Loading...
ELU (Exponential Linear Unit) is x for x>0, α*(exp(x)-1) for x≤0. It pushes mean activations toward zero and has self-normalizing properties, enabling faster convergence in deep networks.
Use arithmetic selection to avoid divergence.
__device__ float elu_branchless(float x, float alpha) {
float pos = fmaxf(x, 0.0f);
float neg = fminf(x, 0.0f);
return pos + alpha * (expf(neg) - 1.0f) * (neg < 0);
// Or simpler: x > 0 ? x : alpha * (expf(x) - 1)
}Conditional implementation.
__global__ void elu_naive(float* x, float* y, float alpha, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
float v = x[idx];
y[idx] = (v > 0) ? v : alpha * (expf(v) - 1.0f);
}
}Vectorized with minimal divergence.
__global__ void elu_opt(float4* x, float4* y, float alpha, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
float4 v = x[idx];
float4 pos = make_float4(fmaxf(v.x,0), fmaxf(v.y,0), fmaxf(v.z,0), fmaxf(v.w,0));
float4 neg = make_float4(
alpha * (v.x < 0 ? __expf(v.x) - 1 : 0),
alpha * (v.y < 0 ? __expf(v.y) - 1 : 0),
alpha * (v.z < 0 ? __expf(v.z) - 1 : 0),
alpha * (v.w < 0 ? __expf(v.w) - 1 : 0));
y[idx] = make_float4(pos.x+neg.x, pos.y+neg.y, pos.z+neg.z, pos.w+neg.w);
}
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Throughput | 380 GB/s | 620 GB/s | 63% faster |
Default is 1.0. SELU uses α≈1.67 with specific scaling for self-normalization.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.