Loading...
Softplus is log(1+exp(x)), a smooth approximation to ReLU that outputs strictly positive values. Used for variance/scale parameters in probabilistic models and as a component of Mish.
Use different formulas for positive/negative x.
__device__ float softplus_stable(float x) {
// For large x: softplus(x) ≈ x
// For small x: softplus(x) ≈ log(1 + exp(x))
if (x > 20.0f) return x; // exp(x) >> 1
if (x < -20.0f) return __expf(x); // exp(x) << 1
return log1pf(__expf(x));
}Direct computation, overflows for x > 88.
__global__ void softplus_naive(float* x, float* y, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) y[idx] = logf(1.0f + expf(x[idx])); // Overflows!
}Stable with configurable beta parameter.
__global__ void softplus_opt(float4* x, float4* y, float beta, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
float4 v = x[idx];
float threshold = 20.0f / beta;
y[idx] = make_float4(
(v.x > threshold) ? v.x : log1pf(__expf(beta*v.x)) / beta,
(v.y > threshold) ? v.y : log1pf(__expf(beta*v.y)) / beta,
(v.z > threshold) ? v.z : log1pf(__expf(beta*v.z)) / beta,
(v.w > threshold) ? v.w : log1pf(__expf(beta*v.w)) / beta
);
}
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Throughput | 350 GB/s | 580 GB/s | 66% faster |
Softplus is smooth (differentiable everywhere) and strictly positive. Use for variance outputs or when smoothness matters. ReLU is faster.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.