Loading...
Mish is x*tanh(softplus(x)) = x*tanh(ln(1+exp(x))), a smooth self-regularized activation. Used in YOLOv4 and other detection models. Slightly better than Swish but more computationally expensive.
Compute softplus stably, then tanh.
__device__ float mish_stable(float x) {
// softplus(x) = ln(1 + exp(x)) = max(x,0) + ln(1 + exp(-|x|))
float sp = (x > 0) ? x + log1pf(__expf(-x)) : log1pf(__expf(x));
return x * tanhf(sp);
}Direct computation, overflows.
__global__ void mish_naive(float* x, float* y, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
float v = x[idx];
y[idx] = v * tanhf(logf(1.0f + expf(v))); // Overflows for v>88!
}
}Vectorized with stable computation.
__global__ void mish_opt(float4* x, float4* y, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
float4 v = x[idx];
y[idx] = make_float4(
mish_stable(v.x), mish_stable(v.y),
mish_stable(v.z), mish_stable(v.w)
);
}
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Throughput | 280 GB/s | 450 GB/s | 61% faster |
| vs ReLU | 2.5x slower | 1.8x slower | Acceptable for accuracy gains |
For detection tasks (YOLO), yes. For general vision, Swish is usually sufficient with lower cost.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.