Dropout randomly zeros elements during training for regularization. Efficient CUDA implementation requires fast random number generation, memory-efficient masking, and fusion with adjacent operations. Inverted dropout (scaling during training) is the modern standard.
Generate random numbers and apply mask in single kernel.
__global__ void dropout_fused(float* x, float* y, float* mask,
unsigned long long seed, float p, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
curandStatePhilox4_32_10_t state;
curand_init(seed, idx, 0, &state);
float rand = curand_uniform(&state);
float keep = (rand > p) ? 1.0f : 0.0f;
float scale = 1.0f / (1.0f - p); // Inverted dropout
mask[idx] = keep;
y[idx] = x[idx] * keep * scale;
}
}Two-kernel approach with synchronization overhead.
// Separate kernels for mask generation and application
void dropout_naive(float* x, float* y, float* mask, float p, int n) {
// Generate random mask on CPU or separate kernel
generate_mask<<<blocks, threads>>>(mask, p, n, seed);
cudaDeviceSynchronize();
// Apply mask
apply_mask<<<blocks, threads>>>(x, y, mask, 1.0f/(1.0f-p), n);
}Vectorized with compact mask storage and grid-stride loop.
__global__ void dropout_opt(float* __restrict__ x, float* __restrict__ y,
uint8_t* __restrict__ mask, // Compact mask
unsigned long long seed, float p, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
float scale = 1.0f / (1.0f - p);
curandStatePhilox4_32_10_t state;
curand_init(seed, idx, 0, &state);
for (int i = idx; i < n; i += stride) {
float4 rand4 = curand_uniform4(&state);
// Process 4 elements at once
if (i * 4 + 3 < n) {
float4 in = reinterpret_cast<float4*>(x)[i];
float4 out;
uint8_t m = 0;
m |= (rand4.x > p) << 0; out.x = in.x * (m & 1) * scale;
m |= (rand4.y > p) << 1; out.y = in.y * ((m >> 1) & 1) * scale;
m |= (rand4.z > p) << 2; out.z = in.z * ((m >> 2) & 1) * scale;
m |= (rand4.w > p) << 3; out.w = in.w * ((m >> 3) & 1) * scale;
reinterpret_cast<float4*>(y)[i] = out;
mask[i] = m;
}
}
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Throughput (10M elements) | 85 GB/s | 420 GB/s | 4.9x faster |
| Mask memory | 40MB (float) | 2.5MB (uint8) | 16x smaller |
Inverted dropout scales during training so inference needs no modification. Standard dropout requires scaling at inference which adds overhead.
Reuse the same mask from forward pass. Store mask compactly (1 bit per element) and expand during backward.
Often fused: dropout + activation
Dropout in GEMM epilogue
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.