Loading...
Broadcasting enables element-wise operations between tensors of different shapes by virtually expanding smaller tensors. No actual data copying—just index mapping. Essential for bias addition, normalization, and masked operations.
Use zero stride for broadcast dimensions.
// Add bias [C] to activations [N, C, H, W]
// Bias has stride 0 for N, H, W dimensions
__global__ void add_bias_broadcast(float* x, float* bias, float* y,
int N, int C, int H, int W) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int total = N * C * H * W;
if (idx >= total) return;
int c = (idx / (H * W)) % C; // Channel index
y[idx] = x[idx] + bias[c]; // Bias broadcasts over N, H, W
}Wastes N*H*W times memory.
// DON'T DO THIS - wastes memory!
void add_bias_naive(float* x, float* bias, float* y, int N, int C, int H, int W) {
// Expand bias to [N, C, H, W]
float* expanded_bias;
cudaMalloc(&expanded_bias, N * C * H * W * sizeof(float));
expand_kernel<<<...>>>(bias, expanded_bias, C, N, H, W);
// Then add
add_kernel<<<...>>>(x, expanded_bias, y, N * C * H * W);
}Zero strides handle broadcast dimensions.
// General broadcast binary operation
__global__ void broadcast_binary_op(
float* a, float* b, float* out,
int* a_strides, int* b_strides, int* out_strides,
int* out_shape, int ndim, int total) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= total) return;
// Compute multi-index from output linear index
int a_idx = 0, b_idx = 0;
int remaining = idx;
for (int d = 0; d < ndim; d++) {
int coord = remaining / out_strides[d];
remaining %= out_strides[d];
a_idx += coord * a_strides[d]; // 0 stride = broadcast
b_idx += coord * b_strides[d];
}
out[idx] = a[a_idx] + b[b_idx]; // Or *, -, /, etc.
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Bias add [64,256,56,56] | OOM | 0.2ms | Works vs fails |
Never! Broadcast is virtual expansion via stride manipulation. Only output is materialized.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.