Warp primitives are the fastest communication mechanisms in CUDA - data exchange within a warp takes 1-2 cycles vs. 20+ cycles for shared memory. Mastering shuffle, vote, and match functions enables highly optimized reductions, scans, and filtering. This guide covers all major warp primitives with practical examples for common patterns.
Use __shfl_down_sync for 32-element reduction in 5 steps.
Use __ballot_sync for branch divergence analysis.
Use __match_any_sync for finding duplicate values.
Shared memory reduction requires __syncthreads at each level.
// Reduction using shared memory
__global__ void reduce_shared(float* input, float* output, int n) {
__shared__ float sdata[256];
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + tid;
sdata[tid] = (i < n) ? input[i] : 0.0f;
__syncthreads();
// Shared memory reduction
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) sdata[tid] += sdata[tid + s];
__syncthreads(); // Barrier for each level
}
if (tid == 0) output[blockIdx.x] = sdata[0];
}Warp shuffles are 10x faster than shared memory for intra-warp communication.
// Fast warp-level reduction using shuffle
__device__ float warp_reduce_sum(float val) {
// Full warp mask
unsigned mask = 0xffffffff;
// Butterfly reduction pattern
for (int offset = 16; offset > 0; offset /= 2) {
val += __shfl_down_sync(mask, val, offset);
}
return val; // Valid in lane 0
}
__device__ float warp_reduce_max(float val) {
unsigned mask = 0xffffffff;
for (int offset = 16; offset > 0; offset /= 2) {
val = fmaxf(val, __shfl_down_sync(mask, val, offset));
}
return val;
}
// Broadcast value from lane 0 to all lanes
__device__ float warp_broadcast(float val, int src_lane = 0) {
return __shfl_sync(0xffffffff, val, src_lane);
}
// Vote functions
__device__ bool warp_all(bool predicate) {
return __all_sync(0xffffffff, predicate);
}
__device__ bool warp_any(bool predicate) {
return __any_sync(0xffffffff, predicate);
}
// Get bitmask of which lanes satisfy predicate
__device__ unsigned warp_ballot(bool predicate) {
return __ballot_sync(0xffffffff, predicate);
}
// Find lanes with matching value (Volta+)
__device__ unsigned warp_match(int val) {
return __match_any_sync(0xffffffff, val);
}
// Block reduction using warp shuffle
__global__ void reduce_warp(float* input, float* output, int n) {
float val = (blockIdx.x * blockDim.x + threadIdx.x < n) ?
input[blockIdx.x * blockDim.x + threadIdx.x] : 0.0f;
// Warp-level reduction
val = warp_reduce_sum(val);
// Write warp results to shared memory
__shared__ float warp_sums[8]; // 256 threads = 8 warps
int lane = threadIdx.x % 32;
int warp = threadIdx.x / 32;
if (lane == 0) warp_sums[warp] = val;
__syncthreads();
// First warp reduces warp results
if (warp == 0) {
val = (lane < 8) ? warp_sums[lane] : 0.0f;
val = warp_reduce_sum(val);
if (lane == 0) output[blockIdx.x] = val;
}
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Reduction latency | 1x | 0.5x | Fewer sync points |
| Register pressure | High | Low | No shared memory |
The mask specifies which threads participate. Use 0xffffffff for full warp. Threads not in mask have undefined behavior - critical to handle inactive threads.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.