Parallel reduction is a fundamental building block in GPU computing. Reducing an array of N elements to a single value (sum, max, min, etc.) in O(log N) parallel steps instead of O(N) sequential steps is one of the most important patterns to master. A naive reduction implementation can be 10-50x slower than an optimized one, even on the same GPU. This guide walks through seven progressive optimizations, from basic tree reduction to modern warp-level primitives, showing how each change impacts performance.
Replace interleaved addressing with sequential addressing to eliminate bank conflicts. Threads access consecutive addresses instead of strided patterns.
// BAD: Interleaved addressing (bank conflicts)
for (int s = 1; s < blockDim.x; s *= 2) {
if (tid % (2 * s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// GOOD: Sequential addressing (conflict-free)
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}Perform the first reduction step while loading data from global memory, effectively doubling the work per thread and halving the number of blocks needed.
__global__ void reduce_first_add(float* g_in, float* g_out, int n) {
extern __shared__ float sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
// First add during load - each thread loads and adds 2 elements
float mySum = 0;
if (i < n) mySum = g_in[i];
if (i + blockDim.x < n) mySum += g_in[i + blockDim.x];
sdata[tid] = mySum;
__syncthreads();
// Continue with tree reduction
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) sdata[tid] += sdata[tid + s];
__syncthreads();
}
if (tid == 0) g_out[blockIdx.x] = sdata[0];
}When reduction fits in a single warp (32 threads), skip __syncthreads() since warp execution is synchronous. This eliminates synchronization overhead.
// Device function for warp-level reduction (no sync needed)
__device__ void warpReduce(volatile float* sdata, int tid) {
// volatile keyword prevents compiler from caching in registers
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}
__global__ void reduce_unroll_warp(float* g_in, float* g_out, int n) {
extern __shared__ float sdata[];
// ... load and initial reduction ...
// Reduce until we're down to 64 elements
for (int s = blockDim.x / 2; s > 32; s >>= 1) {
if (tid < s) sdata[tid] += sdata[tid + s];
__syncthreads();
}
// Warp-level reduction (no sync needed)
if (tid < 32) warpReduce(sdata, tid);
if (tid == 0) g_out[blockIdx.x] = sdata[0];
}Use __shfl_down_sync() to exchange values between threads in a warp without shared memory. This is faster and uses no shared memory bandwidth.
// Modern warp-level reduction using shuffle
__device__ float warpReduceSum(float val) {
// All threads in warp participate
for (int offset = 16; offset > 0; offset /= 2) {
val += __shfl_down_sync(0xffffffff, val, offset);
}
return val; // Only thread 0 has the final sum
}
__device__ float blockReduceSum(float val) {
__shared__ float shared[32]; // One slot per warp
int lane = threadIdx.x % 32;
int wid = threadIdx.x / 32;
// Reduce within warp
val = warpReduceSum(val);
// Write warp result to shared memory
if (lane == 0) shared[wid] = val;
__syncthreads();
// Final reduction by first warp
val = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0;
if (wid == 0) val = warpReduceSum(val);
return val;
}Have each thread process multiple input elements before the tree reduction. This increases arithmetic intensity and reduces the number of threads needed.
__global__ void reduce_multi_element(float* g_in, float* g_out, int n) {
float sum = 0;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int gridSize = blockDim.x * gridDim.x;
// Grid-stride loop: each thread sums multiple elements
for (int i = tid; i < n; i += gridSize) {
sum += g_in[i];
}
// Now reduce within block using shared memory or shuffles
sum = blockReduceSum(sum);
if (threadIdx.x == 0) {
g_out[blockIdx.x] = sum;
}
}
// Launch with fewer blocks
int blocks = min((n + 1023) / 1024, 1024);
reduce_multi_element<<<blocks, 256>>>(d_in, d_out, n);Use CUDA Cooperative Groups for flexible, composable synchronization primitives. This enables cleaner code and automatic optimization.
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
__device__ float reduceGroup(cg::thread_group g, float* sdata, float val) {
int lane = g.thread_rank();
// Reduce within group
for (int i = g.size() / 2; i > 0; i /= 2) {
sdata[lane] = val;
g.sync();
if (lane < i) val += sdata[lane + i];
g.sync();
}
return val;
}
__global__ void reduce_cg(float* g_in, float* g_out, int n) {
cg::thread_block block = cg::this_thread_block();
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
extern __shared__ float sdata[];
float sum = 0;
// Load and sum
for (int i = block.group_index().x * block.size() + block.thread_rank();
i < n; i += block.size() * gridDim.x) {
sum += g_in[i];
}
// Reduce within warp (no shared memory needed)
sum = cg::reduce(warp, sum, cg::plus<float>());
// Store warp result and reduce across warps
// ...
}This naive implementation suffers from warp divergence, bank conflicts, excessive synchronization, and poor thread utilization.
// Naive reduction with multiple performance issues
__global__ void reduce_naive(float* g_in, float* g_out, int n) {
extern __shared__ float sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + threadIdx.x;
// Load one element per thread
sdata[tid] = (i < n) ? g_in[i] : 0;
__syncthreads();
// Interleaved reduction (causes bank conflicts and divergence)
for (int s = 1; s < blockDim.x; s *= 2) {
// PROBLEM 1: Divergent warps
if (tid % (2 * s) == 0) {
// PROBLEM 2: Bank conflicts due to strided access
sdata[tid] += sdata[tid + s];
}
__syncthreads(); // PROBLEM 3: Sync even when not needed
}
if (tid == 0) g_out[blockIdx.x] = sdata[0];
}
// PROBLEM 4: Requires multiple kernel launches for large arrays
// First launch: N/256 blocks -> N/256 partial sums
// Second launch: reduce partial sums
// ... repeat until single valueThis optimized version combines grid-stride loops, first-add-during-load, template unrolling, sequential addressing, and warp shuffle instructions.
// Highly optimized reduction using all techniques
template<int BLOCK_SIZE>
__global__ void reduce_optimized(float* g_in, float* g_out, int n) {
__shared__ float sdata[BLOCK_SIZE];
int tid = threadIdx.x;
int i = blockIdx.x * (BLOCK_SIZE * 2) + tid;
int gridSize = BLOCK_SIZE * 2 * gridDim.x;
// Grid-stride loop with first add during load
float mySum = 0;
while (i < n) {
mySum += g_in[i];
if (i + BLOCK_SIZE < n) mySum += g_in[i + BLOCK_SIZE];
i += gridSize;
}
sdata[tid] = mySum;
__syncthreads();
// Sequential addressing reduction
if (BLOCK_SIZE >= 512) { if (tid < 256) sdata[tid] += sdata[tid + 256]; __syncthreads(); }
if (BLOCK_SIZE >= 256) { if (tid < 128) sdata[tid] += sdata[tid + 128]; __syncthreads(); }
if (BLOCK_SIZE >= 128) { if (tid < 64) sdata[tid] += sdata[tid + 64]; __syncthreads(); }
// Warp-level reduction with shuffles
if (tid < 32) {
float val = sdata[tid];
if (BLOCK_SIZE >= 64) val += sdata[tid + 32];
// Warp shuffle reduction
for (int offset = 16; offset > 0; offset /= 2) {
val += __shfl_down_sync(0xffffffff, val, offset);
}
if (tid == 0) g_out[blockIdx.x] = val;
}
}
// Single-pass reduction for common sizes
// For very large arrays, use atomicAdd for final combination
__global__ void reduce_atomic(float* g_in, float* g_out, int n) {
float sum = 0;
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < n; i += blockDim.x * gridDim.x) {
sum += g_in[i];
}
sum = blockReduceSum(sum); // Uses shuffles internally
if (threadIdx.x == 0) atomicAdd(g_out, sum);
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Execution Time (16M elements) | 2.8 ms | 0.12 ms | 23x faster |
| Memory Bandwidth Utilization | 35% | 92% | 2.6x better |
| Effective Throughput (GB/s) | 85 GB/s | 420 GB/s | 5x higher |
| Thread Utilization | 25% | 95% | 3.8x better |
atomicAdd is convenient for single-pass reductions and works well when the number of atomic operations is small (hundreds to low thousands). For maximum performance, multi-pass reduction without atomics is faster. Use atomicAdd when code simplicity matters or when reducing to multiple output locations.
For most GPUs, 256 threads per block is a good starting point. This provides enough parallelism for warp-level optimizations while maintaining good occupancy. Powers of 2 (128, 256, 512) allow complete loop unrolling. Profile with Nsight Compute to find the optimal size for your specific kernel.
Use boundary checks when loading data (if i < n) and initialize out-of-bounds elements to the identity value (0 for sum, -INF for max, +INF for min). The grid-stride loop pattern handles arbitrary sizes elegantly without requiring padding.
Yes! NVIDIA CUB (CUDA Unbound) provides highly optimized reduction primitives that often match or exceed hand-tuned kernels. Use cub::DeviceReduce::Sum() for production code. Write custom kernels when you need to fuse reduction with other operations or need specialized reduction operators.
Same pattern with different operator
Generalization of reduction
Uses atomic operations like reduction
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.