Prefix scan (also called parallel scan) computes cumulative sums of an array in parallel. Given input [a, b, c, d], exclusive scan produces [0, a, a+b, a+b+c] and inclusive scan produces [a, a+b, a+b+c, a+b+c+d]. This seemingly simple operation is a fundamental building block for many parallel algorithms. Prefix scan enables stream compaction (removing elements), radix sort, polynomial evaluation, and solving recurrence relations. A naive sequential scan is O(N), but parallel scan achieves O(N/P + log N) with P processors—essential for GPU-scale parallelism.
Two-phase algorithm: up-sweep (reduce) builds partial sums, down-sweep distributes them. Achieves O(N) work complexity matching sequential scan.
__global__ void blelloch_scan(float* data, int n) {
extern __shared__ float temp[];
int tid = threadIdx.x;
int offset = 1;
// Load into shared memory
temp[2*tid] = data[2*tid];
temp[2*tid+1] = data[2*tid+1];
// Up-sweep (reduce) phase
for (int d = n >> 1; d > 0; d >>= 1) {
__syncthreads();
if (tid < d) {
int ai = offset * (2*tid+1) - 1;
int bi = offset * (2*tid+2) - 1;
temp[bi] += temp[ai];
}
offset *= 2;
}
// Clear last element for exclusive scan
if (tid == 0) temp[n-1] = 0;
// Down-sweep phase
for (int d = 1; d < n; d *= 2) {
offset >>= 1;
__syncthreads();
if (tid < d) {
int ai = offset * (2*tid+1) - 1;
int bi = offset * (2*tid+2) - 1;
float t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
data[2*tid] = temp[2*tid];
data[2*tid+1] = temp[2*tid+1];
}Add padding to shared memory indices to prevent bank conflicts from power-of-2 stride patterns in the scan tree.
#define NUM_BANKS 32
#define LOG_NUM_BANKS 5
#define CONFLICT_FREE_OFFSET(n) ((n) >> LOG_NUM_BANKS)
__global__ void scan_no_conflicts(float* data, int n) {
extern __shared__ float temp[];
int tid = threadIdx.x;
// Compute conflict-free indices
int ai = tid;
int bi = tid + (n/2);
int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
// Load with conflict-free addressing
temp[ai + bankOffsetA] = data[ai];
temp[bi + bankOffsetB] = data[bi];
int offset = 1;
// Up-sweep with conflict-free access
for (int d = n >> 1; d > 0; d >>= 1) {
__syncthreads();
if (tid < d) {
int ai = offset * (2*tid+1) - 1;
int bi = offset * (2*tid+2) - 1;
ai += CONFLICT_FREE_OFFSET(ai);
bi += CONFLICT_FREE_OFFSET(bi);
temp[bi] += temp[ai];
}
offset *= 2;
}
// ... down-sweep with same addressing
}For arrays larger than block size: scan within blocks, scan block sums, then add block sums back. Three-kernel approach handles arbitrary sizes.
// Three-phase large array scan
void scan_large(float* d_data, float* d_block_sums, int n, int block_size) {
int num_blocks = (n + block_size - 1) / block_size;
// Phase 1: Scan within each block, save block totals
scan_blocks<<<num_blocks, block_size/2, smem_size>>>(
d_data, d_block_sums, n);
// Phase 2: Scan the block sums (recursive for very large arrays)
if (num_blocks > block_size) {
scan_large(d_block_sums, d_block_sums2, num_blocks, block_size);
} else {
scan_blocks<<<1, num_blocks/2, smem_size>>>(
d_block_sums, nullptr, num_blocks);
}
// Phase 3: Add scanned block sums to each block
add_block_sums<<<num_blocks, block_size>>>(
d_data, d_block_sums, n);
}
__global__ void add_block_sums(float* data, float* block_sums, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n && blockIdx.x > 0) {
data[idx] += block_sums[blockIdx.x - 1];
}
}Use warp shuffle instructions for scan within a warp. No shared memory or synchronization needed for 32 elements.
__device__ float warp_scan_inclusive(float val) {
int lane = threadIdx.x & 31;
// Kogge-Stone style parallel scan
for (int offset = 1; offset < 32; offset *= 2) {
float n = __shfl_up_sync(0xffffffff, val, offset);
if (lane >= offset) val += n;
}
return val;
}
__device__ float warp_scan_exclusive(float val) {
float inclusive = warp_scan_inclusive(val);
return inclusive - val; // Convert to exclusive
}
__device__ float block_scan(float val) {
__shared__ float warp_sums[32];
int lane = threadIdx.x & 31;
int wid = threadIdx.x >> 5;
// Scan within warp
float warp_result = warp_scan_inclusive(val);
// Store warp totals
if (lane == 31) warp_sums[wid] = warp_result;
__syncthreads();
// Scan warp sums
if (wid == 0) {
float warp_val = (lane < blockDim.x / 32) ? warp_sums[lane] : 0;
warp_sums[lane] = warp_scan_exclusive(warp_val);
}
__syncthreads();
// Add warp prefix to each element
return warp_result + warp_sums[wid];
}Single-pass scan using inter-block communication. Blocks publish partial results and look back at predecessors. Used by CUB library.
// Decoupled look-back scan (simplified)
// Each block has 3 states: INVALID, PARTIAL, COMPLETE
enum ScanState { INVALID = 0, PARTIAL = 1, COMPLETE = 2 };
struct BlockStatus {
int state;
float value;
};
__global__ void scan_lookback(float* data, BlockStatus* status, int n) {
__shared__ float block_sum;
int bid = blockIdx.x;
// Compute local scan
float local_sum = block_scan(data[...]);
// Last thread in block handles look-back
if (threadIdx.x == blockDim.x - 1) {
// Publish partial aggregate
atomicExch(&status[bid].value, local_sum);
__threadfence();
atomicExch(&status[bid].state, PARTIAL);
// Look back at predecessors
float prefix = 0;
for (int i = bid - 1; i >= 0; i--) {
int s;
while ((s = atomicAdd(&status[i].state, 0)) == INVALID);
float v = status[i].value;
prefix += v;
if (s == COMPLETE) break; // Found complete prefix
}
// Update to complete status
atomicExch(&status[bid].value, prefix + local_sum);
__threadfence();
atomicExch(&status[bid].state, COMPLETE);
block_sum = prefix;
}
__syncthreads();
// Add prefix to local results
data[...] = local_result + block_sum;
}Hillis-Steele is simple but performs log N times more work than necessary.
// Hillis-Steele parallel scan - NOT work-efficient
// Does O(N log N) work instead of O(N)
__global__ void hillis_steele_scan(float* data, float* out, int n) {
extern __shared__ float temp[];
int tid = threadIdx.x;
// Load into shared memory
temp[tid] = (tid < n) ? data[tid] : 0;
__syncthreads();
// Hillis-Steele: each step doubles the distance
for (int offset = 1; offset < n; offset *= 2) {
float val = temp[tid];
if (tid >= offset) {
val += temp[tid - offset];
}
__syncthreads();
temp[tid] = val;
__syncthreads();
}
// Write result
if (tid < n) out[tid] = temp[tid];
}
// Problems:
// 1. O(N log N) additions - work inefficient
// 2. Bank conflicts at each step
// 3. Limited to single block (1024 elements)
// 4. Double-buffering needed (or extra sync)Production scan uses warp shuffles, conflict-free shared memory, and hierarchical combination.
#include <cub/cub.cuh>
// Using CUB for production scan (highly optimized)
void optimized_scan(float* d_in, float* d_out, int n) {
// Determine temporary storage size
void* d_temp = nullptr;
size_t temp_bytes = 0;
cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, d_in, d_out, n);
// Allocate temporary storage
cudaMalloc(&d_temp, temp_bytes);
// Run exclusive scan
cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, d_in, d_out, n);
cudaFree(d_temp);
}
// Custom optimized scan for special cases
template<int BLOCK_SIZE>
__global__ void scan_optimized(float* data, float* block_sums, int n) {
__shared__ float smem[BLOCK_SIZE + BLOCK_SIZE/32]; // With padding
int tid = threadIdx.x;
int gid = blockIdx.x * BLOCK_SIZE + tid;
// Load
float val = (gid < n) ? data[gid] : 0;
// Warp-level scan (no sync needed)
val = warp_scan_inclusive(val);
// Store warp results
int lane = tid & 31;
int wid = tid >> 5;
if (lane == 31) smem[wid] = val;
__syncthreads();
// Scan warp totals (first warp only)
if (wid == 0) {
float warp_sum = (lane < BLOCK_SIZE/32) ? smem[lane] : 0;
warp_sum = warp_scan_exclusive(warp_sum);
smem[lane] = warp_sum;
}
__syncthreads();
// Add warp prefix
val += smem[wid];
// Store result and block sum
if (gid < n) data[gid] = val;
if (tid == BLOCK_SIZE - 1 && block_sums)
block_sums[blockIdx.x] = val;
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Throughput (256M elements) | 12 GB/s | 380 GB/s (CUB) | 32x higher |
| Work Efficiency | O(N log N) | O(N) | log N factor |
| Bank Conflicts | 32-way conflicts | 0 conflicts | Eliminated |
| Single Block Latency | 45 μs | 8 μs | 5.6x faster |
Inclusive scan: output[i] = sum(input[0..i]). Exclusive scan: output[i] = sum(input[0..i-1]), with output[0] = 0. Exclusive scan is more common as it gives the prefix sum before each element, useful for computing output indices in stream compaction.
Use CUB (cub::DeviceScan) for production code—it implements the fastest known algorithms including decoupled look-back. Write custom kernels only when fusing scan with other operations, using custom operators, or targeting specific memory layouts not supported by CUB.
Stream compaction removes unwanted elements. First, create a predicate array (1 if keep, 0 if discard). Exclusive scan gives the output index for each kept element. Then scatter: if predicate[i]==1, output[scan[i]] = input[i]. Total output size is scan[n-1] + predicate[n-1].
Yes, scan works with any associative operator (not necessarily commutative). For matrix multiplication scan or string concatenation, the algorithm is the same but the operator order must be preserved. CUB supports custom operators via functors.
Scan generalizes reduction
Scan used for bin offset computation
Radix sort uses scan extensively
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.