Loading...
Cumulative sum (cumsum/scan) computes running totals. Despite seeming sequential, efficient parallel algorithms achieve O(n) work with O(log n) depth. Essential for stream compaction, radix sort, and many parallel algorithms.
Work-efficient O(n) algorithm with two phases.
__global__ void blelloch_scan(float* x, int n) {
extern __shared__ float temp[];
int tid = threadIdx.x;
temp[tid] = x[tid];
__syncthreads();
// Up-sweep (reduce)
for (int d = 1; d < n; d *= 2) {
int i = (tid + 1) * 2 * d - 1;
if (i < n) temp[i] += temp[i - d];
__syncthreads();
}
// Clear last element (for exclusive scan)
if (tid == n - 1) temp[tid] = 0;
__syncthreads();
// Down-sweep
for (int d = n / 2; d >= 1; d /= 2) {
int i = (tid + 1) * 2 * d - 1;
if (i < n) {
float t = temp[i - d];
temp[i - d] = temp[i];
temp[i] += t;
}
__syncthreads();
}
x[tid] = temp[tid];
}O(n) sequential, single thread.
__global__ void cumsum_naive(float* x, float* y, int n) {
if (threadIdx.x == 0) {
y[0] = x[0];
for (int i = 1; i < n; i++)
y[i] = y[i-1] + x[i]; // Sequential!
}
}CUB provides optimized multi-block scan.
#include <cub/cub.cuh>
void cumsum_opt(float* x, float* y, int n) {
size_t temp_bytes = 0;
cub::DeviceScan::InclusiveSum(nullptr, temp_bytes, x, y, n);
void* d_temp;
cudaMalloc(&d_temp, temp_bytes);
cub::DeviceScan::InclusiveSum(d_temp, temp_bytes, x, y, n);
cudaFree(d_temp);
}
// Or exclusive scan (starts with 0)
cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, x, y, n);| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Throughput | 0.5 GB/s | 350 GB/s | 700x faster |
Inclusive: output[i] = sum(input[0..i]). Exclusive: output[i] = sum(input[0..i-1]). Exclusive starts with identity (0 for sum).
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.