Histogram computation counts how many elements fall into each bin—fundamental to image processing, data analysis, and machine learning. The challenge on GPUs is handling concurrent updates to the same bin without losing counts or creating serialization bottlenecks. This guide covers three main approaches: atomic operations (simple but can bottleneck), privatization (each thread block has its own histogram), and sorting-based methods (avoid atomics entirely). The best choice depends on the number of bins and data distribution.
Each thread block maintains a private histogram in shared memory, then merges to global. Reduces atomic contention dramatically.
#define NUM_BINS 256
#define BLOCK_SIZE 256
__global__ void histogram_privatized(unsigned char* data,
unsigned int* histogram, int n) {
__shared__ unsigned int private_hist[NUM_BINS];
int tid = threadIdx.x;
int gid = blockIdx.x * blockDim.x + threadIdx.x;
// Initialize private histogram
if (tid < NUM_BINS) {
private_hist[tid] = 0;
}
__syncthreads();
// Process elements with grid-stride loop
for (int i = gid; i < n; i += blockDim.x * gridDim.x) {
unsigned char bin = data[i];
atomicAdd(&private_hist[bin], 1); // Shared memory atomic
}
__syncthreads();
// Merge to global histogram
if (tid < NUM_BINS) {
atomicAdd(&histogram[tid], private_hist[tid]);
}
}Threads in a warp voting for the same bin combine their updates into a single atomic operation. Reduces atomic traffic by up to 32x.
__device__ void warp_aggregated_atomic(unsigned int* histogram,
unsigned char bin) {
// Find threads with same bin
unsigned int peers = __match_any_sync(0xffffffff, bin);
// Elect one thread per unique bin
int leader = __ffs(peers) - 1;
int lane = threadIdx.x & 31;
if (lane == leader) {
// Count threads with same bin
int count = __popc(peers);
atomicAdd(&histogram[bin], count);
}
}
__global__ void histogram_warp_aggregated(unsigned char* data,
unsigned int* histogram, int n) {
__shared__ unsigned int private_hist[256];
int tid = threadIdx.x;
if (tid < 256) private_hist[tid] = 0;
__syncthreads();
for (int i = blockIdx.x * blockDim.x + tid; i < n;
i += blockDim.x * gridDim.x) {
warp_aggregated_atomic(private_hist, data[i]);
}
__syncthreads();
if (tid < 256) atomicAdd(&histogram[tid], private_hist[tid]);
}Each thread maintains private counters in registers for a subset of bins. Best for very small histograms (8-32 bins).
// For 8-bin histogram, each thread keeps 8 counters
__global__ void histogram_8bin(unsigned char* data,
unsigned int* histogram, int n) {
unsigned int counts[8] = {0};
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Accumulate in registers (no atomics!)
for (int i = tid; i < n; i += stride) {
unsigned char val = data[i];
unsigned char bin = val >> 5; // 8 bins for byte data
counts[bin]++;
}
// Reduce across warp using shuffles
for (int b = 0; b < 8; b++) {
unsigned int sum = counts[b];
for (int offset = 16; offset > 0; offset /= 2) {
sum += __shfl_down_sync(0xffffffff, sum, offset);
}
if ((threadIdx.x & 31) == 0) {
atomicAdd(&histogram[b], sum);
}
}
}Sort input by bin, then count run lengths. Avoids all atomics. Best for large bin counts where privatization uses too much memory.
#include <cub/cub.cuh>
void histogram_via_sort(unsigned int* data, unsigned int* histogram,
int n, int num_bins) {
unsigned int* sorted;
cudaMalloc(&sorted, n * sizeof(unsigned int));
// Sort by bin value
void* temp = nullptr;
size_t temp_bytes = 0;
cub::DeviceRadixSort::SortKeys(temp, temp_bytes, data, sorted, n);
cudaMalloc(&temp, temp_bytes);
cub::DeviceRadixSort::SortKeys(temp, temp_bytes, data, sorted, n);
// Count runs (adjacent equal elements)
unsigned int* unique_out;
unsigned int* counts_out;
unsigned int* num_runs;
cudaMalloc(&unique_out, num_bins * sizeof(unsigned int));
cudaMalloc(&counts_out, num_bins * sizeof(unsigned int));
cudaMalloc(&num_runs, sizeof(unsigned int));
temp_bytes = 0;
cub::DeviceRunLengthEncode::Encode(temp, temp_bytes,
sorted, unique_out, counts_out, num_runs, n);
cudaMalloc(&temp, temp_bytes);
cub::DeviceRunLengthEncode::Encode(temp, temp_bytes,
sorted, unique_out, counts_out, num_runs, n);
// Scatter counts to histogram bins
// ... kernel to place counts_out[i] at histogram[unique_out[i]]
}Process bins in chunks that fit in shared memory. Multiple passes over data, each handling a subset of bins.
#define BINS_PER_PASS 256
#define TOTAL_BINS 4096
__global__ void histogram_chunked(unsigned char* data,
unsigned int* histogram,
int n, int bin_offset) {
__shared__ unsigned int local_hist[BINS_PER_PASS];
int tid = threadIdx.x;
if (tid < BINS_PER_PASS) local_hist[tid] = 0;
__syncthreads();
for (int i = blockIdx.x * blockDim.x + tid; i < n;
i += blockDim.x * gridDim.x) {
int bin = compute_bin(data[i]); // Full bin index
// Only process bins in current chunk
if (bin >= bin_offset && bin < bin_offset + BINS_PER_PASS) {
atomicAdd(&local_hist[bin - bin_offset], 1);
}
}
__syncthreads();
if (tid < BINS_PER_PASS) {
atomicAdd(&histogram[bin_offset + tid], local_hist[tid]);
}
}
// Host code launches multiple passes
for (int offset = 0; offset < TOTAL_BINS; offset += BINS_PER_PASS) {
histogram_chunked<<<blocks, threads>>>(data, hist, n, offset);
}Direct global atomics create massive contention on popular bins.
// Naive histogram with global atomics
__global__ void histogram_naive(unsigned char* data,
unsigned int* histogram, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
unsigned char bin = data[idx];
// PROBLEM: Global atomic on every element
// Serializes threads hitting same bin
atomicAdd(&histogram[bin], 1);
}
}
// Problems:
// 1. Global memory atomics are slow (~100 cycles)
// 2. Hot bins cause severe serialization
// 3. For uniform distribution with 256 bins:
// - 32 threads in warp may hit ~32 different bins
// - But real data often has hot spots
// - Image histograms often have peaks at 0 and 255
// 4. Memory traffic: one atomic per element = terrible
// Performance example:
// 100M elements, 256 bins, uniform distribution
// Naive: ~50 ms
// Optimized: ~2 msCombines privatization, warp aggregation, vectorized loads, and ILP for maximum throughput.
// Optimized histogram combining multiple techniques
template<int NUM_BINS, int BLOCK_SIZE>
__global__ void histogram_optimized(const unsigned char* __restrict__ data,
unsigned int* __restrict__ histogram,
int n) {
// Private histogram in shared memory
__shared__ unsigned int smem[NUM_BINS];
int tid = threadIdx.x;
int lane = tid & 31;
// Initialize shared memory
for (int i = tid; i < NUM_BINS; i += BLOCK_SIZE) {
smem[i] = 0;
}
__syncthreads();
// Process 4 elements per thread per iteration (ILP)
int idx = (blockIdx.x * BLOCK_SIZE + tid) * 4;
int stride = BLOCK_SIZE * gridDim.x * 4;
while (idx + 3 < n) {
// Load 4 bytes at once
uchar4 quad = *reinterpret_cast<const uchar4*>(data + idx);
// Warp-aggregated updates
#pragma unroll
for (int i = 0; i < 4; i++) {
unsigned char bin = ((unsigned char*)&quad)[i];
unsigned int peers = __match_any_sync(0xffffffff, bin);
int leader = __ffs(peers) - 1;
if (lane == leader) {
atomicAdd(&smem[bin], __popc(peers));
}
}
idx += stride;
}
// Handle remainder
idx = (blockIdx.x * BLOCK_SIZE + tid) * 4 + (n / (stride) * stride);
while (idx < n) {
atomicAdd(&smem[data[idx]], 1);
idx += BLOCK_SIZE * gridDim.x;
}
__syncthreads();
// Merge to global with coalesced access
for (int i = tid; i < NUM_BINS; i += BLOCK_SIZE) {
unsigned int count = smem[i];
if (count > 0) {
atomicAdd(&histogram[i], count);
}
}
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Throughput (256 bins, 100M elements) | 2.1 GB/s | 320 GB/s | 152x faster |
| Atomic Operations | 100M atomics | ~3M atomics | 33x fewer |
| Hot Bin Scenario (90% to one bin) | 0.3 GB/s | 85 GB/s | 283x faster |
| Large Histogram (64K bins) | 1.8 GB/s | 95 GB/s (sort-based) | 53x faster |
Use shared memory privatization with warp aggregation. 256 bins fit easily in shared memory (1KB), and image data often has hot spots (blacks, whites) where aggregation helps most. This is the standard approach in image processing libraries.
Compute bin index as: bin = (int)((value - min) / bin_width). For better precision, use: bin = (int)((value - min) * inv_bin_width) where inv_bin_width is precomputed. Handle edge cases: clamp out-of-range values, decide if max value goes in last bin or overflows.
Use sorting when: (1) number of bins exceeds shared memory capacity (>~8K bins), (2) data is already sorted or nearly sorted, (3) you need exact counts without atomic races (verification), or (4) bin computation is expensive and benefits from cache locality after sorting.
Flatten to 1D: bin = binX * numBinsY + binY. For small 2D histograms (e.g., 64x64 = 4096 bins), privatization still works. For larger, use sorting-based approach or compute marginals separately if independence can be assumed.
Histogram is reduction by key
Converts histogram to CDF
Alternative histogram approach
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.