Stencil operations update each array element based on a fixed pattern of neighboring elements. They appear in finite difference methods (PDEs), image processing (blur, edge detection), cellular automata, and physics simulations. The challenge is managing the "halo" or "ghost" regions—neighbor data that must be loaded but doesn't produce output. Efficient stencil computation requires careful tiling to maximize data reuse. Each input element may be needed by multiple output elements, and temporal blocking can amortize halo overhead across multiple time steps. This guide covers the essential patterns for high-performance stencil codes.
Load a tile plus its halo into shared memory. Interior threads produce output, halo threads only load data. This enables massive neighbor reuse.
#define TILE_X 32
#define TILE_Y 16
#define RADIUS 1 // Stencil radius
#define BLOCK_X (TILE_X + 2*RADIUS)
#define BLOCK_Y (TILE_Y + 2*RADIUS)
__global__ void stencil_2d_tiled(float* out, const float* in,
int nx, int ny) {
__shared__ float smem[BLOCK_Y][BLOCK_X];
int tx = threadIdx.x;
int ty = threadIdx.y;
int gx = blockIdx.x * TILE_X + tx - RADIUS;
int gy = blockIdx.y * TILE_Y + ty - RADIUS;
// Load tile including halo
if (gx >= 0 && gx < nx && gy >= 0 && gy < ny) {
smem[ty][tx] = in[gy * nx + gx];
} else {
smem[ty][tx] = 0; // Boundary condition
}
__syncthreads();
// Only interior threads compute output
if (tx >= RADIUS && tx < TILE_X + RADIUS &&
ty >= RADIUS && ty < TILE_Y + RADIUS) {
int out_x = blockIdx.x * TILE_X + (tx - RADIUS);
int out_y = blockIdx.y * TILE_Y + (ty - RADIUS);
if (out_x < nx && out_y < ny) {
// 5-point stencil (2D Laplacian)
float result = -4.0f * smem[ty][tx]
+ smem[ty-1][tx]
+ smem[ty+1][tx]
+ smem[ty][tx-1]
+ smem[ty][tx+1];
out[out_y * nx + out_x] = result;
}
}
}All threads participate in loading the tile, not just those producing output. Ensures coalesced loads and balanced work.
// Better loading pattern for arbitrary halo sizes
__global__ void stencil_collaborative_load(float* out, const float* in,
int nx, int ny) {
__shared__ float smem[BLOCK_Y][BLOCK_X];
int tx = threadIdx.x;
int ty = threadIdx.y;
int tid = ty * blockDim.x + tx;
int threads = blockDim.x * blockDim.y;
// Base position of tile
int tile_x = blockIdx.x * TILE_X - RADIUS;
int tile_y = blockIdx.y * TILE_Y - RADIUS;
// Collaborative load: each thread loads multiple elements
for (int i = tid; i < BLOCK_X * BLOCK_Y; i += threads) {
int lx = i % BLOCK_X;
int ly = i / BLOCK_X;
int gx = tile_x + lx;
int gy = tile_y + ly;
// Clamp to boundaries
gx = max(0, min(gx, nx - 1));
gy = max(0, min(gy, ny - 1));
smem[ly][lx] = in[gy * nx + gx];
}
__syncthreads();
// Compute output
int out_x = blockIdx.x * TILE_X + tx;
int out_y = blockIdx.y * TILE_Y + ty;
if (out_x < nx && out_y < ny) {
int lx = tx + RADIUS;
int ly = ty + RADIUS;
float result = stencil_op(smem, lx, ly);
out[out_y * nx + out_x] = result;
}
}For 3D stencils, process one Z-plane at a time, keeping multiple planes in shared memory/registers. Trades parallelism for memory efficiency.
// 3D 7-point stencil with 2.5D blocking
__global__ void stencil_3d_25d(float* out, const float* in,
int nx, int ny, int nz) {
__shared__ float curr[TILE_Y + 2][TILE_X + 2];
int tx = threadIdx.x + 1;
int ty = threadIdx.y + 1;
int gx = blockIdx.x * TILE_X + threadIdx.x;
int gy = blockIdx.y * TILE_Y + threadIdx.y;
// Registers for Z neighbors
float below, center, above;
// Initialize: load z=0 plane
below = 0; // Boundary
center = in[idx(gx, gy, 0)];
for (int z = 0; z < nz; z++) {
// Load next plane
above = (z + 1 < nz) ? in[idx(gx, gy, z + 1)] : 0;
// Load current plane into shared memory (with halo)
load_plane_with_halo(curr, in, gx, gy, z, nx, ny);
__syncthreads();
// Compute 7-point stencil
if (gx < nx && gy < ny) {
float result = -6.0f * curr[ty][tx]
+ curr[ty-1][tx]
+ curr[ty+1][tx]
+ curr[ty][tx-1]
+ curr[ty][tx+1]
+ below
+ above;
out[idx(gx, gy, z)] = result;
}
__syncthreads();
// Shift planes
below = center;
center = above;
}
}Compute multiple time steps per kernel launch. Increases arithmetic intensity but requires larger halos. Best for iterative solvers.
#define TIME_STEPS 4
#define EFFECTIVE_HALO (RADIUS * TIME_STEPS)
__global__ void stencil_temporal(float* out, float* in,
int nx, int ny, int total_steps) {
// Larger shared memory for temporal tile
__shared__ float smem[2][TILE_Y + 2*EFFECTIVE_HALO]
[TILE_X + 2*EFFECTIVE_HALO];
int current = 0;
// Load initial tile with enlarged halo
load_temporal_tile(smem[current], in, ...);
__syncthreads();
// Multiple time steps in shared memory
for (int t = 0; t < TIME_STEPS; t++) {
int next = 1 - current;
int halo_shrink = RADIUS * t;
// Compute with shrinking valid region
int valid_x = TILE_X + 2*(EFFECTIVE_HALO - halo_shrink - RADIUS);
int valid_y = TILE_Y + 2*(EFFECTIVE_HALO - halo_shrink - RADIUS);
if (threadIdx.x < valid_x && threadIdx.y < valid_y) {
int lx = threadIdx.x + halo_shrink + RADIUS;
int ly = threadIdx.y + halo_shrink + RADIUS;
float result = stencil_op(smem[current], lx, ly);
smem[next][ly][lx] = result;
}
__syncthreads();
current = next;
}
// Write final results
write_results(out, smem[current], ...);
}For 1D stencils, keep a sliding window in registers. Eliminates shared memory overhead for simple patterns.
// 1D stencil with register window (radius 2)
__global__ void stencil_1d_registers(float* out, const float* in, int n) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < n; i += stride) {
// Load window into registers
float v[5]; // radius 2 = 5 elements
#pragma unroll
for (int j = -2; j <= 2; j++) {
int idx = i + j;
// Clamp to boundaries
idx = max(0, min(idx, n - 1));
v[j + 2] = in[idx];
}
// Compute stencil
float result = -2.0f * v[2]
+ v[0] + v[1] + v[3] + v[4];
out[i] = result;
}
}
// With warp-level data sharing for better efficiency
__global__ void stencil_1d_warp_shared(float* out, const float* in, int n) {
int lane = threadIdx.x & 31;
int warp_start = (blockIdx.x * blockDim.x + threadIdx.x) & ~31;
// Each thread loads one element
float my_val = in[warp_start + lane];
// Share with neighbors using shuffles
float left2 = __shfl_up_sync(0xffffffff, my_val, 2);
float left1 = __shfl_up_sync(0xffffffff, my_val, 1);
float right1 = __shfl_down_sync(0xffffffff, my_val, 1);
float right2 = __shfl_down_sync(0xffffffff, my_val, 2);
// Handle warp boundaries
if (lane < 2) {
left2 = in[max(0, warp_start + lane - 2)];
left1 = in[max(0, warp_start + lane - 1)];
}
if (lane >= 30) {
right1 = in[min(n-1, warp_start + lane + 1)];
right2 = in[min(n-1, warp_start + lane + 2)];
}
float result = -2.0f * my_val + left2 + left1 + right1 + right2;
out[warp_start + lane] = result;
}Naive stencil loads each neighbor element multiple times across adjacent outputs.
// Naive 2D stencil - no data reuse
__global__ void stencil_naive(float* out, const float* in,
int nx, int ny) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x > 0 && x < nx - 1 && y > 0 && y < ny - 1) {
// PROBLEM: Each neighbor loaded from global memory
// No reuse despite neighbors being shared between outputs
float center = in[y * nx + x];
float north = in[(y - 1) * nx + x]; // Also needed by (x, y-1)
float south = in[(y + 1) * nx + x]; // Also needed by (x, y+1)
float west = in[y * nx + (x - 1)]; // Also needed by (x-1, y)
float east = in[y * nx + (x + 1)]; // Also needed by (x+1, y)
out[y * nx + x] = -4.0f * center + north + south + west + east;
}
}
// Problems:
// 1. 5 global memory loads per output (4 could be shared)
// 2. ~80% of memory traffic is redundant
// 3. Arithmetic intensity: 5 ops / 20 bytes = 0.25 ops/byte
// (needs ~40 ops/byte to hide memory latency)
// 4. Branch divergence at boundariesTiled stencil loads each element once into shared memory, achieving 4-5x speedup over naive.
#define TILE_X 64
#define TILE_Y 8
#define RADIUS 1
__global__ void stencil_optimized(float* __restrict__ out,
const float* __restrict__ in,
int nx, int ny) {
// Shared memory with halo
__shared__ float smem[TILE_Y + 2*RADIUS][TILE_X + 2*RADIUS];
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x * TILE_X;
int by = blockIdx.y * TILE_Y;
// Global coordinates for loading (including halo)
int gx = bx + tx - RADIUS;
int gy = by + ty - RADIUS;
// Load center region (each thread loads one element)
if (tx < TILE_X + 2*RADIUS && ty < TILE_Y + 2*RADIUS) {
int clamped_x = max(0, min(gx, nx - 1));
int clamped_y = max(0, min(gy, ny - 1));
smem[ty][tx] = in[clamped_y * nx + clamped_x];
}
__syncthreads();
// Output coordinates
int out_x = bx + tx;
int out_y = by + ty;
// Only threads in valid output region compute
if (tx < TILE_X && ty < TILE_Y &&
out_x >= RADIUS && out_x < nx - RADIUS &&
out_y >= RADIUS && out_y < ny - RADIUS) {
// Shared memory coordinates
int sx = tx + RADIUS;
int sy = ty + RADIUS;
// 5-point stencil from shared memory
float result = -4.0f * smem[sy][sx]
+ smem[sy - 1][sx]
+ smem[sy + 1][sx]
+ smem[sy][sx - 1]
+ smem[sy][sx + 1];
out[out_y * nx + out_x] = result;
}
}
// Launch configuration for good occupancy
void launch_stencil(float* d_out, float* d_in, int nx, int ny) {
dim3 block(TILE_X + 2*RADIUS, TILE_Y + 2*RADIUS);
dim3 grid((nx + TILE_X - 1) / TILE_X, (ny + TILE_Y - 1) / TILE_Y);
stencil_optimized<<<grid, block>>>(d_out, d_in, nx, ny);
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| 2D Laplacian (4096x4096) | 12 ms | 2.5 ms | 4.8x faster |
| Memory Traffic Reduction | 5 loads/output | 1.1 loads/output | 4.5x less |
| 3D 7-point (256³) | 45 ms | 8 ms (2.5D) | 5.6x faster |
| Bandwidth Utilization | 85 GB/s | 380 GB/s | 4.5x higher |
Tile size depends on stencil radius, shared memory capacity, and occupancy. For radius-1 2D stencils, 32x16 or 64x8 tiles often work well. The tile should be large enough that halo overhead is small (<20%), but small enough to fit in shared memory with good occupancy. Profile with Nsight Compute to find the optimum.
Common approaches: (1) Clamp to edge: use nearest valid value. (2) Zero/constant: use fixed value outside domain. (3) Periodic: wrap around (modulo indexing). (4) Reflective: mirror values at boundary. Implement in the loading phase by adjusting indices or using conditional assignments.
Use temporal blocking when: (1) running many iterations (iterative solvers), (2) arithmetic intensity is low (simple stencils), (3) memory bandwidth is the bottleneck. Temporal blocking increases arithmetic intensity but requires larger halos (radius * timesteps). The break-even is typically 2-4 time steps per tile.
Partition the domain across GPUs with halo exchange regions. After each time step: (1) compute interior asynchronously, (2) exchange halos using cudaMemcpyPeer or MPI+CUDA-aware MPI, (3) compute boundary regions. Overlap computation and communication using CUDA streams for best performance.
Convolution is a weighted stencil
Critical for stencil memory access
Similar tiling patterns
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.