Asynchronous copy decouples data movement from computation, enabling overlap and latency hiding. Ampere and later GPUs have dedicated copy units that work independently from compute. Proper async copy usage can improve performance by 30-50% for memory-bound kernels. This guide covers memcpy_async, pipeline stages, and software pipelining patterns.
Multi-stage pipeline with N-buffering.
Non-blocking copy from global to shared memory.
Fine-grained synchronization for pipeline stages.
Threads wait for copy to complete before computing.
// Synchronous: compute waits for copy
__global__ void kernel_sync(float* global_in, float* global_out, int n) {
__shared__ float smem[256];
int tid = threadIdx.x;
// Synchronous copy - threads wait here
smem[tid] = global_in[blockIdx.x * 256 + tid];
__syncthreads();
// Compute
float val = smem[tid] * 2.0f;
__syncthreads();
// Store
global_out[blockIdx.x * 256 + tid] = val;
}Async copy enables overlap of memory transfer and compute.
#include <cuda/pipeline>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
__global__ void kernel_async_pipeline(float* global_in, float* global_out,
int n, int tiles) {
__shared__ float smem[2][256]; // Double buffer
auto block = cg::this_thread_block();
__shared__ cuda::pipeline_shared_state<cuda::thread_scope_block, 2> pipe_state;
auto pipe = cuda::make_pipeline(block, &pipe_state);
// Pipeline: prefetch first tile
int tile = 0;
pipe.producer_acquire();
cuda::memcpy_async(block, smem[0],
global_in + tile * 256, sizeof(float) * 256, pipe);
pipe.producer_commit();
for (tile = 1; tile < tiles + 1; tile++) {
// Stage 1: Start async copy for next tile
if (tile < tiles) {
pipe.producer_acquire();
cuda::memcpy_async(block, smem[tile % 2],
global_in + tile * 256,
sizeof(float) * 256, pipe);
pipe.producer_commit();
}
// Stage 2: Wait for previous tile's data
pipe.consumer_wait();
block.sync();
// Stage 3: Compute on ready data
int buf = (tile - 1) % 2;
float val = smem[buf][threadIdx.x] * 2.0f;
// Stage 4: Store result
global_out[(tile - 1) * 256 + threadIdx.x] = val;
pipe.consumer_release();
}
}
// simpler single-stage async copy
__global__ void simple_async(float* in, float* out, int n) {
__shared__ float smem[256];
// Async copy - returns immediately
__pipeline_memcpy_async(&smem[threadIdx.x],
&in[blockIdx.x * 256 + threadIdx.x],
sizeof(float));
// Do other work while copy in flight...
// Wait for async copy to complete
__pipeline_wait_prior(0);
__syncthreads();
// Now safe to use smem
out[blockIdx.x * 256 + threadIdx.x] = smem[threadIdx.x] * 2.0f;
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Memory-bound kernel speedup | 1x | 1.3-1.5x | Latency hiding |
Typically 2-4 stages. More stages hide more latency but use more shared memory. Profile to find the sweet spot for your kernel.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.