Asynchronous memory transfers are essential for achieving maximum GPU utilization. While cudaMemcpy blocks CPU execution until the transfer completes, cudaMemcpyAsync returns immediately, allowing overlapped execution of transfers and kernels. This overlap can effectively hide memory transfer latency, achieving 2-3x application throughput. The key to effective async transfers is understanding CUDA streams and pinned (page-locked) memory. Regular pageable host memory cannot be accessed by the GPU DMA engine, requiring an intermediate copy to pinned staging buffers. Direct pinned memory allocation eliminates this overhead and enables true async behavior. Modern GPUs have copy engines separate from compute units, allowing simultaneous bidirectional transfers (H2D and D2H) while kernels execute. Properly orchestrated multi-stream patterns can keep the GPU fully saturated, maximizing both memory bandwidth and compute throughput.
Use cudaHostAlloc or cudaMallocHost to allocate page-locked memory that can be accessed directly by GPU DMA engines. This eliminates staging copies and enables true async transfers.
// BAD: Pageable memory (forces staging copy)
float* h_data = (float*)malloc(size);
cudaMemcpyAsync(d_data, h_data, size, cudaMemcpyHostToDevice, stream);
// Acts as synchronous due to staging
// GOOD: Pinned memory (true async)
float* h_data;
cudaHostAlloc(&h_data, size, cudaHostAllocDefault);
cudaMemcpyAsync(d_data, h_data, size, cudaMemcpyHostToDevice, stream);
// Returns immediately, DMA proceeds in background
// For write-combined memory (faster H2D)
cudaHostAlloc(&h_data, size, cudaHostAllocWriteCombined);
// For portable across devices
cudaHostAlloc(&h_data, size, cudaHostAllocPortable);Use multiple CUDA streams to overlap H2D transfers, kernel execution, and D2H transfers. This keeps all GPU engines (copy and compute) busy simultaneously.
#define NUM_STREAMS 4
cudaStream_t streams[NUM_STREAMS];
for (int i = 0; i < NUM_STREAMS; i++) {
cudaStreamCreate(&streams[i]);
}
// Process data in chunks with overlapped execution
int chunk_size = total_size / NUM_STREAMS;
for (int i = 0; i < NUM_STREAMS; i++) {
int offset = i * chunk_size;
// H2D transfer
cudaMemcpyAsync(&d_input[offset], &h_input[offset],
chunk_size, cudaMemcpyHostToDevice, streams[i]);
// Kernel execution (overlaps with next H2D)
kernel<<<grid, block, 0, streams[i]>>>(&d_input[offset],
&d_output[offset], chunk_size);
// D2H transfer (overlaps with kernel and H2D)
cudaMemcpyAsync(&h_output[offset], &d_output[offset],
chunk_size, cudaMemcpyDeviceToHost, streams[i]);
}
// Wait for all streams
for (int i = 0; i < NUM_STREAMS; i++) {
cudaStreamSynchronize(streams[i]);
}Use stream priorities to ensure critical operations get scheduled first. Stream callbacks enable CPU-side synchronization without blocking.
// Create high and low priority streams
cudaStream_t high_priority, low_priority;
int least, greatest;
cudaDeviceGetStreamPriorityRange(&least, &greatest);
cudaStreamCreateWithPriority(&high_priority, cudaStreamNonBlocking, greatest);
cudaStreamCreateWithPriority(&low_priority, cudaStreamNonBlocking, least);
// Critical path uses high priority
cudaMemcpyAsync(d_critical, h_critical, size,
cudaMemcpyHostToDevice, high_priority);
critical_kernel<<<grid, block, 0, high_priority>>>(d_critical);
// Callback for CPU-side work without blocking
auto callback = [](cudaStream_t stream, cudaError_t status, void* data) {
// Process results on CPU
ResultData* results = (ResultData*)data;
results->process();
};
cudaStreamAddCallback(high_priority, callback, &results, 0);Reuse pinned memory allocations across iterations to avoid allocation overhead. cudaHostAlloc is expensive (microseconds), so allocate once and reuse.
class PinnedMemoryPool {
std::vector<void*> buffers;
size_t buffer_size;
public:
PinnedMemoryPool(size_t size, int count) : buffer_size(size) {
buffers.resize(count);
for (int i = 0; i < count; i++) {
cudaHostAlloc(&buffers[i], size, cudaHostAllocDefault);
}
}
void* acquire() {
if (buffers.empty()) return nullptr;
void* buf = buffers.back();
buffers.pop_back();
return buf;
}
void release(void* buf) {
buffers.push_back(buf);
}
~PinnedMemoryPool() {
for (void* buf : buffers) cudaFreeHost(buf);
}
};Synchronous transfers serialize execution: CPU blocks, GPU idle during transfers. No overlap possible.
void process_data_sync(float* h_input, float* h_output, int n) {
float *d_input, *d_output;
size_t size = n * sizeof(float);
cudaMalloc(&d_input, size);
cudaMalloc(&d_output, size);
// Blocking H2D transfer
cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);
// Kernel execution (GPU idle during transfers)
int threads = 256;
int blocks = (n + threads - 1) / threads;
process_kernel<<<blocks, threads>>>(d_input, d_output, n);
// Blocking D2H transfer
cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);
cudaFree(d_input);
cudaFree(d_output);
}
// Total time = T_h2d + T_kernel + T_d2h
// GPU idle during transfers, CPU blocked entire timeAsync pipeline overlaps transfers and compute across multiple streams, achieving near-theoretical peak throughput.
class AsyncPipeline {
static const int NUM_STREAMS = 4;
cudaStream_t streams[NUM_STREAMS];
float *d_buffers[NUM_STREAMS];
float *h_pinned[NUM_STREAMS];
size_t chunk_size;
public:
AsyncPipeline(size_t total_size) {
chunk_size = total_size / NUM_STREAMS;
for (int i = 0; i < NUM_STREAMS; i++) {
cudaStreamCreate(&streams[i]);
cudaMalloc(&d_buffers[i], chunk_size * sizeof(float));
cudaHostAlloc(&h_pinned[i], chunk_size * sizeof(float),
cudaHostAllocDefault);
}
}
void process(float* h_input, float* h_output, int total_n) {
int chunk_n = total_n / NUM_STREAMS;
int threads = 256;
int blocks = (chunk_n + threads - 1) / threads;
for (int i = 0; i < NUM_STREAMS; i++) {
// Copy input to pinned buffer (can be optimized further)
memcpy(h_pinned[i], &h_input[i * chunk_n],
chunk_size * sizeof(float));
// Async H2D
cudaMemcpyAsync(d_buffers[i], h_pinned[i],
chunk_size * sizeof(float),
cudaMemcpyHostToDevice, streams[i]);
// Kernel (overlaps with other streams)
process_kernel<<<blocks, threads, 0, streams[i]>>>(
d_buffers[i], d_buffers[i], chunk_n);
// Async D2H
cudaMemcpyAsync(h_pinned[i], d_buffers[i],
chunk_size * sizeof(float),
cudaMemcpyDeviceToHost, streams[i]);
}
// Synchronize all streams
for (int i = 0; i < NUM_STREAMS; i++) {
cudaStreamSynchronize(streams[i]);
memcpy(&h_output[i * chunk_n], h_pinned[i],
chunk_size * sizeof(float));
}
}
~AsyncPipeline() {
for (int i = 0; i < NUM_STREAMS; i++) {
cudaStreamDestroy(streams[i]);
cudaFree(d_buffers[i]);
cudaFreeHost(h_pinned[i]);
}
}
};
// Total time ≈ max(T_h2d, T_kernel, T_d2h) + pipeline_fill_time
// All GPU engines utilized simultaneously| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| End-to-end latency (256MB) | 48ms (serial) | 18ms (pipelined) | 2.67x faster |
| GPU utilization | 35% (idle during transfers) | 92% (overlap) | 2.6x higher |
| Memory bandwidth (PCIe 3.0) | 8.2 GB/s (staged) | 11.8 GB/s (pinned) | 1.44x higher |
| Throughput (items/sec) | 5.3M items/sec | 14.2M items/sec | 2.68x higher |
Use cudaMemcpyAsync when you have other work (kernels or transfers) that can overlap. Async is only beneficial with pinned memory and non-default streams. For single operations or pageable memory, the overhead of async calls provides no benefit over synchronous cudaMemcpy.
Limit pinned memory to 25-50% of system RAM. Excessive pinned memory reduces available pageable memory and can cause system instability. Allocate pinned buffers once at initialization and reuse them. Modern systems with 64GB+ RAM can safely use 8-16GB pinned.
Pageable memory allocated with malloc cannot be accessed by GPU DMA engines. CUDA must first copy to an internal pinned staging buffer, which requires blocking. Use cudaHostAlloc or cudaMallocHost to allocate pinned memory that supports true async DMA.
Start with 3-4 streams. More streams provide diminishing returns and increase synchronization overhead. Profile your application - if GPU utilization is below 90%, add streams. If stream overhead dominates (many tiny operations), reduce streams and increase chunk sizes.
Extends async patterns to 2D pitched memory
Alternative to explicit transfers with managed memory
Direct transfers between GPUs without host
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.