Memory management is crucial for CUDA performance. cudaMalloc is expensive (1-100μs), so allocation strategies matter. Memory pools, async allocation, and proper use of memory types (device, host, unified, pinned) significantly impact performance. This guide covers allocation strategies, memory types, and best practices for high-performance CUDA applications.
Reuse allocations to avoid cudaMalloc overhead.
Use cudaMallocHost for faster CPU-GPU transfers.
cudaMallocAsync allows stream-ordered allocation.
Allocating per batch adds 100μs+ per iteration.
// Anti-pattern: allocating in loop
void process_batches(int num_batches, int batch_size) {
for (int i = 0; i < num_batches; i++) {
float* d_data;
cudaMalloc(&d_data, batch_size * sizeof(float)); // Slow!
process_kernel<<<grid, block>>>(d_data);
cudaFree(d_data); // Also slow!
}
}Memory pools eliminate allocation overhead.
// CUDA 11.2+ memory pool
void process_with_pool(int num_batches, int batch_size) {
// Create memory pool
cudaMemPool_t mempool;
cudaDeviceGetDefaultMemPool(&mempool, 0);
// Set pool to release threshold (keep memory for reuse)
uint64_t threshold = UINT64_MAX;
cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
cudaStream_t stream;
cudaStreamCreate(&stream);
for (int i = 0; i < num_batches; i++) {
float* d_data;
// Stream-ordered async allocation - uses pool
cudaMallocAsync(&d_data, batch_size * sizeof(float), stream);
process_kernel<<<grid, block, 0, stream>>>(d_data);
// Async free - memory returned to pool
cudaFreeAsync(d_data, stream);
}
cudaStreamSynchronize(stream);
}
// Manual memory pool for older CUDA
class MemoryPool {
std::vector<void*> free_blocks;
size_t block_size;
public:
MemoryPool(size_t size) : block_size(size) {}
void* allocate() {
if (!free_blocks.empty()) {
void* ptr = free_blocks.back();
free_blocks.pop_back();
return ptr;
}
void* ptr;
cudaMalloc(&ptr, block_size);
return ptr;
}
void deallocate(void* ptr) {
free_blocks.push_back(ptr);
}
};
// Pinned memory for fast CPU-GPU transfer
void pinned_transfer_example() {
float* h_pinned;
cudaMallocHost(&h_pinned, size); // Pinned (page-locked)
// 2x faster transfer than pageable memory
cudaMemcpyAsync(d_data, h_pinned, size,
cudaMemcpyHostToDevice, stream);
cudaFreeHost(h_pinned);
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Allocation time | 100-500μs | <1μs | 100x+ with pool |
| Transfer speed | 12 GB/s | 25 GB/s | 2x with pinned |
Unified memory simplifies code but may have performance overhead from page faults. Best for: development/prototyping, oversubscribed memory (larger than GPU), or when access patterns are unpredictable.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.