cudaErrorInvalidMemcpyDirection (21)cudaErrorInvalidMemcpyDirection (error code 21) occurs when you specify an invalid or incompatible direction parameter in a CUDA memory copy operation. This happens when the cudaMemcpyKind parameter doesn't match the actual memory locations being copied. This error typically appears when using the wrong cudaMemcpyKind enum value (cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, etc.) or when trying to use a direction that isn't supported by the specific cudaMemcpy variant being called. This guide explains the valid memory copy directions, common mistakes, and best practices for reliable memory transfers in CUDA.
CUDA error: invalid memcpy direction cudaErrorInvalidMemcpyDirection: invalid memcpy direction RuntimeError: CUDA error: invalid memcpy direction Error: invalid cudaMemcpyKind
Ensure you are using the appropriate direction constant for your copy operation.
// Valid cudaMemcpyKind values
cudaMemcpyHostToDevice // CPU -> GPU
cudaMemcpyDeviceToHost // GPU -> CPU
cudaMemcpyDeviceToDevice // GPU -> GPU (same device)
cudaMemcpyHostToHost // CPU -> CPU (rarely used)
cudaMemcpyDefault // Auto-detect (unified memory only)
// Example: Copy from host to device
float* h_data = new float[1024];
float* d_data;
cudaMalloc(&d_data, 1024 * sizeof(float));
cudaMemcpy(d_data, h_data, 1024 * sizeof(float), cudaMemcpyHostToDevice);
// Example: Copy from device to host
cudaMemcpy(h_data, d_data, 1024 * sizeof(float), cudaMemcpyDeviceToHost);When using unified memory, cudaMemcpyDefault auto-detects the direction.
// Unified memory allocation
float* unified_data;
cudaMallocManaged(&unified_data, 1024 * sizeof(float));
// cudaMemcpyDefault works with unified memory
float* h_buffer = new float[1024];
cudaMemcpy(h_buffer, unified_data, 1024 * sizeof(float), cudaMemcpyDefault);
// Or just access directly (no explicit copy needed)
for (int i = 0; i < 1024; i++) {
h_buffer[i] = unified_data[i]; // Automatic migration
}
cudaFree(unified_data);For CPU-to-CPU copies, use standard C memcpy instead of CUDA API.
#include <cstring>
float* h_src = new float[1024];
float* h_dst = new float[1024];
// Wrong: Using CUDA API for host-to-host
// cudaMemcpy(h_dst, h_src, 1024 * sizeof(float), cudaMemcpyHostToHost);
// Correct: Use standard memcpy
memcpy(h_dst, h_src, 1024 * sizeof(float));
// Or use std::copy for C++
#include <algorithm>
std::copy(h_src, h_src + 1024, h_dst);For GPU-to-GPU copies on the same device, use proper synchronization.
float* d_src;
float* d_dst;
cudaMalloc(&d_src, 1024 * sizeof(float));
cudaMalloc(&d_dst, 1024 * sizeof(float));
// Synchronous device-to-device copy
cudaMemcpy(d_dst, d_src, 1024 * sizeof(float), cudaMemcpyDeviceToDevice);
// Or asynchronous with stream
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaMemcpyAsync(d_dst, d_src, 1024 * sizeof(float),
cudaMemcpyDeviceToDevice, stream);
cudaStreamSynchronize(stream);Double-check that your source and destination pointers match the direction.
// Helper function to check pointer type
void checkPointerType(void* ptr) {
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, ptr);
switch (attr.type) {
case cudaMemoryTypeHost:
printf("Host memory\n");
break;
case cudaMemoryTypeDevice:
printf("Device memory\n");
break;
case cudaMemoryTypeManaged:
printf("Managed/Unified memory\n");
break;
default:
printf("Unknown memory type\n");
}
}
// Use before memcpy to verify
checkPointerType(d_data); // Should be Device
checkPointerType(h_data); // Should be HostFor copying between different GPUs, enable peer access first.
int canAccessPeer;
cudaDeviceCanAccessPeer(&canAccessPeer, 0, 1);
if (canAccessPeer) {
cudaSetDevice(0);
cudaDeviceEnablePeerAccess(1, 0);
// Now can copy from GPU 0 to GPU 1
float *d_data0, *d_data1;
cudaSetDevice(0);
cudaMalloc(&d_data0, 1024 * sizeof(float));
cudaSetDevice(1);
cudaMalloc(&d_data1, 1024 * sizeof(float));
cudaMemcpyPeer(d_data1, 1, d_data0, 0, 1024 * sizeof(float));
}
The direction flags are backwards. First copy should be HostToDevice, second should be DeviceToHost.
float* h_data = new float[1024];
float* d_data;
cudaMalloc(&d_data, 1024 * sizeof(float));
// Wrong direction - reversed!
cudaMemcpy(d_data, h_data, 1024 * sizeof(float), cudaMemcpyDeviceToHost);
// Process...
// Wrong again - should be DeviceToHost
cudaMemcpy(h_data, d_data, 1024 * sizeof(float), cudaMemcpyHostToDevice);Direction flags correctly match the source and destination pointers. HostToDevice for upload, DeviceToHost for download.
float* h_data = new float[1024];
float* d_data;
cudaMalloc(&d_data, 1024 * sizeof(float));
// Correct: Host to Device
cudaMemcpy(d_data, h_data, 1024 * sizeof(float), cudaMemcpyHostToDevice);
// Process on GPU...
myKernel<<<blocks, threads>>>(d_data);
cudaDeviceSynchronize();
// Correct: Device to Host
cudaMemcpy(h_data, d_data, 1024 * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
delete[] h_data;cudaMemcpyDefault auto-detects the memory location of source and destination pointers. It only works reliably with unified memory (cudaMallocManaged). For explicit allocations, always use the specific direction flags.
Not directly. For multi-GPU copies, use cudaMemcpyPeer after enabling peer access with cudaDeviceEnablePeerAccess. Alternatively, copy to host first, then to the second GPU.
The cudaMemcpyKind parameter is an enum, so incorrect values compile fine but fail at runtime. Use the predefined constants (cudaMemcpyHostToDevice, etc.) rather than integer values.
Rarely. It exists for API completeness but standard memcpy is more efficient for host-to-host copies. The CUDA runtime has additional overhead that provides no benefit for CPU memory.
Invalid parameters in API calls
Memory allocation before copy operations
Invalid pointer in memory operations
Need help debugging CUDA errors? Download RightNow AI for intelligent error analysis and optimization suggestions.