Loading...
Permute reorders tensor dimensions (generalized transpose). Critical for layout conversions (NCHW↔NHWC). Can be a view (just stride change) but subsequent ops may need contiguous copy.
Use shared memory tiles for coalesced access.
#define TILE 32
__global__ void transpose_tiled(float* in, float* out, int H, int W) {
__shared__ float tile[TILE][TILE + 1]; // +1 avoids bank conflicts
int x = blockIdx.x * TILE + threadIdx.x;
int y = blockIdx.y * TILE + threadIdx.y;
// Load tile (coalesced read)
if (x < W && y < H)
tile[threadIdx.y][threadIdx.x] = in[y * W + x];
__syncthreads();
// Write transposed (coalesced write)
x = blockIdx.y * TILE + threadIdx.x;
y = blockIdx.x * TILE + threadIdx.y;
if (x < H && y < W)
out[y * H + x] = tile[threadIdx.x][threadIdx.y];
}Strided memory access hurts bandwidth.
__global__ void transpose_naive(float* in, float* out, int H, int W) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < W && y < H)
out[x * H + y] = in[y * W + x]; // Uncoalesced write!
}General permute with stride computation.
// For arbitrary permutation, compute index mapping
__global__ void permute_nd(float* in, float* out,
int* in_strides, int* out_strides,
int* perm, int ndim, int total) {
int out_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (out_idx >= total) return;
// Convert linear index to multi-index
int in_idx = 0;
int remaining = out_idx;
for (int d = 0; d < ndim; d++) {
int coord = remaining / out_strides[d];
remaining %= out_strides[d];
in_idx += coord * in_strides[perm[d]];
}
out[out_idx] = in[in_idx];
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| 2D Transpose (4K x 4K) | 2.1ms | 0.4ms | 5.25x faster |
| NCHW→NHWC (batch=64) | 3.5ms | 0.8ms | 4.4x faster |
Transpose swaps two dims. Permute reorders all dims. transpose(0,1) = permute([1,0,...]).
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.