Matrix transpose is the classic example of memory access pattern optimization in CUDA. The challenge is fundamental: a row-major to column-major conversion means either reads or writes will be non-coalesced. You cannot coalesce both without using shared memory as an intermediary. This seemingly simple operation teaches critical GPU optimization concepts. A naive transpose achieves only 10-20% of memory bandwidth, while an optimized version with shared memory tiling reaches 80-90% of peak bandwidth—a 5-10x speedup for the same algorithm.
Load a tile from input with coalesced reads, store to shared memory, then write to output with coalesced writes. Shared memory acts as a fast transpose buffer.
#define TILE_DIM 32
#define BLOCK_ROWS 8
__global__ void transpose_tiled(float* out, float* in,
int width, int height) {
__shared__ float tile[TILE_DIM][TILE_DIM + 1]; // +1 for bank conflicts
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
// Coalesced read from input
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
if (x < width && (y + j) < height) {
tile[threadIdx.y + j][threadIdx.x] = in[(y + j) * width + x];
}
}
__syncthreads();
// Transposed position
x = blockIdx.y * TILE_DIM + threadIdx.x;
y = blockIdx.x * TILE_DIM + threadIdx.y;
// Coalesced write to output
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
if (x < height && (y + j) < width) {
out[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j];
}
}
}Add one element of padding to each row of shared memory. This breaks the power-of-2 stride that causes bank conflicts during column access.
// WITHOUT padding - 32-way bank conflict on column access
__shared__ float tile[32][32];
// tile[0][0], tile[1][0], tile[2][0]... all map to bank 0
// WITH padding - no bank conflicts
__shared__ float tile[32][33]; // Extra column
// tile[0][0] -> bank 0
// tile[1][0] -> bank 1 (offset by 33 = 1 mod 32)
// tile[2][0] -> bank 2
// ...all different banks!
// Access pattern analysis:
// Row access: tile[ty][0..31] -> banks 0..31 (no conflict)
// Column access: tile[0..31][tx] -> with padding, each hits different bank
// The +1 padding costs only 32 extra floats (128 bytes)
// but eliminates all 32-way bank conflictsProcess blocks in diagonal order to improve L2 cache hit rates. Adjacent diagonal blocks share data in their tiles.
__global__ void transpose_diagonal(float* out, float* in,
int width, int height) {
// Remap block indices for diagonal access pattern
int blockIdx_x, blockIdx_y;
if (width == height) {
// Square matrix: diagonal remapping
blockIdx_y = blockIdx.x;
blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
} else {
// Non-square: different strategy
int bid = blockIdx.x + gridDim.x * blockIdx.y;
blockIdx_y = bid % gridDim.y;
blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
}
// Continue with remapped block indices...
__shared__ float tile[TILE_DIM][TILE_DIM + 1];
int x = blockIdx_x * TILE_DIM + threadIdx.x;
int y = blockIdx_y * TILE_DIM + threadIdx.y;
// ... rest of transpose
}For square matrices, transpose in place by swapping A[i][j] with A[j][i]. Requires careful synchronization to avoid data races.
// In-place transpose for square matrices
// Only process upper triangle, swap with lower
__global__ void transpose_inplace(float* matrix, int N) {
__shared__ float tile1[TILE_DIM][TILE_DIM + 1];
__shared__ float tile2[TILE_DIM][TILE_DIM + 1];
int bx = blockIdx.x, by = blockIdx.y;
// Only process upper triangle blocks (including diagonal)
if (bx > by) return;
int x1 = bx * TILE_DIM + threadIdx.x;
int y1 = by * TILE_DIM + threadIdx.y;
int x2 = by * TILE_DIM + threadIdx.x;
int y2 = bx * TILE_DIM + threadIdx.y;
// Load both tiles
if (x1 < N && y1 < N)
tile1[threadIdx.y][threadIdx.x] = matrix[y1 * N + x1];
if (bx != by && x2 < N && y2 < N)
tile2[threadIdx.y][threadIdx.x] = matrix[y2 * N + x2];
__syncthreads();
// Write back transposed
if (x2 < N && y2 < N)
matrix[y2 * N + x2] = tile1[threadIdx.x][threadIdx.y];
if (bx != by && x1 < N && y1 < N)
matrix[y1 * N + x1] = tile2[threadIdx.x][threadIdx.y];
}
// Launch: grid covers upper triangle only
dim3 grid((N + TILE_DIM - 1) / TILE_DIM,
(N + TILE_DIM - 1) / TILE_DIM);Use float4 or int4 for loading and storing 4 elements at once. Reduces instruction count and improves memory throughput.
// Vectorized load/store for aligned data
__global__ void transpose_vectorized(float4* out, float4* in,
int width, int height) {
// Width must be multiple of 4
int width4 = width / 4;
__shared__ float tile[TILE_DIM][TILE_DIM + 4]; // Padding for float4
int x = blockIdx.x * (TILE_DIM/4) + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
// Load float4 (4 consecutive floats)
if (x < width4 && y < height) {
float4 val = in[y * width4 + x];
int tx = threadIdx.x * 4;
tile[threadIdx.y][tx] = val.x;
tile[threadIdx.y][tx + 1] = val.y;
tile[threadIdx.y][tx + 2] = val.z;
tile[threadIdx.y][tx + 3] = val.w;
}
__syncthreads();
// Store transposed (more complex due to layout change)
// ... vectorized store logic
}Naive transpose must choose between coalesced reads or coalesced writes—it cannot have both without shared memory.
// Naive transpose: coalesced reads, non-coalesced writes
__global__ void transpose_naive(float* out, float* in,
int width, int height) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
// Read: in[y][x] - threads read consecutive x values (coalesced)
// Write: out[x][y] - threads write to consecutive y positions
// This means thread 0 writes to out[x][0],
// thread 1 writes to out[x][1], etc.
// These addresses have stride = height (non-coalesced!)
out[x * height + y] = in[y * width + x];
}
}
// Alternative naive: coalesced writes, non-coalesced reads
__global__ void transpose_naive_v2(float* out, float* in,
int width, int height) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < height && y < width) {
// Read: in[x][y] - strided access (non-coalesced)
// Write: out[y][x] - consecutive access (coalesced)
out[y * height + x] = in[x * width + y];
}
}
// Both achieve only ~20% of peak memory bandwidthThis optimized transpose uses shared memory tiling with bank conflict padding to achieve coalesced reads AND coalesced writes.
#define TILE_DIM 32
#define BLOCK_ROWS 8
__global__ void transpose_optimized(float* __restrict__ out,
const float* __restrict__ in,
int width, int height) {
// Shared memory with padding to avoid bank conflicts
__shared__ float tile[TILE_DIM][TILE_DIM + 1];
// Input coordinates
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
// Coalesced read from input into shared memory
// Each thread handles TILE_DIM/BLOCK_ROWS elements
#pragma unroll
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
if (x < width && (y + j) < height) {
tile[threadIdx.y + j][threadIdx.x] = in[(y + j) * width + x];
}
}
__syncthreads();
// Output coordinates (transposed block position)
x = blockIdx.y * TILE_DIM + threadIdx.x; // Swapped!
y = blockIdx.x * TILE_DIM + threadIdx.y; // Swapped!
// Coalesced write from shared memory to output
#pragma unroll
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
if (x < height && (y + j) < width) {
// Read from transposed position in shared memory
out[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j];
}
}
}
// Launch configuration
dim3 block(TILE_DIM, BLOCK_ROWS);
dim3 grid((width + TILE_DIM - 1) / TILE_DIM,
(height + TILE_DIM - 1) / TILE_DIM);
transpose_optimized<<<grid, block>>>(d_out, d_in, width, height);| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Effective Bandwidth (4096x4096) | 42 GB/s | 385 GB/s | 9.2x higher |
| Bandwidth Efficiency | 11% | 89% | 8x better |
| Kernel Time (4096x4096) | 3.2 ms | 0.35 ms | 9.1x faster |
| Bank Conflicts | N/A (no shared mem) | 0 (with padding) | Eliminated |
Memory coalescing requires consecutive threads to access consecutive memory addresses. In naive transpose, either reads or writes must access memory with a stride equal to the matrix width/height. This means instead of one 128-byte transaction serving 32 threads, each thread may need its own transaction—a 32x reduction in effective bandwidth.
32x32 tiles work well for most GPUs because: (1) 32 matches warp size, enabling warp-aligned access, (2) 32x32 = 1024 elements = 4KB, fitting easily in shared memory, and (3) 32 matches bank count, so padding by 1 eliminates all bank conflicts. Smaller tiles (16x16) may work better on older GPUs or when occupancy is limited.
The algorithm works unchanged for non-square matrices. The key is swapping block indices when computing output coordinates: output block (x,y) receives data from input block (y,x). Add boundary checks for matrices not divisible by tile size. For very rectangular matrices (e.g., 1000x10), consider different tile shapes.
Yes, for production code! cublasXgeam (matrix-matrix addition) with alpha=1, beta=0 and B=NULL performs an optimized transpose. It handles all edge cases and is thoroughly optimized. Write custom kernels when fusing transpose with other operations or when using non-standard data types.
Transpose is the classic coalescing example
Often needs transpose for efficient access
Similar tiling and shared memory patterns
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.