Tensor Cores are specialized matrix units on NVIDIA GPUs (Volta and later) that compute 4x4 matrix multiply-accumulate operations in a single cycle. They provide 8-16x throughput over CUDA cores for supported operations. Direct programming uses the WMMA (Warp Matrix Multiply Accumulate) API. This guide covers WMMA programming, memory layout requirements, and when to use Tensor Cores.
Tile large matrices into Tensor Core-sized fragments.
Stage fragments in shared memory with correct layout.
Use FP16 inputs with FP32 accumulation for accuracy.
CUDA core GEMM is limited by compute throughput.
// Standard CUDA core matrix multiply
__global__ void gemm_cuda_core(half* A, half* B, float* C,
int M, int N, int K) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < M && col < N) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
sum += __half2float(A[row * K + k]) *
__half2float(B[k * N + col]);
}
C[row * N + col] = sum;
}
}WMMA enables direct Tensor Core programming for 10x+ speedup.
#include <mma.h>
using namespace nvcuda;
// Tensor Core GEMM using WMMA
// Shapes: M=16, N=16, K=16 for FP16
const int WMMA_M = 16;
const int WMMA_N = 16;
const int WMMA_K = 16;
__global__ void gemm_tensor_core(half* A, half* B, float* C,
int M, int N, int K) {
// Warp and tile indices
int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
int warpN = blockIdx.y;
// Declare matrix fragments
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag;
// Initialize accumulator to zero
wmma::fill_fragment(c_frag, 0.0f);
// Compute C = A * B
for (int k = 0; k < K; k += WMMA_K) {
int aRow = warpM * WMMA_M;
int aCol = k;
int bRow = k;
int bCol = warpN * WMMA_N;
// Bounds check
if (aRow < M && aCol < K && bRow < K && bCol < N) {
// Load A and B fragments
wmma::load_matrix_sync(a_frag, A + aRow * K + aCol, K);
wmma::load_matrix_sync(b_frag, B + bRow * N + bCol, N);
// Tensor Core matrix multiply-accumulate
wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
}
}
// Store result
int cRow = warpM * WMMA_M;
int cCol = warpN * WMMA_N;
if (cRow < M && cCol < N) {
wmma::store_matrix_sync(C + cRow * N + cCol, c_frag, N, wmma::mem_row_major);
}
}
// For FP8 on Hopper (H100):
// Use wmma with __nv_fp8_e4m3 or __nv_fp8_e5m2 types| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| TFLOPS (RTX 4090) | 82 FP32 | 660 FP16 TC | 8x |
| GEMM throughput | 1x | 8-16x | Tensor Core vs CUDA Core |
cuBLAS, cuDNN, and PyTorch use Tensor Cores automatically when: matrix dimensions are multiples of 8, data types are FP16/BF16/TF32, and tensor cores are available (Volta+).
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.