GMRES minimizes the residual ||b - Ax||₂ over Krylov subspace K_m = span{r, Ar, A²r, ..., A^(m-1)r}. It works for any non-singular matrix but requires storing the Krylov basis vectors. Memory grows with iterations (m vectors of length n), so GMRES is typically restarted after m iterations. Choice of m balances convergence speed vs memory.
Allow preconditioner to change between iterations.
More stable orthogonalization than classical GS.
Keep eigenvector approximations across restarts.
Basic GMRES stores full Krylov basis, memory-intensive for large m.
void gmres_naive(cusparseHandle_t sparse, cublasHandle_t blas,
int n, int nnz, int* rowPtr, int* colIdx, float* vals,
float* b, float* x, int restart, int max_iter, float tol) {
int m = restart;
float** V; // Krylov basis: m+1 vectors of length n
float* H; // Hessenberg matrix: (m+1) x m
// Allocate V[0..m] and H
for (int j = 0; j <= m; j++) cudaMalloc(&V[j], n * sizeof(float));
H = new float[(m+1) * m];
for (int cycle = 0; cycle < max_iter / m; cycle++) {
// r = b - Ax
spmv(sparse, rowPtr, colIdx, vals, x, V[0], n);
subtract(b, V[0], V[0], n);
float beta;
cublasSnrm2(blas, n, V[0], 1, &beta);
if (beta < tol) return;
// V[0] = r / ||r||
float inv_beta = 1.0f / beta;
cublasSscal(blas, n, &inv_beta, V[0], 1);
// Arnoldi process
for (int j = 0; j < m; j++) {
spmv(sparse, rowPtr, colIdx, vals, V[j], V[j+1], n);
// Orthogonalize against previous vectors
for (int i = 0; i <= j; i++) {
cublasSdot(blas, n, V[i], 1, V[j+1], 1, &H[i + j*(m+1)]);
float neg_h = -H[i + j*(m+1)];
cublasSaxpy(blas, n, &neg_h, V[i], 1, V[j+1], 1);
}
cublasSnrm2(blas, n, V[j+1], 1, &H[j+1 + j*(m+1)]);
float inv_h = 1.0f / H[j+1 + j*(m+1)];
cublasSscal(blas, n, &inv_h, V[j+1], 1);
}
// Solve least squares: min ||beta*e1 - H*y||
// Update x = x + V*y
}
}Preconditioned GMRES with Givens rotations maintains numerical stability.
void pgmres(cusparseHandle_t sparse, cublasHandle_t blas,
int n, int nnz, int* rowPtr, int* colIdx, float* vals,
float* b, float* x, void* precond,
int restart, int max_iter, float tol) {
int m = restart;
// Allocate on device
float* d_V; // All basis vectors: (m+1) * n
float* d_H; // Hessenberg: (m+1) * m
float* d_w; // Work vector
cudaMalloc(&d_V, (m+1) * n * sizeof(float));
cudaMalloc(&d_H, (m+1) * m * sizeof(float));
cudaMalloc(&d_w, n * sizeof(float));
float* cs = new float[m]; // Givens cosines
float* sn = new float[m]; // Givens sines
float* g = new float[m+1]; // RHS for least squares
for (int cycle = 0; cycle < max_iter / m; cycle++) {
// r = M^{-1}(b - Ax)
spmv(sparse, rowPtr, colIdx, vals, x, d_w, n);
subtract(b, d_w, d_w, n);
apply_precond(precond, d_w, d_V, n); // V[0] = M^{-1}r
float beta;
cublasSnrm2(blas, n, d_V, 1, &beta);
scale(d_V, 1.0f/beta, n);
g[0] = beta;
for (int i = 1; i <= m; i++) g[i] = 0;
int j;
for (j = 0; j < m && g[j] > tol; j++) {
// w = M^{-1} A v_j
spmv(sparse, rowPtr, colIdx, vals, d_V + j*n, d_w, n);
apply_precond(precond, d_w, d_V + (j+1)*n, n);
// Modified Gram-Schmidt
for (int i = 0; i <= j; i++) {
float h;
cublasSdot(blas, n, d_V + i*n, 1, d_V + (j+1)*n, 1, &h);
d_H[i + j*(m+1)] = h;
cublasSaxpy(blas, n, &(-h), d_V + i*n, 1, d_V + (j+1)*n, 1);
}
float h_jp1;
cublasSnrm2(blas, n, d_V + (j+1)*n, 1, &h_jp1);
d_H[j+1 + j*(m+1)] = h_jp1;
scale(d_V + (j+1)*n, 1.0f/h_jp1, n);
// Apply previous Givens rotations
for (int i = 0; i < j; i++) {
apply_givens(d_H, m+1, i, j, cs[i], sn[i]);
}
// Compute new Givens rotation
compute_givens(d_H[j + j*(m+1)], d_H[j+1 + j*(m+1)], &cs[j], &sn[j]);
apply_givens(d_H, m+1, j, j, cs[j], sn[j]);
apply_givens_vec(g, j, cs[j], sn[j]);
}
// Solve upper triangular system H*y = g
// x = x + V*y
solve_upper(d_H, g, j, m+1);
update_solution(blas, d_V, g, x, n, j);
if (fabsf(g[j]) < tol) return;
}
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Non-symmetric n=100k | 850 iters | 120 iters (ILU) | 7x fewer |
| Memory (m=50, n=1M) | 200MB basis | 200MB (same) | Trade-off |
| Time per iteration | 15ms | 18ms (+precond) | Similar |
Typical values: m=20-50. Larger m = better convergence but more memory (m*n floats). If GMRES(m) stagnates, increase m. For very ill-conditioned systems, m=100+ may be needed. Memory limit: m < available_memory / (n * sizeof(float)).
GMRES may stagnate (residual stops decreasing) for highly non-normal matrices. Solutions: increase restart, use better preconditioner, try BiCGSTAB or IDR(s). GMRES never breaks down but may converge arbitrarily slowly.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.