Graph algorithms on GPU are challenging due to irregular memory access and load imbalance from power-law degree distributions. Techniques like edge-parallel processing, work-stealing, and frontier compaction are essential for good performance. This guide covers fundamental graph algorithms and the specialized techniques needed for efficient GPU execution.
Assign threads to edges instead of vertices.
Distribute high-degree vertices across multiple warps.
Use stream compaction for active vertex sets.
Vertex-parallel suffers from load imbalance for high-degree vertices.
// Vertex-parallel: one thread per vertex
__global__ void bfs_vertex_parallel(int* row_ptr, int* col_idx,
int* distances, int* frontier,
int num_vertices, int level) {
int v = blockIdx.x * blockDim.x + threadIdx.x;
if (v >= num_vertices) return;
if (distances[v] == level) {
// Process all neighbors
for (int e = row_ptr[v]; e < row_ptr[v + 1]; e++) {
int neighbor = col_idx[e];
if (distances[neighbor] == -1) {
distances[neighbor] = level + 1;
frontier[neighbor] = 1;
}
}
}
}Edge-parallel with load balancing handles power-law graphs efficiently.
// Edge-parallel: threads assigned to edges
// Step 1: Expand frontier vertices to edge list
// Step 2: Process edges in parallel
// CSR to edge list expansion
__global__ void expand_frontier(int* row_ptr, int* frontier_vertices,
int* edge_src, int num_frontier) {
// Each frontier vertex's edges expanded
}
// Edge-parallel processing
__global__ void bfs_edge_parallel(int* edge_src, int* edge_dst,
int* distances, int* new_frontier,
int num_edges, int level) {
int e = blockIdx.x * blockDim.x + threadIdx.x;
if (e >= num_edges) return;
int src = edge_src[e];
int dst = edge_dst[e];
if (distances[src] == level) {
if (atomicCAS(&distances[dst], -1, level + 1) == -1) {
// First to visit this vertex
new_frontier[atomicAdd(&frontier_count, 1)] = dst;
}
}
}
// For high-degree vertices, use work group assignment
__global__ void bfs_load_balanced(int* row_ptr, int* col_idx,
int* distances, int level,
int* vertex_to_warp, int* warp_offsets) {
// High-degree vertices split across warps
// Each warp processes subset of edges
int warp_id = /* warp assignment */;
int vertex = vertex_to_warp[warp_id];
int edge_start = warp_offsets[warp_id];
int edge_end = warp_offsets[warp_id + 1];
for (int e = edge_start + threadIdx.x % 32; e < edge_end; e += 32) {
int neighbor = col_idx[e];
// Process neighbor...
}
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| BFS throughput (GTEPS) | 5 | 35 | 7x |
| PageRank time | 1x | 3x | Pull vs push |
Use streaming/chunked processing, partition graph with minimal edge cuts (METIS), or use unified memory with manual prefetching hints.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.