Loading...
Top-k finds the k largest elements, essential for beam search decoding and nucleus sampling in LLMs. Full sorting is O(n log n); efficient top-k uses radix select or heap for O(n + k log k).
Binary search on bit patterns to find k-th element.
// Find threshold, then filter
__global__ void topk_radix(float* vals, int* indices, int n, int k,
float* topk_vals, int* topk_idx) {
// 1. Radix select to find k-th largest value
float threshold = radix_select_kth(vals, n, k);
// 2. Filter elements >= threshold
int count = 0;
for (int i = threadIdx.x; i < n && count < k; i += blockDim.x) {
if (vals[i] >= threshold) {
int pos = atomicAdd(&count, 1);
if (pos < k) {
topk_vals[pos] = vals[i];
topk_idx[pos] = i;
}
}
}
}O(n log n) full sort, wasteful for small k.
void topk_naive(float* vals, int* idx, int n, int k, ...) {
thrust::sort_by_key(vals, vals + n, idx, thrust::greater<float>());
cudaMemcpy(topk_vals, vals, k * sizeof(float), ...);
}Warp-cooperative heap for k≤32.
// For small k (≤32), use warp-level min-heap
__device__ void warp_topk(float* vals, int n, int k,
float* topk_vals, int* topk_idx) {
// Each thread maintains one heap element
float my_val = -INFINITY;
int my_idx = -1;
for (int i = threadIdx.x; i < n; i += 32) {
float v = vals[i];
// Find min in current top-k
float min_val = warpReduceMin(my_val);
if (v > min_val) {
// Replace min with new value
// Use ballot to coordinate replacement
if (my_val == min_val && v > my_val) {
my_val = v;
my_idx = i;
}
}
}
topk_vals[threadIdx.x] = my_val;
topk_idx[threadIdx.x] = my_idx;
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Latency (n=50000, k=50) | 1.2ms | 0.08ms | 15x faster |
k≤32: warp heap. k≤1024: block heap. Large k: radix select. Very large k (>n/10): just sort.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.