Loading...
Argsort returns indices that would sort an array, essential for ranking, percentiles, and reordering. CUDA radix sort with index tracking is typically fastest for large arrays.
Track indices through radix sort passes.
void argsort(float* vals, int* indices, int n) {
// Initialize indices
thrust::sequence(thrust::device, indices, indices + n);
// Sort by value, carrying indices
thrust::sort_by_key(thrust::device,
vals, vals + n, indices,
thrust::greater<float>()); // descending
}Indirect comparisons cause poor cache behavior.
// Slow: copy-based approach
void argsort_naive(float* vals, int* idx, int n) {
for (int i = 0; i < n; i++) idx[i] = i;
thrust::sort(idx, idx + n, [vals](int a, int b) {
return vals[a] > vals[b]; // Indirect access - slow!
});
}CUB radix sort is fastest for float arrays.
#include <cub/cub.cuh>
void argsort_cub(float* vals, int* indices, int n) {
thrust::sequence(thrust::device, indices, indices + n);
// Allocate temporary storage
size_t temp_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending(
nullptr, temp_bytes, vals, vals_out, indices, indices_out, n);
void* d_temp;
cudaMalloc(&d_temp, temp_bytes);
cub::DeviceRadixSort::SortPairsDescending(
d_temp, temp_bytes, vals, vals_out, indices, indices_out, n);
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Throughput (1M elements) | 150 M/s | 800 M/s | 5.3x faster |
Radix sort is naturally stable. Use thrust::stable_sort_by_key if needed with comparison sorts.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.