Bitonic sort is a parallel sorting algorithm based on sorting networks—fixed sequences of compare-exchange operations. While its O(N log² N) complexity is worse than O(N log N) comparison sorts, its regular, data-independent pattern maps perfectly to GPU architecture. Bitonic sort excels for small to medium arrays (up to ~1M elements) where its simplicity and predictable memory access patterns outweigh the extra comparisons. For larger arrays, radix sort is typically faster, but bitonic sort remains valuable for key-value sorting and as a building block in more complex algorithms.
Perform as many sorting phases as possible in shared memory before falling back to global memory. Shared memory is ~100x faster.
#define SHARED_SIZE 1024
__global__ void bitonic_sort_shared(float* data, int n) {
__shared__ float shared[SHARED_SIZE];
int tid = threadIdx.x;
int gid = blockIdx.x * SHARED_SIZE + tid;
// Load into shared memory
shared[tid] = (gid < n) ? data[gid] : FLT_MAX;
shared[tid + SHARED_SIZE/2] = (gid + SHARED_SIZE/2 < n) ?
data[gid + SHARED_SIZE/2] : FLT_MAX;
__syncthreads();
// Bitonic sort phases in shared memory
for (int size = 2; size <= SHARED_SIZE; size *= 2) {
// Bitonic merge
for (int stride = size / 2; stride > 0; stride /= 2) {
__syncthreads();
int pos = tid;
int cmp = pos ^ stride; // Partner index
if (cmp > pos) {
// Determine sort direction
bool ascending = ((pos & size) == 0);
float a = shared[pos];
float b = shared[cmp];
if ((a > b) == ascending) {
shared[pos] = b;
shared[cmp] = a;
}
}
}
}
__syncthreads();
// Write back
if (gid < n) data[gid] = shared[tid];
if (gid + SHARED_SIZE/2 < n)
data[gid + SHARED_SIZE/2] = shared[tid + SHARED_SIZE/2];
}For arrays larger than shared memory, perform global memory phases with coalesced access patterns.
// Global bitonic merge step
__global__ void bitonic_merge_global(float* data, int n,
int stage, int substage) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int pair_distance = 1 << (stage - substage);
int block_size = 1 << stage;
// Find partner
int pos = tid;
int partner = pos ^ pair_distance;
if (partner > pos && partner < n) {
// Direction: ascending if in first half of bitonic sequence
bool ascending = ((pos / block_size) % 2) == 0;
float a = data[pos];
float b = data[partner];
if ((a > b) == ascending) {
data[pos] = b;
data[partner] = a;
}
}
}
// Host code for full bitonic sort
void bitonic_sort(float* d_data, int n) {
int padded_n = next_power_of_2(n);
// Initialize padding with max values
if (padded_n > n) {
cudaMemset(d_data + n, 0xFF, (padded_n - n) * sizeof(float));
}
// All stages
for (int stage = 1; stage <= log2(padded_n); stage++) {
for (int substage = 1; substage <= stage; substage++) {
if (pair_distance <= SHARED_LIMIT) {
// Use shared memory kernel
bitonic_sort_shared<<<blocks, threads>>>(d_data, n);
} else {
// Use global memory kernel
bitonic_merge_global<<<blocks, threads>>>(
d_data, n, stage, substage);
}
}
}
}Add padding or remap indices to avoid bank conflicts in shared memory access patterns.
// Remap indices to avoid bank conflicts
__device__ int conflict_free_idx(int idx) {
// Add padding every 32 elements
return idx + (idx >> 5);
}
__global__ void bitonic_no_conflicts(float* data, int n) {
// Shared memory with padding
__shared__ float shared[SHARED_SIZE + SHARED_SIZE/32];
int tid = threadIdx.x;
// Load with conflict-free addressing
int load_idx = conflict_free_idx(tid);
shared[load_idx] = data[blockIdx.x * SHARED_SIZE + tid];
for (int size = 2; size <= SHARED_SIZE; size *= 2) {
for (int stride = size / 2; stride > 0; stride /= 2) {
__syncthreads();
int pos = tid;
int partner = pos ^ stride;
if (partner > pos) {
int idx_a = conflict_free_idx(pos);
int idx_b = conflict_free_idx(partner);
bool ascending = ((pos & size) == 0);
float a = shared[idx_a];
float b = shared[idx_b];
if ((a > b) == ascending) {
shared[idx_a] = b;
shared[idx_b] = a;
}
}
}
}
}Sort key-value pairs together by always moving the value with its key. Essential for sorting indices or associated data.
__global__ void bitonic_sort_kv(float* keys, int* values, int n) {
__shared__ float sh_keys[SHARED_SIZE];
__shared__ int sh_vals[SHARED_SIZE];
int tid = threadIdx.x;
int gid = blockIdx.x * SHARED_SIZE + tid;
// Load key-value pairs
sh_keys[tid] = (gid < n) ? keys[gid] : FLT_MAX;
sh_vals[tid] = (gid < n) ? values[gid] : -1;
__syncthreads();
// Sort phases
for (int size = 2; size <= SHARED_SIZE; size *= 2) {
for (int stride = size / 2; stride > 0; stride /= 2) {
__syncthreads();
int pos = tid;
int partner = pos ^ stride;
if (partner > pos) {
bool ascending = ((pos & size) == 0);
float key_a = sh_keys[pos];
float key_b = sh_keys[partner];
if ((key_a > key_b) == ascending) {
// Swap both key and value
sh_keys[pos] = key_b;
sh_keys[partner] = key_a;
int tmp = sh_vals[pos];
sh_vals[pos] = sh_vals[partner];
sh_vals[partner] = tmp;
}
}
}
}
__syncthreads();
// Store sorted pairs
if (gid < n) {
keys[gid] = sh_keys[tid];
values[gid] = sh_vals[tid];
}
}Process multiple elements per thread using vector types and SIMD-style comparisons.
// Sort 4 elements per thread in registers
__device__ void sort4(float& a, float& b, float& c, float& d) {
// Compare-exchange network for 4 elements
if (a > b) { float t = a; a = b; b = t; }
if (c > d) { float t = c; c = d; d = t; }
if (a > c) { float t = a; a = c; c = t; }
if (b > d) { float t = b; b = d; d = t; }
if (b > c) { float t = b; b = c; c = t; }
}
__global__ void bitonic_vectorized(float4* data, int n4) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n4) {
float4 v = data[tid];
// Sort within float4
sort4(v.x, v.y, v.z, v.w);
data[tid] = v;
}
// Continue with cross-float4 merging...
}Naive implementation launches too many kernels and uses only global memory.
// Naive bitonic sort - all phases in global memory
__global__ void bitonic_naive(float* data, int n, int stage, int substage) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int pair_distance = 1 << (stage - substage);
int block_size = 1 << stage;
int pos = tid;
int partner = pos ^ pair_distance;
// PROBLEM 1: Global memory access every comparison
// PROBLEM 2: Bank conflicts when pair_distance is power of 2
// PROBLEM 3: Many kernel launches (log²n phases)
if (partner > pos && partner < n) {
bool ascending = ((pos / block_size) % 2) == 0;
float a = data[pos]; // Global load
float b = data[partner]; // Global load (possibly non-coalesced)
if ((a > b) == ascending) {
data[pos] = b; // Global store
data[partner] = a; // Global store
}
}
}
// Host: O(log²n) kernel launches
void sort_naive(float* d_data, int n) {
for (int stage = 1; stage <= log2(n); stage++) {
for (int substage = 1; substage <= stage; substage++) {
bitonic_naive<<<blocks, threads>>>(d_data, n, stage, substage);
}
}
}Hybrid sort maximizes shared memory usage and minimizes global memory phases.
#define SHARED_SIZE 2048
#define THREADS_PER_BLOCK 512
// Fully shared memory sort for small arrays / final stages
template<bool ASCENDING>
__global__ void bitonic_sort_block(float* __restrict__ data, int n) {
__shared__ float shared[SHARED_SIZE + SHARED_SIZE/32]; // Padded
int tid = threadIdx.x;
int bid = blockIdx.x;
int gid = bid * SHARED_SIZE + tid;
// Vectorized load (4 elements per thread)
#pragma unroll
for (int i = 0; i < 4; i++) {
int idx = tid * 4 + i;
int global_idx = bid * SHARED_SIZE + idx;
int sh_idx = idx + (idx >> 5); // Conflict-free
shared[sh_idx] = (global_idx < n) ? data[global_idx] : FLT_MAX;
}
__syncthreads();
// Register-level sort of each thread's 4 elements
float reg[4];
#pragma unroll
for (int i = 0; i < 4; i++) {
int idx = tid * 4 + i;
reg[i] = shared[idx + (idx >> 5)];
}
sort4(reg[0], reg[1], reg[2], reg[3]);
#pragma unroll
for (int i = 0; i < 4; i++) {
int idx = tid * 4 + i;
shared[idx + (idx >> 5)] = reg[i];
}
__syncthreads();
// Bitonic merge phases
for (int size = 8; size <= SHARED_SIZE; size *= 2) {
for (int stride = size / 2; stride > 0; stride /= 2) {
#pragma unroll
for (int i = 0; i < 4; i++) {
int pos = tid * 4 + i;
int partner = pos ^ stride;
if (partner > pos) {
int sh_pos = pos + (pos >> 5);
int sh_partner = partner + (partner >> 5);
bool asc = ASCENDING ? ((pos & size) == 0)
: ((pos & size) != 0);
float a = shared[sh_pos];
float b = shared[sh_partner];
if ((a > b) == asc) {
shared[sh_pos] = b;
shared[sh_partner] = a;
}
}
}
__syncthreads();
}
}
// Vectorized store
#pragma unroll
for (int i = 0; i < 4; i++) {
int idx = tid * 4 + i;
int global_idx = bid * SHARED_SIZE + idx;
if (global_idx < n) {
data[global_idx] = shared[idx + (idx >> 5)];
}
}
}
// Global merge for cross-block sorting
__global__ void bitonic_merge_blocks(float* data, int n,
int stage, int substage) {
// Coalesced global memory merge
// ... (as shown earlier)
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Sort Time (1M floats) | 45 ms | 3.2 ms | 14x faster |
| Sort Time (64K floats) | 8.5 ms | 0.4 ms | 21x faster |
| Kernel Launches (1M elements) | 400 launches | 20 launches | 20x fewer |
| vs CUB RadixSort (10M) | 320 ms | 45 ms (CUB: 12 ms) | Radix 3.7x faster |
Use bitonic sort for: (1) small arrays (<1M elements), (2) key-value sorting with complex value types, (3) partial sorting (top-K), (4) when you need a stable sort, or (5) as a building block (sorting within blocks before merging). Use radix sort for large arrays of integers or floats where its O(N) complexity dominates.
Pad to the next power of 2 with maximum values (for ascending sort) or minimum values (for descending). After sorting, the padding elements will be at the end and can be ignored. Alternatively, use odd-even merge sort which handles arbitrary sizes more gracefully.
Standard bitonic sort is not stable (equal elements may be reordered). To make it stable, include the original index as a secondary sort key: sort by (key, index) pairs. This ensures equal keys maintain their original relative order.
Flip the ascending/descending direction in the compare-exchange operations. Instead of swapping when (a > b) == ascending, use (a < b) == ascending, or simply reverse the direction flag computation. Alternatively, negate keys before sorting and negate back after.
Sorting enables selection/top-K
Radix sort uses scan
Sort-based histogram alternative
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.