Loading...
Finding unique elements removes duplicates from a tensor. Two main approaches: sort then adjacent-different (stable, O(n log n)), or hash set (O(n) but uses more memory).
Sort then remove adjacent duplicates.
int find_unique(int* data, int* unique_out, int n) {
// 1. Sort
thrust::device_vector<int> sorted(data, data + n);
thrust::sort(sorted.begin(), sorted.end());
// 2. Remove adjacent duplicates
auto new_end = thrust::unique(sorted.begin(), sorted.end());
int unique_count = new_end - sorted.begin();
// 3. Copy result
thrust::copy(sorted.begin(), new_end, unique_out);
return unique_count;
}O(n²) nested loops.
// Slow: O(n²) for each element check
int unique_naive(int* data, int* out, int n) {
int count = 0;
for (int i = 0; i < n; i++) {
bool found = false;
for (int j = 0; j < count; j++) {
if (out[j] == data[i]) { found = true; break; }
}
if (!found) out[count++] = data[i];
}
return count;
}CUB provides optimized select-unique.
#include <cub/cub.cuh>
int unique_cub(int* data, int* unique_out, int n) {
// Sort first
thrust::sort(thrust::device, data, data + n);
// Use CUB for unique
int* d_num_selected;
cudaMalloc(&d_num_selected, sizeof(int));
size_t temp_bytes = 0;
cub::DeviceSelect::Unique(nullptr, temp_bytes, data, unique_out,
d_num_selected, n);
void* d_temp;
cudaMalloc(&d_temp, temp_bytes);
cub::DeviceSelect::Unique(d_temp, temp_bytes, data, unique_out,
d_num_selected, n);
int num_unique;
cudaMemcpy(&num_unique, d_num_selected, sizeof(int), D2H);
return num_unique;
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Unique (10M, 50% dups) | Timeout | 8ms | >1000x |
Track original indices, sort by (value, index), unique on value, sort result by original index.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.