Cosine similarity measures the angle between vectors, essential for embedding comparison, semantic search, and recommendation systems. Efficient CUDA implementation fuses dot product with norm computation and supports batched pairwise similarity matrices.
Compute dot product and both norms in single pass.
__global__ void cosine_sim_fused(float* a, float* b, float* sim, int n) {
__shared__ float s_dot, s_norm_a, s_norm_b;
float dot = 0, norm_a = 0, norm_b = 0;
for (int i = threadIdx.x; i < n; i += blockDim.x) {
float ai = a[i], bi = b[i];
dot += ai * bi;
norm_a += ai * ai;
norm_b += bi * bi;
}
dot = blockReduceSum(dot);
norm_a = blockReduceSum(norm_a);
norm_b = blockReduceSum(norm_b);
if (threadIdx.x == 0) {
float denom = sqrtf(norm_a) * sqrtf(norm_b);
*sim = (denom > 1e-8f) ? dot / denom : 0.0f;
}
}Three separate kernels, extra memory for normalized vectors.
void cosine_sim_naive(float* a, float* b, float* sim, int n) {
// Normalize a
l2_normalize<<<...>>>(a, a_norm, n);
// Normalize b
l2_normalize<<<...>>>(b, b_norm, n);
// Dot product
dot_product<<<...>>>(a_norm, b_norm, sim, n);
}Pre-compute norms, then compute all pairwise similarities.
// Compute NxM similarity matrix between query and database embeddings
__global__ void cosine_sim_matrix(float* queries, float* database,
float* sims, float* query_norms,
float* db_norms, int N, int M, int D) {
int query_idx = blockIdx.y;
int db_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (db_idx >= M) return;
float dot = 0;
for (int d = 0; d < D; d++) {
dot += queries[query_idx * D + d] * database[db_idx * D + d];
}
float denom = query_norms[query_idx] * db_norms[db_idx];
sims[query_idx * M + db_idx] = (denom > 1e-8f) ? dot / denom : 0.0f;
}
// Pre-compute norms once, reuse for all queries
void batch_cosine_sim(float* Q, float* DB, float* sims, int N, int M, int D) {
compute_l2_norms<<<...>>>(Q, q_norms, N, D);
compute_l2_norms<<<...>>>(DB, db_norms, M, D);
cosine_sim_matrix<<<dim3(M/256,N), 256>>>(Q, DB, sims, q_norms, db_norms, N, M, D);
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| 1M vector search (768-dim) | 45ms | 8ms | 5.6x faster |
| Memory for norms | Recomputed | Cached | N+M vs N*M |
Use cosine when magnitude should not affect similarity (text embeddings). Use dot product when magnitude is meaningful (trained with dot product loss).
Check for zero norm and return 0 similarity. In practice, add small epsilon to denominator.
Cosine sim = dot product of normalized vectors
Batched cosine sim is normalized GEMM
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.