Loading...
Cumulative product computes running products. Uses same scan algorithm as cumsum but with multiplication. Watch for numerical overflow/underflow with many multiplications.
Convert to log, cumsum, then exp for stability.
// cumprod(x) = exp(cumsum(log(x)))
void cumprod_stable(float* x, float* y, int n) {
// 1. log transform
thrust::transform(x, x + n, temp, logf_functor());
// 2. cumsum in log domain
thrust::inclusive_scan(temp, temp + n, temp);
// 3. exp transform
thrust::transform(temp, temp + n, y, expf_functor());
}Sequential, prone to overflow.
__global__ void cumprod_naive(float* x, float* y, int n) {
if (threadIdx.x == 0) {
y[0] = x[0];
for (int i = 1; i < n; i++)
y[i] = y[i-1] * x[i]; // May overflow!
}
}CUB scan with custom multiply operator.
#include <cub/cub.cuh>
struct MultOp {
__device__ float operator()(float a, float b) { return a * b; }
};
void cumprod_opt(float* x, float* y, int n) {
size_t temp_bytes = 0;
MultOp mult_op;
float init = 1.0f;
cub::DeviceScan::InclusiveScan(
nullptr, temp_bytes, x, y, mult_op, n);
void* d_temp;
cudaMalloc(&d_temp, temp_bytes);
cub::DeviceScan::InclusiveScan(
d_temp, temp_bytes, x, y, mult_op, n);
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Throughput | 0.4 GB/s | 280 GB/s | 700x faster |
Zero propagates to all subsequent elements. For log-domain, use log(x + epsilon) or track zeros separately.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.