Loading...
Stack combines tensors along a new dimension (unlike concat which uses existing dimension). All input tensors must have same shape. Common for batching individual samples.
Copy all tensors in single kernel launch.
__global__ void stack_kernel(float** inputs, float* output,
int n_tensors, int tensor_size) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int total = n_tensors * tensor_size;
if (tid >= total) return;
int tensor_idx = tid / tensor_size;
int elem_idx = tid % tensor_size;
output[tid] = inputs[tensor_idx][elem_idx];
}N separate memcpy calls.
void stack_naive(float** inputs, float* output, int n, int size) {
for (int i = 0; i < n; i++) {
cudaMemcpy(output + i * size, inputs[i],
size * sizeof(float), cudaMemcpyDeviceToDevice);
}
}Single kernel launch for all tensors.
void stack_opt(float** inputs, float* output, int n, int size) {
// Copy input pointers to device
float** d_inputs;
cudaMalloc(&d_inputs, n * sizeof(float*));
cudaMemcpy(d_inputs, inputs, n * sizeof(float*), cudaMemcpyHostToDevice);
int total = n * size;
int blocks = (total + 255) / 256;
stack_kernel<<<blocks, 256>>>(d_inputs, output, n, size);
cudaFree(d_inputs);
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Latency (100 x 10K tensors) | 3.2ms | 0.15ms | 21x faster |
Stack: same-sized tensors, create batch dim. Concat: variable-sized, join existing dim.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.