Loading...
Split divides a tensor into multiple tensors along a dimension. For contiguous tensors split along first dims, this is a zero-copy view operation—just pointer offsets.
Use pointer offsets instead of copying.
std::vector<Tensor> split(Tensor& t, int chunks, int dim = 0) {
int dim_size = t.shape[dim];
int chunk_size = (dim_size + chunks - 1) / chunks;
std::vector<Tensor> result;
int offset = 0;
for (int i = 0; i < chunks; i++) {
int size = std::min(chunk_size, dim_size - offset);
if (size <= 0) break;
// For dim=0, just offset the pointer
if (dim == 0 && t.is_contiguous()) {
int elements_per_slice = t.numel() / t.shape[0];
result.push_back({
t.data + offset * elements_per_slice,
compute_shape(t, dim, size),
t.strides
});
}
offset += size;
}
return result;
}Unnecessary copies.
void split_naive(float* input, float** outputs, int n, int chunk_size) {
for (int i = 0; i < n / chunk_size; i++) {
cudaMalloc(&outputs[i], chunk_size * sizeof(float));
cudaMemcpy(outputs[i], input + i * chunk_size,
chunk_size * sizeof(float), D2D);
}
}Pointer arithmetic only.
// For dim=0 contiguous tensor, split is just views
struct TensorView { float* data; int size; };
std::vector<TensorView> split_dim0(float* data, int total, int chunks) {
std::vector<TensorView> result;
int chunk_size = (total + chunks - 1) / chunks;
for (int i = 0; i < chunks; i++) {
int offset = i * chunk_size;
int size = std::min(chunk_size, total - offset);
if (size > 0) {
result.push_back({data + offset, size});
}
}
return result; // No data copies!
}| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Split 100M into 10 | 40ms | 0μs | Instant |
split: specify each chunk size. chunk: specify number of chunks (equal sizes).
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.