Loading...
Flatten converts multi-dimensional tensor to 1D, typically before feeding to linear layers. For contiguous tensors, it's a free view operation. Common pattern: conv→flatten→linear.
Just change shape metadata for contiguous tensors.
Tensor flatten(Tensor& t, int start_dim = 0, int end_dim = -1) {
if (end_dim < 0) end_dim = t.ndim - 1;
// Compute flattened size
int flat_size = 1;
for (int d = start_dim; d <= end_dim; d++)
flat_size *= t.shape[d];
// Build new shape
std::vector<int> new_shape;
for (int d = 0; d < start_dim; d++) new_shape.push_back(t.shape[d]);
new_shape.push_back(flat_size);
for (int d = end_dim + 1; d < t.ndim; d++) new_shape.push_back(t.shape[d]);
if (t.is_contiguous()) {
return Tensor(t.data, new_shape); // Zero-copy view
} else {
return t.contiguous().flatten(start_dim, end_dim);
}
}Unnecessary copy.
void flatten_naive(float* in, float* out, int n) {
cudaMemcpy(out, in, n * sizeof(float), cudaMemcpyDeviceToDevice);
}Framework handles view vs copy decision.
// In practice, use framework's view operation
// PyTorch: x.flatten(1) # Keep batch dim
// This is just metadata change for contiguous tensors
// If implementing manually:
struct Tensor {
float* data;
std::vector<int> shape;
Tensor flatten(int start = 0) {
int flat_size = 1;
for (int i = start; i < shape.size(); i++)
flat_size *= shape[i];
std::vector<int> new_shape(shape.begin(), shape.begin() + start);
new_shape.push_back(flat_size);
return {data, new_shape}; // Same data pointer!
}
};| Metric | Naive | Optimized | Improvement |
|---|---|---|---|
| Latency (contiguous) | 50μs | 0μs | Instant |
Equivalent for 1D result. flatten(1) preserves batch dim, common for CNNs before linear layer.
Ready to optimize your CUDA code? Download RightNow AI and get real-time performance analysis for your kernels.