/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/wrapper/operations/copy.hpp File Reference#
copy.hpp File Reference
#include "ck/wrapper/utils/tensor_utils.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
Go to the source code of this file.
Functions | |
template<typename DimAccessOrderTuple , index_t VectorDim, index_t ScalarPerVector, typename SrcTensorType , typename DstTensorType > | |
__device__ void | copy (const SrcTensorType &src_tensor, DstTensorType &dst_tensor) |
Perform optimized copy between two tensors partitions (threadwise copy). Tensors must have the same size. More... | |
template<typename SrcTensorType , typename DstTensorType > | |
__host__ __device__ void | copy (const SrcTensorType &src_tensor, DstTensorType &dst_tensor) |
Perform generic copy between two tensors partitions (threadwise copy). Tensors must have the same size. More... | |
template<typename DimAccessOrderTuple , index_t VectorDim, index_t ScalarPerVector, typename SrcTensorType , typename DstTensorType , typename ThreadShape , typename ThreadUnrolledDesc > | |
__device__ void | blockwise_copy (const SrcTensorType &src_tensor, DstTensorType &dst_tensor, [[maybe_unused]] const Layout< ThreadShape, ThreadUnrolledDesc > &thread_layout) |
Perform optimized blockwise copy between two tensors. Tensors must have the same size. More... | |
Function Documentation
◆ blockwise_copy()
template<typename DimAccessOrderTuple , index_t VectorDim, index_t ScalarPerVector, typename SrcTensorType , typename DstTensorType , typename ThreadShape , typename ThreadUnrolledDesc >
__device__ void blockwise_copy | ( | const SrcTensorType & | src_tensor, |
DstTensorType & | dst_tensor, | ||
[[maybe_unused] ] const Layout< ThreadShape, ThreadUnrolledDesc > & | thread_layout | ||
) |
Perform optimized blockwise copy between two tensors. Tensors must have the same size.
- Note
- At now Vgpr and Sgpr are not supported.
- Template Parameters
-
DimAccessOrderTuple Tuple with dimension access order. VectorDim Dimension for vectorize read and write. ScalarPerVector Number of scalar per vectorize read and write.
- Parameters
-
src_tensor Source tensor. dst_tensor Destination tensor. thread_layout Thread layout per each dimension for copy.
◆ copy() [1/2]
template<typename DimAccessOrderTuple , index_t VectorDim, index_t ScalarPerVector, typename SrcTensorType , typename DstTensorType >
__device__ void copy | ( | const SrcTensorType & | src_tensor, |
DstTensorType & | dst_tensor | ||
) |
Perform optimized copy between two tensors partitions (threadwise copy). Tensors must have the same size.
- Template Parameters
-
DimAccessOrderTuple Tuple with dimension access order. VectorDim Dimension for vectorized read and write. ScalarPerVector Number of scalar per vectorized read and write.
- Parameters
-
src_tensor Source tensor. dst_tensor Destination tensor.
◆ copy() [2/2]
template<typename SrcTensorType , typename DstTensorType >
__host__ __device__ void copy | ( | const SrcTensorType & | src_tensor, |
DstTensorType & | dst_tensor | ||
) |
Perform generic copy between two tensors partitions (threadwise copy). Tensors must have the same size.
- Parameters
-
src_tensor Source tensor. dst_tensor Destination tensor.