31 template <
typename DimAccessOrderTuple,
34 typename SrcTensorType,
35 typename DstTensorType>
36 __device__
void copy(
const SrcTensorType& src_tensor, DstTensorType& dst_tensor)
39 constexpr
auto I0 = Number<0>{};
40 constexpr
auto I1 = Number<1>{};
42 const auto& in_grid_desc =
layout(src_tensor).GetUnrolledDescriptor();
43 const auto& out_grid_desc =
layout(dst_tensor).GetUnrolledDescriptor();
46 constexpr
index_t num_dims = SrcShapeType::Size();
48 constexpr
auto thread_slice_lengths =
51 [](
auto I) {
return DimAccessOrderTuple{}.At(I); }, Number<num_dims>{});
53 if constexpr(SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
56 auto transfer = ThreadwiseTensorSliceTransfer_v7<
57 Tuple<typename SrcTensorType::TensorElementType>,
58 Tuple<typename DstTensorType::TensorElementType>,
59 decltype(
tie(in_grid_desc)),
60 decltype(
tie(out_grid_desc)),
61 tensor_operation::element_wise::PassThrough,
63 decltype(thread_slice_lengths),
64 decltype(dim_access_order),
68 Sequence<true>>{in_grid_desc,
72 tensor_operation::element_wise::PassThrough{}};
74 transfer.Run(
tie(in_grid_desc),
75 tie(src_tensor.GetBuffer()),
77 tie(dst_tensor.GetBuffer()));
79 else if constexpr(!SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
82 const auto src_slice_origin_idxs =
86 ThreadwiseTensorSliceTransfer_v1r3<
typename SrcTensorType::TensorElementType,
87 typename DstTensorType::TensorElementType,
90 tensor_operation::element_wise::PassThrough,
91 decltype(thread_slice_lengths),
92 decltype(dim_access_order),
98 dst_tensor.GetMultiIdxOffsets(),
99 tensor_operation::element_wise::PassThrough{}};
101 transfer.Run(in_grid_desc,
102 src_slice_origin_idxs,
103 src_tensor.GetBuffer(),
105 dst_tensor.GetBuffer());
107 else if constexpr(SrcTensorType::IsDynamicBuffer && !DstTensorType::IsDynamicBuffer)
110 const auto dst_slice_origin_idxs =
112 auto transfer = ThreadwiseTensorSliceTransfer_v2<
113 std::remove_const_t<typename SrcTensorType::TensorElementType>,
114 std::remove_const_t<typename DstTensorType::TensorElementType>,
117 decltype(thread_slice_lengths),
118 decltype(dim_access_order),
123 false>{in_grid_desc, src_tensor.GetMultiIdxOffsets()};
125 transfer.Run(in_grid_desc,
126 src_tensor.GetBuffer(),
128 dst_slice_origin_idxs,
129 dst_tensor.GetBuffer());
134 static_for<0, SrcShapeType::Size(), 1>{}([&](
auto i) { dst_tensor(i) = src_tensor(i); });
145 template <
typename SrcTensorType,
typename DstTensorType>
146 __host__ __device__
void copy(
const SrcTensorType& src_tensor, DstTensorType& dst_tensor)
150 constexpr
index_t num_dims = SrcShapeType::Size();
152 constexpr
auto dim_access_order_tuple =
153 generate_tuple([](
auto i) {
return Number<i>{}; }, Number<num_dims>{});
154 constexpr
index_t vector_dim = num_dims - 1;
155 constexpr
index_t scalar_per_vector = 1;
156 copy<decltype(dim_access_order_tuple), vector_dim, scalar_per_vector>(src_tensor, dst_tensor);
172 template <
typename DimAccessOrderTuple,
175 typename SrcTensorType,
176 typename DstTensorType,
177 typename ThreadShape,
178 typename ThreadUnrolledDesc>
181 DstTensorType& dst_tensor,
184 static_assert(SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer);
187 const auto& in_grid_desc =
layout(src_tensor).GetUnrolledDescriptor();
188 const auto& out_grid_desc =
layout(dst_tensor).GetUnrolledDescriptor();
191 constexpr
index_t num_dims = SrcShapeType::Size();
193 constexpr
auto tile_lengths_seq =
195 constexpr
auto thread_layout_seq =
198 [](
auto I) {
return DimAccessOrderTuple{}.At(I); }, Number<num_dims>{});
200 using ThisThreadBlock = ThisThreadBlock<size(ThreadShape{})>;
203 auto transfer = ThreadGroupTensorSliceTransfer_v7<
205 Tuple<typename SrcTensorType::TensorElementType>,
206 Tuple<typename DstTensorType::TensorElementType>,
207 decltype(
tie(in_grid_desc)),
208 decltype(
tie(out_grid_desc)),
209 tensor_operation::element_wise::PassThrough,
211 std::remove_const_t<decltype(tile_lengths_seq)>,
212 std::remove_const_t<decltype(thread_layout_seq)>,
213 std::remove_const_t<decltype(dim_access_order)>,
214 std::remove_const_t<decltype(dim_access_order)>,
218 Sequence<true>>{in_grid_desc,
222 tensor_operation::element_wise::PassThrough{}};
224 transfer.Run(
tie(in_grid_desc),
225 tie(src_tensor.GetBuffer()),
227 tie(dst_tensor.GetBuffer()));
__device__ void blockwise_copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor, [[maybe_unused]] const Layout< ThreadShape, ThreadUnrolledDesc > &thread_layout)
Perform optimized blockwise copy between two tensors. Tensors must have the same size.
Definition: copy.hpp:180
__device__ void copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor)
Perform optimized copy between two tensors partitions (threadwise copy). Tensors must have the same s...
Definition: copy.hpp:36
__host__ constexpr __device__ const auto & shape(const LayoutType &layout)
Get Layout shape.
Definition: layout_utils.hpp:431
__host__ constexpr __device__ auto generate_tuple(F &&f, Number< N >)
Definition: tuple_helper.hpp:21
constexpr Tuple< Args &... > tie(Args &... args) noexcept
Definition: tuple.hpp:218
__host__ constexpr __device__ auto generate_sequence_v2(F &&f, Number< N >)
Definition: sequence_helper.hpp:25
__host__ constexpr __device__ auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:211
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:297
int32_t index_t
Definition: ck.hpp:298
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1350
Layout wrapper that performs the tensor descriptor logic.
Definition: layout.hpp:24
__host__ constexpr __device__ const auto & layout(const Tensor< BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType > &tensor)
Get Tensor Layout.
Definition: tensor_utils.hpp:162