/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/wrapper/operations/copy.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/wrapper/operations/copy.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/wrapper/operations/copy.hpp Source File
copy.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
7 
14 
15 // Disable from doxygen docs generation
17 namespace ck {
18 namespace wrapper {
20 
31 template <typename DimAccessOrderTuple,
32  index_t VectorDim,
33  index_t ScalarPerVector,
34  typename SrcTensorType,
35  typename DstTensorType>
36 __device__ void copy(const SrcTensorType& src_tensor, DstTensorType& dst_tensor)
37 {
39  constexpr auto I0 = Number<0>{};
40  constexpr auto I1 = Number<1>{};
41 
42  const auto& in_grid_desc = layout(src_tensor).GetUnrolledDescriptor();
43  const auto& out_grid_desc = layout(dst_tensor).GetUnrolledDescriptor();
44 
45  using SrcShapeType = remove_cvref_t<decltype(shape(src_tensor))>;
46  constexpr index_t num_dims = SrcShapeType::Size();
47 
48  constexpr auto thread_slice_lengths =
49  generate_sequence_v2([](auto I) { return size(SrcShapeType{}.At(I)); }, Number<num_dims>{});
50  constexpr auto dim_access_order = generate_sequence_v2(
51  [](auto I) { return DimAccessOrderTuple{}.At(I); }, Number<num_dims>{});
52 
53  if constexpr(SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
54  {
55  // Perform a copy between DynamicBuffers
56  auto transfer = ThreadwiseTensorSliceTransfer_v7<
57  Tuple<typename SrcTensorType::TensorElementType>,
58  Tuple<typename DstTensorType::TensorElementType>,
59  decltype(tie(in_grid_desc)),
60  decltype(tie(out_grid_desc)),
61  tensor_operation::element_wise::PassThrough,
62  Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
63  decltype(thread_slice_lengths),
64  decltype(dim_access_order),
65  VectorDim,
66  ScalarPerVector,
67  Sequence<true>,
68  Sequence<true>>{in_grid_desc,
69  make_tuple(src_tensor.GetMultiIdxOffsets()),
70  out_grid_desc,
71  make_tuple(dst_tensor.GetMultiIdxOffsets()),
72  tensor_operation::element_wise::PassThrough{}};
73 
74  transfer.Run(tie(in_grid_desc),
75  tie(src_tensor.GetBuffer()),
76  tie(out_grid_desc),
77  tie(dst_tensor.GetBuffer()));
78  }
79  else if constexpr(!SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
80  {
81  // Perform copy from StaticBuffer to DynamicBuffer
82  const auto src_slice_origin_idxs =
83  generate_tuple([&](auto) { return I0; }, Number<num_dims>{});
84 
85  auto transfer =
86  ThreadwiseTensorSliceTransfer_v1r3<typename SrcTensorType::TensorElementType,
87  typename DstTensorType::TensorElementType,
88  remove_cvref_t<decltype(in_grid_desc)>,
89  remove_cvref_t<decltype(out_grid_desc)>,
90  tensor_operation::element_wise::PassThrough,
91  decltype(thread_slice_lengths),
92  decltype(dim_access_order),
93  VectorDim,
94  ScalarPerVector,
96  I1,
97  true>{out_grid_desc,
98  dst_tensor.GetMultiIdxOffsets(),
99  tensor_operation::element_wise::PassThrough{}};
100 
101  transfer.Run(in_grid_desc,
102  src_slice_origin_idxs,
103  src_tensor.GetBuffer(),
104  out_grid_desc,
105  dst_tensor.GetBuffer());
106  }
107  else if constexpr(SrcTensorType::IsDynamicBuffer && !DstTensorType::IsDynamicBuffer)
108  {
109  // Perform copy from DynamicBuffer to StaticBuffer
110  const auto dst_slice_origin_idxs =
111  generate_tuple([&](auto) { return I0; }, Number<num_dims>{});
112  auto transfer = ThreadwiseTensorSliceTransfer_v2<
113  std::remove_const_t<typename SrcTensorType::TensorElementType>,
114  std::remove_const_t<typename DstTensorType::TensorElementType>,
115  remove_cvref_t<decltype(in_grid_desc)>,
116  remove_cvref_t<decltype(out_grid_desc)>,
117  decltype(thread_slice_lengths),
118  decltype(dim_access_order),
119  VectorDim,
120  ScalarPerVector,
121  I1,
122  false,
123  false>{in_grid_desc, src_tensor.GetMultiIdxOffsets()};
124 
125  transfer.Run(in_grid_desc,
126  src_tensor.GetBuffer(),
127  out_grid_desc,
128  dst_slice_origin_idxs,
129  dst_tensor.GetBuffer());
130  }
131  else
132  {
133  // Perform copy between StaticBuffers
134  static_for<0, SrcShapeType::Size(), 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); });
135  }
136 }
137 
145 template <typename SrcTensorType, typename DstTensorType>
146 __host__ __device__ void copy(const SrcTensorType& src_tensor, DstTensorType& dst_tensor)
147 {
148  // Generate default params
149  using SrcShapeType = remove_cvref_t<decltype(shape(src_tensor))>;
150  constexpr index_t num_dims = SrcShapeType::Size();
151  // Incrementing dims 0, 1, 2 ... num_dims - 1
152  constexpr auto dim_access_order_tuple =
153  generate_tuple([](auto i) { return Number<i>{}; }, Number<num_dims>{});
154  constexpr index_t vector_dim = num_dims - 1;
155  constexpr index_t scalar_per_vector = 1;
156  copy<decltype(dim_access_order_tuple), vector_dim, scalar_per_vector>(src_tensor, dst_tensor);
157 }
158 
172 template <typename DimAccessOrderTuple,
173  index_t VectorDim,
174  index_t ScalarPerVector,
175  typename SrcTensorType,
176  typename DstTensorType,
177  typename ThreadShape,
178  typename ThreadUnrolledDesc>
179 __device__ void
180 blockwise_copy(const SrcTensorType& src_tensor,
181  DstTensorType& dst_tensor,
182  [[maybe_unused]] const Layout<ThreadShape, ThreadUnrolledDesc>& thread_layout)
183 {
184  static_assert(SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer);
186 
187  const auto& in_grid_desc = layout(src_tensor).GetUnrolledDescriptor();
188  const auto& out_grid_desc = layout(dst_tensor).GetUnrolledDescriptor();
189 
190  using SrcShapeType = remove_cvref_t<decltype(shape(src_tensor))>;
191  constexpr index_t num_dims = SrcShapeType::Size();
192 
193  constexpr auto tile_lengths_seq =
194  generate_sequence_v2([](auto I) { return size(SrcShapeType{}.At(I)); }, Number<num_dims>{});
195  constexpr auto thread_layout_seq =
196  generate_sequence_v2([](auto I) { return size<I>(ThreadShape{}); }, Number<num_dims>{});
197  constexpr auto dim_access_order = generate_sequence_v2(
198  [](auto I) { return DimAccessOrderTuple{}.At(I); }, Number<num_dims>{});
199 
200  using ThisThreadBlock = ThisThreadBlock<size(ThreadShape{})>;
201 
202  // Perform copy between DynamicBuffers
203  auto transfer = ThreadGroupTensorSliceTransfer_v7<
204  ThisThreadBlock,
205  Tuple<typename SrcTensorType::TensorElementType>,
206  Tuple<typename DstTensorType::TensorElementType>,
207  decltype(tie(in_grid_desc)),
208  decltype(tie(out_grid_desc)),
209  tensor_operation::element_wise::PassThrough,
210  Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
211  std::remove_const_t<decltype(tile_lengths_seq)>,
212  std::remove_const_t<decltype(thread_layout_seq)>,
213  std::remove_const_t<decltype(dim_access_order)>,
214  std::remove_const_t<decltype(dim_access_order)>,
215  VectorDim,
216  ScalarPerVector,
217  Sequence<true>,
218  Sequence<true>>{in_grid_desc,
219  make_tuple(src_tensor.GetMultiIdxOffsets()),
220  out_grid_desc,
221  make_tuple(dst_tensor.GetMultiIdxOffsets()),
222  tensor_operation::element_wise::PassThrough{}};
223 
224  transfer.Run(tie(in_grid_desc),
225  tie(src_tensor.GetBuffer()),
226  tie(out_grid_desc),
227  tie(dst_tensor.GetBuffer()));
228 }
229 
230 } // namespace wrapper
231 } // namespace ck
__device__ void blockwise_copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor, [[maybe_unused]] const Layout< ThreadShape, ThreadUnrolledDesc > &thread_layout)
Perform optimized blockwise copy between two tensors. Tensors must have the same size.
Definition: copy.hpp:180
__device__ void copy(const SrcTensorType &src_tensor, DstTensorType &dst_tensor)
Perform optimized copy between two tensors partitions (threadwise copy). Tensors must have the same s...
Definition: copy.hpp:36
__host__ constexpr __device__ const auto & shape(const LayoutType &layout)
Get Layout shape.
Definition: layout_utils.hpp:431
Definition: ck.hpp:267
__host__ constexpr __device__ auto generate_tuple(F &&f, Number< N >)
Definition: tuple_helper.hpp:21
constexpr Tuple< Args &... > tie(Args &... args) noexcept
Definition: tuple.hpp:218
__host__ constexpr __device__ auto generate_sequence_v2(F &&f, Number< N >)
Definition: sequence_helper.hpp:25
__host__ constexpr __device__ auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:211
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:297
int32_t index_t
Definition: ck.hpp:298
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1350
Layout wrapper that performs the tensor descriptor logic.
Definition: layout.hpp:24
__host__ constexpr __device__ const auto & layout(const Tensor< BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType > &tensor)
Get Tensor Layout.
Definition: tensor_utils.hpp:162