/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/tensor/load_tile_transpose.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/tensor/load_tile_transpose.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/tensor/load_tile_transpose.hpp Source File
load_tile_transpose.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
17 
18 namespace ck_tile {
19 
20 constexpr int DS_READ_TR_SIZE()
21 {
22  return 8; // Literal constant, evaluated at compile time
23 }
24 
25 namespace util {
26 template <typename Suffix, typename Sequence>
28 {
29  static constexpr bool size_check = (Suffix::size() <= Sequence::size());
30 
31  static constexpr index_t start_pos = Sequence::size() - Suffix::size();
32  using extract_indices = typename arithmetic_sequence_gen<start_pos, Sequence::size(), 1>::type;
33 
34  static constexpr bool value =
35  size_check && (Suffix{} == decltype(Sequence::extract(extract_indices{})){});
36 };
37 
38 template <index_t... Xs>
40 {
41  static constexpr bool value = true;
42 };
43 
44 template <typename Suffix, typename Sequence>
46 
47 } // namespace util
48 
49 // Default policy: Retains original 2D transpose behavior
50 template <typename DataType>
52 {
53  template <index_t LaneGroupSize>
54  struct Quad16
55  {
56  static_assert(LaneGroupSize == 64 || LaneGroupSize == 32 || LaneGroupSize == 16,
57  "LaneGroupSize must be 64, 32, or 16");
58  using InputEncoding =
60  tuple<sequence<4>, sequence<LaneGroupSize / 16, 4, 4>>,
64  sequence<2>>;
65 
72  sequence<0>>;
73  };
74 
75  template <index_t LaneGroupSize>
76  struct Quad8
77  {
78  static_assert(LaneGroupSize == 64 || LaneGroupSize == 32 || LaneGroupSize == 16,
79  "LaneGroupSize must be 64, 32, or 16");
80  using InputEncoding =
82  tuple<sequence<8>, sequence<LaneGroupSize / 16, 2, 8>>,
86  sequence<2>>;
87 
94  sequence<0>>;
95  };
96 
97  // Select based on data size
98  template <index_t LaneGroupSize>
99  using QuadInputEncoding = std::conditional_t<sizeof(DataType) == 2,
102 
103  template <index_t LaneGroupSize>
104  using QuadOutputEncoding = std::conditional_t<sizeof(DataType) == 2,
107 
108  // Always swap last two dimensions
109  static constexpr auto transpose_dims = sequence<1, 0>{};
110 
111  // Programmable: Element grouping function
112  static constexpr auto group_func = [](auto idx) {
113  return idx; // Identity mapping
114  };
115 
116  template <typename InDstrEncode, bool ReverseDirection, index_t LaneGroupSize>
118  {
119  using QuadEncoding = std::conditional_t<ReverseDirection,
122  static constexpr auto I0 = number<0>{};
123  static constexpr auto I1 = number<1>{};
124  static constexpr auto input_hs = InDstrEncode::hs_lengthss_;
125  static constexpr auto quad_hs = QuadEncoding::hs_lengthss_;
126  // 1. Must be 2D tensor
127  static constexpr bool dims_valid = (InDstrEncode::NDimX == 2);
128  // 2. Quad pattern must be suffix of input pattern
129  static constexpr bool suffix_valid_dim0 =
130  util::is_sequence_suffix_v<decltype(quad_hs[I0]), decltype(input_hs[I0])>;
131  static constexpr bool suffix_valid_dim1 =
132  util::is_sequence_suffix_v<decltype(quad_hs[I1]), decltype(input_hs[I1])>;
133 
134  // 3. PS→RHS mapping constraints
135  static constexpr auto input_ps_major = InDstrEncode::ps_to_rhss_major_;
136  static constexpr auto input_ps_minor = InDstrEncode::ps_to_rhss_minor_;
137 
138  static constexpr auto quad_ps_major0 = QuadEncoding::ps_to_rhss_major_[I0];
139  static constexpr auto quad_ps_minor0 = QuadEncoding::ps_to_rhss_minor_[I0];
140 
141  static constexpr auto input_ps_major_last =
142  input_ps_major[number<input_ps_major.size() - 1>{}];
143  static constexpr auto input_ps_minor_last =
144  input_ps_minor[number<input_ps_minor.size() - 1>{}];
145 
146  using psys_offset = ck_tile::sequence<input_hs[I0].size() - quad_hs[I0].size(),
147  input_hs[I1].size() - quad_hs[I1].size()>;
149  [](auto i) {
150  return number<quad_ps_minor0[i] + psys_offset{}[quad_ps_major0[i] - 1]>{};
151  },
152  number<quad_ps_minor0.size()>{});
153 
154  static constexpr bool ps_mapping_valid =
157  decltype(input_ps_minor_last)>;
158 
159  // 4. YS→RHS mapping constraints
160  static constexpr auto input_ys_major = InDstrEncode::ys_to_rhs_major_;
161  static constexpr auto input_ys_minor = InDstrEncode::ys_to_rhs_minor_;
162  static constexpr auto quad_ys_major = QuadEncoding::ys_to_rhs_major_;
163  static constexpr auto quad_ys_minor = QuadEncoding::ys_to_rhs_minor_;
164 
165  static_assert(quad_ys_major.size() == 1 && quad_ys_minor.size() == 1,
166  "YS->RHS mapping must be single dimension");
167  static_assert(quad_ys_major.back() == 2 && quad_ys_minor.back() == quad_hs[I1].size() - 1,
168  "YS->RHS mapping must be the last dimension");
169  static constexpr bool ys_mapping_valid =
170  (input_ys_major.back() == 2) && (input_ys_minor.back() == input_hs[I1].size() - 1);
171 
172  static constexpr bool value = dims_valid && suffix_valid_dim0 && suffix_valid_dim1 &&
174  };
175 
176  template <typename InDstrEncode, bool ReverseDirection = false>
178  {
179  static constexpr bool value =
183  static constexpr index_t LaneGroupSize =
187  : 0;
188  };
189 };
190 template <typename TileDistribution_, typename DataType_, typename Policy>
192 {
194 
195  using Validator = typename Policy::template ValidationTraits<InDstrEncode>;
196 
197  static constexpr bool distr_encoding_valid = Validator::value;
198 };
199 
200 // this is used to generate the transposed output tile distribution encoding
201 // based on the input tile distribution encoding
202 template <typename TileDistributionEncoding_,
203  typename DataType_,
204  typename Policy = DefaultTranspose<DataType_>,
205  bool ReverseDirection = false>
207 {
209  static constexpr auto input_hs_lengthss = InDstrEncode::hs_lengthss_;
210  static constexpr index_t LaneGroupSize =
211  Policy::template ValidationTraits<InDstrEncode, ReverseDirection>::LaneGroupSize;
212  static_assert(Policy::template ValidationTraits<InDstrEncode, ReverseDirection>::value,
213  "The input tile distribution encoding is not valid for transpose!");
214 
216  ReverseDirection,
217  typename Policy::template QuadOutputEncoding<LaneGroupSize>,
218  typename Policy::template QuadInputEncoding<LaneGroupSize>>;
220  ReverseDirection,
221  typename Policy::template QuadInputEncoding<LaneGroupSize>,
222  typename Policy::template QuadOutputEncoding<LaneGroupSize>>;
223 
224  static constexpr auto quad_input_hs_lengthss = QuadInputEncoding::hs_lengthss_;
225  static constexpr auto quad_output_hs_lengthss = QuadOutputEncoding::hs_lengthss_;
226 
227  static constexpr auto input_ps_to_rhss_major = InDstrEncode::ps_to_rhss_major_;
228  static constexpr auto input_ps_to_rhss_minor = InDstrEncode::ps_to_rhss_minor_;
229  static constexpr auto input_ys_to_rhs_major = InDstrEncode::ys_to_rhs_major_;
230  static constexpr auto input_ys_to_rhs_minor = InDstrEncode::ys_to_rhs_minor_;
231 
232  static constexpr auto I0 = number<0>{};
233  static constexpr auto quad_input_ps_to_rhss_major0 = QuadInputEncoding::ps_to_rhss_major_[I0];
234  static constexpr auto quad_input_ps_to_rhss_minor0 = QuadInputEncoding::ps_to_rhss_minor_[I0];
235  static constexpr auto quad_output_ps_to_rhss_major0 = QuadOutputEncoding::ps_to_rhss_major_[I0];
236  static constexpr auto quad_output_ps_to_rhss_minor0 = QuadOutputEncoding::ps_to_rhss_minor_[I0];
237  static constexpr auto quad_output_ys_to_rhs_major = QuadOutputEncoding::ys_to_rhs_major_;
238  static constexpr auto quad_output_ys_to_rhs_minor = QuadOutputEncoding::ys_to_rhs_minor_;
239 
240  static constexpr index_t dim0 = Policy::transpose_dims[0];
241  static constexpr index_t dim1 = Policy::transpose_dims[1];
242 
243  static constexpr auto swap_one_and_two = [](const index_t idx) {
244  return (idx == 1) ? 2 : (idx == 2) ? 1 : idx;
245  };
246 
247  // for transpose load
248  // remove the quad_input_hs_lengthss from the input_hs_lengthss for each dimension and reverse
249  // dims and append the quad_output_hs_lengthss to the end of each dimension
250  static constexpr auto outer_hs_lengthss = generate_tuple(
251  [](auto i) {
252  constexpr auto input_i = input_hs_lengthss[i];
253  constexpr auto outer_len = input_i.size() - quad_input_hs_lengthss[i].size();
254  return typename sequence_split<decltype(input_i), outer_len>::left_type{};
255  },
258  static constexpr auto dst_out_hs_lengthss = generate_tuple(
259  [](auto i) {
260  auto outer_i = reversed_outer_hs_lengthss[i];
261  // append the reversed quad output hs lengths to the outer hs lengths
262  return outer_i.push_back(quad_output_hs_lengthss[i]);
263  },
265 
266  // for PS→RHS mapping(both major and minor), we need to modify the last element (which is for
267  // thread distr) of the major sequence
268  static constexpr auto dst_ps_to_rhss_major = generate_tuple(
269  // for major because of dst_out_hs_lengthss is reversed, this index also need to be reversed
270  [](auto i) {
271  if constexpr(i == input_ps_to_rhss_major.size() - 1)
272  {
273  constexpr auto current_size = input_ps_to_rhss_major[i].size();
274  constexpr auto reduce_size = quad_input_ps_to_rhss_major0.size();
275  constexpr auto quad_out = quad_output_ps_to_rhss_major0;
276  constexpr auto reduced_ps_to_rhss_major = input_ps_to_rhss_major[i].extract(
278  return reduced_ps_to_rhss_major.transform(swap_one_and_two).push_back(quad_out);
279  }
280  else
281  {
282  // For all other sequences (i.e. warp), keep them unchanged
283  return input_ps_to_rhss_major[i].transform(swap_one_and_two);
284  }
285  },
286  number<input_ps_to_rhss_major.size()>{});
287 
288  static constexpr auto quad_idx_offset =
289  transform_tuples([](auto x) { return number<x.size()>{}; }, reversed_outer_hs_lengthss);
290 
291  // minus 1 because RsLength is not counted
293  [](auto x) { return quad_idx_offset[number<x - 1>{}]; }, quad_output_ps_to_rhss_major0));
295  [](auto x) { return quad_idx_offset[number<x - 1>{}]; }, quad_output_ys_to_rhs_major));
296 
297  static constexpr auto dst_ps_to_rhss_minor = generate_tuple(
298  [](auto i) {
299  constexpr auto input_i = input_ps_to_rhss_minor[i];
300  if constexpr(i == input_ps_to_rhss_minor.size() - 1)
301  {
302  constexpr auto outer_len = input_i.size() - quad_input_ps_to_rhss_minor0.size();
303  constexpr auto outer_ps =
304  typename sequence_split<decltype(input_i), outer_len>::left_type{};
305 
306  return outer_ps.push_back(quad_output_ps_minor_offset +
308  }
309  else
310  {
311  // For all other sequences, keep them unchanged
312  return input_i;
313  }
314  },
315  number<input_ps_to_rhss_minor.size()>{});
316 
317  static constexpr auto outer_input_ys_to_rhs_major = input_ys_to_rhs_major.pop_back();
318 
319  // for major because of dst_out_hs_lengthss is reversed, this index also need to be reversed
320  static constexpr auto dst_ys_to_rhs_major =
322 
323  static constexpr auto dst_ys_to_rhs_minor = input_ys_to_rhs_minor.pop_back().push_back(
325 
327  tile_distribution_encoding<typename InDstrEncode::RsLengths,
333 };
334 
335 template <typename TileDistributionEncoding_,
336  typename DataType_,
337  typename Policy = DefaultTranspose<DataType_>>
340 template <typename TileDistributionEncoding_,
341  typename DataType_,
342  typename Policy = DefaultTranspose<DataType_>>
345 
346 template <typename InnerEncode,
347  index_t kLeadIterPerWarp,
348  index_t kSecondIterPerWarp,
349  index_t kLeadNumWarps,
350  index_t kSecondNumWarps>
352 {
353  constexpr auto block_outer_dst_encoding =
360  sequence<0, 0>>{};
361  constexpr auto blk_distr_encode =
362  detail::make_embed_tile_distribution_encoding(block_outer_dst_encoding, InnerEncode{});
363 
364  return blk_distr_encode;
365 }
366 
392 template <
393  typename BottomTensorView_,
394  typename WindowLengths_,
395  typename TileDistribution_,
396  index_t NumCoord,
397  typename Policy = DefaultTranspose<typename BottomTensorView_::DataType>,
398  typename = std::enable_if_t<TransposeTileDistrChecker<TileDistribution_,
399  typename BottomTensorView_::DataType,
400  Policy>::distr_encoding_valid,
401  Policy>>
402 CK_TILE_DEVICE auto
404  WindowLengths_,
405  TileDistribution_,
406  NumCoord>& tile_window)
407 {
408  using OutTileDstrEncode = typename OutputTileDistributionTraits<
409  typename TileDistribution_::DstrEncode,
410  typename BottomTensorView_::DataType>::TransposedDstrEncode;
411  auto out_tensor = make_static_distributed_tensor<typename BottomTensorView_::DataType>(
412  make_static_tile_distribution(OutTileDstrEncode{}));
413  auto trans_tensor = tile_window.template load_transpose<Policy>();
414  constexpr auto input_distr = TileDistribution_{};
415  constexpr auto output_distr = make_static_tile_distribution(OutTileDstrEncode{});
416 
417  constexpr auto y_in_desc = input_distr.get_ys_to_d_descriptor();
418  constexpr auto y_out_desc = output_distr.get_ys_to_d_descriptor();
419 
420  constexpr index_t NDimYIn = input_distr.get_num_of_dimension_y();
421  constexpr index_t NDimYOut = output_distr.get_num_of_dimension_y();
422 
423  constexpr auto y_in_lengths = to_sequence(y_in_desc.get_lengths());
424  constexpr auto y_out_lengths = to_sequence(y_out_desc.get_lengths());
425 
426  constexpr auto y_in_element_space_size = y_in_desc.get_element_space_size();
427  constexpr auto y_out_element_space_size = y_out_desc.get_element_space_size();
428  static_assert(y_in_element_space_size == y_out_element_space_size,
429  "the element space size is not the same!");
430  static_assert(y_in_lengths[NDimYIn - 1] == y_out_lengths[NDimYOut - 1],
431  "the vector length is not the same!");
432  constexpr index_t vecLoadSize = y_in_lengths[NDimYIn - 1];
433  constexpr index_t num_of_access =
434  reduce_on_sequence(y_in_lengths, multiplies{}, number<1>{}) / vecLoadSize;
435 
437  static_for<0, num_of_access, 1>{}([&](auto iAccess) {
438  out_tensor.get_thread_buffer().template set_as<DataVec>(
439  number<iAccess>{},
440  trans_tensor.get_thread_buffer().template get_as<DataVec>(number<iAccess>{}));
441  });
442 
443  return out_tensor;
444 }
445 
446 } // namespace ck_tile
#define CK_TILE_DEVICE
Definition: config.hpp:41
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:42
constexpr CK_TILE_HOST_DEVICE auto make_embed_tile_distribution_encoding(OuterDstr, InnerDstr)
Definition: tile_distribution_encoding.hpp:457
constexpr bool is_sequence_suffix_v
Definition: load_tile_transpose.hpp:45
Definition: cluster_descriptor.hpp:13
int32_t index_t
Definition: integer.hpp:9
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:21
constexpr CK_TILE_HOST_DEVICE auto generate_sequence_v2(F &&f, number< N >)
Definition: sequence.hpp:1042
constexpr CK_TILE_HOST_DEVICE index_t reduce_on_sequence(Seq, Reduce f, number< Init >)
Definition: sequence.hpp:979
CK_TILE_DEVICE auto load_tile_transpose(const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window)
transpose loads tile from a tensor and returns the resulting tensor with a new (transposed) tile dist...
Definition: load_tile_transpose.hpp:403
constexpr CK_TILE_HOST_DEVICE auto to_sequence(tuple< number< Is >... >)
Definition: sequence.hpp:1052
constexpr CK_TILE_HOST_DEVICE auto tuple_reverse(const tuple< Ts... > &t)
Definition: tuple.hpp:583
constexpr CK_TILE_HOST_DEVICE auto generate_tuple_for(F &&f, sequence< ids... >)
Definition: tuple.hpp:423
constexpr CK_TILE_HOST_DEVICE auto InputTileDistributionEncoding()
Definition: load_tile_transpose.hpp:351
constexpr CK_TILE_HOST_DEVICE auto generate_tuple(F &&f, number< N >)
Definition: tuple.hpp:429
constexpr int DS_READ_TR_SIZE()
Definition: load_tile_transpose.hpp:20
constexpr CK_TILE_HOST_DEVICE auto make_static_tile_distribution(StaticTileDistributionEncoding_)
Definition: tile_distribution.hpp:480
constexpr CK_TILE_HOST_DEVICE auto transform_tuples(F f, const X &x)
Definition: tuple.hpp:505
typename conditional< predicate, X, Y >::type conditional_t
Definition: functional.hpp:115
typename std::enable_if< B, T >::type enable_if_t
Definition: enable_if.hpp:27
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1350
Definition: load_tile_transpose.hpp:55
Definition: load_tile_transpose.hpp:77
Definition: load_tile_transpose.hpp:178
static constexpr index_t LaneGroupSize
Definition: load_tile_transpose.hpp:183
static constexpr bool value
Definition: load_tile_transpose.hpp:179
Definition: load_tile_transpose.hpp:118
static constexpr bool ys_mapping_valid
Definition: load_tile_transpose.hpp:169
static constexpr auto I1
Definition: load_tile_transpose.hpp:123
static constexpr bool suffix_valid_dim1
Definition: load_tile_transpose.hpp:131
static constexpr bool value
Definition: load_tile_transpose.hpp:172
static constexpr auto quad_ys_major
Definition: load_tile_transpose.hpp:162
static constexpr auto quad_ys_minor
Definition: load_tile_transpose.hpp:163
static constexpr auto quad_ps_minor0
Definition: load_tile_transpose.hpp:139
static constexpr auto quad_hs
Definition: load_tile_transpose.hpp:125
static constexpr auto input_ys_minor
Definition: load_tile_transpose.hpp:161
static constexpr auto input_ps_major_last
Definition: load_tile_transpose.hpp:141
static constexpr auto input_ps_minor_last
Definition: load_tile_transpose.hpp:143
static constexpr auto I0
Definition: load_tile_transpose.hpp:122
static constexpr auto shifted_quad_ps_minor0
Definition: load_tile_transpose.hpp:148
static constexpr bool ps_mapping_valid
Definition: load_tile_transpose.hpp:154
static constexpr auto input_ps_minor
Definition: load_tile_transpose.hpp:136
static constexpr auto input_hs
Definition: load_tile_transpose.hpp:124
static constexpr auto input_ps_major
Definition: load_tile_transpose.hpp:135
static constexpr bool dims_valid
Definition: load_tile_transpose.hpp:127
std::conditional_t< ReverseDirection, QuadOutputEncoding< LaneGroupSize >, QuadInputEncoding< LaneGroupSize > > QuadEncoding
Definition: load_tile_transpose.hpp:121
static constexpr bool suffix_valid_dim0
Definition: load_tile_transpose.hpp:129
static constexpr auto quad_ps_major0
Definition: load_tile_transpose.hpp:138
static constexpr auto input_ys_major
Definition: load_tile_transpose.hpp:160
Definition: load_tile_transpose.hpp:52
std::conditional_t< sizeof(DataType)==2, typename Quad16< LaneGroupSize >::InputEncoding, typename Quad8< LaneGroupSize >::InputEncoding > QuadInputEncoding
Definition: load_tile_transpose.hpp:101
static constexpr auto group_func
Definition: load_tile_transpose.hpp:112
static constexpr auto transpose_dims
Definition: load_tile_transpose.hpp:109
std::conditional_t< sizeof(DataType)==2, typename Quad16< LaneGroupSize >::OutputEncoding, typename Quad8< LaneGroupSize >::OutputEncoding > QuadOutputEncoding
Definition: load_tile_transpose.hpp:106
Definition: load_tile_transpose.hpp:192
static constexpr bool distr_encoding_valid
Definition: load_tile_transpose.hpp:197
typename Policy::template ValidationTraits< InDstrEncode > Validator
Definition: load_tile_transpose.hpp:195
typename remove_cvref_t< TileDistribution_ >::DstrEncode InDstrEncode
Definition: load_tile_transpose.hpp:193
Definition: load_tile_transpose.hpp:207
static constexpr auto quad_output_ps_to_rhss_minor0
Definition: load_tile_transpose.hpp:236
static constexpr auto input_ys_to_rhs_major
Definition: load_tile_transpose.hpp:229
static constexpr auto quad_input_ps_to_rhss_major0
Definition: load_tile_transpose.hpp:233
static constexpr auto outer_input_ys_to_rhs_major
Definition: load_tile_transpose.hpp:317
static constexpr auto quad_output_ps_to_rhss_major0
Definition: load_tile_transpose.hpp:235
static constexpr index_t dim1
Definition: load_tile_transpose.hpp:241
static constexpr auto dst_ps_to_rhss_major
Definition: load_tile_transpose.hpp:268
static constexpr index_t LaneGroupSize
Definition: load_tile_transpose.hpp:210
static constexpr auto quad_idx_offset
Definition: load_tile_transpose.hpp:288
remove_cvref_t< TileDistributionEncoding_ > InDstrEncode
Definition: load_tile_transpose.hpp:208
static constexpr auto dst_ys_to_rhs_minor
Definition: load_tile_transpose.hpp:323
static constexpr auto dst_out_hs_lengthss
Definition: load_tile_transpose.hpp:258
static constexpr auto quad_output_ys_to_rhs_minor
Definition: load_tile_transpose.hpp:238
static constexpr auto I0
Definition: load_tile_transpose.hpp:232
std::conditional_t< ReverseDirection, typename Policy::template QuadInputEncoding< LaneGroupSize >, typename Policy::template QuadOutputEncoding< LaneGroupSize > > QuadOutputEncoding
Definition: load_tile_transpose.hpp:222
static constexpr auto quad_input_ps_to_rhss_minor0
Definition: load_tile_transpose.hpp:234
static constexpr auto quad_input_hs_lengthss
Definition: load_tile_transpose.hpp:224
std::conditional_t< ReverseDirection, typename Policy::template QuadOutputEncoding< LaneGroupSize >, typename Policy::template QuadInputEncoding< LaneGroupSize > > QuadInputEncoding
Definition: load_tile_transpose.hpp:218
static constexpr auto swap_one_and_two
Definition: load_tile_transpose.hpp:243
static constexpr auto outer_hs_lengthss
Definition: load_tile_transpose.hpp:250
static constexpr auto reversed_outer_hs_lengthss
Definition: load_tile_transpose.hpp:257
static constexpr auto dst_ps_to_rhss_minor
Definition: load_tile_transpose.hpp:297
static constexpr index_t dim0
Definition: load_tile_transpose.hpp:240
static constexpr auto input_ys_to_rhs_minor
Definition: load_tile_transpose.hpp:230
static constexpr auto quad_output_ps_minor_offset
Definition: load_tile_transpose.hpp:292
static constexpr auto input_ps_to_rhss_major
Definition: load_tile_transpose.hpp:227
static constexpr auto dst_ys_to_rhs_major
Definition: load_tile_transpose.hpp:320
static constexpr auto quad_output_ys_to_rhs_major
Definition: load_tile_transpose.hpp:237
static constexpr auto quad_output_hs_lengthss
Definition: load_tile_transpose.hpp:225
static constexpr auto input_ps_to_rhss_minor
Definition: load_tile_transpose.hpp:228
static constexpr auto quad_output_ys_minor_offset
Definition: load_tile_transpose.hpp:294
static constexpr auto input_hs_lengthss
Definition: load_tile_transpose.hpp:209
Definition: sequence.hpp:284
typename std::conditional< kHasContent, type0, type1 >::type type
Definition: sequence.hpp:299
A fixed-size array container similar to std::array with additional utilities.
Definition: array.hpp:43
Definition: integral_constant.hpp:13
Definition: math.hpp:98
Definition: sequence.hpp:349
Definition: sequence.hpp:49
Definition: functional.hpp:43
Definition: tile_distribution_encoding.hpp:26
This class provides tile (windowed) view and access to the device memory.
Definition: tile_window.hpp:46
Definition: tuple.hpp:192
Definition: load_tile_transpose.hpp:28
typename arithmetic_sequence_gen< start_pos, Sequence::size(), 1 >::type extract_indices
Definition: load_tile_transpose.hpp:32
static constexpr bool value
Definition: load_tile_transpose.hpp:34
static constexpr bool size_check
Definition: load_tile_transpose.hpp:29
static constexpr index_t start_pos
Definition: load_tile_transpose.hpp:31