/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/tensor/tile_window_linear.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/tensor/tile_window_linear.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/tensor/tile_window_linear.hpp Source File
tile_window_linear.hpp
Go to the documentation of this file.
1 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2 // SPDX-License-Identifier: MIT
3 
4 #pragma once
19 
20 namespace ck_tile {
21 
22 #define WINDOW_DISPATCH_ISSUE() \
23  if constexpr(i_access < 0) \
24  { \
25  static_for<0, NumAccess, 1>{}([&](auto ia) { issue(ia); }); \
26  } \
27  else \
28  { \
29  static_assert(i_access < NumAccess); \
30  issue(number<i_access>{}); \
31  }
32 
33 //
34 // This version of tile window will pre-cache offset/flags based on need
35 //
36 // LinearBottomDims_, e.g seq<0, 1> for 2d tensor, the last one is linear dim
37 // so last dim can use immediate offset to indexing, can save register
38 // TODO: if using this struct, better use load_raw()/store_raw(), can control
39 // the the immediate offset on the fly
40 // space-filing-curve is non-snaked here!
41 // This struct inherits from tile_window_with_tile_dstr_base, which is an intermediary base class
42 // with the ultimate parent class being tile_window_base.
43 template <typename BottomTensorView_,
44  typename WindowLengths_,
45  typename StaticTileDistribution_,
46  typename LinearBottomDims_>
48  : public tile_window_with_tile_dstr_base<tile_window_linear<BottomTensorView_,
49  WindowLengths_,
50  StaticTileDistribution_,
51  LinearBottomDims_>,
52  BottomTensorView_,
53  WindowLengths_,
54  StaticTileDistribution_>
55 {
57  WindowLengths_,
58  StaticTileDistribution_,
59  LinearBottomDims_>,
60  BottomTensorView_,
61  WindowLengths_,
62  StaticTileDistribution_>;
63 
65 
66  static_assert(LinearBottomDims::size() == Base::BottomTensorView::get_num_of_dimension());
67 
68  static constexpr auto I0 = number<0>{};
69  static constexpr auto I1 = number<1>{};
70 
71  struct traits
72  {
73  private:
74  static constexpr auto get_num_non_linear_access()
75  {
76  constexpr auto sfc_access_lens = Base::Traits::SFC_Ys::access_lengths;
77  using ys_to_rhs_major =
78  typename decltype(typename Base::TileDstr{}
79  .get_static_tile_distribution_encoding())::Ys2RHsMajor;
80 
81  constexpr auto non_linear = [&]() {
82  index_t cnt = 1;
83  static_for<0, Base::NDimY, 1>{}([&](auto i_dim_y) {
84  constexpr auto rhs_major = ys_to_rhs_major{}[i_dim_y];
85  constexpr auto target_h_dim = number<rhs_major - 1>{}; // no r dim here!
86  if constexpr(LinearBottomDims{}[target_h_dim] == 0)
87  {
88  cnt *= sfc_access_lens[i_dim_y];
89  }
90  });
91  return cnt;
92  }();
93 
94  return non_linear;
95  }
96 
97  // example:
98  // non_linear_access_map: sequence<0, 0, 0, 0, 1, 1, 1, 1> for 8 access, totally 2 register
99  // used
100  // -> histogram : sequence<4, 4>
101  // -> prefixsum : seqneuce<0, 4, 8>
102  // non_linear_access_map: sequence<0, 1, 2, 3, 4, 5, 6, 7> for 8 access, totally 8 register
103  // used, will pre-cache 8
104  // -> histogram : sequence<1, 1, 1, 1, 1, 1, 1, 1>
105  // -> prefixsum : seqneuce<0, 1, 2, 3, 4, 5, 6, 7, 8>
106  // non_linear_access_map: sequence<0, 0, 1, 1, 2, 2, 3, 3> for 8 access, totally 4 register
107  // used, will pre-cache 4
108  // -> histogram : sequence<2, 2, 2, 2>
109  // -> prefixsum : seqneuce<0, 2, 4, 6, 8>
110  static constexpr auto get_non_linear_access_map()
111  {
112  constexpr auto sfc_access_lens = Base::Traits::SFC_Ys::access_lengths;
113  using ys_to_rhs_major =
114  typename decltype(typename Base::TileDstr{}
115  .get_static_tile_distribution_encoding())::Ys2RHsMajor;
116  constexpr auto non_linear_map = [&]() {
118  index_t cumulative_len_ = 1;
119  index_t cumulative_non_linear_len_ = 1;
120  static_for<0, Base::NDimY, 1>{}([&](auto i_y) {
121  constexpr auto i_dim_y = number<Base::NDimY - i_y - 1>{}; // from right to left
122  constexpr auto rhs_major = ys_to_rhs_major{}[i_dim_y];
123  constexpr auto target_h_dim = number<rhs_major - 1>{}; // no r dim here!
124  constexpr auto is_linear_dim = LinearBottomDims{}[target_h_dim];
125 
127  constexpr auto current_len_ = sfc_access_lens[i_dim_y];
128 
129  // copy cumulative length as current pattern
130  for(auto i_ = 0; i_ < cumulative_len_; i_++)
131  {
132  current_m_(i_) = m_[i_];
133  }
134  for(auto j_ = 0; j_ < current_len_; j_++)
135  {
136  auto j_offset_ = is_linear_dim ? 0 : j_ * cumulative_non_linear_len_;
137  for(auto i_ = 0; i_ < cumulative_len_; i_++)
138  {
139  m_(j_ * cumulative_len_ + i_) = current_m_[i_] + j_offset_;
140  }
141  }
142  cumulative_len_ *= current_len_;
143  if(!is_linear_dim)
144  cumulative_non_linear_len_ *= current_len_;
145  });
146  return m_;
147  }();
148 
149  return TO_SEQUENCE(non_linear_map, Base::Traits::NumAccess);
150  }
151 
152  static constexpr auto get_non_linear_access_histogram()
153  {
154  constexpr auto m_ = get_non_linear_access_map();
155 
156  constexpr auto r_ =
157  typename arithmetic_sequence_gen<0, get_num_non_linear_access() + 1, 1>::type{};
158 
159  constexpr auto h_ = histogram_sorted_sequence(m_, r_);
160 
161  return h_;
162  }
163 
164  static constexpr auto get_non_linear_access_histogram_prefix_sum()
165  {
166  constexpr auto h_ = get_non_linear_access_histogram();
167  constexpr auto h_prefix_sum_ = prefix_sum_sequence(h_);
168  return h_prefix_sum_;
169  }
170 
171  public:
172  static constexpr index_t NumAccess_NonLinear = get_num_non_linear_access();
173  using AccessMap_NonLinear = decltype(get_non_linear_access_map()); // sequence
174  using AccessHistogram_NonLinear = decltype(get_non_linear_access_histogram());
175  using AccessPrefixSum_NonLinear = decltype(get_non_linear_access_histogram_prefix_sum());
176  };
177 
178  static constexpr index_t NumAccess = Base::Traits::NumAccess;
183 
184  CK_TILE_DEVICE constexpr tile_window_linear() = default;
185 
187  const typename Base::BottomTensorView& bottom_tensor_view,
188  const typename Base::WindowLengths& window_lengths,
189  const typename Base::BottomTensorIndex& window_origin,
190  const typename Base::TileDstr& tile_distribution)
192  {
193  this->bottom_tensor_view_ = bottom_tensor_view;
194  this->window_lengths_ = window_lengths;
195  this->window_origin_ = window_origin;
196  this->tile_dstr_ = tile_distribution;
197  auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
200  make_tuple(get_warp_id(), get_lane_id()),
201  generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimY>{})));
202 
203  typename Base::BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
204  window_origin + window_adaptor_thread_coord_tmp.get_bottom_index();
205 
206  auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate(
207  this->bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
208 
209  // future load/store() calls (might allocate more registers)
210  using SFC_Ys = typename Base::Traits::SFC_Ys;
211 
212  static_for<0, NumAccess, 1>{}([&](auto i_access) {
213  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
214  constexpr auto need_save_non_linear_coord =
215  bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
216 
217  if constexpr(need_save_non_linear_coord)
218  {
219  cached_coords_(non_linear_id) = bottom_tensor_thread_coord_tmp;
220  cached_window_adaptor_coords_(non_linear_id) = window_adaptor_thread_coord_tmp;
221  }
222 
223  // TODO: need pad_tensor_view to check which dim need use flag to check
224  // cached flag is independent from non-linear-coord
225  // but need be updated in move_tile, with proper dims
227  this->bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_coord_tmp);
228 
229  if constexpr(i_access != (NumAccess - 1))
230  {
231  constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(i_access); // tuple of number
232  constexpr auto idx_diff_ps_ys = container_concat(
233  generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
234  idx_diff_ys);
235 
237  window_adaptor_thread_coord_tmp,
238  bottom_tensor_thread_coord_tmp,
239  idx_diff_ps_ys);
240  }
241  });
242  }
243 
244  template <index_t i_access>
246  {
247  using SFC_Ys = typename Base::Traits::SFC_Ys;
248  constexpr auto idx_ys = SFC_Ys::get_index(number<i_access>{});
249  using ys_to_rhs_major =
250  typename decltype(typename Base::TileDstr{}
251  .get_static_tile_distribution_encoding())::Ys2RHsMajor;
252 
253  constexpr auto modified_idx_ys = generate_tuple(
254  [&](auto i_dim_y) {
255  constexpr auto rhs_major = ys_to_rhs_major{}[i_dim_y];
256  constexpr auto target_h_dim = number<rhs_major - 1>{}; // no r dim here!
257  if constexpr(LinearBottomDims{}[target_h_dim] == 0)
258  {
259  return number<0>{};
260  }
261  else
262  {
263  return number<idx_ys[i_dim_y]>{};
264  }
265  },
267 
268  constexpr auto adaptor_ = typename Base::TileDstr{}.get_ps_ys_to_xs_adaptor();
269  constexpr auto idx_ =
270  container_concat(make_tuple(number<0>{}, number<0>{}), modified_idx_ys);
271 
272  return adaptor_.calculate_bottom_index(idx_);
273  }
274 
275  template <index_t i_access>
277  {
278  constexpr auto linear_coord = get_bottom_linear_coordinate(number<i_access>{});
279  constexpr auto is_pure_linear_tensor =
281  if constexpr(is_pure_linear_tensor)
282  {
283  // this case usually is a LDS window, everything is known at compile tile.
284  // we directly use BottomTensorView transform to compute the offset, in case padding
285  auto bottom_tensor_coord = make_tensor_coordinate(
286  typename Base::BottomTensorView{}.get_tensor_descriptor(), linear_coord);
287  return bottom_tensor_coord.get_offset();
288  }
289  else
290  {
291  // this case usually is a global window, where last dim can be linear
292  // we hack here, that use the original TileDstr to compute the linear offset
293  // ... hoping that there is no extra padding between other dims, which make sense
294  // since that would introduce runtime length (so can't use linear offset)
295  constexpr index_t linear_offset = [&]() {
296  constexpr auto x_idx_ = linear_coord;
297  constexpr auto x_len_ = typename Base::TileDstr{}.get_lengths();
298  static_assert(x_idx_.size() == x_len_.size());
299  constexpr index_t x_dims_ = x_idx_.size();
300  index_t cu_stride_ = 1;
301  index_t cu_offset_ = 0;
302  static_for<0, x_dims_, 1>{}([&](auto i_) {
303  auto r_i_ = number<x_dims_ - i_ - 1>{};
304  cu_offset_ += x_idx_[r_i_] * cu_stride_;
305  cu_stride_ *= x_len_[r_i_];
306  });
307  return cu_offset_;
308  }();
309  return linear_offset;
310  }
311  }
312 
313  template <index_t i_access = -1, bool oob_conditional_check = true>
315  {
316  using vector_t = typename Base::Traits::vector_t;
317  using SFC_Ys = typename Base::Traits::SFC_Ys;
318 
319  constexpr auto tile_dstr = typename Base::TileDstr{};
320 
321  auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
322 
323  auto issue = [&](auto i_access_) {
324  constexpr auto IAccess = number<i_access_>{};
325 
326  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
327  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
328  auto bottom_tensor_flag = cached_flags_[IAccess];
329 
330  constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
331 
332  // read from bottom tensor
333  const vector_t vec_value =
334  this->get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
335  bottom_tensor_thread_coord,
336  linear_offset,
337  bottom_tensor_flag,
338  bool_constant<oob_conditional_check>{});
339 
340  // data index [y0, y1, ...]
341  constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
342  // write into distributed tensor
343  static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
344  constexpr auto idx_ys = generate_tuple(
345  [&](auto jj) {
346  return jj == Base::Traits::VectorDimY ? (idx_diff_ys[jj] + j)
347  : idx_diff_ys[jj];
348  },
349  number<Base::NDimY>{});
350 
351  constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
352  Base::Traits::PackedSize;
353 
354  dst_tensor.get_thread_buffer().template at<d>() =
355  vec_value
356  .template get_as<typename Base::DataType>()[j / Base::Traits::PackedSize];
357  });
358  };
359 
361 
362  return dst_tensor;
363  }
364 
365  template <typename DstTile, index_t i_access = -1, bool oob_conditional_check = true>
366  CK_TILE_DEVICE auto load(DstTile& dst_tensor,
367  number<i_access> = {},
369  {
370  using vector_t = typename Base::Traits::vector_t;
371  using SFC_Ys = typename Base::Traits::SFC_Ys;
372 
373  constexpr auto tile_dstr = typename Base::TileDstr{};
374 
375  // auto dst_tensor = make_static_distributed_tensor<DataType>(tile_dstr);
376 
377  auto issue = [&](auto i_access_) {
378  constexpr auto IAccess = number<i_access_>{};
379 
380  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
381  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
382  auto bottom_tensor_flag = cached_flags_[IAccess];
383 
384  constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
385 
386  // read from bottom tensor
387  const vector_t vec_value =
388  this->get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
389  bottom_tensor_thread_coord,
390  linear_offset,
391  bottom_tensor_flag,
392  bool_constant<oob_conditional_check>{});
393  // data index [y0, y1, ...]
394  constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
395  // write into distributed tensor
396  static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
397  constexpr auto idx_ys = generate_tuple(
398  [&](auto jj) {
399  return jj == Base::Traits::VectorDimY ? (idx_diff_ys[jj] + j)
400  : idx_diff_ys[jj];
401  },
402  number<Base::NDimY>{});
403 
404  constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
405  Base::Traits::PackedSize;
406 
407  dst_tensor.get_thread_buffer().template at<d>() =
408  vec_value
409  .template get_as<typename Base::DataType>()[j / Base::Traits::PackedSize];
410  });
411  };
412 
414 
415  return dst_tensor;
416  }
417 
418  template <typename DstTile,
419  index_t i_access = -1,
420  bool oob_conditional_check = true,
421  bool pre_nop = false>
422  CK_TILE_DEVICE void load_raw(DstTile& dst_tensor,
423  number<i_access> = {}, // negative means loop over all num_access
425  bool_constant<pre_nop> = {}) const
426  {
427  using vector_t = typename Base::Traits::vector_t;
428  using SFC_Ys = typename Base::Traits::SFC_Ys;
429  static constexpr index_t YElementSize =
430  typename Base::TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
431  static_assert(YElementSize % (Base::Traits::PackedSize * Base::Traits::ScalarPerVector) ==
432  0);
433  using vectorized_tbuf =
434  array<vector_t,
435  YElementSize / (Base::Traits::PackedSize * Base::Traits::ScalarPerVector)>;
436 
437  constexpr auto tile_dstr = typename Base::TileDstr{};
438 
439  auto& dst_vec_tbuf = reinterpret_cast<vectorized_tbuf&>(dst_tensor.get_thread_buffer());
440 
441  auto issue = [&](auto i_access_) {
442  constexpr auto IAccess = number<i_access_>{};
443  constexpr auto pre_nop_ = [&]() {
444  if constexpr(pre_nop && i_access_ == 0 &&
445  Base::BottomTensorView::buffer_view::get_address_space() ==
446  address_space_enum::global)
447  return bool_constant<true>{};
448  else
449  return bool_constant<false>{};
450  }();
451 
452  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
453  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
454  constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
455  auto bottom_tensor_flag = cached_flags_[IAccess];
456 
457  // data index [y0, y1, ...]
458  constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
459  constexpr index_t d =
460  tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start) /
461  Base::Traits::PackedSize;
462  static_assert(d % Base::Traits::ScalarPerVector == 0);
463 
464  this->get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
465  dst_vec_tbuf.template at<d / Base::Traits::ScalarPerVector>(),
466  bottom_tensor_thread_coord,
467  linear_offset ,
468  bottom_tensor_flag,
469  bool_constant<oob_conditional_check>{},
470  pre_nop_);
471 #if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE || \
472  CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
473  asm volatile(""); // this is starting from rocm-6.2, but same sympton, reuse this flag
474 #endif
475  };
476 
478  }
479 
480  // TODO: currently async load only implemented in inline asm
481  template <typename LdsTileWindow_,
482  index_t i_access = -1,
483  bool oob_conditional_check = true,
484  bool pre_nop = false>
485  CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_&& lds_tile,
486  number<i_access> = {},
488  bool_constant<pre_nop> = {}) const
489  {
490  using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
491  using LdsDataType = typename LdsTileWindow::DataType;
492 
493  // currently we only support everything is non linear dim
494  // actually it's not performant if we have linear dim(e.g. fast changing)
495  static_assert(NumAccess_NonLinear == NumAccess);
496  static_assert(Base::BottomTensorView::buffer_view::get_address_space() ==
497  address_space_enum::global);
498 
499  // issues * warps * lanes
500  static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
501 
502  const index_t size_per_buf =
503  lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
504  make_tuple(number<0>{}, number<0>{}, number<0>{})) *
505  sizeof(LdsDataType);
506 
507  const index_t size_per_wave =
508  lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
509  make_tuple(number<0>{}, number<1>{}, number<0>{})) *
510  sizeof(LdsDataType) -
511  size_per_buf;
512 
513  const index_t size_per_issue =
514  lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
515  make_tuple(number<1>{}, number<0>{}, number<0>{})) *
516  sizeof(LdsDataType) -
517  size_per_buf;
518 
519  const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
521  amd_wave_read_first_lane(m0_init_value)); // This should be wave independent
522 
523  using vector_t = typename Base::Traits::vector_t;
524 
525  LdsDataType* smem = lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_;
526 
527  // loop over thread tensor space [y0, y1, ...]
528  auto issue = [&](auto i_access_) {
529  constexpr auto IAccess = number<i_access_>{};
530  constexpr auto pre_nop_ = [&]() {
531  if constexpr(pre_nop && i_access_ == 0)
532  return bool_constant<true>{};
533  else
534  return bool_constant<false>{};
535  }();
536 
537  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
538  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
539  auto bottom_tensor_flag = cached_flags_[IAccess]; // get this flag anyway
540 
541  // read from bottom tensor
542  this->get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
543  smem, bottom_tensor_thread_coord, 0, bottom_tensor_flag, pre_nop_);
544 
545  // move thread coordinate
546  if constexpr(i_access_ != (NumAccess - 1))
547  {
548  m0_inc_with_memory(size_per_issue);
549  }
550  };
551 
553  }
554 
555  template <typename LdsTileWindow_, index_t i_access = -1, bool oob_conditional_check = true>
556  CK_TILE_DEVICE auto async_load(LdsTileWindow_&& lds_tile,
557  number<i_access> = {},
559  {
560  using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
561  using LdsDataType = typename LdsTileWindow::DataType;
562  using vector_t = typename traits::vector_t;
563 
564  static_assert(NumAccess_NonLinear == NumAccess, "Unsupported configuration");
565  static_assert(Base::BottomTensorView::buffer_view::get_address_space() ==
566  address_space_enum::global,
567  "Requires global memory");
568 
569  // Precompute invariant values outside the lambda
570  const auto window_origin = lds_tile.get_window_origin();
571  const auto& bottom_tensor_view = lds_tile.get_bottom_tensor_view();
572  const auto& tensor_descriptor = bottom_tensor_view.get_tensor_descriptor();
573  auto smem_base_ptr = bottom_tensor_view.get_buffer_view().p_data_;
574 
575  auto issue = [&](auto i_access_) {
576  constexpr auto IAccess = number<i_access_>{};
577  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
578 
579  // Use precomputed values
580  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
581  auto window_adaptor_coord = cached_window_adaptor_coords_[non_linear_id];
582  auto bottom_tensor_flag = cached_flags_[IAccess];
583 
584  auto lds_bottom_tensor_thread_idx =
585  window_origin + window_adaptor_coord.get_bottom_index();
586  const auto lds_coord =
587  make_tensor_coordinate(tensor_descriptor, lds_bottom_tensor_thread_idx);
588 
589  CK_TILE_LDS_ADDR LdsDataType* smem = smem_base_ptr + lds_coord.get_offset();
590 
591  // Read from bottom tensor
592  this->get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
593  smem,
594  bottom_tensor_thread_coord,
595  0,
596  bottom_tensor_flag,
597  bool_constant<oob_conditional_check>{});
598  };
599 
601  }
602 
603  template <typename Policy, index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
605  {
606  constexpr auto tile_dstr = typename Base::TileDstr{};
607  auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
608  this->template load_transpose_linear<Policy>(
610  return dst_tensor;
611  }
612 
613  template <typename Policy,
614  typename DistributedTensor,
615  index_t i_access = -1,
616  bool oob_conditional_check = true>
617  CK_TILE_DEVICE auto load_transpose_linear(DistributedTensor& dst_tensor,
618  number<i_access> = {},
620  {
621  using vector_t = typename traits::vector_t;
622  using SFC_Ys = typename traits::SFC_Ys;
623 
624  constexpr auto tile_dstr = typename Base::TileDstr{};
625 
626  constexpr auto group_func = Policy::group_func;
627 
628  auto issue = [&](auto i_access_) {
629  constexpr auto IAccess = number<i_access_>{};
630  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
631  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
632  auto bottom_tensor_flag = cached_flags_[IAccess];
633 
634  constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
635 
636  // read from bottom tensor
637  const vector_t vec_value =
638  this->get_bottom_tensor_view().template get_transpose_vectorized_elements<vector_t>(
639  bottom_tensor_thread_coord, 0);
640  // write into distributed tensor
641  static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
642  constexpr auto idx_ys = generate_tuple(
643  [&](auto jj) {
644  return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
645  },
646  number<Base::NDimY>{});
647 
648  constexpr index_t linear_distributed_index =
649  tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
650  dst_tensor.get_thread_buffer().template at<linear_distributed_index>() =
651  vec_value.template get_as<typename Base::DataType>()[j];
652  });
653  };
655  }
656 
657  template <index_t i_access = -1, bool oob_conditional_check = true>
659  typename Base::TileDstr>& dstr_tensor,
660  number<i_access> = {},
662  {
663 
664  using vector_t = typename Base::Traits::vector_t;
665  using SFC_Ys = typename Base::Traits::SFC_Ys;
666 
667  constexpr auto tile_dstr = typename Base::TileDstr{};
668 
669  // loop over thread tensor space [y0, y1, ...]
670  auto issue = [&](auto i_access_) {
671  constexpr auto IAccess = number<i_access_>{};
672  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
673  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
674  constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
675  auto bottom_tensor_flag = cached_flags_[IAccess];
676  // data index [y0, y1, ...]
677  constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
678 
679  // read from distributed tensor
680  vector_t vec_value;
681 
682  static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
683  constexpr auto idx_ys = generate_tuple(
684  [&](auto jj) {
685  return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
686  : idx_ys_start[jj];
687  },
688  number<Base::NDimY>{});
689 
690  constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
691  Base::Traits::PackedSize;
692 
693  vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
694  dstr_tensor.get_thread_buffer().template at<d>();
695  });
696 
697  // write into bottom tensor
698  this->get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
699  bottom_tensor_thread_coord,
700  linear_offset,
701  bottom_tensor_flag,
702  vec_value,
703  bool_constant<oob_conditional_check>{});
704  };
705 
707  }
708 
709  template <index_t i_access = -1>
710  CK_TILE_DEVICE void
712  dstr_tensor,
713  number<i_access> = {}) const
714  {
715  using vector_t = typename Base::Traits::vector_t;
716  using SFC_Ys = typename Base::Traits::SFC_Ys;
717 
718  constexpr auto tile_dstr = typename Base::TileDstr{};
719  static constexpr bool oob_conditional_check = true;
720 
721  // loop over thread tensor space [y0, y1, ...]
722  auto issue = [&](auto i_access_) {
723  constexpr auto IAccess = number<i_access_>{};
724  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
725  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
726  constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
727  auto bottom_tensor_flag = cached_flags_[IAccess];
728 
729  // data index [y0, y1, ...]
730  constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
731 
732  // read from distributed tensor
733  vector_t vec_value;
734  static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
735  constexpr auto idx_ys = generate_tuple(
736  [&](auto jj) {
737  return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
738  : idx_ys_start[jj];
739  },
740  number<Base::NDimY>{});
741  constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
742  Base::Traits::PackedSize;
743  vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
744  dstr_tensor.get_thread_buffer().template at<d>();
745  });
746 
747  // write into bottom tensor
748  this->get_bottom_tensor_view()
749  .template set_vectorized_elements_raw<vector_t, oob_conditional_check>(
750  bottom_tensor_thread_coord, linear_offset, bottom_tensor_flag, vec_value);
751  };
752 
754  }
755 
756  template <index_t i_access = -1, bool oob_conditional_check = true>
757  CK_TILE_DEVICE void
759  dstr_tensor,
760  number<i_access> = {},
762  {
763 
764  using vector_t = typename Base::Traits::vector_t;
765  using SFC_Ys = typename Base::Traits::SFC_Ys;
766 
767  constexpr auto tile_dstr = typename Base::TileDstr{};
768 
769  // loop over thread tensor space [y0, y1, ...]
770  auto issue = [&](auto i_access_) {
771  constexpr auto IAccess = number<i_access_>{};
772  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
773  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
774  constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
775  auto bottom_tensor_flag = cached_flags_[IAccess];
776 
777  // data index [y0, y1, ...]
778  constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
779 
780  // read from distributed tensor
781  vector_t vec_value;
782 
783  static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
784  constexpr auto idx_ys = generate_tuple(
785  [&](auto jj) {
786  return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
787  : idx_ys_start[jj];
788  },
789  number<Base::NDimY>{});
790 
791  constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
792  Base::Traits::PackedSize;
793 
794  vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
795  dstr_tensor.get_thread_buffer().template at<d>();
796  });
797 
798  // write into bottom tensor
799  this->get_bottom_tensor_view().template update_vectorized_elements<vector_t>(
800  bottom_tensor_thread_coord,
801  linear_offset,
802  bottom_tensor_flag,
803  vec_value,
804  bool_constant<oob_conditional_check>{});
805  };
806 
808  }
809 
810  template <index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
811  CK_TILE_DEVICE void
813  dstr_tensor,
814  number<i_access> = {},
816  bool_constant<pre_nop> = {}) const
817  {
818 
819  using vector_t = typename Base::Traits::vector_t;
820  using SFC_Ys = typename Base::Traits::SFC_Ys;
821 
822  constexpr auto tile_dstr = typename Base::TileDstr{};
823 
824  // loop over thread tensor space [y0, y1, ...]
825  auto issue = [&](auto i_access_) {
826  constexpr auto IAccess = number<i_access_>{};
827  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
828  auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
829  constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
830  auto bottom_tensor_flag = cached_flags_[IAccess];
831 
832  // data index [y0, y1, ...]
833  constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
834 
835  // read from distributed tensor
836  vector_t vec_value;
837 
838  static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
839  constexpr auto idx_ys = generate_tuple(
840  [&](auto jj) {
841  return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
842  : idx_ys_start[jj];
843  },
844  number<Base::NDimY>{});
845 
846  constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
847  Base::Traits::PackedSize;
848 
849  vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
850  dstr_tensor.get_thread_buffer().template at<d>();
851  });
852 
853  // write into bottom tensor
854  this->get_bottom_tensor_view().template update_vectorized_elements_raw<vector_t>(
855  bottom_tensor_thread_coord,
856  linear_offset,
857  bottom_tensor_flag,
858  vec_value,
859  bool_constant<oob_conditional_check>{},
860  bool_constant<pre_nop>{});
861  };
862 
864  }
865  // *_extended() functions acts like a virtual function with a default implementation exisiting
866  // in the base class
868  {
869  static_for<0, NumAccess, 1>{}([&](auto i_access) {
870  constexpr auto IAccess = number<i_access>{};
871  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
872  constexpr auto need_update_non_linear_coord =
873  bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
874 
875  if constexpr(need_update_non_linear_coord)
876  {
877  move_tensor_coordinate(this->bottom_tensor_view_.get_tensor_descriptor(),
878  cached_coords_(non_linear_id),
879  step);
880  }
881 
882  // move the current coord with linear_coords
883  auto tmp_coords = cached_coords_[non_linear_id];
884  constexpr auto linear_coord = get_bottom_linear_coordinate(IAccess);
886  this->bottom_tensor_view_.get_tensor_descriptor(), tmp_coords, linear_coord);
887 
889  this->bottom_tensor_view_.get_tensor_descriptor(), tmp_coords);
890  });
891  }
892 
894  {
895  auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
896  typename Base::TileDstr{}.get_ps_ys_to_xs_adaptor(),
898  make_tuple(get_warp_id(), get_lane_id()),
899  generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimY>{})));
900 
901  typename Base::BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
902  this->window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index();
903 
904  auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate(
905  this->bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
906 
907  // future load/store() calls (might allocate more registers)
908  using SFC_Ys = typename Base::Traits::SFC_Ys;
909 
910  static_for<0, NumAccess, 1>{}([&](auto i_access) {
911  constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
912  constexpr auto need_save_non_linear_coord =
913  bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
914 
915  if constexpr(need_save_non_linear_coord)
916  {
917  cached_coords_(non_linear_id) = bottom_tensor_thread_coord_tmp;
918  cached_window_adaptor_coords_(non_linear_id) = window_adaptor_thread_coord_tmp;
919  }
920 
921  if constexpr(i_access != (NumAccess - 1))
922  {
923  constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(i_access); // tuple of number
924  constexpr auto idx_diff_ps_ys = container_concat(
925  generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
926  idx_diff_ys);
927 
929  window_adaptor_thread_coord_tmp,
930  bottom_tensor_thread_coord_tmp,
931  idx_diff_ps_ys);
932  }
933  });
934  }
935 
936  // this contains:
941 };
942 
943 #undef WINDOW_DISPATCH_ISSUE
944 
945 namespace impl {
946 template <address_space_enum, index_t len_>
948 {
950 };
951 
952 template <index_t len_>
953 struct default_linear_bottom_dims_impl<address_space_enum::global, len_>
954 {
955  // global default to seq<0,0,....1>
956  using type = typename sequence_merge<typename uniform_sequence_gen<len_ - 1, 0>::type,
958 };
959 
960 template <index_t len_>
961 struct default_linear_bottom_dims_impl<address_space_enum::lds, len_>
962 {
963  // lds default to seq<1,1.....1>
965 };
966 } // namespace impl
967 
968 template <typename TensorView_>
970  typename impl::default_linear_bottom_dims_impl<TensorView_::buffer_view::get_address_space(),
971  TensorView_::get_num_of_dimension()>::type;
972 
973 // if using this API, will create a tile_window_linear
974 // this structure can have the chance to use immediate value, save register
975 // need pass in LinearBottomDims_ properly to control which dim is linear
976 // so to generate a constexpr offset as linear_offset for this dim
977 // (and finally pass to the immediate offset of buffer/lds instruction)
978 //
979 // Note: there is no internal check for which dim is OK to use linear offset
980 // user must make sure by themselves
981 //
982 // e.g.
983 // 2d global matrix, set LinearBottomDims_=seq<0, 1>, the last dim will generate
984 // immediate offset if each thread has multiple issue along last dim
985 //
986 // 2d LDS buffer, set LinearBottomDims_=seq<1, 1>, then only one vgpr used as offset
987 // everything else is just using immediate offset.
988 //
989 template <typename TensorView_,
990  typename WindowLengths_,
991  typename StaticTileDistribution_,
992  typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
993 CK_TILE_DEVICE constexpr auto
995  const WindowLengths_& window_lengths,
996  const multi_index<TensorView_::get_num_of_dimension()>& origin,
997  const StaticTileDistribution_& tile_distribution,
998  LinearBottomDims_ = {})
999 {
1000  static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension());
1001  return tile_window_linear<remove_cvref_t<TensorView_>,
1002  remove_cvref_t<WindowLengths_>,
1003  remove_cvref_t<StaticTileDistribution_>,
1004  remove_cvref_t<LinearBottomDims_>>{
1005  tensor_view, window_lengths, origin, tile_distribution};
1006 }
1007 
1008 template <
1009  typename TileWindow_,
1010  typename StaticTileDistribution_,
1011  typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
1012 CK_TILE_DEVICE constexpr auto
1013 make_tile_window_linear(const TileWindow_& tile_window,
1014  const StaticTileDistribution_& tile_distribution,
1015  LinearBottomDims_ = {})
1016 {
1017  return make_tile_window_linear(tile_window.get_bottom_tensor_view(),
1018  tile_window.get_window_lengths(),
1019  tile_window.get_window_origin(),
1020  tile_distribution,
1021  LinearBottomDims_{});
1022 }
1023 
1024 // this version must not be called under a constexpr context
1025 template <typename TensorView_,
1026  typename WindowLengths_,
1027  typename StaticTileDistribution_,
1028  typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
1029 CK_TILE_DEVICE auto
1031  const WindowLengths_& window_lengths,
1032  const multi_index<TensorView_::get_num_of_dimension()>& origin,
1033  const StaticTileDistribution_& tile_distribution,
1034  LinearBottomDims_ = {})
1035 {
1036  static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension());
1037  auto w = tile_window_linear<remove_cvref_t<TensorView_>,
1038  remove_cvref_t<WindowLengths_>,
1039  remove_cvref_t<StaticTileDistribution_>,
1040  remove_cvref_t<LinearBottomDims_>>{
1041  tensor_view, window_lengths, origin, tile_distribution};
1042  w.init_raw();
1043  return w;
1044 }
1045 
1046 template <
1047  typename TileWindow_,
1048  typename StaticTileDistribution_,
1049  typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
1050 CK_TILE_DEVICE constexpr auto
1051 make_tile_window_linear_raw(const TileWindow_& tile_window,
1052  const StaticTileDistribution_& tile_distribution,
1053  LinearBottomDims_ = {})
1054 {
1055  return make_tile_window_linear_raw(tile_window.get_bottom_tensor_view(),
1056  tile_window.get_window_lengths(),
1057  tile_window.get_window_origin(),
1058  tile_distribution,
1059  LinearBottomDims_{});
1060 }
1061 
1062 template <typename TensorView_,
1063  typename WindowLengths_,
1064  typename StaticTileDistribution_,
1065  typename LinearBottomDims_>
1068  window,
1069  const typename tile_window_linear<TensorView_,
1070  WindowLengths_,
1071  StaticTileDistribution_,
1072  LinearBottomDims_>::BottomTensorIndex& step)
1073 {
1074  window.move(step);
1075 }
1076 
1085 template <typename T>
1087 {
1088 };
1089 
1101 template <typename BottomTensorView_,
1102  typename WindowLengths_,
1103  typename StaticTileDistribution_,
1104  typename LinearBottomDims_>
1106  WindowLengths_,
1107  StaticTileDistribution_,
1108  LinearBottomDims_>> : std::true_type
1109 {
1110 };
1111 
1119 template <typename T>
1121 
1122 } // namespace ck_tile
#define CK_TILE_DEVICE
Definition: config.hpp:45
#define CK_TILE_LDS_ADDR
Definition: config.hpp:62
Definition: cluster_descriptor.hpp:13
typename impl::default_linear_bottom_dims_impl< TensorView_::buffer_view::get_address_space(), TensorView_::get_num_of_dimension()>::type default_linear_bottom_dims
Definition: tile_window_linear.hpp:971
__device__ uint32_t amd_wave_read_first_lane(uint16_t v)
Definition: amd_buffer_addressing.hpp:35
constexpr CK_TILE_HOST_DEVICE void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const Index &coord_step)
Definition: tensor_coordinate.hpp:72
constexpr CK_TILE_HOST_DEVICE auto make_tensor_adaptor_coordinate(const Adaptor &adaptor, const TopIndex &idx_top)
Definition: tensor_adaptor_coordinate.hpp:56
constant< b > bool_constant
Definition: integral_constant.hpp:43
int32_t index_t
Definition: integer.hpp:9
constexpr CK_TILE_HOST_DEVICE auto make_tensor_coordinate(const TensorDesc &tensor_desc, const TopIndex &idx_top)
Definition: tensor_coordinate.hpp:60
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:21
constant< v > number
Definition: integral_constant.hpp:37
constexpr CK_TILE_HOST_DEVICE index_t reduce_on_sequence(Seq, Reduce f, number< Init >)
Definition: sequence.hpp:993
constexpr CK_TILE_HOST_DEVICE bool coordinate_has_valid_offset_assuming_top_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition: tensor_coordinate.hpp:79
CK_TILE_DEVICE auto make_tile_window_linear_raw(const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
Definition: tile_window_linear.hpp:1030
constexpr bool is_tile_window_linear_v
Helper variable template to check if a type is a linear tile window.
Definition: tile_window_linear.hpp:1120
CK_TILE_DEVICE void move_tile_window(null_tile_window< WindowLengths > &, const typename null_tile_window< WindowLengths >::BottomTensorIndex &)
Definition: null_tile_window.hpp:95
constexpr CK_TILE_DEVICE auto make_tile_window_linear(const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
Definition: tile_window_linear.hpp:994
constexpr CK_TILE_HOST_DEVICE auto generate_tuple(F &&f, number< N >)
Definition: tuple.hpp:429
CK_TILE_DEVICE void m0_set_with_memory(index_t v)
Definition: utility.hpp:19
constexpr CK_TILE_HOST_DEVICE auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:360
CK_TILE_DEVICE void m0_inc_with_memory(index_t v)
Definition: utility.hpp:25
constexpr CK_TILE_HOST_DEVICE auto histogram_sorted_sequence(SeqSortedSamples, sequence< r, rs... >)
Definition: sequence.hpp:1113
constexpr CK_TILE_HOST_DEVICE auto container_concat(const X &x, const Ys &... ys)
Definition: container_helper.hpp:363
constexpr auto prefix_sum_sequence(Seq)
Definition: sequence.hpp:919
bool_constant< false > false_type
Definition: integral_constant.hpp:63
bool_constant< true > true_type
Definition: integral_constant.hpp:62
Definition: sequence.hpp:298
A fixed-size array container similar to std::array with additional utilities.
Definition: array.hpp:43
Definition: integral_constant.hpp:13
typename sequence_merge< typename uniform_sequence_gen< len_ - 1, 0 >::type, sequence< 1 > >::type type
Definition: tile_window_linear.hpp:957
typename uniform_sequence_gen< len_, 1 >::type type
Definition: tile_window_linear.hpp:964
Definition: tile_window_linear.hpp:948
typename uniform_sequence_gen< len_, 0 >::type type
Definition: tile_window_linear.hpp:949
Type trait to determine if a type is a linear tile window.
Definition: tile_window_linear.hpp:1087
Definition: math.hpp:98
Definition: sequence.hpp:247
Definition: sequence.hpp:49
Definition: static_distributed_tensor.hpp:21
constexpr CK_TILE_HOST_DEVICE const auto & get_thread_buffer() const
Definition: static_distributed_tensor.hpp:58
Definition: functional.hpp:43
Definition: tensor_view.hpp:41
Definition: tile_distribution.hpp:70
constexpr CK_TILE_HOST_DEVICE const auto & get_ps_ys_to_xs_adaptor() const
Definition: tile_distribution.hpp:124
CK_TILE_DEVICE void move(const BottomTensorIndex &step)
Definition: tile_window_base.hpp:67
Definition: tile_window_linear.hpp:72
decltype(get_non_linear_access_histogram_prefix_sum()) AccessPrefixSum_NonLinear
Definition: tile_window_linear.hpp:175
decltype(get_non_linear_access_map()) AccessMap_NonLinear
Definition: tile_window_linear.hpp:173
static constexpr index_t NumAccess_NonLinear
Definition: tile_window_linear.hpp:172
decltype(get_non_linear_access_histogram()) AccessHistogram_NonLinear
Definition: tile_window_linear.hpp:174
Definition: tile_window_linear.hpp:55
static constexpr auto I0
Definition: tile_window_linear.hpp:68
CK_TILE_DEVICE void set_window_origin_extended(const typename Base::BottomTensorIndex &)
Definition: tile_window_linear.hpp:893
CK_TILE_DEVICE auto load(number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:314
constexpr CK_TILE_DEVICE tile_window_linear()=default
array< typename Base::WindowAdaptorCoord, traits::NumAccess_NonLinear > cached_window_adaptor_coords_
Definition: tile_window_linear.hpp:939
CK_TILE_DEVICE auto async_load(LdsTileWindow_ &&lds_tile, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:556
CK_TILE_DEVICE void load_raw(DstTile &dst_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window_linear.hpp:422
static constexpr CK_TILE_DEVICE index_t get_bottom_linear_offset(number< i_access >)
Definition: tile_window_linear.hpp:276
CK_TILE_DEVICE auto load_transpose() const
Definition: tile_window_linear.hpp:604
typename traits::AccessHistogram_NonLinear AccessHistogram_NonLinear
Definition: tile_window_linear.hpp:181
typename traits::AccessMap_NonLinear AccessMap_NonLinear
Definition: tile_window_linear.hpp:180
constexpr CK_TILE_DEVICE tile_window_linear(const typename Base::BottomTensorView &bottom_tensor_view, const typename Base::WindowLengths &window_lengths, const typename Base::BottomTensorIndex &window_origin, const typename Base::TileDstr &tile_distribution)
Definition: tile_window_linear.hpp:186
static constexpr index_t NumAccess
Definition: tile_window_linear.hpp:178
CK_TILE_DEVICE void store_raw(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access >={}) const
Definition: tile_window_linear.hpp:711
array< bool, Base::Traits::NumAccess > cached_flags_
Definition: tile_window_linear.hpp:940
static constexpr CK_TILE_DEVICE auto get_bottom_linear_coordinate(number< i_access >)
Definition: tile_window_linear.hpp:245
CK_TILE_DEVICE void update(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:758
CK_TILE_DEVICE void store(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:658
CK_TILE_DEVICE void update_raw(const static_distributed_tensor< typename Base::DataType, typename Base::TileDstr > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window_linear.hpp:812
typename traits::AccessPrefixSum_NonLinear AccessPrefixSum_NonLinear
Definition: tile_window_linear.hpp:182
CK_TILE_DEVICE auto load_transpose_linear(DistributedTensor &dst_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:617
static constexpr index_t NumAccess_NonLinear
Definition: tile_window_linear.hpp:179
CK_TILE_DEVICE auto load(DstTile &dst_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) const
Definition: tile_window_linear.hpp:366
CK_TILE_DEVICE void move_extended(const typename Base::BottomTensorIndex &step)
Definition: tile_window_linear.hpp:867
array< typename Base::BottomTensorCoord, traits::NumAccess_NonLinear > cached_coords_
Definition: tile_window_linear.hpp:937
CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_ &&lds_tile, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) const
Definition: tile_window_linear.hpp:485
remove_cvref_t< LinearBottomDims_ > LinearBottomDims
Definition: tile_window_linear.hpp:64
static constexpr auto I1
Definition: tile_window_linear.hpp:69
Definition: tile_window_base.hpp:94
CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(WindowAdaptorCoord &window_adaptor_thread_coord, BottomTensorCoord &bottom_tensor_thread_coord, const ATopIndex &idx_diff_adaptor_top) const
Definition: tile_window_base.hpp:129
Definition: sequence.hpp:325
typename sequence_gen< NSize, F >::type type
Definition: sequence.hpp:331
#define WINDOW_DISPATCH_ISSUE()
Definition: tile_window_linear.hpp:22
#define TO_SEQUENCE(a, n)
Definition: to_sequence.hpp:10