/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp Source File
reduce2d_kernel.hpp
Go to the documentation of this file.
1 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2 // SPDX-License-Identifier: MIT
3 
4 #pragma once
5 
6 #include "ck_tile/core.hpp"
7 #include "ck_tile/ops/common.hpp"
10 
11 // Reduce2d Kernel:
12 // =======================================
13 // This kernel implements a 2D reduction operation that reduces data along the second dimension
14 // of a matrix. The reduction is performed in multiple hierarchical stages.
15 
16 namespace ck_tile {
17 
18 template <typename Problem_, typename Policy_ = Reduce2dDefaultPolicy>
20 {
23 
27 
28  static constexpr index_t kBlockSize = Problem::BlockShape::BlockSize;
29  CK_TILE_HOST static constexpr auto BlockSize()
30  {
31  return is_wave32() ? kBlockSize / 2 : kBlockSize;
32  }
33 
34  private:
35  // Helper function to calculate optimal vector size for input tensor
36  template <typename ReduceDims, index_t Rank, index_t NumReduceDim>
37  static constexpr index_t CalculateInputVectorSize()
38  {
39  using S = typename Problem::BlockShape;
40  constexpr index_t memory_vector_size = 16 / sizeof(XDataType);
41  constexpr index_t thread_tile_vector_size = S::ThreadTile_N;
42 
43  // Check if innermost reduce dimension is the last dimension (stride 1).
44  constexpr index_t innermost_reduce_dim = ReduceDims::at(number<NumReduceDim - 1>{});
45  constexpr bool is_innermost_contiguous = (innermost_reduce_dim == Rank - 1);
46 
47  // If innermost reduce dimension is not the last dim (not contiguous), limit vectorization
48  constexpr index_t stride_based_vector_size =
49  is_innermost_contiguous ? ck_tile::min(memory_vector_size, thread_tile_vector_size) : 1;
50 
51  return stride_based_vector_size;
52  }
53 
54  // Helper function to calculate optimal vector size for output tensor
55  static constexpr index_t CalculateOutputVectorSize()
56  {
57  using S = typename Problem::BlockShape;
58  constexpr index_t memory_vector_size = 16 / sizeof(YDataType);
59  constexpr index_t thread_tile_vector_size = S::ThreadTile_M;
60  constexpr index_t vector_size = ck_tile::min(memory_vector_size, thread_tile_vector_size);
61 
62  return vector_size;
63  }
64 
65  public:
66  template <typename InputShape, typename InputStrides>
68  YDataType* p_y,
69  InputShape input_shape,
70  InputStrides input_strides) const
71  {
72  using S = typename Problem::BlockShape;
73  const auto iM = get_block_id() * S::Block_M;
74 
75  static_assert(Problem::KeptDim::size() + Problem::ReduceDims::size() == Problem::Rank,
76  "Size of kept dimensions + reduced dimensions must equal input tensor rank");
77 
78  // Extract lengths based on kept and reduced dimensions
79  const auto kept_lens = [&]() {
80  return generate_tuple(
81  [&](auto I) { return input_shape.at(number<Problem::KeptDim::at(I)>{}); },
82  number<Problem::KeptDim::size()>{});
83  }();
84  const auto reduce_lens = [&]() {
85  return generate_tuple(
86  [&](auto I) { return input_shape.at(number<Problem::ReduceDims::at(I)>{}); },
87  number<Problem::ReduceDims::size()>{});
88  }();
89 
90  const auto kept_merge_transform = make_merge_transform(kept_lens);
91  const auto reduce_merge_transform = make_merge_transform(reduce_lens);
92 
93  auto reduce_func = typename Problem::ReduceOp{};
94  const XDataType custom_padding_value =
95  type_convert<XDataType>(reduce_func.template GetIdentityValue<ComputeDataType>());
96 
97  // Calculate optimal vector size for input tensor
98  constexpr auto x_tensor_vector_size = CalculateInputVectorSize<typename Problem::ReduceDims,
99  Problem::Rank,
100  Problem::NumReduceDim>();
101 
102  // Create input tensor view with custom padding value
103  auto desc = make_naive_tensor_descriptor(
104  input_shape, input_strides, number<x_tensor_vector_size>{});
105 
106  // Create buffer view with custom padding value
107  auto buffer_view = make_buffer_view<address_space_enum::global>(
108  p_x, desc.get_element_space_size(), custom_padding_value);
109 
110  // Create tensor view with custom padding
111  const auto x_tensor = tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
112  const auto transformed_x_tensor = pad_tensor_view(
114  x_tensor,
115  make_tuple(kept_merge_transform, reduce_merge_transform),
116  make_tuple(typename Problem::KeptDim{}, typename Problem::ReduceDims{}),
119  sequence<0, 1>{});
120 
121  // Calculate strides for output tensor based on its own dimensions
122  const auto kept_strides = [&]() {
123  return generate_tuple(
124  [&](auto I) {
125  // Calculate stride for dimension I as product of all following dimensions
126  index_t stride = 1;
127  static_for<I + 1, Problem::KeptDim::size(), 1>{}(
128  [&](auto J) { stride *= kept_lens.at(number<J>{}); });
129  return stride;
130  },
131  number<Problem::KeptDim::size()>{});
132  }();
133 
134  // Calculate optimal vector size for output tensor
135  constexpr auto y_tensor_vector_size = CalculateOutputVectorSize();
136 
137  const auto y_m = make_naive_tensor_view<address_space_enum::global>(
138  p_y, kept_lens, kept_strides, number<y_tensor_vector_size>{});
139 
140  // Transform output tensor to 1D merged view
141  // This creates a view compatible with the 2D reduction pattern
142  const auto y_merged = transform_tensor_view(
143  y_m,
144  make_tuple(kept_merge_transform),
145  make_tuple(typename arithmetic_sequence_gen<0, Problem::KeptDim::size(), 1>::type{}),
147 
148  auto x_window = make_tile_window(transformed_x_tensor,
150  {iM, 0},
151  Policy::template MakeXBlockTileDistribution<Problem>());
152 
153  auto y_window = make_tile_window(y_merged, make_tuple(number<S::Block_M>{}), {iM});
154 
155  __shared__ char smem[Policy::template GetSmemSize<Problem>()];
156 
157  // Get the merged dimension size from the transformed tensor
158  const auto merged_reduce_len =
159  transformed_x_tensor.get_tensor_descriptor().get_lengths().at(number<1>{});
160  index_t num_n_tile_iteration =
161  amd_wave_read_first_lane(integer_divide_ceil(merged_reduce_len, S::Block_N));
162 
163  auto block_reduce2d = Policy::template GetBlockReduce2d<Problem>();
164  auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
165  auto block_reduce2d_cross_warp_sync =
166  Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
167 
168  using XTensorType = decltype(load_tile(x_window));
169  auto y_compute = block_reduce2d.template MakeYBlockTile<XTensorType>();
170  set_tile(y_compute, reduce_func.template GetIdentityValue<ComputeDataType>());
171 
172  for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
173  {
174  const auto x = load_tile(x_window);
175  block_reduce2d(x, y_compute, reduce_func);
176  move_tile_window(x_window, {0, S::Block_N});
177  }
178 
179  block_reduce2d_sync(y_compute, reduce_func);
180  block_reduce2d_cross_warp_sync(y_compute, smem, reduce_func);
181 
182  store_tile(y_window, cast_tile<YDataType>(y_compute));
183  }
184 };
185 
186 } // namespace ck_tile
#define CK_TILE_DEVICE
Definition: config.hpp:45
#define CK_TILE_HOST
Definition: config.hpp:44
Definition: cluster_descriptor.hpp:13
constexpr CK_TILE_HOST_DEVICE auto make_naive_tensor_descriptor(const tuple< Lengths... > &lengths, const tuple< Strides... > &strides, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}, number< GuaranteedLastDimensionVectorStride >=number<-1 >{})
Definition: tensor_descriptor.hpp:274
constexpr CK_TILE_HOST_DEVICE auto integer_divide_ceil(X x, Y y)
Definition: math.hpp:145
CK_TILE_DEVICE void set_tile(DstrTensors &dstr_tensor, const T &value)
Definition: tile_elementwise.hpp:95
__device__ uint32_t amd_wave_read_first_lane(uint16_t v)
Definition: amd_buffer_addressing.hpp:36
constexpr CK_TILE_HOST_DEVICE auto transform_tensor_view(const OldTensorView &old_tensor_view, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition: tensor_view.hpp:526
constexpr CK_TILE_HOST_DEVICE auto make_merge_transform(const LowLengths &low_lengths)
Definition: coordinate_transform.hpp:1690
int32_t index_t
Definition: integer.hpp:9
constexpr CK_TILE_HOST_DEVICE auto pad_tensor_view(const TensorView &tensor_view, const TileLengths &tile_lengths, DoPads)
Definition: tensor_view.hpp:545
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:21
constexpr CK_TILE_DEVICE auto make_tile_window(null_tensor_view, const WindowLengths &window_lengths, const multi_index< WindowLengths::size()> &, Ts &&...)
Definition: null_tile_window.hpp:75
CK_TILE_DEVICE void move_tile_window(null_tile_window< WindowLengths > &, const typename null_tile_window< WindowLengths >::BottomTensorIndex &)
Definition: null_tile_window.hpp:95
constexpr CK_TILE_HOST_DEVICE auto generate_tuple(F &&f, number< N >)
Definition: tuple.hpp:429
constexpr CK_TILE_HOST_DEVICE auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:360
CK_TILE_DEVICE void store_tile(tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile_window_tmp, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
Definition: store_tile.hpp:24
constexpr CK_TILE_HOST_DEVICE T min(T x)
Definition: math.hpp:206
CK_TILE_DEVICE auto load_tile(const TileWindow_ &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={})
Definition: load_tile.hpp:36
Definition: reduce2d_kernel.hpp:20
ck_tile::remove_cvref_t< typename Problem::XDataType > XDataType
Definition: reduce2d_kernel.hpp:24
static constexpr index_t kBlockSize
Definition: reduce2d_kernel.hpp:28
ck_tile::remove_cvref_t< typename Problem::YDataType > YDataType
Definition: reduce2d_kernel.hpp:26
static constexpr CK_TILE_HOST auto BlockSize()
Definition: reduce2d_kernel.hpp:29
ck_tile::remove_cvref_t< typename Problem::ComputeDataType > ComputeDataType
Definition: reduce2d_kernel.hpp:25
CK_TILE_DEVICE void operator()(const XDataType *p_x, YDataType *p_y, InputShape input_shape, InputStrides input_strides) const
Definition: reduce2d_kernel.hpp:67
ck_tile::remove_cvref_t< Problem_ > Problem
Definition: reduce2d_kernel.hpp:21
ck_tile::remove_cvref_t< Policy_ > Policy
Definition: reduce2d_kernel.hpp:22
Definition: sequence.hpp:298
Definition: buffer_view.hpp:35
Definition: integral_constant.hpp:13
Definition: sequence.hpp:49
Definition: functional.hpp:43
Definition: tensor_view.hpp:41