include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp Source File

include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp Source File#

Composable Kernel: include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp Source File
topk_softmax_kernel.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
6 #include "ck_tile/core.hpp"
7 #include "ck_tile/ops/common.hpp"
10 #include <string>
11 #include <type_traits>
12 
13 namespace ck_tile {
14 
16 {
17  const void* p_input;
18  void* p_output;
19  void* p_indices;
23  index_t stride_input; // row stride for input, at least experts
24  index_t stride_output; // row stride for output/indices, at least tpok
25 };
26 
27 template <typename Pipeline_>
29 {
32 
33  using InputType = typename Problem::InputType;
34  using WeightType = typename Problem::WeightType;
35  using IndexType = typename Problem::IndexType;
36 
38  {
39  const void* p_input;
40  void* p_output;
41  void* p_indices;
45  index_t stride_input; // row stride for input, at least experts
46  index_t stride_output; // row stride for output/indices, at least tpok
47  };
48 
51 
52  CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
53  {
54  if constexpr(Problem::LaunchType > 0)
55  {
56  int num_cu = [&]() {
57  hipDeviceProp_t dev_prop;
58  hipDevice_t dev;
59  HIP_CHECK_ERROR(hipGetDevice(&dev));
60  HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
61  return dev_prop.multiProcessorCount;
62  }();
63  return dim3(num_cu * Problem::LaunchType);
64  }
65  else
66  {
67  const int num_warps = (h.num_rows + Problem::RowsPerWarp - 1) / Problem::RowsPerWarp;
68  const int num_blocks =
69  (num_warps + Problem::WarpsPerBlock - 1) / Problem::WarpsPerBlock;
70  return dim3(num_blocks);
71  }
72  }
73 
74  CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
75  {
76  Kargs k;
77  k.p_input = h.p_input;
78  k.p_output = h.p_output;
79  k.p_indices = h.p_indices;
80  k.num_rows = h.num_rows;
82  k.topk = h.topk;
85  return k;
86  }
87 
88  CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::BlockSize; }
89 
90  CK_TILE_DEVICE void operator()(Kargs kargs) const
91  {
92  index_t block_row_id = static_cast<index_t>(blockIdx.x * Problem::RowsPerBlock);
93 
94  if(block_row_id > kargs.num_rows)
95  return;
96 
97  index_t block_os_inp = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_input);
98  index_t block_os_out = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_output);
99  index_t num_rows_rem = __builtin_amdgcn_readfirstlane(kargs.num_rows - block_row_id);
100 
101  const auto input_window = [&]() {
102  const InputType* p_input =
103  reinterpret_cast<const InputType*>(kargs.p_input) + block_os_inp;
104 
105  auto tmp = make_naive_tensor_view<address_space_enum::global>(
106  p_input,
107  make_tuple(num_rows_rem, kargs.num_experts),
108  make_tuple(kargs.stride_input, 1),
110  number<1>{});
111 
112  auto view = pad_tensor_view(
113  tmp,
115  sequence<0, 1>{}); // out-most dim no need pad(leverage oob)
116 
117  return make_tile_window(
118  view,
120  {0, 0});
121  }();
122 
123  auto output_window = [&]() {
124  WeightType* p_output = reinterpret_cast<WeightType*>(kargs.p_output) + block_os_out;
125  auto tmp = make_naive_tensor_view<address_space_enum::global>(
126  p_output,
127  make_tuple(num_rows_rem, kargs.topk),
128  make_tuple(kargs.stride_output, 1),
130  number<1>{});
131  auto view =
132  pad_tensor_view(tmp,
134  sequence<0, 0>{}); // 1. out-most dim no need pad(leverage oob)
135  // 2. we loop over topk 1-1, no need padding
136  return make_tile_window(
138  }();
139 
140  auto indices_window = [&]() {
141  IndexType* p_indices = reinterpret_cast<IndexType*>(kargs.p_indices) + block_os_out;
142  auto tmp = make_naive_tensor_view<address_space_enum::global>(
143  p_indices,
144  make_tuple(num_rows_rem, kargs.topk),
145  make_tuple(kargs.stride_output, 1),
147  number<1>{});
148  auto view =
149  pad_tensor_view(tmp,
151  sequence<0, 0>{}); // 1. out-most dim no need pad(leverage oob)
152  // 2. we loop over topk 1-1, no need padding
153  return make_tile_window(
155  }();
156 
157  Pipeline{}(input_window,
158  output_window,
159  indices_window,
160  kargs.num_rows,
161  kargs.num_experts,
162  kargs.topk,
163  block_row_id);
164  }
165 };
166 } // namespace ck_tile
#define CK_TILE_DEVICE
Definition: config.hpp:40
#define CK_TILE_HOST
Definition: config.hpp:39
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:41
#define HIP_CHECK_ERROR(retval_or_funcall)
Definition: hip_check_error.hpp:22
Definition: cluster_descriptor.hpp:13
int32_t index_t
Definition: integer.hpp:9
constexpr CK_TILE_HOST_DEVICE auto pad_tensor_view(const TensorView &tensor_view, const TileLengths &tile_lengths, DoPads)
Definition: tensor_view.hpp:480
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:20
constexpr CK_TILE_DEVICE auto make_tile_window(null_tensor_view, const WindowLengths &window_lengths, const multi_index< WindowLengths::size()> &, Ts &&...)
Definition: null_tile_window.hpp:72
constexpr CK_TILE_HOST_DEVICE auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:337
Definition: topk_softmax_kernel.hpp:16
index_t num_experts
Definition: topk_softmax_kernel.hpp:21
index_t topk
Definition: topk_softmax_kernel.hpp:22
index_t stride_output
Definition: topk_softmax_kernel.hpp:24
const void * p_input
Definition: topk_softmax_kernel.hpp:17
index_t num_rows
Definition: topk_softmax_kernel.hpp:20
void * p_indices
Definition: topk_softmax_kernel.hpp:19
index_t stride_input
Definition: topk_softmax_kernel.hpp:23
void * p_output
Definition: topk_softmax_kernel.hpp:18
Definition: topk_softmax_kernel.hpp:38
const void * p_input
Definition: topk_softmax_kernel.hpp:39
void * p_output
Definition: topk_softmax_kernel.hpp:40
index_t stride_output
Definition: topk_softmax_kernel.hpp:46
index_t stride_input
Definition: topk_softmax_kernel.hpp:45
index_t num_rows
Definition: topk_softmax_kernel.hpp:42
void * p_indices
Definition: topk_softmax_kernel.hpp:41
index_t topk
Definition: topk_softmax_kernel.hpp:44
index_t num_experts
Definition: topk_softmax_kernel.hpp:43
Definition: topk_softmax_kernel.hpp:29
remove_cvref_t< typename Pipeline::Problem > Problem
Definition: topk_softmax_kernel.hpp:31
remove_cvref_t< Pipeline_ > Pipeline
Definition: topk_softmax_kernel.hpp:30
typename Problem::InputType InputType
Definition: topk_softmax_kernel.hpp:33
static constexpr CK_TILE_HOST auto MakeKargs(const Hargs &h)
Definition: topk_softmax_kernel.hpp:74
static constexpr CK_TILE_HOST auto GridSize(const Hargs &h)
Definition: topk_softmax_kernel.hpp:52
static constexpr CK_TILE_HOST_DEVICE auto BlockSize()
Definition: topk_softmax_kernel.hpp:88
typename Problem::WeightType WeightType
Definition: topk_softmax_kernel.hpp:34
CK_TILE_DEVICE void operator()(Kargs kargs) const
Definition: topk_softmax_kernel.hpp:90
typename Problem::IndexType IndexType
Definition: topk_softmax_kernel.hpp:35
Definition: integral_constant.hpp:13
Definition: sequence.hpp:52