/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/arch/arch.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/arch/arch.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/arch/arch.hpp Source File
arch.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
6 // Address Space for AMDGCN
7 // https://llvm.org/docs/AMDGPUUsage.html#address-space
8 
13 
14 #define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111
15 #define CK_TILE_VMCNT(cnt) \
16  ([]() { static_assert(!((cnt) >> 6), "VMCNT only has 6 bits"); }(), \
17  ((cnt) & 0b1111) | (((cnt) & 0b110000) << 10))
18 #define CK_TILE_EXPCNT(cnt) \
19  ([]() { static_assert(!((cnt) >> 3), "EXP only has 3 bits"); }(), ((cnt) << 4))
20 #define CK_TILE_LGKMCNT(cnt) \
21  ([]() { static_assert(!((cnt) >> 4), "LGKM only has 4 bits"); }(), ((cnt) << 8))
22 
23 namespace ck_tile {
24 
25 template <typename, bool>
26 struct safe_underlying_type;
27 
28 template <typename T>
29 struct safe_underlying_type<T, true>
30 {
31  using type = std::underlying_type_t<T>;
32 };
33 
34 template <typename T>
35 struct safe_underlying_type<T, false>
36 {
37  using type = void;
38 };
39 
40 template <typename T>
41 using safe_underlying_type_t = typename safe_underlying_type<T, std::is_enum<T>::value>::type;
42 
43 enum struct address_space_enum : std::uint16_t
44 {
45  generic = 0,
46  global,
47  lds,
48  sgpr,
49  constant,
50  vgpr
51 };
52 
53 enum struct memory_operation_enum : std::uint16_t
54 {
55  set = 0,
56  atomic_add,
57  atomic_max,
58  add
59 };
60 
62 {
63 #if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__)
64  return 64;
65 #else
66  return 32;
67 #endif
68 }
69 
70 CK_TILE_HOST bool is_wave32()
71 {
72  hipDeviceProp_t props{};
73  int device;
74  auto status = hipGetDevice(&device);
75  if(status != hipSuccess)
76  {
77  return false;
78  }
79  status = hipGetDeviceProperties(&props, device);
80  if(status != hipSuccess)
81  {
82  return false;
83  }
84  return props.major > 9;
85 }
86 
87 CK_TILE_DEVICE index_t get_grid_size() { return gridDim.x; }
88 
89 CK_TILE_DEVICE index_t get_block_size() { return blockDim.x; }
90 
91 // TODO: deprecate these
92 CK_TILE_DEVICE index_t get_thread_local_1d_id() { return threadIdx.x; }
93 
94 CK_TILE_DEVICE index_t get_thread_global_1d_id() { return blockIdx.x * blockDim.x + threadIdx.x; }
95 
96 CK_TILE_DEVICE index_t get_block_1d_id() { return blockIdx.x; }
97 
98 // Use these instead
99 CK_TILE_DEVICE index_t get_lane_id() { return __lane_id(); }
100 
101 template <bool ReturnSgpr = true>
102 CK_TILE_DEVICE index_t get_warp_id(bool_constant<ReturnSgpr> = {})
103 {
104  const index_t warp_id = threadIdx.x / get_warp_size();
105  if constexpr(ReturnSgpr)
106  {
107  return __builtin_amdgcn_readfirstlane(warp_id);
108  }
109  else
110  {
111  return warp_id;
112  }
113 }
114 
115 CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; }
116 
117 CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }
118 
119 CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
120 {
121 #ifdef __gfx12__
122  asm volatile("s_wait_loadcnt %0 \n"
123  "s_barrier_signal -1 \n"
124  "s_barrier_wait -1"
125  :
126  : "n"(cnt)
127  : "memory");
128 #else
129  asm volatile("s_waitcnt vmcnt(%0) \n"
130  "s_barrier"
131  :
132  : "n"(cnt)
133  : "memory");
134 #endif
135 }
136 
137 // https://llvm.org/docs/AMDGPU/gfx9_waitcnt.html
138 struct waitcnt_arg
139 {
140  // bit numbers (hex) -------------------------> FE'DC'BA98'7'654'3210
141  // [V]M [E]XP [L]GKM counters and [U]NUSED ---> VV'UU'LLLL'U'EEE'VVVV
142  CK_TILE_DEVICE static constexpr index_t MAX = 0b11'00'1111'0'111'1111;
143 
144  CK_TILE_DEVICE static constexpr index_t kMaxVmCnt = 0b111111;
145  CK_TILE_DEVICE static constexpr index_t kMaxExpCnt = 0b111;
146  CK_TILE_DEVICE static constexpr index_t kMaxLgkmCnt = 0b1111;
147 
148  template <index_t cnt>
149  CK_TILE_DEVICE static constexpr index_t from_vmcnt()
150  {
151  static_assert(cnt >= 0 && !(cnt >> 6), "valid range is [0..63]");
152  return MAX & ((cnt & 0b1111) | ((cnt & 0b110000) << 10));
153  }
154 
155  template <index_t cnt>
156  CK_TILE_DEVICE static constexpr index_t from_expcnt()
157  {
158  static_assert(cnt >= 0 && !(cnt >> 3), "valid range is [0..7]");
159  return MAX & (cnt << 4);
160  }
161 
162  template <index_t cnt>
163  CK_TILE_DEVICE static constexpr index_t from_lgkmcnt()
164  {
165  static_assert(cnt >= 0 && !(cnt >> 4), "valid range is [0..15]");
166  return MAX & (cnt << 8);
167  }
168 };
169 
170 template <index_t vmcnt = waitcnt_arg::kMaxVmCnt,
171  index_t expcnt = waitcnt_arg::kMaxExpCnt,
172  index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
174 {
175  __builtin_amdgcn_s_waitcnt(waitcnt_arg::from_vmcnt<vmcnt>() |
176  waitcnt_arg::from_expcnt<expcnt>() |
177  waitcnt_arg::from_lgkmcnt<lgkmcnt>());
178 }
179 
180 template <index_t vmcnt = waitcnt_arg::kMaxVmCnt,
181  index_t expcnt = waitcnt_arg::kMaxExpCnt,
182  index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
184 {
185  s_waitcnt<vmcnt, expcnt, lgkmcnt>();
186  __builtin_amdgcn_s_barrier();
187 }
188 
189 template <index_t lgkmcnt = 0>
191 {
192  s_waitcnt_barrier<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, lgkmcnt>();
193 }
194 
195 template <index_t vmcnt = 0>
197 {
198  s_waitcnt_barrier<vmcnt, waitcnt_arg::kMaxExpCnt, waitcnt_arg::kMaxLgkmCnt>();
199 }
200 
202 {
203 #if 1
204  asm volatile("s_nop %0" : : "n"(cnt) :);
205 #else
206  __builtin_amdgcn_sched_barrier(cnt);
207 #endif
208 }
209 
210 #define CK_CONSTANT_ADDRESS_SPACE \
211  __attribute__((address_space( \
212  static_cast<safe_underlying_type_t<address_space_enum>>(address_space_enum::constant))))
213 
214 template <typename T>
216 {
217  // cast a pointer in "Constant" address space (4) to "Generic" address space (0)
218  // only c-style pointer cast seems be able to be compiled
219 #pragma clang diagnostic push
220 #pragma clang diagnostic ignored "-Wold-style-cast"
221  return (T*)(p); // NOLINT(old-style-cast)
222 #pragma clang diagnostic pop
223 }
224 
225 template <typename T>
227 {
228  // cast a pointer in "Generic" address space (0) to "Constant" address space (4)
229  // only c-style pointer cast seems be able to be compiled;
230 #pragma clang diagnostic push
231 #pragma clang diagnostic ignored "-Wold-style-cast"
232  return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast)
233 #pragma clang diagnostic pop
234 }
235 
237 {
238 #if defined(__gfx950__)
239  return 163840;
240 #else
241  return 65536;
242 #endif
243 }
244 
246 CK_TILE_HOST_DEVICE constexpr const char* address_space_to_string(address_space_enum addr_space)
247 {
248  switch(addr_space)
249  {
250  case address_space_enum::generic: return "generic";
251  case address_space_enum::global: return "global";
252  case address_space_enum::lds: return "lds";
253  case address_space_enum::sgpr: return "sgpr";
254  case address_space_enum::constant: return "constant";
255  case address_space_enum::vgpr: return "vgpr";
256  default: return "unknown";
257  }
258 }
259 
260 // Architecture tags
261 struct gfx11_t
262 {
263 };
264 struct gfx12_t
265 {
266 };
267 
268 CK_TILE_DEVICE static constexpr auto get_device_arch()
269 {
270 #if defined(__gfx11__)
271  return gfx11_t{};
272 #else // if defined(__gfx12__)
273  return gfx12_t{};
274 #endif
275 }
276 } // namespace ck_tile
#define CK_CONSTANT_ADDRESS_SPACE
Definition: arch.hpp:210
constexpr CK_TILE_HOST_DEVICE const char * address_space_to_string(address_space_enum addr_space)
Helper function to convert address space enum to string.
Definition: arch.hpp:246
constexpr CK_TILE_HOST_DEVICE index_t get_smem_capacity()
Definition: arch.hpp:236
CK_TILE_DEVICE void s_nop(index_t cnt=0)
Definition: arch.hpp:201
__device__ T * cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE *p)
Definition: arch.hpp:215
CK_TILE_DEVICE void block_sync_lds_direct_load()
Definition: arch.hpp:196
CK_TILE_DEVICE void s_waitcnt_barrier()
Definition: arch.hpp:183
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE * cast_pointer_to_constant_address_space(T *p)
Definition: arch.hpp:226
CK_TILE_DEVICE void block_sync_lds()
Definition: arch.hpp:190
CK_TILE_DEVICE void s_waitcnt()
Definition: arch.hpp:173
#define CK_TILE_DEVICE
Definition: config.hpp:41
#define CK_TILE_HOST
Definition: config.hpp:40
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:42
Definition: cluster_descriptor.hpp:13
CK_TILE_DEVICE void atomic_add(X *p_dst, const X &x)
CK_TILE_HOST_DEVICE T add(const T &a, const T &b)
Definition: generic_memory_space_atomic.hpp:16
int32_t index_t
Definition: integer.hpp:9
__device__ index_t get_grid_size()
Definition: get_id.hpp:60
__host__ constexpr __device__ index_t get_warp_size()
Definition: get_id.hpp:42
__device__ index_t get_block_size()
Definition: get_id.hpp:62
__device__ index_t get_block_1d_id()
Definition: get_id.hpp:58
__device__ index_t get_thread_global_1d_id()
Definition: get_id.hpp:54
__device__ X atomic_max(X *p_dst, const X &x)
__device__ index_t get_thread_local_1d_id()
Definition: get_id.hpp:52
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1350
unsigned short uint16_t
Definition: stdint.h:125
Definition: arch.hpp:262
Definition: arch.hpp:265