/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/arch/arch.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/arch/arch.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/arch/arch.hpp Source File
arch.hpp
Go to the documentation of this file.
1 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2 // SPDX-License-Identifier: MIT
3 
4 #pragma once
5 
6 // Address Space for AMDGCN
7 // https://llvm.org/docs/AMDGPUUsage.html#address-space
8 
16 
17 #define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111
18 #define CK_TILE_VMCNT(cnt) \
19  ([]() { static_assert(!((cnt) >> 6), "VMCNT only has 6 bits"); }(), \
20  ((cnt) & 0b1111) | (((cnt) & 0b110000) << 10))
21 #define CK_TILE_EXPCNT(cnt) \
22  ([]() { static_assert(!((cnt) >> 3), "EXP only has 3 bits"); }(), ((cnt) << 4))
23 #define CK_TILE_LGKMCNT(cnt) \
24  ([]() { static_assert(!((cnt) >> 4), "LGKM only has 4 bits"); }(), ((cnt) << 8))
25 
26 namespace ck_tile {
27 
28 template <typename, bool>
29 struct safe_underlying_type;
30 
31 template <typename T>
32 struct safe_underlying_type<T, true>
33 {
34  using type = std::underlying_type_t<T>;
35 };
36 
37 template <typename T>
38 struct safe_underlying_type<T, false>
39 {
40  using type = void;
41 };
42 
43 template <typename T>
44 using safe_underlying_type_t = typename safe_underlying_type<T, std::is_enum<T>::value>::type;
45 
46 enum struct address_space_enum : std::uint16_t
47 {
48  generic = 0,
49  global,
50  lds,
51  sgpr,
52  constant,
53  vgpr
54 };
55 
56 enum struct memory_operation_enum : std::uint16_t
57 {
58  set = 0,
59  atomic_add,
60  atomic_max,
61  add
62 };
63 
64 namespace core::arch {
65 
70 enum struct amdgcn_target_id
71 {
72  GFX908 = 0x0908, // MI-100...
73  GFX90A = 0x090A,
74  GFX942 = 0x0942,
75  GFX950 = 0x0950,
76  GFX1030 = 0x1030,
77  GFX1031 = 0x1031,
78  GFX1032 = 0x1032,
79  GFX1034 = 0x1034,
80  GFX1035 = 0x1035,
81  GFX1036 = 0x1036,
82  GFX103_GENERIC = 0x103F,
83  GFX1100 = 0x1100,
84  GFX1101 = 0x1101,
85  GFX1102 = 0x1102,
86  GFX1103 = 0x1103,
87  GFX1150 = 0x1150,
88  GFX1151 = 0x1151,
89  GFX1152 = 0x1152,
90  GFX1153 = 0x1153,
91  GFX11_GENERIC = 0x11FF,
92  GFX1200 = 0x1200,
93  GFX1201 = 0x1201,
94  GFX12_GENERIC = 0x12FF,
95  HOST = 0x0000,
96 };
97 
98 enum struct amdgcn_target_family_id
99 {
100  GFX9 = 0x09,
101  GFX10_3 = 0x10,
102  GFX11 = 0x11,
103  GFX12 = 0x12,
104  HOST = 0x00,
105 };
106 
107 enum struct amdgcn_target_arch_id
108 {
109  CDNA = 0x01,
110  RDNA = 0x02,
111  HOST = 0x00,
112 };
113 
114 enum struct amdgcn_target_wave_size_id
115 {
116  WAVE32 = 32u,
117  WAVE64 = 64u,
118  HOST = 64u, // TODO: Is this correct? Should the host default to 64 or 1?
119 };
120 
121 #if 1 //__cplusplus <= 201703L
122 
123 template <amdgcn_target_id TargetId = amdgcn_target_id::HOST,
124  amdgcn_target_family_id FamilyId = amdgcn_target_family_id::HOST,
125  amdgcn_target_arch_id ArchId = amdgcn_target_arch_id::HOST,
126  amdgcn_target_wave_size_id WaveSizeId = amdgcn_target_wave_size_id::HOST>
127 struct amdgcn_target
128 {
129  static constexpr amdgcn_target_id TARGET_ID = TargetId;
130  static constexpr amdgcn_target_family_id FAMILY_ID = FamilyId;
131  static constexpr amdgcn_target_arch_id ARCH_ID = ArchId;
132  static constexpr amdgcn_target_wave_size_id WAVE_SIZE_ID = WaveSizeId;
133 };
134 
135 template <amdgcn_target_id targetId>
136 static constexpr auto make_amdgcn_gfx9_target()
137 {
138  return amdgcn_target<targetId,
139  amdgcn_target_family_id::GFX9,
140  amdgcn_target_arch_id::CDNA,
141  amdgcn_target_wave_size_id::WAVE64>{};
142 }
143 
144 template <amdgcn_target_id targetId>
145 static constexpr auto make_amdgcn_gfx10_3_target()
146 {
147  return amdgcn_target<targetId,
148  amdgcn_target_family_id::GFX10_3,
149  amdgcn_target_arch_id::RDNA,
150  amdgcn_target_wave_size_id::WAVE32>{};
151 }
152 
153 template <amdgcn_target_id targetId>
154 static constexpr auto make_amdgcn_gfx11_target()
155 {
156  return amdgcn_target<targetId,
157  amdgcn_target_family_id::GFX11,
158  amdgcn_target_arch_id::RDNA,
159  amdgcn_target_wave_size_id::WAVE32>{};
160 }
161 
162 template <amdgcn_target_id targetId>
163 static constexpr auto make_amdgcn_gfx12_target()
164 {
165  return amdgcn_target<targetId,
166  amdgcn_target_family_id::GFX12,
167  amdgcn_target_arch_id::RDNA,
168  amdgcn_target_wave_size_id::WAVE32>{};
169 }
170 
171 template <typename CompilerTarget, amdgcn_target_id... TargetIds>
172 static constexpr auto is_target_id_any_of()
173 {
174  return is_any_value_of(CompilerTarget::TARGET_ID, TargetIds...);
175 }
176 
177 template <typename CompilerTarget, amdgcn_target_family_id... FamilyIds>
178 static constexpr auto is_target_family_any_of()
179 {
180  return is_any_value_of(CompilerTarget::FAMILY_ID, FamilyIds...);
181 }
182 
183 template <typename CompilerTarget>
184 static constexpr bool is_target_family_gfx9()
185 {
186  return CompilerTarget::FAMILY_ID == amdgcn_target_family_id::GFX9;
187 }
188 
189 template <typename CompilerTarget>
190 static constexpr bool is_target_family_gfx10_3()
191 {
192  return CompilerTarget::FAMILY_ID == amdgcn_target_family_id::GFX10_3;
193 }
194 
195 template <typename CompilerTarget>
196 static constexpr bool is_target_family_gfx11()
197 {
198  return CompilerTarget::FAMILY_ID == amdgcn_target_family_id::GFX11;
199 }
200 
201 template <typename CompilerTarget>
202 static constexpr bool is_target_family_gfx12()
203 {
204  return CompilerTarget::FAMILY_ID == amdgcn_target_family_id::GFX12;
205 }
206 
207 template <typename CompilerTarget>
208 static constexpr bool is_target_arch_cdna()
209 {
210  return CompilerTarget::ARCH_ID == amdgcn_target_arch_id::CDNA;
211 }
212 
213 template <typename CompilerTarget>
214 static constexpr bool is_target_arch_rdna()
215 {
216  return CompilerTarget::ARCH_ID == amdgcn_target_arch_id::RDNA;
217 }
218 
219 template <typename CompilerTarget>
220 static constexpr bool is_target_wave_size_32()
221 {
222  return CompilerTarget::WAVE_SIZE_ID == amdgcn_target_wave_size_id::WAVE32;
223 }
224 
225 template <typename CompilerTarget>
226 static constexpr bool is_target_wave_size_64()
227 {
228  return CompilerTarget::WAVE_SIZE_ID == amdgcn_target_wave_size_id::WAVE64;
229 }
230 
231 // Helper to map compiler state to target arch id
232 
233 #define MAP_COMPILER_STATE_TO_GFX9_TARGET(COMPILER_STATE, TARGET_ID) \
234  if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
235  { \
236  return make_amdgcn_gfx9_target<amdgcn_target_id::TARGET_ID>(); \
237  } \
238  else
239 
240 #define MAP_COMPILER_STATE_TO_GFX10_3_TARGET(COMPILER_STATE, TARGET_ID) \
241  if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
242  { \
243  return make_amdgcn_gfx10_3_target<amdgcn_target_id::TARGET_ID>(); \
244  } \
245  else
246 
247 #define MAP_COMPILER_STATE_TO_GFX11_TARGET(COMPILER_STATE, TARGET_ID) \
248  if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
249  { \
250  return make_amdgcn_gfx11_target<amdgcn_target_id::TARGET_ID>(); \
251  } \
252  else
253 
254 #define MAP_COMPILER_STATE_TO_GFX12_TARGET(COMPILER_STATE, TARGET_ID) \
255  if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
256  { \
257  return make_amdgcn_gfx12_target<amdgcn_target_id::TARGET_ID>(); \
258  } \
259  else
260 
266 constexpr auto get_compiler_target()
267 {
268  MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX908, GFX908);
269  MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX90A, GFX90A);
270  MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX942, GFX942);
271  MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX950, GFX950);
272  MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1030, GFX1030);
273  MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1031, GFX1031);
274  MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1032, GFX1032);
275  MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1034, GFX1034);
276  MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1035, GFX1035);
277  MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1036, GFX1036);
278  MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX10_3_GENERIC, GFX103_GENERIC);
279  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1100, GFX1100);
280  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1101, GFX1101);
281  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1102, GFX1102);
282  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1103, GFX1103);
283  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1150, GFX1150);
284  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1151, GFX1151);
285  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1152, GFX1152);
286  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1153, GFX1153);
287  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX11_GENERIC, GFX11_GENERIC);
288  MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX1200, GFX1200);
289  MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX1201, GFX1201);
290  MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX12_GENERIC, GFX12_GENERIC);
291 
292  // Return HOST by default
293  if constexpr(amdgcn_compiler_target_state::CK_TILE_HOST_COMPILE)
294  {
295  return amdgcn_target<>{};
296  }
297 }
298 
299 // Cleanup
300 #undef MAP_COMPILER_STATE_TO_GFX9_TARGET
301 #undef MAP_COMPILER_STATE_TO_GFX10_3_TARGET
302 #undef MAP_COMPILER_STATE_TO_GFX11_TARGET
303 #undef MAP_COMPILER_STATE_TO_GFX12_TARGET
304 
305 // Sanity check: device compile must have a valid target architecture
306 static_assert(!amdgcn_compiler_target_state::CK_TILE_DEVICE_COMPILE ||
307  get_compiler_target().TARGET_ID != amdgcn_target_id::HOST,
308  "Device compile must have a valid target device architecture");
309 
310 // Sanity check: host compile must have HOST target architecture
311 static_assert(!amdgcn_compiler_target_state::CK_TILE_HOST_COMPILE ||
312  get_compiler_target().TARGET_ID == amdgcn_target_id::HOST,
313  "Host compile must target HOST architecture");
314 
315 // TODO: c++20 use the make functions and constexpr if to avoid string construction and find at
316 // runtime
317 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(NAME_STRING, TARGET_ID) \
318  if(str.find(NAME_STRING) != std::string::npos) \
319  { \
320  return amdgcn_target_id::TARGET_ID; \
321  } \
322  else
323 
330 // TODO: c++20 constexpr if and string_view to avoid std::string construction and find at runtime
331 // TODO: c++20 return amdgcn_target instance instead of just the target id
332 CK_TILE_HOST auto hip_device_prop_gcn_arch_name_to_amdgcn_target_id(char const* testStr)
333 {
334  auto str = std::string(testStr);
335  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx908", GFX908);
336  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx90a", GFX90A);
337  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx942", GFX942);
338  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx950", GFX950);
339  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1030", GFX1030);
340  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1031", GFX1031);
341  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1032", GFX1032);
342  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1034", GFX1034);
343  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1035", GFX1035);
344  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1036", GFX1036);
345  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx10_3_generic", GFX103_GENERIC);
346  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1100", GFX1100);
347  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1101", GFX1101);
348  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1102", GFX1102);
349  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1103", GFX1103);
350  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1150", GFX1150);
351  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1151", GFX1151);
352  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1152", GFX1152);
353  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1153", GFX1153);
354  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx11_generic", GFX11_GENERIC);
355  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1200", GFX1200);
356  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx1201", GFX1201);
357  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID("gfx12_generic", GFX12_GENERIC);
358 
359  // Default case: return HOST target if no match is found
360  return amdgcn_target_id::HOST;
361 }
362 
363 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID
364 
371 template <typename CompilerTarget, amdgcn_target_id... SupportedTargetIds>
372 using enable_if_target_id_t =
373  std::enable_if_t<is_any_value_of(CompilerTarget::TARGET_ID, SupportedTargetIds...)>;
374 
382 template <typename CompilerTarget, amdgcn_target_family_id... SupportedTargetFamilyIds>
383 using enable_if_target_family_id_t =
384  std::enable_if_t<is_any_value_of(CompilerTarget::FAMILY_ID, SupportedTargetFamilyIds...)>;
385 
391 template <typename CompilerTarget, amdgcn_target_arch_id... SupportedTargetArchIds>
392 using enable_if_target_arch_id_t =
393  std::enable_if_t<is_any_value_of(CompilerTarget::ARCH_ID, SupportedTargetArchIds...)>;
394 
402 template <typename CompilerTarget, amdgcn_target_wave_size_id... SupportedTargetWaveSizeIds>
403 using enable_if_target_wave_size_id_t =
404  std::enable_if_t<is_any_value_of(CompilerTarget::WAVE_SIZE_ID, SupportedTargetWaveSizeIds...)>;
405 
407 
412 template <typename CompilerTarget>
413 using enable_if_target_family_gfx9_t =
414  enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX9>;
415 
420 template <typename CompilerTarget>
421 using enable_if_target_family_gfx10_3_t =
422  enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX10_3>;
423 
428 template <typename CompilerTarget>
429 using enable_if_target_family_gfx11_t =
430  enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX11>;
431 
436 template <typename CompilerTarget>
437 using enable_if_target_family_gfx12_t =
438  enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX12>;
439 
444 template <typename CompilerTarget>
445 using enable_if_target_arch_cdna_t =
446  enable_if_target_arch_id_t<CompilerTarget, amdgcn_target_arch_id::CDNA>;
447 
452 template <typename CompilerTarget>
453 using enable_if_target_arch_rdna_t =
454  enable_if_target_arch_id_t<CompilerTarget, amdgcn_target_arch_id::RDNA>;
455 
460 template <typename CompilerTarget>
461 using enable_if_target_wave32_t =
462  enable_if_target_wave_size_id_t<CompilerTarget, amdgcn_target_wave_size_id::WAVE32>;
463 
468 template <typename CompilerTarget>
469 using enable_if_target_wave64_t =
470  enable_if_target_wave_size_id_t<CompilerTarget, amdgcn_target_wave_size_id::WAVE64>;
471 
472 #elif __cplusplus >= 202002L
473 
474 struct amdgcn_target
475 {
476  // Target architecture identifiers
477  // These are set to HOST (0) by default
478  // TARGET_ID is the specific architecture id (e.g., GFX908)
479  // FAMILY_ID is the architecture family id (e.g., GFX9)
480  // ARCH_ID is the architecture class id (e.g., CDNA, RDNA)
481  // WAVE_SIZE_ID is the wavefront size id (e.g., WAVE32, WAVE64)
482  const amdgcn_target_id TARGET_ID = amdgcn_target_id::HOST;
483  const amdgcn_target_family_id FAMILY_ID = amdgcn_target_family_id::HOST;
484  const amdgcn_target_arch_id ARCH_ID = amdgcn_target_arch_id::HOST;
485  const amdgcn_target_wave_size_id WAVE_SIZE_ID = amdgcn_target_wave_size_id::HOST;
486 };
487 
488 static constexpr auto make_amdgcn_gfx10_3_target(amdgcn_target_id targetId)
489 {
490  return amdgcn_target{.TARGET_ID = targetId,
491  .FAMILY_ID = amdgcn_target_family_id::GFX10_3,
492  .ARCH_ID = amdgcn_target_arch_id::RDNA,
493  .WAVE_SIZE_ID = amdgcn_target_wave_size_id::WAVE32};
494 }
495 
496 static constexpr auto make_amdgcn_gfx9_target(amdgcn_target_id targetId)
497 {
498  return amdgcn_target{.TARGET_ID = targetId,
499  .FAMILY_ID = amdgcn_target_family_id::GFX9,
500  .ARCH_ID = amdgcn_target_arch_id::CDNA,
501  .WAVE_SIZE_ID = amdgcn_target_wave_size_id::WAVE64};
502 }
503 
504 static constexpr auto make_amdgcn_gfx11_target(amdgcn_target_id targetId)
505 {
506  return amdgcn_target{.TARGET_ID = targetId,
507  .FAMILY_ID = amdgcn_target_family_id::GFX11,
508  .ARCH_ID = amdgcn_target_arch_id::RDNA,
509  .WAVE_SIZE_ID = amdgcn_target_wave_size_id::WAVE32};
510 }
511 
512 static constexpr auto make_amdgcn_gfx12_target(amdgcn_target_id targetId)
513 {
514  return amdgcn_target{.TARGET_ID = targetId,
515  .FAMILY_ID = amdgcn_target_family_id::GFX12,
516  .ARCH_ID = amdgcn_target_arch_id::RDNA,
517  .WAVE_SIZE_ID = amdgcn_target_wave_size_id::WAVE32};
518 }
519 
520 static constexpr bool is_target_family_gfx9(amdgcn_target target)
521 {
522  return target.FAMILY_ID == amdgcn_target_family_id::GFX9;
523 }
524 
525 static constexpr bool is_target_family_gfx10_3(amdgcn_target target)
526 {
527  return target.FAMILY_ID == amdgcn_target_family_id::GFX10_3;
528 }
529 
530 static constexpr bool is_target_family_gfx11(amdgcn_target target)
531 {
532  return target.FAMILY_ID == amdgcn_target_family_id::GFX11;
533 }
534 
535 static constexpr bool is_target_family_gfx12(amdgcn_target target)
536 {
537  return target.FAMILY_ID == amdgcn_target_family_id::GFX12;
538 }
539 
540 static constexpr bool is_target_arch_cdna(amdgcn_target target)
541 {
542  return target.ARCH_ID == amdgcn_target_arch_id::CDNA;
543 }
544 
545 static constexpr bool is_target_arch_rdna(amdgcn_target target)
546 {
547  return target.ARCH_ID == amdgcn_target_arch_id::RDNA;
548 }
549 
550 static constexpr bool is_target_wave_size_32(amdgcn_target target)
551 {
552  return target.WAVE_SIZE_ID == amdgcn_target_wave_size_id::WAVE32;
553 }
554 
555 static constexpr bool is_target_wave_size_64(amdgcn_target target)
556 {
557  return target.WAVE_SIZE_ID == amdgcn_target_wave_size_id::WAVE64;
558 }
559 
560 // Helper to map compiler state to target arch id
561 #define MAP_COMPILER_STATE_TO_GFX10_3_TARGET(COMPILER_STATE, TARGET_ID) \
562  if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
563  { \
564  return make_amdgcn_gfx9_target(amdgcn_target_id::TARGET_ID); \
565  }
566 
567 #define MAP_COMPILER_STATE_TO_GFX9_TARGET(COMPILER_STATE, TARGET_ID) \
568  if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
569  { \
570  return make_amdgcn_gfx9_target(amdgcn_target_id::TARGET_ID); \
571  }
572 
573 #define MAP_COMPILER_STATE_TO_GFX11_TARGET(COMPILER_STATE, TARGET_ID) \
574  if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
575  { \
576  return make_amdgcn_gfx11_target(amdgcn_target_id::TARGET_ID); \
577  }
578 
579 #define MAP_COMPILER_STATE_TO_GFX12_TARGET(COMPILER_STATE, TARGET_ID) \
580  if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
581  { \
582  return make_amdgcn_gfx12_target(amdgcn_target_id::TARGET_ID); \
583  }
584 
589 CK_TILE_HOST_DEVICE constexpr auto get_compiler_target()
590 {
591  MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX908, GFX908);
592  MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX90A, GFX90A);
593  MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX942, GFX942);
594  MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX950, GFX950);
595  MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1030, GFX1030);
596  MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1031, GFX1031);
597  MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1032, GFX1032);
598  MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1034, GFX1034);
599  MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1035, GFX1035);
600  MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1036, GFX1036);
601  MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX10_3_GENERIC, GFX103_GENERIC);
602  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1100, GFX1100);
603  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1101, GFX1101);
604  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1102, GFX1102);
605  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1103, GFX1103);
606  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1150, GFX1150);
607  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1151, GFX1151);
608  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1152, GFX1152);
609  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1153, GFX1153);
610  MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX11_GENERIC, GFX11_GENERIC);
611  MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX1200, GFX1200);
612  MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX1201, GFX1201);
613  MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX12_GENERIC, GFX12_GENERIC);
614 
615  // Default to HOST
616  return amdgcn_target{};
617 }
618 
619 // Cleanup
620 #undef MAP_COMPILER_STATE_TO_GFX9_TARGET
621 #undef MAP_COMPILER_STATE_TO_GFX10_3_TARGET
622 #undef MAP_COMPILER_STATE_TO_GFX11_TARGET
623 #undef MAP_COMPILER_STATE_TO_GFX12_TARGET
624 
625 // Sanity check: device compile must have a valid target architecture
626 static_assert(!amdgcn_compiler_target_state::CK_TILE_DEVICE_COMPILE ||
627  get_compiler_target().TARGET_ID != amdgcn_target_id::HOST,
628  "Device compile must have a valid target device architecture");
629 
630 // Sanity check: host compile must have HOST target architecture
631 static_assert(!amdgcn_compiler_target_state::CK_TILE_HOST_COMPILE ||
632  get_compiler_target().TARGET_ID == amdgcn_target_id::HOST,
633  "Host compile must target HOST architecture");
634 
635 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET(NAME_STRING, TARGET_ID) \
636  if constexpr(str.find(NAME_STRING) != std::string::npos) \
637  { \
638  return make_amdgcn_gfx9_target(amdgcn_target_id::TARGET_ID); \
639  } \
640  else
641 
642 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(NAME_STRING, TARGET_ID) \
643  if constexpr(str.find(NAME_STRING) != std::string::npos) \
644  { \
645  return make_amdgcn_gfx10_3_target(amdgcn_target_id::TARGET_ID); \
646  } \
647  else
648 
649 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(NAME_STRING, TARGET_ID) \
650  if constexpr(str.find(NAME_STRING) != std::string::npos) \
651  { \
652  return make_amdgcn_gfx11_target(amdgcn_target_id::TARGET_ID); \
653  } \
654  else
655 
656 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET(NAME_STRING, TARGET_ID) \
657  if constexpr(str.find(NAME_STRING) != std::string::npos) \
658  { \
659  return make_amdgcn_gfx12_target(amdgcn_target_id::TARGET_ID); \
660  } \
661  else
662 
669 CK_TILE_HOST auto hip_device_prop_gcn_arch_name_to_amdgcn_target(char const* testStr)
670 {
671  auto str = std::string(testStr);
672  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET("gfx908", GFX908);
673  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET("gfx90a", GFX90A);
674  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET("gfx942", GFX942);
675  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET("gfx950", GFX950);
676  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1030", GFX1030);
677  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1031", GFX1031);
678  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1032", GFX1032);
679  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1034", GFX1034);
680  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1035", GFX1035);
681  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx1036", GFX1036);
682  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET("gfx10_3_generic", GFX103_GENERIC);
683  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET("gfx1100", GFX1100);
684  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET("gfx1101", GFX1101);
685  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET("gfx1102", GFX1102);
686  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET("gfx1103", GFX1103);
687  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET("gfx1150", GFX1150);
688  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET("gfx1151", GFX1151);
689  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET("gfx1152", GFX1152);
690  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET("gfx1153", GFX1153);
691  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET("gfx11_generic", GFX11_GENERIC);
692  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET("gfx1200", GFX1200);
693  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET("gfx1201", GFX1201);
694  MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET("gfx12_generic", GFX12_GENERIC);
695 
696  // Default case
697  return amdgcn_target{};
698 }
699 
700 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET
701 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET
702 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET
703 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET
704 
711 template <amdgcn_target CompilerTarget, amdgcn_target_id... SupportedTargetIds>
712 using enable_if_target_id_t =
713  std::enable_if_t<is_any_value_of(CompilerTarget.TARGET_ID, SupportedTargetIds...)>;
714 
722 template <amdgcn_target CompilerTarget, amdgcn_target_family_id... SupportedTargetFamilyIds>
723 using enable_if_target_family_id_t =
724  std::enable_if_t<is_any_value_of(CompilerTarget.FAMILY_ID, SupportedTargetFamilyIds...)>;
725 
731 template <amdgcn_target CompilerTarget, amdgcn_target_arch_id... SupportedTargetArchIds>
732 using enable_if_target_arch_id_t =
733  std::enable_if_t<is_any_value_of(CompilerTarget.ARCH_ID, SupportedTargetArchIds...)>;
734 
742 template <amdgcn_target CompilerTarget, amdgcn_target_wave_size_id... SupportedTargetWaveSizeIds>
743 using enable_if_target_wave_size_id_t =
744  std::enable_if_t<is_any_value_of(CompilerTarget.WAVE_SIZE_ID, SupportedTargetWaveSizeIds...)>;
745 
747 
752 template <amdgcn_target CompilerTarget>
753 using enable_if_target_family_gfx9_t =
754  enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX9>;
755 
760 template <amdgcn_target CompilerTarget>
761 using enable_if_target_family_gfx10_3_t =
762  enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX10_3>;
763 
768 template <amdgcn_target CompilerTarget>
769 using enable_if_target_family_gfx11_t =
770  enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX11>;
771 
776 template <amdgcn_target CompilerTarget>
777 using enable_if_target_family_gfx12_t =
778  enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX12>;
779 
784 template <amdgcn_target CompilerTarget>
785 using enable_if_target_arch_cdna_t =
786  enable_if_target_arch_id_t<CompilerTarget, amdgcn_target_arch_id::CDNA>;
787 
792 template <amdgcn_target CompilerTarget>
793 using enable_if_target_arch_rdna_t =
794  enable_if_target_arch_id_t<CompilerTarget, amdgcn_target_arch_id::RDNA>;
795 
800 template <amdgcn_target CompilerTarget>
801 using enable_if_target_wave32_t =
802  enable_if_target_wave_size_id_t<CompilerTarget, amdgcn_target_wave_size_id::WAVE32>;
803 
808 template <amdgcn_target CompilerTarget>
809 using enable_if_target_wave64_t =
810  enable_if_target_wave_size_id_t<CompilerTarget, amdgcn_target_wave_size_id::WAVE64>;
811 
812 #endif // __cplusplus <= 201703L
813 
814 } // namespace core::arch
815 
816 CK_TILE_HOST bool is_wave32()
817 {
818  hipDeviceProp_t props{};
819  int device;
820  auto status = hipGetDevice(&device);
821  if(status != hipSuccess)
822  {
823  return false;
824  }
825  status = hipGetDeviceProperties(&props, device);
826  if(status != hipSuccess)
827  {
828  return false;
829  }
830  return props.major > 9;
831 }
832 
836 {
837  return static_cast<index_t>(core::arch::get_compiler_target().WAVE_SIZE_ID);
838 }
839 
840 CK_TILE_DEVICE index_t get_grid_size() { return gridDim.x; }
841 
842 CK_TILE_DEVICE index_t get_block_size() { return blockDim.x; }
843 
844 // TODO: deprecate these
845 CK_TILE_DEVICE index_t get_thread_local_1d_id() { return threadIdx.x; }
846 
847 CK_TILE_DEVICE index_t get_thread_global_1d_id() { return blockIdx.x * blockDim.x + threadIdx.x; }
848 
849 CK_TILE_DEVICE index_t get_block_1d_id() { return blockIdx.x; }
850 
851 // Use these instead
852 CK_TILE_DEVICE index_t get_lane_id() { return __lane_id(); }
853 
854 template <bool ReturnSgpr = true>
855 CK_TILE_DEVICE index_t get_warp_id(bool_constant<ReturnSgpr> = {})
856 {
857  const index_t warp_id = threadIdx.x / get_warp_size();
858  if constexpr(ReturnSgpr)
859  {
860  return amd_wave_read_first_lane(warp_id);
861  }
862  else
863  {
864  return warp_id;
865  }
866 }
867 
868 CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; }
869 
870 CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }
871 
872 CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
873 {
874 #ifdef __gfx12__
875  asm volatile("s_wait_loadcnt %0 \n"
876  "s_barrier_signal -1 \n"
877  "s_barrier_wait -1"
878  :
879  : "n"(cnt)
880  : "memory");
881 #else
882  asm volatile("s_waitcnt vmcnt(%0) \n"
883  "s_barrier"
884  :
885  : "n"(cnt)
886  : "memory");
887 #endif
888 }
889 
890 struct WaitcntLayoutGfx12
891 { // s_wait_loadcnt_dscnt: mem[13:8], ds[5:0]
892  CK_TILE_DEVICE static constexpr index_t VM_MASK = 0x3F; // mem
893  CK_TILE_DEVICE static constexpr index_t LGKM_MASK = 0x3F; // ds
894  CK_TILE_DEVICE static constexpr bool HAS_EXP = false;
895 
896  CK_TILE_DEVICE static constexpr index_t pack_vm(index_t c) { return ((c & VM_MASK) << 8); }
897  CK_TILE_DEVICE static constexpr index_t pack_lgkm(index_t c) { return ((c & LGKM_MASK) << 0); }
898  CK_TILE_DEVICE static constexpr index_t pack_exp(index_t) { return 0; }
899 };
900 
901 struct WaitcntLayoutGfx11
902 { // vm[15:10] (6), lgkm[9:4] (6), exp unused
903  CK_TILE_DEVICE static constexpr index_t VM_MASK = 0x3F;
904  CK_TILE_DEVICE static constexpr index_t LGKM_MASK = 0x3F;
905  CK_TILE_DEVICE static constexpr bool HAS_EXP = false;
906 
907  CK_TILE_DEVICE static constexpr index_t pack_vm(index_t c) { return ((c & VM_MASK) << 10); }
908  CK_TILE_DEVICE static constexpr index_t pack_lgkm(index_t c) { return ((c & LGKM_MASK) << 4); }
909  CK_TILE_DEVICE static constexpr index_t pack_exp(index_t) { return 0; }
910 };
911 
912 struct WaitcntLayoutLegacy
913 { // FE'DC'BA98'7'654'3210 => VV'UU'LLLL'U'EEE'VVVV
914  CK_TILE_DEVICE static constexpr index_t VM_MASK = 0x3F; // split: low4 + hi2
915  CK_TILE_DEVICE static constexpr index_t LGKM_MASK = 0x0F; // [11:8]
916  CK_TILE_DEVICE static constexpr index_t EXP_MASK = 0x07; // [6:4]
917  CK_TILE_DEVICE static constexpr bool HAS_EXP = true;
918 
919  CK_TILE_DEVICE static constexpr index_t pack_vm(index_t c)
920  {
921  c &= VM_MASK;
922  return ((c & 0xF) << 0) | ((c & 0x30) << 10);
923  }
924  CK_TILE_DEVICE static constexpr index_t pack_lgkm(index_t c) { return ((c & LGKM_MASK) << 8); }
925  CK_TILE_DEVICE static constexpr index_t pack_exp(index_t c) { return ((c & EXP_MASK) << 4); }
926 };
927 
928 // Select active layout
929 #if defined(__gfx12__)
930 using Waitcnt = WaitcntLayoutGfx12;
931 #elif defined(__gfx11__)
932 using Waitcnt = WaitcntLayoutGfx11;
933 #else
934 using Waitcnt = WaitcntLayoutLegacy;
935 #endif
936 
937 //----------------------------------------------
938 // Public API: only from_* (constexpr templates)
939 //----------------------------------------------
940 struct waitcnt_arg
941 {
942  // kMax* exposed for callers; match field widths per-arch
943 #if defined(__gfx12__) || defined(__gfx11__)
944  CK_TILE_DEVICE static constexpr index_t kMaxVmCnt = 0x3F; // 6 bits
945  CK_TILE_DEVICE static constexpr index_t kMaxLgkmCnt = 0x3F; // 6 bits
946  CK_TILE_DEVICE static constexpr index_t kMaxExpCnt = 0x0; // none
947 #else
948  CK_TILE_DEVICE static constexpr index_t kMaxVmCnt = 0x3F; // 6 bits (split)
949  CK_TILE_DEVICE static constexpr index_t kMaxLgkmCnt = 0x0F; // 4 bits
950  CK_TILE_DEVICE static constexpr index_t kMaxExpCnt = 0x07; // 3 bits
951 #endif
952 
953  template <index_t cnt>
954  CK_TILE_DEVICE static constexpr index_t from_vmcnt()
955  {
956  static_assert((cnt & ~Waitcnt::VM_MASK) == 0, "vmcnt out of range");
957  return Waitcnt::pack_vm(cnt);
958  }
959 
960  template <index_t cnt>
961  CK_TILE_DEVICE static constexpr index_t from_lgkmcnt()
962  {
963  static_assert((cnt & ~Waitcnt::LGKM_MASK) == 0, "lgkmcnt out of range");
964  return Waitcnt::pack_lgkm(cnt);
965  }
966 
967  template <index_t cnt>
968  CK_TILE_DEVICE static constexpr index_t from_expcnt()
969  {
970  if constexpr(Waitcnt::HAS_EXP)
971  {
972  // EXP_MASK only exists on legacy
973 #if !defined(__gfx12__) && !defined(__gfx11__)
974  static_assert((cnt & ~Waitcnt::EXP_MASK) == 0, "expcnt out of range");
975  return Waitcnt::pack_exp(cnt);
976 #else
977  (void)cnt;
978  return 0;
979 #endif
980  }
981  else
982  {
983  static_assert(cnt == 0, "expcnt unsupported on this arch");
984  return 0;
985  }
986  }
987 };
988 
989 template <index_t vmcnt = waitcnt_arg::kMaxVmCnt,
990  index_t expcnt = waitcnt_arg::kMaxExpCnt,
991  index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
992 CK_TILE_DEVICE void s_waitcnt()
993 {
994 #if defined(__gfx12__)
995  // GFX12 do't use __builtin_amdgcn_s_waitcnt
996  constexpr index_t wait_mask = waitcnt_arg::from_vmcnt<vmcnt>() |
997  waitcnt_arg::from_expcnt<expcnt>() |
998  waitcnt_arg::from_lgkmcnt<lgkmcnt>();
999 
1000  asm volatile("s_wait_loadcnt_dscnt %0" : : "n"(wait_mask) : "memory");
1001 #else
1002  __builtin_amdgcn_s_waitcnt(waitcnt_arg::from_vmcnt<vmcnt>() |
1003  waitcnt_arg::from_expcnt<expcnt>() |
1004  waitcnt_arg::from_lgkmcnt<lgkmcnt>());
1005 #endif
1006 }
1007 
1008 template <index_t vmcnt = waitcnt_arg::kMaxVmCnt,
1009  index_t expcnt = waitcnt_arg::kMaxExpCnt,
1010  index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
1011 CK_TILE_DEVICE void s_waitcnt_barrier()
1012 {
1013 #if defined(__gfx12__)
1014  // GFX12 optimization: Manual barrier implementation avoids performance penalty
1015  // from __builtin_amdgcn_s_barrier which inserts extra s_wait_loadcnt_dscnt 0x0
1016  constexpr index_t wait_mask = waitcnt_arg::from_vmcnt<vmcnt>() |
1017  waitcnt_arg::from_expcnt<expcnt>() |
1018  waitcnt_arg::from_lgkmcnt<lgkmcnt>();
1019 
1020  asm volatile("s_wait_loadcnt_dscnt %0\n"
1021  "s_barrier_signal -1\n"
1022  "s_barrier_wait -1"
1023  :
1024  : "n"(wait_mask)
1025  : "memory");
1026 #else
1027  s_waitcnt<vmcnt, expcnt, lgkmcnt>();
1028  __builtin_amdgcn_s_barrier();
1029 #endif
1030 }
1031 
1032 template <index_t lgkmcnt = 0>
1034 {
1035  s_waitcnt_barrier<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, lgkmcnt>();
1036 }
1037 
1038 template <index_t vmcnt = 0>
1040 {
1041  s_waitcnt_barrier<vmcnt, waitcnt_arg::kMaxExpCnt, waitcnt_arg::kMaxLgkmCnt>();
1042 }
1043 
1044 CK_TILE_DEVICE void s_nop(index_t cnt = 0)
1045 {
1046 #if 1
1047  asm volatile("s_nop %0" : : "n"(cnt) :);
1048 #else
1049  __builtin_amdgcn_sched_barrier(cnt);
1050 #endif
1051 }
1052 
1053 #define CK_TILE_CONSTANT_ADDRESS_SPACE \
1054  __attribute__((address_space( \
1055  static_cast<safe_underlying_type_t<address_space_enum>>(address_space_enum::constant))))
1056 
1057 template <typename T>
1058 __device__ T* cast_pointer_to_generic_address_space(T CK_TILE_CONSTANT_ADDRESS_SPACE* p)
1059 {
1060  // cast a pointer in "Constant" address space (4) to "Generic" address space (0)
1061  // only c-style pointer cast seems be able to be compiled
1062 #pragma clang diagnostic push
1063 #pragma clang diagnostic ignored "-Wold-style-cast"
1064  return (T*)(p); // NOLINT(old-style-cast)
1065 #pragma clang diagnostic pop
1066 }
1067 
1068 template <typename T>
1069 __host__ __device__ T CK_TILE_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p)
1070 {
1071  // cast a pointer in "Generic" address space (0) to "Constant" address space (4)
1072  // only c-style pointer cast seems be able to be compiled;
1073 #pragma clang diagnostic push
1074 #pragma clang diagnostic ignored "-Wold-style-cast"
1075  return (T CK_TILE_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast)
1076 #pragma clang diagnostic pop
1077 }
1078 
1079 CK_TILE_HOST_DEVICE constexpr index_t get_smem_capacity()
1080 {
1081 #if defined(__gfx950__)
1082  return 163840;
1083 #else
1084  return 65536;
1085 #endif
1086 }
1087 
1089 CK_TILE_HOST_DEVICE constexpr const char* address_space_to_string(address_space_enum addr_space)
1090 {
1091  switch(addr_space)
1092  {
1093  case address_space_enum::generic: return "generic";
1094  case address_space_enum::global: return "global";
1095  case address_space_enum::lds: return "lds";
1096  case address_space_enum::sgpr: return "sgpr";
1097  case address_space_enum::constant: return "constant";
1098  case address_space_enum::vgpr: return "vgpr";
1099  default: return "unknown";
1100  }
1101 }
1102 
1103 // Architecture tags
1104 struct gfx9_t
1105 {
1106 };
1107 struct gfx950_t
1108 {
1109 };
1110 struct gfx103_t
1111 {
1112 };
1113 struct gfx11_t
1114 {
1115 };
1116 struct gfx12_t
1117 {
1118 };
1119 struct gfx_invalid_t
1120 {
1121 };
1122 
1123 CK_TILE_DEVICE static constexpr auto get_device_arch()
1124 {
1125 // FIXME(0): on all devices except gfx11 it returns gfx12_t
1126 // FIXME(1): during the host compilation pass it returns gfx12_t
1127 #if defined(__gfx103__)
1128  return gfx103_t{};
1129 #elif defined(__gfx11__)
1130  return gfx11_t{};
1131 #elif defined(__gfx950__)
1132  return gfx950_t{};
1133 #elif defined(__gfx9__)
1134  return gfx9_t{};
1135 #else
1136  return gfx12_t{};
1137 #endif
1138 }
1139 
1140 CK_TILE_DEVICE static constexpr auto get_n_words_per_128b() { return 4; }
1141 
1142 namespace detail {
1143 CK_TILE_DEVICE static constexpr auto get_n_lds_banks(gfx9_t) { return 32; }
1144 
1145 CK_TILE_DEVICE static constexpr auto get_n_lds_banks(gfx103_t) { return 32; }
1146 
1147 CK_TILE_DEVICE static constexpr auto get_n_lds_banks(gfx11_t) { return 32; }
1148 
1149 CK_TILE_DEVICE static constexpr auto get_n_lds_banks(gfx12_t) { return 32; }
1150 
1151 CK_TILE_DEVICE static constexpr auto get_n_lds_banks(gfx950_t) { return 64; }
1152 
1153 CK_TILE_DEVICE static constexpr auto get_n_lds_banks(gfx_invalid_t) { return 0; }
1154 
1155 } // namespace detail
1156 CK_TILE_DEVICE static constexpr auto get_n_lds_banks()
1157 {
1158  return detail::get_n_lds_banks(get_device_arch());
1159 }
1160 
1161 enum LLVMSchedGroupMask : int32_t
1162 {
1163  NONE = 0,
1164  ALU = 1 << 0,
1165  VALU = 1 << 1,
1166  SALU = 1 << 2,
1167  MFMA = 1 << 3,
1168  VMEM = 1 << 4,
1169  VMEM_READ = 1 << 5,
1170  VMEM_WRITE = 1 << 6,
1171  DS = 1 << 7,
1172  DS_READ = 1 << 8,
1173  DS_WRITE = 1 << 9,
1174  ALL = (DS_WRITE << 1) - 1,
1175 };
1176 } // namespace ck_tile
#define CK_TILE_DEVICE
Definition: config.hpp:45
#define CK_TILE_HOST
Definition: config.hpp:44
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:46
Definition: cluster_descriptor.hpp:13
CK_TILE_DEVICE void atomic_add(X *p_dst, const X &x)
__device__ uint32_t amd_wave_read_first_lane(uint16_t v)
Definition: amd_buffer_addressing.hpp:36
CK_TILE_HOST_DEVICE T add(const T &a, const T &b)
Definition: generic_memory_space_atomic.hpp:16
int32_t index_t
Definition: integer.hpp:9
__device__ index_t get_grid_size()
Definition: get_id.hpp:49
__device__ void s_nop()
Definition: synchronization.hpp:61
__device__ index_t get_block_size()
Definition: get_id.hpp:51
__device__ void block_sync_lds_direct_load()
Definition: synchronization.hpp:43
__device__ index_t get_block_1d_id()
Definition: get_id.hpp:47
__device__ index_t get_thread_global_1d_id()
Definition: get_id.hpp:43
__device__ X atomic_max(X *p_dst, const X &x)
constexpr __device__ index_t get_warp_size()
Definition: get_id.hpp:10
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE * cast_pointer_to_constant_address_space(T *p)
Definition: amd_address_space.hpp:35
__device__ T * cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE *p)
Definition: amd_address_space.hpp:24
typename std::enable_if< B, T >::type enable_if_t
Definition: enable_if.hpp:27
__device__ index_t get_thread_local_1d_id()
Definition: get_id.hpp:41
__device__ void block_sync_lds()
Definition: synchronization.hpp:16
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1697
unsigned short uint16_t
Definition: stdint.h:125
signed int int32_t
Definition: stdint.h:123