17 #define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111
18 #define CK_TILE_VMCNT(cnt) \
19 ([]() { static_assert(!((cnt) >> 6), "VMCNT only has 6 bits"); }(), \
20 ((cnt) & 0b1111) | (((cnt) & 0b110000) << 10))
21 #define CK_TILE_EXPCNT(cnt) \
22 ([]() { static_assert(!((cnt) >> 3), "EXP only has 3 bits"); }(), ((cnt) << 4))
23 #define CK_TILE_LGKMCNT(cnt) \
24 ([]() { static_assert(!((cnt) >> 4), "LGKM only has 4 bits"); }(), ((cnt) << 8))
28 template <
typename,
bool>
29 struct safe_underlying_type;
32 struct safe_underlying_type<T, true>
34 using type = std::underlying_type_t<T>;
38 struct safe_underlying_type<T, false>
64 namespace core::arch {
70 enum struct amdgcn_target_id
82 GFX103_GENERIC = 0x103F,
91 GFX11_GENERIC = 0x11FF,
94 GFX12_GENERIC = 0x12FF,
98 enum struct amdgcn_target_family_id
107 enum struct amdgcn_target_arch_id
114 enum struct amdgcn_target_wave_size_id
123 template <amdgcn_target_id TargetId = amdgcn_target_id::HOST,
124 amdgcn_target_family_id FamilyId = amdgcn_target_family_id::HOST,
125 amdgcn_target_arch_id ArchId = amdgcn_target_arch_id::HOST,
126 amdgcn_target_wave_size_id WaveSizeId = amdgcn_target_wave_size_id::HOST>
129 static constexpr amdgcn_target_id TARGET_ID = TargetId;
130 static constexpr amdgcn_target_family_id FAMILY_ID = FamilyId;
131 static constexpr amdgcn_target_arch_id ARCH_ID = ArchId;
132 static constexpr amdgcn_target_wave_size_id WAVE_SIZE_ID = WaveSizeId;
135 template <amdgcn_target_
id targetId>
136 static constexpr
auto make_amdgcn_gfx9_target()
138 return amdgcn_target<targetId,
139 amdgcn_target_family_id::GFX9,
140 amdgcn_target_arch_id::CDNA,
141 amdgcn_target_wave_size_id::WAVE64>{};
144 template <amdgcn_target_
id targetId>
145 static constexpr
auto make_amdgcn_gfx10_3_target()
147 return amdgcn_target<targetId,
148 amdgcn_target_family_id::GFX10_3,
149 amdgcn_target_arch_id::RDNA,
150 amdgcn_target_wave_size_id::WAVE32>{};
153 template <amdgcn_target_
id targetId>
154 static constexpr
auto make_amdgcn_gfx11_target()
156 return amdgcn_target<targetId,
157 amdgcn_target_family_id::GFX11,
158 amdgcn_target_arch_id::RDNA,
159 amdgcn_target_wave_size_id::WAVE32>{};
162 template <amdgcn_target_
id targetId>
163 static constexpr
auto make_amdgcn_gfx12_target()
165 return amdgcn_target<targetId,
166 amdgcn_target_family_id::GFX12,
167 amdgcn_target_arch_id::RDNA,
168 amdgcn_target_wave_size_id::WAVE32>{};
171 template <
typename CompilerTarget, amdgcn_target_id... TargetIds>
172 static constexpr
auto is_target_id_any_of()
174 return is_any_value_of(CompilerTarget::TARGET_ID, TargetIds...);
177 template <
typename CompilerTarget, amdgcn_target_family_id... FamilyIds>
178 static constexpr
auto is_target_family_any_of()
180 return is_any_value_of(CompilerTarget::FAMILY_ID, FamilyIds...);
183 template <
typename CompilerTarget>
184 static constexpr
bool is_target_family_gfx9()
186 return CompilerTarget::FAMILY_ID == amdgcn_target_family_id::GFX9;
189 template <
typename CompilerTarget>
190 static constexpr
bool is_target_family_gfx10_3()
192 return CompilerTarget::FAMILY_ID == amdgcn_target_family_id::GFX10_3;
195 template <
typename CompilerTarget>
196 static constexpr
bool is_target_family_gfx11()
198 return CompilerTarget::FAMILY_ID == amdgcn_target_family_id::GFX11;
201 template <
typename CompilerTarget>
202 static constexpr
bool is_target_family_gfx12()
204 return CompilerTarget::FAMILY_ID == amdgcn_target_family_id::GFX12;
207 template <
typename CompilerTarget>
208 static constexpr
bool is_target_arch_cdna()
210 return CompilerTarget::ARCH_ID == amdgcn_target_arch_id::CDNA;
213 template <
typename CompilerTarget>
214 static constexpr
bool is_target_arch_rdna()
216 return CompilerTarget::ARCH_ID == amdgcn_target_arch_id::RDNA;
219 template <
typename CompilerTarget>
220 static constexpr
bool is_target_wave_size_32()
222 return CompilerTarget::WAVE_SIZE_ID == amdgcn_target_wave_size_id::WAVE32;
225 template <
typename CompilerTarget>
226 static constexpr
bool is_target_wave_size_64()
228 return CompilerTarget::WAVE_SIZE_ID == amdgcn_target_wave_size_id::WAVE64;
233 #define MAP_COMPILER_STATE_TO_GFX9_TARGET(COMPILER_STATE, TARGET_ID) \
234 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
236 return make_amdgcn_gfx9_target<amdgcn_target_id::TARGET_ID>(); \
240 #define MAP_COMPILER_STATE_TO_GFX10_3_TARGET(COMPILER_STATE, TARGET_ID) \
241 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
243 return make_amdgcn_gfx10_3_target<amdgcn_target_id::TARGET_ID>(); \
247 #define MAP_COMPILER_STATE_TO_GFX11_TARGET(COMPILER_STATE, TARGET_ID) \
248 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
250 return make_amdgcn_gfx11_target<amdgcn_target_id::TARGET_ID>(); \
254 #define MAP_COMPILER_STATE_TO_GFX12_TARGET(COMPILER_STATE, TARGET_ID) \
255 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
257 return make_amdgcn_gfx12_target<amdgcn_target_id::TARGET_ID>(); \
266 constexpr
auto get_compiler_target()
268 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX908, GFX908);
269 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX90A, GFX90A);
270 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX942, GFX942);
271 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX950, GFX950);
272 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1030, GFX1030);
273 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1031, GFX1031);
274 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1032, GFX1032);
275 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1034, GFX1034);
276 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1035, GFX1035);
277 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1036, GFX1036);
278 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX10_3_GENERIC, GFX103_GENERIC);
279 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1100, GFX1100);
280 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1101, GFX1101);
281 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1102, GFX1102);
282 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1103, GFX1103);
283 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1150, GFX1150);
284 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1151, GFX1151);
285 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1152, GFX1152);
286 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1153, GFX1153);
287 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX11_GENERIC, GFX11_GENERIC);
288 MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX1200, GFX1200);
289 MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX1201, GFX1201);
290 MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX12_GENERIC, GFX12_GENERIC);
293 if constexpr(amdgcn_compiler_target_state::CK_TILE_HOST_COMPILE)
295 return amdgcn_target<>{};
300 #undef MAP_COMPILER_STATE_TO_GFX9_TARGET
301 #undef MAP_COMPILER_STATE_TO_GFX10_3_TARGET
302 #undef MAP_COMPILER_STATE_TO_GFX11_TARGET
303 #undef MAP_COMPILER_STATE_TO_GFX12_TARGET
306 static_assert(!amdgcn_compiler_target_state::CK_TILE_DEVICE_COMPILE ||
307 get_compiler_target().TARGET_ID != amdgcn_target_id::HOST,
308 "Device compile must have a valid target device architecture");
311 static_assert(!amdgcn_compiler_target_state::CK_TILE_HOST_COMPILE ||
312 get_compiler_target().TARGET_ID == amdgcn_target_id::HOST,
313 "Host compile must target HOST architecture");
317 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(NAME_STRING, TARGET_ID) \
318 if(str.find(NAME_STRING) != std::string::npos) \
320 return amdgcn_target_id::TARGET_ID; \
332 CK_TILE_HOST auto hip_device_prop_gcn_arch_name_to_amdgcn_target_id(
char const* testStr)
334 auto str = std::string(testStr);
335 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx908", GFX908);
336 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx90a", GFX90A);
337 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx942", GFX942);
338 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx950", GFX950);
339 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1030", GFX1030);
340 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1031", GFX1031);
341 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1032", GFX1032);
342 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1034", GFX1034);
343 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1035", GFX1035);
344 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1036", GFX1036);
345 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx10_3_generic", GFX103_GENERIC);
346 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1100", GFX1100);
347 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1101", GFX1101);
348 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1102", GFX1102);
349 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1103", GFX1103);
350 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1150", GFX1150);
351 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1151", GFX1151);
352 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1152", GFX1152);
353 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1153", GFX1153);
354 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx11_generic", GFX11_GENERIC);
355 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1200", GFX1200);
356 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx1201", GFX1201);
357 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID(
"gfx12_generic", GFX12_GENERIC);
360 return amdgcn_target_id::HOST;
363 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_TARGET_ID
371 template <
typename CompilerTarget, amdgcn_target_id... SupportedTargetIds>
372 using enable_if_target_id_t =
373 std::enable_if_t<is_any_value_of(CompilerTarget::TARGET_ID, SupportedTargetIds...)>;
382 template <
typename CompilerTarget, amdgcn_target_family_id... SupportedTargetFamilyIds>
383 using enable_if_target_family_id_t =
384 std::enable_if_t<is_any_value_of(CompilerTarget::FAMILY_ID, SupportedTargetFamilyIds...)>;
391 template <
typename CompilerTarget, amdgcn_target_arch_id... SupportedTargetArchIds>
392 using enable_if_target_arch_id_t =
393 std::enable_if_t<is_any_value_of(CompilerTarget::ARCH_ID, SupportedTargetArchIds...)>;
402 template <
typename CompilerTarget, amdgcn_target_wave_size_id... SupportedTargetWaveSizeIds>
403 using enable_if_target_wave_size_id_t =
404 std::enable_if_t<is_any_value_of(CompilerTarget::WAVE_SIZE_ID, SupportedTargetWaveSizeIds...)>;
412 template <
typename CompilerTarget>
413 using enable_if_target_family_gfx9_t =
414 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX9>;
420 template <
typename CompilerTarget>
421 using enable_if_target_family_gfx10_3_t =
422 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX10_3>;
428 template <
typename CompilerTarget>
429 using enable_if_target_family_gfx11_t =
430 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX11>;
436 template <
typename CompilerTarget>
437 using enable_if_target_family_gfx12_t =
438 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX12>;
444 template <
typename CompilerTarget>
445 using enable_if_target_arch_cdna_t =
446 enable_if_target_arch_id_t<CompilerTarget, amdgcn_target_arch_id::CDNA>;
452 template <
typename CompilerTarget>
453 using enable_if_target_arch_rdna_t =
454 enable_if_target_arch_id_t<CompilerTarget, amdgcn_target_arch_id::RDNA>;
460 template <
typename CompilerTarget>
461 using enable_if_target_wave32_t =
462 enable_if_target_wave_size_id_t<CompilerTarget, amdgcn_target_wave_size_id::WAVE32>;
468 template <
typename CompilerTarget>
469 using enable_if_target_wave64_t =
470 enable_if_target_wave_size_id_t<CompilerTarget, amdgcn_target_wave_size_id::WAVE64>;
472 #elif __cplusplus >= 202002L
482 const amdgcn_target_id TARGET_ID = amdgcn_target_id::HOST;
483 const amdgcn_target_family_id FAMILY_ID = amdgcn_target_family_id::HOST;
484 const amdgcn_target_arch_id ARCH_ID = amdgcn_target_arch_id::HOST;
485 const amdgcn_target_wave_size_id WAVE_SIZE_ID = amdgcn_target_wave_size_id::HOST;
488 static constexpr
auto make_amdgcn_gfx10_3_target(amdgcn_target_id targetId)
490 return amdgcn_target{.TARGET_ID = targetId,
491 .FAMILY_ID = amdgcn_target_family_id::GFX10_3,
492 .ARCH_ID = amdgcn_target_arch_id::RDNA,
493 .WAVE_SIZE_ID = amdgcn_target_wave_size_id::WAVE32};
496 static constexpr
auto make_amdgcn_gfx9_target(amdgcn_target_id targetId)
498 return amdgcn_target{.TARGET_ID = targetId,
499 .FAMILY_ID = amdgcn_target_family_id::GFX9,
500 .ARCH_ID = amdgcn_target_arch_id::CDNA,
501 .WAVE_SIZE_ID = amdgcn_target_wave_size_id::WAVE64};
504 static constexpr
auto make_amdgcn_gfx11_target(amdgcn_target_id targetId)
506 return amdgcn_target{.TARGET_ID = targetId,
507 .FAMILY_ID = amdgcn_target_family_id::GFX11,
508 .ARCH_ID = amdgcn_target_arch_id::RDNA,
509 .WAVE_SIZE_ID = amdgcn_target_wave_size_id::WAVE32};
512 static constexpr
auto make_amdgcn_gfx12_target(amdgcn_target_id targetId)
514 return amdgcn_target{.TARGET_ID = targetId,
515 .FAMILY_ID = amdgcn_target_family_id::GFX12,
516 .ARCH_ID = amdgcn_target_arch_id::RDNA,
517 .WAVE_SIZE_ID = amdgcn_target_wave_size_id::WAVE32};
520 static constexpr
bool is_target_family_gfx9(amdgcn_target target)
522 return target.FAMILY_ID == amdgcn_target_family_id::GFX9;
525 static constexpr
bool is_target_family_gfx10_3(amdgcn_target target)
527 return target.FAMILY_ID == amdgcn_target_family_id::GFX10_3;
530 static constexpr
bool is_target_family_gfx11(amdgcn_target target)
532 return target.FAMILY_ID == amdgcn_target_family_id::GFX11;
535 static constexpr
bool is_target_family_gfx12(amdgcn_target target)
537 return target.FAMILY_ID == amdgcn_target_family_id::GFX12;
540 static constexpr
bool is_target_arch_cdna(amdgcn_target target)
542 return target.ARCH_ID == amdgcn_target_arch_id::CDNA;
545 static constexpr
bool is_target_arch_rdna(amdgcn_target target)
547 return target.ARCH_ID == amdgcn_target_arch_id::RDNA;
550 static constexpr
bool is_target_wave_size_32(amdgcn_target target)
552 return target.WAVE_SIZE_ID == amdgcn_target_wave_size_id::WAVE32;
555 static constexpr
bool is_target_wave_size_64(amdgcn_target target)
557 return target.WAVE_SIZE_ID == amdgcn_target_wave_size_id::WAVE64;
561 #define MAP_COMPILER_STATE_TO_GFX10_3_TARGET(COMPILER_STATE, TARGET_ID) \
562 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
564 return make_amdgcn_gfx9_target(amdgcn_target_id::TARGET_ID); \
567 #define MAP_COMPILER_STATE_TO_GFX9_TARGET(COMPILER_STATE, TARGET_ID) \
568 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
570 return make_amdgcn_gfx9_target(amdgcn_target_id::TARGET_ID); \
573 #define MAP_COMPILER_STATE_TO_GFX11_TARGET(COMPILER_STATE, TARGET_ID) \
574 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
576 return make_amdgcn_gfx11_target(amdgcn_target_id::TARGET_ID); \
579 #define MAP_COMPILER_STATE_TO_GFX12_TARGET(COMPILER_STATE, TARGET_ID) \
580 if constexpr(amdgcn_compiler_target_state::COMPILER_STATE) \
582 return make_amdgcn_gfx12_target(amdgcn_target_id::TARGET_ID); \
591 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX908, GFX908);
592 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX90A, GFX90A);
593 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX942, GFX942);
594 MAP_COMPILER_STATE_TO_GFX9_TARGET(CK_TILE_ARCH_GFX950, GFX950);
595 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1030, GFX1030);
596 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1031, GFX1031);
597 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1032, GFX1032);
598 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1034, GFX1034);
599 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1035, GFX1035);
600 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX1036, GFX1036);
601 MAP_COMPILER_STATE_TO_GFX10_3_TARGET(CK_TILE_ARCH_GFX10_3_GENERIC, GFX103_GENERIC);
602 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1100, GFX1100);
603 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1101, GFX1101);
604 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1102, GFX1102);
605 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1103, GFX1103);
606 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1150, GFX1150);
607 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1151, GFX1151);
608 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1152, GFX1152);
609 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX1153, GFX1153);
610 MAP_COMPILER_STATE_TO_GFX11_TARGET(CK_TILE_ARCH_GFX11_GENERIC, GFX11_GENERIC);
611 MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX1200, GFX1200);
612 MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX1201, GFX1201);
613 MAP_COMPILER_STATE_TO_GFX12_TARGET(CK_TILE_ARCH_GFX12_GENERIC, GFX12_GENERIC);
616 return amdgcn_target{};
620 #undef MAP_COMPILER_STATE_TO_GFX9_TARGET
621 #undef MAP_COMPILER_STATE_TO_GFX10_3_TARGET
622 #undef MAP_COMPILER_STATE_TO_GFX11_TARGET
623 #undef MAP_COMPILER_STATE_TO_GFX12_TARGET
626 static_assert(!amdgcn_compiler_target_state::CK_TILE_DEVICE_COMPILE ||
627 get_compiler_target().TARGET_ID != amdgcn_target_id::HOST,
628 "Device compile must have a valid target device architecture");
631 static_assert(!amdgcn_compiler_target_state::CK_TILE_HOST_COMPILE ||
632 get_compiler_target().TARGET_ID == amdgcn_target_id::HOST,
633 "Host compile must target HOST architecture");
635 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET(NAME_STRING, TARGET_ID) \
636 if constexpr(str.find(NAME_STRING) != std::string::npos) \
638 return make_amdgcn_gfx9_target(amdgcn_target_id::TARGET_ID); \
642 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(NAME_STRING, TARGET_ID) \
643 if constexpr(str.find(NAME_STRING) != std::string::npos) \
645 return make_amdgcn_gfx10_3_target(amdgcn_target_id::TARGET_ID); \
649 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(NAME_STRING, TARGET_ID) \
650 if constexpr(str.find(NAME_STRING) != std::string::npos) \
652 return make_amdgcn_gfx11_target(amdgcn_target_id::TARGET_ID); \
656 #define MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET(NAME_STRING, TARGET_ID) \
657 if constexpr(str.find(NAME_STRING) != std::string::npos) \
659 return make_amdgcn_gfx12_target(amdgcn_target_id::TARGET_ID); \
669 CK_TILE_HOST auto hip_device_prop_gcn_arch_name_to_amdgcn_target(
char const* testStr)
671 auto str = std::string(testStr);
672 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET(
"gfx908", GFX908);
673 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET(
"gfx90a", GFX90A);
674 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET(
"gfx942", GFX942);
675 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET(
"gfx950", GFX950);
676 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(
"gfx1030", GFX1030);
677 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(
"gfx1031", GFX1031);
678 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(
"gfx1032", GFX1032);
679 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(
"gfx1034", GFX1034);
680 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(
"gfx1035", GFX1035);
681 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(
"gfx1036", GFX1036);
682 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET(
"gfx10_3_generic", GFX103_GENERIC);
683 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1100", GFX1100);
684 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1101", GFX1101);
685 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1102", GFX1102);
686 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1103", GFX1103);
687 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1150", GFX1150);
688 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1151", GFX1151);
689 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1152", GFX1152);
690 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx1153", GFX1153);
691 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET(
"gfx11_generic", GFX11_GENERIC);
692 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET(
"gfx1200", GFX1200);
693 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET(
"gfx1201", GFX1201);
694 MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET(
"gfx12_generic", GFX12_GENERIC);
697 return amdgcn_target{};
700 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX9_TARGET
701 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX10_3_TARGET
702 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX11_TARGET
703 #undef MAP_HIP_DEVICE_PROP_GCN_ARCH_NAME_STRING_TO_GFX12_TARGET
711 template <amdgcn_target CompilerTarget, amdgcn_target_id... SupportedTargetIds>
712 using enable_if_target_id_t =
713 std::enable_if_t<is_any_value_of(CompilerTarget.TARGET_ID, SupportedTargetIds...)>;
722 template <amdgcn_target CompilerTarget, amdgcn_target_family_id... SupportedTargetFamilyIds>
723 using enable_if_target_family_id_t =
724 std::enable_if_t<is_any_value_of(CompilerTarget.FAMILY_ID, SupportedTargetFamilyIds...)>;
731 template <amdgcn_target CompilerTarget, amdgcn_target_arch_id... SupportedTargetArchIds>
732 using enable_if_target_arch_id_t =
733 std::enable_if_t<is_any_value_of(CompilerTarget.ARCH_ID, SupportedTargetArchIds...)>;
742 template <amdgcn_target CompilerTarget, amdgcn_target_wave_size_id... SupportedTargetWaveSizeIds>
743 using enable_if_target_wave_size_id_t =
744 std::enable_if_t<is_any_value_of(CompilerTarget.WAVE_SIZE_ID, SupportedTargetWaveSizeIds...)>;
752 template <amdgcn_target CompilerTarget>
753 using enable_if_target_family_gfx9_t =
754 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX9>;
760 template <amdgcn_target CompilerTarget>
761 using enable_if_target_family_gfx10_3_t =
762 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX10_3>;
768 template <amdgcn_target CompilerTarget>
769 using enable_if_target_family_gfx11_t =
770 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX11>;
776 template <amdgcn_target CompilerTarget>
777 using enable_if_target_family_gfx12_t =
778 enable_if_target_family_id_t<CompilerTarget, amdgcn_target_family_id::GFX12>;
784 template <amdgcn_target CompilerTarget>
785 using enable_if_target_arch_cdna_t =
786 enable_if_target_arch_id_t<CompilerTarget, amdgcn_target_arch_id::CDNA>;
792 template <amdgcn_target CompilerTarget>
793 using enable_if_target_arch_rdna_t =
794 enable_if_target_arch_id_t<CompilerTarget, amdgcn_target_arch_id::RDNA>;
800 template <amdgcn_target CompilerTarget>
801 using enable_if_target_wave32_t =
802 enable_if_target_wave_size_id_t<CompilerTarget, amdgcn_target_wave_size_id::WAVE32>;
808 template <amdgcn_target CompilerTarget>
809 using enable_if_target_wave64_t =
810 enable_if_target_wave_size_id_t<CompilerTarget, amdgcn_target_wave_size_id::WAVE64>;
818 hipDeviceProp_t props{};
820 auto status = hipGetDevice(&device);
821 if(status != hipSuccess)
825 status = hipGetDeviceProperties(&props, device);
826 if(status != hipSuccess)
830 return props.major > 9;
837 return static_cast<index_t>(core::arch::get_compiler_target().WAVE_SIZE_ID);
854 template <
bool ReturnSgpr = true>
858 if constexpr(ReturnSgpr)
875 asm volatile(
"s_wait_loadcnt %0 \n"
876 "s_barrier_signal -1 \n"
882 asm volatile(
"s_waitcnt vmcnt(%0) \n"
890 struct WaitcntLayoutGfx12
901 struct WaitcntLayoutGfx11
912 struct WaitcntLayoutLegacy
922 return ((c & 0xF) << 0) | ((c & 0x30) << 10);
929 #if defined(__gfx12__)
930 using Waitcnt = WaitcntLayoutGfx12;
931 #elif defined(__gfx11__)
932 using Waitcnt = WaitcntLayoutGfx11;
934 using Waitcnt = WaitcntLayoutLegacy;
943 #if defined(__gfx12__) || defined(__gfx11__)
953 template <index_t cnt>
956 static_assert((cnt & ~Waitcnt::VM_MASK) == 0,
"vmcnt out of range");
957 return Waitcnt::pack_vm(cnt);
960 template <index_t cnt>
963 static_assert((cnt & ~Waitcnt::LGKM_MASK) == 0,
"lgkmcnt out of range");
964 return Waitcnt::pack_lgkm(cnt);
967 template <index_t cnt>
970 if constexpr(Waitcnt::HAS_EXP)
973 #if !defined(__gfx12__) && !defined(__gfx11__)
974 static_assert((cnt & ~Waitcnt::EXP_MASK) == 0,
"expcnt out of range");
975 return Waitcnt::pack_exp(cnt);
983 static_assert(cnt == 0,
"expcnt unsupported on this arch");
989 template <
index_t vmcnt = waitcnt_arg::kMaxVmCnt,
990 index_t expcnt = waitcnt_arg::kMaxExpCnt,
991 index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
994 #if defined(__gfx12__)
996 constexpr
index_t wait_mask = waitcnt_arg::from_vmcnt<vmcnt>() |
997 waitcnt_arg::from_expcnt<expcnt>() |
998 waitcnt_arg::from_lgkmcnt<lgkmcnt>();
1000 asm volatile(
"s_wait_loadcnt_dscnt %0" : :
"n"(wait_mask) :
"memory");
1002 __builtin_amdgcn_s_waitcnt(waitcnt_arg::from_vmcnt<vmcnt>() |
1003 waitcnt_arg::from_expcnt<expcnt>() |
1004 waitcnt_arg::from_lgkmcnt<lgkmcnt>());
1008 template <
index_t vmcnt = waitcnt_arg::kMaxVmCnt,
1009 index_t expcnt = waitcnt_arg::kMaxExpCnt,
1010 index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
1013 #if defined(__gfx12__)
1016 constexpr
index_t wait_mask = waitcnt_arg::from_vmcnt<vmcnt>() |
1017 waitcnt_arg::from_expcnt<expcnt>() |
1018 waitcnt_arg::from_lgkmcnt<lgkmcnt>();
1020 asm volatile(
"s_wait_loadcnt_dscnt %0\n"
1021 "s_barrier_signal -1\n"
1027 s_waitcnt<vmcnt, expcnt, lgkmcnt>();
1028 __builtin_amdgcn_s_barrier();
1032 template <index_t lgkmcnt = 0>
1035 s_waitcnt_barrier<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, lgkmcnt>();
1038 template <index_t vmcnt = 0>
1041 s_waitcnt_barrier<vmcnt, waitcnt_arg::kMaxExpCnt, waitcnt_arg::kMaxLgkmCnt>();
1047 asm volatile(
"s_nop %0" : :
"n"(cnt) :);
1049 __builtin_amdgcn_sched_barrier(cnt);
1053 #define CK_TILE_CONSTANT_ADDRESS_SPACE \
1054 __attribute__((address_space( \
1055 static_cast<safe_underlying_type_t<address_space_enum>>(address_space_enum::constant))))
1057 template <
typename T>
1062 #pragma clang diagnostic push
1063 #pragma clang diagnostic ignored "-Wold-style-cast"
1065 #pragma clang diagnostic pop
1068 template <
typename T>
1073 #pragma clang diagnostic push
1074 #pragma clang diagnostic ignored "-Wold-style-cast"
1075 return (T CK_TILE_CONSTANT_ADDRESS_SPACE*)p;
1076 #pragma clang diagnostic pop
1081 #if defined(__gfx950__)
1089 CK_TILE_HOST_DEVICE constexpr
const char* address_space_to_string(address_space_enum addr_space)
1093 case address_space_enum::generic:
return "generic";
1094 case address_space_enum::global:
return "global";
1095 case address_space_enum::lds:
return "lds";
1096 case address_space_enum::sgpr:
return "sgpr";
1097 case address_space_enum::constant:
return "constant";
1098 case address_space_enum::vgpr:
return "vgpr";
1099 default:
return "unknown";
1119 struct gfx_invalid_t
1127 #if defined(__gfx103__)
1129 #elif defined(__gfx11__)
1131 #elif defined(__gfx950__)
1133 #elif defined(__gfx9__)
1140 CK_TILE_DEVICE static constexpr
auto get_n_words_per_128b() {
return 4; }
1143 CK_TILE_DEVICE static constexpr
auto get_n_lds_banks(gfx9_t) {
return 32; }
1145 CK_TILE_DEVICE static constexpr
auto get_n_lds_banks(gfx103_t) {
return 32; }
1147 CK_TILE_DEVICE static constexpr
auto get_n_lds_banks(gfx11_t) {
return 32; }
1149 CK_TILE_DEVICE static constexpr
auto get_n_lds_banks(gfx12_t) {
return 32; }
1151 CK_TILE_DEVICE static constexpr
auto get_n_lds_banks(gfx950_t) {
return 64; }
1153 CK_TILE_DEVICE static constexpr
auto get_n_lds_banks(gfx_invalid_t) {
return 0; }
1158 return detail::get_n_lds_banks(get_device_arch());
1161 enum LLVMSchedGroupMask :
int32_t
1170 VMEM_WRITE = 1 << 6,
1174 ALL = (DS_WRITE << 1) - 1,
#define CK_TILE_DEVICE
Definition: config.hpp:45
#define CK_TILE_HOST
Definition: config.hpp:44
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:46
Definition: cluster_descriptor.hpp:13
CK_TILE_DEVICE void atomic_add(X *p_dst, const X &x)
__device__ uint32_t amd_wave_read_first_lane(uint16_t v)
Definition: amd_buffer_addressing.hpp:36
CK_TILE_HOST_DEVICE T add(const T &a, const T &b)
Definition: generic_memory_space_atomic.hpp:16
int32_t index_t
Definition: integer.hpp:9
__device__ index_t get_grid_size()
Definition: get_id.hpp:49
__device__ void s_nop()
Definition: synchronization.hpp:61
__device__ index_t get_block_size()
Definition: get_id.hpp:51
__device__ void block_sync_lds_direct_load()
Definition: synchronization.hpp:43
__device__ index_t get_block_1d_id()
Definition: get_id.hpp:47
__device__ index_t get_thread_global_1d_id()
Definition: get_id.hpp:43
__device__ X atomic_max(X *p_dst, const X &x)
constexpr __device__ index_t get_warp_size()
Definition: get_id.hpp:10
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE * cast_pointer_to_constant_address_space(T *p)
Definition: amd_address_space.hpp:35
__device__ T * cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE *p)
Definition: amd_address_space.hpp:24
typename std::enable_if< B, T >::type enable_if_t
Definition: enable_if.hpp:27
__device__ index_t get_thread_local_1d_id()
Definition: get_id.hpp:41
__device__ void block_sync_lds()
Definition: synchronization.hpp:16
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1697
unsigned short uint16_t
Definition: stdint.h:125
signed int int32_t
Definition: stdint.h:123