14 #define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111
15 #define CK_TILE_VMCNT(cnt) \
16 ([]() { static_assert(!((cnt) >> 6), "VMCNT only has 6 bits"); }(), \
17 ((cnt) & 0b1111) | (((cnt) & 0b110000) << 10))
18 #define CK_TILE_EXPCNT(cnt) \
19 ([]() { static_assert(!((cnt) >> 3), "EXP only has 3 bits"); }(), ((cnt) << 4))
20 #define CK_TILE_LGKMCNT(cnt) \
21 ([]() { static_assert(!((cnt) >> 4), "LGKM only has 4 bits"); }(), ((cnt) << 8))
25 template <
typename,
bool>
26 struct safe_underlying_type;
29 struct safe_underlying_type<T, true>
31 using type = std::underlying_type_t<T>;
35 struct safe_underlying_type<T, false>
63 #if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__)
72 hipDeviceProp_t props{};
74 auto status = hipGetDevice(&device);
75 if(status != hipSuccess)
79 status = hipGetDeviceProperties(&props, device);
80 if(status != hipSuccess)
84 return props.major > 9;
101 template <
bool ReturnSgpr = true>
105 if constexpr(ReturnSgpr)
107 return __builtin_amdgcn_readfirstlane(warp_id);
122 asm volatile(
"s_wait_loadcnt %0 \n"
123 "s_barrier_signal -1 \n"
129 asm volatile(
"s_waitcnt vmcnt(%0) \n"
148 template <index_t cnt>
151 static_assert(cnt >= 0 && !(cnt >> 6),
"valid range is [0..63]");
152 return MAX & ((cnt & 0b1111) | ((cnt & 0b110000) << 10));
155 template <index_t cnt>
158 static_assert(cnt >= 0 && !(cnt >> 3),
"valid range is [0..7]");
159 return MAX & (cnt << 4);
162 template <index_t cnt>
165 static_assert(cnt >= 0 && !(cnt >> 4),
"valid range is [0..15]");
166 return MAX & (cnt << 8);
170 template <
index_t vmcnt = waitcnt_arg::kMaxVmCnt,
171 index_t expcnt = waitcnt_arg::kMaxExpCnt,
172 index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
175 __builtin_amdgcn_s_waitcnt(waitcnt_arg::from_vmcnt<vmcnt>() |
176 waitcnt_arg::from_expcnt<expcnt>() |
177 waitcnt_arg::from_lgkmcnt<lgkmcnt>());
180 template <
index_t vmcnt = waitcnt_arg::kMaxVmCnt,
181 index_t expcnt = waitcnt_arg::kMaxExpCnt,
182 index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
185 s_waitcnt<vmcnt, expcnt, lgkmcnt>();
186 __builtin_amdgcn_s_barrier();
189 template <index_t lgkmcnt = 0>
192 s_waitcnt_barrier<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, lgkmcnt>();
195 template <index_t vmcnt = 0>
198 s_waitcnt_barrier<vmcnt, waitcnt_arg::kMaxExpCnt, waitcnt_arg::kMaxLgkmCnt>();
204 asm volatile(
"s_nop %0" : :
"n"(cnt) :);
206 __builtin_amdgcn_sched_barrier(cnt);
210 #define CK_CONSTANT_ADDRESS_SPACE \
211 __attribute__((address_space( \
212 static_cast<safe_underlying_type_t<address_space_enum>>(address_space_enum::constant))))
214 template <
typename T>
219 #pragma clang diagnostic push
220 #pragma clang diagnostic ignored "-Wold-style-cast"
222 #pragma clang diagnostic pop
225 template <
typename T>
230 #pragma clang diagnostic push
231 #pragma clang diagnostic ignored "-Wold-style-cast"
233 #pragma clang diagnostic pop
238 #if defined(__gfx950__)
250 case address_space_enum::generic:
return "generic";
251 case address_space_enum::global:
return "global";
252 case address_space_enum::lds:
return "lds";
253 case address_space_enum::sgpr:
return "sgpr";
254 case address_space_enum::constant:
return "constant";
255 case address_space_enum::vgpr:
return "vgpr";
256 default:
return "unknown";
270 #if defined(__gfx11__)
#define CK_CONSTANT_ADDRESS_SPACE
Definition: arch.hpp:210
constexpr CK_TILE_HOST_DEVICE const char * address_space_to_string(address_space_enum addr_space)
Helper function to convert address space enum to string.
Definition: arch.hpp:246
constexpr CK_TILE_HOST_DEVICE index_t get_smem_capacity()
Definition: arch.hpp:236
CK_TILE_DEVICE void s_nop(index_t cnt=0)
Definition: arch.hpp:201
__device__ T * cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE *p)
Definition: arch.hpp:215
CK_TILE_DEVICE void block_sync_lds_direct_load()
Definition: arch.hpp:196
CK_TILE_DEVICE void s_waitcnt_barrier()
Definition: arch.hpp:183
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE * cast_pointer_to_constant_address_space(T *p)
Definition: arch.hpp:226
CK_TILE_DEVICE void block_sync_lds()
Definition: arch.hpp:190
CK_TILE_DEVICE void s_waitcnt()
Definition: arch.hpp:173
#define CK_TILE_DEVICE
Definition: config.hpp:41
#define CK_TILE_HOST
Definition: config.hpp:40
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:42
Definition: cluster_descriptor.hpp:13
CK_TILE_DEVICE void atomic_add(X *p_dst, const X &x)
CK_TILE_HOST_DEVICE T add(const T &a, const T &b)
Definition: generic_memory_space_atomic.hpp:16
int32_t index_t
Definition: integer.hpp:9
__device__ index_t get_grid_size()
Definition: get_id.hpp:60
__host__ constexpr __device__ index_t get_warp_size()
Definition: get_id.hpp:42
__device__ index_t get_block_size()
Definition: get_id.hpp:62
__device__ index_t get_block_1d_id()
Definition: get_id.hpp:58
__device__ index_t get_thread_global_1d_id()
Definition: get_id.hpp:54
__device__ X atomic_max(X *p_dst, const X &x)
__device__ index_t get_thread_local_1d_id()
Definition: get_id.hpp:52
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1350
unsigned short uint16_t
Definition: stdint.h:125