Go to the source code of this file.
|
#define | CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111#define CK_TILE_VMCNT(cnt) \ ([]() { static_assert(!((cnt) >> 6), "VMCNT only has 6 bits"); }(), \ ((cnt) & 0b1111) | (((cnt) & 0b110000) << 10))#define CK_TILE_EXPCNT(cnt) \ ([]() { static_assert(!((cnt) >> 3), "EXP only has 3 bits"); }(), ((cnt) << 4))#define CK_TILE_LGKMCNT(cnt) \ ([]() { static_assert(!((cnt) >> 4), "LGKM only has 4 bits"); }(), ((cnt) << 8))namespace ck_tile {template <typename, bool>struct safe_underlying_type;template <typename T>struct safe_underlying_type<T, true>{ using type = std::underlying_type_t<T>;};template <typename T>struct safe_underlying_type<T, false>{ using type = void;};template <typename T>using safe_underlying_type_t = typename safe_underlying_type<T, std::is_enum<T>::value>::type;enum struct address_space_enum : std::uint16_t{ generic = 0, global, lds, sgpr, constant, vgpr};enum struct memory_operation_enum : std::uint16_t{ set = 0, atomic_add, atomic_max, add};CK_TILE_HOST_DEVICE constexpr index_t get_warp_size(){#if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__) return 64;#else return 32;#endif}CK_TILE_HOST bool is_wave32(){ hipDeviceProp_t props{}; int device; auto status = hipGetDevice(&device); if(status != hipSuccess) { return false; } status = hipGetDeviceProperties(&props, device); if(status != hipSuccess) { return false; } return props.major > 9;}CK_TILE_DEVICE index_t get_grid_size() { return gridDim.x; }CK_TILE_DEVICE index_t get_block_size() { return blockDim.x; }CK_TILE_DEVICE index_t get_thread_local_1d_id() { return threadIdx.x; }CK_TILE_DEVICE index_t get_thread_global_1d_id() { return blockIdx.x * blockDim.x + threadIdx.x; }CK_TILE_DEVICE index_t get_block_1d_id() { return blockIdx.x; }CK_TILE_DEVICE index_t get_lane_id() { return __lane_id(); }template <bool ReturnSgpr = true>CK_TILE_DEVICE index_t get_warp_id(bool_constant<ReturnSgpr> = {}){ const index_t warp_id = threadIdx.x / get_warp_size(); if constexpr(ReturnSgpr) { return __builtin_amdgcn_readfirstlane(warp_id); } else { return warp_id; }}CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; }CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0){#ifdef __gfx12__ asm volatile("s_wait_loadcnt %0 \n" "s_barrier_signal -1 \n" "s_barrier_wait -1" : : "n"(cnt) : "memory");#else asm volatile("s_waitcnt vmcnt(%0) \n" "s_barrier" : : "n"(cnt) : "memory");#endif}struct waitcnt_arg{ CK_TILE_DEVICE static constexpr index_t MAX = 0b11'00'1111'0'111'1111; |
|
#define | CK_CONSTANT_ADDRESS_SPACE |
|
◆ CK_CONSTANT_ADDRESS_SPACE
#define CK_CONSTANT_ADDRESS_SPACE |
Value: __attribute__((address_space( \
static_cast<safe_underlying_type_t<address_space_enum>>(address_space_enum::constant))))
◆ CK_TILE_S_CNT_MAX
#define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111#define CK_TILE_VMCNT(cnt) \ ([]() { static_assert(!((cnt) >> 6), "VMCNT only has 6 bits"); }(), \ ((cnt) & 0b1111) | (((cnt) & 0b110000) << 10))#define CK_TILE_EXPCNT(cnt) \ ([]() { static_assert(!((cnt) >> 3), "EXP only has 3 bits"); }(), ((cnt) << 4))#define CK_TILE_LGKMCNT(cnt) \ ([]() { static_assert(!((cnt) >> 4), "LGKM only has 4 bits"); }(), ((cnt) << 8))namespace ck_tile {template <typename, bool>struct safe_underlying_type;template <typename T>struct safe_underlying_type<T, true>{ using type = std::underlying_type_t<T>;};template <typename T>struct safe_underlying_type<T, false>{ using type = void;};template <typename T>using safe_underlying_type_t = typename safe_underlying_type<T, std::is_enum<T>::value>::type;enum struct address_space_enum : std::uint16_t{ generic = 0, global, lds, sgpr, constant, vgpr};enum struct memory_operation_enum : std::uint16_t{ set = 0, atomic_add, atomic_max, add};CK_TILE_HOST_DEVICE constexpr index_t get_warp_size(){#if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__) return 64;#else return 32;#endif}CK_TILE_HOST bool is_wave32(){ hipDeviceProp_t props{}; int device; auto status = hipGetDevice(&device); if(status != hipSuccess) { return false; } status = hipGetDeviceProperties(&props, device); if(status != hipSuccess) { return false; } return props.major > 9;}CK_TILE_DEVICE index_t get_grid_size() { return gridDim.x; }CK_TILE_DEVICE index_t get_block_size() { return blockDim.x; }CK_TILE_DEVICE index_t get_thread_local_1d_id() { return threadIdx.x; }CK_TILE_DEVICE index_t get_thread_global_1d_id() { return blockIdx.x * blockDim.x + threadIdx.x; }CK_TILE_DEVICE index_t get_block_1d_id() { return blockIdx.x; }CK_TILE_DEVICE index_t get_lane_id() { return __lane_id(); }template <bool ReturnSgpr = true>CK_TILE_DEVICE index_t get_warp_id(bool_constant<ReturnSgpr> = {}){ const index_t warp_id = threadIdx.x / get_warp_size(); if constexpr(ReturnSgpr) { return __builtin_amdgcn_readfirstlane(warp_id); } else { return warp_id; }}CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; }CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0){#ifdef __gfx12__ asm volatile("s_wait_loadcnt %0 \n" "s_barrier_signal -1 \n" "s_barrier_wait -1" : : "n"(cnt) : "memory");#else asm volatile("s_waitcnt vmcnt(%0) \n" "s_barrier" : : "n"(cnt) : "memory");#endif}struct waitcnt_arg{ CK_TILE_DEVICE static constexpr index_t MAX = 0b11'00'1111'0'111'1111; |
◆ address_space_to_string()
constexpr CK_TILE_HOST_DEVICE const char* address_space_to_string |
( |
address_space_enum |
addr_space | ) |
|
|
constexpr |
Helper function to convert address space enum to string.
◆ block_sync_lds()
template<index_t lgkmcnt = 0>
◆ block_sync_lds_direct_load()
template<index_t vmcnt = 0>
◆ cast_pointer_to_constant_address_space()
◆ cast_pointer_to_generic_address_space()
◆ get_smem_capacity()
◆ s_nop()
◆ s_waitcnt()
template<index_t vmcnt = waitcnt_arg::kMaxVmCnt, index_t expcnt = waitcnt_arg::kMaxExpCnt, index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
◆ s_waitcnt_barrier()
template<index_t vmcnt = waitcnt_arg::kMaxVmCnt, index_t expcnt = waitcnt_arg::kMaxExpCnt, index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>