21 asm volatile(
"s_mov_b32 m0, %0" : :
"s"(v) :
"memory");
27 asm volatile(
"s_add_u32 m0, %0, m0" : :
"n"(v) :
"memory");
34 return __shfl_up(v_local, lane_delta);
36 static_assert(
sizeof(T) ==
sizeof(
int32_t),
"wrong!");
40 const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
41 (__lane_id() << 2) + (wrap_around_lane_delta << 2), bit_cast<int32_t>(v_local));
43 return bit_cast<T>(v_remote_tmp);
51 return __shfl_down(v_local, lane_delta);
53 static_assert(
sizeof(T) ==
sizeof(
int32_t),
"wrong!");
55 const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
56 (__lane_id() << 2) + (lane_delta << 2), bit_cast<int32_t>(v_local));
58 return bit_cast<T>(v_remote_tmp);
65 static_assert(
sizeof(T) ==
sizeof(
int32_t),
"wrong!");
67 const int32x2_t x = __builtin_amdgcn_permlane32_swap(
68 bit_cast<int32_t>(v_local), bit_cast<int32_t>(v_local),
false,
false);
71 v(0) = bit_cast<T>(x[0]);
72 v(1) = bit_cast<T>(x[1]);
81 return __shfl(v_local, src_lane);
83 if constexpr(
sizeof(
int32_t) >
sizeof(T))
93 p_remote.x = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(p));
97 else if constexpr(
sizeof(
int32_t) ==
sizeof(T))
100 __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(v_local));
102 return bit_cast<T>(v_remote_tmp);
106 static_assert(
sizeof(T) %
sizeof(
int32_t) == 0,
"wrong!");
109 auto vs = bit_cast<vector_type>(v_local);
110 auto vs_remote = vector_type{};
112 int32_t tmp = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(vs[i_e]));
113 vs_remote(i_e) = tmp;
115 return bit_cast<T>(vs_remote);
120 template <
typename T>
123 static_assert(
sizeof(T) == 4);
126 asm volatile(
"v_cmp_ge_u32 %[s_exec_flag], %[v_flag], 1"
127 : [s_exec_flag]
"=s"(exec_flag)
128 : [v_flag]
"v"(v_flag));
132 template <
typename X,
typename Y>
135 static_assert(
sizeof(X) == 4 &&
sizeof(Y) == 4);
138 asm volatile(
"v_cmp_lt_u32 %[s_exec_flag], %[v_x], %[v_y]"
139 : [s_exec_flag]
"=s"(exec_flag)
140 : [v_x]
"v"(x), [v_y]
"v"(y));
#define CK_TILE_DEVICE
Definition: config.hpp:41
Definition: cluster_descriptor.hpp:13
CK_TILE_DEVICE auto cmp_lt_to_exec(const X &x, const Y &y)
Definition: utility.hpp:133
CK_TILE_DEVICE T warp_shuffle_up(const T &v_local, uint32_t lane_delta)
Definition: utility.hpp:31
uint32_t uint32x2_t
Definition: vector_type.hpp:152
CK_TILE_DEVICE T warp_shuffle(const T &v_local, uint32_t src_lane)
Definition: utility.hpp:78
int32_t index_t
Definition: integer.hpp:9
CK_TILE_DEVICE T warp_shuffle_down(const T &v_local, uint32_t lane_delta)
Definition: utility.hpp:48
CK_TILE_DEVICE auto warp_shuffle_down_pair(const T &v_local)
Definition: utility.hpp:63
int32_t int32_t
Definition: integer.hpp:10
CK_TILE_DEVICE void m0_set_with_memory(index_t v)
Definition: utility.hpp:19
int32_t int32x2_t
Definition: vector_type.hpp:143
CK_TILE_DEVICE auto flag_to_exec(const T &v_flag)
Definition: utility.hpp:121
CK_TILE_DEVICE void m0_inc_with_memory(index_t v)
Definition: utility.hpp:25
__host__ constexpr __device__ index_t get_warp_size()
Definition: get_id.hpp:42
unsigned int uint32_t
Definition: stdint.h:126
Definition: functional.hpp:43