21 asm volatile(
"s_mov_b32 m0, %0" : :
"s"(v) :
"memory");
27 asm volatile(
"s_add_u32 m0, %0, m0" : :
"n"(v) :
"memory");
34 return __shfl_up(v_local, lane_delta);
36 static_assert(
sizeof(T) ==
sizeof(int32_t),
"wrong!");
38 const uint32_t wrap_around_lane_delta = warpSize - lane_delta;
40 const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
41 (__lane_id() << 2) + (wrap_around_lane_delta << 2), bit_cast<int32_t>(v_local));
43 return bit_cast<T>(v_remote_tmp);
51 return __shfl_down(v_local, lane_delta);
53 static_assert(
sizeof(T) ==
sizeof(int32_t),
"wrong!");
55 const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
56 (__lane_id() << 2) + (lane_delta << 2), bit_cast<int32_t>(v_local));
58 return bit_cast<T>(v_remote_tmp);
66 return __shfl(v_local, src_lane);
68 if constexpr(
sizeof(int32_t) >
sizeof(T))
78 p_remote.x = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(p));
82 else if constexpr(
sizeof(int32_t) ==
sizeof(T))
84 const int32_t v_remote_tmp =
85 __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(v_local));
87 return bit_cast<T>(v_remote_tmp);
91 static_assert(
sizeof(T) %
sizeof(int32_t) == 0,
"wrong!");
92 constexpr
index_t elm =
sizeof(T) /
sizeof(int32_t);
94 auto vs = bit_cast<vector_type>(v_local);
95 auto vs_remote = vector_type{};
97 int32_t tmp = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(vs[i_e]));
100 return bit_cast<T>(vs_remote);
105 template <
typename T>
108 static_assert(
sizeof(T) == 4);
111 asm volatile(
"v_cmp_ge_u32 %[s_exec_flag], %[v_flag], 1"
112 : [s_exec_flag]
"=s"(exec_flag)
113 : [v_flag]
"v"(v_flag));
117 template <
typename X,
typename Y>
120 static_assert(
sizeof(X) == 4 &&
sizeof(Y) == 4);
123 asm volatile(
"v_cmp_lt_u32 %[s_exec_flag], %[v_x], %[v_y]"
124 : [s_exec_flag]
"=s"(exec_flag)
125 : [v_x]
"v"(x), [v_y]
"v"(y));
#define CK_TILE_DEVICE
Definition: config.hpp:40
Definition: cluster_descriptor.hpp:13
CK_TILE_DEVICE auto cmp_lt_to_exec(const X &x, const Y &y)
Definition: utility.hpp:118
CK_TILE_DEVICE T warp_shuffle_up(const T &v_local, uint32_t lane_delta)
Definition: utility.hpp:31
uint32_t uint32x2_t
Definition: vector_type.hpp:122
tuple_array< T, N > thread_buffer
Definition: thread_buffer.hpp:14
CK_TILE_DEVICE T warp_shuffle(const T &v_local, uint32_t src_lane)
Definition: utility.hpp:63
int32_t index_t
Definition: integer.hpp:9
CK_TILE_DEVICE T warp_shuffle_down(const T &v_local, uint32_t lane_delta)
Definition: utility.hpp:48
CK_TILE_DEVICE void m0_set_with_memory(index_t v)
Definition: utility.hpp:19
CK_TILE_DEVICE auto flag_to_exec(const T &v_flag)
Definition: utility.hpp:106
CK_TILE_DEVICE void m0_inc_with_memory(index_t v)
Definition: utility.hpp:25
Definition: functional.hpp:43