21     asm volatile(
"s_mov_b32 m0, %0" : : 
"s"(v) : 
"memory");
 
   27     asm volatile(
"s_add_u32 m0, %0, m0" : : 
"n"(v) : 
"memory");
 
   34     return  __shfl_up(v_local, lane_delta);
 
   36     static_assert(
sizeof(T) == 
sizeof(
int32_t), 
"wrong!");
 
   40     const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
 
   41         (__lane_id() << 2) + (wrap_around_lane_delta << 2), bit_cast<int32_t>(v_local));
 
   43     return bit_cast<T>(v_remote_tmp);
 
   51     return  __shfl_down(v_local, lane_delta);
 
   53     static_assert(
sizeof(T) == 
sizeof(
int32_t), 
"wrong!");
 
   55     const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
 
   56         (__lane_id() << 2) + (lane_delta << 2), bit_cast<int32_t>(v_local));
 
   58     return bit_cast<T>(v_remote_tmp);
 
   65     static_assert(
sizeof(T) == 
sizeof(
int32_t), 
"wrong!");
 
   67     const int32x2_t x = __builtin_amdgcn_permlane32_swap(
 
   68         bit_cast<int32_t>(v_local), bit_cast<int32_t>(v_local), 
false, 
false);
 
   71     v(0) = bit_cast<T>(x[0]);
 
   72     v(1) = bit_cast<T>(x[1]);
 
   81     return  __shfl(v_local, src_lane);
 
   83     if constexpr(
sizeof(
int32_t) > 
sizeof(T))
 
   93         p_remote.x = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(p));
 
   97     else if constexpr(
sizeof(
int32_t) == 
sizeof(T))
 
  100             __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(v_local));
 
  102         return bit_cast<T>(v_remote_tmp);
 
  106         static_assert(
sizeof(T) % 
sizeof(
int32_t) == 0, 
"wrong!");
 
  109         auto vs               = bit_cast<vector_type>(v_local);
 
  110         auto vs_remote        = vector_type{};
 
  112             int32_t tmp = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(vs[i_e]));
 
  113             vs_remote(i_e) = tmp;
 
  115         return bit_cast<T>(vs_remote);
 
  120 template <
typename T>
 
  123     static_assert(
sizeof(T) == 4);
 
  126     asm volatile(
"v_cmp_ge_u32 %[s_exec_flag], %[v_flag], 1" 
  127                  : [s_exec_flag] 
"=s"(exec_flag)
 
  128                  : [v_flag] 
"v"(v_flag));
 
  132 template <
typename X, 
typename Y>
 
  135     static_assert(
sizeof(X) == 4 && 
sizeof(Y) == 4);
 
  138     asm volatile(
"v_cmp_lt_u32 %[s_exec_flag], %[v_x], %[v_y]" 
  139                  : [s_exec_flag] 
"=s"(exec_flag)
 
  140                  : [v_x] 
"v"(x), [v_y] 
"v"(y));
 
#define CK_TILE_DEVICE
Definition: config.hpp:41
 
Definition: cluster_descriptor.hpp:13
 
CK_TILE_DEVICE auto cmp_lt_to_exec(const X &x, const Y &y)
Definition: utility.hpp:133
 
CK_TILE_DEVICE T warp_shuffle_up(const T &v_local, uint32_t lane_delta)
Definition: utility.hpp:31
 
uint32_t uint32x2_t
Definition: vector_type.hpp:163
 
CK_TILE_DEVICE T warp_shuffle(const T &v_local, uint32_t src_lane)
Definition: utility.hpp:78
 
int32_t index_t
Definition: integer.hpp:9
 
CK_TILE_DEVICE T warp_shuffle_down(const T &v_local, uint32_t lane_delta)
Definition: utility.hpp:48
 
CK_TILE_DEVICE auto warp_shuffle_down_pair(const T &v_local)
Definition: utility.hpp:63
 
int32_t int32_t
Definition: integer.hpp:10
 
CK_TILE_DEVICE void m0_set_with_memory(index_t v)
Definition: utility.hpp:19
 
int32_t int32x2_t
Definition: vector_type.hpp:154
 
CK_TILE_DEVICE auto flag_to_exec(const T &v_flag)
Definition: utility.hpp:121
 
CK_TILE_DEVICE void m0_inc_with_memory(index_t v)
Definition: utility.hpp:25
 
constexpr __device__ index_t get_warp_size()
Definition: get_id.hpp:10
 
unsigned int uint32_t
Definition: stdint.h:126
 
Definition: functional.hpp:43