/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/arch/utility.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/arch/utility.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/arch/utility.hpp Source File
utility.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
6 // Address Space for AMDGCN
7 // https://llvm.org/docs/AMDGPUUsage.html#address-space
8 
13 
14 #include <stdint.h>
15 
16 namespace ck_tile {
17 
18 // TODO: we have "memory" clobber here because this inline asm is used for async copy
20 {
21  asm volatile("s_mov_b32 m0, %0" : : "s"(v) : "memory");
22 }
23 
24 // NOTE: this is an immediate value
26 {
27  asm volatile("s_add_u32 m0, %0, m0" : : "n"(v) : "memory");
28 }
29 
30 template <typename T>
31 CK_TILE_DEVICE T warp_shuffle_up(const T& v_local, uint32_t lane_delta)
32 {
33 #if 0
34  return __shfl_up(v_local, lane_delta);
35 #elif 1
36  static_assert(sizeof(T) == sizeof(int32_t), "wrong!");
37 
38  const uint32_t wrap_around_lane_delta = get_warp_size() - lane_delta;
39 
40  const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
41  (__lane_id() << 2) + (wrap_around_lane_delta << 2), bit_cast<int32_t>(v_local));
42 
43  return bit_cast<T>(v_remote_tmp);
44 #endif
45 }
46 
47 template <typename T>
48 CK_TILE_DEVICE T warp_shuffle_down(const T& v_local, uint32_t lane_delta)
49 {
50 #if 0
51  return __shfl_down(v_local, lane_delta);
52 #elif 1
53  static_assert(sizeof(T) == sizeof(int32_t), "wrong!");
54 
55  const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
56  (__lane_id() << 2) + (lane_delta << 2), bit_cast<int32_t>(v_local));
57 
58  return bit_cast<T>(v_remote_tmp);
59 #endif
60 }
61 
62 template <typename T>
63 CK_TILE_DEVICE auto warp_shuffle_down_pair(const T& v_local)
64 {
65  static_assert(sizeof(T) == sizeof(int32_t), "wrong!");
66 
67  const int32x2_t x = __builtin_amdgcn_permlane32_swap(
68  bit_cast<int32_t>(v_local), bit_cast<int32_t>(v_local), false, false);
69 
71  v(0) = bit_cast<T>(x[0]);
72  v(1) = bit_cast<T>(x[1]);
73 
74  return v;
75 }
76 
77 template <typename T>
78 CK_TILE_DEVICE T warp_shuffle(const T& v_local, uint32_t src_lane)
79 {
80 #if 0
81  return __shfl(v_local, src_lane);
82 #elif 1
83  if constexpr(sizeof(int32_t) > sizeof(T))
84  {
85  union packet
86  {
87  int32_t x;
88  T v;
89  };
90  packet p;
91  p.v = v_local;
92  packet p_remote;
93  p_remote.x = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(p));
94 
95  return p_remote.v;
96  }
97  else if constexpr(sizeof(int32_t) == sizeof(T))
98  {
99  const int32_t v_remote_tmp =
100  __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(v_local));
101 
102  return bit_cast<T>(v_remote_tmp);
103  }
104  else
105  {
106  static_assert(sizeof(T) % sizeof(int32_t) == 0, "wrong!");
107  constexpr index_t elm = sizeof(T) / sizeof(int32_t);
108  using vector_type = thread_buffer<int32_t, elm>;
109  auto vs = bit_cast<vector_type>(v_local);
110  auto vs_remote = vector_type{};
111  static_for<0, elm, 1>{}([&](auto i_e) {
112  int32_t tmp = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(vs[i_e]));
113  vs_remote(i_e) = tmp;
114  });
115  return bit_cast<T>(vs_remote);
116  }
117 #endif
118 }
119 
120 template <typename T>
121 CK_TILE_DEVICE auto flag_to_exec(const T& v_flag)
122 {
123  static_assert(sizeof(T) == 4);
124  // per-thread v_flag store into 2x sgpr
125  uint32x2_t exec_flag;
126  asm volatile("v_cmp_ge_u32 %[s_exec_flag], %[v_flag], 1"
127  : [s_exec_flag] "=s"(exec_flag)
128  : [v_flag] "v"(v_flag));
129  return exec_flag;
130 }
131 
132 template <typename X, typename Y>
133 CK_TILE_DEVICE auto cmp_lt_to_exec(const X& x, const Y& y)
134 {
135  static_assert(sizeof(X) == 4 && sizeof(Y) == 4);
136  // per-thread cmp store into 2x sgpr
137  uint32x2_t exec_flag;
138  asm volatile("v_cmp_lt_u32 %[s_exec_flag], %[v_x], %[v_y]"
139  : [s_exec_flag] "=s"(exec_flag)
140  : [v_x] "v"(x), [v_y] "v"(y));
141  return exec_flag;
142 }
143 
144 } // namespace ck_tile
#define CK_TILE_DEVICE
Definition: config.hpp:41
Definition: cluster_descriptor.hpp:13
CK_TILE_DEVICE auto cmp_lt_to_exec(const X &x, const Y &y)
Definition: utility.hpp:133
CK_TILE_DEVICE T warp_shuffle_up(const T &v_local, uint32_t lane_delta)
Definition: utility.hpp:31
uint32_t uint32x2_t
Definition: vector_type.hpp:152
CK_TILE_DEVICE T warp_shuffle(const T &v_local, uint32_t src_lane)
Definition: utility.hpp:78
int32_t index_t
Definition: integer.hpp:9
CK_TILE_DEVICE T warp_shuffle_down(const T &v_local, uint32_t lane_delta)
Definition: utility.hpp:48
CK_TILE_DEVICE auto warp_shuffle_down_pair(const T &v_local)
Definition: utility.hpp:63
int32_t int32_t
Definition: integer.hpp:10
CK_TILE_DEVICE void m0_set_with_memory(index_t v)
Definition: utility.hpp:19
int32_t int32x2_t
Definition: vector_type.hpp:143
CK_TILE_DEVICE auto flag_to_exec(const T &v_flag)
Definition: utility.hpp:121
CK_TILE_DEVICE void m0_inc_with_memory(index_t v)
Definition: utility.hpp:25
__host__ constexpr __device__ index_t get_warp_size()
Definition: get_id.hpp:42
unsigned int uint32_t
Definition: stdint.h:126
Definition: functional.hpp:43
Definition: debug.hpp:67