include/ck_tile/core/arch/utility.hpp Source File

include/ck_tile/core/arch/utility.hpp Source File#

Composable Kernel: include/ck_tile/core/arch/utility.hpp Source File
utility.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
6 // Address Space for AMDGCN
7 // https://llvm.org/docs/AMDGPUUsage.html#address-space
8 
13 
14 #include <stdint.h>
15 
16 namespace ck_tile {
17 
18 // TODO: we have "memory" clobber here because this inline asm is used for async copy
20 {
21  asm volatile("s_mov_b32 m0, %0" : : "s"(v) : "memory");
22 }
23 
24 // NOTE: this is an immediate value
26 {
27  asm volatile("s_add_u32 m0, %0, m0" : : "n"(v) : "memory");
28 }
29 
30 template <typename T>
31 CK_TILE_DEVICE T warp_shuffle_up(const T& v_local, uint32_t lane_delta)
32 {
33 #if 0
34  return __shfl_up(v_local, lane_delta);
35 #elif 1
36  static_assert(sizeof(T) == sizeof(int32_t), "wrong!");
37 
38  const uint32_t wrap_around_lane_delta = warpSize - lane_delta;
39 
40  const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
41  (__lane_id() << 2) + (wrap_around_lane_delta << 2), bit_cast<int32_t>(v_local));
42 
43  return bit_cast<T>(v_remote_tmp);
44 #endif
45 }
46 
47 template <typename T>
48 CK_TILE_DEVICE T warp_shuffle_down(const T& v_local, uint32_t lane_delta)
49 {
50 #if 0
51  return __shfl_down(v_local, lane_delta);
52 #elif 1
53  static_assert(sizeof(T) == sizeof(int32_t), "wrong!");
54 
55  const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
56  (__lane_id() << 2) + (lane_delta << 2), bit_cast<int32_t>(v_local));
57 
58  return bit_cast<T>(v_remote_tmp);
59 #endif
60 }
61 
62 template <typename T>
63 CK_TILE_DEVICE T warp_shuffle(const T& v_local, uint32_t src_lane)
64 {
65 #if 0
66  return __shfl(v_local, src_lane);
67 #elif 1
68  if constexpr(sizeof(int32_t) > sizeof(T))
69  {
70  union packet
71  {
72  int32_t x;
73  T v;
74  };
75  packet p;
76  p.v = v_local;
77  packet p_remote;
78  p_remote.x = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(p));
79 
80  return p_remote.v;
81  }
82  else if constexpr(sizeof(int32_t) == sizeof(T))
83  {
84  const int32_t v_remote_tmp =
85  __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(v_local));
86 
87  return bit_cast<T>(v_remote_tmp);
88  }
89  else
90  {
91  static_assert(sizeof(T) % sizeof(int32_t) == 0, "wrong!");
92  constexpr index_t elm = sizeof(T) / sizeof(int32_t);
93  using vector_type = thread_buffer<int32_t, elm>;
94  auto vs = bit_cast<vector_type>(v_local);
95  auto vs_remote = vector_type{};
96  static_for<0, elm, 1>{}([&](auto i_e) {
97  int32_t tmp = __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(vs[i_e]));
98  vs_remote(i_e) = tmp;
99  });
100  return bit_cast<T>(vs_remote);
101  }
102 #endif
103 }
104 
105 template <typename T>
106 CK_TILE_DEVICE auto flag_to_exec(const T& v_flag)
107 {
108  static_assert(sizeof(T) == 4);
109  // per-thread v_flag store into 2x sgpr
110  uint32x2_t exec_flag;
111  asm volatile("v_cmp_ge_u32 %[s_exec_flag], %[v_flag], 1"
112  : [s_exec_flag] "=s"(exec_flag)
113  : [v_flag] "v"(v_flag));
114  return exec_flag;
115 }
116 
117 template <typename X, typename Y>
118 CK_TILE_DEVICE auto cmp_lt_to_exec(const X& x, const Y& y)
119 {
120  static_assert(sizeof(X) == 4 && sizeof(Y) == 4);
121  // per-thread cmp store into 2x sgpr
122  uint32x2_t exec_flag;
123  asm volatile("v_cmp_lt_u32 %[s_exec_flag], %[v_x], %[v_y]"
124  : [s_exec_flag] "=s"(exec_flag)
125  : [v_x] "v"(x), [v_y] "v"(y));
126  return exec_flag;
127 }
128 
129 } // namespace ck_tile
#define CK_TILE_DEVICE
Definition: config.hpp:40
Definition: cluster_descriptor.hpp:13
CK_TILE_DEVICE auto cmp_lt_to_exec(const X &x, const Y &y)
Definition: utility.hpp:118
CK_TILE_DEVICE T warp_shuffle_up(const T &v_local, uint32_t lane_delta)
Definition: utility.hpp:31
uint32_t uint32x2_t
Definition: vector_type.hpp:122
tuple_array< T, N > thread_buffer
Definition: thread_buffer.hpp:14
CK_TILE_DEVICE T warp_shuffle(const T &v_local, uint32_t src_lane)
Definition: utility.hpp:63
int32_t index_t
Definition: integer.hpp:9
CK_TILE_DEVICE T warp_shuffle_down(const T &v_local, uint32_t lane_delta)
Definition: utility.hpp:48
CK_TILE_DEVICE void m0_set_with_memory(index_t v)
Definition: utility.hpp:19
CK_TILE_DEVICE auto flag_to_exec(const T &v_flag)
Definition: utility.hpp:106
CK_TILE_DEVICE void m0_inc_with_memory(index_t v)
Definition: utility.hpp:25
Definition: functional.hpp:43