/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_wave_read_first_lane.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_wave_read_first_lane.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_wave_read_first_lane.hpp Source File
amd_wave_read_first_lane.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
6 #include "ck/ck.hpp"
8 #include "ck/utility/math.hpp"
9 
10 #if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
11 #include <array>
12 #include <cstddef>
13 #include <cstdint>
14 #include <type_traits>
15 #endif
16 
17 namespace ck {
18 namespace detail {
19 
20 template <unsigned SizeInBytes>
21 struct get_carrier;
22 
23 template <>
24 struct get_carrier<1>
25 {
26  using type = uint8_t;
27 };
28 
29 template <>
30 struct get_carrier<2>
31 {
32  using type = uint16_t;
33 };
34 
35 template <>
36 struct get_carrier<3>
37 {
38  using type = class carrier
39  {
40  using value_type = uint32_t;
41 
43  static_assert(sizeof(bytes) <= sizeof(value_type));
44 
45  // replacement of host std::copy_n()
46  template <typename InputIterator, typename Size, typename OutputIterator>
47  __device__ static OutputIterator copy_n(InputIterator from, Size size, OutputIterator to)
48  {
49  if(0 < size)
50  {
51  *to = *from;
52  ++to;
53  for(Size count = 1; count < size; ++count)
54  {
55  *to = *++from;
56  ++to;
57  }
58  }
59 
60  return to;
61  }
62 
63  // method to trigger template substitution failure
64  __device__ carrier(const carrier& other) noexcept
65  {
66  copy_n(other.bytes.begin(), bytes.Size(), bytes.begin());
67  }
68 
69  public:
70  __device__ carrier& operator=(value_type value) noexcept
71  {
72  copy_n(reinterpret_cast<const ck::byte*>(&value), bytes.Size(), bytes.begin());
73 
74  return *this;
75  }
76 
77  __device__ operator value_type() const noexcept
78  {
79  ck::byte result[sizeof(value_type)];
80 
81  copy_n(bytes.begin(), bytes.Size(), result);
82 
83  return *reinterpret_cast<const value_type*>(result);
84  }
85  };
86 };
87 static_assert(sizeof(get_carrier<3>::type) == 3);
88 
89 template <>
90 struct get_carrier<4>
91 {
92  using type = uint32_t;
93 };
94 
95 template <unsigned SizeInBytes>
97 
98 } // namespace detail
99 
101 {
102  return __builtin_amdgcn_readfirstlane(value);
103 }
104 
106 {
107  return __builtin_amdgcn_readfirstlane(value);
108 }
109 
111 {
112  constexpr unsigned object_size = sizeof(int64_t);
113  constexpr unsigned second_part_offset = object_size / 2;
114  auto* const from_obj = reinterpret_cast<const ck::byte*>(&value);
115  alignas(int64_t) ck::byte to_obj[object_size];
116 
117  using Sgpr = uint32_t;
118 
119  *reinterpret_cast<Sgpr*>(to_obj) =
120  amd_wave_read_first_lane(*reinterpret_cast<const Sgpr*>(from_obj));
121  *reinterpret_cast<Sgpr*>(to_obj + second_part_offset) =
122  amd_wave_read_first_lane(*reinterpret_cast<const Sgpr*>(from_obj + second_part_offset));
123 
124  return *reinterpret_cast<int64_t*>(to_obj);
125 }
126 
127 template <typename Object,
128  typename = ck::enable_if_t<ck::is_class_v<Object> && ck::is_trivially_copyable_v<Object>>>
129 __device__ auto amd_wave_read_first_lane(const Object& obj)
130 {
131  using Size = unsigned;
132  constexpr Size SgprSize = 4;
133  constexpr Size ObjectSize = sizeof(Object);
134 
135  auto* const from_obj = reinterpret_cast<const ck::byte*>(&obj);
136  alignas(Object) ck::byte to_obj[ObjectSize];
137 
138  constexpr Size RemainedSize = ObjectSize % SgprSize;
139  constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize;
140  for(Size offset = 0; offset < CompleteSgprCopyBoundary; offset += SgprSize)
141  {
143 
144  *reinterpret_cast<Sgpr*>(to_obj + offset) =
145  amd_wave_read_first_lane(*reinterpret_cast<const Sgpr*>(from_obj + offset));
146  }
147 
148  if constexpr(0 < RemainedSize)
149  {
150  using Carrier = detail::get_carrier_t<RemainedSize>;
151 
152  *reinterpret_cast<Carrier*>(to_obj + CompleteSgprCopyBoundary) = amd_wave_read_first_lane(
153  *reinterpret_cast<const Carrier*>(from_obj + CompleteSgprCopyBoundary));
154  }
155 
158  return *reinterpret_cast<Object*>(to_obj);
159 }
160 
161 } // namespace ck
typename get_carrier< SizeInBytes >::type get_carrier_t
Definition: amd_wave_read_first_lane.hpp:96
Definition: ck.hpp:267
__device__ uint32_t amd_wave_read_first_lane(uint32_t value)
Definition: amd_wave_read_first_lane.hpp:100
long int64_t
Definition: data_type.hpp:461
typename std::enable_if< B, T >::type enable_if_t
Definition: enable_if.hpp:27
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1350
unsigned short uint16_t
Definition: stdint.h:125
unsigned int uint32_t
Definition: stdint.h:126
signed int int32_t
Definition: stdint.h:123
unsigned char uint8_t
Definition: stdint.h:124
__host__ constexpr __device__ const TData * begin() const
Definition: array.hpp:39
__host__ static constexpr __device__ index_t Size()
Definition: array.hpp:20
uint8_t type
Definition: amd_wave_read_first_lane.hpp:26
uint16_t type
Definition: amd_wave_read_first_lane.hpp:32
static __device__ OutputIterator copy_n(InputIterator from, Size size, OutputIterator to)
Definition: amd_wave_read_first_lane.hpp:47
__device__ carrier & operator=(value_type value) noexcept
Definition: amd_wave_read_first_lane.hpp:70
Array< ck::byte, 3 > bytes
Definition: amd_wave_read_first_lane.hpp:42
class carrier { using value_type=uint32_t type
Definition: amd_wave_read_first_lane.hpp:40
__device__ carrier(const carrier &other) noexcept
Definition: amd_wave_read_first_lane.hpp:64
uint32_t type
Definition: amd_wave_read_first_lane.hpp:92
Definition: amd_wave_read_first_lane.hpp:21