include/ck/utility/magic_division.hpp Source File

include/ck/utility/magic_division.hpp Source File#

Composable Kernel: include/ck/utility/magic_division.hpp Source File
magic_division.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
6 #include "ck/ck.hpp"
7 #include "integral_constant.hpp"
8 #include "number.hpp"
9 #include "type.hpp"
10 #include "tuple.hpp"
11 
12 #ifdef CK_CODE_GEN_RTC
13 #define INT32_MAX 2147483647
14 #endif
15 
16 namespace ck {
17 
18 // magic number division
19 // Caution:
20 // 1. For uint32_t as dividend: magic number division implementation being used would produce
21 // correct result if the dividend is uint32_t and its value is within 31-bit value range.
22 // 2. For int32_t as dividendd: magic number division for int32_t dividened has not been
23 // implemented, the int32_t dividend would be bit-wise interpreted as uint32_t and magic number
24 // division implementation for uint32_t is then used. Therefore, dividend value need to be
25 // non-negative.
26 // TODO:
27 // 1. Implement magic number divison for int32_t
28 // 2. Implement magic number divison for unit32_t with 32-bit value range
30 {
31  // uint32_t
32  __host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor)
33  {
34  // WARNING: magic division is only applicable for division inside this range.
35  // You should use the return value of CalculateMagicNumbers, if division is not inside this
36  // range. The "else" logic below is to quiet down run-time error.
37  if(divisor >= 1 && divisor <= INT32_MAX)
38  {
39  uint32_t shift = 0;
40  for(shift = 0; shift < 32; ++shift)
41  {
42  if((1U << shift) >= divisor)
43  {
44  break;
45  }
46  }
47 
48  uint64_t one = 1;
49  uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
50  // assert(multiplier <= 0xffffffffUL);
51 
52  return make_tuple(uint32_t(multiplier), shift);
53  }
54  else
55  {
56  return make_tuple(uint32_t(0), uint32_t(0));
57  }
58  }
59 
60  __host__ __device__ static constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor)
61  {
62  auto tmp = CalculateMagicNumbers(divisor);
63 
64  return tmp[Number<0>{}];
65  }
66 
67  __host__ __device__ static constexpr uint32_t CalculateMagicShift(uint32_t divisor)
68  {
69  auto tmp = CalculateMagicNumbers(divisor);
70 
71  return tmp[Number<1>{}];
72  }
73 
74  // integral_constant<uint32_t, .>
75  template <uint32_t Divisor>
76  __host__ __device__ static constexpr auto
78  {
79  constexpr auto tmp = CalculateMagicNumbers(uint32_t{Divisor});
80 
81  constexpr uint32_t multiplier = tmp[Number<0>{}];
82  constexpr uint32_t shift = tmp[Number<1>{}];
83 
86  }
87 
88  template <uint32_t Divisor>
89  __host__ __device__ static constexpr auto
91  {
92  constexpr uint32_t multiplier = CalculateMagicMultiplier(uint32_t{Divisor});
93 
95  }
96 
97  template <uint32_t Divisor>
98  __host__ __device__ static constexpr auto
100  {
101  constexpr uint32_t shift = CalculateMagicShift(uint32_t{Divisor});
102 
104  }
105 
106  // integral_constant<int32_t, .>
107  template <int32_t Divisor>
108  __host__ __device__ static constexpr auto
110  {
112  }
113 
114  template <int32_t Divisor>
115  __host__ __device__ static constexpr auto
117  {
119  }
120 
121  template <int32_t Divisor>
122  __host__ __device__ static constexpr auto
124  {
126  }
127 
128  // magic division for uint32_t
129  __device__ static constexpr uint32_t
130  DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
131  {
132  uint32_t tmp = __umulhi(dividend, multiplier);
133  return (tmp + dividend) >> shift;
134  }
135 
136  __host__ static constexpr uint32_t
137  DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
138  {
139  uint32_t tmp = static_cast<uint64_t>(dividend) * multiplier >> 32;
140  return (tmp + dividend) >> shift;
141  }
142 
143  // magic division for int32_t
144  // HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
145  // non-negative for result to be correct
146  // TODO: figure out how to do magic number divison for int32_t as dividended
147  __device__ static constexpr int32_t
148  DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
149  {
150  uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
151  uint32_t tmp = __umulhi(dividend_u32, multiplier);
152  return (tmp + dividend_u32) >> shift;
153  }
154 
155  __host__ static constexpr int32_t
156  DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
157  {
158  uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
159  uint32_t tmp = static_cast<uint64_t>(dividend_u32) * multiplier >> 32;
160  return (tmp + dividend_u32) >> shift;
161  }
162 };
163 
164 struct MDiv
165 {
166  // 1 dword -> 3 dword storage
167  uint32_t divisor;
168  uint32_t multiplier;
169  uint32_t shift; // TODO: 8 bit is enough
170 
171  // prefer construct on host
172  __host__ __device__ MDiv(uint32_t divisor_) : divisor(divisor_)
173  {
174  auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
175 
176  multiplier = tmp[Number<0>{}];
177  shift = tmp[Number<1>{}];
178  }
179 
180  __host__ __device__ MDiv() : divisor(0), multiplier(0), shift(0) {}
181 
182  __host__ __device__ void update(uint32_t divisor_)
183  {
184  divisor = divisor_;
185  auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
186 
187  multiplier = tmp[Number<0>{}];
188  shift = tmp[Number<1>{}];
189  }
190 
191  __host__ __device__ uint32_t div(uint32_t dividend_) const
192  {
193  return MagicDivision::DoMagicDivision(dividend_, multiplier, shift);
194  }
195 
196  __host__ __device__ void
197  divmod(uint32_t dividend_, uint32_t& quotient_, uint32_t& remainder_) const
198  {
199  quotient_ = div(dividend_);
200  remainder_ = dividend_ - (quotient_ * divisor);
201  }
202 
203  __host__ __device__ uint32_t get() const { return divisor; }
204 };
205 
206 struct MDiv2
207 {
208  // 1 dword -> 2 dword storage, divisor need compute from runtime
209  uint32_t multiplier;
210  uint32_t shift; // TODO: 8 bit is enough
211 
212  // prefer construct on host
213  __host__ __device__ MDiv2(uint32_t divisor_)
214  {
215  auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
216 
217  multiplier = tmp[Number<0>{}];
218  shift = tmp[Number<1>{}];
219  }
220 
221  __host__ __device__ MDiv2() : multiplier(0), shift(0) {}
222 
223  __host__ __device__ uint32_t div(uint32_t dividend_) const
224  {
225  return MagicDivision::DoMagicDivision(dividend_, multiplier, shift);
226  }
227 
228  __host__ __device__ void
229  divmod(uint32_t dividend_, uint32_t divisor_, uint32_t& quotient_, uint32_t& remainder_) const
230  {
231  quotient_ = div(dividend_);
232  remainder_ = dividend_ - (quotient_ * divisor_);
233  }
234 };
235 
236 } // namespace ck
Definition: ck.hpp:264
__host__ constexpr __device__ auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:211
Definition: magic_division.hpp:207
__host__ __device__ MDiv2(uint32_t divisor_)
Definition: magic_division.hpp:213
uint32_t shift
Definition: magic_division.hpp:210
uint32_t multiplier
Definition: magic_division.hpp:209
__host__ __device__ void divmod(uint32_t dividend_, uint32_t divisor_, uint32_t &quotient_, uint32_t &remainder_) const
Definition: magic_division.hpp:229
__host__ __device__ uint32_t div(uint32_t dividend_) const
Definition: magic_division.hpp:223
__host__ __device__ MDiv2()
Definition: magic_division.hpp:221
Definition: magic_division.hpp:165
__host__ __device__ MDiv()
Definition: magic_division.hpp:180
uint32_t divisor
Definition: magic_division.hpp:167
__host__ __device__ uint32_t get() const
Definition: magic_division.hpp:203
__host__ __device__ void divmod(uint32_t dividend_, uint32_t &quotient_, uint32_t &remainder_) const
Definition: magic_division.hpp:197
__host__ __device__ void update(uint32_t divisor_)
Definition: magic_division.hpp:182
__host__ __device__ MDiv(uint32_t divisor_)
Definition: magic_division.hpp:172
uint32_t multiplier
Definition: magic_division.hpp:168
uint32_t shift
Definition: magic_division.hpp:169
__host__ __device__ uint32_t div(uint32_t dividend_) const
Definition: magic_division.hpp:191
Definition: magic_division.hpp:30
static constexpr __host__ uint32_t DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
Definition: magic_division.hpp:137
__host__ static constexpr __device__ auto CalculateMagicShift(integral_constant< int32_t, Divisor >)
Definition: magic_division.hpp:123
__host__ static constexpr __device__ auto CalculateMagicMultiplier(integral_constant< uint32_t, Divisor >)
Definition: magic_division.hpp:90
__host__ static constexpr __device__ auto CalculateMagicNumbers(uint32_t divisor)
Definition: magic_division.hpp:32
static constexpr __device__ uint32_t DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
Definition: magic_division.hpp:130
__host__ static constexpr __device__ auto CalculateMagicShift(integral_constant< uint32_t, Divisor >)
Definition: magic_division.hpp:99
__host__ static constexpr __device__ uint32_t CalculateMagicShift(uint32_t divisor)
Definition: magic_division.hpp:67
static constexpr __device__ int32_t DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
Definition: magic_division.hpp:148
static constexpr __host__ int32_t DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
Definition: magic_division.hpp:156
__host__ static constexpr __device__ auto CalculateMagicNumbers(integral_constant< int32_t, Divisor >)
Definition: magic_division.hpp:109
__host__ static constexpr __device__ auto CalculateMagicNumbers(integral_constant< uint32_t, Divisor >)
Definition: magic_division.hpp:77
__host__ static constexpr __device__ uint32_t CalculateMagicMultiplier(uint32_t divisor)
Definition: magic_division.hpp:60
__host__ static constexpr __device__ auto CalculateMagicMultiplier(integral_constant< int32_t, Divisor >)
Definition: magic_division.hpp:116
Definition: integral_constant.hpp:10