/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/develop/projects/rocrand/library/include/rocrand/rocrand_threefry4_impl.h Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/develop/projects/rocrand/library/include/rocrand/rocrand_threefry4_impl.h Source File#

API library: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/develop/projects/rocrand/library/include/rocrand/rocrand_threefry4_impl.h Source File
rocrand_threefry4_impl.h
1 // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 /*
22 Copyright 2010-2011, D. E. Shaw Research.
23 All rights reserved.
24 
25 Redistribution and use in source and binary forms, with or without
26 modification, are permitted provided that the following conditions are
27 met:
28 
29 * Redistributions of source code must retain the above copyright
30  notice, this list of conditions, and the following disclaimer.
31 
32 * Redistributions in binary form must reproduce the above copyright
33  notice, this list of conditions, and the following disclaimer in the
34  documentation and/or other materials provided with the distribution.
35 
36 * Neither the name of D. E. Shaw Research nor the names of its
37  contributors may be used to endorse or promote products derived from
38  this software without specific prior written permission.
39 
40 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
41 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
42 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
43 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
44 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
45 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
46 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
47 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
48 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
49 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
50 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
51 */
52 
53 #ifndef ROCRAND_THREEFRY4_IMPL_H_
54 #define ROCRAND_THREEFRY4_IMPL_H_
55 
56 #include "rocrand/rocrand_common.h"
57 #include "rocrand/rocrand_threefry_common.h"
58 
59 #include <hip/hip_runtime.h>
60 
61 #ifndef THREEFRY4x32_DEFAULT_ROUNDS
62  #define THREEFRY4x32_DEFAULT_ROUNDS 20
63 #endif
64 
65 #ifndef THREEFRY4x64_DEFAULT_ROUNDS
66  #define THREEFRY4x64_DEFAULT_ROUNDS 20
67 #endif
68 
69 namespace rocrand_device
70 {
71 
72 template<class value>
73 __forceinline__ __device__ __host__ int threefry_rotation_array(int indexX, int indexY) = delete;
74 
75 template<>
76 __forceinline__ __device__ __host__ int threefry_rotation_array<unsigned int>(int indexX,
77  int indexY)
78 {
79  // Output from skein_rot_search: (srs-B128-X5000.out)
80  // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
81  // Start: Mon Aug 24 22:41:36 2009
82  // ...
83  // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format
84  static constexpr int THREEFRY_ROTATION_32_4[8][2] = {
85  {10, 26},
86  {11, 21},
87  {13, 27},
88  {23, 5},
89  { 6, 20},
90  {17, 11},
91  {25, 10},
92  {18, 20}
93  };
94  return THREEFRY_ROTATION_32_4[indexX][indexY];
95 }
96 
97 template<>
98 __forceinline__ __device__ __host__ int threefry_rotation_array<unsigned long long>(int indexX,
99  int indexY)
100 {
101  // These are the R_256 constants from the Threefish reference sources
102  // with names changed to R_64x4... */
103  static constexpr int THREEFRY_ROTATION_64_4[8][2] = {
104  {14, 16},
105  {52, 57},
106  {23, 40},
107  { 5, 37},
108  {25, 33},
109  {46, 12},
110  {58, 22},
111  {32, 32}
112  };
113  return THREEFRY_ROTATION_64_4[indexX][indexY];
114 }
115 
116 template<typename state_value, typename value, unsigned int Nrounds>
117 class threefry_engine4_base
118 {
119 public:
120  struct threefry_state_4
121  {
122  state_value counter;
123  state_value key;
124  state_value result;
125  unsigned int substate;
126  };
127  using state_type = threefry_state_4;
128  using state_vector_type = state_value;
129 
131  __forceinline__ __device__ __host__ void discard(unsigned long long offset)
132  {
133  this->discard_impl(offset);
134  this->m_state.result = this->threefry_rounds(m_state.counter, m_state.key);
135  }
136 
142  __forceinline__ __device__ __host__ void discard_subsequence(unsigned long long subsequence)
143  {
144  this->discard_subsequence_impl(subsequence);
145  m_state.result = this->threefry_rounds(m_state.counter, m_state.key);
146  }
147 
148  __forceinline__ __device__ __host__ value operator()()
149  {
150  return this->next();
151  }
152 
153  __forceinline__ __device__ __host__ value next()
154  {
155 #if defined(__HIP_PLATFORM_AMD__)
156  value ret = ROCRAND_HIPVEC_ACCESS(m_state.result)[m_state.substate];
157 #else
158  value ret = (&m_state.result.x)[m_state.substate];
159 #endif
160  m_state.substate++;
161  if(m_state.substate == 4)
162  {
163  m_state.substate = 0;
164  m_state.counter = this->bump_counter(m_state.counter);
165  m_state.result = this->threefry_rounds(m_state.counter, m_state.key);
166  }
167  return ret;
168  }
169 
170  __forceinline__ __device__ __host__ state_value next4()
171  {
172  state_value ret = m_state.result;
173  m_state.counter = this->bump_counter(m_state.counter);
174  m_state.result = this->threefry_rounds(m_state.counter, m_state.key);
175 
176  return this->interleave(ret, m_state.result);
177  }
178 
179 protected:
180  __forceinline__ __device__ __host__ static state_value threefry_rounds(state_value counter,
181  state_value key)
182  {
183  state_value X;
184  value ks[4 + 1];
185 
186  static_assert(Nrounds <= 72, "72 or less only supported in threefry rounds");
187 
188  ks[4] = skein_ks_parity<value>();
189 
190  ks[0] = key.x;
191  ks[1] = key.y;
192  ks[2] = key.z;
193  ks[3] = key.w;
194 
195  X.x = counter.x;
196  X.y = counter.y;
197  X.z = counter.z;
198  X.w = counter.w;
199 
200  ks[4] ^= key.x;
201  ks[4] ^= key.y;
202  ks[4] ^= key.z;
203  ks[4] ^= key.w;
204 
205  /* Insert initial key before round 0 */
206  X.x += ks[0];
207  X.y += ks[1];
208  X.z += ks[2];
209  X.w += ks[3];
210 
211  for(unsigned int round_idx = 0; round_idx < Nrounds; round_idx++)
212  {
213  int rot_0 = threefry_rotation_array<value>(round_idx & 7u, 0);
214  int rot_1 = threefry_rotation_array<value>(round_idx & 7u, 1);
215  if((round_idx & 2u) == 0)
216  {
217  X.x += X.y;
218  X.y = rotl<value>(X.y, rot_0);
219  X.y ^= X.x;
220  X.z += X.w;
221  X.w = rotl<value>(X.w, rot_1);
222  X.w ^= X.z;
223  }
224  else
225  {
226  X.x += X.w;
227  X.w = rotl<value>(X.w, rot_0);
228  X.w ^= X.x;
229  X.z += X.y;
230  X.y = rotl<value>(X.y, rot_1);
231  X.y ^= X.z;
232  }
233 
234  if((round_idx & 3u) == 3)
235  {
236  unsigned int inject_idx = round_idx / 4;
237  // InjectKey(r = 1 + inject_idx)
238  X.x += ks[(1 + inject_idx) % 5];
239  X.y += ks[(2 + inject_idx) % 5];
240  X.z += ks[(3 + inject_idx) % 5];
241  X.w += ks[(4 + inject_idx) % 5];
242  X.w += 1 + inject_idx;
243  }
244  }
245 
246  return X;
247  }
248 
251  __forceinline__ __device__ __host__ void discard_impl(unsigned long long offset)
252  {
253  // Adjust offset for subset
254  m_state.substate += offset & 3;
255  unsigned long long counter_offset = offset / 4;
256  counter_offset += m_state.substate < 4 ? 0 : 1;
257  m_state.substate += m_state.substate < 4 ? 0 : -4;
258  // Discard states
259  this->discard_state(counter_offset);
260  }
261 
263  __forceinline__ __device__ __host__ void
264  discard_subsequence_impl(unsigned long long subsequence)
265  {
266  value lo, hi;
267  ::rocrand_device::detail::split_ull(lo, hi, subsequence);
268 
269  value old_counter = m_state.counter.z;
270  m_state.counter.z += lo;
271  m_state.counter.w += hi + (m_state.counter.z < old_counter ? 1 : 0);
272  }
273 
276  __forceinline__ __device__ __host__ void discard_state(unsigned long long offset)
277  {
278  value lo, hi;
279  ::rocrand_device::detail::split_ull(lo, hi, offset);
280 
281  state_value old_counter = m_state.counter;
282  m_state.counter.x += lo;
283  m_state.counter.y += hi + (m_state.counter.x < old_counter.x ? 1 : 0);
284  m_state.counter.z += (m_state.counter.y < old_counter.y ? 1 : 0);
285  m_state.counter.w += (m_state.counter.z < old_counter.z ? 1 : 0);
286  }
287 
288  __forceinline__ __device__ __host__ static state_value bump_counter(state_value counter)
289  {
290  counter.x++;
291  value add = counter.x == 0 ? 1 : 0;
292  counter.y += add;
293  add = counter.y == 0 ? add : 0;
294  counter.z += add;
295  add = counter.z == 0 ? add : 0;
296  counter.w += add;
297  return counter;
298  }
299 
300  __forceinline__ __device__ __host__ state_value interleave(const state_value prev,
301  const state_value next) const
302  {
303  switch(m_state.substate)
304  {
305  case 0: return prev;
306  case 1: return state_value{prev.y, prev.z, prev.w, next.x};
307  case 2: return state_value{prev.z, prev.w, next.x, next.y};
308  case 3: return state_value{prev.w, next.x, next.y, next.z};
309  }
310  __builtin_unreachable();
311  }
312 
313 protected:
314  threefry_state_4 m_state;
315 }; // threefry_engine4_base class
316 
317 } // end namespace rocrand_device
318 
319 #endif // ROCRAND_THREEFRY4_IMPL_H_