/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/develop/projects/rocrand/library/include/rocrand/rocrand_mrg32k3a.h Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/develop/projects/rocrand/library/include/rocrand/rocrand_mrg32k3a.h Source File#

API library: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/develop/projects/rocrand/library/include/rocrand/rocrand_mrg32k3a.h Source File
rocrand_mrg32k3a.h
1 // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCRAND_MRG32K3A_H_
22 #define ROCRAND_MRG32K3A_H_
23 
24 #include "rocrand/rocrand_common.h"
25 #include "rocrand/rocrand_mrg32k3a_precomputed.h"
26 
27 #include <hip/hip_runtime.h>
28 
29 #define ROCRAND_MRG32K3A_POW32 4294967296U
30 #define ROCRAND_MRG32K3A_M1 4294967087U
31 #define ROCRAND_MRG32K3A_M1C 209U
32 #define ROCRAND_MRG32K3A_M2 4294944443U
33 #define ROCRAND_MRG32K3A_M2C 22853U
34 #define ROCRAND_MRG32K3A_A12 1403580U
35 #define ROCRAND_MRG32K3A_A13 (4294967087U - 810728U)
36 #define ROCRAND_MRG32K3A_A13N 810728U
37 #define ROCRAND_MRG32K3A_A21 527612U
38 #define ROCRAND_MRG32K3A_A23 (4294944443U - 1370589U)
39 #define ROCRAND_MRG32K3A_A23N 1370589U
40 #define ROCRAND_MRG32K3A_NORM_DOUBLE (2.3283065498378288e-10) // 1/ROCRAND_MRG32K3A_M1
41 #define ROCRAND_MRG32K3A_UINT_NORM \
42  (1.000000048661607) // (ROCRAND_MRG32K3A_POW32 - 1)/(ROCRAND_MRG32K3A_M1 - 1)
43 
52  #define ROCRAND_MRG32K3A_DEFAULT_SEED 12345ULL // end of group rocranddevice
54 
55 namespace rocrand_device {
56 
57 class mrg32k3a_engine
58 {
59 public:
60  struct mrg32k3a_state
61  {
62  unsigned int g1[3];
63  unsigned int g2[3];
64 
65  #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
66  // The Box–Muller transform requires two inputs to convert uniformly
67  // distributed real values [0; 1] to normally distributed real values
68  // (with mean = 0, and stddev = 1). Often user wants only one
69  // normally distributed number, to save performance and random
70  // numbers the 2nd value is saved for future requests.
71  unsigned int boxmuller_float_state; // is there a float in boxmuller_float
72  unsigned int boxmuller_double_state; // is there a double in boxmuller_double
73  float boxmuller_float; // normally distributed float
74  double boxmuller_double; // normally distributed double
75  #endif
76  };
77 
78  __forceinline__ __device__ __host__ mrg32k3a_engine()
79  {
80  this->seed(ROCRAND_MRG32K3A_DEFAULT_SEED, 0, 0);
81  }
82 
91  __forceinline__ __device__ __host__ mrg32k3a_engine(const unsigned long long seed,
92  const unsigned long long subsequence,
93  const unsigned long long offset)
94  {
95  this->seed(seed, subsequence, offset);
96  }
97 
106  __forceinline__ __device__ __host__ void seed(unsigned long long seed_value,
107  const unsigned long long subsequence,
108  const unsigned long long offset)
109  {
110  if(seed_value == 0)
111  {
112  seed_value = ROCRAND_MRG32K3A_DEFAULT_SEED;
113  }
114  unsigned int x = (unsigned int) seed_value ^ 0x55555555U;
115  unsigned int y = (unsigned int) ((seed_value >> 32) ^ 0xAAAAAAAAU);
116  m_state.g1[0] = mod_mul_m1(x, seed_value);
117  m_state.g1[1] = mod_mul_m1(y, seed_value);
118  m_state.g1[2] = mod_mul_m1(x, seed_value);
119  m_state.g2[0] = mod_mul_m2(y, seed_value);
120  m_state.g2[1] = mod_mul_m2(x, seed_value);
121  m_state.g2[2] = mod_mul_m2(y, seed_value);
122  this->restart(subsequence, offset);
123  }
124 
126  __forceinline__ __device__ __host__ void discard(unsigned long long offset)
127  {
128  this->discard_impl(offset);
129  }
130 
133  __forceinline__ __device__ __host__ void discard_subsequence(unsigned long long subsequence)
134  {
135  this->discard_subsequence_impl(subsequence);
136  }
137 
140  __forceinline__ __device__ __host__ void discard_sequence(unsigned long long sequence)
141  {
142  this->discard_sequence_impl(sequence);
143  }
144 
145  __forceinline__ __device__ __host__ void restart(const unsigned long long subsequence,
146  const unsigned long long offset)
147  {
148  #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
149  m_state.boxmuller_float_state = 0;
150  m_state.boxmuller_double_state = 0;
151  #endif
152  this->discard_subsequence_impl(subsequence);
153  this->discard_impl(offset);
154  }
155 
156  __forceinline__ __device__ __host__ unsigned int operator()()
157  {
158  return this->next();
159  }
160 
161  // Returned value is in range [1, ROCRAND_MRG32K3A_M1],
162  // where ROCRAND_MRG32K3A_M1 < UINT_MAX
163  __forceinline__ __device__ __host__
164  unsigned int next()
165  {
166  const unsigned int p1 = mod_m1(detail::mad_u64_u32(
167  ROCRAND_MRG32K3A_A12,
168  m_state.g1[1],
169  detail::mul_u64_u32(ROCRAND_MRG32K3A_A13N, (ROCRAND_MRG32K3A_M1 - m_state.g1[0]))));
170 
171  m_state.g1[0] = m_state.g1[1];
172  m_state.g1[1] = m_state.g1[2];
173  m_state.g1[2] = p1;
174 
175  const unsigned int p2 = mod_m2(detail::mad_u64_u32(
176  ROCRAND_MRG32K3A_A21,
177  m_state.g2[2],
178  detail::mul_u64_u32(ROCRAND_MRG32K3A_A23N, (ROCRAND_MRG32K3A_M2 - m_state.g2[0]))));
179 
180  m_state.g2[0] = m_state.g2[1];
181  m_state.g2[1] = m_state.g2[2];
182  m_state.g2[2] = p2;
183 
184  return (p1 - p2) + (p1 <= p2 ? ROCRAND_MRG32K3A_M1 : 0);
185  }
186 
187 protected:
188  // Advances the internal state to skip \p offset numbers.
189  // DOES NOT CALCULATE NEW ULONGLONG
190  __forceinline__ __device__ __host__ void discard_impl(unsigned long long offset)
191  {
192  discard_state(offset);
193  }
194 
195  // DOES NOT CALCULATE NEW ULONGLONG
196  __forceinline__ __device__ __host__ void
197  discard_subsequence_impl(unsigned long long subsequence)
198  {
199  int i = 0;
200 
201  while(subsequence > 0) {
202  if (subsequence & 1) {
203  #if defined(__HIP_DEVICE_COMPILE__)
204  mod_mat_vec_m1(d_A1P76 + i, m_state.g1);
205  mod_mat_vec_m2(d_A2P76 + i, m_state.g2);
206  #else
207  mod_mat_vec_m1(h_A1P76 + i, m_state.g1);
208  mod_mat_vec_m2(h_A2P76 + i, m_state.g2);
209  #endif
210  }
211  subsequence >>= 1;
212  i += 9;
213  }
214  }
215 
216  // DOES NOT CALCULATE NEW ULONGLONG
217  __forceinline__ __device__ __host__ void discard_sequence_impl(unsigned long long sequence)
218  {
219  int i = 0;
220 
221  while(sequence > 0) {
222  if (sequence & 1) {
223  #if defined(__HIP_DEVICE_COMPILE__)
224  mod_mat_vec_m1(d_A1P127 + i, m_state.g1);
225  mod_mat_vec_m2(d_A2P127 + i, m_state.g2);
226  #else
227  mod_mat_vec_m1(h_A1P127 + i, m_state.g1);
228  mod_mat_vec_m2(h_A2P127 + i, m_state.g2);
229  #endif
230  }
231  sequence >>= 1;
232  i += 9;
233  }
234  }
235 
236  // Advances the internal state by offset times.
237  // DOES NOT CALCULATE NEW ULONGLONG
238  __forceinline__ __device__ __host__ void discard_state(unsigned long long offset)
239  {
240  int i = 0;
241 
242  while(offset > 0) {
243  if (offset & 1) {
244  #if defined(__HIP_DEVICE_COMPILE__)
245  mod_mat_vec_m1(d_A1 + i, m_state.g1);
246  mod_mat_vec_m2(d_A2 + i, m_state.g2);
247  #else
248  mod_mat_vec_m1(h_A1 + i, m_state.g1);
249  mod_mat_vec_m2(h_A2 + i, m_state.g2);
250  #endif
251  }
252  offset >>= 1;
253  i += 9;
254  }
255  }
256 
257  // Advances the internal state to the next state
258  // DOES NOT CALCULATE NEW ULONGLONG
259  __forceinline__ __device__ __host__ void discard_state()
260  {
261  discard_state(1);
262  }
263 
264 private:
265  __forceinline__ __device__ __host__
266  static void mod_mat_vec_m1(const unsigned int* A, unsigned int* s)
267  {
268  unsigned long long x[3] = {s[0], s[1], s[2]};
269 
270  s[0] = mod_m1(mod_m1(A[0] * x[0]) + mod_m1(A[1] * x[1]) + mod_m1(A[2] * x[2]));
271 
272  s[1] = mod_m1(mod_m1(A[3] * x[0]) + mod_m1(A[4] * x[1]) + mod_m1(A[5] * x[2]));
273 
274  s[2] = mod_m1(mod_m1(A[6] * x[0]) + mod_m1(A[7] * x[1]) + mod_m1(A[8] * x[2]));
275  }
276 
277  __forceinline__ __device__ __host__
278  static void mod_mat_vec_m2(const unsigned int* A, unsigned int* s)
279  {
280  unsigned long long x[3] = {s[0], s[1], s[2]};
281 
282  s[0] = mod_m2(mod_m2(A[0] * x[0]) + mod_m2(A[1] * x[1]) + mod_m2(A[2] * x[2]));
283 
284  s[1] = mod_m2(mod_m2(A[3] * x[0]) + mod_m2(A[4] * x[1]) + mod_m2(A[5] * x[2]));
285 
286  s[2] = mod_m2(mod_m2(A[6] * x[0]) + mod_m2(A[7] * x[1]) + mod_m2(A[8] * x[2]));
287  }
288 
289  __forceinline__ __device__ __host__ static unsigned long long mod_mul_m1(unsigned int i,
290  unsigned long long j)
291  {
292  long long hi, lo, temp1, temp2;
293 
294  hi = i / 131072;
295  lo = i - (hi * 131072);
296  temp1 = mod_m1(hi * j) * 131072;
297  temp2 = mod_m1(lo * j);
298  lo = mod_m1(temp1 + temp2);
299 
300  if (lo < 0)
301  lo += ROCRAND_MRG32K3A_M1;
302  return lo;
303  }
304 
305  __forceinline__ __device__ __host__
306  static unsigned long long mod_m1(unsigned long long p)
307  {
308  p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M1C,
309  static_cast<unsigned int>(p >> 32),
310  static_cast<unsigned int>(p));
311  if(p >= ROCRAND_MRG32K3A_M1)
312  p -= ROCRAND_MRG32K3A_M1;
313 
314  return p;
315  }
316 
317  __forceinline__ __device__ __host__
318  static unsigned long long mod_mul_m2(unsigned int i, unsigned long long j)
319  {
320  long long hi, lo, temp1, temp2;
321 
322  hi = i / 131072;
323  lo = i - (hi * 131072);
324  temp1 = mod_m2(hi * j) * 131072;
325  temp2 = mod_m2(lo * j);
326  lo = mod_m2(temp1 + temp2);
327 
328  if (lo < 0)
329  lo += ROCRAND_MRG32K3A_M2;
330  return lo;
331  }
332 
333  __forceinline__ __device__ __host__
334  static unsigned long long mod_m2(unsigned long long p)
335  {
336  p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M2C,
337  static_cast<unsigned int>(p >> 32),
338  static_cast<unsigned int>(p));
339  p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M2C,
340  static_cast<unsigned int>(p >> 32),
341  static_cast<unsigned int>(p));
342  if(p >= ROCRAND_MRG32K3A_M2)
343  p -= ROCRAND_MRG32K3A_M2;
344 
345  return p;
346  }
347 
348 protected:
349  // State
350  mrg32k3a_state m_state;
351 
352  #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
353  friend struct detail::engine_boxmuller_helper<mrg32k3a_engine>;
354  #endif
355 
356 }; // mrg32k3a_engine class
357 
358 } // end namespace rocrand_device
359 
366 typedef rocrand_device::mrg32k3a_engine rocrand_state_mrg32k3a;
368 
380 __forceinline__ __device__ __host__
381 void rocrand_init(const unsigned long long seed,
382  const unsigned long long subsequence,
383  const unsigned long long offset,
384  rocrand_state_mrg32k3a* state)
385 {
386  *state = rocrand_state_mrg32k3a(seed, subsequence, offset);
387 }
388 
401 __forceinline__ __device__ __host__
402 unsigned int rocrand(rocrand_state_mrg32k3a* state)
403 {
404  // next() in [1, ROCRAND_MRG32K3A_M1]
405  return static_cast<unsigned int>((state->next() - 1) * ROCRAND_MRG32K3A_UINT_NORM);
406 }
407 
416 __forceinline__ __device__ __host__
417 void skipahead(unsigned long long offset, rocrand_state_mrg32k3a* state)
418 {
419  return state->discard(offset);
420 }
421 
431 __forceinline__ __device__ __host__
432 void skipahead_subsequence(unsigned long long subsequence, rocrand_state_mrg32k3a* state)
433 {
434  return state->discard_subsequence(subsequence);
435 }
436 
446 __forceinline__ __device__ __host__
447 void skipahead_sequence(unsigned long long sequence, rocrand_state_mrg32k3a* state)
448 {
449  return state->discard_sequence(sequence);
450 }
451  // end of group rocranddevice
453 
454 #endif // ROCRAND_MRG32K3A_H_
#define ROCRAND_MRG32K3A_DEFAULT_SEED
Default seed for MRG32K3A PRNG.
Definition: rocrand_mrg32k3a.h:52
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_mrg32k3a *state)
Initializes MRG32K3A state.
Definition: rocrand_mrg32k3a.h:381
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by subsequence subsequences.
Definition: rocrand_mrg32k3a.h:432
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by offset elements.
Definition: rocrand_mrg32k3a.h:417
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_mrg32k3a *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_mrg32k3a.h:402
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by sequence sequences.
Definition: rocrand_mrg32k3a.h:447