53 #ifndef ROCRAND_PHILOX4X32_10_H_
54 #define ROCRAND_PHILOX4X32_10_H_
56 #include "rocrand/rocrand_common.h"
58 #include <hip/hip_runtime.h>
62 #define ROCRAND_PHILOX_M4x32_0 0xD2511F53U
63 #define ROCRAND_PHILOX_M4x32_1 0xCD9E8D57U
64 #define ROCRAND_PHILOX_W32_0 0x9E3779B9U
65 #define ROCRAND_PHILOX_W32_1 0xBB67AE85U
75 #define ROCRAND_PHILOX4x32_DEFAULT_SEED 0xdeadbeefdeadbeefULL
78 namespace rocrand_device
81 class philox4x32_10_engine
84 struct philox4x32_10_state
89 unsigned int substate;
91 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
97 unsigned int boxmuller_float_state;
98 unsigned int boxmuller_double_state;
99 float boxmuller_float;
100 double boxmuller_double;
104 __forceinline__ __device__ __host__ philox4x32_10_engine()
114 __forceinline__ __device__ __host__ philox4x32_10_engine(
const unsigned long long seed,
115 const unsigned long long subsequence,
116 const unsigned long long offset)
118 this->seed(seed, subsequence, offset);
126 __forceinline__ __device__ __host__
void seed(
unsigned long long seed_value,
127 const unsigned long long subsequence,
128 const unsigned long long offset)
130 m_state.key.x =
static_cast<unsigned int>(seed_value);
131 m_state.key.y =
static_cast<unsigned int>(seed_value >> 32);
132 this->restart(subsequence, offset);
136 __forceinline__ __device__ __host__
void discard(
unsigned long long offset)
138 this->discard_impl(offset);
139 this->m_state.result = this->ten_rounds(m_state.counter, m_state.key);
146 __forceinline__ __device__ __host__
void discard_subsequence(
unsigned long long subsequence)
148 this->discard_subsequence_impl(subsequence);
149 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
152 __forceinline__ __device__ __host__
void restart(
const unsigned long long subsequence,
153 const unsigned long long offset)
155 m_state.counter = {0, 0, 0, 0};
156 m_state.result = {0, 0, 0, 0};
157 m_state.substate = 0;
158 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
159 m_state.boxmuller_float_state = 0;
160 m_state.boxmuller_double_state = 0;
162 this->discard_subsequence_impl(subsequence);
163 this->discard_impl(offset);
164 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
167 __forceinline__ __device__ __host__
unsigned int operator()()
172 __forceinline__ __device__ __host__
unsigned int next()
174 #if defined(__HIP_PLATFORM_AMD__)
175 unsigned int ret = ROCRAND_HIPVEC_ACCESS(m_state.result)[m_state.substate];
177 unsigned int ret = (&m_state.result.x)[m_state.substate];
181 if(m_state.substate == 4)
183 m_state.substate = 0;
184 this->discard_state();
185 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
190 __forceinline__ __device__ __host__ uint4 next4()
192 uint4 ret = m_state.result;
193 this->discard_state();
194 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
195 return this->interleave(ret, m_state.result);
201 __forceinline__ __device__ __host__
void discard_impl(
unsigned long long offset)
204 m_state.substate += offset & 3;
205 unsigned long long counter_offset = offset / 4;
206 counter_offset += m_state.substate < 4 ? 0 : 1;
207 m_state.substate += m_state.substate < 4 ? 0 : -4;
209 this->discard_state(counter_offset);
213 __forceinline__ __device__ __host__
void
214 discard_subsequence_impl(
unsigned long long subsequence)
216 unsigned int lo =
static_cast<unsigned int>(subsequence);
217 unsigned int hi =
static_cast<unsigned int>(subsequence >> 32);
219 unsigned int temp = m_state.counter.z;
220 m_state.counter.z += lo;
221 m_state.counter.w += hi + (m_state.counter.z < temp ? 1 : 0);
226 __forceinline__ __device__ __host__
void discard_state(
unsigned long long offset)
228 unsigned int lo =
static_cast<unsigned int>(offset);
229 unsigned int hi =
static_cast<unsigned int>(offset >> 32);
231 uint4 temp = m_state.counter;
232 m_state.counter.x += lo;
233 m_state.counter.y += hi + (m_state.counter.x < temp.x ? 1 : 0);
234 m_state.counter.z += (m_state.counter.y < temp.y ? 1 : 0);
235 m_state.counter.w += (m_state.counter.z < temp.z ? 1 : 0);
240 __forceinline__ __device__ __host__
void discard_state()
242 m_state.counter = this->bump_counter(m_state.counter);
245 __forceinline__ __device__ __host__
static uint4 bump_counter(uint4 counter)
248 unsigned int add = counter.x == 0 ? 1 : 0;
249 counter.y += add; add = counter.y == 0 ? add : 0;
250 counter.z += add; add = counter.z == 0 ? add : 0;
255 __forceinline__ __device__ __host__ uint4 interleave(
const uint4 prev,
const uint4 next)
const
257 switch(m_state.substate)
262 return uint4{ prev.y, prev.z, prev.w, next.x };
264 return uint4{ prev.z, prev.w, next.x, next.y };
266 return uint4{ prev.w, next.x, next.y, next.z };
268 __builtin_unreachable();
272 __forceinline__ __device__ __host__ uint4 ten_rounds(uint4 counter, uint2 key)
274 counter = this->single_round(counter, key); key = this->bumpkey(key);
275 counter = this->single_round(counter, key); key = this->bumpkey(key);
276 counter = this->single_round(counter, key); key = this->bumpkey(key);
277 counter = this->single_round(counter, key); key = this->bumpkey(key);
278 counter = this->single_round(counter, key); key = this->bumpkey(key);
279 counter = this->single_round(counter, key); key = this->bumpkey(key);
280 counter = this->single_round(counter, key); key = this->bumpkey(key);
281 counter = this->single_round(counter, key); key = this->bumpkey(key);
282 counter = this->single_round(counter, key); key = this->bumpkey(key);
283 return this->single_round(counter, key);
288 __forceinline__ __device__ __host__
static uint4 single_round(uint4 counter, uint2 key)
291 unsigned long long mul0 = detail::mul_u64_u32(ROCRAND_PHILOX_M4x32_0, counter.x);
292 unsigned int hi0 =
static_cast<unsigned int>(mul0 >> 32);
293 unsigned int lo0 =
static_cast<unsigned int>(mul0);
294 unsigned long long mul1 = detail::mul_u64_u32(ROCRAND_PHILOX_M4x32_1, counter.z);
295 unsigned int hi1 =
static_cast<unsigned int>(mul1 >> 32);
296 unsigned int lo1 =
static_cast<unsigned int>(mul1);
297 return uint4{hi1 ^ counter.y ^ key.x, lo1, hi0 ^ counter.w ^ key.y, lo0};
300 __forceinline__ __device__ __host__
static uint2 bumpkey(uint2 key)
302 key.x += ROCRAND_PHILOX_W32_0;
303 key.y += ROCRAND_PHILOX_W32_1;
309 philox4x32_10_state m_state;
311 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
312 friend struct detail::engine_boxmuller_helper<philox4x32_10_engine>;
325 typedef rocrand_device::philox4x32_10_engine rocrand_state_philox4x32_10;
339 __forceinline__ __device__ __host__
341 const unsigned long long subsequence,
342 const unsigned long long offset,
343 rocrand_state_philox4x32_10* state)
345 *state = rocrand_state_philox4x32_10(seed, subsequence, offset);
360 __forceinline__ __device__ __host__
361 unsigned int rocrand(rocrand_state_philox4x32_10* state)
363 return state->next();
378 __forceinline__ __device__ __host__
381 return state->next4();
392 __forceinline__ __device__ __host__
393 void skipahead(
unsigned long long offset, rocrand_state_philox4x32_10* state)
395 return state->discard(offset);
407 __forceinline__ __device__ __host__
410 return state->discard_subsequence(subsequence);
422 __forceinline__ __device__ __host__
425 return state->discard_subsequence(sequence);
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by offset elements.
Definition: rocrand_philox4x32_10.h:393
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_philox4x32_10 *state)
Initializes Philox state.
Definition: rocrand_philox4x32_10.h:340
#define ROCRAND_PHILOX4x32_DEFAULT_SEED
Default seed for PHILOX4x32 PRNG.
Definition: rocrand_philox4x32_10.h:75
__forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_philox4x32_10 *state)
Returns four uniformly distributed random unsigned int values from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:379
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by sequence sequences.
Definition: rocrand_philox4x32_10.h:423
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_philox4x32_10 *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:361
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by subsequence subsequences.
Definition: rocrand_philox4x32_10.h:408