21 #ifndef ROCRAND_NORMAL_H_
22 #define ROCRAND_NORMAL_H_
29 #include "rocrand/rocrand_lfsr113.h"
30 #include "rocrand/rocrand_mrg31k3p.h"
31 #include "rocrand/rocrand_mrg32k3a.h"
32 #include "rocrand/rocrand_mtgp32.h"
33 #include "rocrand/rocrand_philox4x32_10.h"
34 #include "rocrand/rocrand_scrambled_sobol32.h"
35 #include "rocrand/rocrand_scrambled_sobol64.h"
36 #include "rocrand/rocrand_sobol32.h"
37 #include "rocrand/rocrand_sobol64.h"
38 #include "rocrand/rocrand_threefry2x32_20.h"
39 #include "rocrand/rocrand_threefry2x64_20.h"
40 #include "rocrand/rocrand_threefry4x32_20.h"
41 #include "rocrand/rocrand_threefry4x64_20.h"
42 #include "rocrand/rocrand_xorwow.h"
44 #include "rocrand/rocrand_uniform.h"
46 #include <hip/hip_runtime.h>
50 namespace rocrand_device {
53 __forceinline__ __device__ __host__ float2 box_muller(
unsigned int x,
unsigned int y)
56 float u = ROCRAND_2POW32_INV + (x * ROCRAND_2POW32_INV);
57 float v = ROCRAND_2POW32_INV_2PI + (y * ROCRAND_2POW32_INV_2PI);
58 float s = sqrtf(-2.0f * logf(u));
59 #ifdef __HIP_DEVICE_COMPILE__
60 __sincosf(v, &result.x, &result.y);
64 result.x = sinf(v) * s;
65 result.y = cosf(v) * s;
70 __forceinline__ __device__ __host__ float2 box_muller(
unsigned long long v)
72 unsigned int x =
static_cast<unsigned int>(v);
73 unsigned int y =
static_cast<unsigned int>(v >> 32);
75 return box_muller(x, y);
78 __forceinline__ __device__ __host__ double2 box_muller_double(uint4 v)
81 unsigned long long int v1 = (
unsigned long long int)v.x ^
82 ((
unsigned long long int)v.y << (53 - 32));
83 double u = ROCRAND_2POW53_INV_DOUBLE + (v1 * ROCRAND_2POW53_INV_DOUBLE);
84 unsigned long long int v2 = (
unsigned long long int)v.z ^
85 ((
unsigned long long int)v.w << (53 - 32));
86 double w = (ROCRAND_2POW53_INV_DOUBLE * 2.0) +
87 (v2 * (ROCRAND_2POW53_INV_DOUBLE * 2.0));
88 double s = sqrt(-2.0 * log(u));
89 #ifdef __HIP_DEVICE_COMPILE__
90 sincospi(w, &result.x, &result.y);
94 result.x = sin(w * ROCRAND_PI_DOUBLE) * s;
95 result.y = cos(w * ROCRAND_PI_DOUBLE) * s;
100 __forceinline__ __device__ __host__ double2 box_muller_double(ulonglong2 v)
102 unsigned int x =
static_cast<unsigned int>(v.x);
103 unsigned int y =
static_cast<unsigned int>(v.x >> 32);
104 unsigned int z =
static_cast<unsigned int>(v.y);
105 unsigned int w =
static_cast<unsigned int>(v.y >> 32);
107 return box_muller_double(make_uint4(x, y, z, w));
110 __forceinline__ __device__ __host__ __half2 box_muller_half(
unsigned short x,
unsigned short y)
112 #if defined(ROCRAND_HALF_MATH_SUPPORTED)
113 __half u = __float2half(ROCRAND_2POW16_INV + (x * ROCRAND_2POW16_INV));
114 __half v = __float2half(ROCRAND_2POW16_INV_2PI + (y * ROCRAND_2POW16_INV_2PI));
115 __half s = hsqrt(__hmul(__float2half(-2.0f), hlog(u)));
122 float u = ROCRAND_2POW16_INV + (x * ROCRAND_2POW16_INV);
123 float v = ROCRAND_2POW16_INV_2PI + (y * ROCRAND_2POW16_INV_2PI);
124 float s = sqrtf(-2.0f * logf(u));
125 #ifdef __HIP_DEVICE_COMPILE__
126 __sincosf(v, &r.x, &r.y);
140 template<
typename state_type>
141 __forceinline__ __device__ __host__ float2 mrg_box_muller(
unsigned int x,
unsigned int y)
144 float u = rocrand_device::detail::mrg_uniform_distribution<state_type>(x);
145 float v = rocrand_device::detail::mrg_uniform_distribution<state_type>(y) * ROCRAND_2PI;
146 float s = sqrtf(-2.0f * logf(u));
147 #ifdef __HIP_DEVICE_COMPILE__
148 __sincosf(v, &result.x, &result.y);
152 result.x = sinf(v) * s;
153 result.y = cosf(v) * s;
158 template<
typename state_type>
159 __forceinline__ __device__ __host__ double2 mrg_box_muller_double(
unsigned int x,
unsigned int y)
162 double u = rocrand_device::detail::mrg_uniform_distribution<state_type>(x);
163 double v = rocrand_device::detail::mrg_uniform_distribution<state_type>(y) * 2.0;
164 double s = sqrt(-2.0 * log(u));
165 #ifdef __HIP_DEVICE_COMPILE__
166 sincospi(v, &result.x, &result.y);
170 result.x = sin(v * ROCRAND_PI_DOUBLE) * s;
171 result.y = cos(v * ROCRAND_PI_DOUBLE) * s;
176 __forceinline__ __device__ __host__
float roc_f_erfinv(
float x)
178 float tt1, tt2, lnx, sgn;
179 sgn = (x < 0.0f) ? -1.0f : 1.0f;
181 x = (1.0f - x) * (1.0f + x);
184 #ifdef __HIP_DEVICE_COMPILE__
190 #ifdef __HIP_DEVICE_COMPILE__
193 else if (std::isinf(lnx))
197 tt1 = 2.0f / (ROCRAND_PI * 0.147f) + 0.5f * lnx;
198 tt2 = 1.0f / (0.147f) * lnx;
200 return(sgn * sqrtf(-tt1 + sqrtf(tt1 * tt1 - tt2)));
203 __forceinline__ __device__ __host__
double roc_d_erfinv(
double x)
205 double tt1, tt2, lnx, sgn;
206 sgn = (x < 0.0) ? -1.0 : 1.0;
208 x = (1.0 - x) * (1.0 + x);
211 #ifdef __HIP_DEVICE_COMPILE__
217 #ifdef __HIP_DEVICE_COMPILE__
220 else if (std::isinf(lnx))
224 tt1 = 2.0 / (ROCRAND_PI_DOUBLE * 0.147) + 0.5 * lnx;
225 tt2 = 1.0 / (0.147) * lnx;
227 return(sgn * sqrt(-tt1 + sqrt(tt1 * tt1 - tt2)));
230 __forceinline__ __device__ __host__
float normal_distribution(
unsigned int x)
232 float p = ::rocrand_device::detail::uniform_distribution(x);
233 float v = ROCRAND_SQRT2 * ::rocrand_device::detail::roc_f_erfinv(2.0f * p - 1.0f);
237 __forceinline__ __device__ __host__
float normal_distribution(
unsigned long long int x)
239 float p = ::rocrand_device::detail::uniform_distribution(x);
240 float v = ROCRAND_SQRT2 * ::rocrand_device::detail::roc_f_erfinv(2.0f * p - 1.0f);
244 __forceinline__ __device__ __host__ float2 normal_distribution2(
unsigned int v1,
unsigned int v2)
246 return ::rocrand_device::detail::box_muller(v1, v2);
249 __forceinline__ __device__ __host__ float2 normal_distribution2(uint2 v)
251 return ::rocrand_device::detail::box_muller(v.x, v.y);
254 __forceinline__ __device__ __host__ float2 normal_distribution2(
unsigned long long v)
256 return ::rocrand_device::detail::box_muller(v);
259 __forceinline__ __device__ __host__ float4 normal_distribution4(uint4 v)
261 float2 r1 = ::rocrand_device::detail::box_muller(v.x, v.y);
262 float2 r2 = ::rocrand_device::detail::box_muller(v.z, v.w);
271 __forceinline__ __device__ __host__ float4 normal_distribution4(longlong2 v)
273 float2 r1 = ::rocrand_device::detail::box_muller(v.x);
274 float2 r2 = ::rocrand_device::detail::box_muller(v.y);
275 return float4{r1.x, r1.y, r2.x, r2.y};
278 __forceinline__ __device__ __host__ float4 normal_distribution4(
unsigned long long v1,
279 unsigned long long v2)
281 float2 r1 = ::rocrand_device::detail::box_muller(v1);
282 float2 r2 = ::rocrand_device::detail::box_muller(v2);
283 return float4{r1.x, r1.y, r2.x, r2.y};
286 __forceinline__ __device__ __host__
double normal_distribution_double(
unsigned int x)
288 double p = ::rocrand_device::detail::uniform_distribution_double(x);
289 double v = ROCRAND_SQRT2 * ::rocrand_device::detail::roc_d_erfinv(2.0 * p - 1.0);
293 __forceinline__ __device__ __host__
double normal_distribution_double(
unsigned long long int x)
295 double p = ::rocrand_device::detail::uniform_distribution_double(x);
296 double v = ROCRAND_SQRT2 * ::rocrand_device::detail::roc_d_erfinv(2.0 * p - 1.0);
300 __forceinline__ __device__ __host__ double2 normal_distribution_double2(uint4 v)
302 return ::rocrand_device::detail::box_muller_double(v);
305 __forceinline__ __device__ __host__ double2 normal_distribution_double2(ulonglong2 v)
307 return ::rocrand_device::detail::box_muller_double(v);
310 __forceinline__ __device__ __host__ __half2 normal_distribution_half2(
unsigned int v)
312 return ::rocrand_device::detail::box_muller_half(
313 static_cast<unsigned short>(v),
314 static_cast<unsigned short>(v >> 16)
318 __forceinline__ __device__ __host__ __half2 normal_distribution_half2(
unsigned long long v)
320 return ::rocrand_device::detail::box_muller_half(
static_cast<unsigned short>(v),
321 static_cast<unsigned short>(v >> 32));
324 template<
typename state_type>
325 __forceinline__ __device__ __host__ float2 mrg_normal_distribution2(
unsigned int v1,
328 return ::rocrand_device::detail::mrg_box_muller<state_type>(v1, v2);
331 template<
typename state_type>
332 __forceinline__ __device__ __host__ double2 mrg_normal_distribution_double2(
unsigned int v1,
335 return ::rocrand_device::detail::mrg_box_muller_double<state_type>(v1, v2);
338 template<
typename state_type>
339 __forceinline__ __device__ __host__ __half2 mrg_normal_distribution_half2(
unsigned int v)
341 v = rocrand_device::detail::mrg_uniform_distribution_uint<state_type>(v);
342 return ::rocrand_device::detail::box_muller_half(
343 static_cast<unsigned short>(v),
344 static_cast<unsigned short>(v >> 16)
365 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
366 __forceinline__ __device__ __host__
float rocrand_normal(rocrand_state_philox4x32_10* state)
368 typedef rocrand_device::detail::engine_boxmuller_helper<rocrand_state_philox4x32_10> bm_helper;
370 if(bm_helper::has_float(state))
372 return bm_helper::get_float(state);
378 float2 r = rocrand_device::detail::normal_distribution2(state1, state2);
379 bm_helper::save_float(state, r.y);
398 __forceinline__ __device__ __host__
404 return rocrand_device::detail::normal_distribution2(state1, state2);
421 __forceinline__ __device__ __host__
424 return rocrand_device::detail::normal_distribution4(
rocrand4(state));
441 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
444 typedef rocrand_device::detail::engine_boxmuller_helper<rocrand_state_philox4x32_10> bm_helper;
446 if(bm_helper::has_double(state))
448 return bm_helper::get_double(state);
450 double2 r = rocrand_device::detail::normal_distribution_double2(
rocrand4(state));
451 bm_helper::save_double(state, r.y);
470 __forceinline__ __device__ __host__
473 return rocrand_device::detail::normal_distribution_double2(
rocrand4(state));
490 __forceinline__ __device__ __host__
494 r1 = rocrand_device::detail::normal_distribution_double2(
rocrand4(state));
495 r2 = rocrand_device::detail::normal_distribution_double2(
rocrand4(state));
497 r1.x, r1.y, r2.x, r2.y
515 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
516 __forceinline__ __device__ __host__
float rocrand_normal(rocrand_state_mrg31k3p* state)
518 typedef rocrand_device::detail::engine_boxmuller_helper<rocrand_state_mrg31k3p> bm_helper;
520 if(bm_helper::has_float(state))
522 return bm_helper::get_float(state);
525 auto state1 = state->next();
526 auto state2 = state->next();
529 = rocrand_device::detail::mrg_normal_distribution2<rocrand_state_mrg31k3p>(state1, state2);
530 bm_helper::save_float(state, r.y);
549 __forceinline__ __device__ __host__
552 auto state1 = state->next();
553 auto state2 = state->next();
555 return rocrand_device::detail::mrg_normal_distribution2<rocrand_state_mrg31k3p>(state1, state2);
572 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
575 typedef rocrand_device::detail::engine_boxmuller_helper<rocrand_state_mrg31k3p> bm_helper;
577 if(bm_helper::has_double(state))
579 return bm_helper::get_double(state);
582 auto state1 = state->next();
583 auto state2 = state->next();
586 = rocrand_device::detail::mrg_normal_distribution_double2<rocrand_state_mrg31k3p>(state1,
588 bm_helper::save_double(state, r.y);
607 __forceinline__ __device__ __host__
610 auto state1 = state->next();
611 auto state2 = state->next();
613 return rocrand_device::detail::mrg_normal_distribution_double2<rocrand_state_mrg31k3p>(state1,
631 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
632 __forceinline__ __device__ __host__
float rocrand_normal(rocrand_state_mrg32k3a* state)
634 typedef rocrand_device::detail::engine_boxmuller_helper<rocrand_state_mrg32k3a> bm_helper;
636 if(bm_helper::has_float(state))
638 return bm_helper::get_float(state);
641 auto state1 = state->next();
642 auto state2 = state->next();
645 = rocrand_device::detail::mrg_normal_distribution2<rocrand_state_mrg32k3a>(state1, state2);
646 bm_helper::save_float(state, r.y);
665 __forceinline__ __device__ __host__
668 auto state1 = state->next();
669 auto state2 = state->next();
671 return rocrand_device::detail::mrg_normal_distribution2<rocrand_state_mrg32k3a>(state1, state2);
688 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
691 typedef rocrand_device::detail::engine_boxmuller_helper<rocrand_state_mrg32k3a> bm_helper;
693 if(bm_helper::has_double(state))
695 return bm_helper::get_double(state);
698 auto state1 = state->next();
699 auto state2 = state->next();
702 = rocrand_device::detail::mrg_normal_distribution_double2<rocrand_state_mrg32k3a>(state1,
704 bm_helper::save_double(state, r.y);
723 __forceinline__ __device__ __host__
726 auto state1 = state->next();
727 auto state2 = state->next();
729 return rocrand_device::detail::mrg_normal_distribution_double2<rocrand_state_mrg32k3a>(state1,
747 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
748 __forceinline__ __device__ __host__
float rocrand_normal(rocrand_state_xorwow* state)
750 typedef rocrand_device::detail::engine_boxmuller_helper<rocrand_state_xorwow> bm_helper;
752 if(bm_helper::has_float(state))
754 return bm_helper::get_float(state);
758 float2 r = rocrand_device::detail::normal_distribution2(state1, state2);
759 bm_helper::save_float(state, r.y);
778 __forceinline__ __device__ __host__
783 return rocrand_device::detail::normal_distribution2(state1, state2);
800 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
803 typedef rocrand_device::detail::engine_boxmuller_helper<rocrand_state_xorwow> bm_helper;
805 if(bm_helper::has_double(state))
807 return bm_helper::get_double(state);
815 double2 r = rocrand_device::detail::normal_distribution_double2(
816 uint4 { state1, state2, state3, state4 }
818 bm_helper::save_double(state, r.y);
837 __forceinline__ __device__ __host__
845 return rocrand_device::detail::normal_distribution_double2(
846 uint4 { state1, state2, state3, state4 }
862 __forceinline__ __device__
865 return rocrand_device::detail::normal_distribution(
rocrand(state));
882 __forceinline__ __device__
887 return rocrand_device::detail::normal_distribution2(state1, state2);
902 __forceinline__ __device__
905 return rocrand_device::detail::normal_distribution_double(
rocrand(state));
922 __forceinline__ __device__
930 return rocrand_device::detail::normal_distribution_double2(
931 uint4{state1, state2, state3, state4});
946 __forceinline__ __device__ __host__
949 return rocrand_device::detail::normal_distribution(
rocrand(state));
964 __forceinline__ __device__ __host__
967 return rocrand_device::detail::normal_distribution_double(
rocrand(state));
982 __forceinline__ __device__ __host__
985 return rocrand_device::detail::normal_distribution(
rocrand(state));
1000 __forceinline__ __device__ __host__
1003 return rocrand_device::detail::normal_distribution_double(
rocrand(state));
1018 __forceinline__ __device__ __host__
1021 return rocrand_device::detail::normal_distribution(
rocrand(state));
1036 __forceinline__ __device__ __host__
1039 return rocrand_device::detail::normal_distribution_double(
rocrand(state));
1054 __forceinline__ __device__ __host__
1057 return rocrand_device::detail::normal_distribution(
rocrand(state));
1072 __forceinline__ __device__ __host__
1075 return rocrand_device::detail::normal_distribution_double(
rocrand(state));
1090 __forceinline__ __device__ __host__
1093 return rocrand_device::detail::normal_distribution(
rocrand(state));
1110 __forceinline__ __device__ __host__
1116 return rocrand_device::detail::normal_distribution2(state1, state2);
1131 __forceinline__ __device__ __host__
1134 return rocrand_device::detail::normal_distribution_double(
rocrand(state));
1151 __forceinline__ __device__ __host__
1159 return rocrand_device::detail::normal_distribution_double2(
1160 uint4{state1, state2, state3, state4});
1175 __forceinline__ __device__ __host__
1178 return rocrand_device::detail::normal_distribution(
rocrand(state));
1195 __forceinline__ __device__ __host__
1198 return rocrand_device::detail::normal_distribution2(rocrand2(state));
1213 __forceinline__ __device__ __host__
1216 return rocrand_device::detail::normal_distribution_double(
rocrand(state));
1233 __forceinline__ __device__ __host__
1236 auto state1 = rocrand2(state);
1237 auto state2 = rocrand2(state);
1239 return rocrand_device::detail::normal_distribution_double2(
1240 uint4{state1.x, state1.y, state2.x, state2.y});
1255 __forceinline__ __device__ __host__
1258 return rocrand_device::detail::normal_distribution(
rocrand(state));
1275 __forceinline__ __device__ __host__
1278 return rocrand_device::detail::normal_distribution2(
rocrand(state));
1293 __forceinline__ __device__ __host__
1296 return rocrand_device::detail::normal_distribution_double(
rocrand(state));
1313 __forceinline__ __device__ __host__
1316 return rocrand_device::detail::normal_distribution_double2(rocrand2(state));
1331 __forceinline__ __device__ __host__
1334 return rocrand_device::detail::normal_distribution(
rocrand(state));
1351 __forceinline__ __device__ __host__
1357 return rocrand_device::detail::normal_distribution2(state1, state2);
1372 __forceinline__ __device__ __host__
1375 return rocrand_device::detail::normal_distribution_double(
rocrand(state));
1392 __forceinline__ __device__ __host__
1395 return rocrand_device::detail::normal_distribution_double2(
rocrand4(state));
1410 __forceinline__ __device__ __host__
1413 return rocrand_device::detail::normal_distribution(
rocrand(state));
1430 __forceinline__ __device__ __host__
1436 return rocrand_device::detail::normal_distribution2(state1, state2);
1451 __forceinline__ __device__ __host__
1454 return rocrand_device::detail::normal_distribution_double(
rocrand(state));
1471 __forceinline__ __device__ __host__
1477 return rocrand_device::detail::normal_distribution_double2(ulonglong2{state1, state2});
__forceinline__ __device__ __host__ double4 rocrand_normal_double4(rocrand_state_philox4x32_10 *state)
Returns four normally distributed double values.
Definition: rocrand_normal.h:491
__forceinline__ __device__ __host__ double2 rocrand_normal_double2(rocrand_state_philox4x32_10 *state)
Returns two normally distributed double values.
Definition: rocrand_normal.h:471
__forceinline__ __device__ __host__ float rocrand_normal(rocrand_state_philox4x32_10 *state)
Returns a normally distributed float value.
Definition: rocrand_normal.h:366
__forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_philox4x32_10 *state)
Returns four uniformly distributed random unsigned int values from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:379
__forceinline__ __device__ __host__ double rocrand_normal_double(rocrand_state_philox4x32_10 *state)
Returns a normally distributed double value.
Definition: rocrand_normal.h:442
__forceinline__ __device__ __host__ float4 rocrand_normal4(rocrand_state_philox4x32_10 *state)
Returns four normally distributed float values.
Definition: rocrand_normal.h:422
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_lfsr113 *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_lfsr113.h:277
__forceinline__ __device__ __host__ float2 rocrand_normal2(rocrand_state_philox4x32_10 *state)
Returns two normally distributed float values.
Definition: rocrand_normal.h:399