4 #include <hip/hip_runtime.h>
21 state.
s[3] = state.
s[2];
22 state.
s[2] = state.
s[1];
23 state.
s[1] = state.
s[0];
26 state.
s[0] = tmp ^ state.
s[0] ^ (state.
s[0] >> 19);
34 state.
s[0] = (blockDim.x * blockIdx.x + threadIdx.x) * 8912741 + 2313212 + seed;
36 (gridDim.x * blockDim.x - (blockDim.x * blockIdx.x + threadIdx.x)) * 5013829 + 6012697;
37 state.
s[2] = (blockDim.x * blockIdx.x + threadIdx.x) * 3412309 + 2912479;
39 (gridDim.x * blockDim.x - (blockDim.x * blockIdx.x + threadIdx.x)) * 1001447 + 9912307;
42 for(
int i = 0; i < 20; i++)
57 for(
uint64_t i = blockIdx.x * blockDim.x + threadIdx.x;
58 i < buffer_element_size / ck::packed_size_v<T>;
59 i += blockDim.x * gridDim.x)
61 if constexpr(ck::is_same_v<T, ck::pk_i4_t>)
70 p[i] = ck::type_convert<T, int>(
84 for(
uint64_t i = blockIdx.x * blockDim.x + threadIdx.x;
85 i < buffer_element_size / ck::packed_size_v<T>;
86 i += blockDim.x * gridDim.x)
88 if constexpr(ck::is_same_v<T, ck::f4x2_pk_t>)
91 ran_gen_round_u32(s) * (1.0f / 4294967296.0f) * (max_value - min_value) + min_value;
93 ran_gen_round_u32(s) * (1.0f / 4294967296.0f) * (max_value - min_value) + min_value;
95 p[i] = ck::type_convert<ck::f4x2_t>(
ck::float2_t{u1, u2});
100 p[i] = ck::type_convert<T, float>(ran * (max_value - min_value) + min_value);
105 template <
typename T>
109 static constexpr
float PI = 3.141592653f;
113 for(
uint64_t i = blockIdx.x * blockDim.x + threadIdx.x, j = 0; i < buffer_element_size;
114 i += blockDim.x * gridDim.x, j++)
116 if(j % (2 / ck::packed_size_v<T>) == 0)
120 float scale = sigma * ck::math::sqrt(-2.0f *
ck::math::log(u1));
125 if constexpr(ck::is_same_v<T, ck::f4x2_pk_t>)
127 p[i] = ck::type_convert<ck::f4x2_t>(
ck::float2_t{norm[0], norm[1]});
131 p[i] = ck::type_convert<T, float>(norm[j % 2]);
__device__ uint32_t ran_gen_round_u32(ran_state_u32 &state)
Definition: device_tensor_generator.hpp:18
__global__ void fill_tensor_uniform_rand_fp_values(T *p, float min_value, float max_value, uint64_t buffer_element_size)
Definition: device_tensor_generator.hpp:77
__global__ void fill_tensor_uniform_rand_int_values(T *p, int min_value, int max_value, uint64_t buffer_element_size)
Definition: device_tensor_generator.hpp:50
__device__ ran_state_u32 ran_init(uint32_t seed=0)
Definition: device_tensor_generator.hpp:30
__global__ void fill_tensor_norm_rand_fp_values(T *p, float sigma, float mean, uint64_t buffer_element_size)
Definition: device_tensor_generator.hpp:107
__host__ T log(T x)
Definition: math_v2.hpp:409
__host__ T sin(T x)
Definition: math_v2.hpp:187
__host__ T cos(T x)
Definition: math_v2.hpp:241
typename vector_type< float, 2 >::type float2_t
Definition: dtype_vector.hpp:2146
unsigned int uint32_t
Definition: stdint.h:126
unsigned char uint8_t
Definition: stdint.h:124
unsigned __int64 uint64_t
Definition: stdint.h:136
Definition: data_type.hpp:187
Definition: device_tensor_generator.hpp:14
uint32_t s[4]
Definition: device_tensor_generator.hpp:15