/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/library/utility/device_tensor_generator.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/library/utility/device_tensor_generator.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/library/utility/device_tensor_generator.hpp Source File
device_tensor_generator.hpp
Go to the documentation of this file.
1 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2 // SPDX-License-Identifier: MIT
3 #pragma once
4 #include <hip/hip_runtime.h>
5 
6 #include "ck/ck.hpp"
10 
11 // use xorshift for now since it is simple. Should be suitable enough, but feel free to switch in
12 // the future
14 {
15  uint32_t s[4];
16 };
17 
19 {
20  uint32_t tmp = state.s[3];
21  state.s[3] = state.s[2];
22  state.s[2] = state.s[1];
23  state.s[1] = state.s[0];
24  tmp ^= tmp << 11;
25  tmp ^= tmp >> 8;
26  state.s[0] = tmp ^ state.s[0] ^ (state.s[0] >> 19);
27  return state.s[0];
28 }
29 
30 __device__ ran_state_u32 ran_init(uint32_t seed = 0)
31 {
32  ran_state_u32 state;
33  // use primes for initialization
34  state.s[0] = (blockDim.x * blockIdx.x + threadIdx.x) * 8912741 + 2313212 + seed;
35  state.s[1] =
36  (gridDim.x * blockDim.x - (blockDim.x * blockIdx.x + threadIdx.x)) * 5013829 + 6012697;
37  state.s[2] = (blockDim.x * blockIdx.x + threadIdx.x) * 3412309 + 2912479;
38  state.s[3] =
39  (gridDim.x * blockDim.x - (blockDim.x * blockIdx.x + threadIdx.x)) * 1001447 + 9912307;
40 
41  // run 20 rounds
42  for(int i = 0; i < 20; i++)
43  {
44  ran_gen_round_u32(state);
45  }
46  return state;
47 }
48 
49 template <typename T>
51  int min_value,
52  int max_value,
53  uint64_t buffer_element_size)
54 {
55  // initial values
56  ran_state_u32 s = ran_init();
57  for(uint64_t i = blockIdx.x * blockDim.x + threadIdx.x;
58  i < buffer_element_size / ck::packed_size_v<T>;
59  i += blockDim.x * gridDim.x)
60  {
61  if constexpr(ck::is_same_v<T, ck::pk_i4_t>)
62  {
63  uint8_t hi = ((ran_gen_round_u32(s)) % (max_value - min_value)) + min_value + 8;
64  uint8_t lo = ((ran_gen_round_u32(s)) % (max_value - min_value)) + min_value + 8;
65  ck::pk_i4_t res = ((hi & 0xf) << 4) + (lo & 0xf);
66  p[i] = res;
67  }
68  else
69  {
70  p[i] = ck::type_convert<T, int>(
71  static_cast<int>((ran_gen_round_u32(s)) % (max_value - min_value)) + min_value);
72  }
73  }
74 }
75 
76 template <typename T>
78  float min_value,
79  float max_value,
80  uint64_t buffer_element_size)
81 {
82  // initial values
83  ran_state_u32 s = ran_init();
84  for(uint64_t i = blockIdx.x * blockDim.x + threadIdx.x;
85  i < buffer_element_size / ck::packed_size_v<T>;
86  i += blockDim.x * gridDim.x)
87  {
88  if constexpr(ck::is_same_v<T, ck::f4x2_pk_t>)
89  {
90  float u1 =
91  ran_gen_round_u32(s) * (1.0f / 4294967296.0f) * (max_value - min_value) + min_value;
92  float u2 =
93  ran_gen_round_u32(s) * (1.0f / 4294967296.0f) * (max_value - min_value) + min_value;
94 
95  p[i] = ck::type_convert<ck::f4x2_t>(ck::float2_t{u1, u2});
96  }
97  else
98  {
99  float ran = ran_gen_round_u32(s) * (1.0f / 4294967296.0f);
100  p[i] = ck::type_convert<T, float>(ran * (max_value - min_value) + min_value);
101  }
102  }
103 }
104 
105 template <typename T>
106 __global__ void
107 fill_tensor_norm_rand_fp_values(T* p, float sigma, float mean, uint64_t buffer_element_size)
108 {
109  static constexpr float PI = 3.141592653f;
110  // initial values
111  ran_state_u32 s = ran_init();
112  float norm[2];
113  for(uint64_t i = blockIdx.x * blockDim.x + threadIdx.x, j = 0; i < buffer_element_size;
114  i += blockDim.x * gridDim.x, j++)
115  {
116  if(j % (2 / ck::packed_size_v<T>) == 0)
117  {
118  float u1 = ran_gen_round_u32(s) * (1.0f / 4294967296.0f);
119  float u2 = ran_gen_round_u32(s) * (1.0f / 4294967296.0f);
120  float scale = sigma * ck::math::sqrt(-2.0f * ck::math::log(u1));
121  norm[0] = scale * ck::math::cos(2.0f * PI * u2) + mean;
122  norm[1] = scale * ck::math::sin(2.0f * PI * u2) + mean;
123  }
124 
125  if constexpr(ck::is_same_v<T, ck::f4x2_pk_t>)
126  {
127  p[i] = ck::type_convert<ck::f4x2_t>(ck::float2_t{norm[0], norm[1]});
128  }
129  else
130  {
131  p[i] = ck::type_convert<T, float>(norm[j % 2]);
132  }
133  }
134 }
__device__ uint32_t ran_gen_round_u32(ran_state_u32 &state)
Definition: device_tensor_generator.hpp:18
__global__ void fill_tensor_uniform_rand_fp_values(T *p, float min_value, float max_value, uint64_t buffer_element_size)
Definition: device_tensor_generator.hpp:77
__global__ void fill_tensor_uniform_rand_int_values(T *p, int min_value, int max_value, uint64_t buffer_element_size)
Definition: device_tensor_generator.hpp:50
__device__ ran_state_u32 ran_init(uint32_t seed=0)
Definition: device_tensor_generator.hpp:30
__global__ void fill_tensor_norm_rand_fp_values(T *p, float sigma, float mean, uint64_t buffer_element_size)
Definition: device_tensor_generator.hpp:107
__host__ T log(T x)
Definition: math_v2.hpp:409
__host__ T sin(T x)
Definition: math_v2.hpp:187
__host__ T cos(T x)
Definition: math_v2.hpp:241
typename vector_type< float, 2 >::type float2_t
Definition: dtype_vector.hpp:2146
unsigned int uint32_t
Definition: stdint.h:126
unsigned char uint8_t
Definition: stdint.h:124
unsigned __int64 uint64_t
Definition: stdint.h:136
Definition: data_type.hpp:187
Definition: device_tensor_generator.hpp:14
uint32_t s[4]
Definition: device_tensor_generator.hpp:15