12 #include <type_traits>
14 #include <unordered_set>
38 template <
typename T =
void>
43 std::optional<uint32_t>
seed_{11939};
45 template <
typename ForwardIter>
46 void operator()(ForwardIter first, ForwardIter last)
const
50 using T_iter = std::decay_t<decltype(*first)>;
51 static_assert(std::is_same_v<T, T_iter> || std::is_void_v<T>,
52 "Iterator value type must match template type T");
54 const auto total =
static_cast<size_t>(std::distance(first, last));
55 const auto total_bytes = total *
sizeof(T_iter);
59 constexpr
uint64_t MAX_THREAD_COUNT = 80;
60 const size_t num_thread =
min(
62 constexpr
size_t BLOCK_BYTES = 64;
63 constexpr
size_t BLOCK_SIZE = BLOCK_BYTES /
sizeof(T_iter);
68 std::minstd_rand gen(
seed_.has_value() ? *
seed_ : std::random_device{}());
69 std::uniform_real_distribution<float> dis(
a_,
b_);
71 std::vector<joinable_thread> threads;
72 threads.reserve(num_thread - 1);
73 for(
int it = num_thread - 1; it >= 0; --it)
75 const size_t ib_begin = it * blocks_per_thread;
76 const size_t ib_end =
min(ib_begin + blocks_per_thread, num_blocks);
81 g_.discard(ib_begin * BLOCK_SIZE * PackedSize);
83 if constexpr(PackedSize == 2)
84 return type_convert<T_iter>(
fp32x2_t{d_(g_), d_(g_)});
86 return type_convert<T_iter>(d_(g_));
90 for(; ib < ib_end - 1; ++ib)
92 constexpr
size_t iw = iw_.value;
93 *(first + ib * BLOCK_SIZE + iw) = t_fn();
95 for(
size_t iw = 0; iw < BLOCK_SIZE; ++iw)
96 if(ib * BLOCK_SIZE + iw < total)
97 *(first + ib * BLOCK_SIZE + iw) = t_fn();
101 threads.emplace_back(std::move(job));
107 template <
typename ForwardRange>
109 -> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
110 std::begin(std::forward<ForwardRange>(range)),
111 std::end(std::forward<ForwardRange>(range))))>
113 (*this)(std::begin(std::forward<ForwardRange>(range)),
114 std::end(std::forward<ForwardRange>(range)));
124 std::optional<uint32_t>
seed_{11939};
125 template <
typename ForwardIter>
128 if(a_ < -8.0f || b_ > 7.0f)
130 throw std::runtime_error(
131 "a_ or b_ of FillUniformDistribution<ck_tile::pk_int4_t> is out of range.");
134 int min_value =
static_cast<int>(
a_);
135 int max_value =
static_cast<int>(
b_);
136 constexpr
auto int4_array = std::array<uint8_t, 16>{0x88,
152 std::mt19937 gen(
seed_.has_value() ? *
seed_ : std::random_device{}());
153 std::uniform_int_distribution<std::int32_t> dis(0, max_value - min_value + 1);
156 int randomInt = dis(gen);
157 *first = int4_array[randomInt + (min_value + 8)];
161 template <
typename ForwardRange>
163 -> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
164 std::begin(std::forward<ForwardRange>(range)),
165 std::end(std::forward<ForwardRange>(range))))>
167 (*this)(std::begin(std::forward<ForwardRange>(range)),
168 std::end(std::forward<ForwardRange>(range)));
182 template <
typename T>
187 template <
typename T>
192 std::optional<uint32_t>
seed_{11939};
195 std::unordered_set<impl::RawIntegerType<T>>
set_{};
199 std::optional<uint32_t> seed = {11939})
208 template <
typename ForwardIter>
211 std::mt19937& gen =
gen_;
212 std::uniform_real_distribution<float> dis(
a_,
b_);
214 std::generate(first, last, [&dis, &gen, &set]() {
215 T v =
static_cast<T
>(0);
218 v = ck_tile::type_convert<T>(dis(gen));
226 template <
typename ForwardRange>
228 -> std::void_t<decltype(std::declval<FillUniformDistribution_Unique&>()(
229 std::begin(std::forward<ForwardRange>(range)),
230 std::end(std::forward<ForwardRange>(range))))>
232 (*this)(std::begin(std::forward<ForwardRange>(range)),
233 std::end(std::forward<ForwardRange>(range)));
239 template <
typename T>
244 std::optional<uint32_t>
seed_{11939};
248 template <
typename ForwardIter>
253 uint32_t num_thread = std::thread::hardware_concurrency();
254 auto total =
static_cast<std::size_t
>(std::distance(first, last));
255 auto work_per_thread =
static_cast<std::size_t
>((total + num_thread - 1) / num_thread);
257 std::vector<joinable_thread> threads(num_thread);
258 for(std::size_t it = 0; it < num_thread; ++it)
260 std::size_t iw_begin = it * work_per_thread;
261 std::size_t iw_end =
std::min((it + 1) * work_per_thread, total);
262 auto thread_f = [
this, total, iw_begin, iw_end, &first] {
263 if(iw_begin > total || iw_end > total)
266 std::mt19937 gen(
seed_.has_value() ? (*
seed_ + iw_begin)
267 : std::random_device{}());
269 std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
270 return ck_tile::type_convert<T>(dis(gen));
278 std::mt19937 gen(
seed_.has_value() ? *
seed_ : std::random_device{}());
281 first, last, [&dis, &gen]() {
return ck_tile::type_convert<T>(dis(gen)); });
285 template <
typename ForwardRange>
287 -> std::void_t<decltype(std::declval<const FillNormalDistribution&>()(
288 std::begin(std::forward<ForwardRange>(range)),
289 std::end(std::forward<ForwardRange>(range))))>
291 (*this)(std::begin(std::forward<ForwardRange>(range)),
292 std::end(std::forward<ForwardRange>(range)));
315 template <
typename T>
320 std::optional<uint32_t>
seed_{11939};
322 template <
typename ForwardIter>
325 std::mt19937 gen(
seed_.has_value() ? *
seed_ : std::random_device{}());
326 std::uniform_real_distribution<float> dis(
a_,
b_);
328 first, last, [&dis, &gen]() {
return ck_tile::type_convert<T>(std::round(dis(gen))); });
331 template <
typename ForwardRange>
333 -> std::void_t<decltype(std::declval<const FillUniformDistributionIntegerValue&>()(
334 std::begin(std::forward<ForwardRange>(range)),
335 std::end(std::forward<ForwardRange>(range))))>
337 (*this)(std::begin(std::forward<ForwardRange>(range)),
338 std::end(std::forward<ForwardRange>(range)));
342 template <
typename T>
347 std::optional<uint32_t>
seed_{11939};
349 template <
typename ForwardIter>
352 std::mt19937 gen(
seed_.has_value() ? *
seed_ : std::random_device{}());
355 first, last, [&dis, &gen]() {
return ck_tile::type_convert<T>(std::round(dis(gen))); });
358 template <
typename ForwardRange>
360 -> std::void_t<decltype(std::declval<const FillNormalDistributionIntegerValue&>()(
361 std::begin(std::forward<ForwardRange>(range)),
362 std::end(std::forward<ForwardRange>(range))))>
364 (*this)(std::begin(std::forward<ForwardRange>(range)),
365 std::end(std::forward<ForwardRange>(range)));
369 template <
typename T>
375 template <
typename ForwardIter>
378 std::generate(first, last, [=, *
this, n =
init_value_]()
mutable {
382 n.data +=
step_.data;
392 template <
typename ForwardRange>
394 -> std::void_t<decltype(std::declval<const FillMonotonicSeq&>()(
395 std::begin(std::forward<ForwardRange>(range)),
396 std::end(std::forward<ForwardRange>(range))))>
398 (*this)(std::begin(std::forward<ForwardRange>(range)),
399 std::end(std::forward<ForwardRange>(range)));
403 template <
typename T,
bool IsAscending = true>
410 template <
typename ForwardIter>
413 std::generate(first, last, [=, *
this, n =
start_value_]()
mutable {
416 if constexpr(IsAscending)
427 return type_convert<T>(tmp);
431 template <
typename ForwardRange>
433 -> std::void_t<decltype(std::declval<const FillStepRange&>()(
434 std::begin(std::forward<ForwardRange>(range)),
435 std::end(std::forward<ForwardRange>(range))))>
437 (*this)(std::begin(std::forward<ForwardRange>(range)),
438 std::end(std::forward<ForwardRange>(range)));
442 template <
typename T>
447 template <
typename ForwardIter>
453 template <
typename ForwardRange>
455 -> std::void_t<decltype(std::declval<const FillConstant&>()(
456 std::begin(std::forward<ForwardRange>(range)),
457 std::end(std::forward<ForwardRange>(range))))>
459 (*this)(std::begin(std::forward<ForwardRange>(range)),
460 std::end(std::forward<ForwardRange>(range)));
467 template <
typename T>
485 template <
typename ForwardIter>
492 return type_convert<T>(tmp);
496 template <
typename ForwardRange>
498 -> std::void_t<decltype(std::declval<const AdjustToStructuredSparsity&>()(
499 std::begin(std::forward<ForwardRange>(range)),
500 std::end(std::forward<ForwardRange>(range))))>
502 (*this)(std::begin(std::forward<ForwardRange>(range)),
503 std::end(std::forward<ForwardRange>(range)));
507 template <
typename T,
bool UseCos = true,
bool UseAbs = false>
510 template <
typename T_,
bool UseCos_ = true,
bool UseAbs_ = false>
517 if constexpr(UseCos_)
525 if constexpr(UseAbs_)
528 return ck_tile::type_convert<T_>(v);
531 template <
typename ForwardIter>
535 std::generate(first, last, gen);
538 template <
typename ForwardRange>
540 -> std::void_t<decltype(std::declval<const FillTrigValue&>()(
541 std::begin(std::forward<ForwardRange>(range)),
542 std::end(std::forward<ForwardRange>(range))))>
544 (*this)(std::begin(std::forward<ForwardRange>(range)),
545 std::end(std::forward<ForwardRange>(range)));
__host__ constexpr __device__ T min(T x)
Definition: math.hpp:116
auto fill(OutputRange &&range, const T &init) -> std::void_t< decltype(std::fill(std::begin(std::forward< OutputRange >(range)), std::end(std::forward< OutputRange >(range)), init))>
Definition: algorithm.hpp:25
auto transform(InputRange &&range, OutputIterator iter, UnaryOperation unary_op) -> decltype(std::transform(std::begin(range), std::end(range), iter, unary_op))
Definition: algorithm.hpp:36
typename RawIntegerType_< sizeof(T)>::type RawIntegerType
Definition: fill.hpp:183
Definition: cluster_descriptor.hpp:13
constexpr CK_TILE_HOST_DEVICE auto integer_divide_ceil(X x, Y y)
Definition: math.hpp:145
constexpr CK_TILE_HOST_DEVICE Y bit_cast(const X &x)
Definition: bit_cast.hpp:11
CK_TILE_HOST T cos(T x)
Definition: math.hpp:745
float fp32x2_t
Definition: bfloat16.hpp:434
CK_TILE_HOST T sin(T x)
Definition: math.hpp:691
int32_t int32_t
Definition: integer.hpp:10
CK_TILE_HOST_DEVICE bfloat16_t abs(const bfloat16_t &x)
Definition: bfloat16.hpp:403
unsigned int get_available_cpu_cores()
Definition: joinable_thread.hpp:31
constexpr CK_TILE_HOST_DEVICE T min(T x)
Definition: math.hpp:206
constexpr bool is_same_v
Definition: type.hpp:283
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition: pointer.h:1517
unsigned short uint16_t
Definition: stdint.h:125
unsigned int uint32_t
Definition: stdint.h:126
unsigned char uint8_t
Definition: stdint.h:124
unsigned __int64 uint64_t
Definition: stdint.h:136
Transforms given input to fit 2:4 structured sparsity pattern so every subgroup of 4 elements contain...
Definition: fill.hpp:469
auto operator()(ForwardRange &&range) const -> std::void_t< decltype(std::declval< const AdjustToStructuredSparsity & >()(std::begin(std::forward< ForwardRange >(range)), std::end(std::forward< ForwardRange >(range))))>
Definition: fill.hpp:497
size_t start
Definition: fill.hpp:470
static constexpr int32_t masks[]
Definition: fill.hpp:473
void operator()(ForwardIter first, ForwardIter last) const
Definition: fill.hpp:486
auto operator()(ForwardRange &&range) const -> std::void_t< decltype(std::declval< const FillConstant & >()(std::begin(std::forward< ForwardRange >(range)), std::end(std::forward< ForwardRange >(range))))>
Definition: fill.hpp:454
T value_
Definition: fill.hpp:445
void operator()(ForwardIter first, ForwardIter last) const
Definition: fill.hpp:448
auto operator()(ForwardRange &&range) const -> std::void_t< decltype(std::declval< const FillMonotonicSeq & >()(std::begin(std::forward< ForwardRange >(range)), std::end(std::forward< ForwardRange >(range))))>
Definition: fill.hpp:393
T init_value_
Definition: fill.hpp:372
T step_
Definition: fill.hpp:373
void operator()(ForwardIter first, ForwardIter last) const
Definition: fill.hpp:376
std::optional< uint32_t > seed_
Definition: fill.hpp:244
void operator()(ForwardIter first, ForwardIter last) const
Definition: fill.hpp:249
float variance_
Definition: fill.hpp:243
auto operator()(ForwardRange &&range) const -> std::void_t< decltype(std::declval< const FillNormalDistribution & >()(std::begin(std::forward< ForwardRange >(range)), std::end(std::forward< ForwardRange >(range))))>
Definition: fill.hpp:286
bool threaded
Definition: fill.hpp:246
float mean_
Definition: fill.hpp:242
void operator()(ForwardIter first, ForwardIter last) const
Definition: fill.hpp:350
float mean_
Definition: fill.hpp:345
float variance_
Definition: fill.hpp:346
auto operator()(ForwardRange &&range) const -> std::void_t< decltype(std::declval< const FillNormalDistributionIntegerValue & >()(std::begin(std::forward< ForwardRange >(range)), std::end(std::forward< ForwardRange >(range))))>
Definition: fill.hpp:359
std::optional< uint32_t > seed_
Definition: fill.hpp:347
float end_value_
Definition: fill.hpp:407
float start_value_
Definition: fill.hpp:406
float step_
Definition: fill.hpp:408
void operator()(ForwardIter first, ForwardIter last) const
Definition: fill.hpp:411
auto operator()(ForwardRange &&range) const -> std::void_t< decltype(std::declval< const FillStepRange & >()(std::begin(std::forward< ForwardRange >(range)), std::end(std::forward< ForwardRange >(range))))>
Definition: fill.hpp:432
int i
Definition: fill.hpp:513
auto operator()()
Definition: fill.hpp:514
void operator()(ForwardIter first, ForwardIter last) const
Definition: fill.hpp:532
auto operator()(ForwardRange &&range) const -> std::void_t< decltype(std::declval< const FillTrigValue & >()(std::begin(std::forward< ForwardRange >(range)), std::end(std::forward< ForwardRange >(range))))>
Definition: fill.hpp:539
uint8_t type
Definition: fill.hpp:176
uint16_t type
Definition: fill.hpp:177
uint32_t type
Definition: fill.hpp:178
uint64_t type
Definition: fill.hpp:179
Definition: joinable_thread.hpp:15
Definition: numeric.hpp:81
Definition: pk_int4.hpp:21
Definition: functional.hpp:43