24 template <
typename Range>
25 std::ostream&
LogRange(std::ostream& os, Range&& range, std::string delim)
39 template <
typename T,
typename Range>
40 std::ostream&
LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
51 if constexpr(std::is_same_v<RangeType, ck::f8_t> || std::is_same_v<RangeType, ck::bf8_t> ||
52 std::is_same_v<RangeType, ck::bhalf_t>)
54 os << ck::type_convert<float>(v);
56 else if constexpr(std::is_same_v<RangeType, ck::pk_i4_t> ||
57 std::is_same_v<RangeType, ck::f4x2_pk_t>)
59 const auto packed_floats = ck::type_convert<ck::float2_t>(v);
61 os << vector_of_floats.template AsType<float>()[
ck::Number<0>{}] << delim
62 << vector_of_floats.template AsType<float>()[
ck::Number<1>{}];
66 os << static_cast<T>(v);
72 template <
typename F,
typename T, std::size_t... Is>
75 return f(std::get<Is>(args)...);
78 template <
typename F,
typename T>
81 constexpr std::size_t N = std::tuple_size<T>{};
86 template <
typename F,
typename T, std::size_t... Is>
89 return F(std::get<Is>(args)...);
92 template <
typename F,
typename T>
95 constexpr std::size_t N = std::tuple_size<T>{};
97 return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
106 template <
typename X,
typename = std::enable_if_t<std::is_convertible_v<X, std::
size_t>>>
113 : mLens(lens.begin(), lens.end())
118 template <
typename Lengths,
120 std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> ||
121 std::is_convertible_v<ck::ranges::range_value_t<Lengths>,
ck::long_index_t>>>
127 template <
typename X,
129 typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
130 std::is_convertible_v<Y, std::size_t>>>
132 const std::initializer_list<Y>& strides)
133 : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
138 const std::initializer_list<ck::long_index_t>& strides)
139 : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
143 template <
typename Lengths,
146 (std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> &&
147 std::is_convertible_v<ck::ranges::range_value_t<Strides>, std::size_t>) ||
151 : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
162 template <
typename... Is>
166 std::initializer_list<std::size_t> iss{
static_cast<std::size_t
>(is)...};
178 std::vector<std::size_t> mLens;
179 std::vector<std::size_t> mStrides;
182 template <
typename New2Old>
184 const New2Old& new2old)
186 std::vector<std::size_t> new_lengths(
a.GetNumOfDimension());
187 std::vector<std::size_t> new_strides(
a.GetNumOfDimension());
189 for(std::size_t i = 0; i <
a.GetNumOfDimension(); i++)
191 new_lengths[i] =
a.GetLengths()[new2old[i]];
192 new_strides[i] =
a.GetStrides()[new2old[i]];
200 template <
typename... Xs>
215 template <
typename F,
typename... Xs>
219 static constexpr std::size_t
NDIM =
sizeof...(Xs);
220 std::array<std::size_t, NDIM>
mLens;
227 std::partial_sum(
mLens.rbegin(),
230 std::multiplies<std::size_t>());
236 std::array<std::size_t, NDIM> indices;
238 for(std::size_t idim = 0; idim <
NDIM; ++idim)
241 i -= indices[idim] *
mStrides[idim];
249 std::size_t work_per_thread = (
mN1d + num_thread - 1) / num_thread;
251 std::vector<joinable_thread> threads(num_thread);
253 for(std::size_t it = 0; it < num_thread; ++it)
255 std::size_t iw_begin = it * work_per_thread;
256 std::size_t iw_end =
std::min((it + 1) * work_per_thread,
mN1d);
258 auto f = [=, *
this] {
259 for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
269 template <
typename F,
typename... Xs>
275 template <
typename T>
281 template <
typename X>
286 template <
typename X,
typename Y>
287 Tensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
292 template <
typename Lengths>
297 template <
typename Lengths,
typename Str
ides>
298 Tensor(
const Lengths& lens,
const Strides& strides)
305 template <
typename OutT>
311 mData, ret.
mData.begin(), [](
auto value) { return ck::type_convert<OutT>(value); });
325 template <
typename FromT>
329 void savetxt(std::string file_name, std::string dtype =
"float")
331 std::ofstream file(file_name);
335 for(
auto& itm :
mData)
338 file << ck::type_convert<float>(itm) << std::endl;
339 else if(dtype ==
"int")
340 file << ck::type_convert<int>(itm) << std::endl;
344 file << ck::type_convert<float>(itm) << std::endl;
352 throw std::runtime_error(std::string(
"unable to open file:") + file_name);
379 template <
typename F>
395 template <
typename F>
402 template <
typename F>
418 template <
typename F>
425 template <
typename G>
431 auto f = [&](
auto i) { (*this)(i) = g(i); };
436 auto f = [&](
auto i0,
auto i1) { (*this)(i0, i1) = g(i0, i1); };
441 auto f = [&](
auto i0,
auto i1,
auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
447 auto f = [&](
auto i0,
auto i1,
auto i2,
auto i3) {
448 (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
458 auto f = [&](
auto i0,
auto i1,
auto i2,
auto i3,
auto i4) {
459 (*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
470 auto f = [&](
auto i0,
auto i1,
auto i2,
auto i3,
auto i4,
auto i5) {
471 (*this)(i0, i1, i2, i3, i4, i5) = g(i0, i1, i2, i3, i4, i5);
483 auto f = [&](
auto i0,
495 (*this)(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) =
496 g(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11);
513 default:
throw std::runtime_error(
"unspported dimension");
519 template <
typename Distribution = std::uniform_real_distribution<
float>,
520 typename Mapping = ck::
identity,
521 typename Generator = std::minstd_rand>
524 const Generator g = Generator(0),
525 std::size_t num_thread = -1)
529 if(num_thread == -1ULL)
533 constexpr std::size_t BLOCK_BYTES = 64;
534 constexpr std::size_t BLOCK_SIZE = BLOCK_BYTES /
sizeof(T);
539 std::vector<std::thread> threads;
540 threads.reserve(num_thread - 1);
541 const auto dst =
const_cast<T*
>(this->
mData.data());
543 for(
int it = num_thread - 1; it >= 0; --it)
545 std::size_t ib_begin = it * blocks_per_thread;
546 std::size_t ib_end =
min(ib_begin + blocks_per_thread, num_blocks);
551 g_.discard(ib_begin * BLOCK_SIZE * ck::packed_size_v<T>);
558 if constexpr(ck::is_same_v<T, ck::f8_t> || ck::is_same_v<T, ck::bf8_t>)
559 return ck::type_convert<T>(
static_cast<float>(fn(dis_(g_))));
560 else if constexpr(ck::packed_size_v<T> == 1)
563 return
ck::f4x2_pk_t{ck::type_convert<ck::f4x2_t>(
565 ck::type_convert<float>(fn(dis_(g_)))})};
566 else if constexpr(ck::is_same_v<T, ck::f6x32_pk_t> ||
567 ck::is_same_v<T, ck::bf6x32_pk_t>)
569 return ck::type_convert<T>(
571 ck::type_convert<float>(fn(dis_(g_))),
572 ck::type_convert<float>(fn(dis_(g_))),
573 ck::type_convert<float>(fn(dis_(g_))),
574 ck::type_convert<float>(fn(dis_(g_))),
575 ck::type_convert<float>(fn(dis_(g_))),
576 ck::type_convert<float>(fn(dis_(g_))),
577 ck::type_convert<float>(fn(dis_(g_))),
578 ck::type_convert<float>(fn(dis_(g_))),
579 ck::type_convert<float>(fn(dis_(g_))),
580 ck::type_convert<float>(fn(dis_(g_))),
581 ck::type_convert<float>(fn(dis_(g_))),
582 ck::type_convert<float>(fn(dis_(g_))),
583 ck::type_convert<float>(fn(dis_(g_))),
584 ck::type_convert<float>(fn(dis_(g_))),
585 ck::type_convert<float>(fn(dis_(g_))),
586 ck::type_convert<float>(fn(dis_(g_))),
587 ck::type_convert<float>(fn(dis_(g_))),
588 ck::type_convert<float>(fn(dis_(g_))),
589 ck::type_convert<float>(fn(dis_(g_))),
590 ck::type_convert<float>(fn(dis_(g_))),
591 ck::type_convert<float>(fn(dis_(g_))),
592 ck::type_convert<float>(fn(dis_(g_))),
593 ck::type_convert<float>(fn(dis_(g_))),
594 ck::type_convert<float>(fn(dis_(g_))),
595 ck::type_convert<float>(fn(dis_(g_))),
596 ck::type_convert<float>(fn(dis_(g_))),
597 ck::type_convert<float>(fn(dis_(g_))),
598 ck::type_convert<float>(fn(dis_(g_))),
599 ck::type_convert<float>(fn(dis_(g_))),
600 ck::type_convert<float>(fn(dis_(g_))),
601 ck::type_convert<float>(fn(dis_(g_)))});
603 else if constexpr(ck::is_same_v<T, ck::f6x16_pk_t> ||
604 ck::is_same_v<T, ck::bf6x16_pk_t>)
606 return ck::type_convert<T>(
608 ck::type_convert<float>(fn(dis_(g_))),
609 ck::type_convert<float>(fn(dis_(g_))),
610 ck::type_convert<float>(fn(dis_(g_))),
611 ck::type_convert<float>(fn(dis_(g_))),
612 ck::type_convert<float>(fn(dis_(g_))),
613 ck::type_convert<float>(fn(dis_(g_))),
614 ck::type_convert<float>(fn(dis_(g_))),
615 ck::type_convert<float>(fn(dis_(g_))),
616 ck::type_convert<float>(fn(dis_(g_))),
617 ck::type_convert<float>(fn(dis_(g_))),
618 ck::type_convert<float>(fn(dis_(g_))),
619 ck::type_convert<float>(fn(dis_(g_))),
620 ck::type_convert<float>(fn(dis_(g_))),
621 ck::type_convert<float>(fn(dis_(g_))),
622 ck::type_convert<float>(fn(dis_(g_)))});
625 static_assert(
false,
"Unsupported packed size for T");
628 std::size_t ib = ib_begin;
629 for(; ib < ib_end - 1; ++ib)
631 constexpr
size_t iw = iw_.value;
632 dst[ib * BLOCK_SIZE + iw] = t_fn();
634 for(std::size_t iw = 0; iw < BLOCK_SIZE; ++iw)
635 if(ib * BLOCK_SIZE + iw < element_space_size)
636 dst[ib * BLOCK_SIZE + iw] = t_fn();
640 threads.emplace_back(std::move(job));
644 for(
auto& t : threads)
648 template <
typename... Is>
654 template <
typename... Is>
658 ck::packed_size_v<ck::remove_cvref_t<T>>];
661 template <
typename... Is>
665 ck::packed_size_v<ck::remove_cvref_t<T>>];
673 const T&
operator()(
const std::vector<std::size_t>& idx)
const
680 typename Data::iterator
end() {
return mData.end(); }
684 typename Data::const_iterator
begin()
const {
return mData.begin(); }
686 typename Data::const_iterator
end()
const {
return mData.end(); }
688 typename Data::const_pointer
data()
const {
return mData.data(); }
690 typename Data::size_type
size()
const {
return mData.size(); }
692 template <
typename U = T>
695 constexpr std::size_t FromSize =
sizeof(T);
696 constexpr std::size_t ToSize =
sizeof(U);
698 using Element = std::add_const_t<std::remove_reference_t<U>>;
702 template <
typename U = T>
705 constexpr std::size_t FromSize =
sizeof(T);
706 constexpr std::size_t ToSize =
sizeof(U);
708 using Element = std::remove_reference_t<U>;
__host__ constexpr __device__ auto rank([[maybe_unused]] const Layout< Shape, UnrolledDescriptorType > &layout)
Get layout rank (num elements in shape).
Definition: layout_utils.hpp:310
auto call_f_unpack_args_impl(F f, T args, std::index_sequence< Is... >)
Definition: host_tensor.hpp:73
std::ostream & LogRangeAsType(std::ostream &os, Range &&range, std::string delim)
Definition: host_tensor.hpp:40
auto construct_f_unpack_args_impl(T args, std::index_sequence< Is... >)
Definition: host_tensor.hpp:87
auto call_f_unpack_args(F f, T args)
Definition: host_tensor.hpp:79
auto construct_f_unpack_args(F, T args)
Definition: host_tensor.hpp:93
auto make_ParallelTensorFunctor(F f, Xs... xs)
Definition: host_tensor.hpp:270
std::ostream & LogRange(std::ostream &os, Range &&range, std::string delim)
Definition: host_tensor.hpp:25
HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(const HostTensorDescriptor &a, const New2Old &new2old)
Definition: host_tensor.hpp:183
__host__ constexpr __device__ auto integer_divide_ceil(X x, Y y)
Definition: math.hpp:72
__host__ constexpr __device__ T min(T x)
Definition: math.hpp:116
auto transform(InputRange &&range, OutputIterator iter, UnaryOperation unary_op) -> decltype(std::transform(std::begin(range), std::end(range), iter, unary_op))
Definition: algorithm.hpp:36
iter_value_t< ranges::iterator_t< R > > range_value_t
Definition: ranges.hpp:28
typename vector_type< float, 16 >::type float16_t
Definition: dtype_vector.hpp:2134
unsigned int get_available_cpu_cores()
Definition: thread.hpp:11
int64_t long_index_t
Definition: ck.hpp:299
typename vector_type< float, 2 >::type float2_t
Definition: dtype_vector.hpp:2131
__host__ constexpr __device__ Y type_convert(X x)
Definition: type_convert.hpp:98
constexpr bool is_same_v
Definition: type.hpp:283
constexpr bool is_packed_type_v
Definition: data_type.hpp:411
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:297
typename std::enable_if< B, T >::type enable_if_t
Definition: enable_if.hpp:27
__device__ void inner_product(const TA &a, const TB &b, TC &c)
typename vector_type< float, 32 >::type float32_t
Definition: dtype_vector.hpp:2135
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1350
const GenericPointer< typename T::ValueType > & pointer
Definition: pointer.h:1249
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition: pointer.h:1249
Definition: host_tensor.hpp:101
HostTensorDescriptor(const Lengths &lens)
Definition: host_tensor.hpp:122
const std::vector< std::size_t > & GetStrides() const
HostTensorDescriptor(const std::initializer_list< X > &lens)
Definition: host_tensor.hpp:107
std::size_t GetElementSize() const
const std::vector< std::size_t > & GetLengths() const
std::size_t GetOffsetFromMultiIndex(Is... is) const
Definition: host_tensor.hpp:163
HostTensorDescriptor(const std::initializer_list< X > &lens, const std::initializer_list< Y > &strides)
Definition: host_tensor.hpp:131
HostTensorDescriptor(const std::initializer_list< ck::long_index_t > &lens, const std::initializer_list< ck::long_index_t > &strides)
Definition: host_tensor.hpp:137
std::size_t GetNumOfDimension() const
std::size_t GetElementSpaceSize() const
HostTensorDescriptor()=default
HostTensorDescriptor(const Lengths &lens, const Strides &strides)
Definition: host_tensor.hpp:150
std::size_t GetOffsetFromMultiIndex(const std::vector< std::size_t > &iss) const
Definition: host_tensor.hpp:170
friend std::ostream & operator<<(std::ostream &os, const HostTensorDescriptor &desc)
HostTensorDescriptor(const std::initializer_list< ck::long_index_t > &lens)
Definition: host_tensor.hpp:112
Definition: host_tensor.hpp:217
std::array< std::size_t, NDIM > GetNdIndices(std::size_t i) const
Definition: host_tensor.hpp:234
F mF
Definition: host_tensor.hpp:218
std::size_t mN1d
Definition: host_tensor.hpp:222
ParallelTensorFunctor(F f, Xs... xs)
Definition: host_tensor.hpp:224
std::array< std::size_t, NDIM > mLens
Definition: host_tensor.hpp:220
std::array< std::size_t, NDIM > mStrides
Definition: host_tensor.hpp:221
void operator()(std::size_t num_thread=1) const
Definition: host_tensor.hpp:247
static constexpr std::size_t NDIM
Definition: host_tensor.hpp:219
Tensor wrapper that performs static and dynamic buffer logic. The tensor is based on a descriptor sto...
Definition: host_tensor.hpp:277
auto AsSpan() const
Definition: host_tensor.hpp:693
Tensor(const Lengths &lens, const Strides &strides)
Definition: host_tensor.hpp:298
std::size_t GetNumOfDimension() const
Definition: host_tensor.hpp:359
T & operator()(const std::vector< std::size_t > &idx)
Definition: host_tensor.hpp:668
void ForEach(const F &&f) const
Definition: host_tensor.hpp:419
decltype(auto) GetLengths() const
Definition: host_tensor.hpp:355
Data::const_iterator end() const
Definition: host_tensor.hpp:686
std::size_t GetOffsetFromMultiIndex(Is... is) const
Definition: host_tensor.hpp:649
Tensor< OutT > CopyAsType() const
Definition: host_tensor.hpp:306
const T & operator()(const std::vector< std::size_t > &idx) const
Definition: host_tensor.hpp:673
void ForEach(F &&f)
Definition: host_tensor.hpp:396
Data::pointer data()
Definition: host_tensor.hpp:682
void ForEach_impl(F &&f, std::vector< size_t > &idx, size_t rank)
Definition: host_tensor.hpp:380
std::size_t GetElementSpaceSizeInBytes() const
Definition: host_tensor.hpp:375
void ForEach_impl(const F &&f, std::vector< size_t > &idx, size_t rank) const
Definition: host_tensor.hpp:403
Tensor & operator=(const Tensor &)=default
std::vector< T > Data
Definition: host_tensor.hpp:279
Data mData
Definition: host_tensor.hpp:713
Data::iterator end()
Definition: host_tensor.hpp:680
void GenerateTensorDistr(Distribution dis={0.f, 1.f}, Mapping fn={}, const Generator g=Generator(0), std::size_t num_thread=-1)
Definition: host_tensor.hpp:522
std::size_t GetElementSize() const
Definition: host_tensor.hpp:361
void SetZero()
Definition: host_tensor.hpp:377
Tensor(const Lengths &lens)
Definition: host_tensor.hpp:293
void savetxt(std::string file_name, std::string dtype="float")
Definition: host_tensor.hpp:329
Tensor(Tensor &&)=default
const T & operator()(Is... is) const
Definition: host_tensor.hpp:662
Data::const_pointer data() const
Definition: host_tensor.hpp:688
auto AsSpan()
Definition: host_tensor.hpp:703
Data::iterator begin()
Definition: host_tensor.hpp:678
Tensor(std::initializer_list< X > lens, std::initializer_list< Y > strides)
Definition: host_tensor.hpp:287
Tensor(const Tensor &)=default
Tensor(const Descriptor &desc)
Definition: host_tensor.hpp:303
Descriptor mDesc
Definition: host_tensor.hpp:712
Tensor & operator=(Tensor &&)=default
Data::const_iterator begin() const
Definition: host_tensor.hpp:684
std::size_t GetElementSpaceSize() const
Definition: host_tensor.hpp:363
Tensor(const Tensor< FromT > &other)
Definition: host_tensor.hpp:326
Data::size_type size() const
Definition: host_tensor.hpp:690
void GenerateTensorValue(G g, std::size_t num_thread=1)
Definition: host_tensor.hpp:426
decltype(auto) GetStrides() const
Definition: host_tensor.hpp:357
T & operator()(Is... is)
Definition: host_tensor.hpp:655
Tensor(std::initializer_list< X > lens)
Definition: host_tensor.hpp:282
Definition: integral_constant.hpp:20
Definition: functional2.hpp:33
Definition: dtype_vector.hpp:10
Definition: host_tensor.hpp:199
joinable_thread(joinable_thread &&)=default
joinable_thread(Xs &&... xs)
Definition: host_tensor.hpp:201
~joinable_thread()
Definition: host_tensor.hpp:208
joinable_thread & operator=(joinable_thread &&)=default