/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/library/utility/host_tensor.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/library/utility/host_tensor.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/library/utility/host_tensor.hpp Source File
host_tensor.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
6 #include <algorithm>
7 #include <cassert>
8 #include <iostream>
9 #include <fstream>
10 #include <numeric>
11 #include <random>
12 #include <thread>
13 #include <utility>
14 #include <vector>
15 
16 #include "ck/utility/data_type.hpp"
17 #include "ck/utility/span.hpp"
19 
23 
25 
26 template <typename Range>
27 std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
28 {
29  bool first = true;
30  for(auto&& v : range)
31  {
32  if(first)
33  first = false;
34  else
35  os << delim;
36  os << v;
37  }
38  return os;
39 }
40 
41 template <typename T, typename Range>
42 std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
43 {
44  bool first = true;
45  for(auto&& v : range)
46  {
47  if(first)
48  first = false;
49  else
50  os << delim;
51 
52  using RangeType = ck::remove_cvref_t<decltype(v)>;
53  if constexpr(std::is_same_v<RangeType, ck::f8_t> || std::is_same_v<RangeType, ck::bf8_t> ||
54  std::is_same_v<RangeType, ck::bhalf_t>)
55  {
56  os << ck::type_convert<float>(v);
57  }
58  else if constexpr(std::is_same_v<RangeType, ck::pk_i4_t> ||
59  std::is_same_v<RangeType, ck::f4x2_pk_t>)
60  {
61  const auto packed_floats = ck::type_convert<ck::float2_t>(v);
62  const ck::vector_type<float, 2> vector_of_floats{packed_floats};
63  os << vector_of_floats.template AsType<float>()[ck::Number<0>{}] << delim
64  << vector_of_floats.template AsType<float>()[ck::Number<1>{}];
65  }
66  else
67  {
68  os << static_cast<T>(v);
69  }
70  }
71  return os;
72 }
73 
74 template <typename F, typename T, std::size_t... Is>
75 auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
76 {
77  return f(std::get<Is>(args)...);
78 }
79 
80 template <typename F, typename T>
81 auto call_f_unpack_args(F f, T args)
82 {
83  constexpr std::size_t N = std::tuple_size<T>{};
84 
85  return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
86 }
87 
88 template <typename F, typename T, std::size_t... Is>
89 auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
90 {
91  return F(std::get<Is>(args)...);
92 }
93 
94 template <typename F, typename T>
95 auto construct_f_unpack_args(F, T args)
96 {
97  constexpr std::size_t N = std::tuple_size<T>{};
98 
99  return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
100 }
101 
171 {
174 
175  // Runtime tag describing which layout is picked when layout is not specified explicitly at
176  // construction time.
177  enum class ChosenLayout
178  {
179  Original,
180  RowMajor,
182  };
183 
184  // Master constructor
185  template <typename Layout>
186  HostTensorDescriptor(std::vector<std::size_t> lens,
187  std::vector<std::size_t> strides,
188  const Layout& layout = DefaultLayout())
189  : mLens(std::move(lens)), mStrides(std::move(strides))
190  {
191  // To support legacy use cases, when layout is not passed in
192  const auto new_layout = HandleDefaultLayout(layout);
193  if(dbg)
194  {
195  std::cout << "Original Lens: [";
196  LogRange(std::cout, mLens, ", ") << "] and Strides: [";
197  LogRange(std::cout, mStrides, ", ") << "]" << std::endl;
198  std::cout << "Layout: " << layout << " --> " << new_layout << std::endl;
199  }
200 
201  // Handling the strides and validation based on the chosen layout
202  DispatchChosenLayout(new_layout, layout, [&](auto selected_layout) {
203  this->CalculateStrides(selected_layout);
204  this->ValidateStrides(selected_layout);
205  });
206  }
207 
209 
210  // Helper that invokes a callable with a concrete layout object whose type
211  // matches the chosen tag (so template code depending on the layout type
212  // can still leverage if constexpr branches).
213  template <typename F, typename OrigLayout>
214  void DispatchChosenLayout(ChosenLayout tag, const OrigLayout& orig, F&& f) const
215  {
216  switch(tag)
217  {
221  default: f(orig); break;
222  }
223  }
224 
225  template <typename Layout>
227  {
228  if constexpr(!std::is_same_v<Layout, DefaultLayout>)
229  {
230  return ChosenLayout::Original;
231  }
232  else
233  {
234  if(mStrides.empty())
235  {
236  // No strides provided -> assume RowMajor
237  return ChosenLayout::RowMajor;
238  }
239 
240  const auto rank = mLens.size();
241 
242  if(rank > 2)
243  {
244  // Keep as-is - validation will warn/throw later
245  return ChosenLayout::Original;
246  }
247 
248  if(rank == 0)
249  {
250  // Keep as-is - validation will warn/throw later
251  return ChosenLayout::Original;
252  }
253 
254  if(rank == 1)
255  {
256  // Treat 1D tensor as RowMajor
257  return ChosenLayout::RowMajor;
258  }
259 
260  // rank == 2
261  if(mStrides.size() == 2)
262  {
263  // RowMajor pattern (?, 1)
264  if(mStrides[1] == 1)
265  {
266  return ChosenLayout::RowMajor;
267  }
268 
269  // ColumnMajor pattern (1, ?)
270  if(mStrides[0] == 1)
271  {
273  }
274  }
275 
276  // Fallback: leave as-is
277  return ChosenLayout::Original;
278  }
279  }
280 
281  template <typename Layout>
283  {
284  if constexpr(std::is_same_v<Layout, ck::tensor_layout::BypassLayoutVerification>)
285  return;
286  // This is a workaround if the original stride value is -1 (which means "unknown") has been
287  // passed in and casted to size_t (unsigned).
288  auto strides_int = AsInt(mStrides);
289 
290  // case of empty strides or all-zero: auto-calculate based on layout and tensor dimensions
291  if(mStrides.empty() || std::all_of(strides_int.begin(), strides_int.end(), [](int stride) {
292  return stride <= 0;
293  }))
294  {
295 
296  if constexpr(!(std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ||
297  std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>))
298  {
299  std::cerr << "Only RowMajor and ColumnMajor layouts are supported for empty "
300  "strides, got "
301  << layout << ". Will calculate strides as RowMajor." << std::endl;
302  }
303 
304  mStrides.clear();
305  mStrides.resize(mLens.size(), 0);
306  if(mStrides.empty())
307  return;
308 
309  mStrides.back() = 1;
310  std::partial_sum(mLens.rbegin(),
311  mLens.rend() - 1,
312  mStrides.rbegin() + 1,
313  std::multiplies<std::size_t>());
314 
315  if constexpr(std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>)
316  {
317  // swap the last two strides
318  if(mStrides.size() >= 2)
319  std::swap(mStrides[mStrides.size() - 1], mStrides[mStrides.size() - 2]);
320  }
321  }
322  // The other case is if one of the strides is unknown
323  // Currently, only GEMM RowMajor and ColumnMajor layouts are supported and only in the lower
324  // two dimensions, e.g. {..., 0, N} or {..., M, 0}. The higher dimensions are left
325  // untouched.
326  else if constexpr(std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ||
327  std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>)
328  {
329  auto rank = mStrides.size();
330  if(mLens.size() >= 2 && rank >= 2)
331  {
332  const auto inner_idx =
333  std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ? rank - 1 : rank - 2;
334  const auto outer_idx = inner_idx == rank - 1 ? rank - 2 : rank - 1;
335  if(mStrides[inner_idx] <= 0)
336  {
337  mStrides[inner_idx] = 1;
338  }
339  if(mStrides[outer_idx] <= 0)
340  {
341  mStrides[outer_idx] = mLens[inner_idx] * mStrides[inner_idx];
342  }
343  }
344  }
345  }
346 
347  template <typename Layout>
348  void ValidateStrides(const Layout& layout) const
349  {
350  if constexpr(std::is_same_v<ck::tensor_layout::BypassLayoutVerification, Layout>)
351  {
352  return;
353  }
354 
355  if(mLens.empty())
356  {
357  throw std::runtime_error(
358  "HostTensorDescriptor::ValidateStrides: empty tensor dimensions is not allowed.");
359  }
360 
361  const int rank = mLens.size();
362  if(rank == 1) // skip any 1D tensors
363  {
364  return;
365  }
366 
367  if constexpr(std::is_same_v<ck::tensor_layout::BaseTensorLayout, Layout>)
368  {
369  // Any legacy code that doesn't pass layout to HostTensorDescriptor ctor will
370  // hit this case (unless it is a special case - see `HandleDefaultLayout`).
371  throw std::runtime_error("HostTensorDescriptor::ValidateStrides: Abstract tensor "
372  "layout BaseTensorLayout can't be verified. Pls "
373  "pass specific tensor layout to HostTensorDescriptor (or "
374  "ck::tensor_layout::BypassLayoutVerification)");
375  }
376 
377  // GEMM cases
378  if constexpr(std::is_base_of_v<ck::tensor_layout::gemm::BaseGemmLayout, Layout>)
379  {
380  if(mLens.size() != mStrides.size())
381  {
382  std::ostringstream oss;
383  oss << "HostTensorDescriptor::ValidateStrides: mismatch between tensor rank and "
384  "size of strides: "
385  << *this;
386  throw std::runtime_error(oss.str());
387  }
388 
389  // in GEMM, strides must be all positive or all zeros (auto-derived from tensor
390  // dimensions)
391  auto strides_int = AsInt(mStrides);
392  if(std::any_of(
393  strides_int.begin(), strides_int.end(), [](int stride) { return stride <= 0; }))
394  {
395  std::ostringstream oss;
396  oss << "Stride values must be positive or all-zeros (auto-derived from tensor "
397  "dimensions). Instead got ";
398  std::copy(
399  strides_int.begin(), strides_int.end(), std::ostream_iterator<int>(oss, " "));
400  throw std::runtime_error(oss.str());
401  }
402 
403  if constexpr(std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ||
404  std::is_same_v<ck::tensor_layout::gemm::ColumnMajor, Layout>)
405  {
406  // The logic here assumes the GEMM with tensor of more than 2 dims, will always have
407  // HW dimesnsions as the inner ones e.g. batched GEMM is either BHW or BWH
408  const auto inner_idx =
409  std::is_same_v<ck::tensor_layout::gemm::RowMajor, Layout> ? rank - 1 : rank - 2;
410  const auto outer_idx = inner_idx == rank - 1 ? rank - 2 : rank - 1;
411 
412  if(mStrides[outer_idx] < mLens[inner_idx] * mStrides[inner_idx])
413  {
414  std::ostringstream oss;
415  oss << "Invalid strides for " << layout << ": " << *this;
416  throw std::runtime_error(oss.str());
417  }
418 
419  // For higher dimensions, validate strides assuming RowMajor
420  for(int i = 1; i < rank - 2; ++i)
421  {
422  if(mStrides[i - 1] < mStrides[i] * mLens[i])
423  {
424  std::ostringstream oss;
425  oss << "Invalid strides for higher dimensions in " << layout << ": "
426  << *this;
427  throw std::runtime_error(oss.str());
428  }
429  }
430  }
431  else
432  {
433  std::ostringstream oss;
434  oss << "Error: Unsupported GEMM layout: " << layout;
435  throw std::runtime_error(oss.str());
436  }
437  }
438  // Convolution cases
440  Layout>)
441  {
442  // TBD: implement verification for Conv layouts
443  // For now, just print warning and return
444  std::cerr << "Warning: Tensor layout verification for ck::tensor_layout::convolution "
445  "layouts is not supported yet. Skipping..."
446  << std::endl;
447  return;
448  }
449  else
450  {
451  std::ostringstream oss;
452  oss << "Error: Tensor layout verification for " << layout << " is not supported yet.";
453  throw std::runtime_error(oss.str());
454  }
455  }
456 
457  template <typename X,
458  typename Layout = DefaultLayout,
459  typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
460  std::is_convertible_v<Layout, BaseTensorLayout>>>
461  HostTensorDescriptor(const std::initializer_list<X>& lens, const Layout& layout = Layout{})
462  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()), {}, layout)
463  {
464  if(dbg)
465  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
466  }
467 
468  template <typename Layout = DefaultLayout,
469  typename = std::enable_if_t<std::is_convertible_v<Layout, BaseTensorLayout>>>
470  HostTensorDescriptor(const std::initializer_list<ck::long_index_t>& lens,
471  const Layout& layout = Layout{})
472  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()), {}, layout)
473  {
474  if(dbg)
475  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
476  }
477 
478  template <typename Lengths,
479  typename Layout = DefaultLayout,
480  typename = std::enable_if_t<
481  (std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> ||
482  std::is_convertible_v<ck::ranges::range_value_t<Lengths>, ck::long_index_t>) &&
483  std::is_convertible_v<Layout, BaseTensorLayout>>>
484  HostTensorDescriptor(const Lengths& lens, const Layout& layout = Layout{})
485  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()), {}, layout)
486  {
487  if(dbg)
488  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
489  }
490 
491  template <typename X,
492  typename Y,
493  typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
494  std::is_convertible_v<Y, std::size_t>>,
495  typename Layout = DefaultLayout>
496  HostTensorDescriptor(const std::initializer_list<X>& lens,
497  const std::initializer_list<Y>& strides,
498  const Layout& layout = Layout{})
499  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
500  std::vector<std::size_t>(strides.begin(), strides.end()),
501  layout)
502  {
503  if(dbg)
504  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
505  }
506 
507  // HostTensorDescriptor({row, col}, {row_stride, col_stride})
508  template <typename Layout = DefaultLayout>
509  HostTensorDescriptor(const std::initializer_list<ck::long_index_t>& lens,
510  const std::initializer_list<ck::long_index_t>& strides,
511  const Layout& layout = Layout{})
512  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
513  std::vector<std::size_t>(strides.begin(), strides.end()),
514  layout)
515  {
516  if(dbg)
517  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
518  }
519 
520  // HostTensorDescriptor({row, col}, strides)
521  template <typename Strides, typename Layout = DefaultLayout>
522  HostTensorDescriptor(const std::initializer_list<std::size_t>& lens,
523  const Strides& strides,
524  const Layout& layout = Layout{})
525  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
526  std::vector<std::size_t>(strides.begin(), strides.end()),
527  layout)
528  {
529  if(dbg)
530  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
531  }
532 
533  template <typename Lengths,
534  typename Strides,
535  typename Layout = DefaultLayout,
536  typename = std::enable_if_t<
537  ((std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> &&
538  std::is_convertible_v<ck::ranges::range_value_t<Strides>, std::size_t>) ||
539  (std::is_convertible_v<ck::ranges::range_value_t<Lengths>, ck::long_index_t> &&
540  std::is_convertible_v<ck::ranges::range_value_t<Strides>, ck::long_index_t>)) &&
541  std::is_convertible_v<Layout, BaseTensorLayout>>>
542  HostTensorDescriptor(const Lengths& lens,
543  const Strides& strides,
544  const Layout& layout = Layout{})
545  : HostTensorDescriptor(std::vector<std::size_t>(lens.begin(), lens.end()),
546  std::vector<std::size_t>(strides.begin(), strides.end()),
547  layout)
548  {
549  if(dbg)
550  std::cout << "HostTensorDescriptor ctor (" << __LINE__ << ")" << std::endl;
551  }
552 
553  std::size_t GetNumOfDimension() const;
554  std::size_t GetElementSize() const;
555  std::size_t GetElementSpaceSize() const;
556 
557  const std::vector<std::size_t>& GetLengths() const;
558  const std::vector<std::size_t>& GetStrides() const;
559 
560  template <typename... Is>
561  std::size_t GetOffsetFromMultiIndex(Is... is) const
562  {
563  assert(sizeof...(Is) == this->GetNumOfDimension());
564  std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
565  return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
566  }
567 
568  std::size_t GetOffsetFromMultiIndex(const std::vector<std::size_t>& iss) const
569  {
570  return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
571  }
572 
573  friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc);
574  friend std::ostream& operator<<(std::ostream& os, ChosenLayout tag);
575 
576  private:
577  std::vector<std::size_t> mLens;
578  std::vector<std::size_t> mStrides;
579  static constexpr bool dbg = false;
580 
587  std::vector<int> AsInt(const std::vector<size_t>& vec) const
588  {
589  std::vector<int> strides_int(vec.size());
590  std::transform(vec.begin(), vec.end(), strides_int.begin(), [](std::size_t stride) {
591  return static_cast<int>(stride);
592  });
593  return strides_int;
594  }
595 };
596 
597 template <typename New2Old, typename NewLayout = HostTensorDescriptor::BaseTensorLayout>
600  const New2Old& new2old,
601  const NewLayout& new_layout = NewLayout())
602 {
603  std::vector<std::size_t> new_lengths(a.GetNumOfDimension());
604  std::vector<std::size_t> new_strides(a.GetNumOfDimension());
605 
606  for(std::size_t i = 0; i < a.GetNumOfDimension(); i++)
607  {
608  new_lengths[i] = a.GetLengths()[new2old[i]];
609  new_strides[i] = a.GetStrides()[new2old[i]];
610  }
611 
612  return HostTensorDescriptor(new_lengths, new_strides, new_layout);
613 }
614 
615 struct joinable_thread : std::thread
616 {
617  template <typename... Xs>
618  joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
619  {
620  }
621 
624 
626  {
627  if(this->joinable())
628  this->join();
629  }
630 };
631 
632 template <typename F, typename... Xs>
634 {
635  F mF;
636  static constexpr std::size_t NDIM = sizeof...(Xs);
637  std::array<std::size_t, NDIM> mLens;
638  std::array<std::size_t, NDIM> mStrides;
639  std::size_t mN1d;
640 
641  ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
642  {
643  mStrides.back() = 1;
644  std::partial_sum(mLens.rbegin(),
645  mLens.rend() - 1,
646  mStrides.rbegin() + 1,
647  std::multiplies<std::size_t>());
648  mN1d = mStrides[0] * mLens[0];
649  }
650 
651  std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
652  {
653  std::array<std::size_t, NDIM> indices;
654 
655  for(std::size_t idim = 0; idim < NDIM; ++idim)
656  {
657  indices[idim] = i / mStrides[idim];
658  i -= indices[idim] * mStrides[idim];
659  }
660 
661  return indices;
662  }
663 
664  void operator()(std::size_t num_thread = 1) const
665  {
666  std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
667 
668  std::vector<joinable_thread> threads(num_thread);
669 
670  for(std::size_t it = 0; it < num_thread; ++it)
671  {
672  std::size_t iw_begin = it * work_per_thread;
673  std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
674 
675  auto f = [=, *this] {
676  for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
677  {
679  }
680  };
681  threads[it] = joinable_thread(f);
682  }
683  }
684 };
685 
686 template <typename F, typename... Xs>
687 auto make_ParallelTensorFunctor(F f, Xs... xs)
688 {
689  return ParallelTensorFunctor<F, Xs...>(f, xs...);
690 }
691 
692 template <typename T>
693 struct Tensor
694 {
696  using Data = std::vector<T>;
697 
698  template <typename X>
699  Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(GetElementSpaceSize())
700  {
701  }
702 
703  template <typename X, typename Y>
704  Tensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
705  : mDesc(lens, strides), mData(GetElementSpaceSize())
706  {
707  }
708 
709  template <typename Lengths>
710  Tensor(const Lengths& lens) : mDesc(lens), mData(GetElementSpaceSize())
711  {
712  }
713 
714  template <typename Lengths, typename Strides>
715  Tensor(const Lengths& lens, const Strides& strides)
716  : mDesc(lens, strides), mData(GetElementSpaceSize())
717  {
718  }
719 
720  template <typename X, typename... Rest, std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
721  Tensor(std::initializer_list<X> lens, Rest&&... rest)
722  : mDesc(lens, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
723  {
724  }
725 
726  template <typename X,
727  typename Y,
728  typename... Rest,
729  std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
730  Tensor(std::initializer_list<X> lens, std::initializer_list<Y> strides, Rest&&... rest)
731  : mDesc(lens, strides, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
732  {
733  }
734 
735  template <typename Lengths, typename... Rest, std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
736  Tensor(const Lengths& lens, Rest&&... rest)
737  : mDesc(lens, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
738  {
739  }
740 
741  template <typename Lengths,
742  typename Strides,
743  typename... Rest,
744  std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
745  Tensor(const Lengths& lens, const Strides& strides, Rest&&... rest)
746  : mDesc(lens, strides, std::forward<Rest>(rest)...), mData(GetElementSpaceSize())
747  {
748  }
749 
750  Tensor(const Descriptor& desc) : mDesc(desc), mData(GetElementSpaceSize()) {}
751 
752  template <typename OutT>
754  {
755  Tensor<OutT> ret(mDesc);
756 
758  mData, ret.mData.begin(), [](auto value) { return ck::type_convert<OutT>(value); });
759 
760  return ret;
761  }
762 
763  Tensor() = delete;
764  Tensor(const Tensor&) = default;
765  Tensor(Tensor&&) = default;
766 
767  ~Tensor() = default;
768 
769  Tensor& operator=(const Tensor&) = default;
770  Tensor& operator=(Tensor&&) = default;
771 
772  template <typename FromT>
773  explicit Tensor(const Tensor<FromT>& other) : Tensor(other.template CopyAsType<T>())
774  {
775  }
776  void savetxt(std::string file_name, std::string dtype = "float")
777  {
778  std::ofstream file(file_name);
779 
780  if(file.is_open())
781  {
782  for(auto& itm : mData)
783  {
784  if(dtype == "float")
785  file << ck::type_convert<float>(itm) << std::endl;
786  else if(dtype == "int")
787  file << ck::type_convert<int>(itm) << std::endl;
788  else
789  // TODO: we didn't implement operator<< for all custom
790  // data types, here fall back to float in case compile error
791  file << ck::type_convert<float>(itm) << std::endl;
792  }
793  file.close();
794  }
795  else
796  {
797  // Print an error message to the standard error
798  // stream if the file cannot be opened.
799  throw std::runtime_error(std::string("unable to open file:") + file_name);
800  }
801  }
802  decltype(auto) GetLengths() const { return mDesc.GetLengths(); }
803 
804  decltype(auto) GetStrides() const { return mDesc.GetStrides(); }
805 
806  std::size_t GetNumOfDimension() const { return mDesc.GetNumOfDimension(); }
807 
808  std::size_t GetElementSize() const { return mDesc.GetElementSize(); }
809 
810  std::size_t GetElementSpaceSize() const
811  {
813  {
814  return (mDesc.GetElementSpaceSize() + 1) / ck::packed_size_v<ck::remove_cvref_t<T>>;
815  }
816  else
817  {
818  return mDesc.GetElementSpaceSize();
819  }
820  }
821 
822  std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); }
823 
824  void SetZero() { ck::ranges::fill<T>(mData, T{0}); }
825 
826  template <typename F>
827  void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
828  {
829  if(rank == mDesc.GetNumOfDimension())
830  {
831  f(*this, idx);
832  return;
833  }
834  // else
835  for(size_t i = 0; i < mDesc.GetLengths()[rank]; i++)
836  {
837  idx[rank] = i;
838  ForEach_impl(std::forward<F>(f), idx, rank + 1);
839  }
840  }
841 
842  template <typename F>
843  void ForEach(F&& f)
844  {
845  std::vector<size_t> idx(mDesc.GetNumOfDimension(), 0);
846  ForEach_impl(std::forward<F>(f), idx, size_t(0));
847  }
848 
849  template <typename F>
850  void ForEach_impl(const F&& f, std::vector<size_t>& idx, size_t rank) const
851  {
852  if(rank == mDesc.GetNumOfDimension())
853  {
854  f(*this, idx);
855  return;
856  }
857  // else
858  for(size_t i = 0; i < mDesc.GetLengths()[rank]; i++)
859  {
860  idx[rank] = i;
861  ForEach_impl(std::forward<const F>(f), idx, rank + 1);
862  }
863  }
864 
865  template <typename F>
866  void ForEach(const F&& f) const
867  {
868  std::vector<size_t> idx(mDesc.GetNumOfDimension(), 0);
869  ForEach_impl(std::forward<const F>(f), idx, size_t(0));
870  }
871 
872  template <typename G>
873  void GenerateTensorValue(G g, std::size_t num_thread = 1)
874  {
875  switch(mDesc.GetNumOfDimension())
876  {
877  case 1: {
878  auto f = [&](auto i) { (*this)(i) = g(i); };
879  make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
880  break;
881  }
882  case 2: {
883  auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
884  make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
885  break;
886  }
887  case 3: {
888  auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
890  f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
891  break;
892  }
893  case 4: {
894  auto f = [&](auto i0, auto i1, auto i2, auto i3) {
895  (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
896  };
898  mDesc.GetLengths()[0],
899  mDesc.GetLengths()[1],
900  mDesc.GetLengths()[2],
901  mDesc.GetLengths()[3])(num_thread);
902  break;
903  }
904  case 5: {
905  auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
906  (*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
907  };
909  mDesc.GetLengths()[0],
910  mDesc.GetLengths()[1],
911  mDesc.GetLengths()[2],
912  mDesc.GetLengths()[3],
913  mDesc.GetLengths()[4])(num_thread);
914  break;
915  }
916  case 6: {
917  auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4, auto i5) {
918  (*this)(i0, i1, i2, i3, i4, i5) = g(i0, i1, i2, i3, i4, i5);
919  };
921  mDesc.GetLengths()[0],
922  mDesc.GetLengths()[1],
923  mDesc.GetLengths()[2],
924  mDesc.GetLengths()[3],
925  mDesc.GetLengths()[4],
926  mDesc.GetLengths()[5])(num_thread);
927  break;
928  }
929  case 12: {
930  auto f = [&](auto i0,
931  auto i1,
932  auto i2,
933  auto i3,
934  auto i4,
935  auto i5,
936  auto i6,
937  auto i7,
938  auto i8,
939  auto i9,
940  auto i10,
941  auto i11) {
942  (*this)(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) =
943  g(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11);
944  };
946  mDesc.GetLengths()[0],
947  mDesc.GetLengths()[1],
948  mDesc.GetLengths()[2],
949  mDesc.GetLengths()[3],
950  mDesc.GetLengths()[4],
951  mDesc.GetLengths()[5],
952  mDesc.GetLengths()[6],
953  mDesc.GetLengths()[7],
954  mDesc.GetLengths()[8],
955  mDesc.GetLengths()[9],
956  mDesc.GetLengths()[10],
957  mDesc.GetLengths()[11])(num_thread);
958  break;
959  }
960  default: throw std::runtime_error("unspported dimension");
961  }
962  }
963 
964  // Generate random values with multiple threads. Guaranteed to give the same sequence with any
965  // number of threads provided.
966  template <typename Distribution = std::uniform_real_distribution<float>,
967  typename Mapping = ck::identity,
968  typename Generator = std::minstd_rand>
969  void GenerateTensorDistr(Distribution dis = {0.f, 1.f},
970  Mapping fn = {},
971  const Generator g = Generator(0), // default seed 0
972  std::size_t num_thread = -1)
973  {
975  using ck::math::min;
976  if(num_thread == -1ULL)
977  num_thread = min(ck::get_available_cpu_cores(), 80U); // max 80 threads
978  // At least 2MB per thread
979  num_thread = min(num_thread, integer_divide_ceil(this->GetElementSpaceSize(), 0x200000));
980  constexpr std::size_t BLOCK_BYTES = 64;
981  constexpr std::size_t BLOCK_SIZE = BLOCK_BYTES / sizeof(T);
982 
983  const std::size_t num_blocks = integer_divide_ceil(this->GetElementSpaceSize(), BLOCK_SIZE);
984  const std::size_t blocks_per_thread = integer_divide_ceil(num_blocks, num_thread);
985 
986  std::vector<std::thread> threads;
987  threads.reserve(num_thread - 1);
988  const auto dst = const_cast<T*>(this->mData.data());
989  const auto element_space_size = this->GetElementSpaceSize();
990  for(int it = num_thread - 1; it >= 0; --it)
991  {
992  std::size_t ib_begin = it * blocks_per_thread;
993  std::size_t ib_end = min(ib_begin + blocks_per_thread, num_blocks);
994 
995  auto job = [=]() {
996  auto g_ = g; // copy
997  auto dis_ = dis; // copy
998  g_.discard(ib_begin * BLOCK_SIZE * ck::packed_size_v<T>);
999  auto t_fn = [&]() {
1000  // As user can pass integer distribution in dis, we must ensure that the correct
1001  // constructor/converter is called at all times. For f4/f6/f8 types, to ensure
1002  // correct results, we convert from float to the target type. In these cases
1003  // integer constructors are interpreted as direct initialization of the internal
1004  // storage with binary values instead of treating integers as subset of floats.
1005  if constexpr(ck::is_same_v<T, ck::f8_t> || ck::is_same_v<T, ck::bf8_t>)
1006  return ck::type_convert<T>(static_cast<float>(fn(dis_(g_))));
1007  else if constexpr(ck::packed_size_v<T> == 1)
1008  return ck::type_convert<T>(fn(dis_(g_)));
1009  else if constexpr(ck::is_same_v<T, ck::f4x2_pk_t>)
1010  return ck::f4x2_pk_t{ck::type_convert<ck::f4x2_t>(
1011  ck::float2_t{ck::type_convert<float>(fn(dis_(g_))),
1012  ck::type_convert<float>(fn(dis_(g_)))})};
1013  else if constexpr(ck::is_same_v<T, ck::f6x32_pk_t> ||
1014  ck::is_same_v<T, ck::bf6x32_pk_t>)
1015  {
1016  return ck::type_convert<T>(
1017  ck::float32_t{ck::type_convert<float>(fn(dis_(g_))),
1018  ck::type_convert<float>(fn(dis_(g_))),
1019  ck::type_convert<float>(fn(dis_(g_))),
1020  ck::type_convert<float>(fn(dis_(g_))),
1021  ck::type_convert<float>(fn(dis_(g_))),
1022  ck::type_convert<float>(fn(dis_(g_))),
1023  ck::type_convert<float>(fn(dis_(g_))),
1024  ck::type_convert<float>(fn(dis_(g_))),
1025  ck::type_convert<float>(fn(dis_(g_))),
1026  ck::type_convert<float>(fn(dis_(g_))),
1027  ck::type_convert<float>(fn(dis_(g_))),
1028  ck::type_convert<float>(fn(dis_(g_))),
1029  ck::type_convert<float>(fn(dis_(g_))),
1030  ck::type_convert<float>(fn(dis_(g_))),
1031  ck::type_convert<float>(fn(dis_(g_))),
1032  ck::type_convert<float>(fn(dis_(g_))),
1033  ck::type_convert<float>(fn(dis_(g_))),
1034  ck::type_convert<float>(fn(dis_(g_))),
1035  ck::type_convert<float>(fn(dis_(g_))),
1036  ck::type_convert<float>(fn(dis_(g_))),
1037  ck::type_convert<float>(fn(dis_(g_))),
1038  ck::type_convert<float>(fn(dis_(g_))),
1039  ck::type_convert<float>(fn(dis_(g_))),
1040  ck::type_convert<float>(fn(dis_(g_))),
1041  ck::type_convert<float>(fn(dis_(g_))),
1042  ck::type_convert<float>(fn(dis_(g_))),
1043  ck::type_convert<float>(fn(dis_(g_))),
1044  ck::type_convert<float>(fn(dis_(g_))),
1045  ck::type_convert<float>(fn(dis_(g_))),
1046  ck::type_convert<float>(fn(dis_(g_))),
1047  ck::type_convert<float>(fn(dis_(g_))),
1048  ck::type_convert<float>(fn(dis_(g_)))});
1049  }
1050  else if constexpr(ck::is_same_v<T, ck::f6x16_pk_t> ||
1051  ck::is_same_v<T, ck::bf6x16_pk_t>)
1052  {
1053  return ck::type_convert<T>(
1054  ck::float16_t{ck::type_convert<float>(fn(dis_(g_))),
1055  ck::type_convert<float>(fn(dis_(g_))),
1056  ck::type_convert<float>(fn(dis_(g_))),
1057  ck::type_convert<float>(fn(dis_(g_))),
1058  ck::type_convert<float>(fn(dis_(g_))),
1059  ck::type_convert<float>(fn(dis_(g_))),
1060  ck::type_convert<float>(fn(dis_(g_))),
1061  ck::type_convert<float>(fn(dis_(g_))),
1062  ck::type_convert<float>(fn(dis_(g_))),
1063  ck::type_convert<float>(fn(dis_(g_))),
1064  ck::type_convert<float>(fn(dis_(g_))),
1065  ck::type_convert<float>(fn(dis_(g_))),
1066  ck::type_convert<float>(fn(dis_(g_))),
1067  ck::type_convert<float>(fn(dis_(g_))),
1068  ck::type_convert<float>(fn(dis_(g_))),
1069  ck::type_convert<float>(fn(dis_(g_)))});
1070  }
1071  else
1072  static_assert(false, "Unsupported packed size for T");
1073  };
1074 
1075  std::size_t ib = ib_begin;
1076  for(; ib < ib_end - 1; ++ib)
1077  ck::static_for<0, BLOCK_SIZE, 1>{}([&](auto iw_) {
1078  constexpr size_t iw = iw_.value;
1079  dst[ib * BLOCK_SIZE + iw] = t_fn();
1080  });
1081  for(std::size_t iw = 0; iw < BLOCK_SIZE; ++iw)
1082  if(ib * BLOCK_SIZE + iw < element_space_size)
1083  dst[ib * BLOCK_SIZE + iw] = t_fn();
1084  };
1085 
1086  if(it > 0)
1087  threads.emplace_back(std::move(job));
1088  else
1089  job(); // last job run in the main thread
1090  }
1091  for(auto& t : threads)
1092  t.join();
1093  }
1094 
1095  template <typename... Is>
1096  std::size_t GetOffsetFromMultiIndex(Is... is) const
1097  {
1098  return mDesc.GetOffsetFromMultiIndex(is...) / ck::packed_size_v<ck::remove_cvref_t<T>>;
1099  }
1100 
1101  template <typename... Is>
1102  T& operator()(Is... is)
1103  {
1104  return mData[mDesc.GetOffsetFromMultiIndex(is...) /
1105  ck::packed_size_v<ck::remove_cvref_t<T>>];
1106  }
1107 
1108  template <typename... Is>
1109  const T& operator()(Is... is) const
1110  {
1111  return mData[mDesc.GetOffsetFromMultiIndex(is...) /
1112  ck::packed_size_v<ck::remove_cvref_t<T>>];
1113  }
1114 
1115  T& operator()(const std::vector<std::size_t>& idx)
1116  {
1117  return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
1118  }
1119 
1120  const T& operator()(const std::vector<std::size_t>& idx) const
1121  {
1122  return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
1123  }
1124 
1125  typename Data::iterator begin() { return mData.begin(); }
1126 
1127  typename Data::iterator end() { return mData.end(); }
1128 
1129  typename Data::pointer data() { return mData.data(); }
1130 
1131  typename Data::const_iterator begin() const { return mData.begin(); }
1132 
1133  typename Data::const_iterator end() const { return mData.end(); }
1134 
1135  typename Data::const_pointer data() const { return mData.data(); }
1136 
1137  typename Data::size_type size() const { return mData.size(); }
1138 
1139  template <typename U = T>
1140  auto AsSpan() const
1141  {
1142  constexpr std::size_t FromSize = sizeof(T);
1143  constexpr std::size_t ToSize = sizeof(U);
1144 
1145  using Element = std::add_const_t<std::remove_reference_t<U>>;
1146  return ck::span<Element>{reinterpret_cast<Element*>(data()), size() * FromSize / ToSize};
1147  }
1148 
1149  template <typename U = T>
1150  auto AsSpan()
1151  {
1152  constexpr std::size_t FromSize = sizeof(T);
1153  constexpr std::size_t ToSize = sizeof(U);
1154 
1155  using Element = std::remove_reference_t<U>;
1156  return ck::span<Element>{reinterpret_cast<Element*>(data()), size() * FromSize / ToSize};
1157  }
1158 
1161 };
Definition: span.hpp:14
__host__ constexpr __device__ auto rank([[maybe_unused]] const Layout< Shape, UnrolledDescriptorType > &layout)
Get layout rank (num elements in shape).
Definition: layout_utils.hpp:310
auto call_f_unpack_args_impl(F f, T args, std::index_sequence< Is... >)
Definition: host_tensor.hpp:75
std::ostream & LogRangeAsType(std::ostream &os, Range &&range, std::string delim)
Definition: host_tensor.hpp:42
auto construct_f_unpack_args_impl(T args, std::index_sequence< Is... >)
Definition: host_tensor.hpp:89
HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(const HostTensorDescriptor &a, const New2Old &new2old, const NewLayout &new_layout=NewLayout())
Definition: host_tensor.hpp:599
auto call_f_unpack_args(F f, T args)
Definition: host_tensor.hpp:81
auto construct_f_unpack_args(F, T args)
Definition: host_tensor.hpp:95
auto make_ParallelTensorFunctor(F f, Xs... xs)
Definition: host_tensor.hpp:687
std::ostream & LogRange(std::ostream &os, Range &&range, std::string delim)
Definition: host_tensor.hpp:27
__host__ constexpr __device__ auto integer_divide_ceil(X x, Y y)
Definition: math.hpp:72
__host__ constexpr __device__ T min(T x)
Definition: math.hpp:116
auto transform(InputRange &&range, OutputIterator iter, UnaryOperation unary_op) -> decltype(std::transform(std::begin(range), std::end(range), iter, unary_op))
Definition: algorithm.hpp:36
auto copy(InputRange &&range, OutputIterator iter) -> decltype(std::copy(std::begin(std::forward< InputRange >(range)), std::end(std::forward< InputRange >(range)), iter))
Definition: algorithm.hpp:14
iter_value_t< ranges::iterator_t< R > > range_value_t
Definition: ranges.hpp:28
Definition: ck.hpp:268
typename vector_type< float, 16 >::type float16_t
Definition: dtype_vector.hpp:2148
unsigned int get_available_cpu_cores()
Definition: thread.hpp:11
int64_t long_index_t
Definition: ck.hpp:300
typename vector_type< float, 2 >::type float2_t
Definition: dtype_vector.hpp:2145
__host__ constexpr __device__ Y type_convert(X x)
Definition: type_convert.hpp:98
constexpr bool is_base_of_v
Definition: type.hpp:286
constexpr bool is_same_v
Definition: type.hpp:283
constexpr bool is_packed_type_v
Definition: data_type.hpp:414
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:297
typename std::enable_if< B, T >::type enable_if_t
Definition: enable_if.hpp:27
__device__ void inner_product(const TA &a, const TB &b, TC &c)
typename vector_type< float, 32 >::type float32_t
Definition: dtype_vector.hpp:2149
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1350
const GenericPointer< typename T::ValueType > & pointer
Definition: pointer.h:1249
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition: pointer.h:1249
A descriptor class for host tensors that manages tensor dimensions, strides, and layout.
Definition: host_tensor.hpp:171
HostTensorDescriptor()
Definition: host_tensor.hpp:208
void DispatchChosenLayout(ChosenLayout tag, const OrigLayout &orig, F &&f) const
Definition: host_tensor.hpp:214
HostTensorDescriptor(const std::initializer_list< ck::long_index_t > &lens, const std::initializer_list< ck::long_index_t > &strides, const Layout &layout=Layout{})
Definition: host_tensor.hpp:509
const std::vector< std::size_t > & GetStrides() const
HostTensorDescriptor(const std::initializer_list< X > &lens, const Layout &layout=Layout{})
Definition: host_tensor.hpp:461
std::size_t GetElementSize() const
const std::vector< std::size_t > & GetLengths() const
HostTensorDescriptor(const std::initializer_list< ck::long_index_t > &lens, const Layout &layout=Layout{})
Definition: host_tensor.hpp:470
std::size_t GetOffsetFromMultiIndex(Is... is) const
Definition: host_tensor.hpp:561
void ValidateStrides(const Layout &layout) const
Definition: host_tensor.hpp:348
HostTensorDescriptor(std::vector< std::size_t > lens, std::vector< std::size_t > strides, const Layout &layout=DefaultLayout())
Definition: host_tensor.hpp:186
void CalculateStrides(const Layout &layout)
Definition: host_tensor.hpp:282
HostTensorDescriptor(const std::initializer_list< X > &lens, const std::initializer_list< Y > &strides, const Layout &layout=Layout{})
Definition: host_tensor.hpp:496
HostTensorDescriptor(const std::initializer_list< std::size_t > &lens, const Strides &strides, const Layout &layout=Layout{})
Definition: host_tensor.hpp:522
ChosenLayout HandleDefaultLayout(const Layout &)
Definition: host_tensor.hpp:226
HostTensorDescriptor(const Lengths &lens, const Strides &strides, const Layout &layout=Layout{})
Definition: host_tensor.hpp:542
std::size_t GetNumOfDimension() const
HostTensorDescriptor(const Lengths &lens, const Layout &layout=Layout{})
Definition: host_tensor.hpp:484
friend std::ostream & operator<<(std::ostream &os, ChosenLayout tag)
std::size_t GetElementSpaceSize() const
BaseTensorLayout DefaultLayout
Definition: host_tensor.hpp:173
ck::tensor_layout::BaseTensorLayout BaseTensorLayout
Definition: host_tensor.hpp:172
ChosenLayout
Definition: host_tensor.hpp:178
std::size_t GetOffsetFromMultiIndex(const std::vector< std::size_t > &iss) const
Definition: host_tensor.hpp:568
friend std::ostream & operator<<(std::ostream &os, const HostTensorDescriptor &desc)
Layout wrapper that performs the tensor descriptor logic.
Definition: layout.hpp:24
Definition: host_tensor.hpp:634
std::array< std::size_t, NDIM > GetNdIndices(std::size_t i) const
Definition: host_tensor.hpp:651
F mF
Definition: host_tensor.hpp:635
std::size_t mN1d
Definition: host_tensor.hpp:639
ParallelTensorFunctor(F f, Xs... xs)
Definition: host_tensor.hpp:641
std::array< std::size_t, NDIM > mLens
Definition: host_tensor.hpp:637
std::array< std::size_t, NDIM > mStrides
Definition: host_tensor.hpp:638
void operator()(std::size_t num_thread=1) const
Definition: host_tensor.hpp:664
static constexpr std::size_t NDIM
Definition: host_tensor.hpp:636
Tensor wrapper that performs static and dynamic buffer logic. The tensor is based on a descriptor sto...
Definition: host_tensor.hpp:694
auto AsSpan() const
Definition: host_tensor.hpp:1140
Tensor(std::initializer_list< X > lens, std::initializer_list< Y > strides, Rest &&... rest)
Definition: host_tensor.hpp:730
Tensor(const Lengths &lens, Rest &&... rest)
Definition: host_tensor.hpp:736
Tensor(const Lengths &lens, const Strides &strides)
Definition: host_tensor.hpp:715
Tensor()=delete
std::size_t GetNumOfDimension() const
Definition: host_tensor.hpp:806
T & operator()(const std::vector< std::size_t > &idx)
Definition: host_tensor.hpp:1115
void ForEach(const F &&f) const
Definition: host_tensor.hpp:866
decltype(auto) GetLengths() const
Definition: host_tensor.hpp:802
Data::const_iterator end() const
Definition: host_tensor.hpp:1133
std::size_t GetOffsetFromMultiIndex(Is... is) const
Definition: host_tensor.hpp:1096
Tensor< OutT > CopyAsType() const
Definition: host_tensor.hpp:753
const T & operator()(const std::vector< std::size_t > &idx) const
Definition: host_tensor.hpp:1120
void ForEach(F &&f)
Definition: host_tensor.hpp:843
Data::pointer data()
Definition: host_tensor.hpp:1129
void ForEach_impl(F &&f, std::vector< size_t > &idx, size_t rank)
Definition: host_tensor.hpp:827
std::size_t GetElementSpaceSizeInBytes() const
Definition: host_tensor.hpp:822
void ForEach_impl(const F &&f, std::vector< size_t > &idx, size_t rank) const
Definition: host_tensor.hpp:850
Tensor & operator=(const Tensor &)=default
std::vector< T > Data
Definition: host_tensor.hpp:696
Data mData
Definition: host_tensor.hpp:1160
Data::iterator end()
Definition: host_tensor.hpp:1127
void GenerateTensorDistr(Distribution dis={0.f, 1.f}, Mapping fn={}, const Generator g=Generator(0), std::size_t num_thread=-1)
Definition: host_tensor.hpp:969
std::size_t GetElementSize() const
Definition: host_tensor.hpp:808
~Tensor()=default
Tensor(const Lengths &lens, const Strides &strides, Rest &&... rest)
Definition: host_tensor.hpp:745
void SetZero()
Definition: host_tensor.hpp:824
Tensor(const Lengths &lens)
Definition: host_tensor.hpp:710
void savetxt(std::string file_name, std::string dtype="float")
Definition: host_tensor.hpp:776
Tensor(Tensor &&)=default
const T & operator()(Is... is) const
Definition: host_tensor.hpp:1109
Data::const_pointer data() const
Definition: host_tensor.hpp:1135
auto AsSpan()
Definition: host_tensor.hpp:1150
Data::iterator begin()
Definition: host_tensor.hpp:1125
Tensor(std::initializer_list< X > lens, std::initializer_list< Y > strides)
Definition: host_tensor.hpp:704
Tensor(const Tensor &)=default
Tensor(const Descriptor &desc)
Definition: host_tensor.hpp:750
Descriptor mDesc
Definition: host_tensor.hpp:1159
Tensor & operator=(Tensor &&)=default
Data::const_iterator begin() const
Definition: host_tensor.hpp:1131
std::size_t GetElementSpaceSize() const
Definition: host_tensor.hpp:810
Tensor(const Tensor< FromT > &other)
Definition: host_tensor.hpp:773
Data::size_type size() const
Definition: host_tensor.hpp:1137
void GenerateTensorValue(G g, std::size_t num_thread=1)
Definition: host_tensor.hpp:873
decltype(auto) GetStrides() const
Definition: host_tensor.hpp:804
T & operator()(Is... is)
Definition: host_tensor.hpp:1102
Tensor(std::initializer_list< X > lens)
Definition: host_tensor.hpp:699
Tensor(std::initializer_list< X > lens, Rest &&... rest)
Definition: host_tensor.hpp:721
Definition: integral_constant.hpp:20
Definition: functional2.hpp:33
Definition: tensor_layout.hpp:10
Definition: tensor_layout.hpp:31
Definition: tensor_layout.hpp:26
Definition: dtype_vector.hpp:10
Definition: host_tensor.hpp:616
joinable_thread(joinable_thread &&)=default
joinable_thread(Xs &&... xs)
Definition: host_tensor.hpp:618
~joinable_thread()
Definition: host_tensor.hpp:625
joinable_thread & operator=(joinable_thread &&)=default
__host__ constexpr __device__ const auto & layout(const Tensor< BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType > &tensor)
Get Tensor Layout.
Definition: tensor_utils.hpp:162