include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp Source File

include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp Source File#

Composable Kernel: include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp Source File
gridwise_gemm_xdlops_v2r3.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
17 
18 namespace ck {
19 
20 template <typename GridwiseGemm,
21  typename FloatAB,
22  typename FloatC,
23  typename AGridDesc_K0_M_K1,
24  typename BGridDesc_K0_N_K1,
25  typename CGridDesc_M_N,
26  bool HasMainKBlockLoop>
27 __global__ void
28 #if CK_USE_LAUNCH_BOUNDS
30 #endif
31 #if CK_USE_WAVES_PER_EU
32  __attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
33 #endif
34  kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
35  const FloatAB* __restrict__ p_b_grid,
36  FloatC* __restrict__ p_c_grid,
37  const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
38  const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
39  const CGridDesc_M_N c_grid_desc_m_n)
40 {
41 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
42  defined(__gfx94__))
43  __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
44 
45  GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
46  p_b_grid,
47  p_c_grid,
48  p_shared,
49  a_grid_desc_k0_m_k1,
50  b_grid_desc_k0_n_k1,
51  c_grid_desc_m_n);
52 #else
53  ignore = p_a_grid;
54  ignore = p_b_grid;
55  ignore = p_c_grid;
56  ignore = a_grid_desc_k0_m_k1;
57  ignore = b_grid_desc_k0_n_k1;
58  ignore = c_grid_desc_m_n;
59 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
60 }
61 
62 template <typename GridwiseGemm, bool HasMainKBlockLoop>
63 __global__ void
64 #if CK_USE_LAUNCH_BOUNDS
66 #endif
67 #if CK_USE_WAVES_PER_EU
68  __attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
69 #endif
70  kernel_gemm_xdlops_v2r3(const typename GridwiseGemm::Argument karg)
71 {
72 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
73  defined(__gfx94__))
74  __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
75 
76  const auto a_grid_desc_k0_m_k1 =
77  amd_wave_read_first_lane(GridwiseGemm::MakeAGridDescriptor_K0_M_K1(
78  karg.M, karg.MPadded, karg.K, karg.K0, karg.StrideA));
79  const auto b_grid_desc_k0_n_k1 =
80  amd_wave_read_first_lane(GridwiseGemm::MakeBGridDescriptor_K0_N_K1(
81  karg.K, karg.N, karg.NPadded, karg.K0, karg.StrideB));
82  const auto c_grid_desc_m_n = amd_wave_read_first_lane(GridwiseGemm::MakeCGridDescriptor_M_N(
83  karg.M, karg.MPadded, karg.N, karg.NPadded, karg.StrideC));
84 
85  GridwiseGemm::template Run<HasMainKBlockLoop>(karg.p_a_grid,
86  karg.p_b_grid,
87  karg.p_c_grid,
88  p_shared,
89  a_grid_desc_k0_m_k1,
90  b_grid_desc_k0_n_k1,
91  c_grid_desc_m_n);
92 #else
93  ignore = karg;
94 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
95 }
96 
97 template <index_t BlockSize,
98  typename FloatAB,
99  typename FloatAcc,
100  typename FloatC,
101  InMemoryDataOperationEnum CGlobalMemoryDataOperation,
102  typename AElementwiseOperation,
103  typename BElementwiseOperation,
104  typename CElementwiseOperation,
105  index_t MPerBlock,
106  index_t NPerBlock,
107  index_t K0PerBlock,
108  index_t MPerXDL,
109  index_t NPerXDL,
110  index_t K1Value,
111  index_t MXdlPerWave,
112  index_t NXdlPerWave,
113  typename ABlockTransferThreadClusterLengths_K0_M_K1,
114  typename ABlockTransferThreadClusterArrangeOrder,
115  typename ABlockTransferSrcAccessOrder,
116  index_t ABlockTransferSrcVectorDim,
117  index_t ABlockTransferSrcScalarPerVector,
118  index_t ABlockTransferDstScalarPerVector_K1,
119  bool AThreadTransferSrcResetCoordinateAfterRun,
120  bool ABlockLdsExtraM,
121  typename BBlockTransferThreadClusterLengths_K0_N_K1,
122  typename BBlockTransferThreadClusterArrangeOrder,
123  typename BBlockTransferSrcAccessOrder,
124  index_t BBlockTransferSrcVectorDim,
125  index_t BBlockTransferSrcScalarPerVector,
126  index_t BBlockTransferDstScalarPerVector_K1,
127  bool BThreadTransferSrcResetCoordinateAfterRun,
128  bool BBlockLdsExtraN,
129  typename CThreadTransferSrcDstAccessOrder,
130  index_t CThreadTransferSrcDstVectorDim,
131  index_t CThreadTransferDstScalarPerVector,
132  index_t NumGemmKPrefetchStage = 1,
134  PipelineVersion PipelineVer = PipelineVersion::v1>
136 {
137  static constexpr auto I0 = Number<0>{};
138  static constexpr auto I1 = Number<1>{};
139  static constexpr auto I2 = Number<2>{};
140  static constexpr auto I3 = Number<3>{};
141  static constexpr auto I4 = Number<4>{};
142  static constexpr auto I5 = Number<5>{};
143  static constexpr auto I6 = Number<6>{};
144  static constexpr auto I7 = Number<7>{};
145 
146  // K1 should be Number<...>
147  static constexpr auto K1 = Number<K1Value>{};
148 
150 
151  __host__ static auto CalculateGridSize(index_t M, index_t N)
152  {
153  return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, 1);
154  }
155 
156  template <typename CGridDesc_M_N>
157  __host__ static auto CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n)
158  {
159  return std::make_tuple(Block2CTileMap::CalculateGridSize(c_grid_desc_m_n), 1, 1);
160  }
161 
162  template <typename>
163  __host__ static auto CalculateGridSize(index_t M, index_t N)
164  {
165  return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, 1);
166  }
167 
168  __host__ static auto CalculateMPadded(index_t M)
169  {
170  return math::integer_divide_ceil(M, MPerBlock) * MPerBlock;
171  }
172 
173  __host__ static auto CalculateNPadded(index_t N)
174  {
175  return math::integer_divide_ceil(N, NPerBlock) * NPerBlock;
176  }
177 
178  __host__ static auto CalculateK0(index_t K) { return math::integer_divide_ceil(K, K1Value); }
179 
180  // Argument
181  struct Problem
182  {
183  __host__ Problem(index_t M_,
184  index_t N_,
185  index_t K_,
186  index_t StrideA_,
187  index_t StrideB_,
188  index_t StrideC_)
189  : M{M_},
190  N{N_},
191  K{K_},
192  StrideA{StrideA_},
193  StrideB{StrideB_},
194  StrideC{StrideC_},
197  K0{CalculateK0(K_)}
198  {
199  }
200 
201  __host__ void Print() const
202  {
203  std::cout << "problem {"
204  << "M:" << M << ", "
205  << "N:" << N << ", "
206  << "K:" << K << ", "
207  << "SA:" << StrideA << ", "
208  << "SB:" << StrideB << ", "
209  << "SC:" << StrideC << ", "
210  << "MP:" << MPadded << ", "
211  << "NP:" << NPadded << ", "
212  << "K0:" << K0 << "}" << std::endl;
213  }
214 
224  };
225 
226  // Argument
228  {
229  __host__ Argument(const FloatAB* p_a_grid_,
230  const FloatAB* p_b_grid_,
231  FloatC* p_c_grid_,
232  index_t M_,
233  index_t N_,
234  index_t K_,
235  index_t StrideA_,
236  index_t StrideB_,
237  index_t StrideC_)
238  : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_},
239  p_a_grid{p_a_grid_},
240  p_b_grid{p_b_grid_},
241  p_c_grid{p_c_grid_}
242  {
243  }
244 
245  const FloatAB* p_a_grid;
246  const FloatAB* p_b_grid;
247  FloatC* p_c_grid;
248  };
249 
251  decltype(GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;
252 
253  // denorm test fix, required to work around fp16 mfma issue
254  // we convert fp16->fp32->bf16 and execute bf16 mfma instruction
255  // when mfma if fixed, remove this section and update
256  // FloatABAdjusted -> FloatAB throughout this file
257 #if CK_GFX90A_DENORM_WORKAROUND
259 #else
260  using FloatABAdjusted = FloatAB;
261 #endif
262 
263  __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1()
264  {
265  constexpr auto max_lds_align = K1;
266 
267  // A matrix in LDS memory, dst of blockwise copy
268  constexpr auto a_block_desc_k0_m_k1 = [&]() {
269  if constexpr(ABlockLdsExtraM)
270  {
274  }
275  else
276  {
278  make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
279  }
280  }();
281 
282  return a_block_desc_k0_m_k1;
283  }
284 
285  __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1()
286  {
287  constexpr auto max_lds_align = K1;
288 
289  // B matrix in LDS memory, dst of blockwise copy
290  constexpr auto b_block_desc_k0_n_k1 = [&]() {
291  if constexpr(BBlockLdsExtraN)
292  {
296  }
297  else
298  {
300  make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
301  }
302  }();
303 
304  return b_block_desc_k0_n_k1;
305  }
306 
307  __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
308  {
309  // LDS allocation for A and B: be careful of alignment
310  constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
311 
312  constexpr auto b_block_desc_k0_n_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
313 
314  constexpr auto max_lds_align = K1;
315 
316  constexpr auto a_block_space_size_aligned =
317  math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
318 
319  constexpr auto b_block_space_size_aligned =
320  math::integer_least_multiple(b_block_desc_k0_n_k1.GetElementSpaceSize(), max_lds_align);
321 
322  return (a_block_space_size_aligned + b_block_space_size_aligned) * sizeof(FloatAB);
323  }
324 
325  template <typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N>
326  __host__ __device__ static constexpr bool
327  CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
328  const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
329  const CGridDesc_M_N& c_grid_desc_m_n)
330  {
331  static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
332  "wrong! K1 need to be known at compile-time");
333 
334  static_assert((MPerBlock % (MPerXDL * MXdlPerWave) == 0) &&
335  (NPerBlock % (NXdlPerWave * NPerXDL)) == 0,
336  "Invalid tuning param!");
337 
338  const auto M = a_grid_desc_k0_m_k1.GetLength(I1);
339  const auto N = b_grid_desc_k0_n_k1.GetLength(I1);
340  const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
341 
342  if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) &&
343  K0 == b_grid_desc_k0_n_k1.GetLength(I0) && K1 == a_grid_desc_k0_m_k1.GetLength(I2) &&
344  K1 == b_grid_desc_k0_n_k1.GetLength(I2)))
345  return false;
346 
347  if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
348  return false;
349 
350  // check gridwise gemm pipeline
351  const auto num_k_loop = K0 / K0PerBlock;
352 
353  if(!GridwiseGemmPipe::IsSupported(num_k_loop))
354  {
355  return false;
356  }
357 
358  // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
359  return true;
360  }
361 
362  __host__ static constexpr bool CheckValidity(const Problem& problem)
363  {
364  static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
365  "wrong! K1 need to be known at compile-time");
366 
367  static_assert((MPerBlock % (MPerXDL * MXdlPerWave) == 0) &&
368  (NPerBlock % (NXdlPerWave * NPerXDL)) == 0,
369  "Invalid tuning param!");
370 
371  // check gridwise gemm pipeline
372  const auto num_k_loop = math::integer_divide_ceil(problem.K0, K0PerBlock);
373  if(!GridwiseGemmPipe::IsSupported(num_k_loop))
374  {
375  return false;
376  }
377 
378  // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
379  return true;
380  }
381 
382  __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
383  {
384  const index_t num_loop = math::integer_divide_ceil(K, K0PerBlock * K1);
385 
386  return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
387  }
388 
389  template <typename CGridDesc>
390  __host__ __device__ static constexpr auto
391  MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc& c_grid_desc_m_n)
392  {
393  constexpr auto max_lds_align = K1;
394 
395  // A matrix in LDS memory, dst of blockwise copy
396  constexpr auto a_block_desc_k0_m_k1 = [&]() {
397  if constexpr(ABlockLdsExtraM)
398  {
402  }
403  else
404  {
406  make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
407  }
408  }();
409 
410  // B matrix in LDS memory, dst of blockwise copy
411  constexpr auto b_block_desc_k0_n_k1 = [&]() {
412  if constexpr(BBlockLdsExtraN)
413  {
417  }
418  else
419  {
421  make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
422  }
423  }();
424 
425  using BlockwiseGemm =
429  FloatAcc,
430  decltype(a_block_desc_k0_m_k1),
431  decltype(b_block_desc_k0_n_k1),
432  MPerXDL,
433  NPerXDL,
434  MXdlPerWave,
435  NXdlPerWave,
436  K1>;
437 
438  return BlockwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n);
439  }
440 
441  // return block_id to C matrix tile idx (m0, n0) mapping
443 
444  template <bool HasMainKBlockLoop,
445  typename AGridDesc_K0_M_K1,
446  typename BGridDesc_K0_N_K1,
447  typename CGridDesc_M_N>
448  __device__ static void Run(const FloatAB* p_a_grid,
449  const FloatAB* p_b_grid,
450  FloatC* p_c_grid,
451  void* __restrict__ p_shared,
452  const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
453  const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
454  const CGridDesc_M_N& c_grid_desc_m_n)
455  {
456  const auto c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
458 
459  const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
460  p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
461  const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
462  p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize());
463  auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
464  p_c_grid, c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetElementSpaceSize());
465 
466  const AElementwiseOperation a_element_op{};
467  const BElementwiseOperation b_element_op{};
468  const CElementwiseOperation c_element_op{};
469 
470  const auto block_2_ctile_map =
471  Block2CTileMap{c_grid_desc_m_n.GetLength(I0), c_grid_desc_m_n.GetLength(I1)};
472 
473  // divide block work by [M, N]
474  const auto block_work_idx =
475  block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
476 
477  if(!block_2_ctile_map.ValidCTileIndex(
478  block_work_idx,
479  make_tuple(c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I0),
480  c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I1))))
481  {
482  return;
483  }
484 
485  // HACK: this force m/n_block_data_idx_on_grid into SGPR
486  const index_t m_block_data_idx_on_grid =
487  __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
488 
489  const index_t n_block_data_idx_on_grid =
490  __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
491 
492  // lds max alignment
493  constexpr auto max_lds_align = K1;
494 
495  // A matrix in LDS memory, dst of blockwise copy
496  constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
497 
498  // B matrix in LDS memory, dst of blockwise copy
499  constexpr auto b_block_desc_k0_n_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
500 
501  // A matrix blockwise copy
502  auto a_blockwise_copy =
504  AElementwiseOperation,
508  ABlockTransferThreadClusterLengths_K0_M_K1,
509  ABlockTransferThreadClusterArrangeOrder,
510  FloatAB,
512  decltype(a_grid_desc_k0_m_k1),
513  decltype(a_block_desc_k0_m_k1),
514  ABlockTransferSrcAccessOrder,
516  ABlockTransferSrcVectorDim,
517  2,
518  ABlockTransferSrcScalarPerVector,
519  ABlockTransferDstScalarPerVector_K1,
520  1,
521  1,
522  AThreadTransferSrcResetCoordinateAfterRun,
523  true,
524  NumGemmKPrefetchStage>(
525  a_grid_desc_k0_m_k1,
526  make_multi_index(0, m_block_data_idx_on_grid, 0),
527  a_element_op,
528  a_block_desc_k0_m_k1,
529  make_multi_index(0, 0, 0),
531 
532  // B matrix blockwise copy
533  auto b_blockwise_copy =
535  BElementwiseOperation,
539  BBlockTransferThreadClusterLengths_K0_N_K1,
540  BBlockTransferThreadClusterArrangeOrder,
541  FloatAB,
543  decltype(b_grid_desc_k0_n_k1),
544  decltype(b_block_desc_k0_n_k1),
545  BBlockTransferSrcAccessOrder,
547  BBlockTransferSrcVectorDim,
548  2,
549  BBlockTransferSrcScalarPerVector,
550  BBlockTransferDstScalarPerVector_K1,
551  1,
552  1,
553  BThreadTransferSrcResetCoordinateAfterRun,
554  true,
555  NumGemmKPrefetchStage>(
556  b_grid_desc_k0_n_k1,
557  make_multi_index(0, n_block_data_idx_on_grid, 0),
558  b_element_op,
559  b_block_desc_k0_n_k1,
560  make_multi_index(0, 0, 0),
562 
563  // GEMM definition
564  // c_mtx += transpose(a_mtx) * b_mtx
565  // a_mtx[K0PerBlock, MPerBlock] is in LDS
566  // b_mtx[K0PerBlock, NPerBlock] is in LDS
567  // c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
568  // register
569  // sanity check
571  BlockSize,
574  FloatAcc,
575  decltype(a_block_desc_k0_m_k1),
576  decltype(b_block_desc_k0_n_k1),
577  MPerXDL,
578  NPerXDL,
579  MXdlPerWave,
580  NXdlPerWave,
581  K1,
582  LoopSched>();
583 
584  auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
585 
586  // LDS allocation for A and B: be careful of alignment
587  constexpr auto a_block_space_size_aligned =
588  math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
589 
590  auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
591  static_cast<FloatABAdjusted*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());
592 
593  auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
594  static_cast<FloatABAdjusted*>(p_shared) + a_block_space_size_aligned,
595  b_block_desc_k0_n_k1.GetElementSpaceSize());
596 
597  constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
598  constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
599 
600  // gridwise GEMM pipeline
601  const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
602  const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock);
603 
604  GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_k0_m_k1,
605  a_block_desc_k0_m_k1,
606  a_blockwise_copy,
607  a_grid_buf,
608  a_block_buf,
609  a_block_slice_copy_step,
610  b_grid_desc_k0_n_k1,
611  b_block_desc_k0_n_k1,
612  b_blockwise_copy,
613  b_grid_buf,
614  b_block_buf,
615  b_block_slice_copy_step,
616  blockwise_gemm,
617  c_thread_buf,
618  num_k_block_main_loop);
619 
620  // output: register to global memory
621  {
622  constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
623  blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
624 
625  constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
626  blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
627 
628  constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I0);
629  constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I1);
630  constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I2);
631  constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I3);
632  constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I4);
633  constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I5);
634  constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I6);
635  constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I7);
636 
637  // calculate origin of thread output tensor on global memory
638  // blockwise GEMM c matrix starting index
639  const auto c_thread_mtx_on_block =
640  blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
641 
642  const index_t m_thread_data_on_grid =
643  m_block_data_idx_on_grid + c_thread_mtx_on_block[I0];
644 
645  const index_t n_thread_data_on_grid =
646  n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
647 
648  const auto m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor =
650  make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
653 
654  const auto m_thread_data_on_grid_idx =
655  m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
656  make_multi_index(m_thread_data_on_grid));
657 
658  const auto n_thread_data_on_grid_to_n0_n1_n2_adaptor = make_single_stage_tensor_adaptor(
662 
663  const auto n_thread_data_on_grid_idx =
664  n_thread_data_on_grid_to_n0_n1_n2_adaptor.CalculateBottomIndex(
665  make_multi_index(n_thread_data_on_grid));
666 
667  auto c_thread_copy =
669  FloatC,
670  decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
671  decltype(c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2),
672  CElementwiseOperation,
674  CThreadTransferSrcDstAccessOrder,
675  CThreadTransferSrcDstVectorDim,
676  CThreadTransferDstScalarPerVector,
677  CGlobalMemoryDataOperation,
678  1,
679  true>{
680  c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
681  make_multi_index(m_thread_data_on_grid_idx[I0],
682  n_thread_data_on_grid_idx[I0],
683  m_thread_data_on_grid_idx[I1],
684  n_thread_data_on_grid_idx[I1],
685  m_thread_data_on_grid_idx[I2],
686  m_thread_data_on_grid_idx[I3],
687  m_thread_data_on_grid_idx[I4],
688  n_thread_data_on_grid_idx[I2]),
689  c_element_op};
690 
691  c_thread_copy.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
692  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
693  c_thread_buf,
694  c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
695  c_grid_buf);
696  }
697  }
698 };
699 
700 template <index_t BlockSize,
701  typename FloatAB,
702  typename FloatAcc,
703  typename FloatC,
704  InMemoryDataOperationEnum CGlobalMemoryDataOperation,
705  typename ALayout,
706  typename BLayout,
707  typename CLayout,
708  typename AElementwiseOperation,
709  typename BElementwiseOperation,
710  typename CElementwiseOperation,
712  index_t MPerBlock,
713  index_t NPerBlock,
714  index_t K0PerBlock,
715  index_t MPerXDL,
716  index_t NPerXDL,
717  index_t K1Value,
718  index_t MXdlPerWave,
719  index_t NXdlPerWave,
720  typename ABlockTransferThreadClusterLengths_K0_M_K1,
721  typename ABlockTransferThreadClusterArrangeOrder,
722  typename ABlockTransferSrcAccessOrder,
723  index_t ABlockTransferSrcVectorDim,
724  index_t ABlockTransferSrcScalarPerVector,
725  index_t ABlockTransferDstScalarPerVector_K1,
726  bool AThreadTransferSrcResetCoordinateAfterRun,
727  bool ABlockLdsExtraM,
728  typename BBlockTransferThreadClusterLengths_K0_N_K1,
729  typename BBlockTransferThreadClusterArrangeOrder,
730  typename BBlockTransferSrcAccessOrder,
731  index_t BBlockTransferSrcVectorDim,
732  index_t BBlockTransferSrcScalarPerVector,
733  index_t BBlockTransferDstScalarPerVector_K1,
734  bool BThreadTransferSrcResetCoordinateAfterRun,
735  bool BBlockLdsExtraN,
736  typename CThreadTransferSrcDstAccessOrder,
737  index_t CThreadTransferSrcDstVectorDim,
738  index_t CThreadTransferDstScalarPerVector,
739  index_t NumGemmKPrefetchStage = 1,
741  PipelineVersion PipelineVer = PipelineVersion::v1>
744  FloatAB,
745  FloatAcc,
746  FloatC,
747  CGlobalMemoryDataOperation,
748  AElementwiseOperation,
749  BElementwiseOperation,
750  CElementwiseOperation,
751  MPerBlock,
752  NPerBlock,
753  K0PerBlock,
754  MPerXDL,
755  NPerXDL,
756  K1Value,
757  MXdlPerWave,
758  NXdlPerWave,
759  ABlockTransferThreadClusterLengths_K0_M_K1,
760  ABlockTransferThreadClusterArrangeOrder,
761  ABlockTransferSrcAccessOrder,
762  ABlockTransferSrcVectorDim,
763  ABlockTransferSrcScalarPerVector,
764  ABlockTransferDstScalarPerVector_K1,
765  AThreadTransferSrcResetCoordinateAfterRun,
766  ABlockLdsExtraM,
767  BBlockTransferThreadClusterLengths_K0_N_K1,
768  BBlockTransferThreadClusterArrangeOrder,
769  BBlockTransferSrcAccessOrder,
770  BBlockTransferSrcVectorDim,
771  BBlockTransferSrcScalarPerVector,
772  BBlockTransferDstScalarPerVector_K1,
773  BThreadTransferSrcResetCoordinateAfterRun,
774  BBlockLdsExtraN,
775  CThreadTransferSrcDstAccessOrder,
776  CThreadTransferSrcDstVectorDim,
777  CThreadTransferDstScalarPerVector,
778  NumGemmKPrefetchStage,
779  LoopSched,
780  PipelineVer>
781 {
782  using Parent =
784  FloatAB,
785  FloatAcc,
786  FloatC,
787  CGlobalMemoryDataOperation,
788  AElementwiseOperation,
789  BElementwiseOperation,
790  CElementwiseOperation,
791  MPerBlock,
792  NPerBlock,
793  K0PerBlock,
794  MPerXDL,
795  NPerXDL,
796  K1Value,
797  MXdlPerWave,
798  NXdlPerWave,
799  ABlockTransferThreadClusterLengths_K0_M_K1,
800  ABlockTransferThreadClusterArrangeOrder,
801  ABlockTransferSrcAccessOrder,
802  ABlockTransferSrcVectorDim,
803  ABlockTransferSrcScalarPerVector,
804  ABlockTransferDstScalarPerVector_K1,
805  AThreadTransferSrcResetCoordinateAfterRun,
806  ABlockLdsExtraM,
807  BBlockTransferThreadClusterLengths_K0_N_K1,
808  BBlockTransferThreadClusterArrangeOrder,
809  BBlockTransferSrcAccessOrder,
810  BBlockTransferSrcVectorDim,
811  BBlockTransferSrcScalarPerVector,
812  BBlockTransferDstScalarPerVector_K1,
813  BThreadTransferSrcResetCoordinateAfterRun,
814  BBlockLdsExtraN,
815  CThreadTransferSrcDstAccessOrder,
816  CThreadTransferSrcDstVectorDim,
817  CThreadTransferDstScalarPerVector,
818  NumGemmKPrefetchStage,
819  LoopSched,
820  PipelineVer>;
821 
822  using typename Parent::GridwiseGemmPipe;
823  using typename Parent::Problem;
824 
825  using Parent::I1;
826 
827  using Parent::K1;
828 
829  __device__ static auto
831  {
832  const auto a_grid_desc_m_k = [&]() {
834  {
835  return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
836  }
838  {
839  return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
840  }
841  }();
842 
844  {
845  const auto K0Pad = math::integer_divide_ceil(K0, K0PerBlock) * K0PerBlock;
846  const auto KPad = K0Pad * K1Value;
847 
848  const auto a_grid_desc_m_kpad = transform_tensor_descriptor(
849  a_grid_desc_m_k,
853 
855  a_grid_desc_m_kpad,
857  make_right_pad_transform(M, MPad - M)),
860  }
861  else if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
862  {
864  a_grid_desc_m_k,
866  make_right_pad_transform(M, MPad - M)),
869  }
870  else
871  {
873  a_grid_desc_m_k,
878  }
879  }
880 
881  __device__ static auto
883  {
884  const auto b_grid_desc_k_n = [&]() {
886  {
887  return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
888  }
890  {
891  return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
892  }
893  }();
894 
896  {
897  const auto K0Pad = math::integer_divide_ceil(K0, K0PerBlock) * K0PerBlock;
898  const auto KPad = K0Pad * K1Value;
899 
900  const auto b_grid_desc_kpad_n = transform_tensor_descriptor(
901  b_grid_desc_k_n,
905 
907  b_grid_desc_kpad_n,
909  make_right_pad_transform(N, NPad - N)),
912  }
913 
914  else if constexpr(GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding)
915  {
917  b_grid_desc_k_n,
919  make_right_pad_transform(N, NPad - N)),
922  }
923  else
924  {
926  b_grid_desc_k_n,
931  }
932  }
933 
934  __device__ static auto
936  {
937  const auto c_grid_desc_m_n = [&]() {
939  {
940  return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
941  }
943  {
944  return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
945  }
946  }();
947 
950  {
951  return transform_tensor_descriptor(c_grid_desc_m_n,
953  make_right_pad_transform(N, NPad - N)),
956  }
957  else
958  {
959 
961  c_grid_desc_m_n,
965  }
966  }
967 
968  __host__ static constexpr bool CheckValidity(const Problem& problem)
969  {
970  static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
971  "wrong! K1 need to be known at compile-time");
972 
973  static_assert((MPerBlock % (MPerXDL * MXdlPerWave) == 0) &&
974  (NPerBlock % (NXdlPerWave * NPerXDL)) == 0,
975  "Invalid tuning param!");
976 
977  if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
981  {
982  if(!(problem.M % MPerBlock == 0))
983  {
984  return false;
985  }
986  }
987 
988  if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
992  {
993  if(!(problem.N % NPerBlock == 0))
994  {
995  return false;
996  }
997  }
998 
999  if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
1003  {
1004  if(!(problem.K0 % K0PerBlock == 0))
1005  {
1006  return false;
1007  }
1008  }
1009 
1011  {
1012  if(problem.K % ABlockTransferSrcScalarPerVector != 0)
1013  {
1014  return false;
1015  }
1016  }
1017  else
1018  {
1019  if(problem.M % ABlockTransferSrcScalarPerVector != 0)
1020  {
1021  return false;
1022  }
1023  }
1024 
1026  {
1027  if(problem.N % BBlockTransferSrcScalarPerVector != 0)
1028  {
1029  return false;
1030  }
1031  }
1032  else
1033  {
1034  if(problem.K % BBlockTransferSrcScalarPerVector != 0)
1035  {
1036  return false;
1037  }
1038  }
1039 
1040  // check gridwise gemm pipeline
1041  const auto num_k_loop = math::integer_divide_ceil(problem.K0, K0PerBlock);
1042 
1043  if(!GridwiseGemmPipe::IsSupported(num_k_loop))
1044  {
1045  return false;
1046  }
1047 
1048  // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
1049  return true;
1050  }
1051 };
1052 
1053 } // namespace ck
#define CK_MIN_BLOCK_PER_CU
Definition: ck.hpp:34
#define CK_MAX_THREAD_PER_BLOCK
Definition: ck.hpp:33
__host__ constexpr __device__ auto integer_least_multiple(X x, Y y)
Definition: math.hpp:78
__host__ constexpr __device__ auto integer_divide_ceil(X x, Y y)
Definition: math.hpp:72
GemmSpecialization
Definition: gemm_specialization.hpp:11
Definition: ck.hpp:264
__host__ constexpr __device__ auto make_multi_index(Xs &&... xs)
Definition: array_multi_index.hpp:15
constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
Definition: blockwise_gemm_xdlops.hpp:606
__host__ constexpr __device__ auto make_naive_tensor_descriptor(const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
Definition: tensor_descriptor_helper.hpp:49
InMemoryDataOperationEnum
Definition: ck.hpp:267
__host__ constexpr __device__ auto make_merge_transform(const LowLengths &low_lengths)
Definition: multi_index_transform_helper.hpp:55
__host__ constexpr __device__ auto make_naive_tensor_descriptor_aligned(const Tuple< Lengths... > &lengths, Align align)
Definition: tensor_descriptor_helper.hpp:132
__host__ constexpr __device__ auto make_single_stage_tensor_adaptor(const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
Definition: tensor_adaptor.hpp:429
ushort bhalf_t
Definition: data_type.hpp:24
constexpr detail::ignore_t ignore
Definition: ignore.hpp:20
__device__ uint32_t amd_wave_read_first_lane(uint32_t value)
Definition: amd_wave_read_first_lane.hpp:100
__device__ index_t get_block_1d_id()
Definition: get_id.hpp:22
typename conditional< predicate, X, Y >::type conditional_t
Definition: functional.hpp:115
__host__ constexpr __device__ auto make_pass_through_transform(const LowLength &low_length)
Definition: multi_index_transform_helper.hpp:12
__global__ void kernel_gemm_xdlops_v2r3(const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDesc_M_N c_grid_desc_m_n)
Definition: gridwise_gemm_xdlops_v2r3.hpp:34
__host__ constexpr __device__ auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:211
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:300
__host__ constexpr __device__ auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition: multi_index_transform_helper.hpp:90
LoopScheduler
Definition: loop_scheduler.hpp:15
int32_t index_t
Definition: ck.hpp:289
__host__ constexpr __device__ auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition: tensor_descriptor.hpp:319
__host__ constexpr __device__ auto make_right_pad_transform(const LowLength &low_length, const RightPadLength &right_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{})
Definition: multi_index_transform_helper.hpp:37
PipelineVersion
Definition: gridwise_gemm_pipeline_selector.hpp:17
typename remove_cv< T >::type remove_cv_t
Definition: type.hpp:298
constexpr LoopScheduler make_default_loop_scheduler()
Definition: loop_scheduler.hpp:20
Definition: blockwise_gemm_smfmac_xdlops.hpp:44
Definition: gridwise_gemm_xdlops_v2r3.hpp:228
const FloatAB * p_a_grid
Definition: gridwise_gemm_xdlops_v2r3.hpp:245
FloatC * p_c_grid
Definition: gridwise_gemm_xdlops_v2r3.hpp:247
const FloatAB * p_b_grid
Definition: gridwise_gemm_xdlops_v2r3.hpp:246
__host__ Argument(const FloatAB *p_a_grid_, const FloatAB *p_b_grid_, FloatC *p_c_grid_, index_t M_, index_t N_, index_t K_, index_t StrideA_, index_t StrideB_, index_t StrideC_)
Definition: gridwise_gemm_xdlops_v2r3.hpp:229
Definition: gridwise_gemm_xdlops_v2r3.hpp:182
index_t NPadded
Definition: gridwise_gemm_xdlops_v2r3.hpp:222
index_t StrideC
Definition: gridwise_gemm_xdlops_v2r3.hpp:220
index_t M
Definition: gridwise_gemm_xdlops_v2r3.hpp:215
index_t StrideA
Definition: gridwise_gemm_xdlops_v2r3.hpp:218
index_t N
Definition: gridwise_gemm_xdlops_v2r3.hpp:216
index_t K
Definition: gridwise_gemm_xdlops_v2r3.hpp:217
index_t StrideB
Definition: gridwise_gemm_xdlops_v2r3.hpp:219
index_t K0
Definition: gridwise_gemm_xdlops_v2r3.hpp:223
__host__ Problem(index_t M_, index_t N_, index_t K_, index_t StrideA_, index_t StrideB_, index_t StrideC_)
Definition: gridwise_gemm_xdlops_v2r3.hpp:183
__host__ void Print() const
Definition: gridwise_gemm_xdlops_v2r3.hpp:201
index_t MPadded
Definition: gridwise_gemm_xdlops_v2r3.hpp:221
Definition: gridwise_gemm_xdlops_v2r3.hpp:781
static __device__ auto MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
Definition: gridwise_gemm_xdlops_v2r3.hpp:935
static constexpr __host__ bool CheckValidity(const Problem &problem)
Definition: gridwise_gemm_xdlops_v2r3.hpp:968
static constexpr auto I1
Definition: gridwise_gemm_xdlops_v2r3.hpp:138
static constexpr auto K1
Definition: gridwise_gemm_xdlops_v2r3.hpp:147
static __device__ auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t MPad, index_t K, index_t K0, index_t StrideA)
Definition: gridwise_gemm_xdlops_v2r3.hpp:830
static __device__ auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t NPad, index_t K0, index_t StrideB)
Definition: gridwise_gemm_xdlops_v2r3.hpp:882
Definition: gridwise_gemm_xdlops_v2r3.hpp:136
ThisThreadBlock< BlockSize > ThisThreadBlock
Definition: gridwise_gemm_xdlops_v2r3.hpp:149
static __host__ auto CalculateMPadded(index_t M)
Definition: gridwise_gemm_xdlops_v2r3.hpp:168
static __device__ void Run(const FloatAB *p_a_grid, const FloatAB *p_b_grid, FloatC *p_c_grid, void *__restrict__ p_shared, const AGridDesc_K0_M_K1 &a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 &b_grid_desc_k0_n_k1, const CGridDesc_M_N &c_grid_desc_m_n)
Definition: gridwise_gemm_xdlops_v2r3.hpp:448
__host__ static constexpr __device__ bool CheckValidity(const AGridDesc_K0_M_K1 &a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 &b_grid_desc_k0_n_k1, const CGridDesc_M_N &c_grid_desc_m_n)
Definition: gridwise_gemm_xdlops_v2r3.hpp:327
__host__ static constexpr __device__ auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1()
Definition: gridwise_gemm_xdlops_v2r3.hpp:285
static constexpr __host__ bool CalculateHasMainKBlockLoop(index_t K)
Definition: gridwise_gemm_xdlops_v2r3.hpp:382
static constexpr auto I7
Definition: gridwise_gemm_xdlops_v2r3.hpp:144
static constexpr auto I2
Definition: gridwise_gemm_xdlops_v2r3.hpp:139
static __host__ auto CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n)
Definition: gridwise_gemm_xdlops_v2r3.hpp:157
static constexpr auto I5
Definition: gridwise_gemm_xdlops_v2r3.hpp:142
static __host__ auto CalculateNPadded(index_t N)
Definition: gridwise_gemm_xdlops_v2r3.hpp:173
FloatAB FloatABAdjusted
Definition: gridwise_gemm_xdlops_v2r3.hpp:260
static constexpr auto I0
Definition: gridwise_gemm_xdlops_v2r3.hpp:137
static constexpr auto I4
Definition: gridwise_gemm_xdlops_v2r3.hpp:141
static __host__ auto CalculateGridSize(index_t M, index_t N)
Definition: gridwise_gemm_xdlops_v2r3.hpp:163
static constexpr auto I1
Definition: gridwise_gemm_xdlops_v2r3.hpp:138
static constexpr auto K1
Definition: gridwise_gemm_xdlops_v2r3.hpp:147
static constexpr __host__ bool CheckValidity(const Problem &problem)
Definition: gridwise_gemm_xdlops_v2r3.hpp:362
static constexpr auto I6
Definition: gridwise_gemm_xdlops_v2r3.hpp:143
__host__ static constexpr __device__ auto MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc &c_grid_desc_m_n)
Definition: gridwise_gemm_xdlops_v2r3.hpp:391
remove_cvref_t< decltype(GridwiseGemmPipeline_Selector< PipelineVer, NumGemmKPrefetchStage, LoopSched >())> GridwiseGemmPipe
Definition: gridwise_gemm_xdlops_v2r3.hpp:251
__host__ static constexpr __device__ auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1()
Definition: gridwise_gemm_xdlops_v2r3.hpp:263
static __host__ auto CalculateK0(index_t K)
Definition: gridwise_gemm_xdlops_v2r3.hpp:178
static constexpr auto I3
Definition: gridwise_gemm_xdlops_v2r3.hpp:140
__host__ static constexpr __device__ index_t GetSharedMemoryNumberOfByte()
Definition: gridwise_gemm_xdlops_v2r3.hpp:307
static __host__ auto CalculateGridSize(index_t M, index_t N)
Definition: gridwise_gemm_xdlops_v2r3.hpp:151
Definition: sequence.hpp:43
Blockwise data transfer.
Definition: thread_group_tensor_slice_transfer_v4r1.hpp:46
Definition: threadwise_tensor_slice_transfer.hpp:39
Definition: integral_constant.hpp:10
Definition: is_known_at_compile_time.hpp:14
Definition: type.hpp:177
Definition: device_base.hpp:50
Definition: unary_element_wise_operation.hpp:241