25 template <
typename SrcData,
29 typename ElementwiseOperation,
30 typename SliceLengths,
31 typename DimAccessOrder,
35 index_t DstScalarStrideInVector,
36 bool DstResetCoordinateAfterRun,
37 typename enable_if<SrcDesc::IsKnownAtCompileTime(),
bool>::type =
false>
49 const Index& dst_slice_origin_idx,
50 const ElementwiseOperation& element_op)
52 element_op_{element_op}
54 static_assert(SrcDesc::IsKnownAtCompileTime(),
55 "wrong! SrcDesc need to known at compile-time");
57 "wrong! Not divisible");
65 template <
typename SrcSliceOriginIdx,
typename SrcBuffer,
typename DstBuffer>
66 __device__
void Run(
const SrcDesc&,
67 const SrcSliceOriginIdx&,
68 const SrcBuffer& src_buf,
69 const DstDesc& dst_desc,
72 static_assert(SrcDesc::IsKnownAtCompileTime(),
73 "wrong! SrcDesc need to known at compile-time");
76 "wrong! SrcSliceOrigin need to known at compile-time");
78 static_assert(SrcBuffer::IsStaticBuffer(),
"wrong! SrcBuffer need to be StaticBuffer");
82 constexpr
auto src_slice_origin_idx =
to_multi_index(SrcSliceOriginIdx{});
89 constexpr
auto dst_scalar_step_in_vector =
98 "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
110 constexpr
index_t src_offset = src_desc.CalculateOffset(
111 src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
118 dst_vector.template AsType<DstData>()(i) = v;
121 const bool is_dst_valid =
125 dst_buf.template Update<DstInMemOp, dst_vector_t>(
126 dst_coord_.GetOffset(),
128 dst_vector.template AsType<dst_vector_t>()[
Number<0>{}]);
130 if constexpr(idx_1d.value != num_access - 1)
140 if constexpr(DstResetCoordinateAfterRun)
142 const auto dst_reset_step =
159 if constexpr(num_access == 0)
165 constexpr
auto reset_step =
174 const Index& dst_slice_origin_step_idx)
177 const auto adjusted_step_idx =
178 DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
189 const ElementwiseOperation element_op_;
221 template <
typename SrcData,
225 typename SliceLengths,
226 typename DimAccessOrder,
229 index_t SrcScalarStrideInVector,
230 bool SrcResetCoordinateAfterRun,
231 bool InvalidElementAsNaN =
false,
232 typename enable_if<DstDesc::IsKnownAtCompileTime(),
bool>::type =
false>
236 (!InvalidElementAsNaN),
237 "Filling invalid element as NaN is only for floating point types");
255 const Index& src_slice_origin_idx)
258 static_assert(DstDesc::IsKnownAtCompileTime(),
259 "wrong! SrcDesc need to known at compile-time");
261 "wrong! Not divisible");
266 static_assert(SrcScalarPerVector %
PackedSize == 0,
"pk data N cannot be 1");
275 template <
typename SrcBuffer,
typename DstBuffer,
typename DstSliceOriginIdx>
276 __device__
void Run(
const SrcDesc& src_desc,
277 const SrcBuffer& src_buf,
279 const DstSliceOriginIdx&,
282 static_assert(DstDesc::IsKnownAtCompileTime(),
283 "wrong! DstDesc need to known at compile-time");
286 "wrong! DstSliceOrigin need to known at compile-time");
290 "wrong! inconsistent type");
294 constexpr
auto dst_slice_origin_idx = DstSliceOriginIdx{};
301 constexpr
auto src_scalar_step_in_vector =
318 const bool is_src_valid =
322 src_vector.template AsType<src_vector_t>()(
Number<0>{}) =
323 src_buf.template Get<src_vector_t>(src_coord_.GetOffset() /
PackedSize,
329 dst_desc.CalculateOffset(
to_multi_index(dst_slice_origin_idx) + src_data_idx +
330 i * src_scalar_step_in_vector);
332 if constexpr(InvalidElementAsNaN)
336 ? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
342 type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
346 if constexpr(idx_1d.value != num_access - 1)
356 if constexpr(SrcResetCoordinateAfterRun)
358 const auto src_reset_step =
375 if constexpr(num_access == 0)
381 constexpr
auto reset_step =
390 const Index& src_slice_origin_step_idx)
393 const auto adjusted_step_idx =
394 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
404 template <
typename SrcMoveSliceWindowStepHack>
407 const Index& src_slice_origin_step_idx,
408 const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
411 const auto adjusted_step_idx =
412 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
417 src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
426 template <
typename SrcData,
430 typename SliceLengths,
431 typename DimAccessOrder,
434 index_t SrcScalarStrideInVector,
435 bool SrcResetCoordinateAfterRun,
437 bool InvalidElementAsNaN =
false,
438 typename enable_if<DstDesc::IsKnownAtCompileTime(),
bool>::type =
false>
442 (!InvalidElementAsNaN),
443 "Filling invalid element as NaN is only for floating point types");
461 const SrcDesc& src_desc,
462 const Index& src_slice_origin_idx,
465 scale_gather_offsets_(scale_gather_offsets)
467 static_assert(DstDesc::IsKnownAtCompileTime(),
468 "wrong! SrcDesc need to known at compile-time");
470 "wrong! Not divisible");
474 static_assert(SrcScalarPerVector %
PackedSize == 0,
"pk data N cannot be 1");
480 auto adjusted_origin_idx = [&]() {
484 [&](
auto i) { idx(i) = i.value == 0 ? 0 : src_slice_origin_idx[
Number<i>{}]; });
492 template <
typename SrcBuffer,
typename DstBuffer,
typename DstSliceOriginIdx>
493 __device__
void Run(
const SrcDesc& src_desc,
494 const SrcBuffer& src_buf,
496 const DstSliceOriginIdx&,
499 static_assert(DstDesc::IsKnownAtCompileTime(),
500 "wrong! DstDesc need to known at compile-time");
503 "wrong! DstSliceOrigin need to known at compile-time");
507 "wrong! inconsistent type");
511 constexpr
auto dst_slice_origin_idx = DstSliceOriginIdx{};
518 constexpr
auto src_scalar_step_in_vector =
529 constexpr
auto current_dst_origin =
541 const bool is_src_valid =
546 src_vector.template AsType<src_vector_t>()(
Number<0>{}) =
547 src_buf.template Get<src_vector_t>(src_coord_.GetOffset() /
PackedSize +
548 scale_gather_offsets_(gather_idx),
555 src_data_idx + i * src_scalar_step_in_vector);
556 constexpr
auto full_dst_offset =
557 dst_desc.CalculateOffset(current_dst_origin) + dst_offset;
559 if constexpr(InvalidElementAsNaN)
561 dst_buf(full_dst_offset) =
563 ? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
569 type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
573 if constexpr(idx_1d.value != num_access - 1)
584 if constexpr(SrcResetCoordinateAfterRun)
586 const auto src_reset_step =
603 if constexpr(num_access == 0)
609 constexpr
auto reset_step =
618 const Index& src_slice_origin_step_idx)
621 const auto adjusted_step_idx =
622 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
632 template <
typename SrcMoveSliceWindowStepHack>
635 const Index& src_slice_origin_step_idx,
636 const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
639 const auto adjusted_step_idx =
640 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
645 src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
660 template <
typename SliceLengths,
666 typename SrcDimAccessOrder,
667 typename DstDimAccessOrder,
672 index_t SrcScalarStrideInVector,
673 index_t DstScalarStrideInVector,
674 bool SrcResetCoordinateAfterRun,
677 bool DstResetCoordinateAfterRun>
692 const Index& src_slice_origin,
693 const DstDesc& dst_desc,
694 const Index& dst_slice_origin)
699 "wrong! Not divisible");
701 "wrong! Not divisible");
714 template <
typename SrcBuffer,
typename SrcStepHacks>
716 RunRead(
const SrcDesc& src_desc,
const SrcBuffer& src_buf,
const SrcStepHacks& src_step_hacks)
724 "wrong! SrcBuffer and SrcData data type are inconsistent");
734 constexpr
auto src_scalar_step_in_vector =
737 constexpr
auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
739 constexpr
auto src_dim_access_order = SrcDimAccessOrder{};
741 constexpr
auto ordered_src_access_lengths =
747 Index forward_step_idx;
750 forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
754 src_desc, forward_step_idx, src_step_hacks[I0][i]);
761 Index backward_step_idx;
764 backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
768 src_desc, backward_step_idx, src_step_hacks[I1][i]);
773 static_ford<decltype(ordered_src_access_lengths)>{}([&](
auto ordered_src_access_idx) {
775 constexpr
auto forward_sweep = [&]() {
778 forward_sweep_(I0) =
true;
781 index_t tmp = ordered_src_access_idx[I0];
784 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
787 forward_sweep_(i) = tmp % 2 == 0;
790 return forward_sweep_;
794 constexpr
auto src_data_idx = [&]() {
798 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
799 : ordered_src_access_lengths[i] - 1 -
800 ordered_src_access_idx[i];
804 src_scalar_per_access;
809 using src_vector_t =
typename decltype(src_tmp_vector)::type;
811 const bool is_src_valid =
815 src_tmp_vector.template AsType<src_vector_t>()(
Number<0>{}) =
816 src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
820 constexpr
index_t buffer_offset =
821 buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
826 constexpr
auto move_on_dim = [&]() constexpr {
830 move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
834 ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
843 if constexpr(move_on_dim[i])
845 if constexpr(forward_sweep[i])
848 src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
853 src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
860 if constexpr(SrcResetCoordinateAfterRun)
862 const auto src_reset_step =
869 template <
typename DstBuffer,
typename DstStepHacks>
871 RunWrite(
const DstDesc& dst_desc, DstBuffer& dst_buf,
const DstStepHacks& dst_step_hacks)
879 "wrong! SrcBuffer or DstBuffer data type is wrong");
889 constexpr
auto dst_scalar_step_in_vector =
892 constexpr
auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
894 constexpr
auto dst_dim_access_order = DstDimAccessOrder{};
896 constexpr
auto ordered_dst_access_lengths =
902 Index forward_step_idx;
905 forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
909 dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
916 Index backward_step_idx;
919 backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
923 dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
928 static_ford<decltype(ordered_dst_access_lengths)>{}([&](
auto ordered_dst_access_idx) {
930 constexpr
auto forward_sweep = [&]() {
933 forward_sweep_(I0) =
true;
936 index_t tmp = ordered_dst_access_idx[I0];
939 tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
942 forward_sweep_(i) = tmp % 2 == 0;
945 return forward_sweep_;
949 constexpr
auto dst_data_idx = [&]() {
953 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
954 : ordered_dst_access_lengths[i] - 1 -
955 ordered_dst_access_idx[i];
959 dst_scalar_per_access;
966 constexpr
index_t buffer_offset =
967 buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
969 dst_tmp_vector.template AsType<DstData>()(i) =
973 using dst_vector_t =
typename decltype(dst_tmp_vector)::type;
976 const bool is_dst_valid =
979 dst_buf.template Set<dst_vector_t>(
980 dst_coord_.GetOffset(),
982 dst_tmp_vector.template AsType<dst_vector_t>()[
Number<0>{}]);
984 constexpr
auto move_on_dim = [&]() constexpr {
988 move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
992 ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
1001 if constexpr(move_on_dim[i])
1003 if constexpr(forward_sweep[i])
1006 dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
1011 dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
1018 if constexpr(DstResetCoordinateAfterRun)
1020 const auto dst_reset_step =
1027 template <
typename SrcBuffer>
1028 __device__
void RunRead(
const SrcDesc& src_desc,
const SrcBuffer& src_buf)
1030 constexpr
index_t ntransform_src = SrcDesc::GetNumOfTransform();
1034 constexpr
auto src_step_hacks =
1038 RunRead(src_desc, src_buf, src_step_hacks);
1041 template <
typename DstBuffer>
1042 __device__
void RunWrite(
const DstDesc& dst_desc, DstBuffer& dst_buf)
1044 constexpr
index_t ntransform_dst = DstDesc::GetNumOfTransform();
1048 constexpr
auto dst_step_hacks =
1052 RunWrite(dst_desc, dst_buf, dst_step_hacks);
1064 constexpr
auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
1066 constexpr
auto src_dim_access_order = SrcDimAccessOrder{};
1068 constexpr
auto ordered_src_access_lengths =
1072 constexpr
auto forward_sweep = [&]() {
1075 forward_sweep_(I0) =
true;
1078 index_t tmp = ordered_src_access_lengths[I0] - 1;
1081 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
1084 forward_sweep_(i) = tmp % 2 == 0;
1087 return forward_sweep_;
1092 constexpr
auto src_data_idx = [&]() {
1096 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
1100 src_scalar_per_access;
1104 constexpr
auto reset_src_data_step = [&]() {
1105 Index reset_src_data_step_;
1109 return reset_src_data_step_;
1112 return reset_src_data_step;
1124 constexpr
auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
1126 constexpr
auto dst_dim_access_order = DstDimAccessOrder{};
1128 constexpr
auto ordered_dst_access_lengths =
1132 constexpr
auto forward_sweep = [&]() {
1135 forward_sweep_(I0) =
true;
1138 index_t tmp = ordered_dst_access_lengths[I0] - 1;
1141 tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
1144 forward_sweep_(i) = tmp % 2 == 0;
1147 return forward_sweep_;
1152 constexpr
auto dst_data_idx = [&]() {
1156 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
1160 dst_scalar_per_access;
1164 constexpr
auto reset_dst_data_step = [&]() {
1165 Index reset_dst_data_step_;
1169 return reset_dst_data_step_;
1172 return reset_dst_data_step;
1177 const Index& src_slice_origin_step_idx)
1180 const auto adjusted_step_idx =
1181 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
1191 template <
typename SrcMoveSliceWindowStepHack>
1194 const Index& src_slice_origin_step_idx,
1195 const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
1198 const auto adjusted_step_idx =
1199 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
1204 src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
1210 const Index& dst_slice_origin_step_idx)
1213 const auto adjusted_step_idx =
1214 DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
1224 static constexpr
auto buffer_desc_ =
1227 static constexpr
auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
1229 StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;
1248 template <
typename SrcData,
1252 typename SliceLengths,
1253 typename DimAccessOrder,
1256 index_t SrcScalarStrideInVector,
1257 typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1258 bool>::type =
false>
1279 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1280 "wrong! SrcDesc and DstDesc need to known at compile-time");
1285 static_assert(SrcScalarPerVector %
PackedSize == 0,
"pk data N cannot be 1");
1289 template <
typename SrcRefToOriginDisplacement,
1290 typename DstOriginIdx,
1293 __device__
void Run(
const SrcDesc&,
1294 const SrcRefToOriginDisplacement&,
1295 const SrcBuffer& src_buf,
1297 const DstOriginIdx&,
1298 DstBuffer& dst_buf)
const
1300 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1301 "wrong! SrcDesc and DstDesc need to known at compile-time");
1306 "wrong! SrcBuffer or DstBuffer data type is wrong");
1308 static_assert(DstBuffer::IsStaticBuffer(),
"wrong! DstBuffer need to be StaticBuffer");
1312 "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
1320 constexpr
auto src_ref_to_origin_disp_idx =
to_multi_index(SrcRefToOriginDisplacement{});
1325 [&](
auto i) constexpr {
1326 if constexpr(i == SrcVectorDim)
1339 [&](
auto i) constexpr {
1340 if constexpr(i == SrcVectorDim)
1351 constexpr
auto access_lengths = SliceLengths{} / src_scalar_per_access;
1353 constexpr
auto dim_access_order = DimAccessOrder{};
1355 constexpr
auto ordered_access_lengths =
1358 static_ford<decltype(ordered_access_lengths)>{}([&](
auto ordered_access_idx) {
1362 constexpr
auto data_to_origin_disp_idx =
1364 src_scalar_per_access;
1367 constexpr
auto data_to_origin_disp_idx =
1368 ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
1371 constexpr
auto src_ref_to_data_disp_idx =
1372 src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
1374 constexpr
auto src_ref_to_data_disp_coord_step =
1377 auto src_data_coord = src_ref_coord_;
1383 using src_vector_t =
typename decltype(src_tmp_vector)::type;
1386 src_desc, src_data_coord);
1389 if constexpr(SrcBuffer::IsDynamicBuffer())
1391 src_tmp_vector.template AsType<src_vector_t>()(
Number<0>{}) =
1392 src_buf.template Get<src_vector_t>(src_data_coord.GetOffset() /
PackedSize,
1395 else if constexpr(SrcBuffer::IsStaticBuffer())
1398 constexpr
index_t src_offset = src_desc.CalculateOffset(
1399 src_ref_to_origin_disp_idx + data_to_origin_disp_idx +
1400 i * src_scalar_step_in_vector);
1412 constexpr
index_t pack_size = 8;
1414 static_assert(SrcScalarPerVector % pack_size == 0,
"");
1419 static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](
auto i) {
1421 dst_tmp_vector.template AsType<dst_v_t>()(i),
1422 src_tmp_vector.template AsType<src_v_t>()[i]);
1427 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1428 dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1435 SrcScalarPerVector % 2 == 0)
1441 constexpr
index_t pack_size = 2;
1445 static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](
auto i) {
1447 dst_tmp_vector.template AsType<dst_v_t>()(i),
1448 src_tmp_vector.template AsType<src_v_t>()[i]);
1453 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1454 dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1467 dst_tmp_vector.template AsType<DstData>()(i) =
1468 type_convert<DstData>(src_tmp_vector.template AsType<SrcData>()[i]);
1473 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1474 dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1483 template <
typename SrcRefToOriginDisplacement,
1484 typename DstOriginIdx,
1487 __device__
void Run(
const SrcDesc&,
1488 const SrcRefToOriginDisplacement&,
1489 const SrcBuffer& src_buf,
1490 const DstData& scale,
1492 const DstOriginIdx&,
1493 DstBuffer& dst_buf)
const
1495 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1496 "wrong! SrcDesc and DstDesc need to known at compile-time");
1501 "wrong! SrcBuffer or DstBuffer data type is wrong");
1503 static_assert(DstBuffer::IsStaticBuffer(),
"wrong! DstBuffer need to be StaticBuffer");
1507 "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
1515 constexpr
auto src_ref_to_origin_disp_idx =
to_multi_index(SrcRefToOriginDisplacement{});
1520 [&](
auto i) constexpr {
1521 if constexpr(i == SrcVectorDim)
1534 [&](
auto i) constexpr {
1535 if constexpr(i == SrcVectorDim)
1546 constexpr
auto access_lengths = SliceLengths{} / src_scalar_per_access;
1548 constexpr
auto dim_access_order = DimAccessOrder{};
1550 constexpr
auto ordered_access_lengths =
1553 static_ford<decltype(ordered_access_lengths)>{}([&](
auto ordered_access_idx) {
1557 constexpr
auto data_to_origin_disp_idx =
1559 src_scalar_per_access;
1562 constexpr
auto data_to_origin_disp_idx =
1563 ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
1566 constexpr
auto src_ref_to_data_disp_idx =
1567 src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
1569 constexpr
auto src_ref_to_data_disp_coord_step =
1572 auto src_data_coord = src_ref_coord_;
1578 using src_vector_t =
typename decltype(src_tmp_vector)::type;
1581 src_desc, src_data_coord);
1584 if constexpr(SrcBuffer::IsDynamicBuffer())
1586 src_tmp_vector.template AsType<src_vector_t>()(
Number<0>{}) =
1587 src_buf.template Get<src_vector_t>(src_data_coord.GetOffset() /
PackedSize,
1590 else if constexpr(SrcBuffer::IsStaticBuffer())
1593 constexpr
index_t src_offset = src_desc.CalculateOffset(
1594 src_ref_to_origin_disp_idx + data_to_origin_disp_idx +
1595 i * src_scalar_step_in_vector);
1607 scale_vector.template AsType<DstData>()(
Number<0>{}) = scale;
1608 scale_vector.template AsType<DstData>()(
Number<1>{}) = scale;
1610 constexpr
index_t pack_size = 8;
1612 static_assert(SrcScalarPerVector % pack_size == 0,
"");
1618 static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](
auto i) {
1620 dst_tmp_vector.template AsType<dst_v_t>()(i),
1621 src_tmp_vector.template AsType<src_v_t>()[i],
1622 scale_vector.template AsType<scale_v_t>()[
Number<0>{}]);
1627 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1628 dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1635 SrcScalarPerVector % 2 == 0)
1641 constexpr
index_t pack_size = 2;
1645 static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](
auto i) {
1647 dst_tmp_vector.template AsType<dst_v_t>()(i),
1648 src_tmp_vector.template AsType<src_v_t>()[i]);
1653 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1654 dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1667 dst_tmp_vector.template AsType<DstData>()(i) =
1668 type_convert<DstData>(src_tmp_vector.template AsType<SrcData>()[i]);
1673 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1674 dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
1682 template <
typename SrcSliceMoveStepIdx>
1684 const SrcSliceMoveStepIdx& src_slice_move_step_idx)
1686 constexpr
auto src_desc = SrcDesc{};
1688 const auto src_slice_move_step_iter =
1708 template <
typename SrcData,
1712 typename ElementwiseOperation,
1713 typename SliceLengths,
1714 typename DimAccessOrder,
1717 typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1718 bool>::type =
false>
1733 const ElementwiseOperation& element_op)
1736 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1737 "wrong! Desc need to known at compile-time");
1740 "wrong! Not divisible");
1743 template <
typename SrcSliceOriginIdx,
1744 typename DstSliceOriginIdx,
1747 __device__
void Run(
const SrcDesc&,
1748 const SrcSliceOriginIdx&,
1749 const SrcBuffer& src_buf,
1751 const DstSliceOriginIdx&,
1752 DstBuffer& dst_buf)
const
1754 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1755 "wrong! Desc need to known at compile-time");
1759 "wrong! SliceOrigin need to known at compile-time");
1761 static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
1762 "wrong! Buffer need to be StaticBuffer");
1767 constexpr
auto src_slice_origin_idx =
to_multi_index(SrcSliceOriginIdx{});
1768 constexpr
auto dst_slice_origin_idx =
to_multi_index(DstSliceOriginIdx{});
1774 constexpr
auto dst_scalar_step_in_vector =
1782 "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
1796 constexpr
index_t src_offset = src_desc.CalculateOffset(
1797 src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1806 constexpr
index_t pack_size = 8;
1808 static_assert(DstScalarPerVector % pack_size == 0,
"");
1813 static_for<0, DstScalarPerVector / pack_size, 1>{}([&](
auto i) {
1815 dst_tmp_vector.template AsType<dst_v_t>()(i),
1816 src_tmp_vector.template AsType<src_v_t>()[i]);
1821 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1822 dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1835 constexpr
index_t src_offset = src_desc.CalculateOffset(
1836 src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1838 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1839 dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1862 template <
typename SrcData,
1866 typename ElementwiseOperation,
1867 typename SliceLengths,
1868 typename DimAccessOrder,
1871 uint32_t LowEightRowlaneIdx,
1872 uint32_t HighEightRowLaneIdx,
1873 bool IntraRowSwizzlePerm,
1874 typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1875 bool>::type =
false>
1884 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1885 "wrong! Desc need to known at compile-time");
1888 "wrong! Not divisible");
1892 template <
typename SrcSliceOriginIdx,
1893 typename DstSliceOriginIdx,
1896 __device__
void Run(
const SrcDesc&,
1897 const SrcSliceOriginIdx&,
1898 const SrcBuffer& src_buf,
1900 const DstSliceOriginIdx&,
1901 DstBuffer& dst_buf)
const
1903 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
1904 "wrong! Desc need to known at compile-time");
1908 "wrong! SliceOrigin need to known at compile-time");
1910 static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
1911 "wrong! Buffer need to be StaticBuffer");
1916 constexpr
auto src_slice_origin_idx =
to_multi_index(SrcSliceOriginIdx{});
1917 constexpr
auto dst_slice_origin_idx =
to_multi_index(DstSliceOriginIdx{});
1923 constexpr
auto dst_scalar_step_in_vector =
1931 "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
1941 constexpr
index_t src_offset = src_desc.CalculateOffset(
1942 src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1944 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
1945 dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
1947 SrcData v_this_row, v_theother_row;
1955 if constexpr(IntraRowSwizzlePerm)
1957 temp = __builtin_amdgcn_permlane16(
1958 temp, type_convert_sp<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
1959 v_this_row = type_convert_sp<SrcData>(temp);
1963 temp = __builtin_amdgcn_permlanex16(temp,
1964 type_convert_sp<int>(v_this_row),
1966 HighEightRowLaneIdx,
1969 v_theother_row = type_convert_sp<SrcData>(temp);
1976 type_convert_sp<DstData>(v_theother_row);
1982 type_convert_sp<DstData>(v_this_row);
1992 template <
typename SrcData,
1996 typename ElementwiseOperation,
1997 typename SliceLengths,
1998 typename DimAccessOrder,
2001 bool IntraRowSwizzlePerm,
2002 typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
2003 bool>::type =
false>
2012 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
2013 "wrong! Desc need to known at compile-time");
2016 "wrong! Not divisible");
2020 template <
typename SrcSliceOriginIdx,
2021 typename DstSliceOriginIdx,
2024 __device__
void Run(
const SrcDesc&,
2025 const SrcSliceOriginIdx&,
2026 const SrcBuffer& src_buf,
2028 const DstSliceOriginIdx&,
2029 DstBuffer& dst_buf)
const
2031 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
2032 "wrong! Desc need to known at compile-time");
2036 "wrong! SliceOrigin need to known at compile-time");
2038 static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
2039 "wrong! Buffer need to be StaticBuffer");
2044 constexpr
auto src_slice_origin_idx =
to_multi_index(SrcSliceOriginIdx{});
2045 constexpr
auto dst_slice_origin_idx =
to_multi_index(DstSliceOriginIdx{});
2051 constexpr
auto dst_scalar_step_in_vector =
2059 "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
2069 constexpr
index_t src_offset = src_desc.CalculateOffset(
2070 src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
2072 constexpr
index_t dst_offset = dst_desc.CalculateOffset(
2073 dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
2083 if constexpr(IntraRowSwizzlePerm)
2085 temp = __builtin_amdgcn_permlane16(
2086 temp, type_convert_sp<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
2087 v_this_row = type_convert_sp<SrcData>(temp);
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition: statically_indexed_array.hpp:45
__host__ constexpr __device__ bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition: tensor_descriptor.hpp:560
__host__ constexpr __device__ auto make_multi_index(Xs &&... xs)
Definition: array_multi_index.hpp:15
__host__ constexpr __device__ auto generate_tuple(F &&f, Number< N >)
Definition: tuple_helper.hpp:21
InMemoryDataOperationEnum
Definition: ck.hpp:275
__host__ constexpr __device__ auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition: tensor_descriptor_helper.hpp:101
f8_fnuz_t f8_t
Definition: amd_ck_fp8.hpp:1737
__host__ constexpr __device__ auto to_multi_index(const T &x)
Definition: array_multi_index.hpp:28
_Float16 half_t
Definition: data_type.hpp:30
__host__ constexpr __device__ auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition: tensor_descriptor.hpp:407
constexpr detail::ignore_t ignore
Definition: ignore.hpp:20
__host__ constexpr __device__ auto generate_sequence(F, Number< N >)
Definition: sequence_helper.hpp:18
__host__ constexpr __device__ auto generate_sequence_v2(F &&f, Number< N >)
Definition: sequence_helper.hpp:25
__host__ constexpr __device__ auto sequence_to_tuple_of_number(Sequence< Is... >)
Definition: container_helper.hpp:380
std::enable_if< B, T > enable_if
Definition: enable_if.hpp:24
constexpr bool is_same_v
Definition: type.hpp:283
__host__ constexpr __device__ auto container_reorder_given_new2old(const Array< TData, NSize > &old_array, Sequence< IRs... >)
Definition: container_helper.hpp:43
__host__ constexpr __device__ auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:211
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:297
int32_t index_t
Definition: ck.hpp:297
__host__ constexpr __device__ void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition: tensor_descriptor.hpp:508
__host__ constexpr __device__ auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition: tensor_descriptor.hpp:444
__device__ index_t get_thread_local_1d_id()
Definition: get_id.hpp:52
typename remove_cv< T >::type remove_cv_t
Definition: type.hpp:295
__host__ constexpr __device__ auto container_reorder_given_old2new(const Array< TData, NSize > &old_array, Sequence< IRs... > old2new)
Definition: container_helper.hpp:54
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition: dtype_vector.hpp:54
__host__ static constexpr __device__ T QuietNaN()
Definition: numeric_limits.hpp:313
Definition: tensor_space_filling_curve.hpp:20
static __device__ constexpr __host__ auto GetForwardStep(Number< AccessIdx1d >)
Definition: tensor_space_filling_curve.hpp:66
__host__ static constexpr __device__ index_t GetNumOfAccess()
Definition: tensor_space_filling_curve.hpp:41
static constexpr index_t ScalarPerVector
Definition: tensor_space_filling_curve.hpp:25
static __device__ constexpr __host__ Index GetIndex(Number< AccessIdx1d >)
Definition: tensor_space_filling_curve.hpp:81
static __device__ constexpr __host__ auto GetStepBetween(Number< AccessIdx1dBegin >, Number< AccessIdx1dEnd >)
Definition: tensor_space_filling_curve.hpp:52
Definition: threadwise_tensor_slice_transfer.hpp:1877
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1878
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow(const Index &src_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1882
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:1988
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1896
Definition: threadwise_tensor_slice_transfer.hpp:2005
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:2006
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic_IntraRow(const Index &src_idx)
Definition: threadwise_tensor_slice_transfer.hpp:2010
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:2095
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:2024
Threadwise data transfer.
Definition: threadwise_tensor_slice_transfer.hpp:1720
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:1725
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1747
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1721
ElementwiseOperation element_op_
Definition: threadwise_tensor_slice_transfer.hpp:1853
constexpr __device__ ThreadwiseTensorSliceTransfer_StaticToStatic(const ElementwiseOperation &element_op)
Definition: threadwise_tensor_slice_transfer.hpp:1732
Definition: threadwise_tensor_slice_transfer.hpp:39
static constexpr __device__ auto GetDstCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:149
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:40
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:42
decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition: threadwise_tensor_slice_transfer.hpp:44
constexpr __device__ ThreadwiseTensorSliceTransfer_v1r3(const DstDesc &dst_desc, const Index &dst_slice_origin_idx, const ElementwiseOperation &element_op)
Definition: threadwise_tensor_slice_transfer.hpp:48
decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) DstCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:46
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:173
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:60
__device__ void Run(const SrcDesc &, const SrcSliceOriginIdx &, const SrcBuffer &src_buf, const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:66
Definition: threadwise_tensor_slice_transfer.hpp:440
__device__ void Run(const SrcDesc &src_desc, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:493
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:453
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition: threadwise_tensor_slice_transfer.hpp:634
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:478
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:449
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:447
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:451
static constexpr __device__ auto GetSrcCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:593
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:617
constexpr __device__ ThreadwiseTensorSliceTransfer_v2_gather(const SrcDesc &src_desc, const Index &src_slice_origin_idx, const StaticallyIndexedArray< index_t, scale_gather_num > &scale_gather_offsets)
Definition: threadwise_tensor_slice_transfer.hpp:460
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:445
Helper structure that facilitates transfer of source (grid) data to destination threads.
Definition: threadwise_tensor_slice_transfer.hpp:234
constexpr __device__ ThreadwiseTensorSliceTransfer_v2(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:254
__device__ void Run(const SrcDesc &src_desc, const SrcBuffer &src_buf, const DstDesc &, const DstSliceOriginIdx &, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:276
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:241
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:389
static constexpr __device__ auto GetSrcCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:365
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition: threadwise_tensor_slice_transfer.hpp:406
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:270
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:239
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:245
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:247
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:243
Definition: threadwise_tensor_slice_transfer.hpp:681
decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition: threadwise_tensor_slice_transfer.hpp:686
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:688
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:683
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition: threadwise_tensor_slice_transfer.hpp:1193
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf, const SrcStepHacks &src_step_hacks)
Definition: threadwise_tensor_slice_transfer.hpp:716
decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) DstCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:689
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1209
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:709
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition: threadwise_tensor_slice_transfer.hpp:1042
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition: threadwise_tensor_slice_transfer.hpp:704
static constexpr __device__ auto GetSrcCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:1055
static constexpr __device__ auto GetDstCoordinateResetStep()
Definition: threadwise_tensor_slice_transfer.hpp:1115
constexpr __device__ ThreadwiseTensorSliceTransfer_v3(const SrcDesc &src_desc, const Index &src_slice_origin, const DstDesc &dst_desc, const Index &dst_slice_origin)
Definition: threadwise_tensor_slice_transfer.hpp:691
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:685
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:682
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf)
Definition: threadwise_tensor_slice_transfer.hpp:1028
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1176
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf, const DstStepHacks &dst_step_hacks)
Definition: threadwise_tensor_slice_transfer.hpp:871
Definition: threadwise_tensor_slice_transfer.hpp:1260
static constexpr index_t nDim
Definition: threadwise_tensor_slice_transfer.hpp:1261
__device__ void Run(const SrcDesc &, const SrcRefToOriginDisplacement &, const SrcBuffer &src_buf, const DstDesc &, const DstOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1293
static constexpr index_t PackedSize
Definition: threadwise_tensor_slice_transfer.hpp:1269
decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition: threadwise_tensor_slice_transfer.hpp:1265
constexpr __device__ ThreadwiseTensorSliceTransfer_v4(const Index &src_ref_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1276
decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition: threadwise_tensor_slice_transfer.hpp:1267
__device__ void Run(const SrcDesc &, const SrcRefToOriginDisplacement &, const SrcBuffer &src_buf, const DstData &scale, const DstDesc &, const DstOriginIdx &, DstBuffer &dst_buf) const
Definition: threadwise_tensor_slice_transfer.hpp:1487
__device__ void SetSrcCoord(const Index &src_ref_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1693
MultiIndex< nDim > Index
Definition: threadwise_tensor_slice_transfer.hpp:1263
__device__ void MoveSrcSliceWindow(const SrcDesc &, const SrcSliceMoveStepIdx &src_slice_move_step_idx)
Definition: threadwise_tensor_slice_transfer.hpp:1683
Definition: threadwise_tensor_slice_transfer_util.hpp:20
Definition: threadwise_tensor_slice_transfer_util.hpp:29
Definition: data_type.hpp:41
Definition: integral_constant.hpp:20
Definition: is_known_at_compile_time.hpp:14
Definition: data_type.hpp:186
Definition: functional2.hpp:33
Definition: functional3.hpp:97
Definition: unary_element_wise_operation.hpp:241
Definition: unary_element_wise_operation.hpp:277
Definition: unary_element_wise_operation.hpp:133
Definition: dtype_vector.hpp:30
Definition: dtype_vector.hpp:10