8 #if __clang_major__ >= 20
33 template <address_space_enum BufferAddressSpace,
35 typename BufferSizeType,
36 bool InvalidElementUseNumericalZeroValue,
47 template <
typename T,
typename BufferSizeType,
bool Inval
idElementUseNumericalZeroValue>
51 InvalidElementUseNumericalZeroValue,
61 : p_data_{}, buffer_size_{}, invalid_element_value_{}
66 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
71 BufferSizeType buffer_size,
72 T invalid_element_value)
73 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
81 return address_space_enum::generic;
94 bool oob_conditional_check =
true,
96 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
101 bool is_valid_element,
109 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
110 "wrong! X should contain multiple T");
114 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
117 __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]),
sizeof(X));
121 return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
126 if constexpr(InvalidElementUseNumericalZeroValue)
128 return X{numeric<remove_cvref_t<T>>::zero()};
132 return X{invalid_element_value_};
141 template <
typename X,
142 bool oob_conditional_check =
true,
144 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
145 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
149 bool is_valid_element,
152 static_assert(
false,
"Error: transpose load not supported in global memory space.");
155 ignore = is_valid_element;
160 template <memory_operation_enum Op,
163 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
164 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
168 if constexpr(Op == memory_operation_enum::set)
170 this->
template set<X>(i, linear_offset, is_valid_element, x);
175 auto tmp = this->
template get<X>(i, linear_offset, is_valid_element);
176 this->
template set<X>(i, linear_offset, is_valid_element, x + tmp);
181 template <
typename X,
183 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
193 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
194 "wrong! X should contain multiple T");
198 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
201 __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp,
sizeof(X));
203 *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
222 template <
typename T,
223 typename BufferSizeType,
224 bool InvalidElementUseNumericalZeroValue,
229 InvalidElementUseNumericalZeroValue,
234 T* p_data_ =
nullptr;
242 : p_data_{}, buffer_size_{}, cached_buf_res_{0}, invalid_element_value_{}
248 buffer_size_{buffer_size / PackedSize},
250 invalid_element_value_{0}
255 BufferSizeType buffer_size,
256 T invalid_element_value)
258 buffer_size_{buffer_size / PackedSize},
260 invalid_element_value_{invalid_element_value}
273 return address_space_enum::global;
285 template <
typename X,
286 bool oob_conditional_check =
true,
288 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
293 bool is_valid_element,
301 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
302 "wrong! X should contain multiple T");
304 #if CK_TILE_USE_AMD_BUFFER_LOAD
305 bool constexpr use_amd_buffer_addressing =
true;
307 bool constexpr use_amd_buffer_addressing =
false;
310 if constexpr(use_amd_buffer_addressing)
312 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
314 if constexpr(InvalidElementUseNumericalZeroValue)
316 return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>,
319 oob_conditional_check>(
320 p_data_, i + linear_offset, is_valid_element, buffer_size_);
328 oob_conditional_check>(p_data_,
332 invalid_element_value_);
339 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
342 __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]),
sizeof(X));
346 return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
351 if constexpr(InvalidElementUseNumericalZeroValue)
353 return X{numeric<remove_cvref_t<T>>::zero()};
357 return X{invalid_element_value_};
367 template <
typename X,
368 bool oob_conditional_check =
true,
370 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
371 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
375 bool is_valid_element,
378 static_assert(
false,
"Error: transpose load not supported in global memory space.");
381 ignore = is_valid_element;
386 template <
typename X,
387 bool oob_conditional_check =
true,
388 bool pre_nop =
false,
390 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
391 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
396 bool is_valid_element,
403 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
404 "wrong! X should contain multiple T");
406 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
408 amd_buffer_load_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check, pre_nop>(
413 template <
typename X,
414 bool oob_conditional_check =
true,
416 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
417 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
422 bool is_valid_element,
429 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
430 "wrong! X should contain multiple T");
432 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
433 const int32x4_t src_wave_buffer_resource =
436 amd_async_buffer_load_with_oob<remove_cvref_t<T>, t_per_x, Coherence>(
438 src_wave_buffer_resource,
446 template <
typename X,
447 bool pre_nop =
false,
449 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
450 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
462 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
463 "wrong! X should contain multiple T");
465 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
467 amd_async_buffer_load_with_oob_raw<remove_cvref_t<T>, t_per_x, Coherence>(
472 template <memory_operation_enum Op,
474 bool oob_conditional_check =
true,
476 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
477 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
481 bool is_valid_element,
485 if constexpr(Op == memory_operation_enum::set)
487 this->
template set<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
491 this->
template atomic_add<X, oob_conditional_check>(
492 i, linear_offset, is_valid_element, x);
496 this->
template atomic_max<X, oob_conditional_check>(
497 i, linear_offset, is_valid_element, x);
503 this->
template get<X, oob_conditional_check>(i, linear_offset, is_valid_element);
504 this->
template set<X, oob_conditional_check>(
505 i, linear_offset, is_valid_element, x + tmp);
512 template <memory_operation_enum Op,
514 bool oob_conditional_check =
true,
515 bool pre_nop =
false,
517 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
518 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
522 bool is_valid_element,
527 if constexpr(Op == memory_operation_enum::set)
529 this->
template set_raw<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
533 this->
template atomic_add_raw<X, oob_conditional_check, pre_nop>(
534 i, linear_offset, is_valid_element, x);
543 template <
typename X,
544 bool oob_conditional_check =
true,
546 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
547 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
556 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
557 "wrong! X should contain multiple T");
559 #if CK_TILE_USE_AMD_BUFFER_STORE
560 bool constexpr use_amd_buffer_addressing =
true;
562 bool constexpr use_amd_buffer_addressing =
false;
565 if constexpr(use_amd_buffer_addressing)
567 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
569 amd_buffer_store<remove_cvref_t<T>, t_per_x, Coherence>(
570 x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
576 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
579 __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp,
sizeof(X));
581 *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
588 template <
typename X,
589 bool oob_conditional_check =
true,
591 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
601 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
602 "wrong! X should contain multiple T");
604 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
605 amd_buffer_store_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check>(
606 x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
609 template <
typename X,
610 bool oob_conditional_check =
true,
612 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
625 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
626 "wrong! X should contain multiple T");
628 static_assert(get_address_space() == address_space_enum::global,
"only support global mem");
630 #if CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
631 bool constexpr use_amd_buffer_addressing =
632 std::is_same_v<remove_cvref_t<scalar_t>,
int32_t> ||
633 std::is_same_v<remove_cvref_t<scalar_t>,
float> ||
634 (std::is_same_v<remove_cvref_t<scalar_t>,
half_t> && scalar_per_x_vector % 2 == 0);
635 #elif CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
636 bool constexpr use_amd_buffer_addressing =
637 std::is_same_v<remove_cvref_t<scalar_t>,
int32_t>;
638 #elif(!CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
639 bool constexpr use_amd_buffer_addressing =
640 std::is_same_v<remove_cvref_t<scalar_t>,
float> ||
641 (std::is_same_v<remove_cvref_t<scalar_t>,
half_t> && scalar_per_x_vector % 2 == 0);
643 bool constexpr use_amd_buffer_addressing =
false;
646 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
648 if constexpr(use_amd_buffer_addressing)
650 amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
651 x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
657 atomic_add_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
662 template <
typename X,
663 bool oob_conditional_check =
true,
666 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
679 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
680 "wrong! X should contain multiple T");
682 static_assert(get_address_space() == address_space_enum::global,
"only support global mem");
684 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
686 amd_buffer_atomic_add_raw<remove_cvref_t<T>,
689 oob_conditional_check,
691 x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
694 template <
typename X,
695 bool oob_conditional_check =
true,
697 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
708 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
709 "wrong! X should contain multiple T");
711 static_assert(get_address_space() == address_space_enum::global,
"only support global mem");
713 #if CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
715 bool constexpr use_amd_buffer_addressing = std::is_same_v<remove_cvref_t<scalar_t>,
double>;
717 bool constexpr use_amd_buffer_addressing =
false;
720 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
722 if constexpr(use_amd_buffer_addressing)
724 amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
725 x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
727 else if(is_valid_element)
729 atomic_max_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
747 template <
typename T,
typename BufferSizeType,
bool Inval
idElementUseNumericalZeroValue>
751 InvalidElementUseNumericalZeroValue,
756 T* p_data_ =
nullptr;
761 : p_data_{}, buffer_size_{}, invalid_element_value_{}
766 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
771 BufferSizeType buffer_size,
772 T invalid_element_value)
773 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
781 return address_space_enum::lds;
793 template <
typename X,
794 bool oob_conditional_check =
true,
796 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
801 bool is_valid_element,
809 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
810 "wrong! X should contain multiple T");
814 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
817 __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]),
sizeof(X));
822 scalar_per_t_vector * scalar_per_x_vector>;
824 auto rtn = *c_style_pointer_cast<const buf_t*>(&p_data_[i + linear_offset]);
825 return bit_cast<X>(rtn);
830 if constexpr(InvalidElementUseNumericalZeroValue)
832 return X{numeric<remove_cvref_t<T>>::zero()};
836 return X{invalid_element_value_};
842 template <
typename X,
843 bool oob_conditional_check =
true,
844 bool pre_nop =
false,
846 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
847 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
855 smem_load<
sizeof(X)>{}(dst, v_offset *
sizeof(T), i_offset *
sizeof(T));
858 template <
typename X,
860 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
861 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
864 [[maybe_unused]]
index_t linear_offset,
865 bool is_valid_element)
const
872 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
873 "wrong! X should contain multiple T");
877 #if defined(__gfx950__)
878 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
879 constexpr address_space_enum addr_space = get_address_space();
880 return amd_transpose_load_to_vgpr<remove_cvref_t<T>, t_per_x, addr_space>(
881 p_data_ + i + linear_offset);
888 if constexpr(InvalidElementUseNumericalZeroValue)
894 return X{invalid_element_value_};
900 template <memory_operation_enum Op,
903 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
908 if constexpr(Op == memory_operation_enum::set)
910 this->
template set<X>(i, linear_offset, is_valid_element, x);
915 auto tmp = this->
template get<X>(i, linear_offset, is_valid_element);
916 this->
template set<X>(i, linear_offset, is_valid_element, x + tmp);
921 template <
typename X,
923 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
933 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
934 "wrong! X should contain multiple T");
936 #if CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
937 bool constexpr workaround_int8_ds_write_issue =
true;
939 bool constexpr workaround_int8_ds_write_issue =
false;
945 workaround_int8_ds_write_issue)
978 "wrong! not implemented for this combination, please add "
991 *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
992 *c_style_pointer_cast<const int8_t*>(&x);
1003 *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
1004 *c_style_pointer_cast<const int16_t*>(&x);
1015 *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
1016 *c_style_pointer_cast<const int32_t*>(&x);
1027 *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
1028 *c_style_pointer_cast<const int32x2_t*>(&x);
1039 *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
1040 *c_style_pointer_cast<const int32x4_t*>(&x);
1049 *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
1050 *c_style_pointer_cast<const int32_t*>(&x);
1059 *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
1060 *c_style_pointer_cast<const int32x2_t*>(&x);
1069 *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
1070 *c_style_pointer_cast<const int32x4_t*>(&x);
1076 if(is_valid_element)
1078 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1081 __builtin_memcpy(&(p_data_[i]), &tmp,
sizeof(X));
1084 scalar_per_t_vector * scalar_per_x_vector>;
1086 *c_style_pointer_cast<buf_t*>(&p_data_[i]) =
reinterpret_cast<const buf_t&
>(x);
1106 template <
typename T,
typename BufferSizeType,
bool Inval
idElementUseNumericalZeroValue>
1110 InvalidElementUseNumericalZeroValue,
1115 T* p_data_ =
nullptr;
1120 : p_data_{}, buffer_size_{}, invalid_element_value_{}
1125 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
1130 BufferSizeType buffer_size,
1131 T invalid_element_value)
1132 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
1140 return address_space_enum::vgpr;
1152 template <
typename X,
1153 bool oob_conditional_check =
true,
1155 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1157 bool>::type =
false>
1160 bool is_valid_element,
1168 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
1169 "wrong! X should contain multiple T");
1171 if(is_valid_element)
1173 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1176 __builtin_memcpy(&tmp, &(p_data_[i]),
sizeof(X));
1180 return *c_style_pointer_cast<const X*>(&p_data_[i]);
1185 if constexpr(InvalidElementUseNumericalZeroValue)
1187 return X{numeric<remove_cvref_t<T>>::zero()};
1191 return X{invalid_element_value_};
1197 template <memory_operation_enum Op,
1200 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1201 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
1202 bool>::type =
false>
1205 if constexpr(Op == memory_operation_enum::set)
1207 this->
template set<X>(i, linear_offset, is_valid_element, x);
1212 auto tmp = this->
template get<X>(i, linear_offset, is_valid_element);
1213 this->
template set<X>(i, linear_offset, is_valid_element, x + tmp);
1218 template <
typename X,
1220 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1222 bool>::type =
false>
1230 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
1231 "wrong! X should contain multiple T");
1233 if(is_valid_element)
1235 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1238 __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp,
sizeof(X));
1240 *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
1252 template <address_space_enum BufferAddressSpace,
1255 typename BufferSizeType>
1261 template <address_space_enum BufferAddressSpace,
1264 typename BufferSizeType,
1266 typename std::enable_if<std::is_same<remove_cvref_t<T>, remove_cvref_t<X>>
::value,
1267 bool>::type =
false>
1272 p, buffer_size, invalid_element_value};
1276 template <address_space_enum BufferAddressSpace,
1278 typename BufferSizeType,
1279 bool InvalidElementUseNumericalZeroValue,
1284 InvalidElementUseNumericalZeroValue,
1287 printf(
"buffer_view{AddressSpace: %s, p_data_: %p, buffer_size_: ",
1290 print(bv.buffer_size_);
1291 printf(
", invalid_element_value_: ");
1292 print(bv.invalid_element_value_);
constexpr CK_TILE_HOST_DEVICE const char * address_space_to_string(address_space_enum addr_space)
Helper function to convert address space enum to string.
Definition: arch.hpp:246
#define CK_TILE_DEVICE
Definition: config.hpp:41
#define CK_TILE_LDS_ADDR
Definition: config.hpp:58
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:42
Definition: cluster_descriptor.hpp:13
int8_t int8x16_t
Definition: vector_type.hpp:182
int8_t int8x4_t
Definition: vector_type.hpp:180
int8_t int8x8_t
Definition: vector_type.hpp:181
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition: amd_buffer_addressing.hpp:2471
int8_t int8_t
Definition: int8.hpp:20
amd_buffer_coherence_enum
Definition: amd_buffer_addressing.hpp:1332
CK_TILE_HOST_DEVICE T add(const T &a, const T &b)
Definition: generic_memory_space_atomic.hpp:16
constexpr CK_TILE_HOST_DEVICE auto make_buffer_view(T *__restrict__ p, BufferSizeType buffer_size)
Definition: buffer_view.hpp:1256
int32_t index_t
Definition: integer.hpp:9
CK_TILE_HOST_DEVICE void print(const tile_distribution_encoding_pattern_2d< BlockSize, YPerTile, XPerTile, VecSize, DistributionPattern, NumWaveGroups > &)
Definition: static_encoding_pattern.hpp:342
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:21
int8_t pk_int4x4_t
Definition: vector_type.hpp:236
int8_t pk_int4x16_t
Definition: vector_type.hpp:238
constexpr detail::ignore_t ignore
Definition: ignore.hpp:20
typename impl::ext_vector< T, N >::type ext_vector_t
Definition: vector_type.hpp:83
int32_t int32_t
Definition: integer.hpp:10
int8_t int8x2_t
Definition: pk_int4.hpp:103
int32_t int32x4_t
Definition: vector_type.hpp:144
int8_t pk_int4x8_t
Definition: vector_type.hpp:237
_Float16 half_t
Definition: half.hpp:111
CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void *ptr, uint32_t size=0xffffffff)
Definition: amd_buffer_addressing.hpp:40
__device__ X atomic_max(X *p_dst, const X &x)
std::enable_if< B, T > enable_if
Definition: enable_if.hpp:24
constexpr bool is_same_v
Definition: type.hpp:283
__device__ X atomic_add(X *p_dst, const X &x)
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1350
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:186
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:65
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:60
constexpr CK_TILE_DEVICE auto transpose_get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:147
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:77
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:90
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:70
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:212
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:166
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:79
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:99
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:86
T type
Definition: buffer_view.hpp:54
BufferSizeType buffer_size_
Definition: buffer_view.hpp:57
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:209
constexpr CK_TILE_DEVICE auto transpose_get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:373
int32x4_t cached_buf_res_
Definition: buffer_view.hpp:236
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:737
BufferSizeType buffer_size_
Definition: buffer_view.hpp:235
constexpr CK_TILE_DEVICE auto async_get_raw(remove_cvref_t< T > *smem, index_t i, index_t linear_offset, bool, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:452
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:278
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:266
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:282
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:291
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x, bool_constant< oob_conditional_check >={})
Definition: buffer_view.hpp:479
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:271
CK_TILE_DEVICE void update_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
Definition: buffer_view.hpp:520
constexpr CK_TILE_DEVICE auto async_get(CK_TILE_LDS_ADDR remove_cvref_t< T > *smem, index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:419
T type
Definition: buffer_view.hpp:232
constexpr CK_TILE_DEVICE auto get_raw(remove_cvref_t< X > &dst, index_t v_offset, index_t i_offset, bool is_valid_element, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:393
CK_TILE_DEVICE void atomic_add(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:616
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:246
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:734
CK_TILE_DEVICE void set_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:594
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:241
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:549
CK_TILE_DEVICE void atomic_max(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:701
CK_TILE_DEVICE void atomic_add_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:670
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:254
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:926
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:760
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:1093
BufferSizeType buffer_size_
Definition: buffer_view.hpp:757
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:770
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:1096
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:777
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:799
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:906
constexpr CK_TILE_DEVICE auto transpose_get([[maybe_unused]] index_t i, [[maybe_unused]] index_t linear_offset, bool is_valid_element) const
Definition: buffer_view.hpp:863
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:765
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:786
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:790
T type
Definition: buffer_view.hpp:754
constexpr CK_TILE_DEVICE auto get_raw(remove_cvref_t< X > &dst, index_t v_offset, index_t i_offset, bool, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:849
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:779
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:1249
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:1124
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:1136
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:1246
T type
Definition: buffer_view.hpp:1113
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:1129
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:1149
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:1223
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:1119
BufferSizeType buffer_size_
Definition: buffer_view.hpp:1116
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:1145
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:1203
constexpr CK_TILE_DEVICE auto get(index_t i, index_t, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:1158
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:1138
Definition: buffer_view.hpp:38
Definition: integral_constant.hpp:13
Definition: numeric.hpp:81
Definition: numeric.hpp:18
Definition: pk_int4.hpp:21
Definition: amd_buffer_addressing.hpp:832
Definition: vector_type.hpp:89