30 template <address_space_enum BufferAddressSpace,
32 typename BufferSizeType,
33 bool InvalidElementUseNumericalZeroValue,
44 template <
typename T,
typename BufferSizeType,
bool Inval
idElementUseNumericalZeroValue>
48 InvalidElementUseNumericalZeroValue,
58 : p_data_{}, buffer_size_{}, invalid_element_value_{}
63 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
68 BufferSizeType buffer_size,
69 T invalid_element_value)
70 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
78 return address_space_enum::generic;
91 bool oob_conditional_check =
true,
93 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
98 bool is_valid_element,
106 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
107 "wrong! X should contain multiple T");
111 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
114 __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]),
sizeof(X));
118 return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
123 if constexpr(InvalidElementUseNumericalZeroValue)
125 return X{numeric<remove_cvref_t<T>>::zero()};
129 return X{invalid_element_value_};
138 template <
typename X,
139 bool oob_conditional_check =
true,
141 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
142 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
146 bool is_valid_element,
149 static_assert(
false,
"Error: transpose load not supported in global memory space.");
152 ignore = is_valid_element;
157 template <memory_operation_enum Op,
160 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
161 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
165 if constexpr(Op == memory_operation_enum::set)
167 this->
template set<X>(i, linear_offset, is_valid_element, x);
172 auto tmp = this->
template get<X>(i, linear_offset, is_valid_element);
173 this->
template set<X>(i, linear_offset, is_valid_element, x + tmp);
178 template <
typename X,
180 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
190 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
191 "wrong! X should contain multiple T");
195 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
198 __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp,
sizeof(X));
200 *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
219 template <
typename T,
220 typename BufferSizeType,
221 bool InvalidElementUseNumericalZeroValue,
226 InvalidElementUseNumericalZeroValue,
231 T* p_data_ =
nullptr;
239 : p_data_{}, buffer_size_{}, cached_buf_res_{0}, invalid_element_value_{}
245 buffer_size_{buffer_size / PackedSize},
247 invalid_element_value_{}
252 BufferSizeType buffer_size,
253 T invalid_element_value)
255 buffer_size_{buffer_size / PackedSize},
257 invalid_element_value_{invalid_element_value}
270 return address_space_enum::global;
282 template <
typename X,
283 bool oob_conditional_check =
true,
285 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
290 bool is_valid_element,
298 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
299 "wrong! X should contain multiple T");
301 #if CK_TILE_USE_AMD_BUFFER_LOAD
302 bool constexpr use_amd_buffer_addressing =
true;
304 bool constexpr use_amd_buffer_addressing =
false;
307 if constexpr(use_amd_buffer_addressing)
309 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
311 if constexpr(InvalidElementUseNumericalZeroValue)
313 return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>,
316 oob_conditional_check>(
317 p_data_, i + linear_offset, is_valid_element, buffer_size_);
325 oob_conditional_check>(p_data_,
329 invalid_element_value_);
336 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
339 __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]),
sizeof(X));
343 return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
348 if constexpr(InvalidElementUseNumericalZeroValue)
350 return X{numeric<remove_cvref_t<T>>::zero()};
354 return X{invalid_element_value_};
364 template <
typename X,
365 bool oob_conditional_check =
true,
367 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
368 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
372 bool is_valid_element,
375 static_assert(
false,
"Error: transpose load not supported in global memory space.");
378 ignore = is_valid_element;
383 template <
typename X,
384 bool oob_conditional_check =
true,
385 bool pre_nop =
false,
387 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
388 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
393 bool is_valid_element,
400 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
401 "wrong! X should contain multiple T");
403 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
405 amd_buffer_load_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check, pre_nop>(
410 template <
typename X,
411 bool oob_conditional_check =
true,
413 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
414 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
419 bool is_valid_element,
426 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
427 "wrong! X should contain multiple T");
429 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
430 const int32x4_t src_wave_buffer_resource =
433 amd_async_buffer_load_with_oob<remove_cvref_t<T>, t_per_x, Coherence>(
435 src_wave_buffer_resource,
443 template <
typename X,
444 bool pre_nop =
false,
446 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
447 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
459 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
460 "wrong! X should contain multiple T");
462 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
464 amd_async_buffer_load_with_oob_raw<remove_cvref_t<T>, t_per_x, Coherence>(
469 template <memory_operation_enum Op,
471 bool oob_conditional_check =
true,
473 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
474 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
478 bool is_valid_element,
482 if constexpr(Op == memory_operation_enum::set)
484 this->
template set<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
488 this->
template atomic_add<X, oob_conditional_check>(
489 i, linear_offset, is_valid_element, x);
493 this->
template atomic_max<X, oob_conditional_check>(
494 i, linear_offset, is_valid_element, x);
500 this->
template get<X, oob_conditional_check>(i, linear_offset, is_valid_element);
501 this->
template set<X, oob_conditional_check>(
502 i, linear_offset, is_valid_element, x + tmp);
509 template <memory_operation_enum Op,
511 bool oob_conditional_check =
true,
512 bool pre_nop =
false,
514 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
515 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
519 bool is_valid_element,
524 if constexpr(Op == memory_operation_enum::set)
526 this->
template set_raw<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
530 this->
template atomic_add_raw<X, oob_conditional_check, pre_nop>(
531 i, linear_offset, is_valid_element, x);
540 template <
typename X,
541 bool oob_conditional_check =
true,
543 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
544 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
553 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
554 "wrong! X should contain multiple T");
556 #if CK_TILE_USE_AMD_BUFFER_STORE
557 bool constexpr use_amd_buffer_addressing =
true;
559 bool constexpr use_amd_buffer_addressing =
false;
562 if constexpr(use_amd_buffer_addressing)
564 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
566 amd_buffer_store<remove_cvref_t<T>, t_per_x, Coherence>(
567 x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
573 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
576 __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp,
sizeof(X));
578 *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
585 template <
typename X,
586 bool oob_conditional_check =
true,
588 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
598 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
599 "wrong! X should contain multiple T");
601 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
602 amd_buffer_store_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check>(
603 x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
606 template <
typename X,
607 bool oob_conditional_check =
true,
609 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
622 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
623 "wrong! X should contain multiple T");
625 static_assert(get_address_space() == address_space_enum::global,
"only support global mem");
627 #if CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
628 bool constexpr use_amd_buffer_addressing =
629 std::is_same_v<remove_cvref_t<scalar_t>,
int32_t> ||
630 std::is_same_v<remove_cvref_t<scalar_t>,
float> ||
631 (std::is_same_v<remove_cvref_t<scalar_t>,
half_t> && scalar_per_x_vector % 2 == 0)
632 #
if defined(__gfx950__)
634 (std::is_same_v<remove_cvref_t<scalar_t>,
bfloat16_t> && scalar_per_x_vector % 2 == 0)
637 #elif CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
638 bool constexpr use_amd_buffer_addressing =
639 std::is_same_v<remove_cvref_t<scalar_t>,
int32_t>;
640 #elif(!CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
641 bool constexpr use_amd_buffer_addressing =
642 std::is_same_v<remove_cvref_t<scalar_t>,
float> ||
643 (std::is_same_v<remove_cvref_t<scalar_t>,
half_t> && scalar_per_x_vector % 2 == 0)
644 #
if defined(__gfx950__)
646 (std::is_same_v<remove_cvref_t<scalar_t>,
bfloat16_t> && scalar_per_x_vector % 2 == 0)
650 bool constexpr use_amd_buffer_addressing =
false;
653 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
655 if constexpr(use_amd_buffer_addressing)
657 amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
658 x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
664 atomic_add_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
669 template <
typename X,
670 bool oob_conditional_check =
true,
673 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
686 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
687 "wrong! X should contain multiple T");
689 static_assert(get_address_space() == address_space_enum::global,
"only support global mem");
691 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
693 amd_buffer_atomic_add_raw<remove_cvref_t<T>,
696 oob_conditional_check,
698 x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
701 template <
typename X,
702 bool oob_conditional_check =
true,
704 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
715 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
716 "wrong! X should contain multiple T");
718 static_assert(get_address_space() == address_space_enum::global,
"only support global mem");
720 #if CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
722 bool constexpr use_amd_buffer_addressing = std::is_same_v<remove_cvref_t<scalar_t>,
double>;
724 bool constexpr use_amd_buffer_addressing =
false;
727 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
729 if constexpr(use_amd_buffer_addressing)
731 amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
732 x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
734 else if(is_valid_element)
736 atomic_max_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
754 template <
typename T,
typename BufferSizeType,
bool Inval
idElementUseNumericalZeroValue>
758 InvalidElementUseNumericalZeroValue,
763 T* p_data_ =
nullptr;
768 : p_data_{}, buffer_size_{}, invalid_element_value_{}
773 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
778 BufferSizeType buffer_size,
779 T invalid_element_value)
780 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
788 return address_space_enum::lds;
800 template <
typename X,
801 bool oob_conditional_check =
true,
803 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
808 bool is_valid_element,
816 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
817 "wrong! X should contain multiple T");
821 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
824 __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]),
sizeof(X));
829 scalar_per_t_vector * scalar_per_x_vector>;
831 auto rtn = *c_style_pointer_cast<const buf_t*>(&p_data_[i + linear_offset]);
832 return bit_cast<X>(rtn);
837 if constexpr(InvalidElementUseNumericalZeroValue)
839 return X{numeric<remove_cvref_t<T>>::zero()};
843 return X{invalid_element_value_};
849 template <
typename X,
850 bool oob_conditional_check =
true,
851 bool pre_nop =
false,
853 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
854 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
862 smem_load<
sizeof(X)>{}(dst, v_offset *
sizeof(T), i_offset *
sizeof(T));
865 template <
typename X,
867 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
868 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
871 [[maybe_unused]]
index_t linear_offset,
872 bool is_valid_element)
const
879 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
880 "wrong! X should contain multiple T");
884 #if defined(__gfx950__)
885 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
886 return amd_transpose_load_to_vgpr<remove_cvref_t<T>, t_per_x>(p_data_ + i +
894 if constexpr(InvalidElementUseNumericalZeroValue)
900 return X{invalid_element_value_};
906 template <memory_operation_enum Op,
909 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
914 if constexpr(Op == memory_operation_enum::set)
916 this->
template set<X>(i, linear_offset, is_valid_element, x);
921 auto tmp = this->
template get<X>(i, linear_offset, is_valid_element);
922 this->
template set<X>(i, linear_offset, is_valid_element, x + tmp);
927 template <
typename X,
929 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
939 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
940 "wrong! X should contain multiple T");
942 #if CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
943 bool constexpr workaround_int8_ds_write_issue =
true;
945 bool constexpr workaround_int8_ds_write_issue =
false;
951 workaround_int8_ds_write_issue)
984 "wrong! not implemented for this combination, please add "
997 *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
998 *c_style_pointer_cast<const int8_t*>(&x);
1009 *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
1010 *c_style_pointer_cast<const int16_t*>(&x);
1021 *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
1022 *c_style_pointer_cast<const int32_t*>(&x);
1033 *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
1034 *c_style_pointer_cast<const int32x2_t*>(&x);
1045 *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
1046 *c_style_pointer_cast<const int32x4_t*>(&x);
1055 *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
1056 *c_style_pointer_cast<const int32_t*>(&x);
1065 *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
1066 *c_style_pointer_cast<const int32x2_t*>(&x);
1075 *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
1076 *c_style_pointer_cast<const int32x4_t*>(&x);
1082 if(is_valid_element)
1084 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1087 __builtin_memcpy(&(p_data_[i]), &tmp,
sizeof(X));
1090 scalar_per_t_vector * scalar_per_x_vector>;
1092 *c_style_pointer_cast<buf_t*>(&p_data_[i]) =
reinterpret_cast<const buf_t&
>(x);
1112 template <
typename T,
typename BufferSizeType,
bool Inval
idElementUseNumericalZeroValue>
1116 InvalidElementUseNumericalZeroValue,
1121 T* p_data_ =
nullptr;
1126 : p_data_{}, buffer_size_{}, invalid_element_value_{}
1131 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
1136 BufferSizeType buffer_size,
1137 T invalid_element_value)
1138 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
1146 return address_space_enum::vgpr;
1158 template <
typename X,
1159 bool oob_conditional_check =
true,
1161 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1163 bool>::type =
false>
1166 bool is_valid_element,
1174 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
1175 "wrong! X should contain multiple T");
1177 if(is_valid_element)
1179 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1182 __builtin_memcpy(&tmp, &(p_data_[i]),
sizeof(X));
1186 return *c_style_pointer_cast<const X*>(&p_data_[i]);
1191 if constexpr(InvalidElementUseNumericalZeroValue)
1193 return X{numeric<remove_cvref_t<T>>::zero()};
1197 return X{invalid_element_value_};
1203 template <memory_operation_enum Op,
1206 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1207 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
1208 bool>::type =
false>
1211 if constexpr(Op == memory_operation_enum::set)
1213 this->
template set<X>(i, linear_offset, is_valid_element, x);
1218 auto tmp = this->
template get<X>(i, linear_offset, is_valid_element);
1219 this->
template set<X>(i, linear_offset, is_valid_element, x + tmp);
1224 template <
typename X,
1226 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1228 bool>::type =
false>
1236 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
1237 "wrong! X should contain multiple T");
1239 if(is_valid_element)
1241 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1244 __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp,
sizeof(X));
1246 *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
1258 template <address_space_enum BufferAddressSpace,
1261 typename BufferSizeType>
1267 template <address_space_enum BufferAddressSpace,
1270 typename BufferSizeType,
1272 typename std::enable_if<std::is_same<remove_cvref_t<T>, remove_cvref_t<X>>
::value,
1273 bool>::type =
false>
1278 p, buffer_size, invalid_element_value};
1282 template <address_space_enum BufferAddressSpace,
1284 typename BufferSizeType,
1285 bool InvalidElementUseNumericalZeroValue,
1290 InvalidElementUseNumericalZeroValue,
1293 printf(
"buffer_view{AddressSpace: %s, p_data_: %p, buffer_size_: ",
1296 print(bv.buffer_size_);
1297 printf(
", invalid_element_value_: ");
1298 print(bv.invalid_element_value_);
constexpr CK_TILE_HOST_DEVICE const char * address_space_to_string(address_space_enum addr_space)
Helper function to convert address space enum to string.
Definition: arch.hpp:301
#define CK_TILE_DEVICE
Definition: config.hpp:41
#define CK_TILE_LDS_ADDR
Definition: config.hpp:58
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:42
Definition: cluster_descriptor.hpp:13
ushort bfloat16_t
Definition: bfloat16.hpp:111
int8_t int8x16_t
Definition: vector_type.hpp:193
int8_t int8x4_t
Definition: vector_type.hpp:191
int8_t int8x8_t
Definition: vector_type.hpp:192
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition: amd_buffer_addressing.hpp:2580
int8_t int8_t
Definition: int8.hpp:20
amd_buffer_coherence_enum
Definition: amd_buffer_addressing.hpp:1404
CK_TILE_HOST_DEVICE T add(const T &a, const T &b)
Definition: generic_memory_space_atomic.hpp:16
constexpr CK_TILE_HOST_DEVICE auto make_buffer_view(T *__restrict__ p, BufferSizeType buffer_size)
Definition: buffer_view.hpp:1262
int32_t index_t
Definition: integer.hpp:9
CK_TILE_HOST_DEVICE void print(const tile_distribution_encoding_pattern_2d< BlockSize, YPerTile, XPerTile, VecSize, DistributionPattern, NumWaveGroups > &)
Definition: static_encoding_pattern.hpp:341
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:21
int8_t pk_int4x4_t
Definition: vector_type.hpp:247
int8_t pk_int4x16_t
Definition: vector_type.hpp:249
constexpr detail::ignore_t ignore
Definition: ignore.hpp:20
typename impl::ext_vector< T, N >::type ext_vector_t
Definition: vector_type.hpp:84
int32_t int32_t
Definition: integer.hpp:10
int8_t int8x2_t
Definition: pk_int4.hpp:103
int32_t int32x4_t
Definition: vector_type.hpp:155
CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void *ptr, uint32_t size=0xffffffff, ForceSGPR={})
Definition: amd_buffer_addressing.hpp:97
int8_t pk_int4x8_t
Definition: vector_type.hpp:248
_Float16 half_t
Definition: half.hpp:111
__device__ X atomic_max(X *p_dst, const X &x)
std::enable_if< B, T > enable_if
Definition: enable_if.hpp:24
constexpr bool is_same_v
Definition: type.hpp:283
__device__ X atomic_add(X *p_dst, const X &x)
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1697
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:183
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:62
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:57
constexpr CK_TILE_DEVICE auto transpose_get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:144
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:74
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:87
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:67
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:209
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:163
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:76
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:96
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:83
T type
Definition: buffer_view.hpp:51
BufferSizeType buffer_size_
Definition: buffer_view.hpp:54
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:206
constexpr CK_TILE_DEVICE auto transpose_get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:370
int32x4_t cached_buf_res_
Definition: buffer_view.hpp:233
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:744
BufferSizeType buffer_size_
Definition: buffer_view.hpp:232
constexpr CK_TILE_DEVICE auto async_get_raw(remove_cvref_t< T > *smem, index_t i, index_t linear_offset, bool, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:449
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:275
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:263
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:279
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:288
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x, bool_constant< oob_conditional_check >={})
Definition: buffer_view.hpp:476
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:268
CK_TILE_DEVICE void update_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
Definition: buffer_view.hpp:517
constexpr CK_TILE_DEVICE auto async_get(CK_TILE_LDS_ADDR remove_cvref_t< T > *smem, index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:416
T type
Definition: buffer_view.hpp:229
constexpr CK_TILE_DEVICE auto get_raw(remove_cvref_t< X > &dst, index_t v_offset, index_t i_offset, bool is_valid_element, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:390
CK_TILE_DEVICE void atomic_add(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:613
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:243
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:741
CK_TILE_DEVICE void set_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:591
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:238
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:546
CK_TILE_DEVICE void atomic_max(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:708
CK_TILE_DEVICE void atomic_add_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:677
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:251
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:932
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:767
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:1099
BufferSizeType buffer_size_
Definition: buffer_view.hpp:764
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:777
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:1102
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:784
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:806
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:912
constexpr CK_TILE_DEVICE auto transpose_get([[maybe_unused]] index_t i, [[maybe_unused]] index_t linear_offset, bool is_valid_element) const
Definition: buffer_view.hpp:870
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:772
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:793
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:797
T type
Definition: buffer_view.hpp:761
constexpr CK_TILE_DEVICE auto get_raw(remove_cvref_t< X > &dst, index_t v_offset, index_t i_offset, bool, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:856
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:786
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:1255
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:1130
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:1142
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:1252
T type
Definition: buffer_view.hpp:1119
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:1135
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:1155
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:1229
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:1125
BufferSizeType buffer_size_
Definition: buffer_view.hpp:1122
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:1151
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:1209
constexpr CK_TILE_DEVICE auto get(index_t i, index_t, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:1164
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:1144
Definition: buffer_view.hpp:35
Definition: integral_constant.hpp:13
Definition: numeric.hpp:81
Definition: numeric.hpp:18
Definition: pk_int4.hpp:21
Definition: amd_buffer_addressing.hpp:895
Definition: vector_type.hpp:90