8 #if __clang_major__ >= 20
33 template <address_space_enum BufferAddressSpace,
35 typename BufferSizeType,
36 bool InvalidElementUseNumericalZeroValue,
47 template <
typename T,
typename BufferSizeType,
bool Inval
idElementUseNumericalZeroValue>
51 InvalidElementUseNumericalZeroValue,
61 : p_data_{}, buffer_size_{}, invalid_element_value_{}
66 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
71 BufferSizeType buffer_size,
72 T invalid_element_value)
73 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
81 return address_space_enum::generic;
94 bool oob_conditional_check =
true,
96 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
101 bool is_valid_element,
109 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
110 "wrong! X should contain multiple T");
114 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
117 __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]),
sizeof(X));
121 return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
126 if constexpr(InvalidElementUseNumericalZeroValue)
128 return X{numeric<remove_cvref_t<T>>::zero()};
132 return X{invalid_element_value_};
141 template <
typename X,
142 bool oob_conditional_check =
true,
144 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
145 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
149 bool is_valid_element,
152 static_assert(
false,
"Error: transpose load not supported in global memory space.");
155 ignore = is_valid_element;
160 template <memory_operation_enum Op,
163 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
164 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
168 if constexpr(Op == memory_operation_enum::set)
170 this->
template set<X>(i, linear_offset, is_valid_element, x);
175 auto tmp = this->
template get<X>(i, linear_offset, is_valid_element);
176 this->
template set<X>(i, linear_offset, is_valid_element, x + tmp);
181 template <
typename X,
183 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
193 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
194 "wrong! X should contain multiple T");
198 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
201 __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp,
sizeof(X));
203 *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
222 template <
typename T,
223 typename BufferSizeType,
224 bool InvalidElementUseNumericalZeroValue,
229 InvalidElementUseNumericalZeroValue,
234 T* p_data_ =
nullptr;
242 : p_data_{}, buffer_size_{}, cached_buf_res_{0}, invalid_element_value_{}
248 buffer_size_{buffer_size / PackedSize},
250 invalid_element_value_{0}
255 BufferSizeType buffer_size,
256 T invalid_element_value)
258 buffer_size_{buffer_size / PackedSize},
260 invalid_element_value_{invalid_element_value}
273 return address_space_enum::global;
285 template <
typename X,
286 bool oob_conditional_check =
true,
288 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
293 bool is_valid_element,
301 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
302 "wrong! X should contain multiple T");
304 #if CK_TILE_USE_AMD_BUFFER_LOAD
305 bool constexpr use_amd_buffer_addressing =
true;
307 bool constexpr use_amd_buffer_addressing =
false;
310 if constexpr(use_amd_buffer_addressing)
312 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
314 if constexpr(InvalidElementUseNumericalZeroValue)
316 return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>,
319 oob_conditional_check>(
320 p_data_, i + linear_offset, is_valid_element, buffer_size_);
328 oob_conditional_check>(p_data_,
332 invalid_element_value_);
339 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
342 __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]),
sizeof(X));
346 return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
351 if constexpr(InvalidElementUseNumericalZeroValue)
353 return X{numeric<remove_cvref_t<T>>::zero()};
357 return X{invalid_element_value_};
367 template <
typename X,
368 bool oob_conditional_check =
true,
370 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
371 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
375 bool is_valid_element,
378 static_assert(
false,
"Error: transpose load not supported in global memory space.");
381 ignore = is_valid_element;
386 template <
typename X,
387 bool oob_conditional_check =
true,
388 bool pre_nop =
false,
390 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
391 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
396 bool is_valid_element,
403 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
404 "wrong! X should contain multiple T");
406 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
408 amd_buffer_load_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check, pre_nop>(
413 template <
typename X,
414 bool oob_conditional_check =
true,
416 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
417 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
422 bool is_valid_element,
429 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
430 "wrong! X should contain multiple T");
432 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
433 const int32x4_t src_wave_buffer_resource =
436 amd_async_buffer_load_with_oob<remove_cvref_t<T>, t_per_x, Coherence>(
438 src_wave_buffer_resource,
446 template <
typename X,
447 bool pre_nop =
false,
449 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
450 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
462 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
463 "wrong! X should contain multiple T");
465 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
467 amd_async_buffer_load_with_oob_raw<remove_cvref_t<T>, t_per_x, Coherence>(
472 template <memory_operation_enum Op,
474 bool oob_conditional_check =
true,
476 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
477 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
481 bool is_valid_element,
485 if constexpr(Op == memory_operation_enum::set)
487 this->
template set<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
491 this->
template atomic_add<X, oob_conditional_check>(
492 i, linear_offset, is_valid_element, x);
496 this->
template atomic_max<X, oob_conditional_check>(
497 i, linear_offset, is_valid_element, x);
503 this->
template get<X, oob_conditional_check>(i, linear_offset, is_valid_element);
504 this->
template set<X, oob_conditional_check>(
505 i, linear_offset, is_valid_element, x + tmp);
512 template <memory_operation_enum Op,
514 bool oob_conditional_check =
true,
515 bool pre_nop =
false,
517 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
518 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
522 bool is_valid_element,
527 if constexpr(Op == memory_operation_enum::set)
529 this->
template set_raw<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
533 this->
template atomic_add_raw<X, oob_conditional_check, pre_nop>(
534 i, linear_offset, is_valid_element, x);
543 template <
typename X,
544 bool oob_conditional_check =
true,
546 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
547 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
556 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
557 "wrong! X should contain multiple T");
559 #if CK_TILE_USE_AMD_BUFFER_STORE
560 bool constexpr use_amd_buffer_addressing =
true;
562 bool constexpr use_amd_buffer_addressing =
false;
565 if constexpr(use_amd_buffer_addressing)
567 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
569 amd_buffer_store<remove_cvref_t<T>, t_per_x, Coherence>(
570 x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
576 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
579 __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp,
sizeof(X));
581 *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
588 template <
typename X,
589 bool oob_conditional_check =
true,
591 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
601 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
602 "wrong! X should contain multiple T");
604 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
605 amd_buffer_store_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check>(
606 x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
609 template <
typename X,
610 bool oob_conditional_check =
true,
612 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
625 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
626 "wrong! X should contain multiple T");
628 static_assert(get_address_space() == address_space_enum::global,
"only support global mem");
630 #if CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
631 bool constexpr use_amd_buffer_addressing =
632 std::is_same_v<remove_cvref_t<scalar_t>,
int32_t> ||
633 std::is_same_v<remove_cvref_t<scalar_t>,
float> ||
634 (std::is_same_v<remove_cvref_t<scalar_t>,
half_t> && scalar_per_x_vector % 2 == 0);
635 #elif CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
636 bool constexpr use_amd_buffer_addressing =
637 std::is_same_v<remove_cvref_t<scalar_t>,
int32_t>;
638 #elif(!CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
639 bool constexpr use_amd_buffer_addressing =
640 std::is_same_v<remove_cvref_t<scalar_t>,
float> ||
641 (std::is_same_v<remove_cvref_t<scalar_t>,
half_t> && scalar_per_x_vector % 2 == 0);
643 bool constexpr use_amd_buffer_addressing =
false;
646 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
648 if constexpr(use_amd_buffer_addressing)
650 amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
651 x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
657 atomic_add_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
662 template <
typename X,
663 bool oob_conditional_check =
true,
666 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
679 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
680 "wrong! X should contain multiple T");
682 static_assert(get_address_space() == address_space_enum::global,
"only support global mem");
684 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
686 amd_buffer_atomic_add_raw<remove_cvref_t<T>,
689 oob_conditional_check,
691 x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
694 template <
typename X,
695 bool oob_conditional_check =
true,
697 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
708 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
709 "wrong! X should contain multiple T");
711 static_assert(get_address_space() == address_space_enum::global,
"only support global mem");
713 #if CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
715 bool constexpr use_amd_buffer_addressing = std::is_same_v<remove_cvref_t<scalar_t>,
double>;
717 bool constexpr use_amd_buffer_addressing =
false;
720 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
722 if constexpr(use_amd_buffer_addressing)
724 amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
725 x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
727 else if(is_valid_element)
729 atomic_max_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
747 template <
typename T,
typename BufferSizeType,
bool Inval
idElementUseNumericalZeroValue>
751 InvalidElementUseNumericalZeroValue,
756 T* p_data_ =
nullptr;
761 : p_data_{}, buffer_size_{}, invalid_element_value_{}
766 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
771 BufferSizeType buffer_size,
772 T invalid_element_value)
773 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
781 return address_space_enum::lds;
793 template <
typename X,
794 bool oob_conditional_check =
true,
796 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
801 bool is_valid_element,
809 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
810 "wrong! X should contain multiple T");
814 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
817 __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]),
sizeof(X));
822 scalar_per_t_vector * scalar_per_x_vector>;
824 auto rtn = *c_style_pointer_cast<const buf_t*>(&p_data_[i + linear_offset]);
825 return bit_cast<X>(rtn);
830 if constexpr(InvalidElementUseNumericalZeroValue)
832 return X{numeric<remove_cvref_t<T>>::zero()};
836 return X{invalid_element_value_};
842 template <
typename X,
843 bool oob_conditional_check =
true,
844 bool pre_nop =
false,
846 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
847 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
855 smem_load<
sizeof(X)>{}(dst, v_offset *
sizeof(T), i_offset *
sizeof(T));
858 template <
typename X,
860 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
861 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
864 [[maybe_unused]]
index_t linear_offset,
865 bool is_valid_element)
const
872 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
873 "wrong! X should contain multiple T");
877 #if defined(__gfx950__)
878 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
879 return amd_transpose_load_to_vgpr<remove_cvref_t<T>, t_per_x>(p_data_ + i +
887 if constexpr(InvalidElementUseNumericalZeroValue)
893 return X{invalid_element_value_};
899 template <memory_operation_enum Op,
902 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
907 if constexpr(Op == memory_operation_enum::set)
909 this->
template set<X>(i, linear_offset, is_valid_element, x);
914 auto tmp = this->
template get<X>(i, linear_offset, is_valid_element);
915 this->
template set<X>(i, linear_offset, is_valid_element, x + tmp);
920 template <
typename X,
922 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
932 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
933 "wrong! X should contain multiple T");
935 #if CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
936 bool constexpr workaround_int8_ds_write_issue =
true;
938 bool constexpr workaround_int8_ds_write_issue =
false;
944 workaround_int8_ds_write_issue)
977 "wrong! not implemented for this combination, please add "
990 *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
991 *c_style_pointer_cast<const int8_t*>(&x);
1002 *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
1003 *c_style_pointer_cast<const int16_t*>(&x);
1014 *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
1015 *c_style_pointer_cast<const int32_t*>(&x);
1026 *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
1027 *c_style_pointer_cast<const int32x2_t*>(&x);
1038 *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
1039 *c_style_pointer_cast<const int32x4_t*>(&x);
1048 *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
1049 *c_style_pointer_cast<const int32_t*>(&x);
1058 *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
1059 *c_style_pointer_cast<const int32x2_t*>(&x);
1068 *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
1069 *c_style_pointer_cast<const int32x4_t*>(&x);
1075 if(is_valid_element)
1077 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1080 __builtin_memcpy(&(p_data_[i]), &tmp,
sizeof(X));
1083 scalar_per_t_vector * scalar_per_x_vector>;
1085 *c_style_pointer_cast<buf_t*>(&p_data_[i]) =
reinterpret_cast<const buf_t&
>(x);
1105 template <
typename T,
typename BufferSizeType,
bool Inval
idElementUseNumericalZeroValue>
1109 InvalidElementUseNumericalZeroValue,
1114 T* p_data_ =
nullptr;
1119 : p_data_{}, buffer_size_{}, invalid_element_value_{}
1124 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
1129 BufferSizeType buffer_size,
1130 T invalid_element_value)
1131 : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
1139 return address_space_enum::vgpr;
1151 template <
typename X,
1152 bool oob_conditional_check =
true,
1154 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1156 bool>::type =
false>
1159 bool is_valid_element,
1167 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
1168 "wrong! X should contain multiple T");
1170 if(is_valid_element)
1172 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1175 __builtin_memcpy(&tmp, &(p_data_[i]),
sizeof(X));
1179 return *c_style_pointer_cast<const X*>(&p_data_[i]);
1184 if constexpr(InvalidElementUseNumericalZeroValue)
1186 return X{numeric<remove_cvref_t<T>>::zero()};
1190 return X{invalid_element_value_};
1196 template <memory_operation_enum Op,
1199 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1200 typename vector_traits<remove_cvref_t<T>>::scalar_type>
::value,
1201 bool>::type =
false>
1204 if constexpr(Op == memory_operation_enum::set)
1206 this->
template set<X>(i, linear_offset, is_valid_element, x);
1211 auto tmp = this->
template get<X>(i, linear_offset, is_valid_element);
1212 this->
template set<X>(i, linear_offset, is_valid_element, x + tmp);
1217 template <
typename X,
1219 std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
1221 bool>::type =
false>
1229 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
1230 "wrong! X should contain multiple T");
1232 if(is_valid_element)
1234 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
1237 __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp,
sizeof(X));
1239 *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
1251 template <address_space_enum BufferAddressSpace,
1254 typename BufferSizeType>
1260 template <address_space_enum BufferAddressSpace,
1263 typename BufferSizeType,
1265 typename std::enable_if<std::is_same<remove_cvref_t<T>, remove_cvref_t<X>>
::value,
1266 bool>::type =
false>
1271 p, buffer_size, invalid_element_value};
1275 template <address_space_enum BufferAddressSpace,
1277 typename BufferSizeType,
1278 bool InvalidElementUseNumericalZeroValue,
1283 InvalidElementUseNumericalZeroValue,
1286 printf(
"buffer_view{AddressSpace: %s, p_data_: %p, buffer_size_: ",
1289 print(bv.buffer_size_);
1290 printf(
", invalid_element_value_: ");
1291 print(bv.invalid_element_value_);
constexpr CK_TILE_HOST_DEVICE const char * address_space_to_string(address_space_enum addr_space)
Helper function to convert address space enum to string.
Definition: arch.hpp:248
#define CK_TILE_DEVICE
Definition: config.hpp:41
#define CK_TILE_LDS_ADDR
Definition: config.hpp:58
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:42
Definition: cluster_descriptor.hpp:13
int8_t int8x16_t
Definition: vector_type.hpp:182
int8_t int8x4_t
Definition: vector_type.hpp:180
int8_t int8x8_t
Definition: vector_type.hpp:181
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition: amd_buffer_addressing.hpp:2471
int8_t int8_t
Definition: int8.hpp:20
amd_buffer_coherence_enum
Definition: amd_buffer_addressing.hpp:1332
CK_TILE_HOST_DEVICE T add(const T &a, const T &b)
Definition: generic_memory_space_atomic.hpp:16
constexpr CK_TILE_HOST_DEVICE auto make_buffer_view(T *__restrict__ p, BufferSizeType buffer_size)
Definition: buffer_view.hpp:1255
int32_t index_t
Definition: integer.hpp:9
CK_TILE_HOST_DEVICE void print(const tile_distribution_encoding_pattern_2d< BlockSize, YPerTile, XPerTile, VecSize, DistributionPattern, NumWaveGroups > &)
Definition: static_encoding_pattern.hpp:342
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:21
int8_t pk_int4x4_t
Definition: vector_type.hpp:236
int8_t pk_int4x16_t
Definition: vector_type.hpp:238
constexpr detail::ignore_t ignore
Definition: ignore.hpp:20
typename impl::ext_vector< T, N >::type ext_vector_t
Definition: vector_type.hpp:83
int32_t int32_t
Definition: integer.hpp:10
int8_t int8x2_t
Definition: pk_int4.hpp:103
int32_t int32x4_t
Definition: vector_type.hpp:144
int8_t pk_int4x8_t
Definition: vector_type.hpp:237
_Float16 half_t
Definition: half.hpp:111
CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void *ptr, uint32_t size=0xffffffff)
Definition: amd_buffer_addressing.hpp:40
__device__ X atomic_max(X *p_dst, const X &x)
std::enable_if< B, T > enable_if
Definition: enable_if.hpp:24
constexpr bool is_same_v
Definition: type.hpp:283
__device__ X atomic_add(X *p_dst, const X &x)
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1350
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:186
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:65
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:60
constexpr CK_TILE_DEVICE auto transpose_get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:147
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:77
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:90
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:70
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:212
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:166
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:79
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:99
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:86
T type
Definition: buffer_view.hpp:54
BufferSizeType buffer_size_
Definition: buffer_view.hpp:57
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:209
constexpr CK_TILE_DEVICE auto transpose_get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:373
int32x4_t cached_buf_res_
Definition: buffer_view.hpp:236
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:737
BufferSizeType buffer_size_
Definition: buffer_view.hpp:235
constexpr CK_TILE_DEVICE auto async_get_raw(remove_cvref_t< T > *smem, index_t i, index_t linear_offset, bool, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:452
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:278
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:266
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:282
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:291
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x, bool_constant< oob_conditional_check >={})
Definition: buffer_view.hpp:479
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:271
CK_TILE_DEVICE void update_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
Definition: buffer_view.hpp:520
constexpr CK_TILE_DEVICE auto async_get(CK_TILE_LDS_ADDR remove_cvref_t< T > *smem, index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:419
T type
Definition: buffer_view.hpp:232
constexpr CK_TILE_DEVICE auto get_raw(remove_cvref_t< X > &dst, index_t v_offset, index_t i_offset, bool is_valid_element, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:393
CK_TILE_DEVICE void atomic_add(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:616
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:246
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:734
CK_TILE_DEVICE void set_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:594
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:241
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:549
CK_TILE_DEVICE void atomic_max(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:701
CK_TILE_DEVICE void atomic_add_raw(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:670
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:254
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:925
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:760
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:1092
BufferSizeType buffer_size_
Definition: buffer_view.hpp:757
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:770
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:1095
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:777
constexpr CK_TILE_DEVICE auto get(index_t i, index_t linear_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:799
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:905
constexpr CK_TILE_DEVICE auto transpose_get([[maybe_unused]] index_t i, [[maybe_unused]] index_t linear_offset, bool is_valid_element) const
Definition: buffer_view.hpp:863
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:765
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:786
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:790
T type
Definition: buffer_view.hpp:754
constexpr CK_TILE_DEVICE auto get_raw(remove_cvref_t< X > &dst, index_t v_offset, index_t i_offset, bool, bool_constant< pre_nop >={}) const
Definition: buffer_view.hpp:849
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:779
static constexpr CK_TILE_DEVICE bool is_dynamic_buffer()
Definition: buffer_view.hpp:1248
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size)
Definition: buffer_view.hpp:1123
CK_TILE_HOST_DEVICE void init_raw()
Definition: buffer_view.hpp:1135
static constexpr CK_TILE_DEVICE bool is_static_buffer()
Definition: buffer_view.hpp:1245
T type
Definition: buffer_view.hpp:1112
constexpr CK_TILE_HOST_DEVICE buffer_view(T *__restrict__ p_data, BufferSizeType buffer_size, T invalid_element_value)
Definition: buffer_view.hpp:1128
constexpr CK_TILE_DEVICE T & operator()(index_t i)
Definition: buffer_view.hpp:1148
CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:1222
constexpr CK_TILE_HOST_DEVICE buffer_view()
Definition: buffer_view.hpp:1118
BufferSizeType buffer_size_
Definition: buffer_view.hpp:1115
constexpr CK_TILE_DEVICE const T & operator[](index_t i) const
Definition: buffer_view.hpp:1144
CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X &x)
Definition: buffer_view.hpp:1202
constexpr CK_TILE_DEVICE auto get(index_t i, index_t, bool is_valid_element, bool_constant< oob_conditional_check >={}) const
Definition: buffer_view.hpp:1157
static constexpr CK_TILE_DEVICE address_space_enum get_address_space()
Definition: buffer_view.hpp:1137
Definition: buffer_view.hpp:38
Definition: integral_constant.hpp:13
Definition: numeric.hpp:81
Definition: numeric.hpp:18
Definition: pk_int4.hpp:21
Definition: amd_buffer_addressing.hpp:832
Definition: vector_type.hpp:89