10 #if __clang_major__ >= 20
11 #include "amd_buffer_addressing_builtins.hpp"
26 typename ElementSpaceSize,
27 bool InvalidElementUseNumericalZeroValue,
49 __host__ __device__ constexpr
DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
55 ElementSpaceSize element_space_size,
56 T invalid_element_value)
65 return BufferAddressSpace;
73 bool DoTranspose =
false,
78 __host__ __device__ constexpr
auto Get(IndexType i,
bool is_valid_element)
const
85 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
86 "wrong! X should contain multiple T");
88 #if CK_USE_AMD_BUFFER_LOAD
89 bool constexpr use_amd_buffer_addressing =
sizeof(IndexType) <=
sizeof(
int32_t);
91 bool constexpr use_amd_buffer_addressing =
false;
97 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
99 if constexpr(InvalidElementUseNumericalZeroValue)
101 return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>,
108 return amd_buffer_load_invalid_element_return_customized_value<remove_cvref_t<T>,
121 return amd_global_load_transpose_to_vgpr(
p_data_ + i);
123 static_assert(!DoTranspose,
"load-with-transpose only supported on gfx12+");
130 #if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
133 __builtin_memcpy(&tmp, &(
p_data_[i]),
sizeof(X));
137 return *c_style_pointer_cast<const X*>(&
p_data_[i]);
142 if constexpr(InvalidElementUseNumericalZeroValue)
159 __host__ __device__
void Update(IndexType i,
bool is_valid_element,
const X& x)
163 this->
template Set<X>(i, is_valid_element, x);
167 this->
template AtomicAdd<X>(i, is_valid_element, x);
171 this->
template AtomicMax<X>(i, is_valid_element, x);
175 auto tmp = this->
template Get<X>(i, is_valid_element);
178 if constexpr(is_same_v<scalar_t, bhalf_t>)
184 type_convert<X>(type_convert<float>(x) + type_convert<float>(tmp));
185 this->
template Set<X>(i, is_valid_element, result);
194 auto result = type_convert<scalar_t>(
195 type_convert<float>(a_vector.template AsType<scalar_t>()[idx]) +
196 type_convert<float>(b_vector.template AsType<scalar_t>()[idx]));
197 this->
template Set<scalar_t>(i + idx, is_valid_element, result);
203 this->
template Set<X>(i, is_valid_element, x + tmp);
208 template <
typename DstBuffer, index_t NumElemsPerThread>
210 IndexType src_offset,
211 IndexType dst_offset,
212 bool is_valid_element)
const
216 "Source data must come from a global memory buffer.");
218 "Destination data must be stored in an LDS memory buffer.");
220 amd_direct_load_global_to_lds<T, NumElemsPerThread>(
p_data_,
228 template <
typename X,
231 !is_native_type<X>(),
233 __host__ __device__
void Set(IndexType i,
bool is_valid_element,
const X& x)
240 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
241 "wrong! X should contain multiple T");
243 #if CK_USE_AMD_BUFFER_LOAD
244 bool constexpr use_amd_buffer_addressing =
sizeof(IndexType) <=
sizeof(
int32_t);
246 bool constexpr use_amd_buffer_addressing =
false;
249 #if CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
250 bool constexpr workaround_int8_ds_write_issue =
true;
252 bool constexpr workaround_int8_ds_write_issue =
false;
257 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
259 amd_buffer_store<remove_cvref_t<T>, t_per_x, coherence>(
264 workaround_int8_ds_write_issue)
288 "wrong! not implemented for this combination, please add "
296 *c_style_pointer_cast<int8_t*>(&
p_data_[i]) =
297 *c_style_pointer_cast<const int8_t*>(&x);
304 *c_style_pointer_cast<int16_t*>(&
p_data_[i]) =
305 *c_style_pointer_cast<const int16_t*>(&x);
312 *c_style_pointer_cast<int32_t*>(&
p_data_[i]) =
313 *c_style_pointer_cast<const int32_t*>(&x);
320 *c_style_pointer_cast<int32x2_t*>(&
p_data_[i]) =
321 *c_style_pointer_cast<const int32x2_t*>(&x);
328 *c_style_pointer_cast<int32x4_t*>(&
p_data_[i]) =
329 *c_style_pointer_cast<const int32x4_t*>(&x);
336 *c_style_pointer_cast<int32_t*>(&
p_data_[i]) =
337 *c_style_pointer_cast<const int32_t*>(&x);
344 *c_style_pointer_cast<int32x2_t*>(&
p_data_[i]) =
345 *c_style_pointer_cast<const int32x2_t*>(&x);
352 *c_style_pointer_cast<int32x4_t*>(&
p_data_[i]) =
353 *c_style_pointer_cast<const int32x4_t*>(&x);
364 __builtin_memcpy(&(
p_data_[i]), &tmp,
sizeof(X));
367 *c_style_pointer_cast<X*>(&
p_data_[i]) = x;
373 template <
typename X,
377 __host__ __device__
void AtomicAdd(IndexType i,
bool is_valid_element,
const X& x)
386 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
387 "wrong! X should contain multiple T");
391 #if CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
392 bool constexpr use_amd_buffer_addressing =
393 is_same_v<remove_cvref_t<scalar_t>,
int32_t> ||
394 is_same_v<remove_cvref_t<scalar_t>,
float> ||
395 (is_same_v<remove_cvref_t<scalar_t>,
half_t> && scalar_per_x_vector % 2 == 0) ||
397 #elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
398 bool constexpr use_amd_buffer_addressing =
399 sizeof(IndexType) <=
sizeof(
int32_t) && is_same_v<remove_cvref_t<scalar_t>,
int32_t>;
400 #elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
401 bool constexpr use_amd_buffer_addressing =
402 sizeof(IndexType) <=
sizeof(
int32_t) &&
403 (is_same_v<remove_cvref_t<scalar_t>,
float> ||
404 (is_same_v<remove_cvref_t<scalar_t>,
half_t> && scalar_per_x_vector % 2 == 0) ||
407 bool constexpr use_amd_buffer_addressing =
false;
410 if constexpr(use_amd_buffer_addressing)
412 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
414 amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
421 atomic_add<X>(c_style_pointer_cast<X*>(&
p_data_[i]), x);
426 template <
typename X,
430 __host__ __device__
void AtomicMax(IndexType i,
bool is_valid_element,
const X& x)
437 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
438 "wrong! X should contain multiple T");
442 #if CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
444 bool constexpr use_amd_buffer_addressing =
445 sizeof(IndexType) <=
sizeof(
int32_t) && is_same_v<remove_cvref_t<scalar_t>,
double>;
447 bool constexpr use_amd_buffer_addressing =
false;
450 if constexpr(use_amd_buffer_addressing)
452 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
454 amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
457 else if(is_valid_element)
459 atomic_max<X>(c_style_pointer_cast<X*>(&
p_data_[i]), x);
471 typename ElementSpaceSize>
475 p, element_space_size};
481 typename ElementSpaceSize>
483 ElementSpaceSize element_space_size)
486 p, element_space_size};
493 typename ElementSpaceSize,
495 typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>
::value,
bool>::type =
false>
496 __host__ __device__ constexpr
auto
500 p, element_space_size, invalid_element_value};
AmdBufferCoherenceEnum
Definition: amd_buffer_addressing.hpp:295
InMemoryDataOperationEnum
Definition: ck.hpp:277
typename vector_type< int8_t, 2 >::type int8x2_t
Definition: dtype_vector.hpp:2176
__host__ constexpr __device__ auto make_long_dynamic_buffer(T *p, ElementSpaceSize element_space_size)
Definition: dynamic_buffer.hpp:482
typename vector_type< int8_t, 8 >::type int8x8_t
Definition: dtype_vector.hpp:2178
AddressSpaceEnum
Definition: amd_address_space.hpp:15
_Float16 half_t
Definition: data_type.hpp:31
ushort bhalf_t
Definition: data_type.hpp:30
std::enable_if< B, T > enable_if
Definition: enable_if.hpp:24
typename vector_type< int8_t, 16 >::type int8x16_t
Definition: dtype_vector.hpp:2179
constexpr bool is_same_v
Definition: type.hpp:283
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:297
int32_t index_t
Definition: ck.hpp:299
typename vector_type< int8_t, 4 >::type int8x4_t
Definition: dtype_vector.hpp:2177
__host__ constexpr __device__ auto make_dynamic_buffer(T *p, ElementSpaceSize element_space_size)
Definition: dynamic_buffer.hpp:472
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1697
signed int int32_t
Definition: stdint.h:123
signed char int8_t
Definition: stdint.h:121
Definition: dynamic_buffer.hpp:31
__host__ constexpr __device__ const T & operator[](IndexType i) const
Definition: dynamic_buffer.hpp:68
ElementSpaceSize element_space_size_
Definition: dynamic_buffer.hpp:35
__host__ constexpr __device__ DynamicBuffer(T *p_data, ElementSpaceSize element_space_size, T invalid_element_value)
Definition: dynamic_buffer.hpp:54
__host__ constexpr __device__ auto Get(IndexType i, bool is_valid_element) const
Definition: dynamic_buffer.hpp:78
__host__ constexpr __device__ T & operator()(IndexType i)
Definition: dynamic_buffer.hpp:70
T invalid_element_value_
Definition: dynamic_buffer.hpp:36
static constexpr index_t PackedSize
Definition: dynamic_buffer.hpp:42
__host__ __device__ void Update(IndexType i, bool is_valid_element, const X &x)
Definition: dynamic_buffer.hpp:159
__host__ static constexpr __device__ bool IsDynamicBuffer()
Definition: dynamic_buffer.hpp:465
T * p_data_
Definition: dynamic_buffer.hpp:34
__host__ __device__ void AtomicAdd(IndexType i, bool is_valid_element, const X &x)
Definition: dynamic_buffer.hpp:377
__host__ static constexpr __device__ AddressSpaceEnum GetAddressSpace()
Definition: dynamic_buffer.hpp:63
__host__ __device__ void DirectCopyToLds(DstBuffer &dst_buf, IndexType src_offset, IndexType dst_offset, bool is_valid_element) const
Definition: dynamic_buffer.hpp:209
__host__ __device__ void Set(IndexType i, bool is_valid_element, const X &x)
Definition: dynamic_buffer.hpp:233
T type
Definition: dynamic_buffer.hpp:32
__host__ static constexpr __device__ bool IsStaticBuffer()
Definition: dynamic_buffer.hpp:463
__host__ constexpr __device__ DynamicBuffer(T *p_data, ElementSpaceSize element_space_size)
Definition: dynamic_buffer.hpp:49
__host__ __device__ void AtomicMax(IndexType i, bool is_valid_element, const X &x)
Definition: dynamic_buffer.hpp:430
Definition: data_type.hpp:218
Definition: data_type.hpp:187
Definition: data_type.hpp:39
Definition: functional2.hpp:33
Definition: dtype_vector.hpp:10