21 typename ElementSpaceSize,
22 bool InvalidElementUseNumericalZeroValue,
39 __host__ __device__ constexpr
DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
45 ElementSpaceSize element_space_size,
46 T invalid_element_value)
55 return BufferAddressSpace;
67 __host__ __device__ constexpr
auto Get(
index_t i,
bool is_valid_element)
const
74 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
75 "wrong! X should contain multiple T");
77 #if CK_USE_AMD_BUFFER_LOAD
78 bool constexpr use_amd_buffer_addressing =
true;
80 bool constexpr use_amd_buffer_addressing =
false;
85 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
87 if constexpr(InvalidElementUseNumericalZeroValue)
89 return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>,
96 return amd_buffer_load_invalid_element_return_customized_value<remove_cvref_t<T>,
110 #if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
113 __builtin_memcpy(&tmp, &(
p_data_[i]),
sizeof(X));
117 return *c_style_pointer_cast<const X*>(&
p_data_[i]);
122 if constexpr(InvalidElementUseNumericalZeroValue)
139 __host__ __device__
void Update(
index_t i,
bool is_valid_element,
const X& x)
143 this->
template Set<X>(i, is_valid_element, x);
147 this->
template AtomicAdd<X>(i, is_valid_element, x);
151 this->
template AtomicMax<X>(i, is_valid_element, x);
155 auto tmp = this->
template Get<X>(i, is_valid_element);
158 if constexpr(is_same_v<scalar_t, bhalf_t>)
164 type_convert<X>(type_convert<float>(x) + type_convert<float>(tmp));
165 this->
template Set<X>(i, is_valid_element, result);
174 auto result = type_convert<scalar_t>(
175 type_convert<float>(a_vector.template AsType<scalar_t>()[idx]) +
176 type_convert<float>(b_vector.template AsType<scalar_t>()[idx]));
177 this->
template Set<scalar_t>(i + idx, is_valid_element, result);
183 this->
template Set<X>(i, is_valid_element, x + tmp);
188 template <
typename DstBuffer, index_t NumElemsPerThread>
192 bool is_valid_element)
const
196 "Source data must come from a global memory buffer.");
198 "Destination data must be stored in an LDS memory buffer.");
200 amd_direct_load_global_to_lds<T, NumElemsPerThread>(
p_data_,
208 template <
typename X,
211 !is_native_type<X>(),
213 __host__ __device__
void Set(
index_t i,
bool is_valid_element,
const X& x)
220 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
221 "wrong! X should contain multiple T");
223 #if CK_USE_AMD_BUFFER_STORE
224 bool constexpr use_amd_buffer_addressing =
true;
226 bool constexpr use_amd_buffer_addressing =
false;
229 #if CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
230 bool constexpr workaround_int8_ds_write_issue =
true;
232 bool constexpr workaround_int8_ds_write_issue =
false;
237 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
239 amd_buffer_store<remove_cvref_t<T>, t_per_x, coherence>(
244 workaround_int8_ds_write_issue)
268 "wrong! not implemented for this combination, please add "
276 *c_style_pointer_cast<int8_t*>(&
p_data_[i]) =
277 *c_style_pointer_cast<const int8_t*>(&x);
284 *c_style_pointer_cast<int16_t*>(&
p_data_[i]) =
285 *c_style_pointer_cast<const int16_t*>(&x);
292 *c_style_pointer_cast<int32_t*>(&
p_data_[i]) =
293 *c_style_pointer_cast<const int32_t*>(&x);
300 *c_style_pointer_cast<int32x2_t*>(&
p_data_[i]) =
301 *c_style_pointer_cast<const int32x2_t*>(&x);
308 *c_style_pointer_cast<int32x4_t*>(&
p_data_[i]) =
309 *c_style_pointer_cast<const int32x4_t*>(&x);
316 *c_style_pointer_cast<int32_t*>(&
p_data_[i]) =
317 *c_style_pointer_cast<const int32_t*>(&x);
324 *c_style_pointer_cast<int32x2_t*>(&
p_data_[i]) =
325 *c_style_pointer_cast<const int32x2_t*>(&x);
332 *c_style_pointer_cast<int32x4_t*>(&
p_data_[i]) =
333 *c_style_pointer_cast<const int32x4_t*>(&x);
341 #if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
344 __builtin_memcpy(&(
p_data_[i]), &tmp,
sizeof(X));
346 *c_style_pointer_cast<X*>(&
p_data_[i]) = x;
352 template <
typename X,
365 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
366 "wrong! X should contain multiple T");
370 #if CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
371 bool constexpr use_amd_buffer_addressing =
372 is_same_v<remove_cvref_t<scalar_t>, int32_t> ||
373 is_same_v<remove_cvref_t<scalar_t>,
float> ||
374 (is_same_v<remove_cvref_t<scalar_t>,
half_t> && scalar_per_x_vector % 2 == 0) ||
376 #elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
377 bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, int32_t>;
378 #elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
379 bool constexpr use_amd_buffer_addressing =
380 is_same_v<remove_cvref_t<scalar_t>,
float> ||
381 (is_same_v<remove_cvref_t<scalar_t>,
half_t> && scalar_per_x_vector % 2 == 0) ||
384 bool constexpr use_amd_buffer_addressing =
false;
387 if constexpr(use_amd_buffer_addressing)
389 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
391 amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
398 atomic_add<X>(c_style_pointer_cast<X*>(&
p_data_[i]), x);
403 template <
typename X,
414 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
415 "wrong! X should contain multiple T");
419 #if CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
421 bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>,
double>;
423 bool constexpr use_amd_buffer_addressing =
false;
426 if constexpr(use_amd_buffer_addressing)
428 constexpr
index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
430 amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
433 else if(is_valid_element)
435 atomic_max<X>(c_style_pointer_cast<X*>(&
p_data_[i]), x);
447 typename ElementSpaceSize>
451 p, element_space_size};
458 typename ElementSpaceSize,
460 typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value,
bool>::type =
false>
461 __host__ __device__ constexpr
auto
465 p, element_space_size, invalid_element_value};
int8_t int8_t
Definition: int8.hpp:20
AmdBufferCoherenceEnum
Definition: amd_buffer_addressing.hpp:295
InMemoryDataOperationEnum
Definition: ck.hpp:267
typename vector_type< int8_t, 2 >::type int8x2_t
Definition: data_type.hpp:2513
typename vector_type< int8_t, 8 >::type int8x8_t
Definition: data_type.hpp:2515
AddressSpaceEnum
Definition: amd_address_space.hpp:15
_Float16 half_t
Definition: data_type.hpp:25
ushort bhalf_t
Definition: data_type.hpp:24
std::enable_if< B, T > enable_if
Definition: enable_if.hpp:10
typename vector_type< int8_t, 16 >::type int8x16_t
Definition: data_type.hpp:2516
constexpr bool is_same_v
Definition: type.hpp:283
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition: type.hpp:300
int32_t index_t
Definition: ck.hpp:289
typename vector_type< int8_t, 4 >::type int8x4_t
Definition: data_type.hpp:2514
__host__ constexpr __device__ auto make_dynamic_buffer(T *p, ElementSpaceSize element_space_size)
Definition: dynamic_buffer.hpp:448
Definition: dynamic_buffer.hpp:25
__host__ constexpr __device__ DynamicBuffer(T *p_data, ElementSpaceSize element_space_size)
Definition: dynamic_buffer.hpp:39
__host__ static constexpr __device__ bool IsStaticBuffer()
Definition: dynamic_buffer.hpp:439
__host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X &x)
Definition: dynamic_buffer.hpp:356
__host__ __device__ void AtomicMax(index_t i, bool is_valid_element, const X &x)
Definition: dynamic_buffer.hpp:407
__host__ constexpr __device__ auto Get(index_t i, bool is_valid_element) const
Definition: dynamic_buffer.hpp:67
ElementSpaceSize element_space_size_
Definition: dynamic_buffer.hpp:29
T * p_data_
Definition: dynamic_buffer.hpp:28
__host__ static constexpr __device__ AddressSpaceEnum GetAddressSpace()
Definition: dynamic_buffer.hpp:53
__host__ constexpr __device__ const T & operator[](index_t i) const
Definition: dynamic_buffer.hpp:58
__host__ constexpr __device__ T & operator()(index_t i)
Definition: dynamic_buffer.hpp:60
__host__ constexpr __device__ DynamicBuffer(T *p_data, ElementSpaceSize element_space_size, T invalid_element_value)
Definition: dynamic_buffer.hpp:44
T invalid_element_value_
Definition: dynamic_buffer.hpp:30
__host__ __device__ void Set(index_t i, bool is_valid_element, const X &x)
Definition: dynamic_buffer.hpp:213
static constexpr index_t PackedSize
Definition: dynamic_buffer.hpp:32
__host__ __device__ void DirectCopyToLds(DstBuffer &dst_buf, index_t src_offset, index_t dst_offset, bool is_valid_element) const
Definition: dynamic_buffer.hpp:189
__host__ static constexpr __device__ bool IsDynamicBuffer()
Definition: dynamic_buffer.hpp:441
T type
Definition: dynamic_buffer.hpp:26
__host__ __device__ void Update(index_t i, bool is_valid_element, const X &x)
Definition: dynamic_buffer.hpp:139
Definition: data_type.hpp:399
Definition: data_type.hpp:320
Definition: data_type.hpp:394
Definition: functional2.hpp:31
Definition: data_type.hpp:347