25 BufferResource<T> wave_buffer_resource;
28 wave_buffer_resource.address(Number<0>{}) =
const_cast<remove_cv_t<T>*
>(p_wave);
30 wave_buffer_resource.range(Number<2>{}) = element_space_size *
sizeof(T);
34 return wave_buffer_resource.content;
40 BufferResource<T> wave_buffer_resource;
43 wave_buffer_resource.address(Number<0>{}) =
const_cast<remove_cv_t<T>*
>(p_wave);
45 wave_buffer_resource.range(Number<2>{}) = 0xffffffff;
49 return wave_buffer_resource.content;
59 int32_t num = element_space_size *
sizeof(T);
62 return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, flags);
74 return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, flags);
83 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
91 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.add.i32");
99 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.f32");
107 int glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fmax.f64");
133 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
134 __device__
typename vector_type<int8_t, N>::type
136 index_t src_thread_addr_offset,
139 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
140 "wrong! not implemented");
144 return __builtin_amdgcn_raw_buffer_load_b8(src_wave_buffer_resource,
145 src_thread_addr_offset,
146 src_wave_addr_offset,
147 static_cast<index_t>(coherence));
149 else if constexpr(N == 2)
152 int16_t tmp = __builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource,
153 src_thread_addr_offset,
154 src_wave_addr_offset,
155 static_cast<index_t>(coherence));
157 return bit_cast<int8x2_t>(tmp);
159 else if constexpr(N == 4)
161 int32_t tmp = __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
162 src_thread_addr_offset,
163 src_wave_addr_offset,
164 static_cast<index_t>(coherence));
166 return bit_cast<int8x4_t>(tmp);
168 else if constexpr(N == 8)
170 int32x2_t tmp = __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
171 src_thread_addr_offset,
172 src_wave_addr_offset,
173 static_cast<index_t>(coherence));
175 return bit_cast<int8x8_t>(tmp);
177 else if constexpr(N == 16)
179 int32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
180 src_thread_addr_offset,
181 src_wave_addr_offset,
182 static_cast<index_t>(coherence));
183 return bit_cast<int8x16_t>(tmp);
185 else if constexpr(N == 32)
187 int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
188 src_thread_addr_offset,
189 src_wave_addr_offset,
190 static_cast<index_t>(coherence));
192 __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
193 src_thread_addr_offset,
194 src_wave_addr_offset + 4 *
sizeof(
int32_t),
195 static_cast<index_t>(coherence));
201 return bit_cast<int8x32_t>(tmp);
203 else if constexpr(N == 64)
205 int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
206 src_thread_addr_offset,
207 src_wave_addr_offset,
208 static_cast<index_t>(coherence));
210 __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
211 src_thread_addr_offset,
212 src_wave_addr_offset + 4 *
sizeof(
int32_t),
213 static_cast<index_t>(coherence));
215 __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
216 src_thread_addr_offset,
217 src_wave_addr_offset + 8 *
sizeof(
int32_t),
218 static_cast<index_t>(coherence));
220 __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
221 src_thread_addr_offset,
222 src_wave_addr_offset + 12 *
sizeof(
int32_t),
223 static_cast<index_t>(coherence));
232 return bit_cast<int8x64_t>(tmp);
236 template <
typename T,
239 __device__
typename vector_type<T, N>::type
241 index_t src_thread_addr_offset,
255 "wrong! not implemented");
258 auto raw_data = amd_buffer_load_impl_raw<sizeof(T) * N, coherence>(
259 src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
260 return bit_cast<r_t>(raw_data);
263 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
266 __amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
267 index_t dst_thread_addr_offset,
270 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
271 "wrong! not implemented");
275 __builtin_amdgcn_raw_buffer_store_b8(src_thread_data,
276 dst_wave_buffer_resource,
277 dst_thread_addr_offset,
278 dst_wave_addr_offset,
279 static_cast<index_t>(coherence));
281 else if constexpr(N == 2)
284 __builtin_amdgcn_raw_buffer_store_b16(bit_cast<int16_t>(src_thread_data),
285 dst_wave_buffer_resource,
286 dst_thread_addr_offset,
287 dst_wave_addr_offset,
288 static_cast<index_t>(coherence));
290 else if constexpr(N == 4)
292 __builtin_amdgcn_raw_buffer_store_b32(bit_cast<int32_t>(src_thread_data),
293 dst_wave_buffer_resource,
294 dst_thread_addr_offset,
295 dst_wave_addr_offset,
296 static_cast<index_t>(coherence));
298 else if constexpr(N == 8)
300 __builtin_amdgcn_raw_buffer_store_b64(bit_cast<int32x2_t>(src_thread_data),
301 dst_wave_buffer_resource,
302 dst_thread_addr_offset,
303 dst_wave_addr_offset,
304 static_cast<index_t>(coherence));
306 else if constexpr(N == 16)
308 __builtin_amdgcn_raw_buffer_store_b128(bit_cast<int32x4_t>(src_thread_data),
309 dst_wave_buffer_resource,
310 dst_thread_addr_offset,
311 dst_wave_addr_offset,
312 static_cast<index_t>(coherence));
314 else if constexpr(N == 32)
318 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<0>{}],
319 dst_wave_buffer_resource,
320 dst_thread_addr_offset,
321 dst_wave_addr_offset,
322 static_cast<index_t>(coherence));
324 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<1>{}],
325 dst_wave_buffer_resource,
326 dst_thread_addr_offset,
327 dst_wave_addr_offset +
sizeof(
int32_t) * 4,
328 static_cast<index_t>(coherence));
330 else if constexpr(N == 64)
334 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<0>{}],
335 dst_wave_buffer_resource,
336 dst_thread_addr_offset,
337 dst_wave_addr_offset,
338 static_cast<index_t>(coherence));
340 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<1>{}],
341 dst_wave_buffer_resource,
342 dst_thread_addr_offset,
343 dst_wave_addr_offset +
sizeof(
int32_t) * 4,
344 static_cast<index_t>(coherence));
346 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<2>{}],
347 dst_wave_buffer_resource,
348 dst_thread_addr_offset,
349 dst_wave_addr_offset +
sizeof(
int32_t) * 8,
350 static_cast<index_t>(coherence));
352 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<3>{}],
353 dst_wave_buffer_resource,
354 dst_thread_addr_offset,
355 dst_wave_addr_offset +
sizeof(
int32_t) * 12,
356 static_cast<index_t>(coherence));
360 template <
typename T,
364 __amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
365 index_t dst_thread_addr_offset,
377 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
379 "wrong! not implemented");
383 amd_buffer_store_impl_raw<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
384 dst_wave_buffer_resource,
385 dst_thread_addr_offset,
386 dst_wave_addr_offset);
389 template <
typename T, index_t N>
395 "wrong! not implemented");
399 vector_type<half_t, N> tmp{src_thread_data};
400 static_for<0, N / 2, 1>{}([&](
auto i) {
401 __builtin_amdgcn_global_atomic_fadd_v2f16(bit_cast<half2_t*>(addr) + i,
402 tmp.template AsType<half2_t>()[i]);
405 #if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)
408 vector_type<bhalf_t, N> tmp{src_thread_data};
409 static_for<0, N / 2, 1>{}([&](
auto i) {
410 __builtin_amdgcn_global_atomic_fadd_v2bf16(bit_cast<bhalf2_t*>(addr) + i,
411 tmp.template AsType<bhalf2_t>()[i]);
417 template <
typename T, index_t N>
420 index_t dst_thread_addr_offset,
426 "wrong! not implemented");
433 dst_wave_buffer_resource,
434 dst_thread_addr_offset,
435 dst_wave_addr_offset,
438 else if constexpr(N == 2)
440 vector_type<float, 2> tmp{src_thread_data};
443 dst_wave_buffer_resource,
444 dst_thread_addr_offset,
445 dst_wave_addr_offset,
449 dst_wave_buffer_resource,
450 dst_thread_addr_offset,
451 dst_wave_addr_offset +
sizeof(
float),
454 else if constexpr(N == 4)
456 vector_type<float, 4> tmp{src_thread_data};
459 dst_wave_buffer_resource,
460 dst_thread_addr_offset,
461 dst_wave_addr_offset,
465 dst_wave_buffer_resource,
466 dst_thread_addr_offset,
467 dst_wave_addr_offset +
sizeof(
float),
471 dst_wave_buffer_resource,
472 dst_thread_addr_offset,
473 dst_wave_addr_offset + 2 *
sizeof(
float),
477 dst_wave_buffer_resource,
478 dst_thread_addr_offset,
479 dst_wave_addr_offset + 3 *
sizeof(
float),
488 dst_wave_buffer_resource,
489 dst_thread_addr_offset,
490 dst_wave_addr_offset,
493 else if constexpr(N == 4)
495 vector_type<half_t, 4> tmp{src_thread_data};
497 static_for<0, 2, 1>{}([&](
auto i) {
499 dst_wave_buffer_resource,
500 dst_thread_addr_offset,
501 dst_wave_addr_offset + i *
sizeof(
half2_t),
505 else if constexpr(N == 8)
507 vector_type<half_t, 8> tmp{src_thread_data};
509 static_for<0, 4, 1>{}([&](
auto i) {
511 dst_wave_buffer_resource,
512 dst_thread_addr_offset,
513 dst_wave_addr_offset + i *
sizeof(
half2_t),
523 dst_wave_buffer_resource,
524 dst_thread_addr_offset,
525 dst_wave_addr_offset,
528 else if constexpr(N == 2)
530 vector_type<int32_t, 2> tmp{src_thread_data};
533 dst_wave_buffer_resource,
534 dst_thread_addr_offset,
535 dst_wave_addr_offset,
539 dst_wave_buffer_resource,
540 dst_thread_addr_offset,
541 dst_wave_addr_offset +
sizeof(
int32_t),
544 else if constexpr(N == 4)
546 vector_type<int32_t, 4> tmp{src_thread_data};
549 dst_wave_buffer_resource,
550 dst_thread_addr_offset,
551 dst_wave_addr_offset,
555 dst_wave_buffer_resource,
556 dst_thread_addr_offset,
557 dst_wave_addr_offset +
sizeof(
int32_t),
561 dst_wave_buffer_resource,
562 dst_thread_addr_offset,
563 dst_wave_addr_offset + 2 *
sizeof(
int32_t),
567 dst_wave_buffer_resource,
568 dst_thread_addr_offset,
569 dst_wave_addr_offset + 3 *
sizeof(
int32_t),
575 template <
typename T, index_t N>
578 index_t dst_thread_addr_offset,
582 "wrong! not implemented");
588 dst_wave_buffer_resource,
589 dst_thread_addr_offset,
590 dst_wave_addr_offset,
593 else if constexpr(N == 2)
595 vector_type<double, 2> tmp{src_thread_data};
598 dst_wave_buffer_resource,
599 dst_thread_addr_offset,
600 dst_wave_addr_offset,
604 dst_wave_buffer_resource,
605 dst_thread_addr_offset,
606 dst_wave_addr_offset +
sizeof(
double),
609 else if constexpr(N == 4)
611 vector_type<double, 4> tmp{src_thread_data};
614 dst_wave_buffer_resource,
615 dst_thread_addr_offset,
616 dst_wave_addr_offset,
620 dst_wave_buffer_resource,
621 dst_thread_addr_offset,
622 dst_wave_addr_offset +
sizeof(
double),
626 dst_wave_buffer_resource,
627 dst_thread_addr_offset,
628 dst_wave_addr_offset + 2 *
sizeof(
double),
632 dst_wave_buffer_resource,
633 dst_thread_addr_offset,
634 dst_wave_addr_offset + 3 *
sizeof(
double),
644 template <
typename T,
647 __device__
typename vector_type_maker<T, N>::type::type
649 index_t src_thread_element_offset,
650 bool src_thread_element_valid,
651 index_t src_element_space_size)
653 const __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
656 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
658 using vector_t =
typename vector_type_maker<T, N>::type::type;
659 using scalar_t =
typename scalar_type<vector_t>::type;
661 constexpr
index_t vector_size = scalar_type<vector_t>::vector_size;
663 #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
664 uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
665 return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
666 src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
670 vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
671 src_wave_buffer_resource, src_thread_addr_offset, 0)};
672 return src_thread_element_valid ? tmp : vector_t(0);
680 template <
typename T,
683 __device__
typename vector_type_maker<T, N>::type::type
685 index_t src_thread_element_offset,
686 bool src_thread_element_valid,
687 index_t src_element_space_size,
690 const __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
693 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
695 using vector_t =
typename vector_type_maker<T, N>::type::type;
696 using scalar_t =
typename scalar_type<vector_t>::type;
698 constexpr
index_t vector_size = scalar_type<vector_t>::vector_size;
700 vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
701 src_wave_buffer_resource, src_thread_addr_offset, 0)};
703 return src_thread_element_valid ? tmp : vector_t(customized_value);
710 template <
typename T,
713 __device__
void amd_buffer_store(
const typename vector_type_maker<T, N>::type::type src_thread_data,
715 const index_t dst_thread_element_offset,
716 const bool dst_thread_element_valid,
717 const index_t dst_element_space_size)
719 const __amdgpu_buffer_rsrc_t dst_wave_buffer_resource =
722 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
724 using vector_t =
typename vector_type_maker<T, N>::type::type;
725 using scalar_t =
typename scalar_type<vector_t>::type;
726 constexpr
index_t vector_size = scalar_type<vector_t>::vector_size;
728 #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
729 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
730 amd_buffer_store_impl<scalar_t, vector_size, coherence>(
731 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
733 if(dst_thread_element_valid)
735 amd_buffer_store_impl<scalar_t, vector_size, coherence>(
736 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
745 template <
typename T, index_t N>
749 const index_t dst_thread_element_offset,
750 const bool dst_thread_element_valid,
751 const index_t dst_element_space_size)
753 const int32x4_t dst_wave_buffer_resource =
756 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
758 using vector_t =
typename vector_type_maker<T, N>::type::type;
759 using scalar_t =
typename scalar_type<vector_t>::type;
760 constexpr
index_t vector_size = scalar_type<vector_t>::vector_size;
764 if(dst_thread_element_valid)
766 amd_global_atomic_add_impl<scalar_t, vector_size>(
767 src_thread_data, p_dst_wave + dst_thread_element_offset);
772 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
773 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
775 amd_buffer_atomic_add_impl<scalar_t, vector_size>(
776 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
778 if(dst_thread_element_valid)
780 amd_buffer_atomic_add_impl<scalar_t, vector_size>(
781 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
791 template <
typename T, index_t N>
795 const index_t dst_thread_element_offset,
796 const bool dst_thread_element_valid,
797 const index_t dst_element_space_size)
799 const int32x4_t dst_wave_buffer_resource =
802 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
804 using vector_t =
typename vector_type_maker<T, N>::type::type;
805 using scalar_t =
typename scalar_type<vector_t>::type;
806 constexpr
index_t vector_size = scalar_type<vector_t>::vector_size;
808 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
809 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
811 amd_buffer_atomic_max_impl<scalar_t, vector_size>(
812 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
814 if(dst_thread_element_valid)
816 amd_buffer_atomic_max_impl<scalar_t, vector_size>(
817 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
825 __attribute__((address_space(3)))
uint32_t* lds_ptr,
830 index_t aux) __asm(
"llvm.amdgcn.raw.buffer.load.lds");
832 #ifndef __HIPCC_RTC__
833 template <
typename T, index_t NumElemsPerThread>
839 const index_t src_element_space_size)
844 constexpr
auto bytes_per_thread =
sizeof(T) * NumElemsPerThread;
845 #if defined(__gfx950__)
846 constexpr
auto dword_bytes = 4;
847 static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
848 bytes_per_thread == dword_bytes * 4);
849 #elif defined(__gfx942__)
850 constexpr
auto dword_bytes = 4;
851 static_assert(bytes_per_thread == dword_bytes);
856 const index_t global_offset_bytes = is_valid ? global_offset *
sizeof(T) : 0x80000000;
858 #if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
859 T* lds_ptr = lds_base_ptr + lds_offset;
860 #ifndef CK_CODE_GEN_RTC
861 auto const lds_ptr_sgpr =
862 __builtin_amdgcn_readfirstlane((
reinterpret_cast<uintptr_t>(lds_ptr)));
864 auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((
reinterpret_cast<size_t>(lds_ptr)));
866 asm volatile(
"s_mov_b32 m0, %0; \n\t"
867 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::
"s"(lds_ptr_sgpr),
868 "v"(global_offset_bytes),
873 __attribute__((address_space(3)))
uint32_t* lds_ptr =
874 #ifndef CK_CODE_GEN_RTC
875 reinterpret_cast<__attribute__((address_space(3)))
uint32_t*
>(
876 reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
878 reinterpret_cast<__attribute__((address_space(3)))
uint32_t*
>(
879 reinterpret_cast<size_t>(lds_base_ptr + lds_offset));
883 src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
#define CK_BUFFER_RESOURCE_3RD_DWORD
Definition: ck.hpp:79
__device__ int32x4_t make_wave_buffer_resource_with_default_range(T *p_wave)
Definition: amd_buffer_addressing.hpp:38
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition: statically_indexed_array.hpp:45
__device__ void amd_buffer_store(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:894
__device__ void amd_direct_load_global_to_lds(const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)
Definition: amd_buffer_addressing.hpp:1015
__device__ void amd_buffer_atomic_max(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:974
__device__ void amd_buffer_store_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:544
AmdBufferCoherenceEnum
Definition: amd_buffer_addressing.hpp:295
__device__ int32x4_t make_wave_buffer_resource(T *p_wave, index_t element_space_size)
Definition: amd_buffer_addressing.hpp:23
typename vector_type< int32_t, 2 >::type int32x2_t
Definition: dtype_vector.hpp:2153
__device__ void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
__device__ void amd_buffer_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:599
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition: amd_buffer_addressing.hpp:865
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
__device__ void amd_global_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, T *addr)
Definition: amd_buffer_addressing.hpp:571
__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
__device__ vector_type< T, N >::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:419
__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_new(T *p_wave, index_t element_space_size)
Definition: amd_buffer_addressing_builtins.hpp:53
__device__ void amd_buffer_atomic_add(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:928
__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
typename vector_type< half_t, 2 >::type half2_t
Definition: dtype_vector.hpp:2139
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_zero(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
Definition: amd_buffer_addressing.hpp:829
__device__ void amd_buffer_atomic_max_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:757
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
typename vector_type< int32_t, 4 >::type int32x4_t
Definition: dtype_vector.hpp:2154
int32_t index_t
Definition: ck.hpp:298
__device__ void amd_buffer_store_impl_raw(const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:446
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:315
typename remove_cv< T >::type remove_cv_t
Definition: type.hpp:295
__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_with_default_range_new(T *p_wave)
Definition: amd_buffer_addressing_builtins.hpp:66
signed short int16_t
Definition: stdint.h:122
_W64 unsigned int uintptr_t
Definition: stdint.h:165
unsigned int uint32_t
Definition: stdint.h:126
signed int int32_t
Definition: stdint.h:123
signed char int8_t
Definition: stdint.h:121
Definition: integral_constant.hpp:20
static constexpr bool value
Definition: integral_constant.hpp:21
Definition: dtype_vector.hpp:10
int32x4_t content
Definition: amd_buffer_addressing.hpp:16
StaticallyIndexedArray< int32_t, 4 > config
Definition: amd_buffer_addressing.hpp:19
constexpr __device__ BufferResource()
Definition: amd_buffer_addressing_builtins.hpp:12
StaticallyIndexedArray< int32_t, 4 > range
Definition: amd_buffer_addressing.hpp:18
StaticallyIndexedArray< T *, 2 > address
Definition: amd_buffer_addressing.hpp:17