31 wave_buffer_resource.
range(
Number<2>{}) = element_space_size *
sizeof(T);
35 return wave_buffer_resource.
content;
50 return wave_buffer_resource.
content;
58 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.i8");
64 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2i8");
70 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4i8");
77 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.i16");
83 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2i16");
89 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4i16");
96 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.i32");
102 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2i32");
108 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4i32");
115 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.f16");
121 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2f16");
127 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4f16");
134 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.f32");
140 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2f32");
146 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4f32");
154 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.i8");
161 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2i8");
168 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4i8");
176 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.i16");
183 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2i16");
190 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4i16");
198 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.i32");
205 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2i32");
212 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4i32");
220 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.f16");
227 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2f16");
234 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4f16");
242 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.f32");
249 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2f32");
256 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4f32");
264 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
272 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.add.i32");
280 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.f32");
288 int glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fmax.f64");
290 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
293 index_t src_thread_addr_offset,
296 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
297 "wrong! not implemented");
302 src_thread_addr_offset,
303 src_wave_addr_offset,
304 static_cast<index_t>(coherence));
306 else if constexpr(N == 2)
310 src_thread_addr_offset,
311 src_wave_addr_offset,
312 static_cast<index_t>(coherence));
314 return bit_cast<int8x2_t>(tmp);
316 else if constexpr(N == 4)
319 src_thread_addr_offset,
320 src_wave_addr_offset,
321 static_cast<index_t>(coherence));
323 return bit_cast<int8x4_t>(tmp);
325 else if constexpr(N == 8)
328 src_thread_addr_offset,
329 src_wave_addr_offset,
330 static_cast<index_t>(coherence));
332 return bit_cast<int8x8_t>(tmp);
334 else if constexpr(N == 16)
337 src_thread_addr_offset,
338 src_wave_addr_offset,
339 static_cast<index_t>(coherence));
340 return bit_cast<int8x16_t>(tmp);
342 else if constexpr(N == 32)
345 src_thread_addr_offset,
346 src_wave_addr_offset,
347 static_cast<index_t>(coherence));
350 src_thread_addr_offset,
351 src_wave_addr_offset + 4 *
sizeof(
int32_t),
352 static_cast<index_t>(coherence));
358 return bit_cast<int8x32_t>(tmp);
360 else if constexpr(N == 64)
363 src_thread_addr_offset,
364 src_wave_addr_offset,
365 static_cast<index_t>(coherence));
368 src_thread_addr_offset,
369 src_wave_addr_offset + 4 *
sizeof(
int32_t),
370 static_cast<index_t>(coherence));
373 src_thread_addr_offset,
374 src_wave_addr_offset + 8 *
sizeof(
int32_t),
375 static_cast<index_t>(coherence));
378 src_thread_addr_offset,
379 src_wave_addr_offset + 12 *
sizeof(
int32_t),
380 static_cast<index_t>(coherence));
389 return bit_cast<int8x64_t>(tmp);
393 template <
typename T,
397 index_t src_thread_addr_offset,
412 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
413 "wrong! not implemented");
416 auto raw_data = amd_buffer_load_impl_raw<sizeof(T) * N, coherence>(
417 src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
418 return bit_cast<r_t>(raw_data);
421 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
425 index_t dst_thread_addr_offset,
428 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
429 "wrong! not implemented");
434 dst_wave_buffer_resource,
435 dst_thread_addr_offset,
436 dst_wave_addr_offset,
437 static_cast<index_t>(coherence));
439 else if constexpr(N == 2)
443 dst_wave_buffer_resource,
444 dst_thread_addr_offset,
445 dst_wave_addr_offset,
446 static_cast<index_t>(coherence));
448 else if constexpr(N == 4)
451 dst_wave_buffer_resource,
452 dst_thread_addr_offset,
453 dst_wave_addr_offset,
454 static_cast<index_t>(coherence));
456 else if constexpr(N == 8)
459 dst_wave_buffer_resource,
460 dst_thread_addr_offset,
461 dst_wave_addr_offset,
462 static_cast<index_t>(coherence));
464 else if constexpr(N == 16)
467 dst_wave_buffer_resource,
468 dst_thread_addr_offset,
469 dst_wave_addr_offset,
470 static_cast<index_t>(coherence));
472 else if constexpr(N == 32)
477 dst_wave_buffer_resource,
478 dst_thread_addr_offset,
479 dst_wave_addr_offset,
480 static_cast<index_t>(coherence));
483 dst_wave_buffer_resource,
484 dst_thread_addr_offset,
485 dst_wave_addr_offset +
sizeof(
int32_t) * 4,
486 static_cast<index_t>(coherence));
488 else if constexpr(N == 64)
493 dst_wave_buffer_resource,
494 dst_thread_addr_offset,
495 dst_wave_addr_offset,
496 static_cast<index_t>(coherence));
499 dst_wave_buffer_resource,
500 dst_thread_addr_offset,
501 dst_wave_addr_offset +
sizeof(
int32_t) * 4,
502 static_cast<index_t>(coherence));
505 dst_wave_buffer_resource,
506 dst_thread_addr_offset,
507 dst_wave_addr_offset +
sizeof(
int32_t) * 8,
508 static_cast<index_t>(coherence));
511 dst_wave_buffer_resource,
512 dst_thread_addr_offset,
513 dst_wave_addr_offset +
sizeof(
int32_t) * 12,
514 static_cast<index_t>(coherence));
518 template <
typename T,
523 index_t dst_thread_addr_offset,
535 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
537 "wrong! not implemented");
541 amd_buffer_store_impl_raw<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
542 dst_wave_buffer_resource,
543 dst_thread_addr_offset,
544 dst_wave_addr_offset);
547 template <
typename T, index_t N>
553 "wrong! not implemented");
559 __builtin_amdgcn_global_atomic_fadd_v2f16(bit_cast<half2_t*>(addr) + i,
560 tmp.template AsType<half2_t>()[i]);
563 #if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)
568 __builtin_amdgcn_global_atomic_fadd_v2bf16(bit_cast<bhalf2_t*>(addr) + i,
569 tmp.template AsType<bhalf2_t>()[i]);
575 template <
typename T, index_t N>
578 index_t dst_thread_addr_offset,
584 "wrong! not implemented");
591 dst_wave_buffer_resource,
592 dst_thread_addr_offset,
593 dst_wave_addr_offset,
596 else if constexpr(N == 2)
601 dst_wave_buffer_resource,
602 dst_thread_addr_offset,
603 dst_wave_addr_offset,
607 dst_wave_buffer_resource,
608 dst_thread_addr_offset,
609 dst_wave_addr_offset +
sizeof(
float),
612 else if constexpr(N == 4)
617 dst_wave_buffer_resource,
618 dst_thread_addr_offset,
619 dst_wave_addr_offset,
623 dst_wave_buffer_resource,
624 dst_thread_addr_offset,
625 dst_wave_addr_offset +
sizeof(
float),
629 dst_wave_buffer_resource,
630 dst_thread_addr_offset,
631 dst_wave_addr_offset + 2 *
sizeof(
float),
635 dst_wave_buffer_resource,
636 dst_thread_addr_offset,
637 dst_wave_addr_offset + 3 *
sizeof(
float),
646 dst_wave_buffer_resource,
647 dst_thread_addr_offset,
648 dst_wave_addr_offset,
651 else if constexpr(N == 4)
657 dst_wave_buffer_resource,
658 dst_thread_addr_offset,
659 dst_wave_addr_offset + i *
sizeof(
half2_t),
663 else if constexpr(N == 8)
669 dst_wave_buffer_resource,
670 dst_thread_addr_offset,
671 dst_wave_addr_offset + i *
sizeof(
half2_t),
681 dst_wave_buffer_resource,
682 dst_thread_addr_offset,
683 dst_wave_addr_offset,
686 else if constexpr(N == 2)
691 dst_wave_buffer_resource,
692 dst_thread_addr_offset,
693 dst_wave_addr_offset,
697 dst_wave_buffer_resource,
698 dst_thread_addr_offset,
699 dst_wave_addr_offset +
sizeof(
int32_t),
702 else if constexpr(N == 4)
707 dst_wave_buffer_resource,
708 dst_thread_addr_offset,
709 dst_wave_addr_offset,
713 dst_wave_buffer_resource,
714 dst_thread_addr_offset,
715 dst_wave_addr_offset +
sizeof(
int32_t),
719 dst_wave_buffer_resource,
720 dst_thread_addr_offset,
721 dst_wave_addr_offset + 2 *
sizeof(
int32_t),
725 dst_wave_buffer_resource,
726 dst_thread_addr_offset,
727 dst_wave_addr_offset + 3 *
sizeof(
int32_t),
733 template <
typename T, index_t N>
736 index_t dst_thread_addr_offset,
740 "wrong! not implemented");
746 dst_wave_buffer_resource,
747 dst_thread_addr_offset,
748 dst_wave_addr_offset,
751 else if constexpr(N == 2)
756 dst_wave_buffer_resource,
757 dst_thread_addr_offset,
758 dst_wave_addr_offset,
762 dst_wave_buffer_resource,
763 dst_thread_addr_offset,
764 dst_wave_addr_offset +
sizeof(
double),
767 else if constexpr(N == 4)
772 dst_wave_buffer_resource,
773 dst_thread_addr_offset,
774 dst_wave_addr_offset,
778 dst_wave_buffer_resource,
779 dst_thread_addr_offset,
780 dst_wave_addr_offset +
sizeof(
double),
784 dst_wave_buffer_resource,
785 dst_thread_addr_offset,
786 dst_wave_addr_offset + 2 *
sizeof(
double),
790 dst_wave_buffer_resource,
791 dst_thread_addr_offset,
792 dst_wave_addr_offset + 3 *
sizeof(
double),
802 template <
typename T,
805 __device__
typename vector_type_maker<T, N>::type::type
807 index_t src_thread_element_offset,
808 bool src_thread_element_valid,
809 index_t src_element_space_size)
811 const int32x4_t src_wave_buffer_resource =
814 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
821 #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
822 uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
823 return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
824 src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
828 vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
829 src_wave_buffer_resource, src_thread_addr_offset, 0)};
830 return src_thread_element_valid ? tmp : vector_t(0);
838 template <
typename T,
841 __device__
typename vector_type_maker<T, N>::type::type
843 index_t src_thread_element_offset,
844 bool src_thread_element_valid,
845 index_t src_element_space_size,
848 const int32x4_t src_wave_buffer_resource =
851 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
858 vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
859 src_wave_buffer_resource, src_thread_addr_offset, 0)};
861 return src_thread_element_valid ? tmp : vector_t(customized_value);
868 template <
typename T,
873 const index_t dst_thread_element_offset,
874 const bool dst_thread_element_valid,
875 const index_t dst_element_space_size)
877 const int32x4_t dst_wave_buffer_resource =
880 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
886 #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
887 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
888 amd_buffer_store_impl<scalar_t, vector_size, coherence>(
889 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
891 if(dst_thread_element_valid)
893 amd_buffer_store_impl<scalar_t, vector_size, coherence>(
894 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
903 template <
typename T, index_t N>
907 const index_t dst_thread_element_offset,
908 const bool dst_thread_element_valid,
909 const index_t dst_element_space_size)
911 const int32x4_t dst_wave_buffer_resource =
914 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
922 if(dst_thread_element_valid)
924 amd_global_atomic_add_impl<scalar_t, vector_size>(
925 src_thread_data, p_dst_wave + dst_thread_element_offset);
930 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
931 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
933 amd_buffer_atomic_add_impl<scalar_t, vector_size>(
934 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
936 if(dst_thread_element_valid)
938 amd_buffer_atomic_add_impl<scalar_t, vector_size>(
939 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
949 template <
typename T, index_t N>
953 const index_t dst_thread_element_offset,
954 const bool dst_thread_element_valid,
955 const index_t dst_element_space_size)
957 const int32x4_t dst_wave_buffer_resource =
960 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
966 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
967 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
969 amd_buffer_atomic_max_impl<scalar_t, vector_size>(
970 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
972 if(dst_thread_element_valid)
974 amd_buffer_atomic_max_impl<scalar_t, vector_size>(
975 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
983 __attribute__((address_space(3)))
uint32_t* lds_ptr,
988 index_t aux) __asm(
"llvm.amdgcn.raw.buffer.load.lds");
990 #ifndef __HIPCC_RTC__
991 template <
typename T, index_t NumElemsPerThread>
997 const index_t src_element_space_size)
1000 constexpr
auto bytes_per_thread =
sizeof(T) * NumElemsPerThread;
1001 #if defined(__gfx950__)
1002 constexpr
auto dword_bytes = 4;
1003 static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
1004 bytes_per_thread == dword_bytes * 4);
1005 #elif defined(__gfx942__)
1006 constexpr
auto dword_bytes = 4;
1007 static_assert(bytes_per_thread == dword_bytes);
1012 const index_t global_offset_bytes = is_valid ? global_offset *
sizeof(T) : 0x80000000;
1014 #if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
1015 T* lds_ptr = lds_base_ptr + lds_offset;
1016 #ifndef CK_CODE_GEN_RTC
1017 auto const lds_ptr_sgpr =
1018 __builtin_amdgcn_readfirstlane((
reinterpret_cast<uintptr_t>(lds_ptr)));
1020 auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((
reinterpret_cast<size_t>(lds_ptr)));
1022 asm volatile(
"s_mov_b32 m0, %0; \n\t"
1023 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::
"s"(lds_ptr_sgpr),
1024 "v"(global_offset_bytes),
1029 __attribute__((address_space(3)))
uint32_t* lds_ptr =
1030 #ifndef CK_CODE_GEN_RTC
1031 reinterpret_cast<__attribute__((address_space(3)))
uint32_t*
>(
1032 reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
1034 reinterpret_cast<__attribute__((address_space(3)))
uint32_t*
>(
1035 reinterpret_cast<size_t>(lds_base_ptr + lds_offset));
1039 src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
#define CK_BUFFER_RESOURCE_3RD_DWORD
Definition: ck.hpp:81
__device__ void llvm_amdgcn_raw_buffer_store_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
__device__ int32x4_t make_wave_buffer_resource_with_default_range(T *p_wave)
Definition: amd_buffer_addressing.hpp:39
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition: statically_indexed_array.hpp:45
__device__ void amd_buffer_store(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:871
__device__ void amd_direct_load_global_to_lds(const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)
Definition: amd_buffer_addressing.hpp:992
__device__ void amd_buffer_atomic_max(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:951
__device__ int32x4_t llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
__device__ void llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
__device__ void llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
__device__ void amd_buffer_store_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:521
__device__ float llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
__device__ int32x2_t llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
AmdBufferCoherenceEnum
Definition: amd_buffer_coherence.hpp:9
typename vector_type< bhalf_t, 4 >::type bhalf4_t
Definition: dtype_vector.hpp:2162
__device__ int32x4_t make_wave_buffer_resource(T *p_wave, index_t element_space_size)
Definition: amd_buffer_addressing.hpp:24
typename vector_type< int32_t, 2 >::type int32x2_t
Definition: dtype_vector.hpp:2168
__device__ void llvm_amdgcn_raw_buffer_store_fp16(half_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
typename vector_type< int8_t, 2 >::type int8x2_t
Definition: dtype_vector.hpp:2177
__device__ half_t llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
__device__ bhalf_t llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
__device__ void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
__device__ void llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
__device__ int8x4_t llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
__device__ void amd_buffer_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:576
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition: amd_buffer_addressing.hpp:842
__device__ float4_t llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
__device__ bhalf2_t llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
typename vector_type< float, 2 >::type float2_t
Definition: dtype_vector.hpp:2146
__device__ void llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
typename vector_type< half_t, 4 >::type half4_t
Definition: dtype_vector.hpp:2155
__device__ void amd_global_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, T *addr)
Definition: amd_buffer_addressing.hpp:548
_Float16 half_t
Definition: data_type.hpp:31
__device__ void llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
ushort bhalf_t
Definition: data_type.hpp:30
__device__ vector_type< T, N >::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:396
typename vector_type< bhalf_t, 2 >::type bhalf2_t
Definition: dtype_vector.hpp:2161
__device__ void llvm_amdgcn_raw_buffer_store_i16(bhalf_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
__device__ void llvm_amdgcn_raw_buffer_store_i16x2(bhalf2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
typename vector_type< float, 4 >::type float4_t
Definition: dtype_vector.hpp:2147
__device__ half2_t llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
__device__ void amd_buffer_atomic_add(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:905
__device__ void llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
__device__ bhalf4_t llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
__device__ int8_t llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
__device__ float2_t llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
typename vector_type< half_t, 2 >::type half2_t
Definition: dtype_vector.hpp:2154
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_zero(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
Definition: amd_buffer_addressing.hpp:806
__device__ void amd_buffer_atomic_max_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:734
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
typename vector_type< int32_t, 4 >::type int32x4_t
Definition: dtype_vector.hpp:2169
int32_t index_t
Definition: ck.hpp:301
__device__ void llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
typename vector_type< int8_t, 4 >::type int8x4_t
Definition: dtype_vector.hpp:2178
__device__ void llvm_amdgcn_raw_buffer_store_i8(int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
__device__ int32_t llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
__device__ void amd_buffer_store_impl_raw(const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:423
__device__ void llvm_amdgcn_raw_buffer_store_i16x4(bhalf4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:292
__device__ int8x2_t llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
typename remove_cv< T >::type remove_cv_t
Definition: type.hpp:295
__device__ void llvm_amdgcn_raw_buffer_store_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
__device__ half4_t llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
__device__ void llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
signed short int16_t
Definition: stdint.h:122
_W64 unsigned int uintptr_t
Definition: stdint.h:164
unsigned int uint32_t
Definition: stdint.h:126
signed int int32_t
Definition: stdint.h:123
signed char int8_t
Definition: stdint.h:121
Definition: integral_constant.hpp:20
Definition: data_type.hpp:39
Definition: functional2.hpp:33
Definition: dtype_vector.hpp:31
Definition: dtype_vector.hpp:11
Definition: amd_buffer_addressing.hpp:12
int32x4_t content
Definition: amd_buffer_addressing.hpp:17
StaticallyIndexedArray< int32_t, 4 > config
Definition: amd_buffer_addressing.hpp:20
constexpr __device__ BufferResource()
Definition: amd_buffer_addressing.hpp:13
StaticallyIndexedArray< int32_t, 4 > range
Definition: amd_buffer_addressing.hpp:19
StaticallyIndexedArray< T *, 2 > address
Definition: amd_buffer_addressing.hpp:18