30 wave_buffer_resource.
range(
Number<2>{}) = element_space_size *
sizeof(T);
34 return wave_buffer_resource.
content;
49 return wave_buffer_resource.
content;
57 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.i8");
63 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2i8");
69 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4i8");
76 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.i16");
82 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2i16");
88 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4i16");
95 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.i32");
101 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2i32");
107 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4i32");
114 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.f16");
120 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2f16");
126 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4f16");
133 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.f32");
139 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2f32");
145 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4f32");
153 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.i8");
160 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2i8");
167 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4i8");
175 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.i16");
182 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2i16");
189 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4i16");
197 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.i32");
204 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2i32");
211 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4i32");
219 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.f16");
226 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2f16");
233 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4f16");
241 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.f32");
248 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2f32");
255 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4f32");
263 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
271 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.add.i32");
279 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.f32");
287 int glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fmax.f64");
313 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
314 __device__
typename vector_type<int8_t, N>::type
316 index_t src_thread_addr_offset,
319 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
320 "wrong! not implemented");
325 src_thread_addr_offset,
326 src_wave_addr_offset,
327 static_cast<index_t>(coherence));
329 else if constexpr(N == 2)
333 src_thread_addr_offset,
334 src_wave_addr_offset,
335 static_cast<index_t>(coherence));
337 return bit_cast<int8x2_t>(tmp);
339 else if constexpr(N == 4)
342 src_thread_addr_offset,
343 src_wave_addr_offset,
344 static_cast<index_t>(coherence));
346 return bit_cast<int8x4_t>(tmp);
348 else if constexpr(N == 8)
351 src_thread_addr_offset,
352 src_wave_addr_offset,
353 static_cast<index_t>(coherence));
355 return bit_cast<int8x8_t>(tmp);
357 else if constexpr(N == 16)
360 src_thread_addr_offset,
361 src_wave_addr_offset,
362 static_cast<index_t>(coherence));
363 return bit_cast<int8x16_t>(tmp);
365 else if constexpr(N == 32)
368 src_thread_addr_offset,
369 src_wave_addr_offset,
370 static_cast<index_t>(coherence));
373 src_thread_addr_offset,
374 src_wave_addr_offset + 4 *
sizeof(
int32_t),
375 static_cast<index_t>(coherence));
381 return bit_cast<int8x32_t>(tmp);
383 else if constexpr(N == 64)
386 src_thread_addr_offset,
387 src_wave_addr_offset,
388 static_cast<index_t>(coherence));
391 src_thread_addr_offset,
392 src_wave_addr_offset + 4 *
sizeof(
int32_t),
393 static_cast<index_t>(coherence));
396 src_thread_addr_offset,
397 src_wave_addr_offset + 8 *
sizeof(
int32_t),
398 static_cast<index_t>(coherence));
401 src_thread_addr_offset,
402 src_wave_addr_offset + 12 *
sizeof(
int32_t),
403 static_cast<index_t>(coherence));
412 return bit_cast<int8x64_t>(tmp);
416 template <
typename T,
420 index_t src_thread_addr_offset,
435 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
436 "wrong! not implemented");
439 auto raw_data = amd_buffer_load_impl_raw<sizeof(T) * N, coherence>(
440 src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
441 return bit_cast<r_t>(raw_data);
444 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
448 index_t dst_thread_addr_offset,
451 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
452 "wrong! not implemented");
457 dst_wave_buffer_resource,
458 dst_thread_addr_offset,
459 dst_wave_addr_offset,
460 static_cast<index_t>(coherence));
462 else if constexpr(N == 2)
466 dst_wave_buffer_resource,
467 dst_thread_addr_offset,
468 dst_wave_addr_offset,
469 static_cast<index_t>(coherence));
471 else if constexpr(N == 4)
474 dst_wave_buffer_resource,
475 dst_thread_addr_offset,
476 dst_wave_addr_offset,
477 static_cast<index_t>(coherence));
479 else if constexpr(N == 8)
482 dst_wave_buffer_resource,
483 dst_thread_addr_offset,
484 dst_wave_addr_offset,
485 static_cast<index_t>(coherence));
487 else if constexpr(N == 16)
490 dst_wave_buffer_resource,
491 dst_thread_addr_offset,
492 dst_wave_addr_offset,
493 static_cast<index_t>(coherence));
495 else if constexpr(N == 32)
500 dst_wave_buffer_resource,
501 dst_thread_addr_offset,
502 dst_wave_addr_offset,
503 static_cast<index_t>(coherence));
506 dst_wave_buffer_resource,
507 dst_thread_addr_offset,
508 dst_wave_addr_offset +
sizeof(
int32_t) * 4,
509 static_cast<index_t>(coherence));
511 else if constexpr(N == 64)
516 dst_wave_buffer_resource,
517 dst_thread_addr_offset,
518 dst_wave_addr_offset,
519 static_cast<index_t>(coherence));
522 dst_wave_buffer_resource,
523 dst_thread_addr_offset,
524 dst_wave_addr_offset +
sizeof(
int32_t) * 4,
525 static_cast<index_t>(coherence));
528 dst_wave_buffer_resource,
529 dst_thread_addr_offset,
530 dst_wave_addr_offset +
sizeof(
int32_t) * 8,
531 static_cast<index_t>(coherence));
534 dst_wave_buffer_resource,
535 dst_thread_addr_offset,
536 dst_wave_addr_offset +
sizeof(
int32_t) * 12,
537 static_cast<index_t>(coherence));
541 template <
typename T,
546 index_t dst_thread_addr_offset,
558 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
560 "wrong! not implemented");
564 amd_buffer_store_impl_raw<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
565 dst_wave_buffer_resource,
566 dst_thread_addr_offset,
567 dst_wave_addr_offset);
570 template <
typename T, index_t N>
576 "wrong! not implemented");
582 __builtin_amdgcn_global_atomic_fadd_v2f16(bit_cast<half2_t*>(addr) + i,
583 tmp.template AsType<half2_t>()[i]);
586 #if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)
591 __builtin_amdgcn_global_atomic_fadd_v2bf16(bit_cast<bhalf2_t*>(addr) + i,
592 tmp.template AsType<bhalf2_t>()[i]);
598 template <
typename T, index_t N>
601 index_t dst_thread_addr_offset,
607 "wrong! not implemented");
614 dst_wave_buffer_resource,
615 dst_thread_addr_offset,
616 dst_wave_addr_offset,
619 else if constexpr(N == 2)
624 dst_wave_buffer_resource,
625 dst_thread_addr_offset,
626 dst_wave_addr_offset,
630 dst_wave_buffer_resource,
631 dst_thread_addr_offset,
632 dst_wave_addr_offset +
sizeof(
float),
635 else if constexpr(N == 4)
640 dst_wave_buffer_resource,
641 dst_thread_addr_offset,
642 dst_wave_addr_offset,
646 dst_wave_buffer_resource,
647 dst_thread_addr_offset,
648 dst_wave_addr_offset +
sizeof(
float),
652 dst_wave_buffer_resource,
653 dst_thread_addr_offset,
654 dst_wave_addr_offset + 2 *
sizeof(
float),
658 dst_wave_buffer_resource,
659 dst_thread_addr_offset,
660 dst_wave_addr_offset + 3 *
sizeof(
float),
669 dst_wave_buffer_resource,
670 dst_thread_addr_offset,
671 dst_wave_addr_offset,
674 else if constexpr(N == 4)
680 dst_wave_buffer_resource,
681 dst_thread_addr_offset,
682 dst_wave_addr_offset + i *
sizeof(
half2_t),
686 else if constexpr(N == 8)
692 dst_wave_buffer_resource,
693 dst_thread_addr_offset,
694 dst_wave_addr_offset + i *
sizeof(
half2_t),
704 dst_wave_buffer_resource,
705 dst_thread_addr_offset,
706 dst_wave_addr_offset,
709 else if constexpr(N == 2)
714 dst_wave_buffer_resource,
715 dst_thread_addr_offset,
716 dst_wave_addr_offset,
720 dst_wave_buffer_resource,
721 dst_thread_addr_offset,
722 dst_wave_addr_offset +
sizeof(
int32_t),
725 else if constexpr(N == 4)
730 dst_wave_buffer_resource,
731 dst_thread_addr_offset,
732 dst_wave_addr_offset,
736 dst_wave_buffer_resource,
737 dst_thread_addr_offset,
738 dst_wave_addr_offset +
sizeof(
int32_t),
742 dst_wave_buffer_resource,
743 dst_thread_addr_offset,
744 dst_wave_addr_offset + 2 *
sizeof(
int32_t),
748 dst_wave_buffer_resource,
749 dst_thread_addr_offset,
750 dst_wave_addr_offset + 3 *
sizeof(
int32_t),
756 template <
typename T, index_t N>
759 index_t dst_thread_addr_offset,
763 "wrong! not implemented");
769 dst_wave_buffer_resource,
770 dst_thread_addr_offset,
771 dst_wave_addr_offset,
774 else if constexpr(N == 2)
779 dst_wave_buffer_resource,
780 dst_thread_addr_offset,
781 dst_wave_addr_offset,
785 dst_wave_buffer_resource,
786 dst_thread_addr_offset,
787 dst_wave_addr_offset +
sizeof(
double),
790 else if constexpr(N == 4)
795 dst_wave_buffer_resource,
796 dst_thread_addr_offset,
797 dst_wave_addr_offset,
801 dst_wave_buffer_resource,
802 dst_thread_addr_offset,
803 dst_wave_addr_offset +
sizeof(
double),
807 dst_wave_buffer_resource,
808 dst_thread_addr_offset,
809 dst_wave_addr_offset + 2 *
sizeof(
double),
813 dst_wave_buffer_resource,
814 dst_thread_addr_offset,
815 dst_wave_addr_offset + 3 *
sizeof(
double),
825 template <
typename T,
828 __device__
typename vector_type_maker<T, N>::type::type
830 index_t src_thread_element_offset,
831 bool src_thread_element_valid,
832 index_t src_element_space_size)
834 const int32x4_t src_wave_buffer_resource =
837 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
844 #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
845 uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
846 return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
847 src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
851 vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
852 src_wave_buffer_resource, src_thread_addr_offset, 0)};
853 return src_thread_element_valid ? tmp : vector_t(0);
861 template <
typename T,
864 __device__
typename vector_type_maker<T, N>::type::type
866 index_t src_thread_element_offset,
867 bool src_thread_element_valid,
868 index_t src_element_space_size,
871 const int32x4_t src_wave_buffer_resource =
874 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
881 vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
882 src_wave_buffer_resource, src_thread_addr_offset, 0)};
884 return src_thread_element_valid ? tmp : vector_t(customized_value);
891 template <
typename T,
896 const index_t dst_thread_element_offset,
897 const bool dst_thread_element_valid,
898 const index_t dst_element_space_size)
900 const int32x4_t dst_wave_buffer_resource =
903 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
909 #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
910 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
911 amd_buffer_store_impl<scalar_t, vector_size, coherence>(
912 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
914 if(dst_thread_element_valid)
916 amd_buffer_store_impl<scalar_t, vector_size, coherence>(
917 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
926 template <
typename T, index_t N>
930 const index_t dst_thread_element_offset,
931 const bool dst_thread_element_valid,
932 const index_t dst_element_space_size)
934 const int32x4_t dst_wave_buffer_resource =
937 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
945 if(dst_thread_element_valid)
947 amd_global_atomic_add_impl<scalar_t, vector_size>(
948 src_thread_data, p_dst_wave + dst_thread_element_offset);
953 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
954 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
956 amd_buffer_atomic_add_impl<scalar_t, vector_size>(
957 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
959 if(dst_thread_element_valid)
961 amd_buffer_atomic_add_impl<scalar_t, vector_size>(
962 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
972 template <
typename T, index_t N>
976 const index_t dst_thread_element_offset,
977 const bool dst_thread_element_valid,
978 const index_t dst_element_space_size)
980 const int32x4_t dst_wave_buffer_resource =
983 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
989 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
990 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
992 amd_buffer_atomic_max_impl<scalar_t, vector_size>(
993 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
995 if(dst_thread_element_valid)
997 amd_buffer_atomic_max_impl<scalar_t, vector_size>(
998 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
1006 __attribute__((address_space(3)))
uint32_t* lds_ptr,
1011 index_t aux) __asm(
"llvm.amdgcn.raw.buffer.load.lds");
1013 #ifndef __HIPCC_RTC__
1014 template <
typename T, index_t NumElemsPerThread>
1019 const bool is_valid,
1020 const index_t src_element_space_size)
1023 constexpr
auto bytes_per_thread =
sizeof(T) * NumElemsPerThread;
1024 #if defined(__gfx950__)
1025 constexpr
auto dword_bytes = 4;
1026 static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
1027 bytes_per_thread == dword_bytes * 4);
1028 #elif defined(__gfx942__)
1029 constexpr
auto dword_bytes = 4;
1030 static_assert(bytes_per_thread == dword_bytes);
1035 const index_t global_offset_bytes = is_valid ? global_offset *
sizeof(T) : 0x80000000;
1037 #if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
1038 T* lds_ptr = lds_base_ptr + lds_offset;
1039 #ifndef CK_CODE_GEN_RTC
1040 auto const lds_ptr_sgpr =
1041 __builtin_amdgcn_readfirstlane((
reinterpret_cast<uintptr_t>(lds_ptr)));
1043 auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((
reinterpret_cast<size_t>(lds_ptr)));
1045 asm volatile(
"s_mov_b32 m0, %0; \n\t"
1046 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::
"s"(lds_ptr_sgpr),
1047 "v"(global_offset_bytes),
1052 __attribute__((address_space(3)))
uint32_t* lds_ptr =
1053 #ifndef CK_CODE_GEN_RTC
1054 reinterpret_cast<__attribute__((address_space(3)))
uint32_t*
>(
1055 reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
1057 reinterpret_cast<__attribute__((address_space(3)))
uint32_t*
>(
1058 reinterpret_cast<size_t>(lds_base_ptr + lds_offset));
1062 src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
#define CK_BUFFER_RESOURCE_3RD_DWORD
Definition: ck.hpp:79
__device__ void llvm_amdgcn_raw_buffer_store_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
__device__ int32x4_t make_wave_buffer_resource_with_default_range(T *p_wave)
Definition: amd_buffer_addressing.hpp:38
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition: statically_indexed_array.hpp:45
__device__ void amd_buffer_store(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:894
__device__ void amd_direct_load_global_to_lds(const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)
Definition: amd_buffer_addressing.hpp:1015
__device__ void amd_buffer_atomic_max(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:974
__device__ int32x4_t llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
__device__ void llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
__device__ void llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
__device__ void amd_buffer_store_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:544
__device__ float llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
__device__ int32x2_t llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
AmdBufferCoherenceEnum
Definition: amd_buffer_addressing.hpp:295
typename vector_type< bhalf_t, 4 >::type bhalf4_t
Definition: dtype_vector.hpp:2147
__device__ int32x4_t make_wave_buffer_resource(T *p_wave, index_t element_space_size)
Definition: amd_buffer_addressing.hpp:23
typename vector_type< int32_t, 2 >::type int32x2_t
Definition: dtype_vector.hpp:2153
__device__ void llvm_amdgcn_raw_buffer_store_fp16(half_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
typename vector_type< int8_t, 2 >::type int8x2_t
Definition: dtype_vector.hpp:2162
__device__ half_t llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
__device__ bhalf_t llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
__device__ void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
__device__ void llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
__device__ int8x4_t llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
__device__ void amd_buffer_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:599
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition: amd_buffer_addressing.hpp:865
__device__ float4_t llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
__device__ bhalf2_t llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
typename vector_type< float, 2 >::type float2_t
Definition: dtype_vector.hpp:2131
__device__ void llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
typename vector_type< half_t, 4 >::type half4_t
Definition: dtype_vector.hpp:2140
__device__ void amd_global_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, T *addr)
Definition: amd_buffer_addressing.hpp:571
_Float16 half_t
Definition: data_type.hpp:30
__device__ void llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
ushort bhalf_t
Definition: data_type.hpp:29
__device__ vector_type< T, N >::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:419
typename vector_type< bhalf_t, 2 >::type bhalf2_t
Definition: dtype_vector.hpp:2146
__device__ void llvm_amdgcn_raw_buffer_store_i16(bhalf_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
__device__ void llvm_amdgcn_raw_buffer_store_i16x2(bhalf2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
typename vector_type< float, 4 >::type float4_t
Definition: dtype_vector.hpp:2132
__device__ half2_t llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
__device__ void amd_buffer_atomic_add(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:928
__device__ void llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
__device__ bhalf4_t llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
__device__ int8_t llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
__device__ float2_t llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
typename vector_type< half_t, 2 >::type half2_t
Definition: dtype_vector.hpp:2139
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_zero(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
Definition: amd_buffer_addressing.hpp:829
__device__ void amd_buffer_atomic_max_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:757
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
typename vector_type< int32_t, 4 >::type int32x4_t
Definition: dtype_vector.hpp:2154
int32_t index_t
Definition: ck.hpp:298
__device__ void llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
typename vector_type< int8_t, 4 >::type int8x4_t
Definition: dtype_vector.hpp:2163
__device__ void llvm_amdgcn_raw_buffer_store_i8(int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
__device__ int32_t llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
__device__ void amd_buffer_store_impl_raw(const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:446
__device__ void llvm_amdgcn_raw_buffer_store_i16x4(bhalf4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:315
__device__ int8x2_t llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
typename remove_cv< T >::type remove_cv_t
Definition: type.hpp:295
__device__ void llvm_amdgcn_raw_buffer_store_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
__device__ half4_t llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
__device__ void llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
signed short int16_t
Definition: stdint.h:122
_W64 unsigned int uintptr_t
Definition: stdint.h:165
unsigned int uint32_t
Definition: stdint.h:126
signed int int32_t
Definition: stdint.h:123
signed char int8_t
Definition: stdint.h:121
Definition: integral_constant.hpp:20
Definition: data_type.hpp:38
Definition: functional2.hpp:33
Definition: dtype_vector.hpp:30
Definition: dtype_vector.hpp:10
Definition: amd_buffer_addressing.hpp:11
int32x4_t content
Definition: amd_buffer_addressing.hpp:16
StaticallyIndexedArray< int32_t, 4 > config
Definition: amd_buffer_addressing.hpp:19
constexpr __device__ BufferResource()
Definition: amd_buffer_addressing.hpp:12
StaticallyIndexedArray< int32_t, 4 > range
Definition: amd_buffer_addressing.hpp:18
StaticallyIndexedArray< T *, 2 > address
Definition: amd_buffer_addressing.hpp:17