30 wave_buffer_resource.
range(
Number<2>{}) = element_space_size *
sizeof(T);
34 return wave_buffer_resource.
content;
49 return wave_buffer_resource.
content;
57 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.i8");
63 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2i8");
69 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4i8");
76 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.i16");
82 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2i16");
88 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4i16");
95 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.i32");
101 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2i32");
107 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4i32");
114 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.f16");
120 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2f16");
126 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4f16");
133 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.f32");
139 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v2f32");
145 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.load.v4f32");
153 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.i8");
160 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2i8");
167 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4i8");
175 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.i16");
182 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2i16");
189 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4i16");
197 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.i32");
204 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2i32");
211 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4i32");
219 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.f16");
226 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2f16");
233 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4f16");
241 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.f32");
248 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v2f32");
255 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.store.v4f32");
263 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
271 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.add.i32");
279 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.f32");
287 int glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fmax.f64");
313 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
314 __device__
typename vector_type<int8_t, N>::type
316 index_t src_thread_addr_offset,
319 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
320 "wrong! not implemented");
325 src_thread_addr_offset,
326 src_wave_addr_offset,
327 static_cast<index_t>(coherence));
329 else if constexpr(N == 2)
333 src_thread_addr_offset,
334 src_wave_addr_offset,
335 static_cast<index_t>(coherence));
337 return bit_cast<int8x2_t>(tmp);
339 else if constexpr(N == 4)
342 src_thread_addr_offset,
343 src_wave_addr_offset,
344 static_cast<index_t>(coherence));
346 return bit_cast<int8x4_t>(tmp);
348 else if constexpr(N == 8)
351 src_thread_addr_offset,
352 src_wave_addr_offset,
353 static_cast<index_t>(coherence));
355 return bit_cast<int8x8_t>(tmp);
357 else if constexpr(N == 16)
360 src_thread_addr_offset,
361 src_wave_addr_offset,
362 static_cast<index_t>(coherence));
363 return bit_cast<int8x16_t>(tmp);
365 else if constexpr(N == 32)
368 src_thread_addr_offset,
369 src_wave_addr_offset,
370 static_cast<index_t>(coherence));
373 src_thread_addr_offset,
374 src_wave_addr_offset + 4 *
sizeof(int32_t),
375 static_cast<index_t>(coherence));
381 return bit_cast<int8x32_t>(tmp);
383 else if constexpr(N == 64)
386 src_thread_addr_offset,
387 src_wave_addr_offset,
388 static_cast<index_t>(coherence));
391 src_thread_addr_offset,
392 src_wave_addr_offset + 4 *
sizeof(int32_t),
393 static_cast<index_t>(coherence));
396 src_thread_addr_offset,
397 src_wave_addr_offset + 8 *
sizeof(int32_t),
398 static_cast<index_t>(coherence));
401 src_thread_addr_offset,
402 src_wave_addr_offset + 12 *
sizeof(int32_t),
403 static_cast<index_t>(coherence));
412 return bit_cast<int8x64_t>(tmp);
416 template <
typename T,
420 index_t src_thread_addr_offset,
434 "wrong! not implemented");
437 auto raw_data = amd_buffer_load_impl_raw<sizeof(T) * N, coherence>(
438 src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
439 return bit_cast<r_t>(raw_data);
442 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
446 index_t dst_thread_addr_offset,
449 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
450 "wrong! not implemented");
455 dst_wave_buffer_resource,
456 dst_thread_addr_offset,
457 dst_wave_addr_offset,
458 static_cast<index_t>(coherence));
460 else if constexpr(N == 2)
464 dst_wave_buffer_resource,
465 dst_thread_addr_offset,
466 dst_wave_addr_offset,
467 static_cast<index_t>(coherence));
469 else if constexpr(N == 4)
472 dst_wave_buffer_resource,
473 dst_thread_addr_offset,
474 dst_wave_addr_offset,
475 static_cast<index_t>(coherence));
477 else if constexpr(N == 8)
480 dst_wave_buffer_resource,
481 dst_thread_addr_offset,
482 dst_wave_addr_offset,
483 static_cast<index_t>(coherence));
485 else if constexpr(N == 16)
488 dst_wave_buffer_resource,
489 dst_thread_addr_offset,
490 dst_wave_addr_offset,
491 static_cast<index_t>(coherence));
493 else if constexpr(N == 32)
498 dst_wave_buffer_resource,
499 dst_thread_addr_offset,
500 dst_wave_addr_offset,
501 static_cast<index_t>(coherence));
504 dst_wave_buffer_resource,
505 dst_thread_addr_offset,
506 dst_wave_addr_offset +
sizeof(int32_t) * 4,
507 static_cast<index_t>(coherence));
509 else if constexpr(N == 64)
514 dst_wave_buffer_resource,
515 dst_thread_addr_offset,
516 dst_wave_addr_offset,
517 static_cast<index_t>(coherence));
520 dst_wave_buffer_resource,
521 dst_thread_addr_offset,
522 dst_wave_addr_offset +
sizeof(int32_t) * 4,
523 static_cast<index_t>(coherence));
526 dst_wave_buffer_resource,
527 dst_thread_addr_offset,
528 dst_wave_addr_offset +
sizeof(int32_t) * 8,
529 static_cast<index_t>(coherence));
532 dst_wave_buffer_resource,
533 dst_thread_addr_offset,
534 dst_wave_addr_offset +
sizeof(int32_t) * 12,
535 static_cast<index_t>(coherence));
539 template <
typename T,
544 index_t dst_thread_addr_offset,
556 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
558 "wrong! not implemented");
562 amd_buffer_store_impl_raw<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
563 dst_wave_buffer_resource,
564 dst_thread_addr_offset,
565 dst_wave_addr_offset);
568 template <
typename T, index_t N>
574 "wrong! not implemented");
580 __builtin_amdgcn_global_atomic_fadd_v2f16(bit_cast<half2_t*>(addr) + i,
581 tmp.template AsType<half2_t>()[i]);
584 #if defined(__gfx942__) || defined(__gfx950__)
589 __builtin_amdgcn_global_atomic_fadd_v2bf16(bit_cast<bhalf2_t*>(addr) + i,
590 tmp.template AsType<bhalf2_t>()[i]);
596 template <
typename T, index_t N>
599 index_t dst_thread_addr_offset,
605 "wrong! not implemented");
612 dst_wave_buffer_resource,
613 dst_thread_addr_offset,
614 dst_wave_addr_offset,
617 else if constexpr(N == 2)
622 dst_wave_buffer_resource,
623 dst_thread_addr_offset,
624 dst_wave_addr_offset,
628 dst_wave_buffer_resource,
629 dst_thread_addr_offset,
630 dst_wave_addr_offset +
sizeof(
float),
633 else if constexpr(N == 4)
638 dst_wave_buffer_resource,
639 dst_thread_addr_offset,
640 dst_wave_addr_offset,
644 dst_wave_buffer_resource,
645 dst_thread_addr_offset,
646 dst_wave_addr_offset +
sizeof(
float),
650 dst_wave_buffer_resource,
651 dst_thread_addr_offset,
652 dst_wave_addr_offset + 2 *
sizeof(
float),
656 dst_wave_buffer_resource,
657 dst_thread_addr_offset,
658 dst_wave_addr_offset + 3 *
sizeof(
float),
667 dst_wave_buffer_resource,
668 dst_thread_addr_offset,
669 dst_wave_addr_offset,
672 else if constexpr(N == 4)
678 dst_wave_buffer_resource,
679 dst_thread_addr_offset,
680 dst_wave_addr_offset + i *
sizeof(
half2_t),
684 else if constexpr(N == 8)
690 dst_wave_buffer_resource,
691 dst_thread_addr_offset,
692 dst_wave_addr_offset + i *
sizeof(
half2_t),
702 dst_wave_buffer_resource,
703 dst_thread_addr_offset,
704 dst_wave_addr_offset,
707 else if constexpr(N == 2)
712 dst_wave_buffer_resource,
713 dst_thread_addr_offset,
714 dst_wave_addr_offset,
718 dst_wave_buffer_resource,
719 dst_thread_addr_offset,
720 dst_wave_addr_offset +
sizeof(int32_t),
723 else if constexpr(N == 4)
728 dst_wave_buffer_resource,
729 dst_thread_addr_offset,
730 dst_wave_addr_offset,
734 dst_wave_buffer_resource,
735 dst_thread_addr_offset,
736 dst_wave_addr_offset +
sizeof(int32_t),
740 dst_wave_buffer_resource,
741 dst_thread_addr_offset,
742 dst_wave_addr_offset + 2 *
sizeof(int32_t),
746 dst_wave_buffer_resource,
747 dst_thread_addr_offset,
748 dst_wave_addr_offset + 3 *
sizeof(int32_t),
754 template <
typename T, index_t N>
757 index_t dst_thread_addr_offset,
761 "wrong! not implemented");
767 dst_wave_buffer_resource,
768 dst_thread_addr_offset,
769 dst_wave_addr_offset,
772 else if constexpr(N == 2)
777 dst_wave_buffer_resource,
778 dst_thread_addr_offset,
779 dst_wave_addr_offset,
783 dst_wave_buffer_resource,
784 dst_thread_addr_offset,
785 dst_wave_addr_offset +
sizeof(
double),
788 else if constexpr(N == 4)
793 dst_wave_buffer_resource,
794 dst_thread_addr_offset,
795 dst_wave_addr_offset,
799 dst_wave_buffer_resource,
800 dst_thread_addr_offset,
801 dst_wave_addr_offset +
sizeof(
double),
805 dst_wave_buffer_resource,
806 dst_thread_addr_offset,
807 dst_wave_addr_offset + 2 *
sizeof(
double),
811 dst_wave_buffer_resource,
812 dst_thread_addr_offset,
813 dst_wave_addr_offset + 3 *
sizeof(
double),
823 template <
typename T,
826 __device__
typename vector_type_maker<T, N>::type::type
828 index_t src_thread_element_offset,
829 bool src_thread_element_valid,
830 index_t src_element_space_size)
832 const int32x4_t src_wave_buffer_resource =
835 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
842 #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
843 uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
844 return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
845 src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
849 vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
850 src_wave_buffer_resource, src_thread_addr_offset, 0)};
851 return src_thread_element_valid ? tmp : vector_t(0);
859 template <
typename T,
862 __device__
typename vector_type_maker<T, N>::type::type
864 index_t src_thread_element_offset,
865 bool src_thread_element_valid,
866 index_t src_element_space_size,
869 const int32x4_t src_wave_buffer_resource =
872 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
879 vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
880 src_wave_buffer_resource, src_thread_addr_offset, 0)};
882 return src_thread_element_valid ? tmp : vector_t(customized_value);
889 template <
typename T,
894 const index_t dst_thread_element_offset,
895 const bool dst_thread_element_valid,
896 const index_t dst_element_space_size)
898 const int32x4_t dst_wave_buffer_resource =
901 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
907 #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
908 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
909 amd_buffer_store_impl<scalar_t, vector_size, coherence>(
910 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
912 if(dst_thread_element_valid)
914 amd_buffer_store_impl<scalar_t, vector_size, coherence>(
915 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
924 template <
typename T, index_t N>
928 const index_t dst_thread_element_offset,
929 const bool dst_thread_element_valid,
930 const index_t dst_element_space_size)
932 const int32x4_t dst_wave_buffer_resource =
935 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
943 if(dst_thread_element_valid)
945 amd_global_atomic_add_impl<scalar_t, vector_size>(
946 src_thread_data, p_dst_wave + dst_thread_element_offset);
951 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
952 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
954 amd_buffer_atomic_add_impl<scalar_t, vector_size>(
955 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
957 if(dst_thread_element_valid)
959 amd_buffer_atomic_add_impl<scalar_t, vector_size>(
960 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
970 template <
typename T, index_t N>
974 const index_t dst_thread_element_offset,
975 const bool dst_thread_element_valid,
976 const index_t dst_element_space_size)
978 const int32x4_t dst_wave_buffer_resource =
981 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
987 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
988 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
990 amd_buffer_atomic_max_impl<scalar_t, vector_size>(
991 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
993 if(dst_thread_element_valid)
995 amd_buffer_atomic_max_impl<scalar_t, vector_size>(
996 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
1004 __attribute__((address_space(3))) uint32_t* lds_ptr,
1009 index_t aux) __asm(
"llvm.amdgcn.raw.buffer.load.lds");
1011 template <
typename T, index_t NumElemsPerThread>
1016 const bool is_valid,
1017 const index_t src_element_space_size)
1020 constexpr
auto dword_bytes = 4;
1021 constexpr
auto bytes_per_thread =
sizeof(T) * NumElemsPerThread;
1022 static_assert(bytes_per_thread == dword_bytes);
1024 #ifndef CK_CODE_GEN_RTC
1025 const uint32_t* global_ptr =
1026 reinterpret_cast<uint32_t*
>(
reinterpret_cast<uintptr_t
>(global_base_ptr));
1028 const uint32_t* global_ptr =
1029 reinterpret_cast<uint32_t*
>(
reinterpret_cast<size_t>(global_base_ptr));
1032 const index_t global_offset_bytes = is_valid ? global_offset *
sizeof(T) : 0x80000000;
1034 #if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
1035 T* lds_ptr = lds_base_ptr + lds_offset;
1036 #ifndef CK_CODE_GEN_RTC
1037 auto const lds_ptr_sgpr =
1038 __builtin_amdgcn_readfirstlane((
reinterpret_cast<uintptr_t
>(lds_ptr)));
1040 auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((
reinterpret_cast<size_t>(lds_ptr)));
1042 asm volatile(
"s_mov_b32 m0, %0; \n\t"
1043 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::
"s"(lds_ptr_sgpr),
1044 "v"(global_offset_bytes),
1049 __attribute__((address_space(3))) uint32_t* lds_ptr =
1050 #ifndef CK_CODE_GEN_RTC
1051 reinterpret_cast<__attribute__((address_space(3))) uint32_t*
>(
1052 reinterpret_cast<uintptr_t
>(lds_base_ptr + lds_offset));
1054 reinterpret_cast<__attribute__((address_space(3))) uint32_t*
>(
1055 reinterpret_cast<size_t>(lds_base_ptr + lds_offset));
1059 src_resource, lds_ptr,
sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
#define CK_BUFFER_RESOURCE_3RD_DWORD
Definition: ck.hpp:82
int8_t int8_t
Definition: int8.hpp:20
__device__ void llvm_amdgcn_raw_buffer_store_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
__device__ int32x4_t make_wave_buffer_resource_with_default_range(T *p_wave)
Definition: amd_buffer_addressing.hpp:38
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition: statically_indexed_array.hpp:45
__device__ void amd_buffer_store(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:892
__device__ void amd_direct_load_global_to_lds(const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)
Definition: amd_buffer_addressing.hpp:1012
__device__ void amd_buffer_atomic_max(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:972
__device__ int32x4_t llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
__device__ void llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
__device__ void llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
__device__ void amd_buffer_store_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:542
__device__ float llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
__device__ int32x2_t llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
AmdBufferCoherenceEnum
Definition: amd_buffer_addressing.hpp:295
typename vector_type< bhalf_t, 4 >::type bhalf4_t
Definition: data_type.hpp:2498
__device__ int32x4_t make_wave_buffer_resource(T *p_wave, index_t element_space_size)
Definition: amd_buffer_addressing.hpp:23
typename vector_type< int32_t, 2 >::type int32x2_t
Definition: data_type.hpp:2505
__device__ void llvm_amdgcn_raw_buffer_store_fp16(half_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
typename vector_type< int8_t, 2 >::type int8x2_t
Definition: data_type.hpp:2513
__device__ half_t llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
__device__ bhalf_t llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
__device__ void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
__device__ void llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
__device__ int8x4_t llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
__device__ void amd_buffer_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:597
__device__ float4_t llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
__device__ bhalf2_t llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
typename vector_type< float, 2 >::type float2_t
Definition: data_type.hpp:2481
__device__ void llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
typename vector_type< half_t, 4 >::type half4_t
Definition: data_type.hpp:2490
__device__ void amd_global_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, T *addr)
Definition: amd_buffer_addressing.hpp:569
_Float16 half_t
Definition: data_type.hpp:25
__device__ void llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
ushort bhalf_t
Definition: data_type.hpp:24
__device__ vector_type< T, N >::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:419
typename vector_type< bhalf_t, 2 >::type bhalf2_t
Definition: data_type.hpp:2497
__device__ void llvm_amdgcn_raw_buffer_store_i16(bhalf_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition: amd_buffer_addressing.hpp:863
__device__ void llvm_amdgcn_raw_buffer_store_i16x2(bhalf2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
typename vector_type< float, 4 >::type float4_t
Definition: data_type.hpp:2482
__device__ half2_t llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
__device__ void amd_buffer_atomic_add(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:926
__device__ void llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
__device__ bhalf4_t llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_zero(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
Definition: amd_buffer_addressing.hpp:827
__device__ int8_t llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
__device__ float2_t llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
typename vector_type< half_t, 2 >::type half2_t
Definition: data_type.hpp:2489
__device__ void amd_buffer_atomic_max_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:755
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
typename vector_type< int32_t, 4 >::type int32x4_t
Definition: data_type.hpp:2506
int32_t index_t
Definition: ck.hpp:289
__device__ void llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
typename vector_type< int8_t, 4 >::type int8x4_t
Definition: data_type.hpp:2514
__device__ void llvm_amdgcn_raw_buffer_store_i8(int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
__device__ int32_t llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
__device__ void amd_buffer_store_impl_raw(const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:444
__device__ void llvm_amdgcn_raw_buffer_store_i16x4(bhalf4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:315
__device__ int8x2_t llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
typename remove_cv< T >::type remove_cv_t
Definition: type.hpp:298
__device__ void llvm_amdgcn_raw_buffer_store_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
__device__ half4_t llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
__device__ void llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
Definition: integral_constant.hpp:10
Definition: data_type.hpp:394
Definition: functional2.hpp:31
Definition: data_type.hpp:367
Definition: data_type.hpp:347
Definition: amd_buffer_addressing.hpp:11
StaticallyIndexedArray< int32_t, 4 > config
Definition: amd_buffer_addressing.hpp:19
int32x4_t content
Definition: amd_buffer_addressing.hpp:16
StaticallyIndexedArray< int32_t, 4 > range
Definition: amd_buffer_addressing.hpp:18
constexpr __device__ BufferResource()
Definition: amd_buffer_addressing.hpp:12
StaticallyIndexedArray< T *, 2 > address
Definition: amd_buffer_addressing.hpp:17