26 BufferResource<T> wave_buffer_resource;
29 wave_buffer_resource.address(Number<0>{}) =
const_cast<remove_cv_t<T>*
>(p_wave);
31 wave_buffer_resource.range(Number<2>{}) = element_space_size *
sizeof(T);
35 return wave_buffer_resource.content;
41 BufferResource<T> wave_buffer_resource;
44 wave_buffer_resource.address(Number<0>{}) =
const_cast<remove_cv_t<T>*
>(p_wave);
46 wave_buffer_resource.range(Number<2>{}) = 0xffffffff;
50 return wave_buffer_resource.content;
60 int32_t num = element_space_size *
sizeof(T);
63 return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, flags);
75 return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, flags);
84 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
92 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.add.i32");
100 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.f32");
108 int glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fmax.f64");
110 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
111 __device__
typename vector_type<int8_t, N>::type
113 index_t src_thread_addr_offset,
116 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
117 "wrong! not implemented");
121 return __builtin_amdgcn_raw_buffer_load_b8(src_wave_buffer_resource,
122 src_thread_addr_offset,
123 src_wave_addr_offset,
124 static_cast<index_t>(coherence));
126 else if constexpr(N == 2)
129 int16_t tmp = __builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource,
130 src_thread_addr_offset,
131 src_wave_addr_offset,
132 static_cast<index_t>(coherence));
134 return bit_cast<int8x2_t>(tmp);
136 else if constexpr(N == 4)
138 int32_t tmp = __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
139 src_thread_addr_offset,
140 src_wave_addr_offset,
141 static_cast<index_t>(coherence));
143 return bit_cast<int8x4_t>(tmp);
145 else if constexpr(N == 8)
147 int32x2_t tmp = __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
148 src_thread_addr_offset,
149 src_wave_addr_offset,
150 static_cast<index_t>(coherence));
152 return bit_cast<int8x8_t>(tmp);
154 else if constexpr(N == 16)
156 int32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
157 src_thread_addr_offset,
158 src_wave_addr_offset,
159 static_cast<index_t>(coherence));
160 return bit_cast<int8x16_t>(tmp);
162 else if constexpr(N == 32)
164 int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
165 src_thread_addr_offset,
166 src_wave_addr_offset,
167 static_cast<index_t>(coherence));
169 __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
170 src_thread_addr_offset,
171 src_wave_addr_offset + 4 *
sizeof(
int32_t),
172 static_cast<index_t>(coherence));
178 return bit_cast<int8x32_t>(tmp);
180 else if constexpr(N == 64)
182 int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
183 src_thread_addr_offset,
184 src_wave_addr_offset,
185 static_cast<index_t>(coherence));
187 __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
188 src_thread_addr_offset,
189 src_wave_addr_offset + 4 *
sizeof(
int32_t),
190 static_cast<index_t>(coherence));
192 __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
193 src_thread_addr_offset,
194 src_wave_addr_offset + 8 *
sizeof(
int32_t),
195 static_cast<index_t>(coherence));
197 __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
198 src_thread_addr_offset,
199 src_wave_addr_offset + 12 *
sizeof(
int32_t),
200 static_cast<index_t>(coherence));
209 return bit_cast<int8x64_t>(tmp);
213 template <
typename T,
216 __device__
typename vector_type<T, N>::type
218 index_t src_thread_addr_offset,
232 "wrong! not implemented");
235 auto raw_data = amd_buffer_load_impl_raw<sizeof(T) * N, coherence>(
236 src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
237 return bit_cast<r_t>(raw_data);
240 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
243 __amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
244 index_t dst_thread_addr_offset,
247 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
248 "wrong! not implemented");
252 __builtin_amdgcn_raw_buffer_store_b8(src_thread_data,
253 dst_wave_buffer_resource,
254 dst_thread_addr_offset,
255 dst_wave_addr_offset,
256 static_cast<index_t>(coherence));
258 else if constexpr(N == 2)
261 __builtin_amdgcn_raw_buffer_store_b16(bit_cast<int16_t>(src_thread_data),
262 dst_wave_buffer_resource,
263 dst_thread_addr_offset,
264 dst_wave_addr_offset,
265 static_cast<index_t>(coherence));
267 else if constexpr(N == 4)
269 __builtin_amdgcn_raw_buffer_store_b32(bit_cast<int32_t>(src_thread_data),
270 dst_wave_buffer_resource,
271 dst_thread_addr_offset,
272 dst_wave_addr_offset,
273 static_cast<index_t>(coherence));
275 else if constexpr(N == 8)
277 __builtin_amdgcn_raw_buffer_store_b64(bit_cast<int32x2_t>(src_thread_data),
278 dst_wave_buffer_resource,
279 dst_thread_addr_offset,
280 dst_wave_addr_offset,
281 static_cast<index_t>(coherence));
283 else if constexpr(N == 16)
285 __builtin_amdgcn_raw_buffer_store_b128(bit_cast<int32x4_t>(src_thread_data),
286 dst_wave_buffer_resource,
287 dst_thread_addr_offset,
288 dst_wave_addr_offset,
289 static_cast<index_t>(coherence));
291 else if constexpr(N == 32)
295 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<0>{}],
296 dst_wave_buffer_resource,
297 dst_thread_addr_offset,
298 dst_wave_addr_offset,
299 static_cast<index_t>(coherence));
301 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<1>{}],
302 dst_wave_buffer_resource,
303 dst_thread_addr_offset,
304 dst_wave_addr_offset +
sizeof(
int32_t) * 4,
305 static_cast<index_t>(coherence));
307 else if constexpr(N == 64)
311 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<0>{}],
312 dst_wave_buffer_resource,
313 dst_thread_addr_offset,
314 dst_wave_addr_offset,
315 static_cast<index_t>(coherence));
317 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<1>{}],
318 dst_wave_buffer_resource,
319 dst_thread_addr_offset,
320 dst_wave_addr_offset +
sizeof(
int32_t) * 4,
321 static_cast<index_t>(coherence));
323 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<2>{}],
324 dst_wave_buffer_resource,
325 dst_thread_addr_offset,
326 dst_wave_addr_offset +
sizeof(
int32_t) * 8,
327 static_cast<index_t>(coherence));
329 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<3>{}],
330 dst_wave_buffer_resource,
331 dst_thread_addr_offset,
332 dst_wave_addr_offset +
sizeof(
int32_t) * 12,
333 static_cast<index_t>(coherence));
337 template <
typename T,
341 __amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
342 index_t dst_thread_addr_offset,
354 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
356 "wrong! not implemented");
360 amd_buffer_store_impl_raw<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
361 dst_wave_buffer_resource,
362 dst_thread_addr_offset,
363 dst_wave_addr_offset);
366 template <
typename T, index_t N>
372 "wrong! not implemented");
376 vector_type<half_t, N> tmp{src_thread_data};
377 static_for<0, N / 2, 1>{}([&](
auto i) {
378 __builtin_amdgcn_global_atomic_fadd_v2f16(bit_cast<half2_t*>(addr) + i,
379 tmp.template AsType<half2_t>()[i]);
382 #if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)
385 vector_type<bhalf_t, N> tmp{src_thread_data};
386 static_for<0, N / 2, 1>{}([&](
auto i) {
387 __builtin_amdgcn_global_atomic_fadd_v2bf16(bit_cast<bhalf2_t*>(addr) + i,
388 tmp.template AsType<bhalf2_t>()[i]);
394 template <
typename T, index_t N>
397 index_t dst_thread_addr_offset,
403 "wrong! not implemented");
410 dst_wave_buffer_resource,
411 dst_thread_addr_offset,
412 dst_wave_addr_offset,
415 else if constexpr(N == 2)
417 vector_type<float, 2> tmp{src_thread_data};
420 dst_wave_buffer_resource,
421 dst_thread_addr_offset,
422 dst_wave_addr_offset,
426 dst_wave_buffer_resource,
427 dst_thread_addr_offset,
428 dst_wave_addr_offset +
sizeof(
float),
431 else if constexpr(N == 4)
433 vector_type<float, 4> tmp{src_thread_data};
436 dst_wave_buffer_resource,
437 dst_thread_addr_offset,
438 dst_wave_addr_offset,
442 dst_wave_buffer_resource,
443 dst_thread_addr_offset,
444 dst_wave_addr_offset +
sizeof(
float),
448 dst_wave_buffer_resource,
449 dst_thread_addr_offset,
450 dst_wave_addr_offset + 2 *
sizeof(
float),
454 dst_wave_buffer_resource,
455 dst_thread_addr_offset,
456 dst_wave_addr_offset + 3 *
sizeof(
float),
465 dst_wave_buffer_resource,
466 dst_thread_addr_offset,
467 dst_wave_addr_offset,
470 else if constexpr(N == 4)
472 vector_type<half_t, 4> tmp{src_thread_data};
474 static_for<0, 2, 1>{}([&](
auto i) {
476 dst_wave_buffer_resource,
477 dst_thread_addr_offset,
478 dst_wave_addr_offset + i *
sizeof(
half2_t),
482 else if constexpr(N == 8)
484 vector_type<half_t, 8> tmp{src_thread_data};
486 static_for<0, 4, 1>{}([&](
auto i) {
488 dst_wave_buffer_resource,
489 dst_thread_addr_offset,
490 dst_wave_addr_offset + i *
sizeof(
half2_t),
500 dst_wave_buffer_resource,
501 dst_thread_addr_offset,
502 dst_wave_addr_offset,
505 else if constexpr(N == 2)
507 vector_type<int32_t, 2> tmp{src_thread_data};
510 dst_wave_buffer_resource,
511 dst_thread_addr_offset,
512 dst_wave_addr_offset,
516 dst_wave_buffer_resource,
517 dst_thread_addr_offset,
518 dst_wave_addr_offset +
sizeof(
int32_t),
521 else if constexpr(N == 4)
523 vector_type<int32_t, 4> tmp{src_thread_data};
526 dst_wave_buffer_resource,
527 dst_thread_addr_offset,
528 dst_wave_addr_offset,
532 dst_wave_buffer_resource,
533 dst_thread_addr_offset,
534 dst_wave_addr_offset +
sizeof(
int32_t),
538 dst_wave_buffer_resource,
539 dst_thread_addr_offset,
540 dst_wave_addr_offset + 2 *
sizeof(
int32_t),
544 dst_wave_buffer_resource,
545 dst_thread_addr_offset,
546 dst_wave_addr_offset + 3 *
sizeof(
int32_t),
552 template <
typename T, index_t N>
555 index_t dst_thread_addr_offset,
559 "wrong! not implemented");
565 dst_wave_buffer_resource,
566 dst_thread_addr_offset,
567 dst_wave_addr_offset,
570 else if constexpr(N == 2)
572 vector_type<double, 2> tmp{src_thread_data};
575 dst_wave_buffer_resource,
576 dst_thread_addr_offset,
577 dst_wave_addr_offset,
581 dst_wave_buffer_resource,
582 dst_thread_addr_offset,
583 dst_wave_addr_offset +
sizeof(
double),
586 else if constexpr(N == 4)
588 vector_type<double, 4> tmp{src_thread_data};
591 dst_wave_buffer_resource,
592 dst_thread_addr_offset,
593 dst_wave_addr_offset,
597 dst_wave_buffer_resource,
598 dst_thread_addr_offset,
599 dst_wave_addr_offset +
sizeof(
double),
603 dst_wave_buffer_resource,
604 dst_thread_addr_offset,
605 dst_wave_addr_offset + 2 *
sizeof(
double),
609 dst_wave_buffer_resource,
610 dst_thread_addr_offset,
611 dst_wave_addr_offset + 3 *
sizeof(
double),
621 template <
typename T,
624 __device__
typename vector_type_maker<T, N>::type::type
626 index_t src_thread_element_offset,
627 bool src_thread_element_valid,
628 index_t src_element_space_size)
630 const __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
633 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
635 using vector_t =
typename vector_type_maker<T, N>::type::type;
636 using scalar_t =
typename scalar_type<vector_t>::type;
638 constexpr
index_t vector_size = scalar_type<vector_t>::vector_size;
640 #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
641 uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
642 return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
643 src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
647 vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
648 src_wave_buffer_resource, src_thread_addr_offset, 0)};
649 return src_thread_element_valid ? tmp : vector_t(0);
657 template <
typename T,
660 __device__
typename vector_type_maker<T, N>::type::type
662 index_t src_thread_element_offset,
663 bool src_thread_element_valid,
664 index_t src_element_space_size,
667 const __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
670 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
672 using vector_t =
typename vector_type_maker<T, N>::type::type;
673 using scalar_t =
typename scalar_type<vector_t>::type;
675 constexpr
index_t vector_size = scalar_type<vector_t>::vector_size;
677 vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
678 src_wave_buffer_resource, src_thread_addr_offset, 0)};
680 return src_thread_element_valid ? tmp : vector_t(customized_value);
687 template <
typename T,
690 __device__
void amd_buffer_store(
const typename vector_type_maker<T, N>::type::type src_thread_data,
692 const index_t dst_thread_element_offset,
693 const bool dst_thread_element_valid,
694 const index_t dst_element_space_size)
696 const __amdgpu_buffer_rsrc_t dst_wave_buffer_resource =
699 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
701 using vector_t =
typename vector_type_maker<T, N>::type::type;
702 using scalar_t =
typename scalar_type<vector_t>::type;
703 constexpr
index_t vector_size = scalar_type<vector_t>::vector_size;
705 #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
706 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
707 amd_buffer_store_impl<scalar_t, vector_size, coherence>(
708 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
710 if(dst_thread_element_valid)
712 amd_buffer_store_impl<scalar_t, vector_size, coherence>(
713 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
722 template <
typename T, index_t N>
726 const index_t dst_thread_element_offset,
727 const bool dst_thread_element_valid,
728 const index_t dst_element_space_size)
730 const int32x4_t dst_wave_buffer_resource =
733 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
735 using vector_t =
typename vector_type_maker<T, N>::type::type;
736 using scalar_t =
typename scalar_type<vector_t>::type;
737 constexpr
index_t vector_size = scalar_type<vector_t>::vector_size;
741 if(dst_thread_element_valid)
743 amd_global_atomic_add_impl<scalar_t, vector_size>(
744 src_thread_data, p_dst_wave + dst_thread_element_offset);
749 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
750 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
752 amd_buffer_atomic_add_impl<scalar_t, vector_size>(
753 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
755 if(dst_thread_element_valid)
757 amd_buffer_atomic_add_impl<scalar_t, vector_size>(
758 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
768 template <
typename T, index_t N>
772 const index_t dst_thread_element_offset,
773 const bool dst_thread_element_valid,
774 const index_t dst_element_space_size)
776 const int32x4_t dst_wave_buffer_resource =
779 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
781 using vector_t =
typename vector_type_maker<T, N>::type::type;
782 using scalar_t =
typename scalar_type<vector_t>::type;
783 constexpr
index_t vector_size = scalar_type<vector_t>::vector_size;
785 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
786 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
788 amd_buffer_atomic_max_impl<scalar_t, vector_size>(
789 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
791 if(dst_thread_element_valid)
793 amd_buffer_atomic_max_impl<scalar_t, vector_size>(
794 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
802 __attribute__((address_space(3)))
uint32_t* lds_ptr,
807 index_t aux) __asm(
"llvm.amdgcn.raw.buffer.load.lds");
809 #ifndef __HIPCC_RTC__
810 template <
typename T, index_t NumElemsPerThread>
816 const index_t src_element_space_size)
821 constexpr
auto bytes_per_thread =
sizeof(T) * NumElemsPerThread;
822 #if defined(__gfx950__)
823 constexpr
auto dword_bytes = 4;
824 static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
825 bytes_per_thread == dword_bytes * 4);
826 #elif defined(__gfx942__)
827 constexpr
auto dword_bytes = 4;
828 static_assert(bytes_per_thread == dword_bytes);
833 const index_t global_offset_bytes = is_valid ? global_offset *
sizeof(T) : 0x80000000;
835 #if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
836 T* lds_ptr = lds_base_ptr + lds_offset;
837 #ifndef CK_CODE_GEN_RTC
838 auto const lds_ptr_sgpr =
839 __builtin_amdgcn_readfirstlane((
reinterpret_cast<uintptr_t>(lds_ptr)));
841 auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((
reinterpret_cast<size_t>(lds_ptr)));
843 asm volatile(
"s_mov_b32 m0, %0; \n\t"
844 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::
"s"(lds_ptr_sgpr),
845 "v"(global_offset_bytes),
850 __attribute__((address_space(3)))
uint32_t* lds_ptr =
851 #ifndef CK_CODE_GEN_RTC
852 reinterpret_cast<__attribute__((address_space(3)))
uint32_t*
>(
853 reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
855 reinterpret_cast<__attribute__((address_space(3)))
uint32_t*
>(
856 reinterpret_cast<size_t>(lds_base_ptr + lds_offset));
860 src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
#define CK_BUFFER_RESOURCE_3RD_DWORD
Definition: ck.hpp:81
__device__ int32x4_t make_wave_buffer_resource_with_default_range(T *p_wave)
Definition: amd_buffer_addressing.hpp:39
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition: statically_indexed_array.hpp:45
__device__ void amd_buffer_store(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:871
__device__ void amd_direct_load_global_to_lds(const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)
Definition: amd_buffer_addressing.hpp:992
__device__ void amd_buffer_atomic_max(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:951
__device__ void amd_buffer_store_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:521
AmdBufferCoherenceEnum
Definition: amd_buffer_coherence.hpp:9
__device__ int32x4_t make_wave_buffer_resource(T *p_wave, index_t element_space_size)
Definition: amd_buffer_addressing.hpp:24
typename vector_type< int32_t, 2 >::type int32x2_t
Definition: dtype_vector.hpp:2168
__device__ void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
__device__ void amd_buffer_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:576
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition: amd_buffer_addressing.hpp:842
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
__device__ void amd_global_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, T *addr)
Definition: amd_buffer_addressing.hpp:548
__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
__device__ vector_type< T, N >::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:396
__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_new(T *p_wave, index_t element_space_size)
Definition: amd_buffer_addressing_builtins.hpp:54
__device__ void amd_buffer_atomic_add(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:905
__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
typename vector_type< half_t, 2 >::type half2_t
Definition: dtype_vector.hpp:2154
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_zero(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
Definition: amd_buffer_addressing.hpp:806
__device__ void amd_buffer_atomic_max_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:734
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
typename vector_type< int32_t, 4 >::type int32x4_t
Definition: dtype_vector.hpp:2169
int32_t index_t
Definition: ck.hpp:301
__device__ void amd_buffer_store_impl_raw(const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:423
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:292
typename remove_cv< T >::type remove_cv_t
Definition: type.hpp:295
__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_with_default_range_new(T *p_wave)
Definition: amd_buffer_addressing_builtins.hpp:67
signed short int16_t
Definition: stdint.h:122
_W64 unsigned int uintptr_t
Definition: stdint.h:164
unsigned int uint32_t
Definition: stdint.h:126
signed int int32_t
Definition: stdint.h:123
signed char int8_t
Definition: stdint.h:121
Definition: integral_constant.hpp:20
static constexpr bool value
Definition: integral_constant.hpp:21
Definition: dtype_vector.hpp:11
int32x4_t content
Definition: amd_buffer_addressing.hpp:17
StaticallyIndexedArray< int32_t, 4 > config
Definition: amd_buffer_addressing.hpp:20
constexpr __device__ BufferResource()
Definition: amd_buffer_addressing_builtins.hpp:13
StaticallyIndexedArray< int32_t, 4 > range
Definition: amd_buffer_addressing.hpp:19
StaticallyIndexedArray< T *, 2 > address
Definition: amd_buffer_addressing.hpp:18