/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_buffer_addressing.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_buffer_addressing.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_buffer_addressing.hpp Source File
amd_buffer_addressing.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 #include "data_type.hpp"
6 
7 namespace ck {
8 
9 template <typename T>
11 {
12  __device__ constexpr BufferResource() : content{} {}
13 
14  // 128 bit SGPRs to supply buffer resource in buffer instructions
15  // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
20 };
21 
22 template <typename T>
23 __device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size)
24 {
25  BufferResource<T> wave_buffer_resource;
26 
27  // wavewise base address (64 bit)
28  wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
29  // wavewise range (32 bit)
30  wave_buffer_resource.range(Number<2>{}) = element_space_size * sizeof(T);
31  // wavewise setting (32 bit)
32  wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
33 
34  return wave_buffer_resource.content;
35 }
36 
37 template <typename T>
39 {
40  BufferResource<T> wave_buffer_resource;
41 
42  // wavewise base address (64 bit)
43  wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
44  // wavewise range (32 bit)
45  wave_buffer_resource.range(Number<2>{}) = 0xffffffff; // max possible range
46  // wavewise setting (32 bit)
47  wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
48 
49  return wave_buffer_resource.content;
50 }
51 
52 // buffer load i8
53 __device__ int8_t
55  index_t voffset,
56  index_t soffset,
57  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8");
58 
59 __device__ int8x2_t
61  index_t voffset,
62  index_t soffset,
63  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8");
64 
65 __device__ int8x4_t
67  index_t voffset,
68  index_t soffset,
69  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
70 
71 // buffer load i16
72 __device__ bhalf_t
74  index_t voffset,
75  index_t soffset,
76  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16");
77 
78 __device__ bhalf2_t
80  index_t voffset,
81  index_t soffset,
82  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16");
83 
84 __device__ bhalf4_t
86  index_t voffset,
87  index_t soffset,
88  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16");
89 
90 // buffer load i32
91 __device__ int32_t
93  index_t voffset,
94  index_t soffset,
95  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
96 
97 __device__ int32x2_t
99  index_t voffset,
100  index_t soffset,
101  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
102 
103 __device__ int32x4_t
105  index_t voffset,
106  index_t soffset,
107  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
108 
109 // buffer load fp16
110 __device__ half_t
112  index_t voffset,
113  index_t soffset,
114  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
115 
116 __device__ half2_t
118  index_t voffset,
119  index_t soffset,
120  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
121 
122 __device__ half4_t
124  index_t voffset,
125  index_t soffset,
126  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16");
127 
128 // buffer load fp32
129 __device__ float
131  index_t voffset,
132  index_t soffset,
133  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
134 
135 __device__ float2_t
137  index_t voffset,
138  index_t soffset,
139  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
140 
141 __device__ float4_t
143  index_t voffset,
144  index_t soffset,
145  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
146 
147 // buffer store i8
148 __device__ void
150  int32x4_t rsrc,
151  index_t voffset,
152  index_t soffset,
153  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8");
154 
155 __device__ void
157  int32x4_t rsrc,
158  index_t voffset,
159  index_t soffset,
160  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8");
161 
162 __device__ void
164  int32x4_t rsrc,
165  index_t voffset,
166  index_t soffset,
167  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
168 
169 // buffer store i16
170 __device__ void
172  int32x4_t rsrc,
173  index_t voffset,
174  index_t soffset,
175  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");
176 
177 __device__ void
179  int32x4_t rsrc,
180  index_t voffset,
181  index_t soffset,
182  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16");
183 
184 __device__ void
186  int32x4_t rsrc,
187  index_t voffset,
188  index_t soffset,
189  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16");
190 
191 // buffer store i32
192 __device__ void
194  int32x4_t rsrc,
195  index_t voffset,
196  index_t soffset,
197  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32");
198 
199 __device__ void
201  int32x4_t rsrc,
202  index_t voffset,
203  index_t soffset,
204  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32");
205 
206 __device__ void
208  int32x4_t rsrc,
209  index_t voffset,
210  index_t soffset,
211  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
212 
213 // buffer store fp16
214 __device__ void
216  int32x4_t rsrc,
217  index_t voffset,
218  index_t soffset,
219  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
220 
221 __device__ void
223  int32x4_t rsrc,
224  index_t voffset,
225  index_t soffset,
226  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16");
227 
228 __device__ void
230  int32x4_t rsrc,
231  index_t voffset,
232  index_t soffset,
233  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
234 
235 // buffer store fp32
236 __device__ void
238  int32x4_t rsrc,
239  index_t voffset,
240  index_t soffset,
241  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
242 
243 __device__ void
245  int32x4_t rsrc,
246  index_t voffset,
247  index_t soffset,
248  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
249 
250 __device__ void
252  int32x4_t rsrc,
253  index_t voffset,
254  index_t soffset,
255  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
256 
257 // buffer atomic-add fp16
259  half2_t vdata,
260  int32x4_t rsrc,
261  index_t voffset,
262  index_t soffset,
263  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
264 
265 // buffer atomic-add i32
267  int32_t vdata,
268  int32x4_t rsrc,
269  index_t voffset,
270  index_t soffset,
271  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32");
272 
273 // buffer atomic-add fp32
275  float vdata,
276  int32x4_t rsrc,
277  index_t voffset,
278  index_t soffset,
279  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32");
280 
281 // buffer atomic-add fp32
282 __device__ double
284  int32x4_t rsrc, // dst_wave_buffer_resource
285  int voffset, // dst_thread_addr_offset
286  int soffset, // dst_wave_addr_offset
287  int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
288 
289 // memory coherency bit for buffer store/load instruction
290 // check ISA manual for each GFX target
291 // e.g. for
292 // https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
293 // page 67~68
295 {
296  DefaultCoherence = 0, // default value
297  GLC = 1,
298  SLC = 2,
299  GLC_SLC = 3,
300  // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
301  // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
302  // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
303  WAVE_NT0 = 0,
304  WAVE_NT1 = 2,
305  GROUP_NT0 = 1,
306  GROUP_NT1 = 3,
307  DEVICE_NT0 = 8,
308  DEVICE_NT1 = 10,
309  SYSTEM_NT0 = 9,
310  SYSTEM_NT1 = 11,
311 };
312 
313 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
314 __device__ typename vector_type<int8_t, N>::type
315 amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,
316  index_t src_thread_addr_offset,
317  index_t src_wave_addr_offset)
318 {
319  static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
320  "wrong! not implemented");
321 
322  if constexpr(N == 1)
323  {
324  return llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource,
325  src_thread_addr_offset,
326  src_wave_addr_offset,
327  static_cast<index_t>(coherence));
328  }
329  else if constexpr(N == 2)
330  {
331 
332  int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
333  src_thread_addr_offset,
334  src_wave_addr_offset,
335  static_cast<index_t>(coherence));
336 
337  return bit_cast<int8x2_t>(tmp);
338  }
339  else if constexpr(N == 4)
340  {
341  int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource,
342  src_thread_addr_offset,
343  src_wave_addr_offset,
344  static_cast<index_t>(coherence));
345 
346  return bit_cast<int8x4_t>(tmp);
347  }
348  else if constexpr(N == 8)
349  {
350  int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource,
351  src_thread_addr_offset,
352  src_wave_addr_offset,
353  static_cast<index_t>(coherence));
354 
355  return bit_cast<int8x8_t>(tmp);
356  }
357  else if constexpr(N == 16)
358  {
359  int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
360  src_thread_addr_offset,
361  src_wave_addr_offset,
362  static_cast<index_t>(coherence));
363  return bit_cast<int8x16_t>(tmp);
364  }
365  else if constexpr(N == 32)
366  {
367  int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
368  src_thread_addr_offset,
369  src_wave_addr_offset,
370  static_cast<index_t>(coherence));
371  int32x4_t tmp1 =
372  llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
373  src_thread_addr_offset,
374  src_wave_addr_offset + 4 * sizeof(int32_t),
375  static_cast<index_t>(coherence));
377 
378  tmp.AsType<int32x4_t>()(Number<0>{}) = tmp0;
379  tmp.AsType<int32x4_t>()(Number<1>{}) = tmp1;
380 
381  return bit_cast<int8x32_t>(tmp);
382  }
383  else if constexpr(N == 64)
384  {
385  int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
386  src_thread_addr_offset,
387  src_wave_addr_offset,
388  static_cast<index_t>(coherence));
389  int32x4_t tmp1 =
390  llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
391  src_thread_addr_offset,
392  src_wave_addr_offset + 4 * sizeof(int32_t),
393  static_cast<index_t>(coherence));
394  int32x4_t tmp2 =
395  llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
396  src_thread_addr_offset,
397  src_wave_addr_offset + 8 * sizeof(int32_t),
398  static_cast<index_t>(coherence));
399  int32x4_t tmp3 =
400  llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
401  src_thread_addr_offset,
402  src_wave_addr_offset + 12 * sizeof(int32_t),
403  static_cast<index_t>(coherence));
404 
406 
407  tmp.AsType<int32x4_t>()(Number<0>{}) = tmp0;
408  tmp.AsType<int32x4_t>()(Number<1>{}) = tmp1;
409  tmp.AsType<int32x4_t>()(Number<2>{}) = tmp2;
410  tmp.AsType<int32x4_t>()(Number<3>{}) = tmp3;
411 
412  return bit_cast<int8x64_t>(tmp);
413  }
414 }
415 
416 template <typename T,
417  index_t N,
419 __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
420  index_t src_thread_addr_offset,
421  index_t src_wave_addr_offset)
422 {
423  static_assert(
424  (is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
425  (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
426  (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
427  (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
428  (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
429  (is_same<T, f8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
430  (is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
431  (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
432  (is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
433  (is_same<T, pk_i4_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
435  (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
436  "wrong! not implemented");
437 
438  using r_t = typename vector_type<T, N>::type;
439  auto raw_data = amd_buffer_load_impl_raw<sizeof(T) * N, coherence>(
440  src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
441  return bit_cast<r_t>(raw_data);
442 }
443 
444 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
445 __device__ void
447  int32x4_t dst_wave_buffer_resource,
448  index_t dst_thread_addr_offset,
449  index_t dst_wave_addr_offset)
450 {
451  static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
452  "wrong! not implemented");
453 
454  if constexpr(N == 1)
455  {
456  llvm_amdgcn_raw_buffer_store_i8(src_thread_data,
457  dst_wave_buffer_resource,
458  dst_thread_addr_offset,
459  dst_wave_addr_offset,
460  static_cast<index_t>(coherence));
461  }
462  else if constexpr(N == 2)
463  {
464 
465  llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data),
466  dst_wave_buffer_resource,
467  dst_thread_addr_offset,
468  dst_wave_addr_offset,
469  static_cast<index_t>(coherence));
470  }
471  else if constexpr(N == 4)
472  {
473  llvm_amdgcn_raw_buffer_store_i32(bit_cast<int32_t>(src_thread_data),
474  dst_wave_buffer_resource,
475  dst_thread_addr_offset,
476  dst_wave_addr_offset,
477  static_cast<index_t>(coherence));
478  }
479  else if constexpr(N == 8)
480  {
481  llvm_amdgcn_raw_buffer_store_i32x2(bit_cast<int32x2_t>(src_thread_data),
482  dst_wave_buffer_resource,
483  dst_thread_addr_offset,
484  dst_wave_addr_offset,
485  static_cast<index_t>(coherence));
486  }
487  else if constexpr(N == 16)
488  {
489  llvm_amdgcn_raw_buffer_store_i32x4(bit_cast<int32x4_t>(src_thread_data),
490  dst_wave_buffer_resource,
491  dst_thread_addr_offset,
492  dst_wave_addr_offset,
493  static_cast<index_t>(coherence));
494  }
495  else if constexpr(N == 32)
496  {
497  vector_type<int32_t, 8> tmp{bit_cast<int32x8_t>(src_thread_data)};
498 
499  llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<0>{}],
500  dst_wave_buffer_resource,
501  dst_thread_addr_offset,
502  dst_wave_addr_offset,
503  static_cast<index_t>(coherence));
504 
505  llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<1>{}],
506  dst_wave_buffer_resource,
507  dst_thread_addr_offset,
508  dst_wave_addr_offset + sizeof(int32_t) * 4,
509  static_cast<index_t>(coherence));
510  }
511  else if constexpr(N == 64)
512  {
513  vector_type<int32_t, 16> tmp{bit_cast<int32x16_t>(src_thread_data)};
514 
515  llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<0>{}],
516  dst_wave_buffer_resource,
517  dst_thread_addr_offset,
518  dst_wave_addr_offset,
519  static_cast<index_t>(coherence));
520 
521  llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<1>{}],
522  dst_wave_buffer_resource,
523  dst_thread_addr_offset,
524  dst_wave_addr_offset + sizeof(int32_t) * 4,
525  static_cast<index_t>(coherence));
526 
527  llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<2>{}],
528  dst_wave_buffer_resource,
529  dst_thread_addr_offset,
530  dst_wave_addr_offset + sizeof(int32_t) * 8,
531  static_cast<index_t>(coherence));
532 
533  llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<3>{}],
534  dst_wave_buffer_resource,
535  dst_thread_addr_offset,
536  dst_wave_addr_offset + sizeof(int32_t) * 12,
537  static_cast<index_t>(coherence));
538  }
539 }
540 
541 template <typename T,
542  index_t N,
544 __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
545  int32x4_t dst_wave_buffer_resource,
546  index_t dst_thread_addr_offset,
547  index_t dst_wave_addr_offset)
548 {
549  static_assert(
550  (is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
551  (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
552  (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
553  (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
554  (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
555  (is_same<T, f8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
556  (is_same<T, bf8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
558  (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
559  (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
560  "wrong! not implemented");
561 
562  using r_t = typename vector_type<int8_t, sizeof(T) * N>::type;
563 
564  amd_buffer_store_impl_raw<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
565  dst_wave_buffer_resource,
566  dst_thread_addr_offset,
567  dst_wave_addr_offset);
568 }
569 
570 template <typename T, index_t N>
571 __device__ void amd_global_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
572  T* addr)
573 {
574  static_assert((is_same<T, bhalf_t>::value && (N == 2 || N == 4 || N == 8)) ||
575  (is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)),
576  "wrong! not implemented");
577 
578  if constexpr(is_same<T, half_t>::value)
579  {
580  vector_type<half_t, N> tmp{src_thread_data};
581  static_for<0, N / 2, 1>{}([&](auto i) {
582  __builtin_amdgcn_global_atomic_fadd_v2f16(bit_cast<half2_t*>(addr) + i,
583  tmp.template AsType<half2_t>()[i]);
584  });
585  }
586 #if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)
587  else if constexpr(is_same<T, bhalf_t>::value)
588  {
589  vector_type<bhalf_t, N> tmp{src_thread_data};
590  static_for<0, N / 2, 1>{}([&](auto i) {
591  __builtin_amdgcn_global_atomic_fadd_v2bf16(bit_cast<bhalf2_t*>(addr) + i,
592  tmp.template AsType<bhalf2_t>()[i]);
593  });
594  }
595 #endif
596 }
597 
598 template <typename T, index_t N>
599 __device__ void amd_buffer_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
600  int32x4_t dst_wave_buffer_resource,
601  index_t dst_thread_addr_offset,
602  index_t dst_wave_addr_offset)
603 {
604  static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
605  (is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)) ||
606  (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
607  "wrong! not implemented");
608 
609  if constexpr(is_same<T, float>::value)
610  {
611  if constexpr(N == 1)
612  {
614  dst_wave_buffer_resource,
615  dst_thread_addr_offset,
616  dst_wave_addr_offset,
617  0);
618  }
619  else if constexpr(N == 2)
620  {
621  vector_type<float, 2> tmp{src_thread_data};
622 
623  llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
624  dst_wave_buffer_resource,
625  dst_thread_addr_offset,
626  dst_wave_addr_offset,
627  0);
628 
629  llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
630  dst_wave_buffer_resource,
631  dst_thread_addr_offset,
632  dst_wave_addr_offset + sizeof(float),
633  0);
634  }
635  else if constexpr(N == 4)
636  {
637  vector_type<float, 4> tmp{src_thread_data};
638 
639  llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
640  dst_wave_buffer_resource,
641  dst_thread_addr_offset,
642  dst_wave_addr_offset,
643  0);
644 
645  llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
646  dst_wave_buffer_resource,
647  dst_thread_addr_offset,
648  dst_wave_addr_offset + sizeof(float),
649  0);
650 
651  llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<2>{}],
652  dst_wave_buffer_resource,
653  dst_thread_addr_offset,
654  dst_wave_addr_offset + 2 * sizeof(float),
655  0);
656 
657  llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<3>{}],
658  dst_wave_buffer_resource,
659  dst_thread_addr_offset,
660  dst_wave_addr_offset + 3 * sizeof(float),
661  0);
662  }
663  }
664  else if constexpr(is_same<T, half_t>::value)
665  {
666  if constexpr(N == 2)
667  {
669  dst_wave_buffer_resource,
670  dst_thread_addr_offset,
671  dst_wave_addr_offset,
672  0);
673  }
674  else if constexpr(N == 4)
675  {
676  vector_type<half_t, 4> tmp{src_thread_data};
677 
678  static_for<0, 2, 1>{}([&](auto i) {
680  dst_wave_buffer_resource,
681  dst_thread_addr_offset,
682  dst_wave_addr_offset + i * sizeof(half2_t),
683  0);
684  });
685  }
686  else if constexpr(N == 8)
687  {
688  vector_type<half_t, 8> tmp{src_thread_data};
689 
690  static_for<0, 4, 1>{}([&](auto i) {
692  dst_wave_buffer_resource,
693  dst_thread_addr_offset,
694  dst_wave_addr_offset + i * sizeof(half2_t),
695  0);
696  });
697  }
698  }
699  else if constexpr(is_same<T, int32_t>::value)
700  {
701  if constexpr(N == 1)
702  {
704  dst_wave_buffer_resource,
705  dst_thread_addr_offset,
706  dst_wave_addr_offset,
707  0);
708  }
709  else if constexpr(N == 2)
710  {
711  vector_type<int32_t, 2> tmp{src_thread_data};
712 
714  dst_wave_buffer_resource,
715  dst_thread_addr_offset,
716  dst_wave_addr_offset,
717  0);
718 
720  dst_wave_buffer_resource,
721  dst_thread_addr_offset,
722  dst_wave_addr_offset + sizeof(int32_t),
723  0);
724  }
725  else if constexpr(N == 4)
726  {
727  vector_type<int32_t, 4> tmp{src_thread_data};
728 
730  dst_wave_buffer_resource,
731  dst_thread_addr_offset,
732  dst_wave_addr_offset,
733  0);
734 
736  dst_wave_buffer_resource,
737  dst_thread_addr_offset,
738  dst_wave_addr_offset + sizeof(int32_t),
739  0);
740 
742  dst_wave_buffer_resource,
743  dst_thread_addr_offset,
744  dst_wave_addr_offset + 2 * sizeof(int32_t),
745  0);
746 
748  dst_wave_buffer_resource,
749  dst_thread_addr_offset,
750  dst_wave_addr_offset + 3 * sizeof(int32_t),
751  0);
752  }
753  }
754 }
755 
756 template <typename T, index_t N>
757 __device__ void amd_buffer_atomic_max_impl(const typename vector_type<T, N>::type src_thread_data,
758  int32x4_t dst_wave_buffer_resource,
759  index_t dst_thread_addr_offset,
760  index_t dst_wave_addr_offset)
761 {
762  static_assert((is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
763  "wrong! not implemented");
764  if constexpr(is_same<T, double>::value)
765  {
766  if constexpr(N == 1)
767  {
769  dst_wave_buffer_resource,
770  dst_thread_addr_offset,
771  dst_wave_addr_offset,
772  0);
773  }
774  else if constexpr(N == 2)
775  {
776  vector_type<double, 2> tmp{src_thread_data};
777 
778  llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<0>{}],
779  dst_wave_buffer_resource,
780  dst_thread_addr_offset,
781  dst_wave_addr_offset,
782  0);
783 
784  llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<1>{}],
785  dst_wave_buffer_resource,
786  dst_thread_addr_offset,
787  dst_wave_addr_offset + sizeof(double),
788  0);
789  }
790  else if constexpr(N == 4)
791  {
792  vector_type<double, 4> tmp{src_thread_data};
793 
794  llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<0>{}],
795  dst_wave_buffer_resource,
796  dst_thread_addr_offset,
797  dst_wave_addr_offset,
798  0);
799 
800  llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<1>{}],
801  dst_wave_buffer_resource,
802  dst_thread_addr_offset,
803  dst_wave_addr_offset + sizeof(double),
804  0);
805 
806  llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<2>{}],
807  dst_wave_buffer_resource,
808  dst_thread_addr_offset,
809  dst_wave_addr_offset + 2 * sizeof(double),
810  0);
811 
812  llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<3>{}],
813  dst_wave_buffer_resource,
814  dst_thread_addr_offset,
815  dst_wave_addr_offset + 3 * sizeof(double),
816  0);
817  }
818  }
819 }
820 
821 // buffer_load requires:
822 // 1) p_src_wave must point to global memory space
823 // 2) p_src_wave must be a wavewise pointer.
824 // It is user's responsibility to make sure that is true.
825 template <typename T,
826  index_t N,
828 __device__ typename vector_type_maker<T, N>::type::type
830  index_t src_thread_element_offset,
831  bool src_thread_element_valid,
832  index_t src_element_space_size)
833 {
834  const int32x4_t src_wave_buffer_resource =
835  make_wave_buffer_resource(p_src_wave, src_element_space_size);
836 
837  index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
838 
839  using vector_t = typename vector_type_maker<T, N>::type::type;
840  using scalar_t = typename scalar_type<vector_t>::type;
841 
842  constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
843 
844 #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
845  uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
846  return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
847  src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
848 
849 #else
850 
851  vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
852  src_wave_buffer_resource, src_thread_addr_offset, 0)};
853  return src_thread_element_valid ? tmp : vector_t(0);
854 #endif
855 }
856 
857 // buffer_load requires:
858 // 1) p_src_wave must point to global memory space
859 // 2) p_src_wave must be a wavewise pointer.
860 // It is user's responsibility to make sure that is true.
861 template <typename T,
862  index_t N,
864 __device__ typename vector_type_maker<T, N>::type::type
866  index_t src_thread_element_offset,
867  bool src_thread_element_valid,
868  index_t src_element_space_size,
869  T customized_value)
870 {
871  const int32x4_t src_wave_buffer_resource =
872  make_wave_buffer_resource(p_src_wave, src_element_space_size);
873 
874  index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
875 
876  using vector_t = typename vector_type_maker<T, N>::type::type;
877  using scalar_t = typename scalar_type<vector_t>::type;
878 
879  constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
880 
881  vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
882  src_wave_buffer_resource, src_thread_addr_offset, 0)};
883 
884  return src_thread_element_valid ? tmp : vector_t(customized_value);
885 }
886 
887 // buffer_store requires:
888 // 1) p_dst_wave must point to global memory
889 // 2) p_dst_wave must be a wavewise pointer.
890 // It is user's responsibility to make sure that is true.
891 template <typename T,
892  index_t N,
894 __device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
895  T* p_dst_wave,
896  const index_t dst_thread_element_offset,
897  const bool dst_thread_element_valid,
898  const index_t dst_element_space_size)
899 {
900  const int32x4_t dst_wave_buffer_resource =
901  make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
902 
903  index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
904 
905  using vector_t = typename vector_type_maker<T, N>::type::type;
906  using scalar_t = typename scalar_type<vector_t>::type;
907  constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
908 
909 #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
910  uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
911  amd_buffer_store_impl<scalar_t, vector_size, coherence>(
912  src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
913 #else
914  if(dst_thread_element_valid)
915  {
916  amd_buffer_store_impl<scalar_t, vector_size, coherence>(
917  src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
918  }
919 #endif
920 }
921 
922 // buffer_atomic_add requires:
923 // 1) p_dst_wave must point to global memory
924 // 2) p_dst_wave must be a wavewise pointer.
925 // It is user's responsibility to make sure that is true.
926 template <typename T, index_t N>
927 __device__ void
929  T* p_dst_wave,
930  const index_t dst_thread_element_offset,
931  const bool dst_thread_element_valid,
932  const index_t dst_element_space_size)
933 {
934  const int32x4_t dst_wave_buffer_resource =
935  make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
936 
937  index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
938 
939  using vector_t = typename vector_type_maker<T, N>::type::type;
940  using scalar_t = typename scalar_type<vector_t>::type;
941  constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
942 
943  if constexpr(is_same<T, bhalf_t>::value)
944  {
945  if(dst_thread_element_valid)
946  {
947  amd_global_atomic_add_impl<scalar_t, vector_size>(
948  src_thread_data, p_dst_wave + dst_thread_element_offset);
949  }
950  }
951  else
952  {
953 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
954  uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
955 
956  amd_buffer_atomic_add_impl<scalar_t, vector_size>(
957  src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
958 #else
959  if(dst_thread_element_valid)
960  {
961  amd_buffer_atomic_add_impl<scalar_t, vector_size>(
962  src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
963  }
964 #endif
965  }
966 }
967 
968 // buffer_atomic_max requires:
969 // 1) p_dst_wave must point to global memory
970 // 2) p_dst_wave must be a wavewise pointer.
971 // It is user's responsibility to make sure that is true.
972 template <typename T, index_t N>
973 __device__ void
975  T* p_dst_wave,
976  const index_t dst_thread_element_offset,
977  const bool dst_thread_element_valid,
978  const index_t dst_element_space_size)
979 {
980  const int32x4_t dst_wave_buffer_resource =
981  make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
982 
983  index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
984 
985  using vector_t = typename vector_type_maker<T, N>::type::type;
986  using scalar_t = typename scalar_type<vector_t>::type;
987  constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
988 
989 #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
990  uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
991 
992  amd_buffer_atomic_max_impl<scalar_t, vector_size>(
993  src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
994 #else
995  if(dst_thread_element_valid)
996  {
997  amd_buffer_atomic_max_impl<scalar_t, vector_size>(
998  src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
999  }
1000 #endif
1001 }
1002 
1003 // Direct loads from global to LDS.
1004 __device__ void
1006  __attribute__((address_space(3))) uint32_t* lds_ptr,
1007  index_t size,
1008  index_t voffset,
1009  index_t soffset,
1010  index_t offset,
1011  index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
1012 
1013 #ifndef __HIPCC_RTC__
1014 template <typename T, index_t NumElemsPerThread>
1015 __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
1016  const index_t global_offset,
1017  T* lds_base_ptr,
1018  const index_t lds_offset,
1019  const bool is_valid,
1020  const index_t src_element_space_size)
1021 {
1022  // Direct loads require that each thread reads and writes exactly a single DWORD.
1023  constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
1024 #if defined(__gfx950__)
1025  constexpr auto dword_bytes = 4;
1026  static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
1027  bytes_per_thread == dword_bytes * 4);
1028 #elif defined(__gfx942__)
1029  constexpr auto dword_bytes = 4;
1030  static_assert(bytes_per_thread == dword_bytes);
1031 #endif
1032 
1033  const int32x4_t src_resource =
1034  make_wave_buffer_resource(global_base_ptr, src_element_space_size);
1035  const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
1036 
1037 #if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
1038  T* lds_ptr = lds_base_ptr + lds_offset;
1039 #ifndef CK_CODE_GEN_RTC
1040  auto const lds_ptr_sgpr =
1041  __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
1042 #else
1043  auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((reinterpret_cast<size_t>(lds_ptr)));
1044 #endif
1045  asm volatile("s_mov_b32 m0, %0; \n\t"
1046  "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
1047  "v"(global_offset_bytes),
1048  "s"(src_resource)
1049  : "memory");
1050 #else
1051  // LDS pointer must be attributed with the LDS address space.
1052  __attribute__((address_space(3))) uint32_t* lds_ptr =
1053 #ifndef CK_CODE_GEN_RTC
1054  reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
1055  reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
1056 #else
1057  reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
1058  reinterpret_cast<size_t>(lds_base_ptr + lds_offset));
1059 #endif
1060 
1062  src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
1063 #endif
1064 }
1065 #endif
1066 
1067 } // namespace ck
#define CK_BUFFER_RESOURCE_3RD_DWORD
Definition: ck.hpp:79
Definition: ck.hpp:267
__device__ void llvm_amdgcn_raw_buffer_store_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
__device__ int32x4_t make_wave_buffer_resource_with_default_range(T *p_wave)
Definition: amd_buffer_addressing.hpp:38
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition: statically_indexed_array.hpp:45
__device__ void amd_buffer_store(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:894
__device__ void amd_direct_load_global_to_lds(const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)
Definition: amd_buffer_addressing.hpp:1015
__device__ void amd_buffer_atomic_max(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:974
__device__ int32x4_t llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
__device__ void llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
__device__ void llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
__device__ void amd_buffer_store_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:544
__device__ float llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
__device__ int32x2_t llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
AmdBufferCoherenceEnum
Definition: amd_buffer_addressing.hpp:295
typename vector_type< bhalf_t, 4 >::type bhalf4_t
Definition: dtype_vector.hpp:2147
__device__ int32x4_t make_wave_buffer_resource(T *p_wave, index_t element_space_size)
Definition: amd_buffer_addressing.hpp:23
typename vector_type< int32_t, 2 >::type int32x2_t
Definition: dtype_vector.hpp:2153
__device__ void llvm_amdgcn_raw_buffer_store_fp16(half_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
typename vector_type< int8_t, 2 >::type int8x2_t
Definition: dtype_vector.hpp:2162
__device__ half_t llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
__device__ bhalf_t llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
__device__ void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
__device__ void llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
__device__ int8x4_t llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
__device__ void amd_buffer_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:599
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition: amd_buffer_addressing.hpp:865
__device__ float4_t llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
__device__ bhalf2_t llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
typename vector_type< float, 2 >::type float2_t
Definition: dtype_vector.hpp:2131
__device__ void llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
typename vector_type< half_t, 4 >::type half4_t
Definition: dtype_vector.hpp:2140
__device__ void amd_global_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, T *addr)
Definition: amd_buffer_addressing.hpp:571
_Float16 half_t
Definition: data_type.hpp:30
__device__ void llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
ushort bhalf_t
Definition: data_type.hpp:29
__device__ vector_type< T, N >::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:419
typename vector_type< bhalf_t, 2 >::type bhalf2_t
Definition: dtype_vector.hpp:2146
__device__ void llvm_amdgcn_raw_buffer_store_i16(bhalf_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
__device__ void llvm_amdgcn_raw_buffer_store_i16x2(bhalf2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
typename vector_type< float, 4 >::type float4_t
Definition: dtype_vector.hpp:2132
__device__ half2_t llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
__device__ void amd_buffer_atomic_add(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition: amd_buffer_addressing.hpp:928
__device__ void llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
__device__ bhalf4_t llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
__device__ int8_t llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
__device__ float2_t llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
typename vector_type< half_t, 2 >::type half2_t
Definition: dtype_vector.hpp:2139
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_zero(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
Definition: amd_buffer_addressing.hpp:829
__device__ void amd_buffer_atomic_max_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:757
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
typename vector_type< int32_t, 4 >::type int32x4_t
Definition: dtype_vector.hpp:2154
int32_t index_t
Definition: ck.hpp:298
__device__ void llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
typename vector_type< int8_t, 4 >::type int8x4_t
Definition: dtype_vector.hpp:2163
__device__ void llvm_amdgcn_raw_buffer_store_i8(int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
__device__ int32_t llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
__device__ void amd_buffer_store_impl_raw(const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:446
__device__ void llvm_amdgcn_raw_buffer_store_i16x4(bhalf4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition: amd_buffer_addressing.hpp:315
__device__ int8x2_t llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
typename remove_cv< T >::type remove_cv_t
Definition: type.hpp:295
__device__ void llvm_amdgcn_raw_buffer_store_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
__device__ half4_t llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
__device__ void llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
signed short int16_t
Definition: stdint.h:122
_W64 unsigned int uintptr_t
Definition: stdint.h:165
unsigned int uint32_t
Definition: stdint.h:126
signed int int32_t
Definition: stdint.h:123
signed char int8_t
Definition: stdint.h:121
Definition: integral_constant.hpp:20
Definition: type.hpp:177
Definition: data_type.hpp:38
Definition: functional2.hpp:33
Definition: dtype_vector.hpp:30
Definition: dtype_vector.hpp:10
Definition: amd_buffer_addressing.hpp:11
int32x4_t content
Definition: amd_buffer_addressing.hpp:16
StaticallyIndexedArray< int32_t, 4 > config
Definition: amd_buffer_addressing.hpp:19
constexpr __device__ BufferResource()
Definition: amd_buffer_addressing.hpp:12
StaticallyIndexedArray< int32_t, 4 > range
Definition: amd_buffer_addressing.hpp:18
StaticallyIndexedArray< T *, 2 > address
Definition: amd_buffer_addressing.hpp:17