/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/config.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/config.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/config.hpp Source File
config.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
6 #if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__) || \
7  defined(__gfx9_4_generic__)
8 #define __gfx9__
9 #endif
10 #if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx9_4_generic__)
11 #define __gfx94__
12 #endif
13 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
14  defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
15  defined(__gfx10_3_generic__)
16 #define __gfx103__
17 #endif
18 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
19  defined(__gfx1103__) || defined(__gfx1150__) || defined(__gfx1151__) || \
20  defined(__gfx1152__) || defined(__gfx11_generic__)
21 #define __gfx11__
22 #endif
23 #if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
24 #define __gfx12__
25 #endif
26 
27 #include "hip/hip_version.h"
28 #ifndef CK_TILE_DONT_USE_HIP_RUNTIME_HEADERS
29 #include "hip/hip_runtime.h"
30 #include "hip/hip_fp16.h"
31 #endif
32 
33 #ifdef __HIPCC__
34 #define CK_TILE_HOST inline __host__
35 #define CK_TILE_DEVICE inline __device__
36 #define CK_TILE_HOST_DEVICE inline __host__ __device__
37 #define CK_TILE_DEVICE_EXTERN __device__
38 #define CK_TILE_HOST_DEVICE_EXTERN __host__ __device__
39 #else
40 #define CK_TILE_HOST inline
41 #define CK_TILE_DEVICE inline
42 #define CK_TILE_HOST_DEVICE inline
43 #define CK_TILE_DEVICE_EXTERN
44 #define CK_TILE_HOST_DEVICE_EXTERN
45 #endif
46 
47 // implementing the "memory address space" attribute
48 // https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces-table
49 // WA for https://github.com/ROCm/composable_kernel/issues/1946
50 #if 0
51 #define CK_TILE_GENERIC_ADDR __attribute__((address_space(0)))
52 #define CK_TILE_GLOBAL_ADDR __attribute__((address_space(1)))
53 #define CK_TILE_LDS_ADDR __attribute__((address_space(3)))
54 #define CK_TILE_BUF_RES_ADDR __attribute__((address_space(8)))
55 #else
56 #define CK_TILE_GENERIC_ADDR
57 #define CK_TILE_GLOBAL_ADDR
58 #define CK_TILE_LDS_ADDR
59 #define CK_TILE_BUF_RES_ADDR
60 #endif
61 #ifndef CK_TILE_USE_CUSTOM_DATA_TYPE
62 #define CK_TILE_USE_CUSTOM_DATA_TYPE 0 // custom data type will generate extra move/bfi code
63 #endif
64 
65 #define CK_TILE_FLOAT_TO_BFLOAT16_STANDARD 0
66 #define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE_WITH_NAN 1
67 #define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE 2
68 #define CK_TILE_FLOAT_TO_BFLOAT16_STANDARD_ASM 3
69 #define CK_TILE_FLOAT_TO_BFLOAT16_RTA_ASM 4
70 
71 #ifndef CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT
72 #define CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE
73 #endif
74 
75 #define CK_TILE_FLOAT_TO_FP8_STANDARD 0
76 #define CK_TILE_FLOAT_TO_FP8_STOCHASTIC 1
77 
78 #ifndef CK_TILE_FLOAT_TO_FP8_DEFAULT
79 #define CK_TILE_FLOAT_TO_FP8_DEFAULT CK_TILE_FLOAT_TO_FP8_STANDARD
80 #endif
81 
82 // in the old rocm period, we have to use tuple array implementation to implement this
83 // so turn on the _USE_TUPLE if meet compiler error, otherwise _USE_ARRAY by default.
84 #define CK_TILE_STATICALLY_INDEXED_ARRAY_USE_ARRAY 0
85 #define CK_TILE_STATICALLY_INDEXED_ARRAY_USE_TUPLE 1
86 #ifndef CK_TILE_STATICALLY_INDEXED_ARRAY_DEFAULT
87 #define CK_TILE_STATICALLY_INDEXED_ARRAY_DEFAULT CK_TILE_STATICALLY_INDEXED_ARRAY_USE_TUPLE
88 #endif
89 
90 #define CK_TILE_THREAD_BUFFER_USE_ARRAY 0
91 #define CK_TILE_THREAD_BUFFER_USE_TUPLE 1
92 #ifndef CK_TILE_THREAD_BUFFER_DEFAULT
93 #define CK_TILE_THREAD_BUFFER_DEFAULT CK_TILE_THREAD_BUFFER_USE_ARRAY
94 #endif
95 
96 #ifndef CK_TILE_TUPLE_CTOR_WITH_INITIALIZER_LIST
97 #if CK_TILE_THREAD_BUFFER_DEFAULT == CK_TILE_THREAD_BUFFER_USE_TUPLE
98 // if using tuple-array as thread_buffer implementation, need to support {} brace init
99 // ... with similiar behavior as array
100 #define CK_TILE_TUPLE_CTOR_WITH_INITIALIZER_LIST 1
101 #else
102 #define CK_TILE_TUPLE_CTOR_WITH_INITIALIZER_LIST 0
103 #endif
104 #endif
105 
106 #ifndef CK_TILE_USE_LAUNCH_BOUNDS
107 #define CK_TILE_USE_LAUNCH_BOUNDS 1
108 #endif
109 
110 #ifndef CK_TILE_TIME_KERNEL
111 #define CK_TILE_TIME_KERNEL 1
112 #endif
113 
114 #define CK_TILE_MAX_THREAD_PER_BLOCK 256
115 #define CK_TILE_MIN_BLOCK_PER_CU 2
116 
117 #ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
118 #define CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
119 #endif
120 
121 #ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
122 #define CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
123 #endif
124 
125 #ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
126 #define CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1
127 #endif
128 
129 #ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
130 #define CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK 1
131 #endif
132 
133 #ifndef CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
134 #define CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM 1
135 #endif
136 
137 #ifndef CK_TILE_USE_AMD_BUFFER_LOAD
138 #define CK_TILE_USE_AMD_BUFFER_LOAD 1
139 #endif
140 
141 #ifndef CK_TILE_USE_AMD_BUFFER_STORE
142 #define CK_TILE_USE_AMD_BUFFER_STORE 1
143 #endif
144 
145 #ifndef CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER
146 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1
147 #endif
148 
149 #ifndef CK_TILE_USE_PK4_LAYOUT_SHUFFLE
150 #define CK_TILE_USE_PK4_LAYOUT_SHUFFLE 1
151 #endif
152 
153 // buffer atomic add: floating point
154 #ifndef __HIP_DEVICE_COMPILE__ // for host code
155 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
156 #elif defined(__gfx9__) || defined(__gfx12__) // for GPU code
157 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
158 #else // for GPU code
159 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
160 #endif
161 
162 #if(defined(__gfx90a__) || defined(__gfx94__)) // for GPU code
163 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
164 #else
165 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
166 #endif
167 
168 #ifndef CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
169 #define CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
170 #endif
171 
172 #ifndef CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
173 #define CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
174 #endif
175 
176 #ifndef CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE
177 #if HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 1 && HIP_VERSION_PATCH >= 40091
178 #define CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE 1
179 #else
180 #define CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE 0
181 #endif
182 #endif
183 
184 // workaround for ROCm 6.2 and later
185 #ifndef CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
186 #if(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 2 && HIP_VERSION_PATCH >= 41133) || \
187  (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 3 && HIP_VERSION_PATCH >= 42131) || \
188  (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR > 3)
189 #define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 1
190 #else
191 #define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 0
192 #endif
193 #endif
194 
195 // use llvm builtin bf16 data type after ROCm 6.5
196 #ifndef CK_TILE_USE_LLVM_BUILTIN_BF16
197 #if(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 5 && HIP_VERSION_PATCH >= 50421) || \
198  (HIP_VERSION_MAJOR >= 7)
199 #define CK_TILE_USE_LLVM_BUILTIN_BF16 1
200 #else
201 #define CK_TILE_USE_LLVM_BUILTIN_BF16 0
202 #endif
203 #endif
204 
205 #ifndef CK_TILE_DEBUG_LOG
206 #define CK_TILE_DEBUG_LOG 0
207 #endif
208 
209 #ifndef __HIP_DEVICE_COMPILE__ // for host code
210 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0xffffffff
211 #elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || \
212  defined(__gfx9__) // for GPU code
213 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x00020000
214 #elif defined(__gfx103__) // for GPU code
215 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31014000
216 #elif defined(__gfx11__) || defined(__gfx12__) // for GPU code
217 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31004000
218 #endif
219 
220 #ifndef CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
221 #define CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
222 #endif
223 
224 #ifndef CK_TILE_USE_SUBDWORD_TILE_CAST
225 #define CK_TILE_USE_SUBDWORD_TILE_CAST 0
226 #endif
227 
228 #ifndef CK_TILE_USE_PK_FP16_TILE_CAST
229 #define CK_TILE_USE_PK_FP16_TILE_CAST 0
230 #endif
231 
232 // TODO: better solve this inside compiler
233 #ifndef CK_TILE_FMHA_FWD_FAST_EXP2
234 #define CK_TILE_FMHA_FWD_FAST_EXP2 0
235 #endif
236 
237 #ifndef CK_TILE_FMHA_FLOAT_TO_FLOAT16_RTN
238 #define CK_TILE_FMHA_FLOAT_TO_FLOAT16_RTN 0
239 #endif
240 
241 #ifndef CK_TILE_BUFFER_LOAD_RAW_BF16_WA
242 #define CK_TILE_BUFFER_LOAD_RAW_BF16_WA 1
243 #endif
244 
245 // workaround: compiler not emiting reciprocal instruction frm __frcp_rn()
246 #ifndef CK_TILE_WORKAROUND_SWDEV_383542
247 #define CK_TILE_WORKAROUND_SWDEV_383542 1
248 #endif
249 
250 #ifndef CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
251 #define CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID 1
252 #endif
253 
254 #ifndef CK_TILE_USE_OCP_FP8
255 #if defined(__HIP_DEVICE_COMPILE__)
256 #if defined(__gfx950__) || defined(__gfx12__)
257 #define CK_TILE_USE_OCP_FP8 1
258 #else
259 #define CK_TILE_USE_OCP_FP8 0
260 #endif
261 #else
262 #define CK_TILE_USE_OCP_FP8 0
263 #endif
264 #endif
265 
266 #ifndef CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
267 #if __clang_major__ >= 20
268 #define CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN 1
269 #else
270 #define CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN 0
271 #endif
272 #endif
273 
274 #ifndef CK_TILE_WA_ISSUE_2028
275 #define CK_TILE_WA_ISSUE_2028 0
276 #endif
277 
278 // Y pointed to R, we don't see a valuable use case.
279 // Will enforce encoding to check Y not pointed to R if set to zero
280 #ifndef CK_TILE_ENC_SUPPORT_Y_TO_R
281 #define CK_TILE_ENC_SUPPORT_Y_TO_R 0
282 #endif