/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/workgroup_synchronization.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/workgroup_synchronization.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/workgroup_synchronization.hpp Source File
workgroup_synchronization.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
6 
7 namespace ck {
8 
9 // Initialization flag of Barrier object, can be any value except for zero
10 static constexpr int BarrierInitFlag = 0x7856;
11 
12 // 1) only the first thread-block in the synchronizaton group is supposed to call this function. It
13 // is the responsibility of the user to ensure the two integer values in p_control_bits are zeros
14 // before calling gms_init().
15 // 2) Aftercalling gms_reset(), the two integer values in p_control_bits will be zeros, so no
16 // repetitious initialization of p_control_bits buffer is required
17 static __device__ void gms_init(int NumWarps, int* p_control_bits)
18 {
19  union
20  {
21  int two32[2];
22  unsigned long one64;
23  } regs;
24 
25  regs.two32[0] = BarrierInitFlag;
26  regs.two32[1] = NumWarps;
27 
28  if(threadIdx.x == 0)
29  atomicCAS(reinterpret_cast<unsigned long*>(p_control_bits), 0, regs.one64);
30 };
31 
32 // all the workgroups in the synchronization group is supposed to call this function
33 static __device__ void gms_barrier(int* p_control_bits)
34 {
35  constexpr int mask = WarpSize - 1;
36 
37  if((threadIdx.x & mask) == 0)
38  {
39  // ensure the barrier object is initialized
40  do
41  {
42  const int r0 = __atomic_load_n(&p_control_bits[0], __ATOMIC_RELAXED);
43 
44  if(r0 == BarrierInitFlag)
45  break;
46 
47  } while(true);
48 
49  // go ahead toward the barrier line
50  atomicSub(&p_control_bits[1], 1);
51 
52  // wait until all warps have arrived
53  do
54  {
55  const int r1 = __atomic_load_n(&p_control_bits[1], __ATOMIC_RELAXED);
56 
57  if(r1 == 0)
58  break;
59 
60  } while(true);
61  };
62 };
63 
64 // 1) Only the first thread-block in the synchronizaton group is supposed to call this function.
65 // 2) Aftercalling gms_reset(), the two integer values in p_control_bits will be zeros, so no
66 // repetitious initialization of p_control_bits buffer is required
67 static __device__ void gms_reset(int* p_control_bits)
68 {
69  // reset the barrier object
70  if(threadIdx.x == 0)
71  (void)atomicCAS(&p_control_bits[0], BarrierInitFlag, 0);
72 };
73 
74 } // namespace ck
Definition: ck.hpp:267