15 return num_loop % 2 == 0;
20 return (num_loop / 2) > 1;
23 template <
bool HasMainLoop,
26 typename ABlockTransfer,
28 typename ABlockBuffer,
29 typename ABlockTransferStep,
32 typename BBlockTransfer,
34 typename BBlockBuffer,
35 typename BBlockTransferStep,
36 typename BlockwiseGemm,
37 typename CThreadBuffer>
38 __device__
static void Run(
const AGridDesc& a_grid_desc,
39 const ABlockDesc& a_block_desc,
40 ABlockTransfer& a_blockwise_copy,
41 const AGridBuffer& a_grid_buf,
42 ABlockBuffer& a_block_buf,
43 const ABlockTransferStep& a_block_copy_step,
44 const BGridDesc& b_grid_desc,
45 const BBlockDesc& b_block_desc,
46 BBlockTransfer& b_blockwise_copy,
47 const BGridBuffer& b_grid_buf,
48 BBlockBuffer& b_block_buf,
49 const BBlockTransferStep& b_block_copy_step,
50 const BlockwiseGemm& blockwise_gemm,
51 CThreadBuffer& c_thread_buf,
55 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
56 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
59 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
60 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
66 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
68 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
71 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
73 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
76 if constexpr(HasMainLoop)
82 #if CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT
89 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
94 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
95 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
98 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
100 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
103 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
105 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
108 }
while(i < (num_loop - 2));
116 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
121 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
122 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
127 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
#define CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT
Definition: ck.hpp:217
int32_t index_t
Definition: ck.hpp:298
__device__ void block_sync_lds()
Definition: synchronization.hpp:10
Definition: gridwise_gemm_pipeline_v2.hpp:11
__host__ static constexpr __device__ bool CalculateHasMainLoop(const index_t num_loop)
Definition: gridwise_gemm_pipeline_v2.hpp:18
__host__ static constexpr __device__ bool IsSupported(const index_t num_loop)
Definition: gridwise_gemm_pipeline_v2.hpp:12
static __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, const BlockwiseGemm &blockwise_gemm, CThreadBuffer &c_thread_buf, index_t num_loop)
Definition: gridwise_gemm_pipeline_v2.hpp:38