12 template <index_t NumPrefetch,
bool AEnableLds,
bool BEnableLds>
29 template <
bool HasMainLoop,
32 typename ABlockTransfer,
34 typename ABlockBuffer,
35 typename ABlockTransferStep,
38 typename BBlockTransfer,
40 typename BBlockBuffer,
41 typename BBlockTransferStep,
42 typename BlockwiseGemm,
43 typename CThreadBuffer>
44 __device__
static void Run(
const AGridDesc& a_grid_desc,
45 const ABlockDesc& a_block_desc,
46 ABlockTransfer& a_blockwise_copy,
47 const AGridBuffer& a_grid_buf,
48 ABlockBuffer& a_block_buf,
49 const ABlockTransferStep& a_block_copy_step,
50 const BGridDesc& b_grid_desc,
51 const BBlockDesc& b_block_desc,
52 BBlockTransfer& b_blockwise_copy,
53 const BGridBuffer& b_grid_buf,
54 BBlockBuffer& b_block_buf,
55 const BBlockTransferStep& b_block_copy_step,
56 const BlockwiseGemm& blockwise_gemm,
57 CThreadBuffer& c_thread_buf,
61 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
62 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
64 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
65 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
70 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
71 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
74 if constexpr(HasMainLoop)
80 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
84 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
86 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
90 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
91 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
93 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
94 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
97 }
while(i < (num_loop - 1));
104 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
119 return num_loop % 2 == 0;
124 return (num_loop / 2) > 1;
127 template <
bool HasMainLoop,
130 typename ABlockTransfer,
131 typename AGridBuffer,
132 typename ABlockBuffer,
133 typename ABlockTransferStep,
136 typename BBlockTransfer,
137 typename BGridBuffer,
138 typename BBlockBuffer,
139 typename BBlockTransferStep,
140 typename BlockwiseGemm,
141 typename CThreadBuffer>
142 static __device__
void Run(
const AGridDesc& a_grid_desc,
143 const ABlockDesc& a_block_desc,
144 ABlockTransfer& a_blockwise_copy,
145 const AGridBuffer& a_grid_buf,
146 ABlockBuffer& a_block_buf,
147 const ABlockTransferStep& a_block_copy_step,
148 const BGridDesc& b_grid_desc,
149 const BBlockDesc& b_block_desc,
150 BBlockTransfer& b_blockwise_copy,
151 const BGridBuffer& b_grid_buf,
152 BBlockBuffer& b_block_buf,
153 const BBlockTransferStep& b_block_copy_step,
154 const BlockwiseGemm& blockwise_gemm,
155 CThreadBuffer& c_thread_buf,
161 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
162 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
165 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
166 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
169 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I1);
170 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I1);
174 c_thread_buf.Clear();
177 if constexpr(HasMainLoop)
184 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
185 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
188 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
189 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
192 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
193 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
199 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
205 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
206 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
209 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I1);
210 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I1);
213 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I1);
214 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I1);
220 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
226 }
while(i < (num_loop - 2));
232 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
233 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
239 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
245 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I1);
246 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I1);
252 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
270 template <
bool HasMainLoop,
273 typename ABlockTransfer,
274 typename AGridBuffer,
275 typename ABlockBuffer,
276 typename ABlockTransferStep,
279 typename BBlockTransfer,
280 typename BGridBuffer,
281 typename BBlockBuffer,
282 typename BBlockTransferStep,
283 typename BlockwiseGemm,
284 typename CThreadBuffer>
285 __device__
static void Run(
const AGridDesc& a_grid_desc,
286 const ABlockDesc& a_block_desc,
287 ABlockTransfer& a_blockwise_copy,
288 const AGridBuffer& a_grid_buf,
289 ABlockBuffer& a_block_buf,
290 const ABlockTransferStep& a_block_copy_step,
291 const BGridDesc& b_grid_desc,
292 const BBlockDesc& b_block_desc,
293 BBlockTransfer& b_blockwise_copy,
294 const BGridBuffer& b_grid_buf,
295 BBlockBuffer& b_block_buf,
296 const BBlockTransferStep& b_block_copy_step,
297 const BlockwiseGemm& blockwise_gemm,
298 CThreadBuffer& c_thread_buf,
301 constexpr
auto a_block_origin_idx =
make_tuple(I0, I0, I0, I0, I0, I0, I0);
302 auto a_block_buf_switch = a_block_buf;
305 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
306 a_blockwise_copy.Run(
307 a_grid_desc, a_grid_buf, a_block_desc, a_block_origin_idx, a_block_buf);
309 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
310 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
313 c_thread_buf.Clear();
315 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
318 if constexpr(HasMainLoop)
324 a_blockwise_copy.Run(
325 a_grid_desc, a_grid_buf, a_block_desc, a_block_origin_idx, a_block_buf_switch);
329 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
331 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
335 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
336 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
338 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
340 a_block_buf = a_block_buf_switch;
342 }
while(i < (num_loop - 1));
349 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
369 template <
bool HasMainLoop,
372 typename ABlockTransfer,
373 typename AGridBuffer,
374 typename ABlockBuffer,
375 typename ABlockTransferStep,
378 typename BBlockTransfer,
379 typename BGridBuffer,
380 typename BBlockBuffer,
381 typename BBlockTransferStep,
382 typename BlockwiseGemm,
383 typename CThreadBuffer>
384 __device__
static void Run(
const AGridDesc& a_grid_desc,
385 const ABlockDesc& a_block_desc,
386 ABlockTransfer& a_blockwise_copy,
387 const AGridBuffer& a_grid_buf,
388 ABlockBuffer& a_block_buf,
389 const ABlockTransferStep& a_block_copy_step,
390 const BGridDesc& b_grid_desc,
391 const BBlockDesc& b_block_desc,
392 BBlockTransfer& b_blockwise_copy,
393 const BGridBuffer& b_grid_buf,
394 BBlockBuffer& b_block_buf,
395 const BBlockTransferStep& b_block_copy_step,
396 const BlockwiseGemm& blockwise_gemm,
397 CThreadBuffer& c_thread_buf,
400 constexpr
auto b_block_origin_idx =
make_tuple(I0, I0, I0, I0, I0, I0, I0);
401 auto b_block_buf_switch = b_block_buf;
404 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
405 b_blockwise_copy.Run(
406 b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf);
408 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
409 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
412 c_thread_buf.Clear();
414 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
417 if constexpr(HasMainLoop)
423 b_blockwise_copy.Run(
424 b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf_switch);
428 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
430 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
434 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
435 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
437 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
439 b_block_buf = b_block_buf_switch;
441 }
while(i < (num_loop - 1));
448 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
468 template <
bool HasMainLoop,
471 typename ABlockTransfer,
472 typename AGridBuffer,
473 typename ABlockBuffer,
474 typename ABlockTransferStep,
477 typename BBlockTransfer,
478 typename BGridBuffer,
479 typename BBlockBuffer,
480 typename BBlockTransferStep,
481 typename BlockwiseGemm,
482 typename CThreadBuffer>
483 __device__
static void Run(
const AGridDesc& a_grid_desc,
484 const ABlockDesc& a_block_desc,
485 ABlockTransfer& a_blockwise_copy,
486 const AGridBuffer& a_grid_buf,
487 ABlockBuffer& a_block_buf,
488 const ABlockTransferStep& a_block_copy_step,
489 const BGridDesc& b_grid_desc,
490 const BBlockDesc& b_block_desc,
491 BBlockTransfer& b_blockwise_copy,
492 const BGridBuffer& b_grid_buf,
493 BBlockBuffer& b_block_buf,
494 const BBlockTransferStep& b_block_copy_step,
495 const BlockwiseGemm& blockwise_gemm,
496 CThreadBuffer& c_thread_buf,
499 constexpr
auto b_block_origin_idx =
make_tuple(I0, I0, I0, I0, I0, I0, I0);
500 constexpr
auto a_block_origin_idx =
make_tuple(I0, I0, I0, I0, I0, I0, I0);
501 auto b_block_buf_switch = b_block_buf;
502 auto a_block_buf_switch = a_block_buf;
505 a_blockwise_copy.Run(
506 a_grid_desc, a_grid_buf, a_block_desc, a_block_origin_idx, a_block_buf);
507 b_blockwise_copy.Run(
508 b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf);
510 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
511 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
514 c_thread_buf.Clear();
517 if constexpr(HasMainLoop)
523 a_blockwise_copy.Run(
524 a_grid_desc, a_grid_buf, a_block_desc, a_block_origin_idx, a_block_buf_switch);
525 b_blockwise_copy.Run(
526 b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf_switch);
530 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
534 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
535 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
537 a_block_buf = a_block_buf_switch;
538 b_block_buf = b_block_buf_switch;
540 }
while(i < (num_loop - 1));
547 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
554 template <index_t NumPrefetch,
bool AEnableLds,
bool BEnableLds>
570 template <
bool HasMainLoop,
573 typename ABlockTransfer,
574 typename AGridBuffer,
575 typename ABlockBuffer,
576 typename ABlockTransferStep,
579 typename BBlockTransfer,
580 typename BGridBuffer,
581 typename BBlockBuffer,
582 typename BBlockTransferStep,
583 typename ScaleGridDesc,
584 typename ScaleGridBuffer,
585 typename BlockwiseGemm,
586 typename CThreadBuffer>
587 __device__
static void Run(
const AGridDesc& a_grid_desc,
588 const ABlockDesc& a_block_desc,
589 ABlockTransfer& a_blockwise_copy,
590 const AGridBuffer& a_grid_buf,
591 ABlockBuffer& a_block_buf,
592 const ABlockTransferStep& a_block_copy_step,
593 const BGridDesc& b_grid_desc,
594 const BBlockDesc& b_block_desc,
595 BBlockTransfer& b_blockwise_copy,
596 const BGridBuffer& b_grid_buf,
597 BBlockBuffer& b_block_buf,
598 const BBlockTransferStep& b_block_copy_step,
599 const ScaleGridDesc& scale_grid_desc,
600 const ScaleGridBuffer& scale_grid_buf,
601 const BlockwiseGemm& blockwise_gemm,
602 CThreadBuffer& c_thread_buf,
606 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
607 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
609 b_blockwise_copy.RunScaleRead(scale_grid_desc, scale_grid_buf);
611 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
612 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
615 c_thread_buf.Clear();
617 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
619 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
622 if constexpr(HasMainLoop)
628 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
632 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
634 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
638 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
639 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
641 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
642 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
645 }
while(i < (num_loop - 1));
652 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
657 template <index_t NumPrefetch>
670 template <
bool HasMainLoop,
673 typename ABlockTransfer,
674 typename AGridBuffer,
675 typename ABlockBuffer,
676 typename ABlockTransferStep,
679 typename BBlockTransfer,
680 typename BGridBuffer,
681 typename BBlockBuffer,
682 typename BBlockTransferStep,
683 typename BlockwiseGemm,
684 typename CThreadBuffer>
685 static __device__
void Run(
const AGridDesc& a_grid_desc,
686 const ABlockDesc& a_block_desc,
687 ABlockTransfer& a_blockwise_copy,
688 const AGridBuffer& a_grid_buf,
689 ABlockBuffer& a_block_buf,
690 const ABlockTransferStep& a_block_copy_step,
691 const BGridDesc& b_grid_desc,
692 const BBlockDesc& b_block_desc,
693 BBlockTransfer& b_blockwise_copy,
694 const BGridBuffer& b_grid_buf,
695 BBlockBuffer& b_block_buf,
696 const BBlockTransferStep& b_block_copy_step,
697 const BlockwiseGemm& blockwise_gemm,
698 CThreadBuffer& c_thread_buf,
702 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
703 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
705 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
706 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
709 c_thread_buf.Clear();
711 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
712 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
715 if constexpr(HasMainLoop)
721 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
725 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
727 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
731 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
732 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
734 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
735 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
738 }
while(i < (num_loop - 1));
745 blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
757 template <index_t NumPrefetch, LoopScheduler LoopSched>
constexpr auto GridwiseGemmPipeline_v1_Selector()
Definition: gridwise_gemm_pipeline_v1.hpp:758
__host__ constexpr __device__ auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:211
int32_t index_t
Definition: ck.hpp:298
__device__ void block_sync_lds()
Definition: synchronization.hpp:10
static __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, const BlockwiseGemm &blockwise_gemm, CThreadBuffer &c_thread_buf, index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:483
__host__ static constexpr __device__ bool CalculateHasMainLoop(index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:463
__host__ static constexpr __device__ bool IsSupported(index_t)
Definition: gridwise_gemm_pipeline_v1.hpp:461
__host__ static constexpr __device__ bool CalculateHasMainLoop(index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:265
__host__ static constexpr __device__ bool IsSupported(index_t)
Definition: gridwise_gemm_pipeline_v1.hpp:263
static __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, const BlockwiseGemm &blockwise_gemm, CThreadBuffer &c_thread_buf, index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:285
__host__ static constexpr __device__ bool IsSupported(index_t)
Definition: gridwise_gemm_pipeline_v1.hpp:362
static __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, const BlockwiseGemm &blockwise_gemm, CThreadBuffer &c_thread_buf, index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:384
__host__ static constexpr __device__ bool CalculateHasMainLoop(index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:364
static __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, const BlockwiseGemm &blockwise_gemm, CThreadBuffer &c_thread_buf, index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:44
__host__ static constexpr __device__ bool IsSupported(index_t)
Definition: gridwise_gemm_pipeline_v1.hpp:22
__host__ static constexpr __device__ bool CalculateHasMainLoop(index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:24
static __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, const BlockwiseGemm &blockwise_gemm, CThreadBuffer &c_thread_buf, index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:142
__host__ static constexpr __device__ bool CalculateHasMainLoop(index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:122
__host__ static constexpr __device__ bool IsSupported(index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:116
__host__ static constexpr __device__ bool IsSupported(index_t)
Definition: gridwise_gemm_pipeline_v1.hpp:563
__host__ static constexpr __device__ bool CalculateHasMainLoop(index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:565
static __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, const ScaleGridDesc &scale_grid_desc, const ScaleGridBuffer &scale_grid_buf, const BlockwiseGemm &blockwise_gemm, CThreadBuffer &c_thread_buf, index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:587
Definition: gridwise_gemm_pipeline_v1.hpp:555
Definition: gridwise_gemm_pipeline_v1.hpp:13
__host__ static constexpr __device__ bool IsSupported(index_t)
Definition: gridwise_gemm_pipeline_v1.hpp:663
__host__ static constexpr __device__ bool CalculateHasMainLoop(index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:665
static __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, const BlockwiseGemm &blockwise_gemm, CThreadBuffer &c_thread_buf, index_t num_loop)
Definition: gridwise_gemm_pipeline_v1.hpp:685
Definition: gridwise_gemm_pipeline_v1.hpp:658
Definition: integral_constant.hpp:20