#include <grouped_gemm_kernel.hpp>
|
CK_TILE_DEVICE void | Run (const UniversalGemmKernelArgs<> &kargs, const tuple< index_t, index_t > &block_idx_2d, const index_t block_idx_z) const |
|
CK_TILE_DEVICE index_t | FindGroupId (const GemmTransKernelArg *gemm_desc_ptr, index_t block_id, index_t group_count) const |
|
template<bool U = UsePersistentKernel, typename = std::enable_if_t<!U>> |
CK_TILE_DEVICE void | operator() (const void CK_CONSTANT_ADDRESS_SPACE *gemm_descs_const, index_t group_count) const |
|
template<bool U = UsePersistentKernel, typename = std::enable_if_t<U>, typename = void> |
CK_TILE_DEVICE void | operator() (const void CK_CONSTANT_ADDRESS_SPACE *gemm_descs_const, const index_t group_count) const |
|
|
static CK_TILE_HOST const std::string | GetName () |
|
static CK_TILE_HOST auto | GetWorkSpaceSize (const std::vector< GroupedGemmHostArgs > &gemm_descs) -> std::size_t |
|
static CK_TILE_HOST auto | GetWorkSpaceSize (index_t group_count) -> std::size_t |
|
static CK_TILE_HOST auto | BlockSize () -> dim3 |
|
static CK_TILE_HOST auto | MaxOccupancyGridSize (const stream_config &s) -> dim3 |
| Get the maximum occupancy grid size for the persistent kernel on the current device. More...
|
|
static CK_TILE_HOST auto | GridSize (const std::vector< GroupedGemmHostArgs > &gemm_descs) |
|
static CK_TILE_HOST auto | MakeKargs (const std::vector< GroupedGemmHostArgs > &gemm_descs) -> std::vector< GemmTransKernelArg > |
|
static CK_TILE_HOST bool | IsSupportedArgument (const std::vector< GemmTransKernelArg > &kargs) |
|
static constexpr CK_TILE_HOST_DEVICE auto | GetSmemSize () -> index_t |
|
static CK_TILE_DEVICE void | RunGemmWithPipelineSelection (const ADataType *a_ptr, const BDataType *b_ptr, CDataType *c_ptr, void *smem_ptr_0, const UniversalGemmKernelArgs<> &kargs, const typename Base::SplitKBatchOffset &splitk_batch_offset, const index_t block_idx_m, const index_t block_idx_n) |
| Runs single GEMM problem cooperatively by whole workgroup. More...
|
|
static CK_TILE_DEVICE void | RunGemmWithPipelineSelection2LDS (const ADataType *a_ptr, const BDataType *b_ptr, CDataType *c_ptr, void *__restrict__ smem_ptr_0, void *__restrict__ smem_ptr_1, const UniversalGemmKernelArgs<> &kargs, const typename Base::SplitKBatchOffset &splitk_batch_offset, const index_t block_idx_m, const index_t block_idx_n) |
| Runs single GEMM problem cooperatively by whole workgroup. More...
|
|
◆ ADataType
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
Specify the data type configurations for A, B, C/E.
◆ ALayout
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ Base
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ BDataType
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ BLayout
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ CDataType
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ CLayout
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ EpiloguePipeline
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ GemmPipeline
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ Kernel
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ OffsetTile1DPartitioner
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
ALayout and ADataType are expected to be scalars, not a tuple.
BLayout and BDataType are expected to be scalars, not a tuple.
C/ELayout and C/EDataType are expected to be scalars, not a tuple.
◆ TilePartitioner
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ BlockSize()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ FindGroupId()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ GetName()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ GetSmemSize()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ GetWorkSpaceSize() [1/2]
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ GetWorkSpaceSize() [2/2]
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ GridSize()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ IsSupportedArgument()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ MakeKargs()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ MaxOccupancyGridSize()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
Get the maximum occupancy grid size for the persistent kernel on the current device.
- Returns
- The maximum occupancy grid size.
- Note
- This function queries the maximum occupancy of the kernel using
hipOccupancyMaxActiveBlocksPerMultiprocessor
.
◆ operator()() [1/2]
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
template<bool U = UsePersistentKernel, typename = std::enable_if_t<U>, typename = void>
◆ operator()() [2/2]
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
template<bool U = UsePersistentKernel, typename = std::enable_if_t<!U>>
◆ Run()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ RunGemmWithPipelineSelection()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
Runs single GEMM problem cooperatively by whole workgroup.
- Note
- The GEMM pipeline is selected in-kernel based on the number of K-loops and the tail-number. This is needed for the persistent tile-loop when we didn't have access to the K dimension on the host.
- Parameters
-
a_ptr | input A pointer |
b_ptr | input B pointer |
c_ptr | output C pointer |
smem_ptr_0 | The start memory pointer of the shared memory block. |
kargs | GEMM kernel arguments |
splitk_batch_offset | splitk_batch_offset Utility structure used to calculate k batch. |
block_idx_m | The GEMM's output M dimension tile index processed by this workgroup. |
block_idx_n | The GEMM's output N dimension tile index processed by this workgroup. |
◆ RunGemmWithPipelineSelection2LDS()
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
static CK_TILE_DEVICE void ck_tile::GroupedGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::RunGemmWithPipelineSelection2LDS |
( |
const ADataType * |
a_ptr, |
|
|
const BDataType * |
b_ptr, |
|
|
CDataType * |
c_ptr, |
|
|
void *__restrict__ |
smem_ptr_0, |
|
|
void *__restrict__ |
smem_ptr_1, |
|
|
const UniversalGemmKernelArgs<> & |
kargs, |
|
|
const typename Base::SplitKBatchOffset & |
splitk_batch_offset, |
|
|
const index_t |
block_idx_m, |
|
|
const index_t |
block_idx_n |
|
) |
| |
|
inlinestatic |
Runs single GEMM problem cooperatively by whole workgroup.
- Note
- The GEMM pipeline is selected in-kernel based on the number of K-loops and the tail-number. This is needed for the persistent tile-loop when we didn't have access to the K dimension on the host.
- Parameters
-
a_ptr | input A pointer |
b_ptr | input B pointer |
c_ptr | output C pointer |
smem_ptr_0 | The start memory pointer of the shared memory block. |
smem_ptr_1 | The second start memory pointer of the shared memory block. |
kargs | GEMM kernel arguments |
splitk_batch_offset | splitk_batch_offset Utility structure used to calculate k batch. |
block_idx_m | The GEMM's output M dimension tile index processed by this workgroup. |
block_idx_n | The GEMM's output N dimension tile index processed by this workgroup. |
◆ kBlockSize
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
◆ UsePersistentKernel
template<typename TilePartitioner_ , typename GemmPipeline_ , typename EpiloguePipeline_ >
constexpr bool ck_tile::GroupedGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::UsePersistentKernel = GemmPipeline::UsePersistentKernel |
|
staticconstexpr |
The documentation for this struct was generated from the following file:
- /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp