UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ > Struct Template Reference#
The Universal GEMM kernel template. More...
#include <universal_gemm_kernel.hpp>
Classes | |
struct | has_persistent_kernel |
struct | has_tile_partitioner_output_offset_impl |
struct | SplitKBatchOffset |
Public Types | |
using | TilePartitioner = remove_cvref_t< TilePartitioner_ > |
using | GemmPipeline = remove_cvref_t< GemmPipeline_ > |
using | EpiloguePipeline = remove_cvref_t< EpiloguePipeline_ > |
using | AsLayout = std::conditional_t< ALayoutIsTuple, remove_cvref_t< typename GemmPipeline::ALayout >, remove_cvref_t< tuple< typename GemmPipeline::ALayout > >> |
using | BsLayout = std::conditional_t< BLayoutIsTuple, remove_cvref_t< typename GemmPipeline::BLayout >, remove_cvref_t< tuple< typename GemmPipeline::BLayout > >> |
using | DsLayout = std::conditional_t< DLayoutIsTuple, remove_cvref_t< typename EpiloguePipeline::DsLayout >, remove_cvref_t< tuple< typename EpiloguePipeline::DsLayout > >> |
using | AsDataType = std::conditional_t< ADataTypeIsTuple, remove_cvref_t< typename GemmPipeline::ADataType >, remove_cvref_t< tuple< typename GemmPipeline::ADataType > >> |
using | BsDataType = std::conditional_t< BDataTypeIsTuple, remove_cvref_t< typename GemmPipeline::BDataType >, remove_cvref_t< tuple< typename GemmPipeline::BDataType > >> |
using | DsDataType = std::conditional_t< DDataTypeIsTuple, remove_cvref_t< typename EpiloguePipeline::DsDataType >, remove_cvref_t< tuple< typename EpiloguePipeline::DsDataType > >> |
using | ELayout = remove_cvref_t< typename GemmPipeline::CLayout > |
using | EDataType = remove_cvref_t< typename EpiloguePipeline::ODataType > |
using | ADataType = remove_cvref_t< std::tuple_element_t< I0, AsDataType > > |
using | BDataType = remove_cvref_t< std::tuple_element_t< I0, BsDataType > > |
using | KernelArgs = UniversalGemmKernelArgs< AsLayout::size(), BsLayout::size(), DsLayout::size()> |
Public Member Functions | |
template<bool U = !PersistentKernel, typename = std::enable_if_t<U>> | |
CK_TILE_DEVICE void | operator() (KernelArgs kargs) const |
template<bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void> | |
CK_TILE_DEVICE void | operator() (KernelArgs kargs) const |
Static Public Member Functions | |
static CK_TILE_HOST const std::string | GetName () |
static constexpr CK_TILE_HOST auto | GridSize (index_t M, index_t N, index_t KBatch) |
static CK_TILE_HOST auto | MaxOccupancyGridSize (const stream_config &s) -> dim3 |
Get the maximum occupancy grid size for the persistent kernel on the current device. More... | |
static CK_TILE_HOST auto | BlockSize () |
static constexpr CK_TILE_HOST KernelArgs | MakeKernelArgs (const UniversalGemmHostArgs< NumATensor, NumBTensor, NumDTensor > &hostArgs) |
static constexpr CK_TILE_HOST_DEVICE index_t | GetSmemSize () |
static CK_TILE_HOST bool | IsSupportedArgument (const KernelArgs &kargs) |
template<memory_operation_enum DstInMemOp = memory_operation_enum::set> | |
static CK_TILE_DEVICE auto | MakeGemmTensorViews (const std::array< const ADataType *, NumATensor > &as_ptr, const std::array< const BDataType *, NumBTensor > &bs_ptr, const std::array< const void *, NumDTensor > &ds_ptr, EDataType *e_ptr, const KernelArgs &kargs, const SplitKBatchOffset &splitk_batch_offset) |
template<typename TensorView > | |
static CK_TILE_DEVICE auto | MakeGemmPadViews (const TensorView &views) |
template<typename PadView > | |
static CK_TILE_DEVICE auto | MakeGemmTileWindows (const PadView &views, const index_t i_m, const index_t i_n) |
template<bool UseDefaultScheduler = true> | |
static CK_TILE_DEVICE void | RunGemm (const std::array< const ADataType *, NumATensor > &as_ptr, const std::array< const BDataType *, NumBTensor > &bs_ptr, const std::array< const void *, NumDTensor > &ds_ptr, EDataType *e_ptr, void *smem_ptr_0, const KernelArgs &kargs, const SplitKBatchOffset &splitk_batch_offset, const index_t block_idx_m, const index_t block_idx_n) |
Runs single GEMM problem cooperatively by whole workgroup. More... | |
static CK_TILE_DEVICE void | RunGemm2LDS (const std::array< const ADataType *, NumATensor > &as_ptr, const std::array< const BDataType *, NumBTensor > &bs_ptr, const std::array< const void *, NumDTensor > &ds_ptr, EDataType *e_ptr, void *__restrict__ smem_ptr_0, void *__restrict__ smem_ptr_1, const KernelArgs &kargs, const SplitKBatchOffset &splitk_batch_offset, const index_t block_idx_m, const index_t block_idx_n) |
Runs single GEMM problem cooperatively by whole workgroup. More... | |
Static Public Attributes | |
static constexpr bool | ADataTypeIsTuple |
static constexpr bool | BDataTypeIsTuple |
static constexpr bool | DDataTypeIsTuple |
static constexpr bool | ALayoutIsTuple |
static constexpr bool | BLayoutIsTuple |
static constexpr bool | DLayoutIsTuple |
static constexpr index_t | kBlockSize = GemmPipeline::BlockSize |
static constexpr bool | PersistentKernel = has_persistent_kernel::value |
static constexpr bool | has_tile_partitioner_output_offset |
static constexpr auto | I0 = number<0>() |
static constexpr auto | I1 = number<1>() |
static constexpr auto | I2 = number<2>() |
static constexpr auto | I3 = number<3>{} |
static constexpr index_t | NumATensor = AsDataType::size() |
static constexpr index_t | NumBTensor = BsDataType::size() |
static constexpr index_t | NumDTensor = DsDataType::size() |
Detailed Description
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
struct ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >
The Universal GEMM kernel template.
Overview
This class provides the generic matrix multiplication kernel template. By semantic division of GEMM algorithm into following parts we achieve flexible, versatile and robust kernel implementation.
- Prolog - The start of GEMM kernel implementation in operator() function call operator" which determines the work scope of each workgroup. @li @b GemmPipeline - The core part @a "heart" of matrix multiplication algorithm. This is the place where each workgroup is loading data from global memory and carrying out dot products.
- Epilogue - The "final" part of matrix multiplication implementation responsible for storing results to global memory. This is also the place where any additional operator fusion may take place.
Additionally both GemmPipeline and EpiloguePipeline are parameterized with so called Policy which determines all internal details of those functional parts. You can think of it like both gemm and epilogue pipelines provides the control-flow logic controlled by policies. Moreover the policy is responsible for definition of all necessary data layouts and thread's work distribution.
- Template Parameters
-
TilePartitioner_ The type of class providing mapping of workgroup index into the output data tile to be calculated. It determines the workgroup to data relationship (or in other words - which data would be processed and calculated by which workgroup). GemmPipeline_ The type of class which provides the core part of matrix multiplication. This class should provide implementation of data loading from global memory and performing block-wise matrix multiplication. You can think of it as a work done by single workgroup point of view. EpiloguePipeline_ The type of class providing the final part of matrix multiplication implementation. It is responsible for storing results calculated by GemmPipeline to the output E tensor in global memory.
Member Typedef Documentation
◆ ADataType
using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::ADataType = remove_cvref_t<std::tuple_element_t<I0, AsDataType> > |
◆ AsDataType
using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::AsDataType = std::conditional_t<ADataTypeIsTuple, remove_cvref_t<typename GemmPipeline::ADataType>, remove_cvref_t<tuple<typename GemmPipeline::ADataType> >> |
◆ AsLayout
using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::AsLayout = std::conditional_t<ALayoutIsTuple, remove_cvref_t<typename GemmPipeline::ALayout>, remove_cvref_t<tuple<typename GemmPipeline::ALayout> >> |
◆ BDataType
using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::BDataType = remove_cvref_t<std::tuple_element_t<I0, BsDataType> > |
◆ BsDataType
using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::BsDataType = std::conditional_t<BDataTypeIsTuple, remove_cvref_t<typename GemmPipeline::BDataType>, remove_cvref_t<tuple<typename GemmPipeline::BDataType> >> |
◆ BsLayout
using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::BsLayout = std::conditional_t<BLayoutIsTuple, remove_cvref_t<typename GemmPipeline::BLayout>, remove_cvref_t<tuple<typename GemmPipeline::BLayout> >> |
◆ DsDataType
using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::DsDataType = std::conditional_t<DDataTypeIsTuple, remove_cvref_t<typename EpiloguePipeline::DsDataType>, remove_cvref_t<tuple<typename EpiloguePipeline::DsDataType> >> |
◆ DsLayout
using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::DsLayout = std::conditional_t<DLayoutIsTuple, remove_cvref_t<typename EpiloguePipeline::DsLayout>, remove_cvref_t<tuple<typename EpiloguePipeline::DsLayout> >> |
◆ EDataType
using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::EDataType = remove_cvref_t<typename EpiloguePipeline::ODataType> |
◆ ELayout
using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::ELayout = remove_cvref_t<typename GemmPipeline::CLayout> |
◆ EpiloguePipeline
using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::EpiloguePipeline = remove_cvref_t<EpiloguePipeline_> |
◆ GemmPipeline
using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::GemmPipeline = remove_cvref_t<GemmPipeline_> |
◆ KernelArgs
using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::KernelArgs = UniversalGemmKernelArgs<AsLayout::size(), BsLayout::size(), DsLayout::size()> |
◆ TilePartitioner
using ck_tile::UniversalGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::TilePartitioner = remove_cvref_t<TilePartitioner_> |
Member Function Documentation
◆ BlockSize()
|
inlinestatic |
◆ GetName()
|
inlinestatic |
◆ GetSmemSize()
|
inlinestaticconstexpr |
◆ GridSize()
|
inlinestaticconstexpr |
◆ IsSupportedArgument()
|
inlinestatic |
◆ MakeGemmPadViews()
|
inlinestatic |
◆ MakeGemmTensorViews()
|
inlinestatic |
◆ MakeGemmTileWindows()
|
inlinestatic |
◆ MakeKernelArgs()
|
inlinestaticconstexpr |
◆ MaxOccupancyGridSize()
|
inlinestatic |
Get the maximum occupancy grid size for the persistent kernel on the current device.
- Returns
- The maximum occupancy grid size.
- Note
- This function queries the maximum occupancy of the kernel using
hipOccupancyMaxActiveBlocksPerMultiprocessor
.
◆ operator()() [1/2]
|
inline |
◆ operator()() [2/2]
|
inline |
◆ RunGemm()
|
inlinestatic |
Runs single GEMM problem cooperatively by whole workgroup.
- Parameters
-
as_ptr input As pointer bs_ptr input Bs pointer ds_ptr input Ds pointer e_ptr output E pointer smem_ptr_0 The start memory pointer of the shared memory block. kargs GEMM kernel arguments splitk_batch_offset splitk_batch_offset Utility structure used to calculate k batch. block_idx_m The GEMM's output M dimension tile index processed by this workgroup. block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
◆ RunGemm2LDS()
|
inlinestatic |
Runs single GEMM problem cooperatively by whole workgroup.
- Note
- RunGEMM2LDS in with two shared memory buffers using the ping pong buffer mechanism.
- Parameters
-
as_ptr input As pointer bs_ptr input Bs pointer ds_ptr input Ds pointer e_ptr output E pointer smem_ptr_0 The starting pointer of 1st shared memory block. smem_ptr_1 The starting pointer of 2nd shared memory block. kargs GEMM kernel arguments splitk_batch_offset Utility structure used to calculate k batch. block_idx_m The GEMM's output M dimension tile index processed by this workgroup. block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
Member Data Documentation
◆ ADataTypeIsTuple
|
staticconstexpr |
◆ ALayoutIsTuple
|
staticconstexpr |
◆ BDataTypeIsTuple
|
staticconstexpr |
◆ BLayoutIsTuple
|
staticconstexpr |
◆ DDataTypeIsTuple
|
staticconstexpr |
◆ DLayoutIsTuple
|
staticconstexpr |
◆ has_tile_partitioner_output_offset
|
staticconstexpr |
◆ I0
|
staticconstexpr |
◆ I1
|
staticconstexpr |
◆ I2
|
staticconstexpr |
◆ I3
|
staticconstexpr |
◆ kBlockSize
|
staticconstexpr |
◆ NumATensor
|
staticconstexpr |
◆ NumBTensor
|
staticconstexpr |
◆ NumDTensor
|
staticconstexpr |
◆ PersistentKernel
|
staticconstexpr |
The documentation for this struct was generated from the following file:
- /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp