/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/wrapper/operations/gemm.hpp File Reference#
gemm.hpp File Reference
#include "ck/wrapper/utils/tensor_utils.hpp"
#include "ck/wrapper/traits/blockwise_gemm_xdl_traits.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
Go to the source code of this file.
Functions | |
template<typename DataType , index_t BlockSize, typename GemmTraits , typename ATensorType , typename BTensorType , typename CTensorType > | |
__device__ void | blockwise_gemm_xdl (const ATensorType &a_local_tile_tensor, const BTensorType &b_local_tile_tensor, CTensorType &c_reg_tensor) |
Perform blockwise gemm xdl on tensors stored in lds. Result will be stored in Vgpr register. A data layout must be (MPerBlock, KPerBlock) or (K0PerBlock, MPerBlock, K1) and B data layout must be (NPerBlock, KPerBlock) or (K0PerBlock, NPerBlock, K1). More... | |
template<typename DataType , typename ATileLayout , typename BTileLayout , index_t BlockSize, typename GemmTraits , typename CTensorType > | |
__host__ constexpr __device__ auto | make_blockwise_gemm_xdl_c_local_partition (CTensorType &c_local_tile_tensor) |
Create local partition per thread for C tensor. More... | |
template<typename DataType , typename ATileLayout , typename BTileLayout , index_t BlockSize, typename GemmTraits > | |
__host__ constexpr __device__ auto | make_blockwise_gemm_xdl_c_vgpr () |
Create local partition per thread for C tensor. More... | |
Function Documentation
◆ blockwise_gemm_xdl()
template<typename DataType , index_t BlockSize, typename GemmTraits , typename ATensorType , typename BTensorType , typename CTensorType >
__device__ void blockwise_gemm_xdl | ( | const ATensorType & | a_local_tile_tensor, |
const BTensorType & | b_local_tile_tensor, | ||
CTensorType & | c_reg_tensor | ||
) |
Perform blockwise gemm xdl on tensors stored in lds. Result will be stored in Vgpr register. A data layout must be (MPerBlock, KPerBlock) or (K0PerBlock, MPerBlock, K1) and B data layout must be (NPerBlock, KPerBlock) or (K0PerBlock, NPerBlock, K1).
- Note
- C output Vgpr register layout (8D):
- MXdlPerWave - The number of MFMA instructions run by single wave in M dimension per tile.
- NXdlPerWave - The number of MFMA instructions run by single wave in N dimension per tile.
- MWave - Equals to 1 since this is for single wave.
- NWave - Equals to 1 since this is for single wave.
- NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
- NumInputsBlock - Mfma instruction internal layout (depeneds on the instruction size).
- GroupSize - Mfma instruction internal layout (depeneds on the instruction size).
- NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
- Template Parameters
-
DataType Input data types. BlockSize Tensor to pad. GemmTraits Traits of gemm xdl operation.
- Parameters
-
a_local_tile_tensor A tensor in LDS memory for blockwise gemm (MPerBlock, KPerBlock) or (K0PerBlock, MPerBlock, K1) layout. b_local_tile_tensor B tensor in LDS memory for blockwise gemm (NPerBlock, KPerBlock) or (K0PerBlock, NPerBlock, K1) layout. c_reg_tensor C tensor VGPR memory for blockwise gemm.
◆ make_blockwise_gemm_xdl_c_local_partition()
template<typename DataType , typename ATileLayout , typename BTileLayout , index_t BlockSize, typename GemmTraits , typename CTensorType >
|
constexpr |
Create local partition per thread for C tensor.
- Note
- C output global memory layout (8D):
- MXdlPerWave - The number of MFMA instructions run by single wave in M dimension.
- NXdlPerWave - The number of MFMA instructions run by single wave in N dimension.
- MWave - The number of waves in single tile M dimension per tile.
- NWave - The number of waves in single tile N dimension per tile.
- NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
- NumInputsBlock - Mfma instruction internal layout (depeneds on the instruction size).
- GroupSize - Mfma instruction internal layout (depeneds on the instruction size).
- NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
- Template Parameters
-
DataType Input data types. ATileLayout A tensor layout. BTileLayout B tensor layout. BlockSize Number of threads in block. GemmTraits Traits of gemm xdl operation.
- Parameters
-
c_local_tile_tensor C tensor in LDS memory for blockwise gemm (MPerBlock, NPerBlock) layout.
- Returns
- Partition c tensor for blockwise gemm.
◆ make_blockwise_gemm_xdl_c_vgpr()
template<typename DataType , typename ATileLayout , typename BTileLayout , index_t BlockSize, typename GemmTraits >
|
constexpr |
Create local partition per thread for C tensor.
- Note
- C output Vgpr register layout (8D):
- MXdlPerWave - The number of MFMA instructions run by single wave in M dimension per tile.
- NXdlPerWave - The number of MFMA instructions run by single wave in N dimension per tile.
- MWave - Equals to 1 since this is for single wave.
- NWave - Equals to 1 since this is for single wave.
- NumGroupsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
- NumInputsBlock - Mfma instruction internal layout (depeneds on the instruction size).
- GroupSize - Mfma instruction internal layout (depeneds on the instruction size).
- NumThreadsPerBlock - Mfma instruction internal layout (depeneds on the instruction size).
- Template Parameters
-
DataType Input data types. ATileLayout A tensor layout. BTileLayout B tensor layout. BlockSize Number of threads in block. GemmTraits Traits of gemm xdl operation.
- Returns
- Vgpr c tensor for blockwise gemm.