/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp Source File#
gemm_tile_partitioner.hpp
Go to the documentation of this file.
Definition: cluster_descriptor.hpp:13
constexpr CK_TILE_HOST_DEVICE auto integer_divide_ceil(X x, Y y)
Definition: math.hpp:149
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition: type_traits.hpp:21
constexpr CK_TILE_HOST_DEVICE auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:360
Class mapping 1D block index into 2D output tile space.
Definition: gemm_tile_partitioner.hpp:229
static constexpr index_t MPerBlock
Definition: gemm_tile_partitioner.hpp:232
static CK_TILE_HOST_DEVICE auto GridSize(index_t M, index_t N) noexcept(noexcept(MPerBlock !=0 &&NPerBlock !=0)) -> index_t
Calculates GEMM kernel grid size.
Definition: gemm_tile_partitioner.hpp:250
static constexpr index_t KPerBlock
Definition: gemm_tile_partitioner.hpp:234
static CK_TILE_HOST_DEVICE auto GetLoopNum(index_t K) noexcept -> index_t
Calculate number of loop iterations over GEMM's K dimension.
Definition: gemm_tile_partitioner.hpp:263
CK_TILE_HOST_DEVICE GemmSpatiallyLocalTilePartitioner() noexcept=delete
remove_cvref_t< BlockGemmShapeType > BlockGemmShape
Definition: gemm_tile_partitioner.hpp:230
static constexpr index_t NPerBlock
Definition: gemm_tile_partitioner.hpp:233
CK_TILE_DEVICE auto GetOutputTileIndex(index_t block_1d_id) noexcept -> const tuple< index_t, index_t >
Calculate workgroup 1D index mapping into 2D output C-tile space.
Definition: gemm_tile_partitioner.hpp:275
Class providing 1D WGP index mapping into 2D output C-tile space.
Definition: gemm_tile_partitioner.hpp:89
CK_TILE_HOST_DEVICE GemmTile1DPartitioner() noexcept=delete
static CK_TILE_HOST_DEVICE auto GetLoopNum(index_t K) noexcept -> index_t
Calculate number of loop iterations over GEMM's K dimension.
Definition: gemm_tile_partitioner.hpp:130
static CK_TILE_DEVICE auto GetOutputTileIndex(index_t blockIdx) noexcept -> const tuple< index_t, index_t >
Calculate workgroup 1D index mapping into 2D output C-tile space.
Definition: gemm_tile_partitioner.hpp:142
remove_cvref_t< BlockGemmShape_ > BlockGemmShape
Definition: gemm_tile_partitioner.hpp:90
static constexpr index_t MPerBlock
Definition: gemm_tile_partitioner.hpp:92
static constexpr index_t NPerBlock
Definition: gemm_tile_partitioner.hpp:93
static CK_TILE_HOST_DEVICE auto GridSize(index_t M, index_t N) noexcept(noexcept(MPerBlock !=0 &&NPerBlock !=0)) -> index_t
Calculates GEMM kernel grid size.
Definition: gemm_tile_partitioner.hpp:117
static constexpr index_t KPerBlock
Definition: gemm_tile_partitioner.hpp:94
Class providing 2D workgroup index mapping into 2D output GEMM C-tile space.
Definition: gemm_tile_partitioner.hpp:22
static CK_TILE_DEVICE auto GetOutputTileIndex(index_t blockIdx, index_t blockIdy) noexcept -> const tuple< index_t, index_t >
The function returns 2D output tile space.
Definition: gemm_tile_partitioner.hpp:74
static CK_TILE_HOST auto GridSize(index_t M, index_t N) noexcept(noexcept(MPerBlock !=0 &&NPerBlock !=0)) -> dim3
Calculates GEMM kernel grid size.
Definition: gemm_tile_partitioner.hpp:41
remove_cvref_t< BlockGemmShapeType > BlockGemmShape
Definition: gemm_tile_partitioner.hpp:23
static CK_TILE_HOST_DEVICE auto GetLoopNum(index_t K) noexcept -> index_t
Calculate number of loop iterations over GEMM's K dimension.
Definition: gemm_tile_partitioner.hpp:54
static constexpr index_t NPerBlock
Definition: gemm_tile_partitioner.hpp:26
static constexpr index_t KPerBlock
Definition: gemm_tile_partitioner.hpp:27
static constexpr index_t MPerBlock
Definition: gemm_tile_partitioner.hpp:25
CK_TILE_HOST_DEVICE GemmTile2DPartitioner() noexcept=delete
GemmTile1DPartitioner::GetOutputTileIndex's std::false specialization, checking expression validity i...
Definition: gemm_tile_partitioner.hpp:161
Struct used to calculate offseted tile indexes.
Definition: gemm_tile_partitioner.hpp:184
static CK_TILE_DEVICE auto GetOffsetedTileIndex(index_t block_start, index_t M, index_t N) noexcept -> const tuple< index_t, index_t >
The function subtracts the block's start (offset) from 1D raw-indexes.
Definition: gemm_tile_partitioner.hpp:192
static CK_TILE_DEVICE auto GetOffsetedTileIndex(index_t block_start, index_t M, index_t N, index_t block_idx) noexcept -> const tuple< index_t, index_t >
The function subtracts the block's start (offset) from a given block index.
Definition: gemm_tile_partitioner.hpp:208
Stream-K tile partitioner that dynamically balances work across workgroups.
Definition: gemm_tile_partitioner.hpp:388
CK_TILE_HOST_DEVICE uint32_t GetTileIntersections(uint32_t tiles_, const mdiv &equiv_tiles_) const noexcept
Get location of intersection of tiles for reduction.
Definition: gemm_tile_partitioner.hpp:710
CK_TILE_HOST_DEVICE uint32_t GetNumTileK() const noexcept
Definition: gemm_tile_partitioner.hpp:802
uint32_t k_iters_per_big_block
Definition: gemm_tile_partitioner.hpp:808
CK_TILE_HOST_DEVICE uint32_t GetSkTotalIters() const noexcept
Get total number of iterations for sk tiles.
Definition: gemm_tile_partitioner.hpp:628
CK_TILE_HOST_DEVICE StreamKTilePartitioner() noexcept=delete
static constexpr uint32_t MPerBlock
Definition: gemm_tile_partitioner.hpp:391
CK_TILE_HOST_DEVICE uint32_t GetNumTileM() const noexcept
Definition: gemm_tile_partitioner.hpp:800
CK_TILE_DEVICE uint32_t GetAccBufferOffsetFromBlock(uint32_t block_idx_) const noexcept
Calculate offset based on block_idx index for big/little streamk blocks.
Definition: gemm_tile_partitioner.hpp:778
CK_TILE_DEVICE void GetTileIdxWithOffset(uint32_t iter, uint32_t &tile_idx, uint32_t &iter_offset) const noexcept
Get index of tile during a specified iteration.
Definition: gemm_tile_partitioner.hpp:673
uint32_t sk_num_blocks
Definition: gemm_tile_partitioner.hpp:804
mdiv equiv_tiles_little
Definition: gemm_tile_partitioner.hpp:812
CK_TILE_DEVICE uint32_t GetAccBufferOffsetFromTile(uint32_t tile_idx_) const noexcept
Calculate offset based on tile index for big/little tiles.
Definition: gemm_tile_partitioner.hpp:750
CK_TILE_HOST_DEVICE uint32_t GetNumTileN() const noexcept
Definition: gemm_tile_partitioner.hpp:801
CK_TILE_HOST_DEVICE uint32_t GetWorkSpaceSize(uint32_t acc_element_bytes) const noexcept
Calculates the total buffer space needed for accumulation and the semaphore.
Definition: gemm_tile_partitioner.hpp:702
static constexpr uint32_t NPerBlock
Definition: gemm_tile_partitioner.hpp:392
static constexpr uint32_t KPerBlock
Definition: gemm_tile_partitioner.hpp:393
CK_TILE_HOST_DEVICE uint32_t GetTilesCoverSkBlock(uint32_t num_sk_blocks_, uint32_t iters_per_sk_block_) const noexcept
Calculate the number of tiles needed for the number of sk blocks.
Definition: gemm_tile_partitioner.hpp:723
static CK_TILE_HOST_DEVICE auto GetLoopNum(uint32_t K) noexcept -> uint32_t
Calculate number of loop iterations over K dimension for given work unit.
Definition: gemm_tile_partitioner.hpp:563
mdiv equiv_tiles_big
Definition: gemm_tile_partitioner.hpp:811
CK_TILE_HOST_DEVICE uint32_t GetWorkSpaceSizeForSemaphore() const noexcept
Calculates the buffer space needed for the semaphore.
Definition: gemm_tile_partitioner.hpp:694
CK_TILE_HOST auto GridSize() const noexcept -> dim3
Calculate optimal grid size for Stream-K.
Definition: gemm_tile_partitioner.hpp:550
CK_TILE_HOST_DEVICE uint32_t GetSkTiles() const noexcept
Get total number of sk tiles.
Definition: gemm_tile_partitioner.hpp:638
CK_TILE_DEVICE auto GetOutputTileIndex(uint32_t tile_idx) const noexcept -> tuple< uint32_t, uint32_t >
Get output tile index for standard 2D mapping (compatibility)
Definition: gemm_tile_partitioner.hpp:572
uint32_t sk_num_big_blocks
Definition: gemm_tile_partitioner.hpp:805
uint32_t dp_start_block_idx
Definition: gemm_tile_partitioner.hpp:806
CK_TILE_DEVICE void GetBlockItr(uint32_t block_idx, uint32_t &iter_start, uint32_t &iter_end) const noexcept
Get work range for a given block ID.
Definition: gemm_tile_partitioner.hpp:603
CK_TILE_DEVICE uint32_t GetCurrentIterLength(uint32_t iter_start, uint32_t iter_end, uint32_t total_iter_length) const noexcept
Get length of loop iterations for stream-k loop.
Definition: gemm_tile_partitioner.hpp:648
mdiv k_iters_per_tile
Definition: gemm_tile_partitioner.hpp:810
CK_TILE_HOST_DEVICE uint32_t GetWorkSpaceSizeForAcc(uint32_t acc_element_bytes) const noexcept
Calculates the buffer space needed for accumulation.
Definition: gemm_tile_partitioner.hpp:683
CK_TILE_HOST_DEVICE uint32_t GetTotalAccBuffers() const noexcept
Calculate the amount of total accumulation buffers required for stream-k.
Definition: gemm_tile_partitioner.hpp:733
BlockGemmShapeType BlockGemmShape
Definition: gemm_tile_partitioner.hpp:389
CK_TILE_DEVICE uint32_t GetTileIdx(uint32_t iter) const noexcept
Get index of tile during a specified iteration.
Definition: gemm_tile_partitioner.hpp:664
uint32_t reduction_start_block_idx
Definition: gemm_tile_partitioner.hpp:807
Definition: magic_div.hpp:228
CK_TILE_HOST_DEVICE void divmod(uint32_t dividend_, uint32_t divisor_, uint32_t "ient_, uint32_t &remainder_) const
Definition: magic_div.hpp:250
Definition: magic_div.hpp:186
CK_TILE_HOST_DEVICE void divmod(uint32_t dividend_, uint32_t "ient_, uint32_t &remainder_) const
Definition: magic_div.hpp:218
CK_TILE_HOST_DEVICE uint32_t div(uint32_t dividend_) const
Definition: magic_div.hpp:212
Definition: tuple.hpp:192