/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp Source File#
block_to_ctile_map.hpp
Go to the documentation of this file.
259 struct BlockToCTileMap_M00_N0_M01Adapt : BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
548 __host__ __device__ BlockToCTileMap_KSplit_M00_N0_M01Adapt(const CGridDesc_M_N& c_grid_desc_m_n,
__host__ constexpr __device__ auto integer_divide_ceil(X x, Y y)
Definition: math.hpp:72
Definition: ck.hpp:266
__host__ constexpr __device__ auto make_multi_index(Xs &&... xs)
Definition: array_multi_index.hpp:15
__host__ constexpr __device__ auto make_merge_transform(const LowLengths &low_lengths)
Definition: multi_index_transform_helper.hpp:55
__host__ constexpr __device__ auto make_single_stage_tensor_adaptor(const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
Definition: tensor_adaptor.hpp:425
__host__ __device__ bool DefaultValidCTileIndex(const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim)
Definition: block_to_ctile_map.hpp:834
__host__ constexpr __device__ auto make_pass_through_transform(const LowLength &low_length)
Definition: multi_index_transform_helper.hpp:12
__host__ constexpr __device__ auto make_insert_transform(const UpperIndex &up_idx)
Definition: multi_index_transform_helper.hpp:104
__host__ constexpr __device__ auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition: multi_index_transform_helper.hpp:90
__host__ constexpr __device__ auto chain_tensor_adaptors(const TensorAdaptor0 &adaptor0, const TensorAdaptor1 &adaptor1)
Definition: tensor_adaptor.hpp:245
Simple tile mapping which creates 3D grid of block of threads.
Definition: block_to_ctile_map.hpp:976
constexpr __host__ bool CheckValidity(const CGridDesc_M_N &) const
Definition: block_to_ctile_map.hpp:1003
__host__ __device__ bool ValidCTileIndex(const CTileIdx &, const CTileDim &) const
Definition: block_to_ctile_map.hpp:996
constexpr __device__ auto CalculateBottomIndex(const TopIdx &) const
Definition: block_to_ctile_map.hpp:990
__host__ constexpr __device__ auto CalculateGridSize(index_t M, index_t N, index_t k_split) const
Definition: block_to_ctile_map.hpp:981
__host__ __device__ BlockToCTileMap_3DGrid_KSplit()=default
Definition: block_to_ctile_map.hpp:1419
__host__ __device__ uint32_t get_sk_tiles() const
Definition: block_to_ctile_map.hpp:1591
MDiv k_iters_per_tile
Definition: block_to_ctile_map.hpp:1434
__host__ __device__ uint32_t get_workspace_size(uint32_t acc_element_bytes) const
Definition: block_to_ctile_map.hpp:1699
__host__ __device__ uint32_t get_tile_intersections(uint32_t tiles_, const MDiv &equiv_tiles_) const
Definition: block_to_ctile_map.hpp:1704
MDiv equiv_tiles_little
Definition: block_to_ctile_map.hpp:1436
__device__ uint32_t get_acc_buffer_offset_from_block(uint32_t block_idx_) const
Definition: block_to_ctile_map.hpp:1762
uint32_t dp_start_block_idx
Definition: block_to_ctile_map.hpp:1430
static constexpr uint32_t KPerBlock
Definition: block_to_ctile_map.hpp:1423
static constexpr uint32_t NPerBlock
Definition: block_to_ctile_map.hpp:1422
static constexpr uint32_t min_k_iters_per_sk_block
Definition: block_to_ctile_map.hpp:1420
__host__ __device__ uint32_t get_sk_total_iters() const
Definition: block_to_ctile_map.hpp:1584
__host__ __device__ uint32_t get_total_acc_buffers() const
Definition: block_to_ctile_map.hpp:1721
__host__ __device__ index_t get_grid_dims() const
Definition: block_to_ctile_map.hpp:1598
__device__ uint32_t get_tile_idx(uint32_t iter) const
Definition: block_to_ctile_map.hpp:1649
__host__ __device__ uint32_t get_workspace_size_for_semaphore() const
Definition: block_to_ctile_map.hpp:1694
__device__ void get_block_itr(uint32_t block_idx, uint32_t &iter_start, uint32_t &iter_end) const
Definition: block_to_ctile_map.hpp:1616
uint32_t k_iters_per_big_block
Definition: block_to_ctile_map.hpp:1432
uint32_t sk_num_big_blocks
Definition: block_to_ctile_map.hpp:1429
MDiv equiv_tiles_big
Definition: block_to_ctile_map.hpp:1435
__device__ uint32_t get_acc_buffer_offset_from_tile(uint32_t tile_idx_) const
Definition: block_to_ctile_map.hpp:1736
__device__ auto tile_to_spatial(uint32_t tile_idx, uint32_t m, uint32_t n) const
Definition: block_to_ctile_map.hpp:1657
__host__ static constexpr __device__ index_t CalculateGridSize(index_t M, index_t N)
Definition: block_to_ctile_map.hpp:1577
__host__ __device__ uint32_t get_workspace_size_for_acc(uint32_t acc_element_bytes) const
Definition: block_to_ctile_map.hpp:1686
__device__ uint32_t get_current_iter_length(uint32_t iter_start, uint32_t iter_end, uint32_t total_iter_length) const
Definition: block_to_ctile_map.hpp:1638
static constexpr uint32_t tile_swizzle_sub_m
Definition: block_to_ctile_map.hpp:1424
uint32_t reduction_start_block_idx
Definition: block_to_ctile_map.hpp:1431
__host__ __device__ uint32_t get_tiles_cover_sk_block(uint32_t num_sk_blocks_, uint32_t iters_per_sk_block_) const
Definition: block_to_ctile_map.hpp:1714
__host__ __device__ BlockToCTileMap_GemmStreamK_v2(uint32_t m, uint32_t n, uint32_t k, uint32_t grid_size=1, uint32_t streamk_sel=1, StreamKReductionStrategy reduction_strategy_=StreamKReductionStrategy::Atomic)
Definition: block_to_ctile_map.hpp:1440
uint32_t sk_num_blocks
Definition: block_to_ctile_map.hpp:1428
__device__ uint32_t get_block_idx() const
Definition: block_to_ctile_map.hpp:1609
__device__ void get_tile_idx_with_offset(uint32_t iter, uint32_t &tile_idx, uint32_t &iter_offset) const
Definition: block_to_ctile_map.hpp:1652
StreamKReductionStrategy reduction_strategy
Definition: block_to_ctile_map.hpp:1437
static constexpr uint32_t MPerBlock
Definition: block_to_ctile_map.hpp:1421
Definition: block_to_ctile_map.hpp:1021
uint32_t k_iters_per_big_block
Definition: block_to_ctile_map.hpp:1035
__host__ __device__ uint32_t get_workspace_size(uint32_t acc_element_bytes) const
Definition: block_to_ctile_map.hpp:1326
__device__ uint32_t get_acc_buffer_offset_from_block(uint32_t block_idx_) const
Definition: block_to_ctile_map.hpp:1389
__host__ __device__ uint32_t get_sk_total_iters() const
Definition: block_to_ctile_map.hpp:1212
__host__ __device__ uint32_t get_tiles_cover_sk_block(uint32_t num_sk_blocks_, uint32_t iters_per_sk_block_) const
Definition: block_to_ctile_map.hpp:1341
static constexpr uint32_t MPerBlock
Definition: block_to_ctile_map.hpp:1023
uint32_t dp_start_block_idx
Definition: block_to_ctile_map.hpp:1033
__host__ __device__ uint32_t get_sk_tiles() const
Definition: block_to_ctile_map.hpp:1219
static constexpr uint32_t KPerBlock
Definition: block_to_ctile_map.hpp:1025
__host__ __device__ uint32_t get_total_acc_buffers() const
Definition: block_to_ctile_map.hpp:1348
__device__ uint32_t get_current_iter_length(uint32_t iter_start, uint32_t iter_end, uint32_t total_iter_length) const
Definition: block_to_ctile_map.hpp:1265
static constexpr uint32_t NPerBlock
Definition: block_to_ctile_map.hpp:1024
__device__ uint32_t get_acc_buffer_offset_from_tile(uint32_t tile_idx_) const
Definition: block_to_ctile_map.hpp:1363
uint32_t reduction_start_block_idx
Definition: block_to_ctile_map.hpp:1034
__host__ __device__ uint32_t get_workspace_size_for_acc(uint32_t acc_element_bytes) const
Definition: block_to_ctile_map.hpp:1313
MDiv k_iters_per_tile
Definition: block_to_ctile_map.hpp:1037
__device__ void get_tile_idx_with_offset(uint32_t iter, uint32_t &tile_idx, uint32_t &iter_offset) const
Definition: block_to_ctile_map.hpp:1279
static constexpr uint32_t tile_swizzle_sub_m
Definition: block_to_ctile_map.hpp:1027
BlockToCTileMap_GemmStreamK(uint32_t m, uint32_t n, uint32_t k, uint32_t num_cu, uint32_t occupancy, uint32_t sk_blocks=0xffffffff)
Definition: block_to_ctile_map.hpp:1045
static constexpr StreamKReductionStrategy ReductionStrategy
Definition: block_to_ctile_map.hpp:1026
__device__ auto tile_to_spatial(uint32_t tile_idx, uint32_t m, uint32_t n) const
Definition: block_to_ctile_map.hpp:1284
__device__ uint32_t get_tile_idx(uint32_t iter) const
Definition: block_to_ctile_map.hpp:1276
__host__ __device__ uint32_t get_tile_intersections(uint32_t tiles_, const MDiv &eqav_tiles_) const
Definition: block_to_ctile_map.hpp:1331
__device__ uint32_t get_block_idx() const
Definition: block_to_ctile_map.hpp:1236
__device__ void get_block_itr(uint32_t block_idx, uint32_t &iter_start, uint32_t &iter_end) const
Definition: block_to_ctile_map.hpp:1243
MDiv eqav_tiles_little
Definition: block_to_ctile_map.hpp:1039
uint32_t sk_num_blocks
Definition: block_to_ctile_map.hpp:1031
MDiv eqav_tiles_big
Definition: block_to_ctile_map.hpp:1038
static constexpr uint32_t min_k_iters_per_sk_block
Definition: block_to_ctile_map.hpp:1022
uint32_t sk_num_big_blocks
Definition: block_to_ctile_map.hpp:1032
__host__ __device__ dim3 get_grid_dims() const
Definition: block_to_ctile_map.hpp:1226
__host__ __device__ uint32_t get_workspace_size_for_semaphore() const
Definition: block_to_ctile_map.hpp:1321
Definition: block_to_ctile_map.hpp:270
__host__ constexpr __device__ auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition: block_to_ctile_map.hpp:297
__host__ static constexpr __device__ index_t CalculateGridSize(index_t M, index_t N)
Definition: block_to_ctile_map.hpp:282
__host__ __device__ BlockToCTileMap_Grouped_M00_N0_M01Adapt()=default
static constexpr auto I1
Definition: block_to_ctile_map.hpp:272
__host__ bool CheckValidity(const CGridDesc_M_N &) const
Definition: block_to_ctile_map.hpp:291
static constexpr auto I0
Definition: block_to_ctile_map.hpp:271
__host__ __device__ bool ValidCTileIndex(const CTileIdx &, const CTileDim &) const
Definition: block_to_ctile_map.hpp:383
__host__ __device__ BlockToCTileMap_Grouped_M00_N0_M01Adapt(index_t M, index_t N, index_t M01=8)
Definition: block_to_ctile_map.hpp:275
Definition: block_to_ctile_map.hpp:719
__host__ constexpr __device__ auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition: block_to_ctile_map.hpp:754
static constexpr auto I2
Definition: block_to_ctile_map.hpp:722
static constexpr auto I0
Definition: block_to_ctile_map.hpp:720
__host__ BlockToCTileMap_KSplit_M00_N00_M01_N01(const CGridDesc_M_N &c_grid_desc_m_n, index_t M01=1, index_t N01=1, index_t KSplit=1)
Definition: block_to_ctile_map.hpp:727
__host__ BlockToCTileMap_KSplit_M00_N00_M01_N01()=default
constexpr __host__ bool CheckValidity(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition: block_to_ctile_map.hpp:772
static constexpr auto I3
Definition: block_to_ctile_map.hpp:723
static constexpr auto I1
Definition: block_to_ctile_map.hpp:721
__host__ __device__ bool ValidCTileIndex(const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim) const
Definition: block_to_ctile_map.hpp:763
__host__ constexpr __device__ index_t CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition: block_to_ctile_map.hpp:740
Definition: block_to_ctile_map.hpp:540
__host__ __device__ BlockToCTileMap_KSplit_M00_N0_M01Adapt(const CGridDesc_M_N &c_grid_desc_m_n, index_t M01=8, index_t KSplit=1)
Definition: block_to_ctile_map.hpp:548
__host__ __device__ BlockToCTileMap_KSplit_M00_N0_M01Adapt()=default
static constexpr auto I0
Definition: block_to_ctile_map.hpp:541
static constexpr auto I1
Definition: block_to_ctile_map.hpp:542
__host__ __device__ bool ValidCTileIndex(const CTileIdx &, const CTileDim &) const
Definition: block_to_ctile_map.hpp:593
constexpr __host__ index_t CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition: block_to_ctile_map.hpp:555
__host__ constexpr __device__ auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition: block_to_ctile_map.hpp:566
static constexpr auto I2
Definition: block_to_ctile_map.hpp:543
static constexpr auto I3
Definition: block_to_ctile_map.hpp:544
constexpr __host__ bool CheckValidity(const CGridDesc_M_N &) const
Definition: block_to_ctile_map.hpp:599
Definition: block_to_ctile_map.hpp:616
__host__ __device__ BlockToCTileMap_M00_N00_M01_N01()=default
constexpr __host__ bool CheckValidity(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition: block_to_ctile_map.hpp:660
__host__ constexpr __device__ auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition: block_to_ctile_map.hpp:645
__host__ __device__ BlockToCTileMap_M00_N00_M01_N01(const CGridDesc_M_N &c_grid_desc_m_n, index_t M01=1, index_t N01=1)
Definition: block_to_ctile_map.hpp:624
constexpr __host__ index_t CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition: block_to_ctile_map.hpp:631
static constexpr auto I0
Definition: block_to_ctile_map.hpp:617
static constexpr auto I3
Definition: block_to_ctile_map.hpp:620
static constexpr auto I1
Definition: block_to_ctile_map.hpp:618
__host__ __device__ bool ValidCTileIndex(const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim) const
Definition: block_to_ctile_map.hpp:651
static constexpr auto I2
Definition: block_to_ctile_map.hpp:619
Definition: block_to_ctile_map.hpp:122
__host__ constexpr __device__ bool ValidCTileIndex(const CTileIdx &, const CTileDim &) const
Definition: block_to_ctile_map.hpp:245
__host__ static constexpr __device__ index_t CalculateGridSize(index_t M, index_t N)
Definition: block_to_ctile_map.hpp:157
__host__ constexpr __device__ BlockToCTileMap_M00_N0_M01Adapt()=default
__host__ constexpr __device__ BlockToCTileMap_M00_N0_M01Adapt(BlockToCTileMap_M00_N0_M01Adapt &&)=default
__host__ constexpr __device__ BlockToCTileMap_M00_N0_M01Adapt & operator=(BlockToCTileMap_M00_N0_M01Adapt &&)=default
__host__ constexpr __device__ BlockToCTileMap_M00_N0_M01Adapt(const BlockToCTileMap_M00_N0_M01Adapt &)=default
__host__ constexpr __device__ BlockToCTileMap_M00_N0_M01Adapt & operator=(const BlockToCTileMap_M00_N0_M01Adapt &)=default
static constexpr __host__ index_t CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n)
Definition: block_to_ctile_map.hpp:166
__host__ constexpr __device__ BlockToCTileMap_M00_N0_M01Adapt(const CGridDesc_M_N &c_grid_desc_m_n, index_t M01=8)
Definition: block_to_ctile_map.hpp:150
__host__ constexpr __device__ BlockToCTileMap_M00_N0_M01Adapt(index_t M, index_t N, index_t M01=8)
Definition: block_to_ctile_map.hpp:138
__host__ constexpr __device__ auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition: block_to_ctile_map.hpp:178
constexpr __host__ bool CheckValidity(const CGridDesc_M_N &) const
Definition: block_to_ctile_map.hpp:172
Definition: block_to_ctile_map.hpp:260
Definition: block_to_ctile_map.hpp:24
constexpr __host__ index_t CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition: block_to_ctile_map.hpp:38
constexpr __host__ bool CheckValidity(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition: block_to_ctile_map.hpp:66
__host__ constexpr __device__ BlockToCTileMap_M00_N0_M01(const CGridDesc_M_N &c_grid_desc_m_n, index_t M01=1)
Definition: block_to_ctile_map.hpp:32
__host__ constexpr __device__ auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition: block_to_ctile_map.hpp:51
__host__ constexpr __device__ BlockToCTileMap_M00_N0_M01()=default
__host__ constexpr __device__ bool ValidCTileIndex(const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim) const
Definition: block_to_ctile_map.hpp:57
__host__ bool CheckValidity(const CGridDesc_M_N &) const
Definition: block_to_ctile_map.hpp:450
__host__ __device__ BlockToCTileMap_N00_M0_N01Adapt()=default
__host__ __device__ BlockToCTileMap_N00_M0_N01Adapt(const CGridDesc_M_N &c_grid_desc_m_n, index_t N01=8)
Definition: block_to_ctile_map.hpp:428
static constexpr __host__ index_t CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n)
Definition: block_to_ctile_map.hpp:444
__host__ __device__ BlockToCTileMap_N00_M0_N01Adapt & operator=(const BlockToCTileMap_N00_M0_N01Adapt &)=default
__host__ __device__ bool ValidCTileIndex(const CTileIdx &, const CTileDim &) const
Definition: block_to_ctile_map.hpp:524
__host__ __device__ BlockToCTileMap_N00_M0_N01Adapt & operator=(BlockToCTileMap_N00_M0_N01Adapt &&)=default
__host__ constexpr __device__ auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition: block_to_ctile_map.hpp:456
__host__ __device__ BlockToCTileMap_N00_M0_N01Adapt(const BlockToCTileMap_N00_M0_N01Adapt &)=default
__host__ __device__ BlockToCTileMap_N00_M0_N01Adapt(BlockToCTileMap_N00_M0_N01Adapt &&)=default
__host__ __device__ BlockToCTileMap_N00_M0_N01Adapt(index_t M, index_t N, index_t N01=8)
Definition: block_to_ctile_map.hpp:417
__host__ static constexpr __device__ index_t CalculateGridSize(index_t M, index_t N)
Definition: block_to_ctile_map.hpp:435
Definition: block_to_ctile_map.hpp:398
Definition: magic_division.hpp:208
__host__ __device__ void divmod(uint32_t dividend_, uint32_t divisor_, uint32_t "ient_, uint32_t &remainder_) const
Definition: magic_division.hpp:230
Definition: magic_division.hpp:166
__host__ __device__ void divmod(uint32_t dividend_, uint32_t "ient_, uint32_t &remainder_) const
Definition: magic_division.hpp:198
__host__ __device__ uint32_t div(uint32_t dividend_) const
Definition: magic_division.hpp:192
__host__ static constexpr __device__ T Max()
Definition: numeric_limits.hpp:311
Definition: block_to_ctile_map.hpp:919
__host__ __device__ bool ValidCTileIndex(const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim) const
Definition: block_to_ctile_map.hpp:939
index_t tile_offset_
Definition: block_to_ctile_map.hpp:959
constexpr __host__ bool CheckValidity(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition: block_to_ctile_map.hpp:946
UnderlyingBlockToCTileMap block_to_ctile_map_
Definition: block_to_ctile_map.hpp:957
__host__ constexpr __device__ auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition: block_to_ctile_map.hpp:932
__host__ __device__ OffsettedBlockToCTileMap2(UnderlyingBlockToCTileMap block_to_ctile_map, index_t group_offset, index_t tile_offset)
Definition: block_to_ctile_map.hpp:922
UnderlyingBlockToCTileMap underlying_type
Definition: block_to_ctile_map.hpp:920
index_t group_offset_
Definition: block_to_ctile_map.hpp:958
__device__ void UpdateTileOffset(index_t offset)
Definition: block_to_ctile_map.hpp:956
__host__ constexpr __device__ index_t CalculateGridSize(index_t M, index_t N) const
Definition: block_to_ctile_map.hpp:951
Definition: block_to_ctile_map.hpp:871
__host__ __device__ bool ValidCTileIndex(const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim) const
Definition: block_to_ctile_map.hpp:890
__host__ constexpr __device__ index_t CalculateGridSize(index_t M, index_t N) const
Definition: block_to_ctile_map.hpp:908
constexpr __host__ bool CheckValidity(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition: block_to_ctile_map.hpp:897
constexpr __host__ index_t CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition: block_to_ctile_map.hpp:903
index_t block_start_
Definition: block_to_ctile_map.hpp:914
__host__ constexpr __device__ auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition: block_to_ctile_map.hpp:883
__host__ __device__ OffsettedBlockToCTileMap()=default
__host__ __device__ OffsettedBlockToCTileMap(UnderlyingBlockToCTileMap block_to_ctile_map, index_t block_start)
Definition: block_to_ctile_map.hpp:875
UnderlyingBlockToCTileMap underlying_type
Definition: block_to_ctile_map.hpp:872
UnderlyingBlockToCTileMap block_to_ctile_map_
Definition: block_to_ctile_map.hpp:913
Definition: sequence.hpp:43
Definition: integral_constant.hpp:20