/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_tile_partitioner.hpp Source File

Go to the documentation of this file.

 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
  
 #pragma once
  
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/common.hpp"
  
 namespace ck_tile {
  
 template <typename BlockGemmShapeType,
           StreamKReductionStrategy ReductionStrategyType = StreamKReductionStrategy::Atomic>
 struct StreamKTilePartitionerBase
 {
  
     static constexpr index_t MPerBlock                          = BlockGemmShapeType::kM;
     static constexpr index_t NPerBlock                          = BlockGemmShapeType::kN;
     static constexpr index_t KPerBlock                          = BlockGemmShapeType::kK;
     static constexpr StreamKReductionStrategy ReductionStrategy = ReductionStrategyType;
     static constexpr auto MemoryOperation = (ReductionStrategy == StreamKReductionStrategy::Atomic)
                                                 ? memory_operation_enum::atomic_add
                                                 : memory_operation_enum::set;
  
     StreamKTilePartitionerBase(index_t m, index_t n, index_t k, index_t grid);
  
     CK_TILE_HOST_DEVICE index_t get_partials_buffer_size(index_t acc_element_bytes) const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_flags_buffer_size() const noexcept;
  
     public:
     CK_TILE_DEVICE index_t get_start_iter(index_t cta_idx) const noexcept;
  
     CK_TILE_DEVICE void
     get_iter_boundaries(index_t& iter_start, index_t& iter_end, index_t cta_idx) const noexcept;
  
     CK_TILE_DEVICE index_t get_tile_index(index_t iter_start) const noexcept;
  
     CK_TILE_DEVICE void get_tile_boundaries(index_t& tile_iter_start,
                                             index_t& tile_iter_end,
                                             index_t tile_idx) const noexcept;
  
     CK_TILE_DEVICE static index_t get_local_iter(index_t iter_start,
                                                  index_t tile_iter_start) noexcept;
  
     CK_TILE_DEVICE static index_t
     get_local_iter_end(index_t tile_iter_start, index_t iter_end, index_t tile_iter_end) noexcept;
  
     CK_TILE_DEVICE index_t get_tile_local_cta_index(index_t tile_iter_start,
                                                     index_t cta_idx) const noexcept;
  
     CK_TILE_DEVICE auto
     get_output_tile_index(index_t tile_idx) const noexcept -> tuple<index_t, index_t>;
  
     CK_TILE_HOST_DEVICE index_t get_workspace_size(index_t acc_element_bytes) const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_num_tiles() const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_grid() const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_dp_tiles() const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_sk_tiles() const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_sk_ctas() const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_total_sk_iters() const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_iters_per_tile() const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_iters_per_sk_cta() const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_extra_iters() const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_total_dp_iters() const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_n() const noexcept;
  
     CK_TILE_HOST index_t estimate_num_wgs_per_tile() const noexcept;
  
     protected:
     index_t num_tiles_;
     index_t grid_;
     index_t dp_tiles_;
  
     private:
     index_t full_tiles_ = 1;
     index_t sk_tiles_;
     index_t sk_ctas_;
     index_t total_sk_iters_;
     index_t iters_per_tile_;
     index_t iters_per_sk_cta_;
     index_t extra_iters_;
     index_t total_dp_iters_;
     index_t n_;
 };
  
 template <typename BlockGemmShapeType,
           StreamKReductionStrategy ReductionStrategyType,
           bool Persistent>
 struct StreamKTilePartitioner;
  
 template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
 struct StreamKTilePartitioner<BlockGemmShapeType, ReductionStrategyType, true>
     : StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>
 {
     StreamKTilePartitioner(ck_tile::index_t m,
                            ck_tile::index_t n,
                            ck_tile::index_t k,
                            ck_tile::index_t grid);
  
     public:
     static constexpr bool PERSISTENT = true;
     CK_TILE_HOST auto grid_size() const noexcept -> dim3;
  
     CK_TILE_HOST_DEVICE index_t get_dp_tiles_per_cta() const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_extra_dp_tiles() const noexcept;
  
     protected:
     index_t dp_tiles_per_cta_;
     index_t extra_dp_tiles_;
 };
  
 template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
 struct StreamKTilePartitioner<BlockGemmShapeType, ReductionStrategyType, false>
     : StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>
 {
     StreamKTilePartitioner(ck_tile::index_t m,
                            ck_tile::index_t n,
                            ck_tile::index_t k,
                            ck_tile::index_t grid);
  
     public:
     static constexpr bool PERSISTENT = false;
     CK_TILE_HOST auto grid_size() const noexcept -> dim3;
  
     CK_TILE_HOST_DEVICE index_t get_dp_ctas() const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_dp_start_block_idx() const noexcept;
  
     CK_TILE_HOST_DEVICE index_t get_sk_start_block_idx() const noexcept;
  
     protected:
     index_t dp_ctas_;
     index_t dp_start_block_idx_;
     index_t sk_start_block_idx_;
 };
  
 } // namespace ck_tile
  
 #include "streamk_gemm_tile_partitioner_impl.hpp"