include/ck_tile/core/algorithm/static_encoding_pattern.hpp Source File

include/ck_tile/core/algorithm/static_encoding_pattern.hpp Source File#

Composable Kernel: include/ck_tile/core/algorithm/static_encoding_pattern.hpp Source File
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  
 #pragma once
  
 #include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/container/sequence.hpp"
 #include "ck_tile/core/container/tuple.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/tensor/tile_distribution.hpp"
 #include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
  
 namespace ck_tile {
  
 enum struct tile_distribution_pattern
 {
     thread_raked,
     warp_raked,
     block_raked,
 };
  
 struct TileDistributionEncodingPattern
 {
 };
  
 template <index_t BlockSize,
           index_t YPerTile,
           index_t XPerTile,
           index_t VecSize,
           tile_distribution_pattern DistributionPattern,
           index_t NumWaveGroups = 1>
 struct TileDistributionEncodingPattern2D : public TileDistributionEncodingPattern
 {
 };
  
 // Thread raked
 template <index_t BlockSize,
           index_t YPerTile,
           index_t XPerTile,
           index_t VecSize,
           index_t NumWaveGroups>
 struct TileDistributionEncodingPattern2D<BlockSize,
                                          YPerTile,
                                          XPerTile,
                                          VecSize,
                                          tile_distribution_pattern::thread_raked,
                                          NumWaveGroups> : public TileDistributionEncodingPattern
 {
  
     // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
     static constexpr index_t warp_size  = get_warp_size();
     static constexpr index_t num_warps  = BlockSize / get_warp_size();
     static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
     static constexpr index_t X1         = VecSize > LargestVec ? LargestVec : VecSize;
     static constexpr index_t X0         = XPerTile / X1; // # of threads in X dim
  
     // # of rows in Y dim accessed by single wavefront in one iteration
     static constexpr index_t Y1 = warp_size / X0;
     static_assert(X0 * Y1 == warp_size, "X0 * Y1 must cover whole wavefront!");
  
     static constexpr index_t Y0 = num_warps / NumWaveGroups;
     //  YPerWarp = YPerTile / Y0;
     //  Y2 = YPerWarp / Y1;
     static constexpr index_t Y2 = YPerTile / (Y1 * Y0); // # of iters within wavefront
  
     static_assert(X0 * Y1 * Y0 * NumWaveGroups == BlockSize,
                   "X0 * warp_ys * Y0 must cover whole workgroup!");
     static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover whole YPerTile");
  
     CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
     {
         if constexpr(NumWaveGroups != 1)
         {
             return make_static_tile_distribution(
                 tile_distribution_encoding<sequence<Y0>,
                                            tuple<sequence<Y1, Y2>, sequence<X0, X1>>,
                                            tuple<sequence<0>, sequence<1, 2>>,
                                            tuple<sequence<0>, sequence<0, 0>>, // -> <Y0>, <Y1, X0>
                                            sequence<1, 2>,
                                            sequence<1, 1>>{}); // -> <Y2, X1>
         }
         else
         {
             return make_static_tile_distribution(
                 tile_distribution_encoding<sequence<1>,
                                            tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
                                            tuple<sequence<1>, sequence<1, 2>>,
                                            tuple<sequence<0>, sequence<1, 0>>, // -> <Y0>, <Y1, X0>
                                            sequence<1, 2>,
                                            sequence<2, 1>>{}); // -> <Y2, X1>
         }
     }
  
     CK_TILE_HOST_DEVICE static constexpr auto MakeShuffled2DStaticTileDistribution()
     {
         if constexpr(NumWaveGroups != 1)
         {
             return make_static_tile_distribution(
                 tile_distribution_encoding<sequence<Y0>,
                                            tuple<sequence<X0, X1>, sequence<Y1, Y2>>,
                                            tuple<sequence<0>, sequence<2, 1>>,
                                            tuple<sequence<0>, sequence<0, 0>>, // -> <Y0>, <Y1, X0>
                                            sequence<1, 2>,
                                            sequence<1, 1>>{}); // -> <X1, Y2>
         }
         else
         {
             return make_static_tile_distribution(
                 tile_distribution_encoding<sequence<1>,
                                            tuple<sequence<X0, X1>, sequence<Y0, Y1, Y2>>,
                                            tuple<sequence<2>, sequence<2, 1>>,
                                            tuple<sequence<0>, sequence<1, 0>>, // -> <Y0>, <Y1, X0>
                                            sequence<1, 2>,
                                            sequence<1, 2>>{}); // -> <X1, Y2>
         }
     }
 };
  
 // Warp raked
 template <index_t BlockSize,
           index_t YPerTile,
           index_t XPerTile,
           index_t VecSize,
           index_t NumWaveGroups>
 struct TileDistributionEncodingPattern2D<BlockSize,
                                          YPerTile,
                                          XPerTile,
                                          VecSize,
                                          tile_distribution_pattern::warp_raked,
                                          NumWaveGroups> : public TileDistributionEncodingPattern
 {
  
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
     static constexpr index_t warp_size  = get_warp_size();
     static constexpr index_t num_warps  = BlockSize / get_warp_size();
     static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
     static constexpr index_t X1         = VecSize > LargestVec ? LargestVec : VecSize;
     static constexpr index_t X0         = XPerTile / X1; // # of threads in X dim
  
     static constexpr index_t Y2 = warp_size / X0; // # of rows in Y dim to cover whole wavefront
     static_assert(X0 * Y2 == warp_size, "X0 * Y2 must cover whole wavefront!");
  
     static constexpr index_t Y0 = num_warps;
     static_assert(X0 * Y2 * Y0 == BlockSize, "X0 * Y2 * Y1 must cover whole workgroup!");
  
     static constexpr index_t Y1 = YPerTile / (Y2 * Y0); // # of iters within wavefront
     static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover whole YPerTile");
  
     CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
     {
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
                                        tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
                                        tuple<sequence<1>, sequence<1, 2>>,
                                        tuple<sequence<0>, sequence<2, 0>>, // -> <Y0>, <Y2, X0>
                                        sequence<1, 2>,
                                        sequence<1, 1>>{}); // -> <Y1, X1>
     }
  
     CK_TILE_HOST_DEVICE static constexpr auto MakeShuffled2DStaticTileDistribution()
     {
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
                                        tuple<sequence<X0, X1>, sequence<Y0, Y1, Y2>>,
                                        tuple<sequence<2>, sequence<2, 1>>,
                                        tuple<sequence<0>, sequence<2, 0>>, // -> <Y0>, <Y2, X0>
                                        sequence<1, 2>,
                                        sequence<1, 1>>{}); // -> <X1, Y1>
     }
 };
  
 // Block raked
 template <index_t BlockSize,
           index_t YPerTile,
           index_t XPerTile,
           index_t VecSize,
           index_t NumWaveGroups>
 struct TileDistributionEncodingPattern2D<BlockSize,
                                          YPerTile,
                                          XPerTile,
                                          VecSize,
                                          tile_distribution_pattern::block_raked,
                                          NumWaveGroups> : public TileDistributionEncodingPattern
 {
  
     // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
     static constexpr index_t warp_size  = get_warp_size();
     static constexpr index_t num_warps  = BlockSize / get_warp_size();
     static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
     static constexpr index_t X1         = VecSize > LargestVec ? LargestVec : VecSize;
     static constexpr index_t X0         = XPerTile / X1; // # of threads in X dim
     static constexpr index_t Y2 = warp_size / X0; // # of rows in Y dim to cover whole wavefront
     static_assert(X0 * Y2 == warp_size, "X0 * Y2 must cover whole wavefront!");
     static constexpr index_t Y1 = num_warps;
     static_assert(X0 * Y2 * Y1 == BlockSize, "X0 * Y2 * Y1 must cover whole workgroup!");
     static constexpr index_t Y0 = YPerTile / (Y2 * Y1); // # of iters
     static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover whole YPerTile");
  
     CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
     {
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
                                        tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
                                        tuple<sequence<1>, sequence<1, 2>>,
                                        tuple<sequence<1>, sequence<2, 0>>, // -> <Y1>, <Y2, X0>
                                        sequence<1, 2>,
                                        sequence<0, 1>>{}); // -> <Y0, X1>
     }
  
     CK_TILE_HOST_DEVICE static constexpr auto MakeShuffled2DStaticTileDistribution()
     {
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
                                        tuple<sequence<X0, X1>, sequence<Y0, Y1, Y2>>,
                                        tuple<sequence<2>, sequence<2, 1>>,
                                        tuple<sequence<1>, sequence<2, 0>>, // -> <Y1>, <Y2, X0>
                                        sequence<1, 2>,
                                        sequence<1, 0>>{}); // -> <X1, Y0>
     }
 };
  
 } // namespace ck_tile