/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/algorithm/static_encoding_pattern.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/algorithm/static_encoding_pattern.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/core/algorithm/static_encoding_pattern.hpp Source File
Go to the documentation of this file.
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
  
 #pragma once
  
 #include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/container/sequence.hpp"
 #include "ck_tile/core/container/tuple.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/tensor/tile_distribution.hpp"
 #include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
 #include "ck_tile/core/utility/print.hpp"
  
 namespace ck_tile {
  
 enum struct tile_distribution_pattern
 {
     thread_raked,
     warp_raked,
     block_raked
 };
  
 struct tile_distribution_encoding_pattern
 {
 };
  
 template <index_t BlockSize,
           index_t YPerTile,
           index_t XPerTile,
           index_t VecSize,
           tile_distribution_pattern DistributionPattern,
           index_t NumWaveGroups = 1>
 struct tile_distribution_encoding_pattern_2d : public tile_distribution_encoding_pattern
 {
 };
  
 // Thread raked
 template <index_t BlockSize,
           index_t YPerTile,
           index_t XPerTile,
           index_t VecSize,
           index_t NumWaveGroups>
 struct tile_distribution_encoding_pattern_2d<BlockSize,
                                              YPerTile,
                                              XPerTile,
                                              VecSize,
                                              tile_distribution_pattern::thread_raked,
                                              NumWaveGroups>
     : public tile_distribution_encoding_pattern
 {
     // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
     static constexpr index_t warp_size  = get_warp_size();
     static constexpr index_t num_warps  = BlockSize / get_warp_size();
     static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
     static constexpr index_t X1         = VecSize > LargestVec ? LargestVec : VecSize;
     static constexpr index_t X0         = XPerTile / X1; // # of threads in X dim
  
     // # of rows in Y dim accessed by single wavefront in one iteration
     static constexpr index_t Y1 = warp_size / X0;
     static_assert(X0 * Y1 == warp_size, "X0 * Y1 must cover whole wavefront!");
  
     static constexpr index_t Y0 = num_warps / NumWaveGroups;
     //  YPerWarp = YPerTile / Y0;
     //  Y2 = YPerWarp / Y1;
     static constexpr index_t Y2 = YPerTile / (Y1 * Y0); // # of iters within wavefront
  
     static_assert(X0 * Y1 * Y0 * NumWaveGroups == BlockSize,
                   "X0 * warp_ys * Y0 must cover whole workgroup!");
     static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover whole YPerTile");
  
     CK_TILE_HOST_DEVICE static constexpr auto make_2d_static_tile_distribution()
     {
         if constexpr(NumWaveGroups != 1)
         {
             return make_static_tile_distribution(
                 tile_distribution_encoding<sequence<Y0>,
                                            tuple<sequence<Y1, Y2>, sequence<X0, X1>>,
                                            tuple<sequence<0>, sequence<1, 2>>,
                                            tuple<sequence<0>, sequence<0, 0>>, // -> <Y0>, <Y1, X0>
                                            sequence<1, 2>,
                                            sequence<1, 1>>{}); // -> <Y2, X1>
         }
         else
         {
             return make_static_tile_distribution(
                 tile_distribution_encoding<sequence<1>,
                                            tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
                                            tuple<sequence<1>, sequence<1, 2>>,
                                            tuple<sequence<0>, sequence<1, 0>>, // -> <Y0>, <Y1, X0>
                                            sequence<1, 2>,
                                            sequence<2, 1>>{}); // -> <Y2, X1>
         }
     }
  
     CK_TILE_HOST_DEVICE static constexpr auto make_shuffled_2d_static_tile_distribution()
     {
         if constexpr(NumWaveGroups != 1)
         {
             return make_static_tile_distribution(
                 tile_distribution_encoding<sequence<Y0>,
                                            tuple<sequence<X0, X1>, sequence<Y1, Y2>>,
                                            tuple<sequence<0>, sequence<2, 1>>,
                                            tuple<sequence<0>, sequence<0, 0>>, // -> <Y0>, <Y1, X0>
                                            sequence<1, 2>,
                                            sequence<1, 1>>{}); // -> <X1, Y2>
         }
         else
         {
             return make_static_tile_distribution(
                 tile_distribution_encoding<sequence<1>,
                                            tuple<sequence<X0, X1>, sequence<Y0, Y1, Y2>>,
                                            tuple<sequence<2>, sequence<2, 1>>,
                                            tuple<sequence<0>, sequence<1, 0>>, // -> <Y0>, <Y1, X0>
                                            sequence<1, 2>,
                                            sequence<1, 2>>{}); // -> <X1, Y2>
         }
     }
 };
  
 // Warp raked
 template <index_t BlockSize,
           index_t YPerTile,
           index_t XPerTile,
           index_t VecSize,
           index_t NumWaveGroups>
 struct tile_distribution_encoding_pattern_2d<BlockSize,
                                              YPerTile,
                                              XPerTile,
                                              VecSize,
                                              tile_distribution_pattern::warp_raked,
                                              NumWaveGroups>
     : public tile_distribution_encoding_pattern
 {
  
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
     static constexpr index_t warp_size  = get_warp_size();
     static constexpr index_t num_warps  = BlockSize / get_warp_size();
     static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
     static constexpr index_t X1         = VecSize > LargestVec ? LargestVec : VecSize;
     static constexpr index_t X0         = XPerTile / X1; // # of threads in X dim
  
     static constexpr index_t Y2 = warp_size / X0; // # of rows in Y dim to cover whole wavefront
     static_assert(X0 * Y2 == warp_size, "X0 * Y2 must cover whole wavefront!");
  
     static constexpr index_t Y0 = num_warps;
     static_assert(X0 * Y2 * Y0 == BlockSize, "X0 * Y2 * Y1 must cover whole workgroup!");
  
     static constexpr index_t Y1 = YPerTile / (Y2 * Y0); // # of iters within wavefront
     static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover whole YPerTile");
  
     CK_TILE_HOST_DEVICE static constexpr auto make_2d_static_tile_distribution()
     {
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
                                        tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
                                        tuple<sequence<1>, sequence<1, 2>>,
                                        tuple<sequence<0>, sequence<2, 0>>, // -> <Y0>, <Y2, X0>
                                        sequence<1, 2>,
                                        sequence<1, 1>>{}); // -> <Y1, X1>
     }
  
     CK_TILE_HOST_DEVICE static constexpr auto make_shuffled_2d_static_tile_distribution()
     {
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
                                        tuple<sequence<X0, X1>, sequence<Y0, Y1, Y2>>,
                                        tuple<sequence<2>, sequence<2, 1>>,
                                        tuple<sequence<0>, sequence<2, 0>>, // -> <Y0>, <Y2, X0>
                                        sequence<1, 2>,
                                        sequence<1, 1>>{}); // -> <X1, Y1>
     }
 };
  
 // Block raked
 template <index_t BlockSize,
           index_t YPerTile,
           index_t XPerTile,
           index_t VecSize,
           index_t NumWaveGroups>
 struct tile_distribution_encoding_pattern_2d<BlockSize,
                                              YPerTile,
                                              XPerTile,
                                              VecSize,
                                              tile_distribution_pattern::block_raked,
                                              NumWaveGroups>
     : public tile_distribution_encoding_pattern
 {
  
     // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
     static constexpr index_t warp_size  = get_warp_size();
     static constexpr index_t num_warps  = BlockSize / get_warp_size();
     static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
     static constexpr index_t X1         = VecSize > LargestVec ? LargestVec : VecSize;
     static constexpr index_t X0         = XPerTile / X1; // # of threads in X dim
     static constexpr index_t Y2 = warp_size / X0; // # of rows in Y dim to cover whole wavefront
     static_assert(X0 * Y2 == warp_size, "X0 * Y2 must cover whole wavefront!");
     static constexpr index_t Y1 = num_warps;
     static_assert(X0 * Y2 * Y1 == BlockSize, "X0 * Y2 * Y1 must cover whole workgroup!");
     static constexpr index_t Y0 = YPerTile / (Y2 * Y1); // # of iters
     static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover whole YPerTile");
  
     CK_TILE_HOST_DEVICE static constexpr auto make_2d_static_tile_distribution()
     {
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
                                        tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
                                        tuple<sequence<1>, sequence<1, 2>>,
                                        tuple<sequence<1>, sequence<2, 0>>, // -> <Y1>, <Y2, X0>
                                        sequence<1, 2>,
                                        sequence<0, 1>>{}); // -> <Y0, X1>
     }
  
     CK_TILE_HOST_DEVICE static constexpr auto make_shuffled_2d_static_tile_distribution()
     {
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
                                        tuple<sequence<X0, X1>, sequence<Y0, Y1, Y2>>,
                                        tuple<sequence<2>, sequence<2, 1>>,
                                        tuple<sequence<1>, sequence<2, 0>>, // -> <Y1>, <Y2, X0>
                                        sequence<1, 2>,
                                        sequence<1, 0>>{}); // -> <X1, Y0>
     }
 };
  
 // Helper function to convert enum to string
 constexpr const char* tile_distribution_pattern_to_string(tile_distribution_pattern pattern)
 {
     switch(pattern)
     {
     case tile_distribution_pattern::thread_raked: return "thread_raked";
     case tile_distribution_pattern::warp_raked: return "warp_raked";
     case tile_distribution_pattern::block_raked: return "block_raked";
     default: return "unknown";
     }
 }
  
 template <index_t BlockSize,
           index_t YPerTile,
           index_t XPerTile,
           index_t VecSize,
           tile_distribution_pattern DistributionPattern,
           index_t NumWaveGroups>
 CK_TILE_HOST_DEVICE void print(const tile_distribution_encoding_pattern_2d<BlockSize,
                                                                            YPerTile,
                                                                            XPerTile,
                                                                            VecSize,
                                                                            DistributionPattern,
                                                                            NumWaveGroups>&)
 {
     using PatternType = tile_distribution_encoding_pattern_2d<BlockSize,
                                                               YPerTile,
                                                               XPerTile,
                                                               VecSize,
                                                               DistributionPattern,
                                                               NumWaveGroups>;
  
     printf("tile_distribution_encoding_pattern_2d<BlockSize:%d, YPerTile:%d, XPerTile:%d, "
            "VecSize:%d, %s>: ",
            BlockSize,
            YPerTile,
            XPerTile,
            VecSize,
            tile_distribution_pattern_to_string(DistributionPattern));
     printf("{<Y0, Y1, Y2>: <%d, %d, %d>, <X0, X1>: <%d, %d>}\n",
            PatternType::Y0,
            PatternType::Y1,
            PatternType::Y2,
            PatternType::X0,
            PatternType::X1);
 }
  
 } // namespace ck_tile