ck_tile Namespace Reference

ck_tile Namespace Reference#

Composable Kernel: ck_tile Namespace Reference
ck_tile Namespace Reference

Namespaces

 conv
 
 detail
 
 details
 
 element_wise
 
 impl
 
 internal
 
 literals
 
 ranges
 
 ReduceOp
 
 tensor_layout
 
 util
 

Classes

struct  base_transform
 
struct  pass_through
 
struct  pad
 
struct  left_pad
 
struct  right_pad
 
struct  embed
 
struct  lambda_merge_generate_MagicDivision_calculate_magic_divisor
 
struct  merge_v2_magic_division
 
struct  merge_v3_division_mod
 
struct  unmerge
 
struct  freeze
 
struct  insert
 
struct  replicate
 
struct  slice
 
struct  modulo
 
struct  xor_t
 
struct  offset
 
struct  indexing
 
struct  indexing_adaptor_onshot_cached
 
struct  space_filling_curve
 
struct  TileDistributionEncodingPattern
 
struct  TileDistributionEncodingPattern2D
 Class creating 2D static tile distribution with different load/store patterns. More...
 
struct  TileDistributionEncodingPattern2D< BlockSize, YPerTile, XPerTile, VecSize, tile_distribution_pattern::thread_raked, NumWaveGroups >
 
struct  TileDistributionEncodingPattern2D< BlockSize, YPerTile, XPerTile, VecSize, tile_distribution_pattern::warp_raked, NumWaveGroups >
 
struct  TileDistributionEncodingPattern2D< BlockSize, YPerTile, XPerTile, VecSize, tile_distribution_pattern::block_raked, NumWaveGroups >
 
struct  buffer_resource
 
struct  buffer_load
 
struct  buffer_load_if
 
struct  buffer_store
 
struct  buffer_store_if
 
struct  buffer_load< 16, pre_nop >
 
struct  buffer_load< 8, pre_nop >
 
struct  buffer_load< 4, pre_nop >
 
struct  buffer_load< 2, pre_nop >
 
struct  buffer_load< 1, pre_nop >
 
struct  buffer_load_if< 16, pre_nop >
 
struct  buffer_load_if< 8, pre_nop >
 
struct  buffer_load_if< 4, pre_nop >
 
struct  buffer_load_if< 2, pre_nop >
 
struct  buffer_load_if< 1, pre_nop >
 
struct  buffer_store< 16 >
 
struct  buffer_store< 8 >
 
struct  buffer_store< 4 >
 
struct  buffer_store< 2 >
 
struct  buffer_store< 1 >
 
struct  buffer_store_if< 16 >
 
struct  buffer_store_if< 8 >
 
struct  buffer_store_if< 4 >
 
struct  buffer_store_if< 2 >
 
struct  buffer_store_if< 1 >
 
struct  buffer_atomic_add_if
 
struct  buffer_atomic_add_if< bf16_t, 2, pre_nop >
 
struct  buffer_atomic_add
 
struct  buffer_atomic_add< bf16_t, 2, pre_nop >
 
struct  smem_load
 
struct  smem_load< 16 >
 
struct  smem_load< 8 >
 
struct  smem_load< 4 >
 
struct  smem_load< 2 >
 
struct  smem_load< 1 >
 
struct  LaneGroupTransposeTraits
 
struct  LaneGroupTransposeTraits< T, LaneGroupSize, std::enable_if_t< sizeof(T)==2 > >
 
struct  LaneGroupTransposeTraits< T, LaneGroupSize, std::enable_if_t< sizeof(T)==1 > >
 
struct  workgroup_barrier
 
struct  array
 A fixed-size array container similar to std::array with additional utilities. More...
 
struct  array< T, 0 >
 Specialization of array container for zero elements. More...
 
struct  vector_traits
 
struct  vector_traits< array< T, N >, void >
 
struct  map
 
struct  meta_data_buffer
 
struct  sequence
 
struct  sequence_split
 
struct  sequence_reverse
 
struct  sequence_map_inverse
 
struct  is_valid_sequence_map
 
struct  sequence_merge
 
struct  sequence_merge< sequence< Xs... >, sequence< Ys... > >
 
struct  sequence_merge< Seq >
 
struct  sequence_gen
 
struct  arithmetic_sequence_gen
 
struct  arithmetic_sequence_gen< 0, IEnd, 1 >
 
struct  uniform_sequence_gen
 
struct  sequence_reverse_inclusive_scan
 
struct  sequence_reverse_inclusive_scan< sequence< I, Is... >, Reduce, Init >
 
struct  sequence_reverse_inclusive_scan< sequence< I >, Reduce, Init >
 
struct  sequence_reverse_inclusive_scan< sequence<>, Reduce, Init >
 
struct  sequence_reverse< sequence< Ns... > >
 
struct  sequence_reduce
 
struct  sequence_reduce< Reduce, sequence< Xs... >, sequence< Ys... > >
 
struct  sequence_reduce< Reduce, Seq >
 
struct  sequence_sort_impl
 
struct  sequence_sort_impl< sequence< ValueX, ValueY >, sequence< IdX, IdY >, Compare >
 
struct  sequence_sort_impl< sequence< Value >, sequence< Id >, Compare >
 
struct  sequence_sort_impl< sequence<>, sequence<>, Compare >
 
struct  sequence_sort
 
struct  sequence_unique_sort
 
struct  sequence_exclusive_scan
 
struct  sequence_exclusive_scan< sequence< Xs... >, sequence< Y, Ys... >, Reduce >
 
struct  sequence_exclusive_scan< sequence< Xs... >, sequence< Y >, Reduce >
 
struct  sequence_exclusive_scan< sequence< Xs... >, sequence<>, Reduce >
 
struct  tuple
 
class  span
 
struct  vector_traits< tuple< T... >, void >
 
struct  tuple_concat
 
struct  tuple_concat< tuple< Xs... >, tuple< Ys... > >
 
struct  numeric
 
struct  numeric< bfloat16_t >
 
struct  numeric_traits< bfloat16_t >
 
struct  e8m0_bexp_t
 Unsigned representation of a conventional biased Float32 exponent. More...
 
struct  numeric_traits< e8m0_t >
 
struct  numeric< e8m0_t >
 
struct  numeric_traits< fp8_t >
 
struct  numeric_traits< bf8_t >
 
struct  numeric< fp8_t >
 
struct  numeric< bf8_t >
 
struct  numeric< half_t >
 
struct  numeric_traits< half_t >
 
struct  numeric< int8_t >
 
struct  constant
 
struct  integral_constant
 
struct  is_constant
 
struct  is_constant< constant< v > >
 
struct  scales_c
 
struct  scales
 
struct  plus
 
struct  plus< void, void >
 
struct  minus
 
struct  minus< void, void >
 
struct  multiplies
 
struct  multiplies< void, void >
 
struct  maximize
 
struct  minimize
 
struct  integer_divide_ceiler
 
struct  equal< void, void >
 
struct  equal< float, float >
 
struct  equal< double, double >
 
struct  less
 
struct  less< void, void >
 
struct  less_equal
 
struct  less_equal< void, void >
 
struct  less_equal< float, float >
 
struct  less_equal< double, double >
 
struct  log2e
 
struct  log2e< double >
 
struct  log2e< float >
 
struct  numeric_utils
 
struct  null_type
 
struct  numeric_traits
 
struct  numeric_traits< float >
 
struct  pk_float4_e2m1_t
 
struct  numeric_traits< pk_fp4_t >
 
struct  numeric< pk_fp4_t >
 
struct  pk_int4_t
 
struct  numeric< pk_int4_t >
 
struct  numeric_traits< pk_int4_t >
 
struct  native_t
 
struct  vector_traits< T, void >
 
struct  buffer_view
 
struct  buffer_view< address_space_enum::generic, T, BufferSizeType, InvalidElementUseNumericalZeroValue, amd_buffer_coherence_enum::coherence_default >
 
struct  buffer_view< address_space_enum::global, T, BufferSizeType, InvalidElementUseNumericalZeroValue, Coherence >
 
struct  buffer_view< address_space_enum::lds, T, BufferSizeType, InvalidElementUseNumericalZeroValue, amd_buffer_coherence_enum::coherence_default >
 
struct  buffer_view< address_space_enum::vgpr, T, BufferSizeType, InvalidElementUseNumericalZeroValue, amd_buffer_coherence_enum::coherence_default >
 
struct  DefaultTranspose
 
struct  TransposeTileDistrChecker
 
struct  TransposeTileDistributionTraits
 
struct  null_tensor
 
struct  null_tile_window
 
struct  static_distributed_tensor
 
struct  tile_sweeper
 
struct  tensor_adaptor
 
struct  lambda_get_up_dim_num
 
struct  tensor_adaptor_coordinate
 
struct  tensor_coordinate
 
struct  tensor_descriptor
 
struct  tensor_view
 
struct  null_tensor_view
 
struct  tile_distributed_span
 
struct  tile_distributed_index
 
struct  tile_distribution
 
struct  tile_distribution_encoding
 
class  tile_distribution_encoding_shuffle
 
class  tile_distribution_encoding_shuffle< encoding, sequence< shuffle... > >
 
struct  tile_scatter_gather
 This class provides tile (windowed) view and access to the device memory. More...
 
struct  tile_window_with_static_distribution
 This class provides tile (windowed) view and access to the device memory. More...
 
struct  tile_window_with_static_lengths
 This class provides description of tile windowed view on the device memory. More...
 
struct  is_tile_window_with_static_distribution
 Type trait to determine if a type is a tile window with static distribution. More...
 
struct  is_tile_window_with_static_distribution< tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, StaticTileDistribution_, NumCoord > >
 Specialization for tile_window_with_static_distribution to evaluate to true_type. More...
 
struct  is_tile_window_with_static_lengths
 Type trait to determine if a type is a tile window with static lengths. More...
 
struct  is_tile_window_with_static_lengths< tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > >
 Specialization for tile_window_with_static_lengths to evaluate to true_type. More...
 
struct  tile_window_base
 This class provides description of tile windowed view on the device memory. More...
 
struct  tile_window_with_tile_dstr_base
 
struct  tile_window_linear
 
struct  is_tile_window_linear
 Type trait to determine if a type is a linear tile window. More...
 
struct  is_tile_window_linear< tile_window_linear< BottomTensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ > >
 Specialization of is_tile_window_linear for tile_window_linear. More...
 
struct  str_literal
 
struct  thread_buffer
 
struct  CK_PRINTF
 
struct  CK_PRINTF< ConvertTo, str_literal< FMTChars... >, str_literal< PREFIXChars... >, str_literal< SUFFIXChars... > >
 
struct  CK_PRINTF_WARP0
 
struct  static_for
 
struct  static_for< 0, N, 1 >
 
struct  identity
 
struct  static_ford
 
struct  static_uford
 
struct  magic_division32_bit_range
 
struct  magic_division16_bit_range
 
struct  mdiv
 
struct  mdiv2
 
class  philox
 
struct  prand_generator_t
 
struct  prand_generator_t< float, seed_ >
 
struct  prand_generator_t< half_t, seed_ >
 
struct  static_counter
 
struct  transpose_vectors
 
struct  copy_const
 
struct  copy_const< const From, To >
 
struct  nonesuch
 
struct  is_any_of
 
struct  is_any_of< CompareTo, FirstType >
 
struct  is_any_of< CompareTo, FirstType, Rest... >
 
struct  is_specialization_of
 
struct  is_specialization_of< RefTemplate< Args... >, RefTemplate >
 
struct  tuple_element_or_default
 
struct  composes
 
struct  composes< F >
 
struct  saturates
 
class  ArgParser
 
struct  IsCharArray
 
struct  IsCharArray< char[N]>
 
struct  IsCharArray< const char[N]>
 
struct  IsCharArray< char(&)[N]>
 
struct  IsCharArray< const char(&)[N]>
 
struct  DeviceMem
 Manages device memory allocation and host-device data transfers. More...
 
struct  FillUniformDistribution
 
struct  FillUniformDistribution< ck_tile::pk_int4_t >
 
struct  FillUniformDistribution_Unique
 
struct  FillNormalDistribution
 
struct  FillUniformDistributionIntegerValue
 
struct  FillNormalDistributionIntegerValue
 
struct  FillMonotonicSeq
 
struct  FillStepRange
 
struct  FillConstant
 
struct  AdjustToStructuredSparsity
 Transforms given input to fit 2:4 structured sparsity pattern so every subgroup of 4 elements contain at most 2 non-zero elements. More...
 
struct  FillTrigValue
 
struct  HostTensorDescriptor
 Descriptor for tensors in host memory. More...
 
struct  ParallelTensorFunctor
 
struct  HostTensor
 
struct  joinable_thread
 
struct  reference_layernorm2d_default_epilogue
 
struct  reference_rmsnorm2d_default_epilogue
 
struct  RotatingMemWrapper
 
struct  stream_config
 
struct  gpu_timer
 
struct  cpu_timer
 
struct  AddRmsnorm2dRdquantFwdHostArgs
 
struct  AddRmsnorm2dRdquantFwd
 
struct  AddRmsnorm2dRdquantFwdPipelineDefaultPolicy
 
struct  AddRmsnorm2dRdquantFwdPipelineOnePass
 
struct  AddRmsnorm2dRdquantFwdPipelineProblem
 
struct  AddRmsnorm2dRdquantFwdPipelineThreePass
 
struct  BatchedTransposeHostArgs
 
struct  BatchedTransposeKernel
 
struct  BatchedTransposeCommonPolicy
 
struct  BatchedTransposeLdsPipeline
 
struct  BatchedTransposeLdsPolicy
 
struct  BatchedTransposeLdsProblem
 
struct  BatchedTransposePipeline
 
struct  BatchedTransposePolicy
 
struct  BatchedTransposeProblem
 
struct  Generic2dBlockShape
 
struct  typeToStr
 
struct  typeToStr< float >
 
struct  typeToStr< fp16_t >
 
struct  typeToStr< bf16_t >
 
struct  typeToStr< fp8_t >
 
struct  typeToStr< bf8_t >
 
struct  typeToStr< int8_t >
 
struct  typeToStr< pk_int4_t >
 
struct  ElementWiseKernel
 
struct  ElementWiseDefaultPolicy
 
struct  ElementWisePipelineProblem
 
struct  ElementWiseShape
 
struct  CShuffleEpilogueProblem
 
struct  CShuffleEpilogue
 
struct  Default2DAndDynamicQuantEpilogueProblem
 
struct  Default2DAndDynamicQuantEpilogue
 
struct  Default2DEpilogueProblem
 
struct  DefaultGemm2DEpilogueProblem
 
struct  Default2DEpilogue
 
struct  DefaultGemm2DEpilogue
 
struct  DynamicQuantEpilogueTraits
 
struct  DynamicQuantEpilogueProblem
 
struct  DynamicQuantEpilogue
 
struct  BlockFlatmmASmemBSmemCRegV1
 
struct  BlockFlatmmASmemBSmemCRegV1CustomPolicy
 
struct  Flatmm_32x512x128_1x4x1_16x16x32_Base
 
struct  Flatmm_32x512x128_1x4x1_16x16x32_BF16
 
struct  Flatmm_32x512x128_1x4x1_16x16x32_FP16
 
struct  FlatmmSn_32x128x512_1x4x1_16x16x32_Base
 
struct  FlatmmSn_32x128x512_1x4x1_16x16x32_BF16
 
struct  FlatmmSn_32x128x512_1x4x1_16x16x32_FP16
 
struct  FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl
 
struct  FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl
 
struct  FlatmmHostArgs
 
struct  FlatmmKernelArgs
 
struct  FlatmmKernel
 
struct  BaseFlatmmPipelineAGmemBGmemCRegV1
 
struct  FlatmmPipelineAGmemBGmemCRegV1
 
struct  UniversalFlatmmPipelineAgBgCrPolicy
 
struct  TileFlatmmShape
 
struct  BlockAttentionBiasEnumToStr
 
struct  BlockAttentionBiasEnumToStr< BlockAttentionBiasEnum::NO_BIAS >
 
struct  BlockAttentionBiasEnumToStr< BlockAttentionBiasEnum::ELEMENTWISE_BIAS >
 
struct  BlockAttentionBiasEnumToStr< BlockAttentionBiasEnum::ALIBI >
 
struct  NullBlockDropout
 
struct  BlockDropout
 
struct  BlockDropoutBwd
 
struct  BlockDropoutBwd< false, IsWG32_, IsStoreRandval_ >
 
struct  BlockDropoutBwd< true, IsWG32_, IsStoreRandval_ >
 
struct  GenericAttentionMask
 
struct  SimplifiedGenericAttentionMask
 
struct  SimplifiedRatioAttentionMask
 
struct  Alibi
 
struct  EmptyPositionEncoding
 
struct  RotaryEmbeddingEnumToStr
 
struct  RotaryEmbeddingEnumToStr< RotaryEmbeddingEnum::NONE >
 
struct  RotaryEmbeddingEnumToStr< RotaryEmbeddingEnum::INTERLEAVED >
 
struct  RotaryEmbeddingEnumToStr< RotaryEmbeddingEnum::HALF_ROTATED >
 
struct  BlockRotaryEmbedding
 
struct  TrivialPageBlockNavigator
 
struct  PageBlockNavigator
 
struct  StandardAttentionParams
 
struct  LogitsSoftCapParams
 
struct  StandardAttention
 
struct  LogitsSoftCap
 
struct  ComposedAttention
 
struct  FmhaBatchPrefillWithPagedKVCacheKernel
 
struct  FmhaBwdDQDKDVKernel
 
struct  FmhaBwdOGradDotOKernel
 
struct  FmhaBwdConvertQGradKernel
 
struct  FmhaFwdAppendKVKernel
 
struct  FmhaFwdAppendKVTilePartitioner
 
struct  FmhaFwdKernel
 
struct  FmhaFwdPagedKVKernel
 
struct  FmhaFwdSplitKVCombineKernel
 
struct  FmhaFwdSplitKVKernel
 
struct  FmhaFwdV3Kernel
 
struct  BlockFmhaBatchPrefillPipelineQRKSVSAsync
 
struct  BlockFmhaBwdConvertQGrad
 
struct  BlockFmhaBwdOGradDotO
 
struct  BlockFmhaBwdDQDKDVPipelineKRKTRVR
 
struct  BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
 
class  BlockFmhaBwdDQDKDVPipelineSelector
 
class  BlockFmhaBwdDQDKDVPipeline
 
struct  BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
 
struct  BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
 
struct  fmha_bwd_qr_qtr_dor_pipeline
 
struct  fmha_bwd_qr_qtr_dor_pipeline< T, std::void_t< decltype(T::is_qr_qtr_dor_pipeline)> >
 
struct  BlockFmhaBwdPipelineDefaultPolicy
 
struct  BlockFmhaBwdPipelineProblem
 
struct  BlockFmhaBwdOGradDotOPipelineProblem
 
struct  BlockFmhaBwdConvertQGradPipelineProblem
 
struct  BlockFmhaBwdPipelineTrLoadDefaultPolicy
 
struct  BlockFmhaFwdAppendKVPipeline
 
struct  BlockFmhaFwdAppendKVPipelineDefaultPolicy
 
struct  BlockFmhaFwdPagedKVPipelineQRKSVS
 
struct  BlockFmhaFwdPagedKVPipelineQRKSVSDefaultPolicy
 
struct  BlockFmhaFwdSplitKVCombinePipeline
 
struct  BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
 
struct  BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
 
struct  BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVSDefaultPolicy
 
struct  BlockFmhaFwdSplitKVPipelineQRKSVS
 
struct  BlockFmhaFwdSplitKVPipelineQRKSVSDefaultPolicy
 
struct  CoreLoopScheduler
 
struct  CoreLoopScheduler< PipelineProblem, true >
 
struct  CoreLoopScheduler< PipelineProblem, false >
 
struct  BlockFmhaFwdV3Pipeline
 
struct  BlockFmhaV3PipelineDefaultPolicy
 
struct  BlockFmhaPipelineEnumToStr
 
struct  BlockFmhaPipelineEnumToStr< BlockFmhaPipelineEnum::QRKSVS >
 
struct  BlockFmhaPipelineEnumToStr< BlockFmhaPipelineEnum::QRKSVS_ASYNC >
 
struct  BlockFmhaPipelineEnumToStr< BlockFmhaPipelineEnum::QSKSVS >
 
struct  BlockFmhaPipelineEnumToStr< BlockFmhaPipelineEnum::QRKSVS_ASYNC_TRLOAD >
 
struct  BlockFmhaPipelineProblem
 
struct  BlockFmhaFwdPagedKVPipelineProblem
 
struct  BlockFmhaFwdSplitKVPipelineProblem
 
struct  BlockFmhaSplitKVCombinePipelineTileSizes
 
struct  BlockFmhaSplitKVCombinePipelineProblem
 
struct  BlockFmhaFwdAppendKVPipelineProblem
 
struct  BlockFmhaFwdV3PipelineProblem
 
struct  BlockFmhaPipelineQRKSVS
 
struct  BlockFmhaPipelineQRKSVSAsync
 
struct  BlockFmhaPipelineQRKSVSAsyncTrload
 
struct  BlockFmhaPipelineQRKSVSAsyncTrloadDefaultPolicy
 
struct  BlockFmhaPipelineQRKSVSFp8
 
struct  BlockFmhaPipelineQRKSVSWholeKPrefetch
 
struct  BlockFmhaPipelineQRKSVSWholeKPrefetchDefaultPolicy
 
struct  BlockFmhaPipelineQSKSVS
 
struct  BlockFmhaPipelineQSKSVSDefaultPolicy
 
struct  BlockFmhaPipelineQXCustomPolicy
 
struct  BlockFmhaPipelineQXCustomPolicy< true >
 
struct  BlockFmhaPipelineQXCustomPolicy< false >
 
struct  BlockFmhaPipelineQXKSVSCustomPolicy
 
struct  TileFmhaShape
 
struct  TileFmhaBwdShape
 
struct  TileFmhaTraits
 
struct  TileFmhaFwdPagedKVTraits
 
struct  TileFmhaFwdSplitKVTraits
 
struct  TileFmhaFwdSplitKVCombineTraits
 
struct  TileFmhaFwdAppendKVTraits
 
struct  TileFmhaBwdOGradDotOTraits
 
struct  TileFmhaBwdConvertQGradTraits
 
struct  TileFmhaFwdV3Traits
 
struct  FusedMoeGemmHostArgs
 
struct  FusedMoeGemmKernel
 
struct  FusedMoeGemmShape
 
struct  FusedMoeGemmTilePartitioner_Linear
 
struct  MoeSortingHostArgs
 
struct  MoeSortingKernel
 
struct  MoeSortingClearWorkspaceKernel
 
struct  MoeSortingMultiPhaseKernel_P0
 
struct  MoeSortingMultiPhaseKernel_P1
 
struct  MoeSortingMultiPhaseKernel_P2
 
struct  MoeSortingMultiPhaseKernel_P3
 
struct  MoeSortingMultiPhaseKernel_P23
 
struct  MoeSortingProblem
 
struct  MoeSortingProblemEx
 
struct  MoeSortingProblemMp
 
struct  MoeSortingClearWorkspaceProblem
 
struct  FusedMoeGemmPipeline_FlatmmEx
 
struct  FusedMoeGemmPipelineFlatmmPolicy
 
struct  FusedMoeGemmPipeline_FlatmmUk
 
struct  FusedMoeGemmPipelineProblem
 
struct  FusedMoeGemmTraits
 
struct  MoeSortingPolicy
 
struct  BlockGemmARegBGmemCRegV1
 
struct  BlockGemmARegBGmemCRegV1DefaultPolicy
 
struct  BlockGemmARegBRegCRegV1
 
struct  BlockGemmARegBRegCRegV1CustomPolicy
 
struct  BlockGemmARegBRegCRegV1DefaultPolicy
 
struct  BlockGemmARegBRegCRegV2
 
struct  BlockGemmARegBRegCRegV2CustomPolicy
 
struct  BlockGemmARegBSmemCRegOneWarpV1
 
struct  BlockGemmARegBSmemCRegV1
 
struct  BlockGemmARegBSmemCRegV1CustomPolicy
 
struct  BlockGemmARegBSmemCRegV1DefaultPolicy
 
struct  BlockGemmARegBSmemCRegV2
 
struct  BlockGemmARegBSmemCRegV2CustomPolicy
 
struct  BlockGemmARegBSmemCRegV2DefaultPolicy
 
struct  BlockGemmARegBSmemCRegV2R1
 
struct  BlockGemmASmemBRegCRegV1
 
struct  BlockGemmASmemBRegCRegV1CustomPolicy
 
struct  BlockGemmASmemBRegCRegV1DefaultPolicy
 
struct  BlockGemmASmemBSmemCRegV1
 
struct  BlockGemmASmemBSmemCRegV1CustomPolicy
 
struct  BlockGemmASmemBSmemCRegV1DefaultPolicy
 
struct  BlockGemmProblem
 
struct  BlockUniversalGemmAsBsCr
 
struct  BlockWeightPreshuffleASmemBSmemCRegV1
 
struct  BlockWeightPreshuffleASmemBSmemCRegV1CustomPolicy
 
struct  BatchedGemmHostArgs
 The Batched GEMM kernel host arguments. More...
 
struct  BatchedGemmKernel
 
struct  GemmHostArgs
 The GEMM kernel host arguments. More...
 
struct  GemmKernel
 
struct  GemmMultiDHostArgs
 The MultiD GEMM kernel host arguments. More...
 
struct  GemmKernelMultiD
 
struct  GemmTile2DPartitioner
 Class providing 2D workgroup index mapping into 2D output GEMM C-tile space. More...
 
struct  GemmTile1DPartitioner
 Class providing 1D WGP index mapping into 2D output C-tile space. More...
 
struct  HasFnOneArgImpl
 GemmTile1DPartitioner::GetOutputTileIndex's std::false specialization, checking expression validity in-place for ill-formed. More...
 
struct  HasFnOneArgImpl< T, std::void_t< decltype(std::declval< T >().GetOutputTileIndex(1))> >
 GemmTile1DPartitioner::GetOutputTileIndex's std::true specialization, checking expression validity in-place for well-formed. More...
 
struct  OffsettedTile1DPartitioner
 Struct used to calculate offseted tile indexes. More...
 
struct  GemmSpatiallyLocalTilePartitioner
 Class mapping 1D block index into 2D output tile space. More...
 
struct  GroupedGemmHostArgs
 The Grouped GEMM kernel host arguments. More...
 
struct  GemmTransKernelArg
 
struct  GroupedGemmKernel
 
struct  StreamKHostArgs
 The Stream K GEMM kernel host arguments. More...
 
struct  StreamKKernel
 
struct  UniversalGemmHostArgs
 The Universal GEMM kernel host arguments. More...
 
struct  UniversalGemmKernelArgs
 The GEMM kernel device arguments. More...
 
struct  UniversalGemmKernel
 The Universal GEMM kernel template. More...
 
struct  GemmPipelineAgBgCrImplBase
 
struct  BaseGemmPipelineAgBgCrCompV3
 
struct  GemmPipelineAgBgCrCompV3
 
struct  BaseGemmPipelineAgBgCrCompV4
 
struct  GemmPipelineAgBgCrCompV4
 Compute optimized pipeline version 4. More...
 
struct  GemmPipelineAgBgCrCompV4DefaultPolicy
 
struct  BaseGemmPipelineAgBgCrCompV5
 
struct  GemmPipelineAgBgCrCompV5
 
struct  GemmPipelineAgBgCrCompV5DefaultPolicy
 
struct  BaseGemmPipelineAgBgCrMem
 
struct  GemmPipelineAgBgCrMem
 
struct  GemmPipelineAGmemBGmemCRegV1
 
struct  GemmPipelineAGmemBGmemCRegV1DefaultPolicy
 
struct  GemmPipelineAGmemBGmemCRegV2
 
struct  GemmPipelineProblemBase
 
struct  UniversalGemmPipelineProblem
 
struct  UniversalGemmBasePolicy
 
struct  UniversalGemmPipelineAgBgCrPolicy
 
struct  TileGemmShape
 
struct  TileGemmTraits
 
struct  TileGemmUniversalTraits
 
struct  UniversalWeightPreshufflePipelineAgBgCrPolicy
 
struct  BaseWeightPreshufflePipelineAGmemBGmemCRegV1
 
struct  WeightPreshufflePipelineAGmemBGmemCRegV1
 
struct  BaseWeightPreshufflePipelineAGmemBGmemCRegV2
 
struct  WeightPreshufflePipelineAGmemBGmemCRegV2
 
struct  WarpGemmAttributeMfma
 
struct  WarpGemmAttributeMfmaIterateK
 
struct  WarpGemmAttributeMfmaTransposedCDistribution
 
struct  WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB
 
struct  WarpGemmAttributeMfmaIterateKAndTransposedCDistribution
 
struct  WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB
 
struct  WarpGemmAttributeMfmaIterateK_SwizzleA
 
struct  WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32
 
struct  WarpGemmAttributeMfmaImplF16F16F32M32N32K8
 
struct  WarpGemmAttributeMfmaImplF16F16F32M16N16K16
 
struct  WarpGemmAttributeMfmaImplF16F16F32M16N16K32
 
struct  WarpGemmAttributeMfmaImplF16F16F32M4N64K4
 
struct  WarpGemmAttributeMfmaImplF16F16F32M64N4K4
 
struct  WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
 
struct  WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
 
struct  WarpGemmAttributeMfmaImplBf16Bf16F32M4N64K4
 
struct  WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4
 
struct  WarpGemmAttributeMfmaImplF16F16F32M32N32K16
 
struct  WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16
 
struct  WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base
 
struct  WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
 
struct  WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base
 
struct  WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base
 
struct  WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
 
struct  WarpGemmAttributeMfmaImpl_i32_16x16x32_i8
 
struct  WarpGemmAttributeMfmaImpl_i32_16x16x64_i8
 
struct  WarpGemmAttributeMfmaImpl_i32_32x32x32_i8
 
struct  WarpGemmAttributeSmfmac
 Class describing structured sparsity mfma instructions. More...
 
struct  WarpGemmAttributeSmfmacImplF16F16F32M32N32K16
 
struct  WarpGemmAttributeSmfmacImplF16F16F32M16N16K32
 
struct  AWarpDstrEncodingTrait
 
struct  BWarpDstrEncodingTrait
 
struct  CWarpDstrEncodingTrait
 
struct  WarpGemmAttributeWmma
 
struct  WmmaTraits
 
struct  WarpGemmAttributeWmmaImpl
 
struct  has_wmma_traits
 
struct  WmmaTraits< gfx11_t, fp16_t, fp16_t, float, 16, 16, 16 >
 
struct  WmmaTraits< gfx11_t, bf16_t, bf16_t, float, 16, 16, 16 >
 
struct  WmmaTraits< gfx12_t, fp16_t, fp16_t, float, 16, 16, 16 >
 
struct  WmmaTraits< gfx12_t, bf16_t, bf16_t, float, 16, 16, 16 >
 
struct  WmmaTraits< gfx11_t, int8_t, int8_t, int32_t, 16, 16, 16 >
 
struct  WmmaTraits< gfx12_t, int8_t, int8_t, int32_t, 16, 16, 16 >
 
struct  WmmaTraits< gfx12_t, fp8_t, fp8_t, float, 16, 16, 16 >
 
struct  WmmaTraits< gfx12_t, bf8_t, bf8_t, float, 16, 16, 16 >
 
struct  WmmaTraits< gfx12_t, fp8_t, bf8_t, float, 16, 16, 16 >
 
struct  WmmaTraits< gfx12_t, bf8_t, fp8_t, float, 16, 16, 16 >
 
struct  WmmaTraitsBase
 
struct  WmmaTraitsBase< gfx11_t, ADType, BDType, CDType >
 
struct  WmmaTraitsBase< gfx12_t, ADType, BDType, CDType >
 
struct  WarpGemmImpl
 
struct  WarpGemmSmfmacImpl
 
struct  BlockGemmAQuantBase
 
struct  AQuantBlockUniversalGemmAsBsCr
 
struct  BlockGemmBQuantBase
 
struct  BQuantBlockUniversalGemmAsBsCr
 
struct  AQuantGemmProblem
 
struct  AQuantGemmHostArgs
 
struct  AQuantGemmKernelArgs
 
struct  AQuantGemmKernel
 
struct  BQuantGemmProblem
 
struct  BQuantGemmHostArgs
 
struct  BQuantGemmKernelArgs
 
struct  BQuantGemmKernel
 
struct  GemmAQuantPipelineAgBgCrImplBase
 
struct  GemmAQuantPipelineAgBgCrDefaultPolicy
 
struct  BaseAQuantGemmPipelineAgBgCrCompV3
 
struct  AQuantGemmPipelineAgBgCrCompV3
 
struct  GemmBQuantPipelineAgBgCrImplBase
 
struct  GemmBQuantPipelineAgBgCrDefaultPolicy
 
struct  BaseBQuantGemmPipelineAgBgCrCompV3
 
struct  BQuantGemmPipelineAgBgCrCompV3
 
struct  TileDistributionEncodingPatternAQ
 
struct  TileDistributionEncodingPatternAQTransposedC
 
struct  TileDistributionEncodingPatternBQ
 
struct  GemmAQuantPipelineProblemBase
 
struct  GemmBQuantPipelineProblemBase
 
struct  TileGemmAQuantTraits
 
struct  TileGemmBQuantTraits
 
struct  GroupedConvBwdDataKernelArgs
 The Grouped Convolution kernel device arguments. More...
 
struct  GroupedConvolutionBackwardDataKernel
 The Grouped Convolution Backward Data kernel template. More...
 
struct  GroupedConvBwdWeightKernelArgs
 The Grouped Convolution kernel device arguments. More...
 
struct  GroupedConvolutionBackwardWeightKernel
 The Grouped Convolution Backward Weight kernel template. More...
 
struct  GroupedConvFwdKernelArgs
 The Grouped Convolution kernel device arguments. More...
 
struct  GroupedConvolutionForwardKernel
 The Grouped Convolution Forward kernel template. More...
 
struct  GroupedConvHostArgs
 The Grouped Conv kernel host arguments. More...
 
struct  GroupedConvTraits
 
struct  TransformConvBwdDataToGemm
 
struct  TransformConvBwdWeightToGemm
 
struct  TransformConvFwdToGemm
 
struct  ImageToColumn
 
struct  BlockImageToColumnProblem
 
struct  TileImageToColumnShape
 
struct  Layernorm2dFwdHostArgs
 
struct  Layernorm2dFwd
 
struct  Layernorm2dFwdPipelineDefaultPolicy
 
struct  Layernorm2dFwdPipelineOnePass
 
struct  Layernorm2dFwdPipelineProblem
 
struct  Layernorm2dFwdPipelineTwoPass
 
struct  Layernorm2dXBiasEnumName
 
struct  Layernorm2dXBiasEnumName< Layernorm2dXBiasEnum::NO_BIAS >
 
struct  Layernorm2dXBiasEnumName< Layernorm2dXBiasEnum::ADD_BIAS >
 
struct  Layernorm2dFusedAddEnumName
 
struct  Layernorm2dFusedAddEnumName< Layernorm2dFusedAddEnum::NO_ADD >
 
struct  Layernorm2dFusedAddEnumName< Layernorm2dFusedAddEnum::PRE_ADD_STORE >
 
struct  Layernorm2dFusedAddEnumName< Layernorm2dFusedAddEnum::PRE_ADD >
 
struct  Layernorm2dFusedQuantEnumName
 
struct  Layernorm2dFusedQuantEnumName< Layernorm2dFusedQuantEnum::NO_SWEEP >
 
struct  Layernorm2dFusedQuantEnumName< Layernorm2dFusedQuantEnum::DYNAMIC_QUANT >
 
struct  Layernorm2dFusedQuantEnumName< Layernorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT >
 
struct  Layernorm2dFwdTraits
 
struct  BlockNormReduce
 
struct  BlockNormReduceSync
 
struct  BlockNormReduceCrossWarpSync
 
struct  BlockNormReduceProblem
 
struct  GenericPermuteHostArgs
 
struct  GenericPermute
 
struct  GenericPermuteProblem
 
struct  BlockReduce2D
 
struct  BlockReduce2d
 
struct  BlockReduce2dSync
 
struct  BlockReduce2dCrossWarpSync
 
struct  BlockReduce2dTreeCrossWarpSync
 
struct  BlockReduce2dProblem
 
struct  Reduce
 
struct  Reduce2dDefaultPolicy
 
struct  Reduce2dProblem
 
struct  Reduce2dShape
 
struct  Rmsnorm2dFwdHostArgs
 
struct  Rmsnorm2dFwd
 
struct  Rmsnorm2dFwdPipelineDefaultPolicy
 
struct  Rmsnorm2dFwdPipelineModelSensitiveT5Pass
 This T5Pass implements the RMSNorm2d forward pipeline as a variant based on Rmsnorm2dFwdPipelineOnePass and Rmsnorm2dFwdPipelineTwoPass using a T5 model-like method. More...
 
struct  Rmsnorm2dFwdPipelineOnePass
 
struct  Rmsnorm2dFwdPipelineProblem
 
struct  Rmsnorm2dFwdPipelineTwoPass
 
struct  Rmsnorm2dFusedAddEnumName
 
struct  Rmsnorm2dFusedAddEnumName< Rmsnorm2dFusedAddEnum::NO_ADD >
 
struct  Rmsnorm2dFusedAddEnumName< Rmsnorm2dFusedAddEnum::PRE_ADD_STORE >
 
struct  Rmsnorm2dFusedAddEnumName< Rmsnorm2dFusedAddEnum::PRE_ADD >
 
struct  Rmsnorm2dFusedQuantEnumName
 
struct  Rmsnorm2dFusedQuantEnumName< Rmsnorm2dFusedQuantEnum::NO_SWEEP >
 
struct  Rmsnorm2dFusedQuantEnumName< Rmsnorm2dFusedQuantEnum::DYNAMIC_QUANT >
 
struct  Rmsnorm2dFusedQuantEnumName< Rmsnorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT >
 
struct  Rmsnorm2dSensitiveEnumName
 
struct  Rmsnorm2dSensitiveEnumName< Rmsnorm2dSensitiveEnum::NO_SPECIFIC_MODEL >
 
struct  Rmsnorm2dSensitiveEnumName< Rmsnorm2dSensitiveEnum::T5_MODEL_LIKE >
 
struct  Rmsnorm2dFwdTraits
 
struct  MoeSmoothquantHostArgs
 
struct  MoeSmoothquant
 
struct  SmoothquantHostArgs
 
struct  Smoothquant
 
struct  SmoothquantPipelineDefaultPolicy
 
struct  SmoothquantPipelineOnePass
 
struct  SmoothquantPipelineProblem
 
struct  SmoothquantPipelineTwoPass
 
struct  BlockSoftmax2D
 
struct  BlockSoftmax2DProblem
 
struct  BlockTopkStream2D
 
struct  BlockTopkStream2DProblem
 
struct  TopkSoftmaxHostArgs
 
struct  TopkSoftmaxKernel
 
struct  TopkSoftmaxWarpPerRowPipeline
 
struct  TopkSoftmaxWarpPerRowPolicy
 
struct  TopkSoftmaxWarpPerRowProblem
 
struct  naive_attention_fwd_args
 
struct  naive_attention_fwd_traits
 
struct  naive_attention_fwd_kernel_traits
 
struct  naive_attention_fwd_kernel
 

Typedefs

template<index_t N>
using multi_index = array< index_t, N >
 
template<index_t N>
using make_index_sequence = typename __make_integer_seq< impl::__integer_sequence, index_t, N >::seq_type
 
template<typename... Seqs>
using sequence_merge_t = typename sequence_merge< Seqs... >::type
 
template<index_t NSize, index_t I>
using uniform_sequence_gen_t = typename uniform_sequence_gen< NSize, I >::type
 
template<typename T , index_t N>
using statically_indexed_array = tuple_array< T, N >
 
template<typename T , index_t N>
using thread_buffer = tuple_array< T, N >
 
template<typename T , index_t N>
using tuple_array = typename impl::tuple_array_impl< T, N >::type
 
template<typename T >
using is_tuple = decltype(std::declval< T & >().IsTuple())
 
using bfloat16_t = ushort
 
using bf16_t = bfloat16_t
 
using bf16_raw_t = uint16_t
 
using e8m0_t = e8m0_bexp_t
 
using e8m0_raw_t = typename e8m0_t::raw_type
 
using fp8_t = _BitInt(8)
 
using fp8_raw_t = uint8_t
 
using bf8_t = unsigned _BitInt(8)
 
using bf8_raw_t = uint8_t
 
using fp16_hip_t = _Float16
 
using fp16_raw_t = uint16_t
 
using fp16_t = _Float16
 
using half_t = _Float16
 
using fp16x2_t = _Float16
 
using int8_t = int8_t
 
using index_t = int32_t
 
using int32_t = int32_t
 
using long_index_t = int64_t
 
template<index_t v>
using number = constant< v >
 
template<long_index_t v>
using long_number = constant< v >
 
template<bool b>
using bool_constant = constant< b >
 
using fp32_t = float
 
using fp32x2_t = float
 
using bf16x2_t = bfloat16_t
 
using pk_fp4_t = pk_float4_e2m1_t
 
using pk_fp4_raw_t = typename pk_fp4_t::raw_type
 
using int8x2_t = int8_t
 
template<typename T , index_t N>
using ext_vector_t = typename impl::ext_vector< T, N >::type
 
template<typename X , typename Y >
using has_same_scalar_type = std::is_same< typename vector_traits< remove_cvref_t< X > >::scalar_type, typename vector_traits< remove_cvref_t< Y > >::scalar_type >
 
using fp64_t = double
 
using fp64x2_t = double
 
using fp64x4_t = double
 
using fp32x4_t = float
 
using fp32x8_t = float
 
using fp32x16_t = float
 
using fp32x32_t = float
 
using fp32x64_t = float
 
using fp16x4_t = _Float16
 
using fp16x8_t = _Float16
 
using fp16x16_t = _Float16
 
using fp16x32_t = _Float16
 
using fp16x64_t = _Float16
 
using bf16x4_t = bfloat16_t
 
using bf16x8_t = bfloat16_t
 
using bf16x16_t = bfloat16_t
 
using bf16x32_t = bfloat16_t
 
using bf16x64_t = bfloat16_t
 
using int32x2_t = int32_t
 
using int32x4_t = int32_t
 
using int32x8_t = int32_t
 
using int32x16_t = int32_t
 
using int32x32_t = int32_t
 
using int32x64_t = int32_t
 
using uint32x2_t = uint32_t
 
using uint32x4_t = uint32_t
 
using uint32x8_t = uint32_t
 
using uint32x16_t = uint32_t
 
using uint32x32_t = uint32_t
 
using uint32x64_t = uint32_t
 
using int16x2_t = int16_t
 
using int16x4_t = int16_t
 
using int16x8_t = int16_t
 
using int16x16_t = int16_t
 
using int16x32_t = int16_t
 
using int16x64_t = int16_t
 
using uint16x2_t = uint16_t
 
using uint16x4_t = uint16_t
 
using uint16x8_t = uint16_t
 
using uint16x16_t = uint16_t
 
using uint16x32_t = uint16_t
 
using uint16x64_t = uint16_t
 
using int8x4_t = int8_t
 
using int8x8_t = int8_t
 
using int8x16_t = int8_t
 
using int8x32_t = int8_t
 
using int8x64_t = int8_t
 
using uint8x2_t = uint8_t
 
using uint8x4_t = uint8_t
 
using uint8x8_t = uint8_t
 
using uint8x16_t = uint8_t
 
using uint8x32_t = uint8_t
 
using uint8x64_t = uint8_t
 
using fp8x2_t = fp8_t
 
using fp8x4_t = fp8_t
 
using fp8x8_t = fp8_t
 
using fp8x16_t = fp8_t
 
using fp8x32_t = fp8_t
 
using fp8x64_t = fp8_t
 
using bf8x2_t = bf8_t
 
using bf8x4_t = bf8_t
 
using bf8x8_t = bf8_t
 
using bf8x16_t = bf8_t
 
using bf8x32_t = bf8_t
 
using bf8x64_t = bf8_t
 
using pk_int4x2_t = int8_t
 
using pk_int4x4_t = int8_t
 
using pk_int4x8_t = int8_t
 
using pk_int4x16_t = int8_t
 
using pk_int4x32_t = int8_t
 
template<typename TileDistributionEncoding_ , typename DataType_ , typename Policy = DefaultTranspose<DataType_>>
using OutputTileDistributionTraits = TransposeTileDistributionTraits< TileDistributionEncoding_, DataType_, Policy, false >
 
template<typename TileDistributionEncoding_ , typename DataType_ , typename Policy = DefaultTranspose<DataType_>>
using InputTileDistributionTraits = TransposeTileDistributionTraits< TileDistributionEncoding_, DataType_, Policy, true >
 
template<typename encoding , typename shuffle >
using tile_distribution_encoding_shuffle_t = typename tile_distribution_encoding_shuffle< encoding, shuffle >::type
 
template<typename TensorView_ >
using default_linear_bottom_dims = typename impl::default_linear_bottom_dims_impl< TensorView_::buffer_view::get_address_space(), TensorView_::get_num_of_dimension()>::type
 
using magic_division = magic_division32_bit_range
 
template<typename T >
using remove_reference_t = typename std::remove_reference< T >::type
 
template<typename T >
using remove_cv_t = typename std::remove_cv< T >::type
 
template<typename T >
using remove_cvref_t = remove_cv_t< std::remove_reference_t< T > >
 
template<typename T >
using remove_pointer_t = typename std::remove_pointer< T >::type
 
template<typename From , typename To >
using copy_const_t = typename copy_const< From, To >::type
 
template<template< class... > class Op, class... Args>
using is_detected = typename detail::detector< nonesuch, void, Op, Args... >::value_t
 
template<typename T >
using is_static = impl::is_static_impl< remove_cvref_t< T > >
 
template<typename T >
using is_known_at_compile_time = is_static< T >
 
template<typename Tuple_ , std::size_t Idx, typename DefaultType >
using tuple_element_or_default_t = typename tuple_element_or_default< Tuple_, Idx, DefaultType >::type
 
using F8 = ck_tile::fp8_t
 8-bit floating point type More...
 
using BF8 = ck_tile::bf8_t
 8-bit brain floating point type More...
 
using F16 = ck_tile::half_t
 16-bit floating point (half precision) type More...
 
using BF16 = ck_tile::bf16_t
 16-bit brain floating point type More...
 
using F32 = float
 32-bit floating point (single precision) type More...
 
using I8 = int8_t
 8-bit signed integer type More...
 
using I32 = int32_t
 32-bit signed integer type More...
 
template<typename T >
using iter_value_t = typename std::iterator_traits< remove_cvref_t< T > >::value_type
 
template<typename T >
using iter_reference_t = decltype(*std::declval< T & >())
 
template<typename T >
using iter_difference_t = typename std::iterator_traits< remove_cvref_t< T > >::difference_type
 
template<bool kPadM_, bool kPadN_, bool UseSmoothInputScale_, bool UseRawStore_ = true, bool UseMax3_ = false>
using Default2DAndDynamicQuantEpilogueTraits = DynamicQuantEpilogueTraits< kPadM_, kPadN_, UseSmoothInputScale_, UseRawStore_, UseMax3_ >
 
using BlockFmhaBatchPrefillPipelineQRKSVSAsyncDefaultPolicy = BlockFmhaPipelineQXKSVSCustomPolicy< true, true, 3, 3 >
 
using BlockFmhaPipelineQRKSVSAsyncDefaultPolicy = BlockFmhaPipelineQXKSVSCustomPolicy< true, true, 3, 3 >
 
using BlockFmhaPipelineQRKSVSDefaultPolicy = BlockFmhaPipelineQXKSVSCustomPolicy< true, false, 1, 1 >
 
using GemmPipelineAGmemBGmemCRegV2DefaultPolicy = GemmPipelineAGmemBGmemCRegV1DefaultPolicy
 
template<typename ADataType_ , typename BDataType_ , typename CDataType_ , typename BlockGemmShape_ , typename Traits_ , typename ComputeDataType_ = ADataType_, bool FixedVectorSize_ = false, index_t VectorSizeA_ = 1, index_t VectorSizeB_ = 1>
using GemmPipelineProblem = GemmPipelineProblemBase< ADataType_, BDataType_, CDataType_, BlockGemmShape_, Traits_, ComputeDataType_, FixedVectorSize_, VectorSizeA_, VectorSizeB_ >
 
template<bool kPadM_, bool kPadN_, bool kPadK_, bool DoubleSmemBuffer_, typename ALayout_ , typename BLayout_ , typename CLayout_ , bool TransposeC_ = false, bool UseStructuredSparsity_ = false>
using PersistentTileGemmUniversalTraits = TileGemmUniversalTraits< kPadM_, kPadN_, kPadK_, DoubleSmemBuffer_, ALayout_, BLayout_, CLayout_, TransposeC_, UseStructuredSparsity_, true >
 
using WarpGemmMfmaF16F16F32M32N32K8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImplF16F16F32M32N32K8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfmaF16F16F32M16N16K16 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImplF16F16F32M16N16K16< WGAttrCtlEnum::Default_ > >>
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl< WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplF16F16F32M32N32K8< WGAttrCtlEnum::Default_ >, 2, AttrNumAccess > >
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl< WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplF16F16F32M16N16K16< WGAttrCtlEnum::Default_ >, 2, AttrNumAccess > >
 
using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl< WarpGemmAttributeMfmaIterateK_SwizzleA< WarpGemmAttributeMfmaImplF16F16F32M32N32K8< WGAttrCtlEnum::Default_ >, 1 > >
 
using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl< WarpGemmAttributeMfmaIterateK_SwizzleA< WarpGemmAttributeMfmaImplF16F16F32M32N32K8< WGAttrCtlEnum::Default_ >, 2 > >
 
using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution = WarpGemmImpl< WarpGemmAttributeMfmaTransposedCDistribution< WarpGemmAttributeMfmaImplF16F16F32M32N32K8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution = WarpGemmImpl< WarpGemmAttributeMfmaTransposedCDistribution< WarpGemmAttributeMfmaImplF16F16F32M16N16K16< WGAttrCtlEnum::Default_ > >>
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution = WarpGemmImpl< WarpGemmAttributeMfmaIterateKAndTransposedCDistribution< WarpGemmAttributeMfmaImplF16F16F32M32N32K8< WGAttrCtlEnum::Default_ >, 2, AttrNumAccess > >
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution = WarpGemmImpl< WarpGemmAttributeMfmaIterateKAndTransposedCDistribution< WarpGemmAttributeMfmaImplF16F16F32M16N16K16< WGAttrCtlEnum::Default_ >, 2, AttrNumAccess > >
 
using WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution = WarpGemmImpl< WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB< WarpGemmAttributeMfmaImplF16F16F32M32N32K8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution = WarpGemmImpl< WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB< WarpGemmAttributeMfmaImplF16F16F32M32N32K8< WGAttrCtlEnum::Default_ >, 2 > >
 
using WarpGemmMfmaF16F16F32M4N64K16 = WarpGemmImpl< WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplF16F16F32M4N64K4< WGAttrCtlEnum::Default_ >, 4 > >
 
using WarpGemmMfmaF16F16F32M64N4K16 = WarpGemmImpl< WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplF16F16F32M64N4K4< WGAttrCtlEnum::Default_ >, 4 > >
 
using WarpGemmSmfmacF16F16F32M32N32K16 = WarpGemmSmfmacImpl< WarpGemmAttributeSmfmac< WarpGemmAttributeSmfmacImplF16F16F32M32N32K16< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmSmfmacF16F16F32M16N16K32 = WarpGemmSmfmacImpl< WarpGemmAttributeSmfmac< WarpGemmAttributeSmfmacImplF16F16F32M16N16K32< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfmaBf16Bf16F32M16N16K16 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16< WGAttrCtlEnum::Default_ > >>
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl< WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8< WGAttrCtlEnum::Default_ >, 2, AttrNumAccess > >
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl< WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16< WGAttrCtlEnum::Default_ >, 2, AttrNumAccess > >
 
using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl< WarpGemmAttributeMfmaIterateK_SwizzleA< WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8< WGAttrCtlEnum::Default_ >, 1 > >
 
using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA = WarpGemmImpl< WarpGemmAttributeMfmaIterateK_SwizzleA< WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8< WGAttrCtlEnum::Default_ >, 2 > >
 
using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution = WarpGemmImpl< WarpGemmAttributeMfmaTransposedCDistribution< WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution = WarpGemmImpl< WarpGemmAttributeMfmaTransposedCDistribution< WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16< WGAttrCtlEnum::Default_ > >>
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution = WarpGemmImpl< WarpGemmAttributeMfmaIterateKAndTransposedCDistribution< WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8< WGAttrCtlEnum::Default_ >, 2, AttrNumAccess > >
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution = WarpGemmImpl< WarpGemmAttributeMfmaIterateKAndTransposedCDistribution< WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16< WGAttrCtlEnum::Default_ >, 2, AttrNumAccess > >
 
using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution = WarpGemmImpl< WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB< WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution = WarpGemmImpl< WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB< WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8< WGAttrCtlEnum::Default_ >, 2 > >
 
using WarpGemmMfmaBf16Bf16F32M4N64K16 = WarpGemmImpl< WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplBf16Bf16F32M4N64K4< WGAttrCtlEnum::Default_ >, 4 > >
 
using WarpGemmMfmaBf16Bf16F32M64N4K16 = WarpGemmImpl< WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4< WGAttrCtlEnum::Default_ >, 4 > >
 
using WarpGemmMfma_f32_32x32x16_fp8_fp8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfma_f32_32x32x16_fp8_bf8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfma_f32_32x32x16_bf8_fp8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfma_f32_32x32x16_bf8_bf8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfma_f32_32x32x32_fp8_fp8 = WarpGemmImpl< WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8< WGAttrCtlEnum::Default_ >, 2 > >
 
using WarpGemmMfma_f32_32x32x32_bf8_bf8 = WarpGemmImpl< WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8< WGAttrCtlEnum::Default_ >, 2 > >
 
using WarpGemmMfma_f32_16x16x32_fp8_fp8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfma_f32_16x16x32_fp8_fp8_CTransposed = WarpGemmImpl< WarpGemmAttributeMfmaTransposedCDistribution< WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfma_f32_16x16x32_bf8_bf8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfma_f32_16x16x32_bf8_bf8_CTransposed = WarpGemmImpl< WarpGemmAttributeMfmaTransposedCDistribution< WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfma_f32_16x16x64_fp8_fp8 = WarpGemmImpl< WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8< WGAttrCtlEnum::Default_ >, 2 > >
 
using WarpGemmMfma_f32_16x16x64_bf8_bf8 = WarpGemmImpl< WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8< WGAttrCtlEnum::Default_ >, 2 > >
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_16x16x128_fp8_fp8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8< WGAttrCtlEnum::Default_ >, AttrNumAccess > >
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_16x16x128_fp8_bf8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8< WGAttrCtlEnum::Default_ >, AttrNumAccess > >
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_16x16x128_bf8_fp8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8< WGAttrCtlEnum::Default_ >, AttrNumAccess > >
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_16x16x128_bf8_bf8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8< WGAttrCtlEnum::Default_ >, AttrNumAccess > >
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_32x32x64_fp8_fp8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8< WGAttrCtlEnum::Default_ >, AttrNumAccess > >
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_32x32x64_fp8_bf8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8< WGAttrCtlEnum::Default_ >, AttrNumAccess > >
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_32x32x64_bf8_fp8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8< WGAttrCtlEnum::Default_ >, AttrNumAccess > >
 
template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmMfma_f32_32x32x64_bf8_bf8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8< WGAttrCtlEnum::Default_ >, AttrNumAccess > >
 
using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed = WarpGemmImpl< WarpGemmAttributeMfmaTransposedCDistribution< WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed = WarpGemmImpl< WarpGemmAttributeMfmaTransposedCDistribution< WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed = WarpGemmImpl< WarpGemmAttributeMfmaTransposedCDistribution< WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed = WarpGemmImpl< WarpGemmAttributeMfmaTransposedCDistribution< WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8< WGAttrCtlEnum::Default_ > >>
 
template<index_t swizzle_factor = 2>
using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution = WarpGemmImpl< WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB< WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base< fp8_t, fp8_t, WGAttrCtlEnum::Default_ >, 2, swizzle_factor > >
 
using WarpGemmMfma_i32_32x32x16_i8_i8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_i32_32x32x16_i8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed = WarpGemmImpl< WarpGemmAttributeMfmaTransposedCDistribution< WarpGemmAttributeMfmaImpl_i32_32x32x16_i8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfma_i32_16x16x32_i8_i8 = WarpGemmImpl< WarpGemmAttributeMfma< WarpGemmAttributeMfmaImpl_i32_16x16x32_i8< WGAttrCtlEnum::Default_ > >>
 
using WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed = WarpGemmImpl< WarpGemmAttributeMfmaTransposedCDistribution< WarpGemmAttributeMfmaImpl_i32_16x16x32_i8< WGAttrCtlEnum::Default_ > >>
 
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8 = WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base< fp8_t, fp8_t, Ctrl_ >
 
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8 = WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base< fp8_t, fp8_t, Ctrl_ >
 
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8 = WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base< fp8_t, bf8_t, Ctrl_ >
 
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8 = WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base< bf8_t, bf8_t, Ctrl_ >
 
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8 = WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base< bf8_t, fp8_t, Ctrl_ >
 
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8 = WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base< bf8_t, bf8_t, Ctrl_ >
 
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8 = WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base< fp8_t, fp8_t, Ctrl_ >
 
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8 = WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base< fp8_t, bf8_t, Ctrl_ >
 
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8 = WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base< bf8_t, fp8_t, Ctrl_ >
 
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8 = WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base< bf8_t, bf8_t, Ctrl_ >
 
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8 = WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base< fp8_t, fp8_t, Ctrl_ >
 
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8 = WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base< fp8_t, bf8_t, Ctrl_ >
 
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8 = WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base< bf8_t, fp8_t, Ctrl_ >
 
template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8 = WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base< bf8_t, bf8_t, Ctrl_ >
 
using DeviceIp = remove_cvref_t< decltype(ck_tile::get_device_arch())>
 
using WarpGemmAttributeWmmaImpl_f32_16x16x16_f16_f16 = WarpGemmAttributeWmmaImpl< WmmaTraits< DeviceIp, fp16_t, fp16_t, float, 16, 16, 16 > >
 
using WarpGemmAttributeWmmaImpl_f32_16x16x16_bf16_bf16 = WarpGemmAttributeWmmaImpl< WmmaTraits< DeviceIp, bf16_t, bf16_t, float, 16, 16, 16 > >
 
using WarpGemmAttributeWmmaImpl_i32_16x16x16_i8_i8 = WarpGemmAttributeWmmaImpl< WmmaTraits< DeviceIp, int8_t, int8_t, int32_t, 16, 16, 16 > >
 
using WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_f8 = WarpGemmAttributeWmmaImpl< WmmaTraits< gfx12_t, fp8_t, fp8_t, float, 16, 16, 16 > >
 
using WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_bf8 = WarpGemmAttributeWmmaImpl< WmmaTraits< gfx12_t, bf8_t, bf8_t, float, 16, 16, 16 > >
 
using WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_bf8 = WarpGemmAttributeWmmaImpl< WmmaTraits< gfx12_t, fp8_t, bf8_t, float, 16, 16, 16 > >
 
using WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_f8 = WarpGemmAttributeWmmaImpl< WmmaTraits< gfx12_t, bf8_t, fp8_t, float, 16, 16, 16 > >
 
template<typename AType , typename BType , typename AccType , index_t MPerWave, index_t NPerWave, index_t KPerWave, bool TransposeC, bool SwizzleA = false, bool UseStructuredSparsity = false, WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using WarpGemmDispatcher = typename impl::WarpGemmDispatcher< AType, BType, AccType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity, AttrNumAccess >::Type
 
template<bool kTransC = false>
using WarpGemmWmma_f32_16x16x16_f16_f16 = WarpGemmImpl< WarpGemmAttributeWmma< WarpGemmAttributeWmmaImpl_f32_16x16x16_f16_f16, kTransC > >
 
template<bool kTransC = false>
using WarpGemmWmma_f32_16x16x16_bf16_bf16 = WarpGemmImpl< WarpGemmAttributeWmma< WarpGemmAttributeWmmaImpl_f32_16x16x16_bf16_bf16, kTransC > >
 
template<bool kTransC = false>
using WarpGemmWmma_i32_16x16x16_i8_i8 = WarpGemmImpl< WarpGemmAttributeWmma< WarpGemmAttributeWmmaImpl_i32_16x16x16_i8_i8, kTransC > >
 
template<bool kTransC = false>
using WarpGemmWmma_f32_16x16x16_f8_f8 = WarpGemmImpl< WarpGemmAttributeWmma< WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_f8, kTransC > >
 
template<bool kTransC = false>
using WarpGemmWmma_f32_16x16x16_bf8_bf8 = WarpGemmImpl< WarpGemmAttributeWmma< WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_bf8, kTransC > >
 
template<bool kTransC = false>
using WarpGemmWmma_f32_16x16x16_f8_bf8 = WarpGemmImpl< WarpGemmAttributeWmma< WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_bf8, kTransC > >
 
template<bool kTransC = false>
using WarpGemmWmma_f32_16x16x16_bf8_f8 = WarpGemmImpl< WarpGemmAttributeWmma< WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_f8, kTransC > >
 
template<typename ADataType_ , typename AQDataType_ , typename BDataType_ , typename CDataType_ , typename BlockGemmShape_ , typename Traits_ , uint32_t QuantGroupSize_, bool TransposeC_, typename ComputeDataType_ = BDataType_, GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave, bool HasHotLoop_ = true, TailNumber TailNum_ = TailNumber::Full>
using GemmAQuantPipelineProblem = GemmAQuantPipelineProblemBase< ADataType_, AQDataType_, BDataType_, CDataType_, BlockGemmShape_, Traits_, QuantGroupSize_, TransposeC_, ComputeDataType_, Scheduler_, HasHotLoop_, TailNum_ >
 
template<typename ADataType_ , typename BDataType_ , typename BQDataType_ , typename CDataType_ , typename BlockGemmShape_ , typename Traits_ , uint32_t QuantGroupSize_, typename ComputeDataType_ = ADataType_, GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave, bool HasHotLoop_ = true, TailNumber TailNum_ = TailNumber::Full>
using GemmBQuantPipelineProblem = GemmBQuantPipelineProblemBase< ADataType_, BDataType_, BQDataType_, CDataType_, BlockGemmShape_, Traits_, QuantGroupSize_, ComputeDataType_, Scheduler_, HasHotLoop_, TailNum_ >
 
using GroupedConvFwdHostArgs = GroupedConvHostArgs< const void *, const void *, void * >
 
using GroupedConvBwdWeightHostArgs = GroupedConvHostArgs< const void *, void *, const void * >
 
using GroupedConvBwdDataHostArgs = GroupedConvHostArgs< void *, const void *, const void * >
 

Enumerations

enum class  coord_transform_enum {
  undefined ,
  pass_through ,
  pad ,
  embed ,
  merge ,
  unmerge ,
  replicate ,
  xor_t ,
  offset ,
  indexing
}
 
enum class  tile_distribution_pattern {
  thread_raked ,
  warp_raked ,
  block_raked
}
 Enumeration describing static tile distribution patterns. More...
 
enum class  amd_buffer_coherence_enum {
  coherence_default = 0 ,
  glc = 1 ,
  slc = 2 ,
  glc_slc = 3 ,
  WAVE_NT0 = 0 ,
  WAVE_NT1 = 2 ,
  GROUP_NT0 = 1 ,
  GROUP_NT1 = 3 ,
  DEVICE_NT0 = 8 ,
  DEVICE_NT1 = 10 ,
  SYSTEM_NT0 = 9 ,
  SYSTEM_NT1 = 11
}
 
enum class  bf16_rounding_mode {
  standard = 0 ,
  truncate_with_nan ,
  truncate ,
  standard_asm ,
  rta_asm
}
 
enum class  fp8_rounding_mode {
  standard = 0 ,
  stochastic
}
 
enum class  fp8_interpretation {
  E4M3_OCP = 0 ,
  E5M2_OCP = 1 ,
  E4M3_FNUZ = 2 ,
  E5M2_FNUZ = 3
}
 FP8 interpretation used in conversion algorithms. More...
 
enum class  BlockAttentionBiasEnum {
  NO_BIAS = 0 ,
  ELEMENTWISE_BIAS = 1 ,
  ALIBI = 2
}
 
enum class  GenericAttentionMaskEnum {
  NO_MASK = 0 ,
  MASK_FROM_TOP_LEFT = 1 ,
  MASK_FROM_BOTTOM_RIGHT = 2 ,
  MASK_GENERIC
}
 
enum class  PositionEncodingEnum {
  NO = 0 ,
  ALIBI = 1
}
 
enum class  AlibiMode {
  VERTICAL = 0 ,
  FROM_TOP_LEFT = 1 ,
  FROM_BOTTOM_RIGHT = 2
}
 
enum class  RotaryEmbeddingEnum {
  NONE = 0 ,
  INTERLEAVED = 1 ,
  HALF_ROTATED = 2
}
 
enum class  BlockFmhaPipelineEnum {
  QRKSVS = 0 ,
  QRKSVS_ASYNC ,
  QSKSVS ,
  QRKSVS_ASYNC_TRLOAD
}
 
enum class  FusedMoeGemmWeightPermuteEnum {
  no_permute = 0 ,
  b_nr_kr_kw_nw_kv = 1 ,
  b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv
}
 
enum class  FusedMoeGemmPipelineSequencerEnum {
  SLD_A = 1 << 0 ,
  SLD_B = 1 << 1 ,
  GLD_A = 1 << 2 ,
  GLD_B = 1 << 3 ,
  SST_A = 1 << 4 ,
  SST_B = 1 << 5 ,
  GST_O = 1 << 6
}
 
enum class  GemmLoopOrder {
  KMN ,
  MNK
}
 
enum  StreamKReductionStrategy : uint32_t {
  Atomic = 0u ,
  Reduction = 1u
}
 
enum class  GemmPipelineScheduler {
  Default ,
  Intrawave ,
  Interwave
}
 
enum class  TailNumber {
  Odd ,
  Even ,
  One ,
  Two ,
  Three ,
  Four ,
  Five ,
  Six ,
  Seven ,
  Empty ,
  Full
}
 
enum class  WGAttrNumAccessEnum {
  Single = 1 ,
  Double = 2 ,
  Quad = 4 ,
  Invalid = -1
}
 
enum class  WGAttrCtlEnum {
  Default_ = 0 ,
  Raw_vvv = 1 ,
  Raw_vaa = 2 ,
  Raw_vav = 3 ,
  Raw_vva = 4 ,
  Raw_avv = 5
}
 
enum class  ConvolutionSpecialization {
  Default ,
  Filter1x1Pad0 ,
  Filter1x1Stride1Pad0 ,
  Filter3x3
}
 
enum class  Layernorm2dXBiasEnum {
  NO_BIAS = 0 ,
  ADD_BIAS = 1
}
 
enum class  Layernorm2dFusedAddEnum {
  NO_ADD = 0 ,
  PRE_ADD_STORE = 1 ,
  PRE_ADD = 2
}
 
enum class  Layernorm2dFusedQuantEnum {
  NO_SWEEP = 0 ,
  SMOOTH_DYNAMIC_QUANT = 1 ,
  DYNAMIC_QUANT = 2
}
 
enum class  Rmsnorm2dFusedAddEnum {
  NO_ADD = 0 ,
  PRE_ADD_STORE = 1 ,
  PRE_ADD = 2
}
 
enum class  Rmsnorm2dFusedQuantEnum {
  NO_SWEEP = 0 ,
  SMOOTH_DYNAMIC_QUANT = 1 ,
  DYNAMIC_QUANT = 2
}
 
enum class  Rmsnorm2dSensitiveEnum {
  NO_SPECIFIC_MODEL = 0 ,
  T5_MODEL_LIKE = 1
}
 
enum class  naive_attention_layout_enum {
  DEFAULT ,
  BSHD ,
  BHSD ,
  BS3HD ,
  PHSD ,
  PHDSX ,
  PHDS ,
  SCALE_HS ,
  SCALE_SH
}
 
enum class  naive_attention_variation_enum {
  FLASH_BATCHED = 0 ,
  FLASH_GROUPED ,
  DECODE_PAGED
}
 
enum class  naive_attention_quant_algo {
  NO = 0 ,
  KV_8BIT_PERHEAD = 1 ,
  KV_8BIT_PERTOKEN = 2
}
 

Functions

template<typename Lengths , typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type>
constexpr CK_TILE_HOST_DEVICE auto make_cluster_descriptor (const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::size(), 1 >::type{})
 
template<typename LowLength >
constexpr CK_TILE_HOST_DEVICE auto make_pass_through_transform (const LowLength &low_length)
 
template<typename LowLength , typename LeftPad , typename RightPad , bool SkipIsValidCheck = false>
constexpr CK_TILE_HOST_DEVICE auto make_pad_transform (const LowLength &low_length, const LeftPad &left_pad, const RightPad &right_pad, bool_constant< SkipIsValidCheck >=bool_constant< false >{})
 
template<typename LowLength , typename LeftPadLength , bool SkipIsValidCheck = false>
constexpr CK_TILE_HOST_DEVICE auto make_left_pad_transform (const LowLength &low_length, const LeftPadLength &left_pad_, bool_constant< SkipIsValidCheck >=bool_constant< false >{})
 
template<typename LowLength , typename RightPadLength , bool SkipIsValidCheck = false>
constexpr CK_TILE_HOST_DEVICE auto make_right_pad_transform (const LowLength &low_length, const RightPadLength &right_pad_, bool_constant< SkipIsValidCheck >=bool_constant< false >{})
 
template<typename UpLengths , typename Coefficients , typename std::enable_if< UpLengths::size()==Coefficients::size(), bool >::type = false>
constexpr CK_TILE_HOST_DEVICE auto make_embed_transform (const UpLengths &up_lengths, const Coefficients &coefficients)
 
template<typename LowLengths >
constexpr CK_TILE_HOST_DEVICE auto make_merge_transform_v2_magic_division (const LowLengths &low_lengths)
 
template<typename LowLengths >
constexpr CK_TILE_HOST_DEVICE auto make_merge_transform_v3_division_mod (const LowLengths &low_lengths)
 
template<typename LowLengths >
constexpr CK_TILE_HOST_DEVICE auto make_merge_transform (const LowLengths &low_lengths)
 
template<typename UpLengths , bool Use24BitIntegerCalculation = false>
constexpr CK_TILE_HOST_DEVICE auto make_unmerge_transform (const UpLengths &up_lengths, bool_constant< Use24BitIntegerCalculation >=bool_constant< false >{})
 
template<typename LowerIndex >
constexpr CK_TILE_HOST_DEVICE auto make_freeze_transform (const LowerIndex &low_idx)
 
template<typename UpperIndex >
constexpr CK_TILE_HOST_DEVICE auto make_insert_transform (const UpperIndex &up_idx)
 
template<typename UpLengths >
constexpr CK_TILE_HOST_DEVICE auto make_replicate_transform (const UpLengths &up_lengths)
 
template<typename LowLength , typename SliceBegin , typename SliceEnd >
constexpr CK_TILE_HOST_DEVICE auto make_slice_transform (const LowLength &low_length, const SliceBegin &slice_begin, const SliceEnd &slice_end)
 
template<typename Modulus , typename UpLength >
constexpr CK_TILE_HOST_DEVICE auto make_modulo_transform (const Modulus &modulus, const UpLength &up_length)
 
template<typename LowLengths >
constexpr CK_TILE_HOST_DEVICE auto make_xor_transform (const LowLengths &low_lengths)
 
template<typename LowLength , typename OffsetLength >
constexpr CK_TILE_HOST_DEVICE auto make_offset_transform (const LowLength &low_length, const OffsetLength &offset_length)
 
template<typename UpLength , typename Indices >
constexpr CK_TILE_HOST_DEVICE auto make_indexing_transform (const UpLength &up_lengths, const Indices &indices)
 
template<typename UpLength , typename IndexingAdaptor >
constexpr CK_TILE_HOST_DEVICE auto make_indexing_transform_with_adaptor (const UpLength &up_lengths, const IndexingAdaptor &iadaptor)
 
constexpr const char * tile_distribution_pattern_to_string (tile_distribution_pattern pattern)
 
template<index_t BlockSize, index_t YPerTile, index_t XPerTile, index_t VecSize, tile_distribution_pattern DistributionPattern, index_t NumWaveGroups>
CK_TILE_HOST_DEVICE void print (const TileDistributionEncodingPattern2D< BlockSize, YPerTile, XPerTile, VecSize, DistributionPattern, NumWaveGroups > &)
 
CK_TILE_DEVICE int32x4_t make_wave_buffer_resource (const void *ptr, uint32_t size=0xffffffff)
 
CK_TILE_DEVICE void buffer_load_fence (index_t cnt=0)
 
CK_TILE_DEVICE void lds_load_fence (index_t cnt=0)
 
template<typename... T>
CK_TILE_DEVICE void buffer_load_fence (index_t cnt=0, T &... o)
 
CK_TILE_DEVICE void buffer_store_fence (index_t cnt=0)
 
CK_TILE_DEVICE auto async_load_fence_raw (index_t cnt=0)
 
CK_TILE_DEVICE_EXTERN int8_t llvm_amdgcn_raw_buffer_load_i8 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
 
CK_TILE_DEVICE_EXTERN int8x2_t llvm_amdgcn_raw_buffer_load_i8x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
 
CK_TILE_DEVICE_EXTERN int8x4_t llvm_amdgcn_raw_buffer_load_i8x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
 
CK_TILE_DEVICE_EXTERN int16_t llvm_amdgcn_raw_buffer_load_i16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
 
CK_TILE_DEVICE_EXTERN int16x2_t llvm_amdgcn_raw_buffer_load_i16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
 
CK_TILE_DEVICE_EXTERN int16x4_t llvm_amdgcn_raw_buffer_load_i16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
 
CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_load_i32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
 
CK_TILE_DEVICE_EXTERN int32x2_t llvm_amdgcn_raw_buffer_load_i32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
 
CK_TILE_DEVICE_EXTERN int32x4_t llvm_amdgcn_raw_buffer_load_i32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
 
CK_TILE_DEVICE_EXTERN _Float16 llvm_amdgcn_raw_buffer_load_fp16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
 
CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_load_fp16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
 
CK_TILE_DEVICE_EXTERN fp16x4_t llvm_amdgcn_raw_buffer_load_fp16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
 
CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_load_fp32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
 
CK_TILE_DEVICE_EXTERN fp32x2_t llvm_amdgcn_raw_buffer_load_fp32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
 
CK_TILE_DEVICE_EXTERN fp32x4_t llvm_amdgcn_raw_buffer_load_fp32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8 (int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8x2 (int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8x4 (int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16 (int16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x2 (int16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x4 (int16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16 (uint16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x2 (uint16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x4 (uint16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x2 (int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x4 (int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16 (_Float16 vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x2 (fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x4 (fp16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x2 (fp32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x4 (fp32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
 
CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2 (fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
 
CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
 
CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
 
CK_TILE_DEVICE_EXTERN double llvm_amdgcn_raw_buffer_atomic_max_fp64 (double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
 
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_load_lds (int32x4_t rsrc, as3_uint32_ptr lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
 
template<unsigned num_dwords, bool pre_nop = false>
CK_TILE_DEVICE void async_buffer_load_dwordxn_v (void *smem, int32x4_t rsrc, index_t voffset, index_t, index_t ioffset, index_t=0, bool_constant< pre_nop >={})
 
CK_TILE_DEVICE void async_buffer_load_fence (index_t cnt=0)
 
template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer< int8_t, N > amd_buffer_load_impl_with_bytes (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_impl (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void amd_buffer_load_raw_impl (thread_buffer< T, N > &dst, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_linear_addr_offset, index_t flag=0, bool_constant< pre_nop >={})
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void amd_async_buffer_load_impl (CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, bool_constant< pre_nop >={})
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void amd_async_buffer_load (CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, index_t flag=0, bool_constant< oob_conditional_check >={})
 
template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes (const thread_buffer< int8_t, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE void amd_buffer_store_impl (const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void amd_buffer_store_raw_impl (const thread_buffer< T, N > &dst_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset, index_t dst_linear_addr_offset, index_t is_valid_element=1)
 
template<typename T , index_t N>
CK_TILE_DEVICE void amd_buffer_atomic_add_impl (const thread_buffer< T, N > &src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N>
CK_TILE_DEVICE void amd_buffer_atomic_max_impl (const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_zero (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_customized_value (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void amd_buffer_load_raw (thread_buffer< T, N > &dst, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, index_t is_valid_element=0, bool_constant< pre_nop >={})
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void amd_buffer_load_raw (thread_buffer< T, N > &dst, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t is_valid_element=0, bool_constant< pre_nop >={})
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw (T *smem, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, bool_constant< pre_nop >={})
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw (T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool_constant< pre_nop >={})
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = false>
CK_TILE_DEVICE void amd_async_buffer_load_with_oob (CK_TILE_LDS_ADDR T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool is_valid_element, bool_constant< oob_conditional_check >={})
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void amd_buffer_store (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void amd_buffer_store_raw (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
template<typename T , index_t N>
CK_TILE_DEVICE void amd_buffer_atomic_add (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void amd_buffer_atomic_add_raw (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size, bool_constant< pre_nop >={})
 
template<typename T , index_t N>
CK_TILE_DEVICE void amd_buffer_atomic_max (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
template<typename T , index_t LaneGroupSize, index_t kOuterDistDim0, index_t kOuterDistDim1, index_t kInnerDistDim0, index_t kInnerDistDim1>
constexpr CK_TILE_DEVICE auto make_transposed_distr_encode ()
 
template<typename T , typename ComputeType >
CK_TILE_HOST_DEVICEadd (const T &a, const T &b)
 
CK_TILE_HOST_DEVICE bf16x2_t add_bf16x2_t (const bf16x2_t &a, const bf16x2_t &b)
 
CK_TILE_HOST_DEVICE bf16x4_t add_bf16x4_t (const bf16x4_t &a, const bf16x4_t &b)
 
CK_TILE_HOST_DEVICE fp16x2_t add_f16x2_t (const fp16x2_t &a, const fp16x2_t &b)
 
CK_TILE_HOST_DEVICE fp8x4_t add_fp8x4_t (const fp8x4_t &a, const fp8x4_t &b)
 
CK_TILE_HOST_DEVICE fp8x8_t add_fp8x8_t (const fp8x8_t &a, const fp8x8_t &b)
 
CK_TILE_HOST_DEVICE bf8x4_t add_bf8x4_t (const bf8x4_t &a, const bf8x4_t &b)
 
CK_TILE_HOST_DEVICE bf8x8_t add_bf8x8_t (const bf8x8_t &a, const bf8x8_t &b)
 
template<typename X >
CK_TILE_DEVICE void atomic_add (X *p_dst, const X &x)
 
template<>
CK_TILE_DEVICE void atomic_add< bf16x2_t > (bf16x2_t *p_dst, const bf16x2_t &x)
 
template<>
CK_TILE_DEVICE void atomic_add< bf16x4_t > (bf16x4_t *p_dst, bf16x4_t const &x)
 
template<>
CK_TILE_DEVICE void atomic_add< fp8x4_t > (fp8x4_t *p_dst, const fp8x4_t &x)
 
template<>
CK_TILE_DEVICE void atomic_add< bf8x4_t > (bf8x4_t *p_dst, const bf8x4_t &x)
 
template<>
CK_TILE_DEVICE void atomic_add< fp8x8_t > (fp8x8_t *p_dst, fp8x8_t const &x)
 
template<>
CK_TILE_DEVICE void atomic_add< bf8x8_t > (bf8x8_t *p_dst, bf8x8_t const &x)
 
template<>
CK_TILE_DEVICE void atomic_add< fp16x2_t > (fp16x2_t *p_dst, fp16x2_t const &x)
 
template<typename T , index_t N>
CK_TILE_DEVICE void atomic_add_g (T *p_dst, const thread_buffer< T, N > &x)
 
template<typename T , index_t N>
CK_TILE_DEVICE void atomic_max_g (T *p_dst, const thread_buffer< T, N > &x)
 
CK_TILE_DEVICE void m0_set_with_memory (index_t v)
 
CK_TILE_DEVICE void m0_inc_with_memory (index_t v)
 
template<typename T >
CK_TILE_DEVICEwarp_shuffle_up (const T &v_local, uint32_t lane_delta)
 
template<typename T >
CK_TILE_DEVICEwarp_shuffle_down (const T &v_local, uint32_t lane_delta)
 
template<typename T >
CK_TILE_DEVICE auto warp_shuffle_down_pair (const T &v_local)
 
template<typename T >
CK_TILE_DEVICEwarp_shuffle (const T &v_local, uint32_t src_lane)
 
template<typename T >
CK_TILE_DEVICE auto flag_to_exec (const T &v_flag)
 
template<typename X , typename Y >
CK_TILE_DEVICE auto cmp_lt_to_exec (const X &x, const Y &y)
 
template<typename D = void, typename... Ts>
constexpr CK_TILE_HOST_DEVICE details::return_type< D, Ts... > make_array (Ts &&... ts)
 
template<typename T , index_t Size>
constexpr CK_TILE_HOST_DEVICE auto make_array_with (std::initializer_list< T > ilist)
 
template<typename T , index_t Size>
constexpr CK_TILE_HOST_DEVICE bool operator== (const array< T, Size > &a, const array< T, Size > &b)
 
template<typename T , index_t Size>
constexpr CK_TILE_HOST_DEVICE bool operator!= (const array< T, Size > &a, const array< T, Size > &b)
 
template<typename T , index_t N, typename X >
constexpr CK_TILE_HOST_DEVICE auto to_array (const std::vector< X > &x)
 
template<typename T , index_t N, typename X >
constexpr CK_TILE_HOST_DEVICE auto to_array (const X &x)
 
template<typename TData , index_t NSize>
constexpr CK_TILE_HOST_DEVICE auto container_push_back (const array< TData, NSize > &a, const TData &x)
 
template<typename... Ts, typename T >
constexpr CK_TILE_HOST_DEVICE auto container_push_front (const tuple< Ts... > &a, const T &x)
 
template<typename... Ts, typename T >
constexpr CK_TILE_HOST_DEVICE auto container_push_back (const tuple< Ts... > &a, const T &x)
 
template<typename TData , index_t NSize, index_t... IRs>
constexpr CK_TILE_HOST_DEVICE auto container_reorder_given_new2old (const array< TData, NSize > &old_array, sequence< IRs... >)
 
template<typename TData , index_t NSize, index_t... IRs>
constexpr CK_TILE_HOST_DEVICE auto container_reorder_given_old2new (const array< TData, NSize > &old_array, sequence< IRs... > old2new)
 
template<typename TData , index_t NSize>
constexpr CK_TILE_HOST_DEVICE auto container_reorder_given_new2old (const array< TData, NSize > &old_array, const map< index_t, index_t > &new2old)
 
template<typename TData , index_t NSize>
constexpr CK_TILE_HOST_DEVICE auto container_reorder_given_old2new (const array< TData, NSize > &old_array, const map< index_t, index_t > &old2new)
 
template<typename... Ts, index_t... IRs>
constexpr CK_TILE_HOST_DEVICE auto container_reorder_given_new2old (const tuple< Ts... > &old_tuple, sequence< IRs... >)
 
template<typename... Ts, index_t... IRs>
constexpr CK_TILE_HOST_DEVICE auto container_reorder_given_old2new (const tuple< Ts... > &old_tuple, sequence< IRs... > old2new)
 
template<index_t... Is, index_t... IRs>
constexpr CK_TILE_HOST_DEVICE auto container_reorder_given_new2old (sequence< Is... >, sequence< IRs... >)
 
template<index_t... Is, index_t... IRs>
constexpr CK_TILE_HOST_DEVICE auto container_reorder_given_old2new (sequence< Is... > old_seq, sequence< IRs... >)
 
template<typename Container , typename Reduce , typename ROld , index_t I, index_t IEnd, index_t IStep>
constexpr CK_TILE_HOST_DEVICE auto container_reduce_impl (const Container &x, Reduce reduce, ROld r_old, number< I > i, number< IEnd >, number< IStep >)
 
template<typename Container , typename Reduce , typename Init , index_t IBegin = 0, index_t IEnd = Container::size(), index_t IStep = 1>
constexpr CK_TILE_HOST_DEVICE auto container_reduce (const Container &x, Reduce reduce, Init init, number< IBegin >=number< 0 >{}, number< IEnd >=number< Container::size()>{}, number< IStep >=number< 1 >{})
 
template<typename TData , index_t NSize, typename Reduce >
constexpr CK_TILE_HOST_DEVICE auto container_reverse_inclusive_scan (const array< TData, NSize > &x, Reduce f, TData init)
 
template<typename TData , index_t NSize, typename Reduce , typename Init >
constexpr CK_TILE_HOST_DEVICE auto container_reverse_exclusive_scan (const array< TData, NSize > &x, Reduce f, Init init)
 
template<index_t... Is, typename Reduce , index_t Init>
constexpr CK_TILE_HOST_DEVICE auto container_reverse_exclusive_scan (const sequence< Is... > &seq, Reduce f, number< Init >)
 
template<typename... Xs, typename Reduce , index_t I, typename YOld , typename ROld >
constexpr CK_TILE_HOST_DEVICE auto container_reverse_exclusive_scan_impl (const tuple< Xs... > &x, Reduce reduce, number< I > i, YOld y_old, ROld r_old)
 
template<typename... Xs, typename Reduce , typename Init >
constexpr CK_TILE_HOST_DEVICE auto container_reverse_exclusive_scan (const tuple< Xs... > &x, Reduce reduce, Init init)
 
template<typename... Xs, typename Reduce , typename TData >
constexpr CK_TILE_HOST_DEVICE auto container_reverse_inclusive_scan (const tuple< Xs... > &x, Reduce f, TData init)
 
template<typename X , typename... Ys>
constexpr CK_TILE_HOST_DEVICE auto container_concat (const X &x, const Ys &... ys)
 
template<typename T , index_t NX, index_t NY>
constexpr CK_TILE_HOST_DEVICE auto container_concat (const array< T, NX > &ax, const array< T, NY > &ay)
 
template<typename... X, typename... Y>
constexpr CK_TILE_HOST_DEVICE auto container_concat (const tuple< X... > &tx, const tuple< Y... > &ty)
 
template<typename Container >
constexpr CK_TILE_HOST_DEVICE auto container_concat (const Container &x)
 
template<typename T , index_t N, index_t... Is>
constexpr CK_TILE_HOST_DEVICE auto get_container_subset (const array< T, N > &arr, sequence< Is... >)
 
template<typename... Ts, index_t... Is>
constexpr CK_TILE_HOST_DEVICE auto get_container_subset (const tuple< Ts... > &tup, sequence< Is... >)
 
template<typename T , index_t N, index_t... Is>
constexpr CK_TILE_HOST_DEVICE void set_container_subset (array< T, N > &y, sequence< Is... > picks, const array< T, sizeof...(Is)> &x)
 
template<typename Y , typename X , index_t... Is>
constexpr CK_TILE_HOST_DEVICE void set_container_subset (Y &y, sequence< Is... > picks, const X &x)
 
template<index_t... Is>
constexpr index_t container_find (sequence< Is... > seq, index_t value)
 
template<index_t... Is>
constexpr CK_TILE_HOST_DEVICE auto sequence_to_tuple_of_number (sequence< Is... >)
 
template<typename... Xs>
constexpr CK_TILE_HOST_DEVICE auto make_multi_index (Xs &&... xs)
 
template<index_t NSize>
constexpr CK_TILE_HOST_DEVICE auto make_zero_multi_index ()
 
template<typename T >
constexpr CK_TILE_HOST_DEVICE auto to_multi_index (const T &x)
 
template<index_t NSize, typename X >
constexpr CK_TILE_HOST_DEVICE auto operator+= (multi_index< NSize > &y, const X &x)
 
template<index_t NSize, typename X >
constexpr CK_TILE_HOST_DEVICE auto operator-= (multi_index< NSize > &y, const X &x)
 
template<index_t NSize, typename T >
constexpr CK_TILE_HOST_DEVICE auto operator+ (const multi_index< NSize > &a, const T &b)
 
template<index_t NSize, typename T >
constexpr CK_TILE_HOST_DEVICE auto operator- (const multi_index< NSize > &a, const T &b)
 
template<index_t NSize, typename T >
constexpr CK_TILE_HOST_DEVICE auto operator* (const multi_index< NSize > &a, const T &b)
 
template<index_t NSize>
constexpr CK_TILE_HOST_DEVICE auto operator* (index_t a, const multi_index< NSize > &x)
 
template<index_t NSize>
constexpr CK_TILE_HOST_DEVICE auto operator* (const multi_index< NSize > &x, index_t a)
 
template<index_t I, index_t... Is>
constexpr CK_TILE_HOST_DEVICE auto sequence_pop_front (sequence< I, Is... >)
 
template<typename Seq >
constexpr CK_TILE_HOST_DEVICE auto sequence_pop_back (Seq)
 
template<index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE bool operator== (sequence< Xs... >, sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE bool operator!= (sequence< Xs... > x, sequence< Ys... > y)
 
template<index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE auto operator+ (sequence< Xs... >, sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE auto operator- (sequence< Xs... >, sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE auto operator* (sequence< Xs... >, sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE auto operator/ (sequence< Xs... >, sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE auto operator% (sequence< Xs... >, sequence< Ys... >)
 
template<index_t... Xs, index_t Y>
constexpr CK_TILE_HOST_DEVICE auto operator+ (sequence< Xs... >, number< Y >)
 
template<index_t... Xs, index_t Y>
constexpr CK_TILE_HOST_DEVICE auto operator- (sequence< Xs... >, number< Y >)
 
template<index_t... Xs, index_t Y>
constexpr CK_TILE_HOST_DEVICE auto operator* (sequence< Xs... >, number< Y >)
 
template<index_t... Xs, index_t Y>
constexpr CK_TILE_HOST_DEVICE auto operator/ (sequence< Xs... >, number< Y >)
 
template<index_t... Xs, index_t Y>
constexpr CK_TILE_HOST_DEVICE auto operator% (sequence< Xs... >, number< Y >)
 
template<index_t Y, index_t... Xs>
constexpr CK_TILE_HOST_DEVICE auto operator+ (number< Y >, sequence< Xs... >)
 
template<index_t Y, index_t... Xs>
constexpr CK_TILE_HOST_DEVICE auto operator- (number< Y >, sequence< Xs... >)
 
template<index_t Y, index_t... Xs>
constexpr CK_TILE_HOST_DEVICE auto operator* (number< Y >, sequence< Xs... >)
 
template<index_t Y, index_t... Xs>
constexpr CK_TILE_HOST_DEVICE auto operator/ (number< Y >, sequence< Xs... >)
 
template<index_t Y, index_t... Xs>
constexpr CK_TILE_HOST_DEVICE auto operator% (number< Y >, sequence< Xs... >)
 
template<typename... Seqs>
constexpr CK_TILE_HOST_DEVICE auto merge_sequences (Seqs...)
 
template<typename F , index_t... Xs>
constexpr CK_TILE_HOST_DEVICE auto transform_sequences (F f, sequence< Xs... >)
 
template<typename F , index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE auto transform_sequences (F f, sequence< Xs... >, sequence< Ys... >)
 
template<typename F , index_t... Xs, index_t... Ys, index_t... Zs>
constexpr CK_TILE_HOST_DEVICE auto transform_sequences (F f, sequence< Xs... >, sequence< Ys... >, sequence< Zs... >)
 
template<typename Seq , typename Reduce , index_t Init>
constexpr CK_TILE_HOST_DEVICE auto reverse_inclusive_scan_sequence (Seq, Reduce, number< Init >)
 
template<typename Seq , typename Reduce , index_t Init>
constexpr CK_TILE_HOST_DEVICE auto reverse_exclusive_scan_sequence (Seq, Reduce, number< Init >)
 
template<typename Seq , typename Reduce , index_t Init>
constexpr CK_TILE_HOST_DEVICE auto inclusive_scan_sequence (Seq, Reduce, number< Init >)
 
template<typename Seq , typename Reduce , index_t Init>
constexpr auto exclusive_scan_sequence (Seq, Reduce, number< Init >)
 
template<typename Seq >
constexpr auto prefix_sum_sequence (Seq)
 
template<typename Seq , index_t... Is>
constexpr CK_TILE_HOST_DEVICE auto pick_sequence_elements_by_ids (Seq, sequence< Is... >)
 
template<typename Seq , typename Mask >
constexpr CK_TILE_HOST_DEVICE auto pick_sequence_elements_by_mask (Seq, Mask)
 
template<typename Seq , typename Values , typename Ids >
constexpr CK_TILE_HOST_DEVICE auto modify_sequence_elements_by_ids (Seq, Values, Ids)
 
template<typename Seq , typename Reduce , index_t Init>
constexpr CK_TILE_HOST_DEVICE index_t reduce_on_sequence (Seq, Reduce f, number< Init >)
 
template<typename Seq , typename F >
constexpr CK_TILE_HOST_DEVICE bool sequence_any_of (Seq, F f)
 
template<typename Seq , typename F >
constexpr CK_TILE_HOST_DEVICE bool sequence_all_of (Seq, F f)
 
template<index_t... Is>
constexpr CK_TILE_HOST_DEVICE auto make_sequence (number< Is >...)
 
template<typename F , index_t N>
constexpr CK_TILE_HOST_DEVICE auto generate_sequence (F, number< N >)
 
template<typename F , index_t N>
constexpr CK_TILE_HOST_DEVICE auto generate_sequence_v2 (F &&f, number< N >)
 
template<index_t... Is>
constexpr CK_TILE_HOST_DEVICE auto to_sequence (tuple< number< Is >... >)
 
template<typename SeqSortedSamples , index_t r, index_t... rs>
constexpr CK_TILE_HOST_DEVICE auto histogram_sorted_sequence (SeqSortedSamples, sequence< r, rs... >)
 
template<typename F , index_t N>
constexpr CK_TILE_HOST_DEVICE auto generate_array (F &&f, number< N >)
 
template<typename Seq , index_t SliceSize, typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
constexpr auto reverse_slice_sequence (Seq, number< SliceSize >, Mask=typename uniform_sequence_gen< Seq::size(), 1 >::type{})
 
template<typename Seq , index_t SliceSize, typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
constexpr auto slice_sequence (Seq, number< SliceSize >, Mask=typename uniform_sequence_gen< Seq::size(), 1 >::type{})
 
template<typename... Ts>
constexpr CK_TILE_HOST_DEVICE auto make_thread_buffer (Ts &&... ts)
 
template<typename... T>
CK_TILE_HOST_DEVICE void print (const tuple< T... > &t)
 
template<typename... Xs>
constexpr CK_TILE_HOST_DEVICE bool operator== (const tuple< Xs... > &a, const tuple< Xs... > &b)
 
template<typename... Xs>
constexpr CK_TILE_HOST_DEVICE bool operator!= (const tuple< Xs... > &a, const tuple< Xs... > &b)
 
template<typename... Xs>
constexpr CK_TILE_HOST_DEVICE auto make_tuple (Xs &&... xs)
 
template<typename... Args>
constexpr tuple< Args &... > tie (Args &... args) noexcept
 
template<typename F , index_t... ids>
constexpr CK_TILE_HOST_DEVICE auto generate_tuple_for (F &&f, sequence< ids... >)
 
template<typename F , index_t N>
constexpr CK_TILE_HOST_DEVICE auto generate_tuple (F &&f, number< N >)
 
template<typename F , index_t N>
constexpr CK_TILE_HOST_DEVICE auto generate_tie (F &&f, number< N >)
 
template<typename... X, typename... Y>
constexpr CK_TILE_HOST_DEVICE auto concat_tuple_of_reference (const tuple< X &... > &tx, const tuple< Y &... > &ty)
 
template<typename... X, typename... Y>
constexpr CK_TILE_HOST_DEVICE auto concat_tuple (const tuple< X... > &tx, const tuple< Y... > &ty)
 
template<typename... X>
constexpr CK_TILE_HOST_DEVICE auto concat_tuple (const tuple< X... > &tx)
 
template<typename... X, typename... Tuples>
constexpr CK_TILE_HOST_DEVICE auto concat_tuple (const tuple< X... > &tx, const Tuples &... tuples)
 
template<typename F , typename X >
constexpr CK_TILE_HOST_DEVICE auto transform_tuples (F f, const X &x)
 
template<typename F , typename X , typename Y >
constexpr CK_TILE_HOST_DEVICE auto transform_tuples (F f, const X &x, const Y &y)
 
template<typename F , typename X , typename Y , typename Z >
constexpr CK_TILE_HOST_DEVICE auto transform_tuples (F f, const X &x, const Y &y, const Z &z)
 
template<typename F , typename Tuple >
constexpr decltype(auto) apply (F &&f, Tuple &&t)
 
template<typename F , typename X >
constexpr CK_TILE_HOST_DEVICE auto embed_tuples (F f, const X &x)
 
template<index_t Depth = 0, index_t MaxDepth = -1>
constexpr CK_TILE_HOST_DEVICE auto unroll_nested_tuple (const tuple<> &t)
 
template<index_t Depth = 0, index_t MaxDepth = -1, typename T >
constexpr CK_TILE_HOST_DEVICE auto unroll_nested_tuple (const T &t)
 
template<index_t Depth = 0, index_t MaxDepth = -1, typename... Ts>
constexpr CK_TILE_HOST_DEVICE auto unroll_nested_tuple (const tuple< Ts... > &t)
 
template<typename... Ts>
constexpr CK_TILE_HOST_DEVICE auto tuple_reverse (const tuple< Ts... > &t)
 
template<index_t Idx, index_t End, typename F , typename... Ts>
constexpr CK_TILE_HOST_DEVICE auto tuple_reduce (F &&f, const tuple< Ts... > &t)
 
template<typename... Ts>
constexpr CK_TILE_HOST_DEVICE auto is_nested_tuple (const tuple< Ts... > &)
 
template<index_t depth = 0, typename T >
constexpr CK_TILE_HOST_DEVICE auto tuple_depth (const T &)
 
template<index_t depth = 0, typename... Ts>
constexpr CK_TILE_HOST_DEVICE auto tuple_depth (const tuple< Ts... > &)
 
template<typename... Seqs>
constexpr CK_TILE_HOST_DEVICE auto to_array_of_array (tuple< Seqs... > t_of_s)
 
template<typename... Ys, typename X , std::enable_if_t<!std::is_integral< X >::value &&!std::is_floating_point< X >::value, bool > = false>
constexpr CK_TILE_HOST_DEVICE auto operator+= (tuple< Ys... > &y, const X &x)
 
template<typename... Ys, typename X , std::enable_if_t<!std::is_integral< X >::value &&!std::is_floating_point< X >::value, bool > = false>
constexpr CK_TILE_HOST_DEVICE auto operator-= (tuple< Ys... > &y, const X &x)
 
template<typename... Xs, typename Y , std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false>
constexpr CK_TILE_HOST_DEVICE auto operator+ (const tuple< Xs... > &x, const Y &y)
 
template<typename... Xs, typename... Ys>
constexpr CK_TILE_HOST_DEVICE auto operator+ (const tuple< Xs... > &x, const tuple< Ys... > &y)
 
template<typename... Xs, typename Y , std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false>
constexpr CK_TILE_HOST_DEVICE auto operator- (const tuple< Xs... > &x, const Y &y)
 
template<typename... Xs, typename... Ys>
constexpr CK_TILE_HOST_DEVICE auto operator- (const tuple< Xs... > &x, const tuple< Ys... > &y)
 
template<typename... Xs, typename Y , std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false>
constexpr CK_TILE_HOST_DEVICE auto operator* (const tuple< Xs... > &x, const Y &y)
 
template<typename... Xs, typename Y , std::enable_if_t< std::is_integral< Y >::value||std::is_floating_point< Y >::value, bool > = false>
constexpr CK_TILE_HOST_DEVICE auto operator* (Y a, const tuple< Xs... > &x)
 
template<typename... Xs, typename Y , std::enable_if_t< std::is_integral< Y >::value||std::is_floating_point< Y >::value, bool > = false>
constexpr CK_TILE_HOST_DEVICE auto operator* (const tuple< Xs... > &x, Y a)
 
template<typename... Xs, typename... Ys>
constexpr CK_TILE_HOST_DEVICE auto operator* (const tuple< Xs... > &x, const tuple< Ys... > &y)
 
template<typename... Xs, typename... Ys>
constexpr CK_TILE_HOST_DEVICE auto operator/ (const tuple< Xs... > &x, const tuple< Ys... > &y)
 
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
constexpr CK_TILE_HOST_DEVICE uint16_t float_to_bf16_raw (float f, constant< rounding >={})
 
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
constexpr CK_TILE_HOST_DEVICE uint16_t double_to_bf16_raw (double f, constant< rounding >={})
 
constexpr CK_TILE_HOST_DEVICE float bf16_to_float_raw (uint16_t x)
 
constexpr CK_TILE_HOST_DEVICE double bf16_to_double_raw (uint16_t x)
 
constexpr CK_TILE_HOST_DEVICE uint16_t float_to_bf16_rtn_raw (float f)
 
constexpr CK_TILE_HOST uint16_t float_to_bf16_rtn_asm (float f)
 
CK_TILE_HOST uint16_t float_to_bf16_rta_asm (float f)
 
constexpr CK_TILE_HOST_DEVICE uint16_t float_to_bf16_truc_nan_raw (float f)
 
constexpr CK_TILE_HOST_DEVICE uint16_t float_to_bf16_truc_raw (float f)
 
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
constexpr CK_TILE_HOST_DEVICE bfloat16_t float_to_bf16 (float f, constant< rounding >={})
 
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
constexpr CK_TILE_HOST_DEVICE bfloat16_t double_to_bf16 (double f, constant< rounding >={})
 
constexpr CK_TILE_HOST_DEVICE float bf16_to_float (bfloat16_t x)
 
constexpr CK_TILE_HOST_DEVICE double bf16_to_double (bfloat16_t x)
 
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
CK_TILE_HOST_DEVICE constexpr bfloat16_t fp16_to_bf16 (half_t f, constant< rounding >={})
 
constexpr CK_TILE_HOST_DEVICE half_t bf16_to_fp16 (bfloat16_t x)
 
CK_TILE_HOST_DEVICE bfloat16_t abs (const bfloat16_t &x)
 
CK_TILE_HOST_DEVICE bool isnan (const bfloat16_t &x)
 
CK_TILE_DEVICE bfloat16_t sqrt (bfloat16_t x)
 
CK_TILE_DEVICE bfloat16_t exp (bfloat16_t x)
 
CK_TILE_DEVICE bfloat16_t exp2 (bfloat16_t x)
 
CK_TILE_DEVICE bfloat16_t log (bfloat16_t x)
 
template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE uint8_t float_to_fp8_raw (float, constant< rounding >={})
 
template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE uint8_t float_to_bf8_raw (float, constant< rounding >={})
 
CK_TILE_HOST_DEVICE float fp8_to_float_raw (uint8_t)
 
CK_TILE_HOST_DEVICE float bf8_to_float_raw (uint8_t)
 
template<typename SrcT , typename DstT >
CK_TILE_HOST_DEVICE numeric_traits< DstT >::bitwise_type float_to_fp8_sr_raw (SrcT x)
 Converts a floating-point value to an 8-bit floating-point representation with stochastic rounding. More...
 
template<typename SrcT , typename DstT >
CK_TILE_HOST_DEVICE numeric_traits< DstT >::bitwise_type float_to_fp8_rtn_raw (SrcT x)
 Converts a floating-point value to an 8-bit floating-point representation with rounding to nearest even. More...
 
template<fp8_rounding_mode rounding>
CK_TILE_HOST_DEVICE fp8_raw_t float_to_fp8_raw (float x, constant< rounding >)
 
template<fp8_rounding_mode rounding>
CK_TILE_HOST_DEVICE bf8_raw_t float_to_bf8_raw (float x, constant< rounding >)
 
template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE fp8_t float_to_fp8 (float x, constant< rounding >={})
 
template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE bf8_t float_to_bf8 (float x, constant< rounding >={})
 
CK_TILE_HOST_DEVICE float fp8_to_float (fp8_t x)
 
CK_TILE_HOST_DEVICE float bf8_to_float (bf8_t x)
 
template<typename T >
CK_TILE_HOST_DEVICEabs (const T &x)
 
CK_TILE_HOST_DEVICE bool isnan (const fp8_t &x)
 
CK_TILE_HOST_DEVICE bool isnan (const bf8_t &x)
 
constexpr CK_TILE_HOST_DEVICE float fp16_to_float_hip (const fp16_hip_t &x)
 
constexpr CK_TILE_HOST_DEVICE double fp16_to_double_hip (const fp16_hip_t &x)
 
constexpr CK_TILE_HOST_DEVICE fp16_hip_t float_to_fp16_hip (const float &x)
 
constexpr CK_TILE_HOST_DEVICE fp16_hip_t double_to_fp16_hip (const double &x)
 
constexpr CK_TILE_HOST_DEVICE float fp16_to_float (const half_t &x)
 
constexpr CK_TILE_HOST_DEVICE float fp16_to_double (const half_t &x)
 
constexpr CK_TILE_HOST_DEVICE half_t float_to_fp16 (const float &x)
 
constexpr CK_TILE_HOST_DEVICE half_t double_to_fp16 (const double &x)
 
CK_TILE_HOST fp16x2_t pk_add_f16 (const fp16x2_t &x, const fp16x2_t &y)
 
constexpr CK_TILE_HOST_DEVICE float int8_to_float (const int8_t &x)
 
constexpr CK_TILE_HOST_DEVICE int8_t float_to_int8 (const float &x)
 
template<typename Scale >
__host__ __device__ scales (Scale) -> scales< Scale >
 FIXME: create macro to replace 'host device' and nothing more. More...
 
__host__ __device__ plus () -> plus< void, void >
 FIXME: create macro to replace 'host device' and nothing more. More...
 
__host__ __device__ minus () -> minus< void, void >
 FIXME: create macro to replace 'host device' and nothing more. More...
 
__host__ __device__ multiplies () -> multiplies< void, void >
 FIXME: create macro to replace 'host device' and nothing more. More...
 
template<typename X , typename Y >
constexpr CK_TILE_HOST_DEVICE auto integer_divide_floor (X x, Y y)
 
template<typename X , typename Y >
constexpr CK_TILE_HOST_DEVICE auto integer_divide_ceil (X x, Y y)
 
template<typename X , typename Y >
constexpr CK_TILE_HOST_DEVICE auto integer_least_multiple (X x, Y y)
 
template<typename T >
constexpr CK_TILE_HOST_DEVICEmax (T x)
 
template<typename T >
constexpr CK_TILE_HOSTmax (T x, T y)
 
template<typename T >
constexpr CK_TILE_DEVICEmax (T x, T y)
 
template<>
constexpr CK_TILE_DEVICE float max (float x, float y)
 
template<>
constexpr CK_TILE_DEVICE double max (double x, double y)
 
template<index_t X>
constexpr CK_TILE_HOST_DEVICE index_t max (number< X >, index_t y)
 
template<index_t Y>
constexpr CK_TILE_HOST_DEVICE index_t max (index_t x, number< Y >)
 
template<typename X , typename... Ys>
constexpr CK_TILE_HOST_DEVICE auto max (X x, Ys... ys)
 
template<typename T >
constexpr CK_TILE_HOST_DEVICEmin (T x)
 
template<typename T >
constexpr CK_TILE_HOSTmin (T x, T y)
 
template<typename T >
constexpr CK_TILE_DEVICEmin (T x, T y)
 
template<>
constexpr CK_TILE_DEVICE float min (float x, float y)
 
template<>
constexpr CK_TILE_DEVICE double min (double x, double y)
 
template<index_t X>
constexpr CK_TILE_HOST_DEVICE index_t min (number< X >, index_t y)
 
template<index_t Y>
constexpr CK_TILE_HOST_DEVICE index_t min (index_t x, number< Y >)
 
template<typename X , typename... Ys>
constexpr CK_TILE_HOST_DEVICE auto min (X x, Ys... ys)
 
template<typename T >
constexpr CK_TILE_HOST_DEVICEclamp (const T &x, const T &lowerbound, const T &upperbound)
 
CK_TILE_HOST int clz (uint32_t x)
 
constexpr CK_TILE_HOST_DEVICE index_t gcd (index_t x, index_t y)
 
template<index_t X, index_t Y>
constexpr CK_TILE_HOST_DEVICE auto gcd (number< X >, number< Y >)
 
Y constexpr CK_TILE_HOST_DEVICE auto lcm (X x, Y y)
 
__host__ __device__ equal () -> equal< void, void >
 FIXME: create macro to replace 'host device' and nothing more. More...
 
__host__ __device__ less () -> less< void, void >
 FIXME: create macro to replace 'host device' and nothing more. More...
 
__host__ __device__ less_equal () -> less_equal< void, void >
 FIXME: create macro to replace 'host device' and nothing more. More...
 
constexpr CK_TILE_HOST_DEVICE int32_t next_power_of_two (int32_t x)
 
template<index_t X>
constexpr CK_TILE_HOST_DEVICE auto next_power_of_two ()
 
template<index_t X>
constexpr CK_TILE_HOST_DEVICE auto next_power_of_two (number< X >)
 
constexpr CK_TILE_HOST_DEVICE int32_t integer_log2_floor (int32_t x)
 
constexpr CK_TILE_HOST_DEVICE bool is_power_of_two_integer (int32_t x)
 
CK_TILE_DEVICE float exp2 (float x)
 
CK_TILE_DEVICE uint16_t sad_u16 (uint16_t x, uint16_t y, uint16_t acc)
 
CK_TILE_DEVICE uint32_t sad_u32 (uint32_t x, uint32_t y, uint32_t acc)
 
CK_TILE_HOST float abs (float x)
 
CK_TILE_HOST double abs (double x)
 
CK_TILE_HOST int8_t abs (int8_t x)
 
CK_TILE_HOST int32_t abs (int32_t x)
 
CK_TILE_HOST fp16_t abs (fp16_t x)
 
CK_TILE_HOST bool isnan (float x)
 
CK_TILE_HOST bool isnan (double x)
 
CK_TILE_HOST bool isnan (int8_t x)
 
CK_TILE_HOST bool isnan (int32_t x)
 
CK_TILE_HOST bool isnan (fp16_t x)
 
CK_TILE_HOST fp16_t sqrt (fp16_t x)
 
CK_TILE_HOST float sqrt (float x)
 
CK_TILE_HOST double sqrt (double x)
 
template<typename T >
CK_TILE_HOSTtanh (T x)
 
template<>
CK_TILE_HOST float tanh< float > (float x)
 
template<>
CK_TILE_HOST double tanh< double > (double x)
 
template<typename T >
CK_TILE_HOSTacos (T x)
 
template<>
CK_TILE_HOST float acos< float > (float x)
 
template<>
CK_TILE_HOST double acos< double > (double x)
 
template<typename T >
CK_TILE_HOSTneg (T x)
 
template<>
CK_TILE_HOST float neg< float > (float x)
 
template<>
CK_TILE_HOST double neg< double > (double x)
 
template<>
CK_TILE_HOST int32_t neg< int32_t > (int32_t x)
 
template<>
CK_TILE_HOST int8_t neg< int8_t > (int8_t x)
 
template<typename T >
CK_TILE_HOSTatan (T x)
 
template<>
CK_TILE_HOST float atan< float > (float x)
 
template<>
CK_TILE_HOST double atan< double > (double x)
 
template<typename T >
CK_TILE_HOSTsin (T x)
 
template<>
CK_TILE_HOST float sin< float > (float x)
 
template<>
CK_TILE_HOST double sin< double > (double x)
 
template<typename T >
CK_TILE_HOSTasin (T x)
 
template<>
CK_TILE_HOST float asin< float > (float x)
 
template<>
CK_TILE_HOST double asin< double > (double x)
 
template<typename T >
CK_TILE_HOSTasinh (T x)
 
template<>
CK_TILE_HOST float asinh< float > (float x)
 
template<>
CK_TILE_HOST double asinh< double > (double x)
 
template<typename T >
CK_TILE_HOSTcos (T x)
 
template<>
CK_TILE_HOST float cos< float > (float x)
 
template<>
CK_TILE_HOST double cos< double > (double x)
 
template<typename T >
CK_TILE_HOSTacosh (T x)
 
template<>
CK_TILE_HOST float acosh< float > (float x)
 
template<>
CK_TILE_HOST double acosh< double > (double x)
 
template<typename T >
CK_TILE_HOSTtan (T x)
 
template<>
CK_TILE_HOST float tan< float > (float x)
 
template<>
CK_TILE_HOST double tan< double > (double x)
 
template<typename T >
CK_TILE_HOSTatanh (T x)
 
template<>
CK_TILE_HOST float atanh< float > (float x)
 
template<>
CK_TILE_HOST double atanh< double > (double x)
 
template<typename T >
CK_TILE_HOSTsinh (T x)
 
template<>
CK_TILE_HOST float sinh< float > (float x)
 
template<>
CK_TILE_HOST double sinh< double > (double x)
 
template<typename T >
CK_TILE_HOSTceil (T x)
 
template<>
CK_TILE_HOST float ceil< float > (float x)
 
template<>
CK_TILE_HOST double ceil< double > (double x)
 
template<typename T >
CK_TILE_HOSTcosh (T x)
 
template<>
CK_TILE_HOST float cosh< float > (float x)
 
template<>
CK_TILE_HOST double cosh< double > (double x)
 
template<typename T >
CK_TILE_HOSTfloor (T x)
 
template<>
CK_TILE_HOST float floor< float > (float x)
 
template<>
CK_TILE_HOST double floor< double > (double x)
 
template<typename T >
CK_TILE_HOSTrcp (T x)
 
template<typename T >
CK_TILE_HOSTexp (T x)
 
template<>
CK_TILE_HOST float exp< float > (float x)
 
template<>
CK_TILE_HOST double exp< double > (double x)
 
template<typename T >
CK_TILE_HOSTlog (T x)
 
template<>
CK_TILE_HOST float log< float > (float x)
 
template<>
CK_TILE_HOST double log< double > (double x)
 
template<typename T >
CK_TILE_HOSTpow (T x, T gamma)
 
template<>
CK_TILE_HOST float pow< float > (float x, float gamma)
 
template<>
CK_TILE_HOST double pow< double > (double x, double gamma)
 
template<typename T >
CK_TILE_HOSTexpm1 (T x)
 
template<>
CK_TILE_HOST float expm1< float > (float x)
 
template<>
CK_TILE_HOST double expm1< double > (double x)
 
template<typename T >
CK_TILE_DEVICEtanh (T x)
 
template<typename T >
CK_TILE_DEVICEacos (T x)
 
template<typename T >
CK_TILE_DEVICEneg (T x)
 
template<>
CK_TILE_DEVICE fp16_t neg< fp16_t > (fp16_t x)
 
template<typename T >
CK_TILE_DEVICEatan (T x)
 
template<typename T >
CK_TILE_DEVICEsin (T x)
 
template<>
CK_TILE_DEVICE fp16_t sin< fp16_t > (fp16_t x)
 
template<typename T >
CK_TILE_DEVICEasin (T x)
 
template<typename T >
CK_TILE_DEVICEasinh (T x)
 
template<typename T >
CK_TILE_DEVICEacosh (T x)
 
template<typename T >
CK_TILE_DEVICEtan (T x)
 
template<typename T >
CK_TILE_DEVICEatanh (T x)
 
template<typename T >
CK_TILE_DEVICEsinh (T x)
 
template<typename T >
CK_TILE_DEVICEceil (T x)
 
template<>
CK_TILE_DEVICE fp16_t ceil< fp16_t > (fp16_t x)
 
template<typename T >
CK_TILE_DEVICEcosh (T x)
 
template<typename T >
CK_TILE_DEVICEfloor (T x)
 
template<>
CK_TILE_DEVICE fp16_t floor< fp16_t > (fp16_t x)
 
template<typename T >
CK_TILE_DEVICErcp (T x)
 
template<typename T >
CK_TILE_DEVICEexp (T x)
 
template<>
CK_TILE_DEVICE fp16_t exp< fp16_t > (fp16_t x)
 
template<typename T >
CK_TILE_DEVICEtanh_fast (T x)
 
template<>
CK_TILE_DEVICE float tanh_fast< float > (float x)
 
template<typename T >
CK_TILE_DEVICElog (T x)
 
template<>
CK_TILE_DEVICE fp16_t log< fp16_t > (fp16_t x)
 
template<typename T >
CK_TILE_DEVICEpow (T x, T gamma)
 
template<typename T >
CK_TILE_DEVICEexpm1 (T x)
 
template<typename T >
CK_TILE_HOST_DEVICE float convert_to_float (typename T::raw_type data, float scale=1.f)
 
template<typename T >
CK_TILE_HOST_DEVICE T::raw_type convert_to_type (float value, float scale=1.f)
 
constexpr CK_TILE_HOST_DEVICE uint8_t float_to_e2m1 (float x, float scale=1.f)
 
constexpr CK_TILE_HOST_DEVICE pk_fp4_t float_to_pk_fp4 (const float &x, float scale)
 
constexpr CK_TILE_HOST_DEVICE pk_fp4_t fp16_to_pk_fp4 (const fp16_t &x, float scale)
 
constexpr CK_TILE_HOST_DEVICE pk_fp4_t bf16_to_pk_fp4 (const bf16_t &x, float scale)
 
constexpr CK_TILE_HOST_DEVICE pk_fp4_t fp16x2_to_pk_fp4 (const fp16x2_t &x, float scale)
 
constexpr CK_TILE_HOST_DEVICE pk_fp4_t bf16x2_to_pk_fp4 (const bf16x2_t &x, float scale)
 
constexpr CK_TILE_HOST_DEVICE pk_fp4_t fp32x2_to_pk_fp4 (const fp32x2_t &x, float scale)
 
constexpr CK_TILE_HOST_DEVICE fp32x2_t pk_fp4_to_fp32x2 (const pk_fp4_t &x, float scale)
 
constexpr CK_TILE_HOST_DEVICE fp16x2_t pk_fp4_to_fp16x2 (const pk_fp4_t &x, float scale)
 
constexpr CK_TILE_HOST_DEVICE bf16x2_t pk_fp4_to_bf16x2 (const pk_fp4_t &x, float scale)
 
constexpr CK_TILE_HOST_DEVICE float pk_fp4_to_float (const pk_fp4_t &x, float scale)
 
constexpr CK_TILE_HOST_DEVICE fp16_t pk_fp4_to_fp16 (const pk_fp4_t &x, float scale)
 
constexpr CK_TILE_HOST_DEVICE bf16_t pk_fp4_to_bf16 (const pk_fp4_t &x, float scale)
 
CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t (const pk_int4_t &x)
 
CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t_signed_conversion (const pk_int4_t &x)
 
CK_TILE_HOST_DEVICE fp16x2_t pk_int4_t_to_halfx2_t (const pk_int4_t &x)
 
CK_TILE_HOST_DEVICE bf16x2_t pk_int4_t_to_bfloat16x2_t (const pk_int4_t &x)
 
CK_TILE_HOST_DEVICE int8x2_t pk_int4_t_to_int8x2_t (const pk_int4_t &x)
 
template<typename Y , typename X , std::enable_if_t<!(std::is_const_v< Y >||std::is_const_v< X >), bool > = false>
constexpr CK_TILE_HOST_DEVICEtype_convert (X x)
 
template<typename Y , typename X >
constexpr CK_TILE_HOST_DEVICEscaled_type_convert (X x, float scale)
 
template<address_space_enum BufferAddressSpace, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename T , typename BufferSizeType >
constexpr CK_TILE_HOST_DEVICE auto make_buffer_view (T *__restrict__ p, BufferSizeType buffer_size)
 
template<address_space_enum BufferAddressSpace, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename T , typename BufferSizeType , typename X , typename std::enable_if< std::is_same< remove_cvref_t< T >, remove_cvref_t< X >>::value, bool >::type = false>
constexpr CK_TILE_HOST_DEVICE auto make_buffer_view (T *__restrict__ p, BufferSizeType buffer_size, X invalid_element_value)
 
template<address_space_enum BufferAddressSpace, typename T , typename BufferSizeType , bool InvalidElementUseNumericalZeroValue, amd_buffer_coherence_enum Coherence>
CK_TILE_HOST_DEVICE void print (const buffer_view< BufferAddressSpace, T, BufferSizeType, InvalidElementUseNumericalZeroValue, Coherence > &bv)
 
template<typename TileWindow_ , index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE auto load_tile (const TileWindow_ &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={})
 
template<typename DistributedTensor_ , typename TileWindow_ , index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE auto load_tile (DistributedTensor_ &dst_tile, const TileWindow_ &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={})
 
template<typename T , typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto load_tile_raw (T &tile, const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
 Loads a tile of data using inline assembly. More...
 
template<typename T , typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto load_tile_raw (T &tile, const tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
 
template<typename LdsTileWindow_ , typename TileWindow_ , index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE auto async_load_tile (LdsTileWindow_ &&lds_tile, const TileWindow_ &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={})
 
template<typename LdsTileWindow_ , typename TileWindow_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto async_load_tile_raw (LdsTileWindow_ &&lds_tile, const TileWindow_ &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
 
CK_TILE_DEVICE auto async_load_fence (index_t cnt=0)
 
template<typename WindowLengths >
CK_TILE_DEVICE auto load_tile (const null_tile_window< WindowLengths > &)
 
template<typename T , typename WindowLengths >
CK_TILE_DEVICE auto load_tile_raw (T &, const null_tile_window< WindowLengths > &)
 
constexpr int DS_READ_TR_SIZE ()
 
template<typename InnerEncode , index_t kLeadIterPerWarp, index_t kSecondIterPerWarp, index_t kLeadNumWarps, index_t kSecondNumWarps>
constexpr CK_TILE_HOST_DEVICE auto InputTileDistributionEncoding ()
 
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename Policy = DefaultTranspose<typename BottomTensorView_::DataType>, typename = std::enable_if_t<TransposeTileDistrChecker<TileDistribution_, typename BottomTensorView_::DataType, Policy>::distr_encoding_valid, Policy>>
CK_TILE_DEVICE auto load_tile_transpose (const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window)
 transpose loads tile from a tensor and returns the resulting tensor with a new (transposed) tile distribution. use SFINAE to ensure the tile distribution encoding is valid. More...
 
template<typename T >
constexpr CK_TILE_DEVICE auto is_null_tile_window (const T &)
 
template<typename WindowLengths >
constexpr CK_TILE_DEVICE auto make_null_tile_window (const WindowLengths &window_lengths)
 
template<typename WindowLengths , typename... Ts>
constexpr CK_TILE_DEVICE auto make_tile_window (null_tensor_view, const WindowLengths &window_lengths, const multi_index< WindowLengths::size()> &, Ts &&...)
 
template<typename WindowLengths , typename StaticTileDistribution >
constexpr CK_TILE_DEVICE auto make_tile_window (const null_tile_window< WindowLengths > &t, const StaticTileDistribution &)
 
template<typename WindowLengths >
CK_TILE_DEVICE void move_tile_window (null_tile_window< WindowLengths > &, const typename null_tile_window< WindowLengths >::BottomTensorIndex &)
 
template<typename OutTensor , typename InTensor >
CK_TILE_DEVICE void shuffle_tile (OutTensor &out, const InTensor &in)
 
template<typename BottomTensorView_ , typename WindowLengths_ , index_t... SliceBegins, index_t... SliceEnds>
constexpr CK_TILE_DEVICE auto get_slice_tile (const tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile, sequence< SliceBegins... > slice_begins, sequence< SliceEnds... > slice_ends)
 
template<typename DataType_ , typename StaticTileDistribution_ , index_t... SliceBegins, index_t... SliceEnds>
constexpr CK_TILE_DEVICE auto get_slice_tile (const static_distributed_tensor< DataType_, StaticTileDistribution_ > &tile, sequence< SliceBegins... > slice_begins, sequence< SliceEnds... > slice_ends)
 
template<typename DstDataType_ , typename DstStaticTileDistribution_ , typename SrcDataType_ , typename SrcStaticTileDistribution_ , index_t... SliceBegins, index_t... SliceEnds>
constexpr CK_TILE_DEVICE auto set_slice_tile (static_distributed_tensor< DstDataType_, DstStaticTileDistribution_ > &dst_tile, const static_distributed_tensor< SrcDataType_, SrcStaticTileDistribution_ > &src_tile, sequence< SliceBegins... > slice_begins, sequence< SliceEnds... > slice_ends)
 
template<typename DataType , typename StaticTileDistribution >
constexpr CK_TILE_HOST_DEVICE auto make_static_distributed_tensor (const StaticTileDistribution &)
 
template<typename DataType , typename StaticTileDistribution , typename ThreadBuffer >
constexpr CK_TILE_HOST_DEVICE auto make_static_distributed_tensor (const StaticTileDistribution &, ThreadBuffer &&thread_buffer_)
 
template<typename StaticTileDistribution , typename DistributedIndices >
constexpr CK_TILE_HOST_DEVICE auto get_x_indices_from_distributed_indices (StaticTileDistribution tile_distribution, DistributedIndices distributed_indices)
 
template<typename DataType , typename StaticTileDistribution , typename XIndicesPredicate >
CK_TILE_HOST_DEVICE void set_tile_if (static_distributed_tensor< DataType, StaticTileDistribution > &out_tensor, DataType value, XIndicesPredicate predicate)
 
template<typename YLengths , index_t XUnpacks>
constexpr CK_TILE_HOST_DEVICE auto get_y_unpacks_from_x_unpacks (YLengths, number< XUnpacks >)
 
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename DataType_ >
CK_TILE_DEVICE void store_tile (tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile_window_tmp, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
 
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename DataType_ >
CK_TILE_DEVICE void store_tile_raw (tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile_window_tmp, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
 
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ >
CK_TILE_DEVICE void store_tile (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
 
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ >
CK_TILE_DEVICE void store_tile_raw (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
 
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , typename DataType_ >
CK_TILE_DEVICE void store_tile (tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
 
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , typename DataType_ >
CK_TILE_DEVICE void store_tile_raw (tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
 
template<typename TileDistributedSpan_ , typename F >
CK_TILE_DEVICE void sweep_tile_span (TileDistributedSpan_, const F &f)
 
template<typename TileDistributedSpan_ , typename F , typename Unpacks = typename uniform_sequence_gen<TileDistributedSpan_::Impl::size(), 1>::type>
CK_TILE_DEVICE void sweep_tile_uspan (TileDistributedSpan_, const F &f, Unpacks={})
 
template<typename DistributedTensor , typename F , typename UnpacksPerXDim = typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type>
constexpr CK_TILE_HOST_DEVICE void sweep_tile (const F &f, UnpacksPerXDim={})
 
template<typename DistributedTensor , typename F , typename UnpacksPerXDim = typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type>
constexpr CK_TILE_HOST_DEVICE void sweep_tile (const DistributedTensor &, const F &f, UnpacksPerXDim={})
 
template<typename T , typename F , typename U = typename uniform_sequence_gen<T::get_num_of_dimension(), 1>::type>
CK_TILE_HOST_DEVICE_EXTERN tile_sweeper (const T &, const F &, U={}) -> tile_sweeper< T, F, U >
 
template<typename Transforms , typename LowerDimensionOldTopIdss , typename UpperDimensionNewTopIdss >
constexpr CK_TILE_HOST_DEVICE auto make_single_stage_tensor_adaptor (const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
 
template<typename OldTensorAdaptor , typename NewTransforms , typename NewLowerDimensionOldTopIdss , typename NewUpperDimensionNewTopIdss >
constexpr CK_TILE_HOST_DEVICE auto transform_tensor_adaptor (const OldTensorAdaptor &old_tensor_adaptor, const NewTransforms &new_transforms, NewLowerDimensionOldTopIdss, NewUpperDimensionNewTopIdss)
 
template<typename TensorAdaptor0 , typename TensorAdaptor1 >
constexpr CK_TILE_HOST_DEVICE auto chain_tensor_adaptors (const TensorAdaptor0 &adaptor0, const TensorAdaptor1 &adaptor1)
 
template<typename Adaptor , typename TopIndex >
constexpr CK_TILE_HOST_DEVICE auto make_tensor_adaptor_coordinate (const Adaptor &adaptor, const TopIndex &idx_top)
 
template<bool JudgeDoTransforms = true, typename Adaptor , typename AdaptorCoord , typename TopIndex , typename BottomIndex >
constexpr CK_TILE_HOST_DEVICE void move_tensor_adaptor_coordinate (const Adaptor &adaptor, AdaptorCoord &coord, const TopIndex &idx_diff_top, BottomIndex &idx_diff_bottom)
 
template<bool JudgeDoTransforms = true, typename Adaptor , typename AdaptorCoord , typename TopIndex >
constexpr CK_TILE_HOST_DEVICE void move_tensor_adaptor_coordinate (const Adaptor &adaptor, AdaptorCoord &coord, const TopIndex &idx_diff_top)
 
template<typename Adaptor , typename AdaptorCoord >
constexpr CK_TILE_HOST_DEVICE bool adaptor_coordinate_is_valid_assuming_top_index_is_valid (const Adaptor &adaptor, const AdaptorCoord &coord)
 
template<typename Adaptor , typename AdpatorCoord >
constexpr CK_TILE_HOST_DEVICE bool adaptor_coordinate_is_valid (const Adaptor &adaptor, const AdpatorCoord &coord)
 
template<typename TensorDesc , typename TopIndex >
constexpr CK_TILE_HOST_DEVICE auto make_tensor_coordinate (const TensorDesc &tensor_desc, const TopIndex &idx_top)
 
template<bool JudgeDoTransforms = true, typename TensorDesc , typename TensorCoord , typename Index >
constexpr CK_TILE_HOST_DEVICE void move_tensor_coordinate (const TensorDesc &tensor_desc, TensorCoord &coord, const Index &coord_step)
 
template<typename TensorDesc , typename TensorCoord >
constexpr CK_TILE_HOST_DEVICE bool coordinate_has_valid_offset_assuming_top_index_is_valid (const TensorDesc &tensor_desc, const TensorCoord &coord)
 
template<typename TensorDesc , typename TensorCoord >
constexpr CK_TILE_HOST_DEVICE bool coordinate_has_valid_offset (const TensorDesc &tensor_desc, const TensorCoord &coord)
 
template<typename Adaptor , typename ElementSpaceSize >
constexpr CK_TILE_HOST_DEVICE auto make_tensor_descriptor_from_adaptor (const Adaptor &adaptor, const ElementSpaceSize &element_space_size)
 
template<typename OldTensorDescriptor , typename NewTransforms , typename NewLowerDimensionOldTopIdss , typename NewUpperDimensionNewTopIdss >
constexpr CK_TILE_HOST_DEVICE auto transform_tensor_descriptor (const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldTopIdss, NewUpperDimensionNewTopIdss)
 
template<typename... Lengths, typename... Strides, index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
constexpr CK_TILE_HOST_DEVICE auto make_naive_tensor_descriptor (const tuple< Lengths... > &lengths, const tuple< Strides... > &strides, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}, number< GuaranteedLastDimensionVectorStride >=number<-1 >{})
 
template<typename... Lengths, typename... Strides, typename offset , index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
constexpr CK_TILE_HOST_DEVICE auto make_naive_tensor_descriptor_with_offset (const tuple< Lengths... > &lengths, const tuple< Strides... > &strides, const offset &os, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}, number< GuaranteedLastDimensionVectorStride >=number<-1 >{})
 
template<typename... Lengths, index_t GuaranteedLastDimensionVectorLength = -1>
constexpr CK_TILE_HOST_DEVICE auto make_naive_tensor_descriptor_packed (const tuple< Lengths... > &lengths, number< GuaranteedLastDimensionVectorLength >=number<-1 >{})
 
template<typename... Lengths, typename... Strides, typename Offset , index_t GuaranteedLastDimensionVectorLength = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
constexpr CK_TILE_HOST_DEVICE auto make_naive_tensor_descriptor_packed_with_offset (const tuple< Lengths... > &lengths, const Offset &offset, number< GuaranteedLastDimensionVectorLength >=number<-1 >{})
 
template<typename... Lengths, typename Align >
constexpr CK_TILE_HOST_DEVICE auto make_naive_tensor_descriptor_aligned (const tuple< Lengths... > &lengths, Align align)
 
template<address_space_enum BufferAddressSpace = address_space_enum::generic, memory_operation_enum DstInMemOp = memory_operation_enum::set, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType , typename... Ts>
constexpr CK_TILE_HOST_DEVICE auto make_tensor_view (DataType *__restrict__ p, const tensor_descriptor< Ts... > &desc)
 
template<address_space_enum BufferAddressSpace = address_space_enum::generic, memory_operation_enum DstInMemOp = memory_operation_enum::set, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType , typename... Lengths, typename... Strides, index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
constexpr CK_TILE_HOST_DEVICE auto make_naive_tensor_view (DataType *__restrict__ p, const tuple< Lengths... > &lengths, const tuple< Strides... > &strides, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}, number< GuaranteedLastDimensionVectorStride >=number<-1 >{})
 
template<address_space_enum BufferAddressSpace = address_space_enum::generic, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType , typename... Lengths, index_t GuaranteedLastDimensionVectorLength = -1>
constexpr CK_TILE_HOST_DEVICE auto make_naive_tensor_view_packed (DataType *__restrict__ p, const tuple< Lengths... > &lengths, number< GuaranteedLastDimensionVectorLength >=number<-1 >{})
 
template<typename OldTensorView , typename NewTransforms , typename NewLowerDimensionOldVisibleIdss , typename NewUpperDimensionNewVisibleIdss >
constexpr CK_TILE_HOST_DEVICE auto transform_tensor_view (const OldTensorView &old_tensor_view, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
 
template<typename TensorView , typename TileLengths , typename DoPads >
constexpr CK_TILE_HOST_DEVICE auto pad_tensor_view (const TensorView &tensor_view, const TileLengths &tile_lengths, DoPads)
 
template<typename StaticTileDistributionEncoding_ >
constexpr CK_TILE_HOST_DEVICE auto make_static_tile_distribution (StaticTileDistributionEncoding_)
 
template<typename PsYs2XsAdaptor_ , typename Ys2DDescriptor_ , typename StaticTileDistributionEncoding_ , typename TileDistributionDetail_ >
CK_TILE_HOST_DEVICE void print (const tile_distribution< PsYs2XsAdaptor_, Ys2DDescriptor_, StaticTileDistributionEncoding_, TileDistributionDetail_ > &distribution)
 
template<typename RsLengths_ , typename HsLengthss_ , typename Ps2RHssMajor_ , typename Ps2RHssMinor_ , typename Ys2RHsMajor_ , typename Ys2RHsMinor_ >
CK_TILE_HOST_DEVICE void print (const typename tile_distribution_encoding< RsLengths_, HsLengthss_, Ps2RHssMajor_, Ps2RHssMinor_, Ys2RHsMajor_, Ys2RHsMinor_ >::detail &detail_obj)
 
template<typename RsLengths_ , typename HsLengthss_ , typename Ps2RHssMajor_ , typename Ps2RHssMinor_ , typename Ys2RHsMajor_ , typename Ys2RHsMinor_ >
CK_TILE_HOST_DEVICE void print (const tile_distribution_encoding< RsLengths_, HsLengthss_, Ps2RHssMajor_, Ps2RHssMinor_, Ys2RHsMajor_, Ys2RHsMinor_ > &encoding)
 
template<typename InOutElementFunc , typename... InOutDstrTensors, typename = std::enable_if_t<std::conjunction_v< std::negation<std::is_same<std::remove_const_t<InOutDstrTensors>, null_tensor>>...>>>
CK_TILE_DEVICE void tile_elementwise_inout (const InOutElementFunc &inout_element_func, InOutDstrTensors &... inout_dstr_tensors)
 
template<typename InElementFunc , typename... InTensor, typename = std::enable_if_t< std::conjunction_v<std::negation<std::is_same<InTensor, null_tensor>>...>>>
CK_TILE_DEVICE auto tile_elementwise_in (const InElementFunc &in_element_func, const InTensor &... in_dstr_tensors)
 
template<typename InElementFunc , typename Tuple , size_t... I>
CK_TILE_DEVICE auto tile_elementwise_inout_unpack (const InElementFunc &in_element_func, const Tuple &t, std::index_sequence< I... >)
 Template function that "unpacks" a tuple and applies an element-wise operation. More...
 
template<typename InElementFunc , typename Tuple >
CK_TILE_DEVICE auto tile_elementwise_inout_unpack (const InElementFunc &in_element_func, const Tuple &t)
 Template function that "unpacks" a tuple and applies an element-wise operation. More...
 
template<typename DstrTensors , typename T >
CK_TILE_DEVICE void set_tile (DstrTensors &dstr_tensor, const T &value)
 
template<typename T >
CK_TILE_DEVICE void set_tile (null_tensor &, const T &)
 
template<typename DstrTensors , index_t v, bool skip_subdword_opt = false>
CK_TILE_DEVICE void set_tile (DstrTensors &dstr_tensor, number< v >, bool_constant< skip_subdword_opt >={})
 
template<index_t v>
CK_TILE_DEVICE void set_tile (null_tensor &, number< v >)
 
template<typename DstrTensors >
CK_TILE_DEVICE void clear_tile (DstrTensors &dstr_tensor)
 
template<typename DstType , typename SrcTensor >
CK_TILE_DEVICE auto cast_tile (const SrcTensor &src_tensor)
 
template<typename InOutElementFunc , typename... MaybeNullTensor, typename = std::enable_if_t< std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, null_tensor>...>>>
CK_TILE_DEVICE void tile_elementwise_inout (const InOutElementFunc &, MaybeNullTensor &&...)
 
template<typename InElementFunc , typename... MaybeNullTensor, typename = std::enable_if_t< std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, null_tensor>...>>>
CK_TILE_DEVICE auto tile_elementwise_in (const InElementFunc &, MaybeNullTensor &&...)
 
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename StaticPageIndexArray_ , index_t HsGatherDim = 0, index_t NumCoord = 1>
constexpr CK_TILE_DEVICE auto make_tile_scatter_gather (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, const StaticPageIndexArray_ &page_idx, number< HsGatherDim >={}, number< NumCoord >={})
 
template<typename TensorView , typename WindowLengths , typename StaticTileDistribution , typename StaticPageIndexArray , index_t HsGatherDim>
constexpr CK_TILE_DEVICE auto make_tile_scatter_gather (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const multi_index< TensorView::get_num_of_dimension()> &origin, const StaticTileDistribution &tile_distribution, const StaticPageIndexArray &page_idx, number< HsGatherDim >={})
 
template<typename TensorView , typename WindowLengths , typename StaticTileDistribution , typename StaticPageIndexArray , index_t HsGatherDim>
constexpr CK_TILE_DEVICE auto make_tile_scatter_gather (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const StaticTileDistribution &tile_distribution, const StaticPageIndexArray &page_idx, number< HsGatherDim >={})
 
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename StaticPageIndexArray_ , typename StaticValidArray_ , index_t HsGatherDim = 0, index_t NumCoord = 1>
constexpr CK_TILE_DEVICE auto make_tile_scatter_gather (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, const StaticPageIndexArray_ &page_idx, const StaticValidArray_ &valids, number< HsGatherDim >={}, number< NumCoord >={})
 
template<typename TensorView , typename WindowLengths , typename StaticTileDistribution , typename StaticPageIndexArray , typename StaticValidArray , index_t HsGatherDim>
constexpr CK_TILE_DEVICE auto make_tile_scatter_gather (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const multi_index< TensorView::get_num_of_dimension()> &origin, const StaticTileDistribution &tile_distribution, const StaticPageIndexArray &page_idx, const StaticValidArray &valids, number< HsGatherDim >={})
 
template<typename TensorView , typename WindowLengths , typename StaticTileDistribution , typename StaticPageIndexArray , typename StaticValidArray , index_t HsGatherDim>
constexpr CK_TILE_DEVICE auto make_tile_scatter_gather (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const StaticTileDistribution &tile_distribution, const StaticPageIndexArray &page_idx, const StaticValidArray &valids, number< HsGatherDim >={})
 
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , index_t NumCoord = 1>
constexpr CK_TILE_DEVICE auto make_tile_window (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, number< NumCoord >={})
 
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , index_t NumCoord = 1>
CK_TILE_DEVICE auto make_tile_window_raw (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, number< NumCoord >={})
 
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , index_t NumCoord>
CK_TILE_DEVICE void move_tile_window (tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord > &window, const typename tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord >::BottomTensorIndex &step)
 
template<typename TensorView_ , typename WindowLengths_ >
constexpr CK_TILE_DEVICE auto make_tile_window (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin)
 
template<typename TensorView , typename WindowLengths >
constexpr CK_TILE_DEVICE auto make_tile_window (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const multi_index< TensorView::get_num_of_dimension()> &origin)
 
template<typename TensorView , typename WindowLengths , typename StaticTileDistribution >
constexpr CK_TILE_DEVICE auto make_tile_window (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const multi_index< TensorView::get_num_of_dimension()> &origin, const StaticTileDistribution &tile_distribution)
 
template<typename TensorView , typename WindowLengths , typename StaticTileDistribution >
constexpr CK_TILE_DEVICE auto make_tile_window (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const StaticTileDistribution &tile_distribution)
 
template<typename TensorView , typename WindowLengths , typename StaticTileDistribution >
constexpr CK_TILE_DEVICE auto make_tile_window_raw (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const StaticTileDistribution &tile_distribution)
 
template<typename TensorView_ , typename WindowLengths_ >
CK_TILE_DEVICE void move_tile_window (tile_window_with_static_lengths< TensorView_, WindowLengths_ > &window, const typename tile_window_with_static_lengths< TensorView_, WindowLengths_ >::BottomTensorIndex &step)
 
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
constexpr CK_TILE_DEVICE auto make_tile_window_linear (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
 
template<typename TileWindow_ , typename StaticTileDistribution_ , typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
constexpr CK_TILE_DEVICE auto make_tile_window_linear (const TileWindow_ &tile_window, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
 
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
CK_TILE_DEVICE auto make_tile_window_linear_raw (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
 
template<typename TileWindow_ , typename StaticTileDistribution_ , typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
constexpr CK_TILE_DEVICE auto make_tile_window_linear_raw (const TileWindow_ &tile_window, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={})
 
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename LinearBottomDims_ >
CK_TILE_DEVICE void move_tile_window (tile_window_linear< TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ > &window, const typename tile_window_linear< TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ >::BottomTensorIndex &step)
 
template<typename TileWindow_ >
CK_TILE_DEVICE void move_tile_window (TileWindow_ &window, const typename TileWindow_::BottomTensorIndex &step)
 
template<typename LdsTileWindow_ >
CK_TILE_DEVICE auto get_async_store_smem_info (LdsTileWindow_ &&lds_tile)
 
template<typename OutTensor , typename InTensor >
CK_TILE_DEVICE void transpose_tile2d (OutTensor &out, const InTensor &in)
 
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename DataType_ >
CK_TILE_DEVICE void update_tile (tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile_window_tmp, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor)
 
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ , index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE void update_tile (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={})
 
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void update_tile_raw (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
 
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , typename DataType_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto update_tile_raw (tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={})
 
template<typename Y , typename X >
constexpr CK_TILE_HOST_DEVICEbit_cast (const X &x)
 
template<auto... val>
constexpr void CK_PRINT ()
 
template<size_t... Idx>
constexpr std::tuple< std::integral_constant< size_t, Idx >... > makeTuple (std::index_sequence< Idx... >) noexcept
 
constexpr size_t constexpr_strlen (const char *c)
 
template<typename... Args>
void CK_TILE_ERROR (Args &&... args) noexcept
 
template<class EnvVar >
const std::string & EnvGetString (EnvVar)
 
template<class EnvVar >
bool EnvIsEnabled (EnvVar)
 
template<class EnvVar >
bool EnvIsDisabled (EnvVar)
 
template<class EnvVar >
uint64_t EnvValue (EnvVar)
 
template<class EnvVar >
bool EnvIsUnset (EnvVar)
 
template<class EnvVar >
void EnvUnset (EnvVar)
 
template<typename EnvVar , typename ValueType >
void UpdateEnvVar (EnvVar, const ValueType &val)
 Updates the cached value of an environment variable. More...
 
template<typename EnvVar >
void UpdateEnvVar (EnvVar, const std::string_view &val)
 
template<typename F , typename X >
constexpr CK_TILE_HOST_DEVICE auto unpack (F &&f, X &&x)
 
template<typename F , typename X , typename Y >
constexpr CK_TILE_HOST_DEVICE auto unpack2 (F &&f, X &&x, Y &&y)
 
template<bool predicate, typename X , typename Y >
constexpr auto conditional_expr (X &&x, Y &&y)
 
template<typename T >
CK_TILE_HOST_DEVICE void print (const T &)
 
template<>
CK_TILE_HOST_DEVICE void print (const int &value)
 Specialization for int. More...
 
template<>
CK_TILE_HOST_DEVICE void print (const float &value)
 Specialization for float. More...
 
template<>
CK_TILE_HOST_DEVICE void print (const double &value)
 Specialization for double. More...
 
template<>
CK_TILE_HOST_DEVICE void print (const long &value)
 Specialization for long. More...
 
template<>
CK_TILE_HOST_DEVICE void print (const unsigned int &value)
 Specialization for unsigned int. More...
 
template<>
CK_TILE_HOST_DEVICE void print (const char &value)
 Specialization for char. More...
 
template<typename T , size_t N>
CK_TILE_HOST_DEVICE void print (const T(&value)[N])
 Specialization for array. More...
 
template<typename PY , typename PX , typename std::enable_if< std::is_pointer_v< PY > &&std::is_pointer_v< PX >, bool >::type = false>
CK_TILE_HOST_DEVICE PY c_style_pointer_cast (PX p_x)
 
template<typename... Ts>
__host__ __device__ composes (Ts &&...) -> composes< remove_cvref_t< Ts >... >
 FIXME: create macro to replace 'host device' and nothing more. More...
 
template<typename ComputeDataType , typename OutDataType , typename AccDataType = ComputeDataType>
CK_TILE_HOST double get_relative_threshold (const int number_of_accumulations=1)
 Calculate relative error threshold for numerical comparisons. More...
 
template<typename ComputeDataType , typename OutDataType , typename AccDataType = ComputeDataType>
CK_TILE_HOST double get_absolute_threshold (const double max_possible_num, const int number_of_accumulations=1)
 Calculate absolute error threshold for numerical comparisons. More...
 
template<typename T >
std::ostream & operator<< (std::ostream &os, const std::vector< T > &v)
 Stream operator overload for vector output. More...
 
template<typename Range , typename RefRange >
CK_TILE_HOST bool check_size_mismatch (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!")
 Check for size mismatch between output and reference ranges. More...
 
CK_TILE_HOST void report_error_stats (int err_count, double max_err, std::size_t total_size)
 Report error statistics for numerical comparisons. More...
 
template<typename Range , typename RefRange >
std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_floating_point_v< ranges::range_value_t< Range > > &&!std::is_same_v< ranges::range_value_t< Range >, half_t >, bool >::type CK_TILE_HOST check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-5, double atol=3e-6, bool allow_infinity_ref=false)
 Check errors between floating point ranges using the specified tolerances. More...
 
template<typename Range , typename RefRange >
std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, bf16_t >, bool >::type CK_TILE_HOST check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-3, double atol=1e-3, bool allow_infinity_ref=false)
 Check errors between floating point ranges using the specified tolerances. More...
 
template<typename Range , typename RefRange >
std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, half_t >, bool >::type CK_TILE_HOST check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-3, double atol=1e-3, bool allow_infinity_ref=false)
 Check errors between half precision floating point ranges. More...
 
template<typename Range , typename RefRange >
std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange >> &&std::is_integral_v< ranges::range_value_t< Range >> &&!std::is_same_v< ranges::range_value_t< Range >, bf16_t >), bool > CK_TILE_HOST check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double=0, double atol=0)
 Check errors between integer ranges. More...
 
template<typename Range , typename RefRange >
std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange >> &&std::is_same_v< ranges::range_value_t< Range >, fp8_t >), bool > CK_TILE_HOST check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", unsigned max_rounding_point_distance=1, double atol=1e-1, bool allow_infinity_ref=false)
 Check errors between FP8 ranges. More...
 
template<typename Range , typename RefRange >
std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange >> &&std::is_same_v< ranges::range_value_t< Range >, bf8_t >), bool > CK_TILE_HOST check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-3, double atol=1e-3, bool allow_infinity_ref=false)
 Check errors between BF8 ranges. More...
 
template<typename... Ts>
auto concat (const Ts &... xs) -> std::enable_if_t<!AllConvertibleToStringView< Ts... >, std::string >
 
template<std::size_t N>
constexpr std::size_t getSize (char(&)[N]) noexcept
 
template<std::size_t N>
constexpr std::size_t getSize (const char(&)[N]) noexcept
 
constexpr std::size_t getSize (const char *s) noexcept
 
constexpr std::size_t getSize (const char &) noexcept
 
std::size_t getSize (const std::string &s) noexcept
 
constexpr std::size_t getSize (const std::string_view &s) noexcept
 
template<typename... Ts>
auto concatInto (std::string &result, const Ts &... xs) -> std::enable_if_t< AllConvertibleToStringView< Ts... >, void >
 
template<typename Sep , typename First , typename... Rest>
auto concat (Sep sep, const First &first, const Rest &... rest) -> std::enable_if_t< AllConvertibleToStringView< First, Rest... >, std::string >
 
template<typename T >
__global__ void set_buffer_value (T *p, T x, uint64_t buffer_element_size)
 
constexpr unsigned int fnv1a_hash (std::string_view str, unsigned int h=2166136261u)
 
std::string get_device_name ()
 
bool is_gfx11_supported ()
 
bool is_gfx12_supported ()
 
bool is_load_tr_supported ()
 
CK_TILE_HOST void hip_check_error (hipError_t x)
 
template<typename Range >
CK_TILE_HOST std::ostream & LogRange (std::ostream &os, Range &&range, std::string delim, int precision=std::cout.precision(), int width=0)
 
template<typename T , typename Range >
CK_TILE_HOST std::ostream & LogRangeAsType (std::ostream &os, Range &&range, std::string delim, int precision=std::cout.precision(), int width=0)
 
template<typename F , typename T , std::size_t... Is>
CK_TILE_HOST auto call_f_unpack_args_impl (F f, T args, std::index_sequence< Is... >)
 
template<typename F , typename T >
CK_TILE_HOST auto call_f_unpack_args (F f, T args)
 
template<typename F , typename T , std::size_t... Is>
CK_TILE_HOST auto construct_f_unpack_args_impl (T args, std::index_sequence< Is... >)
 
template<typename F , typename T >
CK_TILE_HOST auto construct_f_unpack_args (F, T args)
 
template<typename New2Old >
CK_TILE_HOST HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old (const HostTensorDescriptor &a, const New2Old &new2old)
 
template<typename F , typename... Xs>
CK_TILE_HOST auto make_ParallelTensorFunctor (F f, Xs... xs)
 
template<bool is_row_major>
auto host_tensor_descriptor (std::size_t row, std::size_t col, std::size_t stride, bool_constant< is_row_major >)
 Creates a host tensor descriptor with specified dimensions and layout. More...
 
template<bool is_row_major>
auto get_default_stride (std::size_t row, std::size_t col, std::size_t stride, bool_constant< is_row_major >)
 
template<int MinBlockPerCu, typename Kernel , typename... Args>
__global__ void kentry (Args... args)
 
template<int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU, typename KernelImpl , typename... Args>
CK_TILE_HOST auto make_kernel (KernelImpl, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
 
template<typename... Callables>
CK_TILE_HOST void launch_and_check (const stream_config &sc, Callables &&... callables)
 
template<typename TimerType , typename PreprocessFunc >
CK_TILE_HOST double preprocess_profiling_impl (TimerType timer, const stream_config &s, PreprocessFunc preprocess)
 
template<typename TimerType , typename CallablesFunc , typename PreprocessFunc = std::nullptr_t>
CK_TILE_HOST double timing_loop_impl (TimerType timer, const stream_config &s, CallablesFunc &&callables_func, PreprocessFunc preprocess=nullptr)
 
template<typename... Callables>
CK_TILE_HOST float launch_kernel (const stream_config &s, Callables &&... callables)
 
template<typename PreprocessFunc , typename... Callables>
CK_TILE_HOST float launch_kernel_time_mask (const stream_config &s, PreprocessFunc preprocess, Callables &&... callables)
 
template<typename DataType , typename RandValOutputDataType >
CK_TILE_HOST void reference_batched_dropout (HostTensor< DataType > &in_out_b_m_n, const HostTensor< RandValOutputDataType > &randval_b_m_n, const uint8_t &p_undrop_in_uint8_t, const float scale)
 
template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename BinaryElementOp = ck_tile::plus<AccDataType>>
CK_TILE_HOST void reference_batched_elementwise (const HostTensor< ADataType > &a_b_m_n, const HostTensor< BDataType > &b_b_m_n, HostTensor< CDataType > &c_b_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const BinaryElementOp &binary_element_op={})
 
template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_batched_gemm (const HostTensor< ADataType > &a_b_m_k, const HostTensor< BDataType > &b_b_n_k, HostTensor< CDataType > &c_b_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={})
 
template<typename CDataType , typename MaskingType >
CK_TILE_HOST void reference_batched_masking (HostTensor< CDataType > &c_b_m_n, const MaskingType &mask)
 
template<typename DataType , typename ComputeDataType = float>
CK_TILE_HOST void reference_batched_rotary_position_embedding (const HostTensor< DataType > &input_bsd, const HostTensor< DataType > &cos_sd, const HostTensor< DataType > &sin_sd, bool interleaved, HostTensor< DataType > &output_bsd, bool use_1_row_sin_cos=false)
 
template<typename ADataType , typename CompDataType , typename BDataType , typename CompElementOp = ck_tile::identity>
CK_TILE_HOST void reference_batched_softmax (const HostTensor< ADataType > &a_b_m_n, HostTensor< BDataType > &b_b_m_n, const CompElementOp &comp_element_op={}, std::optional< std::reference_wrapper< HostTensor< CompDataType >>> lse_b_m=std::nullopt)
 
template<typename Type >
CK_TILE_HOST void reference_batched_transpose (const HostTensor< Type > &x, HostTensor< Type > &y, std::string layout_in="NCHW", std::string layout_out="NHWC")
 
template<typename ADataType , typename BDataType , typename ComputeDataType , typename ElementOp >
CK_TILE_HOST void reference_unary_elementwise (const HostTensor< ADataType > &a, HostTensor< BDataType > &b, ElementOp element_op)
 
template<typename ADataType , typename BDataType , typename CDataType , typename ComputeDataType , typename ElementOp >
CK_TILE_HOST void reference_binary_elementwise (const HostTensor< ADataType > &a, const HostTensor< BDataType > &b, HostTensor< CDataType > &c, ElementOp element_op)
 
template<typename AccDataType , typename Activation , typename ADataType , typename GDataType , typename DDataType , typename ODataType , typename AScaleDataType , typename GScaleDataType , typename DScaleDataType , typename YSmoothScaleDataType , typename TopkWeightDataType , typename IndexDataType >
void reference_fused_moe (const ck_tile::HostTensor< ADataType > &a_host, const ck_tile::HostTensor< GDataType > &g_host, const ck_tile::HostTensor< DDataType > &d_host, const ck_tile::HostTensor< AScaleDataType > &sa_host, const ck_tile::HostTensor< GScaleDataType > &sg_host, const ck_tile::HostTensor< DScaleDataType > &sd_host, const ck_tile::HostTensor< YSmoothScaleDataType > &sy_host, ck_tile::HostTensor< ODataType > &o_host, const ck_tile::HostTensor< IndexDataType > &sorted_token_ids_host, const ck_tile::HostTensor< TopkWeightDataType > &sorted_weight_host, const ck_tile::HostTensor< IndexDataType > &sorted_expert_ids_host, const ck_tile::HostTensor< IndexDataType > &num_sorted_tiles_host, const ck_tile::HostTensor< IndexDataType > &token_ids_host, ck_tile::index_t block_m, ck_tile::index_t tokens, ck_tile::index_t experts, ck_tile::index_t hidden_size, ck_tile::index_t intermediate_size, ck_tile::index_t topk, ck_tile::index_t gate_only)
 
template<typename ADataType , typename QDataType , typename BDataType , typename AccDataType , typename CDataType , uint32_t QuantGroupSize, bool aquant, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_gemm_quant (const HostTensor< ADataType > &a_m_k, const HostTensor< QDataType > &q, const HostTensor< BDataType > &b_k_n, HostTensor< CDataType > &c_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={})
 
template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_gemm (const HostTensor< ADataType > &a_m_k, const HostTensor< BDataType > &b_k_n, HostTensor< CDataType > &c_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={})
 
template<typename ADataType , typename BDataType , typename DsDataType , typename AccDataType , typename CDataType , typename ACCElementOp , typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>>
CK_TILE_HOST void reference_gemm_multiple_d (const HostTensor< ADataType > &a_m_k, const HostTensor< BDataType > &b_k_n, const std::array< HostTensor< DDataType >, DsDataType::size()> &ds_m_n, HostTensor< CDataType > &c_m_n, const ACCElementOp &acc_element_op={})
 
template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename LayoutA , typename LayoutB , typename LayoutC >
__global__ void naive_gemm_kernel (ADataType *A, BDataType *B, CDataType *C, ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t K, ck_tile::index_t strideA, ck_tile::index_t strideB, ck_tile::index_t strideC)
 
template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename LayoutA , typename LayoutB , typename LayoutC >
void reference_gemm_gpu (ADataType *a_ptr, BDataType *b_ptr, CDataType *c_ptr, index_t M, index_t N, index_t K, index_t stride_a, index_t stride_b, index_t stride_c)
 
template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename LayoutA , typename LayoutB , typename LayoutC >
void reference_batched_gemm_gpu (ADataType *a_ptr, BDataType *b_ptr, CDataType *c_ptr, index_t M, index_t N, index_t K, index_t stride_a, index_t stride_b, index_t stride_c, index_t batch_stride_A, index_t batch_stride_B, index_t batch_stride_C, index_t batch_count)
 
template<ck_tile::index_t NDimSpatial, typename InDataType , typename WeiDataType , typename OutDataType >
CK_TILE_HOST void reference_grouped_conv_bwd_data (HostTensor< InDataType > &input, const HostTensor< WeiDataType > &weight, const HostTensor< OutDataType > &output, std::vector< ck_tile::long_index_t > conv_strides, std::vector< ck_tile::long_index_t > conv_dilations, std::vector< ck_tile::long_index_t > in_left_pads, std::vector< ck_tile::long_index_t >)
 
template<ck_tile::index_t NDimSpatial, typename InDataType , typename WeiDataType , typename OutDataType >
CK_TILE_HOST void reference_grouped_conv_bwd_weight (const HostTensor< InDataType > &input, HostTensor< WeiDataType > &weight, const HostTensor< OutDataType > &output, std::vector< ck_tile::long_index_t > conv_strides, std::vector< ck_tile::long_index_t > conv_dilations, std::vector< ck_tile::long_index_t > in_left_pads, std::vector< ck_tile::long_index_t >)
 
template<ck_tile::index_t NDimSpatial, typename InDataType , typename WeiDataType , typename OutDataType >
CK_TILE_HOST void reference_grouped_conv_fwd (const HostTensor< InDataType > &input, const HostTensor< WeiDataType > &weight, HostTensor< OutDataType > &output, std::vector< ck_tile::long_index_t > conv_strides, std::vector< ck_tile::long_index_t > conv_dilations, std::vector< ck_tile::long_index_t > in_left_pads, std::vector< ck_tile::long_index_t >)
 
template<typename InDataType , typename OutDataType , index_t NDimSpatial>
CK_TILE_HOST void reference_im2col (const HostTensor< InDataType > &in_host, HostTensor< OutDataType > &out_host, const ck_tile::conv::ConvParam &conv_params)
 
template<typename XDataType , typename GammaDataType , typename BetaDataType , typename ComputeDataType , typename YDataType , typename MeanDataType , typename InvStdDataType , typename Epilogue = reference_layernorm2d_default_epilogue>
void reference_layernorm2d_fwd (const HostTensor< XDataType > &x_m_n, const HostTensor< GammaDataType > &gamma_n, const HostTensor< BetaDataType > &beta_n, HostTensor< YDataType > &y_m_n, HostTensor< MeanDataType > &mean_m, HostTensor< InvStdDataType > &invStd_m, ComputeDataType epsilon, Epilogue epilogue_functor={})
 
template<typename WeightType , typename IndexType = index_t>
CK_TILE_HOST void reference_moe_sorting (const HostTensor< IndexType > &topk_ids, const HostTensor< WeightType > &weights, const HostTensor< IndexType > &local_expert_mask, HostTensor< IndexType > &p_sorted_token_ids, HostTensor< WeightType > &sorted_weight, HostTensor< IndexType > &sorted_expert_ids, index_t &unit_cnt, const index_t experts, const index_t unit_size, const index_t tokens, bool local_expert_masking, bool skip_experts_with_zero_token=true)
 
template<typename DataType >
CK_TILE_HOST void reference_permute (const HostTensor< DataType > &x, HostTensor< DataType > &y, std::vector< index_t > perm)
 
template<typename DataType >
CK_TILE_HOST auto reference_permute (const HostTensor< DataType > &x, std::vector< index_t > perm)
 
template<typename XDataType , typename ComputeDataType , typename YDataType , typename ReduceOp >
CK_TILE_HOST void reference_reduce (const HostTensor< XDataType > &x_m_n, HostTensor< YDataType > &y_m, ReduceOp reduce_op)
 
template<typename XDataType , typename ComputeDataType , typename YDataType , typename ReduceOp , typename KeptDim , typename ReduceDims >
CK_TILE_HOST void reference_reduce (const HostTensor< XDataType > &x_tensor, HostTensor< YDataType > &y_tensor, ReduceOp reduce_op, KeptDim kept_dim, ReduceDims reduce_dims)
 
template<typename XDataType , typename GammaDataType , typename ComputeDataType , typename YDataType , typename InvRmsDataType , typename UnquantYDataType , typename Epilogue = reference_rmsnorm2d_default_epilogue>
void reference_rmsnorm2d_fwd (const HostTensor< XDataType > &x_m_n, const HostTensor< GammaDataType > &gamma_n, HostTensor< YDataType > &y_m_n, HostTensor< InvRmsDataType > &invRms_m, HostTensor< UnquantYDataType > &unquant_y_m_n, ComputeDataType epsilon, Epilogue epilogue_functor={})
 
template<typename XDataType , typename ScaleDataType , typename QXDataType >
CK_TILE_HOST void reference_rowwise_quantization2d (const HostTensor< XDataType > &x_m_n, const HostTensor< ScaleDataType > &scale_m, HostTensor< QXDataType > &qx_m_n)
 
template<typename InputType , typename ComputeType , typename OutputType = ComputeType>
CK_TILE_HOST void reference_softmax (const HostTensor< InputType > &x, HostTensor< OutputType > &y, index_t dim=-1)
 
template<typename InputType , typename ComputeType , typename OutputType = ComputeType>
CK_TILE_HOST auto reference_softmax (const HostTensor< InputType > &x, index_t dim=-1)
 
template<typename DataType , typename IndexType = index_t>
CK_TILE_HOST void reference_topk (const HostTensor< DataType > &x, HostTensor< DataType > &y_values, HostTensor< IndexType > &y_indices, index_t k, index_t dim=-1, bool largest=true, bool sorted=true)
 
template<typename DataType , typename IndexType = index_t>
CK_TILE_HOST auto reference_topk (const HostTensor< DataType > &x, index_t k, index_t dim=-1, bool largest=true, bool sorted=true)
 
template<typename ADataType , typename BDataType >
void reference_transpose_elementwise (const HostTensor< ADataType > &a, HostTensor< BDataType > &b)
 
void flush_icache ()
 
template<typename ADataType_ , typename BDataType_ >
std::string gemm_prec_str ()
 
constexpr CK_TILE_HOST_DEVICE auto make_generic_attention_mask_coordinates_from_lr_window (index_t left_size, index_t right_size, index_t y_total, index_t x_total, bool is_top_left=true)
 
template<typename MaskType >
constexpr CK_TILE_HOST_DEVICE auto make_generic_attention_mask_from_lr_window (index_t left_size, index_t right_size, index_t y_total, index_t x_total, bool is_top_left=true)
 
template<typename DataType , bool RowMajor = true, unsigned LogMaxSadOprndSize = 16>
CK_TILE_HOST_DEVICE auto make_alibi_from_lr_mask (DataType slope, index_t window_left_size, index_t window_right_size, index_t y_total, index_t x_total, GenericAttentionMaskEnum mask_enum)
 
template<typename DataType >
CK_TILE_HOST std::vector< DataType > get_alibi_slopes (ck_tile::index_t nheads)
 
template<typename TensorView >
CK_TILE_HOST_DEVICE auto make_page_block_navigator (const TensorView &tensor_view)
 
template<typename DataType , index_t VirtualDim, typename TensorView >
CK_TILE_HOST_DEVICE auto make_page_block_navigator (copy_const_t< DataType, void > *physical_blocks, long_index_t block_stride, long_index_t fixed_offset, const int32_t *physical_block_indices, index_t num_blocks, index_t page_block_size, const TensorView &complete_view, const TensorView &last_view)
 
constexpr CK_TILE_HOST auto moe_sorting_get_smem_row_col (int tokens_, int num_experts_)
 
CK_TILE_HOST index_t moe_sorting_get_sub_token (int tokens_, int num_experts_)
 
CK_TILE_HOST bool moe_sorting_is_oneshot (int tokens_, int num_experts_)
 
CK_TILE_HOST index_t moe_sorting_mp_get_workspace_size (int tokens_, int num_experts_, int topk_)
 
CK_TILE_HOST index_t moe_sorting_get_workspace_size (int tokens_, int num_experts_, int topk_, int dispatch_policy_)
 
template<typename ADataType , typename BDataType , typename AccDataType , index_t M_Warp_Tile, index_t N_Warp_Tile, index_t K_Warp_Tile>
CK_TILE_HOST bool check_wmma_supported ()
 
CK_TILE_HOST std::string getConvSpecializationString (const ConvolutionSpecialization &s)
 
template<typename BlockShape >
constexpr CK_TILE_DEVICE index_t block_tile_welford_calculate_max_count (int row_size)
 
template<typename VarDistributedTensor_ , bool FastFdiv_ = false>
constexpr CK_TILE_DEVICE void block_tile_welford_post_scale_var (VarDistributedTensor_ &var_tensor, int count, bool_constant< FastFdiv_ >={})
 
template<typename T , bool kFastFDiv = false>
CK_TILE_DEVICE void welford_update (T &mean, T &var, T x, int count, bool_constant< kFastFDiv >={})
 
template<typename AccDistributedTensor_ , typename ReduceFunc , bool WithBroadcast = true, bool CrossWarp = true>
CK_TILE_DEVICE void block_tile_reduce_sync (AccDistributedTensor_ &acc_tensor, const ReduceFunc &reduce_func, bool_constant< WithBroadcast >={}, bool_constant< CrossWarp >={})
 
template<typename AccDistributedTensor_ , typename ReduceFunc >
CK_TILE_DEVICE void block_tile_reduce_xor_sync (AccDistributedTensor_ &acc_tensor, const ReduceFunc &reduce_func)
 
template<typename AccDistributedTensor_ , typename InDistributedTensor_ , index_t... InReduceDims, typename ReduceFunc >
CK_TILE_DEVICE void block_tile_reduce (AccDistributedTensor_ &acc_tensor, const InDistributedTensor_ &in_tensor, sequence< InReduceDims... >, const ReduceFunc &reduce_func)
 
template<typename AccDataType_ , typename InDistributedTensor_ , index_t... InReduceDims, typename ReduceFunc , typename InDataType_ >
CK_TILE_DEVICE auto block_tile_reduce (const InDistributedTensor_ &in_tensor, sequence< InReduceDims... > in_reduce_dims, const ReduceFunc &reduce_func, const InDataType_ &reduce_init)
 
template<typename T >
CK_TILE_HOST_DEVICE_EXTERN BlockReduce2D (const T &, const typename T::DataType &) -> BlockReduce2D< T >
 
CK_TILE_HOST float naive_attention_fwd (naive_attention_fwd_traits t, naive_attention_fwd_args a, ck_tile::stream_config s)
 

Variables

template<typename T >
constexpr bool is_constant_v = is_constant<T>::value
 
 Right
 
template<typename T = double>
constexpr T log2e_v = log2e<T>::value
 
template<typename T = double>
constexpr T log2e_rcp_v = 1. / log2e<T>::value
 
template<typename T >
constexpr bool is_null_tile_window_v = impl::is_null_tile_window<remove_cvref_t<T>>::value
 
template<typename T >
constexpr bool is_tile_window_with_static_distribution_v
 Helper variable template to check if a type is a tile window with static distribution. More...
 
template<typename T >
constexpr bool is_tile_window_with_static_lengths_v
 Helper variable template to check if a type is a tile window with static lengths. More...
 
template<typename T >
constexpr bool is_tile_window_linear_v = is_tile_window_linear<T>::value
 Helper variable template to check if a type is a linear tile window. More...
 
constexpr detail::ignore_t ignore
 
template<typename T >
constexpr bool is_static_v = is_static<T>::value
 
constexpr int ERROR_DETAIL_LIMIT = 5
 Maximum number of error values to display when checking errors. More...
 
template<typename... Ts>
constexpr bool AllConvertibleToStringView
 
constexpr uint32_t CUSTOM_MASK = 1U
 
constexpr uint32_t SLIDING_WINDOW = 2U
 
constexpr uint32_t LOGITS_SOFT_CAP = 4U
 
constexpr uint32_t ALIBI = 8U
 
template<typename Arch , typename AType , typename BType , typename CType , index_t warp_m, index_t warp_n, index_t warp_k>
constexpr bool has_wmma_traits_v
 

Typedef Documentation

◆ BF16

using ck_tile::BF16 = typedef ck_tile::bf16_t

16-bit brain floating point type

◆ bf16_raw_t

using ck_tile::bf16_raw_t = typedef uint16_t

◆ bf16_t

using ck_tile::bf16_t = typedef bfloat16_t

◆ bf16x16_t

using ck_tile::bf16x16_t = typedef bfloat16_t

◆ bf16x2_t

◆ bf16x32_t

using ck_tile::bf16x32_t = typedef bfloat16_t

◆ bf16x4_t

using ck_tile::bf16x4_t = typedef bfloat16_t

◆ bf16x64_t

using ck_tile::bf16x64_t = typedef bfloat16_t

◆ bf16x8_t

using ck_tile::bf16x8_t = typedef bfloat16_t

◆ BF8

using ck_tile::BF8 = typedef ck_tile::bf8_t

8-bit brain floating point type

◆ bf8_raw_t

using ck_tile::bf8_raw_t = typedef uint8_t

◆ bf8_t

using ck_tile::bf8_t = typedef unsigned _BitInt(8)

◆ bf8x16_t

using ck_tile::bf8x16_t = typedef bf8_t

◆ bf8x2_t

using ck_tile::bf8x2_t = typedef bf8_t

◆ bf8x32_t

using ck_tile::bf8x32_t = typedef bf8_t

◆ bf8x4_t

using ck_tile::bf8x4_t = typedef bf8_t

◆ bf8x64_t

using ck_tile::bf8x64_t = typedef bf8_t

◆ bf8x8_t

using ck_tile::bf8x8_t = typedef bf8_t

◆ bfloat16_t

using ck_tile::bfloat16_t = typedef ushort

◆ BlockFmhaBatchPrefillPipelineQRKSVSAsyncDefaultPolicy

◆ BlockFmhaPipelineQRKSVSAsyncDefaultPolicy

◆ BlockFmhaPipelineQRKSVSDefaultPolicy

◆ bool_constant

template<bool b>
using ck_tile::bool_constant = typedef constant<b>

◆ copy_const_t

template<typename From , typename To >
using ck_tile::copy_const_t = typedef typename copy_const<From, To>::type

◆ Default2DAndDynamicQuantEpilogueTraits

template<bool kPadM_, bool kPadN_, bool UseSmoothInputScale_, bool UseRawStore_ = true, bool UseMax3_ = false>
using ck_tile::Default2DAndDynamicQuantEpilogueTraits = typedef DynamicQuantEpilogueTraits<kPadM_, kPadN_, UseSmoothInputScale_, UseRawStore_, UseMax3_>

◆ default_linear_bottom_dims

template<typename TensorView_ >
using ck_tile::default_linear_bottom_dims = typedef typename impl::default_linear_bottom_dims_impl<TensorView_::buffer_view::get_address_space(), TensorView_::get_num_of_dimension()>::type

◆ DeviceIp

using ck_tile::DeviceIp = typedef remove_cvref_t<decltype(ck_tile::get_device_arch())>

◆ e8m0_raw_t

using ck_tile::e8m0_raw_t = typedef typename e8m0_t::raw_type

◆ e8m0_t

using ck_tile::e8m0_t = typedef e8m0_bexp_t

◆ ext_vector_t

template<typename T , index_t N>
using ck_tile::ext_vector_t = typedef typename impl::ext_vector<T, N>::type

◆ F16

using ck_tile::F16 = typedef ck_tile::half_t

16-bit floating point (half precision) type

◆ F32

using ck_tile::F32 = typedef float

32-bit floating point (single precision) type

◆ F8

using ck_tile::F8 = typedef ck_tile::fp8_t

8-bit floating point type

◆ fp16_hip_t

using ck_tile::fp16_hip_t = typedef _Float16

◆ fp16_raw_t

typedef ushort ck_tile::fp16_raw_t

◆ fp16_t

using ck_tile::fp16_t = typedef _Float16

◆ fp16x16_t

using ck_tile::fp16x16_t = typedef _Float16

◆ fp16x2_t

typedef _Float16 ck_tile::fp16x2_t

◆ fp16x32_t

using ck_tile::fp16x32_t = typedef _Float16

◆ fp16x4_t

using ck_tile::fp16x4_t = typedef _Float16

◆ fp16x64_t

using ck_tile::fp16x64_t = typedef _Float16

◆ fp16x8_t

using ck_tile::fp16x8_t = typedef _Float16

◆ fp32_t

typedef float ck_tile::fp32_t

◆ fp32x16_t

using ck_tile::fp32x16_t = typedef float

◆ fp32x2_t

typedef float ck_tile::fp32x2_t

◆ fp32x32_t

using ck_tile::fp32x32_t = typedef float

◆ fp32x4_t

using ck_tile::fp32x4_t = typedef float

◆ fp32x64_t

using ck_tile::fp32x64_t = typedef float

◆ fp32x8_t

using ck_tile::fp32x8_t = typedef float

◆ fp64_t

using ck_tile::fp64_t = typedef double

◆ fp64x2_t

using ck_tile::fp64x2_t = typedef double

◆ fp64x4_t

using ck_tile::fp64x4_t = typedef double

◆ fp8_raw_t

using ck_tile::fp8_raw_t = typedef uint8_t

◆ fp8_t

using ck_tile::fp8_t = typedef _BitInt(8)

◆ fp8x16_t

using ck_tile::fp8x16_t = typedef fp8_t

◆ fp8x2_t

using ck_tile::fp8x2_t = typedef fp8_t

◆ fp8x32_t

using ck_tile::fp8x32_t = typedef fp8_t

◆ fp8x4_t

using ck_tile::fp8x4_t = typedef fp8_t

◆ fp8x64_t

using ck_tile::fp8x64_t = typedef fp8_t

◆ fp8x8_t

using ck_tile::fp8x8_t = typedef fp8_t

◆ GemmAQuantPipelineProblem

template<typename ADataType_ , typename AQDataType_ , typename BDataType_ , typename CDataType_ , typename BlockGemmShape_ , typename Traits_ , uint32_t QuantGroupSize_, bool TransposeC_, typename ComputeDataType_ = BDataType_, GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave, bool HasHotLoop_ = true, TailNumber TailNum_ = TailNumber::Full>
using ck_tile::GemmAQuantPipelineProblem = typedef GemmAQuantPipelineProblemBase<ADataType_, AQDataType_, BDataType_, CDataType_, BlockGemmShape_, Traits_, QuantGroupSize_, TransposeC_, ComputeDataType_, Scheduler_, HasHotLoop_, TailNum_>

◆ GemmBQuantPipelineProblem

template<typename ADataType_ , typename BDataType_ , typename BQDataType_ , typename CDataType_ , typename BlockGemmShape_ , typename Traits_ , uint32_t QuantGroupSize_, typename ComputeDataType_ = ADataType_, GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave, bool HasHotLoop_ = true, TailNumber TailNum_ = TailNumber::Full>
using ck_tile::GemmBQuantPipelineProblem = typedef GemmBQuantPipelineProblemBase<ADataType_, BDataType_, BQDataType_, CDataType_, BlockGemmShape_, Traits_, QuantGroupSize_, ComputeDataType_, Scheduler_, HasHotLoop_, TailNum_>

◆ GemmPipelineAGmemBGmemCRegV2DefaultPolicy

◆ GemmPipelineProblem

template<typename ADataType_ , typename BDataType_ , typename CDataType_ , typename BlockGemmShape_ , typename Traits_ , typename ComputeDataType_ = ADataType_, bool FixedVectorSize_ = false, index_t VectorSizeA_ = 1, index_t VectorSizeB_ = 1>
using ck_tile::GemmPipelineProblem = typedef GemmPipelineProblemBase<ADataType_, BDataType_, CDataType_, BlockGemmShape_, Traits_, ComputeDataType_, FixedVectorSize_, VectorSizeA_, VectorSizeB_>

◆ GroupedConvBwdDataHostArgs

using ck_tile::GroupedConvBwdDataHostArgs = typedef GroupedConvHostArgs<void*, const void*, const void*>

◆ GroupedConvBwdWeightHostArgs

using ck_tile::GroupedConvBwdWeightHostArgs = typedef GroupedConvHostArgs<const void*, void*, const void*>

◆ GroupedConvFwdHostArgs

using ck_tile::GroupedConvFwdHostArgs = typedef GroupedConvHostArgs<const void*, const void*, void*>

◆ half_t

using ck_tile::half_t = typedef _Float16

◆ has_same_scalar_type

template<typename X , typename Y >
using ck_tile::has_same_scalar_type = typedef std::is_same<typename vector_traits<remove_cvref_t<X> >::scalar_type, typename vector_traits<remove_cvref_t<Y> >::scalar_type>

◆ I32

using ck_tile::I32 = typedef int32_t

32-bit signed integer type

◆ I8

using ck_tile::I8 = typedef int8_t

8-bit signed integer type

◆ index_t

using ck_tile::index_t = typedef int32_t

◆ InputTileDistributionTraits

template<typename TileDistributionEncoding_ , typename DataType_ , typename Policy = DefaultTranspose<DataType_>>
using ck_tile::InputTileDistributionTraits = typedef TransposeTileDistributionTraits<TileDistributionEncoding_, DataType_, Policy, true>

◆ int16x16_t

using ck_tile::int16x16_t = typedef int16_t

◆ int16x2_t

using ck_tile::int16x2_t = typedef int16_t

◆ int16x32_t

using ck_tile::int16x32_t = typedef int16_t

◆ int16x4_t

using ck_tile::int16x4_t = typedef int16_t

◆ int16x64_t

using ck_tile::int16x64_t = typedef int16_t

◆ int16x8_t

using ck_tile::int16x8_t = typedef int16_t

◆ int32_t

◆ int32x16_t

using ck_tile::int32x16_t = typedef int32_t

◆ int32x2_t

using ck_tile::int32x2_t = typedef int32_t

◆ int32x32_t

using ck_tile::int32x32_t = typedef int32_t

◆ int32x4_t

using ck_tile::int32x4_t = typedef int32_t

◆ int32x64_t

using ck_tile::int32x64_t = typedef int32_t

◆ int32x8_t

using ck_tile::int32x8_t = typedef int32_t

◆ int8_t

◆ int8x16_t

using ck_tile::int8x16_t = typedef int8_t

◆ int8x2_t

◆ int8x32_t

using ck_tile::int8x32_t = typedef int8_t

◆ int8x4_t

using ck_tile::int8x4_t = typedef int8_t

◆ int8x64_t

using ck_tile::int8x64_t = typedef int8_t

◆ int8x8_t

using ck_tile::int8x8_t = typedef int8_t

◆ is_detected

template<template< class... > class Op, class... Args>
using ck_tile::is_detected = typedef typename detail::detector<nonesuch, void, Op, Args...>::value_t

◆ is_known_at_compile_time

template<typename T >
using ck_tile::is_known_at_compile_time = typedef is_static<T>

◆ is_static

template<typename T >
using ck_tile::is_static = typedef impl::is_static_impl<remove_cvref_t<T> >

◆ is_tuple

template<typename T >
using ck_tile::is_tuple = typedef decltype(std::declval<T&>().IsTuple())

◆ iter_difference_t

template<typename T >
using ck_tile::iter_difference_t = typedef typename std::iterator_traits<remove_cvref_t<T> >::difference_type

◆ iter_reference_t

template<typename T >
using ck_tile::iter_reference_t = typedef decltype(*std::declval<T&>())

◆ iter_value_t

template<typename T >
using ck_tile::iter_value_t = typedef typename std::iterator_traits<remove_cvref_t<T> >::value_type

◆ long_index_t

using ck_tile::long_index_t = typedef int64_t

◆ long_number

template<long_index_t v>
using ck_tile::long_number = typedef constant<v>

◆ magic_division

◆ make_index_sequence

template<index_t N>
using ck_tile::make_index_sequence = typedef typename __make_integer_seq<impl::__integer_sequence, index_t, N>::seq_type

◆ multi_index

template<index_t N>
using ck_tile::multi_index = typedef array<index_t, N>

◆ number

template<index_t v>
using ck_tile::number = typedef constant<v>

◆ OutputTileDistributionTraits

template<typename TileDistributionEncoding_ , typename DataType_ , typename Policy = DefaultTranspose<DataType_>>
using ck_tile::OutputTileDistributionTraits = typedef TransposeTileDistributionTraits<TileDistributionEncoding_, DataType_, Policy, false>

◆ PersistentTileGemmUniversalTraits

template<bool kPadM_, bool kPadN_, bool kPadK_, bool DoubleSmemBuffer_, typename ALayout_ , typename BLayout_ , typename CLayout_ , bool TransposeC_ = false, bool UseStructuredSparsity_ = false>
using ck_tile::PersistentTileGemmUniversalTraits = typedef TileGemmUniversalTraits<kPadM_, kPadN_, kPadK_, DoubleSmemBuffer_, ALayout_, BLayout_, CLayout_, TransposeC_, UseStructuredSparsity_, true>

◆ pk_fp4_raw_t

using ck_tile::pk_fp4_raw_t = typedef typename pk_fp4_t::raw_type

◆ pk_fp4_t

◆ pk_int4x16_t

using ck_tile::pk_int4x16_t = typedef int8_t

◆ pk_int4x2_t

using ck_tile::pk_int4x2_t = typedef int8_t

◆ pk_int4x32_t

using ck_tile::pk_int4x32_t = typedef int8_t

◆ pk_int4x4_t

using ck_tile::pk_int4x4_t = typedef int8_t

◆ pk_int4x8_t

using ck_tile::pk_int4x8_t = typedef int8_t

◆ remove_cv_t

template<typename T >
using ck_tile::remove_cv_t = typedef typename std::remove_cv<T>::type

◆ remove_cvref_t

template<typename T >
using ck_tile::remove_cvref_t = typedef remove_cv_t<std::remove_reference_t<T> >

◆ remove_pointer_t

template<typename T >
using ck_tile::remove_pointer_t = typedef typename std::remove_pointer<T>::type

◆ remove_reference_t

template<typename T >
using ck_tile::remove_reference_t = typedef typename std::remove_reference<T>::type

◆ sequence_merge_t

template<typename... Seqs>
using ck_tile::sequence_merge_t = typedef typename sequence_merge<Seqs...>::type

◆ statically_indexed_array

template<typename T , index_t N>
using ck_tile::statically_indexed_array = typedef tuple_array<T, N>

◆ thread_buffer

template<typename T , index_t N>
using ck_tile::thread_buffer = typedef tuple_array<T, N>

◆ tile_distribution_encoding_shuffle_t

template<typename encoding , typename shuffle >
using ck_tile::tile_distribution_encoding_shuffle_t = typedef typename tile_distribution_encoding_shuffle<encoding, shuffle>::type

◆ tuple_array

template<typename T , index_t N>
using ck_tile::tuple_array = typedef typename impl::tuple_array_impl<T, N>::type

◆ tuple_element_or_default_t

template<typename Tuple_ , std::size_t Idx, typename DefaultType >
using ck_tile::tuple_element_or_default_t = typedef typename tuple_element_or_default<Tuple_, Idx, DefaultType>::type

◆ uint16x16_t

using ck_tile::uint16x16_t = typedef uint16_t

◆ uint16x2_t

using ck_tile::uint16x2_t = typedef uint16_t

◆ uint16x32_t

using ck_tile::uint16x32_t = typedef uint16_t

◆ uint16x4_t

using ck_tile::uint16x4_t = typedef uint16_t

◆ uint16x64_t

using ck_tile::uint16x64_t = typedef uint16_t

◆ uint16x8_t

using ck_tile::uint16x8_t = typedef uint16_t

◆ uint32x16_t

using ck_tile::uint32x16_t = typedef uint32_t

◆ uint32x2_t

using ck_tile::uint32x2_t = typedef uint32_t

◆ uint32x32_t

using ck_tile::uint32x32_t = typedef uint32_t

◆ uint32x4_t

using ck_tile::uint32x4_t = typedef uint32_t

◆ uint32x64_t

using ck_tile::uint32x64_t = typedef uint32_t

◆ uint32x8_t

using ck_tile::uint32x8_t = typedef uint32_t

◆ uint8x16_t

using ck_tile::uint8x16_t = typedef uint8_t

◆ uint8x2_t

using ck_tile::uint8x2_t = typedef uint8_t

◆ uint8x32_t

using ck_tile::uint8x32_t = typedef uint8_t

◆ uint8x4_t

using ck_tile::uint8x4_t = typedef uint8_t

◆ uint8x64_t

using ck_tile::uint8x64_t = typedef uint8_t

◆ uint8x8_t

using ck_tile::uint8x8_t = typedef uint8_t

◆ uniform_sequence_gen_t

template<index_t NSize, index_t I>
using ck_tile::uniform_sequence_gen_t = typedef typename uniform_sequence_gen<NSize, I>::type

◆ WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8

template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using ck_tile::WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8 = typedef WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<bf8_t, bf8_t, Ctrl_>

◆ WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8

template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using ck_tile::WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8 = typedef WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<bf8_t, fp8_t, Ctrl_>

◆ WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8

template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using ck_tile::WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8 = typedef WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<fp8_t, bf8_t, Ctrl_>

◆ WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8

template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using ck_tile::WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8 = typedef WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<fp8_t, fp8_t, Ctrl_>

◆ WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8

template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using ck_tile::WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8 = typedef WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base<bf8_t, bf8_t, Ctrl_>

◆ WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8

template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using ck_tile::WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8 = typedef WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base<fp8_t, fp8_t, Ctrl_>

◆ WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8

template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, bf8_t, Ctrl_>

◆ WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8

template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, fp8_t, Ctrl_>

◆ WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8

template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, bf8_t, Ctrl_>

◆ WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8

template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t, Ctrl_>

◆ WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8

template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<bf8_t, bf8_t, Ctrl_>

◆ WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8

template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<bf8_t, fp8_t, Ctrl_>

◆ WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8

template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<fp8_t, bf8_t, Ctrl_>

◆ WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8

template<WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<fp8_t, fp8_t, Ctrl_>

◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_bf16_bf16

◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_bf8

◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_f8

◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_f16_f16

◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_bf8

◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_f8

◆ WarpGemmAttributeWmmaImpl_i32_16x16x16_i8_i8

◆ WarpGemmDispatcher

template<typename AType , typename BType , typename AccType , index_t MPerWave, index_t NPerWave, index_t KPerWave, bool TransposeC, bool SwizzleA = false, bool UseStructuredSparsity = false, WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmDispatcher = typedef typename impl::WarpGemmDispatcher<AType, BType, AccType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity, AttrNumAccess>::Type

◆ WarpGemmMfma_f32_16x16x128_bf8_bf8

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfma_f32_16x16x128_bf8_bf8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>, AttrNumAccess> >

◆ WarpGemmMfma_f32_16x16x128_bf8_fp8

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfma_f32_16x16x128_bf8_fp8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8<WGAttrCtlEnum::Default_>, AttrNumAccess> >

◆ WarpGemmMfma_f32_16x16x128_fp8_bf8

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfma_f32_16x16x128_fp8_bf8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8<WGAttrCtlEnum::Default_>, AttrNumAccess> >

◆ WarpGemmMfma_f32_16x16x128_fp8_fp8

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfma_f32_16x16x128_fp8_fp8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8<WGAttrCtlEnum::Default_>, AttrNumAccess> >

◆ WarpGemmMfma_f32_16x16x32_bf8_bf8

◆ WarpGemmMfma_f32_16x16x32_bf8_bf8_CTransposed

◆ WarpGemmMfma_f32_16x16x32_fp8_fp8

◆ WarpGemmMfma_f32_16x16x32_fp8_fp8_CTransposed

◆ WarpGemmMfma_f32_16x16x64_bf8_bf8

◆ WarpGemmMfma_f32_16x16x64_fp8_fp8

◆ WarpGemmMfma_f32_32x32x16_bf8_bf8

◆ WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed

◆ WarpGemmMfma_f32_32x32x16_bf8_fp8

◆ WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed

◆ WarpGemmMfma_f32_32x32x16_fp8_bf8

◆ WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed

◆ WarpGemmMfma_f32_32x32x16_fp8_fp8

◆ WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed

◆ WarpGemmMfma_f32_32x32x32_bf8_bf8

◆ WarpGemmMfma_f32_32x32x32_fp8_fp8

◆ WarpGemmMfma_f32_32x32x64_bf8_bf8

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfma_f32_32x32x64_bf8_bf8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8<WGAttrCtlEnum::Default_>, AttrNumAccess> >

◆ WarpGemmMfma_f32_32x32x64_bf8_fp8

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfma_f32_32x32x64_bf8_fp8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8<WGAttrCtlEnum::Default_>, AttrNumAccess> >

◆ WarpGemmMfma_f32_32x32x64_fp8_bf8

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfma_f32_32x32x64_fp8_bf8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8<WGAttrCtlEnum::Default_>, AttrNumAccess> >

◆ WarpGemmMfma_f32_32x32x64_fp8_fp8

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfma_f32_32x32x64_fp8_fp8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8<WGAttrCtlEnum::Default_>, AttrNumAccess> >

◆ WarpGemmMfma_i32_16x16x32_i8_i8

◆ WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed

◆ WarpGemmMfma_i32_32x32x16_i8_i8

◆ WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed

◆ WarpGemmMfmaBf16Bf16F32M16N16K16

◆ WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution

◆ WarpGemmMfmaBf16Bf16F32M16N16K32

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfmaBf16Bf16F32M16N16K32 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>, 2, AttrNumAccess> >

◆ WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution

◆ WarpGemmMfmaBf16Bf16F32M32N32K16

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfmaBf16Bf16F32M32N32K16 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>, 2, AttrNumAccess> >

◆ WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA

◆ WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution

◆ WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution

◆ WarpGemmMfmaBf16Bf16F32M32N32K8

◆ WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA

◆ WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution

◆ WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution

◆ WarpGemmMfmaBf16Bf16F32M4N64K16

◆ WarpGemmMfmaBf16Bf16F32M64N4K16

◆ WarpGemmMfmaF16F16F32M16N16K16

◆ WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution

◆ WarpGemmMfmaF16F16F32M16N16K32

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfmaF16F16F32M16N16K32 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>, 2, AttrNumAccess> >

◆ WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution

◆ WarpGemmMfmaF16F16F32M32N32K16

template<WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
using ck_tile::WarpGemmMfmaF16F16F32M32N32K16 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>, 2, AttrNumAccess> >

◆ WarpGemmMfmaF16F16F32M32N32K16SwizzleA

◆ WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution

◆ WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution

◆ WarpGemmMfmaF16F16F32M32N32K8

◆ WarpGemmMfmaF16F16F32M32N32K8SwizzleA

◆ WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution

◆ WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution

◆ WarpGemmMfmaF16F16F32M4N64K16

◆ WarpGemmMfmaF16F16F32M64N4K16

◆ WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution

◆ WarpGemmSmfmacF16F16F32M16N16K32

◆ WarpGemmSmfmacF16F16F32M32N32K16

◆ WarpGemmWmma_f32_16x16x16_bf16_bf16

◆ WarpGemmWmma_f32_16x16x16_bf8_bf8

◆ WarpGemmWmma_f32_16x16x16_bf8_f8

◆ WarpGemmWmma_f32_16x16x16_f16_f16

◆ WarpGemmWmma_f32_16x16x16_f8_bf8

◆ WarpGemmWmma_f32_16x16x16_f8_f8

◆ WarpGemmWmma_i32_16x16x16_i8_i8

Enumeration Type Documentation

◆ AlibiMode

enum ck_tile::AlibiMode
strong
Enumerator
VERTICAL 
FROM_TOP_LEFT 
FROM_BOTTOM_RIGHT 

◆ amd_buffer_coherence_enum

Enumerator
coherence_default 
glc 
slc 
glc_slc 
WAVE_NT0 
WAVE_NT1 
GROUP_NT0 
GROUP_NT1 
DEVICE_NT0 
DEVICE_NT1 
SYSTEM_NT0 
SYSTEM_NT1 

◆ bf16_rounding_mode

Enumerator
standard 
truncate_with_nan 
truncate 
standard_asm 
rta_asm 

◆ BlockAttentionBiasEnum

Enumerator
NO_BIAS 
ELEMENTWISE_BIAS 
ALIBI 

◆ BlockFmhaPipelineEnum

Enumerator
QRKSVS 
QRKSVS_ASYNC 
QSKSVS 
QRKSVS_ASYNC_TRLOAD 

◆ ConvolutionSpecialization

Enumerator
Default 
Filter1x1Pad0 
Filter1x1Stride1Pad0 
Filter3x3 

◆ coord_transform_enum

Enumerator
undefined 
pass_through 
pad 
embed 
merge 
unmerge 
replicate 
xor_t 
offset 
indexing 

◆ fp8_interpretation

FP8 interpretation used in conversion algorithms.

Enumerator
E4M3_OCP 
E5M2_OCP 
E4M3_FNUZ 
E5M2_FNUZ 

◆ fp8_rounding_mode

Enumerator
standard 
stochastic 

◆ FusedMoeGemmPipelineSequencerEnum

Enumerator
SLD_A 
SLD_B 
GLD_A 
GLD_B 
SST_A 
SST_B 
GST_O 

◆ FusedMoeGemmWeightPermuteEnum

Enumerator
no_permute 
b_nr_kr_kw_nw_kv 
b_nr_kr_waveflatten 

◆ GemmLoopOrder

Enumerator
KMN 
MNK 

◆ GemmPipelineScheduler

Enumerator
Default 
Intrawave 
Interwave 

◆ GenericAttentionMaskEnum

Enumerator
NO_MASK 
MASK_FROM_TOP_LEFT 
MASK_FROM_BOTTOM_RIGHT 
MASK_GENERIC 

◆ Layernorm2dFusedAddEnum

Enumerator
NO_ADD 
PRE_ADD_STORE 
PRE_ADD 

◆ Layernorm2dFusedQuantEnum

Enumerator
NO_SWEEP 
SMOOTH_DYNAMIC_QUANT 
DYNAMIC_QUANT 

◆ Layernorm2dXBiasEnum

Enumerator
NO_BIAS 
ADD_BIAS 

◆ naive_attention_layout_enum

Enumerator
DEFAULT 
BSHD 
BHSD 
BS3HD 
PHSD 
PHDSX 
PHDS 
SCALE_HS 
SCALE_SH 

◆ naive_attention_quant_algo

Enumerator
NO 
KV_8BIT_PERHEAD 
KV_8BIT_PERTOKEN 

◆ naive_attention_variation_enum

Enumerator
FLASH_BATCHED 
FLASH_GROUPED 
DECODE_PAGED 

◆ PositionEncodingEnum

Enumerator
NO 
ALIBI 

◆ Rmsnorm2dFusedAddEnum

Enumerator
NO_ADD 
PRE_ADD_STORE 
PRE_ADD 

◆ Rmsnorm2dFusedQuantEnum

Enumerator
NO_SWEEP 
SMOOTH_DYNAMIC_QUANT 
DYNAMIC_QUANT 

◆ Rmsnorm2dSensitiveEnum

Enumerator
NO_SPECIFIC_MODEL 
T5_MODEL_LIKE 

◆ RotaryEmbeddingEnum

Enumerator
NONE 
INTERLEAVED 
HALF_ROTATED 

◆ StreamKReductionStrategy

Enumerator
Atomic 

Workgroups atomically add their results to the C tensor.

Reduction 

For a given tile in the C tensor, one workgroup accumulates results of other contributing workgroups.

◆ TailNumber

enum ck_tile::TailNumber
strong
Enumerator
Odd 
Even 
One 
Two 
Three 
Four 
Five 
Six 
Seven 
Empty 
Full 

◆ tile_distribution_pattern

Enumeration describing static tile distribution patterns.

Enumerator
thread_raked 

Thread raked pattern.

warp_raked 

Warp raked pattern.

block_raked 

Block raked pattern - aka linear.

◆ WGAttrCtlEnum

Enumerator
Default_ 
Raw_vvv 
Raw_vaa 
Raw_vav 
Raw_vva 
Raw_avv 

◆ WGAttrNumAccessEnum

Enumerator
Single 
Double 
Quad 
Invalid 

Function Documentation

◆ abs() [1/7]

◆ abs() [2/7]

template<typename T >
CK_TILE_HOST_DEVICE T ck_tile::abs ( const T &  x)

◆ abs() [3/7]

CK_TILE_DEVICE double ck_tile::abs ( double  x)

◆ abs() [4/7]

CK_TILE_DEVICE float ck_tile::abs ( float  x)

◆ abs() [5/7]

CK_TILE_DEVICE fp16_t ck_tile::abs ( fp16_t  x)

◆ abs() [6/7]

CK_TILE_DEVICE int32_t ck_tile::abs ( int32_t  x)

◆ abs() [7/7]

CK_TILE_DEVICE int8_t ck_tile::abs ( int8_t  x)

◆ acos() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::acos ( x)

◆ acos() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::acos ( x)

◆ acos< double >()

template<>
CK_TILE_DEVICE double ck_tile::acos< double > ( double  x)

◆ acos< float >()

template<>
CK_TILE_DEVICE float ck_tile::acos< float > ( float  x)

◆ acosh() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::acosh ( x)

◆ acosh() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::acosh ( x)

◆ acosh< double >()

template<>
CK_TILE_DEVICE double ck_tile::acosh< double > ( double  x)

◆ acosh< float >()

template<>
CK_TILE_DEVICE float ck_tile::acosh< float > ( float  x)

◆ adaptor_coordinate_is_valid()

template<typename Adaptor , typename AdpatorCoord >
constexpr CK_TILE_HOST_DEVICE bool ck_tile::adaptor_coordinate_is_valid ( const Adaptor &  adaptor,
const AdpatorCoord &  coord 
)
constexpr

◆ adaptor_coordinate_is_valid_assuming_top_index_is_valid()

template<typename Adaptor , typename AdaptorCoord >
constexpr CK_TILE_HOST_DEVICE bool ck_tile::adaptor_coordinate_is_valid_assuming_top_index_is_valid ( const Adaptor &  adaptor,
const AdaptorCoord &  coord 
)
constexpr

◆ add()

template<typename T , typename ComputeType >
CK_TILE_HOST_DEVICE T ck_tile::add ( const T &  a,
const T &  b 
)

◆ add_bf16x2_t()

CK_TILE_HOST_DEVICE bf16x2_t ck_tile::add_bf16x2_t ( const bf16x2_t a,
const bf16x2_t b 
)

◆ add_bf16x4_t()

CK_TILE_HOST_DEVICE bf16x4_t ck_tile::add_bf16x4_t ( const bf16x4_t a,
const bf16x4_t b 
)

◆ add_bf8x4_t()

CK_TILE_HOST_DEVICE bf8x4_t ck_tile::add_bf8x4_t ( const bf8x4_t a,
const bf8x4_t b 
)

◆ add_bf8x8_t()

CK_TILE_HOST_DEVICE bf8x8_t ck_tile::add_bf8x8_t ( const bf8x8_t a,
const bf8x8_t b 
)

◆ add_f16x2_t()

CK_TILE_HOST_DEVICE fp16x2_t ck_tile::add_f16x2_t ( const fp16x2_t a,
const fp16x2_t b 
)

◆ add_fp8x4_t()

CK_TILE_HOST_DEVICE fp8x4_t ck_tile::add_fp8x4_t ( const fp8x4_t a,
const fp8x4_t b 
)

◆ add_fp8x8_t()

CK_TILE_HOST_DEVICE fp8x8_t ck_tile::add_fp8x8_t ( const fp8x8_t a,
const fp8x8_t b 
)

◆ amd_async_buffer_load()

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load ( CK_TILE_LDS_ADDR T *  smem,
int32x4_t  src_wave_buffer_resource,
index_t  src_thread_addr_offset,
index_t  src_wave_addr_offset,
index_t  src_immediate_addr_offset = 0,
index_t  flag = 0,
bool_constant< oob_conditional_check >  = {} 
)

◆ amd_async_buffer_load_impl()

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_impl ( CK_TILE_LDS_ADDR T *  smem,
int32x4_t  src_wave_buffer_resource,
index_t  src_thread_addr_offset,
index_t  src_wave_addr_offset,
index_t  src_immediate_addr_offset = 0,
bool_constant< pre_nop >  = {} 
)

◆ amd_async_buffer_load_with_oob()

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = false>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_with_oob ( CK_TILE_LDS_ADDR T *  smem,
const int32x4_t  src_wave_buffer_resource,
index_t  src_thread_element_offset,
index_t  src_linear_element_offset,
bool  is_valid_element,
bool_constant< oob_conditional_check >  = {} 
)

◆ amd_async_buffer_load_with_oob_raw() [1/2]

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_with_oob_raw ( T *  smem,
const int32x4_t  src_wave_buffer_resource,
index_t  src_thread_element_offset,
index_t  src_linear_element_offset,
bool_constant< pre_nop >  = {} 
)

◆ amd_async_buffer_load_with_oob_raw() [2/2]

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_with_oob_raw ( T *  smem,
const T *  p_src_wave,
index_t  src_thread_element_offset,
index_t  src_linear_element_offset,
index_t  src_element_space_size,
bool_constant< pre_nop >  = {} 
)

◆ amd_buffer_atomic_add()

template<typename T , index_t N>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_add ( const thread_buffer< T, N > &  src_thread_data,
T *  p_dst_wave,
const index_t  dst_thread_element_offset,
const bool  dst_thread_element_valid,
const index_t  dst_element_space_size 
)

◆ amd_buffer_atomic_add_impl()

template<typename T , index_t N>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_add_impl ( const thread_buffer< T, N > &  src_thread_data,
int32x4_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_buffer_atomic_add_raw()

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_add_raw ( const thread_buffer< T, N > &  src_thread_data,
T *  p_dst_wave,
const index_t  dst_thread_element_offset,
const index_t  dst_linear_element_offset,
const bool  dst_thread_element_valid,
const index_t  dst_element_space_size,
bool_constant< pre_nop >  = {} 
)

◆ amd_buffer_atomic_max()

template<typename T , index_t N>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_max ( const thread_buffer< T, N > &  src_thread_data,
T *  p_dst_wave,
const index_t  dst_thread_element_offset,
const bool  dst_thread_element_valid,
const index_t  dst_element_space_size 
)

◆ amd_buffer_atomic_max_impl()

template<typename T , index_t N>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_max_impl ( const thread_buffer< T, N >  src_thread_data,
int32x4_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_buffer_load_impl()

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer<T, N> ck_tile::amd_buffer_load_impl ( int32x4_t  src_wave_buffer_resource,
index_t  src_thread_addr_offset,
index_t  src_wave_addr_offset 
)

◆ amd_buffer_load_impl_with_bytes()

template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer<int8_t, N> ck_tile::amd_buffer_load_impl_with_bytes ( int32x4_t  src_wave_buffer_resource,
index_t  src_thread_addr_offset,
index_t  src_wave_addr_offset 
)

◆ amd_buffer_load_invalid_element_return_customized_value()

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE thread_buffer<T, N> ck_tile::amd_buffer_load_invalid_element_return_customized_value ( const T *  p_src_wave,
index_t  src_thread_element_offset,
bool  src_thread_element_valid,
index_t  src_element_space_size,
customized_value 
)

◆ amd_buffer_load_invalid_element_return_zero()

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE thread_buffer<T, N> ck_tile::amd_buffer_load_invalid_element_return_zero ( const T *  p_src_wave,
index_t  src_thread_element_offset,
bool  src_thread_element_valid,
index_t  src_element_space_size 
)

◆ amd_buffer_load_raw() [1/2]

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_buffer_load_raw ( thread_buffer< T, N > &  dst,
const int32x4_t  src_wave_buffer_resource,
index_t  src_thread_element_offset,
index_t  src_linear_element_offset,
index_t  is_valid_element = 0,
bool_constant< pre_nop >  = {} 
)

◆ amd_buffer_load_raw() [2/2]

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_buffer_load_raw ( thread_buffer< T, N > &  dst,
const T *  p_src_wave,
index_t  src_thread_element_offset,
index_t  src_linear_element_offset,
index_t  src_element_space_size,
index_t  is_valid_element = 0,
bool_constant< pre_nop >  = {} 
)

◆ amd_buffer_load_raw_impl()

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_buffer_load_raw_impl ( thread_buffer< T, N > &  dst,
int32x4_t  src_wave_buffer_resource,
index_t  src_thread_addr_offset,
index_t  src_wave_addr_offset,
index_t  src_linear_addr_offset,
index_t  flag = 0,
bool_constant< pre_nop >  = {} 
)

◆ amd_buffer_store()

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void ck_tile::amd_buffer_store ( const thread_buffer< T, N > &  src_thread_data,
T *  p_dst_wave,
const index_t  dst_thread_element_offset,
const bool  dst_thread_element_valid,
const index_t  dst_element_space_size 
)

◆ amd_buffer_store_impl()

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE void ck_tile::amd_buffer_store_impl ( const thread_buffer< T, N >  src_thread_data,
int32x4_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_buffer_store_impl_with_bytes()

template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE void ck_tile::amd_buffer_store_impl_with_bytes ( const thread_buffer< int8_t, N >  src_thread_data,
int32x4_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_buffer_store_raw()

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void ck_tile::amd_buffer_store_raw ( const thread_buffer< T, N > &  src_thread_data,
T *  p_dst_wave,
const index_t  dst_thread_element_offset,
const index_t  dst_linear_element_offset,
const bool  dst_thread_element_valid,
const index_t  dst_element_space_size 
)

◆ amd_buffer_store_raw_impl()

template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void ck_tile::amd_buffer_store_raw_impl ( const thread_buffer< T, N > &  dst_thread_data,
int32x4_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset,
index_t  dst_linear_addr_offset,
index_t  is_valid_element = 1 
)

◆ apply()

template<typename F , typename Tuple >
constexpr decltype(auto) ck_tile::apply ( F &&  f,
Tuple &&  t 
)
constexpr

◆ asin() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::asin ( x)

◆ asin() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::asin ( x)

◆ asin< double >()

template<>
CK_TILE_DEVICE double ck_tile::asin< double > ( double  x)

◆ asin< float >()

template<>
CK_TILE_DEVICE float ck_tile::asin< float > ( float  x)

◆ asinh() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::asinh ( x)

◆ asinh() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::asinh ( x)

◆ asinh< double >()

template<>
CK_TILE_DEVICE double ck_tile::asinh< double > ( double  x)

◆ asinh< float >()

template<>
CK_TILE_DEVICE float ck_tile::asinh< float > ( float  x)

◆ async_buffer_load_dwordxn_v()

template<unsigned num_dwords, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::async_buffer_load_dwordxn_v ( void *  smem,
int32x4_t  rsrc,
index_t  voffset,
index_t  ,
index_t  ioffset,
index_t  = 0,
bool_constant< pre_nop >  = {} 
)

◆ async_buffer_load_fence()

CK_TILE_DEVICE void ck_tile::async_buffer_load_fence ( index_t  cnt = 0)

◆ async_load_fence()

CK_TILE_DEVICE auto ck_tile::async_load_fence ( index_t  cnt = 0)

◆ async_load_fence_raw()

CK_TILE_DEVICE auto ck_tile::async_load_fence_raw ( index_t  cnt = 0)

◆ async_load_tile()

template<typename LdsTileWindow_ , typename TileWindow_ , index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE auto ck_tile::async_load_tile ( LdsTileWindow_ &&  lds_tile,
const TileWindow_ &  tile_window,
number< i_access >  = {},
bool_constant< oob_conditional_check >  = {} 
)

◆ async_load_tile_raw()

template<typename LdsTileWindow_ , typename TileWindow_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto ck_tile::async_load_tile_raw ( LdsTileWindow_ &&  lds_tile,
const TileWindow_ &  tile_window,
number< i_access >  = {},
bool_constant< oob_conditional_check >  = {},
bool_constant< pre_nop >  = {} 
)

◆ atan() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::atan ( x)

◆ atan() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::atan ( x)

◆ atan< double >()

template<>
CK_TILE_DEVICE double ck_tile::atan< double > ( double  x)

◆ atan< float >()

template<>
CK_TILE_DEVICE float ck_tile::atan< float > ( float  x)

◆ atanh() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::atanh ( x)

◆ atanh() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::atanh ( x)

◆ atanh< double >()

template<>
CK_TILE_DEVICE double ck_tile::atanh< double > ( double  x)

◆ atanh< float >()

template<>
CK_TILE_DEVICE float ck_tile::atanh< float > ( float  x)

◆ atomic_add()

template<typename X >
CK_TILE_DEVICE void ck_tile::atomic_add ( X *  p_dst,
const X &  x 
)

◆ atomic_add< bf16x2_t >()

template<>
CK_TILE_DEVICE void ck_tile::atomic_add< bf16x2_t > ( bf16x2_t p_dst,
const bf16x2_t x 
)

◆ atomic_add< bf16x4_t >()

template<>
CK_TILE_DEVICE void ck_tile::atomic_add< bf16x4_t > ( bf16x4_t p_dst,
bf16x4_t const &  x 
)

◆ atomic_add< bf8x4_t >()

template<>
CK_TILE_DEVICE void ck_tile::atomic_add< bf8x4_t > ( bf8x4_t p_dst,
const bf8x4_t x 
)

◆ atomic_add< bf8x8_t >()

template<>
CK_TILE_DEVICE void ck_tile::atomic_add< bf8x8_t > ( bf8x8_t p_dst,
bf8x8_t const &  x 
)

◆ atomic_add< fp16x2_t >()

template<>
CK_TILE_DEVICE void ck_tile::atomic_add< fp16x2_t > ( fp16x2_t p_dst,
fp16x2_t const &  x 
)

◆ atomic_add< fp8x4_t >()

template<>
CK_TILE_DEVICE void ck_tile::atomic_add< fp8x4_t > ( fp8x4_t p_dst,
const fp8x4_t x 
)

◆ atomic_add< fp8x8_t >()

template<>
CK_TILE_DEVICE void ck_tile::atomic_add< fp8x8_t > ( fp8x8_t p_dst,
fp8x8_t const &  x 
)

◆ atomic_add_g()

template<typename T , index_t N>
CK_TILE_DEVICE void ck_tile::atomic_add_g ( T *  p_dst,
const thread_buffer< T, N > &  x 
)

◆ atomic_max_g()

template<typename T , index_t N>
CK_TILE_DEVICE void ck_tile::atomic_max_g ( T *  p_dst,
const thread_buffer< T, N > &  x 
)

◆ bf16_to_double()

constexpr CK_TILE_HOST_DEVICE double ck_tile::bf16_to_double ( bfloat16_t  x)
constexpr

◆ bf16_to_double_raw()

constexpr CK_TILE_HOST_DEVICE double ck_tile::bf16_to_double_raw ( uint16_t  x)
constexpr

◆ bf16_to_float()

constexpr CK_TILE_HOST_DEVICE float ck_tile::bf16_to_float ( bfloat16_t  x)
constexpr

◆ bf16_to_float_raw()

constexpr CK_TILE_HOST_DEVICE float ck_tile::bf16_to_float_raw ( uint16_t  x)
constexpr

◆ bf16_to_fp16()

constexpr CK_TILE_HOST_DEVICE half_t ck_tile::bf16_to_fp16 ( bfloat16_t  x)
constexpr

◆ bf16_to_pk_fp4()

constexpr CK_TILE_HOST_DEVICE pk_fp4_t ck_tile::bf16_to_pk_fp4 ( const bf16_t x,
float  scale 
)
constexpr

◆ bf16x2_to_pk_fp4()

constexpr CK_TILE_HOST_DEVICE pk_fp4_t ck_tile::bf16x2_to_pk_fp4 ( const bf16x2_t x,
float  scale 
)
constexpr

◆ bf8_to_float()

CK_TILE_HOST_DEVICE float ck_tile::bf8_to_float ( bf8_t  x)

◆ bf8_to_float_raw()

CK_TILE_HOST_DEVICE float ck_tile::bf8_to_float_raw ( uint8_t  x)

◆ bit_cast()

template<typename Y , typename X >
constexpr CK_TILE_HOST_DEVICE Y ck_tile::bit_cast ( const X &  x)
constexpr

◆ block_tile_reduce() [1/2]

template<typename AccDistributedTensor_ , typename InDistributedTensor_ , index_t... InReduceDims, typename ReduceFunc >
CK_TILE_DEVICE void ck_tile::block_tile_reduce ( AccDistributedTensor_ &  acc_tensor,
const InDistributedTensor_ &  in_tensor,
sequence< InReduceDims... >  ,
const ReduceFunc &  reduce_func 
)

◆ block_tile_reduce() [2/2]

template<typename AccDataType_ , typename InDistributedTensor_ , index_t... InReduceDims, typename ReduceFunc , typename InDataType_ >
CK_TILE_DEVICE auto ck_tile::block_tile_reduce ( const InDistributedTensor_ &  in_tensor,
sequence< InReduceDims... >  in_reduce_dims,
const ReduceFunc &  reduce_func,
const InDataType_ &  reduce_init 
)

◆ block_tile_reduce_sync()

template<typename AccDistributedTensor_ , typename ReduceFunc , bool WithBroadcast = true, bool CrossWarp = true>
CK_TILE_DEVICE void ck_tile::block_tile_reduce_sync ( AccDistributedTensor_ &  acc_tensor,
const ReduceFunc &  reduce_func,
bool_constant< WithBroadcast >  = {},
bool_constant< CrossWarp >  = {} 
)

◆ block_tile_reduce_xor_sync()

template<typename AccDistributedTensor_ , typename ReduceFunc >
CK_TILE_DEVICE void ck_tile::block_tile_reduce_xor_sync ( AccDistributedTensor_ &  acc_tensor,
const ReduceFunc &  reduce_func 
)

◆ block_tile_welford_calculate_max_count()

template<typename BlockShape >
constexpr CK_TILE_DEVICE index_t ck_tile::block_tile_welford_calculate_max_count ( int  row_size)
constexpr

◆ block_tile_welford_post_scale_var()

template<typename VarDistributedTensor_ , bool FastFdiv_ = false>
constexpr CK_TILE_DEVICE void ck_tile::block_tile_welford_post_scale_var ( VarDistributedTensor_ &  var_tensor,
int  count,
bool_constant< FastFdiv_ >  = {} 
)
constexpr

◆ BlockReduce2D()

template<typename T >
CK_TILE_HOST_DEVICE_EXTERN ck_tile::BlockReduce2D ( const T &  ,
const typename T::DataType &   
) -> BlockReduce2D< T >

◆ buffer_load_fence() [1/2]

CK_TILE_DEVICE void ck_tile::buffer_load_fence ( index_t  cnt = 0)

◆ buffer_load_fence() [2/2]

template<typename... T>
CK_TILE_DEVICE void ck_tile::buffer_load_fence ( index_t  cnt = 0,
T &...  o 
)

◆ buffer_store_fence()

CK_TILE_DEVICE void ck_tile::buffer_store_fence ( index_t  cnt = 0)

◆ c_style_pointer_cast()

template<typename PY , typename PX , typename std::enable_if< std::is_pointer_v< PY > &&std::is_pointer_v< PX >, bool >::type = false>
CK_TILE_HOST_DEVICE PY ck_tile::c_style_pointer_cast ( PX  p_x)

◆ call_f_unpack_args()

template<typename F , typename T >
CK_TILE_HOST auto ck_tile::call_f_unpack_args ( f,
args 
)

◆ call_f_unpack_args_impl()

template<typename F , typename T , std::size_t... Is>
CK_TILE_HOST auto ck_tile::call_f_unpack_args_impl ( f,
args,
std::index_sequence< Is... >   
)

◆ cast_tile()

template<typename DstType , typename SrcTensor >
CK_TILE_DEVICE auto ck_tile::cast_tile ( const SrcTensor &  src_tensor)

◆ ceil() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::ceil ( x)

◆ ceil() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::ceil ( x)

◆ ceil< double >()

template<>
CK_TILE_DEVICE double ck_tile::ceil< double > ( double  x)

◆ ceil< float >()

template<>
CK_TILE_DEVICE float ck_tile::ceil< float > ( float  x)

◆ ceil< fp16_t >()

◆ chain_tensor_adaptors()

template<typename TensorAdaptor0 , typename TensorAdaptor1 >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::chain_tensor_adaptors ( const TensorAdaptor0 &  adaptor0,
const TensorAdaptor1 &  adaptor1 
)
constexpr

◆ check_err() [1/6]

template<typename Range , typename RefRange >
std::enable_if< std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange> > && std::is_same_v<ranges::range_value_t<Range>, bf16_t>, bool>::type CK_TILE_HOST ck_tile::check_err ( const Range &  out,
const RefRange &  ref,
const std::string &  msg = "Error: Incorrect results!",
double  rtol = 1e-3,
double  atol = 1e-3,
bool  allow_infinity_ref = false 
)

Check errors between floating point ranges using the specified tolerances.

Compares two ranges of brain floating point values within specified relative and absolute tolerances.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if check fails
rtolRelative tolerance
atolAbsolute tolerance
allow_infinity_refWhether to allow infinity in reference values
Returns
True if check passes, false otherwise

◆ check_err() [2/6]

template<typename Range , typename RefRange >
std::enable_if< std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange> > && std::is_same_v<ranges::range_value_t<Range>, half_t>, bool>::type CK_TILE_HOST ck_tile::check_err ( const Range &  out,
const RefRange &  ref,
const std::string &  msg = "Error: Incorrect results!",
double  rtol = 1e-3,
double  atol = 1e-3,
bool  allow_infinity_ref = false 
)

Check errors between half precision floating point ranges.

Compares two ranges of half precision floating point values within specified tolerances. This specialization handles the specific requirements and characteristics of half precision floating point comparisons.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if check fails
rtolRelative tolerance
atolAbsolute tolerance
allow_infinity_refWhether to allow infinity in reference values
Returns
True if check passes, false otherwise

◆ check_err() [3/6]

template<typename Range , typename RefRange >
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> && std::is_same_v<ranges::range_value_t<Range>, bf8_t>), bool> CK_TILE_HOST ck_tile::check_err ( const Range &  out,
const RefRange &  ref,
const std::string &  msg = "Error: Incorrect results!",
double  rtol = 1e-3,
double  atol = 1e-3,
bool  allow_infinity_ref = false 
)

Check errors between BF8 ranges.

Specialized comparison for 8-bit brain floating point values that considers the specific numerical properties and error characteristics of the BF8 format.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if check fails
rtolRelative tolerance
atolAbsolute tolerance
allow_infinity_refWhether to allow infinity in reference values
Returns
True if check passes, false otherwise

◆ check_err() [4/6]

template<typename Range , typename RefRange >
std::enable_if< std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange> > && std::is_floating_point_v<ranges::range_value_t<Range> > && !std::is_same_v<ranges::range_value_t<Range>, half_t>, bool>::type CK_TILE_HOST ck_tile::check_err ( const Range &  out,
const RefRange &  ref,
const std::string &  msg = "Error: Incorrect results!",
double  rtol = 1e-5,
double  atol = 3e-6,
bool  allow_infinity_ref = false 
)

Check errors between floating point ranges using the specified tolerances.

Compares two ranges of floating point values within specified relative and absolute tolerances. This overload handles standard floating point types except half precision floating point.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if check fails
rtolRelative tolerance
atolAbsolute tolerance
allow_infinity_refWhether to allow infinity in reference values
Returns
True if check passes, false otherwise

◆ check_err() [5/6]

template<typename Range , typename RefRange >
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> && std::is_integral_v<ranges::range_value_t<Range>> && !std::is_same_v<ranges::range_value_t<Range>, bf16_t>) , bool> CK_TILE_HOST ck_tile::check_err ( const Range &  out,
const RefRange &  ref,
const std::string &  msg = "Error: Incorrect results!",
double  = 0,
double  atol = 0 
)

Check errors between integer ranges.

Compares two ranges of integer values with an absolute tolerance. This specialization handles integer types and optionally int4_t when the experimental bit int extension is enabled.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if check fails
atolAbsolute tolerance
Returns
True if check passes, false otherwise

◆ check_err() [6/6]

template<typename Range , typename RefRange >
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> && std::is_same_v<ranges::range_value_t<Range>, fp8_t>), bool> CK_TILE_HOST ck_tile::check_err ( const Range &  out,
const RefRange &  ref,
const std::string &  msg = "Error: Incorrect results!",
unsigned  max_rounding_point_distance = 1,
double  atol = 1e-1,
bool  allow_infinity_ref = false 
)

Check errors between FP8 ranges.

Specialized comparison for 8-bit floating point values that takes into account the unique characteristics and limitations of FP8 arithmetic, including rounding point distances and special handling of infinity values.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if check fails
max_rounding_point_distanceMaximum allowed distance between rounding points
atolAbsolute tolerance
allow_infinity_refWhether to allow infinity in reference values
Returns
True if check passes, false otherwise

◆ check_size_mismatch()

template<typename Range , typename RefRange >
CK_TILE_HOST bool ck_tile::check_size_mismatch ( const Range &  out,
const RefRange &  ref,
const std::string &  msg = "Error: Incorrect results!" 
)

Check for size mismatch between output and reference ranges.

Verifies that the output and reference ranges are the same size.

Template Parameters
RangeType of output range
RefRangeType of reference range
Parameters
outOutput range to check
refReference range to check against
msgError message to display if sizes mismatch
Returns
True if sizes mismatch, false otherwise

◆ check_wmma_supported()

template<typename ADataType , typename BDataType , typename AccDataType , index_t M_Warp_Tile, index_t N_Warp_Tile, index_t K_Warp_Tile>
CK_TILE_HOST bool ck_tile::check_wmma_supported ( )

◆ CK_PRINT()

template<auto... val>
constexpr void ck_tile::CK_PRINT ( )
inlineconstexpr

◆ CK_TILE_ERROR()

template<typename... Args>
void ck_tile::CK_TILE_ERROR ( Args &&...  args)
noexcept

◆ clamp()

template<typename T >
constexpr CK_TILE_HOST_DEVICE T ck_tile::clamp ( const T &  x,
const T &  lowerbound,
const T &  upperbound 
)
constexpr

◆ clear_tile()

template<typename DstrTensors >
CK_TILE_DEVICE void ck_tile::clear_tile ( DstrTensors &  dstr_tensor)

◆ clz()

CK_TILE_DEVICE int ck_tile::clz ( uint32_t  x)

◆ cmp_lt_to_exec()

template<typename X , typename Y >
CK_TILE_DEVICE auto ck_tile::cmp_lt_to_exec ( const X &  x,
const Y &  y 
)

◆ composes()

template<typename... Ts>
__host__ __device__ ck_tile::composes ( Ts &&  ...) -> composes< remove_cvref_t< Ts >... >

FIXME: create macro to replace 'host device' and nothing more.

◆ concat() [1/2]

template<typename... Ts>
auto ck_tile::concat ( const Ts &...  xs) -> std::enable_if_t<!AllConvertibleToStringView<Ts...>, std::string>

◆ concat() [2/2]

template<typename Sep , typename First , typename... Rest>
auto ck_tile::concat ( Sep  sep,
const First &  first,
const Rest &...  rest 
) -> std::enable_if_t<AllConvertibleToStringView<First, Rest...>, std::string>

◆ concat_tuple() [1/3]

template<typename... X>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::concat_tuple ( const tuple< X... > &  tx)
constexpr

◆ concat_tuple() [2/3]

template<typename... X, typename... Y>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::concat_tuple ( const tuple< X... > &  tx,
const tuple< Y... > &  ty 
)
constexpr

◆ concat_tuple() [3/3]

template<typename... X, typename... Tuples>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::concat_tuple ( const tuple< X... > &  tx,
const Tuples &...  tuples 
)
constexpr

◆ concat_tuple_of_reference()

template<typename... X, typename... Y>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::concat_tuple_of_reference ( const tuple< X &... > &  tx,
const tuple< Y &... > &  ty 
)
constexpr

◆ concatInto()

template<typename... Ts>
auto ck_tile::concatInto ( std::string &  result,
const Ts &...  xs 
) -> std::enable_if_t<AllConvertibleToStringView<Ts...>, void>

◆ conditional_expr()

template<bool predicate, typename X , typename Y >
constexpr auto ck_tile::conditional_expr ( X &&  x,
Y &&  y 
)
constexpr

◆ constexpr_strlen()

constexpr size_t ck_tile::constexpr_strlen ( const char *  c)
constexpr

◆ construct_f_unpack_args()

template<typename F , typename T >
CK_TILE_HOST auto ck_tile::construct_f_unpack_args ( ,
args 
)

◆ construct_f_unpack_args_impl()

template<typename F , typename T , std::size_t... Is>
CK_TILE_HOST auto ck_tile::construct_f_unpack_args_impl ( args,
std::index_sequence< Is... >   
)

◆ container_concat() [1/4]

template<typename T , index_t NX, index_t NY>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_concat ( const array< T, NX > &  ax,
const array< T, NY > &  ay 
)
constexpr

◆ container_concat() [2/4]

template<typename Container >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_concat ( const Container &  x)
constexpr

◆ container_concat() [3/4]

template<typename... X, typename... Y>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_concat ( const tuple< X... > &  tx,
const tuple< Y... > &  ty 
)
constexpr

◆ container_concat() [4/4]

template<typename X , typename... Ys>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_concat ( const X &  x,
const Ys &...  ys 
)
constexpr

◆ container_find()

template<index_t... Is>
constexpr index_t ck_tile::container_find ( sequence< Is... >  seq,
index_t  value 
)
constexpr

◆ container_push_back() [1/2]

template<typename TData , index_t NSize>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_push_back ( const array< TData, NSize > &  a,
const TData &  x 
)
constexpr

◆ container_push_back() [2/2]

template<typename... Ts, typename T >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_push_back ( const tuple< Ts... > &  a,
const T &  x 
)
constexpr

◆ container_push_front()

template<typename... Ts, typename T >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_push_front ( const tuple< Ts... > &  a,
const T &  x 
)
constexpr

◆ container_reduce()

template<typename Container , typename Reduce , typename Init , index_t IBegin = 0, index_t IEnd = Container::size(), index_t IStep = 1>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reduce ( const Container &  x,
Reduce  reduce,
Init  init,
number< IBegin >  = number<0>{},
number< IEnd >  = number<Container::size()>{},
number< IStep >  = number<1>{} 
)
constexpr

◆ container_reduce_impl()

template<typename Container , typename Reduce , typename ROld , index_t I, index_t IEnd, index_t IStep>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reduce_impl ( const Container &  x,
Reduce  reduce,
ROld  r_old,
number< I >  i,
number< IEnd >  ,
number< IStep >   
)
constexpr

◆ container_reorder_given_new2old() [1/4]

template<typename TData , index_t NSize>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reorder_given_new2old ( const array< TData, NSize > &  old_array,
const map< index_t, index_t > &  new2old 
)
constexpr

◆ container_reorder_given_new2old() [2/4]

template<typename TData , index_t NSize, index_t... IRs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reorder_given_new2old ( const array< TData, NSize > &  old_array,
sequence< IRs... >   
)
constexpr

◆ container_reorder_given_new2old() [3/4]

template<typename... Ts, index_t... IRs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reorder_given_new2old ( const tuple< Ts... > &  old_tuple,
sequence< IRs... >   
)
constexpr

◆ container_reorder_given_new2old() [4/4]

template<index_t... Is, index_t... IRs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reorder_given_new2old ( sequence< Is... >  ,
sequence< IRs... >   
)
constexpr

◆ container_reorder_given_old2new() [1/4]

template<typename TData , index_t NSize>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reorder_given_old2new ( const array< TData, NSize > &  old_array,
const map< index_t, index_t > &  old2new 
)
constexpr

◆ container_reorder_given_old2new() [2/4]

template<typename TData , index_t NSize, index_t... IRs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reorder_given_old2new ( const array< TData, NSize > &  old_array,
sequence< IRs... >  old2new 
)
constexpr

◆ container_reorder_given_old2new() [3/4]

template<typename... Ts, index_t... IRs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reorder_given_old2new ( const tuple< Ts... > &  old_tuple,
sequence< IRs... >  old2new 
)
constexpr

◆ container_reorder_given_old2new() [4/4]

template<index_t... Is, index_t... IRs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reorder_given_old2new ( sequence< Is... >  old_seq,
sequence< IRs... >   
)
constexpr

◆ container_reverse_exclusive_scan() [1/3]

template<typename TData , index_t NSize, typename Reduce , typename Init >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reverse_exclusive_scan ( const array< TData, NSize > &  x,
Reduce  f,
Init  init 
)
constexpr

◆ container_reverse_exclusive_scan() [2/3]

template<index_t... Is, typename Reduce , index_t Init>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reverse_exclusive_scan ( const sequence< Is... > &  seq,
Reduce  f,
number< Init >   
)
constexpr

◆ container_reverse_exclusive_scan() [3/3]

template<typename... Xs, typename Reduce , typename Init >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reverse_exclusive_scan ( const tuple< Xs... > &  x,
Reduce  reduce,
Init  init 
)
constexpr

◆ container_reverse_exclusive_scan_impl()

template<typename... Xs, typename Reduce , index_t I, typename YOld , typename ROld >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reverse_exclusive_scan_impl ( const tuple< Xs... > &  x,
Reduce  reduce,
number< I >  i,
YOld  y_old,
ROld  r_old 
)
constexpr

◆ container_reverse_inclusive_scan() [1/2]

template<typename TData , index_t NSize, typename Reduce >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reverse_inclusive_scan ( const array< TData, NSize > &  x,
Reduce  f,
TData  init 
)
constexpr

◆ container_reverse_inclusive_scan() [2/2]

template<typename... Xs, typename Reduce , typename TData >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::container_reverse_inclusive_scan ( const tuple< Xs... > &  x,
Reduce  f,
TData  init 
)
constexpr

◆ convert_to_float()

template<typename T >
CK_TILE_HOST_DEVICE float ck_tile::convert_to_float ( typename T::raw_type  data,
float  scale = 1.f 
)

◆ convert_to_type()

template<typename T >
CK_TILE_HOST_DEVICE T::raw_type ck_tile::convert_to_type ( float  value,
float  scale = 1.f 
)

◆ coordinate_has_valid_offset()

template<typename TensorDesc , typename TensorCoord >
constexpr CK_TILE_HOST_DEVICE bool ck_tile::coordinate_has_valid_offset ( const TensorDesc &  tensor_desc,
const TensorCoord &  coord 
)
constexpr

◆ coordinate_has_valid_offset_assuming_top_index_is_valid()

template<typename TensorDesc , typename TensorCoord >
constexpr CK_TILE_HOST_DEVICE bool ck_tile::coordinate_has_valid_offset_assuming_top_index_is_valid ( const TensorDesc &  tensor_desc,
const TensorCoord &  coord 
)
constexpr

◆ cos()

◆ cos< double >()

template<>
CK_TILE_HOST double ck_tile::cos< double > ( double  x)

◆ cos< float >()

template<>
CK_TILE_HOST float ck_tile::cos< float > ( float  x)

◆ cosh() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::cosh ( x)

◆ cosh() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::cosh ( x)

◆ cosh< double >()

template<>
CK_TILE_DEVICE double ck_tile::cosh< double > ( double  x)

◆ cosh< float >()

template<>
CK_TILE_DEVICE float ck_tile::cosh< float > ( float  x)

◆ double_to_bf16()

template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
constexpr CK_TILE_HOST_DEVICE bfloat16_t ck_tile::double_to_bf16 ( double  f,
constant< rounding >  = {} 
)
constexpr

◆ double_to_bf16_raw()

template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
constexpr CK_TILE_HOST_DEVICE uint16_t ck_tile::double_to_bf16_raw ( double  f,
constant< rounding >  = {} 
)
constexpr

◆ double_to_fp16()

constexpr CK_TILE_HOST_DEVICE half_t ck_tile::double_to_fp16 ( const double &  x)
constexpr

◆ double_to_fp16_hip()

constexpr CK_TILE_HOST_DEVICE fp16_hip_t ck_tile::double_to_fp16_hip ( const double &  x)
constexpr

◆ DS_READ_TR_SIZE()

constexpr int ck_tile::DS_READ_TR_SIZE ( )
constexpr

◆ embed_tuples()

template<typename F , typename X >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::embed_tuples ( f,
const X &  x 
)
constexpr

◆ EnvGetString()

template<class EnvVar >
const std::string& ck_tile::EnvGetString ( EnvVar  )
inline

◆ EnvIsDisabled()

template<class EnvVar >
bool ck_tile::EnvIsDisabled ( EnvVar  )
inline

◆ EnvIsEnabled()

template<class EnvVar >
bool ck_tile::EnvIsEnabled ( EnvVar  )
inline

◆ EnvIsUnset()

template<class EnvVar >
bool ck_tile::EnvIsUnset ( EnvVar  )
inline

◆ EnvUnset()

template<class EnvVar >
void ck_tile::EnvUnset ( EnvVar  )

◆ EnvValue()

template<class EnvVar >
uint64_t ck_tile::EnvValue ( EnvVar  )
inline

◆ equal()

__host__ __device__ ck_tile::equal ( ) -> equal< void, void >

FIXME: create macro to replace 'host device' and nothing more.

◆ exclusive_scan_sequence()

template<typename Seq , typename Reduce , index_t Init>
constexpr auto ck_tile::exclusive_scan_sequence ( Seq  ,
Reduce  ,
number< Init >   
)
constexpr

◆ exp() [1/3]

CK_TILE_DEVICE bfloat16_t ck_tile::exp ( bfloat16_t  x)

◆ exp() [2/3]

template<typename T >
CK_TILE_HOST T ck_tile::exp ( x)

◆ exp() [3/3]

template<typename T >
CK_TILE_DEVICE T ck_tile::exp ( x)

◆ exp2() [1/2]

CK_TILE_DEVICE bfloat16_t ck_tile::exp2 ( bfloat16_t  x)

◆ exp2() [2/2]

CK_TILE_HOST float ck_tile::exp2 ( float  x)

◆ exp< double >()

template<>
CK_TILE_DEVICE double ck_tile::exp< double > ( double  x)

◆ exp< float >()

template<>
CK_TILE_DEVICE float ck_tile::exp< float > ( float  x)

◆ exp< fp16_t >()

◆ expm1() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::expm1 ( x)

◆ expm1() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::expm1 ( x)

◆ expm1< double >()

template<>
CK_TILE_DEVICE double ck_tile::expm1< double > ( double  x)

◆ expm1< float >()

template<>
CK_TILE_DEVICE float ck_tile::expm1< float > ( float  x)

◆ flag_to_exec()

template<typename T >
CK_TILE_DEVICE auto ck_tile::flag_to_exec ( const T &  v_flag)

◆ float_to_bf16()

template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
constexpr CK_TILE_HOST_DEVICE bfloat16_t ck_tile::float_to_bf16 ( float  f,
constant< rounding >  = {} 
)
constexpr

◆ float_to_bf16_raw()

template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
constexpr CK_TILE_HOST_DEVICE uint16_t ck_tile::float_to_bf16_raw ( float  f,
constant< rounding >  = {} 
)
constexpr

◆ float_to_bf16_rta_asm()

CK_TILE_DEVICE uint16_t ck_tile::float_to_bf16_rta_asm ( float  f)

◆ float_to_bf16_rtn_asm()

CK_TILE_DEVICE uint16_t ck_tile::float_to_bf16_rtn_asm ( float  f)
constexpr

◆ float_to_bf16_rtn_raw()

constexpr CK_TILE_HOST_DEVICE uint16_t ck_tile::float_to_bf16_rtn_raw ( float  f)
constexpr

◆ float_to_bf16_truc_nan_raw()

constexpr CK_TILE_HOST_DEVICE uint16_t ck_tile::float_to_bf16_truc_nan_raw ( float  f)
constexpr

◆ float_to_bf16_truc_raw()

constexpr CK_TILE_HOST_DEVICE uint16_t ck_tile::float_to_bf16_truc_raw ( float  f)
constexpr

◆ float_to_bf8()

template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE bf8_t ck_tile::float_to_bf8 ( float  x,
constant< rounding >  = {} 
)

◆ float_to_bf8_raw() [1/2]

template<fp8_rounding_mode rounding>
CK_TILE_HOST_DEVICE bf8_raw_t ck_tile::float_to_bf8_raw ( float  x,
constant< rounding >   
)

◆ float_to_bf8_raw() [2/2]

template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE uint8_t ck_tile::float_to_bf8_raw ( float  x,
constant< rounding >  = {} 
)

◆ float_to_e2m1()

constexpr CK_TILE_HOST_DEVICE pk_fp4_raw_t ck_tile::float_to_e2m1 ( float  x,
float  scale = 1.f 
)
constexpr

◆ float_to_fp16()

constexpr CK_TILE_HOST_DEVICE half_t ck_tile::float_to_fp16 ( const float &  x)
constexpr

◆ float_to_fp16_hip()

constexpr CK_TILE_HOST_DEVICE fp16_hip_t ck_tile::float_to_fp16_hip ( const float &  x)
constexpr

◆ float_to_fp8()

template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE fp8_t ck_tile::float_to_fp8 ( float  x,
constant< rounding >  = {} 
)

◆ float_to_fp8_raw() [1/2]

template<fp8_rounding_mode rounding>
CK_TILE_HOST_DEVICE fp8_raw_t ck_tile::float_to_fp8_raw ( float  x,
constant< rounding >   
)

◆ float_to_fp8_raw() [2/2]

template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
CK_TILE_HOST_DEVICE uint8_t ck_tile::float_to_fp8_raw ( float  x,
constant< rounding >  = {} 
)

◆ float_to_fp8_rtn_raw()

template<typename SrcT , typename DstT >
CK_TILE_HOST_DEVICE numeric_traits<DstT>::bitwise_type ck_tile::float_to_fp8_rtn_raw ( SrcT  x)

Converts a floating-point value to an 8-bit floating-point representation with rounding to nearest even.

This function converts a floating-point value (float or half_t) to an 8-bit floating-point representation of type fp8_t or bf8_t. The conversion process may involve clipping.

Template Parameters
DstTThe destination type (fp8_t or bf8_t).
SrcTThe source type (float or half_t) to be converted.
Parameters
xThe floating-point value to be converted.
Returns
The 8-bit floating-point representation of the input value.

◆ float_to_fp8_sr_raw()

template<typename SrcT , typename DstT >
CK_TILE_HOST_DEVICE numeric_traits<DstT>::bitwise_type ck_tile::float_to_fp8_sr_raw ( SrcT  x)

Converts a floating-point value to an 8-bit floating-point representation with stochastic rounding.

This function converts a floating-point value (float or half_t) to an 8-bit floating-point representation of type fp8_t or bf8_t. The conversion process may involve clipping and uses a pseudo-random number generator for the stochastic rounding.

Template Parameters
DstTThe destination type (fp8_t or bf8_t).
SrcTThe source type (float or half_t) to be converted.
Parameters
xThe floating-point value to be converted.
Returns
The 8-bit floating-point representation of the input value.

◆ float_to_int8()

constexpr CK_TILE_HOST_DEVICE int8_t ck_tile::float_to_int8 ( const float &  x)
constexpr

◆ float_to_pk_fp4()

constexpr CK_TILE_HOST_DEVICE pk_fp4_t ck_tile::float_to_pk_fp4 ( const float &  x,
float  scale 
)
constexpr

◆ floor() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::floor ( x)

◆ floor() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::floor ( x)

◆ floor< double >()

template<>
CK_TILE_DEVICE double ck_tile::floor< double > ( double  x)

◆ floor< float >()

template<>
CK_TILE_DEVICE float ck_tile::floor< float > ( float  x)

◆ floor< fp16_t >()

◆ flush_icache()

void ck_tile::flush_icache ( )
inline

◆ fnv1a_hash()

constexpr unsigned int ck_tile::fnv1a_hash ( std::string_view  str,
unsigned int  h = 2166136261u 
)
constexpr

◆ fp16_to_bf16()

template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
CK_TILE_HOST_DEVICE constexpr bfloat16_t ck_tile::fp16_to_bf16 ( half_t  f,
constant< rounding >  = {} 
)
constexpr

◆ fp16_to_double()

constexpr CK_TILE_HOST_DEVICE float ck_tile::fp16_to_double ( const half_t x)
constexpr

◆ fp16_to_double_hip()

constexpr CK_TILE_HOST_DEVICE double ck_tile::fp16_to_double_hip ( const fp16_hip_t x)
constexpr

◆ fp16_to_float()

constexpr CK_TILE_HOST_DEVICE float ck_tile::fp16_to_float ( const half_t x)
constexpr

◆ fp16_to_float_hip()

constexpr CK_TILE_HOST_DEVICE float ck_tile::fp16_to_float_hip ( const fp16_hip_t x)
constexpr

◆ fp16_to_pk_fp4()

constexpr CK_TILE_HOST_DEVICE pk_fp4_t ck_tile::fp16_to_pk_fp4 ( const fp16_t x,
float  scale 
)
constexpr

◆ fp16x2_to_pk_fp4()

constexpr CK_TILE_HOST_DEVICE pk_fp4_t ck_tile::fp16x2_to_pk_fp4 ( const fp16x2_t x,
float  scale 
)
constexpr

◆ fp32x2_to_pk_fp4()

constexpr CK_TILE_HOST_DEVICE pk_fp4_t ck_tile::fp32x2_to_pk_fp4 ( const fp32x2_t x,
float  scale 
)
constexpr

◆ fp8_to_float()

CK_TILE_HOST_DEVICE float ck_tile::fp8_to_float ( fp8_t  x)

◆ fp8_to_float_raw()

CK_TILE_HOST_DEVICE float ck_tile::fp8_to_float_raw ( uint8_t  x)

◆ gcd() [1/2]

constexpr CK_TILE_HOST_DEVICE index_t ck_tile::gcd ( index_t  x,
index_t  y 
)
constexpr

◆ gcd() [2/2]

template<index_t X, index_t Y>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::gcd ( number< X >  ,
number< Y >   
)
constexpr

◆ gemm_prec_str()

template<typename ADataType_ , typename BDataType_ >
std::string ck_tile::gemm_prec_str ( )

◆ generate_array()

template<typename F , index_t N>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::generate_array ( F &&  f,
number< N >   
)
constexpr

◆ generate_sequence()

template<typename F , index_t N>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::generate_sequence ( ,
number< N >   
)
constexpr

◆ generate_sequence_v2()

template<typename F , index_t N>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::generate_sequence_v2 ( F &&  f,
number< N >   
)
constexpr

◆ generate_tie()

template<typename F , index_t N>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::generate_tie ( F &&  f,
number< N >   
)
constexpr

◆ generate_tuple()

template<typename F , index_t N>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::generate_tuple ( F &&  f,
number< N >   
)
constexpr

◆ generate_tuple_for()

template<typename F , index_t... ids>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::generate_tuple_for ( F &&  f,
sequence< ids... >   
)
constexpr

◆ get_absolute_threshold()

template<typename ComputeDataType , typename OutDataType , typename AccDataType = ComputeDataType>
CK_TILE_HOST double ck_tile::get_absolute_threshold ( const double  max_possible_num,
const int  number_of_accumulations = 1 
)

Calculate absolute error threshold for numerical comparisons.

Calculates the absolute error threshold based on the maximum possible value and the characteristics of the data types involved in the computation.

Template Parameters
ComputeDataTypeType used for computation
OutDataTypeType used for output
AccDataTypeType used for accumulation (defaults to ComputeDataType)
Parameters
max_possible_numMaximum possible value in the computation
number_of_accumulationsNumber of accumulation operations performed
Returns
Absolute error threshold based on data type characteristics and maximum value

◆ get_alibi_slopes()

template<typename DataType >
CK_TILE_HOST std::vector<DataType> ck_tile::get_alibi_slopes ( ck_tile::index_t  nheads)

◆ get_async_store_smem_info()

template<typename LdsTileWindow_ >
CK_TILE_DEVICE auto ck_tile::get_async_store_smem_info ( LdsTileWindow_ &&  lds_tile)

◆ get_container_subset() [1/2]

template<typename T , index_t N, index_t... Is>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::get_container_subset ( const array< T, N > &  arr,
sequence< Is... >   
)
constexpr

◆ get_container_subset() [2/2]

template<typename... Ts, index_t... Is>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::get_container_subset ( const tuple< Ts... > &  tup,
sequence< Is... >   
)
constexpr

◆ get_default_stride()

template<bool is_row_major>
auto ck_tile::get_default_stride ( std::size_t  row,
std::size_t  col,
std::size_t  stride,
bool_constant< is_row_major >   
)

◆ get_device_name()

std::string ck_tile::get_device_name ( )
inline

◆ get_relative_threshold()

template<typename ComputeDataType , typename OutDataType , typename AccDataType = ComputeDataType>
CK_TILE_HOST double ck_tile::get_relative_threshold ( const int  number_of_accumulations = 1)

Calculate relative error threshold for numerical comparisons.

Calculates the relative error threshold based on the mantissa bits and characteristics of the data types involved in the computation.

Template Parameters
ComputeDataTypeType used for computation
OutDataTypeType used for output
AccDataTypeType used for accumulation (defaults to ComputeDataType)
Parameters
number_of_accumulationsNumber of accumulation operations performed
Returns
Relative error threshold based on data type characteristics

◆ get_slice_tile() [1/2]

template<typename DataType_ , typename StaticTileDistribution_ , index_t... SliceBegins, index_t... SliceEnds>
constexpr CK_TILE_DEVICE auto ck_tile::get_slice_tile ( const static_distributed_tensor< DataType_, StaticTileDistribution_ > &  tile,
sequence< SliceBegins... >  slice_begins,
sequence< SliceEnds... >  slice_ends 
)
constexpr

◆ get_slice_tile() [2/2]

template<typename BottomTensorView_ , typename WindowLengths_ , index_t... SliceBegins, index_t... SliceEnds>
constexpr CK_TILE_DEVICE auto ck_tile::get_slice_tile ( const tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &  tile,
sequence< SliceBegins... >  slice_begins,
sequence< SliceEnds... >  slice_ends 
)
constexpr

◆ get_x_indices_from_distributed_indices()

template<typename StaticTileDistribution , typename DistributedIndices >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::get_x_indices_from_distributed_indices ( StaticTileDistribution  tile_distribution,
DistributedIndices  distributed_indices 
)
constexpr

◆ get_y_unpacks_from_x_unpacks()

template<typename YLengths , index_t XUnpacks>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::get_y_unpacks_from_x_unpacks ( YLengths  ,
number< XUnpacks >   
)
constexpr

◆ getConvSpecializationString()

CK_TILE_HOST std::string ck_tile::getConvSpecializationString ( const ConvolutionSpecialization s)

◆ getSize() [1/6]

template<std::size_t N>
constexpr std::size_t ck_tile::getSize ( char(&)  [N])
inlineconstexprnoexcept

◆ getSize() [2/6]

constexpr std::size_t ck_tile::getSize ( const char &  )
inlineconstexprnoexcept

◆ getSize() [3/6]

constexpr std::size_t ck_tile::getSize ( const char *  s)
inlineconstexprnoexcept

◆ getSize() [4/6]

template<std::size_t N>
constexpr std::size_t ck_tile::getSize ( const   char(&)[N])
inlineconstexprnoexcept

◆ getSize() [5/6]

std::size_t ck_tile::getSize ( const std::string &  s)
inlinenoexcept

◆ getSize() [6/6]

constexpr std::size_t ck_tile::getSize ( const std::string_view &  s)
inlineconstexprnoexcept

◆ hip_check_error()

CK_TILE_HOST void ck_tile::hip_check_error ( hipError_t  x)

◆ histogram_sorted_sequence()

template<typename SeqSortedSamples , index_t r, index_t... rs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::histogram_sorted_sequence ( SeqSortedSamples  ,
sequence< r, rs... >   
)
constexpr

◆ host_tensor_descriptor()

template<bool is_row_major>
auto ck_tile::host_tensor_descriptor ( std::size_t  row,
std::size_t  col,
std::size_t  stride,
bool_constant< is_row_major >   
)

Creates a host tensor descriptor with specified dimensions and layout.

Constructs a HostTensorDescriptor with appropriate strides based on whether the tensor layout is row-major or column-major. This is determined via the compile-time template parameter is_row_major.

Template Parameters
is_row_majorCompile-time flag indicating if the layout is row-major (true) or column-major (false)
Parameters
rowNumber of rows in the tensor
colNumber of columns in the tensor
strideStride between adjacent rows (for row-major) or columns (for column-major)
Returns
HostTensorDescriptor with shape {row, col} and strides:
  • For row-major: {stride, 1}
  • For column-major: {1, stride}

◆ inclusive_scan_sequence()

template<typename Seq , typename Reduce , index_t Init>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::inclusive_scan_sequence ( Seq  ,
Reduce  ,
number< Init >   
)
constexpr

◆ InputTileDistributionEncoding()

template<typename InnerEncode , index_t kLeadIterPerWarp, index_t kSecondIterPerWarp, index_t kLeadNumWarps, index_t kSecondNumWarps>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::InputTileDistributionEncoding ( )
constexpr

◆ int8_to_float()

constexpr CK_TILE_HOST_DEVICE float ck_tile::int8_to_float ( const int8_t x)
constexpr

◆ integer_divide_ceil()

template<typename X , typename Y >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::integer_divide_ceil ( x,
y 
)
constexpr

◆ integer_divide_floor()

template<typename X , typename Y >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::integer_divide_floor ( x,
y 
)
constexpr

◆ integer_least_multiple()

template<typename X , typename Y >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::integer_least_multiple ( x,
y 
)
constexpr

◆ integer_log2_floor()

constexpr CK_TILE_HOST_DEVICE int32_t ck_tile::integer_log2_floor ( int32_t  x)
constexpr

◆ is_gfx11_supported()

bool ck_tile::is_gfx11_supported ( )
inline

◆ is_gfx12_supported()

bool ck_tile::is_gfx12_supported ( )
inline

◆ is_load_tr_supported()

bool ck_tile::is_load_tr_supported ( )
inline

◆ is_nested_tuple()

template<typename... Ts>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::is_nested_tuple ( const tuple< Ts... > &  )
constexpr

◆ is_null_tile_window()

template<typename T >
constexpr CK_TILE_DEVICE auto ck_tile::is_null_tile_window ( const T &  )
constexpr

◆ is_power_of_two_integer()

constexpr CK_TILE_HOST_DEVICE bool ck_tile::is_power_of_two_integer ( int32_t  x)
constexpr

◆ isnan() [1/8]

CK_TILE_HOST_DEVICE bool ck_tile::isnan ( const bf8_t x)

◆ isnan() [2/8]

CK_TILE_HOST_DEVICE bool ck_tile::isnan ( const bfloat16_t x)

◆ isnan() [3/8]

CK_TILE_HOST_DEVICE bool ck_tile::isnan ( const fp8_t x)

◆ isnan() [4/8]

CK_TILE_DEVICE bool ck_tile::isnan ( double  x)

◆ isnan() [5/8]

CK_TILE_DEVICE bool ck_tile::isnan ( float  x)

◆ isnan() [6/8]

CK_TILE_DEVICE bool ck_tile::isnan ( fp16_t  x)

◆ isnan() [7/8]

CK_TILE_DEVICE bool ck_tile::isnan ( int32_t  x)

◆ isnan() [8/8]

CK_TILE_DEVICE bool ck_tile::isnan ( int8_t  x)

◆ kentry()

template<int MinBlockPerCu, typename Kernel , typename... Args>
__global__ void ck_tile::kentry ( Args...  args)

◆ launch_and_check()

template<typename... Callables>
CK_TILE_HOST void ck_tile::launch_and_check ( const stream_config sc,
Callables &&...  callables 
)

◆ launch_kernel()

template<typename... Callables>
CK_TILE_HOST float ck_tile::launch_kernel ( const stream_config s,
Callables &&...  callables 
)

◆ launch_kernel_time_mask()

template<typename PreprocessFunc , typename... Callables>
CK_TILE_HOST float ck_tile::launch_kernel_time_mask ( const stream_config s,
PreprocessFunc  preprocess,
Callables &&...  callables 
)

◆ lcm()

Y constexpr CK_TILE_HOST_DEVICE auto ck_tile::lcm ( x,
y 
)
constexpr

◆ lds_load_fence()

CK_TILE_DEVICE void ck_tile::lds_load_fence ( index_t  cnt = 0)

◆ less()

__host__ __device__ ck_tile::less ( ) -> less< void, void >

FIXME: create macro to replace 'host device' and nothing more.

◆ less_equal()

__host__ __device__ ck_tile::less_equal ( ) -> less_equal< void, void >

FIXME: create macro to replace 'host device' and nothing more.

◆ llvm_amdgcn_raw_buffer_atomic_add_fp16x2()

CK_TILE_DEVICE_EXTERN fp16x2_t ck_tile::llvm_amdgcn_raw_buffer_atomic_add_fp16x2 ( fp16x2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_atomic_add_fp32()

CK_TILE_DEVICE_EXTERN float ck_tile::llvm_amdgcn_raw_buffer_atomic_add_fp32 ( float  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_atomic_add_i32()

CK_TILE_DEVICE_EXTERN int32_t ck_tile::llvm_amdgcn_raw_buffer_atomic_add_i32 ( int32_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_atomic_max_fp64()

CK_TILE_DEVICE_EXTERN double ck_tile::llvm_amdgcn_raw_buffer_atomic_max_fp64 ( double  vdata,
int32x4_t  rsrc,
int  voffset,
int  soffset,
int  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp16()

CK_TILE_DEVICE_EXTERN _Float16 ck_tile::llvm_amdgcn_raw_buffer_load_fp16 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp16x2()

CK_TILE_DEVICE_EXTERN fp16x2_t ck_tile::llvm_amdgcn_raw_buffer_load_fp16x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp16x4()

CK_TILE_DEVICE_EXTERN fp16x4_t ck_tile::llvm_amdgcn_raw_buffer_load_fp16x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp32()

CK_TILE_DEVICE_EXTERN float ck_tile::llvm_amdgcn_raw_buffer_load_fp32 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp32x2()

CK_TILE_DEVICE_EXTERN fp32x2_t ck_tile::llvm_amdgcn_raw_buffer_load_fp32x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp32x4()

CK_TILE_DEVICE_EXTERN fp32x4_t ck_tile::llvm_amdgcn_raw_buffer_load_fp32x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i16()

CK_TILE_DEVICE_EXTERN int16_t ck_tile::llvm_amdgcn_raw_buffer_load_i16 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i16x2()

CK_TILE_DEVICE_EXTERN int16x2_t ck_tile::llvm_amdgcn_raw_buffer_load_i16x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i16x4()

CK_TILE_DEVICE_EXTERN int16x4_t ck_tile::llvm_amdgcn_raw_buffer_load_i16x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i32()

CK_TILE_DEVICE_EXTERN int32_t ck_tile::llvm_amdgcn_raw_buffer_load_i32 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i32x2()

CK_TILE_DEVICE_EXTERN int32x2_t ck_tile::llvm_amdgcn_raw_buffer_load_i32x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i32x4()

CK_TILE_DEVICE_EXTERN int32x4_t ck_tile::llvm_amdgcn_raw_buffer_load_i32x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i8()

CK_TILE_DEVICE_EXTERN int8_t ck_tile::llvm_amdgcn_raw_buffer_load_i8 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i8x2()

CK_TILE_DEVICE_EXTERN int8x2_t ck_tile::llvm_amdgcn_raw_buffer_load_i8x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i8x4()

CK_TILE_DEVICE_EXTERN int8x4_t ck_tile::llvm_amdgcn_raw_buffer_load_i8x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_lds()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_load_lds ( int32x4_t  rsrc,
as3_uint32_ptr  lds_ptr,
index_t  size,
index_t  voffset,
index_t  soffset,
index_t  offset,
index_t  aux 
)

◆ llvm_amdgcn_raw_buffer_store_fp16()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp16 ( _Float16  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp16x2()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp16x2 ( fp16x2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp16x4()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp16x4 ( fp16x4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp32()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp32 ( float  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp32x2()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp32x2 ( fp32x2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp32x4()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp32x4 ( fp32x4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i16()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i16 ( int16_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i16x2()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i16x2 ( int16x2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i16x4()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i16x4 ( int16x4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i32()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i32 ( int32_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i32x2()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i32x2 ( int32x2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i32x4()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i32x4 ( int32x4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i8()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i8 ( int8_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i8x2()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i8x2 ( int8x2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i8x4()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i8x4 ( int8x4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_ui16()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_ui16 ( uint16_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_ui16x2()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_ui16x2 ( uint16x2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_ui16x4()

CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_ui16x4 ( uint16x4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ load_tile() [1/3]

template<typename WindowLengths >
CK_TILE_DEVICE auto ck_tile::load_tile ( const null_tile_window< WindowLengths > &  )

◆ load_tile() [2/3]

template<typename TileWindow_ , index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE auto ck_tile::load_tile ( const TileWindow_ &  tile_window,
number< i_access >  = {},
bool_constant< oob_conditional_check >  = {} 
)

◆ load_tile() [3/3]

template<typename DistributedTensor_ , typename TileWindow_ , index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE auto ck_tile::load_tile ( DistributedTensor_ &  dst_tile,
const TileWindow_ &  tile_window,
number< i_access >  = {},
bool_constant< oob_conditional_check >  = {} 
)

◆ load_tile_raw() [1/3]

template<typename T , typename WindowLengths >
CK_TILE_DEVICE auto ck_tile::load_tile_raw ( T &  ,
const null_tile_window< WindowLengths > &   
)

◆ load_tile_raw() [2/3]

template<typename T , typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto ck_tile::load_tile_raw ( T &  tile,
const tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &  tile_window,
number< i_access >  = {},
bool_constant< oob_conditional_check >  = {},
bool_constant< pre_nop >  = {} 
)

◆ load_tile_raw() [3/3]

template<typename T , typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto ck_tile::load_tile_raw ( T &  tile,
const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &  tile_window,
number< i_access >  = {},
bool_constant< oob_conditional_check >  = {},
bool_constant< pre_nop >  = {} 
)

Loads a tile of data using inline assembly.

Note
Bare in mind that loading data this way, you have to manually initialize your thread buffer and synchronize load afterwards in order to make sure it's done before using loaded data from registers
See also
tile_window_with_static_distribution::init_raw() and buffer_view.hpp
buffer_load_fence()

◆ load_tile_transpose()

template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename Policy = DefaultTranspose<typename BottomTensorView_::DataType>, typename = std::enable_if_t<TransposeTileDistrChecker<TileDistribution_, typename BottomTensorView_::DataType, Policy>::distr_encoding_valid, Policy>>
CK_TILE_DEVICE auto ck_tile::load_tile_transpose ( const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &  tile_window)

transpose loads tile from a tensor and returns the resulting tensor with a new (transposed) tile distribution. use SFINAE to ensure the tile distribution encoding is valid.

This function is intended for use with statically distributed tensor tiles, where the input and output tile distributions differ due to the transpose operation. It ensures that the element space size and vector length remain consistent between the input and output distributions.

Template Parameters
BottomTensorView_The type of the bottom tensor view.
WindowLengths_The type representing the window lengths.
TileDistribution_The type representing the tile distribution.
NumCoordThe number of coordinates (dimensions).
PolicyThe transpose policy to use (defaults to DefaultTranspose). the last is SFINAE to ensure the tile distribution encoding is valid.
Parameters
tile_windowThe tile window with static distribution to load and transpose.
Returns
A statically distributed tensor containing the transposed tile data.
Note
  • The function uses compile-time checks to ensure the input and output tile distributions are compatible in terms of element space size and vector length.
  • The transpose operation is performed according to the specified Policy.

◆ log() [1/3]

CK_TILE_DEVICE bfloat16_t ck_tile::log ( bfloat16_t  x)

◆ log() [2/3]

template<typename T >
CK_TILE_HOST T ck_tile::log ( x)

◆ log() [3/3]

template<typename T >
CK_TILE_DEVICE T ck_tile::log ( x)

◆ log< double >()

template<>
CK_TILE_DEVICE double ck_tile::log< double > ( double  x)

◆ log< float >()

template<>
CK_TILE_DEVICE float ck_tile::log< float > ( float  x)

◆ log< fp16_t >()

◆ LogRange()

template<typename Range >
CK_TILE_HOST std::ostream& ck_tile::LogRange ( std::ostream &  os,
Range &&  range,
std::string  delim,
int  precision = std::cout.precision(),
int  width = 0 
)

◆ LogRangeAsType()

template<typename T , typename Range >
CK_TILE_HOST std::ostream& ck_tile::LogRangeAsType ( std::ostream &  os,
Range &&  range,
std::string  delim,
int  precision = std::cout.precision(),
int  width = 0 
)

◆ m0_inc_with_memory()

CK_TILE_DEVICE void ck_tile::m0_inc_with_memory ( index_t  v)

◆ m0_set_with_memory()

CK_TILE_DEVICE void ck_tile::m0_set_with_memory ( index_t  v)

◆ make_alibi_from_lr_mask()

template<typename DataType , bool RowMajor = true, unsigned LogMaxSadOprndSize = 16>
CK_TILE_HOST_DEVICE auto ck_tile::make_alibi_from_lr_mask ( DataType  slope,
index_t  window_left_size,
index_t  window_right_size,
index_t  y_total,
index_t  x_total,
GenericAttentionMaskEnum  mask_enum 
)

◆ make_array()

template<typename D = void, typename... Ts>
constexpr CK_TILE_HOST_DEVICE details::return_type<D, Ts...> ck_tile::make_array ( Ts &&...  ts)
constexpr

◆ make_array_with()

template<typename T , index_t Size>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_array_with ( std::initializer_list< T >  ilist)
constexpr

◆ make_buffer_view() [1/2]

template<address_space_enum BufferAddressSpace, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename T , typename BufferSizeType >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_buffer_view ( T *__restrict__  p,
BufferSizeType  buffer_size 
)
constexpr

◆ make_buffer_view() [2/2]

template<address_space_enum BufferAddressSpace, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename T , typename BufferSizeType , typename X , typename std::enable_if< std::is_same< remove_cvref_t< T >, remove_cvref_t< X >>::value, bool >::type = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_buffer_view ( T *__restrict__  p,
BufferSizeType  buffer_size,
invalid_element_value 
)
constexpr

◆ make_cluster_descriptor()

template<typename Lengths , typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_cluster_descriptor ( const Lengths &  lengths,
ArrangeOrder  order = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type{} 
)
constexpr

◆ make_embed_transform()

template<typename UpLengths , typename Coefficients , typename std::enable_if< UpLengths::size()==Coefficients::size(), bool >::type = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_embed_transform ( const UpLengths &  up_lengths,
const Coefficients &  coefficients 
)
constexpr

◆ make_freeze_transform()

template<typename LowerIndex >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_freeze_transform ( const LowerIndex &  low_idx)
constexpr

◆ make_generic_attention_mask_coordinates_from_lr_window()

constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_generic_attention_mask_coordinates_from_lr_window ( index_t  left_size,
index_t  right_size,
index_t  y_total,
index_t  x_total,
bool  is_top_left = true 
)
constexpr

◆ make_generic_attention_mask_from_lr_window()

template<typename MaskType >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_generic_attention_mask_from_lr_window ( index_t  left_size,
index_t  right_size,
index_t  y_total,
index_t  x_total,
bool  is_top_left = true 
)
constexpr

◆ make_indexing_transform()

template<typename UpLength , typename Indices >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_indexing_transform ( const UpLength &  up_lengths,
const Indices &  indices 
)
constexpr

◆ make_indexing_transform_with_adaptor()

template<typename UpLength , typename IndexingAdaptor >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_indexing_transform_with_adaptor ( const UpLength &  up_lengths,
const IndexingAdaptor &  iadaptor 
)
constexpr

◆ make_insert_transform()

template<typename UpperIndex >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_insert_transform ( const UpperIndex &  up_idx)
constexpr

◆ make_kernel()

template<int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU, typename KernelImpl , typename... Args>
CK_TILE_HOST auto ck_tile::make_kernel ( KernelImpl  ,
dim3  grid_dim,
dim3  block_dim,
std::size_t  lds_byte,
Args...  args 
)

◆ make_left_pad_transform()

template<typename LowLength , typename LeftPadLength , bool SkipIsValidCheck = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_left_pad_transform ( const LowLength &  low_length,
const LeftPadLength &  left_pad_,
bool_constant< SkipIsValidCheck >  = bool_constant<false>{} 
)
constexpr

◆ make_merge_transform()

template<typename LowLengths >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_merge_transform ( const LowLengths &  low_lengths)
constexpr

◆ make_merge_transform_v2_magic_division()

template<typename LowLengths >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_merge_transform_v2_magic_division ( const LowLengths &  low_lengths)
constexpr

◆ make_merge_transform_v3_division_mod()

template<typename LowLengths >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_merge_transform_v3_division_mod ( const LowLengths &  low_lengths)
constexpr

◆ make_modulo_transform()

template<typename Modulus , typename UpLength >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_modulo_transform ( const Modulus &  modulus,
const UpLength &  up_length 
)
constexpr

◆ make_multi_index()

template<typename... Xs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_multi_index ( Xs &&...  xs)
constexpr

◆ make_naive_tensor_descriptor()

template<typename... Lengths, typename... Strides, index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_naive_tensor_descriptor ( const tuple< Lengths... > &  lengths,
const tuple< Strides... > &  strides,
number< GuaranteedLastDimensionVectorLength >  = number<-1>{},
number< GuaranteedLastDimensionVectorStride >  = number<-1>{} 
)
constexpr

◆ make_naive_tensor_descriptor_aligned()

template<typename... Lengths, typename Align >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_naive_tensor_descriptor_aligned ( const tuple< Lengths... > &  lengths,
Align  align 
)
constexpr

◆ make_naive_tensor_descriptor_packed()

template<typename... Lengths, index_t GuaranteedLastDimensionVectorLength = -1>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_naive_tensor_descriptor_packed ( const tuple< Lengths... > &  lengths,
number< GuaranteedLastDimensionVectorLength >  = number<-1>{} 
)
constexpr

◆ make_naive_tensor_descriptor_packed_with_offset()

template<typename... Lengths, typename... Strides, typename Offset , index_t GuaranteedLastDimensionVectorLength = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_naive_tensor_descriptor_packed_with_offset ( const tuple< Lengths... > &  lengths,
const Offset &  offset,
number< GuaranteedLastDimensionVectorLength >  = number<-1>{} 
)
constexpr

◆ make_naive_tensor_descriptor_with_offset()

template<typename... Lengths, typename... Strides, typename offset , index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_naive_tensor_descriptor_with_offset ( const tuple< Lengths... > &  lengths,
const tuple< Strides... > &  strides,
const offset os,
number< GuaranteedLastDimensionVectorLength >  = number<-1>{},
number< GuaranteedLastDimensionVectorStride >  = number<-1>{} 
)
constexpr

◆ make_naive_tensor_view()

template<address_space_enum BufferAddressSpace = address_space_enum::generic, memory_operation_enum DstInMemOp = memory_operation_enum::set, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType , typename... Lengths, typename... Strides, index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_naive_tensor_view ( DataType *__restrict__  p,
const tuple< Lengths... > &  lengths,
const tuple< Strides... > &  strides,
number< GuaranteedLastDimensionVectorLength >  = number<-1>{},
number< GuaranteedLastDimensionVectorStride >  = number<-1>{} 
)
constexpr

◆ make_naive_tensor_view_packed()

template<address_space_enum BufferAddressSpace = address_space_enum::generic, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType , typename... Lengths, index_t GuaranteedLastDimensionVectorLength = -1>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_naive_tensor_view_packed ( DataType *__restrict__  p,
const tuple< Lengths... > &  lengths,
number< GuaranteedLastDimensionVectorLength >  = number<-1>{} 
)
constexpr

◆ make_null_tile_window()

template<typename WindowLengths >
constexpr CK_TILE_DEVICE auto ck_tile::make_null_tile_window ( const WindowLengths &  window_lengths)
constexpr

◆ make_offset_transform()

template<typename LowLength , typename OffsetLength >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_offset_transform ( const LowLength &  low_length,
const OffsetLength &  offset_length 
)
constexpr

◆ make_pad_transform()

template<typename LowLength , typename LeftPad , typename RightPad , bool SkipIsValidCheck = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_pad_transform ( const LowLength &  low_length,
const LeftPad &  left_pad,
const RightPad &  right_pad,
bool_constant< SkipIsValidCheck >  = bool_constant<false>{} 
)
constexpr

◆ make_page_block_navigator() [1/2]

template<typename TensorView >
CK_TILE_HOST_DEVICE auto ck_tile::make_page_block_navigator ( const TensorView &  tensor_view)

◆ make_page_block_navigator() [2/2]

template<typename DataType , index_t VirtualDim, typename TensorView >
CK_TILE_HOST_DEVICE auto ck_tile::make_page_block_navigator ( copy_const_t< DataType, void > *  physical_blocks,
long_index_t  block_stride,
long_index_t  fixed_offset,
const int32_t physical_block_indices,
index_t  num_blocks,
index_t  page_block_size,
const TensorView &  complete_view,
const TensorView &  last_view 
)

◆ make_ParallelTensorFunctor()

template<typename F , typename... Xs>
CK_TILE_HOST auto ck_tile::make_ParallelTensorFunctor ( f,
Xs...  xs 
)

◆ make_pass_through_transform()

template<typename LowLength >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_pass_through_transform ( const LowLength &  low_length)
constexpr

◆ make_replicate_transform()

template<typename UpLengths >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_replicate_transform ( const UpLengths &  up_lengths)
constexpr

◆ make_right_pad_transform()

template<typename LowLength , typename RightPadLength , bool SkipIsValidCheck = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_right_pad_transform ( const LowLength &  low_length,
const RightPadLength &  right_pad_,
bool_constant< SkipIsValidCheck >  = bool_constant<false>{} 
)
constexpr

◆ make_sequence()

template<index_t... Is>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_sequence ( number< Is >  ...)
constexpr

◆ make_single_stage_tensor_adaptor()

template<typename Transforms , typename LowerDimensionOldTopIdss , typename UpperDimensionNewTopIdss >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_single_stage_tensor_adaptor ( const Transforms &  transforms,
LowerDimensionOldTopIdss  ,
UpperDimensionNewTopIdss   
)
constexpr

◆ make_slice_transform()

template<typename LowLength , typename SliceBegin , typename SliceEnd >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_slice_transform ( const LowLength &  low_length,
const SliceBegin &  slice_begin,
const SliceEnd &  slice_end 
)
constexpr

◆ make_static_distributed_tensor() [1/2]

template<typename DataType , typename StaticTileDistribution >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_static_distributed_tensor ( const StaticTileDistribution &  )
constexpr

◆ make_static_distributed_tensor() [2/2]

template<typename DataType , typename StaticTileDistribution , typename ThreadBuffer >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_static_distributed_tensor ( const StaticTileDistribution &  ,
ThreadBuffer &&  thread_buffer_ 
)
constexpr

◆ make_static_tile_distribution()

template<typename StaticTileDistributionEncoding_ >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_static_tile_distribution ( StaticTileDistributionEncoding_  )
constexpr

◆ make_tensor_adaptor_coordinate()

template<typename Adaptor , typename TopIndex >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_tensor_adaptor_coordinate ( const Adaptor &  adaptor,
const TopIndex &  idx_top 
)
constexpr

◆ make_tensor_coordinate()

template<typename TensorDesc , typename TopIndex >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_tensor_coordinate ( const TensorDesc &  tensor_desc,
const TopIndex &  idx_top 
)
constexpr

◆ make_tensor_descriptor_from_adaptor()

template<typename Adaptor , typename ElementSpaceSize >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_tensor_descriptor_from_adaptor ( const Adaptor &  adaptor,
const ElementSpaceSize &  element_space_size 
)
constexpr

◆ make_tensor_view()

template<address_space_enum BufferAddressSpace = address_space_enum::generic, memory_operation_enum DstInMemOp = memory_operation_enum::set, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType , typename... Ts>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_tensor_view ( DataType *__restrict__  p,
const tensor_descriptor< Ts... > &  desc 
)
constexpr

◆ make_thread_buffer()

template<typename... Ts>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_thread_buffer ( Ts &&...  ts)
constexpr

◆ make_tile_scatter_gather() [1/6]

template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename StaticPageIndexArray_ , typename StaticValidArray_ , index_t HsGatherDim = 0, index_t NumCoord = 1>
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_scatter_gather ( const TensorView_ &  tensor_view,
const WindowLengths_ &  window_lengths,
const multi_index< TensorView_::get_num_of_dimension()> &  origin,
const StaticTileDistribution_ &  tile_distribution,
const StaticPageIndexArray_ &  page_idx,
const StaticValidArray_ &  valids,
number< HsGatherDim >  = {},
number< NumCoord >  = {} 
)
constexpr

◆ make_tile_scatter_gather() [2/6]

template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename StaticPageIndexArray_ , index_t HsGatherDim = 0, index_t NumCoord = 1>
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_scatter_gather ( const TensorView_ &  tensor_view,
const WindowLengths_ &  window_lengths,
const multi_index< TensorView_::get_num_of_dimension()> &  origin,
const StaticTileDistribution_ &  tile_distribution,
const StaticPageIndexArray_ &  page_idx,
number< HsGatherDim >  = {},
number< NumCoord >  = {} 
)
constexpr

◆ make_tile_scatter_gather() [3/6]

template<typename TensorView , typename WindowLengths , typename StaticTileDistribution , typename StaticPageIndexArray , typename StaticValidArray , index_t HsGatherDim>
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_scatter_gather ( const tile_window_with_static_lengths< TensorView, WindowLengths > &  tile_window,
const multi_index< TensorView::get_num_of_dimension()> &  origin,
const StaticTileDistribution &  tile_distribution,
const StaticPageIndexArray &  page_idx,
const StaticValidArray &  valids,
number< HsGatherDim >  = {} 
)
constexpr

◆ make_tile_scatter_gather() [4/6]

template<typename TensorView , typename WindowLengths , typename StaticTileDistribution , typename StaticPageIndexArray , index_t HsGatherDim>
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_scatter_gather ( const tile_window_with_static_lengths< TensorView, WindowLengths > &  tile_window,
const multi_index< TensorView::get_num_of_dimension()> &  origin,
const StaticTileDistribution &  tile_distribution,
const StaticPageIndexArray &  page_idx,
number< HsGatherDim >  = {} 
)
constexpr

◆ make_tile_scatter_gather() [5/6]

template<typename TensorView , typename WindowLengths , typename StaticTileDistribution , typename StaticPageIndexArray , typename StaticValidArray , index_t HsGatherDim>
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_scatter_gather ( const tile_window_with_static_lengths< TensorView, WindowLengths > &  tile_window,
const StaticTileDistribution &  tile_distribution,
const StaticPageIndexArray &  page_idx,
const StaticValidArray &  valids,
number< HsGatherDim >  = {} 
)
constexpr

◆ make_tile_scatter_gather() [6/6]

template<typename TensorView , typename WindowLengths , typename StaticTileDistribution , typename StaticPageIndexArray , index_t HsGatherDim>
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_scatter_gather ( const tile_window_with_static_lengths< TensorView, WindowLengths > &  tile_window,
const StaticTileDistribution &  tile_distribution,
const StaticPageIndexArray &  page_idx,
number< HsGatherDim >  = {} 
)
constexpr

◆ make_tile_window() [1/7]

template<typename WindowLengths , typename StaticTileDistribution >
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_window ( const null_tile_window< WindowLengths > &  t,
const StaticTileDistribution &   
)
constexpr

◆ make_tile_window() [2/7]

template<typename TensorView_ , typename WindowLengths_ >
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_window ( const TensorView_ &  tensor_view,
const WindowLengths_ &  window_lengths,
const multi_index< TensorView_::get_num_of_dimension()> &  origin 
)
constexpr

◆ make_tile_window() [3/7]

template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , index_t NumCoord = 1>
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_window ( const TensorView_ &  tensor_view,
const WindowLengths_ &  window_lengths,
const multi_index< TensorView_::get_num_of_dimension()> &  origin,
const StaticTileDistribution_ &  tile_distribution,
number< NumCoord >  = {} 
)
constexpr

◆ make_tile_window() [4/7]

template<typename TensorView , typename WindowLengths >
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_window ( const tile_window_with_static_lengths< TensorView, WindowLengths > &  tile_window,
const multi_index< TensorView::get_num_of_dimension()> &  origin 
)
constexpr

◆ make_tile_window() [5/7]

template<typename TensorView , typename WindowLengths , typename StaticTileDistribution >
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_window ( const tile_window_with_static_lengths< TensorView, WindowLengths > &  tile_window,
const multi_index< TensorView::get_num_of_dimension()> &  origin,
const StaticTileDistribution &  tile_distribution 
)
constexpr

◆ make_tile_window() [6/7]

template<typename TensorView , typename WindowLengths , typename StaticTileDistribution >
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_window ( const tile_window_with_static_lengths< TensorView, WindowLengths > &  tile_window,
const StaticTileDistribution &  tile_distribution 
)
constexpr

◆ make_tile_window() [7/7]

template<typename WindowLengths , typename... Ts>
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_window ( null_tensor_view  ,
const WindowLengths &  window_lengths,
const multi_index< WindowLengths::size()> &  ,
Ts &&  ... 
)
constexpr

◆ make_tile_window_linear() [1/2]

template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_window_linear ( const TensorView_ &  tensor_view,
const WindowLengths_ &  window_lengths,
const multi_index< TensorView_::get_num_of_dimension()> &  origin,
const StaticTileDistribution_ &  tile_distribution,
LinearBottomDims_  = {} 
)
constexpr

◆ make_tile_window_linear() [2/2]

template<typename TileWindow_ , typename StaticTileDistribution_ , typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_window_linear ( const TileWindow_ &  tile_window,
const StaticTileDistribution_ &  tile_distribution,
LinearBottomDims_  = {} 
)
constexpr

◆ make_tile_window_linear_raw() [1/2]

template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
CK_TILE_DEVICE auto ck_tile::make_tile_window_linear_raw ( const TensorView_ &  tensor_view,
const WindowLengths_ &  window_lengths,
const multi_index< TensorView_::get_num_of_dimension()> &  origin,
const StaticTileDistribution_ &  tile_distribution,
LinearBottomDims_  = {} 
)

◆ make_tile_window_linear_raw() [2/2]

template<typename TileWindow_ , typename StaticTileDistribution_ , typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_window_linear_raw ( const TileWindow_ &  tile_window,
const StaticTileDistribution_ &  tile_distribution,
LinearBottomDims_  = {} 
)
constexpr

◆ make_tile_window_raw() [1/2]

template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , index_t NumCoord = 1>
CK_TILE_DEVICE auto ck_tile::make_tile_window_raw ( const TensorView_ &  tensor_view,
const WindowLengths_ &  window_lengths,
const multi_index< TensorView_::get_num_of_dimension()> &  origin,
const StaticTileDistribution_ &  tile_distribution,
number< NumCoord >  = {} 
)

◆ make_tile_window_raw() [2/2]

template<typename TensorView , typename WindowLengths , typename StaticTileDistribution >
constexpr CK_TILE_DEVICE auto ck_tile::make_tile_window_raw ( const tile_window_with_static_lengths< TensorView, WindowLengths > &  tile_window,
const StaticTileDistribution &  tile_distribution 
)
constexpr

◆ make_transposed_distr_encode()

template<typename T , index_t LaneGroupSize, index_t kOuterDistDim0, index_t kOuterDistDim1, index_t kInnerDistDim0, index_t kInnerDistDim1>
constexpr CK_TILE_DEVICE auto ck_tile::make_transposed_distr_encode ( )
constexpr

◆ make_tuple()

template<typename... Xs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_tuple ( Xs &&...  xs)
constexpr

◆ make_unmerge_transform()

template<typename UpLengths , bool Use24BitIntegerCalculation = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_unmerge_transform ( const UpLengths &  up_lengths,
bool_constant< Use24BitIntegerCalculation >  = bool_constant<false>{} 
)
constexpr

◆ make_wave_buffer_resource()

CK_TILE_DEVICE int32x4_t ck_tile::make_wave_buffer_resource ( const void *  ptr,
uint32_t  size = 0xffffffff 
)

◆ make_xor_transform()

template<typename LowLengths >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_xor_transform ( const LowLengths &  low_lengths)
constexpr

◆ make_zero_multi_index()

template<index_t NSize>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::make_zero_multi_index ( )
constexpr

◆ makeTuple()

template<size_t... Idx>
constexpr std::tuple<std::integral_constant<size_t, Idx>...> ck_tile::makeTuple ( std::index_sequence< Idx... >  )
constexprnoexcept

◆ max() [1/8]

template<>
constexpr CK_TILE_DEVICE double ck_tile::max ( double  x,
double  y 
)
constexpr

◆ max() [2/8]

template<>
constexpr CK_TILE_DEVICE float ck_tile::max ( float  x,
float  y 
)
constexpr

◆ max() [3/8]

template<index_t Y>
constexpr CK_TILE_HOST_DEVICE index_t ck_tile::max ( index_t  x,
number< Y >   
)
constexpr

◆ max() [4/8]

template<index_t X>
constexpr CK_TILE_HOST_DEVICE index_t ck_tile::max ( number< X >  ,
index_t  y 
)
constexpr

◆ max() [5/8]

template<typename T >
constexpr CK_TILE_HOST_DEVICE T ck_tile::max ( x)
constexpr

◆ max() [6/8]

template<typename T >
constexpr CK_TILE_HOST T ck_tile::max ( x,
y 
)
constexpr

◆ max() [7/8]

template<typename T >
constexpr CK_TILE_DEVICE T ck_tile::max ( x,
y 
)
constexpr

◆ max() [8/8]

template<typename X , typename... Ys>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::max ( x,
Ys...  ys 
)
constexpr

◆ merge_sequences()

template<typename... Seqs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::merge_sequences ( Seqs...  )
constexpr

◆ min() [1/8]

template<>
constexpr CK_TILE_DEVICE double ck_tile::min ( double  x,
double  y 
)
constexpr

◆ min() [2/8]

template<>
constexpr CK_TILE_DEVICE float ck_tile::min ( float  x,
float  y 
)
constexpr

◆ min() [3/8]

template<index_t Y>
constexpr CK_TILE_HOST_DEVICE index_t ck_tile::min ( index_t  x,
number< Y >   
)
constexpr

◆ min() [4/8]

template<index_t X>
constexpr CK_TILE_HOST_DEVICE index_t ck_tile::min ( number< X >  ,
index_t  y 
)
constexpr

◆ min() [5/8]

template<typename T >
constexpr CK_TILE_HOST_DEVICE T ck_tile::min ( x)
constexpr

◆ min() [6/8]

template<typename T >
constexpr CK_TILE_HOST T ck_tile::min ( x,
y 
)
constexpr

◆ min() [7/8]

template<typename T >
constexpr CK_TILE_DEVICE T ck_tile::min ( x,
y 
)
constexpr

◆ min() [8/8]

template<typename X , typename... Ys>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::min ( x,
Ys...  ys 
)
constexpr

◆ minus()

__host__ __device__ ck_tile::minus ( ) -> minus< void, void >

FIXME: create macro to replace 'host device' and nothing more.

◆ modify_sequence_elements_by_ids()

template<typename Seq , typename Values , typename Ids >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::modify_sequence_elements_by_ids ( Seq  ,
Values  ,
Ids   
)
constexpr

◆ moe_sorting_get_smem_row_col()

constexpr CK_TILE_HOST auto ck_tile::moe_sorting_get_smem_row_col ( int  tokens_,
int  num_experts_ 
)
constexpr

◆ moe_sorting_get_sub_token()

CK_TILE_HOST index_t ck_tile::moe_sorting_get_sub_token ( int  tokens_,
int  num_experts_ 
)

◆ moe_sorting_get_workspace_size()

CK_TILE_HOST index_t ck_tile::moe_sorting_get_workspace_size ( int  tokens_,
int  num_experts_,
int  topk_,
int  dispatch_policy_ 
)

◆ moe_sorting_is_oneshot()

CK_TILE_HOST bool ck_tile::moe_sorting_is_oneshot ( int  tokens_,
int  num_experts_ 
)

◆ moe_sorting_mp_get_workspace_size()

CK_TILE_HOST index_t ck_tile::moe_sorting_mp_get_workspace_size ( int  tokens_,
int  num_experts_,
int  topk_ 
)

◆ move_tensor_adaptor_coordinate() [1/2]

template<bool JudgeDoTransforms = true, typename Adaptor , typename AdaptorCoord , typename TopIndex >
constexpr CK_TILE_HOST_DEVICE void ck_tile::move_tensor_adaptor_coordinate ( const Adaptor &  adaptor,
AdaptorCoord &  coord,
const TopIndex &  idx_diff_top 
)
constexpr

◆ move_tensor_adaptor_coordinate() [2/2]

template<bool JudgeDoTransforms = true, typename Adaptor , typename AdaptorCoord , typename TopIndex , typename BottomIndex >
constexpr CK_TILE_HOST_DEVICE void ck_tile::move_tensor_adaptor_coordinate ( const Adaptor &  adaptor,
AdaptorCoord &  coord,
const TopIndex &  idx_diff_top,
BottomIndex &  idx_diff_bottom 
)
constexpr

◆ move_tensor_coordinate()

template<bool JudgeDoTransforms = true, typename TensorDesc , typename TensorCoord , typename Index >
constexpr CK_TILE_HOST_DEVICE void ck_tile::move_tensor_coordinate ( const TensorDesc &  tensor_desc,
TensorCoord &  coord,
const Index &  coord_step 
)
constexpr

◆ move_tile_window() [1/5]

template<typename WindowLengths >
CK_TILE_DEVICE void ck_tile::move_tile_window ( null_tile_window< WindowLengths > &  ,
const typename null_tile_window< WindowLengths >::BottomTensorIndex &   
)

◆ move_tile_window() [2/5]

template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename LinearBottomDims_ >
CK_TILE_DEVICE void ck_tile::move_tile_window ( tile_window_linear< TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ > &  window,
const typename tile_window_linear< TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ >::BottomTensorIndex &  step 
)

◆ move_tile_window() [3/5]

template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , index_t NumCoord>
CK_TILE_DEVICE void ck_tile::move_tile_window ( tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord > &  window,
const typename tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord >::BottomTensorIndex &  step 
)

◆ move_tile_window() [4/5]

template<typename TensorView_ , typename WindowLengths_ >
CK_TILE_DEVICE void ck_tile::move_tile_window ( tile_window_with_static_lengths< TensorView_, WindowLengths_ > &  window,
const typename tile_window_with_static_lengths< TensorView_, WindowLengths_ >::BottomTensorIndex &  step 
)

◆ move_tile_window() [5/5]

template<typename TileWindow_ >
CK_TILE_DEVICE void ck_tile::move_tile_window ( TileWindow_ &  window,
const typename TileWindow_::BottomTensorIndex &  step 
)

◆ multiplies()

__host__ __device__ ck_tile::multiplies ( ) -> multiplies< void, void >

FIXME: create macro to replace 'host device' and nothing more.

◆ naive_attention_fwd()

CK_TILE_HOST float ck_tile::naive_attention_fwd ( naive_attention_fwd_traits  t,
naive_attention_fwd_args  a,
ck_tile::stream_config  s 
)

◆ naive_gemm_kernel()

template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename LayoutA , typename LayoutB , typename LayoutC >
__global__ void ck_tile::naive_gemm_kernel ( ADataType *  A,
BDataType *  B,
CDataType *  C,
ck_tile::index_t  M,
ck_tile::index_t  N,
ck_tile::index_t  K,
ck_tile::index_t  strideA,
ck_tile::index_t  strideB,
ck_tile::index_t  strideC 
)

◆ neg() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::neg ( x)

◆ neg() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::neg ( x)

◆ neg< double >()

template<>
CK_TILE_DEVICE double ck_tile::neg< double > ( double  x)

◆ neg< float >()

template<>
CK_TILE_DEVICE float ck_tile::neg< float > ( float  x)

◆ neg< fp16_t >()

◆ neg< int32_t >()

◆ neg< int8_t >()

◆ next_power_of_two() [1/3]

template<index_t X>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::next_power_of_two ( )
constexpr

◆ next_power_of_two() [2/3]

constexpr CK_TILE_HOST_DEVICE int32_t ck_tile::next_power_of_two ( int32_t  x)
constexpr

◆ next_power_of_two() [3/3]

template<index_t X>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::next_power_of_two ( number< X >  )
constexpr

◆ operator!=() [1/3]

template<typename T , index_t Size>
constexpr CK_TILE_HOST_DEVICE bool ck_tile::operator!= ( const array< T, Size > &  a,
const array< T, Size > &  b 
)
constexpr

◆ operator!=() [2/3]

template<typename... Xs>
constexpr CK_TILE_HOST_DEVICE bool ck_tile::operator!= ( const tuple< Xs... > &  a,
const tuple< Xs... > &  b 
)
constexpr

◆ operator!=() [3/3]

template<index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE bool ck_tile::operator!= ( sequence< Xs... >  x,
sequence< Ys... >  y 
)
constexpr

◆ operator%() [1/3]

template<index_t Y, index_t... Xs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator% ( number< Y >  ,
sequence< Xs... >   
)
constexpr

◆ operator%() [2/3]

template<index_t... Xs, index_t Y>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator% ( sequence< Xs... >  ,
number< Y >   
)
constexpr

◆ operator%() [3/3]

template<index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator% ( sequence< Xs... >  ,
sequence< Ys... >   
)
constexpr

◆ operator*() [1/10]

template<index_t NSize, typename T >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator* ( const multi_index< NSize > &  a,
const T &  b 
)
constexpr

◆ operator*() [2/10]

template<index_t NSize>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator* ( const multi_index< NSize > &  x,
index_t  a 
)
constexpr

◆ operator*() [3/10]

template<typename... Xs, typename... Ys>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator* ( const tuple< Xs... > &  x,
const tuple< Ys... > &  y 
)
constexpr

◆ operator*() [4/10]

template<typename... Xs, typename Y , std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator* ( const tuple< Xs... > &  x,
const Y &  y 
)
constexpr

◆ operator*() [5/10]

template<typename... Xs, typename Y , std::enable_if_t< std::is_integral< Y >::value||std::is_floating_point< Y >::value, bool > = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator* ( const tuple< Xs... > &  x,
a 
)
constexpr

◆ operator*() [6/10]

template<index_t NSize>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator* ( index_t  a,
const multi_index< NSize > &  x 
)
constexpr

◆ operator*() [7/10]

template<index_t Y, index_t... Xs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator* ( number< Y >  ,
sequence< Xs... >   
)
constexpr

◆ operator*() [8/10]

template<index_t... Xs, index_t Y>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator* ( sequence< Xs... >  ,
number< Y >   
)
constexpr

◆ operator*() [9/10]

template<index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator* ( sequence< Xs... >  ,
sequence< Ys... >   
)
constexpr

◆ operator*() [10/10]

template<typename... Xs, typename Y , std::enable_if_t< std::is_integral< Y >::value||std::is_floating_point< Y >::value, bool > = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator* ( a,
const tuple< Xs... > &  x 
)
constexpr

◆ operator+() [1/6]

template<index_t NSize, typename T >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator+ ( const multi_index< NSize > &  a,
const T &  b 
)
constexpr

◆ operator+() [2/6]

template<typename... Xs, typename... Ys>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator+ ( const tuple< Xs... > &  x,
const tuple< Ys... > &  y 
)
constexpr

◆ operator+() [3/6]

template<typename... Xs, typename Y , std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator+ ( const tuple< Xs... > &  x,
const Y &  y 
)
constexpr

◆ operator+() [4/6]

template<index_t Y, index_t... Xs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator+ ( number< Y >  ,
sequence< Xs... >   
)
constexpr

◆ operator+() [5/6]

template<index_t... Xs, index_t Y>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator+ ( sequence< Xs... >  ,
number< Y >   
)
constexpr

◆ operator+() [6/6]

template<index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator+ ( sequence< Xs... >  ,
sequence< Ys... >   
)
constexpr

◆ operator+=() [1/2]

template<index_t NSize, typename X >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator+= ( multi_index< NSize > &  y,
const X &  x 
)
constexpr

◆ operator+=() [2/2]

template<typename... Ys, typename X , std::enable_if_t<!std::is_integral< X >::value &&!std::is_floating_point< X >::value, bool > = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator+= ( tuple< Ys... > &  y,
const X &  x 
)
constexpr

◆ operator-() [1/6]

template<index_t NSize, typename T >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator- ( const multi_index< NSize > &  a,
const T &  b 
)
constexpr

◆ operator-() [2/6]

template<typename... Xs, typename... Ys>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator- ( const tuple< Xs... > &  x,
const tuple< Ys... > &  y 
)
constexpr

◆ operator-() [3/6]

template<typename... Xs, typename Y , std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator- ( const tuple< Xs... > &  x,
const Y &  y 
)
constexpr

◆ operator-() [4/6]

template<index_t Y, index_t... Xs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator- ( number< Y >  ,
sequence< Xs... >   
)
constexpr

◆ operator-() [5/6]

template<index_t... Xs, index_t Y>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator- ( sequence< Xs... >  ,
number< Y >   
)
constexpr

◆ operator-() [6/6]

template<index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator- ( sequence< Xs... >  ,
sequence< Ys... >   
)
constexpr

◆ operator-=() [1/2]

template<index_t NSize, typename X >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator-= ( multi_index< NSize > &  y,
const X &  x 
)
constexpr

◆ operator-=() [2/2]

template<typename... Ys, typename X , std::enable_if_t<!std::is_integral< X >::value &&!std::is_floating_point< X >::value, bool > = false>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator-= ( tuple< Ys... > &  y,
const X &  x 
)
constexpr

◆ operator/() [1/4]

template<typename... Xs, typename... Ys>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator/ ( const tuple< Xs... > &  x,
const tuple< Ys... > &  y 
)
constexpr

◆ operator/() [2/4]

template<index_t Y, index_t... Xs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator/ ( number< Y >  ,
sequence< Xs... >   
)
constexpr

◆ operator/() [3/4]

template<index_t... Xs, index_t Y>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator/ ( sequence< Xs... >  ,
number< Y >   
)
constexpr

◆ operator/() [4/4]

template<index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::operator/ ( sequence< Xs... >  ,
sequence< Ys... >   
)
constexpr

◆ operator<<()

template<typename T >
std::ostream& ck_tile::operator<< ( std::ostream &  os,
const std::vector< T > &  v 
)

Stream operator overload for vector output.

Provides a formatted string representation of a vector, useful for debugging and logging.

Template Parameters
TType of vector elements
Parameters
osOutput stream
vVector to output
Returns
Reference to the output stream

◆ operator==() [1/3]

template<typename T , index_t Size>
constexpr CK_TILE_HOST_DEVICE bool ck_tile::operator== ( const array< T, Size > &  a,
const array< T, Size > &  b 
)
constexpr

◆ operator==() [2/3]

template<typename... Xs>
constexpr CK_TILE_HOST_DEVICE bool ck_tile::operator== ( const tuple< Xs... > &  a,
const tuple< Xs... > &  b 
)
constexpr

◆ operator==() [3/3]

template<index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE bool ck_tile::operator== ( sequence< Xs... >  ,
sequence< Ys... >   
)
constexpr

◆ pad_tensor_view()

template<typename TensorView , typename TileLengths , typename DoPads >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::pad_tensor_view ( const TensorView &  tensor_view,
const TileLengths &  tile_lengths,
DoPads   
)
constexpr

◆ pick_sequence_elements_by_ids()

template<typename Seq , index_t... Is>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::pick_sequence_elements_by_ids ( Seq  ,
sequence< Is... >   
)
constexpr

◆ pick_sequence_elements_by_mask()

template<typename Seq , typename Mask >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::pick_sequence_elements_by_mask ( Seq  ,
Mask   
)
constexpr

◆ pk_add_f16()

CK_TILE_DEVICE fp16x2_t ck_tile::pk_add_f16 ( const fp16x2_t x,
const fp16x2_t y 
)

◆ pk_fp4_to_bf16()

constexpr CK_TILE_HOST_DEVICE bf16_t ck_tile::pk_fp4_to_bf16 ( const pk_fp4_t x,
float  scale 
)
constexpr

◆ pk_fp4_to_bf16x2()

constexpr CK_TILE_HOST_DEVICE bf16x2_t ck_tile::pk_fp4_to_bf16x2 ( const pk_fp4_t x,
float  scale 
)
constexpr

◆ pk_fp4_to_float()

constexpr CK_TILE_HOST_DEVICE float ck_tile::pk_fp4_to_float ( const pk_fp4_t x,
float  scale 
)
constexpr

◆ pk_fp4_to_fp16()

constexpr CK_TILE_HOST_DEVICE fp16_t ck_tile::pk_fp4_to_fp16 ( const pk_fp4_t x,
float  scale 
)
constexpr

◆ pk_fp4_to_fp16x2()

constexpr CK_TILE_HOST_DEVICE fp16x2_t ck_tile::pk_fp4_to_fp16x2 ( const pk_fp4_t x,
float  scale 
)
constexpr

◆ pk_fp4_to_fp32x2()

constexpr CK_TILE_HOST_DEVICE fp32x2_t ck_tile::pk_fp4_to_fp32x2 ( const pk_fp4_t x,
float  scale 
)
constexpr

◆ pk_int4_t_to_bfloat16x2_t()

CK_TILE_HOST_DEVICE bf16x2_t ck_tile::pk_int4_t_to_bfloat16x2_t ( const pk_int4_t x)

◆ pk_int4_t_to_fp32x2_t()

CK_TILE_HOST_DEVICE fp32x2_t ck_tile::pk_int4_t_to_fp32x2_t ( const pk_int4_t x)

◆ pk_int4_t_to_fp32x2_t_signed_conversion()

CK_TILE_HOST_DEVICE fp32x2_t ck_tile::pk_int4_t_to_fp32x2_t_signed_conversion ( const pk_int4_t x)

◆ pk_int4_t_to_halfx2_t()

CK_TILE_HOST_DEVICE fp16x2_t ck_tile::pk_int4_t_to_halfx2_t ( const pk_int4_t x)

◆ pk_int4_t_to_int8x2_t()

CK_TILE_HOST_DEVICE int8x2_t ck_tile::pk_int4_t_to_int8x2_t ( const pk_int4_t x)

◆ plus()

__host__ __device__ ck_tile::plus ( ) -> plus< void, void >

FIXME: create macro to replace 'host device' and nothing more.

◆ pow() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::pow ( x,
gamma 
)

◆ pow() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::pow ( x,
gamma 
)

◆ pow< double >()

template<>
CK_TILE_DEVICE double ck_tile::pow< double > ( double  x,
double  gamma 
)

◆ pow< float >()

template<>
CK_TILE_DEVICE float ck_tile::pow< float > ( float  x,
float  gamma 
)

◆ prefix_sum_sequence()

template<typename Seq >
constexpr auto ck_tile::prefix_sum_sequence ( Seq  )
constexpr

◆ preprocess_profiling_impl()

template<typename TimerType , typename PreprocessFunc >
CK_TILE_HOST double ck_tile::preprocess_profiling_impl ( TimerType  timer,
const stream_config s,
PreprocessFunc  preprocess 
)

◆ print() [1/14]

template<address_space_enum BufferAddressSpace, typename T , typename BufferSizeType , bool InvalidElementUseNumericalZeroValue, amd_buffer_coherence_enum Coherence>
CK_TILE_HOST_DEVICE void ck_tile::print ( const buffer_view< BufferAddressSpace, T, BufferSizeType, InvalidElementUseNumericalZeroValue, Coherence > &  bv)

◆ print() [2/14]

template<>
CK_TILE_HOST_DEVICE void ck_tile::print ( const char &  value)

Specialization for char.

◆ print() [3/14]

template<>
CK_TILE_HOST_DEVICE void ck_tile::print ( const double &  value)

Specialization for double.

◆ print() [4/14]

template<>
CK_TILE_HOST_DEVICE void ck_tile::print ( const float &  value)

Specialization for float.

◆ print() [5/14]

template<>
CK_TILE_HOST_DEVICE void ck_tile::print ( const int &  value)

Specialization for int.

◆ print() [6/14]

template<>
CK_TILE_HOST_DEVICE void ck_tile::print ( const long &  value)

Specialization for long.

◆ print() [7/14]

template<typename T >
CK_TILE_HOST_DEVICE void ck_tile::print ( const T &  )

Declare a ck_tile::print() interface that gets specialized in each header file for types that can be printed.

◆ print() [8/14]

template<typename T , size_t N>
CK_TILE_HOST_DEVICE void ck_tile::print ( const T(&)  value[N])

Specialization for array.

◆ print() [9/14]

template<typename PsYs2XsAdaptor_ , typename Ys2DDescriptor_ , typename StaticTileDistributionEncoding_ , typename TileDistributionDetail_ >
CK_TILE_HOST_DEVICE void ck_tile::print ( const tile_distribution< PsYs2XsAdaptor_, Ys2DDescriptor_, StaticTileDistributionEncoding_, TileDistributionDetail_ > &  distribution)

◆ print() [10/14]

template<typename RsLengths_ , typename HsLengthss_ , typename Ps2RHssMajor_ , typename Ps2RHssMinor_ , typename Ys2RHsMajor_ , typename Ys2RHsMinor_ >
CK_TILE_HOST_DEVICE void ck_tile::print ( const tile_distribution_encoding< RsLengths_, HsLengthss_, Ps2RHssMajor_, Ps2RHssMinor_, Ys2RHsMajor_, Ys2RHsMinor_ > &  encoding)

◆ print() [11/14]

template<index_t BlockSize, index_t YPerTile, index_t XPerTile, index_t VecSize, tile_distribution_pattern DistributionPattern, index_t NumWaveGroups>
CK_TILE_HOST_DEVICE void ck_tile::print ( const TileDistributionEncodingPattern2D< BlockSize, YPerTile, XPerTile, VecSize, DistributionPattern, NumWaveGroups > &  )

◆ print() [12/14]

template<typename... T>
CK_TILE_HOST_DEVICE void ck_tile::print ( const tuple< T... > &  t)

◆ print() [13/14]

template<typename RsLengths_ , typename HsLengthss_ , typename Ps2RHssMajor_ , typename Ps2RHssMinor_ , typename Ys2RHsMajor_ , typename Ys2RHsMinor_ >
CK_TILE_HOST_DEVICE void ck_tile::print ( const typename tile_distribution_encoding< RsLengths_, HsLengthss_, Ps2RHssMajor_, Ps2RHssMinor_, Ys2RHsMajor_, Ys2RHsMinor_ >::detail &  detail_obj)

◆ print() [14/14]

template<>
CK_TILE_HOST_DEVICE void ck_tile::print ( const unsigned int &  value)

Specialization for unsigned int.

◆ rcp() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::rcp ( x)

◆ rcp() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::rcp ( x)

◆ reduce_on_sequence()

template<typename Seq , typename Reduce , index_t Init>
constexpr CK_TILE_HOST_DEVICE index_t ck_tile::reduce_on_sequence ( Seq  ,
Reduce  f,
number< Init >   
)
constexpr

◆ reference_batched_dropout()

template<typename DataType , typename RandValOutputDataType >
CK_TILE_HOST void ck_tile::reference_batched_dropout ( HostTensor< DataType > &  in_out_b_m_n,
const HostTensor< RandValOutputDataType > &  randval_b_m_n,
const uint8_t &  p_undrop_in_uint8_t,
const float  scale 
)

◆ reference_batched_elementwise()

template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename BinaryElementOp = ck_tile::plus<AccDataType>>
CK_TILE_HOST void ck_tile::reference_batched_elementwise ( const HostTensor< ADataType > &  a_b_m_n,
const HostTensor< BDataType > &  b_b_m_n,
HostTensor< CDataType > &  c_b_m_n,
const AElementOp &  a_element_op = {},
const BElementOp &  b_element_op = {},
const BinaryElementOp &  binary_element_op = {} 
)

◆ reference_batched_gemm()

template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void ck_tile::reference_batched_gemm ( const HostTensor< ADataType > &  a_b_m_k,
const HostTensor< BDataType > &  b_b_n_k,
HostTensor< CDataType > &  c_b_m_n,
const AElementOp &  a_element_op = {},
const BElementOp &  b_element_op = {},
const ACCElementOp &  acc_element_op = {} 
)

◆ reference_batched_gemm_gpu()

template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename LayoutA , typename LayoutB , typename LayoutC >
void ck_tile::reference_batched_gemm_gpu ( ADataType *  a_ptr,
BDataType *  b_ptr,
CDataType *  c_ptr,
index_t  M,
index_t  N,
index_t  K,
index_t  stride_a,
index_t  stride_b,
index_t  stride_c,
index_t  batch_stride_A,
index_t  batch_stride_B,
index_t  batch_stride_C,
index_t  batch_count 
)

◆ reference_batched_masking()

template<typename CDataType , typename MaskingType >
CK_TILE_HOST void ck_tile::reference_batched_masking ( HostTensor< CDataType > &  c_b_m_n,
const MaskingType &  mask 
)

◆ reference_batched_rotary_position_embedding()

template<typename DataType , typename ComputeDataType = float>
CK_TILE_HOST void ck_tile::reference_batched_rotary_position_embedding ( const HostTensor< DataType > &  input_bsd,
const HostTensor< DataType > &  cos_sd,
const HostTensor< DataType > &  sin_sd,
bool  interleaved,
HostTensor< DataType > &  output_bsd,
bool  use_1_row_sin_cos = false 
)

◆ reference_batched_softmax()

template<typename ADataType , typename CompDataType , typename BDataType , typename CompElementOp = ck_tile::identity>
CK_TILE_HOST void ck_tile::reference_batched_softmax ( const HostTensor< ADataType > &  a_b_m_n,
HostTensor< BDataType > &  b_b_m_n,
const CompElementOp &  comp_element_op = {},
std::optional< std::reference_wrapper< HostTensor< CompDataType >>>  lse_b_m = std::nullopt 
)

◆ reference_batched_transpose()

template<typename Type >
CK_TILE_HOST void ck_tile::reference_batched_transpose ( const HostTensor< Type > &  x,
HostTensor< Type > &  y,
std::string  layout_in = "NCHW",
std::string  layout_out = "NHWC" 
)

◆ reference_binary_elementwise()

template<typename ADataType , typename BDataType , typename CDataType , typename ComputeDataType , typename ElementOp >
CK_TILE_HOST void ck_tile::reference_binary_elementwise ( const HostTensor< ADataType > &  a,
const HostTensor< BDataType > &  b,
HostTensor< CDataType > &  c,
ElementOp  element_op 
)

◆ reference_fused_moe()

template<typename AccDataType , typename Activation , typename ADataType , typename GDataType , typename DDataType , typename ODataType , typename AScaleDataType , typename GScaleDataType , typename DScaleDataType , typename YSmoothScaleDataType , typename TopkWeightDataType , typename IndexDataType >
void ck_tile::reference_fused_moe ( const ck_tile::HostTensor< ADataType > &  a_host,
const ck_tile::HostTensor< GDataType > &  g_host,
const ck_tile::HostTensor< DDataType > &  d_host,
const ck_tile::HostTensor< AScaleDataType > &  sa_host,
const ck_tile::HostTensor< GScaleDataType > &  sg_host,
const ck_tile::HostTensor< DScaleDataType > &  sd_host,
const ck_tile::HostTensor< YSmoothScaleDataType > &  sy_host,
ck_tile::HostTensor< ODataType > &  o_host,
const ck_tile::HostTensor< IndexDataType > &  sorted_token_ids_host,
const ck_tile::HostTensor< TopkWeightDataType > &  sorted_weight_host,
const ck_tile::HostTensor< IndexDataType > &  sorted_expert_ids_host,
const ck_tile::HostTensor< IndexDataType > &  num_sorted_tiles_host,
const ck_tile::HostTensor< IndexDataType > &  token_ids_host,
ck_tile::index_t  block_m,
ck_tile::index_t  tokens,
ck_tile::index_t  experts,
ck_tile::index_t  hidden_size,
ck_tile::index_t  intermediate_size,
ck_tile::index_t  topk,
ck_tile::index_t  gate_only 
)

◆ reference_gemm()

template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void ck_tile::reference_gemm ( const HostTensor< ADataType > &  a_m_k,
const HostTensor< BDataType > &  b_k_n,
HostTensor< CDataType > &  c_m_n,
const AElementOp &  a_element_op = {},
const BElementOp &  b_element_op = {},
const ACCElementOp &  acc_element_op = {} 
)

◆ reference_gemm_gpu()

template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename LayoutA , typename LayoutB , typename LayoutC >
void ck_tile::reference_gemm_gpu ( ADataType *  a_ptr,
BDataType *  b_ptr,
CDataType *  c_ptr,
index_t  M,
index_t  N,
index_t  K,
index_t  stride_a,
index_t  stride_b,
index_t  stride_c 
)

◆ reference_gemm_multiple_d()

template<typename ADataType , typename BDataType , typename DsDataType , typename AccDataType , typename CDataType , typename ACCElementOp , typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>>
CK_TILE_HOST void ck_tile::reference_gemm_multiple_d ( const HostTensor< ADataType > &  a_m_k,
const HostTensor< BDataType > &  b_k_n,
const std::array< HostTensor< DDataType >, DsDataType::size()> &  ds_m_n,
HostTensor< CDataType > &  c_m_n,
const ACCElementOp &  acc_element_op = {} 
)

◆ reference_gemm_quant()

template<typename ADataType , typename QDataType , typename BDataType , typename AccDataType , typename CDataType , uint32_t QuantGroupSize, bool aquant, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void ck_tile::reference_gemm_quant ( const HostTensor< ADataType > &  a_m_k,
const HostTensor< QDataType > &  q,
const HostTensor< BDataType > &  b_k_n,
HostTensor< CDataType > &  c_m_n,
const AElementOp &  a_element_op = {},
const BElementOp &  b_element_op = {},
const ACCElementOp &  acc_element_op = {} 
)

◆ reference_grouped_conv_bwd_data()

template<ck_tile::index_t NDimSpatial, typename InDataType , typename WeiDataType , typename OutDataType >
CK_TILE_HOST void ck_tile::reference_grouped_conv_bwd_data ( HostTensor< InDataType > &  input,
const HostTensor< WeiDataType > &  weight,
const HostTensor< OutDataType > &  output,
std::vector< ck_tile::long_index_t conv_strides,
std::vector< ck_tile::long_index_t conv_dilations,
std::vector< ck_tile::long_index_t in_left_pads,
std::vector< ck_tile::long_index_t  
)

◆ reference_grouped_conv_bwd_weight()

template<ck_tile::index_t NDimSpatial, typename InDataType , typename WeiDataType , typename OutDataType >
CK_TILE_HOST void ck_tile::reference_grouped_conv_bwd_weight ( const HostTensor< InDataType > &  input,
HostTensor< WeiDataType > &  weight,
const HostTensor< OutDataType > &  output,
std::vector< ck_tile::long_index_t conv_strides,
std::vector< ck_tile::long_index_t conv_dilations,
std::vector< ck_tile::long_index_t in_left_pads,
std::vector< ck_tile::long_index_t  
)

◆ reference_grouped_conv_fwd()

template<ck_tile::index_t NDimSpatial, typename InDataType , typename WeiDataType , typename OutDataType >
CK_TILE_HOST void ck_tile::reference_grouped_conv_fwd ( const HostTensor< InDataType > &  input,
const HostTensor< WeiDataType > &  weight,
HostTensor< OutDataType > &  output,
std::vector< ck_tile::long_index_t conv_strides,
std::vector< ck_tile::long_index_t conv_dilations,
std::vector< ck_tile::long_index_t in_left_pads,
std::vector< ck_tile::long_index_t  
)

◆ reference_im2col()

template<typename InDataType , typename OutDataType , index_t NDimSpatial>
CK_TILE_HOST void ck_tile::reference_im2col ( const HostTensor< InDataType > &  in_host,
HostTensor< OutDataType > &  out_host,
const ck_tile::conv::ConvParam conv_params 
)

◆ reference_layernorm2d_fwd()

template<typename XDataType , typename GammaDataType , typename BetaDataType , typename ComputeDataType , typename YDataType , typename MeanDataType , typename InvStdDataType , typename Epilogue = reference_layernorm2d_default_epilogue>
void ck_tile::reference_layernorm2d_fwd ( const HostTensor< XDataType > &  x_m_n,
const HostTensor< GammaDataType > &  gamma_n,
const HostTensor< BetaDataType > &  beta_n,
HostTensor< YDataType > &  y_m_n,
HostTensor< MeanDataType > &  mean_m,
HostTensor< InvStdDataType > &  invStd_m,
ComputeDataType  epsilon,
Epilogue  epilogue_functor = {} 
)

◆ reference_moe_sorting()

template<typename WeightType , typename IndexType = index_t>
CK_TILE_HOST void ck_tile::reference_moe_sorting ( const HostTensor< IndexType > &  topk_ids,
const HostTensor< WeightType > &  weights,
const HostTensor< IndexType > &  local_expert_mask,
HostTensor< IndexType > &  p_sorted_token_ids,
HostTensor< WeightType > &  sorted_weight,
HostTensor< IndexType > &  sorted_expert_ids,
index_t unit_cnt,
const index_t  experts,
const index_t  unit_size,
const index_t  tokens,
bool  local_expert_masking,
bool  skip_experts_with_zero_token = true 
)

◆ reference_permute() [1/2]

template<typename DataType >
CK_TILE_HOST void ck_tile::reference_permute ( const HostTensor< DataType > &  x,
HostTensor< DataType > &  y,
std::vector< index_t perm 
)

◆ reference_permute() [2/2]

template<typename DataType >
CK_TILE_HOST auto ck_tile::reference_permute ( const HostTensor< DataType > &  x,
std::vector< index_t perm 
)

◆ reference_reduce() [1/2]

template<typename XDataType , typename ComputeDataType , typename YDataType , typename ReduceOp >
CK_TILE_HOST void ck_tile::reference_reduce ( const HostTensor< XDataType > &  x_m_n,
HostTensor< YDataType > &  y_m,
ReduceOp  reduce_op 
)

◆ reference_reduce() [2/2]

template<typename XDataType , typename ComputeDataType , typename YDataType , typename ReduceOp , typename KeptDim , typename ReduceDims >
CK_TILE_HOST void ck_tile::reference_reduce ( const HostTensor< XDataType > &  x_tensor,
HostTensor< YDataType > &  y_tensor,
ReduceOp  reduce_op,
KeptDim  kept_dim,
ReduceDims  reduce_dims 
)

◆ reference_rmsnorm2d_fwd()

template<typename XDataType , typename GammaDataType , typename ComputeDataType , typename YDataType , typename InvRmsDataType , typename UnquantYDataType , typename Epilogue = reference_rmsnorm2d_default_epilogue>
void ck_tile::reference_rmsnorm2d_fwd ( const HostTensor< XDataType > &  x_m_n,
const HostTensor< GammaDataType > &  gamma_n,
HostTensor< YDataType > &  y_m_n,
HostTensor< InvRmsDataType > &  invRms_m,
HostTensor< UnquantYDataType > &  unquant_y_m_n,
ComputeDataType  epsilon,
Epilogue  epilogue_functor = {} 
)

◆ reference_rowwise_quantization2d()

template<typename XDataType , typename ScaleDataType , typename QXDataType >
CK_TILE_HOST void ck_tile::reference_rowwise_quantization2d ( const HostTensor< XDataType > &  x_m_n,
const HostTensor< ScaleDataType > &  scale_m,
HostTensor< QXDataType > &  qx_m_n 
)

◆ reference_softmax() [1/2]

template<typename InputType , typename ComputeType , typename OutputType = ComputeType>
CK_TILE_HOST void ck_tile::reference_softmax ( const HostTensor< InputType > &  x,
HostTensor< OutputType > &  y,
index_t  dim = -1 
)

◆ reference_softmax() [2/2]

template<typename InputType , typename ComputeType , typename OutputType = ComputeType>
CK_TILE_HOST auto ck_tile::reference_softmax ( const HostTensor< InputType > &  x,
index_t  dim = -1 
)

◆ reference_topk() [1/2]

template<typename DataType , typename IndexType = index_t>
CK_TILE_HOST void ck_tile::reference_topk ( const HostTensor< DataType > &  x,
HostTensor< DataType > &  y_values,
HostTensor< IndexType > &  y_indices,
index_t  k,
index_t  dim = -1,
bool  largest = true,
bool  sorted = true 
)

◆ reference_topk() [2/2]

template<typename DataType , typename IndexType = index_t>
CK_TILE_HOST auto ck_tile::reference_topk ( const HostTensor< DataType > &  x,
index_t  k,
index_t  dim = -1,
bool  largest = true,
bool  sorted = true 
)

◆ reference_transpose_elementwise()

template<typename ADataType , typename BDataType >
void ck_tile::reference_transpose_elementwise ( const HostTensor< ADataType > &  a,
HostTensor< BDataType > &  b 
)

◆ reference_unary_elementwise()

template<typename ADataType , typename BDataType , typename ComputeDataType , typename ElementOp >
CK_TILE_HOST void ck_tile::reference_unary_elementwise ( const HostTensor< ADataType > &  a,
HostTensor< BDataType > &  b,
ElementOp  element_op 
)

◆ report_error_stats()

CK_TILE_HOST void ck_tile::report_error_stats ( int  err_count,
double  max_err,
std::size_t  total_size 
)

Report error statistics for numerical comparisons.

Outputs statistics about numerical comparison errors including count and maximum error.

Parameters
err_countNumber of errors found
max_errMaximum error value encountered
total_sizeTotal number of elements compared

◆ reverse_exclusive_scan_sequence()

template<typename Seq , typename Reduce , index_t Init>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::reverse_exclusive_scan_sequence ( Seq  ,
Reduce  ,
number< Init >   
)
constexpr

◆ reverse_inclusive_scan_sequence()

template<typename Seq , typename Reduce , index_t Init>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::reverse_inclusive_scan_sequence ( Seq  ,
Reduce  ,
number< Init >   
)
constexpr

◆ reverse_slice_sequence()

template<typename Seq , index_t SliceSize, typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
constexpr auto ck_tile::reverse_slice_sequence ( Seq  ,
number< SliceSize >  ,
Mask  = typename uniform_sequence_gen<Seq::size(), 1>::type{} 
)
constexpr

◆ sad_u16()

CK_TILE_DEVICE uint16_t ck_tile::sad_u16 ( uint16_t  x,
uint16_t  y,
uint16_t  acc 
)

◆ sad_u32()

CK_TILE_HOST uint32_t ck_tile::sad_u32 ( uint32_t  x,
uint32_t  y,
uint32_t  acc 
)

TODO: replace inline asm when intrinsic is available

◆ scaled_type_convert()

template<typename Y , typename X >
constexpr CK_TILE_HOST_DEVICE Y ck_tile::scaled_type_convert ( x,
float  scale 
)
constexpr

◆ scales()

template<typename Scale >
__host__ __device__ ck_tile::scales ( Scale  ) -> scales< Scale >

FIXME: create macro to replace 'host device' and nothing more.

◆ sequence_all_of()

template<typename Seq , typename F >
constexpr CK_TILE_HOST_DEVICE bool ck_tile::sequence_all_of ( Seq  ,
f 
)
constexpr

◆ sequence_any_of()

template<typename Seq , typename F >
constexpr CK_TILE_HOST_DEVICE bool ck_tile::sequence_any_of ( Seq  ,
f 
)
constexpr

◆ sequence_pop_back()

template<typename Seq >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::sequence_pop_back ( Seq  )
constexpr

◆ sequence_pop_front()

template<index_t I, index_t... Is>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::sequence_pop_front ( sequence< I, Is... >  )
constexpr

◆ sequence_to_tuple_of_number()

template<index_t... Is>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::sequence_to_tuple_of_number ( sequence< Is... >  )
constexpr

◆ set_buffer_value()

template<typename T >
__global__ void ck_tile::set_buffer_value ( T *  p,
x,
uint64_t  buffer_element_size 
)

◆ set_container_subset() [1/2]

template<typename T , index_t N, index_t... Is>
constexpr CK_TILE_HOST_DEVICE void ck_tile::set_container_subset ( array< T, N > &  y,
sequence< Is... >  picks,
const array< T, sizeof...(Is)> &  x 
)
constexpr

◆ set_container_subset() [2/2]

template<typename Y , typename X , index_t... Is>
constexpr CK_TILE_HOST_DEVICE void ck_tile::set_container_subset ( Y &  y,
sequence< Is... >  picks,
const X &  x 
)
constexpr

◆ set_slice_tile()

template<typename DstDataType_ , typename DstStaticTileDistribution_ , typename SrcDataType_ , typename SrcStaticTileDistribution_ , index_t... SliceBegins, index_t... SliceEnds>
constexpr CK_TILE_DEVICE auto ck_tile::set_slice_tile ( static_distributed_tensor< DstDataType_, DstStaticTileDistribution_ > &  dst_tile,
const static_distributed_tensor< SrcDataType_, SrcStaticTileDistribution_ > &  src_tile,
sequence< SliceBegins... >  slice_begins,
sequence< SliceEnds... >  slice_ends 
)
constexpr

◆ set_tile() [1/4]

template<typename DstrTensors , typename T >
CK_TILE_DEVICE void ck_tile::set_tile ( DstrTensors &  dstr_tensor,
const T &  value 
)

◆ set_tile() [2/4]

template<typename DstrTensors , index_t v, bool skip_subdword_opt = false>
CK_TILE_DEVICE void ck_tile::set_tile ( DstrTensors &  dstr_tensor,
number< v >  ,
bool_constant< skip_subdword_opt >  = {} 
)

◆ set_tile() [3/4]

template<typename T >
CK_TILE_DEVICE void ck_tile::set_tile ( null_tensor ,
const T &   
)

◆ set_tile() [4/4]

template<index_t v>
CK_TILE_DEVICE void ck_tile::set_tile ( null_tensor ,
number< v >   
)

◆ set_tile_if()

template<typename DataType , typename StaticTileDistribution , typename XIndicesPredicate >
CK_TILE_HOST_DEVICE void ck_tile::set_tile_if ( static_distributed_tensor< DataType, StaticTileDistribution > &  out_tensor,
DataType  value,
XIndicesPredicate  predicate 
)

◆ shuffle_tile()

template<typename OutTensor , typename InTensor >
CK_TILE_DEVICE void ck_tile::shuffle_tile ( OutTensor &  out,
const InTensor &  in 
)

◆ sin() [1/2]

◆ sin() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::sin ( x)

◆ sin< double >()

template<>
CK_TILE_DEVICE double ck_tile::sin< double > ( double  x)

◆ sin< float >()

template<>
CK_TILE_DEVICE float ck_tile::sin< float > ( float  x)

◆ sin< fp16_t >()

◆ sinh() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::sinh ( x)

◆ sinh() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::sinh ( x)

◆ sinh< double >()

template<>
CK_TILE_DEVICE double ck_tile::sinh< double > ( double  x)

◆ sinh< float >()

template<>
CK_TILE_DEVICE float ck_tile::sinh< float > ( float  x)

◆ slice_sequence()

template<typename Seq , index_t SliceSize, typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
constexpr auto ck_tile::slice_sequence ( Seq  ,
number< SliceSize >  ,
Mask  = typename uniform_sequence_gen<Seq::size(), 1>::type{} 
)
constexpr

◆ sqrt() [1/4]

CK_TILE_DEVICE bfloat16_t ck_tile::sqrt ( bfloat16_t  x)

◆ sqrt() [2/4]

CK_TILE_DEVICE double ck_tile::sqrt ( double  x)

◆ sqrt() [3/4]

CK_TILE_DEVICE float ck_tile::sqrt ( float  x)

◆ sqrt() [4/4]

CK_TILE_DEVICE fp16_t ck_tile::sqrt ( fp16_t  x)

◆ store_tile() [1/3]

template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , typename DataType_ >
CK_TILE_DEVICE void ck_tile::store_tile ( tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &  tile_window,
const static_distributed_tensor< DataType_, TileDistribution_ > &  dstr_tensor 
)

◆ store_tile() [2/3]

template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ >
CK_TILE_DEVICE void ck_tile::store_tile ( tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &  tile_window,
const static_distributed_tensor< DataType_, TileDistribution_ > &  dstr_tensor 
)

◆ store_tile() [3/3]

template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename DataType_ >
CK_TILE_DEVICE void ck_tile::store_tile ( tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &  tile_window_tmp,
const static_distributed_tensor< DataType_, TileDistribution_ > &  dstr_tensor 
)

◆ store_tile_raw() [1/3]

template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , typename DataType_ >
CK_TILE_DEVICE void ck_tile::store_tile_raw ( tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &  tile_window,
const static_distributed_tensor< DataType_, TileDistribution_ > &  dstr_tensor 
)

◆ store_tile_raw() [2/3]

template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ >
CK_TILE_DEVICE void ck_tile::store_tile_raw ( tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &  tile_window,
const static_distributed_tensor< DataType_, TileDistribution_ > &  dstr_tensor 
)

◆ store_tile_raw() [3/3]

template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename DataType_ >
CK_TILE_DEVICE void ck_tile::store_tile_raw ( tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &  tile_window_tmp,
const static_distributed_tensor< DataType_, TileDistribution_ > &  dstr_tensor 
)

◆ sweep_tile() [1/2]

template<typename DistributedTensor , typename F , typename UnpacksPerXDim = typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type>
constexpr CK_TILE_HOST_DEVICE void ck_tile::sweep_tile ( const DistributedTensor &  ,
const F &  f,
UnpacksPerXDim  = {} 
)
constexpr

◆ sweep_tile() [2/2]

template<typename DistributedTensor , typename F , typename UnpacksPerXDim = typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type>
constexpr CK_TILE_HOST_DEVICE void ck_tile::sweep_tile ( const F &  f,
UnpacksPerXDim  = {} 
)
constexpr

◆ sweep_tile_span()

template<typename TileDistributedSpan_ , typename F >
CK_TILE_DEVICE void ck_tile::sweep_tile_span ( TileDistributedSpan_  ,
const F &  f 
)

◆ sweep_tile_uspan()

template<typename TileDistributedSpan_ , typename F , typename Unpacks = typename uniform_sequence_gen<TileDistributedSpan_::Impl::size(), 1>::type>
CK_TILE_DEVICE void ck_tile::sweep_tile_uspan ( TileDistributedSpan_  ,
const F &  f,
Unpacks  = {} 
)

◆ tan() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::tan ( x)

◆ tan() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::tan ( x)

◆ tan< double >()

template<>
CK_TILE_DEVICE double ck_tile::tan< double > ( double  x)

◆ tan< float >()

template<>
CK_TILE_DEVICE float ck_tile::tan< float > ( float  x)

◆ tanh() [1/2]

template<typename T >
CK_TILE_HOST T ck_tile::tanh ( x)

◆ tanh() [2/2]

template<typename T >
CK_TILE_DEVICE T ck_tile::tanh ( x)

◆ tanh< double >()

template<>
CK_TILE_DEVICE double ck_tile::tanh< double > ( double  x)

◆ tanh< float >()

template<>
CK_TILE_DEVICE float ck_tile::tanh< float > ( float  x)

◆ tanh_fast()

template<typename T >
CK_TILE_DEVICE T ck_tile::tanh_fast ( x)

◆ tanh_fast< float >()

template<>
CK_TILE_DEVICE float ck_tile::tanh_fast< float > ( float  x)

◆ tie()

template<typename... Args>
constexpr tuple<Args&...> ck_tile::tie ( Args &...  args)
constexprnoexcept

◆ tile_distribution_pattern_to_string()

constexpr const char* ck_tile::tile_distribution_pattern_to_string ( tile_distribution_pattern  pattern)
constexpr

◆ tile_elementwise_in() [1/2]

template<typename InElementFunc , typename... MaybeNullTensor, typename = std::enable_if_t< std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, null_tensor>...>>>
CK_TILE_DEVICE auto ck_tile::tile_elementwise_in ( const InElementFunc &  ,
MaybeNullTensor &&  ... 
)

◆ tile_elementwise_in() [2/2]

template<typename InElementFunc , typename... InTensor, typename = std::enable_if_t< std::conjunction_v<std::negation<std::is_same<InTensor, null_tensor>>...>>>
CK_TILE_DEVICE auto ck_tile::tile_elementwise_in ( const InElementFunc &  in_element_func,
const InTensor &...  in_dstr_tensors 
)

◆ tile_elementwise_inout() [1/2]

template<typename InOutElementFunc , typename... MaybeNullTensor, typename = std::enable_if_t< std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, null_tensor>...>>>
CK_TILE_DEVICE void ck_tile::tile_elementwise_inout ( const InOutElementFunc &  ,
MaybeNullTensor &&  ... 
)

◆ tile_elementwise_inout() [2/2]

template<typename InOutElementFunc , typename... InOutDstrTensors, typename = std::enable_if_t<std::conjunction_v< std::negation<std::is_same<std::remove_const_t<InOutDstrTensors>, null_tensor>>...>>>
CK_TILE_DEVICE void ck_tile::tile_elementwise_inout ( const InOutElementFunc &  inout_element_func,
InOutDstrTensors &...  inout_dstr_tensors 
)

◆ tile_elementwise_inout_unpack() [1/2]

template<typename InElementFunc , typename Tuple >
CK_TILE_DEVICE auto ck_tile::tile_elementwise_inout_unpack ( const InElementFunc &  in_element_func,
const Tuple &  t 
)

Template function that "unpacks" a tuple and applies an element-wise operation.

Parameters
in_element_funcFunction to apply element-wise.
tAny container containing elements to process, with known size and tuple-like semantic.
Returns
Calls the overloaded function, passing an index sequence.

◆ tile_elementwise_inout_unpack() [2/2]

template<typename InElementFunc , typename Tuple , size_t... I>
CK_TILE_DEVICE auto ck_tile::tile_elementwise_inout_unpack ( const InElementFunc &  in_element_func,
const Tuple &  t,
std::index_sequence< I... >   
)

Template function that "unpacks" a tuple and applies an element-wise operation.

Parameters
in_element_funcFunction to apply element-wise.
tAny container containing elements to process, with known size and tuple-like semantic.
Returns
Calls tile_elementwise_inout with unpacked tuple elements.

◆ tile_sweeper()

template<typename T , typename F , typename U = typename uniform_sequence_gen<T::get_num_of_dimension(), 1>::type>
CK_TILE_HOST_DEVICE_EXTERN ck_tile::tile_sweeper ( const T &  ,
const F &  ,
= {} 
) -> tile_sweeper< T, F, U >

◆ timing_loop_impl()

template<typename TimerType , typename CallablesFunc , typename PreprocessFunc = std::nullptr_t>
CK_TILE_HOST double ck_tile::timing_loop_impl ( TimerType  timer,
const stream_config s,
CallablesFunc &&  callables_func,
PreprocessFunc  preprocess = nullptr 
)

◆ to_array() [1/2]

template<typename T , index_t N, typename X >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::to_array ( const std::vector< X > &  x)
constexpr

◆ to_array() [2/2]

template<typename T , index_t N, typename X >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::to_array ( const X &  x)
constexpr

◆ to_array_of_array()

template<typename... Seqs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::to_array_of_array ( tuple< Seqs... >  t_of_s)
constexpr

◆ to_multi_index()

template<typename T >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::to_multi_index ( const T &  x)
constexpr

◆ to_sequence()

template<index_t... Is>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::to_sequence ( tuple< number< Is >... >  )
constexpr

◆ transform_sequences() [1/3]

template<typename F , index_t... Xs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::transform_sequences ( f,
sequence< Xs... >   
)
constexpr

◆ transform_sequences() [2/3]

template<typename F , index_t... Xs, index_t... Ys>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::transform_sequences ( f,
sequence< Xs... >  ,
sequence< Ys... >   
)
constexpr

◆ transform_sequences() [3/3]

template<typename F , index_t... Xs, index_t... Ys, index_t... Zs>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::transform_sequences ( f,
sequence< Xs... >  ,
sequence< Ys... >  ,
sequence< Zs... >   
)
constexpr

◆ transform_tensor_adaptor()

template<typename OldTensorAdaptor , typename NewTransforms , typename NewLowerDimensionOldTopIdss , typename NewUpperDimensionNewTopIdss >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::transform_tensor_adaptor ( const OldTensorAdaptor &  old_tensor_adaptor,
const NewTransforms &  new_transforms,
NewLowerDimensionOldTopIdss  ,
NewUpperDimensionNewTopIdss   
)
constexpr

◆ transform_tensor_descriptor()

template<typename OldTensorDescriptor , typename NewTransforms , typename NewLowerDimensionOldTopIdss , typename NewUpperDimensionNewTopIdss >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::transform_tensor_descriptor ( const OldTensorDescriptor &  old_tensor_desc,
const NewTransforms &  new_transforms,
NewLowerDimensionOldTopIdss  ,
NewUpperDimensionNewTopIdss   
)
constexpr

◆ transform_tensor_view()

template<typename OldTensorView , typename NewTransforms , typename NewLowerDimensionOldVisibleIdss , typename NewUpperDimensionNewVisibleIdss >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::transform_tensor_view ( const OldTensorView &  old_tensor_view,
const NewTransforms &  new_transforms,
NewLowerDimensionOldVisibleIdss  ,
NewUpperDimensionNewVisibleIdss   
)
constexpr

◆ transform_tuples() [1/3]

template<typename F , typename X >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::transform_tuples ( f,
const X &  x 
)
constexpr

◆ transform_tuples() [2/3]

template<typename F , typename X , typename Y >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::transform_tuples ( f,
const X &  x,
const Y &  y 
)
constexpr

◆ transform_tuples() [3/3]

template<typename F , typename X , typename Y , typename Z >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::transform_tuples ( f,
const X &  x,
const Y &  y,
const Z &  z 
)
constexpr

◆ transpose_host_tensor_descriptor_given_new2old()

template<typename New2Old >
CK_TILE_HOST HostTensorDescriptor ck_tile::transpose_host_tensor_descriptor_given_new2old ( const HostTensorDescriptor a,
const New2Old &  new2old 
)

◆ transpose_tile2d()

template<typename OutTensor , typename InTensor >
CK_TILE_DEVICE void ck_tile::transpose_tile2d ( OutTensor &  out,
const InTensor &  in 
)

◆ tuple_depth() [1/2]

template<index_t depth = 0, typename T >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::tuple_depth ( const T &  )
constexpr

◆ tuple_depth() [2/2]

template<index_t depth = 0, typename... Ts>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::tuple_depth ( const tuple< Ts... > &  )
constexpr

◆ tuple_reduce()

template<index_t Idx, index_t End, typename F , typename... Ts>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::tuple_reduce ( F &&  f,
const tuple< Ts... > &  t 
)
constexpr

◆ tuple_reverse()

template<typename... Ts>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::tuple_reverse ( const tuple< Ts... > &  t)
constexpr

◆ type_convert()

template<typename Y , typename X , std::enable_if_t<!(std::is_const_v< Y >||std::is_const_v< X >), bool > = false>
constexpr CK_TILE_HOST_DEVICE Y ck_tile::type_convert ( x)
constexpr

◆ unpack()

template<typename F , typename X >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::unpack ( F &&  f,
X &&  x 
)
constexpr

◆ unpack2()

template<typename F , typename X , typename Y >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::unpack2 ( F &&  f,
X &&  x,
Y &&  y 
)
constexpr

◆ unroll_nested_tuple() [1/3]

template<index_t Depth = 0, index_t MaxDepth = -1, typename T >
constexpr CK_TILE_HOST_DEVICE auto ck_tile::unroll_nested_tuple ( const T &  t)
constexpr

◆ unroll_nested_tuple() [2/3]

template<index_t Depth = 0, index_t MaxDepth = -1, typename... Ts>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::unroll_nested_tuple ( const tuple< Ts... > &  t)
constexpr

◆ unroll_nested_tuple() [3/3]

template<index_t Depth = 0, index_t MaxDepth = -1>
constexpr CK_TILE_HOST_DEVICE auto ck_tile::unroll_nested_tuple ( const tuple<> &  t)
constexpr

◆ update_tile() [1/2]

template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ , index_t i_access = -1, bool oob_conditional_check = true>
CK_TILE_DEVICE void ck_tile::update_tile ( tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &  tile_window,
const static_distributed_tensor< DataType_, TileDistribution_ > &  dstr_tensor,
number< i_access >  = {},
bool_constant< oob_conditional_check >  = {} 
)

◆ update_tile() [2/2]

template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename DataType_ >
CK_TILE_DEVICE void ck_tile::update_tile ( tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &  tile_window_tmp,
const static_distributed_tensor< DataType_, TileDistribution_ > &  dstr_tensor 
)

◆ update_tile_raw() [1/2]

template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , typename DataType_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE auto ck_tile::update_tile_raw ( tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &  tile_window,
const static_distributed_tensor< DataType_, TileDistribution_ > &  dstr_tensor,
number< i_access >  = {},
bool_constant< oob_conditional_check >  = {},
bool_constant< pre_nop >  = {} 
)

◆ update_tile_raw() [2/2]

template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::update_tile_raw ( tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &  tile_window,
const static_distributed_tensor< DataType_, TileDistribution_ > &  dstr_tensor,
number< i_access >  = {},
bool_constant< oob_conditional_check >  = {},
bool_constant< pre_nop >  = {} 
)

◆ UpdateEnvVar() [1/2]

template<typename EnvVar >
void ck_tile::UpdateEnvVar ( EnvVar  ,
const std::string_view &  val 
)

◆ UpdateEnvVar() [2/2]

template<typename EnvVar , typename ValueType >
void ck_tile::UpdateEnvVar ( EnvVar  ,
const ValueType &  val 
)

Updates the cached value of an environment variable.

◆ warp_shuffle()

template<typename T >
CK_TILE_DEVICE T ck_tile::warp_shuffle ( const T &  v_local,
uint32_t  src_lane 
)

◆ warp_shuffle_down()

template<typename T >
CK_TILE_DEVICE T ck_tile::warp_shuffle_down ( const T &  v_local,
uint32_t  lane_delta 
)

◆ warp_shuffle_down_pair()

template<typename T >
CK_TILE_DEVICE auto ck_tile::warp_shuffle_down_pair ( const T &  v_local)

◆ warp_shuffle_up()

template<typename T >
CK_TILE_DEVICE T ck_tile::warp_shuffle_up ( const T &  v_local,
uint32_t  lane_delta 
)

◆ welford_update()

template<typename T , bool kFastFDiv = false>
CK_TILE_DEVICE void ck_tile::welford_update ( T &  mean,
T &  var,
x,
int  count,
bool_constant< kFastFDiv >  = {} 
)

Variable Documentation

◆ ALIBI

constexpr uint32_t ck_tile::ALIBI = 8U
constexpr

◆ AllConvertibleToStringView

template<typename... Ts>
constexpr bool ck_tile::AllConvertibleToStringView
inlineconstexpr
Initial value:
=
((std::is_convertible_v<Ts, std::string_view> || IsCharArray<Ts>::value ||
std::is_same_v<Ts, char>) &&
...)

◆ CUSTOM_MASK

constexpr uint32_t ck_tile::CUSTOM_MASK = 1U
constexpr

◆ ERROR_DETAIL_LIMIT

constexpr int ck_tile::ERROR_DETAIL_LIMIT = 5
constexpr

Maximum number of error values to display when checking errors.

◆ has_wmma_traits_v

template<typename Arch , typename AType , typename BType , typename CType , index_t warp_m, index_t warp_n, index_t warp_k>
constexpr bool ck_tile::has_wmma_traits_v
constexpr
Initial value:
=
has_wmma_traits<Arch, AType, BType, CType, warp_m, warp_n, warp_k>::value

◆ ignore

constexpr detail::ignore_t ck_tile::ignore
inlineconstexpr

◆ is_constant_v

template<typename T >
constexpr bool ck_tile::is_constant_v = is_constant<T>::value
inlineconstexpr

◆ is_null_tile_window_v

template<typename T >
constexpr bool ck_tile::is_null_tile_window_v = impl::is_null_tile_window<remove_cvref_t<T>>::value
constexpr

◆ is_static_v

template<typename T >
constexpr bool ck_tile::is_static_v = is_static<T>::value
inlineconstexpr

◆ is_tile_window_linear_v

template<typename T >
constexpr bool ck_tile::is_tile_window_linear_v = is_tile_window_linear<T>::value
inlineconstexpr

Helper variable template to check if a type is a linear tile window.

Equivalent to is_tile_window_linear<T>::value.

Template Parameters
TThe type to check.

◆ is_tile_window_with_static_distribution_v

template<typename T >
constexpr bool ck_tile::is_tile_window_with_static_distribution_v
inlineconstexpr
Initial value:
=
is_tile_window_with_static_distribution<T>::value

Helper variable template to check if a type is a tile window with static distribution.

Equivalent to is_tile_window_with_static_distribution<T>::value.

Template Parameters
TThe type to check.

◆ is_tile_window_with_static_lengths_v

template<typename T >
constexpr bool ck_tile::is_tile_window_with_static_lengths_v
inlineconstexpr
Initial value:
=
is_tile_window_with_static_lengths<T>::value

Helper variable template to check if a type is a tile window with static lengths.

Equivalent to is_tile_window_with_static_lengths<T>::value.

Template Parameters
TThe type to check.

◆ log2e_rcp_v

template<typename T = double>
constexpr T ck_tile::log2e_rcp_v = 1. / log2e<T>::value
constexpr

◆ log2e_v

template<typename T = double>
constexpr T ck_tile::log2e_v = log2e<T>::value
constexpr

◆ LOGITS_SOFT_CAP

constexpr uint32_t ck_tile::LOGITS_SOFT_CAP = 4U
constexpr

◆ Right

ck_tile::Right
Initial value:
= Left>
struct equal
{
CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs,
const Right& rhs) const -> decltype(lhs == rhs)
{
return lhs == rhs;
}
}
#define CK_TILE_HOST_DEVICE
Definition: config.hpp:41
Right
Definition: math.hpp:327
__host__ __device__ equal() -> equal< void, void >
FIXME: create macro to replace 'host device' and nothing more.

◆ SLIDING_WINDOW

constexpr uint32_t ck_tile::SLIDING_WINDOW = 2U
constexpr