ck Namespace Reference

ck Namespace Reference#

Composable Kernel: ck Namespace Reference
ck Namespace Reference

Namespaces

 conv_tensor_rearrange_op
 
 debug
 
 detail
 
 details
 
 dpp8
 
 fp8_impl
 
 host_common
 
 impl
 
 internal
 
 lds_utils
 
 literals
 
 math
 
 mathext
 
 ranges
 
 reduce
 
 tensor_layout
 
 tensor_operation
 
 util
 
 utility
 
 utils
 

Classes

struct  InMemoryDataOperationEnumSequence
 
struct  StaticTensor
 
struct  StaticTensorTupleOfVectorBuffer
 
struct  PassThrough
 
struct  Pad
 
struct  LeftPad
 
struct  RightPad
 
struct  Embed
 
struct  Merge_v1_carry_check
 
struct  lambda_merge_generate_MagicDivision_calculate_magic_multiplier
 
struct  lambda_merge_generate_MagicDivision_calculate_magic_shift
 
struct  Merge_v2_magic_division
 
struct  Merge_v2r2_magic_division
 
struct  Merge_v3_division_mod
 
struct  UnMerge
 
struct  Freeze
 
struct  Insert
 
struct  Vectorize
 
struct  Slice
 
struct  Modulo
 
struct  Xor
 
struct  TensorAdaptor
 
struct  TensorCoordinate
 
struct  TensorCoordinateStep
 
struct  TensorDescriptor
 
struct  lambda_get_up_dim_num
 
struct  SpaceFillingCurve
 
struct  BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
 
struct  BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
 
struct  BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
 
struct  BlockwiseGemmDpp_ak0mak1_bk0nbk1_m0n0m1n1m2n2
 
struct  BlockwiseGemmXdlops_mx_pipeline_base
 
struct  BlockwiseGemmWmmaops_pipeline_hotloop_inst
 
struct  BlockwiseGemmWmmaops_pipeline_base
 
struct  BlockwiseGemmWmmaops_pipeline_v1
 
struct  BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmWmmaops_pipeline_v3
 
struct  BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_hotloop_inst
 
struct  BlockwiseGemmXdlops_pipeline_v4
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v1
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v2
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v2< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v3
 
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_base
 
struct  BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v1
 
struct  BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3
 
struct  BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v3
 
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_mx_moe_bns_gufusion_v3
 
struct  BlockwiseGemmXdlops_pipeline_mx_moe_bns_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v1
 
struct  BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v1< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3
 
struct  BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v1
 
struct  BlockwiseGemmXdlops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v1_ab_scale
 
struct  BlockwiseGemmXdlops_pipeline_v1_ab_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v1_b_scale
 
struct  BlockwiseGemmXdlops_pipeline_v1_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v1_mx
 
struct  BlockwiseGemmXdlops_pipeline_v1_mx< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v2
 
struct  BlockwiseGemmXdlops_pipeline_v2< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v2< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v2_ab_scale
 
struct  BlockwiseGemmXdlops_pipeline_v2_ab_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v2_b_scale
 
struct  BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v3
 
struct  BlockwiseGemmXdlops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v3_ab_scale
 
struct  BlockwiseGemmXdlops_pipeline_v3_ab_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v3_b_scale
 
struct  BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v3_mx
 
struct  BlockwiseGemmXdlops_pipeline_v3_mx< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle
 
struct  BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v4_b_scale
 
struct  BlockwiseGemmXdlops_pipeline_v4_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_pipeline_v5
 
struct  BlockwiseGemmXdlops_pipeline_v5< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
 
struct  BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 
struct  BlockwiseGemmWMMA
 
struct  BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 
struct  BlockwiseGemmXdlops_v2
 Blockwise gemm. More...
 
struct  BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1r1
 
struct  BlockwiseSoftmax
 Blockwise softmax. More...
 
struct  BlockwiseTensorSliceTransfer_v5r1
 
struct  BlockwiseWelford
 
struct  PartitionedBlockwiseReduction
 
struct  PartitionedBlockwiseReduction_v2
 
struct  PartitionedBlockwiseReductionWithIndex
 
struct  ThreadGroupTensorSliceTransfer_DirectLoad
 
struct  ThreadGroupTensorSliceTransfer_Gather_DirectLoad
 
struct  ThreadGroupTensorSliceTransfer_v4r1
 Blockwise data transfer. More...
 
struct  ThreadGroupTensorSliceTransfer_v4r1_dequant
 Blockwise data transfer with dequantization. More...
 
struct  ThreadGroupTensorSliceTransfer_v4r1_gather
 Blockwise data transfer. More...
 
struct  ThreadGroupTensorSliceTransfer_v4r2
 Blockwise data transfer. More...
 
struct  ThreadGroupTensorSliceTransfer_v6r1
 
struct  ThreadGroupTensorSliceTransfer_v6r1r2
 
struct  ThreadGroupTensorSliceTransfer_v6r2
 
struct  ThreadGroupTensorSliceTransfer_v6r3
 
struct  ThreadGroupTensorSliceTransfer_v7
 
struct  ThreadGroupTensorSliceTransfer_v7r2
 
struct  ThreadGroupTensorSliceTransfer_v7r3
 
struct  ThreadGroupTensorSliceTransfer_v7r3_scatter
 
struct  reduce_binary_operator
 
struct  reduce_binary_operator< ReduceTensorOp::ADD >
 
struct  reduce_binary_operator< ReduceTensorOp::MUL >
 
struct  reduce_binary_operator< ReduceTensorOp::MIN >
 
struct  reduce_binary_operator< ReduceTensorOp::MAX >
 
struct  reduce_binary_operator< ReduceTensorOp::AMAX >
 
struct  reduce_binary_operator< ReduceTensorOp::AVG >
 
struct  reduce_binary_operator< ReduceTensorOp::NORM1 >
 
struct  reduce_binary_operator< ReduceTensorOp::NORM2 >
 
struct  reduce_unary_operator
 
struct  reduce_unary_operator< ReduceTensorOp::AVG, IsFirstReduce, true >
 
struct  reduce_unary_operator< ReduceTensorOp::NORM1, true, IsLastReduce >
 
struct  reduce_unary_operator< ReduceTensorOp::AMAX, true, IsLastReduce >
 
struct  reduce_unary_operator< ReduceTensorOp::NORM2, true, false >
 
struct  reduce_unary_operator< ReduceTensorOp::NORM2, true, true >
 
struct  reduce_unary_operator< ReduceTensorOp::NORM2, false, true >
 
struct  GridwiseMultiblockBatchNormForward
 
struct  GridwiseReduceSecondHalfBatchNormBackwardFinal
 
struct  GridwiseMultiblockWelfordFirstHalf
 
struct  GridwiseWelfordSecondHalfBatchNormForwardFinal
 
struct  GridwiseWelfordSecondHalfReduceFirstHalf
 
struct  BlockToCTileMap_M00_N0_M01
 
struct  BlockToCTileMap_M00_N0_M01Adapt
 
struct  BlockToCTileMap_M00_N0_M01Adapt< MPerBlock, NPerBlock, void >
 
struct  BlockToCTileMap_Grouped_M00_N0_M01Adapt
 
struct  BlockToCTileMap_N00_M0_N01Adapt
 
struct  BlockToCTileMap_N00_M0_N01Adapt< MPerBlock, NPerBlock, void >
 
struct  BlockToCTileMap_KSplit_M00_N0_M01Adapt
 
struct  BlockToCTileMap_M00_N00_M01_N01
 
struct  BlockToCTileMap_KSplit_M00_N00_M01_N01
 
struct  OffsettedBlockToCTileMap
 
struct  OffsettedBlockToCTileMap2
 
struct  BlockToCTileMap_3DGrid_KSplit
 Simple tile mapping which creates 3D grid of block of threads. More...
 
struct  BlockToCTileMap_GemmStreamK
 
struct  BlockToCTileMap_GemmStreamK_v2
 
struct  GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle
 
struct  GridwiseWelfordSecondHalfLayernorm2d
 
struct  GridwiseMultipleReduction_mk_to_m_multiblock
 
struct  GridwiseMultipleReduction_mk_to_m_threadwise
 
struct  GridwiseReduction_mk_to_m_multiblock
 
struct  GridwiseReduction_mk_to_m_threadwise
 
struct  GridwiseReduction_mk_to_m_threadwise_multi_d
 
struct  GridwiseBatchedGemmGemm_Xdl_CShuffle
 
struct  GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
 
struct  GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
 
struct  GridwiseBatchedGemmSoftmaxGemm_Wmma
 
struct  GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
 Gridwise gemm + softmax + gemm fusion. More...
 
struct  GridwiseBatchNormBackwardWithBlockwiseWelford
 
struct  GridwiseBatchNormForwardWithBlockwiseWelford
 
struct  GridwiseElementwise_1D
 
struct  GridwiseElementwise
 
struct  GridwiseElementwiseLayernormWelfordVariance_mk_to_mk
 
struct  GridwiseFpAintBGemm_Wmma
 
struct  GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 
struct  GridwiseGemmDlMultipleD_km_kn_mn
 
struct  GridwiseGemmDl_km_kn_mn_v1r3
 
struct  GridwiseGemmDl_bkm_bkn_mn_v1r3
 
struct  GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp
 
struct  GridwiseGemmMultipleABD_xdl_cshuffle
 
struct  GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 
struct  GridwiseGemmMultipleD_Wmma
 
struct  GridwiseGemmMultipleD_xdl_cshuffle
 
struct  GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
 
struct  GridwiseGemmMultipleD_xdl_splitk_cshuffle
 
struct  GridwiseGemmPipeline_v1
 
struct  GridwiseGemmPipeline_v1< 1, true, true >
 
struct  GridwiseGemmPipeline_v1< 2, true, true >
 
struct  GridwiseGemmPipeline_v1< 1, false, true >
 
struct  GridwiseGemmPipeline_v1< 1, true, false >
 
struct  GridwiseGemmPipeline_v1< 1, false, false >
 
struct  GridwiseGemmPipeline_v1_WeightOnly
 
struct  GridwiseGemmPipeline_v1_WeightOnly< 1, true, true >
 
struct  GridwiseGemmPipelineInterwave_v1
 
struct  GridwiseGemmPipelineInterwave_v1< 1 >
 
struct  GridwiseGemmPipelineInterwave_v1< 2 >
 
struct  GridwiseGemmPipeline_v2
 
struct  GridwiseGemmPipeline_v3
 
struct  GridwiseGemmPipeline_v4
 
struct  GridwiseGemmPipeline_v4< 1 >
 
struct  GridwiseGemmPipeline_v4< 2 >
 
struct  GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 
struct  GridwiseGemmSplitKMultipleD_xdl_cshuffle
 
struct  GridwiseGemmLoadWave
 
struct  GridwiseGemmLoadWave< TileLoadThreadGroup, 1 >
 
struct  GridwiseGemmMathWave
 
struct  GridwiseGemmMathWave< TileMathThreadGroup, 1 >
 
struct  GridwiseGemm_Wmma
 
struct  GridwiseGemm_wmma_cshuffle_v3
 "Universal" GEMM kernel with SplitK support. More...
 
struct  GridwiseGemm_wmma_cshuffle_v3_b_scale
 
struct  GridwiseGemm_wmma_cshuffle_v3_base
 
struct  GridwiseGemm_xdl_cshuffle_conv_v3
 
struct  GridwiseGemm_xdl_cshuffle_streamk_v3
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 
struct  GridwiseGemm_xdl_cshuffle_v2
 
struct  GridwiseGemm_xdl_cshuffle_v3
 "Universal" GEMM kernel with SplitK support. More...
 
struct  GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
 
struct  GridwiseGemmMultiD_xdl_cshuffle_v3
 
struct  GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
 
struct  GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
 
struct  GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
 
struct  GridwiseGemmMX_xdl_cshuffle_v3
 
struct  GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
 
struct  GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle
 
struct  Merge_v4_no_carry
 
struct  GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
 
struct  GridwiseGemm_xdlops_splitk_lds_direct_load
 
struct  GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
 
struct  GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
 
struct  GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
 
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
 
struct  GridwiseMoeGemm
 
struct  GridwiseMoeGemmBlockScale
 
struct  GridwiseMoeGemmMX
 
struct  GridwiseMoeGemmMXBNS
 
struct  GridwiseMoeGemmMX_BPreshuffle
 
struct  GridwisePermute
 
struct  GridwisePutElement_1D
 
struct  GridwiseSoftmax_mk_to_mk
 
struct  GridwiseSparseEmbeddingsForwardLayernorm
 
struct  GridwiseTensorRearrange
 
struct  GridwiseNormalizationBwdData_mk_to_mk
 
struct  GridwiseNormalizationBwdGammaBeta_mk_to_k
 
struct  GridwiseNormalizationNaiveVariance_mk_to_mk
 
struct  GridwiseNormalizationSplitK1st
 
struct  GridwiseNormalizationSplitK2nd
 
struct  GridwiseNormalizationWelfordVariance_mk_to_mk
 
struct  ThreadwiseReduction
 
struct  ThreadwiseReductionWithIndex
 
struct  ThreadwiseGemmDl_km0m1_kn0n1_m0m1n0n1
 
struct  ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
 
struct  ThreadwiseGemmDlops_km_kn_mn_v3
 
struct  ThreadwiseTensorSliceSet_v1
 
struct  ThreadwiseTensorSliceTransfer_v1r3
 
struct  ThreadwiseTensorSliceTransfer_v2
 Helper structure that facilitates transfer of source (grid) data to destination threads. More...
 
struct  ThreadwiseTensorSliceTransfer_v2_gather
 
struct  ThreadwiseTensorSliceTransfer_v3
 
struct  ThreadwiseTensorSliceTransfer_v4
 
struct  ThreadwiseTensorSliceTransfer_StaticToStatic
 Threadwise data transfer. More...
 
struct  ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
 
struct  ThreadwiseTensorSliceTransfer_StaticToStatic_IntraRow
 
struct  ThreadwiseTensorSliceTransfer_v3r1
 
struct  ThreadwiseTensorSliceTransfer_v3r1_dequant
 
struct  ThreadwiseTensorSliceTransfer_v3r1_gather
 
struct  ThreadwiseTensorSliceTransfer_v3r2
 
struct  ThreadwiseTensorSliceTransfer_v4r1
 
struct  ThreadwiseTensorSliceTransfer_v5r1
 
struct  ThreadwiseTensorSliceTransfer_v6r1
 
struct  ThreadwiseTensorSliceTransfer_v6r1r2
 
struct  ThreadwiseTensorSliceTransfer_v6r2
 
struct  ThreadwiseTensorSliceTransfer_v6r3
 
struct  ThreadwiseTensorSliceTransfer_v7
 
struct  ThreadwiseTensorSliceTransfer_v7r2
 
struct  ThreadwiseTensorSliceTransfer_v7r3
 
struct  ThreadwiseTensorSliceTransfer_v7r3_scatter
 
struct  ThreadwiseWelford
 
struct  ThreadwiseWelfordMerge
 
struct  dpp_type
 
struct  dpp_type< DppInstr::dpp8_f16_32x8x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_8x32x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_8x16x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_16x16x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_4x32x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_4x16x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_1x32x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_2x32x2 >
 
struct  dpp_type< DppInstr::dpp8_f16_2x16x2 >
 
struct  DppSelector
 
struct  DppGemm
 
struct  smfmac_type
 
struct  smfmac< SmfmacInstr::smfmac_f32_16x16x32f16 >
 
struct  smfmac< SmfmacInstr::smfmac_f32_32x32x16f16 >
 
struct  smfmac< SmfmacInstr::smfmac_f32_16x16x32bf16 >
 
struct  smfmac< SmfmacInstr::smfmac_f32_32x32x16bf16 >
 
struct  SmfmacSelector
 
struct  SparseXdlopsGemm
 
struct  wmma_type
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_f16, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_bf16, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f16_16x16x16_f16, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_bf16_16x16x16_bf16, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_i32_16x16x16_iu8, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_f16_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_bf16_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_i32_16x16x16_iu8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_f8f8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_f8bf8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_bf8f8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_bf8bf8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
 
struct  WmmaSelector
 
struct  WmmaGemm
 
struct  mfma_type
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x1xf32 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x2xf32 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x4xf32 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x1xf32 >
 
struct  mfma_type< MfmaInstr::mfma_f32_4x4x1xf32 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x4f16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x8f16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16f16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32f16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x16f16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x4f16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_4x4x4f16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16bf16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x8bf16_1k >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32bf16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x16bf16_1k >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x4bf16 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x8bf16 >
 
struct  mfma_type< MfmaInstr::mfma_i32_32x32x8i8 >
 
struct  mfma_type< MfmaInstr::mfma_i32_16x16x16i8 >
 
struct  mfma_type< MfmaInstr::mfma_i32_32x32x16i8 >
 
struct  mfma_type< MfmaInstr::mfma_i32_16x16x32i8 >
 
struct  mfma_type< MfmaInstr::mfma_i32_32x32x32i8 >
 
struct  mfma_type< MfmaInstr::mfma_i32_16x16x64i8 >
 
struct  mfma_type< MfmaInstr::mfma_f64_16x16x4f64 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16f8f8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32f8f8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16bf8bf8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32bf8bf8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16f8bf8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32f8bf8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16bf8f8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32bf8f8 >
 
struct  mfma_type< MfmaInstr::mfma_f32_32x32x64f8f6f4 >
 
struct  mfma_type< MfmaInstr::mfma_f32_16x16x128f8f6f4 >
 
struct  mfma_type< MfmaInstr::mfma_scale_f32_32x32x64f8f6f4 >
 
struct  mfma_type< MfmaInstr::mfma_scale_f32_16x16x128f8f6f4 >
 
struct  mfma_type_gfx11_base
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_f16 >
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_bf16 >
 
struct  mfma_type< MfmaInstr::wmma_i32_16x16x16_iu8 >
 
struct  mfma_type< MfmaInstr::wmma_unsupport_16x16_gfx11 >
 
struct  mfma_type_gfx12_base
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_f16_gfx12 >
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_bf16_gfx12 >
 
struct  mfma_type< MfmaInstr::wmma_i32_16x16x16_iu8_gfx12 >
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_f8f8_gfx12 >
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_f8bf8_gfx12 >
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_bf8f8_gfx12 >
 
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_bf8bf8_gfx12 >
 
struct  mfma_type< MfmaInstr::wmma_unsupport_16x16_gfx12 >
 
struct  MfmaSelector
 
struct  XdlopsGemm
 
union  BufferResource
 
struct  f8_ocp_t
 
struct  bf8_ocp_t
 
struct  intrin_smfmac_f32_16x16x32f16
 
struct  intrin_smfmac_f32_16x16x32f16< 16, 16 >
 
struct  intrin_smfmac_f32_16x16x32bf16
 
struct  intrin_smfmac_f32_16x16x32bf16< 16, 16 >
 
struct  intrin_smfmac_f32_32x32x16f16
 
struct  intrin_smfmac_f32_32x32x16f16< 32, 32 >
 
struct  intrin_smfmac_f32_32x32x16bf16
 
struct  intrin_smfmac_f32_32x32x16bf16< 32, 32 >
 
struct  intrin_wmma_f32_16x16x16_f16_w32
 
struct  intrin_wmma_f32_16x16x16_f16_w32< 16, 16 >
 
struct  intrin_wmma_f32_16x16x16_bf16_w32
 
struct  intrin_wmma_f32_16x16x16_bf16_w32< 16, 16 >
 
struct  intrin_wmma_f16_16x16x16_f16_w32
 
struct  intrin_wmma_f16_16x16x16_f16_w32< 16, 16, Opsel >
 
struct  intrin_wmma_bf16_16x16x16_bf16_w32
 
struct  intrin_wmma_bf16_16x16x16_bf16_w32< 16, 16, Opsel >
 
struct  intrin_wmma_i32_16x16x16_iu8_w32
 
struct  intrin_wmma_i32_16x16x16_iu8_w32< 16, 16, neg_a, neg_b, clamp >
 
struct  intrin_wmma_f32_16x16x16_f16_w64
 
struct  intrin_wmma_f32_16x16x16_f16_w64< 16, 16 >
 
struct  intrin_wmma_f32_16x16x16_bf16_w64
 
struct  intrin_wmma_f32_16x16x16_bf16_w64< 16, 16 >
 
struct  intrin_wmma_f16_16x16x16_f16_w64
 
struct  intrin_wmma_f16_16x16x16_f16_w64< 16, 16, Opsel >
 
struct  intrin_wmma_bf16_16x16x16_bf16_w64
 
struct  intrin_wmma_bf16_16x16x16_bf16_w64< 16, 16, Opsel >
 
struct  intrin_wmma_i32_16x16x16_iu8_w64
 
struct  intrin_wmma_i32_16x16x16_iu8_w64< 16, 16, neg_a, neg_b, clamp >
 
struct  intrin_wmma_f32_16x16x16_f16_w32_gfx12
 
struct  intrin_wmma_f32_16x16x16_f16_w32_gfx12< 16, 16 >
 
struct  intrin_wmma_f32_16x16x16_bf16_w32_gfx12
 
struct  intrin_wmma_f32_16x16x16_bf16_w32_gfx12< 16, 16 >
 
struct  intrin_wmma_i32_16x16x16_iu8_w32_gfx12
 
struct  intrin_wmma_i32_16x16x16_iu8_w32_gfx12< 16, 16, neg_a, neg_b, clamp >
 
struct  intrin_wmma_f32_16x16x16_f8f8_w32_gfx12
 
struct  intrin_wmma_f32_16x16x16_f8f8_w32_gfx12< 16, 16 >
 
struct  intrin_wmma_f32_16x16x16_f8bf8_w32_gfx12
 
struct  intrin_wmma_f32_16x16x16_f8bf8_w32_gfx12< 16, 16 >
 
struct  intrin_wmma_f32_16x16x16_bf8f8_w32_gfx12
 
struct  intrin_wmma_f32_16x16x16_bf8f8_w32_gfx12< 16, 16 >
 
struct  intrin_wmma_f32_16x16x16_bf8bf8_w32_gfx12
 
struct  intrin_wmma_f32_16x16x16_bf8bf8_w32_gfx12< 16, 16 >
 
struct  intrin_mfma_f32_32x32x1f32
 
struct  intrin_mfma_f32_32x32x1f32< 64, 64 >
 
struct  intrin_mfma_f32_32x32x1f32< 32, 64 >
 
struct  intrin_mfma_f32_32x32x2f32
 
struct  intrin_mfma_f32_32x32x2f32< 32, 32 >
 
struct  intrin_mfma_f32_16x16x4f32
 
struct  intrin_mfma_f32_16x16x4f32< 16, 16 >
 
struct  intrin_mfma_f32_16x16x1f32
 
struct  intrin_mfma_f32_16x16x1f32< 16, 64 >
 
struct  intrin_mfma_f32_4x4x1f32
 
struct  intrin_mfma_f32_4x4x1f32< 4, 64 >
 
struct  intrin_mfma_f32_4x4x1f32< 8, 64 >
 
struct  intrin_mfma_f32_32x32x4f16
 
struct  intrin_mfma_f32_32x32x4f16< 64, 64 >
 
struct  intrin_mfma_f32_32x32x4f16< 32, 64 >
 
struct  intrin_mfma_f32_32x32x16f16
 
struct  intrin_mfma_f32_32x32x16f16< 32, 32 >
 
struct  intrin_mfma_f32_16x16x32f16
 
struct  intrin_mfma_f32_16x16x32f16< 16, 16 >
 
struct  intrin_mfma_f32_32x32x8f16
 
struct  intrin_mfma_f32_32x32x8f16< 32, 32 >
 
struct  intrin_mfma_f32_16x16x16f16
 
struct  intrin_mfma_f32_16x16x16f16< 16, 16 >
 
struct  intrin_mfma_f32_16x16x4f16
 
struct  intrin_mfma_f32_16x16x4f16< 16, 64 >
 
struct  intrin_mfma_f32_4x4x4f16
 
struct  intrin_mfma_f32_4x4x4f16< 4, 64 >
 
struct  intrin_mfma_f32_4x4x4f16< 8, 64 >
 
struct  intrin_mfma_f32_32x32x16bf16
 
struct  intrin_mfma_f32_32x32x16bf16< 32, 32 >
 
struct  intrin_mfma_f32_16x16x32bf16
 
struct  intrin_mfma_f32_16x16x32bf16< 16, 16 >
 
struct  intrin_mfma_f32_32x32x8bf16_1k
 
struct  intrin_mfma_f32_32x32x8bf16_1k< 32, 32 >
 
struct  intrin_mfma_f32_16x16x16bf16_1k
 
struct  intrin_mfma_f32_16x16x16bf16_1k< 16, 16 >
 
struct  intrin_mfma_f32_32x32x4bf16
 
struct  intrin_mfma_f32_32x32x4bf16< 32, 32 >
 
struct  intrin_mfma_f32_16x16x8bf16
 
struct  intrin_mfma_f32_16x16x8bf16< 16, 16 >
 
struct  intrin_mfma_i32_32x32x8i8
 
struct  intrin_mfma_i32_32x32x8i8< 32, 32 >
 
struct  intrin_mfma_i32_16x16x16i8
 
struct  intrin_mfma_i32_16x16x16i8< 16, 16 >
 
struct  intrin_mfma_i32_32x32x32i8
 
struct  intrin_mfma_i32_32x32x32i8< 32, 32 >
 
struct  intrin_mfma_i32_16x16x64i8
 
struct  intrin_mfma_i32_16x16x64i8< 16, 16 >
 
struct  intrin_mfma_i32_32x32x16i8
 
struct  intrin_mfma_i32_32x32x16i8< 32, 32 >
 
struct  intrin_mfma_i32_16x16x32i8
 
struct  intrin_mfma_i32_16x16x32i8< 16, 16 >
 
struct  intrin_mfma_f64_16x16x4f64
 
struct  intrin_mfma_f64_16x16x4f64< 16, 16 >
 
struct  intrin_mfma_f32_32x32x64f8f6f4
 
struct  intrin_mfma_f32_32x32x64f8f6f4< 32, 32 >
 Performs a matrix fused multiply-accumulate operation on 32x32x64 submatrices for f8, f6, and f4 data types. More...
 
struct  intrin_mfma_scale_f32_32x32x64f8f6f4
 
struct  intrin_mfma_scale_f32_32x32x64f8f6f4< 32, 32, OpselA, OpselB >
 
struct  intrin_mfma_scale_f32_16x16x128f8f6f4
 
struct  intrin_mfma_scale_f32_16x16x128f8f6f4< 16, 16, OpselA, OpselB >
 
struct  intrin_mfma_f32_16x16x128f8f6f4
 
struct  intrin_mfma_f32_16x16x128f8f6f4< 16, 16 >
 Performs a matrix fused multiply-accumulate operation on 16x16x128 submatrices for f8f6f4 data types. More...
 
struct  intrin_mfma_f32_32x32x16f8f8
 
struct  intrin_mfma_f32_32x32x16f8f8< 32, 32 >
 
struct  intrin_mfma_f32_16x16x32f8f8
 
struct  intrin_mfma_f32_16x16x32f8f8< 16, 16 >
 
struct  intrin_mfma_f32_32x32x16bf8bf8
 
struct  intrin_mfma_f32_32x32x16bf8bf8< 32, 32 >
 
struct  intrin_mfma_f32_16x16x32bf8bf8
 
struct  intrin_mfma_f32_16x16x32bf8bf8< 16, 16 >
 
struct  intrin_mfma_f32_32x32x16f8bf8
 
struct  intrin_mfma_f32_32x32x16f8bf8< 32, 32 >
 
struct  intrin_mfma_f32_16x16x32f8bf8
 
struct  intrin_mfma_f32_16x16x32f8bf8< 16, 16 >
 
struct  intrin_mfma_f32_32x32x16bf8f8
 
struct  intrin_mfma_f32_32x32x16bf8f8< 32, 32 >
 
struct  intrin_mfma_f32_16x16x32bf8f8
 
struct  intrin_mfma_f32_16x16x32bf8f8< 16, 16 >
 
struct  Array
 
struct  Array< TData, 0 >
 
struct  ContainerElementPicker
 
struct  ConstantContainerElementPicker
 
struct  scalar_type
 
struct  f4x2_pk_t
 
struct  f6_pk_t
 
struct  pk_i4_t
 
struct  is_scalar_type
 
struct  scalar_type< T >
 
struct  scalar_type< double >
 
struct  scalar_type< float >
 
struct  scalar_type< half_t >
 
struct  scalar_type< bhalf_t >
 
struct  scalar_type< int32_t >
 
struct  scalar_type< int8_t >
 
struct  scalar_type< uint8_t >
 
struct  scalar_type< pk_i4_t >
 
struct  scalar_type< f8_fnuz_t >
 
struct  scalar_type< bf8_fnuz_t >
 
struct  scalar_type< f8_ocp_t >
 
struct  scalar_type< bf8_ocp_t >
 
struct  scalar_type< e8m0_bexp_t >
 
struct  scalar_type< f4x2_pk_t >
 
struct  scalar_type< f6x32_pk_t >
 
struct  scalar_type< bf6x32_pk_t >
 
struct  scalar_type< f6x16_pk_t >
 
struct  scalar_type< bf6x16_pk_t >
 
struct  scalar_type< bool >
 
struct  packed_type_info
 
struct  packed_type_maker
 
struct  vector_type
 
struct  vector_type_maker
 
struct  scalar_type< vector_type< T, N > >
 
struct  vector_type_maker< T, N0 >
 
struct  vector_type_maker< vector_type< T, N1 >, N0 >
 
struct  non_native_vector_base
 
struct  nnvb_data_t_selector
 
struct  nnvb_data_t_selector< f8_ocp_t >
 
struct  nnvb_data_t_selector< bf8_ocp_t >
 
struct  nnvb_data_t_selector< e8m0_bexp_t >
 
struct  nnvb_data_t_selector< f6x16_pk_t >
 
struct  nnvb_data_t_selector< f6x32_pk_t >
 
struct  nnvb_data_t_selector< bf6x16_pk_t >
 
struct  nnvb_data_t_selector< bf6x32_pk_t >
 
struct  nnvb_data_t_selector< pk_i4_t >
 
struct  nnvb_data_t_selector< f4x2_pk_t >
 
struct  non_native_vector_base< T, N, ck::enable_if_t< sizeof(T)==1||sizeof(T)==2||sizeof(T)==4||sizeof(T)==8 > >
 
struct  non_native_vector_base< T, N, ck::enable_if_t< sizeof(T)==12||sizeof(T)==16||sizeof(T)==24||sizeof(T)==32 > >
 
struct  scalar_type< non_native_vector_base< T, N, ck::enable_if_t< sizeof(T)==1||sizeof(T)==2||sizeof(T)==4||sizeof(T)==8 > > >
 
struct  scalar_type< non_native_vector_base< T, N, ck::enable_if_t< sizeof(T)==12||sizeof(T)==16||sizeof(T)==24||sizeof(T)==32 > > >
 
struct  DynamicBuffer
 
struct  e8m0_bexp_t
 Unsigned representation of a conventional biased Float32 exponent. More...
 
struct  forwarder
 
struct  swallow
 
struct  logical_and
 
struct  logical_or
 
struct  logical_not
 
struct  static_if
 
struct  static_if< true >
 
struct  static_if< false >
 
struct  conditional
 
struct  conditional< true, X, Y >
 
struct  conditional< false, X, Y >
 
struct  static_for
 
struct  static_for< 0, N, 1 >
 
struct  static_for_range
 
struct  static_for_product
 
struct  static_for_product< Tuple< Is... > >
 
struct  static_for_product< Tuple< Is... >, Rest... >
 
struct  identity
 
struct  static_ford
 
struct  ford
 
struct  constant
 
struct  integral_constant
 
struct  nonesuch
 
struct  is_known_at_compile_time
 
struct  is_known_at_compile_time< index_t >
 
struct  is_known_at_compile_time< unsigned int >
 
struct  is_known_at_compile_time< long_index_t >
 
struct  is_known_at_compile_time< integral_constant< T, X > >
 
struct  is_known_at_compile_time< Sequence< Is... > >
 
struct  is_known_at_compile_time< Tuple< Ts... > >
 
struct  MagicDivision
 
struct  MDiv
 
struct  MDiv2
 
struct  NumericLimits
 
struct  NumericLimits< half_t >
 
struct  NumericLimits< f8_fnuz_t >
 
struct  NumericLimits< bf8_fnuz_t >
 
struct  NumericLimits< f8_ocp_t >
 
struct  NumericLimits< bf8_ocp_t >
 
struct  NumericLimits< f4_t >
 
struct  NumericLimits< f6_t >
 
struct  NumericLimits< bf6_t >
 
struct  NumericLimits< e8m0_bexp_t >
 
struct  NumericUtils
 
struct  NumericUtils< e8m0_bexp_t >
 
struct  NumericUtils< float >
 
struct  NumericUtils< half_t >
 
struct  NumericUtils< bhalf_t >
 
struct  NumericUtils< f8_fnuz_t >
 
struct  NumericUtils< bf8_fnuz_t >
 
struct  NumericUtils< f8_ocp_t >
 
struct  NumericUtils< bf8_ocp_t >
 
struct  NumericUtils< f4_t >
 
struct  NumericUtils< f6_t >
 
struct  NumericUtils< bf6_t >
 
struct  float_equal_one
 
struct  float_equal_zero
 
struct  Sequence
 
struct  sequence_split
 
struct  sequence_reverse
 
struct  sequence_map_inverse
 
struct  is_valid_sequence_map
 
struct  sequence_merge
 
struct  sequence_merge< Sequence< Xs... >, Sequence< Ys... > >
 
struct  sequence_merge< Seq >
 
struct  sequence_gen
 
struct  arithmetic_sequence_gen
 
struct  arithmetic_sequence_gen< 0, IEnd, 1 >
 
struct  uniform_sequence_gen
 
struct  sequence_reverse_inclusive_scan
 
struct  sequence_reverse_inclusive_scan< Sequence< I, Is... >, Reduce, Init >
 
struct  sequence_reverse_inclusive_scan< Sequence< I >, Reduce, Init >
 
struct  sequence_reverse_inclusive_scan< Sequence<>, Reduce, Init >
 
struct  sequence_reverse< Sequence< I > >
 
struct  sequence_reverse< Sequence< I0, I1 > >
 
struct  sequence_reduce
 
struct  sequence_reduce< Reduce, Sequence< Xs... >, Sequence< Ys... > >
 
struct  sequence_reduce< Reduce, Seq >
 
struct  sequence_sort_impl
 
struct  sequence_sort_impl< Sequence< ValueX, ValueY >, Sequence< IdX, IdY >, Compare >
 
struct  sequence_sort_impl< Sequence< Value >, Sequence< Id >, Compare >
 
struct  sequence_sort_impl< Sequence<>, Sequence<>, Compare >
 
struct  sequence_sort
 
struct  sequence_unique_sort
 
class  span
 
struct  StaticBuffer
 
struct  StaticBufferTupleOfVector
 
struct  StaticallyIndexedArray_v2
 
struct  ThisThreadBlock
 
struct  transpose_vectors
 
struct  transpose_vectors< half_t, NX, NY >
 
struct  transpose_vectors< int8_t, NX, NY >
 
struct  transpose_vectors< f8_t, NX, NY >
 
struct  Tuple
 
struct  Tuple<>
 
struct  tuple_element
 
struct  is_same
 
struct  is_same< X, X >
 
struct  is_floating_point
 
struct  is_floating_point< float >
 
struct  is_floating_point< double >
 
struct  is_floating_point< long double >
 
struct  is_integral
 
struct  is_integral< int >
 
struct  is_integral< unsigned int >
 
struct  is_integral< long >
 
struct  is_integral< unsigned long >
 
struct  is_integral< short >
 
struct  is_integral< unsigned short >
 
struct  is_integral< long long >
 
struct  is_integral< unsigned long long >
 
struct  is_integral< char >
 
struct  is_integral< signed char >
 
struct  is_integral< unsigned char >
 
struct  is_integral< wchar_t >
 
struct  is_integral< char16_t >
 
struct  is_integral< char32_t >
 
struct  is_integral< bool >
 
struct  workgroup_barrier
 

Typedefs

using index_t = int32_t
 
using long_index_t = int64_t
 
template<typename T >
using iter_value_t = typename std::iterator_traits< remove_cvref_t< T > >::value_type
 
template<typename T >
using iter_reference_t = decltype(*std::declval< T & >())
 
template<typename T >
using iter_difference_t = typename std::iterator_traits< remove_cvref_t< T > >::difference_type
 
template<typename TensorDesc >
using TensorCoordinate_t = decltype(make_tensor_coordinate(TensorDesc{}, MultiIndex< remove_cvref_t< TensorDesc >::GetNumOfDimension()>{}))
 
template<typename TensorDesc >
using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step(TensorDesc{}, MultiIndex< remove_cvref_t< TensorDesc >::GetNumOfDimension()>{}))
 
using f8_fnuz_t = _BitInt(8)
 
using bf8_fnuz_t = unsigned _BitInt(8)
 
typedef unsigned char fp8_storage_t
 
using f8_t = f8_fnuz_t
 
using bf8_t = bf8_fnuz_t
 
template<index_t N>
using MultiIndex = Array< index_t, N >
 
using bhalf_t = ushort
 
using half_t = _Float16
 
using int4_t = _BitInt(4)
 
using f4_t = unsigned _BitInt(4)
 
using f6_t = _BitInt(6)
 
using bf6_t = unsigned _BitInt(6)
 
using f6x16_pk_t = f6_pk_t< f6_t, 16 >
 
using f6x32_pk_t = f6_pk_t< f6_t, 32 >
 
using bf6x16_pk_t = f6_pk_t< bf6_t, 16 >
 
using bf6x32_pk_t = f6_pk_t< bf6_t, 32 >
 
template<typename X , typename Y >
using has_same_scalar_type = is_same< typename scalar_type< remove_cvref_t< X > >::type, typename scalar_type< remove_cvref_t< Y > >::type >
 
template<typename T >
using element_type_t = typename packed_type_info< T >::element_type
 
template<typename T , index_t N = 0>
using packed_type_t = typename packed_type_maker< T, N >::packed_type
 
using int64_t = long
 
using double2_t = typename vector_type< double, 2 >::type
 
using double4_t = typename vector_type< double, 4 >::type
 
template<typename T , index_t N>
using vector_type_maker_t = typename vector_type_maker< T, N >::type
 
using float2_t = typename vector_type< float, 2 >::type
 
using float4_t = typename vector_type< float, 4 >::type
 
using float8_t = typename vector_type< float, 8 >::type
 
using float16_t = typename vector_type< float, 16 >::type
 
using float32_t = typename vector_type< float, 32 >::type
 
using float64_t = typename vector_type< float, 64 >::type
 
using half2_t = typename vector_type< half_t, 2 >::type
 
using half4_t = typename vector_type< half_t, 4 >::type
 
using half8_t = typename vector_type< half_t, 8 >::type
 
using half16_t = typename vector_type< half_t, 16 >::type
 
using half32_t = typename vector_type< half_t, 32 >::type
 
using bhalf2_t = typename vector_type< bhalf_t, 2 >::type
 
using bhalf4_t = typename vector_type< bhalf_t, 4 >::type
 
using bhalf8_t = typename vector_type< bhalf_t, 8 >::type
 
using bhalf16_t = typename vector_type< bhalf_t, 16 >::type
 
using bhalf32_t = typename vector_type< bhalf_t, 32 >::type
 
using int32x2_t = typename vector_type< int32_t, 2 >::type
 
using int32x4_t = typename vector_type< int32_t, 4 >::type
 
using int32x6_t = typename vector_type< int32_t, 6 >::type
 
using int32x8_t = typename vector_type< int32_t, 8 >::type
 
using int32x16_t = typename vector_type< int32_t, 16 >::type
 
using int32x32_t = typename vector_type< int32_t, 32 >::type
 
using int32x64_t = typename vector_type< int32_t, 64 >::type
 
using int8x2_t = typename vector_type< int8_t, 2 >::type
 
using int8x4_t = typename vector_type< int8_t, 4 >::type
 
using int8x8_t = typename vector_type< int8_t, 8 >::type
 
using int8x16_t = typename vector_type< int8_t, 16 >::type
 
using int8x32_t = typename vector_type< int8_t, 32 >::type
 
using int8x64_t = typename vector_type< int8_t, 64 >::type
 
using f8x2_fnuz_t = typename vector_type< f8_fnuz_t, 2 >::type
 
using f8x4_fnuz_t = typename vector_type< f8_fnuz_t, 4 >::type
 
using f8x8_fnuz_t = typename vector_type< f8_fnuz_t, 8 >::type
 
using f8x16_fnuz_t = typename vector_type< f8_fnuz_t, 16 >::type
 
using f8x32_fnuz_t = typename vector_type< f8_fnuz_t, 32 >::type
 
using f8x64_fnuz_t = typename vector_type< f8_fnuz_t, 64 >::type
 
using bf8x2_fnuz_t = typename vector_type< bf8_fnuz_t, 2 >::type
 
using bf8x4_fnuz_t = typename vector_type< bf8_fnuz_t, 4 >::type
 
using bf8x8_fnuz_t = typename vector_type< bf8_fnuz_t, 8 >::type
 
using bf8x16_fnuz_t = typename vector_type< bf8_fnuz_t, 16 >::type
 
using bf8x32_fnuz_t = typename vector_type< bf8_fnuz_t, 32 >::type
 
using bf8x64_fnuz_t = typename vector_type< bf8_fnuz_t, 64 >::type
 
using f8x2_ocp_t = typename vector_type< f8_ocp_t, 2 >::type
 
using f8x4_ocp_t = typename vector_type< f8_ocp_t, 4 >::type
 
using f8x8_ocp_t = typename vector_type< f8_ocp_t, 8 >::type
 
using f8x16_ocp_t = typename vector_type< f8_ocp_t, 16 >::type
 
using f8x32_ocp_t = typename vector_type< f8_ocp_t, 32 >::type
 
using f8x64_ocp_t = typename vector_type< f8_ocp_t, 64 >::type
 
using bf8x2_ocp_t = typename vector_type< bf8_ocp_t, 2 >::type
 
using bf8x4_ocp_t = typename vector_type< bf8_ocp_t, 4 >::type
 
using bf8x8_ocp_t = typename vector_type< bf8_ocp_t, 8 >::type
 
using bf8x16_ocp_t = typename vector_type< bf8_ocp_t, 16 >::type
 
using bf8x32_ocp_t = typename vector_type< bf8_ocp_t, 32 >::type
 
using bf8x64_ocp_t = typename vector_type< bf8_ocp_t, 64 >::type
 
using uint8x2_t = typename vector_type< uint8_t, 2 >::type
 
using uint8x4_t = typename vector_type< uint8_t, 4 >::type
 
using uint8x8_t = typename vector_type< uint8_t, 8 >::type
 
using uint8x16_t = typename vector_type< uint8_t, 16 >::type
 
using uint8x32_t = typename vector_type< uint8_t, 32 >::type
 
using uint8x64_t = typename vector_type< uint8_t, 64 >::type
 
using f4x2_t = typename vector_type< f4x2_pk_t, 1 >::type
 
using f4x4_t = typename vector_type< f4x2_pk_t, 2 >::type
 
using f4x8_t = typename vector_type< f4x2_pk_t, 4 >::type
 
using f4x16_t = typename vector_type< f4x2_pk_t, 8 >::type
 
using f4x32_t = typename vector_type< f4x2_pk_t, 16 >::type
 
using f4x64_t = typename vector_type< f4x2_pk_t, 32 >::type
 
using f6x16_t = typename vector_type< f6x16_pk_t, 1 >::type
 
using f6x16x2_t = typename vector_type< f6x16_pk_t, 2 >::type
 
using f6x32_t = typename vector_type< f6x32_pk_t, 1 >::type
 
using bf6x16_t = typename vector_type< bf6x16_pk_t, 1 >::type
 
using bf6x16x2_t = typename vector_type< bf6x16_pk_t, 2 >::type
 
using bf6x32_t = typename vector_type< bf6x32_pk_t, 1 >::type
 
using e8m0x4_bexp_t = typename vector_type< e8m0_bexp_t, 4 >::type
 
using pk_i4x2_t = typename vector_type< pk_i4_t, 2 >::type
 
using pk_i4x4_t = typename vector_type< pk_i4_t, 4 >::type
 
using pk_i4x8_t = typename vector_type< pk_i4_t, 8 >::type
 
template<bool B, typename T = void>
using enable_if = std::enable_if< B, T >
 
template<bool B, typename T = void>
using enable_if_t = typename std::enable_if< B, T >::type
 
template<bool predicate, class X , class Y >
using conditional_t = typename conditional< predicate, X, Y >::type
 
template<bool B>
using bool_constant = integral_constant< bool, B >
 
using true_type = bool_constant< true >
 
using false_type = bool_constant< false >
 
template<template< class... > class Op, class... Args>
using is_detected = typename detail::detector< nonesuch, void, Op, Args... >::value_t
 
template<typename T >
using is_pack2_invocable_t = decltype(ck::declval< T & >().is_pack2_invocable)
 
template<typename T >
using is_pack4_invocable_t = decltype(ck::declval< T & >().is_pack4_invocable)
 
template<typename T >
using is_pack8_invocable_t = decltype(ck::declval< T & >().is_pack8_invocable)
 
template<index_t N>
using Number = integral_constant< index_t, N >
 
template<index_t N>
using LongNumber = integral_constant< long_index_t, N >
 
template<index_t N>
using make_index_sequence = typename __make_integer_seq< impl::__integer_sequence, index_t, N >::seq_type
 
template<typename Sx , typename Sy >
using sequence_merge_t = typename sequence_merge< Sx, Sy >::type
 
template<index_t NSize, index_t I>
using uniform_sequence_gen_t = typename uniform_sequence_gen< NSize, I >::type
 
template<typename T , index_t N>
using StaticallyIndexedArray = typename detail::StaticallyIndexedArrayImpl< T, N >::type
 
template<index_t I, typename TTuple >
using tuple_element_t = typename tuple_element< I, TTuple >::type
 
template<typename T >
using is_tuple = decltype(ck::declval< T & >().IsTuple())
 
template<typename T >
using remove_reference_t = typename remove_reference< T >::type
 
template<typename T >
using remove_cv_t = typename remove_cv< T >::type
 
template<typename T >
using remove_cvref_t = remove_cv_t< remove_reference_t< T > >
 
template<typename T >
using remove_pointer_t = typename remove_pointer< T >::type
 

Enumerations

enum class  InMemoryDataOperationEnum {
  Set ,
  AtomicAdd ,
  AtomicMax ,
  Add
}
 
enum  StreamKReductionStrategy {
  Atomic = 0 ,
  Reduction
}
 
enum class  PipelineVersion {
  v1 ,
  v2 ,
  v4 ,
  weight_only
}
 
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
 
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
 
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
 
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
 
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
 
enum class  DppInstr {
  dpp8_f16_1x32x2 = 0 ,
  dpp8_f16_2x16x2 ,
  dpp8_f16_2x32x2 ,
  dpp8_f16_4x16x2 ,
  dpp8_f16_4x32x2 ,
  dpp8_f16_8x16x2 ,
  dpp8_f16_8x32x2 ,
  dpp8_f16_16x16x2 ,
  dpp8_f16_32x8x2
}
 
enum class  SmfmacInstr {
  smfmac_f32_16x16x32f16 = 0 ,
  smfmac_f32_32x32x16f16 ,
  smfmac_f32_16x16x32bf16 ,
  smfmac_f32_32x32x16bf16
}
 
enum class  WmmaInstr {
  wmma_f32_16x16x16_f16 = 0 ,
  wmma_f32_16x16x16_bf16 ,
  wmma_f16_16x16x16_f16 ,
  wmma_bf16_16x16x16_bf16 ,
  wmma_i32_16x16x16_iu8 ,
  wmma_i32_16x16x16_iu4 ,
  wmma_f32_16x16x16_f16_gfx12 ,
  wmma_f32_16x16x16_bf16_gfx12 ,
  wmma_i32_16x16x16_iu8_gfx12 ,
  wmma_f32_16x16x16_f8f8_gfx12 ,
  wmma_f32_16x16x16_f8bf8_gfx12 ,
  wmma_f32_16x16x16_bf8f8_gfx12 ,
  wmma_f32_16x16x16_bf8bf8_gfx12
}
 
enum class  MfmaInstr {
  mfma_f32_32x32x1xf32 = 0 ,
  mfma_f32_16x16x1xf32 ,
  mfma_f32_4x4x1xf32 ,
  mfma_f32_32x32x2xf32 ,
  mfma_f32_16x16x4xf32 ,
  mfma_f32_32x32x4f16 ,
  mfma_f32_16x16x4f16 ,
  mfma_f32_4x4x4f16 ,
  mfma_f32_32x32x8f16 ,
  mfma_f32_16x16x16f16 ,
  mfma_f32_32x32x8bf16_1k ,
  mfma_f32_16x16x16bf16_1k ,
  mfma_f32_32x32x4bf16 ,
  mfma_f32_16x16x8bf16 ,
  mfma_i32_32x32x8i8 ,
  mfma_i32_16x16x16i8 ,
  mfma_i32_32x32x16i8 ,
  mfma_i32_16x16x32i8 ,
  mfma_f64_16x16x4f64 ,
  mfma_f32_32x32x16f8f8 ,
  mfma_f32_16x16x32f8f8 ,
  mfma_f32_32x32x16bf8bf8 ,
  mfma_f32_16x16x32bf8bf8 ,
  mfma_f32_32x32x16f8bf8 ,
  mfma_f32_16x16x32f8bf8 ,
  mfma_f32_32x32x16bf8f8 ,
  mfma_f32_16x16x32bf8f8 ,
  mfma_f32_32x32x16f16 ,
  mfma_f32_16x16x32f16 ,
  mfma_f32_32x32x16bf16 ,
  mfma_f32_16x16x32bf16 ,
  mfma_i32_32x32x32i8 ,
  mfma_i32_16x16x64i8 ,
  mfma_f32_32x32x64f8f6f4 ,
  mfma_f32_16x16x128f8f6f4 ,
  mfma_scale_f32_32x32x64f8f6f4 ,
  mfma_scale_f32_16x16x128f8f6f4 ,
  wmma_f32_16x16x16_f16 ,
  wmma_f32_16x16x16_bf16 ,
  wmma_i32_16x16x16_iu8 ,
  wmma_unsupport_16x16_gfx11 ,
  wmma_f32_16x16x16_f16_gfx12 ,
  wmma_f32_16x16x16_bf16_gfx12 ,
  wmma_i32_16x16x16_iu8_gfx12 ,
  wmma_f32_16x16x16_f8f8_gfx12 ,
  wmma_f32_16x16x16_f8bf8_gfx12 ,
  wmma_f32_16x16x16_bf8f8_gfx12 ,
  wmma_f32_16x16x16_bf8bf8_gfx12 ,
  wmma_unsupport_16x16_gfx12
}
 
enum class  AddressSpaceEnum {
  Generic ,
  Global ,
  Lds ,
  Sgpr ,
  Vgpr
}
 
enum class  AmdBufferCoherenceEnum {
  DefaultCoherence = 0 ,
  GLC = 1 ,
  SLC = 2 ,
  GLC_SLC = 3 ,
  WAVE_NT0 = 0 ,
  WAVE_NT1 = 2 ,
  GROUP_NT0 = 1 ,
  GROUP_NT1 = 3 ,
  DEVICE_NT0 = 8 ,
  DEVICE_NT1 = 10 ,
  SYSTEM_NT0 = 9 ,
  SYSTEM_NT1 = 11 ,
  DefaultCoherence = 0 ,
  GLC = 1 ,
  SLC = 2 ,
  GLC_SLC = 3 ,
  WAVE_NT0 = 0 ,
  WAVE_NT1 = 2 ,
  GROUP_NT0 = 1 ,
  GROUP_NT1 = 3 ,
  DEVICE_NT0 = 8 ,
  DEVICE_NT1 = 10 ,
  SYSTEM_NT0 = 9 ,
  SYSTEM_NT1 = 11
}
 
enum class  AmdBufferCoherenceEnum {
  DefaultCoherence = 0 ,
  GLC = 1 ,
  SLC = 2 ,
  GLC_SLC = 3 ,
  WAVE_NT0 = 0 ,
  WAVE_NT1 = 2 ,
  GROUP_NT0 = 1 ,
  GROUP_NT1 = 3 ,
  DEVICE_NT0 = 8 ,
  DEVICE_NT1 = 10 ,
  SYSTEM_NT0 = 9 ,
  SYSTEM_NT1 = 11 ,
  DefaultCoherence = 0 ,
  GLC = 1 ,
  SLC = 2 ,
  GLC_SLC = 3 ,
  WAVE_NT0 = 0 ,
  WAVE_NT1 = 2 ,
  GROUP_NT0 = 1 ,
  GROUP_NT1 = 3 ,
  DEVICE_NT0 = 8 ,
  DEVICE_NT1 = 10 ,
  SYSTEM_NT0 = 9 ,
  SYSTEM_NT1 = 11
}
 
enum class  ck_fp8_interpretation_t {
  CK_E4M3_OCP = 0 ,
  CK_E5M2_OCP = 1 ,
  CK_E4M3_FNUZ = 2 ,
  CK_E5M2_FNUZ = 3
}
 Describes FP8 interpretation. More...
 
enum class  ck_saturation_t {
  CK_NOSAT = 0 ,
  CK_SATFINITE = 1
}
 Describes saturation behavior. More...
 
enum class  BlockGemmPipelineVersion {
  v1 ,
  v2 ,
  v3 ,
  v4 ,
  v5
}
 
enum class  BlockGemmPipelineScheduler {
  Intrawave ,
  Interwave
}
 
enum class  TailNumber {
  Odd ,
  Even ,
  One ,
  Two ,
  Three ,
  Four ,
  Five ,
  Six ,
  Seven ,
  Empty ,
  Full
}
 
enum  SchedulerGroup : uint32_t {
  SCHED_GROUP_MFMA = 0x008 ,
  SCHED_GROUP_VMEM = 0x020 ,
  SCHED_GROUP_LDS_READ = 0x100 ,
  SCHED_GROUP_LDS_WRITE = 0x200
}
 
enum class  f8_rounding_mode {
  standard ,
  stochastic
}
 
enum class  LoopScheduler {
  Default ,
  Interwave
}
 
enum class  ReduceTensorOp {
  ADD = 0 ,
  MUL = 1 ,
  MIN = 2 ,
  MAX = 3 ,
  AMAX = 4 ,
  AVG = 5 ,
  NORM1 = 6 ,
  NORM2 = 7
}
 
enum class  NanPropagation {
  NOT_PROPAGATE_NAN = 0 ,
  PROPAGATE_NAN = 1
}
 
enum class  ReduceTensorIndices {
  NO_INDICES = 0 ,
  FLATTENED_INDICES = 1
}
 
enum class  IndicesType {
  INDICES_32BIT = 0 ,
  INDICES_64BIT = 1 ,
  INDICES_16BIT = 2 ,
  INDICES_8BIT = 3
}
 

Functions

constexpr unsigned int fnv1a_hash (std::string_view str, unsigned int h=2166136261u)
 
std::string get_device_name ()
 
bool is_gfx12_supported ()
 
bool is_gfx11_supported ()
 
bool is_xdl_supported ()
 
bool is_lds_direct_load_supported ()
 
bool is_bf16_atomic_supported ()
 
bool is_gfx101_supported ()
 
bool is_gfx103_supported ()
 
template<typename T , typename ForwardIterator , typename Size , typename BinaryOperation >
auto accumulate_n (ForwardIterator first, Size count, T init, BinaryOperation op) -> decltype(std::accumulate(first, std::next(first, count), init, op))
 
unsigned int get_available_cpu_cores ()
 
template<typename... In, typename... Wei, typename... Out, typename ConvStrides , typename ConvDilations , typename InLeftPads , typename InRightPads , index_t GemmK1Value>
__host__ constexpr __device__ auto transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad (const TensorDescriptor< In... > &in_grid_desc_n_di_hi_wi_c, const TensorDescriptor< Wei... > &wei_k_z_y_x_c_grid_desc, const TensorDescriptor< Out... > &out_n_do_ho_wo_k_grid_desc, const ConvStrides &conv_strides, const ConvDilations &conv_dilations, const InLeftPads &in_left_pads, const InRightPads &in_right_pads, Number< GemmK1Value >)
 
template<AddressSpaceEnum AddressSpace, typename T , typename TensorDesc , typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false>
__host__ constexpr __device__ auto make_static_tensor (TensorDesc)
 
template<AddressSpaceEnum AddressSpace, typename T , typename TensorDesc , typename X , typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false, typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X >>::value, bool >::type = false>
__host__ constexpr __device__ auto make_static_tensor (TensorDesc, X invalid_element_value)
 
template<typename Lengths , typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
__host__ constexpr __device__ auto make_cluster_descriptor (const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::Size(), 1 >::type{})
 
template<typename LowLength >
__host__ constexpr __device__ auto make_pass_through_transform (const LowLength &low_length)
 
template<typename LowLength , typename LeftPad , typename RightPad , bool SkipIsValidCheck = false>
__host__ constexpr __device__ auto make_pad_transform (const LowLength &low_length, const LeftPad &left_pad, const RightPad &right_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{})
 
template<typename LowLength , typename LeftPadLength , bool SkipIsValidCheck = false>
__host__ constexpr __device__ auto make_left_pad_transform (const LowLength &low_length, const LeftPadLength &left_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{})
 
template<typename LowLength , typename RightPadLength , bool SkipIsValidCheck = false>
__host__ constexpr __device__ auto make_right_pad_transform (const LowLength &low_length, const RightPadLength &right_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{})
 
template<typename UpLengths , typename Coefficients , typename enable_if< UpLengths::Size()==Coefficients::Size(), bool >::type = false>
__host__ constexpr __device__ auto make_embed_transform (const UpLengths &up_lengths, const Coefficients &coefficients)
 
template<typename LowLengths >
__host__ constexpr __device__ auto make_merge_transform (const LowLengths &low_lengths)
 
template<typename LowLengths >
__host__ constexpr __device__ auto make_merge_transform_v1_carry_check (const LowLengths &low_lengths)
 
template<typename LowLengths >
__host__ constexpr __device__ auto make_merge_transform_v2_magic_division (const LowLengths &low_lengths)
 
template<typename LowLengths >
__host__ constexpr __device__ auto make_merge_transform_v3_division_mod (const LowLengths &low_lengths)
 
template<typename UpLengths , bool Use24BitIntegerCalculation = false>
__host__ constexpr __device__ auto make_unmerge_transform (const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
 
template<typename LowerIndex >
__host__ constexpr __device__ auto make_freeze_transform (const LowerIndex &low_idx)
 
template<typename UpperIndex >
__host__ constexpr __device__ auto make_insert_transform (const UpperIndex &up_idx)
 
template<typename LowLength , typename SliceBegin , typename SliceEnd >
__host__ constexpr __device__ auto make_slice_transform (const LowLength &low_length, const SliceBegin &slice_begin, const SliceEnd &slice_end)
 
template<typename VectorSize , typename UpLength >
__host__ constexpr __device__ auto make_vectorize_transform (const VectorSize &vector_size, const UpLength &up_length)
 
template<typename Modulus , typename UpLength >
__host__ constexpr __device__ auto make_modulo_transform (const Modulus &modulus, const UpLength &up_length)
 
template<typename LowLengths >
__host__ constexpr __device__ auto make_xor_with_modulo_transform (const LowLengths &low_lengths)
 
template<typename LowLengths >
__host__ constexpr __device__ auto make_xor_transform (const LowLengths &low_lengths)
 
template<typename TensorAdaptor0 , typename TensorAdaptor1 >
__host__ constexpr __device__ auto chain_tensor_adaptors (const TensorAdaptor0 &adaptor0, const TensorAdaptor1 &adaptor1)
 
template<typename Transforms , typename LowerDimensionOldTopIdss , typename UpperDimensionNewTopIdss >
__host__ constexpr __device__ auto make_single_stage_tensor_adaptor (const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
 
template<typename OldTensorDescriptor , typename NewTransforms , typename NewLowerDimensionOldVisibleIdss , typename NewUpperDimensionNewVisibleIdss >
__host__ constexpr __device__ auto transform_tensor_descriptor (const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
 
template<typename TensorDesc , typename VisibleIndex >
__host__ constexpr __device__ auto make_tensor_coordinate (const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
 
template<typename TensorDesc , typename VisibleIndex , typename UpdateLowerIndexHack >
__host__ constexpr __device__ auto make_tensor_coordinate_step (const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
 
template<typename TensorDesc , typename VisibleIndex >
__host__ constexpr __device__ auto make_tensor_coordinate_step (const TensorDesc &, const VisibleIndex &idx_diff_visible)
 
template<typename TensorDesc , typename TensorCoord , typename TensorCoordStep >
__host__ constexpr __device__ void move_tensor_coordinate (const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
 
template<typename TensorDesc , typename TensorCoord >
__host__ constexpr __device__ bool coordinate_has_valid_offset_assuming_visible_index_is_valid (const TensorDesc &tensor_desc, const TensorCoord &coord)
 
template<typename TensorDesc , typename TensorCoord >
__host__ constexpr __device__ bool coordinate_has_valid_offset (const TensorDesc &tensor_desc, const TensorCoord &coord)
 
template<typename... Lengths, typename... Strides, typename enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
__host__ constexpr __device__ auto make_naive_tensor_descriptor (const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
 
template<typename... Lengths>
__host__ constexpr __device__ auto make_naive_tensor_descriptor_packed (const Tuple< Lengths... > &lengths)
 
template<typename... Lengths, typename Align >
__host__ constexpr __device__ auto make_naive_tensor_descriptor_aligned (const Tuple< Lengths... > &lengths, Align align)
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeTypeA , typename ComputeTypeB , typename AccDataType , typename AWmmaTileDesc , typename BWmmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmPipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmABScalePipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmMXBPreshufflePipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmBPreshufflePipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmPipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmBlockScaleBPreshufflePipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmBlockMoeScaleBPreshufflePipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmMXBPreshufflePipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmMXNBSPipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmMXPipeline_Selector ()
 
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmMXPipeline_Selector ()
 
template<index_t BlockSize, typename FloatA , typename FloatB , typename FloatAcc , typename AK0MK1BlockDesc , typename BK0NK1BlockDesc , index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, LoopScheduler LoopSched, typename ComputeTypeA = FloatA, typename ComputeTypeB = FloatB>
constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector ()
 
template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ComputePtrOffsetOfBatch , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const index_t batch_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2ETileMap block_2_etile_map)
 
template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_batched_gemm_xdl_cshuffle_v3_multi_d (BatchedGemmArg karg)
 
template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds (BatchedGemmArg karg)
 
template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_batched_gemm_b_scale_xdl_cshuffle_v3 (BatchedGemmArg karg)
 
template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds (BatchedGemmArg karg)
 
template<typename GridwiseGemm , typename AsPointer , typename BsPointer , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AsGridDesc_AK0_M_AK1 , typename BsGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_abd_xdl_cshuffle (AsPointer p_as_grid, BsPointer p_bs_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AsGridDesc_AK0_M_AK1 as_grid_desc_ak0_m_ak1, const BsGridDesc_BK0_N_BK1 bs_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
 
template<typename GridwiseElementwiseReduction , typename InDataTypePointerTuple , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename AccDataType , typename XElementwiseOperation , typename YElementwiseOperation , typename InGrid2dDescTuple , typename GridDesc_M_K >
__global__ void kernel_elementwise_layernorm (const InGrid2dDescTuple in_grid_2d_desc_tuple, const GridDesc_M_K x_grid_desc_m_k, const GridDesc_M_K gamma_grid_desc_m_k, const GridDesc_M_K beta_grid_desc_m_k, const GridDesc_M_K y_grid_desc_m_k, index_t num_k_block_tile_iteration, AccDataType epsilon, const InDataTypePointerTuple p_in_global_tuple, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, const XElementwiseOperation x_elementwise_op, const YElementwiseOperation y_elementwise_op)
 
template<typename GridwiseGemm , typename ABDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_K0_M0_M1_K1 , typename BGridDesc_K0_N0_N1_K1 , typename DsGridDesc_M0_M10_M11_N0_N10_N11 , typename CGridDesc_M0_M10_M11_N0_N10_N11 , typename Block2CTileMap , bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__global__ void kernel_gemm_dl_multiple_d (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1, const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1, const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11, const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemmWelford , typename ABDataType , typename DsPointer , typename EMeanVarDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename MeanVarGridDescriptor_MBlock_MPerBlock_NBlock , typename CountGridDescriptor_MBlock_MPerBlock_NBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_multiple_d_welford_first_half_xdl_cshuffle (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EMeanVarDataType *__restrict__ p_e_grid, EMeanVarDataType *__restrict__ p_welford_mean_grid, EMeanVarDataType *__restrict__ p_welford_var_grid, int32_t *__restrict__ p_welford_count_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const MeanVarGridDescriptor_MBlock_MPerBlock_NBlock mean_var_grid_desc_mblock_mperblock_nblock, const CountGridDescriptor_MBlock_MPerBlock_NBlock count_grid_desc_mblock_mperblock_nblock, const Block2ETileMap block_2_etile_map, index_t NRaw)
 
template<typename GridwiseWelfordLayernorm , typename EMeanVarDataType , typename HDataType , typename GammaDataType , typename BetaDataType , typename ComputeDataType , typename EHGridDesc_M_N , typename LayernormMeanVarGridDesc_M_NBlock , typename LayernormCountGridDesc_M_NBlock , typename GammaBetaGridDesc_N , typename HElementwiseOperation >
__global__ void kernel_welford_layernorm2d_second_half (const EMeanVarDataType *__restrict__ p_e_grid, const EMeanVarDataType *__restrict__ p_in_welford_mean_grid, const EMeanVarDataType *__restrict__ p_in_welford_var_grid, const int32_t *__restrict__ p_in_welford_count_grid, const GammaDataType *__restrict__ p_gamma_grid, const BetaDataType *__restrict__ p_beta_grid, HDataType *__restrict__ p_h_grid, const EHGridDesc_M_N e_grid_desc_m_n, const EHGridDesc_M_N h_grid_desc_m_n, const LayernormMeanVarGridDesc_M_NBlock mean_var_grid_desc_m_nblock, const LayernormCountGridDesc_M_NBlock count_grid_desc_m_nblock, const GammaBetaGridDesc_N gamma_grid_desc_n, const GammaBetaGridDesc_N beta_grid_desc_n, index_t numMeanVarCountBlockTileIteration_N, index_t NBlockClusterLength, ComputeDataType epsilon, HElementwiseOperation h_element_op)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename FloatRsPointer , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename QsElementwiseOperation , typename RsElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename RsGridDescriptor_MBlock_MPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_multiple_d_multiple_r_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, FloatRsPointer p_rs_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const QsElementwiseOperation qs_element_op, const RsElementwiseOperation rs_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock, const Block2ETileMap block_2_etile_map)
 
template<typename GridwiseGemm , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_multiple_d_xdl_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
 
template<typename GridwiseGemm , typename ABDataType , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename EElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdl_waveletmodel_cshuffle (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const EElementwiseOperation e_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
 
template<typename GridwiseGemm , typename ContractionMultiDKernelArg , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , bool HasMainKBlockLoop>
__global__ void kernel_grouped_contraction_multiple_d_xdl_cshuffle (const void CK_CONSTANT_ADDRESS_SPACE *contraction_args, const index_t group_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op)
 
template<typename GridwiseWelford , typename XDataType , typename WorkspaceMeanVarDataType , typename ComputeDataType , typename XGridDesc_M_K , typename MeanVarGridDesc_M_KBlock >
__global__ void kernel_normalizationSplitK1st (const XGridDesc_M_K x_grid_desc_m_k, const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock, index_t num_k_block_tile_iteration, const XDataType *const __restrict__ p_x_global, WorkspaceMeanVarDataType *const __restrict__ p_welford_mean, WorkspaceMeanVarDataType *const __restrict__ p_welford_variance, int32_t *const __restrict__ p_welford_count)
 
template<typename GridwiseWelfordNormalization , typename WorkspaceMeanVarDataType , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename MeanVarGridDesc_M_KBlock , typename CountGridDesc_M_KBlock , typename XYGammaBetaGridDesc_M_K , typename SaveMeanInvStdGridDesc_M >
__global__ void kernel_normalizationSplitK2nd (const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock, const CountGridDesc_M_KBlock count_grid_desc_m_kblock, const XYGammaBetaGridDesc_M_K x_grid_desc_m_k, const XYGammaBetaGridDesc_M_K gamma_grid_desc_m_k, const XYGammaBetaGridDesc_M_K beta_grid_desc_m_k, const XYGammaBetaGridDesc_M_K y_grid_desc_m_k, const SaveMeanInvStdGridDesc_M save_mean_grid_desc_m, const SaveMeanInvStdGridDesc_M save_inv_std_grid_desc_m, index_t num_k_mean_var_count_iteration, index_t num_k_block_tile_iteration, index_t k_grid_size, ComputeDataType epsilon, const WorkspaceMeanVarDataType *const p_mean_global, const WorkspaceMeanVarDataType *const p_variance_global, const int32_t *const p_welford_count_global, const XDataType *const __restrict__ p_x_global, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, SaveMeanInvStdDataType *const __restrict__ p_save_mean_global, SaveMeanInvStdDataType *const __restrict__ p_save_inv_std_global, const YElementwiseOperation y_elementwise_op)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AKB_AK0_M_AK1 , typename BGridDesc_BKB_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ComputePtrOffsetOfBatch , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const index_t batch_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AKB_AK0_M_AK1 a_grid_desc_akb_ak0_m_ak1, const BGridDesc_BKB_BK0_N_BK1 b_grid_desc_bkb_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2ETileMap block_2_etile_map)
 
__device__ half4_t i4_to_half4 (int q)
 
__device__ half4_t i4_to_half4_scale (int q, const ck::half2_t &scale)
 
__device__ f8x4_t i4_to_f8x4 (int q)
 
__device__ f8x8_t i4_to_fp8x8 (int q)
 
__device__ bhalf4_t i4_to_bhalf4 (int q)
 
template<typename GridwiseMultiblockBatchNormForward_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename MeanVarCountGridDesc_M_G , typename MeanVarCountGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor >
__global__ void kernel_multiblock_batchnorm_forward (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, MeanVarDataType *const __restrict__ p_welford_mean, MeanVarDataType *const __restrict__ p_welford_variance, int32_t *const __restrict__ p_welford_count, int32_t *const __restrict__ p_control, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance)
 
template<typename GridwiseReduceSecondHalfBatchNormBackwardFinal_ , typename XDataType , typename DyDataType , typename DxDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename DscaleDbiasGridDesc_M_K , typename MeanVarGridDesc_M , typename ScaleBiasGridDesc_M >
__global__ void kernel_reduce_second_half_batchnorm_backward_final (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const XYGridDesc_M_K dx_grid_desc_m_k, const DscaleDbiasGridDesc_M_K dscale_dbias_grid_desc_m_k, const MeanVarGridDesc_M mean_var_grid_desc_m, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, index_t blkgroup_size, long_index_t reduce_size, index_t num_xy_k_block_tile_iteration, index_t num_dscale_dbias_k_block_tile_iteration, const DscaleDbiasDataType *const __restrict__ p_reduce_dscale, const DscaleDbiasDataType *const __restrict__ p_reduce_dbias, const MeanVarDataType *const __restrict__ p_mean, const MeanVarDataType *const __restrict__ p_inv_var, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, const ScaleDataType *const __restrict__ p_scale, const DyElementwiseOp dy_elementwise_op, DxDataType *const __restrict__ p_dx, DscaleDbiasDataType *const __restrict__ p_dscale, DscaleDbiasDataType *const __restrict__ p_dbias)
 
template<typename GridwiseMultiblockWelfordFirstHalf_ , typename XDataType , typename MeanVarDataType , typename XGridDesc_M_K , typename MeanVarCountGridDesc_M_G , typename GetReduceCountPerThreadFunctor >
__global__ void kernel_multiblock_welford_first_half (const XGridDesc_M_K x_grid_desc_m_k, const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, const XDataType *const __restrict__ p_x, MeanVarDataType *const p_welford_mean, MeanVarDataType *const p_welford_variance, int32_t *const p_welford_count)
 
template<typename GridwiseWelfordSecondHalfBatchNormForwardFinal_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename MeanVarCountGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M >
__global__ void kernel_welford_second_half_batchnorm_forward_final (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, index_t blkgroup_size, index_t num_xy_k_block_tile_iteration, AccDataType epsilon, const MeanVarDataType *const __restrict__ p_in_welford_mean, const MeanVarDataType *const __restrict__ p_in_welford_variance, const int32_t *const __restrict__ p_in_welford_count, const XDataType *const __restrict__ p_x, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance)
 
template<typename GridwiseWelfordSecondHalfReduceFirstHalf_ , typename XDataType , typename DyDataType , typename AccDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename MeanVarGridDesc_M , typename MeanVarCountGridDesc_M_K , typename DscaleDbiasGridDesc_M_G >
__global__ void kernel_welford_second_half_reduce_first_half (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const MeanVarGridDesc_M mean_var_grid_desc_m, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const DscaleDbiasGridDesc_M_G dscale_dbias_grid_desc_m_g, index_t blkgroup_size, index_t num_xy_k_block_tile_iteration, index_t num_mean_var_count_k_block_tile_iteration, AccDataType epsilon, bool haveSavedMeanInvVar, const MeanVarDataType *const __restrict__ p_savedMean, const MeanVarDataType *const __restrict__ p_savedInvVar, const MeanVarDataType *const __restrict__ p_in_welford_mean, const MeanVarDataType *const __restrict__ p_in_welford_variance, const int32_t *const __restrict__ p_in_welford_count, const DyElementwiseOp dy_elementwise_op, MeanVarDataType *const __restrict__ p_out_welford_mean, MeanVarDataType *const __restrict__ p_out_welford_inv_variance, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, DscaleDbiasDataType *const __restrict__ p_reduce_dscale, DscaleDbiasDataType *const __restrict__ p_reduce_dbias)
 
template<typename CTileIdx , typename CTileDim >
__host__ __device__ bool DefaultValidCTileIndex (const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim)
 
template<typename GridwiseMultipleReduction , index_t NumReduction, typename InDataType , typename OutDataTypePointerTuple , typename AccDataType , typename InGridDesc_M_K , typename OutGridDesc_M_Tuple , typename InElementwiseOperationTuple , typename AccElementwiseOperationTuple >
__global__ void kernel_multiple_reduce_multiblock (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M_Tuple out_grid_desc_m_tuple, const InElementwiseOperationTuple in_elementwise_op_tuple, const AccElementwiseOperationTuple acc_elementwise_op_tuple, index_t block_group_size, index_t num_k_block_tile_iteration, Array< AccDataType, NumReduction > alpha_values, const InDataType *const __restrict__ p_in_value_global, Array< AccDataType, NumReduction > beta_values, OutDataTypePointerTuple p_out_value_global_tuple)
 
template<typename GridwiseMultipleReduction , index_t NumReduction, typename InDataType , typename OutDataTypePointerTuple , typename AccDataType , typename InGridDesc_M_K , typename OutGridDesc_M_Tuple , typename InElementwiseOperationTuple , typename AccElementwiseOperationTuple >
__global__ void kernel_multiple_reduce_threadwise (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M_Tuple out_grid_desc_m_tuple, const InElementwiseOperationTuple in_elementwise_op_tuple, const AccElementwiseOperationTuple acc_elementwise_op_tuple, Array< AccDataType, NumReduction > alpha_values, const InDataType *const __restrict__ p_in_value_global, Array< AccDataType, NumReduction > beta_values, OutDataTypePointerTuple p_out_value_global_tuple)
 
template<typename GridwiseReduction , bool OutputIndex, bool HaveIndexInput, typename InDataType , typename OutDataType , typename AccDataType , typename IndexDataType , typename InGridDesc_M_K , typename OutGridDesc_M , typename InElementwiseOperation , typename AccElementwiseOperation >
__global__ void kernel_reduce_multiblock (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const AccElementwiseOperation acc_elementwise_op, index_t block_group_size, index_t num_k_block_tile_iteration, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, const IndexDataType *const __restrict__ p_in_index_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global, IndexDataType *const __restrict__ p_out_index_global)
 
template<typename GridwiseReduction , bool OutputIndex, bool TransformIndexKtoGlobal, bool HaveIndexInput, typename InDataType , typename OutDataType , typename AccDataType , typename IndexDataType , typename InGridDesc_M_K , typename OutGridDesc_M , typename InElementwiseOperation , typename AccElementwiseOperation >
__global__ void kernel_reduce_threadwise (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const AccElementwiseOperation acc_elementwise_op, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, const IndexDataType *const __restrict__ p_in_index_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global, IndexDataType *const __restrict__ p_out_index_global)
 
template<typename GridwiseReduction , typename InDataType , typename OutDataType , typename AccDataType , typename InGridDesc_M_K , typename DsGridDesc_M , typename OutGridDesc_M , typename InElementwiseOperation , typename OutElementwiseOperation , typename DsGridPointer >
__global__ void kernel_reduce_threadwise_multi_d (const InGridDesc_M_K in_grid_desc_m_k, const DsGridDesc_M ds_grid_desc_m, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const OutElementwiseOperation out_elementwise_op, const InDataType *const __restrict__ p_in_value_global, const DsGridPointer p_ds_value_global, OutDataType *const __restrict__ p_out_value_global)
 
template<typename GridwiseBatchrNormBackwardWithBlockwiseWelford_ , typename XDataType , typename DyDataType , typename DxDataType , typename AccDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor >
__global__ void kernel_batchnorm_backward_with_blockwise_welford (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const XYGridDesc_M_K dx_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M dscale_dbias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, long_index_t reduce_size, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, const ScaleDataType *const __restrict__ p_scale, bool haveSavedMeanInvVar, const MeanVarDataType *const __restrict__ p_savedMean, const MeanVarDataType *const __restrict__ p_savedInvVar, const DyElementwiseOp dy_elementwise_op, DxDataType *const __restrict__ p_dx, DscaleDbiasDataType *const __restrict__ p_dscale, DscaleDbiasDataType *const __restrict__ p_dbias)
 
template<typename GridwiseBatchrNormForwardWithBlockwiseWelford_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor >
__global__ void kernel_batchnorm_forward_with_blockwise_welford (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance)
 
template<typename GridwiseElementwise1dFunctor , typename InGrid1dDescTuple , typename OutGrid1dDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename ElementwiseOperation , typename UnaryOperation , typename Scale >
__global__ void kernel_elementwise_1d (const InGrid1dDescTuple in_grid_1d_desc_tuple, const OutGrid1dDescTuple out_grid_1d_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const ElementwiseOperation elementwise_op, const UnaryOperation unary_op, const Scale scale_op)
 
template<typename GridwiseElementwiseFunctor , typename InGridDescTuple , typename OutGridDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename Block2TileMap , typename ElementwiseOperation >
__global__ void kernel_elementwise (const InGridDescTuple in_grid_desc_tuple, const OutGridDescTuple out_grid_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const Block2TileMap block_2_tile_map, const ElementwiseOperation elementwise_op)
 
template<typename GridwiseElementwiseFunctorA , typename GridwiseElementwiseFunctorB , typename InAGridDescTuple , typename InBGridDescTuple , typename OutAGridDescTuple , typename OutBGridDescTuple , typename InADataTypePointerTuple , typename InBDataTypePointerTuple , typename OutADataTypePointerTuple , typename OutBDataTypePointerTuple , typename Block2TileMapA , typename Block2TileMapB , typename ElementwiseOperation >
__global__ void kernel_elementwise_dual (const InAGridDescTuple in_grid_desc_tuple_a, const InBGridDescTuple in_grid_desc_tuple_b, const OutAGridDescTuple out_grid_desc_tuple_a, const OutBGridDescTuple out_grid_desc_tuple_b, const InADataTypePointerTuple p_in_global_tuple_a, const InBDataTypePointerTuple p_in_global_tuple_b, const OutADataTypePointerTuple p_out_global_tuple_a, const OutBDataTypePointerTuple p_out_global_tuple_b, const Block2TileMapA block_2_tile_map_a, const Block2TileMapB block_2_tile_map_b, const ElementwiseOperation elementwise_op, const index_t a_grid_size)
 
template<typename GridwiseElementwiseFunctorA , typename GridwiseElementwiseFunctorB , typename InAGridDescTuple , typename InBGridDescTuple , typename OutAGridDescTuple , typename OutBGridDescTuple , typename InADataTypePointerTuple , typename InBDataTypePointerTuple , typename OutADataTypePointerTuple , typename OutBDataTypePointerTuple , typename Block2TileMapA , typename Block2TileMapB , typename ElementwiseOperation , index_t NumInputsA, index_t NumInputsB, index_t NumOutputsA, index_t NumOutputsB>
__global__ void kernel_elementwise_batched_dual (const InAGridDescTuple in_grid_desc_tuple_a, const InBGridDescTuple in_grid_desc_tuple_b, const OutAGridDescTuple out_grid_desc_tuple_a, const OutBGridDescTuple out_grid_desc_tuple_b, const InADataTypePointerTuple p_in_global_tuple_a, const InBDataTypePointerTuple p_in_global_tuple_b, const OutADataTypePointerTuple p_out_global_tuple_a, const OutBDataTypePointerTuple p_out_global_tuple_b, const Block2TileMapA block_2_tile_map_a, const Block2TileMapB block_2_tile_map_b, const ElementwiseOperation elementwise_op, const index_t a_grid_size, const index_t batch_count_a, const index_t batch_count_b, const std::array< index_t, NumInputsA > input_batch_strides_a, const std::array< index_t, NumInputsB > input_batch_strides_b, const std::array< index_t, NumOutputsA > output_batch_strides_a, const std::array< index_t, NumOutputsB > output_batch_strides_b)
 
template<typename GridwiseElementwiseFunctor , typename InGridDescTuple , typename OutGridDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename Block2TileMap , typename ElementwiseOperation , index_t NumInputs, index_t NumOutputs>
__global__ void kernel_batched_elementwise (const InGridDescTuple in_grid_desc_tuple, const OutGridDescTuple out_grid_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const Block2TileMap block_2_tile_map, const ElementwiseOperation elementwise_op, const index_t batch_count, const std::array< index_t, NumInputs > input_batch_strides, const std::array< index_t, NumOutputs > output_batch_strides)
 
template<typename GridwiseGemm , typename ADataType , typename BDataType , typename ScaleDataType , typename CDataType , typename AGridDesc , typename BGridDesc , typename ScaleGridDesc , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_fpAintB_gemm_wmma (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, const ScaleDataType *__restrict__ p_scale_grid, CDataType *__restrict__ p_c_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const ScaleGridDesc scale_grid_desc, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename FloatC0 , typename FloatC1 , typename ReducePtrsGlobal , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename C1ElementwiseOperation , typename ReduceInElementwiseOperations , typename ReduceAccElementwiseOperations , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ReduceGridDescriptor_MBlock_MPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_bias_add_reduce_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC0 *__restrict__ p_bias_grid, const FloatC1 *__restrict__ p_d0_grid, ReducePtrsGlobal p_reduces_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const C1ElementwiseOperation c1_element_op, const ReduceInElementwiseOperations reduce_in_element_ops, const ReduceAccElementwiseOperations reduce_out_element_ops, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c0_grid_desc_mblock_mperblock_nblock_nperblock, const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c1_grid_desc_mblock_mperblock_nblock_nperblock, const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M0_M1_K1 , typename BGridDesc_K0_N0_N1_K1 , typename CGridDesc_M0_M10_M11_N0_N10_N11 , typename Block2CTileMap , bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__global__ void kernel_gemm_dl_v1r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1, const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1, const CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop>
__global__ void kernel_gemm_dpp (const typename GridwiseGemm::Argument karg)
 
template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2CTileMap , typename ComputePtrOffsetOfBatch , bool HasMainKBlockLoop>
__global__ void kernel_grouped_conv_multiple_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const index_t batch_count, const AGridDesc_AK0_M_AK1 a_grid_desc, const BGridDesc_BK0_N_BK1 b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_, const Block2CTileMap block_2_ctile_map, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 
template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AGridDesc , typename BGridDesc , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename ComputePtrOffsetOfBatch , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const index_t batch_count, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2CTileMap block_2_etile_map)
 
template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AGridDesc , typename BGridDesc , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_mupltipe_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_multiple_d_xdl_cshuffle_lds_direct_load (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
 
template<PipelineVersion PipelineVer, index_t NumPrefetch = 1, LoopScheduler LoopSched = LoopScheduler::Default, bool AEnableLds = true, bool BEnableLds = true>
constexpr auto GridwiseGemmPipeline_Selector ()
 
template<index_t NumPrefetch, LoopScheduler LoopSched>
constexpr auto GridwiseGemmPipeline_v1_Selector ()
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename ReducePtrsGlobal , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename ReduceInElementwiseOperations , typename ReduceAccElementwiseOperations , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ReduceGridDescriptor_MBlock_MPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_reduce_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, ReducePtrsGlobal p_reduces_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const ReduceInElementwiseOperations reduce_in_element_ops, const ReduceAccElementwiseOperations reduce_out_element_ops, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , typename ADataType , typename BDataType , typename CDataType , typename AGridDesc , typename BGridDesc , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_wmma (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, CDataType *__restrict__ p_c_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_wmma_cshuffle_v3 (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_xdl_cshuffle_v3 (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_xdl_cshuffle_v3_2lds (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdl_cshuffle_v1 (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdl_cshuffle_v1 (const FloatA *__restrict__ p_a_grid, const FloatB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, typename GridwiseGemm::Problem problem)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, index_t TailNum = 3>
__global__ void kernel_gemm_xdl_cshuffle_v2 (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdl_cshuffle_v2 (const FloatA *p_a_grid, const FloatB *p_b_grid, FloatC *p_c_grid, typename GridwiseGemm::Problem problem)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_b_preshuffle (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_2lds (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds (typename GridwiseGemm::Argument karg)
 
template<bool Use2LDS, typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ enable_if_t<!Use2LDS, void > kernel_gemm_xdl_cshuffle_v3_mx (typename GridwiseGemm::Argument karg)
 
template<bool Use2LDS, typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ enable_if_t< Use2LDS, void > kernel_gemm_xdl_cshuffle_v3_mx (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename FloatC0 , typename AElementwiseOperation , typename BElementwiseOperation , typename AccElementwiseOperation , typename CElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C0GridDescriptor_NBlock_NPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_layernorm_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC0 *__restrict__ p_c0_bias_grid, const FloatC0 *__restrict__ p_c0_add_grid, const FloatC0 *__restrict__ p_c0_gamma_grid, const FloatC0 *__restrict__ p_c0_beta_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const AccElementwiseOperation acc_element_op, const CElementwiseOperation c_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock, const Block2CTileMap block_2_ctile_map)
 
template<typename LowLengths >
__host__ constexpr __device__ auto make_merge_transform_v4_no_carry (const LowLengths &low_lengths)
 
template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , typename AGridDesc_B_K0_M_K1 , typename BGridDesc_B_K0_N_K1 , typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename CBlockClusterAdaptor , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_bwd_weight (const FloatA *__restrict__ p_a_grid, const FloatB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc, const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc, const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const CBlockClusterAdaptor c_block_cluster_adaptor)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3 , typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainK0BlockLoop>
__global__ void kernel_gemm_xdlops_skip_b_lds_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3 b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3, const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation >
__global__ void kernel_gemm_xdlops_splitk_lds_direct_load (typename GridwiseGemm::Argument karg, const Block2CTileMap &b2c_map, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op)
 
template<typename GridwiseGemm >
__global__ void kernel_gemm_xdlops_streamk (const typename GridwiseGemm::FloatAB *p_a_grid, const typename GridwiseGemm::FloatAB *p_b_grid, typename GridwiseGemm::FloatC *p_c_grid, void *p_workspace, index_t M, index_t N, index_t K, index_t StrideA, index_t StrideB, index_t StrideC, typename GridwiseGemm::Block2CTileMap block_mapping)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDesc_M_N , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v2r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDesc_M_N c_grid_desc_m_n)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v2r3 (const typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename ABK0MK1GridDesc , typename BBK0NK1GridDesc , typename CM0N0M1N1M2M3M4N2GridDesc , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename CBlockClusterAdaptor , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v2r4 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const ABK0MK1GridDesc a_b_k0_m_k1_grid_desc, const BBK0NK1GridDesc b_b_k0_n_k1_grid_desc, const CM0N0M1N1M2M3M4N2GridDesc c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const CBlockClusterAdaptor c_block_cluster_adaptor)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation >
__global__ void kernel_gemm_xdlops_v2r4r2_simplified (typename GridwiseGemm::Argument karg, const Block2CTileMap &b2c_map, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainK0BlockLoop>
__global__ void kernel_gemm_xdlops_v3r1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v3r2 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC *__restrict__ p_c0_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v3r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC *__restrict__ p_c0_grid, const FloatC *__restrict__ p_c1_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_moe_gemm (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_moe_gemm_2lds (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_moe_mxgemm_2lds (typename GridwiseGemm::Argument karg)
 
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_moe_mxgemm (typename GridwiseGemm::Argument karg)
 
template<typename GridwisePermute , typename InGridDesc , typename OutGridDesc , typename InDataType , typename OutDataType , typename ElementwiseOperation , typename Block2TileMap >
__global__ void kernel_nd_permute (const InGridDesc in_grid_desc, const OutGridDesc out_grid_desc, const InDataType *p_in_global, OutDataType *p_out_global, const ElementwiseOperation elementwise_op, const Block2TileMap block_2_tile_map)
 
template<typename GridwisePutElementwise1dFunctor , typename InGrid1dDesc , typename InDataType , typename IndexDataType , typename OutDataType , typename ElementwiseOperation >
__global__ void kernel_put_element_1d (const InGrid1dDesc in_grid_1d_desc, const InDataType *__restrict__ p_in_global, const IndexDataType *__restrict__ p_indices_global, OutDataType *__restrict__ p_out_global, const ElementwiseOperation elementwise_op)
 
template<index_t BlockSize, typename DataType , typename Grid1dBufferDescType >
__global__ void kernel_buffer_set_value (const Grid1dBufferDescType grid_1d_buffer_desc, DataType *const __restrict__ p_global, DataType value)
 
template<typename Grid1dBufferDescTuple , index_t NumBuffer, index_t BlockSize, typename DataTypePointerTuple , typename DataTypeTuple >
__global__ void kernel_multiple_buffer_set_value (const Grid1dBufferDescTuple grid_1d_buffer_desc_tuple, DataTypePointerTuple p_global_tuple, DataTypeTuple value_tuple)
 
template<typename GridwiseReduction , typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K >
__global__ void kernel_softmax (const GridDesc_M_K in_grid_desc_m_k, const GridDesc_M_K out_grid_desc_m_k, index_t block_group_size, index_t num_k_block_tile_iteration, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global)
 
template<typename GridwiseSparseEmbedding , typename EmbType , typename IndexType , typename GammaDataType , typename BetaDataType , typename AccDataType , typename OutType , typename OutGridDesc , typename EmbElementwiseOperation , ck::index_t NumEmbeddings>
__global__ void kernel_sparse_embeddings_forward_layernorm (OutType *p_out, const ck::Array< EmbType *, NumEmbeddings > p_embs, const ck::Array< IndexType *, NumEmbeddings > p_indexes, const GammaDataType *p_gamma, const BetaDataType *p_beta, const OutGridDesc out_grid_desc, const AccDataType epsilon, const EmbElementwiseOperation emb_elementwise_op)
 
template<typename InputGridDesc , typename InputDataType , typename OutputGridDesc , typename OutputDataType , typename Block2ETileMap , typename ComputePtrOffsetOfStridedBatch , typename GridwiseTensorRearrangeKernel >
__global__ void kernel_tensor_rearrange (const InputGridDesc in_grid_desc, const InputDataType *__restrict__ p_in_global, const OutputGridDesc out_grid_desc, OutputDataType *__restrict__ p_out_global, const index_t batch_count, const Block2ETileMap block_2_tile_map, const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
 
template<typename GridwiseReduction , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename GridDesc_M_K , typename GridDesc_M >
__global__ void kernel_normalization (const GridDesc_M_K x_grid_desc_m_k, const GridDesc_M_K gamma_grid_desc_m_k, const GridDesc_M_K beta_grid_desc_m_k, const GridDesc_M_K y_grid_desc_m_k, const GridDesc_M save_mean_grid_desc_m, const GridDesc_M save_inv_std_grid_desc_m, index_t num_k_block_tile_iteration, ComputeDataType epsilon, const XDataType *const __restrict__ p_x_global, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, SaveMeanInvStdDataType *const __restrict__ p_save_mean_global, SaveMeanInvStdDataType *const __restrict__ p_save_inv_std_global, const YElementwiseOperation y_elementwise_op)
 
template<typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename GridDesc_M_K , typename GridDesc_M , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t XSrcVectorDim, index_t XSrcVectorSize, index_t GammaSrcVectorDim, index_t GammaSrcVectorSize, index_t BetaSrcVectorDim, index_t BetaSrcVectorSize, index_t YDstVectorDim, index_t YDstVectorSize, index_t SaveMeanInvStdDstVectorSize, bool UseWelford>
auto NormalizationKernelSelector (bool isSweepOnce)
 
template<typename T >
__device__ T * cast_pointer_to_generic_address_space (T CK_CONSTANT_ADDRESS_SPACE *p)
 
template<typename T >
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACEcast_pointer_to_constant_address_space (T *p)
 
template<typename T >
__device__ int32x4_t make_wave_buffer_resource (T *p_wave, index_t element_space_size)
 
template<typename T >
__device__ int32x4_t make_wave_buffer_resource_with_default_range (T *p_wave)
 
__device__ int8_t llvm_amdgcn_raw_buffer_load_i8 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
 
__device__ int8x2_t llvm_amdgcn_raw_buffer_load_i8x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
 
__device__ int8x4_t llvm_amdgcn_raw_buffer_load_i8x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
 
__device__ bhalf_t llvm_amdgcn_raw_buffer_load_i16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
 
__device__ bhalf2_t llvm_amdgcn_raw_buffer_load_i16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
 
__device__ bhalf4_t llvm_amdgcn_raw_buffer_load_i16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
 
__device__ int32_t llvm_amdgcn_raw_buffer_load_i32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
 
__device__ int32x2_t llvm_amdgcn_raw_buffer_load_i32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
 
__device__ int32x4_t llvm_amdgcn_raw_buffer_load_i32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
 
__device__ half_t llvm_amdgcn_raw_buffer_load_fp16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
 
__device__ half2_t llvm_amdgcn_raw_buffer_load_fp16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
 
__device__ half4_t llvm_amdgcn_raw_buffer_load_fp16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
 
__device__ float llvm_amdgcn_raw_buffer_load_fp32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
 
__device__ float2_t llvm_amdgcn_raw_buffer_load_fp32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
 
__device__ float4_t llvm_amdgcn_raw_buffer_load_fp32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
 
__device__ void llvm_amdgcn_raw_buffer_store_i8 (int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
 
__device__ void llvm_amdgcn_raw_buffer_store_i8x2 (int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
 
__device__ void llvm_amdgcn_raw_buffer_store_i8x4 (int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
 
__device__ void llvm_amdgcn_raw_buffer_store_i16 (bhalf_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
 
__device__ void llvm_amdgcn_raw_buffer_store_i16x2 (bhalf2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
 
__device__ void llvm_amdgcn_raw_buffer_store_i16x4 (bhalf4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
 
__device__ void llvm_amdgcn_raw_buffer_store_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
 
__device__ void llvm_amdgcn_raw_buffer_store_i32x2 (int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
 
__device__ void llvm_amdgcn_raw_buffer_store_i32x4 (int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
 
__device__ void llvm_amdgcn_raw_buffer_store_fp16 (half_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
 
__device__ void llvm_amdgcn_raw_buffer_store_fp16x2 (half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
 
__device__ void llvm_amdgcn_raw_buffer_store_fp16x4 (half4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
 
__device__ void llvm_amdgcn_raw_buffer_store_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
 
__device__ void llvm_amdgcn_raw_buffer_store_fp32x2 (float2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
 
__device__ void llvm_amdgcn_raw_buffer_store_fp32x4 (float4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
 
__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2 (half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
 
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
 
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
 
__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64 (double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
 
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< T, N >::type amd_buffer_load_impl (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
 
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store_impl_raw (const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N>
__device__ void amd_global_atomic_add_impl (const typename vector_type< T, N >::type src_thread_data, T *addr)
 
template<typename T , index_t N>
__device__ void amd_buffer_atomic_add_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N>
__device__ void amd_buffer_atomic_max_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_zero (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_customized_value (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
template<typename T , index_t N>
__device__ void amd_buffer_atomic_add (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
template<typename T , index_t N>
__device__ void amd_buffer_atomic_max (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
__device__ void llvm_amdgcn_raw_buffer_load_lds (int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
 
template<typename T , index_t NumElemsPerThread>
__device__ void amd_direct_load_global_to_lds (const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)
 
template<typename T >
__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_new (T *p_wave, index_t element_space_size)
 
template<typename T >
__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_with_default_range_new (T *p_wave)
 
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw (__amdgpu_buffer_rsrc_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< T, N >::type amd_buffer_load_impl (__amdgpu_buffer_rsrc_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
 
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store_impl_raw (const typename vector_type< int8_t, N >::type src_thread_data, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store_impl (const typename vector_type< T, N >::type src_thread_data, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<>
__host__ constexpr __device__ bool fp8_is_nan (f8_ocp_t a)
 
template<>
__host__ constexpr __device__ bool fp8_is_nan (bf8_ocp_t a)
 
template<>
__host__ constexpr __device__ bool fp8_is_nan (f8_fnuz_t a)
 
template<>
__host__ constexpr __device__ bool fp8_is_nan (bf8_fnuz_t a)
 
template<>
__host__ constexpr __device__ bool fp8_is_inf (bf8_ocp_t a)
 
__device__ int amd_assembly_and_b32 (int a, int b)
 
__device__ int amd_assembly_and_or_b32 (int a, int b, int d)
 
__device__ half2_t amd_assembly_pk_fma_f16 (half2_t a, half2_t b, half2_t c)
 
__device__ half2_t amd_assembly_pk_add_f16 (half2_t a, half2_t b)
 
__device__ float amd_assemble_cvt_f32_i4 (int b)
 
__device__ f8x4_t amd_assembly_cvt_f8_to_f32 (float b0, float b1, float b2, float b3)
 
__device__ f8x8_t amd_assembly_i4_to_fp8x8 (int a)
 
__device__ void amd_assembly_outer_product_1x2 (float a, float b0, float b1, float &c0, float &c1)
 
__device__ void amd_assembly_outer_product_1x4 (float a, float b0, float b1, float b2, float b3, float &c0, float &c1, float &c2, float &c3)
 
__device__ void amd_assembly_outer_product_1x2 (half2_t a, half2_t b0, half2_t b1, float &c0, float &c1)
 
__device__ void amd_assembly_outer_product_1x4 (half2_t a, half2_t b0, half2_t b1, half2_t b2, half2_t b3, float &c0, float &c1, float &c2, float &c3)
 
__device__ void amd_assembly_outer_product_1x2 (int8x4_t a, int8x4_t b0, int8x4_t b1, int32_t &c0, int32_t &c1)
 
__device__ void amd_assembly_outer_product_1x4 (int8x4_t a, int8x4_t b0, int8x4_t b1, int8x4_t b2, int8x4_t b3, int32_t &c0, int32_t &c1, int32_t &c2, int32_t &c3)
 
__device__ uint32_t amd_wave_read_first_lane (uint32_t value)
 
__device__ int32_t amd_wave_read_first_lane (int32_t value)
 
__device__ int64_t amd_wave_read_first_lane (int64_t value)
 
template<typename Object , typename = ck::enable_if_t<ck::is_class_v<Object> && ck::is_trivially_copyable_v<Object>>>
__device__ auto amd_wave_read_first_lane (const Object &obj)
 
template<typename X , typename... Xs>
__host__ constexpr __device__ auto make_array (X &&x, Xs &&... xs)
 
template<typename X >
__host__ constexpr __device__ auto make_array ()
 
template<typename... Xs>
__host__ constexpr __device__ auto make_multi_index (Xs &&... xs)
 
template<index_t NSize>
__host__ constexpr __device__ auto make_zero_multi_index ()
 
template<typename T >
__host__ constexpr __device__ auto to_multi_index (const T &x)
 
template<index_t NSize, typename X >
__host__ constexpr __device__ auto operator+= (MultiIndex< NSize > &y, const X &x)
 
template<index_t NSize, typename X >
__host__ constexpr __device__ auto operator-= (MultiIndex< NSize > &y, const X &x)
 
template<index_t NSize, typename T >
__host__ constexpr __device__ auto operator+ (const MultiIndex< NSize > &a, const T &b)
 
template<index_t NSize, typename T >
__host__ constexpr __device__ auto operator- (const MultiIndex< NSize > &a, const T &b)
 
template<index_t NSize, typename T >
__host__ constexpr __device__ auto operator* (const MultiIndex< NSize > &a, const T &b)
 
template<typename PY , typename PX , typename enable_if< is_pointer_v< PY > &&is_pointer_v< PX >, bool >::type = false>
__host__ __device__ PY c_style_pointer_cast (PX p_x)
 
template<typename Arr , typename Picks , typename X >
__host__ constexpr __device__ auto operator+= (ContainerElementPicker< Arr, Picks > &y, const X &x)
 
template<typename Arr , typename Picks , typename X >
__host__ constexpr __device__ auto operator-= (ContainerElementPicker< Arr, Picks > &y, const X &x)
 
template<typename Arr , typename Picks >
__host__ constexpr __device__ auto pick_container_element (Arr &a, Picks)
 
template<typename Arr , typename Picks >
__host__ constexpr __device__ auto pick_container_element (const Arr &a, Picks)
 
template<typename TData , index_t NSize>
__host__ constexpr __device__ auto container_push_back (const Array< TData, NSize > &a, const TData &x)
 
template<typename... Ts, typename T >
__host__ constexpr __device__ auto container_push_front (const Tuple< Ts... > &a, const T &x)
 
template<typename... Ts, typename T >
__host__ constexpr __device__ auto container_push_back (const Tuple< Ts... > &a, const T &x)
 
template<typename TData , index_t NSize, index_t... IRs>
__host__ constexpr __device__ auto container_reorder_given_new2old (const Array< TData, NSize > &old_array, Sequence< IRs... >)
 
template<typename TData , index_t NSize, index_t... IRs>
__host__ constexpr __device__ auto container_reorder_given_old2new (const Array< TData, NSize > &old_array, Sequence< IRs... > old2new)
 
template<typename... Ts, index_t... IRs>
__host__ constexpr __device__ auto container_reorder_given_new2old (const Tuple< Ts... > &old_tuple, Sequence< IRs... >)
 
template<typename... Ts, index_t... IRs>
__host__ constexpr __device__ auto container_reorder_given_old2new (const Tuple< Ts... > &old_tuple, Sequence< IRs... > old2new)
 
template<index_t... Is, index_t... IRs>
__host__ constexpr __device__ auto container_reorder_given_new2old (Sequence< Is... >, Sequence< IRs... >)
 
template<index_t... Is, index_t... IRs>
__host__ constexpr __device__ auto container_reorder_given_old2new (Sequence< Is... > old_seq, Sequence< IRs... >)
 
template<typename Container , typename Reduce , typename Init , index_t IBegin = 0, index_t IEnd = Container::Size(), index_t IStep = 1>
__host__ constexpr __device__ auto container_reduce (const Container &x, Reduce reduce, Init init, Number< IBegin >=Number< 0 >{}, Number< IEnd >=Number< Container::Size()>{}, Number< IStep >=Number< 1 >{})
 
template<typename TData , index_t NSize, typename Reduce >
__host__ constexpr __device__ auto container_reverse_inclusive_scan (const Array< TData, NSize > &x, Reduce f, TData init)
 
template<typename TData , index_t NSize, typename Reduce >
__host__ constexpr __device__ auto container_reverse_exclusive_scan (const Array< TData, NSize > &x, Reduce f, TData init)
 
template<index_t... Is, typename Reduce , index_t Init>
__host__ constexpr __device__ auto container_reverse_exclusive_scan (const Sequence< Is... > &seq, Reduce f, Number< Init >)
 
template<typename... Xs, typename Reduce , typename Init >
__host__ constexpr __device__ auto container_reverse_exclusive_scan (const Tuple< Xs... > &x, Reduce reduce, Init init)
 
template<typename... Xs, typename Reduce , typename TData >
__host__ constexpr __device__ auto container_reverse_inclusive_scan (const Tuple< Xs... > &x, Reduce f, TData init)
 
template<typename X , typename... Ys>
__host__ constexpr __device__ auto container_concat (const X &x, const Ys &... ys)
 
template<typename T , index_t NX, index_t NY>
__host__ constexpr __device__ auto container_concat (const Array< T, NX > &ax, const Array< T, NY > &ay)
 
template<typename... X, typename... Y>
__host__ constexpr __device__ auto container_concat (const Tuple< X... > &tx, const Tuple< Y... > &ty)
 
template<typename Container >
__host__ constexpr __device__ auto container_concat (const Container &x)
 
template<typename T , index_t N, index_t... Is>
__host__ constexpr __device__ auto get_container_subset (const Array< T, N > &arr, Sequence< Is... >)
 
template<typename... Ts, index_t... Is>
__host__ constexpr __device__ auto get_container_subset (const Tuple< Ts... > &tup, Sequence< Is... >)
 
template<typename T , index_t N, index_t... Is>
__host__ constexpr __device__ void set_container_subset (Array< T, N > &y, Sequence< Is... > picks, const Array< T, sizeof...(Is)> &x)
 
template<typename... Ys, index_t... Is, typename... Xs>
__host__ constexpr __device__ void set_container_subset (Tuple< Ys... > &y, Sequence< Is... > picks, const Tuple< Xs... > &x)
 
template<index_t... Is>
__host__ constexpr __device__ auto sequence_to_tuple_of_number (Sequence< Is... >)
 
constexpr auto next_pow2 (uint32_t x)
 
template<typename T >
constexpr bool is_native_type ()
 
template<typename T , index_t N>
__host__ constexpr __device__ auto make_vector_type (Number< N >)
 
template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize >
__host__ constexpr __device__ auto make_dynamic_buffer (T *p, ElementSpaceSize element_space_size)
 
template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize >
__host__ constexpr __device__ auto make_long_dynamic_buffer (T *p, ElementSpaceSize element_space_size)
 
template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize , typename X , typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X >>::value, bool >::type = false>
__host__ constexpr __device__ auto make_dynamic_buffer (T *p, ElementSpaceSize element_space_size, X invalid_element_value)
 
template<class EnvVar >
const std::string & EnvGetString (EnvVar)
 
template<class EnvVar >
bool EnvIsEnabled (EnvVar)
 
template<class EnvVar >
bool EnvIsDisabled (EnvVar)
 
template<class EnvVar >
uint64_t EnvValue (EnvVar)
 
template<class EnvVar >
bool EnvIsUnset (EnvVar)
 
template<class EnvVar >
void EnvUnset (EnvVar)
 
template<typename EnvVar , typename ValueType >
void UpdateEnvVar (EnvVar, const ValueType &val)
 updates the cached value of an environment variable More...
 
template<typename EnvVar >
void UpdateEnvVar (EnvVar, const std::string_view &val)
 
__host__ int clz (uint32_t x)
 
template<bool predicate, typename X , typename Y >
constexpr auto conditional_expr (X &&x, Y &&y)
 
template<typename F , typename X >
__host__ constexpr __device__ auto unpack (F &&f, X &&x)
 
template<typename F , typename X , typename Y >
__host__ constexpr __device__ auto unpack2 (F &&f, X &&x, Y &&y)
 
template<typename X >
__device__ X atomic_add (X *p_dst, const X &x)
 
template<>
__device__ int32_t atomic_add< int32_t > (int32_t *p_dst, const int32_t &x)
 
template<>
__device__ uint32_t atomic_add< uint32_t > (uint32_t *p_dst, const uint32_t &x)
 
template<>
__device__ float atomic_add< float > (float *p_dst, const float &x)
 
template<>
__device__ unsigned short atomic_add< unsigned short > (unsigned short *p_dst, const unsigned short &x)
 
template<>
__device__ _Float16 atomic_add< _Float16 > (_Float16 *p_dst, const _Float16 &x)
 
template<>
__device__ double atomic_add< double > (double *p_dst, const double &x)
 
template<>
__device__ float2_t atomic_add< float2_t > (float2_t *p_dst, const float2_t &x)
 
template<>
__device__ double2_t atomic_add< double2_t > (double2_t *p_dst, const double2_t &x)
 
template<typename X >
__device__ X atomic_max (X *p_dst, const X &x)
 
template<>
__device__ int32_t atomic_max< int32_t > (int32_t *p_dst, const int32_t &x)
 
template<>
__device__ uint32_t atomic_max< uint32_t > (uint32_t *p_dst, const uint32_t &x)
 
template<>
__device__ float atomic_max< float > (float *p_dst, const float &x)
 
template<>
__device__ double atomic_max< double > (double *p_dst, const double &x)
 
template<>
__device__ float2_t atomic_max< float2_t > (float2_t *p_dst, const float2_t &x)
 
__host__ constexpr __device__ index_t get_warp_size ()
 
__device__ index_t get_thread_local_1d_id ()
 
__device__ index_t get_thread_global_1d_id ()
 
__device__ index_t get_warp_local_1d_id ()
 
__device__ index_t get_block_1d_id ()
 
__device__ index_t get_grid_size ()
 
__device__ index_t get_block_size ()
 
template<>
constexpr __device__ index_t get_shift< 1 > ()
 
template<typename TA , typename TB , typename TC >
__device__ void inner_product (const TA &a, const TB &b, TC &c)
 
template<>
__device__ void inner_product< float, float, float > (const float &a, const float &b, float &c)
 
template<>
__device__ void inner_product< float2_t, float2_t, float > (const float2_t &a, const float2_t &b, float &c)
 
template<>
__device__ void inner_product< float4_t, float4_t, float > (const float4_t &a, const float4_t &b, float &c)
 
template<>
__device__ void inner_product< bhalf_t, bhalf_t, float > (const bhalf_t &a, const bhalf_t &b, float &c)
 
template<>
__device__ void inner_product< half_t, half_t, float > (const half_t &a, const half_t &b, float &c)
 
template<>
__device__ void inner_product< half2_t, half2_t, float > (const half2_t &a, const half2_t &b, float &c)
 
template<>
__device__ void inner_product< half4_t, half4_t, float > (const half4_t &a, const half4_t &b, float &c)
 
template<>
__device__ void inner_product< half8_t, half8_t, float > (const half8_t &a, const half8_t &b, float &c)
 
template<>
__device__ void inner_product< int8_t, int8_t, int32_t > (const int8_t &a, const int8_t &b, int32_t &c)
 
template<>
__device__ void inner_product< int8x2_t, int8x2_t, int32_t > (const int8x2_t &a, const int8x2_t &b, int32_t &c)
 
template<>
__device__ void inner_product< int8x4_t, int8x4_t, int32_t > (const int8x4_t &a, const int8x4_t &b, int32_t &c)
 
template<>
__device__ void inner_product< int8x8_t, int8x8_t, int32_t > (const int8x8_t &a, const int8x8_t &b, int32_t &c)
 
template<>
__device__ void inner_product< int8x16_t, int8x16_t, int32_t > (const int8x16_t &a, const int8x16_t &b, int32_t &c)
 
template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto operator+ (integral_constant< TX, X >, integral_constant< TY, Y >)
 
template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto operator- (integral_constant< TX, X >, integral_constant< TY, Y >)
 
template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto operator* (integral_constant< TX, X >, integral_constant< TY, Y >)
 
template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto operator/ (integral_constant< TX, X >, integral_constant< TY, Y >)
 
template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto operator% (integral_constant< TX, X >, integral_constant< TY, Y >)
 
constexpr LoopScheduler make_default_loop_scheduler ()
 
template<typename Y , typename X >
__host__ constexpr __device__ Y mxf8_convert_sr (X x, float scale)
 
template<typename Y , typename X >
__host__ constexpr __device__ Y mxf8_convert_rne (X x, float scale)
 
template<>
__host__ __device__ f8_ocp_t mxf8_convert_rne< f8_ocp_t, float > (float x, float scale)
 
template<>
__host__ __device__ bf8_ocp_t mxf8_convert_rne< bf8_ocp_t, float > (float x, float scale)
 
template<>
__host__ __device__ f8x2_ocp_t mxf8_convert_rne< f8x2_ocp_t, float2_t > (float2_t x, float scale)
 
template<>
__host__ __device__ bf8x2_ocp_t mxf8_convert_rne< bf8x2_ocp_t, float2_t > (float2_t x, float scale)
 
template<>
__host__ __device__ f8x16_ocp_t mxf8_convert_rne< f8x16_ocp_t, float16_t > (float16_t x, float scale)
 
template<>
__host__ __device__ bf8x16_ocp_t mxf8_convert_rne< bf8x16_ocp_t, float16_t > (float16_t x, float scale)
 
template<>
__host__ __device__ f8x32_ocp_t mxf8_convert_rne< f8x32_ocp_t, float32_t > (float32_t x, float scale)
 
template<>
__host__ __device__ bf8x32_ocp_t mxf8_convert_rne< bf8x32_ocp_t, float32_t > (float32_t x, float scale)
 
template<>
__host__ __device__ f8_ocp_t mxf8_convert_sr< f8_ocp_t, float > (float x, float scale)
 
template<>
__host__ __device__ bf8_ocp_t mxf8_convert_sr< bf8_ocp_t, float > (float x, float scale)
 
template<>
__host__ __device__ f8x2_ocp_t mxf8_convert_sr< f8x2_ocp_t, float2_t > (float2_t x, float scale)
 
template<>
__host__ __device__ bf8x2_ocp_t mxf8_convert_sr< bf8x2_ocp_t, float2_t > (float2_t x, float scale)
 
template<>
__host__ __device__ f8x16_ocp_t mxf8_convert_sr< f8x16_ocp_t, float16_t > (float16_t x, float scale)
 
template<>
__host__ __device__ bf8x16_ocp_t mxf8_convert_sr< bf8x16_ocp_t, float16_t > (float16_t x, float scale)
 
template<>
__host__ __device__ f8x32_ocp_t mxf8_convert_sr< f8x32_ocp_t, float32_t > (float32_t x, float scale)
 
template<>
__host__ __device__ bf8x32_ocp_t mxf8_convert_sr< bf8x32_ocp_t, float32_t > (float32_t x, float scale)
 
template<typename T , uint32_t seed_t, ck::enable_if_t< std::is_same< float, T >{}, bool > = false>
__host__ __device__ uint32_t prand_generator (index_t id, T val, uint32_t seed=seed_t)
 
template<typename T , uint32_t seed_t, ck::enable_if_t<!(std::is_same< float, T >{}||std::is_same< _Float16, T >{}), bool > = false>
__host__ __device__ uint32_t prand_generator (int id, T val, uint32_t seed=seed_t)
 
template<typename Y , typename X >
constexpr __host__ Y scaled_type_convert (e8m0_bexp_t scale, X x)
 
template<>
__host__ float scaled_type_convert< float, f8_ocp_t > (e8m0_bexp_t scale, f8_ocp_t x)
 
template<>
__host__ float scaled_type_convert< float, bf8_ocp_t > (e8m0_bexp_t scale, bf8_ocp_t x)
 
template<>
__host__ float2_t scaled_type_convert< float2_t, f8x2_ocp_t > (e8m0_bexp_t scale, f8x2_ocp_t x)
 
template<>
__host__ float2_t scaled_type_convert< float2_t, bf8x2_ocp_t > (e8m0_bexp_t scale, bf8x2_ocp_t x)
 
template<>
__host__ float16_t scaled_type_convert< float16_t, f8x16_ocp_t > (e8m0_bexp_t scale, f8x16_ocp_t x)
 
template<>
__host__ float16_t scaled_type_convert< float16_t, bf8x16_ocp_t > (e8m0_bexp_t scale, bf8x16_ocp_t x)
 
template<>
__host__ float32_t scaled_type_convert< float32_t, f8x32_ocp_t > (e8m0_bexp_t scale, f8x32_ocp_t x)
 
template<>
__host__ float32_t scaled_type_convert< float32_t, bf8x32_ocp_t > (e8m0_bexp_t scale, bf8x32_ocp_t x)
 
template<>
__host__ f8_ocp_t scaled_type_convert< f8_ocp_t, float > (e8m0_bexp_t scale, float x)
 
template<>
__host__ bf8_ocp_t scaled_type_convert< bf8_ocp_t, float > (e8m0_bexp_t scale, float x)
 
template<>
__host__ f8x2_ocp_t scaled_type_convert< f8x2_ocp_t, float2_t > (e8m0_bexp_t scale, float2_t x)
 
template<>
__host__ bf8x2_ocp_t scaled_type_convert< bf8x2_ocp_t, float2_t > (e8m0_bexp_t scale, float2_t x)
 
template<>
__host__ f8x16_ocp_t scaled_type_convert< f8x16_ocp_t, float16_t > (e8m0_bexp_t scale, float16_t x)
 
template<>
__host__ bf8x16_ocp_t scaled_type_convert< bf8x16_ocp_t, float16_t > (e8m0_bexp_t scale, float16_t x)
 
template<>
__host__ f8x32_ocp_t scaled_type_convert< f8x32_ocp_t, float32_t > (e8m0_bexp_t scale, float32_t x)
 
template<>
__host__ bf8x32_ocp_t scaled_type_convert< bf8x32_ocp_t, float32_t > (e8m0_bexp_t scale, float32_t x)
 
template<index_t I, index_t... Is>
__host__ constexpr __device__ auto sequence_pop_front (Sequence< I, Is... >)
 
template<typename Seq >
__host__ constexpr __device__ auto sequence_pop_back (Seq)
 
template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ bool operator== (Sequence< Xs... >, Sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto operator+ (Sequence< Xs... >, Sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto operator- (Sequence< Xs... >, Sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto operator* (Sequence< Xs... >, Sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto operator/ (Sequence< Xs... >, Sequence< Ys... >)
 
template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto operator% (Sequence< Xs... >, Sequence< Ys... >)
 
template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto operator+ (Sequence< Xs... >, Number< Y >)
 
template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto operator- (Sequence< Xs... >, Number< Y >)
 
template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto operator* (Sequence< Xs... >, Number< Y >)
 
template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto operator/ (Sequence< Xs... >, Number< Y >)
 
template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto operator% (Sequence< Xs... >, Number< Y >)
 
template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto operator+ (Number< Y >, Sequence< Xs... >)
 
template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto operator- (Number< Y >, Sequence< Xs... >)
 
template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto operator* (Number< Y >, Sequence< Xs... >)
 
template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto operator/ (Number< Y >, Sequence< Xs... >)
 
template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto operator% (Number< Y >, Sequence< Xs... >)
 
template<typename... Seqs>
__host__ constexpr __device__ auto merge_sequences (Seqs...)
 
template<typename F , index_t... Xs>
__host__ constexpr __device__ auto transform_sequences (F f, Sequence< Xs... >)
 
template<typename F , index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto transform_sequences (F f, Sequence< Xs... >, Sequence< Ys... >)
 
template<typename F , index_t... Xs, index_t... Ys, index_t... Zs>
__host__ constexpr __device__ auto transform_sequences (F f, Sequence< Xs... >, Sequence< Ys... >, Sequence< Zs... >)
 
template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ auto reverse_inclusive_scan_sequence (Seq, Reduce, Number< Init >)
 
template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ auto reverse_exclusive_scan_sequence (Seq, Reduce, Number< Init >)
 
template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ auto inclusive_scan_sequence (Seq, Reduce, Number< Init >)
 
template<typename Seq , index_t... Is>
__host__ constexpr __device__ auto pick_sequence_elements_by_ids (Seq, Sequence< Is... >)
 
template<typename Seq , typename Mask >
__host__ constexpr __device__ auto pick_sequence_elements_by_mask (Seq, Mask)
 
template<typename Seq , typename Values , typename Ids >
__host__ constexpr __device__ auto modify_sequence_elements_by_ids (Seq, Values, Ids)
 
template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ index_t reduce_on_sequence (Seq, Reduce f, Number< Init >)
 
template<typename Seq , typename F >
__host__ constexpr __device__ bool sequence_any_of (Seq, F f)
 
template<typename Seq , typename F >
__host__ constexpr __device__ bool sequence_all_of (Seq, F f)
 
template<index_t... Is>
__host__ constexpr __device__ auto make_sequence (Number< Is >...)
 
template<typename F , index_t N>
__host__ constexpr __device__ auto generate_sequence (F, Number< N >)
 
template<typename F , index_t N>
__host__ constexpr __device__ auto generate_sequence_v2 (F &&f, Number< N >)
 
template<index_t... Is>
__host__ constexpr __device__ auto to_sequence (Tuple< Number< Is >... >)
 
template<AddressSpaceEnum AddressSpace, typename T , index_t N>
__host__ constexpr __device__ auto make_static_buffer (Number< N >)
 
template<AddressSpaceEnum AddressSpace, typename T , long_index_t N>
__host__ constexpr __device__ auto make_static_buffer (LongNumber< N >)
 
template<typename X , typename... Xs>
__host__ constexpr __device__ auto make_statically_indexed_array (const X &x, const Xs &... xs)
 
template<typename X >
__host__ constexpr __device__ auto make_statically_indexed_array ()
 
template<typename... Ys, typename X , enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false>
__host__ constexpr __device__ auto operator+= (Tuple< Ys... > &y, const X &x)
 
template<typename... Ys, typename X , enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false>
__host__ constexpr __device__ auto operator-= (Tuple< Ys... > &y, const X &x)
 
template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto operator+ (const Tuple< Xs... > &x, const Y &y)
 
template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto operator- (const Tuple< Xs... > &x, const Y &y)
 
template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto operator* (const Tuple< Xs... > &x, const Y &y)
 
template<typename... Xs, typename Y , enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto operator* (Y a, const Tuple< Xs... > &x)
 
template<typename... Xs, typename Y , enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto operator* (const Tuple< Xs... > &x, Y a)
 
template<typename... Xs>
__host__ __device__ void print_multi_index (const Tuple< Xs... > &x)
 
__device__ void block_sync_lds ()
 
__device__ void block_sync_lds_direct_load ()
 
__device__ void s_nop ()
 
__device__ void transpose_fp16_2x2 (const half2_t &x0, const half2_t &x1, half2_t &y0, half2_t &y1)
 
__device__ void transpose_int8_4x4 (const int8x4_t &x0, const int8x4_t &x1, const int8x4_t &x2, const int8x4_t &x3, int8x4_t &y0, int8x4_t &y1, int8x4_t &y2, int8x4_t &y3)
 
__device__ void transpose_f8_4x4 (const f8x4_t &x0, const f8x4_t &x1, const f8x4_t &x2, const f8x4_t &x3, f8x4_t &y0, f8x4_t &y1, f8x4_t &y2, f8x4_t &y3)
 
template<typename... Xs>
__host__ constexpr __device__ auto make_tuple (Xs &&... xs)
 
template<typename... Args>
constexpr Tuple< Args &... > tie (Args &... args) noexcept
 
template<typename F , index_t... ids>
__host__ constexpr __device__ auto generate_tuple_for (F &&f, Sequence< ids... >)
 
template<typename F , index_t N>
__host__ constexpr __device__ auto generate_tuple (F &&f, Number< N >)
 
template<typename F , index_t N>
__host__ constexpr __device__ auto generate_tuple (F &&f, LongNumber< N >)
 
template<typename F , index_t N>
__host__ constexpr __device__ auto generate_tie (F &&f, Number< N >)
 
template<typename... X, typename... Y>
__host__ constexpr __device__ auto concat_tuple_of_reference (const Tuple< X &... > &tx, const Tuple< Y &... > &ty)
 
template<typename... X, typename... Y>
__host__ constexpr __device__ auto concat_tuple (const Tuple< X... > &tx, const Tuple< Y... > &ty)
 
template<typename... X>
__host__ constexpr __device__ auto concat_tuple (const Tuple< X... > &tx)
 
template<typename... X, typename... Tuples>
__host__ constexpr __device__ auto concat_tuple (const Tuple< X... > &tx, const Tuples &... tuples)
 
template<typename F , typename X >
__host__ constexpr __device__ auto transform_tuples (F f, const X &x)
 
template<typename F , typename X , typename Y >
__host__ constexpr __device__ auto transform_tuples (F f, const X &x, const Y &y)
 
template<typename F , typename X , typename Y , typename Z >
__host__ constexpr __device__ auto transform_tuples (F f, const X &x, const Y &y, const Z &z)
 
template<index_t Depth = 0, index_t MaxDepth = -1>
__host__ constexpr __device__ auto UnrollNestedTuple (const Tuple<> &element)
 
template<index_t Depth = 0, index_t MaxDepth = -1, typename T >
__host__ constexpr __device__ auto UnrollNestedTuple (const T &element)
 
template<index_t Depth = 0, index_t MaxDepth = -1, typename... Ts>
__host__ constexpr __device__ auto UnrollNestedTuple (const Tuple< Ts... > &tuple)
 
template<typename... Ts>
__host__ constexpr __device__ auto TupleReverse (const Tuple< Ts... > &tuple)
 
template<index_t Idx, index_t End, typename F , typename... Ts>
__host__ constexpr __device__ auto TupleReduce (F &&f, const Tuple< Ts... > &tuple)
 
template<typename... Ts>
__host__ constexpr __device__ auto IsNestedTuple (const Tuple< Ts... > &)
 
template<index_t depth = 0, typename T >
__host__ constexpr __device__ auto TupleDepth (const T &)
 
template<index_t depth = 0, typename... Ts>
__host__ constexpr __device__ auto TupleDepth (const Tuple< Ts... > &)
 
template<index_t from, index_t to, typename... Ts>
__host__ constexpr __device__ auto TupleSlice (const Tuple< Ts... > &tuple)
 
template<typename Y , typename X , typename enable_if< sizeof(X)==sizeof(Y), bool >::type = false>
__host__ constexpr __device__ Y bit_cast (const X &x)
 
template<typename Y , typename X >
__host__ constexpr __device__ Y bf16_convert_rtn (X x)
 
template<>
__host__ constexpr __device__ bhalf_t bf16_convert_rtn< bhalf_t, float > (float x)
 
template<>
__host__ constexpr __device__ bhalf_t bf16_convert_rtn< bhalf_t, half_t > (half_t x)
 
template<typename Y , typename X , ck::enable_if_t<!(ck::is_const_v< Y >||ck::is_const_v< X >), bool > = false>
__host__ constexpr __device__ Y type_convert (X x)
 
template<>
__host__ constexpr __device__ float type_convert< float, bhalf_t > (bhalf_t x)
 
template<>
__host__ constexpr __device__ bhalf_t type_convert< bhalf_t, float > (float x)
 
template<>
__host__ constexpr __device__ half_t type_convert< half_t, bhalf_t > (bhalf_t x)
 
template<>
__host__ constexpr __device__ bhalf_t type_convert< bhalf_t, half_t > (half_t x)
 
template<>
__host__ constexpr __device__ int8_t type_convert< int8_t, bhalf_t > (bhalf_t x)
 
template<>
__host__ constexpr __device__ bhalf_t type_convert< bhalf_t, int8_t > (int8_t x)
 
template<>
__host__ constexpr __device__ f8_ocp_t type_convert< f8_ocp_t, int > (int x)
 
template<>
__host__ constexpr __device__ bf8_ocp_t type_convert< bf8_ocp_t, int > (int x)
 
template<typename Y , typename X >
__host__ constexpr __device__ Y type_convert_sp (X x)
 
template<>
__host__ constexpr __device__ int type_convert_sp< int, float > (float x)
 
template<>
__host__ constexpr __device__ float type_convert_sp< float, int > (int x)
 
template<>
__host__ constexpr __device__ int type_convert_sp< int, half_t > (half_t x)
 
template<>
__host__ constexpr __device__ half_t type_convert_sp< half_t, int > (int x)
 
template<typename Y , typename X >
__host__ constexpr __device__ Y f8_convert_sr (X x)
 
template<>
__host__ __device__ f8_fnuz_t f8_convert_sr< f8_fnuz_t, float > (float x)
 
template<>
__host__ __device__ f8_fnuz_t f8_convert_sr< f8_fnuz_t, half_t > (half_t x)
 
template<>
__host__ __device__ bf8_fnuz_t f8_convert_sr< bf8_fnuz_t, float > (float x)
 
template<>
__host__ __device__ bf8_fnuz_t f8_convert_sr< bf8_fnuz_t, half_t > (half_t x)
 
template<>
__host__ __device__ f8_ocp_t f8_convert_sr< f8_ocp_t, float > (float x)
 Converts a float to a 8-bit float type (f8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ f8x2_ocp_t f8_convert_sr< f8x2_ocp_t, float2_t > (float2_t x)
 Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ bf8_ocp_t f8_convert_sr< bf8_ocp_t, float > (float x)
 Converts a float to a 8-bit float type (bf8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_sr< bf8x2_ocp_t, float2_t > (float2_t x)
 Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ f8_ocp_t f8_convert_sr< f8_ocp_t, half_t > (half_t x)
 Converts a half_t to a 8-bit float type (f8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ f8x2_ocp_t f8_convert_sr< f8x2_ocp_t, half2_t > (half2_t x)
 Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ bf8_ocp_t f8_convert_sr< bf8_ocp_t, half_t > (half_t x)
 Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_sr< bf8x2_ocp_t, half2_t > (half2_t x)
 Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ f8_ocp_t f8_convert_sr< f8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ f8x2_ocp_t f8_convert_sr< f8x2_ocp_t, bhalf2_t > (bhalf2_t x)
 Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ bf8_ocp_t f8_convert_sr< bf8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding. More...
 
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_sr< bf8x2_ocp_t, bhalf2_t > (bhalf2_t x)
 Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding. More...
 
template<typename Y , typename X >
__host__ constexpr __device__ Y f8_convert_rne (X x)
 
template<>
__host__ __device__ f8_fnuz_t f8_convert_rne< f8_fnuz_t, float > (float x)
 
template<>
__host__ __device__ f8_fnuz_t f8_convert_rne< f8_fnuz_t, half_t > (half_t x)
 
template<>
__host__ __device__ bf8_fnuz_t f8_convert_rne< bf8_fnuz_t, float > (float x)
 
template<>
__host__ __device__ bf8_fnuz_t f8_convert_rne< bf8_fnuz_t, half_t > (half_t x)
 
template<>
__host__ __device__ f8_ocp_t f8_convert_rne< f8_ocp_t, float > (float x)
 Converts a float to a 8-bit float type (f8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ f8x2_ocp_t f8_convert_rne< f8x2_ocp_t, float2_t > (float2_t x)
 Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ bf8_ocp_t f8_convert_rne< bf8_ocp_t, float > (float x)
 Converts a float to a 8-bit float type (bf8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_rne< bf8x2_ocp_t, float2_t > (float2_t x)
 Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ f8_ocp_t f8_convert_rne< f8_ocp_t, half_t > (half_t x)
 Converts a half_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ f8x2_ocp_t f8_convert_rne< f8x2_ocp_t, half2_t > (half2_t x)
 Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ bf8_ocp_t f8_convert_rne< bf8_ocp_t, half_t > (half_t x)
 Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_rne< bf8x2_ocp_t, half2_t > (half2_t x)
 Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ f8_ocp_t f8_convert_rne< f8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ f8x2_ocp_t f8_convert_rne< f8x2_ocp_t, bhalf2_t > (bhalf2_t x)
 Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ bf8_ocp_t f8_convert_rne< bf8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_rne< bf8x2_ocp_t, bhalf2_t > (bhalf2_t x)
 Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even. More...
 
template<>
__host__ __device__ f8_fnuz_t type_convert< f8_fnuz_t, float > (float x)
 
template<>
__host__ __device__ float type_convert< float, f8_fnuz_t > (f8_fnuz_t x)
 
template<>
__host__ __device__ float2_t type_convert< float2_t, f8x2_fnuz_t > (f8x2_fnuz_t x)
 
template<>
__host__ __device__ float type_convert< float, f8_ocp_t > (f8_ocp_t x)
 Converts a f8_ocp_t value to a float value. More...
 
template<>
__host__ __device__ float2_t type_convert< float2_t, f8x2_ocp_t > (f8x2_ocp_t x)
 Converts a vector of 2 f8_ocp_t values to a vector of 2 float values. More...
 
template<>
__host__ __device__ half_t type_convert< half_t, f8_ocp_t > (f8_ocp_t x)
 Converts a f8_ocp_t value to a half_t value. More...
 
template<>
__host__ __device__ half2_t type_convert< half2_t, f8x2_ocp_t > (f8x2_ocp_t x)
 Converts a vector of 2 f8_ocp_t values to a vector of 2 half_t values. More...
 
template<>
__host__ __device__ bhalf_t type_convert< bhalf_t, f8_ocp_t > (f8_ocp_t x)
 Converts a f8_ocp_t value to a bhalf_t value. More...
 
template<>
__host__ __device__ bhalf2_t type_convert< bhalf2_t, f8x2_ocp_t > (f8x2_ocp_t x)
 Converts a vector of 2 f8_ocp_t values to a vector of 2 bhalf_t values. More...
 
template<>
__host__ __device__ float type_convert< float, bf8_ocp_t > (bf8_ocp_t x)
 Converts a bf8_ocp_t value to a float value. More...
 
template<>
__host__ __device__ float2_t type_convert< float2_t, bf8x2_ocp_t > (bf8x2_ocp_t x)
 Converts a vector of 2 bf8_ocp_t values to a vector of 2 float values. More...
 
template<>
__host__ __device__ half_t type_convert< half_t, bf8_ocp_t > (bf8_ocp_t x)
 Converts a bf8_ocp_t value to a half_t value. More...
 
template<>
__host__ __device__ half2_t type_convert< half2_t, bf8x2_ocp_t > (bf8x2_ocp_t x)
 Converts a vector of 2 bf8_ocp_t values to a vector of 2 half_t values. More...
 
template<>
__host__ __device__ bhalf_t type_convert< bhalf_t, bf8_ocp_t > (bf8_ocp_t x)
 Converts a bf8_ocp_t value to a bhalf_t value. More...
 
template<>
__host__ __device__ bhalf2_t type_convert< bhalf2_t, bf8x2_ocp_t > (bf8x2_ocp_t x)
 Converts a vector of 2 bf8_ocp_t values to a vector of 2 bhalf_t values. More...
 
template<>
__host__ __device__ float2_t type_convert< float2_t, pk_i4_t > (pk_i4_t x)
 
template<>
__host__ __device__ half2_t type_convert< half2_t, pk_i4_t > (pk_i4_t x)
 
template<>
__host__ __device__ bhalf2_t type_convert< bhalf2_t, pk_i4_t > (pk_i4_t x)
 
template<>
__host__ __device__ half2_t type_convert< half2_t, float2_t > (float2_t x)
 
template<>
__host__ __device__ f8_fnuz_t type_convert< f8_fnuz_t, half_t > (half_t x)
 
template<>
__host__ __device__ f8_ocp_t type_convert< f8_ocp_t, half_t > (half_t x)
 Converts a half_t value to a f8_ocp_t value with rounding determined by a flag. More...
 
template<>
__host__ __device__ bf8_ocp_t type_convert< bf8_ocp_t, half_t > (half_t x)
 Converts a half_t value to a bf8_ocp_t value with rounding determined by a flag. More...
 
template<>
__host__ __device__ half_t type_convert< half_t, f8_fnuz_t > (f8_fnuz_t x)
 
template<>
__host__ __device__ bf8_fnuz_t type_convert< bf8_fnuz_t, float > (float x)
 
template<>
__host__ __device__ f8_ocp_t type_convert< f8_ocp_t, float > (float x)
 Converts a float value to a f8_ocp_t value with rounding determined by a flag. More...
 
template<>
__host__ __device__ bf8_ocp_t type_convert< bf8_ocp_t, float > (float x)
 Converts a float value to a bf8_ocp_t value with rounding determined by a flag. More...
 
template<>
__host__ __device__ f8_ocp_t type_convert< f8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t value to a f8_ocp_t value with rounding determined by a flag. More...
 
template<>
__host__ __device__ bf8_ocp_t type_convert< bf8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t value to a bf8_ocp_t value with rounding determined by a flag. More...
 
template<>
__host__ __device__ float type_convert< float, bf8_fnuz_t > (bf8_fnuz_t x)
 
template<>
__host__ __device__ bf8_fnuz_t type_convert< bf8_fnuz_t, half_t > (half_t x)
 
template<>
__host__ __device__ half_t type_convert< half_t, bf8_fnuz_t > (bf8_fnuz_t x)
 
__host__ __device__ f4_t f4_convert_rne (float x, float scale=1.0f)
 
__host__ __device__ f4x2_t f4_convert_rne (float2_t x, float scale=1.0f)
 
__host__ __device__ f4_t f4_convert_sr (float x, float scale=1.0f)
 
__host__ __device__ f4x2_t f4_convert_sr (float2_t x, float scale=1.0f)
 
template<>
__host__ __device__ f4_t type_convert< f4_t, float > (float x)
 
template<>
__host__ __device__ f4x2_t type_convert< f4x2_t, float2_t > (float2_t x)
 
template<>
__host__ __device__ f4x2_pk_t type_convert< f4x2_pk_t, float2_t > (float2_t x)
 
template<>
__host__ __device__ f4x32_t type_convert< f4x32_t, float32_t > (float32_t x)
 
template<>
__host__ __device__ float type_convert< float, f4_t > (f4_t x)
 
template<>
__host__ __device__ float2_t type_convert< float2_t, f4x2_t > (f4x2_t x)
 
template<>
__host__ __device__ float32_t type_convert< float32_t, f4x32_t > (f4x32_t x)
 
__host__ __device__ f6_t f6_convert_rne (float x, float scale=1.0f)
 Converts a float to a 6-bit float type (f6_t) using round-to-nearest-even. More...
 
__host__ __device__ f6x32_t f6_convert_rne (float32_t x, float scale=1.0f)
 Converts a 32-element single-precision float array into a packed 6-bit representation. More...
 
__host__ __device__ f6_t f6_convert_sr (float x, float scale=1.0f)
 Converts a float to the 6-bit floating-point type (f6_t) using stochastic rounding. More...
 
__host__ __device__ f6x32_t f6_convert_sr (float32_t x, float scale=1.0f)
 Converts a 32-element single-precision float array into a packed 6-bit representation. More...
 
template<>
__host__ __device__ f6_t type_convert< f6_t, float > (float x)
 Specializes the type conversion template for converting a float into the 6-bit float type (f6_t). More...
 
template<>
__host__ __device__ f6x32_t type_convert< f6x32_t, float32_t > (float32_t x)
 Specializes the type conversion template for converting a vector of 32 floats into the vector of 32 6-bit float types (f6x32_t). More...
 
template<>
__host__ __device__ f6x32_pk_t type_convert< f6x32_pk_t, float32_t > (float32_t x)
 
template<>
__host__ __device__ f6x16_t type_convert< f6x16_t, float16_t > (float16_t x)
 
template<>
__host__ __device__ f6x16_pk_t type_convert< f6x16_pk_t, float16_t > (float16_t x)
 
template<>
__host__ __device__ float type_convert< float, f6_t > (f6_t x)
 Specializes the type conversion template for converting the 6-bit float type (f6_t) to float. More...
 
template<>
__host__ __device__ float32_t type_convert< float32_t, f6x32_t > (f6x32_t x)
 Specializes the type conversion template for converting the vector of 32 6-bit float types (f6x32_t) to vector of 32 floats. More...
 
template<>
__host__ __device__ float16_t type_convert< float16_t, f6x16_t > (f6x16_t x)
 
template<>
__host__ __device__ float16_t type_convert< float16_t, f6x16_pk_t > (f6x16_pk_t x)
 
__host__ __device__ bf6_t bf6_convert_rne (float x, float scale=1.0f)
 Converts a float to the 6-bit BF6 type using round-to-nearest-even. More...
 
__host__ __device__ bf6x32_t bf6_convert_rne (float32_t x, float scale=1.0f)
 Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using round-to-nearest-even. More...
 
__host__ __device__ bf6_t bf6_convert_sr (float x, float scale=1.0f)
 Converts a float to the 6-bit BF6 type using stochastic rounding. More...
 
__host__ __device__ bf6x32_t bf6_convert_sr (float32_t x, float scale=1.0f)
 Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using stochastic rounding. More...
 
template<>
__host__ __device__ bf6_t type_convert< bf6_t, float > (float x)
 Specializes float-to-bf6_t conversion. More...
 
template<>
__host__ __device__ bf6x32_t type_convert< bf6x32_t, float32_t > (float32_t x)
 Specializes vector of 32 float-to-bf6_t conversion. More...
 
template<>
__host__ __device__ bf6x32_pk_t type_convert< bf6x32_pk_t, float32_t > (float32_t x)
 
template<>
__host__ __device__ bf6x16_t type_convert< bf6x16_t, float16_t > (float16_t x)
 
template<>
__host__ __device__ bf6x16_pk_t type_convert< bf6x16_pk_t, float16_t > (float16_t x)
 
template<>
__host__ __device__ float type_convert< float, bf6_t > (bf6_t x)
 Specializes the type conversion template for converting a bf6_t value to float. More...
 
template<>
__host__ __device__ float32_t type_convert< float32_t, bf6x32_t > (bf6x32_t x)
 Specializes the type conversion template for converting a vector of 32 bf6_t values to vector of 32 floats. More...
 
template<>
__host__ __device__ float16_t type_convert< float16_t, bf6x16_t > (bf6x16_t x)
 
template<>
__host__ __device__ float16_t type_convert< float16_t, bf6x16_pk_t > (bf6x16_pk_t x)
 
template<typename Y , typename X , size_t NumElems>
__host__ __device__ void array_convert (std::array< Y, NumElems > &y, const std::array< X, NumElems > &x)
 
template<typename Y , typename X , index_t NumElems>
__host__ __device__ void array_convert (Array< Y, NumElems > &y, const Array< X, NumElems > &x)
 

Variables

template<typename T >
constexpr index_t packed_size_v = packed_type_info<T>::packed_size
 
template<typename T >
constexpr bool is_packed_type_v = packed_size_v<T> > 1
 
constexpr detail::ignore_t ignore
 
template<typename X , typename Y >
constexpr bool is_same_v = is_same<X, Y>::value
 
template<typename X , typename Y >
constexpr bool is_base_of_v = is_base_of<X, Y>::value
 
template<typename T >
constexpr bool is_unsigned_v = is_unsigned<T>::value
 
template<typename T >
constexpr bool is_pointer_v = is_pointer<T>::value
 

Detailed Description

Definitions from <cstdint>, <cmath> conflict with /opt/rocm/include/hip/amd_detail/amd_hip_vector_types.h.

Typedef Documentation

◆ bf6_t

using ck::bf6_t = typedef unsigned _BitInt(6)

◆ bf6x16_pk_t

using ck::bf6x16_pk_t = typedef f6_pk_t<bf6_t, 16>

◆ bf6x16_t

using ck::bf6x16_t = typedef typename vector_type<bf6x16_pk_t, 1>::type

◆ bf6x16x2_t

using ck::bf6x16x2_t = typedef typename vector_type<bf6x16_pk_t, 2>::type

◆ bf6x32_pk_t

using ck::bf6x32_pk_t = typedef f6_pk_t<bf6_t, 32>

◆ bf6x32_t

using ck::bf6x32_t = typedef typename vector_type<bf6x32_pk_t, 1>::type

◆ bf8_fnuz_t

using ck::bf8_fnuz_t = typedef unsigned _BitInt(8)

◆ bf8_t

using ck::bf8_t = typedef bf8_fnuz_t

◆ bf8x16_fnuz_t

using ck::bf8x16_fnuz_t = typedef typename vector_type<bf8_fnuz_t, 16>::type

◆ bf8x16_ocp_t

using ck::bf8x16_ocp_t = typedef typename vector_type<bf8_ocp_t, 16>::type

◆ bf8x2_fnuz_t

using ck::bf8x2_fnuz_t = typedef typename vector_type<bf8_fnuz_t, 2>::type

◆ bf8x2_ocp_t

using ck::bf8x2_ocp_t = typedef typename vector_type<bf8_ocp_t, 2>::type

◆ bf8x32_fnuz_t

using ck::bf8x32_fnuz_t = typedef typename vector_type<bf8_fnuz_t, 32>::type

◆ bf8x32_ocp_t

using ck::bf8x32_ocp_t = typedef typename vector_type<bf8_ocp_t, 32>::type

◆ bf8x4_fnuz_t

using ck::bf8x4_fnuz_t = typedef typename vector_type<bf8_fnuz_t, 4>::type

◆ bf8x4_ocp_t

using ck::bf8x4_ocp_t = typedef typename vector_type<bf8_ocp_t, 4>::type

◆ bf8x64_fnuz_t

using ck::bf8x64_fnuz_t = typedef typename vector_type<bf8_fnuz_t, 64>::type

◆ bf8x64_ocp_t

using ck::bf8x64_ocp_t = typedef typename vector_type<bf8_ocp_t, 64>::type

◆ bf8x8_fnuz_t

using ck::bf8x8_fnuz_t = typedef typename vector_type<bf8_fnuz_t, 8>::type

◆ bf8x8_ocp_t

using ck::bf8x8_ocp_t = typedef typename vector_type<bf8_ocp_t, 8>::type

◆ bhalf16_t

using ck::bhalf16_t = typedef typename vector_type<bhalf_t, 16>::type

◆ bhalf2_t

using ck::bhalf2_t = typedef typename vector_type<bhalf_t, 2>::type

◆ bhalf32_t

using ck::bhalf32_t = typedef typename vector_type<bhalf_t, 32>::type

◆ bhalf4_t

using ck::bhalf4_t = typedef typename vector_type<bhalf_t, 4>::type

◆ bhalf8_t

using ck::bhalf8_t = typedef typename vector_type<bhalf_t, 8>::type

◆ bhalf_t

using ck::bhalf_t = typedef ushort

◆ bool_constant

template<bool B>
using ck::bool_constant = typedef integral_constant<bool, B>

◆ conditional_t

template<bool predicate, class X , class Y >
using ck::conditional_t = typedef typename conditional<predicate, X, Y>::type

◆ double2_t

using ck::double2_t = typedef typename vector_type<double, 2>::type

◆ double4_t

using ck::double4_t = typedef typename vector_type<double, 4>::type

◆ e8m0x4_bexp_t

using ck::e8m0x4_bexp_t = typedef typename vector_type<e8m0_bexp_t, 4>::type

◆ element_type_t

template<typename T >
using ck::element_type_t = typedef typename packed_type_info<T>::element_type

◆ enable_if

template<bool B, typename T = void>
using ck::enable_if = typedef std::enable_if<B, T>

◆ enable_if_t

template<bool B, typename T = void>
using ck::enable_if_t = typedef typename std::enable_if<B, T>::type

◆ f4_t

using ck::f4_t = typedef unsigned _BitInt(4)

◆ f4x16_t

using ck::f4x16_t = typedef typename vector_type<f4x2_pk_t, 8>::type

◆ f4x2_t

using ck::f4x2_t = typedef typename vector_type<f4x2_pk_t, 1>::type

◆ f4x32_t

using ck::f4x32_t = typedef typename vector_type<f4x2_pk_t, 16>::type

◆ f4x4_t

using ck::f4x4_t = typedef typename vector_type<f4x2_pk_t, 2>::type

◆ f4x64_t

using ck::f4x64_t = typedef typename vector_type<f4x2_pk_t, 32>::type

◆ f4x8_t

using ck::f4x8_t = typedef typename vector_type<f4x2_pk_t, 4>::type

◆ f6_t

using ck::f6_t = typedef _BitInt(6)

◆ f6x16_pk_t

using ck::f6x16_pk_t = typedef f6_pk_t<f6_t, 16>

◆ f6x16_t

using ck::f6x16_t = typedef typename vector_type<f6x16_pk_t, 1>::type

◆ f6x16x2_t

using ck::f6x16x2_t = typedef typename vector_type<f6x16_pk_t, 2>::type

◆ f6x32_pk_t

using ck::f6x32_pk_t = typedef f6_pk_t<f6_t, 32>

◆ f6x32_t

using ck::f6x32_t = typedef typename vector_type<f6x32_pk_t, 1>::type

◆ f8_fnuz_t

using ck::f8_fnuz_t = typedef _BitInt(8)

◆ f8_t

using ck::f8_t = typedef f8_fnuz_t

◆ f8x16_fnuz_t

using ck::f8x16_fnuz_t = typedef typename vector_type<f8_fnuz_t, 16>::type

◆ f8x16_ocp_t

using ck::f8x16_ocp_t = typedef typename vector_type<f8_ocp_t, 16>::type

◆ f8x2_fnuz_t

using ck::f8x2_fnuz_t = typedef typename vector_type<f8_fnuz_t, 2>::type

◆ f8x2_ocp_t

using ck::f8x2_ocp_t = typedef typename vector_type<f8_ocp_t, 2>::type

◆ f8x32_fnuz_t

using ck::f8x32_fnuz_t = typedef typename vector_type<f8_fnuz_t, 32>::type

◆ f8x32_ocp_t

using ck::f8x32_ocp_t = typedef typename vector_type<f8_ocp_t, 32>::type

◆ f8x4_fnuz_t

using ck::f8x4_fnuz_t = typedef typename vector_type<f8_fnuz_t, 4>::type

◆ f8x4_ocp_t

using ck::f8x4_ocp_t = typedef typename vector_type<f8_ocp_t, 4>::type

◆ f8x64_fnuz_t

using ck::f8x64_fnuz_t = typedef typename vector_type<f8_fnuz_t, 64>::type

◆ f8x64_ocp_t

using ck::f8x64_ocp_t = typedef typename vector_type<f8_ocp_t, 64>::type

◆ f8x8_fnuz_t

using ck::f8x8_fnuz_t = typedef typename vector_type<f8_fnuz_t, 8>::type

◆ f8x8_ocp_t

using ck::f8x8_ocp_t = typedef typename vector_type<f8_ocp_t, 8>::type

◆ false_type

using ck::false_type = typedef bool_constant<false>

◆ float16_t

using ck::float16_t = typedef typename vector_type<float, 16>::type

◆ float2_t

using ck::float2_t = typedef typename vector_type<float, 2>::type

◆ float32_t

using ck::float32_t = typedef typename vector_type<float, 32>::type

◆ float4_t

using ck::float4_t = typedef typename vector_type<float, 4>::type

◆ float64_t

using ck::float64_t = typedef typename vector_type<float, 64>::type

◆ float8_t

using ck::float8_t = typedef typename vector_type<float, 8>::type

◆ fp8_storage_t

typedef unsigned char ck::fp8_storage_t

◆ half16_t

using ck::half16_t = typedef typename vector_type<half_t, 16>::type

◆ half2_t

using ck::half2_t = typedef typename vector_type<half_t, 2>::type

◆ half32_t

using ck::half32_t = typedef typename vector_type<half_t, 32>::type

◆ half4_t

using ck::half4_t = typedef typename vector_type<half_t, 4>::type

◆ half8_t

using ck::half8_t = typedef typename vector_type<half_t, 8>::type

◆ half_t

using ck::half_t = typedef _Float16

◆ has_same_scalar_type

template<typename X , typename Y >
using ck::has_same_scalar_type = typedef is_same<typename scalar_type<remove_cvref_t<X> >::type, typename scalar_type<remove_cvref_t<Y> >::type>

◆ index_t

using ck::index_t = typedef int32_t

◆ int32x16_t

using ck::int32x16_t = typedef typename vector_type<int32_t, 16>::type

◆ int32x2_t

using ck::int32x2_t = typedef typename vector_type<int32_t, 2>::type

◆ int32x32_t

using ck::int32x32_t = typedef typename vector_type<int32_t, 32>::type

◆ int32x4_t

using ck::int32x4_t = typedef typename vector_type<int32_t, 4>::type

◆ int32x64_t

using ck::int32x64_t = typedef typename vector_type<int32_t, 64>::type

◆ int32x6_t

using ck::int32x6_t = typedef typename vector_type<int32_t, 6>::type

◆ int32x8_t

using ck::int32x8_t = typedef typename vector_type<int32_t, 8>::type

◆ int4_t

using ck::int4_t = typedef _BitInt(4)

◆ int64_t

using ck::int64_t = typedef long

◆ int8x16_t

using ck::int8x16_t = typedef typename vector_type<int8_t, 16>::type

◆ int8x2_t

using ck::int8x2_t = typedef typename vector_type<int8_t, 2>::type

◆ int8x32_t

using ck::int8x32_t = typedef typename vector_type<int8_t, 32>::type

◆ int8x4_t

using ck::int8x4_t = typedef typename vector_type<int8_t, 4>::type

◆ int8x64_t

using ck::int8x64_t = typedef typename vector_type<int8_t, 64>::type

◆ int8x8_t

using ck::int8x8_t = typedef typename vector_type<int8_t, 8>::type

◆ is_detected

template<template< class... > class Op, class... Args>
using ck::is_detected = typedef typename detail::detector<nonesuch, void, Op, Args...>::value_t

◆ is_pack2_invocable_t

template<typename T >
using ck::is_pack2_invocable_t = typedef decltype(ck::declval<T&>().is_pack2_invocable)

◆ is_pack4_invocable_t

template<typename T >
using ck::is_pack4_invocable_t = typedef decltype(ck::declval<T&>().is_pack4_invocable)

◆ is_pack8_invocable_t

template<typename T >
using ck::is_pack8_invocable_t = typedef decltype(ck::declval<T&>().is_pack8_invocable)

◆ is_tuple

template<typename T >
using ck::is_tuple = typedef decltype(ck::declval<T&>().IsTuple())

◆ iter_difference_t

template<typename T >
using ck::iter_difference_t = typedef typename std::iterator_traits<remove_cvref_t<T> >::difference_type

◆ iter_reference_t

template<typename T >
using ck::iter_reference_t = typedef decltype(*std::declval<T&>())

◆ iter_value_t

template<typename T >
using ck::iter_value_t = typedef typename std::iterator_traits<remove_cvref_t<T> >::value_type

◆ long_index_t

using ck::long_index_t = typedef int64_t

◆ LongNumber

template<index_t N>
using ck::LongNumber = typedef integral_constant<long_index_t, N>

◆ make_index_sequence

template<index_t N>
using ck::make_index_sequence = typedef typename __make_integer_seq<impl::__integer_sequence, index_t, N>::seq_type

◆ MultiIndex

template<index_t N>
using ck::MultiIndex = typedef StaticallyIndexedArray<index_t, N>

◆ Number

template<index_t N>
using ck::Number = typedef integral_constant<index_t, N>

◆ packed_type_t

template<typename T , index_t N = 0>
using ck::packed_type_t = typedef typename packed_type_maker<T, N>::packed_type

◆ pk_i4x2_t

using ck::pk_i4x2_t = typedef typename vector_type<pk_i4_t, 2>::type

◆ pk_i4x4_t

using ck::pk_i4x4_t = typedef typename vector_type<pk_i4_t, 4>::type

◆ pk_i4x8_t

using ck::pk_i4x8_t = typedef typename vector_type<pk_i4_t, 8>::type

◆ remove_cv_t

template<typename T >
using ck::remove_cv_t = typedef typename remove_cv<T>::type

◆ remove_cvref_t

template<typename T >
using ck::remove_cvref_t = typedef remove_cv_t<remove_reference_t<T> >

◆ remove_pointer_t

template<typename T >
using ck::remove_pointer_t = typedef typename remove_pointer<T>::type

◆ remove_reference_t

template<typename T >
using ck::remove_reference_t = typedef typename remove_reference<T>::type

◆ sequence_merge_t

template<typename Sx , typename Sy >
using ck::sequence_merge_t = typedef typename sequence_merge<Sx, Sy>::type

◆ StaticallyIndexedArray

template<typename T , index_t N>
using ck::StaticallyIndexedArray = typedef typename detail::StaticallyIndexedArrayImpl<T, N>::type

◆ TensorCoordinate_t

template<typename TensorDesc >
using ck::TensorCoordinate_t = typedef decltype(make_tensor_coordinate( TensorDesc{}, MultiIndex<remove_cvref_t<TensorDesc>::GetNumOfDimension()>{}))

◆ TensorCoordinateStep_t

template<typename TensorDesc >
using ck::TensorCoordinateStep_t = typedef decltype(make_tensor_coordinate_step( TensorDesc{}, MultiIndex<remove_cvref_t<TensorDesc>::GetNumOfDimension()>{}))

◆ true_type

using ck::true_type = typedef bool_constant<true>

◆ tuple_element_t

template<index_t I, typename TTuple >
using ck::tuple_element_t = typedef typename tuple_element<I, TTuple>::type

◆ uint8x16_t

using ck::uint8x16_t = typedef typename vector_type<uint8_t, 16>::type

◆ uint8x2_t

using ck::uint8x2_t = typedef typename vector_type<uint8_t, 2>::type

◆ uint8x32_t

using ck::uint8x32_t = typedef typename vector_type<uint8_t, 32>::type

◆ uint8x4_t

using ck::uint8x4_t = typedef typename vector_type<uint8_t, 4>::type

◆ uint8x64_t

using ck::uint8x64_t = typedef typename vector_type<uint8_t, 64>::type

◆ uint8x8_t

using ck::uint8x8_t = typedef typename vector_type<uint8_t, 8>::type

◆ uniform_sequence_gen_t

template<index_t NSize, index_t I>
using ck::uniform_sequence_gen_t = typedef typename uniform_sequence_gen<NSize, I>::type

◆ vector_type_maker_t

template<typename T , index_t N>
using ck::vector_type_maker_t = typedef typename vector_type_maker<T, N>::type

Enumeration Type Documentation

◆ Activation [1/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ Activation [2/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ Activation [3/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ Activation [4/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ Activation [5/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ AddressSpaceEnum

enum ck::AddressSpaceEnum
strong
Enumerator
Generic 
Global 
Lds 
Sgpr 
Vgpr 

◆ AmdBufferCoherenceEnum [1/2]

Enumerator
DefaultCoherence 
GLC 
SLC 
GLC_SLC 
WAVE_NT0 
WAVE_NT1 
GROUP_NT0 
GROUP_NT1 
DEVICE_NT0 
DEVICE_NT1 
SYSTEM_NT0 
SYSTEM_NT1 
DefaultCoherence 
GLC 
SLC 
GLC_SLC 
WAVE_NT0 
WAVE_NT1 
GROUP_NT0 
GROUP_NT1 
DEVICE_NT0 
DEVICE_NT1 
SYSTEM_NT0 
SYSTEM_NT1 

◆ AmdBufferCoherenceEnum [2/2]

Enumerator
DefaultCoherence 
GLC 
SLC 
GLC_SLC 
WAVE_NT0 
WAVE_NT1 
GROUP_NT0 
GROUP_NT1 
DEVICE_NT0 
DEVICE_NT1 
SYSTEM_NT0 
SYSTEM_NT1 
DefaultCoherence 
GLC 
SLC 
GLC_SLC 
WAVE_NT0 
WAVE_NT1 
GROUP_NT0 
GROUP_NT1 
DEVICE_NT0 
DEVICE_NT1 
SYSTEM_NT0 
SYSTEM_NT1 

◆ BlockGemmPipelineScheduler

Enumerator
Intrawave 
Interwave 

◆ BlockGemmPipelineVersion

Enumerator
v1 
v2 
v3 
v4 
v5 

◆ ck_fp8_interpretation_t

Describes FP8 interpretation.

Enumerator
CK_E4M3_OCP 
CK_E5M2_OCP 
CK_E4M3_FNUZ 
CK_E5M2_FNUZ 

◆ ck_saturation_t

enum ck::ck_saturation_t
strong

Describes saturation behavior.

Enumerator
CK_NOSAT 
CK_SATFINITE 

◆ DppInstr

enum ck::DppInstr
strong
Enumerator
dpp8_f16_1x32x2 
dpp8_f16_2x16x2 
dpp8_f16_2x32x2 
dpp8_f16_4x16x2 
dpp8_f16_4x32x2 
dpp8_f16_8x16x2 
dpp8_f16_8x32x2 
dpp8_f16_16x16x2 
dpp8_f16_32x8x2 

◆ f8_rounding_mode

enum ck::f8_rounding_mode
strong
Enumerator
standard 
stochastic 

◆ IndicesType

enum ck::IndicesType
strong
Enumerator
INDICES_32BIT 
INDICES_64BIT 
INDICES_16BIT 
INDICES_8BIT 

◆ InMemoryDataOperationEnum

Enumerator
Set 
AtomicAdd 
AtomicMax 
Add 

◆ LoopScheduler

enum ck::LoopScheduler
strong
Enumerator
Default 
Interwave 

◆ MfmaInstr

enum ck::MfmaInstr
strong
Enumerator
mfma_f32_32x32x1xf32 
mfma_f32_16x16x1xf32 
mfma_f32_4x4x1xf32 
mfma_f32_32x32x2xf32 
mfma_f32_16x16x4xf32 
mfma_f32_32x32x4f16 
mfma_f32_16x16x4f16 
mfma_f32_4x4x4f16 
mfma_f32_32x32x8f16 
mfma_f32_16x16x16f16 
mfma_f32_32x32x8bf16_1k 
mfma_f32_16x16x16bf16_1k 
mfma_f32_32x32x4bf16 
mfma_f32_16x16x8bf16 
mfma_i32_32x32x8i8 
mfma_i32_16x16x16i8 
mfma_i32_32x32x16i8 
mfma_i32_16x16x32i8 
mfma_f64_16x16x4f64 
mfma_f32_32x32x16f8f8 
mfma_f32_16x16x32f8f8 
mfma_f32_32x32x16bf8bf8 
mfma_f32_16x16x32bf8bf8 
mfma_f32_32x32x16f8bf8 
mfma_f32_16x16x32f8bf8 
mfma_f32_32x32x16bf8f8 
mfma_f32_16x16x32bf8f8 
mfma_f32_32x32x16f16 
mfma_f32_16x16x32f16 
mfma_f32_32x32x16bf16 
mfma_f32_16x16x32bf16 
mfma_i32_32x32x32i8 
mfma_i32_16x16x64i8 
mfma_f32_32x32x64f8f6f4 
mfma_f32_16x16x128f8f6f4 
mfma_scale_f32_32x32x64f8f6f4 
mfma_scale_f32_16x16x128f8f6f4 
wmma_f32_16x16x16_f16 
wmma_f32_16x16x16_bf16 
wmma_i32_16x16x16_iu8 
wmma_unsupport_16x16_gfx11 
wmma_f32_16x16x16_f16_gfx12 
wmma_f32_16x16x16_bf16_gfx12 
wmma_i32_16x16x16_iu8_gfx12 
wmma_f32_16x16x16_f8f8_gfx12 
wmma_f32_16x16x16_f8bf8_gfx12 
wmma_f32_16x16x16_bf8f8_gfx12 
wmma_f32_16x16x16_bf8bf8_gfx12 
wmma_unsupport_16x16_gfx12 

◆ NanPropagation

enum ck::NanPropagation
strong
Enumerator
NOT_PROPAGATE_NAN 
PROPAGATE_NAN 

◆ PipelineVersion

enum ck::PipelineVersion
strong
Enumerator
v1 
v2 
v4 
weight_only 

◆ ReduceTensorIndices

Enumerator
NO_INDICES 
FLATTENED_INDICES 

◆ ReduceTensorOp

enum ck::ReduceTensorOp
strong
Enumerator
ADD 
MUL 
MIN 
MAX 
AMAX 
AVG 
NORM1 
NORM2 

◆ SchedulerGroup

enum ck::SchedulerGroup : uint32_t
Enumerator
SCHED_GROUP_MFMA 
SCHED_GROUP_VMEM 
SCHED_GROUP_LDS_READ 
SCHED_GROUP_LDS_WRITE 

◆ SmfmacInstr

enum ck::SmfmacInstr
strong
Enumerator
smfmac_f32_16x16x32f16 
smfmac_f32_32x32x16f16 
smfmac_f32_16x16x32bf16 
smfmac_f32_32x32x16bf16 

◆ StreamKReductionStrategy

Enumerator
Atomic 
Reduction 

◆ TailNumber

enum ck::TailNumber
strong
Enumerator
Odd 
Even 
One 
Two 
Three 
Four 
Five 
Six 
Seven 
Empty 
Full 

◆ WmmaInstr

enum ck::WmmaInstr
strong
Enumerator
wmma_f32_16x16x16_f16 
wmma_f32_16x16x16_bf16 
wmma_f16_16x16x16_f16 
wmma_bf16_16x16x16_bf16 
wmma_i32_16x16x16_iu8 
wmma_i32_16x16x16_iu4 
wmma_f32_16x16x16_f16_gfx12 
wmma_f32_16x16x16_bf16_gfx12 
wmma_i32_16x16x16_iu8_gfx12 
wmma_f32_16x16x16_f8f8_gfx12 
wmma_f32_16x16x16_f8bf8_gfx12 
wmma_f32_16x16x16_bf8f8_gfx12 
wmma_f32_16x16x16_bf8bf8_gfx12 

Function Documentation

◆ accumulate_n()

template<typename T , typename ForwardIterator , typename Size , typename BinaryOperation >
auto ck::accumulate_n ( ForwardIterator  first,
Size  count,
init,
BinaryOperation  op 
) -> decltype(std::accumulate(first, std::next(first, count), init, op))

◆ amd_assemble_cvt_f32_i4()

__device__ float ck::amd_assemble_cvt_f32_i4 ( int  b)
inline

◆ amd_assembly_and_b32()

__device__ int ck::amd_assembly_and_b32 ( int  a,
int  b 
)
inline

◆ amd_assembly_and_or_b32()

__device__ int ck::amd_assembly_and_or_b32 ( int  a,
int  b,
int  d 
)
inline

◆ amd_assembly_cvt_f8_to_f32()

__device__ f8x4_t ck::amd_assembly_cvt_f8_to_f32 ( float  b0,
float  b1,
float  b2,
float  b3 
)
inline

◆ amd_assembly_i4_to_fp8x8()

__device__ f8x8_t ck::amd_assembly_i4_to_fp8x8 ( int  a)
inline

◆ amd_assembly_outer_product_1x2() [1/3]

__device__ void ck::amd_assembly_outer_product_1x2 ( float  a,
float  b0,
float  b1,
float &  c0,
float &  c1 
)

◆ amd_assembly_outer_product_1x2() [2/3]

__device__ void ck::amd_assembly_outer_product_1x2 ( half2_t  a,
half2_t  b0,
half2_t  b1,
float &  c0,
float &  c1 
)

◆ amd_assembly_outer_product_1x2() [3/3]

__device__ void ck::amd_assembly_outer_product_1x2 ( int8x4_t  a,
int8x4_t  b0,
int8x4_t  b1,
int32_t &  c0,
int32_t &  c1 
)

◆ amd_assembly_outer_product_1x4() [1/3]

__device__ void ck::amd_assembly_outer_product_1x4 ( float  a,
float  b0,
float  b1,
float  b2,
float  b3,
float &  c0,
float &  c1,
float &  c2,
float &  c3 
)

◆ amd_assembly_outer_product_1x4() [2/3]

__device__ void ck::amd_assembly_outer_product_1x4 ( half2_t  a,
half2_t  b0,
half2_t  b1,
half2_t  b2,
half2_t  b3,
float &  c0,
float &  c1,
float &  c2,
float &  c3 
)

◆ amd_assembly_outer_product_1x4() [3/3]

__device__ void ck::amd_assembly_outer_product_1x4 ( int8x4_t  a,
int8x4_t  b0,
int8x4_t  b1,
int8x4_t  b2,
int8x4_t  b3,
int32_t &  c0,
int32_t &  c1,
int32_t &  c2,
int32_t &  c3 
)

◆ amd_assembly_pk_add_f16()

__device__ half2_t ck::amd_assembly_pk_add_f16 ( half2_t  a,
half2_t  b 
)
inline

◆ amd_assembly_pk_fma_f16()

__device__ half2_t ck::amd_assembly_pk_fma_f16 ( half2_t  a,
half2_t  b,
half2_t  c 
)
inline

◆ amd_buffer_atomic_add()

template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_add ( const typename vector_type_maker< T, N >::type::type  src_thread_data,
T *  p_dst_wave,
const index_t  dst_thread_element_offset,
const bool  dst_thread_element_valid,
const index_t  dst_element_space_size 
)

◆ amd_buffer_atomic_add_impl()

template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_add_impl ( const typename vector_type< T, N >::type  src_thread_data,
int32x4_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_buffer_atomic_max()

template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_max ( const typename vector_type_maker< T, N >::type::type  src_thread_data,
T *  p_dst_wave,
const index_t  dst_thread_element_offset,
const bool  dst_thread_element_valid,
const index_t  dst_element_space_size 
)

◆ amd_buffer_atomic_max_impl()

template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_max_impl ( const typename vector_type< T, N >::type  src_thread_data,
int32x4_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_buffer_load_impl() [1/2]

template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type<T, N>::type ck::amd_buffer_load_impl ( __amdgpu_buffer_rsrc_t  src_wave_buffer_resource,
index_t  src_thread_addr_offset,
index_t  src_wave_addr_offset 
)

◆ amd_buffer_load_impl() [2/2]

template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type<T, N>::type ck::amd_buffer_load_impl ( int32x4_t  src_wave_buffer_resource,
index_t  src_thread_addr_offset,
index_t  src_wave_addr_offset 
)

◆ amd_buffer_load_impl_raw() [1/2]

template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type<int8_t, N>::type ck::amd_buffer_load_impl_raw ( __amdgpu_buffer_rsrc_t  src_wave_buffer_resource,
index_t  src_thread_addr_offset,
index_t  src_wave_addr_offset 
)

◆ amd_buffer_load_impl_raw() [2/2]

template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type<int8_t, N>::type ck::amd_buffer_load_impl_raw ( int32x4_t  src_wave_buffer_resource,
index_t  src_thread_addr_offset,
index_t  src_wave_addr_offset 
)

◆ amd_buffer_load_invalid_element_return_customized_value()

template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type ck::amd_buffer_load_invalid_element_return_customized_value ( const T *  p_src_wave,
index_t  src_thread_element_offset,
bool  src_thread_element_valid,
index_t  src_element_space_size,
customized_value 
)

◆ amd_buffer_load_invalid_element_return_zero()

template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type ck::amd_buffer_load_invalid_element_return_zero ( const T *  p_src_wave,
index_t  src_thread_element_offset,
bool  src_thread_element_valid,
index_t  src_element_space_size 
)

◆ amd_buffer_store()

template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store ( const typename vector_type_maker< T, N >::type::type  src_thread_data,
T *  p_dst_wave,
const index_t  dst_thread_element_offset,
const bool  dst_thread_element_valid,
const index_t  dst_element_space_size 
)

◆ amd_buffer_store_impl() [1/2]

template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl ( const typename vector_type< T, N >::type  src_thread_data,
__amdgpu_buffer_rsrc_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_buffer_store_impl() [2/2]

template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl ( const typename vector_type< T, N >::type  src_thread_data,
int32x4_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_buffer_store_impl_raw() [1/2]

template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl_raw ( const typename vector_type< int8_t, N >::type  src_thread_data,
__amdgpu_buffer_rsrc_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_buffer_store_impl_raw() [2/2]

template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl_raw ( const typename vector_type< int8_t, N >::type  src_thread_data,
int32x4_t  dst_wave_buffer_resource,
index_t  dst_thread_addr_offset,
index_t  dst_wave_addr_offset 
)

◆ amd_direct_load_global_to_lds()

template<typename T , index_t NumElemsPerThread>
__device__ void ck::amd_direct_load_global_to_lds ( const T *  global_base_ptr,
const index_t  global_offset,
T *  lds_base_ptr,
const index_t  lds_offset,
const bool  is_valid,
const index_t  src_element_space_size 
)

◆ amd_global_atomic_add_impl()

template<typename T , index_t N>
__device__ void ck::amd_global_atomic_add_impl ( const typename vector_type< T, N >::type  src_thread_data,
T *  addr 
)

◆ amd_wave_read_first_lane() [1/4]

template<typename Object , typename = ck::enable_if_t<ck::is_class_v<Object> && ck::is_trivially_copyable_v<Object>>>
__device__ auto ck::amd_wave_read_first_lane ( const Object &  obj)

NOTE: Implicitly start object lifetime. It's better to use std::start_lifetime_at() in this scenario

◆ amd_wave_read_first_lane() [2/4]

__device__ int32_t ck::amd_wave_read_first_lane ( int32_t  value)
inline

◆ amd_wave_read_first_lane() [3/4]

__device__ int64_t ck::amd_wave_read_first_lane ( int64_t  value)
inline

◆ amd_wave_read_first_lane() [4/4]

__device__ uint32_t ck::amd_wave_read_first_lane ( uint32_t  value)
inline

◆ array_convert() [1/2]

template<typename Y , typename X , index_t NumElems>
__host__ __device__ void ck::array_convert ( Array< Y, NumElems > &  y,
const Array< X, NumElems > &  x 
)
inline

◆ array_convert() [2/2]

template<typename Y , typename X , size_t NumElems>
__host__ __device__ void ck::array_convert ( std::array< Y, NumElems > &  y,
const std::array< X, NumElems > &  x 
)
inline

◆ atomic_add()

template<typename X >
__device__ X ck::atomic_add ( X *  p_dst,
const X &  x 
)

◆ atomic_add< _Float16 >()

template<>
__device__ _Float16 ck::atomic_add< _Float16 > ( _Float16 *  p_dst,
const _Float16 &  x 
)

◆ atomic_add< double >()

template<>
__device__ double ck::atomic_add< double > ( double *  p_dst,
const double &  x 
)

◆ atomic_add< double2_t >()

template<>
__device__ double2_t ck::atomic_add< double2_t > ( double2_t p_dst,
const double2_t x 
)

◆ atomic_add< float >()

template<>
__device__ float ck::atomic_add< float > ( float *  p_dst,
const float &  x 
)

◆ atomic_add< float2_t >()

template<>
__device__ float2_t ck::atomic_add< float2_t > ( float2_t p_dst,
const float2_t x 
)

◆ atomic_add< int32_t >()

template<>
__device__ int32_t ck::atomic_add< int32_t > ( int32_t *  p_dst,
const int32_t &  x 
)

◆ atomic_add< uint32_t >()

template<>
__device__ uint32_t ck::atomic_add< uint32_t > ( uint32_t *  p_dst,
const uint32_t &  x 
)

◆ atomic_add< unsigned short >()

template<>
__device__ unsigned short ck::atomic_add< unsigned short > ( unsigned short *  p_dst,
const unsigned short &  x 
)

◆ atomic_max()

template<typename X >
__device__ X ck::atomic_max ( X *  p_dst,
const X &  x 
)

◆ atomic_max< double >()

template<>
__device__ double ck::atomic_max< double > ( double *  p_dst,
const double &  x 
)

◆ atomic_max< float >()

template<>
__device__ float ck::atomic_max< float > ( float *  p_dst,
const float &  x 
)

◆ atomic_max< float2_t >()

template<>
__device__ float2_t ck::atomic_max< float2_t > ( float2_t p_dst,
const float2_t x 
)

◆ atomic_max< int32_t >()

template<>
__device__ int32_t ck::atomic_max< int32_t > ( int32_t *  p_dst,
const int32_t &  x 
)

◆ atomic_max< uint32_t >()

template<>
__device__ uint32_t ck::atomic_max< uint32_t > ( uint32_t *  p_dst,
const uint32_t &  x 
)

◆ bf16_convert_rtn()

template<typename Y , typename X >
__host__ constexpr __device__ Y ck::bf16_convert_rtn ( x)
constexpr

◆ bf16_convert_rtn< bhalf_t, float >()

template<>
__host__ constexpr __device__ bhalf_t ck::bf16_convert_rtn< bhalf_t, float > ( float  x)
inlineconstexpr

◆ bf16_convert_rtn< bhalf_t, half_t >()

template<>
__host__ constexpr __device__ bhalf_t ck::bf16_convert_rtn< bhalf_t, half_t > ( half_t  x)
inlineconstexpr

◆ bf6_convert_rne() [1/2]

__host__ __device__ bf6_t ck::bf6_convert_rne ( float  x,
float  scale = 1.0f 
)
inline

Converts a float to the 6-bit BF6 type using round-to-nearest-even.

Divides the input by the specified scale, then saturates and converts it to a 6-bit BF6 floating-point format.

Parameters
xThe float value to be converted.
scaleThe scaling factor applied to the input before conversion.
Returns
The converted bf6_t value.

◆ bf6_convert_rne() [2/2]

__host__ __device__ bf6x32_t ck::bf6_convert_rne ( float32_t  x,
float  scale = 1.0f 
)
inline

Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using round-to-nearest-even.

Divides the input by the specified scale, then saturates and converts it to a 6-bit BF6 floating-point format.

Parameters
xThe float vector to be converted.
scaleThe scaling factor applied to the input before conversion.
Returns
The converted bf6x32_t vector.

◆ bf6_convert_sr() [1/2]

__host__ __device__ bf6_t ck::bf6_convert_sr ( float  x,
float  scale = 1.0f 
)
inline

Converts a float to the 6-bit BF6 type using stochastic rounding.

Divides the input by the specified scale, and converts the result to a 6-bit BF6 floating-point format with stochastic rounding.

Parameters
xThe float value to be converted.
scaleThe scaling factor applied to the input before conversion.
Returns
The converted bf6_t value.

◆ bf6_convert_sr() [2/2]

__host__ __device__ bf6x32_t ck::bf6_convert_sr ( float32_t  x,
float  scale = 1.0f 
)
inline

Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using stochastic rounding.

Divides the input by the specified scale, and converts the result to a 6-bit BF6 floating-point format with stochastic rounding.

Parameters
xThe float vector to be converted.
scaleThe scaling factor applied to the input before conversion.
Returns
The converted bf6x32_t vector.

◆ bit_cast()

template<typename Y , typename X , typename enable_if< sizeof(X)==sizeof(Y), bool >::type = false>
__host__ constexpr __device__ Y ck::bit_cast ( const X &  x)
constexpr

◆ block_sync_lds()

__device__ void ck::block_sync_lds ( )

◆ block_sync_lds_direct_load()

__device__ void ck::block_sync_lds_direct_load ( )

◆ BlockGemmABScalePipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockGemmABScalePipeline_Selector ( )
constexpr

◆ BlockGemmBlockMoeScaleBPreshufflePipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto ck::BlockGemmBlockMoeScaleBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmBlockScaleBPreshufflePipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockGemmBlockScaleBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmBPreshufflePipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto ck::BlockGemmBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmMXBPreshufflePipeline_Selector() [1/2]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto ck::BlockGemmMXBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmMXBPreshufflePipeline_Selector() [2/2]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockGemmMXBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmMXNBSPipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto ck::BlockGemmMXNBSPipeline_Selector ( )
constexpr

◆ BlockGemmMXPipeline_Selector() [1/2]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto ck::BlockGemmMXPipeline_Selector ( )
constexpr

◆ BlockGemmMXPipeline_Selector() [2/2]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockGemmMXPipeline_Selector ( )
constexpr

◆ BlockGemmPipeline_Selector() [1/2]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeTypeA , typename ComputeTypeB , typename AccDataType , typename AWmmaTileDesc , typename BWmmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockGemmPipeline_Selector ( )
constexpr

◆ BlockGemmPipeline_Selector() [2/2]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto ck::BlockGemmPipeline_Selector ( )
constexpr

◆ BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()

template<index_t BlockSize, typename FloatA , typename FloatB , typename FloatAcc , typename AK0MK1BlockDesc , typename BK0NK1BlockDesc , index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, LoopScheduler LoopSched, typename ComputeTypeA = FloatA, typename ComputeTypeB = FloatB>
constexpr auto ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector ( )
constexpr

◆ c_style_pointer_cast()

template<typename PY , typename PX , typename enable_if< is_pointer_v< PY > &&is_pointer_v< PX >, bool >::type = false>
__host__ __device__ PY ck::c_style_pointer_cast ( PX  p_x)

◆ cast_pointer_to_constant_address_space()

template<typename T >
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* ck::cast_pointer_to_constant_address_space ( T *  p)

◆ cast_pointer_to_generic_address_space()

template<typename T >
__device__ T* ck::cast_pointer_to_generic_address_space ( T CK_CONSTANT_ADDRESS_SPACE p)

◆ chain_tensor_adaptors()

template<typename TensorAdaptor0 , typename TensorAdaptor1 >
__host__ constexpr __device__ auto ck::chain_tensor_adaptors ( const TensorAdaptor0 &  adaptor0,
const TensorAdaptor1 &  adaptor1 
)
constexpr

◆ clz()

__device__ int ck::clz ( uint32_t  x)
inline

◆ concat_tuple() [1/3]

template<typename... X>
__host__ constexpr __device__ auto ck::concat_tuple ( const Tuple< X... > &  tx)
constexpr

◆ concat_tuple() [2/3]

template<typename... X, typename... Y>
__host__ constexpr __device__ auto ck::concat_tuple ( const Tuple< X... > &  tx,
const Tuple< Y... > &  ty 
)
constexpr

◆ concat_tuple() [3/3]

template<typename... X, typename... Tuples>
__host__ constexpr __device__ auto ck::concat_tuple ( const Tuple< X... > &  tx,
const Tuples &...  tuples 
)
constexpr

◆ concat_tuple_of_reference()

template<typename... X, typename... Y>
__host__ constexpr __device__ auto ck::concat_tuple_of_reference ( const Tuple< X &... > &  tx,
const Tuple< Y &... > &  ty 
)
constexpr

◆ conditional_expr()

template<bool predicate, typename X , typename Y >
constexpr auto ck::conditional_expr ( X &&  x,
Y &&  y 
)
constexpr

◆ container_concat() [1/4]

template<typename T , index_t NX, index_t NY>
__host__ constexpr __device__ auto ck::container_concat ( const Array< T, NX > &  ax,
const Array< T, NY > &  ay 
)
constexpr

◆ container_concat() [2/4]

template<typename Container >
__host__ constexpr __device__ auto ck::container_concat ( const Container &  x)
constexpr

◆ container_concat() [3/4]

template<typename... X, typename... Y>
__host__ constexpr __device__ auto ck::container_concat ( const Tuple< X... > &  tx,
const Tuple< Y... > &  ty 
)
constexpr

◆ container_concat() [4/4]

template<typename X , typename... Ys>
__host__ constexpr __device__ auto ck::container_concat ( const X &  x,
const Ys &...  ys 
)
constexpr

◆ container_push_back() [1/2]

template<typename TData , index_t NSize>
__host__ constexpr __device__ auto ck::container_push_back ( const Array< TData, NSize > &  a,
const TData &  x 
)
constexpr

◆ container_push_back() [2/2]

template<typename... Ts, typename T >
__host__ constexpr __device__ auto ck::container_push_back ( const Tuple< Ts... > &  a,
const T &  x 
)
constexpr

◆ container_push_front()

template<typename... Ts, typename T >
__host__ constexpr __device__ auto ck::container_push_front ( const Tuple< Ts... > &  a,
const T &  x 
)
constexpr

◆ container_reduce()

template<typename Container , typename Reduce , typename Init , index_t IBegin = 0, index_t IEnd = Container::Size(), index_t IStep = 1>
__host__ constexpr __device__ auto ck::container_reduce ( const Container &  x,
Reduce  reduce,
Init  init,
Number< IBegin >  = Number<0>{},
Number< IEnd >  = Number<Container::Size()>{},
Number< IStep >  = Number<1>{} 
)
constexpr

◆ container_reorder_given_new2old() [1/3]

template<typename TData , index_t NSize, index_t... IRs>
__host__ constexpr __device__ auto ck::container_reorder_given_new2old ( const Array< TData, NSize > &  old_array,
Sequence< IRs... >   
)
constexpr

◆ container_reorder_given_new2old() [2/3]

template<typename... Ts, index_t... IRs>
__host__ constexpr __device__ auto ck::container_reorder_given_new2old ( const Tuple< Ts... > &  old_tuple,
Sequence< IRs... >   
)
constexpr

◆ container_reorder_given_new2old() [3/3]

template<index_t... Is, index_t... IRs>
__host__ constexpr __device__ auto ck::container_reorder_given_new2old ( Sequence< Is... >  ,
Sequence< IRs... >   
)
constexpr

◆ container_reorder_given_old2new() [1/3]

template<typename TData , index_t NSize, index_t... IRs>
__host__ constexpr __device__ auto ck::container_reorder_given_old2new ( const Array< TData, NSize > &  old_array,
Sequence< IRs... >  old2new 
)
constexpr

◆ container_reorder_given_old2new() [2/3]

template<typename... Ts, index_t... IRs>
__host__ constexpr __device__ auto ck::container_reorder_given_old2new ( const Tuple< Ts... > &  old_tuple,
Sequence< IRs... >  old2new 
)
constexpr

◆ container_reorder_given_old2new() [3/3]

template<index_t... Is, index_t... IRs>
__host__ constexpr __device__ auto ck::container_reorder_given_old2new ( Sequence< Is... >  old_seq,
Sequence< IRs... >   
)
constexpr

◆ container_reverse_exclusive_scan() [1/3]

template<typename TData , index_t NSize, typename Reduce >
__host__ constexpr __device__ auto ck::container_reverse_exclusive_scan ( const Array< TData, NSize > &  x,
Reduce  f,
TData  init 
)
constexpr

◆ container_reverse_exclusive_scan() [2/3]

template<index_t... Is, typename Reduce , index_t Init>
__host__ constexpr __device__ auto ck::container_reverse_exclusive_scan ( const Sequence< Is... > &  seq,
Reduce  f,
Number< Init >   
)
constexpr

◆ container_reverse_exclusive_scan() [3/3]

template<typename... Xs, typename Reduce , typename Init >
__host__ constexpr __device__ auto ck::container_reverse_exclusive_scan ( const Tuple< Xs... > &  x,
Reduce  reduce,
Init  init 
)
constexpr

◆ container_reverse_inclusive_scan() [1/2]

template<typename TData , index_t NSize, typename Reduce >
__host__ constexpr __device__ auto ck::container_reverse_inclusive_scan ( const Array< TData, NSize > &  x,
Reduce  f,
TData  init 
)
constexpr

◆ container_reverse_inclusive_scan() [2/2]

template<typename... Xs, typename Reduce , typename TData >
__host__ constexpr __device__ auto ck::container_reverse_inclusive_scan ( const Tuple< Xs... > &  x,
Reduce  f,
TData  init 
)
constexpr

◆ coordinate_has_valid_offset()

template<typename TensorDesc , typename TensorCoord >
__host__ constexpr __device__ bool ck::coordinate_has_valid_offset ( const TensorDesc &  tensor_desc,
const TensorCoord &  coord 
)
constexpr

◆ coordinate_has_valid_offset_assuming_visible_index_is_valid()

template<typename TensorDesc , typename TensorCoord >
__host__ constexpr __device__ bool ck::coordinate_has_valid_offset_assuming_visible_index_is_valid ( const TensorDesc &  tensor_desc,
const TensorCoord &  coord 
)
constexpr

◆ DefaultValidCTileIndex()

template<typename CTileIdx , typename CTileDim >
__host__ __device__ bool ck::DefaultValidCTileIndex ( const CTileIdx &  c_tile_idx,
const CTileDim &  c_tile_dim 
)

◆ EnvGetString()

template<class EnvVar >
const std::string& ck::EnvGetString ( EnvVar  )
inline

◆ EnvIsDisabled()

template<class EnvVar >
bool ck::EnvIsDisabled ( EnvVar  )
inline

◆ EnvIsEnabled()

template<class EnvVar >
bool ck::EnvIsEnabled ( EnvVar  )
inline

◆ EnvIsUnset()

template<class EnvVar >
bool ck::EnvIsUnset ( EnvVar  )
inline

◆ EnvUnset()

template<class EnvVar >
void ck::EnvUnset ( EnvVar  )

◆ EnvValue()

template<class EnvVar >
uint64_t ck::EnvValue ( EnvVar  )
inline

◆ f4_convert_rne() [1/2]

__host__ __device__ f4_t ck::f4_convert_rne ( float  x,
float  scale = 1.0f 
)
inline

◆ f4_convert_rne() [2/2]

__host__ __device__ f4x32_t ck::f4_convert_rne ( float2_t  x,
float  scale = 1.0f 
)
inline

◆ f4_convert_sr() [1/2]

__host__ __device__ f4_t ck::f4_convert_sr ( float  x,
float  scale = 1.0f 
)
inline

◆ f4_convert_sr() [2/2]

__host__ __device__ f4x32_t ck::f4_convert_sr ( float2_t  x,
float  scale = 1.0f 
)
inline

◆ f6_convert_rne() [1/2]

__host__ __device__ f6_t ck::f6_convert_rne ( float  x,
float  scale = 1.0f 
)
inline

Converts a float to a 6-bit float type (f6_t) using round-to-nearest-even.

Divides the input by the specified scale, then saturates and converts it to the 6-bit floating-point format (f6_t).

Parameters
xThe input float value.
scaleA scaling factor applied to x before conversion.
Returns
The converted f6_t value.

◆ f6_convert_rne() [2/2]

__host__ __device__ f6x32_t ck::f6_convert_rne ( float32_t  x,
float  scale = 1.0f 
)
inline

Converts a 32-element single-precision float array into a packed 6-bit representation.

This function divides each input float by the provided scale value, then performs conversion with rounding to nearest / even to pack each element into 6 bits of precision.

Parameters
xA vector of 32 floats stored in float32_t.
scaleA scaling factor for each float before conversion.
Returns
An f6x32_t object storing the compressed 6-bit representation.

◆ f6_convert_sr() [1/2]

__host__ __device__ f6_t ck::f6_convert_sr ( float  x,
float  scale = 1.0f 
)
inline

Converts a float to the 6-bit floating-point type (f6_t) using stochastic rounding.

Divides the input by the specified scale, then performs saturation and conversion to f6_t based on a pseudo-randomly generated seed.

Parameters
xThe input float value.
scaleA scaling factor applied to x before conversion.
Returns
The converted f6_t value.

◆ f6_convert_sr() [2/2]

__host__ __device__ f6x32_t ck::f6_convert_sr ( float32_t  x,
float  scale = 1.0f 
)
inline

Converts a 32-element single-precision float array into a packed 6-bit representation.

This function divides each input float by the provided scale value, then performs conversion with stochastic rounding to pack each element into 6 bits of precision.

Parameters
xA vector of 32 floats stored in float32_t.
scaleA scaling factor for each float before conversion.
Returns
An f6x32_t object storing the compressed 6-bit representation.

◆ f8_convert_rne()

template<typename Y , typename X >
__host__ constexpr __device__ Y ck::f8_convert_rne ( x)
constexpr

◆ f8_convert_rne< bf8_fnuz_t, float >()

template<>
__host__ __device__ bf8_fnuz_t ck::f8_convert_rne< bf8_fnuz_t, float > ( float  x)
inline

◆ f8_convert_rne< bf8_fnuz_t, half_t >()

template<>
__host__ __device__ bf8_fnuz_t ck::f8_convert_rne< bf8_fnuz_t, half_t > ( half_t  x)
inline

◆ f8_convert_rne< bf8_ocp_t, bhalf_t >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_rne< bf8_ocp_t, bhalf_t > ( bhalf_t  x)
inline

Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input bhalf_t value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_rne< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_rne< bf8_ocp_t, float > ( float  x)
inline

Converts a float to a 8-bit float type (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input float value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_rne< bf8_ocp_t, half_t >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_rne< bf8_ocp_t, half_t > ( half_t  x)
inline

Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input half_t value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_rne< bf8x2_ocp_t, bhalf2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_rne< bf8x2_ocp_t, bhalf2_t > ( bhalf2_t  x)
inline

Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 bhalf_t.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_rne< bf8x2_ocp_t, float2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_rne< bf8x2_ocp_t, float2_t > ( float2_t  x)
inline

Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 floats.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_rne< bf8x2_ocp_t, half2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_rne< bf8x2_ocp_t, half2_t > ( half2_t  x)
inline

Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 half_t.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_rne< f8_fnuz_t, float >()

template<>
__host__ __device__ f8_fnuz_t ck::f8_convert_rne< f8_fnuz_t, float > ( float  x)
inline

◆ f8_convert_rne< f8_fnuz_t, half_t >()

template<>
__host__ __device__ f8_fnuz_t ck::f8_convert_rne< f8_fnuz_t, half_t > ( half_t  x)
inline

◆ f8_convert_rne< f8_ocp_t, bhalf_t >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_rne< f8_ocp_t, bhalf_t > ( bhalf_t  x)
inline

Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input bhalf_t value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_rne< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_rne< f8_ocp_t, float > ( float  x)
inline

Converts a float to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input float value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_rne< f8_ocp_t, half_t >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_rne< f8_ocp_t, half_t > ( half_t  x)
inline

Converts a half_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input half_t value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_rne< f8x2_ocp_t, bhalf2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_rne< f8x2_ocp_t, bhalf2_t > ( bhalf2_t  x)
inline

Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 bhalf_t.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_rne< f8x2_ocp_t, float2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_rne< f8x2_ocp_t, float2_t > ( float2_t  x)
inline

Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 floats.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_rne< f8x2_ocp_t, half2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_rne< f8x2_ocp_t, half2_t > ( half2_t  x)
inline

Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 half_t.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_sr()

template<typename Y , typename X >
__host__ constexpr __device__ Y ck::f8_convert_sr ( x)
constexpr

◆ f8_convert_sr< bf8_fnuz_t, float >()

template<>
__host__ __device__ bf8_fnuz_t ck::f8_convert_sr< bf8_fnuz_t, float > ( float  x)
inline

◆ f8_convert_sr< bf8_fnuz_t, half_t >()

template<>
__host__ __device__ bf8_fnuz_t ck::f8_convert_sr< bf8_fnuz_t, half_t > ( half_t  x)
inline

◆ f8_convert_sr< bf8_ocp_t, bhalf_t >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_sr< bf8_ocp_t, bhalf_t > ( bhalf_t  x)
inline

Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input bhalf_t value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_sr< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_sr< bf8_ocp_t, float > ( float  x)
inline

Converts a float to a 8-bit float type (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input float value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_sr< bf8_ocp_t, half_t >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_sr< bf8_ocp_t, half_t > ( half_t  x)
inline

Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input half_t value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_sr< bf8x2_ocp_t, bhalf2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_sr< bf8x2_ocp_t, bhalf2_t > ( bhalf2_t  x)
inline

Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 bhalf_t.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_sr< bf8x2_ocp_t, float2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_sr< bf8x2_ocp_t, float2_t > ( float2_t  x)
inline

Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 floats.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_sr< bf8x2_ocp_t, half2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_sr< bf8x2_ocp_t, half2_t > ( half2_t  x)
inline

Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 half_t.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_sr< f8_fnuz_t, float >()

template<>
__host__ __device__ f8_fnuz_t ck::f8_convert_sr< f8_fnuz_t, float > ( float  x)
inline

◆ f8_convert_sr< f8_fnuz_t, half_t >()

template<>
__host__ __device__ f8_fnuz_t ck::f8_convert_sr< f8_fnuz_t, half_t > ( half_t  x)
inline

◆ f8_convert_sr< f8_ocp_t, bhalf_t >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_sr< f8_ocp_t, bhalf_t > ( bhalf_t  x)
inline

Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using stochastic rounding.

Parameters
xThe input bhalf_t value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_sr< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_sr< f8_ocp_t, float > ( float  x)
inline

Converts a float to a 8-bit float type (f8_ocp_t) using stochastic rounding.

Parameters
xThe input float value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_sr< f8_ocp_t, half_t >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_sr< f8_ocp_t, half_t > ( half_t  x)
inline

Converts a half_t to a 8-bit float type (f8_ocp_t) using stochastic rounding.

Parameters
xThe input half_t value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_sr< f8x2_ocp_t, bhalf2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_sr< f8x2_ocp_t, bhalf2_t > ( bhalf2_t  x)
inline

Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 bhalf_t.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_sr< f8x2_ocp_t, float2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_sr< f8x2_ocp_t, float2_t > ( float2_t  x)
inline

Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 floats.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_sr< f8x2_ocp_t, half2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_sr< f8x2_ocp_t, half2_t > ( half2_t  x)
inline

Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 half_t.
Returns
The converted vector of 2 f8_ocp_t.

◆ fnv1a_hash()

constexpr unsigned int ck::fnv1a_hash ( std::string_view  str,
unsigned int  h = 2166136261u 
)
constexpr

◆ fp8_is_inf()

template<>
__host__ constexpr __device__ bool ck::fp8_is_inf ( bf8_ocp_t  a)
inlineconstexpr

◆ fp8_is_nan() [1/4]

template<>
__host__ constexpr __device__ bool ck::fp8_is_nan ( bf8_fnuz_t  a)
inlineconstexpr

◆ fp8_is_nan() [2/4]

template<>
__host__ constexpr __device__ bool ck::fp8_is_nan ( bf8_ocp_t  a)
inlineconstexpr

◆ fp8_is_nan() [3/4]

template<>
__host__ constexpr __device__ bool ck::fp8_is_nan ( f8_fnuz_t  a)
inlineconstexpr

◆ fp8_is_nan() [4/4]

template<>
__host__ constexpr __device__ bool ck::fp8_is_nan ( f8_ocp_t  a)
inlineconstexpr

◆ generate_sequence()

template<typename F , index_t N>
__host__ constexpr __device__ auto ck::generate_sequence ( ,
Number< N >   
)
constexpr

◆ generate_sequence_v2()

template<typename F , index_t N>
__host__ constexpr __device__ auto ck::generate_sequence_v2 ( F &&  f,
Number< N >   
)
constexpr

◆ generate_tie()

template<typename F , index_t N>
__host__ constexpr __device__ auto ck::generate_tie ( F &&  f,
Number< N >   
)
constexpr

◆ generate_tuple() [1/2]

template<typename F , index_t N>
__host__ constexpr __device__ auto ck::generate_tuple ( F &&  f,
LongNumber< N >   
)
constexpr

◆ generate_tuple() [2/2]

template<typename F , index_t N>
__host__ constexpr __device__ auto ck::generate_tuple ( F &&  f,
Number< N >   
)
constexpr

◆ generate_tuple_for()

template<typename F , index_t... ids>
__host__ constexpr __device__ auto ck::generate_tuple_for ( F &&  f,
Sequence< ids... >   
)
constexpr

◆ get_available_cpu_cores()

unsigned int ck::get_available_cpu_cores ( )
inline

◆ get_block_1d_id()

__device__ index_t ck::get_block_1d_id ( )

◆ get_block_size()

__device__ index_t ck::get_block_size ( )

◆ get_container_subset() [1/2]

template<typename T , index_t N, index_t... Is>
__host__ constexpr __device__ auto ck::get_container_subset ( const Array< T, N > &  arr,
Sequence< Is... >   
)
constexpr

◆ get_container_subset() [2/2]

template<typename... Ts, index_t... Is>
__host__ constexpr __device__ auto ck::get_container_subset ( const Tuple< Ts... > &  tup,
Sequence< Is... >   
)
constexpr

◆ get_device_name()

std::string ck::get_device_name ( )
inline

◆ get_grid_size()

__device__ index_t ck::get_grid_size ( )

◆ get_shift< 1 >()

template<>
constexpr __device__ index_t ck::get_shift< 1 > ( )
constexpr

◆ get_thread_global_1d_id()

__device__ index_t ck::get_thread_global_1d_id ( )

◆ get_thread_local_1d_id()

__device__ index_t ck::get_thread_local_1d_id ( )

◆ get_warp_local_1d_id()

__device__ index_t ck::get_warp_local_1d_id ( )

◆ get_warp_size()

__host__ constexpr __device__ index_t ck::get_warp_size ( )
constexpr

◆ GridwiseGemmPipeline_Selector()

template<PipelineVersion PipelineVer, index_t NumPrefetch = 1, LoopScheduler LoopSched = LoopScheduler::Default, bool AEnableLds = true, bool BEnableLds = true>
constexpr auto ck::GridwiseGemmPipeline_Selector ( )
constexpr

◆ GridwiseGemmPipeline_v1_Selector()

template<index_t NumPrefetch, LoopScheduler LoopSched>
constexpr auto ck::GridwiseGemmPipeline_v1_Selector ( )
constexpr

◆ i4_to_bhalf4()

__device__ bhalf4_t ck::i4_to_bhalf4 ( int  q)
inline

◆ i4_to_f8x4()

__device__ f8x4_t ck::i4_to_f8x4 ( int  q)
inline

◆ i4_to_fp8x8()

__device__ f8x8_t ck::i4_to_fp8x8 ( int  q)
inline

◆ i4_to_half4()

__device__ half4_t ck::i4_to_half4 ( int  q)
inline

◆ i4_to_half4_scale()

__device__ half4_t ck::i4_to_half4_scale ( int  q,
const ck::half2_t scale 
)
inline

◆ inclusive_scan_sequence()

template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ auto ck::inclusive_scan_sequence ( Seq  ,
Reduce  ,
Number< Init >   
)
constexpr

◆ inner_product()

template<typename TA , typename TB , typename TC >
__device__ void ck::inner_product ( const TA &  a,
const TB &  b,
TC &  c 
)

◆ inner_product< bhalf_t, bhalf_t, float >()

template<>
__device__ void ck::inner_product< bhalf_t, bhalf_t, float > ( const bhalf_t a,
const bhalf_t b,
float &  c 
)

◆ inner_product< float, float, float >()

template<>
__device__ void ck::inner_product< float, float, float > ( const float &  a,
const float &  b,
float &  c 
)

◆ inner_product< float2_t, float2_t, float >()

template<>
__device__ void ck::inner_product< float2_t, float2_t, float > ( const float2_t a,
const float2_t b,
float &  c 
)

◆ inner_product< float4_t, float4_t, float >()

template<>
__device__ void ck::inner_product< float4_t, float4_t, float > ( const float4_t a,
const float4_t b,
float &  c 
)

◆ inner_product< half2_t, half2_t, float >()

template<>
__device__ void ck::inner_product< half2_t, half2_t, float > ( const half2_t a,
const half2_t b,
float &  c 
)

◆ inner_product< half4_t, half4_t, float >()

template<>
__device__ void ck::inner_product< half4_t, half4_t, float > ( const half4_t a,
const half4_t b,
float &  c 
)

◆ inner_product< half8_t, half8_t, float >()

template<>
__device__ void ck::inner_product< half8_t, half8_t, float > ( const half8_t a,
const half8_t b,
float &  c 
)

◆ inner_product< half_t, half_t, float >()

template<>
__device__ void ck::inner_product< half_t, half_t, float > ( const half_t a,
const half_t b,
float &  c 
)

◆ inner_product< int8_t, int8_t, int32_t >()

template<>
__device__ void ck::inner_product< int8_t, int8_t, int32_t > ( const int8_t &  a,
const int8_t &  b,
int32_t &  c 
)

◆ inner_product< int8x16_t, int8x16_t, int32_t >()

template<>
__device__ void ck::inner_product< int8x16_t, int8x16_t, int32_t > ( const int8x16_t a,
const int8x16_t b,
int32_t &  c 
)

◆ inner_product< int8x2_t, int8x2_t, int32_t >()

template<>
__device__ void ck::inner_product< int8x2_t, int8x2_t, int32_t > ( const int8x2_t a,
const int8x2_t b,
int32_t &  c 
)

◆ inner_product< int8x4_t, int8x4_t, int32_t >()

template<>
__device__ void ck::inner_product< int8x4_t, int8x4_t, int32_t > ( const int8x4_t a,
const int8x4_t b,
int32_t &  c 
)

◆ inner_product< int8x8_t, int8x8_t, int32_t >()

template<>
__device__ void ck::inner_product< int8x8_t, int8x8_t, int32_t > ( const int8x8_t a,
const int8x8_t b,
int32_t &  c 
)

◆ is_bf16_atomic_supported()

bool ck::is_bf16_atomic_supported ( )
inline

◆ is_gfx101_supported()

bool ck::is_gfx101_supported ( )
inline

◆ is_gfx103_supported()

bool ck::is_gfx103_supported ( )
inline

◆ is_gfx11_supported()

bool ck::is_gfx11_supported ( )
inline

◆ is_gfx12_supported()

bool ck::is_gfx12_supported ( )
inline

◆ is_lds_direct_load_supported()

bool ck::is_lds_direct_load_supported ( )
inline

◆ is_native_type()

template<typename T >
constexpr bool ck::is_native_type ( )
inlineconstexpr

◆ is_xdl_supported()

bool ck::is_xdl_supported ( )
inline

◆ IsNestedTuple()

template<typename... Ts>
__host__ constexpr __device__ auto ck::IsNestedTuple ( const Tuple< Ts... > &  )
constexpr

◆ kernel_batched_elementwise()

template<typename GridwiseElementwiseFunctor , typename InGridDescTuple , typename OutGridDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename Block2TileMap , typename ElementwiseOperation , index_t NumInputs, index_t NumOutputs>
__global__ void ck::kernel_batched_elementwise ( const InGridDescTuple  in_grid_desc_tuple,
const OutGridDescTuple  out_grid_desc_tuple,
const InDataTypePointerTuple  p_in_global_tuple,
const OutDataTypePointerTuple  p_out_global_tuple,
const Block2TileMap  block_2_tile_map,
const ElementwiseOperation  elementwise_op,
const index_t  batch_count,
const std::array< index_t, NumInputs >  input_batch_strides,
const std::array< index_t, NumOutputs >  output_batch_strides 
)

◆ kernel_batched_gemm_b_scale_xdl_cshuffle_v3()

template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_batched_gemm_b_scale_xdl_cshuffle_v3 ( BatchedGemmArg  karg)

◆ kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds()

template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds ( BatchedGemmArg  karg)

◆ kernel_batched_gemm_xdl_cshuffle_v3_multi_d()

template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_batched_gemm_xdl_cshuffle_v3_multi_d ( BatchedGemmArg  karg)

◆ kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds()

template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds ( BatchedGemmArg  karg)

◆ kernel_batchnorm_backward_with_blockwise_welford()

template<typename GridwiseBatchrNormBackwardWithBlockwiseWelford_ , typename XDataType , typename DyDataType , typename DxDataType , typename AccDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor >
__global__ void ck::kernel_batchnorm_backward_with_blockwise_welford ( const XYGridDesc_M_K  x_grid_desc_m_k,
const XYGridDesc_M_K  dy_grid_desc_m_k,
const XYGridDesc_M_K  dx_grid_desc_m_k,
const ScaleBiasGridDesc_M  scale_grid_desc_m,
const ScaleBiasGridDesc_M  dscale_dbias_grid_desc_m,
const MeanVarGridDesc_M  mean_var_grid_desc_m,
const GetReduceCountPerThreadFunctor  get_reduce_count_per_thread,
long_index_t  reduce_size,
index_t  num_k_block_tile_iteration,
AccDataType  epsilon,
const XDataType *const __restrict__  p_x,
const DyDataType *const __restrict__  p_dy,
const ScaleDataType *const __restrict__  p_scale,
bool  haveSavedMeanInvVar,
const MeanVarDataType *const __restrict__  p_savedMean,
const MeanVarDataType *const __restrict__  p_savedInvVar,
const DyElementwiseOp  dy_elementwise_op,
DxDataType *const __restrict__  p_dx,
DscaleDbiasDataType *const __restrict__  p_dscale,
DscaleDbiasDataType *const __restrict__  p_dbias 
)

◆ kernel_batchnorm_forward_with_blockwise_welford()

template<typename GridwiseBatchrNormForwardWithBlockwiseWelford_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor >
__global__ void ck::kernel_batchnorm_forward_with_blockwise_welford ( const XYGridDesc_M_K  x_grid_desc_m_k,
const XYGridDesc_M_K  y_grid_desc_m_k,
const ScaleBiasGridDesc_M  scale_grid_desc_m,
const ScaleBiasGridDesc_M  bias_grid_desc_m,
const MeanVarGridDesc_M  mean_var_grid_desc_m,
const GetReduceCountPerThreadFunctor  get_reduce_count_per_thread,
index_t  num_k_block_tile_iteration,
AccDataType  epsilon,
const XDataType *const __restrict__  p_x,
const ScaleDataType *const __restrict__  p_scale,
const BiasDataType *const __restrict__  p_bias,
const YElementwiseOp  y_elementwise_op,
YDataType *const __restrict__  p_y,
bool  updateMovingAverage,
AccDataType  averageFactor,
MeanVarDataType *const __restrict__  resultRunningMean,
MeanVarDataType *const __restrict__  resultRunningVariance,
bool  saveMeanInvVariance,
MeanVarDataType *const __restrict__  resultSaveMean,
MeanVarDataType *const __restrict__  resultSaveInvVariance 
)

◆ kernel_buffer_set_value()

template<index_t BlockSize, typename DataType , typename Grid1dBufferDescType >
__global__ void ck::kernel_buffer_set_value ( const Grid1dBufferDescType  grid_1d_buffer_desc,
DataType *const __restrict__  p_global,
DataType  value 
)

◆ kernel_contraction_multiple_abd_xdl_cshuffle()

template<typename GridwiseGemm , typename AsPointer , typename BsPointer , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AsGridDesc_AK0_M_AK1 , typename BsGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_abd_xdl_cshuffle ( AsPointer  p_as_grid,
BsPointer  p_bs_grid,
DsPointer  p_ds_grid,
EDataType *__restrict__  p_e_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AsGridDesc_AK0_M_AK1  as_grid_desc_ak0_m_ak1,
const BsGridDesc_BK0_N_BK1  bs_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_contraction_multiple_d_wmma_cshuffle()

template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AGridDesc , typename BGridDesc , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename ComputePtrOffsetOfBatch , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_d_wmma_cshuffle ( const ADataType *__restrict__  p_a_grid,
const BDataType *__restrict__  p_b_grid,
DsPointer  p_ds_grid,
EDataType *__restrict__  p_e_grid,
const index_t  batch_count,
const AGridDesc  a_grid_desc,
const BGridDesc  b_grid_desc,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const ComputePtrOffsetOfBatch  compute_ptr_offset_of_batch,
const Block2CTileMap  block_2_etile_map 
)

◆ kernel_contraction_multiple_d_xdl_cshuffle() [1/3]

template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_d_xdl_cshuffle ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatDsPointer  p_ds_grid,
FloatE *__restrict__  p_e_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_contraction_multiple_d_xdl_cshuffle() [2/3]

template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ComputePtrOffsetOfBatch , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_d_xdl_cshuffle ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatDsPointer  p_ds_grid,
FloatE *__restrict__  p_e_grid,
const index_t  batch_count,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const ComputePtrOffsetOfBatch  compute_ptr_offset_of_batch,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_contraction_multiple_d_xdl_cshuffle() [3/3]

template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AKB_AK0_M_AK1 , typename BGridDesc_BKB_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ComputePtrOffsetOfBatch , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_d_xdl_cshuffle ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatDsPointer  p_ds_grid,
FloatE *__restrict__  p_e_grid,
const index_t  batch_count,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AGridDesc_AKB_AK0_M_AK1  a_grid_desc_akb_ak0_m_ak1,
const BGridDesc_BKB_BK0_N_BK1  b_grid_desc_bkb_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const ComputePtrOffsetOfBatch  compute_ptr_offset_of_batch,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_elementwise()

template<typename GridwiseElementwiseFunctor , typename InGridDescTuple , typename OutGridDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename Block2TileMap , typename ElementwiseOperation >
__global__ void ck::kernel_elementwise ( const InGridDescTuple  in_grid_desc_tuple,
const OutGridDescTuple  out_grid_desc_tuple,
const InDataTypePointerTuple  p_in_global_tuple,
const OutDataTypePointerTuple  p_out_global_tuple,
const Block2TileMap  block_2_tile_map,
const ElementwiseOperation  elementwise_op 
)

◆ kernel_elementwise_1d()

template<typename GridwiseElementwise1dFunctor , typename InGrid1dDescTuple , typename OutGrid1dDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename ElementwiseOperation , typename UnaryOperation , typename Scale >
__global__ void ck::kernel_elementwise_1d ( const InGrid1dDescTuple  in_grid_1d_desc_tuple,
const OutGrid1dDescTuple  out_grid_1d_desc_tuple,
const InDataTypePointerTuple  p_in_global_tuple,
const OutDataTypePointerTuple  p_out_global_tuple,
const ElementwiseOperation  elementwise_op,
const UnaryOperation  unary_op,
const Scale  scale_op 
)

◆ kernel_elementwise_batched_dual()

template<typename GridwiseElementwiseFunctorA , typename GridwiseElementwiseFunctorB , typename InAGridDescTuple , typename InBGridDescTuple , typename OutAGridDescTuple , typename OutBGridDescTuple , typename InADataTypePointerTuple , typename InBDataTypePointerTuple , typename OutADataTypePointerTuple , typename OutBDataTypePointerTuple , typename Block2TileMapA , typename Block2TileMapB , typename ElementwiseOperation , index_t NumInputsA, index_t NumInputsB, index_t NumOutputsA, index_t NumOutputsB>
__global__ void ck::kernel_elementwise_batched_dual ( const InAGridDescTuple  in_grid_desc_tuple_a,
const InBGridDescTuple  in_grid_desc_tuple_b,
const OutAGridDescTuple  out_grid_desc_tuple_a,
const OutBGridDescTuple  out_grid_desc_tuple_b,
const InADataTypePointerTuple  p_in_global_tuple_a,
const InBDataTypePointerTuple  p_in_global_tuple_b,
const OutADataTypePointerTuple  p_out_global_tuple_a,
const OutBDataTypePointerTuple  p_out_global_tuple_b,
const Block2TileMapA  block_2_tile_map_a,
const Block2TileMapB  block_2_tile_map_b,
const ElementwiseOperation  elementwise_op,
const index_t  a_grid_size,
const index_t  batch_count_a,
const index_t  batch_count_b,
const std::array< index_t, NumInputsA >  input_batch_strides_a,
const std::array< index_t, NumInputsB >  input_batch_strides_b,
const std::array< index_t, NumOutputsA >  output_batch_strides_a,
const std::array< index_t, NumOutputsB >  output_batch_strides_b 
)

◆ kernel_elementwise_dual()

template<typename GridwiseElementwiseFunctorA , typename GridwiseElementwiseFunctorB , typename InAGridDescTuple , typename InBGridDescTuple , typename OutAGridDescTuple , typename OutBGridDescTuple , typename InADataTypePointerTuple , typename InBDataTypePointerTuple , typename OutADataTypePointerTuple , typename OutBDataTypePointerTuple , typename Block2TileMapA , typename Block2TileMapB , typename ElementwiseOperation >
__global__ void ck::kernel_elementwise_dual ( const InAGridDescTuple  in_grid_desc_tuple_a,
const InBGridDescTuple  in_grid_desc_tuple_b,
const OutAGridDescTuple  out_grid_desc_tuple_a,
const OutBGridDescTuple  out_grid_desc_tuple_b,
const InADataTypePointerTuple  p_in_global_tuple_a,
const InBDataTypePointerTuple  p_in_global_tuple_b,
const OutADataTypePointerTuple  p_out_global_tuple_a,
const OutBDataTypePointerTuple  p_out_global_tuple_b,
const Block2TileMapA  block_2_tile_map_a,
const Block2TileMapB  block_2_tile_map_b,
const ElementwiseOperation  elementwise_op,
const index_t  a_grid_size 
)

◆ kernel_elementwise_layernorm()

template<typename GridwiseElementwiseReduction , typename InDataTypePointerTuple , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename AccDataType , typename XElementwiseOperation , typename YElementwiseOperation , typename InGrid2dDescTuple , typename GridDesc_M_K >
__global__ void ck::kernel_elementwise_layernorm ( const InGrid2dDescTuple  in_grid_2d_desc_tuple,
const GridDesc_M_K  x_grid_desc_m_k,
const GridDesc_M_K  gamma_grid_desc_m_k,
const GridDesc_M_K  beta_grid_desc_m_k,
const GridDesc_M_K  y_grid_desc_m_k,
index_t  num_k_block_tile_iteration,
AccDataType  epsilon,
const InDataTypePointerTuple  p_in_global_tuple,
const GammaDataType *const __restrict__  p_gamma_global,
const BetaDataType *const __restrict__  p_beta_global,
YDataType *const __restrict__  p_y_global,
const XElementwiseOperation  x_elementwise_op,
const YElementwiseOperation  y_elementwise_op 
)

◆ kernel_fpAintB_gemm_wmma()

template<typename GridwiseGemm , typename ADataType , typename BDataType , typename ScaleDataType , typename CDataType , typename AGridDesc , typename BGridDesc , typename ScaleGridDesc , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_fpAintB_gemm_wmma ( const ADataType *__restrict__  p_a_grid,
const BDataType *__restrict__  p_b_grid,
const ScaleDataType *__restrict__  p_scale_grid,
CDataType *__restrict__  p_c_grid,
const AGridDesc  a_grid_desc,
const BGridDesc  b_grid_desc,
const ScaleGridDesc  scale_grid_desc,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  c_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_bias_add_reduce_xdl_cshuffle_v1()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename FloatC0 , typename FloatC1 , typename ReducePtrsGlobal , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename C1ElementwiseOperation , typename ReduceInElementwiseOperations , typename ReduceAccElementwiseOperations , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ReduceGridDescriptor_MBlock_MPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_bias_add_reduce_xdl_cshuffle_v1 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const FloatC0 *__restrict__  p_bias_grid,
const FloatC1 *__restrict__  p_d0_grid,
ReducePtrsGlobal  p_reduces_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const C1ElementwiseOperation  c1_element_op,
const ReduceInElementwiseOperations  reduce_in_element_ops,
const ReduceAccElementwiseOperations  reduce_out_element_ops,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  c_grid_desc_mblock_mperblock_nblock_nperblock,
const C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  c0_grid_desc_mblock_mperblock_nblock_nperblock,
const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  c1_grid_desc_mblock_mperblock_nblock_nperblock,
const ReduceGridDescriptor_MBlock_MPerBlock  reduce_grid_desc_mblock_mperblock,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_dl_multiple_d()

template<typename GridwiseGemm , typename ABDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_K0_M0_M1_K1 , typename BGridDesc_K0_N0_N1_K1 , typename DsGridDesc_M0_M10_M11_N0_N10_N11 , typename CGridDesc_M0_M10_M11_N0_N10_N11 , typename Block2CTileMap , bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__global__ void ck::kernel_gemm_dl_multiple_d ( const ABDataType *__restrict__  p_a_grid,
const ABDataType *__restrict__  p_b_grid,
DsPointer  p_ds_grid,
EDataType *__restrict__  p_e_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AGridDesc_K0_M0_M1_K1  a_grid_desc_k0_m0_m1_k1,
const BGridDesc_K0_N0_N1_K1  b_grid_desc_k0_n0_n1_k1,
const DsGridDesc_M0_M10_M11_N0_N10_N11  ds_grid_desc_m0_m10_m11_n0_n10_n11,
const CGridDesc_M0_M10_M11_N0_N10_N11  e_grid_desc_m0_m10_m11_n0_n10_n11,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_dl_v1r3()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M0_M1_K1 , typename BGridDesc_K0_N0_N1_K1 , typename CGridDesc_M0_M10_M11_N0_N10_N11 , typename Block2CTileMap , bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__global__ void ck::kernel_gemm_dl_v1r3 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const AGridDesc_K0_M0_M1_K1  a_grid_desc_k0_m0_m1_k1,
const BGridDesc_K0_N0_N1_K1  b_grid_desc_k0_n0_n1_k1,
const CGridDesc_M0_M10_M11_N0_N10_N11  c_grid_desc_m0_m10_m11_n0_n10_n11,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_dpp()

template<typename GridwiseGemm , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_dpp ( const typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_layernorm_xdl_cshuffle_v1()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename FloatC0 , typename AElementwiseOperation , typename BElementwiseOperation , typename AccElementwiseOperation , typename CElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C0GridDescriptor_NBlock_NPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_layernorm_xdl_cshuffle_v1 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const FloatC0 *__restrict__  p_c0_bias_grid,
const FloatC0 *__restrict__  p_c0_add_grid,
const FloatC0 *__restrict__  p_c0_gamma_grid,
const FloatC0 *__restrict__  p_c0_beta_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const AccElementwiseOperation  acc_element_op,
const CElementwiseOperation  c_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  c_grid_desc_mblock_mperblock_nblock_nperblock,
const C0GridDescriptor_NBlock_NPerBlock  c0_grid_desc_nblock_nperblock,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_multiple_d_multiple_r_xdl_cshuffle()

template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename FloatRsPointer , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename QsElementwiseOperation , typename RsElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename RsGridDescriptor_MBlock_MPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_multiple_d_multiple_r_xdl_cshuffle ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatDsPointer  p_ds_grid,
FloatE *__restrict__  p_e_grid,
FloatRsPointer  p_rs_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const QsElementwiseOperation  qs_element_op,
const RsElementwiseOperation  rs_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const RsGridDescriptor_MBlock_MPerBlock  rs_grid_desc_mblock_mperblock,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_gemm_multiple_d_welford_first_half_xdl_cshuffle()

template<typename GridwiseGemmWelford , typename ABDataType , typename DsPointer , typename EMeanVarDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename MeanVarGridDescriptor_MBlock_MPerBlock_NBlock , typename CountGridDescriptor_MBlock_MPerBlock_NBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_multiple_d_welford_first_half_xdl_cshuffle ( const ABDataType *__restrict__  p_a_grid,
const ABDataType *__restrict__  p_b_grid,
DsPointer  p_ds_grid,
EMeanVarDataType *__restrict__  p_e_grid,
EMeanVarDataType *__restrict__  p_welford_mean_grid,
EMeanVarDataType *__restrict__  p_welford_var_grid,
int32_t *__restrict__  p_welford_count_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const MeanVarGridDescriptor_MBlock_MPerBlock_NBlock  mean_var_grid_desc_mblock_mperblock_nblock,
const CountGridDescriptor_MBlock_MPerBlock_NBlock  count_grid_desc_mblock_mperblock_nblock,
const Block2ETileMap  block_2_etile_map,
index_t  NRaw 
)

◆ kernel_gemm_multiple_d_xdl_cshuffle()

template<typename GridwiseGemm , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_multiple_d_xdl_cshuffle ( const ADataType *__restrict__  p_a_grid,
const BDataType *__restrict__  p_b_grid,
DsPointer  p_ds_grid,
EDataType *__restrict__  p_e_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_gemm_multiple_d_xdl_cshuffle_lds_direct_load()

template<typename GridwiseGemm , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_multiple_d_xdl_cshuffle_lds_direct_load ( const ADataType *__restrict__  p_a_grid,
const BDataType *__restrict__  p_b_grid,
DsPointer  p_ds_grid,
EDataType *__restrict__  p_e_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_gemm_mupltipe_d_wmma_cshuffle()

template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AGridDesc , typename BGridDesc , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_mupltipe_d_wmma_cshuffle ( const ADataType *__restrict__  p_a_grid,
const BDataType *__restrict__  p_b_grid,
DsPointer  p_ds_grid,
EDataType *__restrict__  p_e_grid,
const AGridDesc  a_grid_desc,
const BGridDesc  b_grid_desc,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_reduce_xdl_cshuffle_v1()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename ReducePtrsGlobal , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename ReduceInElementwiseOperations , typename ReduceAccElementwiseOperations , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ReduceGridDescriptor_MBlock_MPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_reduce_xdl_cshuffle_v1 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
ReducePtrsGlobal  p_reduces_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const ReduceInElementwiseOperations  reduce_in_element_ops,
const ReduceAccElementwiseOperations  reduce_out_element_ops,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  c_grid_desc_mblock_mperblock_nblock_nperblock,
const ReduceGridDescriptor_MBlock_MPerBlock  reduce_grid_desc_mblock_mperblock,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_wmma()

template<typename GridwiseGemm , typename ADataType , typename BDataType , typename CDataType , typename AGridDesc , typename BGridDesc , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_wmma ( const ADataType *__restrict__  p_a_grid,
const BDataType *__restrict__  p_b_grid,
CDataType *__restrict__  p_c_grid,
const AGridDesc  a_grid_desc,
const BGridDesc  b_grid_desc,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  c_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_wmma_cshuffle_v3()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_wmma_cshuffle_v3 ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v1() [1/2]

template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdl_cshuffle_v1 ( const FloatA *__restrict__  p_a_grid,
const FloatB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
typename GridwiseGemm::Problem  problem 
)

◆ kernel_gemm_xdl_cshuffle_v1() [2/2]

template<typename GridwiseGemm , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdl_cshuffle_v1 ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v2() [1/2]

template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdl_cshuffle_v2 ( const FloatA *  p_a_grid,
const FloatB *  p_b_grid,
FloatC *  p_c_grid,
typename GridwiseGemm::Problem  problem 
)

◆ kernel_gemm_xdl_cshuffle_v2() [2/2]

template<typename GridwiseGemm , bool HasMainKBlockLoop, index_t TailNum = 3>
__global__ void ck::kernel_gemm_xdl_cshuffle_v2 ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3 ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_2lds()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_2lds ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_b_preshuffle()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_b_preshuffle ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_2lds()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_2lds ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_mx() [1/2]

template<bool Use2LDS, typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ enable_if_t<!Use2LDS, void > ck::kernel_gemm_xdl_cshuffle_v3_mx ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_cshuffle_v3_mx() [2/2]

template<bool Use2LDS, typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ enable_if_t< Use2LDS, void > ck::kernel_gemm_xdl_cshuffle_v3_mx ( typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdl_waveletmodel_cshuffle()

template<typename GridwiseGemm , typename ABDataType , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename EElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdl_waveletmodel_cshuffle ( const ABDataType *__restrict__  p_a_grid,
const ABDataType *__restrict__  p_b_grid,
EDataType *__restrict__  p_e_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const EElementwiseOperation  e_element_op,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap  block_2_etile_map 
)

◆ kernel_gemm_xdlops_bwd_weight()

template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , typename AGridDesc_B_K0_M_K1 , typename BGridDesc_B_K0_N_K1 , typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename CBlockClusterAdaptor , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_bwd_weight ( const FloatA *__restrict__  p_a_grid,
const FloatB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const AGridDesc_B_K0_M_K1  a_b_k0_m_k1_grid_desc,
const BGridDesc_B_K0_N_K1  b_b_k0_n_k1_grid_desc,
const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  c_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const CBlockClusterAdaptor  c_block_cluster_adaptor 
)

◆ kernel_gemm_xdlops_skip_b_lds_v1()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3 , typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainK0BlockLoop>
__global__ void ck::kernel_gemm_xdlops_skip_b_lds_v1 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const AGridDesc_K0_M_K1  a_grid_desc_k0_m_k1,
const BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3  b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2  c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_xdlops_splitk_lds_direct_load()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation >
__global__ void ck::kernel_gemm_xdlops_splitk_lds_direct_load ( typename GridwiseGemm::Argument  karg,
const Block2CTileMap &  b2c_map,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op 
)

◆ kernel_gemm_xdlops_streamk()

template<typename GridwiseGemm >
__global__ void ck::kernel_gemm_xdlops_streamk ( const typename GridwiseGemm::FloatAB *  p_a_grid,
const typename GridwiseGemm::FloatAB *  p_b_grid,
typename GridwiseGemm::FloatC *  p_c_grid,
void *  p_workspace,
index_t  M,
index_t  N,
index_t  K,
index_t  StrideA,
index_t  StrideB,
index_t  StrideC,
typename GridwiseGemm::Block2CTileMap  block_mapping 
)

◆ kernel_gemm_xdlops_v2r3() [1/2]

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDesc_M_N , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v2r3 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const AGridDesc_K0_M_K1  a_grid_desc_k0_m_k1,
const BGridDesc_K0_N_K1  b_grid_desc_k0_n_k1,
const CGridDesc_M_N  c_grid_desc_m_n 
)

◆ kernel_gemm_xdlops_v2r3() [2/2]

template<typename GridwiseGemm , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v2r3 ( const typename GridwiseGemm::Argument  karg)

◆ kernel_gemm_xdlops_v2r4()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename ABK0MK1GridDesc , typename BBK0NK1GridDesc , typename CM0N0M1N1M2M3M4N2GridDesc , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename CBlockClusterAdaptor , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v2r4 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const ABK0MK1GridDesc  a_b_k0_m_k1_grid_desc,
const BBK0NK1GridDesc  b_b_k0_n_k1_grid_desc,
const CM0N0M1N1M2M3M4N2GridDesc  c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const CBlockClusterAdaptor  c_block_cluster_adaptor 
)

◆ kernel_gemm_xdlops_v2r4r2_simplified()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation >
__global__ void ck::kernel_gemm_xdlops_v2r4r2_simplified ( typename GridwiseGemm::Argument  karg,
const Block2CTileMap &  b2c_map,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op 
)

◆ kernel_gemm_xdlops_v3r1()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainK0BlockLoop>
__global__ void ck::kernel_gemm_xdlops_v3r1 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const AGridDesc_AK0_M_AK1  a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1  b_grid_desc_bk0_n_bk1,
const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl  c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_xdlops_v3r2()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v3r2 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const FloatC *__restrict__  p_c0_grid,
const AGridDesc_K0_M_K1  a_grid_desc_k0_m_k1,
const BGridDesc_K0_N_K1  b_grid_desc_k0_n_k1,
const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl  c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl  c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_gemm_xdlops_v3r3()

template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v3r3 ( const FloatAB *__restrict__  p_a_grid,
const FloatAB *__restrict__  p_b_grid,
FloatC *__restrict__  p_c_grid,
const FloatC *__restrict__  p_c0_grid,
const FloatC *__restrict__  p_c1_grid,
const AGridDesc_K0_M_K1  a_grid_desc_k0_m_k1,
const BGridDesc_K0_N_K1  b_grid_desc_k0_n_k1,
const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl  c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl  c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl  c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CElementwiseOperation  c_element_op,
const Block2CTileMap  block_2_ctile_map 
)

◆ kernel_grouped_contraction_multiple_d_xdl_cshuffle()

template<typename GridwiseGemm , typename ContractionMultiDKernelArg , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , bool HasMainKBlockLoop>
__global__ void ck::kernel_grouped_contraction_multiple_d_xdl_cshuffle ( const void CK_CONSTANT_ADDRESS_SPACE contraction_args,
const index_t  group_count,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op 
)

◆ kernel_grouped_conv_multiple_d_wmma_cshuffle()

template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2CTileMap , typename ComputePtrOffsetOfBatch , bool HasMainKBlockLoop>
__global__ void ck::kernel_grouped_conv_multiple_d_wmma_cshuffle ( const ADataType *__restrict__  p_a_grid,
const BDataType *__restrict__  p_b_grid,
DsPointer  p_ds_grid,
EDataType *__restrict__  p_e_grid,
const AElementwiseOperation  a_element_op,
const BElementwiseOperation  b_element_op,
const CDEElementwiseOperation  cde_element_op,
const index_t  batch_count,
const AGridDesc_AK0_M_AK1  a_grid_desc,
const BGridDesc_BK0_N_BK1  b_grid_desc,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  e_grid_desc_mblock_mperblock_nblock_nperblock_,
const Block2CTileMap  block_2_ctile_map,
const ComputePtrOffsetOfBatch  compute_ptr_offset_of_batch 
)

◆ kernel_moe_gemm()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_moe_gemm ( typename GridwiseGemm::Argument  karg)

◆ kernel_moe_gemm_2lds()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_moe_gemm_2lds ( typename GridwiseGemm::Argument  karg)

◆ kernel_moe_mxgemm()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_moe_mxgemm ( typename GridwiseGemm::Argument  karg)

◆ kernel_moe_mxgemm_2lds()

template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_moe_mxgemm_2lds ( typename GridwiseGemm::Argument  karg)

◆ kernel_multiblock_batchnorm_forward()

template<typename GridwiseMultiblockBatchNormForward_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename MeanVarCountGridDesc_M_G , typename MeanVarCountGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor >
__global__ void ck::kernel_multiblock_batchnorm_forward ( const XYGridDesc_M_K  x_grid_desc_m_k,
const XYGridDesc_M_K  y_grid_desc_m_k,
const MeanVarCountGridDesc_M_G  mean_var_count_grid_desc_m_g,
const MeanVarCountGridDesc_M_K  mean_var_count_grid_desc_m_k,
const ScaleBiasGridDesc_M  scale_grid_desc_m,
const ScaleBiasGridDesc_M  bias_grid_desc_m,
const MeanVarGridDesc_M  mean_var_grid_desc_m,
const GetReduceCountPerThreadFunctor  get_reduce_count_per_thread,
index_t  num_k_block_tile_iteration,
AccDataType  epsilon,
const XDataType *const __restrict__  p_x,
MeanVarDataType *const __restrict__  p_welford_mean,
MeanVarDataType *const __restrict__  p_welford_variance,
int32_t *const __restrict__  p_welford_count,
int32_t *const __restrict__  p_control,
const ScaleDataType *const __restrict__  p_scale,
const BiasDataType *const __restrict__  p_bias,
const YElementwiseOp  y_elementwise_op,
YDataType *const __restrict__  p_y,
bool  updateMovingAverage,
AccDataType  averageFactor,
MeanVarDataType *const __restrict__  resultRunningMean,
MeanVarDataType *const __restrict__  resultRunningVariance,
bool  saveMeanInvVariance,
MeanVarDataType *const __restrict__  resultSaveMean,
MeanVarDataType *const __restrict__  resultSaveInvVariance 
)

◆ kernel_multiblock_welford_first_half()

template<typename GridwiseMultiblockWelfordFirstHalf_ , typename XDataType , typename MeanVarDataType , typename XGridDesc_M_K , typename MeanVarCountGridDesc_M_G , typename GetReduceCountPerThreadFunctor >
__global__ void ck::kernel_multiblock_welford_first_half ( const XGridDesc_M_K  x_grid_desc_m_k,
const MeanVarCountGridDesc_M_G  mean_var_count_grid_desc_m_g,
const GetReduceCountPerThreadFunctor  get_reduce_count_per_thread,
index_t  num_k_block_tile_iteration,
const XDataType *const __restrict__  p_x,
MeanVarDataType *const  p_welford_mean,
MeanVarDataType *const  p_welford_variance,
int32_t *const  p_welford_count 
)

◆ kernel_multiple_buffer_set_value()

template<typename Grid1dBufferDescTuple , index_t NumBuffer, index_t BlockSize, typename DataTypePointerTuple , typename DataTypeTuple >
__global__ void ck::kernel_multiple_buffer_set_value ( const Grid1dBufferDescTuple  grid_1d_buffer_desc_tuple,
DataTypePointerTuple  p_global_tuple,
DataTypeTuple  value_tuple 
)

◆ kernel_multiple_reduce_multiblock()

template<typename GridwiseMultipleReduction , index_t NumReduction, typename InDataType , typename OutDataTypePointerTuple , typename AccDataType , typename InGridDesc_M_K , typename OutGridDesc_M_Tuple , typename InElementwiseOperationTuple , typename AccElementwiseOperationTuple >
__global__ void ck::kernel_multiple_reduce_multiblock ( const InGridDesc_M_K  in_grid_desc_m_k,
const OutGridDesc_M_Tuple  out_grid_desc_m_tuple,
const InElementwiseOperationTuple  in_elementwise_op_tuple,
const AccElementwiseOperationTuple  acc_elementwise_op_tuple,
index_t  block_group_size,
index_t  num_k_block_tile_iteration,
Array< AccDataType, NumReduction >  alpha_values,
const InDataType *const __restrict__  p_in_value_global,
Array< AccDataType, NumReduction >  beta_values,
OutDataTypePointerTuple  p_out_value_global_tuple 
)

◆ kernel_multiple_reduce_threadwise()

template<typename GridwiseMultipleReduction , index_t NumReduction, typename InDataType , typename OutDataTypePointerTuple , typename AccDataType , typename InGridDesc_M_K , typename OutGridDesc_M_Tuple , typename InElementwiseOperationTuple , typename AccElementwiseOperationTuple >
__global__ void ck::kernel_multiple_reduce_threadwise ( const InGridDesc_M_K  in_grid_desc_m_k,
const OutGridDesc_M_Tuple  out_grid_desc_m_tuple,
const InElementwiseOperationTuple  in_elementwise_op_tuple,
const AccElementwiseOperationTuple  acc_elementwise_op_tuple,
Array< AccDataType, NumReduction >  alpha_values,
const InDataType *const __restrict__  p_in_value_global,
Array< AccDataType, NumReduction >  beta_values,
OutDataTypePointerTuple  p_out_value_global_tuple 
)

◆ kernel_nd_permute()

template<typename GridwisePermute , typename InGridDesc , typename OutGridDesc , typename InDataType , typename OutDataType , typename ElementwiseOperation , typename Block2TileMap >
__global__ void ck::kernel_nd_permute ( const InGridDesc  in_grid_desc,
const OutGridDesc  out_grid_desc,
const InDataType *  p_in_global,
OutDataType *  p_out_global,
const ElementwiseOperation  elementwise_op,
const Block2TileMap  block_2_tile_map 
)

◆ kernel_normalization()

template<typename GridwiseReduction , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename GridDesc_M_K , typename GridDesc_M >
__global__ void ck::kernel_normalization ( const GridDesc_M_K  x_grid_desc_m_k,
const GridDesc_M_K  gamma_grid_desc_m_k,
const GridDesc_M_K  beta_grid_desc_m_k,
const GridDesc_M_K  y_grid_desc_m_k,
const GridDesc_M  save_mean_grid_desc_m,
const GridDesc_M  save_inv_std_grid_desc_m,
index_t  num_k_block_tile_iteration,
ComputeDataType  epsilon,
const XDataType *const __restrict__  p_x_global,
const GammaDataType *const __restrict__  p_gamma_global,
const BetaDataType *const __restrict__  p_beta_global,
YDataType *const __restrict__  p_y_global,
SaveMeanInvStdDataType *const __restrict__  p_save_mean_global,
SaveMeanInvStdDataType *const __restrict__  p_save_inv_std_global,
const YElementwiseOperation  y_elementwise_op 
)

◆ kernel_normalizationSplitK1st()

template<typename GridwiseWelford , typename XDataType , typename WorkspaceMeanVarDataType , typename ComputeDataType , typename XGridDesc_M_K , typename MeanVarGridDesc_M_KBlock >
__global__ void ck::kernel_normalizationSplitK1st ( const XGridDesc_M_K  x_grid_desc_m_k,
const MeanVarGridDesc_M_KBlock  mean_var_grid_desc_m_kblock,
index_t  num_k_block_tile_iteration,
const XDataType *const __restrict__  p_x_global,
WorkspaceMeanVarDataType *const __restrict__  p_welford_mean,
WorkspaceMeanVarDataType *const __restrict__  p_welford_variance,
int32_t *const __restrict__  p_welford_count 
)

◆ kernel_normalizationSplitK2nd()

template<typename GridwiseWelfordNormalization , typename WorkspaceMeanVarDataType , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename MeanVarGridDesc_M_KBlock , typename CountGridDesc_M_KBlock , typename XYGammaBetaGridDesc_M_K , typename SaveMeanInvStdGridDesc_M >
__global__ void ck::kernel_normalizationSplitK2nd ( const MeanVarGridDesc_M_KBlock  mean_var_grid_desc_m_kblock,
const CountGridDesc_M_KBlock  count_grid_desc_m_kblock,
const XYGammaBetaGridDesc_M_K  x_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K  gamma_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K  beta_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K  y_grid_desc_m_k,
const SaveMeanInvStdGridDesc_M  save_mean_grid_desc_m,
const SaveMeanInvStdGridDesc_M  save_inv_std_grid_desc_m,
index_t  num_k_mean_var_count_iteration,
index_t  num_k_block_tile_iteration,
index_t  k_grid_size,
ComputeDataType  epsilon,
const WorkspaceMeanVarDataType *const  p_mean_global,
const WorkspaceMeanVarDataType *const  p_variance_global,
const int32_t *const  p_welford_count_global,
const XDataType *const __restrict__  p_x_global,
const GammaDataType *const __restrict__  p_gamma_global,
const BetaDataType *const __restrict__  p_beta_global,
YDataType *const __restrict__  p_y_global,
SaveMeanInvStdDataType *const __restrict__  p_save_mean_global,
SaveMeanInvStdDataType *const __restrict__  p_save_inv_std_global,
const YElementwiseOperation  y_elementwise_op 
)

◆ kernel_put_element_1d()

template<typename GridwisePutElementwise1dFunctor , typename InGrid1dDesc , typename InDataType , typename IndexDataType , typename OutDataType , typename ElementwiseOperation >
__global__ void ck::kernel_put_element_1d ( const InGrid1dDesc  in_grid_1d_desc,
const InDataType *__restrict__  p_in_global,
const IndexDataType *__restrict__  p_indices_global,
OutDataType *__restrict__  p_out_global,
const ElementwiseOperation  elementwise_op 
)

◆ kernel_reduce_multiblock()

template<typename GridwiseReduction , bool OutputIndex, bool HaveIndexInput, typename InDataType , typename OutDataType , typename AccDataType , typename IndexDataType , typename InGridDesc_M_K , typename OutGridDesc_M , typename InElementwiseOperation , typename AccElementwiseOperation >
__global__ void ck::kernel_reduce_multiblock ( const InGridDesc_M_K  in_grid_desc_m_k,
const OutGridDesc_M  out_grid_desc_m,
const InElementwiseOperation  in_elementwise_op,
const AccElementwiseOperation  acc_elementwise_op,
index_t  block_group_size,
index_t  num_k_block_tile_iteration,
AccDataType  alpha,
const InDataType *const __restrict__  p_in_value_global,
const IndexDataType *const __restrict__  p_in_index_global,
AccDataType  beta,
OutDataType *const __restrict__  p_out_value_global,
IndexDataType *const __restrict__  p_out_index_global 
)

◆ kernel_reduce_second_half_batchnorm_backward_final()

template<typename GridwiseReduceSecondHalfBatchNormBackwardFinal_ , typename XDataType , typename DyDataType , typename DxDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename DscaleDbiasGridDesc_M_K , typename MeanVarGridDesc_M , typename ScaleBiasGridDesc_M >
__global__ void ck::kernel_reduce_second_half_batchnorm_backward_final ( const XYGridDesc_M_K  x_grid_desc_m_k,
const XYGridDesc_M_K  dy_grid_desc_m_k,
const XYGridDesc_M_K  dx_grid_desc_m_k,
const DscaleDbiasGridDesc_M_K  dscale_dbias_grid_desc_m_k,
const MeanVarGridDesc_M  mean_var_grid_desc_m,
const ScaleBiasGridDesc_M  scale_grid_desc_m,
const ScaleBiasGridDesc_M  bias_grid_desc_m,
index_t  blkgroup_size,
long_index_t  reduce_size,
index_t  num_xy_k_block_tile_iteration,
index_t  num_dscale_dbias_k_block_tile_iteration,
const DscaleDbiasDataType *const __restrict__  p_reduce_dscale,
const DscaleDbiasDataType *const __restrict__  p_reduce_dbias,
const MeanVarDataType *const __restrict__  p_mean,
const MeanVarDataType *const __restrict__  p_inv_var,
const XDataType *const __restrict__  p_x,
const DyDataType *const __restrict__  p_dy,
const ScaleDataType *const __restrict__  p_scale,
const DyElementwiseOp  dy_elementwise_op,
DxDataType *const __restrict__  p_dx,
DscaleDbiasDataType *const __restrict__  p_dscale,
DscaleDbiasDataType *const __restrict__  p_dbias 
)

◆ kernel_reduce_threadwise()

template<typename GridwiseReduction , bool OutputIndex, bool TransformIndexKtoGlobal, bool HaveIndexInput, typename InDataType , typename OutDataType , typename AccDataType , typename IndexDataType , typename InGridDesc_M_K , typename OutGridDesc_M , typename InElementwiseOperation , typename AccElementwiseOperation >
__global__ void ck::kernel_reduce_threadwise ( const InGridDesc_M_K  in_grid_desc_m_k,
const OutGridDesc_M  out_grid_desc_m,
const InElementwiseOperation  in_elementwise_op,
const AccElementwiseOperation  acc_elementwise_op,
AccDataType  alpha,
const InDataType *const __restrict__  p_in_value_global,
const IndexDataType *const __restrict__  p_in_index_global,
AccDataType  beta,
OutDataType *const __restrict__  p_out_value_global,
IndexDataType *const __restrict__  p_out_index_global 
)

◆ kernel_reduce_threadwise_multi_d()

template<typename GridwiseReduction , typename InDataType , typename OutDataType , typename AccDataType , typename InGridDesc_M_K , typename DsGridDesc_M , typename OutGridDesc_M , typename InElementwiseOperation , typename OutElementwiseOperation , typename DsGridPointer >
__global__ void ck::kernel_reduce_threadwise_multi_d ( const InGridDesc_M_K  in_grid_desc_m_k,
const DsGridDesc_M  ds_grid_desc_m,
const OutGridDesc_M  out_grid_desc_m,
const InElementwiseOperation  in_elementwise_op,
const OutElementwiseOperation  out_elementwise_op,
const InDataType *const __restrict__  p_in_value_global,
const DsGridPointer  p_ds_value_global,
OutDataType *const __restrict__  p_out_value_global 
)

◆ kernel_softmax()

template<typename GridwiseReduction , typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K >
__global__ void ck::kernel_softmax ( const GridDesc_M_K  in_grid_desc_m_k,
const GridDesc_M_K  out_grid_desc_m_k,
index_t  block_group_size,
index_t  num_k_block_tile_iteration,
AccDataType  alpha,
const InDataType *const __restrict__  p_in_value_global,
AccDataType  beta,
OutDataType *const __restrict__  p_out_value_global 
)

◆ kernel_sparse_embeddings_forward_layernorm()

template<typename GridwiseSparseEmbedding , typename EmbType , typename IndexType , typename GammaDataType , typename BetaDataType , typename AccDataType , typename OutType , typename OutGridDesc , typename EmbElementwiseOperation , ck::index_t NumEmbeddings>
__global__ void ck::kernel_sparse_embeddings_forward_layernorm ( OutType *  p_out,
const ck::Array< EmbType *, NumEmbeddings >  p_embs,
const ck::Array< IndexType *, NumEmbeddings >  p_indexes,
const GammaDataType *  p_gamma,
const BetaDataType *  p_beta,
const OutGridDesc  out_grid_desc,
const AccDataType  epsilon,
const EmbElementwiseOperation  emb_elementwise_op 
)

◆ kernel_tensor_rearrange()

template<typename InputGridDesc , typename InputDataType , typename OutputGridDesc , typename OutputDataType , typename Block2ETileMap , typename ComputePtrOffsetOfStridedBatch , typename GridwiseTensorRearrangeKernel >
__global__ void ck::kernel_tensor_rearrange ( const InputGridDesc  in_grid_desc,
const InputDataType *__restrict__  p_in_global,
const OutputGridDesc  out_grid_desc,
OutputDataType *__restrict__  p_out_global,
const index_t  batch_count,
const Block2ETileMap  block_2_tile_map,
const ComputePtrOffsetOfStridedBatch  compute_ptr_offset_of_batch 
)

◆ kernel_welford_layernorm2d_second_half()

template<typename GridwiseWelfordLayernorm , typename EMeanVarDataType , typename HDataType , typename GammaDataType , typename BetaDataType , typename ComputeDataType , typename EHGridDesc_M_N , typename LayernormMeanVarGridDesc_M_NBlock , typename LayernormCountGridDesc_M_NBlock , typename GammaBetaGridDesc_N , typename HElementwiseOperation >
__global__ void ck::kernel_welford_layernorm2d_second_half ( const EMeanVarDataType *__restrict__  p_e_grid,
const EMeanVarDataType *__restrict__  p_in_welford_mean_grid,
const EMeanVarDataType *__restrict__  p_in_welford_var_grid,
const int32_t *__restrict__  p_in_welford_count_grid,
const GammaDataType *__restrict__  p_gamma_grid,
const BetaDataType *__restrict__  p_beta_grid,
HDataType *__restrict__  p_h_grid,
const EHGridDesc_M_N  e_grid_desc_m_n,
const EHGridDesc_M_N  h_grid_desc_m_n,
const LayernormMeanVarGridDesc_M_NBlock  mean_var_grid_desc_m_nblock,
const LayernormCountGridDesc_M_NBlock  count_grid_desc_m_nblock,
const GammaBetaGridDesc_N  gamma_grid_desc_n,
const GammaBetaGridDesc_N  beta_grid_desc_n,
index_t  numMeanVarCountBlockTileIteration_N,
index_t  NBlockClusterLength,
ComputeDataType  epsilon,
HElementwiseOperation  h_element_op 
)

◆ kernel_welford_second_half_batchnorm_forward_final()

template<typename GridwiseWelfordSecondHalfBatchNormForwardFinal_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename MeanVarCountGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M >
__global__ void ck::kernel_welford_second_half_batchnorm_forward_final ( const XYGridDesc_M_K  x_grid_desc_m_k,
const XYGridDesc_M_K  y_grid_desc_m_k,
const MeanVarCountGridDesc_M_K  mean_var_count_grid_desc_m_k,
const ScaleBiasGridDesc_M  scale_grid_desc_m,
const ScaleBiasGridDesc_M  bias_grid_desc_m,
const MeanVarGridDesc_M  mean_var_grid_desc_m,
index_t  blkgroup_size,
index_t  num_xy_k_block_tile_iteration,
AccDataType  epsilon,
const MeanVarDataType *const __restrict__  p_in_welford_mean,
const MeanVarDataType *const __restrict__  p_in_welford_variance,
const int32_t *const __restrict__  p_in_welford_count,
const XDataType *const __restrict__  p_x,
const ScaleDataType *const __restrict__  p_scale,
const BiasDataType *const __restrict__  p_bias,
const YElementwiseOp  y_elementwise_op,
YDataType *const __restrict__  p_y,
bool  updateMovingAverage,
AccDataType  averageFactor,
MeanVarDataType *const __restrict__  resultRunningMean,
MeanVarDataType *const __restrict__  resultRunningVariance,
bool  saveMeanInvVariance,
MeanVarDataType *const __restrict__  resultSaveMean,
MeanVarDataType *const __restrict__  resultSaveInvVariance 
)

◆ kernel_welford_second_half_reduce_first_half()

template<typename GridwiseWelfordSecondHalfReduceFirstHalf_ , typename XDataType , typename DyDataType , typename AccDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename MeanVarGridDesc_M , typename MeanVarCountGridDesc_M_K , typename DscaleDbiasGridDesc_M_G >
__global__ void ck::kernel_welford_second_half_reduce_first_half ( const XYGridDesc_M_K  x_grid_desc_m_k,
const XYGridDesc_M_K  dy_grid_desc_m_k,
const MeanVarGridDesc_M  mean_var_grid_desc_m,
const MeanVarCountGridDesc_M_K  mean_var_count_grid_desc_m_k,
const DscaleDbiasGridDesc_M_G  dscale_dbias_grid_desc_m_g,
index_t  blkgroup_size,
index_t  num_xy_k_block_tile_iteration,
index_t  num_mean_var_count_k_block_tile_iteration,
AccDataType  epsilon,
bool  haveSavedMeanInvVar,
const MeanVarDataType *const __restrict__  p_savedMean,
const MeanVarDataType *const __restrict__  p_savedInvVar,
const MeanVarDataType *const __restrict__  p_in_welford_mean,
const MeanVarDataType *const __restrict__  p_in_welford_variance,
const int32_t *const __restrict__  p_in_welford_count,
const DyElementwiseOp  dy_elementwise_op,
MeanVarDataType *const __restrict__  p_out_welford_mean,
MeanVarDataType *const __restrict__  p_out_welford_inv_variance,
const XDataType *const __restrict__  p_x,
const DyDataType *const __restrict__  p_dy,
DscaleDbiasDataType *const __restrict__  p_reduce_dscale,
DscaleDbiasDataType *const __restrict__  p_reduce_dbias 
)

◆ llvm_amdgcn_raw_buffer_atomic_add_fp16x2()

__device__ half2_t ck::llvm_amdgcn_raw_buffer_atomic_add_fp16x2 ( half2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_atomic_add_fp32()

__device__ float ck::llvm_amdgcn_raw_buffer_atomic_add_fp32 ( float  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_atomic_add_i32()

__device__ int32_t ck::llvm_amdgcn_raw_buffer_atomic_add_i32 ( int32_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_atomic_max_fp64()

__device__ double ck::llvm_amdgcn_raw_buffer_atomic_max_fp64 ( double  vdata,
int32x4_t  rsrc,
int  voffset,
int  soffset,
int  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp16()

__device__ half_t ck::llvm_amdgcn_raw_buffer_load_fp16 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp16x2()

__device__ half2_t ck::llvm_amdgcn_raw_buffer_load_fp16x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp16x4()

__device__ half4_t ck::llvm_amdgcn_raw_buffer_load_fp16x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp32()

__device__ float ck::llvm_amdgcn_raw_buffer_load_fp32 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp32x2()

__device__ float2_t ck::llvm_amdgcn_raw_buffer_load_fp32x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_fp32x4()

__device__ float4_t ck::llvm_amdgcn_raw_buffer_load_fp32x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i16()

__device__ bhalf_t ck::llvm_amdgcn_raw_buffer_load_i16 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i16x2()

__device__ bhalf2_t ck::llvm_amdgcn_raw_buffer_load_i16x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i16x4()

__device__ bhalf4_t ck::llvm_amdgcn_raw_buffer_load_i16x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i32()

__device__ int32_t ck::llvm_amdgcn_raw_buffer_load_i32 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i32x2()

__device__ int32x2_t ck::llvm_amdgcn_raw_buffer_load_i32x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i32x4()

__device__ int32x4_t ck::llvm_amdgcn_raw_buffer_load_i32x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i8()

__device__ int8_t ck::llvm_amdgcn_raw_buffer_load_i8 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i8x2()

__device__ int8x2_t ck::llvm_amdgcn_raw_buffer_load_i8x2 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_i8x4()

__device__ int8x4_t ck::llvm_amdgcn_raw_buffer_load_i8x4 ( int32x4_t  srsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_load_lds()

__device__ void ck::llvm_amdgcn_raw_buffer_load_lds ( int32x4_t  rsrc,
uint32_t *  lds_ptr,
index_t  size,
index_t  voffset,
index_t  soffset,
index_t  offset,
index_t  aux 
)

◆ llvm_amdgcn_raw_buffer_store_fp16()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp16 ( half_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp16x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp16x2 ( half2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp16x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp16x4 ( half4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp32()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp32 ( float  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp32x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp32x2 ( float2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_fp32x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp32x4 ( float4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i16()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i16 ( bhalf_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i16x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i16x2 ( bhalf2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i16x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i16x4 ( bhalf4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i32()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i32 ( int32_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i32x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i32x2 ( int32x2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i32x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i32x4 ( int32x4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i8()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i8 ( int8_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i8x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i8x2 ( int8x2_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ llvm_amdgcn_raw_buffer_store_i8x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i8x4 ( int8x4_t  vdata,
int32x4_t  rsrc,
index_t  voffset,
index_t  soffset,
index_t  glc_slc 
)

◆ make_array() [1/2]

template<typename X >
__host__ constexpr __device__ auto ck::make_array ( )
constexpr

◆ make_array() [2/2]

template<typename X , typename... Xs>
__host__ constexpr __device__ auto ck::make_array ( X &&  x,
Xs &&...  xs 
)
constexpr

◆ make_cluster_descriptor()

template<typename Lengths , typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
__host__ constexpr __device__ auto ck::make_cluster_descriptor ( const Lengths &  lengths,
ArrangeOrder  order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{} 
)
constexpr

◆ make_default_loop_scheduler()

constexpr LoopScheduler ck::make_default_loop_scheduler ( )
constexpr

◆ make_dynamic_buffer() [1/2]

template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize >
__host__ constexpr __device__ auto ck::make_dynamic_buffer ( T *  p,
ElementSpaceSize  element_space_size 
)
constexpr

◆ make_dynamic_buffer() [2/2]

template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize , typename X , typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X >>::value, bool >::type = false>
__host__ constexpr __device__ auto ck::make_dynamic_buffer ( T *  p,
ElementSpaceSize  element_space_size,
invalid_element_value 
)
constexpr

◆ make_embed_transform()

template<typename UpLengths , typename Coefficients , typename enable_if< UpLengths::Size()==Coefficients::Size(), bool >::type = false>
__host__ constexpr __device__ auto ck::make_embed_transform ( const UpLengths &  up_lengths,
const Coefficients &  coefficients 
)
constexpr

◆ make_freeze_transform()

template<typename LowerIndex >
__host__ constexpr __device__ auto ck::make_freeze_transform ( const LowerIndex &  low_idx)
constexpr

◆ make_insert_transform()

template<typename UpperIndex >
__host__ constexpr __device__ auto ck::make_insert_transform ( const UpperIndex &  up_idx)
constexpr

◆ make_left_pad_transform()

template<typename LowLength , typename LeftPadLength , bool SkipIsValidCheck = false>
__host__ constexpr __device__ auto ck::make_left_pad_transform ( const LowLength &  low_length,
const LeftPadLength &  left_pad,
integral_constant< bool, SkipIsValidCheck >  = integral_constant<bool, false>{} 
)
constexpr

◆ make_long_dynamic_buffer()

template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize >
__host__ constexpr __device__ auto ck::make_long_dynamic_buffer ( T *  p,
ElementSpaceSize  element_space_size 
)
constexpr

◆ make_merge_transform()

template<typename LowLengths >
__host__ constexpr __device__ auto ck::make_merge_transform ( const LowLengths &  low_lengths)
constexpr

◆ make_merge_transform_v1_carry_check()

template<typename LowLengths >
__host__ constexpr __device__ auto ck::make_merge_transform_v1_carry_check ( const LowLengths &  low_lengths)
constexpr

◆ make_merge_transform_v2_magic_division()

template<typename LowLengths >
__host__ constexpr __device__ auto ck::make_merge_transform_v2_magic_division ( const LowLengths &  low_lengths)
constexpr

◆ make_merge_transform_v3_division_mod()

template<typename LowLengths >
__host__ constexpr __device__ auto ck::make_merge_transform_v3_division_mod ( const LowLengths &  low_lengths)
constexpr

◆ make_merge_transform_v4_no_carry()

template<typename LowLengths >
__host__ constexpr __device__ auto ck::make_merge_transform_v4_no_carry ( const LowLengths &  low_lengths)
constexpr

◆ make_modulo_transform()

template<typename Modulus , typename UpLength >
__host__ constexpr __device__ auto ck::make_modulo_transform ( const Modulus &  modulus,
const UpLength &  up_length 
)
constexpr

◆ make_multi_index()

template<typename... Xs>
__host__ constexpr __device__ auto ck::make_multi_index ( Xs &&...  xs)
constexpr

◆ make_naive_tensor_descriptor()

template<typename... Lengths, typename... Strides, typename enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
__host__ constexpr __device__ auto ck::make_naive_tensor_descriptor ( const Tuple< Lengths... > &  lengths,
const Tuple< Strides... > &  strides 
)
constexpr

◆ make_naive_tensor_descriptor_aligned()

template<typename... Lengths, typename Align >
__host__ constexpr __device__ auto ck::make_naive_tensor_descriptor_aligned ( const Tuple< Lengths... > &  lengths,
Align  align 
)
constexpr

◆ make_naive_tensor_descriptor_packed()

template<typename... Lengths>
__host__ constexpr __device__ auto ck::make_naive_tensor_descriptor_packed ( const Tuple< Lengths... > &  lengths)
constexpr

◆ make_pad_transform()

template<typename LowLength , typename LeftPad , typename RightPad , bool SkipIsValidCheck = false>
__host__ constexpr __device__ auto ck::make_pad_transform ( const LowLength &  low_length,
const LeftPad left_pad,
const RightPad right_pad,
integral_constant< bool, SkipIsValidCheck >  = integral_constant<bool, false>{} 
)
constexpr

◆ make_pass_through_transform()

template<typename LowLength >
__host__ constexpr __device__ auto ck::make_pass_through_transform ( const LowLength &  low_length)
constexpr

◆ make_right_pad_transform()

template<typename LowLength , typename RightPadLength , bool SkipIsValidCheck = false>
__host__ constexpr __device__ auto ck::make_right_pad_transform ( const LowLength &  low_length,
const RightPadLength &  right_pad,
integral_constant< bool, SkipIsValidCheck >  = integral_constant<bool, false>{} 
)
constexpr

◆ make_sequence()

template<index_t... Is>
__host__ constexpr __device__ auto ck::make_sequence ( Number< Is >  ...)
constexpr

◆ make_single_stage_tensor_adaptor()

template<typename Transforms , typename LowerDimensionOldTopIdss , typename UpperDimensionNewTopIdss >
__host__ constexpr __device__ auto ck::make_single_stage_tensor_adaptor ( const Transforms &  transforms,
LowerDimensionOldTopIdss  ,
UpperDimensionNewTopIdss   
)
constexpr

◆ make_slice_transform()

template<typename LowLength , typename SliceBegin , typename SliceEnd >
__host__ constexpr __device__ auto ck::make_slice_transform ( const LowLength &  low_length,
const SliceBegin &  slice_begin,
const SliceEnd &  slice_end 
)
constexpr

◆ make_static_buffer() [1/2]

template<AddressSpaceEnum AddressSpace, typename T , long_index_t N>
__host__ constexpr __device__ auto ck::make_static_buffer ( LongNumber< N >  )
constexpr

◆ make_static_buffer() [2/2]

template<AddressSpaceEnum AddressSpace, typename T , index_t N>
__host__ constexpr __device__ auto ck::make_static_buffer ( Number< N >  )
constexpr

◆ make_static_tensor() [1/2]

template<AddressSpaceEnum AddressSpace, typename T , typename TensorDesc , typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false>
__host__ constexpr __device__ auto ck::make_static_tensor ( TensorDesc  )
constexpr

◆ make_static_tensor() [2/2]

template<AddressSpaceEnum AddressSpace, typename T , typename TensorDesc , typename X , typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false, typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X >>::value, bool >::type = false>
__host__ constexpr __device__ auto ck::make_static_tensor ( TensorDesc  ,
invalid_element_value 
)
constexpr

◆ make_statically_indexed_array() [1/2]

template<typename X >
__host__ constexpr __device__ auto ck::make_statically_indexed_array ( )
constexpr

◆ make_statically_indexed_array() [2/2]

template<typename X , typename... Xs>
__host__ constexpr __device__ auto ck::make_statically_indexed_array ( const X &  x,
const Xs &...  xs 
)
constexpr

◆ make_tensor_coordinate()

template<typename TensorDesc , typename VisibleIndex >
__host__ constexpr __device__ auto ck::make_tensor_coordinate ( const TensorDesc &  tensor_desc,
const VisibleIndex &  idx_visible 
)
constexpr

◆ make_tensor_coordinate_step() [1/2]

template<typename TensorDesc , typename VisibleIndex >
__host__ constexpr __device__ auto ck::make_tensor_coordinate_step ( const TensorDesc &  ,
const VisibleIndex &  idx_diff_visible 
)
constexpr

◆ make_tensor_coordinate_step() [2/2]

template<typename TensorDesc , typename VisibleIndex , typename UpdateLowerIndexHack >
__host__ constexpr __device__ auto ck::make_tensor_coordinate_step ( const TensorDesc &  ,
const VisibleIndex &  idx_diff_visible,
UpdateLowerIndexHack   
)
constexpr

◆ make_tuple()

template<typename... Xs>
__host__ constexpr __device__ auto ck::make_tuple ( Xs &&...  xs)
constexpr

◆ make_unmerge_transform()

template<typename UpLengths , bool Use24BitIntegerCalculation = false>
__host__ constexpr __device__ auto ck::make_unmerge_transform ( const UpLengths &  up_lengths,
integral_constant< bool, Use24BitIntegerCalculation >  = integral_constant<bool, false>{} 
)
constexpr

◆ make_vector_type()

template<typename T , index_t N>
__host__ constexpr __device__ auto ck::make_vector_type ( Number< N >  )
constexpr

◆ make_vectorize_transform()

template<typename VectorSize , typename UpLength >
__host__ constexpr __device__ auto ck::make_vectorize_transform ( const VectorSize &  vector_size,
const UpLength &  up_length 
)
constexpr

◆ make_wave_buffer_resource()

template<typename T >
__device__ int32x4_t ck::make_wave_buffer_resource ( T *  p_wave,
index_t  element_space_size 
)

◆ make_wave_buffer_resource_new()

template<typename T >
__device__ __amdgpu_buffer_rsrc_t ck::make_wave_buffer_resource_new ( T *  p_wave,
index_t  element_space_size 
)

◆ make_wave_buffer_resource_with_default_range()

template<typename T >
__device__ int32x4_t ck::make_wave_buffer_resource_with_default_range ( T *  p_wave)

◆ make_wave_buffer_resource_with_default_range_new()

template<typename T >
__device__ __amdgpu_buffer_rsrc_t ck::make_wave_buffer_resource_with_default_range_new ( T *  p_wave)

◆ make_xor_transform()

template<typename LowLengths >
__host__ constexpr __device__ auto ck::make_xor_transform ( const LowLengths &  low_lengths)
constexpr

◆ make_xor_with_modulo_transform()

template<typename LowLengths >
__host__ constexpr __device__ auto ck::make_xor_with_modulo_transform ( const LowLengths &  low_lengths)
constexpr

◆ make_zero_multi_index()

template<index_t NSize>
__host__ constexpr __device__ auto ck::make_zero_multi_index ( )
constexpr

◆ merge_sequences()

template<typename... Seqs>
__host__ constexpr __device__ auto ck::merge_sequences ( Seqs...  )
constexpr

◆ modify_sequence_elements_by_ids()

template<typename Seq , typename Values , typename Ids >
__host__ constexpr __device__ auto ck::modify_sequence_elements_by_ids ( Seq  ,
Values  ,
Ids   
)
constexpr

◆ move_tensor_coordinate()

template<typename TensorDesc , typename TensorCoord , typename TensorCoordStep >
__host__ constexpr __device__ void ck::move_tensor_coordinate ( const TensorDesc &  tensor_desc,
TensorCoord &  coord,
const TensorCoordStep &  coord_step 
)
constexpr

◆ mxf8_convert_rne()

template<typename Y , typename X >
__host__ constexpr __device__ Y ck::mxf8_convert_rne ( x,
float  scale 
)
constexpr

◆ mxf8_convert_rne< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::mxf8_convert_rne< bf8_ocp_t, float > ( float  x,
float  scale 
)
inline

◆ mxf8_convert_rne< bf8x16_ocp_t, float16_t >()

template<>
__host__ __device__ bf8x16_ocp_t ck::mxf8_convert_rne< bf8x16_ocp_t, float16_t > ( float16_t  x,
float  scale 
)
inline

◆ mxf8_convert_rne< bf8x2_ocp_t, float2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::mxf8_convert_rne< bf8x2_ocp_t, float2_t > ( float2_t  x,
float  scale 
)
inline

◆ mxf8_convert_rne< bf8x32_ocp_t, float32_t >()

template<>
__host__ __device__ bf8x32_ocp_t ck::mxf8_convert_rne< bf8x32_ocp_t, float32_t > ( float32_t  x,
float  scale 
)
inline

◆ mxf8_convert_rne< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::mxf8_convert_rne< f8_ocp_t, float > ( float  x,
float  scale 
)
inline

◆ mxf8_convert_rne< f8x16_ocp_t, float16_t >()

template<>
__host__ __device__ f8x16_ocp_t ck::mxf8_convert_rne< f8x16_ocp_t, float16_t > ( float16_t  x,
float  scale 
)
inline

◆ mxf8_convert_rne< f8x2_ocp_t, float2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::mxf8_convert_rne< f8x2_ocp_t, float2_t > ( float2_t  x,
float  scale 
)
inline

◆ mxf8_convert_rne< f8x32_ocp_t, float32_t >()

template<>
__host__ __device__ f8x32_ocp_t ck::mxf8_convert_rne< f8x32_ocp_t, float32_t > ( float32_t  x,
float  scale 
)
inline

◆ mxf8_convert_sr()

template<typename Y , typename X >
__host__ constexpr __device__ Y ck::mxf8_convert_sr ( x,
float  scale 
)
constexpr

◆ mxf8_convert_sr< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::mxf8_convert_sr< bf8_ocp_t, float > ( float  x,
float  scale 
)
inline

◆ mxf8_convert_sr< bf8x16_ocp_t, float16_t >()

template<>
__host__ __device__ bf8x16_ocp_t ck::mxf8_convert_sr< bf8x16_ocp_t, float16_t > ( float16_t  x,
float  scale 
)
inline

◆ mxf8_convert_sr< bf8x2_ocp_t, float2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::mxf8_convert_sr< bf8x2_ocp_t, float2_t > ( float2_t  x,
float  scale 
)
inline

◆ mxf8_convert_sr< bf8x32_ocp_t, float32_t >()

template<>
__host__ __device__ bf8x32_ocp_t ck::mxf8_convert_sr< bf8x32_ocp_t, float32_t > ( float32_t  x,
float  scale 
)
inline

◆ mxf8_convert_sr< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::mxf8_convert_sr< f8_ocp_t, float > ( float  x,
float  scale 
)
inline

◆ mxf8_convert_sr< f8x16_ocp_t, float16_t >()

template<>
__host__ __device__ f8x16_ocp_t ck::mxf8_convert_sr< f8x16_ocp_t, float16_t > ( float16_t  x,
float  scale 
)
inline

◆ mxf8_convert_sr< f8x2_ocp_t, float2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::mxf8_convert_sr< f8x2_ocp_t, float2_t > ( float2_t  x,
float  scale 
)
inline

◆ mxf8_convert_sr< f8x32_ocp_t, float32_t >()

template<>
__host__ __device__ f8x32_ocp_t ck::mxf8_convert_sr< f8x32_ocp_t, float32_t > ( float32_t  x,
float  scale 
)
inline

◆ next_pow2()

constexpr auto ck::next_pow2 ( uint32_t  x)
inlineconstexpr

◆ NormalizationKernelSelector()

template<typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename GridDesc_M_K , typename GridDesc_M , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t XSrcVectorDim, index_t XSrcVectorSize, index_t GammaSrcVectorDim, index_t GammaSrcVectorSize, index_t BetaSrcVectorDim, index_t BetaSrcVectorSize, index_t YDstVectorDim, index_t YDstVectorSize, index_t SaveMeanInvStdDstVectorSize, bool UseWelford>
auto ck::NormalizationKernelSelector ( bool  isSweepOnce)

◆ operator%() [1/4]

template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto ck::operator% ( integral_constant< TX, X >  ,
integral_constant< TY, Y >   
)
constexpr

◆ operator%() [2/4]

template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto ck::operator% ( Number< Y >  ,
Sequence< Xs... >   
)
constexpr

◆ operator%() [3/4]

template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto ck::operator% ( Sequence< Xs... >  ,
Number< Y >   
)
constexpr

◆ operator%() [4/4]

template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto ck::operator% ( Sequence< Xs... >  ,
Sequence< Ys... >   
)
constexpr

◆ operator*() [1/8]

template<index_t NSize, typename T >
__host__ constexpr __device__ auto ck::operator* ( const MultiIndex< NSize > &  a,
const T &  b 
)
constexpr

◆ operator*() [2/8]

template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto ck::operator* ( const Tuple< Xs... > &  x,
const Y &  y 
)
constexpr

◆ operator*() [3/8]

template<typename... Xs, typename Y , enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto ck::operator* ( const Tuple< Xs... > &  x,
a 
)
constexpr

◆ operator*() [4/8]

template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto ck::operator* ( integral_constant< TX, X >  ,
integral_constant< TY, Y >   
)
constexpr

◆ operator*() [5/8]

template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto ck::operator* ( Number< Y >  ,
Sequence< Xs... >   
)
constexpr

◆ operator*() [6/8]

template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto ck::operator* ( Sequence< Xs... >  ,
Number< Y >   
)
constexpr

◆ operator*() [7/8]

template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto ck::operator* ( Sequence< Xs... >  ,
Sequence< Ys... >   
)
constexpr

◆ operator*() [8/8]

template<typename... Xs, typename Y , enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto ck::operator* ( a,
const Tuple< Xs... > &  x 
)
constexpr

◆ operator+() [1/6]

template<index_t NSize, typename T >
__host__ constexpr __device__ auto ck::operator+ ( const MultiIndex< NSize > &  a,
const T &  b 
)
constexpr

◆ operator+() [2/6]

template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto ck::operator+ ( const Tuple< Xs... > &  x,
const Y &  y 
)
constexpr

◆ operator+() [3/6]

template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto ck::operator+ ( integral_constant< TX, X >  ,
integral_constant< TY, Y >   
)
constexpr

◆ operator+() [4/6]

template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto ck::operator+ ( Number< Y >  ,
Sequence< Xs... >   
)
constexpr

◆ operator+() [5/6]

template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto ck::operator+ ( Sequence< Xs... >  ,
Number< Y >   
)
constexpr

◆ operator+() [6/6]

template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto ck::operator+ ( Sequence< Xs... >  ,
Sequence< Ys... >   
)
constexpr

◆ operator+=() [1/3]

template<typename Arr , typename Picks , typename X >
__host__ constexpr __device__ auto ck::operator+= ( ContainerElementPicker< Arr, Picks > &  y,
const X &  x 
)
constexpr

◆ operator+=() [2/3]

template<index_t NSize, typename X >
__host__ constexpr __device__ auto ck::operator+= ( MultiIndex< NSize > &  y,
const X &  x 
)
constexpr

◆ operator+=() [3/3]

template<typename... Ys, typename X , enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false>
__host__ constexpr __device__ auto ck::operator+= ( Tuple< Ys... > &  y,
const X &  x 
)
constexpr

◆ operator-() [1/6]

template<index_t NSize, typename T >
__host__ constexpr __device__ auto ck::operator- ( const MultiIndex< NSize > &  a,
const T &  b 
)
constexpr

◆ operator-() [2/6]

template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ constexpr __device__ auto ck::operator- ( const Tuple< Xs... > &  x,
const Y &  y 
)
constexpr

◆ operator-() [3/6]

template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto ck::operator- ( integral_constant< TX, X >  ,
integral_constant< TY, Y >   
)
constexpr

◆ operator-() [4/6]

template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto ck::operator- ( Number< Y >  ,
Sequence< Xs... >   
)
constexpr

◆ operator-() [5/6]

template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto ck::operator- ( Sequence< Xs... >  ,
Number< Y >   
)
constexpr

◆ operator-() [6/6]

template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto ck::operator- ( Sequence< Xs... >  ,
Sequence< Ys... >   
)
constexpr

◆ operator-=() [1/3]

template<typename Arr , typename Picks , typename X >
__host__ constexpr __device__ auto ck::operator-= ( ContainerElementPicker< Arr, Picks > &  y,
const X &  x 
)
constexpr

◆ operator-=() [2/3]

template<index_t NSize, typename X >
__host__ constexpr __device__ auto ck::operator-= ( MultiIndex< NSize > &  y,
const X &  x 
)
constexpr

◆ operator-=() [3/3]

template<typename... Ys, typename X , enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false>
__host__ constexpr __device__ auto ck::operator-= ( Tuple< Ys... > &  y,
const X &  x 
)
constexpr

◆ operator/() [1/4]

template<typename TX , TX X, typename TY , TY Y>
__host__ constexpr __device__ auto ck::operator/ ( integral_constant< TX, X >  ,
integral_constant< TY, Y >   
)
constexpr

◆ operator/() [2/4]

template<index_t Y, index_t... Xs>
__host__ constexpr __device__ auto ck::operator/ ( Number< Y >  ,
Sequence< Xs... >   
)
constexpr

◆ operator/() [3/4]

template<index_t... Xs, index_t Y>
__host__ constexpr __device__ auto ck::operator/ ( Sequence< Xs... >  ,
Number< Y >   
)
constexpr

◆ operator/() [4/4]

template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto ck::operator/ ( Sequence< Xs... >  ,
Sequence< Ys... >   
)
constexpr

◆ operator==()

template<index_t... Xs, index_t... Ys>
__host__ constexpr __device__ bool ck::operator== ( Sequence< Xs... >  ,
Sequence< Ys... >   
)
constexpr

◆ pick_container_element() [1/2]

template<typename Arr , typename Picks >
__host__ constexpr __device__ auto ck::pick_container_element ( Arr &  a,
Picks   
)
constexpr

◆ pick_container_element() [2/2]

template<typename Arr , typename Picks >
__host__ constexpr __device__ auto ck::pick_container_element ( const Arr &  a,
Picks   
)
constexpr

◆ pick_sequence_elements_by_ids()

template<typename Seq , index_t... Is>
__host__ constexpr __device__ auto ck::pick_sequence_elements_by_ids ( Seq  ,
Sequence< Is... >   
)
constexpr

◆ pick_sequence_elements_by_mask()

template<typename Seq , typename Mask >
__host__ constexpr __device__ auto ck::pick_sequence_elements_by_mask ( Seq  ,
Mask   
)
constexpr

◆ prand_generator() [1/2]

template<typename T , uint32_t seed_t, ck::enable_if_t< std::is_same< float, T >{}, bool > = false>
__host__ __device__ uint32_t ck::prand_generator ( index_t  id,
val,
uint32_t  seed = seed_t 
)

◆ prand_generator() [2/2]

template<typename T , uint32_t seed_t, ck::enable_if_t<!(std::is_same< float, T >{}||std::is_same< _Float16, T >{}), bool > = false>
__host__ __device__ uint32_t ck::prand_generator ( int  id,
val,
uint32_t  seed = seed_t 
)

◆ print_multi_index()

template<typename... Xs>
__host__ __device__ void ck::print_multi_index ( const Tuple< Xs... > &  x)

◆ reduce_on_sequence()

template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ index_t ck::reduce_on_sequence ( Seq  ,
Reduce  f,
Number< Init >   
)
constexpr

◆ reverse_exclusive_scan_sequence()

template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ auto ck::reverse_exclusive_scan_sequence ( Seq  ,
Reduce  ,
Number< Init >   
)
constexpr

◆ reverse_inclusive_scan_sequence()

template<typename Seq , typename Reduce , index_t Init>
__host__ constexpr __device__ auto ck::reverse_inclusive_scan_sequence ( Seq  ,
Reduce  ,
Number< Init >   
)
constexpr

◆ s_nop()

__device__ void ck::s_nop ( )

◆ scaled_type_convert()

template<typename Y , typename X >
constexpr __host__ Y ck::scaled_type_convert ( e8m0_bexp_t  scale,
x 
)
constexpr

◆ scaled_type_convert< bf8_ocp_t, float >()

template<>
__host__ bf8_ocp_t ck::scaled_type_convert< bf8_ocp_t, float > ( e8m0_bexp_t  scale,
float  x 
)
inline

◆ scaled_type_convert< bf8x16_ocp_t, float16_t >()

template<>
__host__ bf8x16_ocp_t ck::scaled_type_convert< bf8x16_ocp_t, float16_t > ( e8m0_bexp_t  scale,
float16_t  x 
)
inline

◆ scaled_type_convert< bf8x2_ocp_t, float2_t >()

template<>
__host__ bf8x2_ocp_t ck::scaled_type_convert< bf8x2_ocp_t, float2_t > ( e8m0_bexp_t  scale,
float2_t  x 
)
inline

◆ scaled_type_convert< bf8x32_ocp_t, float32_t >()

template<>
__host__ bf8x32_ocp_t ck::scaled_type_convert< bf8x32_ocp_t, float32_t > ( e8m0_bexp_t  scale,
float32_t  x 
)
inline

◆ scaled_type_convert< f8_ocp_t, float >()

template<>
__host__ f8_ocp_t ck::scaled_type_convert< f8_ocp_t, float > ( e8m0_bexp_t  scale,
float  x 
)
inline

◆ scaled_type_convert< f8x16_ocp_t, float16_t >()

template<>
__host__ f8x16_ocp_t ck::scaled_type_convert< f8x16_ocp_t, float16_t > ( e8m0_bexp_t  scale,
float16_t  x 
)
inline

◆ scaled_type_convert< f8x2_ocp_t, float2_t >()

template<>
__host__ f8x2_ocp_t ck::scaled_type_convert< f8x2_ocp_t, float2_t > ( e8m0_bexp_t  scale,
float2_t  x 
)
inline

◆ scaled_type_convert< f8x32_ocp_t, float32_t >()

template<>
__host__ f8x32_ocp_t ck::scaled_type_convert< f8x32_ocp_t, float32_t > ( e8m0_bexp_t  scale,
float32_t  x 
)
inline

◆ scaled_type_convert< float, bf8_ocp_t >()

template<>
__host__ float ck::scaled_type_convert< float, bf8_ocp_t > ( e8m0_bexp_t  scale,
bf8_ocp_t  x 
)
inline

◆ scaled_type_convert< float, f8_ocp_t >()

template<>
__host__ float ck::scaled_type_convert< float, f8_ocp_t > ( e8m0_bexp_t  scale,
f8_ocp_t  x 
)
inline

◆ scaled_type_convert< float16_t, bf8x16_ocp_t >()

template<>
__host__ float16_t ck::scaled_type_convert< float16_t, bf8x16_ocp_t > ( e8m0_bexp_t  scale,
bf8x16_ocp_t  x 
)
inline

◆ scaled_type_convert< float16_t, f8x16_ocp_t >()

template<>
__host__ float16_t ck::scaled_type_convert< float16_t, f8x16_ocp_t > ( e8m0_bexp_t  scale,
f8x16_ocp_t  x 
)
inline

◆ scaled_type_convert< float2_t, bf8x2_ocp_t >()

template<>
__host__ float2_t ck::scaled_type_convert< float2_t, bf8x2_ocp_t > ( e8m0_bexp_t  scale,
bf8x2_ocp_t  x 
)
inline

◆ scaled_type_convert< float2_t, f8x2_ocp_t >()

template<>
__host__ float2_t ck::scaled_type_convert< float2_t, f8x2_ocp_t > ( e8m0_bexp_t  scale,
f8x2_ocp_t  x 
)
inline

◆ scaled_type_convert< float32_t, bf8x32_ocp_t >()

template<>
__host__ float32_t ck::scaled_type_convert< float32_t, bf8x32_ocp_t > ( e8m0_bexp_t  scale,
bf8x32_ocp_t  x 
)
inline

◆ scaled_type_convert< float32_t, f8x32_ocp_t >()

template<>
__host__ float32_t ck::scaled_type_convert< float32_t, f8x32_ocp_t > ( e8m0_bexp_t  scale,
f8x32_ocp_t  x 
)
inline

◆ sequence_all_of()

template<typename Seq , typename F >
__host__ constexpr __device__ bool ck::sequence_all_of ( Seq  ,
f 
)
constexpr

◆ sequence_any_of()

template<typename Seq , typename F >
__host__ constexpr __device__ bool ck::sequence_any_of ( Seq  ,
f 
)
constexpr

◆ sequence_pop_back()

template<typename Seq >
__host__ constexpr __device__ auto ck::sequence_pop_back ( Seq  )
constexpr

◆ sequence_pop_front()

template<index_t I, index_t... Is>
__host__ constexpr __device__ auto ck::sequence_pop_front ( Sequence< I, Is... >  )
constexpr

◆ sequence_to_tuple_of_number()

template<index_t... Is>
__host__ constexpr __device__ auto ck::sequence_to_tuple_of_number ( Sequence< Is... >  )
constexpr

◆ set_container_subset() [1/2]

template<typename T , index_t N, index_t... Is>
__host__ constexpr __device__ void ck::set_container_subset ( Array< T, N > &  y,
Sequence< Is... >  picks,
const Array< T, sizeof...(Is)> &  x 
)
constexpr

◆ set_container_subset() [2/2]

template<typename... Ys, index_t... Is, typename... Xs>
__host__ constexpr __device__ void ck::set_container_subset ( Tuple< Ys... > &  y,
Sequence< Is... >  picks,
const Tuple< Xs... > &  x 
)
constexpr

◆ tie()

template<typename... Args>
constexpr Tuple<Args&...> ck::tie ( Args &...  args)
constexprnoexcept

◆ to_multi_index()

template<typename T >
__host__ constexpr __device__ auto ck::to_multi_index ( const T &  x)
constexpr

◆ to_sequence()

template<index_t... Is>
__host__ constexpr __device__ auto ck::to_sequence ( Tuple< Number< Is >... >  )
constexpr

◆ transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad()

template<typename... In, typename... Wei, typename... Out, typename ConvStrides , typename ConvDilations , typename InLeftPads , typename InRightPads , index_t GemmK1Value>
__host__ constexpr __device__ auto ck::transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad ( const TensorDescriptor< In... > &  in_grid_desc_n_di_hi_wi_c,
const TensorDescriptor< Wei... > &  wei_k_z_y_x_c_grid_desc,
const TensorDescriptor< Out... > &  out_n_do_ho_wo_k_grid_desc,
const ConvStrides &  conv_strides,
const ConvDilations &  conv_dilations,
const InLeftPads &  in_left_pads,
const InRightPads &  in_right_pads,
Number< GemmK1Value >   
)
constexpr

◆ transform_sequences() [1/3]

template<typename F , index_t... Xs>
__host__ constexpr __device__ auto ck::transform_sequences ( f,
Sequence< Xs... >   
)
constexpr

◆ transform_sequences() [2/3]

template<typename F , index_t... Xs, index_t... Ys>
__host__ constexpr __device__ auto ck::transform_sequences ( f,
Sequence< Xs... >  ,
Sequence< Ys... >   
)
constexpr

◆ transform_sequences() [3/3]

template<typename F , index_t... Xs, index_t... Ys, index_t... Zs>
__host__ constexpr __device__ auto ck::transform_sequences ( f,
Sequence< Xs... >  ,
Sequence< Ys... >  ,
Sequence< Zs... >   
)
constexpr

◆ transform_tensor_descriptor()

template<typename OldTensorDescriptor , typename NewTransforms , typename NewLowerDimensionOldVisibleIdss , typename NewUpperDimensionNewVisibleIdss >
__host__ constexpr __device__ auto ck::transform_tensor_descriptor ( const OldTensorDescriptor &  old_tensor_desc,
const NewTransforms &  new_transforms,
NewLowerDimensionOldVisibleIdss  ,
NewUpperDimensionNewVisibleIdss   
)
constexpr

◆ transform_tuples() [1/3]

template<typename F , typename X >
__host__ constexpr __device__ auto ck::transform_tuples ( f,
const X &  x 
)
constexpr

◆ transform_tuples() [2/3]

template<typename F , typename X , typename Y >
__host__ constexpr __device__ auto ck::transform_tuples ( f,
const X &  x,
const Y &  y 
)
constexpr

◆ transform_tuples() [3/3]

template<typename F , typename X , typename Y , typename Z >
__host__ constexpr __device__ auto ck::transform_tuples ( f,
const X &  x,
const Y &  y,
const Z &  z 
)
constexpr

◆ transpose_f8_4x4()

__device__ void ck::transpose_f8_4x4 ( const f8x4_t &  x0,
const f8x4_t &  x1,
const f8x4_t &  x2,
const f8x4_t &  x3,
f8x4_t &  y0,
f8x4_t &  y1,
f8x4_t &  y2,
f8x4_t &  y3 
)

◆ transpose_fp16_2x2()

__device__ void ck::transpose_fp16_2x2 ( const half2_t x0,
const half2_t x1,
half2_t y0,
half2_t y1 
)

◆ transpose_int8_4x4()

__device__ void ck::transpose_int8_4x4 ( const int8x4_t x0,
const int8x4_t x1,
const int8x4_t x2,
const int8x4_t x3,
int8x4_t y0,
int8x4_t y1,
int8x4_t y2,
int8x4_t y3 
)

◆ TupleDepth() [1/2]

template<index_t depth = 0, typename T >
__host__ constexpr __device__ auto ck::TupleDepth ( const T &  )
constexpr

◆ TupleDepth() [2/2]

template<index_t depth = 0, typename... Ts>
__host__ constexpr __device__ auto ck::TupleDepth ( const Tuple< Ts... > &  )
constexpr

◆ TupleReduce()

template<index_t Idx, index_t End, typename F , typename... Ts>
__host__ constexpr __device__ auto ck::TupleReduce ( F &&  f,
const Tuple< Ts... > &  tuple 
)
constexpr

◆ TupleReverse()

template<typename... Ts>
__host__ constexpr __device__ auto ck::TupleReverse ( const Tuple< Ts... > &  tuple)
constexpr

◆ TupleSlice()

template<index_t from, index_t to, typename... Ts>
__host__ constexpr __device__ auto ck::TupleSlice ( const Tuple< Ts... > &  tuple)
constexpr

◆ type_convert()

template<typename Y , typename X , ck::enable_if_t<!(ck::is_const_v< Y >||ck::is_const_v< X >), bool > = false>
__host__ constexpr __device__ Y ck::type_convert ( x)
constexpr

◆ type_convert< bf6_t, float >()

template<>
__host__ __device__ bf6_t ck::type_convert< bf6_t, float > ( float  x)
inline

Specializes float-to-bf6_t conversion.

Uses stochastic rounding if CK_USE_SR_F6_CONVERSION is defined, otherwise uses round-to-nearest-even.

Parameters
xInput float value to convert.
Returns
Converted bf6_t value.

◆ type_convert< bf6x16_pk_t, float16_t >()

template<>
__host__ __device__ bf6x16_pk_t ck::type_convert< bf6x16_pk_t, float16_t > ( float16_t  x)
inline

◆ type_convert< bf6x16_t, float16_t >()

template<>
__host__ __device__ bf6x16_t ck::type_convert< bf6x16_t, float16_t > ( float16_t  x)
inline

◆ type_convert< bf6x32_pk_t, float32_t >()

template<>
__host__ __device__ bf6x32_pk_t ck::type_convert< bf6x32_pk_t, float32_t > ( float32_t  x)
inline

◆ type_convert< bf6x32_t, float32_t >()

template<>
__host__ __device__ bf6x32_t ck::type_convert< bf6x32_t, float32_t > ( float32_t  x)
inline

Specializes vector of 32 float-to-bf6_t conversion.

Uses stochastic rounding if CK_USE_SR_F6_CONVERSION is defined, otherwise uses round-to-nearest-even.

Parameters
xInput float vector to convert.
Returns
Converted bf6x32_t vector.

◆ type_convert< bf8_fnuz_t, float >()

template<>
__host__ __device__ bf8_fnuz_t ck::type_convert< bf8_fnuz_t, float > ( float  x)
inline

◆ type_convert< bf8_fnuz_t, half_t >()

template<>
__host__ __device__ bf8_fnuz_t ck::type_convert< bf8_fnuz_t, half_t > ( half_t  x)
inline

◆ type_convert< bf8_ocp_t, bhalf_t >()

template<>
__host__ __device__ bf8_ocp_t ck::type_convert< bf8_ocp_t, bhalf_t > ( bhalf_t  x)
inline

Converts a bhalf_t value to a bf8_ocp_t value with rounding determined by a flag.

Parameters
xThe input bhalf_t value.
Returns
The converted bf8_ocp_t value.

◆ type_convert< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::type_convert< bf8_ocp_t, float > ( float  x)
inline

Converts a float value to a bf8_ocp_t value with rounding determined by a flag.

Parameters
xThe input float value.
Returns
The converted bf8_ocp_t value.

◆ type_convert< bf8_ocp_t, half_t >()

template<>
__host__ __device__ bf8_ocp_t ck::type_convert< bf8_ocp_t, half_t > ( half_t  x)
inline

Converts a half_t value to a bf8_ocp_t value with rounding determined by a flag.

Parameters
xThe input half_t value.
Returns
The converted bf8_ocp_t value.

◆ type_convert< bf8_ocp_t, int >()

template<>
__host__ constexpr __device__ bf8_ocp_t ck::type_convert< bf8_ocp_t, int > ( int  x)
inlineconstexpr

◆ type_convert< bhalf2_t, bf8x2_ocp_t >()

template<>
__host__ __device__ bhalf2_t ck::type_convert< bhalf2_t, bf8x2_ocp_t > ( bf8x2_ocp_t  x)
inline

Converts a vector of 2 bf8_ocp_t values to a vector of 2 bhalf_t values.

Parameters
xThe input vector of 2 bf8_ocp_t values.
Returns
The converted vector of 2 bhalf_t values.

◆ type_convert< bhalf2_t, f8x2_ocp_t >()

template<>
__host__ __device__ bhalf2_t ck::type_convert< bhalf2_t, f8x2_ocp_t > ( f8x2_ocp_t  x)
inline

Converts a vector of 2 f8_ocp_t values to a vector of 2 bhalf_t values.

Parameters
xThe input vector of 2 f8_ocp_t values.
Returns
The converted vector of 2 bhalf_t values.

◆ type_convert< bhalf2_t, pk_i4_t >()

template<>
__host__ __device__ bhalf2_t ck::type_convert< bhalf2_t, pk_i4_t > ( pk_i4_t  x)
inline

◆ type_convert< bhalf_t, bf8_ocp_t >()

template<>
__host__ __device__ bhalf_t ck::type_convert< bhalf_t, bf8_ocp_t > ( bf8_ocp_t  x)
inline

Converts a bf8_ocp_t value to a bhalf_t value.

Parameters
xThe input bf8_ocp_t value.
Returns
The converted bhalf_t value.

◆ type_convert< bhalf_t, f8_ocp_t >()

template<>
__host__ __device__ bhalf_t ck::type_convert< bhalf_t, f8_ocp_t > ( f8_ocp_t  x)
inline

Converts a f8_ocp_t value to a bhalf_t value.

Parameters
xThe input f8_ocp_t value.
Returns
The converted bhalf_t value.

◆ type_convert< bhalf_t, float >()

template<>
__host__ constexpr __device__ bhalf_t ck::type_convert< bhalf_t, float > ( float  x)
inlineconstexpr

◆ type_convert< bhalf_t, half_t >()

template<>
__host__ constexpr __device__ bhalf_t ck::type_convert< bhalf_t, half_t > ( half_t  x)
inlineconstexpr

◆ type_convert< bhalf_t, int8_t >()

template<>
__host__ constexpr __device__ bhalf_t ck::type_convert< bhalf_t, int8_t > ( int8_t  x)
inlineconstexpr

◆ type_convert< f4_t, float >()

template<>
__host__ __device__ f4_t ck::type_convert< f4_t, float > ( float  x)
inline

◆ type_convert< f4x2_pk_t, float2_t >()

template<>
__host__ __device__ f4x2_pk_t ck::type_convert< f4x2_pk_t, float2_t > ( float2_t  x)
inline

◆ type_convert< f4x2_t, float2_t >()

template<>
__host__ __device__ f4x2_t ck::type_convert< f4x2_t, float2_t > ( float2_t  x)
inline

◆ type_convert< f4x32_t, float32_t >()

template<>
__host__ __device__ f4x32_t ck::type_convert< f4x32_t, float32_t > ( float32_t  x)
inline

◆ type_convert< f6_t, float >()

template<>
__host__ __device__ f6_t ck::type_convert< f6_t, float > ( float  x)
inline

Specializes the type conversion template for converting a float into the 6-bit float type (f6_t).

Depending on the CK_USE_SR_F6_CONVERSION flag, the conversion uses stochastic rounding or round-to-nearest-even.

Parameters
xInput float value to be converted.
Returns
The converted f6_t value.

◆ type_convert< f6x16_pk_t, float16_t >()

template<>
__host__ __device__ f6x16_pk_t ck::type_convert< f6x16_pk_t, float16_t > ( float16_t  x)
inline

◆ type_convert< f6x16_t, float16_t >()

template<>
__host__ __device__ f6x16_t ck::type_convert< f6x16_t, float16_t > ( float16_t  x)
inline

◆ type_convert< f6x32_pk_t, float32_t >()

template<>
__host__ __device__ f6x32_pk_t ck::type_convert< f6x32_pk_t, float32_t > ( float32_t  x)
inline

◆ type_convert< f6x32_t, float32_t >()

template<>
__host__ __device__ f6x32_t ck::type_convert< f6x32_t, float32_t > ( float32_t  x)
inline

Specializes the type conversion template for converting a vector of 32 floats into the vector of 32 6-bit float types (f6x32_t).

Depending on the CK_USE_SR_F6_CONVERSION flag, the conversion uses stochastic rounding or round-to-nearest-even.

Parameters
xInput float value to be converted.
Returns
The converted f6x32_t vector.

◆ type_convert< f8_fnuz_t, float >()

template<>
__host__ __device__ f8_fnuz_t ck::type_convert< f8_fnuz_t, float > ( float  x)
inline

◆ type_convert< f8_fnuz_t, half_t >()

template<>
__host__ __device__ f8_fnuz_t ck::type_convert< f8_fnuz_t, half_t > ( half_t  x)
inline

◆ type_convert< f8_ocp_t, bhalf_t >()

template<>
__host__ __device__ f8_ocp_t ck::type_convert< f8_ocp_t, bhalf_t > ( bhalf_t  x)
inline

Converts a bhalf_t value to a f8_ocp_t value with rounding determined by a flag.

Parameters
xThe input bhalf_t value.
Returns
The converted f8_ocp_t value.

◆ type_convert< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::type_convert< f8_ocp_t, float > ( float  x)
inline

Converts a float value to a f8_ocp_t value with rounding determined by a flag.

Parameters
xThe input float value.
Returns
The converted f8_ocp_t value.

◆ type_convert< f8_ocp_t, half_t >()

template<>
__host__ __device__ f8_ocp_t ck::type_convert< f8_ocp_t, half_t > ( half_t  x)
inline

Converts a half_t value to a f8_ocp_t value with rounding determined by a flag.

Parameters
xThe input half_t value.
Returns
The converted f8_ocp_t value.

◆ type_convert< f8_ocp_t, int >()

template<>
__host__ constexpr __device__ f8_ocp_t ck::type_convert< f8_ocp_t, int > ( int  x)
inlineconstexpr

◆ type_convert< float, bf6_t >()

template<>
__host__ __device__ float ck::type_convert< float, bf6_t > ( bf6_t  x)
inline

Specializes the type conversion template for converting a bf6_t value to float.

Interprets the bf6_t value using the default scale factor of 1 and returns its floating-point representation.

Parameters
xThe bf6_t value to convert.
Returns
The float representation of the given bf6_t value.

◆ type_convert< float, bf8_fnuz_t >()

template<>
__host__ __device__ float ck::type_convert< float, bf8_fnuz_t > ( bf8_fnuz_t  x)
inline

◆ type_convert< float, bf8_ocp_t >()

template<>
__host__ __device__ float ck::type_convert< float, bf8_ocp_t > ( bf8_ocp_t  x)
inline

Converts a bf8_ocp_t value to a float value.

Parameters
xThe input bf8_ocp_t value.
Returns
The converted float value.

◆ type_convert< float, bhalf_t >()

template<>
__host__ constexpr __device__ float ck::type_convert< float, bhalf_t > ( bhalf_t  x)
inlineconstexpr

◆ type_convert< float, f4_t >()

template<>
__host__ __device__ float ck::type_convert< float, f4_t > ( f4_t  x)
inline

◆ type_convert< float, f6_t >()

template<>
__host__ __device__ float ck::type_convert< float, f6_t > ( f6_t  x)
inline

Specializes the type conversion template for converting the 6-bit float type (f6_t) to float.

Interprets an f6_t value as a float using the default scale factor of 1.

Parameters
xThe 6-bit float (f6_t) value to be converted.
Returns
The corresponding float representation.

◆ type_convert< float, f8_fnuz_t >()

template<>
__host__ __device__ float ck::type_convert< float, f8_fnuz_t > ( f8_fnuz_t  x)
inline

◆ type_convert< float, f8_ocp_t >()

template<>
__host__ __device__ float ck::type_convert< float, f8_ocp_t > ( f8_ocp_t  x)
inline

Converts a f8_ocp_t value to a float value.

Parameters
xThe input f8_ocp_t value.
Returns
The converted float value.

◆ type_convert< float16_t, bf6x16_pk_t >()

template<>
__host__ __device__ float16_t ck::type_convert< float16_t, bf6x16_pk_t > ( bf6x16_pk_t  x)
inline

◆ type_convert< float16_t, bf6x16_t >()

template<>
__host__ __device__ float16_t ck::type_convert< float16_t, bf6x16_t > ( bf6x16_t  x)
inline

◆ type_convert< float16_t, f6x16_pk_t >()

template<>
__host__ __device__ float16_t ck::type_convert< float16_t, f6x16_pk_t > ( f6x16_pk_t  x)
inline

◆ type_convert< float16_t, f6x16_t >()

template<>
__host__ __device__ float16_t ck::type_convert< float16_t, f6x16_t > ( f6x16_t  x)
inline

◆ type_convert< float2_t, bf8x2_ocp_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, bf8x2_ocp_t > ( bf8x2_ocp_t  x)
inline

Converts a vector of 2 bf8_ocp_t values to a vector of 2 float values.

Parameters
xThe input vector of 2 bf8_ocp_t values.
Returns
The converted vector of 2 float values.

◆ type_convert< float2_t, f4x2_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, f4x2_t > ( f4x2_t  x)
inline

◆ type_convert< float2_t, f8x2_fnuz_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, f8x2_fnuz_t > ( f8x2_fnuz_t  x)
inline

◆ type_convert< float2_t, f8x2_ocp_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, f8x2_ocp_t > ( f8x2_ocp_t  x)
inline

Converts a vector of 2 f8_ocp_t values to a vector of 2 float values.

Parameters
xThe input vector of 2 f8_ocp_t values.
Returns
The converted vector of 2 float values.

◆ type_convert< float2_t, pk_i4_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, pk_i4_t > ( pk_i4_t  x)
inline

◆ type_convert< float32_t, bf6x32_t >()

template<>
__host__ __device__ float32_t ck::type_convert< float32_t, bf6x32_t > ( bf6x32_t  x)
inline

Specializes the type conversion template for converting a vector of 32 bf6_t values to vector of 32 floats.

Interprets the bf6x32_t value using the default scale factor of 1 and returns its floating-point representation.

Parameters
xThe bf6x32_t value to convert.
Returns
The float representation of the given vector.

◆ type_convert< float32_t, f4x32_t >()

template<>
__host__ __device__ float32_t ck::type_convert< float32_t, f4x32_t > ( f4x32_t  x)
inline

◆ type_convert< float32_t, f6x32_t >()

template<>
__host__ __device__ float32_t ck::type_convert< float32_t, f6x32_t > ( f6x32_t  x)
inline

Specializes the type conversion template for converting the vector of 32 6-bit float types (f6x32_t) to vector of 32 floats.

Interprets an f6_t values as floats using the default scale factor of 1.

Parameters
xThe vector of 32 6-bit float (f6x32_t) values to be converted.
Returns
The corresponding float representation.

◆ type_convert< half2_t, bf8x2_ocp_t >()

template<>
__host__ __device__ half2_t ck::type_convert< half2_t, bf8x2_ocp_t > ( bf8x2_ocp_t  x)
inline

Converts a vector of 2 bf8_ocp_t values to a vector of 2 half_t values.

Parameters
xThe input vector of 2 bf8_ocp_t values.
Returns
The converted vector of 2 half_t values.

◆ type_convert< half2_t, f8x2_ocp_t >()

template<>
__host__ __device__ half2_t ck::type_convert< half2_t, f8x2_ocp_t > ( f8x2_ocp_t  x)
inline

Converts a vector of 2 f8_ocp_t values to a vector of 2 half_t values.

Parameters
xThe input vector of 2 f8_ocp_t values.
Returns
The converted vector of 2 half_t values.

◆ type_convert< half2_t, float2_t >()

template<>
__host__ __device__ half2_t ck::type_convert< half2_t, float2_t > ( float2_t  x)
inline

◆ type_convert< half2_t, pk_i4_t >()

template<>
__host__ __device__ half2_t ck::type_convert< half2_t, pk_i4_t > ( pk_i4_t  x)
inline

◆ type_convert< half_t, bf8_fnuz_t >()

template<>
__host__ __device__ half_t ck::type_convert< half_t, bf8_fnuz_t > ( bf8_fnuz_t  x)
inline

◆ type_convert< half_t, bf8_ocp_t >()

template<>
__host__ __device__ half_t ck::type_convert< half_t, bf8_ocp_t > ( bf8_ocp_t  x)
inline

Converts a bf8_ocp_t value to a half_t value.

Parameters
xThe input bf8_ocp_t value.
Returns
The converted half_t value.

◆ type_convert< half_t, bhalf_t >()

template<>
__host__ constexpr __device__ half_t ck::type_convert< half_t, bhalf_t > ( bhalf_t  x)
inlineconstexpr

◆ type_convert< half_t, f8_fnuz_t >()

template<>
__host__ __device__ half_t ck::type_convert< half_t, f8_fnuz_t > ( f8_fnuz_t  x)
inline

◆ type_convert< half_t, f8_ocp_t >()

template<>
__host__ __device__ half_t ck::type_convert< half_t, f8_ocp_t > ( f8_ocp_t  x)
inline

Converts a f8_ocp_t value to a half_t value.

Parameters
xThe input f8_ocp_t value.
Returns
The converted half_t value.

◆ type_convert< int8_t, bhalf_t >()

template<>
__host__ constexpr __device__ int8_t ck::type_convert< int8_t, bhalf_t > ( bhalf_t  x)
inlineconstexpr

◆ type_convert_sp()

template<typename Y , typename X >
__host__ constexpr __device__ Y ck::type_convert_sp ( x)
constexpr

◆ type_convert_sp< float, int >()

template<>
__host__ constexpr __device__ float ck::type_convert_sp< float, int > ( int  x)
inlineconstexpr

◆ type_convert_sp< half_t, int >()

template<>
__host__ constexpr __device__ half_t ck::type_convert_sp< half_t, int > ( int  x)
inlineconstexpr

◆ type_convert_sp< int, float >()

template<>
__host__ constexpr __device__ int ck::type_convert_sp< int, float > ( float  x)
inlineconstexpr

◆ type_convert_sp< int, half_t >()

template<>
__host__ constexpr __device__ int ck::type_convert_sp< int, half_t > ( half_t  x)
inlineconstexpr

◆ unpack()

template<typename F , typename X >
__host__ constexpr __device__ auto ck::unpack ( F &&  f,
X &&  x 
)
constexpr

◆ unpack2()

template<typename F , typename X , typename Y >
__host__ constexpr __device__ auto ck::unpack2 ( F &&  f,
X &&  x,
Y &&  y 
)
constexpr

◆ UnrollNestedTuple() [1/3]

template<index_t Depth = 0, index_t MaxDepth = -1, typename T >
__host__ constexpr __device__ auto ck::UnrollNestedTuple ( const T &  element)
constexpr

◆ UnrollNestedTuple() [2/3]

template<index_t Depth = 0, index_t MaxDepth = -1, typename... Ts>
__host__ constexpr __device__ auto ck::UnrollNestedTuple ( const Tuple< Ts... > &  tuple)
constexpr

◆ UnrollNestedTuple() [3/3]

template<index_t Depth = 0, index_t MaxDepth = -1>
__host__ constexpr __device__ auto ck::UnrollNestedTuple ( const Tuple<> &  element)
constexpr

◆ UpdateEnvVar() [1/2]

template<typename EnvVar >
void ck::UpdateEnvVar ( EnvVar  ,
const std::string_view &  val 
)

◆ UpdateEnvVar() [2/2]

template<typename EnvVar , typename ValueType >
void ck::UpdateEnvVar ( EnvVar  ,
const ValueType &  val 
)

updates the cached value of an environment variable

Variable Documentation

◆ ignore

constexpr detail::ignore_t ck::ignore
inlineconstexpr

◆ is_base_of_v

template<typename X , typename Y >
constexpr bool ck::is_base_of_v = is_base_of<X, Y>::value
inlineconstexpr

◆ is_packed_type_v

template<typename T >
constexpr bool ck::is_packed_type_v = packed_size_v<T> > 1
inlineconstexpr

◆ is_pointer_v

template<typename T >
constexpr bool ck::is_pointer_v = is_pointer<T>::value
inlineconstexpr

◆ is_same_v

template<typename X , typename Y >
constexpr bool ck::is_same_v = is_same<X, Y>::value
inlineconstexpr

◆ is_unsigned_v

template<typename T >
constexpr bool ck::is_unsigned_v = is_unsigned<T>::value
inlineconstexpr

◆ packed_size_v

template<typename T >
constexpr index_t ck::packed_size_v = packed_type_info<T>::packed_size
inlineconstexpr