|
constexpr unsigned int | fnv1a_hash (std::string_view str, unsigned int h=2166136261u) |
|
std::string | get_device_name () |
|
bool | is_gfx12_supported () |
|
bool | is_gfx11_supported () |
|
bool | is_xdl_supported () |
|
bool | is_lds_direct_load_supported () |
|
bool | is_bf16_atomic_supported () |
|
bool | is_gfx101_supported () |
|
bool | is_gfx103_supported () |
|
template<typename T , typename ForwardIterator , typename Size , typename BinaryOperation > |
auto | accumulate_n (ForwardIterator first, Size count, T init, BinaryOperation op) -> decltype(std::accumulate(first, std::next(first, count), init, op)) |
|
unsigned int | get_available_cpu_cores () |
|
template<typename... In, typename... Wei, typename... Out, typename ConvStrides , typename ConvDilations , typename InLeftPads , typename InRightPads , index_t GemmK1Value> |
__host__ constexpr __device__ auto | transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad (const TensorDescriptor< In... > &in_grid_desc_n_di_hi_wi_c, const TensorDescriptor< Wei... > &wei_k_z_y_x_c_grid_desc, const TensorDescriptor< Out... > &out_n_do_ho_wo_k_grid_desc, const ConvStrides &conv_strides, const ConvDilations &conv_dilations, const InLeftPads &in_left_pads, const InRightPads &in_right_pads, Number< GemmK1Value >) |
|
template<AddressSpaceEnum AddressSpace, typename T , typename TensorDesc , typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false> |
__host__ constexpr __device__ auto | make_static_tensor (TensorDesc) |
|
template<AddressSpaceEnum AddressSpace, typename T , typename TensorDesc , typename X , typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false, typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X >>::value, bool >::type = false> |
__host__ constexpr __device__ auto | make_static_tensor (TensorDesc, X invalid_element_value) |
|
template<typename Lengths , typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type> |
__host__ constexpr __device__ auto | make_cluster_descriptor (const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::Size(), 1 >::type{}) |
|
template<typename LowLength > |
__host__ constexpr __device__ auto | make_pass_through_transform (const LowLength &low_length) |
|
template<typename LowLength , typename LeftPad , typename RightPad , bool SkipIsValidCheck = false> |
__host__ constexpr __device__ auto | make_pad_transform (const LowLength &low_length, const LeftPad &left_pad, const RightPad &right_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{}) |
|
template<typename LowLength , typename LeftPadLength , bool SkipIsValidCheck = false> |
__host__ constexpr __device__ auto | make_left_pad_transform (const LowLength &low_length, const LeftPadLength &left_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{}) |
|
template<typename LowLength , typename RightPadLength , bool SkipIsValidCheck = false> |
__host__ constexpr __device__ auto | make_right_pad_transform (const LowLength &low_length, const RightPadLength &right_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{}) |
|
template<typename UpLengths , typename Coefficients , typename enable_if< UpLengths::Size()==Coefficients::Size(), bool >::type = false> |
__host__ constexpr __device__ auto | make_embed_transform (const UpLengths &up_lengths, const Coefficients &coefficients) |
|
template<typename LowLengths > |
__host__ constexpr __device__ auto | make_merge_transform (const LowLengths &low_lengths) |
|
template<typename LowLengths > |
__host__ constexpr __device__ auto | make_merge_transform_v1_carry_check (const LowLengths &low_lengths) |
|
template<typename LowLengths > |
__host__ constexpr __device__ auto | make_merge_transform_v2_magic_division (const LowLengths &low_lengths) |
|
template<typename LowLengths > |
__host__ constexpr __device__ auto | make_merge_transform_v3_division_mod (const LowLengths &low_lengths) |
|
template<typename UpLengths , bool Use24BitIntegerCalculation = false> |
__host__ constexpr __device__ auto | make_unmerge_transform (const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{}) |
|
template<typename LowerIndex > |
__host__ constexpr __device__ auto | make_freeze_transform (const LowerIndex &low_idx) |
|
template<typename UpperIndex > |
__host__ constexpr __device__ auto | make_insert_transform (const UpperIndex &up_idx) |
|
template<typename LowLength , typename SliceBegin , typename SliceEnd > |
__host__ constexpr __device__ auto | make_slice_transform (const LowLength &low_length, const SliceBegin &slice_begin, const SliceEnd &slice_end) |
|
template<typename VectorSize , typename UpLength > |
__host__ constexpr __device__ auto | make_vectorize_transform (const VectorSize &vector_size, const UpLength &up_length) |
|
template<typename Modulus , typename UpLength > |
__host__ constexpr __device__ auto | make_modulo_transform (const Modulus &modulus, const UpLength &up_length) |
|
template<typename LowLengths > |
__host__ constexpr __device__ auto | make_xor_with_modulo_transform (const LowLengths &low_lengths) |
|
template<typename LowLengths > |
__host__ constexpr __device__ auto | make_xor_transform (const LowLengths &low_lengths) |
|
template<typename TensorAdaptor0 , typename TensorAdaptor1 > |
__host__ constexpr __device__ auto | chain_tensor_adaptors (const TensorAdaptor0 &adaptor0, const TensorAdaptor1 &adaptor1) |
|
template<typename Transforms , typename LowerDimensionOldTopIdss , typename UpperDimensionNewTopIdss > |
__host__ constexpr __device__ auto | make_single_stage_tensor_adaptor (const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss) |
|
template<typename OldTensorDescriptor , typename NewTransforms , typename NewLowerDimensionOldVisibleIdss , typename NewUpperDimensionNewVisibleIdss > |
__host__ constexpr __device__ auto | transform_tensor_descriptor (const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss) |
|
template<typename TensorDesc , typename VisibleIndex > |
__host__ constexpr __device__ auto | make_tensor_coordinate (const TensorDesc &tensor_desc, const VisibleIndex &idx_visible) |
|
template<typename TensorDesc , typename VisibleIndex , typename UpdateLowerIndexHack > |
__host__ constexpr __device__ auto | make_tensor_coordinate_step (const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack) |
|
template<typename TensorDesc , typename VisibleIndex > |
__host__ constexpr __device__ auto | make_tensor_coordinate_step (const TensorDesc &, const VisibleIndex &idx_diff_visible) |
|
template<typename TensorDesc , typename TensorCoord , typename TensorCoordStep > |
__host__ constexpr __device__ void | move_tensor_coordinate (const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step) |
|
template<typename TensorDesc , typename TensorCoord > |
__host__ constexpr __device__ bool | coordinate_has_valid_offset_assuming_visible_index_is_valid (const TensorDesc &tensor_desc, const TensorCoord &coord) |
|
template<typename TensorDesc , typename TensorCoord > |
__host__ constexpr __device__ bool | coordinate_has_valid_offset (const TensorDesc &tensor_desc, const TensorCoord &coord) |
|
template<typename... Lengths, typename... Strides, typename enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false> |
__host__ constexpr __device__ auto | make_naive_tensor_descriptor (const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides) |
|
template<typename... Lengths> |
__host__ constexpr __device__ auto | make_naive_tensor_descriptor_packed (const Tuple< Lengths... > &lengths) |
|
template<typename... Lengths, typename Align > |
__host__ constexpr __device__ auto | make_naive_tensor_descriptor_aligned (const Tuple< Lengths... > &lengths, Align align) |
|
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeTypeA , typename ComputeTypeB , typename AccDataType , typename AWmmaTileDesc , typename BWmmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, index_t KPack> |
constexpr auto | BlockGemmPipeline_Selector () |
|
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack> |
constexpr auto | BlockGemmABScalePipeline_Selector () |
|
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false> |
constexpr auto | BlockGemmMXBPreshufflePipeline_Selector () |
|
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false> |
constexpr auto | BlockGemmBPreshufflePipeline_Selector () |
|
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack> |
constexpr auto | BlockGemmPipeline_Selector () |
|
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack> |
constexpr auto | BlockGemmBlockScaleBPreshufflePipeline_Selector () |
|
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType , typename BDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false> |
constexpr auto | BlockGemmBlockMoeScaleBPreshufflePipeline_Selector () |
|
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack> |
constexpr auto | BlockGemmMXBPreshufflePipeline_Selector () |
|
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false> |
constexpr auto | BlockGemmMXNBSPipeline_Selector () |
|
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false> |
constexpr auto | BlockGemmMXPipeline_Selector () |
|
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType , typename AScaleDataType , typename BDataType , typename BScaleDataType , typename ComputeDataType , typename AccDataType , typename ATileDesc , typename BTileDesc , typename AMmaTileDesc , typename BMmaTileDesc , index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack> |
constexpr auto | BlockGemmMXPipeline_Selector () |
|
template<index_t BlockSize, typename FloatA , typename FloatB , typename FloatAcc , typename AK0MK1BlockDesc , typename BK0NK1BlockDesc , index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, LoopScheduler LoopSched, typename ComputeTypeA = FloatA, typename ComputeTypeB = FloatB> |
constexpr auto | BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector () |
|
template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ComputePtrOffsetOfBatch , typename Block2ETileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const index_t batch_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2ETileMap block_2_etile_map) |
|
template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
__global__ void | kernel_batched_gemm_xdl_cshuffle_v3_multi_d (BatchedGemmArg karg) |
|
template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
__global__ void | kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds (BatchedGemmArg karg) |
|
template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
__global__ void | kernel_batched_gemm_b_scale_xdl_cshuffle_v3 (BatchedGemmArg karg) |
|
template<typename GridwiseGemm , typename BatchedGemmArg , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
__global__ void | kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds (BatchedGemmArg karg) |
|
template<typename GridwiseGemm , typename AsPointer , typename BsPointer , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AsGridDesc_AK0_M_AK1 , typename BsGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_contraction_multiple_abd_xdl_cshuffle (AsPointer p_as_grid, BsPointer p_bs_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AsGridDesc_AK0_M_AK1 as_grid_desc_ak0_m_ak1, const BsGridDesc_BK0_N_BK1 bs_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map) |
|
template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map) |
|
template<typename GridwiseElementwiseReduction , typename InDataTypePointerTuple , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename AccDataType , typename XElementwiseOperation , typename YElementwiseOperation , typename InGrid2dDescTuple , typename GridDesc_M_K > |
__global__ void | kernel_elementwise_layernorm (const InGrid2dDescTuple in_grid_2d_desc_tuple, const GridDesc_M_K x_grid_desc_m_k, const GridDesc_M_K gamma_grid_desc_m_k, const GridDesc_M_K beta_grid_desc_m_k, const GridDesc_M_K y_grid_desc_m_k, index_t num_k_block_tile_iteration, AccDataType epsilon, const InDataTypePointerTuple p_in_global_tuple, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, const XElementwiseOperation x_elementwise_op, const YElementwiseOperation y_elementwise_op) |
|
template<typename GridwiseGemm , typename ABDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_K0_M0_M1_K1 , typename BGridDesc_K0_N0_N1_K1 , typename DsGridDesc_M0_M10_M11_N0_N10_N11 , typename CGridDesc_M0_M10_M11_N0_N10_N11 , typename Block2CTileMap , bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop> |
__global__ void | kernel_gemm_dl_multiple_d (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1, const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1, const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11, const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11, const Block2CTileMap block_2_ctile_map) |
|
template<typename GridwiseGemmWelford , typename ABDataType , typename DsPointer , typename EMeanVarDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename MeanVarGridDescriptor_MBlock_MPerBlock_NBlock , typename CountGridDescriptor_MBlock_MPerBlock_NBlock , typename Block2ETileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_multiple_d_welford_first_half_xdl_cshuffle (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EMeanVarDataType *__restrict__ p_e_grid, EMeanVarDataType *__restrict__ p_welford_mean_grid, EMeanVarDataType *__restrict__ p_welford_var_grid, int32_t *__restrict__ p_welford_count_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const MeanVarGridDescriptor_MBlock_MPerBlock_NBlock mean_var_grid_desc_mblock_mperblock_nblock, const CountGridDescriptor_MBlock_MPerBlock_NBlock count_grid_desc_mblock_mperblock_nblock, const Block2ETileMap block_2_etile_map, index_t NRaw) |
|
template<typename GridwiseWelfordLayernorm , typename EMeanVarDataType , typename HDataType , typename GammaDataType , typename BetaDataType , typename ComputeDataType , typename EHGridDesc_M_N , typename LayernormMeanVarGridDesc_M_NBlock , typename LayernormCountGridDesc_M_NBlock , typename GammaBetaGridDesc_N , typename HElementwiseOperation > |
__global__ void | kernel_welford_layernorm2d_second_half (const EMeanVarDataType *__restrict__ p_e_grid, const EMeanVarDataType *__restrict__ p_in_welford_mean_grid, const EMeanVarDataType *__restrict__ p_in_welford_var_grid, const int32_t *__restrict__ p_in_welford_count_grid, const GammaDataType *__restrict__ p_gamma_grid, const BetaDataType *__restrict__ p_beta_grid, HDataType *__restrict__ p_h_grid, const EHGridDesc_M_N e_grid_desc_m_n, const EHGridDesc_M_N h_grid_desc_m_n, const LayernormMeanVarGridDesc_M_NBlock mean_var_grid_desc_m_nblock, const LayernormCountGridDesc_M_NBlock count_grid_desc_m_nblock, const GammaBetaGridDesc_N gamma_grid_desc_n, const GammaBetaGridDesc_N beta_grid_desc_n, index_t numMeanVarCountBlockTileIteration_N, index_t NBlockClusterLength, ComputeDataType epsilon, HElementwiseOperation h_element_op) |
|
template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename FloatRsPointer , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename QsElementwiseOperation , typename RsElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename RsGridDescriptor_MBlock_MPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_multiple_d_multiple_r_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, FloatRsPointer p_rs_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const QsElementwiseOperation qs_element_op, const RsElementwiseOperation rs_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock, const Block2ETileMap block_2_etile_map) |
|
template<typename GridwiseGemm , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_multiple_d_xdl_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map) |
|
template<typename GridwiseGemm , typename ABDataType , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename EElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_xdl_waveletmodel_cshuffle (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const EElementwiseOperation e_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map) |
|
template<typename GridwiseGemm , typename ContractionMultiDKernelArg , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , bool HasMainKBlockLoop> |
__global__ void | kernel_grouped_contraction_multiple_d_xdl_cshuffle (const void CK_CONSTANT_ADDRESS_SPACE *contraction_args, const index_t group_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op) |
|
template<typename GridwiseWelford , typename XDataType , typename WorkspaceMeanVarDataType , typename ComputeDataType , typename XGridDesc_M_K , typename MeanVarGridDesc_M_KBlock > |
__global__ void | kernel_normalizationSplitK1st (const XGridDesc_M_K x_grid_desc_m_k, const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock, index_t num_k_block_tile_iteration, const XDataType *const __restrict__ p_x_global, WorkspaceMeanVarDataType *const __restrict__ p_welford_mean, WorkspaceMeanVarDataType *const __restrict__ p_welford_variance, int32_t *const __restrict__ p_welford_count) |
|
template<typename GridwiseWelfordNormalization , typename WorkspaceMeanVarDataType , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename MeanVarGridDesc_M_KBlock , typename CountGridDesc_M_KBlock , typename XYGammaBetaGridDesc_M_K , typename SaveMeanInvStdGridDesc_M > |
__global__ void | kernel_normalizationSplitK2nd (const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock, const CountGridDesc_M_KBlock count_grid_desc_m_kblock, const XYGammaBetaGridDesc_M_K x_grid_desc_m_k, const XYGammaBetaGridDesc_M_K gamma_grid_desc_m_k, const XYGammaBetaGridDesc_M_K beta_grid_desc_m_k, const XYGammaBetaGridDesc_M_K y_grid_desc_m_k, const SaveMeanInvStdGridDesc_M save_mean_grid_desc_m, const SaveMeanInvStdGridDesc_M save_inv_std_grid_desc_m, index_t num_k_mean_var_count_iteration, index_t num_k_block_tile_iteration, index_t k_grid_size, ComputeDataType epsilon, const WorkspaceMeanVarDataType *const p_mean_global, const WorkspaceMeanVarDataType *const p_variance_global, const int32_t *const p_welford_count_global, const XDataType *const __restrict__ p_x_global, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, SaveMeanInvStdDataType *const __restrict__ p_save_mean_global, SaveMeanInvStdDataType *const __restrict__ p_save_inv_std_global, const YElementwiseOperation y_elementwise_op) |
|
template<typename GridwiseGemm , typename FloatAB , typename FloatDsPointer , typename FloatE , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AKB_AK0_M_AK1 , typename BGridDesc_BKB_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ComputePtrOffsetOfBatch , typename Block2ETileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const index_t batch_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AKB_AK0_M_AK1 a_grid_desc_akb_ak0_m_ak1, const BGridDesc_BKB_BK0_N_BK1 b_grid_desc_bkb_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2ETileMap block_2_etile_map) |
|
__device__ half4_t | i4_to_half4 (int q) |
|
__device__ half4_t | i4_to_half4_scale (int q, const ck::half2_t &scale) |
|
__device__ f8x4_t | i4_to_f8x4 (int q) |
|
__device__ f8x8_t | i4_to_fp8x8 (int q) |
|
__device__ bhalf4_t | i4_to_bhalf4 (int q) |
|
template<typename GridwiseMultiblockBatchNormForward_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename MeanVarCountGridDesc_M_G , typename MeanVarCountGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor > |
__global__ void | kernel_multiblock_batchnorm_forward (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, MeanVarDataType *const __restrict__ p_welford_mean, MeanVarDataType *const __restrict__ p_welford_variance, int32_t *const __restrict__ p_welford_count, int32_t *const __restrict__ p_control, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance) |
|
template<typename GridwiseReduceSecondHalfBatchNormBackwardFinal_ , typename XDataType , typename DyDataType , typename DxDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename DscaleDbiasGridDesc_M_K , typename MeanVarGridDesc_M , typename ScaleBiasGridDesc_M > |
__global__ void | kernel_reduce_second_half_batchnorm_backward_final (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const XYGridDesc_M_K dx_grid_desc_m_k, const DscaleDbiasGridDesc_M_K dscale_dbias_grid_desc_m_k, const MeanVarGridDesc_M mean_var_grid_desc_m, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, index_t blkgroup_size, long_index_t reduce_size, index_t num_xy_k_block_tile_iteration, index_t num_dscale_dbias_k_block_tile_iteration, const DscaleDbiasDataType *const __restrict__ p_reduce_dscale, const DscaleDbiasDataType *const __restrict__ p_reduce_dbias, const MeanVarDataType *const __restrict__ p_mean, const MeanVarDataType *const __restrict__ p_inv_var, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, const ScaleDataType *const __restrict__ p_scale, const DyElementwiseOp dy_elementwise_op, DxDataType *const __restrict__ p_dx, DscaleDbiasDataType *const __restrict__ p_dscale, DscaleDbiasDataType *const __restrict__ p_dbias) |
|
template<typename GridwiseMultiblockWelfordFirstHalf_ , typename XDataType , typename MeanVarDataType , typename XGridDesc_M_K , typename MeanVarCountGridDesc_M_G , typename GetReduceCountPerThreadFunctor > |
__global__ void | kernel_multiblock_welford_first_half (const XGridDesc_M_K x_grid_desc_m_k, const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, const XDataType *const __restrict__ p_x, MeanVarDataType *const p_welford_mean, MeanVarDataType *const p_welford_variance, int32_t *const p_welford_count) |
|
template<typename GridwiseWelfordSecondHalfBatchNormForwardFinal_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename MeanVarCountGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M > |
__global__ void | kernel_welford_second_half_batchnorm_forward_final (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, index_t blkgroup_size, index_t num_xy_k_block_tile_iteration, AccDataType epsilon, const MeanVarDataType *const __restrict__ p_in_welford_mean, const MeanVarDataType *const __restrict__ p_in_welford_variance, const int32_t *const __restrict__ p_in_welford_count, const XDataType *const __restrict__ p_x, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance) |
|
template<typename GridwiseWelfordSecondHalfReduceFirstHalf_ , typename XDataType , typename DyDataType , typename AccDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename MeanVarGridDesc_M , typename MeanVarCountGridDesc_M_K , typename DscaleDbiasGridDesc_M_G > |
__global__ void | kernel_welford_second_half_reduce_first_half (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const MeanVarGridDesc_M mean_var_grid_desc_m, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const DscaleDbiasGridDesc_M_G dscale_dbias_grid_desc_m_g, index_t blkgroup_size, index_t num_xy_k_block_tile_iteration, index_t num_mean_var_count_k_block_tile_iteration, AccDataType epsilon, bool haveSavedMeanInvVar, const MeanVarDataType *const __restrict__ p_savedMean, const MeanVarDataType *const __restrict__ p_savedInvVar, const MeanVarDataType *const __restrict__ p_in_welford_mean, const MeanVarDataType *const __restrict__ p_in_welford_variance, const int32_t *const __restrict__ p_in_welford_count, const DyElementwiseOp dy_elementwise_op, MeanVarDataType *const __restrict__ p_out_welford_mean, MeanVarDataType *const __restrict__ p_out_welford_inv_variance, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, DscaleDbiasDataType *const __restrict__ p_reduce_dscale, DscaleDbiasDataType *const __restrict__ p_reduce_dbias) |
|
template<typename CTileIdx , typename CTileDim > |
__host__ __device__ bool | DefaultValidCTileIndex (const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim) |
|
template<typename GridwiseMultipleReduction , index_t NumReduction, typename InDataType , typename OutDataTypePointerTuple , typename AccDataType , typename InGridDesc_M_K , typename OutGridDesc_M_Tuple , typename InElementwiseOperationTuple , typename AccElementwiseOperationTuple > |
__global__ void | kernel_multiple_reduce_multiblock (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M_Tuple out_grid_desc_m_tuple, const InElementwiseOperationTuple in_elementwise_op_tuple, const AccElementwiseOperationTuple acc_elementwise_op_tuple, index_t block_group_size, index_t num_k_block_tile_iteration, Array< AccDataType, NumReduction > alpha_values, const InDataType *const __restrict__ p_in_value_global, Array< AccDataType, NumReduction > beta_values, OutDataTypePointerTuple p_out_value_global_tuple) |
|
template<typename GridwiseMultipleReduction , index_t NumReduction, typename InDataType , typename OutDataTypePointerTuple , typename AccDataType , typename InGridDesc_M_K , typename OutGridDesc_M_Tuple , typename InElementwiseOperationTuple , typename AccElementwiseOperationTuple > |
__global__ void | kernel_multiple_reduce_threadwise (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M_Tuple out_grid_desc_m_tuple, const InElementwiseOperationTuple in_elementwise_op_tuple, const AccElementwiseOperationTuple acc_elementwise_op_tuple, Array< AccDataType, NumReduction > alpha_values, const InDataType *const __restrict__ p_in_value_global, Array< AccDataType, NumReduction > beta_values, OutDataTypePointerTuple p_out_value_global_tuple) |
|
template<typename GridwiseReduction , bool OutputIndex, bool HaveIndexInput, typename InDataType , typename OutDataType , typename AccDataType , typename IndexDataType , typename InGridDesc_M_K , typename OutGridDesc_M , typename InElementwiseOperation , typename AccElementwiseOperation > |
__global__ void | kernel_reduce_multiblock (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const AccElementwiseOperation acc_elementwise_op, index_t block_group_size, index_t num_k_block_tile_iteration, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, const IndexDataType *const __restrict__ p_in_index_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global, IndexDataType *const __restrict__ p_out_index_global) |
|
template<typename GridwiseReduction , bool OutputIndex, bool TransformIndexKtoGlobal, bool HaveIndexInput, typename InDataType , typename OutDataType , typename AccDataType , typename IndexDataType , typename InGridDesc_M_K , typename OutGridDesc_M , typename InElementwiseOperation , typename AccElementwiseOperation > |
__global__ void | kernel_reduce_threadwise (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const AccElementwiseOperation acc_elementwise_op, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, const IndexDataType *const __restrict__ p_in_index_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global, IndexDataType *const __restrict__ p_out_index_global) |
|
template<typename GridwiseReduction , typename InDataType , typename OutDataType , typename AccDataType , typename InGridDesc_M_K , typename DsGridDesc_M , typename OutGridDesc_M , typename InElementwiseOperation , typename OutElementwiseOperation , typename DsGridPointer > |
__global__ void | kernel_reduce_threadwise_multi_d (const InGridDesc_M_K in_grid_desc_m_k, const DsGridDesc_M ds_grid_desc_m, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const OutElementwiseOperation out_elementwise_op, const InDataType *const __restrict__ p_in_value_global, const DsGridPointer p_ds_value_global, OutDataType *const __restrict__ p_out_value_global) |
|
template<typename GridwiseBatchrNormBackwardWithBlockwiseWelford_ , typename XDataType , typename DyDataType , typename DxDataType , typename AccDataType , typename ScaleDataType , typename DscaleDbiasDataType , typename MeanVarDataType , typename DyElementwiseOp , typename XYGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor > |
__global__ void | kernel_batchnorm_backward_with_blockwise_welford (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const XYGridDesc_M_K dx_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M dscale_dbias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, long_index_t reduce_size, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, const ScaleDataType *const __restrict__ p_scale, bool haveSavedMeanInvVar, const MeanVarDataType *const __restrict__ p_savedMean, const MeanVarDataType *const __restrict__ p_savedInvVar, const DyElementwiseOp dy_elementwise_op, DxDataType *const __restrict__ p_dx, DscaleDbiasDataType *const __restrict__ p_dscale, DscaleDbiasDataType *const __restrict__ p_dbias) |
|
template<typename GridwiseBatchrNormForwardWithBlockwiseWelford_ , typename XDataType , typename YDataType , typename AccDataType , typename ScaleDataType , typename BiasDataType , typename MeanVarDataType , typename YElementwiseOp , typename XYGridDesc_M_K , typename ScaleBiasGridDesc_M , typename MeanVarGridDesc_M , typename GetReduceCountPerThreadFunctor > |
__global__ void | kernel_batchnorm_forward_with_blockwise_welford (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance) |
|
template<typename GridwiseElementwise1dFunctor , typename InGrid1dDescTuple , typename OutGrid1dDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename ElementwiseOperation , typename UnaryOperation , typename Scale > |
__global__ void | kernel_elementwise_1d (const InGrid1dDescTuple in_grid_1d_desc_tuple, const OutGrid1dDescTuple out_grid_1d_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const ElementwiseOperation elementwise_op, const UnaryOperation unary_op, const Scale scale_op) |
|
template<typename GridwiseElementwiseFunctor , typename InGridDescTuple , typename OutGridDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename Block2TileMap , typename ElementwiseOperation > |
__global__ void | kernel_elementwise (const InGridDescTuple in_grid_desc_tuple, const OutGridDescTuple out_grid_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const Block2TileMap block_2_tile_map, const ElementwiseOperation elementwise_op) |
|
template<typename GridwiseElementwiseFunctorA , typename GridwiseElementwiseFunctorB , typename InAGridDescTuple , typename InBGridDescTuple , typename OutAGridDescTuple , typename OutBGridDescTuple , typename InADataTypePointerTuple , typename InBDataTypePointerTuple , typename OutADataTypePointerTuple , typename OutBDataTypePointerTuple , typename Block2TileMapA , typename Block2TileMapB , typename ElementwiseOperation > |
__global__ void | kernel_elementwise_dual (const InAGridDescTuple in_grid_desc_tuple_a, const InBGridDescTuple in_grid_desc_tuple_b, const OutAGridDescTuple out_grid_desc_tuple_a, const OutBGridDescTuple out_grid_desc_tuple_b, const InADataTypePointerTuple p_in_global_tuple_a, const InBDataTypePointerTuple p_in_global_tuple_b, const OutADataTypePointerTuple p_out_global_tuple_a, const OutBDataTypePointerTuple p_out_global_tuple_b, const Block2TileMapA block_2_tile_map_a, const Block2TileMapB block_2_tile_map_b, const ElementwiseOperation elementwise_op, const index_t a_grid_size) |
|
template<typename GridwiseElementwiseFunctorA , typename GridwiseElementwiseFunctorB , typename InAGridDescTuple , typename InBGridDescTuple , typename OutAGridDescTuple , typename OutBGridDescTuple , typename InADataTypePointerTuple , typename InBDataTypePointerTuple , typename OutADataTypePointerTuple , typename OutBDataTypePointerTuple , typename Block2TileMapA , typename Block2TileMapB , typename ElementwiseOperation , index_t NumInputsA, index_t NumInputsB, index_t NumOutputsA, index_t NumOutputsB> |
__global__ void | kernel_elementwise_batched_dual (const InAGridDescTuple in_grid_desc_tuple_a, const InBGridDescTuple in_grid_desc_tuple_b, const OutAGridDescTuple out_grid_desc_tuple_a, const OutBGridDescTuple out_grid_desc_tuple_b, const InADataTypePointerTuple p_in_global_tuple_a, const InBDataTypePointerTuple p_in_global_tuple_b, const OutADataTypePointerTuple p_out_global_tuple_a, const OutBDataTypePointerTuple p_out_global_tuple_b, const Block2TileMapA block_2_tile_map_a, const Block2TileMapB block_2_tile_map_b, const ElementwiseOperation elementwise_op, const index_t a_grid_size, const index_t batch_count_a, const index_t batch_count_b, const std::array< index_t, NumInputsA > input_batch_strides_a, const std::array< index_t, NumInputsB > input_batch_strides_b, const std::array< index_t, NumOutputsA > output_batch_strides_a, const std::array< index_t, NumOutputsB > output_batch_strides_b) |
|
template<typename GridwiseElementwiseFunctor , typename InGridDescTuple , typename OutGridDescTuple , typename InDataTypePointerTuple , typename OutDataTypePointerTuple , typename Block2TileMap , typename ElementwiseOperation , index_t NumInputs, index_t NumOutputs> |
__global__ void | kernel_batched_elementwise (const InGridDescTuple in_grid_desc_tuple, const OutGridDescTuple out_grid_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const Block2TileMap block_2_tile_map, const ElementwiseOperation elementwise_op, const index_t batch_count, const std::array< index_t, NumInputs > input_batch_strides, const std::array< index_t, NumOutputs > output_batch_strides) |
|
template<typename GridwiseGemm , typename ADataType , typename BDataType , typename ScaleDataType , typename CDataType , typename AGridDesc , typename BGridDesc , typename ScaleGridDesc , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_fpAintB_gemm_wmma (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, const ScaleDataType *__restrict__ p_scale_grid, CDataType *__restrict__ p_c_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const ScaleGridDesc scale_grid_desc, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map) |
|
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename FloatC0 , typename FloatC1 , typename ReducePtrsGlobal , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename C1ElementwiseOperation , typename ReduceInElementwiseOperations , typename ReduceAccElementwiseOperations , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ReduceGridDescriptor_MBlock_MPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_bias_add_reduce_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC0 *__restrict__ p_bias_grid, const FloatC1 *__restrict__ p_d0_grid, ReducePtrsGlobal p_reduces_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const C1ElementwiseOperation c1_element_op, const ReduceInElementwiseOperations reduce_in_element_ops, const ReduceAccElementwiseOperations reduce_out_element_ops, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c0_grid_desc_mblock_mperblock_nblock_nperblock, const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c1_grid_desc_mblock_mperblock_nblock_nperblock, const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock, const Block2CTileMap block_2_ctile_map) |
|
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M0_M1_K1 , typename BGridDesc_K0_N0_N1_K1 , typename CGridDesc_M0_M10_M11_N0_N10_N11 , typename Block2CTileMap , bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop> |
__global__ void | kernel_gemm_dl_v1r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1, const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1, const CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11, const Block2CTileMap block_2_ctile_map) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_dpp (const typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2CTileMap , typename ComputePtrOffsetOfBatch , bool HasMainKBlockLoop> |
__global__ void | kernel_grouped_conv_multiple_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const index_t batch_count, const AGridDesc_AK0_M_AK1 a_grid_desc, const BGridDesc_BK0_N_BK1 b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_, const Block2CTileMap block_2_ctile_map, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch) |
|
template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AGridDesc , typename BGridDesc , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename ComputePtrOffsetOfBatch , typename Block2CTileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_contraction_multiple_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const index_t batch_count, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2CTileMap block_2_etile_map) |
|
template<typename GridwiseOp , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AGridDesc , typename BGridDesc , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_mupltipe_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const Block2CTileMap block_2_ctile_map) |
|
template<typename GridwiseGemm , typename ADataType , typename BDataType , typename DsPointer , typename EDataType , typename AElementwiseOperation , typename BElementwiseOperation , typename CDEElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename Block2ETileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_multiple_d_xdl_cshuffle_lds_direct_load (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map) |
|
template<PipelineVersion PipelineVer, index_t NumPrefetch = 1, LoopScheduler LoopSched = LoopScheduler::Default, bool AEnableLds = true, bool BEnableLds = true> |
constexpr auto | GridwiseGemmPipeline_Selector () |
|
template<index_t NumPrefetch, LoopScheduler LoopSched> |
constexpr auto | GridwiseGemmPipeline_v1_Selector () |
|
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename ReducePtrsGlobal , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename ReduceInElementwiseOperations , typename ReduceAccElementwiseOperations , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename ReduceGridDescriptor_MBlock_MPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_reduce_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, ReducePtrsGlobal p_reduces_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const ReduceInElementwiseOperations reduce_in_element_ops, const ReduceAccElementwiseOperations reduce_out_element_ops, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock, const Block2CTileMap block_2_ctile_map) |
|
template<typename GridwiseGemm , typename ADataType , typename BDataType , typename CDataType , typename AGridDesc , typename BGridDesc , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_wmma (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, CDataType *__restrict__ p_c_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
__global__ void | kernel_gemm_wmma_cshuffle_v3 (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
__global__ void | kernel_gemm_xdl_cshuffle_v3 (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
__global__ void | kernel_gemm_xdl_cshuffle_v3_2lds (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_xdl_cshuffle_v1 (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_xdl_cshuffle_v1 (const FloatA *__restrict__ p_a_grid, const FloatB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, typename GridwiseGemm::Problem problem) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, index_t TailNum = 3> |
__global__ void | kernel_gemm_xdl_cshuffle_v2 (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_xdl_cshuffle_v2 (const FloatA *p_a_grid, const FloatB *p_b_grid, FloatC *p_c_grid, typename GridwiseGemm::Problem problem) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
__global__ void | kernel_gemm_xdl_cshuffle_v3_b_preshuffle (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
__global__ void | kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
__global__ void | kernel_gemm_xdl_cshuffle_v3_multi_d (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
__global__ void | kernel_gemm_xdl_cshuffle_v3_multi_d_2lds (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
__global__ void | kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
__global__ void | kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
__global__ void | kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
__global__ void | kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds (typename GridwiseGemm::Argument karg) |
|
template<bool Use2LDS, typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
__global__ enable_if_t<!Use2LDS, void > | kernel_gemm_xdl_cshuffle_v3_mx (typename GridwiseGemm::Argument karg) |
|
template<bool Use2LDS, typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
__global__ enable_if_t< Use2LDS, void > | kernel_gemm_xdl_cshuffle_v3_mx (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename FloatC0 , typename AElementwiseOperation , typename BElementwiseOperation , typename AccElementwiseOperation , typename CElementwiseOperation , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock , typename C0GridDescriptor_NBlock_NPerBlock , typename Block2CTileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_layernorm_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC0 *__restrict__ p_c0_bias_grid, const FloatC0 *__restrict__ p_c0_add_grid, const FloatC0 *__restrict__ p_c0_gamma_grid, const FloatC0 *__restrict__ p_c0_beta_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const AccElementwiseOperation acc_element_op, const CElementwiseOperation c_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock, const Block2CTileMap block_2_ctile_map) |
|
template<typename LowLengths > |
__host__ constexpr __device__ auto | make_merge_transform_v4_no_carry (const LowLengths &low_lengths) |
|
template<typename GridwiseGemm , typename FloatA , typename FloatB , typename FloatC , typename AGridDesc_B_K0_M_K1 , typename BGridDesc_B_K0_N_K1 , typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename CBlockClusterAdaptor , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_xdlops_bwd_weight (const FloatA *__restrict__ p_a_grid, const FloatB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc, const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc, const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const CBlockClusterAdaptor c_block_cluster_adaptor) |
|
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3 , typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainK0BlockLoop> |
__global__ void | kernel_gemm_xdlops_skip_b_lds_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3 b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3, const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation > |
__global__ void | kernel_gemm_xdlops_splitk_lds_direct_load (typename GridwiseGemm::Argument karg, const Block2CTileMap &b2c_map, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op) |
|
template<typename GridwiseGemm > |
__global__ void | kernel_gemm_xdlops_streamk (const typename GridwiseGemm::FloatAB *p_a_grid, const typename GridwiseGemm::FloatAB *p_b_grid, typename GridwiseGemm::FloatC *p_c_grid, void *p_workspace, index_t M, index_t N, index_t K, index_t StrideA, index_t StrideB, index_t StrideC, typename GridwiseGemm::Block2CTileMap block_mapping) |
|
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDesc_M_N , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_xdlops_v2r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDesc_M_N c_grid_desc_m_n) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_xdlops_v2r3 (const typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename ABK0MK1GridDesc , typename BBK0NK1GridDesc , typename CM0N0M1N1M2M3M4N2GridDesc , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename CBlockClusterAdaptor , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_xdlops_v2r4 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const ABK0MK1GridDesc a_b_k0_m_k1_grid_desc, const BBK0NK1GridDesc b_b_k0_n_k1_grid_desc, const CM0N0M1N1M2M3M4N2GridDesc c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const CBlockClusterAdaptor c_block_cluster_adaptor) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation > |
__global__ void | kernel_gemm_xdlops_v2r4r2_simplified (typename GridwiseGemm::Argument karg, const Block2CTileMap &b2c_map, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op) |
|
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_AK0_M_AK1 , typename BGridDesc_BK0_N_BK1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainK0BlockLoop> |
__global__ void | kernel_gemm_xdlops_v3r1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map) |
|
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_xdlops_v3r2 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC *__restrict__ p_c0_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map) |
|
template<typename GridwiseGemm , typename FloatAB , typename FloatC , typename AGridDesc_K0_M_K1 , typename BGridDesc_K0_N_K1 , typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl , typename AElementwiseOperation , typename BElementwiseOperation , typename CElementwiseOperation , typename Block2CTileMap , bool HasMainKBlockLoop> |
__global__ void | kernel_gemm_xdlops_v3r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC *__restrict__ p_c0_grid, const FloatC *__restrict__ p_c1_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
__global__ void | kernel_moe_gemm (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
__global__ void | kernel_moe_gemm_2lds (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
__global__ void | kernel_moe_mxgemm_2lds (typename GridwiseGemm::Argument karg) |
|
template<typename GridwiseGemm , bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
__global__ void | kernel_moe_mxgemm (typename GridwiseGemm::Argument karg) |
|
template<typename GridwisePermute , typename InGridDesc , typename OutGridDesc , typename InDataType , typename OutDataType , typename ElementwiseOperation , typename Block2TileMap > |
__global__ void | kernel_nd_permute (const InGridDesc in_grid_desc, const OutGridDesc out_grid_desc, const InDataType *p_in_global, OutDataType *p_out_global, const ElementwiseOperation elementwise_op, const Block2TileMap block_2_tile_map) |
|
template<typename GridwisePutElementwise1dFunctor , typename InGrid1dDesc , typename InDataType , typename IndexDataType , typename OutDataType , typename ElementwiseOperation > |
__global__ void | kernel_put_element_1d (const InGrid1dDesc in_grid_1d_desc, const InDataType *__restrict__ p_in_global, const IndexDataType *__restrict__ p_indices_global, OutDataType *__restrict__ p_out_global, const ElementwiseOperation elementwise_op) |
|
template<index_t BlockSize, typename DataType , typename Grid1dBufferDescType > |
__global__ void | kernel_buffer_set_value (const Grid1dBufferDescType grid_1d_buffer_desc, DataType *const __restrict__ p_global, DataType value) |
|
template<typename Grid1dBufferDescTuple , index_t NumBuffer, index_t BlockSize, typename DataTypePointerTuple , typename DataTypeTuple > |
__global__ void | kernel_multiple_buffer_set_value (const Grid1dBufferDescTuple grid_1d_buffer_desc_tuple, DataTypePointerTuple p_global_tuple, DataTypeTuple value_tuple) |
|
template<typename GridwiseReduction , typename InDataType , typename OutDataType , typename AccDataType , typename GridDesc_M_K > |
__global__ void | kernel_softmax (const GridDesc_M_K in_grid_desc_m_k, const GridDesc_M_K out_grid_desc_m_k, index_t block_group_size, index_t num_k_block_tile_iteration, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global) |
|
template<typename GridwiseSparseEmbedding , typename EmbType , typename IndexType , typename GammaDataType , typename BetaDataType , typename AccDataType , typename OutType , typename OutGridDesc , typename EmbElementwiseOperation , ck::index_t NumEmbeddings> |
__global__ void | kernel_sparse_embeddings_forward_layernorm (OutType *p_out, const ck::Array< EmbType *, NumEmbeddings > p_embs, const ck::Array< IndexType *, NumEmbeddings > p_indexes, const GammaDataType *p_gamma, const BetaDataType *p_beta, const OutGridDesc out_grid_desc, const AccDataType epsilon, const EmbElementwiseOperation emb_elementwise_op) |
|
template<typename InputGridDesc , typename InputDataType , typename OutputGridDesc , typename OutputDataType , typename Block2ETileMap , typename ComputePtrOffsetOfStridedBatch , typename GridwiseTensorRearrangeKernel > |
__global__ void | kernel_tensor_rearrange (const InputGridDesc in_grid_desc, const InputDataType *__restrict__ p_in_global, const OutputGridDesc out_grid_desc, OutputDataType *__restrict__ p_out_global, const index_t batch_count, const Block2ETileMap block_2_tile_map, const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch) |
|
template<typename GridwiseReduction , typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename GridDesc_M_K , typename GridDesc_M > |
__global__ void | kernel_normalization (const GridDesc_M_K x_grid_desc_m_k, const GridDesc_M_K gamma_grid_desc_m_k, const GridDesc_M_K beta_grid_desc_m_k, const GridDesc_M_K y_grid_desc_m_k, const GridDesc_M save_mean_grid_desc_m, const GridDesc_M save_inv_std_grid_desc_m, index_t num_k_block_tile_iteration, ComputeDataType epsilon, const XDataType *const __restrict__ p_x_global, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, SaveMeanInvStdDataType *const __restrict__ p_save_mean_global, SaveMeanInvStdDataType *const __restrict__ p_save_inv_std_global, const YElementwiseOperation y_elementwise_op) |
|
template<typename XDataType , typename GammaDataType , typename BetaDataType , typename YDataType , typename SaveMeanInvStdDataType , typename ComputeDataType , typename YElementwiseOperation , typename GridDesc_M_K , typename GridDesc_M , index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t XSrcVectorDim, index_t XSrcVectorSize, index_t GammaSrcVectorDim, index_t GammaSrcVectorSize, index_t BetaSrcVectorDim, index_t BetaSrcVectorSize, index_t YDstVectorDim, index_t YDstVectorSize, index_t SaveMeanInvStdDstVectorSize, bool UseWelford> |
auto | NormalizationKernelSelector (bool isSweepOnce) |
|
template<typename T > |
__device__ T * | cast_pointer_to_generic_address_space (T CK_CONSTANT_ADDRESS_SPACE *p) |
|
template<typename T > |
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE * | cast_pointer_to_constant_address_space (T *p) |
|
template<typename T > |
__device__ int32x4_t | make_wave_buffer_resource (T *p_wave, index_t element_space_size) |
|
template<typename T > |
__device__ int32x4_t | make_wave_buffer_resource_with_default_range (T *p_wave) |
|
__device__ int8_t | llvm_amdgcn_raw_buffer_load_i8 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8") |
|
__device__ int8x2_t | llvm_amdgcn_raw_buffer_load_i8x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8") |
|
__device__ int8x4_t | llvm_amdgcn_raw_buffer_load_i8x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8") |
|
__device__ bhalf_t | llvm_amdgcn_raw_buffer_load_i16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16") |
|
__device__ bhalf2_t | llvm_amdgcn_raw_buffer_load_i16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16") |
|
__device__ bhalf4_t | llvm_amdgcn_raw_buffer_load_i16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16") |
|
__device__ int32_t | llvm_amdgcn_raw_buffer_load_i32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32") |
|
__device__ int32x2_t | llvm_amdgcn_raw_buffer_load_i32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32") |
|
__device__ int32x4_t | llvm_amdgcn_raw_buffer_load_i32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32") |
|
__device__ half_t | llvm_amdgcn_raw_buffer_load_fp16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16") |
|
__device__ half2_t | llvm_amdgcn_raw_buffer_load_fp16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16") |
|
__device__ half4_t | llvm_amdgcn_raw_buffer_load_fp16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16") |
|
__device__ float | llvm_amdgcn_raw_buffer_load_fp32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32") |
|
__device__ float2_t | llvm_amdgcn_raw_buffer_load_fp32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32") |
|
__device__ float4_t | llvm_amdgcn_raw_buffer_load_fp32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_i8 (int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_i8x2 (int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_i8x4 (int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_i16 (bhalf_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_i16x2 (bhalf2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_i16x4 (bhalf4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_i32x2 (int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_i32x4 (int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_fp16 (half_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_fp16x2 (half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_fp16x4 (half4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_fp32x2 (float2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32") |
|
__device__ void | llvm_amdgcn_raw_buffer_store_fp32x4 (float4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32") |
|
__device__ half2_t | llvm_amdgcn_raw_buffer_atomic_add_fp16x2 (half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16") |
|
__device__ int32_t | llvm_amdgcn_raw_buffer_atomic_add_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32") |
|
__device__ float | llvm_amdgcn_raw_buffer_atomic_add_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32") |
|
__device__ double | llvm_amdgcn_raw_buffer_atomic_max_fp64 (double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64") |
|
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
__device__ vector_type< int8_t, N >::type | amd_buffer_load_impl_raw (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset) |
|
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
__device__ vector_type< T, N >::type | amd_buffer_load_impl (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset) |
|
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
__device__ void | amd_buffer_store_impl_raw (const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
|
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
__device__ void | amd_buffer_store_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
|
template<typename T , index_t N> |
__device__ void | amd_global_atomic_add_impl (const typename vector_type< T, N >::type src_thread_data, T *addr) |
|
template<typename T , index_t N> |
__device__ void | amd_buffer_atomic_add_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
|
template<typename T , index_t N> |
__device__ void | amd_buffer_atomic_max_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
|
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
__device__ vector_type_maker< T, N >::type::type | amd_buffer_load_invalid_element_return_zero (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size) |
|
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
__device__ vector_type_maker< T, N >::type::type | amd_buffer_load_invalid_element_return_customized_value (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value) |
|
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
__device__ void | amd_buffer_store (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) |
|
template<typename T , index_t N> |
__device__ void | amd_buffer_atomic_add (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) |
|
template<typename T , index_t N> |
__device__ void | amd_buffer_atomic_max (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) |
|
__device__ void | llvm_amdgcn_raw_buffer_load_lds (int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds") |
|
template<typename T , index_t NumElemsPerThread> |
__device__ void | amd_direct_load_global_to_lds (const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size) |
|
template<typename T > |
__device__ __amdgpu_buffer_rsrc_t | make_wave_buffer_resource_new (T *p_wave, index_t element_space_size) |
|
template<typename T > |
__device__ __amdgpu_buffer_rsrc_t | make_wave_buffer_resource_with_default_range_new (T *p_wave) |
|
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
__device__ vector_type< int8_t, N >::type | amd_buffer_load_impl_raw (__amdgpu_buffer_rsrc_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset) |
|
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
__device__ vector_type< T, N >::type | amd_buffer_load_impl (__amdgpu_buffer_rsrc_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset) |
|
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
__device__ void | amd_buffer_store_impl_raw (const typename vector_type< int8_t, N >::type src_thread_data, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
|
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
__device__ void | amd_buffer_store_impl (const typename vector_type< T, N >::type src_thread_data, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
|
template<> |
__host__ constexpr __device__ bool | fp8_is_nan (f8_ocp_t a) |
|
template<> |
__host__ constexpr __device__ bool | fp8_is_nan (bf8_ocp_t a) |
|
template<> |
__host__ constexpr __device__ bool | fp8_is_nan (f8_fnuz_t a) |
|
template<> |
__host__ constexpr __device__ bool | fp8_is_nan (bf8_fnuz_t a) |
|
template<> |
__host__ constexpr __device__ bool | fp8_is_inf (bf8_ocp_t a) |
|
__device__ int | amd_assembly_and_b32 (int a, int b) |
|
__device__ int | amd_assembly_and_or_b32 (int a, int b, int d) |
|
__device__ half2_t | amd_assembly_pk_fma_f16 (half2_t a, half2_t b, half2_t c) |
|
__device__ half2_t | amd_assembly_pk_add_f16 (half2_t a, half2_t b) |
|
__device__ float | amd_assemble_cvt_f32_i4 (int b) |
|
__device__ f8x4_t | amd_assembly_cvt_f8_to_f32 (float b0, float b1, float b2, float b3) |
|
__device__ f8x8_t | amd_assembly_i4_to_fp8x8 (int a) |
|
__device__ void | amd_assembly_outer_product_1x2 (float a, float b0, float b1, float &c0, float &c1) |
|
__device__ void | amd_assembly_outer_product_1x4 (float a, float b0, float b1, float b2, float b3, float &c0, float &c1, float &c2, float &c3) |
|
__device__ void | amd_assembly_outer_product_1x2 (half2_t a, half2_t b0, half2_t b1, float &c0, float &c1) |
|
__device__ void | amd_assembly_outer_product_1x4 (half2_t a, half2_t b0, half2_t b1, half2_t b2, half2_t b3, float &c0, float &c1, float &c2, float &c3) |
|
__device__ void | amd_assembly_outer_product_1x2 (int8x4_t a, int8x4_t b0, int8x4_t b1, int32_t &c0, int32_t &c1) |
|
__device__ void | amd_assembly_outer_product_1x4 (int8x4_t a, int8x4_t b0, int8x4_t b1, int8x4_t b2, int8x4_t b3, int32_t &c0, int32_t &c1, int32_t &c2, int32_t &c3) |
|
__device__ uint32_t | amd_wave_read_first_lane (uint32_t value) |
|
__device__ int32_t | amd_wave_read_first_lane (int32_t value) |
|
__device__ int64_t | amd_wave_read_first_lane (int64_t value) |
|
template<typename Object , typename = ck::enable_if_t<ck::is_class_v<Object> && ck::is_trivially_copyable_v<Object>>> |
__device__ auto | amd_wave_read_first_lane (const Object &obj) |
|
template<typename X , typename... Xs> |
__host__ constexpr __device__ auto | make_array (X &&x, Xs &&... xs) |
|
template<typename X > |
__host__ constexpr __device__ auto | make_array () |
|
template<typename... Xs> |
__host__ constexpr __device__ auto | make_multi_index (Xs &&... xs) |
|
template<index_t NSize> |
__host__ constexpr __device__ auto | make_zero_multi_index () |
|
template<typename T > |
__host__ constexpr __device__ auto | to_multi_index (const T &x) |
|
template<index_t NSize, typename X > |
__host__ constexpr __device__ auto | operator+= (MultiIndex< NSize > &y, const X &x) |
|
template<index_t NSize, typename X > |
__host__ constexpr __device__ auto | operator-= (MultiIndex< NSize > &y, const X &x) |
|
template<index_t NSize, typename T > |
__host__ constexpr __device__ auto | operator+ (const MultiIndex< NSize > &a, const T &b) |
|
template<index_t NSize, typename T > |
__host__ constexpr __device__ auto | operator- (const MultiIndex< NSize > &a, const T &b) |
|
template<index_t NSize, typename T > |
__host__ constexpr __device__ auto | operator* (const MultiIndex< NSize > &a, const T &b) |
|
template<typename PY , typename PX , typename enable_if< is_pointer_v< PY > &&is_pointer_v< PX >, bool >::type = false> |
__host__ __device__ PY | c_style_pointer_cast (PX p_x) |
|
template<typename Arr , typename Picks , typename X > |
__host__ constexpr __device__ auto | operator+= (ContainerElementPicker< Arr, Picks > &y, const X &x) |
|
template<typename Arr , typename Picks , typename X > |
__host__ constexpr __device__ auto | operator-= (ContainerElementPicker< Arr, Picks > &y, const X &x) |
|
template<typename Arr , typename Picks > |
__host__ constexpr __device__ auto | pick_container_element (Arr &a, Picks) |
|
template<typename Arr , typename Picks > |
__host__ constexpr __device__ auto | pick_container_element (const Arr &a, Picks) |
|
template<typename TData , index_t NSize> |
__host__ constexpr __device__ auto | container_push_back (const Array< TData, NSize > &a, const TData &x) |
|
template<typename... Ts, typename T > |
__host__ constexpr __device__ auto | container_push_front (const Tuple< Ts... > &a, const T &x) |
|
template<typename... Ts, typename T > |
__host__ constexpr __device__ auto | container_push_back (const Tuple< Ts... > &a, const T &x) |
|
template<typename TData , index_t NSize, index_t... IRs> |
__host__ constexpr __device__ auto | container_reorder_given_new2old (const Array< TData, NSize > &old_array, Sequence< IRs... >) |
|
template<typename TData , index_t NSize, index_t... IRs> |
__host__ constexpr __device__ auto | container_reorder_given_old2new (const Array< TData, NSize > &old_array, Sequence< IRs... > old2new) |
|
template<typename... Ts, index_t... IRs> |
__host__ constexpr __device__ auto | container_reorder_given_new2old (const Tuple< Ts... > &old_tuple, Sequence< IRs... >) |
|
template<typename... Ts, index_t... IRs> |
__host__ constexpr __device__ auto | container_reorder_given_old2new (const Tuple< Ts... > &old_tuple, Sequence< IRs... > old2new) |
|
template<index_t... Is, index_t... IRs> |
__host__ constexpr __device__ auto | container_reorder_given_new2old (Sequence< Is... >, Sequence< IRs... >) |
|
template<index_t... Is, index_t... IRs> |
__host__ constexpr __device__ auto | container_reorder_given_old2new (Sequence< Is... > old_seq, Sequence< IRs... >) |
|
template<typename Container , typename Reduce , typename Init , index_t IBegin = 0, index_t IEnd = Container::Size(), index_t IStep = 1> |
__host__ constexpr __device__ auto | container_reduce (const Container &x, Reduce reduce, Init init, Number< IBegin >=Number< 0 >{}, Number< IEnd >=Number< Container::Size()>{}, Number< IStep >=Number< 1 >{}) |
|
template<typename TData , index_t NSize, typename Reduce > |
__host__ constexpr __device__ auto | container_reverse_inclusive_scan (const Array< TData, NSize > &x, Reduce f, TData init) |
|
template<typename TData , index_t NSize, typename Reduce > |
__host__ constexpr __device__ auto | container_reverse_exclusive_scan (const Array< TData, NSize > &x, Reduce f, TData init) |
|
template<index_t... Is, typename Reduce , index_t Init> |
__host__ constexpr __device__ auto | container_reverse_exclusive_scan (const Sequence< Is... > &seq, Reduce f, Number< Init >) |
|
template<typename... Xs, typename Reduce , typename Init > |
__host__ constexpr __device__ auto | container_reverse_exclusive_scan (const Tuple< Xs... > &x, Reduce reduce, Init init) |
|
template<typename... Xs, typename Reduce , typename TData > |
__host__ constexpr __device__ auto | container_reverse_inclusive_scan (const Tuple< Xs... > &x, Reduce f, TData init) |
|
template<typename X , typename... Ys> |
__host__ constexpr __device__ auto | container_concat (const X &x, const Ys &... ys) |
|
template<typename T , index_t NX, index_t NY> |
__host__ constexpr __device__ auto | container_concat (const Array< T, NX > &ax, const Array< T, NY > &ay) |
|
template<typename... X, typename... Y> |
__host__ constexpr __device__ auto | container_concat (const Tuple< X... > &tx, const Tuple< Y... > &ty) |
|
template<typename Container > |
__host__ constexpr __device__ auto | container_concat (const Container &x) |
|
template<typename T , index_t N, index_t... Is> |
__host__ constexpr __device__ auto | get_container_subset (const Array< T, N > &arr, Sequence< Is... >) |
|
template<typename... Ts, index_t... Is> |
__host__ constexpr __device__ auto | get_container_subset (const Tuple< Ts... > &tup, Sequence< Is... >) |
|
template<typename T , index_t N, index_t... Is> |
__host__ constexpr __device__ void | set_container_subset (Array< T, N > &y, Sequence< Is... > picks, const Array< T, sizeof...(Is)> &x) |
|
template<typename... Ys, index_t... Is, typename... Xs> |
__host__ constexpr __device__ void | set_container_subset (Tuple< Ys... > &y, Sequence< Is... > picks, const Tuple< Xs... > &x) |
|
template<index_t... Is> |
__host__ constexpr __device__ auto | sequence_to_tuple_of_number (Sequence< Is... >) |
|
constexpr auto | next_pow2 (uint32_t x) |
|
template<typename T > |
constexpr bool | is_native_type () |
|
template<typename T , index_t N> |
__host__ constexpr __device__ auto | make_vector_type (Number< N >) |
|
template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize > |
__host__ constexpr __device__ auto | make_dynamic_buffer (T *p, ElementSpaceSize element_space_size) |
|
template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize > |
__host__ constexpr __device__ auto | make_long_dynamic_buffer (T *p, ElementSpaceSize element_space_size) |
|
template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T , typename ElementSpaceSize , typename X , typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X >>::value, bool >::type = false> |
__host__ constexpr __device__ auto | make_dynamic_buffer (T *p, ElementSpaceSize element_space_size, X invalid_element_value) |
|
template<class EnvVar > |
const std::string & | EnvGetString (EnvVar) |
|
template<class EnvVar > |
bool | EnvIsEnabled (EnvVar) |
|
template<class EnvVar > |
bool | EnvIsDisabled (EnvVar) |
|
template<class EnvVar > |
uint64_t | EnvValue (EnvVar) |
|
template<class EnvVar > |
bool | EnvIsUnset (EnvVar) |
|
template<class EnvVar > |
void | EnvUnset (EnvVar) |
|
template<typename EnvVar , typename ValueType > |
void | UpdateEnvVar (EnvVar, const ValueType &val) |
| updates the cached value of an environment variable More...
|
|
template<typename EnvVar > |
void | UpdateEnvVar (EnvVar, const std::string_view &val) |
|
__host__ int | clz (uint32_t x) |
|
template<bool predicate, typename X , typename Y > |
constexpr auto | conditional_expr (X &&x, Y &&y) |
|
template<typename F , typename X > |
__host__ constexpr __device__ auto | unpack (F &&f, X &&x) |
|
template<typename F , typename X , typename Y > |
__host__ constexpr __device__ auto | unpack2 (F &&f, X &&x, Y &&y) |
|
template<typename X > |
__device__ X | atomic_add (X *p_dst, const X &x) |
|
template<> |
__device__ int32_t | atomic_add< int32_t > (int32_t *p_dst, const int32_t &x) |
|
template<> |
__device__ uint32_t | atomic_add< uint32_t > (uint32_t *p_dst, const uint32_t &x) |
|
template<> |
__device__ float | atomic_add< float > (float *p_dst, const float &x) |
|
template<> |
__device__ unsigned short | atomic_add< unsigned short > (unsigned short *p_dst, const unsigned short &x) |
|
template<> |
__device__ _Float16 | atomic_add< _Float16 > (_Float16 *p_dst, const _Float16 &x) |
|
template<> |
__device__ double | atomic_add< double > (double *p_dst, const double &x) |
|
template<> |
__device__ float2_t | atomic_add< float2_t > (float2_t *p_dst, const float2_t &x) |
|
template<> |
__device__ double2_t | atomic_add< double2_t > (double2_t *p_dst, const double2_t &x) |
|
template<typename X > |
__device__ X | atomic_max (X *p_dst, const X &x) |
|
template<> |
__device__ int32_t | atomic_max< int32_t > (int32_t *p_dst, const int32_t &x) |
|
template<> |
__device__ uint32_t | atomic_max< uint32_t > (uint32_t *p_dst, const uint32_t &x) |
|
template<> |
__device__ float | atomic_max< float > (float *p_dst, const float &x) |
|
template<> |
__device__ double | atomic_max< double > (double *p_dst, const double &x) |
|
template<> |
__device__ float2_t | atomic_max< float2_t > (float2_t *p_dst, const float2_t &x) |
|
__host__ constexpr __device__ index_t | get_warp_size () |
|
__device__ index_t | get_thread_local_1d_id () |
|
__device__ index_t | get_thread_global_1d_id () |
|
__device__ index_t | get_warp_local_1d_id () |
|
__device__ index_t | get_block_1d_id () |
|
__device__ index_t | get_grid_size () |
|
__device__ index_t | get_block_size () |
|
template<> |
constexpr __device__ index_t | get_shift< 1 > () |
|
template<typename TA , typename TB , typename TC > |
__device__ void | inner_product (const TA &a, const TB &b, TC &c) |
|
template<> |
__device__ void | inner_product< float, float, float > (const float &a, const float &b, float &c) |
|
template<> |
__device__ void | inner_product< float2_t, float2_t, float > (const float2_t &a, const float2_t &b, float &c) |
|
template<> |
__device__ void | inner_product< float4_t, float4_t, float > (const float4_t &a, const float4_t &b, float &c) |
|
template<> |
__device__ void | inner_product< bhalf_t, bhalf_t, float > (const bhalf_t &a, const bhalf_t &b, float &c) |
|
template<> |
__device__ void | inner_product< half_t, half_t, float > (const half_t &a, const half_t &b, float &c) |
|
template<> |
__device__ void | inner_product< half2_t, half2_t, float > (const half2_t &a, const half2_t &b, float &c) |
|
template<> |
__device__ void | inner_product< half4_t, half4_t, float > (const half4_t &a, const half4_t &b, float &c) |
|
template<> |
__device__ void | inner_product< half8_t, half8_t, float > (const half8_t &a, const half8_t &b, float &c) |
|
template<> |
__device__ void | inner_product< int8_t, int8_t, int32_t > (const int8_t &a, const int8_t &b, int32_t &c) |
|
template<> |
__device__ void | inner_product< int8x2_t, int8x2_t, int32_t > (const int8x2_t &a, const int8x2_t &b, int32_t &c) |
|
template<> |
__device__ void | inner_product< int8x4_t, int8x4_t, int32_t > (const int8x4_t &a, const int8x4_t &b, int32_t &c) |
|
template<> |
__device__ void | inner_product< int8x8_t, int8x8_t, int32_t > (const int8x8_t &a, const int8x8_t &b, int32_t &c) |
|
template<> |
__device__ void | inner_product< int8x16_t, int8x16_t, int32_t > (const int8x16_t &a, const int8x16_t &b, int32_t &c) |
|
template<typename TX , TX X, typename TY , TY Y> |
__host__ constexpr __device__ auto | operator+ (integral_constant< TX, X >, integral_constant< TY, Y >) |
|
template<typename TX , TX X, typename TY , TY Y> |
__host__ constexpr __device__ auto | operator- (integral_constant< TX, X >, integral_constant< TY, Y >) |
|
template<typename TX , TX X, typename TY , TY Y> |
__host__ constexpr __device__ auto | operator* (integral_constant< TX, X >, integral_constant< TY, Y >) |
|
template<typename TX , TX X, typename TY , TY Y> |
__host__ constexpr __device__ auto | operator/ (integral_constant< TX, X >, integral_constant< TY, Y >) |
|
template<typename TX , TX X, typename TY , TY Y> |
__host__ constexpr __device__ auto | operator% (integral_constant< TX, X >, integral_constant< TY, Y >) |
|
constexpr LoopScheduler | make_default_loop_scheduler () |
|
template<typename Y , typename X > |
__host__ constexpr __device__ Y | mxf8_convert_sr (X x, float scale) |
|
template<typename Y , typename X > |
__host__ constexpr __device__ Y | mxf8_convert_rne (X x, float scale) |
|
template<> |
__host__ __device__ f8_ocp_t | mxf8_convert_rne< f8_ocp_t, float > (float x, float scale) |
|
template<> |
__host__ __device__ bf8_ocp_t | mxf8_convert_rne< bf8_ocp_t, float > (float x, float scale) |
|
template<> |
__host__ __device__ f8x2_ocp_t | mxf8_convert_rne< f8x2_ocp_t, float2_t > (float2_t x, float scale) |
|
template<> |
__host__ __device__ bf8x2_ocp_t | mxf8_convert_rne< bf8x2_ocp_t, float2_t > (float2_t x, float scale) |
|
template<> |
__host__ __device__ f8x16_ocp_t | mxf8_convert_rne< f8x16_ocp_t, float16_t > (float16_t x, float scale) |
|
template<> |
__host__ __device__ bf8x16_ocp_t | mxf8_convert_rne< bf8x16_ocp_t, float16_t > (float16_t x, float scale) |
|
template<> |
__host__ __device__ f8x32_ocp_t | mxf8_convert_rne< f8x32_ocp_t, float32_t > (float32_t x, float scale) |
|
template<> |
__host__ __device__ bf8x32_ocp_t | mxf8_convert_rne< bf8x32_ocp_t, float32_t > (float32_t x, float scale) |
|
template<> |
__host__ __device__ f8_ocp_t | mxf8_convert_sr< f8_ocp_t, float > (float x, float scale) |
|
template<> |
__host__ __device__ bf8_ocp_t | mxf8_convert_sr< bf8_ocp_t, float > (float x, float scale) |
|
template<> |
__host__ __device__ f8x2_ocp_t | mxf8_convert_sr< f8x2_ocp_t, float2_t > (float2_t x, float scale) |
|
template<> |
__host__ __device__ bf8x2_ocp_t | mxf8_convert_sr< bf8x2_ocp_t, float2_t > (float2_t x, float scale) |
|
template<> |
__host__ __device__ f8x16_ocp_t | mxf8_convert_sr< f8x16_ocp_t, float16_t > (float16_t x, float scale) |
|
template<> |
__host__ __device__ bf8x16_ocp_t | mxf8_convert_sr< bf8x16_ocp_t, float16_t > (float16_t x, float scale) |
|
template<> |
__host__ __device__ f8x32_ocp_t | mxf8_convert_sr< f8x32_ocp_t, float32_t > (float32_t x, float scale) |
|
template<> |
__host__ __device__ bf8x32_ocp_t | mxf8_convert_sr< bf8x32_ocp_t, float32_t > (float32_t x, float scale) |
|
template<typename T , uint32_t seed_t, ck::enable_if_t< std::is_same< float, T >{}, bool > = false> |
__host__ __device__ uint32_t | prand_generator (index_t id, T val, uint32_t seed=seed_t) |
|
template<typename T , uint32_t seed_t, ck::enable_if_t<!(std::is_same< float, T >{}||std::is_same< _Float16, T >{}), bool > = false> |
__host__ __device__ uint32_t | prand_generator (int id, T val, uint32_t seed=seed_t) |
|
template<typename Y , typename X > |
constexpr __host__ Y | scaled_type_convert (e8m0_bexp_t scale, X x) |
|
template<> |
__host__ float | scaled_type_convert< float, f8_ocp_t > (e8m0_bexp_t scale, f8_ocp_t x) |
|
template<> |
__host__ float | scaled_type_convert< float, bf8_ocp_t > (e8m0_bexp_t scale, bf8_ocp_t x) |
|
template<> |
__host__ float2_t | scaled_type_convert< float2_t, f8x2_ocp_t > (e8m0_bexp_t scale, f8x2_ocp_t x) |
|
template<> |
__host__ float2_t | scaled_type_convert< float2_t, bf8x2_ocp_t > (e8m0_bexp_t scale, bf8x2_ocp_t x) |
|
template<> |
__host__ float16_t | scaled_type_convert< float16_t, f8x16_ocp_t > (e8m0_bexp_t scale, f8x16_ocp_t x) |
|
template<> |
__host__ float16_t | scaled_type_convert< float16_t, bf8x16_ocp_t > (e8m0_bexp_t scale, bf8x16_ocp_t x) |
|
template<> |
__host__ float32_t | scaled_type_convert< float32_t, f8x32_ocp_t > (e8m0_bexp_t scale, f8x32_ocp_t x) |
|
template<> |
__host__ float32_t | scaled_type_convert< float32_t, bf8x32_ocp_t > (e8m0_bexp_t scale, bf8x32_ocp_t x) |
|
template<> |
__host__ f8_ocp_t | scaled_type_convert< f8_ocp_t, float > (e8m0_bexp_t scale, float x) |
|
template<> |
__host__ bf8_ocp_t | scaled_type_convert< bf8_ocp_t, float > (e8m0_bexp_t scale, float x) |
|
template<> |
__host__ f8x2_ocp_t | scaled_type_convert< f8x2_ocp_t, float2_t > (e8m0_bexp_t scale, float2_t x) |
|
template<> |
__host__ bf8x2_ocp_t | scaled_type_convert< bf8x2_ocp_t, float2_t > (e8m0_bexp_t scale, float2_t x) |
|
template<> |
__host__ f8x16_ocp_t | scaled_type_convert< f8x16_ocp_t, float16_t > (e8m0_bexp_t scale, float16_t x) |
|
template<> |
__host__ bf8x16_ocp_t | scaled_type_convert< bf8x16_ocp_t, float16_t > (e8m0_bexp_t scale, float16_t x) |
|
template<> |
__host__ f8x32_ocp_t | scaled_type_convert< f8x32_ocp_t, float32_t > (e8m0_bexp_t scale, float32_t x) |
|
template<> |
__host__ bf8x32_ocp_t | scaled_type_convert< bf8x32_ocp_t, float32_t > (e8m0_bexp_t scale, float32_t x) |
|
template<index_t I, index_t... Is> |
__host__ constexpr __device__ auto | sequence_pop_front (Sequence< I, Is... >) |
|
template<typename Seq > |
__host__ constexpr __device__ auto | sequence_pop_back (Seq) |
|
template<index_t... Xs, index_t... Ys> |
__host__ constexpr __device__ bool | operator== (Sequence< Xs... >, Sequence< Ys... >) |
|
template<index_t... Xs, index_t... Ys> |
__host__ constexpr __device__ auto | operator+ (Sequence< Xs... >, Sequence< Ys... >) |
|
template<index_t... Xs, index_t... Ys> |
__host__ constexpr __device__ auto | operator- (Sequence< Xs... >, Sequence< Ys... >) |
|
template<index_t... Xs, index_t... Ys> |
__host__ constexpr __device__ auto | operator* (Sequence< Xs... >, Sequence< Ys... >) |
|
template<index_t... Xs, index_t... Ys> |
__host__ constexpr __device__ auto | operator/ (Sequence< Xs... >, Sequence< Ys... >) |
|
template<index_t... Xs, index_t... Ys> |
__host__ constexpr __device__ auto | operator% (Sequence< Xs... >, Sequence< Ys... >) |
|
template<index_t... Xs, index_t Y> |
__host__ constexpr __device__ auto | operator+ (Sequence< Xs... >, Number< Y >) |
|
template<index_t... Xs, index_t Y> |
__host__ constexpr __device__ auto | operator- (Sequence< Xs... >, Number< Y >) |
|
template<index_t... Xs, index_t Y> |
__host__ constexpr __device__ auto | operator* (Sequence< Xs... >, Number< Y >) |
|
template<index_t... Xs, index_t Y> |
__host__ constexpr __device__ auto | operator/ (Sequence< Xs... >, Number< Y >) |
|
template<index_t... Xs, index_t Y> |
__host__ constexpr __device__ auto | operator% (Sequence< Xs... >, Number< Y >) |
|
template<index_t Y, index_t... Xs> |
__host__ constexpr __device__ auto | operator+ (Number< Y >, Sequence< Xs... >) |
|
template<index_t Y, index_t... Xs> |
__host__ constexpr __device__ auto | operator- (Number< Y >, Sequence< Xs... >) |
|
template<index_t Y, index_t... Xs> |
__host__ constexpr __device__ auto | operator* (Number< Y >, Sequence< Xs... >) |
|
template<index_t Y, index_t... Xs> |
__host__ constexpr __device__ auto | operator/ (Number< Y >, Sequence< Xs... >) |
|
template<index_t Y, index_t... Xs> |
__host__ constexpr __device__ auto | operator% (Number< Y >, Sequence< Xs... >) |
|
template<typename... Seqs> |
__host__ constexpr __device__ auto | merge_sequences (Seqs...) |
|
template<typename F , index_t... Xs> |
__host__ constexpr __device__ auto | transform_sequences (F f, Sequence< Xs... >) |
|
template<typename F , index_t... Xs, index_t... Ys> |
__host__ constexpr __device__ auto | transform_sequences (F f, Sequence< Xs... >, Sequence< Ys... >) |
|
template<typename F , index_t... Xs, index_t... Ys, index_t... Zs> |
__host__ constexpr __device__ auto | transform_sequences (F f, Sequence< Xs... >, Sequence< Ys... >, Sequence< Zs... >) |
|
template<typename Seq , typename Reduce , index_t Init> |
__host__ constexpr __device__ auto | reverse_inclusive_scan_sequence (Seq, Reduce, Number< Init >) |
|
template<typename Seq , typename Reduce , index_t Init> |
__host__ constexpr __device__ auto | reverse_exclusive_scan_sequence (Seq, Reduce, Number< Init >) |
|
template<typename Seq , typename Reduce , index_t Init> |
__host__ constexpr __device__ auto | inclusive_scan_sequence (Seq, Reduce, Number< Init >) |
|
template<typename Seq , index_t... Is> |
__host__ constexpr __device__ auto | pick_sequence_elements_by_ids (Seq, Sequence< Is... >) |
|
template<typename Seq , typename Mask > |
__host__ constexpr __device__ auto | pick_sequence_elements_by_mask (Seq, Mask) |
|
template<typename Seq , typename Values , typename Ids > |
__host__ constexpr __device__ auto | modify_sequence_elements_by_ids (Seq, Values, Ids) |
|
template<typename Seq , typename Reduce , index_t Init> |
__host__ constexpr __device__ index_t | reduce_on_sequence (Seq, Reduce f, Number< Init >) |
|
template<typename Seq , typename F > |
__host__ constexpr __device__ bool | sequence_any_of (Seq, F f) |
|
template<typename Seq , typename F > |
__host__ constexpr __device__ bool | sequence_all_of (Seq, F f) |
|
template<index_t... Is> |
__host__ constexpr __device__ auto | make_sequence (Number< Is >...) |
|
template<typename F , index_t N> |
__host__ constexpr __device__ auto | generate_sequence (F, Number< N >) |
|
template<typename F , index_t N> |
__host__ constexpr __device__ auto | generate_sequence_v2 (F &&f, Number< N >) |
|
template<index_t... Is> |
__host__ constexpr __device__ auto | to_sequence (Tuple< Number< Is >... >) |
|
template<AddressSpaceEnum AddressSpace, typename T , index_t N> |
__host__ constexpr __device__ auto | make_static_buffer (Number< N >) |
|
template<AddressSpaceEnum AddressSpace, typename T , long_index_t N> |
__host__ constexpr __device__ auto | make_static_buffer (LongNumber< N >) |
|
template<typename X , typename... Xs> |
__host__ constexpr __device__ auto | make_statically_indexed_array (const X &x, const Xs &... xs) |
|
template<typename X > |
__host__ constexpr __device__ auto | make_statically_indexed_array () |
|
template<typename... Ys, typename X , enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false> |
__host__ constexpr __device__ auto | operator+= (Tuple< Ys... > &y, const X &x) |
|
template<typename... Ys, typename X , enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false> |
__host__ constexpr __device__ auto | operator-= (Tuple< Ys... > &y, const X &x) |
|
template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false> |
__host__ constexpr __device__ auto | operator+ (const Tuple< Xs... > &x, const Y &y) |
|
template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false> |
__host__ constexpr __device__ auto | operator- (const Tuple< Xs... > &x, const Y &y) |
|
template<typename... Xs, typename Y , enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false> |
__host__ constexpr __device__ auto | operator* (const Tuple< Xs... > &x, const Y &y) |
|
template<typename... Xs, typename Y , enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false> |
__host__ constexpr __device__ auto | operator* (Y a, const Tuple< Xs... > &x) |
|
template<typename... Xs, typename Y , enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false> |
__host__ constexpr __device__ auto | operator* (const Tuple< Xs... > &x, Y a) |
|
template<typename... Xs> |
__host__ __device__ void | print_multi_index (const Tuple< Xs... > &x) |
|
__device__ void | block_sync_lds () |
|
__device__ void | block_sync_lds_direct_load () |
|
__device__ void | s_nop () |
|
__device__ void | transpose_fp16_2x2 (const half2_t &x0, const half2_t &x1, half2_t &y0, half2_t &y1) |
|
__device__ void | transpose_int8_4x4 (const int8x4_t &x0, const int8x4_t &x1, const int8x4_t &x2, const int8x4_t &x3, int8x4_t &y0, int8x4_t &y1, int8x4_t &y2, int8x4_t &y3) |
|
__device__ void | transpose_f8_4x4 (const f8x4_t &x0, const f8x4_t &x1, const f8x4_t &x2, const f8x4_t &x3, f8x4_t &y0, f8x4_t &y1, f8x4_t &y2, f8x4_t &y3) |
|
template<typename... Xs> |
__host__ constexpr __device__ auto | make_tuple (Xs &&... xs) |
|
template<typename... Args> |
constexpr Tuple< Args &... > | tie (Args &... args) noexcept |
|
template<typename F , index_t... ids> |
__host__ constexpr __device__ auto | generate_tuple_for (F &&f, Sequence< ids... >) |
|
template<typename F , index_t N> |
__host__ constexpr __device__ auto | generate_tuple (F &&f, Number< N >) |
|
template<typename F , index_t N> |
__host__ constexpr __device__ auto | generate_tuple (F &&f, LongNumber< N >) |
|
template<typename F , index_t N> |
__host__ constexpr __device__ auto | generate_tie (F &&f, Number< N >) |
|
template<typename... X, typename... Y> |
__host__ constexpr __device__ auto | concat_tuple_of_reference (const Tuple< X &... > &tx, const Tuple< Y &... > &ty) |
|
template<typename... X, typename... Y> |
__host__ constexpr __device__ auto | concat_tuple (const Tuple< X... > &tx, const Tuple< Y... > &ty) |
|
template<typename... X> |
__host__ constexpr __device__ auto | concat_tuple (const Tuple< X... > &tx) |
|
template<typename... X, typename... Tuples> |
__host__ constexpr __device__ auto | concat_tuple (const Tuple< X... > &tx, const Tuples &... tuples) |
|
template<typename F , typename X > |
__host__ constexpr __device__ auto | transform_tuples (F f, const X &x) |
|
template<typename F , typename X , typename Y > |
__host__ constexpr __device__ auto | transform_tuples (F f, const X &x, const Y &y) |
|
template<typename F , typename X , typename Y , typename Z > |
__host__ constexpr __device__ auto | transform_tuples (F f, const X &x, const Y &y, const Z &z) |
|
template<index_t Depth = 0, index_t MaxDepth = -1> |
__host__ constexpr __device__ auto | UnrollNestedTuple (const Tuple<> &element) |
|
template<index_t Depth = 0, index_t MaxDepth = -1, typename T > |
__host__ constexpr __device__ auto | UnrollNestedTuple (const T &element) |
|
template<index_t Depth = 0, index_t MaxDepth = -1, typename... Ts> |
__host__ constexpr __device__ auto | UnrollNestedTuple (const Tuple< Ts... > &tuple) |
|
template<typename... Ts> |
__host__ constexpr __device__ auto | TupleReverse (const Tuple< Ts... > &tuple) |
|
template<index_t Idx, index_t End, typename F , typename... Ts> |
__host__ constexpr __device__ auto | TupleReduce (F &&f, const Tuple< Ts... > &tuple) |
|
template<typename... Ts> |
__host__ constexpr __device__ auto | IsNestedTuple (const Tuple< Ts... > &) |
|
template<index_t depth = 0, typename T > |
__host__ constexpr __device__ auto | TupleDepth (const T &) |
|
template<index_t depth = 0, typename... Ts> |
__host__ constexpr __device__ auto | TupleDepth (const Tuple< Ts... > &) |
|
template<index_t from, index_t to, typename... Ts> |
__host__ constexpr __device__ auto | TupleSlice (const Tuple< Ts... > &tuple) |
|
template<typename Y , typename X , typename enable_if< sizeof(X)==sizeof(Y), bool >::type = false> |
__host__ constexpr __device__ Y | bit_cast (const X &x) |
|
template<typename Y , typename X > |
__host__ constexpr __device__ Y | bf16_convert_rtn (X x) |
|
template<> |
__host__ constexpr __device__ bhalf_t | bf16_convert_rtn< bhalf_t, float > (float x) |
|
template<> |
__host__ constexpr __device__ bhalf_t | bf16_convert_rtn< bhalf_t, half_t > (half_t x) |
|
template<typename Y , typename X , ck::enable_if_t<!(ck::is_const_v< Y >||ck::is_const_v< X >), bool > = false> |
__host__ constexpr __device__ Y | type_convert (X x) |
|
template<> |
__host__ constexpr __device__ float | type_convert< float, bhalf_t > (bhalf_t x) |
|
template<> |
__host__ constexpr __device__ bhalf_t | type_convert< bhalf_t, float > (float x) |
|
template<> |
__host__ constexpr __device__ half_t | type_convert< half_t, bhalf_t > (bhalf_t x) |
|
template<> |
__host__ constexpr __device__ bhalf_t | type_convert< bhalf_t, half_t > (half_t x) |
|
template<> |
__host__ constexpr __device__ int8_t | type_convert< int8_t, bhalf_t > (bhalf_t x) |
|
template<> |
__host__ constexpr __device__ bhalf_t | type_convert< bhalf_t, int8_t > (int8_t x) |
|
template<> |
__host__ constexpr __device__ f8_ocp_t | type_convert< f8_ocp_t, int > (int x) |
|
template<> |
__host__ constexpr __device__ bf8_ocp_t | type_convert< bf8_ocp_t, int > (int x) |
|
template<typename Y , typename X > |
__host__ constexpr __device__ Y | type_convert_sp (X x) |
|
template<> |
__host__ constexpr __device__ int | type_convert_sp< int, float > (float x) |
|
template<> |
__host__ constexpr __device__ float | type_convert_sp< float, int > (int x) |
|
template<> |
__host__ constexpr __device__ int | type_convert_sp< int, half_t > (half_t x) |
|
template<> |
__host__ constexpr __device__ half_t | type_convert_sp< half_t, int > (int x) |
|
template<typename Y , typename X > |
__host__ constexpr __device__ Y | f8_convert_sr (X x) |
|
template<> |
__host__ __device__ f8_fnuz_t | f8_convert_sr< f8_fnuz_t, float > (float x) |
|
template<> |
__host__ __device__ f8_fnuz_t | f8_convert_sr< f8_fnuz_t, half_t > (half_t x) |
|
template<> |
__host__ __device__ bf8_fnuz_t | f8_convert_sr< bf8_fnuz_t, float > (float x) |
|
template<> |
__host__ __device__ bf8_fnuz_t | f8_convert_sr< bf8_fnuz_t, half_t > (half_t x) |
|
template<> |
__host__ __device__ f8_ocp_t | f8_convert_sr< f8_ocp_t, float > (float x) |
| Converts a float to a 8-bit float type (f8_ocp_t) using stochastic rounding. More...
|
|
template<> |
__host__ __device__ f8x2_ocp_t | f8_convert_sr< f8x2_ocp_t, float2_t > (float2_t x) |
| Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding. More...
|
|
template<> |
__host__ __device__ bf8_ocp_t | f8_convert_sr< bf8_ocp_t, float > (float x) |
| Converts a float to a 8-bit float type (bf8_ocp_t) using stochastic rounding. More...
|
|
template<> |
__host__ __device__ bf8x2_ocp_t | f8_convert_sr< bf8x2_ocp_t, float2_t > (float2_t x) |
| Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding. More...
|
|
template<> |
__host__ __device__ f8_ocp_t | f8_convert_sr< f8_ocp_t, half_t > (half_t x) |
| Converts a half_t to a 8-bit float type (f8_ocp_t) using stochastic rounding. More...
|
|
template<> |
__host__ __device__ f8x2_ocp_t | f8_convert_sr< f8x2_ocp_t, half2_t > (half2_t x) |
| Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding. More...
|
|
template<> |
__host__ __device__ bf8_ocp_t | f8_convert_sr< bf8_ocp_t, half_t > (half_t x) |
| Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding. More...
|
|
template<> |
__host__ __device__ bf8x2_ocp_t | f8_convert_sr< bf8x2_ocp_t, half2_t > (half2_t x) |
| Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding. More...
|
|
template<> |
__host__ __device__ f8_ocp_t | f8_convert_sr< f8_ocp_t, bhalf_t > (bhalf_t x) |
| Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using stochastic rounding. More...
|
|
template<> |
__host__ __device__ f8x2_ocp_t | f8_convert_sr< f8x2_ocp_t, bhalf2_t > (bhalf2_t x) |
| Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding. More...
|
|
template<> |
__host__ __device__ bf8_ocp_t | f8_convert_sr< bf8_ocp_t, bhalf_t > (bhalf_t x) |
| Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding. More...
|
|
template<> |
__host__ __device__ bf8x2_ocp_t | f8_convert_sr< bf8x2_ocp_t, bhalf2_t > (bhalf2_t x) |
| Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding. More...
|
|
template<typename Y , typename X > |
__host__ constexpr __device__ Y | f8_convert_rne (X x) |
|
template<> |
__host__ __device__ f8_fnuz_t | f8_convert_rne< f8_fnuz_t, float > (float x) |
|
template<> |
__host__ __device__ f8_fnuz_t | f8_convert_rne< f8_fnuz_t, half_t > (half_t x) |
|
template<> |
__host__ __device__ bf8_fnuz_t | f8_convert_rne< bf8_fnuz_t, float > (float x) |
|
template<> |
__host__ __device__ bf8_fnuz_t | f8_convert_rne< bf8_fnuz_t, half_t > (half_t x) |
|
template<> |
__host__ __device__ f8_ocp_t | f8_convert_rne< f8_ocp_t, float > (float x) |
| Converts a float to a 8-bit float type (f8_ocp_t) using rounding to nearest/even. More...
|
|
template<> |
__host__ __device__ f8x2_ocp_t | f8_convert_rne< f8x2_ocp_t, float2_t > (float2_t x) |
| Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even. More...
|
|
template<> |
__host__ __device__ bf8_ocp_t | f8_convert_rne< bf8_ocp_t, float > (float x) |
| Converts a float to a 8-bit float type (bf8_ocp_t) using rounding to nearest/even. More...
|
|
template<> |
__host__ __device__ bf8x2_ocp_t | f8_convert_rne< bf8x2_ocp_t, float2_t > (float2_t x) |
| Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even. More...
|
|
template<> |
__host__ __device__ f8_ocp_t | f8_convert_rne< f8_ocp_t, half_t > (half_t x) |
| Converts a half_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even. More...
|
|
template<> |
__host__ __device__ f8x2_ocp_t | f8_convert_rne< f8x2_ocp_t, half2_t > (half2_t x) |
| Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even. More...
|
|
template<> |
__host__ __device__ bf8_ocp_t | f8_convert_rne< bf8_ocp_t, half_t > (half_t x) |
| Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even. More...
|
|
template<> |
__host__ __device__ bf8x2_ocp_t | f8_convert_rne< bf8x2_ocp_t, half2_t > (half2_t x) |
| Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even. More...
|
|
template<> |
__host__ __device__ f8_ocp_t | f8_convert_rne< f8_ocp_t, bhalf_t > (bhalf_t x) |
| Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even. More...
|
|
template<> |
__host__ __device__ f8x2_ocp_t | f8_convert_rne< f8x2_ocp_t, bhalf2_t > (bhalf2_t x) |
| Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even. More...
|
|
template<> |
__host__ __device__ bf8_ocp_t | f8_convert_rne< bf8_ocp_t, bhalf_t > (bhalf_t x) |
| Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even. More...
|
|
template<> |
__host__ __device__ bf8x2_ocp_t | f8_convert_rne< bf8x2_ocp_t, bhalf2_t > (bhalf2_t x) |
| Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even. More...
|
|
template<> |
__host__ __device__ f8_fnuz_t | type_convert< f8_fnuz_t, float > (float x) |
|
template<> |
__host__ __device__ float | type_convert< float, f8_fnuz_t > (f8_fnuz_t x) |
|
template<> |
__host__ __device__ float2_t | type_convert< float2_t, f8x2_fnuz_t > (f8x2_fnuz_t x) |
|
template<> |
__host__ __device__ float | type_convert< float, f8_ocp_t > (f8_ocp_t x) |
| Converts a f8_ocp_t value to a float value. More...
|
|
template<> |
__host__ __device__ float2_t | type_convert< float2_t, f8x2_ocp_t > (f8x2_ocp_t x) |
| Converts a vector of 2 f8_ocp_t values to a vector of 2 float values. More...
|
|
template<> |
__host__ __device__ half_t | type_convert< half_t, f8_ocp_t > (f8_ocp_t x) |
| Converts a f8_ocp_t value to a half_t value. More...
|
|
template<> |
__host__ __device__ half2_t | type_convert< half2_t, f8x2_ocp_t > (f8x2_ocp_t x) |
| Converts a vector of 2 f8_ocp_t values to a vector of 2 half_t values. More...
|
|
template<> |
__host__ __device__ bhalf_t | type_convert< bhalf_t, f8_ocp_t > (f8_ocp_t x) |
| Converts a f8_ocp_t value to a bhalf_t value. More...
|
|
template<> |
__host__ __device__ bhalf2_t | type_convert< bhalf2_t, f8x2_ocp_t > (f8x2_ocp_t x) |
| Converts a vector of 2 f8_ocp_t values to a vector of 2 bhalf_t values. More...
|
|
template<> |
__host__ __device__ float | type_convert< float, bf8_ocp_t > (bf8_ocp_t x) |
| Converts a bf8_ocp_t value to a float value. More...
|
|
template<> |
__host__ __device__ float2_t | type_convert< float2_t, bf8x2_ocp_t > (bf8x2_ocp_t x) |
| Converts a vector of 2 bf8_ocp_t values to a vector of 2 float values. More...
|
|
template<> |
__host__ __device__ half_t | type_convert< half_t, bf8_ocp_t > (bf8_ocp_t x) |
| Converts a bf8_ocp_t value to a half_t value. More...
|
|
template<> |
__host__ __device__ half2_t | type_convert< half2_t, bf8x2_ocp_t > (bf8x2_ocp_t x) |
| Converts a vector of 2 bf8_ocp_t values to a vector of 2 half_t values. More...
|
|
template<> |
__host__ __device__ bhalf_t | type_convert< bhalf_t, bf8_ocp_t > (bf8_ocp_t x) |
| Converts a bf8_ocp_t value to a bhalf_t value. More...
|
|
template<> |
__host__ __device__ bhalf2_t | type_convert< bhalf2_t, bf8x2_ocp_t > (bf8x2_ocp_t x) |
| Converts a vector of 2 bf8_ocp_t values to a vector of 2 bhalf_t values. More...
|
|
template<> |
__host__ __device__ float2_t | type_convert< float2_t, pk_i4_t > (pk_i4_t x) |
|
template<> |
__host__ __device__ half2_t | type_convert< half2_t, pk_i4_t > (pk_i4_t x) |
|
template<> |
__host__ __device__ bhalf2_t | type_convert< bhalf2_t, pk_i4_t > (pk_i4_t x) |
|
template<> |
__host__ __device__ half2_t | type_convert< half2_t, float2_t > (float2_t x) |
|
template<> |
__host__ __device__ f8_fnuz_t | type_convert< f8_fnuz_t, half_t > (half_t x) |
|
template<> |
__host__ __device__ f8_ocp_t | type_convert< f8_ocp_t, half_t > (half_t x) |
| Converts a half_t value to a f8_ocp_t value with rounding determined by a flag. More...
|
|
template<> |
__host__ __device__ bf8_ocp_t | type_convert< bf8_ocp_t, half_t > (half_t x) |
| Converts a half_t value to a bf8_ocp_t value with rounding determined by a flag. More...
|
|
template<> |
__host__ __device__ half_t | type_convert< half_t, f8_fnuz_t > (f8_fnuz_t x) |
|
template<> |
__host__ __device__ bf8_fnuz_t | type_convert< bf8_fnuz_t, float > (float x) |
|
template<> |
__host__ __device__ f8_ocp_t | type_convert< f8_ocp_t, float > (float x) |
| Converts a float value to a f8_ocp_t value with rounding determined by a flag. More...
|
|
template<> |
__host__ __device__ bf8_ocp_t | type_convert< bf8_ocp_t, float > (float x) |
| Converts a float value to a bf8_ocp_t value with rounding determined by a flag. More...
|
|
template<> |
__host__ __device__ f8_ocp_t | type_convert< f8_ocp_t, bhalf_t > (bhalf_t x) |
| Converts a bhalf_t value to a f8_ocp_t value with rounding determined by a flag. More...
|
|
template<> |
__host__ __device__ bf8_ocp_t | type_convert< bf8_ocp_t, bhalf_t > (bhalf_t x) |
| Converts a bhalf_t value to a bf8_ocp_t value with rounding determined by a flag. More...
|
|
template<> |
__host__ __device__ float | type_convert< float, bf8_fnuz_t > (bf8_fnuz_t x) |
|
template<> |
__host__ __device__ bf8_fnuz_t | type_convert< bf8_fnuz_t, half_t > (half_t x) |
|
template<> |
__host__ __device__ half_t | type_convert< half_t, bf8_fnuz_t > (bf8_fnuz_t x) |
|
__host__ __device__ f4_t | f4_convert_rne (float x, float scale=1.0f) |
|
__host__ __device__ f4x2_t | f4_convert_rne (float2_t x, float scale=1.0f) |
|
__host__ __device__ f4_t | f4_convert_sr (float x, float scale=1.0f) |
|
__host__ __device__ f4x2_t | f4_convert_sr (float2_t x, float scale=1.0f) |
|
template<> |
__host__ __device__ f4_t | type_convert< f4_t, float > (float x) |
|
template<> |
__host__ __device__ f4x2_t | type_convert< f4x2_t, float2_t > (float2_t x) |
|
template<> |
__host__ __device__ f4x2_pk_t | type_convert< f4x2_pk_t, float2_t > (float2_t x) |
|
template<> |
__host__ __device__ f4x32_t | type_convert< f4x32_t, float32_t > (float32_t x) |
|
template<> |
__host__ __device__ float | type_convert< float, f4_t > (f4_t x) |
|
template<> |
__host__ __device__ float2_t | type_convert< float2_t, f4x2_t > (f4x2_t x) |
|
template<> |
__host__ __device__ float32_t | type_convert< float32_t, f4x32_t > (f4x32_t x) |
|
__host__ __device__ f6_t | f6_convert_rne (float x, float scale=1.0f) |
| Converts a float to a 6-bit float type (f6_t) using round-to-nearest-even. More...
|
|
__host__ __device__ f6x32_t | f6_convert_rne (float32_t x, float scale=1.0f) |
| Converts a 32-element single-precision float array into a packed 6-bit representation. More...
|
|
__host__ __device__ f6_t | f6_convert_sr (float x, float scale=1.0f) |
| Converts a float to the 6-bit floating-point type (f6_t) using stochastic rounding. More...
|
|
__host__ __device__ f6x32_t | f6_convert_sr (float32_t x, float scale=1.0f) |
| Converts a 32-element single-precision float array into a packed 6-bit representation. More...
|
|
template<> |
__host__ __device__ f6_t | type_convert< f6_t, float > (float x) |
| Specializes the type conversion template for converting a float into the 6-bit float type (f6_t). More...
|
|
template<> |
__host__ __device__ f6x32_t | type_convert< f6x32_t, float32_t > (float32_t x) |
| Specializes the type conversion template for converting a vector of 32 floats into the vector of 32 6-bit float types (f6x32_t). More...
|
|
template<> |
__host__ __device__ f6x32_pk_t | type_convert< f6x32_pk_t, float32_t > (float32_t x) |
|
template<> |
__host__ __device__ f6x16_t | type_convert< f6x16_t, float16_t > (float16_t x) |
|
template<> |
__host__ __device__ f6x16_pk_t | type_convert< f6x16_pk_t, float16_t > (float16_t x) |
|
template<> |
__host__ __device__ float | type_convert< float, f6_t > (f6_t x) |
| Specializes the type conversion template for converting the 6-bit float type (f6_t) to float. More...
|
|
template<> |
__host__ __device__ float32_t | type_convert< float32_t, f6x32_t > (f6x32_t x) |
| Specializes the type conversion template for converting the vector of 32 6-bit float types (f6x32_t) to vector of 32 floats. More...
|
|
template<> |
__host__ __device__ float16_t | type_convert< float16_t, f6x16_t > (f6x16_t x) |
|
template<> |
__host__ __device__ float16_t | type_convert< float16_t, f6x16_pk_t > (f6x16_pk_t x) |
|
__host__ __device__ bf6_t | bf6_convert_rne (float x, float scale=1.0f) |
| Converts a float to the 6-bit BF6 type using round-to-nearest-even. More...
|
|
__host__ __device__ bf6x32_t | bf6_convert_rne (float32_t x, float scale=1.0f) |
| Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using round-to-nearest-even. More...
|
|
__host__ __device__ bf6_t | bf6_convert_sr (float x, float scale=1.0f) |
| Converts a float to the 6-bit BF6 type using stochastic rounding. More...
|
|
__host__ __device__ bf6x32_t | bf6_convert_sr (float32_t x, float scale=1.0f) |
| Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using stochastic rounding. More...
|
|
template<> |
__host__ __device__ bf6_t | type_convert< bf6_t, float > (float x) |
| Specializes float-to-bf6_t conversion. More...
|
|
template<> |
__host__ __device__ bf6x32_t | type_convert< bf6x32_t, float32_t > (float32_t x) |
| Specializes vector of 32 float-to-bf6_t conversion. More...
|
|
template<> |
__host__ __device__ bf6x32_pk_t | type_convert< bf6x32_pk_t, float32_t > (float32_t x) |
|
template<> |
__host__ __device__ bf6x16_t | type_convert< bf6x16_t, float16_t > (float16_t x) |
|
template<> |
__host__ __device__ bf6x16_pk_t | type_convert< bf6x16_pk_t, float16_t > (float16_t x) |
|
template<> |
__host__ __device__ float | type_convert< float, bf6_t > (bf6_t x) |
| Specializes the type conversion template for converting a bf6_t value to float. More...
|
|
template<> |
__host__ __device__ float32_t | type_convert< float32_t, bf6x32_t > (bf6x32_t x) |
| Specializes the type conversion template for converting a vector of 32 bf6_t values to vector of 32 floats. More...
|
|
template<> |
__host__ __device__ float16_t | type_convert< float16_t, bf6x16_t > (bf6x16_t x) |
|
template<> |
__host__ __device__ float16_t | type_convert< float16_t, bf6x16_pk_t > (bf6x16_pk_t x) |
|
template<typename Y , typename X , size_t NumElems> |
__host__ __device__ void | array_convert (std::array< Y, NumElems > &y, const std::array< X, NumElems > &x) |
|
template<typename Y , typename X , index_t NumElems> |
__host__ __device__ void | array_convert (Array< Y, NumElems > &y, const Array< X, NumElems > &x) |
|