|
| template<typename F , typename X , index_t... Is> |
| constexpr CK_TILE_HOST_DEVICE auto | transform_tuples_impl (F f, const X &x, sequence< Is... >) |
| |
| template<typename F , typename X , typename Y , index_t... Is> |
| constexpr CK_TILE_HOST_DEVICE auto | transform_tuples_impl (F f, const X &x, const Y &y, sequence< Is... >) |
| |
| template<typename F , typename X , typename Y , typename Z , index_t... Is> |
| constexpr CK_TILE_HOST_DEVICE auto | transform_tuples_impl (F f, const X &x, const Y &y, const Z &z, sequence< Is... >) |
| |
| template<typename F , typename Tuple , index_t... Is> |
| constexpr decltype(auto) | apply_impl (F &&f, Tuple &&t, sequence< Is... >) |
| |
| template<typename F , typename X , index_t... Is> |
| constexpr CK_TILE_HOST_DEVICE auto | embed_tuples_impl (F f, const X &x, sequence< Is... >) |
| |
| constexpr CK_TILE_HOST_DEVICE _Float16 | lane0 (const fp16x2_t &v) |
| |
| constexpr CK_TILE_HOST_DEVICE _Float16 | lane1 (const fp16x2_t &v) |
| |
| constexpr CK_TILE_HOST_DEVICE bfloat16_t | lane0 (const bf16x2_t &v) |
| |
| constexpr CK_TILE_HOST_DEVICE bfloat16_t | lane1 (const bf16x2_t &v) |
| |
| constexpr CK_TILE_HOST_DEVICE float | lane0 (const fp32x2_t &v) |
| |
| constexpr CK_TILE_HOST_DEVICE float | lane1 (const fp32x2_t &v) |
| |
| template<typename OutTensor , typename InTensor > |
| CK_TILE_DEVICE void | shuffle_tile_impl_in_thread (OutTensor &out_tensor, const InTensor &in_tensor) |
| |
| template<typename Lengths , typename Strides , index_t I, typename AccOld > |
| constexpr CK_TILE_HOST_DEVICE auto | calculate_element_space_size_impl (const Lengths &lengths, const Strides &strides, number< I > i, AccOld acc_old) |
| |
| template<index_t... Is> |
| constexpr CK_TILE_HOST_DEVICE auto | make_tile_distributed_span (sequence< Is... >) |
| |
| template<index_t... Is> |
| constexpr CK_TILE_HOST_DEVICE auto | make_tile_distributed_index (sequence< Is... >) |
| |
| template<index_t NDimMax> |
| constexpr CK_TILE_HOST_DEVICE auto | make_sequential_index (index_t ibegin, index_t iend) |
| |
| template<typename StaticTileDistributionEncoding_ > |
| constexpr CK_TILE_HOST_DEVICE auto | make_adaptor_encoding_for_tile_distribution (StaticTileDistributionEncoding_) |
| |
| template<typename Distribution , index_t... XSliceBegins, index_t... XSliceEnds> |
| constexpr CK_TILE_HOST_DEVICE auto | slice_distribution_from_x (Distribution, sequence< XSliceBegins... > x_slice_begins, sequence< XSliceEnds... > x_slice_ends) |
| |
| template<typename OuterDstr , typename InnerDstr > |
| constexpr CK_TILE_HOST_DEVICE auto | make_embed_tile_distribution_encoding (OuterDstr, InnerDstr) |
| |
| template<typename InDstr , index_t... InReduceDimXs> |
| constexpr CK_TILE_HOST_DEVICE auto | make_reduce_tile_distribution_encoding_impl (InDstr, sequence< InReduceDimXs... > reduce_dim_xs_in) |
| |
| template<typename InDstr , index_t... InReduceDimXs> |
| constexpr CK_TILE_HOST_DEVICE auto | make_reduce_tile_distribution_encoding (InDstr, sequence< InReduceDimXs... > reduce_dim_xs_in) |
| |
| template<typename OutTensor , typename InTensor > |
| CK_TILE_DEVICE void | transpose_tile2d_impl_in_thread (OutTensor &out_tensor, const InTensor &in_tensor) |
| |
| CK_TILE_DEVICE float | fma_impl_vsv (float a, float b, float c) |
| |
| CK_TILE_DEVICE float | add_impl_vv (float lhs, float rhs) |
| |
| CK_TILE_DEVICE float | mul_impl_vv (float lhs, float rhs) |
| |
| CK_TILE_DEVICE fp16x2_t | cvt_pk_fp16_f32 (float a, float b) |
| |
| CK_TILE_DEVICE bf16x2_t | cvt_pk_bf16_f32 (float a, float b) |
| |
| CK_TILE_DEVICE fp32x2_t | pk_mul_f32 (fp32x2_t lhs, fp32x2_t rhs) |
| |
| template<index_t NDimSpatial> |
| __device__ long_index_t | calculate_input_index (index_t n, index_t g, index_t c, const std::array< index_t, NDimSpatial > &spatial_idx, const std::array< long_index_t, NDimSpatial+3 > &strides) |
| |
| template<index_t NDimSpatial> |
| __device__ long_index_t | calculate_weight_index (index_t g, index_t k, index_t c, const std::array< index_t, NDimSpatial > &spatial_idx, const std::array< long_index_t, NDimSpatial+3 > &strides) |
| |
| template<index_t NDimSpatial> |
| __device__ long_index_t | calculate_output_index (index_t n, index_t g, index_t k, const std::array< index_t, NDimSpatial > &spatial_idx, const std::array< long_index_t, NDimSpatial+3 > &strides) |
| |