/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp Source File#
gridwise_normalization_splitk_1st.hpp
Go to the documentation of this file.
__host__ constexpr __device__ auto integer_divide_ceil(X x, Y y)
Definition: math.hpp:72
__host__ constexpr __device__ T clamp(const T &x, const T &lowerbound, const T &upperbound)
Definition: math.hpp:148
Definition: ck.hpp:267
__host__ constexpr __device__ auto make_multi_index(Xs &&... xs)
Definition: array_multi_index.hpp:15
__host__ constexpr __device__ auto generate_tuple(F &&f, Number< N >)
Definition: tuple_helper.hpp:21
__host__ constexpr __device__ auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition: tensor_descriptor_helper.hpp:101
__host__ constexpr __device__ auto make_cluster_descriptor(const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::Size(), 1 >::type{})
Definition: cluster_descriptor.hpp:13
static __device__ void Run(T &mean_value, T &var_value, CountDataType &count)
Definition: blockwise_welford.hpp:51
Definition: gridwise_normalization_splitk_1st.hpp:28
Sequence< MThreadSliceSize, 1 > ThreadBufferLengths_M_1
Definition: gridwise_normalization_splitk_1st.hpp:54
static constexpr auto I1
Definition: gridwise_normalization_splitk_1st.hpp:36
static constexpr auto ThreadBufferNumber
Definition: gridwise_normalization_splitk_1st.hpp:78
static constexpr index_t K_BlockTileSize
Definition: gridwise_normalization_splitk_1st.hpp:75
tensor_operation::element_wise::PassThrough PassThroughOp
Definition: gridwise_normalization_splitk_1st.hpp:72
static constexpr auto thread_buffer_desc_m_1
Definition: gridwise_normalization_splitk_1st.hpp:55
static __device__ void Run(const XGridDesc_M_K &x_grid_desc_m_k, const MeanVarGridDesc_M_KBlock &mean_var_grid_desc_m_kblock, index_t num_k_block_tile_iteration, const XDataType *const __restrict__ p_x_global, MeanVarDataType *const p_mean_global, MeanVarDataType *const p_variance_global, int32_t *const p_welford_count_global)
Definition: gridwise_normalization_splitk_1st.hpp:115
decltype(make_naive_tensor_descriptor_packed(make_tuple(Number< MThreadSliceSize >{}))) ThreadReduceDstDesc_M
Definition: gridwise_normalization_splitk_1st.hpp:61
ThreadwiseWelford< ComputeDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M > ThreadwiseWelford
Definition: gridwise_normalization_splitk_1st.hpp:64
static constexpr bool reorder_thread_cluster
Definition: gridwise_normalization_splitk_1st.hpp:33
static constexpr auto I0
Definition: gridwise_normalization_splitk_1st.hpp:35
decltype(make_naive_tensor_descriptor_packed(make_tuple(Number< MThreadSliceSize >{}, Number< XSrcVectorSize >{}))) ThreadReduceSrcDesc_M_K
Definition: gridwise_normalization_splitk_1st.hpp:59
static __device__ int GetKPerThread(int k, int kRaw, int kGridSize, int block_k_cluster_id, int thread_k_cluster_id)
Definition: gridwise_normalization_splitk_1st.hpp:81
static constexpr index_t M_BlockTileSize
Definition: gridwise_normalization_splitk_1st.hpp:74
static constexpr index_t K_BlockTileStepSize
Definition: gridwise_normalization_splitk_1st.hpp:76
BlockwiseWelford< ComputeDataType, BlockSize, ThreadClusterLengths_M_K, ThreadClusterArrangeOrder, false > BlockwiseWelford
Definition: gridwise_normalization_splitk_1st.hpp:70
typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type ThreadClusterArrangeOrder
Definition: gridwise_normalization_splitk_1st.hpp:45
typename conditional< reorder_thread_cluster, Sequence< 1, 0 >, Sequence< 0, 1 > >::type ThreadBufferDimAccessOrder
Definition: gridwise_normalization_splitk_1st.hpp:42
static constexpr auto I2
Definition: gridwise_normalization_splitk_1st.hpp:37
static constexpr auto thread_buffer_desc_m_k
Definition: gridwise_normalization_splitk_1st.hpp:51
Sequence< MThreadSliceSize, XSrcVectorSize > ThreadBufferLengths_M_K
Definition: gridwise_normalization_splitk_1st.hpp:50
static constexpr auto thread_cluster_desc
Definition: gridwise_normalization_splitk_1st.hpp:47
Sequence< MThreadClusterSize, KThreadClusterSize > ThreadClusterLengths_M_K
Definition: gridwise_normalization_splitk_1st.hpp:39
Definition: sequence.hpp:43
Definition: static_buffer.hpp:16
Definition: threadwise_tensor_slice_transfer.hpp:39
Helper structure that facilitates transfer of source (grid) data to destination threads.
Definition: threadwise_tensor_slice_transfer.hpp:234
Definition: functional.hpp:100
Definition: integral_constant.hpp:20
Definition: functional2.hpp:33
Definition: unary_element_wise_operation.hpp:334