ck_tile Namespace Reference#
Namespaces | |
conv | |
detail | |
details | |
element_wise | |
impl | |
internal | |
literals | |
ranges | |
ReduceOp | |
tensor_layout | |
util | |
Functions | |
template<typename Lengths , typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type> | |
constexpr CK_TILE_HOST_DEVICE auto | make_cluster_descriptor (const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::size(), 1 >::type{}) |
template<typename LowLength > | |
constexpr CK_TILE_HOST_DEVICE auto | make_pass_through_transform (const LowLength &low_length) |
template<typename LowLength , typename LeftPad , typename RightPad , bool SkipIsValidCheck = false> | |
constexpr CK_TILE_HOST_DEVICE auto | make_pad_transform (const LowLength &low_length, const LeftPad &left_pad, const RightPad &right_pad, bool_constant< SkipIsValidCheck >=bool_constant< false >{}) |
template<typename LowLength , typename LeftPadLength , bool SkipIsValidCheck = false> | |
constexpr CK_TILE_HOST_DEVICE auto | make_left_pad_transform (const LowLength &low_length, const LeftPadLength &left_pad_, bool_constant< SkipIsValidCheck >=bool_constant< false >{}) |
template<typename LowLength , typename RightPadLength , bool SkipIsValidCheck = false> | |
constexpr CK_TILE_HOST_DEVICE auto | make_right_pad_transform (const LowLength &low_length, const RightPadLength &right_pad_, bool_constant< SkipIsValidCheck >=bool_constant< false >{}) |
template<typename UpLengths , typename Coefficients , typename std::enable_if< UpLengths::size()==Coefficients::size(), bool >::type = false> | |
constexpr CK_TILE_HOST_DEVICE auto | make_embed_transform (const UpLengths &up_lengths, const Coefficients &coefficients) |
template<typename LowLengths > | |
constexpr CK_TILE_HOST_DEVICE auto | make_merge_transform_v2_magic_division (const LowLengths &low_lengths) |
template<typename LowLengths > | |
constexpr CK_TILE_HOST_DEVICE auto | make_merge_transform_v3_division_mod (const LowLengths &low_lengths) |
template<typename LowLengths > | |
constexpr CK_TILE_HOST_DEVICE auto | make_merge_transform (const LowLengths &low_lengths) |
template<typename UpLengths , bool Use24BitIntegerCalculation = false> | |
constexpr CK_TILE_HOST_DEVICE auto | make_unmerge_transform (const UpLengths &up_lengths, bool_constant< Use24BitIntegerCalculation >=bool_constant< false >{}) |
template<typename LowerIndex > | |
constexpr CK_TILE_HOST_DEVICE auto | make_freeze_transform (const LowerIndex &low_idx) |
template<typename UpperIndex > | |
constexpr CK_TILE_HOST_DEVICE auto | make_insert_transform (const UpperIndex &up_idx) |
template<typename UpLengths > | |
constexpr CK_TILE_HOST_DEVICE auto | make_replicate_transform (const UpLengths &up_lengths) |
template<typename LowLength , typename SliceBegin , typename SliceEnd > | |
constexpr CK_TILE_HOST_DEVICE auto | make_slice_transform (const LowLength &low_length, const SliceBegin &slice_begin, const SliceEnd &slice_end) |
template<typename Modulus , typename UpLength > | |
constexpr CK_TILE_HOST_DEVICE auto | make_modulo_transform (const Modulus &modulus, const UpLength &up_length) |
template<typename LowLengths > | |
constexpr CK_TILE_HOST_DEVICE auto | make_xor_transform (const LowLengths &low_lengths) |
template<typename LowLength , typename OffsetLength > | |
constexpr CK_TILE_HOST_DEVICE auto | make_offset_transform (const LowLength &low_length, const OffsetLength &offset_length) |
template<typename UpLength , typename Indices > | |
constexpr CK_TILE_HOST_DEVICE auto | make_indexing_transform (const UpLength &up_lengths, const Indices &indices) |
template<typename UpLength , typename IndexingAdaptor > | |
constexpr CK_TILE_HOST_DEVICE auto | make_indexing_transform_with_adaptor (const UpLength &up_lengths, const IndexingAdaptor &iadaptor) |
constexpr const char * | tile_distribution_pattern_to_string (tile_distribution_pattern pattern) |
template<index_t BlockSize, index_t YPerTile, index_t XPerTile, index_t VecSize, tile_distribution_pattern DistributionPattern, index_t NumWaveGroups> | |
CK_TILE_HOST_DEVICE void | print (const TileDistributionEncodingPattern2D< BlockSize, YPerTile, XPerTile, VecSize, DistributionPattern, NumWaveGroups > &) |
CK_TILE_DEVICE int32x4_t | make_wave_buffer_resource (const void *ptr, uint32_t size=0xffffffff) |
CK_TILE_DEVICE void | buffer_load_fence (index_t cnt=0) |
CK_TILE_DEVICE void | lds_load_fence (index_t cnt=0) |
template<typename... T> | |
CK_TILE_DEVICE void | buffer_load_fence (index_t cnt=0, T &... o) |
CK_TILE_DEVICE void | buffer_store_fence (index_t cnt=0) |
CK_TILE_DEVICE auto | async_load_fence_raw (index_t cnt=0) |
CK_TILE_DEVICE_EXTERN int8_t | llvm_amdgcn_raw_buffer_load_i8 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8") |
CK_TILE_DEVICE_EXTERN int8x2_t | llvm_amdgcn_raw_buffer_load_i8x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8") |
CK_TILE_DEVICE_EXTERN int8x4_t | llvm_amdgcn_raw_buffer_load_i8x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8") |
CK_TILE_DEVICE_EXTERN int16_t | llvm_amdgcn_raw_buffer_load_i16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16") |
CK_TILE_DEVICE_EXTERN int16x2_t | llvm_amdgcn_raw_buffer_load_i16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16") |
CK_TILE_DEVICE_EXTERN int16x4_t | llvm_amdgcn_raw_buffer_load_i16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16") |
CK_TILE_DEVICE_EXTERN int32_t | llvm_amdgcn_raw_buffer_load_i32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32") |
CK_TILE_DEVICE_EXTERN int32x2_t | llvm_amdgcn_raw_buffer_load_i32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32") |
CK_TILE_DEVICE_EXTERN int32x4_t | llvm_amdgcn_raw_buffer_load_i32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32") |
CK_TILE_DEVICE_EXTERN _Float16 | llvm_amdgcn_raw_buffer_load_fp16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16") |
CK_TILE_DEVICE_EXTERN fp16x2_t | llvm_amdgcn_raw_buffer_load_fp16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16") |
CK_TILE_DEVICE_EXTERN fp16x4_t | llvm_amdgcn_raw_buffer_load_fp16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16") |
CK_TILE_DEVICE_EXTERN float | llvm_amdgcn_raw_buffer_load_fp32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32") |
CK_TILE_DEVICE_EXTERN fp32x2_t | llvm_amdgcn_raw_buffer_load_fp32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32") |
CK_TILE_DEVICE_EXTERN fp32x4_t | llvm_amdgcn_raw_buffer_load_fp32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i8 (int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i8x2 (int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i8x4 (int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i16 (int16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i16x2 (int16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i16x4 (int16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_ui16 (uint16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_ui16x2 (uint16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_ui16x4 (uint16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i32x2 (int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_i32x4 (int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_fp16 (_Float16 vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_fp16x2 (fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_fp16x4 (fp16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_fp32x2 (fp32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_store_fp32x4 (fp32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32") |
CK_TILE_DEVICE_EXTERN fp16x2_t | llvm_amdgcn_raw_buffer_atomic_add_fp16x2 (fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16") |
CK_TILE_DEVICE_EXTERN int32_t | llvm_amdgcn_raw_buffer_atomic_add_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32") |
CK_TILE_DEVICE_EXTERN float | llvm_amdgcn_raw_buffer_atomic_add_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32") |
CK_TILE_DEVICE_EXTERN double | llvm_amdgcn_raw_buffer_atomic_max_fp64 (double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64") |
CK_TILE_DEVICE_EXTERN void | llvm_amdgcn_raw_buffer_load_lds (int32x4_t rsrc, as3_uint32_ptr lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds") |
template<unsigned num_dwords, bool pre_nop = false> | |
CK_TILE_DEVICE void | async_buffer_load_dwordxn_v (void *smem, int32x4_t rsrc, index_t voffset, index_t, index_t ioffset, index_t=0, bool_constant< pre_nop >={}) |
CK_TILE_DEVICE void | async_buffer_load_fence (index_t cnt=0) |
template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> | |
CK_TILE_DEVICE thread_buffer< int8_t, N > | amd_buffer_load_impl_with_bytes (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> | |
CK_TILE_DEVICE thread_buffer< T, N > | amd_buffer_load_impl (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false> | |
CK_TILE_DEVICE void | amd_buffer_load_raw_impl (thread_buffer< T, N > &dst, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_linear_addr_offset, index_t flag=0, bool_constant< pre_nop >={}) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false> | |
CK_TILE_DEVICE void | amd_async_buffer_load_impl (CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, bool_constant< pre_nop >={}) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | |
CK_TILE_DEVICE void | amd_async_buffer_load (CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, index_t flag=0, bool_constant< oob_conditional_check >={}) |
template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> | |
CK_TILE_DEVICE void | amd_buffer_store_impl_with_bytes (const thread_buffer< int8_t, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> | |
CK_TILE_DEVICE void | amd_buffer_store_impl (const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | |
CK_TILE_DEVICE void | amd_buffer_store_raw_impl (const thread_buffer< T, N > &dst_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset, index_t dst_linear_addr_offset, index_t is_valid_element=1) |
template<typename T , index_t N> | |
CK_TILE_DEVICE void | amd_buffer_atomic_add_impl (const thread_buffer< T, N > &src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
template<typename T , index_t N> | |
CK_TILE_DEVICE void | amd_buffer_atomic_max_impl (const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | |
CK_TILE_DEVICE thread_buffer< T, N > | amd_buffer_load_invalid_element_return_zero (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | |
CK_TILE_DEVICE thread_buffer< T, N > | amd_buffer_load_invalid_element_return_customized_value (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false> | |
CK_TILE_DEVICE void | amd_buffer_load_raw (thread_buffer< T, N > &dst, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, index_t is_valid_element=0, bool_constant< pre_nop >={}) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false> | |
CK_TILE_DEVICE void | amd_buffer_load_raw (thread_buffer< T, N > &dst, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t is_valid_element=0, bool_constant< pre_nop >={}) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false> | |
CK_TILE_DEVICE void | amd_async_buffer_load_with_oob_raw (T *smem, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, bool_constant< pre_nop >={}) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false> | |
CK_TILE_DEVICE void | amd_async_buffer_load_with_oob_raw (T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool_constant< pre_nop >={}) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = false> | |
CK_TILE_DEVICE void | amd_async_buffer_load_with_oob (CK_TILE_LDS_ADDR T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool is_valid_element, bool_constant< oob_conditional_check >={}) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | |
CK_TILE_DEVICE void | amd_buffer_store (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true> | |
CK_TILE_DEVICE void | amd_buffer_store_raw (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) |
template<typename T , index_t N> | |
CK_TILE_DEVICE void | amd_buffer_atomic_add (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) |
template<typename T , index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false> | |
CK_TILE_DEVICE void | amd_buffer_atomic_add_raw (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size, bool_constant< pre_nop >={}) |
template<typename T , index_t N> | |
CK_TILE_DEVICE void | amd_buffer_atomic_max (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) |
template<typename T , index_t LaneGroupSize, index_t kOuterDistDim0, index_t kOuterDistDim1, index_t kInnerDistDim0, index_t kInnerDistDim1> | |
constexpr CK_TILE_DEVICE auto | make_transposed_distr_encode () |
template<typename T , typename ComputeType > | |
CK_TILE_HOST_DEVICE T | add (const T &a, const T &b) |
CK_TILE_HOST_DEVICE bf16x2_t | add_bf16x2_t (const bf16x2_t &a, const bf16x2_t &b) |
CK_TILE_HOST_DEVICE bf16x4_t | add_bf16x4_t (const bf16x4_t &a, const bf16x4_t &b) |
CK_TILE_HOST_DEVICE fp16x2_t | add_f16x2_t (const fp16x2_t &a, const fp16x2_t &b) |
CK_TILE_HOST_DEVICE fp8x4_t | add_fp8x4_t (const fp8x4_t &a, const fp8x4_t &b) |
CK_TILE_HOST_DEVICE fp8x8_t | add_fp8x8_t (const fp8x8_t &a, const fp8x8_t &b) |
CK_TILE_HOST_DEVICE bf8x4_t | add_bf8x4_t (const bf8x4_t &a, const bf8x4_t &b) |
CK_TILE_HOST_DEVICE bf8x8_t | add_bf8x8_t (const bf8x8_t &a, const bf8x8_t &b) |
template<typename X > | |
CK_TILE_DEVICE void | atomic_add (X *p_dst, const X &x) |
template<> | |
CK_TILE_DEVICE void | atomic_add< bf16x2_t > (bf16x2_t *p_dst, const bf16x2_t &x) |
template<> | |
CK_TILE_DEVICE void | atomic_add< bf16x4_t > (bf16x4_t *p_dst, bf16x4_t const &x) |
template<> | |
CK_TILE_DEVICE void | atomic_add< fp8x4_t > (fp8x4_t *p_dst, const fp8x4_t &x) |
template<> | |
CK_TILE_DEVICE void | atomic_add< bf8x4_t > (bf8x4_t *p_dst, const bf8x4_t &x) |
template<> | |
CK_TILE_DEVICE void | atomic_add< fp8x8_t > (fp8x8_t *p_dst, fp8x8_t const &x) |
template<> | |
CK_TILE_DEVICE void | atomic_add< bf8x8_t > (bf8x8_t *p_dst, bf8x8_t const &x) |
template<> | |
CK_TILE_DEVICE void | atomic_add< fp16x2_t > (fp16x2_t *p_dst, fp16x2_t const &x) |
template<typename T , index_t N> | |
CK_TILE_DEVICE void | atomic_add_g (T *p_dst, const thread_buffer< T, N > &x) |
template<typename T , index_t N> | |
CK_TILE_DEVICE void | atomic_max_g (T *p_dst, const thread_buffer< T, N > &x) |
CK_TILE_DEVICE void | m0_set_with_memory (index_t v) |
CK_TILE_DEVICE void | m0_inc_with_memory (index_t v) |
template<typename T > | |
CK_TILE_DEVICE T | warp_shuffle_up (const T &v_local, uint32_t lane_delta) |
template<typename T > | |
CK_TILE_DEVICE T | warp_shuffle_down (const T &v_local, uint32_t lane_delta) |
template<typename T > | |
CK_TILE_DEVICE auto | warp_shuffle_down_pair (const T &v_local) |
template<typename T > | |
CK_TILE_DEVICE T | warp_shuffle (const T &v_local, uint32_t src_lane) |
template<typename T > | |
CK_TILE_DEVICE auto | flag_to_exec (const T &v_flag) |
template<typename X , typename Y > | |
CK_TILE_DEVICE auto | cmp_lt_to_exec (const X &x, const Y &y) |
template<typename D = void, typename... Ts> | |
constexpr CK_TILE_HOST_DEVICE details::return_type< D, Ts... > | make_array (Ts &&... ts) |
template<typename T , index_t Size> | |
constexpr CK_TILE_HOST_DEVICE auto | make_array_with (std::initializer_list< T > ilist) |
template<typename T , index_t Size> | |
constexpr CK_TILE_HOST_DEVICE bool | operator== (const array< T, Size > &a, const array< T, Size > &b) |
template<typename T , index_t Size> | |
constexpr CK_TILE_HOST_DEVICE bool | operator!= (const array< T, Size > &a, const array< T, Size > &b) |
template<typename T , index_t N, typename X > | |
constexpr CK_TILE_HOST_DEVICE auto | to_array (const std::vector< X > &x) |
template<typename T , index_t N, typename X > | |
constexpr CK_TILE_HOST_DEVICE auto | to_array (const X &x) |
template<typename TData , index_t NSize> | |
constexpr CK_TILE_HOST_DEVICE auto | container_push_back (const array< TData, NSize > &a, const TData &x) |
template<typename... Ts, typename T > | |
constexpr CK_TILE_HOST_DEVICE auto | container_push_front (const tuple< Ts... > &a, const T &x) |
template<typename... Ts, typename T > | |
constexpr CK_TILE_HOST_DEVICE auto | container_push_back (const tuple< Ts... > &a, const T &x) |
template<typename TData , index_t NSize, index_t... IRs> | |
constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_new2old (const array< TData, NSize > &old_array, sequence< IRs... >) |
template<typename TData , index_t NSize, index_t... IRs> | |
constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_old2new (const array< TData, NSize > &old_array, sequence< IRs... > old2new) |
template<typename TData , index_t NSize> | |
constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_new2old (const array< TData, NSize > &old_array, const map< index_t, index_t > &new2old) |
template<typename TData , index_t NSize> | |
constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_old2new (const array< TData, NSize > &old_array, const map< index_t, index_t > &old2new) |
template<typename... Ts, index_t... IRs> | |
constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_new2old (const tuple< Ts... > &old_tuple, sequence< IRs... >) |
template<typename... Ts, index_t... IRs> | |
constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_old2new (const tuple< Ts... > &old_tuple, sequence< IRs... > old2new) |
template<index_t... Is, index_t... IRs> | |
constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_new2old (sequence< Is... >, sequence< IRs... >) |
template<index_t... Is, index_t... IRs> | |
constexpr CK_TILE_HOST_DEVICE auto | container_reorder_given_old2new (sequence< Is... > old_seq, sequence< IRs... >) |
template<typename Container , typename Reduce , typename ROld , index_t I, index_t IEnd, index_t IStep> | |
constexpr CK_TILE_HOST_DEVICE auto | container_reduce_impl (const Container &x, Reduce reduce, ROld r_old, number< I > i, number< IEnd >, number< IStep >) |
template<typename Container , typename Reduce , typename Init , index_t IBegin = 0, index_t IEnd = Container::size(), index_t IStep = 1> | |
constexpr CK_TILE_HOST_DEVICE auto | container_reduce (const Container &x, Reduce reduce, Init init, number< IBegin >=number< 0 >{}, number< IEnd >=number< Container::size()>{}, number< IStep >=number< 1 >{}) |
template<typename TData , index_t NSize, typename Reduce > | |
constexpr CK_TILE_HOST_DEVICE auto | container_reverse_inclusive_scan (const array< TData, NSize > &x, Reduce f, TData init) |
template<typename TData , index_t NSize, typename Reduce , typename Init > | |
constexpr CK_TILE_HOST_DEVICE auto | container_reverse_exclusive_scan (const array< TData, NSize > &x, Reduce f, Init init) |
template<index_t... Is, typename Reduce , index_t Init> | |
constexpr CK_TILE_HOST_DEVICE auto | container_reverse_exclusive_scan (const sequence< Is... > &seq, Reduce f, number< Init >) |
template<typename... Xs, typename Reduce , index_t I, typename YOld , typename ROld > | |
constexpr CK_TILE_HOST_DEVICE auto | container_reverse_exclusive_scan_impl (const tuple< Xs... > &x, Reduce reduce, number< I > i, YOld y_old, ROld r_old) |
template<typename... Xs, typename Reduce , typename Init > | |
constexpr CK_TILE_HOST_DEVICE auto | container_reverse_exclusive_scan (const tuple< Xs... > &x, Reduce reduce, Init init) |
template<typename... Xs, typename Reduce , typename TData > | |
constexpr CK_TILE_HOST_DEVICE auto | container_reverse_inclusive_scan (const tuple< Xs... > &x, Reduce f, TData init) |
template<typename X , typename... Ys> | |
constexpr CK_TILE_HOST_DEVICE auto | container_concat (const X &x, const Ys &... ys) |
template<typename T , index_t NX, index_t NY> | |
constexpr CK_TILE_HOST_DEVICE auto | container_concat (const array< T, NX > &ax, const array< T, NY > &ay) |
template<typename... X, typename... Y> | |
constexpr CK_TILE_HOST_DEVICE auto | container_concat (const tuple< X... > &tx, const tuple< Y... > &ty) |
template<typename Container > | |
constexpr CK_TILE_HOST_DEVICE auto | container_concat (const Container &x) |
template<typename T , index_t N, index_t... Is> | |
constexpr CK_TILE_HOST_DEVICE auto | get_container_subset (const array< T, N > &arr, sequence< Is... >) |
template<typename... Ts, index_t... Is> | |
constexpr CK_TILE_HOST_DEVICE auto | get_container_subset (const tuple< Ts... > &tup, sequence< Is... >) |
template<typename T , index_t N, index_t... Is> | |
constexpr CK_TILE_HOST_DEVICE void | set_container_subset (array< T, N > &y, sequence< Is... > picks, const array< T, sizeof...(Is)> &x) |
template<typename Y , typename X , index_t... Is> | |
constexpr CK_TILE_HOST_DEVICE void | set_container_subset (Y &y, sequence< Is... > picks, const X &x) |
template<index_t... Is> | |
constexpr index_t | container_find (sequence< Is... > seq, index_t value) |
template<index_t... Is> | |
constexpr CK_TILE_HOST_DEVICE auto | sequence_to_tuple_of_number (sequence< Is... >) |
template<typename... Xs> | |
constexpr CK_TILE_HOST_DEVICE auto | make_multi_index (Xs &&... xs) |
template<index_t NSize> | |
constexpr CK_TILE_HOST_DEVICE auto | make_zero_multi_index () |
template<typename T > | |
constexpr CK_TILE_HOST_DEVICE auto | to_multi_index (const T &x) |
template<index_t NSize, typename X > | |
constexpr CK_TILE_HOST_DEVICE auto | operator+= (multi_index< NSize > &y, const X &x) |
template<index_t NSize, typename X > | |
constexpr CK_TILE_HOST_DEVICE auto | operator-= (multi_index< NSize > &y, const X &x) |
template<index_t NSize, typename T > | |
constexpr CK_TILE_HOST_DEVICE auto | operator+ (const multi_index< NSize > &a, const T &b) |
template<index_t NSize, typename T > | |
constexpr CK_TILE_HOST_DEVICE auto | operator- (const multi_index< NSize > &a, const T &b) |
template<index_t NSize, typename T > | |
constexpr CK_TILE_HOST_DEVICE auto | operator* (const multi_index< NSize > &a, const T &b) |
template<index_t NSize> | |
constexpr CK_TILE_HOST_DEVICE auto | operator* (index_t a, const multi_index< NSize > &x) |
template<index_t NSize> | |
constexpr CK_TILE_HOST_DEVICE auto | operator* (const multi_index< NSize > &x, index_t a) |
template<index_t I, index_t... Is> | |
constexpr CK_TILE_HOST_DEVICE auto | sequence_pop_front (sequence< I, Is... >) |
template<typename Seq > | |
constexpr CK_TILE_HOST_DEVICE auto | sequence_pop_back (Seq) |
template<index_t... Xs, index_t... Ys> | |
constexpr CK_TILE_HOST_DEVICE bool | operator== (sequence< Xs... >, sequence< Ys... >) |
template<index_t... Xs, index_t... Ys> | |
constexpr CK_TILE_HOST_DEVICE bool | operator!= (sequence< Xs... > x, sequence< Ys... > y) |
template<index_t... Xs, index_t... Ys> | |
constexpr CK_TILE_HOST_DEVICE auto | operator+ (sequence< Xs... >, sequence< Ys... >) |
template<index_t... Xs, index_t... Ys> | |
constexpr CK_TILE_HOST_DEVICE auto | operator- (sequence< Xs... >, sequence< Ys... >) |
template<index_t... Xs, index_t... Ys> | |
constexpr CK_TILE_HOST_DEVICE auto | operator* (sequence< Xs... >, sequence< Ys... >) |
template<index_t... Xs, index_t... Ys> | |
constexpr CK_TILE_HOST_DEVICE auto | operator/ (sequence< Xs... >, sequence< Ys... >) |
template<index_t... Xs, index_t... Ys> | |
constexpr CK_TILE_HOST_DEVICE auto | operator% (sequence< Xs... >, sequence< Ys... >) |
template<index_t... Xs, index_t Y> | |
constexpr CK_TILE_HOST_DEVICE auto | operator+ (sequence< Xs... >, number< Y >) |
template<index_t... Xs, index_t Y> | |
constexpr CK_TILE_HOST_DEVICE auto | operator- (sequence< Xs... >, number< Y >) |
template<index_t... Xs, index_t Y> | |
constexpr CK_TILE_HOST_DEVICE auto | operator* (sequence< Xs... >, number< Y >) |
template<index_t... Xs, index_t Y> | |
constexpr CK_TILE_HOST_DEVICE auto | operator/ (sequence< Xs... >, number< Y >) |
template<index_t... Xs, index_t Y> | |
constexpr CK_TILE_HOST_DEVICE auto | operator% (sequence< Xs... >, number< Y >) |
template<index_t Y, index_t... Xs> | |
constexpr CK_TILE_HOST_DEVICE auto | operator+ (number< Y >, sequence< Xs... >) |
template<index_t Y, index_t... Xs> | |
constexpr CK_TILE_HOST_DEVICE auto | operator- (number< Y >, sequence< Xs... >) |
template<index_t Y, index_t... Xs> | |
constexpr CK_TILE_HOST_DEVICE auto | operator* (number< Y >, sequence< Xs... >) |
template<index_t Y, index_t... Xs> | |
constexpr CK_TILE_HOST_DEVICE auto | operator/ (number< Y >, sequence< Xs... >) |
template<index_t Y, index_t... Xs> | |
constexpr CK_TILE_HOST_DEVICE auto | operator% (number< Y >, sequence< Xs... >) |
template<typename... Seqs> | |
constexpr CK_TILE_HOST_DEVICE auto | merge_sequences (Seqs...) |
template<typename F , index_t... Xs> | |
constexpr CK_TILE_HOST_DEVICE auto | transform_sequences (F f, sequence< Xs... >) |
template<typename F , index_t... Xs, index_t... Ys> | |
constexpr CK_TILE_HOST_DEVICE auto | transform_sequences (F f, sequence< Xs... >, sequence< Ys... >) |
template<typename F , index_t... Xs, index_t... Ys, index_t... Zs> | |
constexpr CK_TILE_HOST_DEVICE auto | transform_sequences (F f, sequence< Xs... >, sequence< Ys... >, sequence< Zs... >) |
template<typename Seq , typename Reduce , index_t Init> | |
constexpr CK_TILE_HOST_DEVICE auto | reverse_inclusive_scan_sequence (Seq, Reduce, number< Init >) |
template<typename Seq , typename Reduce , index_t Init> | |
constexpr CK_TILE_HOST_DEVICE auto | reverse_exclusive_scan_sequence (Seq, Reduce, number< Init >) |
template<typename Seq , typename Reduce , index_t Init> | |
constexpr CK_TILE_HOST_DEVICE auto | inclusive_scan_sequence (Seq, Reduce, number< Init >) |
template<typename Seq , typename Reduce , index_t Init> | |
constexpr auto | exclusive_scan_sequence (Seq, Reduce, number< Init >) |
template<typename Seq > | |
constexpr auto | prefix_sum_sequence (Seq) |
template<typename Seq , index_t... Is> | |
constexpr CK_TILE_HOST_DEVICE auto | pick_sequence_elements_by_ids (Seq, sequence< Is... >) |
template<typename Seq , typename Mask > | |
constexpr CK_TILE_HOST_DEVICE auto | pick_sequence_elements_by_mask (Seq, Mask) |
template<typename Seq , typename Values , typename Ids > | |
constexpr CK_TILE_HOST_DEVICE auto | modify_sequence_elements_by_ids (Seq, Values, Ids) |
template<typename Seq , typename Reduce , index_t Init> | |
constexpr CK_TILE_HOST_DEVICE index_t | reduce_on_sequence (Seq, Reduce f, number< Init >) |
template<typename Seq , typename F > | |
constexpr CK_TILE_HOST_DEVICE bool | sequence_any_of (Seq, F f) |
template<typename Seq , typename F > | |
constexpr CK_TILE_HOST_DEVICE bool | sequence_all_of (Seq, F f) |
template<index_t... Is> | |
constexpr CK_TILE_HOST_DEVICE auto | make_sequence (number< Is >...) |
template<typename F , index_t N> | |
constexpr CK_TILE_HOST_DEVICE auto | generate_sequence (F, number< N >) |
template<typename F , index_t N> | |
constexpr CK_TILE_HOST_DEVICE auto | generate_sequence_v2 (F &&f, number< N >) |
template<index_t... Is> | |
constexpr CK_TILE_HOST_DEVICE auto | to_sequence (tuple< number< Is >... >) |
template<typename SeqSortedSamples , index_t r, index_t... rs> | |
constexpr CK_TILE_HOST_DEVICE auto | histogram_sorted_sequence (SeqSortedSamples, sequence< r, rs... >) |
template<typename F , index_t N> | |
constexpr CK_TILE_HOST_DEVICE auto | generate_array (F &&f, number< N >) |
template<typename Seq , index_t SliceSize, typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type> | |
constexpr auto | reverse_slice_sequence (Seq, number< SliceSize >, Mask=typename uniform_sequence_gen< Seq::size(), 1 >::type{}) |
template<typename Seq , index_t SliceSize, typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type> | |
constexpr auto | slice_sequence (Seq, number< SliceSize >, Mask=typename uniform_sequence_gen< Seq::size(), 1 >::type{}) |
template<typename... Ts> | |
constexpr CK_TILE_HOST_DEVICE auto | make_thread_buffer (Ts &&... ts) |
template<typename... T> | |
CK_TILE_HOST_DEVICE void | print (const tuple< T... > &t) |
template<typename... Xs> | |
constexpr CK_TILE_HOST_DEVICE bool | operator== (const tuple< Xs... > &a, const tuple< Xs... > &b) |
template<typename... Xs> | |
constexpr CK_TILE_HOST_DEVICE bool | operator!= (const tuple< Xs... > &a, const tuple< Xs... > &b) |
template<typename... Xs> | |
constexpr CK_TILE_HOST_DEVICE auto | make_tuple (Xs &&... xs) |
template<typename... Args> | |
constexpr tuple< Args &... > | tie (Args &... args) noexcept |
template<typename F , index_t... ids> | |
constexpr CK_TILE_HOST_DEVICE auto | generate_tuple_for (F &&f, sequence< ids... >) |
template<typename F , index_t N> | |
constexpr CK_TILE_HOST_DEVICE auto | generate_tuple (F &&f, number< N >) |
template<typename F , index_t N> | |
constexpr CK_TILE_HOST_DEVICE auto | generate_tie (F &&f, number< N >) |
template<typename... X, typename... Y> | |
constexpr CK_TILE_HOST_DEVICE auto | concat_tuple_of_reference (const tuple< X &... > &tx, const tuple< Y &... > &ty) |
template<typename... X, typename... Y> | |
constexpr CK_TILE_HOST_DEVICE auto | concat_tuple (const tuple< X... > &tx, const tuple< Y... > &ty) |
template<typename... X> | |
constexpr CK_TILE_HOST_DEVICE auto | concat_tuple (const tuple< X... > &tx) |
template<typename... X, typename... Tuples> | |
constexpr CK_TILE_HOST_DEVICE auto | concat_tuple (const tuple< X... > &tx, const Tuples &... tuples) |
template<typename F , typename X > | |
constexpr CK_TILE_HOST_DEVICE auto | transform_tuples (F f, const X &x) |
template<typename F , typename X , typename Y > | |
constexpr CK_TILE_HOST_DEVICE auto | transform_tuples (F f, const X &x, const Y &y) |
template<typename F , typename X , typename Y , typename Z > | |
constexpr CK_TILE_HOST_DEVICE auto | transform_tuples (F f, const X &x, const Y &y, const Z &z) |
template<typename F , typename Tuple > | |
constexpr decltype(auto) | apply (F &&f, Tuple &&t) |
template<typename F , typename X > | |
constexpr CK_TILE_HOST_DEVICE auto | embed_tuples (F f, const X &x) |
template<index_t Depth = 0, index_t MaxDepth = -1> | |
constexpr CK_TILE_HOST_DEVICE auto | unroll_nested_tuple (const tuple<> &t) |
template<index_t Depth = 0, index_t MaxDepth = -1, typename T > | |
constexpr CK_TILE_HOST_DEVICE auto | unroll_nested_tuple (const T &t) |
template<index_t Depth = 0, index_t MaxDepth = -1, typename... Ts> | |
constexpr CK_TILE_HOST_DEVICE auto | unroll_nested_tuple (const tuple< Ts... > &t) |
template<typename... Ts> | |
constexpr CK_TILE_HOST_DEVICE auto | tuple_reverse (const tuple< Ts... > &t) |
template<index_t Idx, index_t End, typename F , typename... Ts> | |
constexpr CK_TILE_HOST_DEVICE auto | tuple_reduce (F &&f, const tuple< Ts... > &t) |
template<typename... Ts> | |
constexpr CK_TILE_HOST_DEVICE auto | is_nested_tuple (const tuple< Ts... > &) |
template<index_t depth = 0, typename T > | |
constexpr CK_TILE_HOST_DEVICE auto | tuple_depth (const T &) |
template<index_t depth = 0, typename... Ts> | |
constexpr CK_TILE_HOST_DEVICE auto | tuple_depth (const tuple< Ts... > &) |
template<typename... Seqs> | |
constexpr CK_TILE_HOST_DEVICE auto | to_array_of_array (tuple< Seqs... > t_of_s) |
template<typename... Ys, typename X , std::enable_if_t<!std::is_integral< X >::value &&!std::is_floating_point< X >::value, bool > = false> | |
constexpr CK_TILE_HOST_DEVICE auto | operator+= (tuple< Ys... > &y, const X &x) |
template<typename... Ys, typename X , std::enable_if_t<!std::is_integral< X >::value &&!std::is_floating_point< X >::value, bool > = false> | |
constexpr CK_TILE_HOST_DEVICE auto | operator-= (tuple< Ys... > &y, const X &x) |
template<typename... Xs, typename Y , std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false> | |
constexpr CK_TILE_HOST_DEVICE auto | operator+ (const tuple< Xs... > &x, const Y &y) |
template<typename... Xs, typename... Ys> | |
constexpr CK_TILE_HOST_DEVICE auto | operator+ (const tuple< Xs... > &x, const tuple< Ys... > &y) |
template<typename... Xs, typename Y , std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false> | |
constexpr CK_TILE_HOST_DEVICE auto | operator- (const tuple< Xs... > &x, const Y &y) |
template<typename... Xs, typename... Ys> | |
constexpr CK_TILE_HOST_DEVICE auto | operator- (const tuple< Xs... > &x, const tuple< Ys... > &y) |
template<typename... Xs, typename Y , std::enable_if_t<!std::is_integral< Y >::value &&!std::is_floating_point< Y >::value, bool > = false> | |
constexpr CK_TILE_HOST_DEVICE auto | operator* (const tuple< Xs... > &x, const Y &y) |
template<typename... Xs, typename Y , std::enable_if_t< std::is_integral< Y >::value||std::is_floating_point< Y >::value, bool > = false> | |
constexpr CK_TILE_HOST_DEVICE auto | operator* (Y a, const tuple< Xs... > &x) |
template<typename... Xs, typename Y , std::enable_if_t< std::is_integral< Y >::value||std::is_floating_point< Y >::value, bool > = false> | |
constexpr CK_TILE_HOST_DEVICE auto | operator* (const tuple< Xs... > &x, Y a) |
template<typename... Xs, typename... Ys> | |
constexpr CK_TILE_HOST_DEVICE auto | operator* (const tuple< Xs... > &x, const tuple< Ys... > &y) |
template<typename... Xs, typename... Ys> | |
constexpr CK_TILE_HOST_DEVICE auto | operator/ (const tuple< Xs... > &x, const tuple< Ys... > &y) |
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)> | |
constexpr CK_TILE_HOST_DEVICE uint16_t | float_to_bf16_raw (float f, constant< rounding >={}) |
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)> | |
constexpr CK_TILE_HOST_DEVICE uint16_t | double_to_bf16_raw (double f, constant< rounding >={}) |
constexpr CK_TILE_HOST_DEVICE float | bf16_to_float_raw (uint16_t x) |
constexpr CK_TILE_HOST_DEVICE double | bf16_to_double_raw (uint16_t x) |
constexpr CK_TILE_HOST_DEVICE uint16_t | float_to_bf16_rtn_raw (float f) |
constexpr CK_TILE_HOST uint16_t | float_to_bf16_rtn_asm (float f) |
CK_TILE_HOST uint16_t | float_to_bf16_rta_asm (float f) |
constexpr CK_TILE_HOST_DEVICE uint16_t | float_to_bf16_truc_nan_raw (float f) |
constexpr CK_TILE_HOST_DEVICE uint16_t | float_to_bf16_truc_raw (float f) |
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)> | |
constexpr CK_TILE_HOST_DEVICE bfloat16_t | float_to_bf16 (float f, constant< rounding >={}) |
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)> | |
constexpr CK_TILE_HOST_DEVICE bfloat16_t | double_to_bf16 (double f, constant< rounding >={}) |
constexpr CK_TILE_HOST_DEVICE float | bf16_to_float (bfloat16_t x) |
constexpr CK_TILE_HOST_DEVICE double | bf16_to_double (bfloat16_t x) |
template<bf16_rounding_mode rounding = static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)> | |
CK_TILE_HOST_DEVICE constexpr bfloat16_t | fp16_to_bf16 (half_t f, constant< rounding >={}) |
constexpr CK_TILE_HOST_DEVICE half_t | bf16_to_fp16 (bfloat16_t x) |
CK_TILE_HOST_DEVICE bfloat16_t | abs (const bfloat16_t &x) |
CK_TILE_HOST_DEVICE bool | isnan (const bfloat16_t &x) |
CK_TILE_DEVICE bfloat16_t | sqrt (bfloat16_t x) |
CK_TILE_DEVICE bfloat16_t | exp (bfloat16_t x) |
CK_TILE_DEVICE bfloat16_t | exp2 (bfloat16_t x) |
CK_TILE_DEVICE bfloat16_t | log (bfloat16_t x) |
template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)> | |
CK_TILE_HOST_DEVICE uint8_t | float_to_fp8_raw (float, constant< rounding >={}) |
template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)> | |
CK_TILE_HOST_DEVICE uint8_t | float_to_bf8_raw (float, constant< rounding >={}) |
CK_TILE_HOST_DEVICE float | fp8_to_float_raw (uint8_t) |
CK_TILE_HOST_DEVICE float | bf8_to_float_raw (uint8_t) |
template<typename SrcT , typename DstT > | |
CK_TILE_HOST_DEVICE numeric_traits< DstT >::bitwise_type | float_to_fp8_sr_raw (SrcT x) |
Converts a floating-point value to an 8-bit floating-point representation with stochastic rounding. More... | |
template<typename SrcT , typename DstT > | |
CK_TILE_HOST_DEVICE numeric_traits< DstT >::bitwise_type | float_to_fp8_rtn_raw (SrcT x) |
Converts a floating-point value to an 8-bit floating-point representation with rounding to nearest even. More... | |
template<fp8_rounding_mode rounding> | |
CK_TILE_HOST_DEVICE fp8_raw_t | float_to_fp8_raw (float x, constant< rounding >) |
template<fp8_rounding_mode rounding> | |
CK_TILE_HOST_DEVICE bf8_raw_t | float_to_bf8_raw (float x, constant< rounding >) |
template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)> | |
CK_TILE_HOST_DEVICE fp8_t | float_to_fp8 (float x, constant< rounding >={}) |
template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)> | |
CK_TILE_HOST_DEVICE bf8_t | float_to_bf8 (float x, constant< rounding >={}) |
CK_TILE_HOST_DEVICE float | fp8_to_float (fp8_t x) |
CK_TILE_HOST_DEVICE float | bf8_to_float (bf8_t x) |
template<typename T > | |
CK_TILE_HOST_DEVICE T | abs (const T &x) |
CK_TILE_HOST_DEVICE bool | isnan (const fp8_t &x) |
CK_TILE_HOST_DEVICE bool | isnan (const bf8_t &x) |
constexpr CK_TILE_HOST_DEVICE float | fp16_to_float_hip (const fp16_hip_t &x) |
constexpr CK_TILE_HOST_DEVICE double | fp16_to_double_hip (const fp16_hip_t &x) |
constexpr CK_TILE_HOST_DEVICE fp16_hip_t | float_to_fp16_hip (const float &x) |
constexpr CK_TILE_HOST_DEVICE fp16_hip_t | double_to_fp16_hip (const double &x) |
constexpr CK_TILE_HOST_DEVICE float | fp16_to_float (const half_t &x) |
constexpr CK_TILE_HOST_DEVICE float | fp16_to_double (const half_t &x) |
constexpr CK_TILE_HOST_DEVICE half_t | float_to_fp16 (const float &x) |
constexpr CK_TILE_HOST_DEVICE half_t | double_to_fp16 (const double &x) |
CK_TILE_HOST fp16x2_t | pk_add_f16 (const fp16x2_t &x, const fp16x2_t &y) |
constexpr CK_TILE_HOST_DEVICE float | int8_to_float (const int8_t &x) |
constexpr CK_TILE_HOST_DEVICE int8_t | float_to_int8 (const float &x) |
template<typename Scale > | |
__host__ __device__ | scales (Scale) -> scales< Scale > |
FIXME: create macro to replace 'host device' and nothing more. More... | |
__host__ __device__ | plus () -> plus< void, void > |
FIXME: create macro to replace 'host device' and nothing more. More... | |
__host__ __device__ | minus () -> minus< void, void > |
FIXME: create macro to replace 'host device' and nothing more. More... | |
__host__ __device__ | multiplies () -> multiplies< void, void > |
FIXME: create macro to replace 'host device' and nothing more. More... | |
template<typename X , typename Y > | |
constexpr CK_TILE_HOST_DEVICE auto | integer_divide_floor (X x, Y y) |
template<typename X , typename Y > | |
constexpr CK_TILE_HOST_DEVICE auto | integer_divide_ceil (X x, Y y) |
template<typename X , typename Y > | |
constexpr CK_TILE_HOST_DEVICE auto | integer_least_multiple (X x, Y y) |
template<typename T > | |
constexpr CK_TILE_HOST_DEVICE T | max (T x) |
template<typename T > | |
constexpr CK_TILE_HOST T | max (T x, T y) |
template<typename T > | |
constexpr CK_TILE_DEVICE T | max (T x, T y) |
template<> | |
constexpr CK_TILE_DEVICE float | max (float x, float y) |
template<> | |
constexpr CK_TILE_DEVICE double | max (double x, double y) |
template<index_t X> | |
constexpr CK_TILE_HOST_DEVICE index_t | max (number< X >, index_t y) |
template<index_t Y> | |
constexpr CK_TILE_HOST_DEVICE index_t | max (index_t x, number< Y >) |
template<typename X , typename... Ys> | |
constexpr CK_TILE_HOST_DEVICE auto | max (X x, Ys... ys) |
template<typename T > | |
constexpr CK_TILE_HOST_DEVICE T | min (T x) |
template<typename T > | |
constexpr CK_TILE_HOST T | min (T x, T y) |
template<typename T > | |
constexpr CK_TILE_DEVICE T | min (T x, T y) |
template<> | |
constexpr CK_TILE_DEVICE float | min (float x, float y) |
template<> | |
constexpr CK_TILE_DEVICE double | min (double x, double y) |
template<index_t X> | |
constexpr CK_TILE_HOST_DEVICE index_t | min (number< X >, index_t y) |
template<index_t Y> | |
constexpr CK_TILE_HOST_DEVICE index_t | min (index_t x, number< Y >) |
template<typename X , typename... Ys> | |
constexpr CK_TILE_HOST_DEVICE auto | min (X x, Ys... ys) |
template<typename T > | |
constexpr CK_TILE_HOST_DEVICE T | clamp (const T &x, const T &lowerbound, const T &upperbound) |
CK_TILE_HOST int | clz (uint32_t x) |
constexpr CK_TILE_HOST_DEVICE index_t | gcd (index_t x, index_t y) |
template<index_t X, index_t Y> | |
constexpr CK_TILE_HOST_DEVICE auto | gcd (number< X >, number< Y >) |
Y constexpr CK_TILE_HOST_DEVICE auto | lcm (X x, Y y) |
__host__ __device__ | equal () -> equal< void, void > |
FIXME: create macro to replace 'host device' and nothing more. More... | |
__host__ __device__ | less () -> less< void, void > |
FIXME: create macro to replace 'host device' and nothing more. More... | |
__host__ __device__ | less_equal () -> less_equal< void, void > |
FIXME: create macro to replace 'host device' and nothing more. More... | |
constexpr CK_TILE_HOST_DEVICE int32_t | next_power_of_two (int32_t x) |
template<index_t X> | |
constexpr CK_TILE_HOST_DEVICE auto | next_power_of_two () |
template<index_t X> | |
constexpr CK_TILE_HOST_DEVICE auto | next_power_of_two (number< X >) |
constexpr CK_TILE_HOST_DEVICE int32_t | integer_log2_floor (int32_t x) |
constexpr CK_TILE_HOST_DEVICE bool | is_power_of_two_integer (int32_t x) |
CK_TILE_DEVICE float | exp2 (float x) |
CK_TILE_DEVICE uint16_t | sad_u16 (uint16_t x, uint16_t y, uint16_t acc) |
CK_TILE_DEVICE uint32_t | sad_u32 (uint32_t x, uint32_t y, uint32_t acc) |
CK_TILE_HOST float | abs (float x) |
CK_TILE_HOST double | abs (double x) |
CK_TILE_HOST int8_t | abs (int8_t x) |
CK_TILE_HOST int32_t | abs (int32_t x) |
CK_TILE_HOST fp16_t | abs (fp16_t x) |
CK_TILE_HOST bool | isnan (float x) |
CK_TILE_HOST bool | isnan (double x) |
CK_TILE_HOST bool | isnan (int8_t x) |
CK_TILE_HOST bool | isnan (int32_t x) |
CK_TILE_HOST bool | isnan (fp16_t x) |
CK_TILE_HOST fp16_t | sqrt (fp16_t x) |
CK_TILE_HOST float | sqrt (float x) |
CK_TILE_HOST double | sqrt (double x) |
template<typename T > | |
CK_TILE_HOST T | tanh (T x) |
template<> | |
CK_TILE_HOST float | tanh< float > (float x) |
template<> | |
CK_TILE_HOST double | tanh< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | acos (T x) |
template<> | |
CK_TILE_HOST float | acos< float > (float x) |
template<> | |
CK_TILE_HOST double | acos< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | neg (T x) |
template<> | |
CK_TILE_HOST float | neg< float > (float x) |
template<> | |
CK_TILE_HOST double | neg< double > (double x) |
template<> | |
CK_TILE_HOST int32_t | neg< int32_t > (int32_t x) |
template<> | |
CK_TILE_HOST int8_t | neg< int8_t > (int8_t x) |
template<typename T > | |
CK_TILE_HOST T | atan (T x) |
template<> | |
CK_TILE_HOST float | atan< float > (float x) |
template<> | |
CK_TILE_HOST double | atan< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | sin (T x) |
template<> | |
CK_TILE_HOST float | sin< float > (float x) |
template<> | |
CK_TILE_HOST double | sin< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | asin (T x) |
template<> | |
CK_TILE_HOST float | asin< float > (float x) |
template<> | |
CK_TILE_HOST double | asin< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | asinh (T x) |
template<> | |
CK_TILE_HOST float | asinh< float > (float x) |
template<> | |
CK_TILE_HOST double | asinh< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | cos (T x) |
template<> | |
CK_TILE_HOST float | cos< float > (float x) |
template<> | |
CK_TILE_HOST double | cos< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | acosh (T x) |
template<> | |
CK_TILE_HOST float | acosh< float > (float x) |
template<> | |
CK_TILE_HOST double | acosh< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | tan (T x) |
template<> | |
CK_TILE_HOST float | tan< float > (float x) |
template<> | |
CK_TILE_HOST double | tan< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | atanh (T x) |
template<> | |
CK_TILE_HOST float | atanh< float > (float x) |
template<> | |
CK_TILE_HOST double | atanh< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | sinh (T x) |
template<> | |
CK_TILE_HOST float | sinh< float > (float x) |
template<> | |
CK_TILE_HOST double | sinh< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | ceil (T x) |
template<> | |
CK_TILE_HOST float | ceil< float > (float x) |
template<> | |
CK_TILE_HOST double | ceil< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | cosh (T x) |
template<> | |
CK_TILE_HOST float | cosh< float > (float x) |
template<> | |
CK_TILE_HOST double | cosh< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | floor (T x) |
template<> | |
CK_TILE_HOST float | floor< float > (float x) |
template<> | |
CK_TILE_HOST double | floor< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | rcp (T x) |
template<typename T > | |
CK_TILE_HOST T | exp (T x) |
template<> | |
CK_TILE_HOST float | exp< float > (float x) |
template<> | |
CK_TILE_HOST double | exp< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | log (T x) |
template<> | |
CK_TILE_HOST float | log< float > (float x) |
template<> | |
CK_TILE_HOST double | log< double > (double x) |
template<typename T > | |
CK_TILE_HOST T | pow (T x, T gamma) |
template<> | |
CK_TILE_HOST float | pow< float > (float x, float gamma) |
template<> | |
CK_TILE_HOST double | pow< double > (double x, double gamma) |
template<typename T > | |
CK_TILE_HOST T | expm1 (T x) |
template<> | |
CK_TILE_HOST float | expm1< float > (float x) |
template<> | |
CK_TILE_HOST double | expm1< double > (double x) |
template<typename T > | |
CK_TILE_DEVICE T | tanh (T x) |
template<typename T > | |
CK_TILE_DEVICE T | acos (T x) |
template<typename T > | |
CK_TILE_DEVICE T | neg (T x) |
template<> | |
CK_TILE_DEVICE fp16_t | neg< fp16_t > (fp16_t x) |
template<typename T > | |
CK_TILE_DEVICE T | atan (T x) |
template<typename T > | |
CK_TILE_DEVICE T | sin (T x) |
template<> | |
CK_TILE_DEVICE fp16_t | sin< fp16_t > (fp16_t x) |
template<typename T > | |
CK_TILE_DEVICE T | asin (T x) |
template<typename T > | |
CK_TILE_DEVICE T | asinh (T x) |
template<typename T > | |
CK_TILE_DEVICE T | acosh (T x) |
template<typename T > | |
CK_TILE_DEVICE T | tan (T x) |
template<typename T > | |
CK_TILE_DEVICE T | atanh (T x) |
template<typename T > | |
CK_TILE_DEVICE T | sinh (T x) |
template<typename T > | |
CK_TILE_DEVICE T | ceil (T x) |
template<> | |
CK_TILE_DEVICE fp16_t | ceil< fp16_t > (fp16_t x) |
template<typename T > | |
CK_TILE_DEVICE T | cosh (T x) |
template<typename T > | |
CK_TILE_DEVICE T | floor (T x) |
template<> | |
CK_TILE_DEVICE fp16_t | floor< fp16_t > (fp16_t x) |
template<typename T > | |
CK_TILE_DEVICE T | rcp (T x) |
template<typename T > | |
CK_TILE_DEVICE T | exp (T x) |
template<> | |
CK_TILE_DEVICE fp16_t | exp< fp16_t > (fp16_t x) |
template<typename T > | |
CK_TILE_DEVICE T | tanh_fast (T x) |
template<> | |
CK_TILE_DEVICE float | tanh_fast< float > (float x) |
template<typename T > | |
CK_TILE_DEVICE T | log (T x) |
template<> | |
CK_TILE_DEVICE fp16_t | log< fp16_t > (fp16_t x) |
template<typename T > | |
CK_TILE_DEVICE T | pow (T x, T gamma) |
template<typename T > | |
CK_TILE_DEVICE T | expm1 (T x) |
template<typename T > | |
CK_TILE_HOST_DEVICE float | convert_to_float (typename T::raw_type data, float scale=1.f) |
template<typename T > | |
CK_TILE_HOST_DEVICE T::raw_type | convert_to_type (float value, float scale=1.f) |
constexpr CK_TILE_HOST_DEVICE uint8_t | float_to_e2m1 (float x, float scale=1.f) |
constexpr CK_TILE_HOST_DEVICE pk_fp4_t | float_to_pk_fp4 (const float &x, float scale) |
constexpr CK_TILE_HOST_DEVICE pk_fp4_t | fp16_to_pk_fp4 (const fp16_t &x, float scale) |
constexpr CK_TILE_HOST_DEVICE pk_fp4_t | bf16_to_pk_fp4 (const bf16_t &x, float scale) |
constexpr CK_TILE_HOST_DEVICE pk_fp4_t | fp16x2_to_pk_fp4 (const fp16x2_t &x, float scale) |
constexpr CK_TILE_HOST_DEVICE pk_fp4_t | bf16x2_to_pk_fp4 (const bf16x2_t &x, float scale) |
constexpr CK_TILE_HOST_DEVICE pk_fp4_t | fp32x2_to_pk_fp4 (const fp32x2_t &x, float scale) |
constexpr CK_TILE_HOST_DEVICE fp32x2_t | pk_fp4_to_fp32x2 (const pk_fp4_t &x, float scale) |
constexpr CK_TILE_HOST_DEVICE fp16x2_t | pk_fp4_to_fp16x2 (const pk_fp4_t &x, float scale) |
constexpr CK_TILE_HOST_DEVICE bf16x2_t | pk_fp4_to_bf16x2 (const pk_fp4_t &x, float scale) |
constexpr CK_TILE_HOST_DEVICE float | pk_fp4_to_float (const pk_fp4_t &x, float scale) |
constexpr CK_TILE_HOST_DEVICE fp16_t | pk_fp4_to_fp16 (const pk_fp4_t &x, float scale) |
constexpr CK_TILE_HOST_DEVICE bf16_t | pk_fp4_to_bf16 (const pk_fp4_t &x, float scale) |
CK_TILE_HOST_DEVICE fp32x2_t | pk_int4_t_to_fp32x2_t (const pk_int4_t &x) |
CK_TILE_HOST_DEVICE fp32x2_t | pk_int4_t_to_fp32x2_t_signed_conversion (const pk_int4_t &x) |
CK_TILE_HOST_DEVICE fp16x2_t | pk_int4_t_to_halfx2_t (const pk_int4_t &x) |
CK_TILE_HOST_DEVICE bf16x2_t | pk_int4_t_to_bfloat16x2_t (const pk_int4_t &x) |
CK_TILE_HOST_DEVICE int8x2_t | pk_int4_t_to_int8x2_t (const pk_int4_t &x) |
template<typename Y , typename X , std::enable_if_t<!(std::is_const_v< Y >||std::is_const_v< X >), bool > = false> | |
constexpr CK_TILE_HOST_DEVICE Y | type_convert (X x) |
template<typename Y , typename X > | |
constexpr CK_TILE_HOST_DEVICE Y | scaled_type_convert (X x, float scale) |
template<address_space_enum BufferAddressSpace, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename T , typename BufferSizeType > | |
constexpr CK_TILE_HOST_DEVICE auto | make_buffer_view (T *__restrict__ p, BufferSizeType buffer_size) |
template<address_space_enum BufferAddressSpace, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename T , typename BufferSizeType , typename X , typename std::enable_if< std::is_same< remove_cvref_t< T >, remove_cvref_t< X >>::value, bool >::type = false> | |
constexpr CK_TILE_HOST_DEVICE auto | make_buffer_view (T *__restrict__ p, BufferSizeType buffer_size, X invalid_element_value) |
template<address_space_enum BufferAddressSpace, typename T , typename BufferSizeType , bool InvalidElementUseNumericalZeroValue, amd_buffer_coherence_enum Coherence> | |
CK_TILE_HOST_DEVICE void | print (const buffer_view< BufferAddressSpace, T, BufferSizeType, InvalidElementUseNumericalZeroValue, Coherence > &bv) |
template<typename TileWindow_ , index_t i_access = -1, bool oob_conditional_check = true> | |
CK_TILE_DEVICE auto | load_tile (const TileWindow_ &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}) |
template<typename DistributedTensor_ , typename TileWindow_ , index_t i_access = -1, bool oob_conditional_check = true> | |
CK_TILE_DEVICE auto | load_tile (DistributedTensor_ &dst_tile, const TileWindow_ &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}) |
template<typename T , typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false> | |
CK_TILE_DEVICE auto | load_tile_raw (T &tile, const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) |
Loads a tile of data using inline assembly. More... | |
template<typename T , typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false> | |
CK_TILE_DEVICE auto | load_tile_raw (T &tile, const tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) |
template<typename LdsTileWindow_ , typename TileWindow_ , index_t i_access = -1, bool oob_conditional_check = true> | |
CK_TILE_DEVICE auto | async_load_tile (LdsTileWindow_ &&lds_tile, const TileWindow_ &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}) |
template<typename LdsTileWindow_ , typename TileWindow_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false> | |
CK_TILE_DEVICE auto | async_load_tile_raw (LdsTileWindow_ &&lds_tile, const TileWindow_ &tile_window, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) |
CK_TILE_DEVICE auto | async_load_fence (index_t cnt=0) |
template<typename WindowLengths > | |
CK_TILE_DEVICE auto | load_tile (const null_tile_window< WindowLengths > &) |
template<typename T , typename WindowLengths > | |
CK_TILE_DEVICE auto | load_tile_raw (T &, const null_tile_window< WindowLengths > &) |
constexpr int | DS_READ_TR_SIZE () |
template<typename InnerEncode , index_t kLeadIterPerWarp, index_t kSecondIterPerWarp, index_t kLeadNumWarps, index_t kSecondNumWarps> | |
constexpr CK_TILE_HOST_DEVICE auto | InputTileDistributionEncoding () |
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename Policy = DefaultTranspose<typename BottomTensorView_::DataType>, typename = std::enable_if_t<TransposeTileDistrChecker<TileDistribution_, typename BottomTensorView_::DataType, Policy>::distr_encoding_valid, Policy>> | |
CK_TILE_DEVICE auto | load_tile_transpose (const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window) |
transpose loads tile from a tensor and returns the resulting tensor with a new (transposed) tile distribution. use SFINAE to ensure the tile distribution encoding is valid. More... | |
template<typename T > | |
constexpr CK_TILE_DEVICE auto | is_null_tile_window (const T &) |
template<typename WindowLengths > | |
constexpr CK_TILE_DEVICE auto | make_null_tile_window (const WindowLengths &window_lengths) |
template<typename WindowLengths , typename... Ts> | |
constexpr CK_TILE_DEVICE auto | make_tile_window (null_tensor_view, const WindowLengths &window_lengths, const multi_index< WindowLengths::size()> &, Ts &&...) |
template<typename WindowLengths , typename StaticTileDistribution > | |
constexpr CK_TILE_DEVICE auto | make_tile_window (const null_tile_window< WindowLengths > &t, const StaticTileDistribution &) |
template<typename WindowLengths > | |
CK_TILE_DEVICE void | move_tile_window (null_tile_window< WindowLengths > &, const typename null_tile_window< WindowLengths >::BottomTensorIndex &) |
template<typename OutTensor , typename InTensor > | |
CK_TILE_DEVICE void | shuffle_tile (OutTensor &out, const InTensor &in) |
template<typename BottomTensorView_ , typename WindowLengths_ , index_t... SliceBegins, index_t... SliceEnds> | |
constexpr CK_TILE_DEVICE auto | get_slice_tile (const tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile, sequence< SliceBegins... > slice_begins, sequence< SliceEnds... > slice_ends) |
template<typename DataType_ , typename StaticTileDistribution_ , index_t... SliceBegins, index_t... SliceEnds> | |
constexpr CK_TILE_DEVICE auto | get_slice_tile (const static_distributed_tensor< DataType_, StaticTileDistribution_ > &tile, sequence< SliceBegins... > slice_begins, sequence< SliceEnds... > slice_ends) |
template<typename DstDataType_ , typename DstStaticTileDistribution_ , typename SrcDataType_ , typename SrcStaticTileDistribution_ , index_t... SliceBegins, index_t... SliceEnds> | |
constexpr CK_TILE_DEVICE auto | set_slice_tile (static_distributed_tensor< DstDataType_, DstStaticTileDistribution_ > &dst_tile, const static_distributed_tensor< SrcDataType_, SrcStaticTileDistribution_ > &src_tile, sequence< SliceBegins... > slice_begins, sequence< SliceEnds... > slice_ends) |
template<typename DataType , typename StaticTileDistribution > | |
constexpr CK_TILE_HOST_DEVICE auto | make_static_distributed_tensor (const StaticTileDistribution &) |
template<typename DataType , typename StaticTileDistribution , typename ThreadBuffer > | |
constexpr CK_TILE_HOST_DEVICE auto | make_static_distributed_tensor (const StaticTileDistribution &, ThreadBuffer &&thread_buffer_) |
template<typename StaticTileDistribution , typename DistributedIndices > | |
constexpr CK_TILE_HOST_DEVICE auto | get_x_indices_from_distributed_indices (StaticTileDistribution tile_distribution, DistributedIndices distributed_indices) |
template<typename DataType , typename StaticTileDistribution , typename XIndicesPredicate > | |
CK_TILE_HOST_DEVICE void | set_tile_if (static_distributed_tensor< DataType, StaticTileDistribution > &out_tensor, DataType value, XIndicesPredicate predicate) |
template<typename YLengths , index_t XUnpacks> | |
constexpr CK_TILE_HOST_DEVICE auto | get_y_unpacks_from_x_unpacks (YLengths, number< XUnpacks >) |
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename DataType_ > | |
CK_TILE_DEVICE void | store_tile (tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile_window_tmp, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor) |
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename DataType_ > | |
CK_TILE_DEVICE void | store_tile_raw (tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile_window_tmp, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor) |
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ > | |
CK_TILE_DEVICE void | store_tile (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor) |
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ > | |
CK_TILE_DEVICE void | store_tile_raw (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor) |
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , typename DataType_ > | |
CK_TILE_DEVICE void | store_tile (tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor) |
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , typename DataType_ > | |
CK_TILE_DEVICE void | store_tile_raw (tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor) |
template<typename TileDistributedSpan_ , typename F > | |
CK_TILE_DEVICE void | sweep_tile_span (TileDistributedSpan_, const F &f) |
template<typename TileDistributedSpan_ , typename F , typename Unpacks = typename uniform_sequence_gen<TileDistributedSpan_::Impl::size(), 1>::type> | |
CK_TILE_DEVICE void | sweep_tile_uspan (TileDistributedSpan_, const F &f, Unpacks={}) |
template<typename DistributedTensor , typename F , typename UnpacksPerXDim = typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type> | |
constexpr CK_TILE_HOST_DEVICE void | sweep_tile (const F &f, UnpacksPerXDim={}) |
template<typename DistributedTensor , typename F , typename UnpacksPerXDim = typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type> | |
constexpr CK_TILE_HOST_DEVICE void | sweep_tile (const DistributedTensor &, const F &f, UnpacksPerXDim={}) |
template<typename T , typename F , typename U = typename uniform_sequence_gen<T::get_num_of_dimension(), 1>::type> | |
CK_TILE_HOST_DEVICE_EXTERN | tile_sweeper (const T &, const F &, U={}) -> tile_sweeper< T, F, U > |
template<typename Transforms , typename LowerDimensionOldTopIdss , typename UpperDimensionNewTopIdss > | |
constexpr CK_TILE_HOST_DEVICE auto | make_single_stage_tensor_adaptor (const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss) |
template<typename OldTensorAdaptor , typename NewTransforms , typename NewLowerDimensionOldTopIdss , typename NewUpperDimensionNewTopIdss > | |
constexpr CK_TILE_HOST_DEVICE auto | transform_tensor_adaptor (const OldTensorAdaptor &old_tensor_adaptor, const NewTransforms &new_transforms, NewLowerDimensionOldTopIdss, NewUpperDimensionNewTopIdss) |
template<typename TensorAdaptor0 , typename TensorAdaptor1 > | |
constexpr CK_TILE_HOST_DEVICE auto | chain_tensor_adaptors (const TensorAdaptor0 &adaptor0, const TensorAdaptor1 &adaptor1) |
template<typename Adaptor , typename TopIndex > | |
constexpr CK_TILE_HOST_DEVICE auto | make_tensor_adaptor_coordinate (const Adaptor &adaptor, const TopIndex &idx_top) |
template<bool JudgeDoTransforms = true, typename Adaptor , typename AdaptorCoord , typename TopIndex , typename BottomIndex > | |
constexpr CK_TILE_HOST_DEVICE void | move_tensor_adaptor_coordinate (const Adaptor &adaptor, AdaptorCoord &coord, const TopIndex &idx_diff_top, BottomIndex &idx_diff_bottom) |
template<bool JudgeDoTransforms = true, typename Adaptor , typename AdaptorCoord , typename TopIndex > | |
constexpr CK_TILE_HOST_DEVICE void | move_tensor_adaptor_coordinate (const Adaptor &adaptor, AdaptorCoord &coord, const TopIndex &idx_diff_top) |
template<typename Adaptor , typename AdaptorCoord > | |
constexpr CK_TILE_HOST_DEVICE bool | adaptor_coordinate_is_valid_assuming_top_index_is_valid (const Adaptor &adaptor, const AdaptorCoord &coord) |
template<typename Adaptor , typename AdpatorCoord > | |
constexpr CK_TILE_HOST_DEVICE bool | adaptor_coordinate_is_valid (const Adaptor &adaptor, const AdpatorCoord &coord) |
template<typename TensorDesc , typename TopIndex > | |
constexpr CK_TILE_HOST_DEVICE auto | make_tensor_coordinate (const TensorDesc &tensor_desc, const TopIndex &idx_top) |
template<bool JudgeDoTransforms = true, typename TensorDesc , typename TensorCoord , typename Index > | |
constexpr CK_TILE_HOST_DEVICE void | move_tensor_coordinate (const TensorDesc &tensor_desc, TensorCoord &coord, const Index &coord_step) |
template<typename TensorDesc , typename TensorCoord > | |
constexpr CK_TILE_HOST_DEVICE bool | coordinate_has_valid_offset_assuming_top_index_is_valid (const TensorDesc &tensor_desc, const TensorCoord &coord) |
template<typename TensorDesc , typename TensorCoord > | |
constexpr CK_TILE_HOST_DEVICE bool | coordinate_has_valid_offset (const TensorDesc &tensor_desc, const TensorCoord &coord) |
template<typename Adaptor , typename ElementSpaceSize > | |
constexpr CK_TILE_HOST_DEVICE auto | make_tensor_descriptor_from_adaptor (const Adaptor &adaptor, const ElementSpaceSize &element_space_size) |
template<typename OldTensorDescriptor , typename NewTransforms , typename NewLowerDimensionOldTopIdss , typename NewUpperDimensionNewTopIdss > | |
constexpr CK_TILE_HOST_DEVICE auto | transform_tensor_descriptor (const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldTopIdss, NewUpperDimensionNewTopIdss) |
template<typename... Lengths, typename... Strides, index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false> | |
constexpr CK_TILE_HOST_DEVICE auto | make_naive_tensor_descriptor (const tuple< Lengths... > &lengths, const tuple< Strides... > &strides, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}, number< GuaranteedLastDimensionVectorStride >=number<-1 >{}) |
template<typename... Lengths, typename... Strides, typename offset , index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false> | |
constexpr CK_TILE_HOST_DEVICE auto | make_naive_tensor_descriptor_with_offset (const tuple< Lengths... > &lengths, const tuple< Strides... > &strides, const offset &os, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}, number< GuaranteedLastDimensionVectorStride >=number<-1 >{}) |
template<typename... Lengths, index_t GuaranteedLastDimensionVectorLength = -1> | |
constexpr CK_TILE_HOST_DEVICE auto | make_naive_tensor_descriptor_packed (const tuple< Lengths... > &lengths, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}) |
template<typename... Lengths, typename... Strides, typename Offset , index_t GuaranteedLastDimensionVectorLength = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false> | |
constexpr CK_TILE_HOST_DEVICE auto | make_naive_tensor_descriptor_packed_with_offset (const tuple< Lengths... > &lengths, const Offset &offset, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}) |
template<typename... Lengths, typename Align > | |
constexpr CK_TILE_HOST_DEVICE auto | make_naive_tensor_descriptor_aligned (const tuple< Lengths... > &lengths, Align align) |
template<address_space_enum BufferAddressSpace = address_space_enum::generic, memory_operation_enum DstInMemOp = memory_operation_enum::set, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType , typename... Ts> | |
constexpr CK_TILE_HOST_DEVICE auto | make_tensor_view (DataType *__restrict__ p, const tensor_descriptor< Ts... > &desc) |
template<address_space_enum BufferAddressSpace = address_space_enum::generic, memory_operation_enum DstInMemOp = memory_operation_enum::set, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType , typename... Lengths, typename... Strides, index_t GuaranteedLastDimensionVectorLength = -1, index_t GuaranteedLastDimensionVectorStride = -1, typename std::enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false> | |
constexpr CK_TILE_HOST_DEVICE auto | make_naive_tensor_view (DataType *__restrict__ p, const tuple< Lengths... > &lengths, const tuple< Strides... > &strides, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}, number< GuaranteedLastDimensionVectorStride >=number<-1 >{}) |
template<address_space_enum BufferAddressSpace = address_space_enum::generic, amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default, typename DataType , typename... Lengths, index_t GuaranteedLastDimensionVectorLength = -1> | |
constexpr CK_TILE_HOST_DEVICE auto | make_naive_tensor_view_packed (DataType *__restrict__ p, const tuple< Lengths... > &lengths, number< GuaranteedLastDimensionVectorLength >=number<-1 >{}) |
template<typename OldTensorView , typename NewTransforms , typename NewLowerDimensionOldVisibleIdss , typename NewUpperDimensionNewVisibleIdss > | |
constexpr CK_TILE_HOST_DEVICE auto | transform_tensor_view (const OldTensorView &old_tensor_view, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss) |
template<typename TensorView , typename TileLengths , typename DoPads > | |
constexpr CK_TILE_HOST_DEVICE auto | pad_tensor_view (const TensorView &tensor_view, const TileLengths &tile_lengths, DoPads) |
template<typename StaticTileDistributionEncoding_ > | |
constexpr CK_TILE_HOST_DEVICE auto | make_static_tile_distribution (StaticTileDistributionEncoding_) |
template<typename PsYs2XsAdaptor_ , typename Ys2DDescriptor_ , typename StaticTileDistributionEncoding_ , typename TileDistributionDetail_ > | |
CK_TILE_HOST_DEVICE void | print (const tile_distribution< PsYs2XsAdaptor_, Ys2DDescriptor_, StaticTileDistributionEncoding_, TileDistributionDetail_ > &distribution) |
template<typename RsLengths_ , typename HsLengthss_ , typename Ps2RHssMajor_ , typename Ps2RHssMinor_ , typename Ys2RHsMajor_ , typename Ys2RHsMinor_ > | |
CK_TILE_HOST_DEVICE void | print (const typename tile_distribution_encoding< RsLengths_, HsLengthss_, Ps2RHssMajor_, Ps2RHssMinor_, Ys2RHsMajor_, Ys2RHsMinor_ >::detail &detail_obj) |
template<typename RsLengths_ , typename HsLengthss_ , typename Ps2RHssMajor_ , typename Ps2RHssMinor_ , typename Ys2RHsMajor_ , typename Ys2RHsMinor_ > | |
CK_TILE_HOST_DEVICE void | print (const tile_distribution_encoding< RsLengths_, HsLengthss_, Ps2RHssMajor_, Ps2RHssMinor_, Ys2RHsMajor_, Ys2RHsMinor_ > &encoding) |
template<typename InOutElementFunc , typename... InOutDstrTensors, typename = std::enable_if_t<std::conjunction_v< std::negation<std::is_same<std::remove_const_t<InOutDstrTensors>, null_tensor>>...>>> | |
CK_TILE_DEVICE void | tile_elementwise_inout (const InOutElementFunc &inout_element_func, InOutDstrTensors &... inout_dstr_tensors) |
template<typename InElementFunc , typename... InTensor, typename = std::enable_if_t< std::conjunction_v<std::negation<std::is_same<InTensor, null_tensor>>...>>> | |
CK_TILE_DEVICE auto | tile_elementwise_in (const InElementFunc &in_element_func, const InTensor &... in_dstr_tensors) |
template<typename InElementFunc , typename Tuple , size_t... I> | |
CK_TILE_DEVICE auto | tile_elementwise_inout_unpack (const InElementFunc &in_element_func, const Tuple &t, std::index_sequence< I... >) |
Template function that "unpacks" a tuple and applies an element-wise operation. More... | |
template<typename InElementFunc , typename Tuple > | |
CK_TILE_DEVICE auto | tile_elementwise_inout_unpack (const InElementFunc &in_element_func, const Tuple &t) |
Template function that "unpacks" a tuple and applies an element-wise operation. More... | |
template<typename DstrTensors , typename T > | |
CK_TILE_DEVICE void | set_tile (DstrTensors &dstr_tensor, const T &value) |
template<typename T > | |
CK_TILE_DEVICE void | set_tile (null_tensor &, const T &) |
template<typename DstrTensors , index_t v, bool skip_subdword_opt = false> | |
CK_TILE_DEVICE void | set_tile (DstrTensors &dstr_tensor, number< v >, bool_constant< skip_subdword_opt >={}) |
template<index_t v> | |
CK_TILE_DEVICE void | set_tile (null_tensor &, number< v >) |
template<typename DstrTensors > | |
CK_TILE_DEVICE void | clear_tile (DstrTensors &dstr_tensor) |
template<typename DstType , typename SrcTensor > | |
CK_TILE_DEVICE auto | cast_tile (const SrcTensor &src_tensor) |
template<typename InOutElementFunc , typename... MaybeNullTensor, typename = std::enable_if_t< std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, null_tensor>...>>> | |
CK_TILE_DEVICE void | tile_elementwise_inout (const InOutElementFunc &, MaybeNullTensor &&...) |
template<typename InElementFunc , typename... MaybeNullTensor, typename = std::enable_if_t< std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, null_tensor>...>>> | |
CK_TILE_DEVICE auto | tile_elementwise_in (const InElementFunc &, MaybeNullTensor &&...) |
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename StaticPageIndexArray_ , index_t HsGatherDim = 0, index_t NumCoord = 1> | |
constexpr CK_TILE_DEVICE auto | make_tile_scatter_gather (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, const StaticPageIndexArray_ &page_idx, number< HsGatherDim >={}, number< NumCoord >={}) |
template<typename TensorView , typename WindowLengths , typename StaticTileDistribution , typename StaticPageIndexArray , index_t HsGatherDim> | |
constexpr CK_TILE_DEVICE auto | make_tile_scatter_gather (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const multi_index< TensorView::get_num_of_dimension()> &origin, const StaticTileDistribution &tile_distribution, const StaticPageIndexArray &page_idx, number< HsGatherDim >={}) |
template<typename TensorView , typename WindowLengths , typename StaticTileDistribution , typename StaticPageIndexArray , index_t HsGatherDim> | |
constexpr CK_TILE_DEVICE auto | make_tile_scatter_gather (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const StaticTileDistribution &tile_distribution, const StaticPageIndexArray &page_idx, number< HsGatherDim >={}) |
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename StaticPageIndexArray_ , typename StaticValidArray_ , index_t HsGatherDim = 0, index_t NumCoord = 1> | |
constexpr CK_TILE_DEVICE auto | make_tile_scatter_gather (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, const StaticPageIndexArray_ &page_idx, const StaticValidArray_ &valids, number< HsGatherDim >={}, number< NumCoord >={}) |
template<typename TensorView , typename WindowLengths , typename StaticTileDistribution , typename StaticPageIndexArray , typename StaticValidArray , index_t HsGatherDim> | |
constexpr CK_TILE_DEVICE auto | make_tile_scatter_gather (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const multi_index< TensorView::get_num_of_dimension()> &origin, const StaticTileDistribution &tile_distribution, const StaticPageIndexArray &page_idx, const StaticValidArray &valids, number< HsGatherDim >={}) |
template<typename TensorView , typename WindowLengths , typename StaticTileDistribution , typename StaticPageIndexArray , typename StaticValidArray , index_t HsGatherDim> | |
constexpr CK_TILE_DEVICE auto | make_tile_scatter_gather (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const StaticTileDistribution &tile_distribution, const StaticPageIndexArray &page_idx, const StaticValidArray &valids, number< HsGatherDim >={}) |
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , index_t NumCoord = 1> | |
constexpr CK_TILE_DEVICE auto | make_tile_window (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, number< NumCoord >={}) |
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , index_t NumCoord = 1> | |
CK_TILE_DEVICE auto | make_tile_window_raw (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, number< NumCoord >={}) |
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , index_t NumCoord> | |
CK_TILE_DEVICE void | move_tile_window (tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord > &window, const typename tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord >::BottomTensorIndex &step) |
template<typename TensorView_ , typename WindowLengths_ > | |
constexpr CK_TILE_DEVICE auto | make_tile_window (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin) |
template<typename TensorView , typename WindowLengths > | |
constexpr CK_TILE_DEVICE auto | make_tile_window (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const multi_index< TensorView::get_num_of_dimension()> &origin) |
template<typename TensorView , typename WindowLengths , typename StaticTileDistribution > | |
constexpr CK_TILE_DEVICE auto | make_tile_window (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const multi_index< TensorView::get_num_of_dimension()> &origin, const StaticTileDistribution &tile_distribution) |
template<typename TensorView , typename WindowLengths , typename StaticTileDistribution > | |
constexpr CK_TILE_DEVICE auto | make_tile_window (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const StaticTileDistribution &tile_distribution) |
template<typename TensorView , typename WindowLengths , typename StaticTileDistribution > | |
constexpr CK_TILE_DEVICE auto | make_tile_window_raw (const tile_window_with_static_lengths< TensorView, WindowLengths > &tile_window, const StaticTileDistribution &tile_distribution) |
template<typename TensorView_ , typename WindowLengths_ > | |
CK_TILE_DEVICE void | move_tile_window (tile_window_with_static_lengths< TensorView_, WindowLengths_ > &window, const typename tile_window_with_static_lengths< TensorView_, WindowLengths_ >::BottomTensorIndex &step) |
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>> | |
constexpr CK_TILE_DEVICE auto | make_tile_window_linear (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={}) |
template<typename TileWindow_ , typename StaticTileDistribution_ , typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>> | |
constexpr CK_TILE_DEVICE auto | make_tile_window_linear (const TileWindow_ &tile_window, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={}) |
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>> | |
CK_TILE_DEVICE auto | make_tile_window_linear_raw (const TensorView_ &tensor_view, const WindowLengths_ &window_lengths, const multi_index< TensorView_::get_num_of_dimension()> &origin, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={}) |
template<typename TileWindow_ , typename StaticTileDistribution_ , typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>> | |
constexpr CK_TILE_DEVICE auto | make_tile_window_linear_raw (const TileWindow_ &tile_window, const StaticTileDistribution_ &tile_distribution, LinearBottomDims_={}) |
template<typename TensorView_ , typename WindowLengths_ , typename StaticTileDistribution_ , typename LinearBottomDims_ > | |
CK_TILE_DEVICE void | move_tile_window (tile_window_linear< TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ > &window, const typename tile_window_linear< TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ >::BottomTensorIndex &step) |
template<typename TileWindow_ > | |
CK_TILE_DEVICE void | move_tile_window (TileWindow_ &window, const typename TileWindow_::BottomTensorIndex &step) |
template<typename LdsTileWindow_ > | |
CK_TILE_DEVICE auto | get_async_store_smem_info (LdsTileWindow_ &&lds_tile) |
template<typename OutTensor , typename InTensor > | |
CK_TILE_DEVICE void | transpose_tile2d (OutTensor &out, const InTensor &in) |
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename DataType_ > | |
CK_TILE_DEVICE void | update_tile (tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > &tile_window_tmp, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor) |
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ , index_t i_access = -1, bool oob_conditional_check = true> | |
CK_TILE_DEVICE void | update_tile (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}) |
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , index_t NumCoord, typename DataType_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false> | |
CK_TILE_DEVICE void | update_tile_raw (tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) |
template<typename BottomTensorView_ , typename WindowLengths_ , typename TileDistribution_ , typename LinearBottomDims_ , typename DataType_ , index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false> | |
CK_TILE_DEVICE auto | update_tile_raw (tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > &tile_window, const static_distributed_tensor< DataType_, TileDistribution_ > &dstr_tensor, number< i_access >={}, bool_constant< oob_conditional_check >={}, bool_constant< pre_nop >={}) |
template<typename Y , typename X > | |
constexpr CK_TILE_HOST_DEVICE Y | bit_cast (const X &x) |
template<auto... val> | |
constexpr void | CK_PRINT () |
template<size_t... Idx> | |
constexpr std::tuple< std::integral_constant< size_t, Idx >... > | makeTuple (std::index_sequence< Idx... >) noexcept |
constexpr size_t | constexpr_strlen (const char *c) |
template<typename... Args> | |
void | CK_TILE_ERROR (Args &&... args) noexcept |
template<class EnvVar > | |
const std::string & | EnvGetString (EnvVar) |
template<class EnvVar > | |
bool | EnvIsEnabled (EnvVar) |
template<class EnvVar > | |
bool | EnvIsDisabled (EnvVar) |
template<class EnvVar > | |
uint64_t | EnvValue (EnvVar) |
template<class EnvVar > | |
bool | EnvIsUnset (EnvVar) |
template<class EnvVar > | |
void | EnvUnset (EnvVar) |
template<typename EnvVar , typename ValueType > | |
void | UpdateEnvVar (EnvVar, const ValueType &val) |
Updates the cached value of an environment variable. More... | |
template<typename EnvVar > | |
void | UpdateEnvVar (EnvVar, const std::string_view &val) |
template<typename F , typename X > | |
constexpr CK_TILE_HOST_DEVICE auto | unpack (F &&f, X &&x) |
template<typename F , typename X , typename Y > | |
constexpr CK_TILE_HOST_DEVICE auto | unpack2 (F &&f, X &&x, Y &&y) |
template<bool predicate, typename X , typename Y > | |
constexpr auto | conditional_expr (X &&x, Y &&y) |
template<typename T > | |
CK_TILE_HOST_DEVICE void | print (const T &) |
template<> | |
CK_TILE_HOST_DEVICE void | print (const int &value) |
Specialization for int. More... | |
template<> | |
CK_TILE_HOST_DEVICE void | print (const float &value) |
Specialization for float. More... | |
template<> | |
CK_TILE_HOST_DEVICE void | print (const double &value) |
Specialization for double. More... | |
template<> | |
CK_TILE_HOST_DEVICE void | print (const long &value) |
Specialization for long. More... | |
template<> | |
CK_TILE_HOST_DEVICE void | print (const unsigned int &value) |
Specialization for unsigned int. More... | |
template<> | |
CK_TILE_HOST_DEVICE void | print (const char &value) |
Specialization for char. More... | |
template<typename T , size_t N> | |
CK_TILE_HOST_DEVICE void | print (const T(&value)[N]) |
Specialization for array. More... | |
template<typename PY , typename PX , typename std::enable_if< std::is_pointer_v< PY > &&std::is_pointer_v< PX >, bool >::type = false> | |
CK_TILE_HOST_DEVICE PY | c_style_pointer_cast (PX p_x) |
template<typename... Ts> | |
__host__ __device__ | composes (Ts &&...) -> composes< remove_cvref_t< Ts >... > |
FIXME: create macro to replace 'host device' and nothing more. More... | |
template<typename ComputeDataType , typename OutDataType , typename AccDataType = ComputeDataType> | |
CK_TILE_HOST double | get_relative_threshold (const int number_of_accumulations=1) |
Calculate relative error threshold for numerical comparisons. More... | |
template<typename ComputeDataType , typename OutDataType , typename AccDataType = ComputeDataType> | |
CK_TILE_HOST double | get_absolute_threshold (const double max_possible_num, const int number_of_accumulations=1) |
Calculate absolute error threshold for numerical comparisons. More... | |
template<typename T > | |
std::ostream & | operator<< (std::ostream &os, const std::vector< T > &v) |
Stream operator overload for vector output. More... | |
template<typename Range , typename RefRange > | |
CK_TILE_HOST bool | check_size_mismatch (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!") |
Check for size mismatch between output and reference ranges. More... | |
CK_TILE_HOST void | report_error_stats (int err_count, double max_err, std::size_t total_size) |
Report error statistics for numerical comparisons. More... | |
template<typename Range , typename RefRange > | |
std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_floating_point_v< ranges::range_value_t< Range > > &&!std::is_same_v< ranges::range_value_t< Range >, half_t >, bool >::type CK_TILE_HOST | check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-5, double atol=3e-6, bool allow_infinity_ref=false) |
Check errors between floating point ranges using the specified tolerances. More... | |
template<typename Range , typename RefRange > | |
std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, bf16_t >, bool >::type CK_TILE_HOST | check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-3, double atol=1e-3, bool allow_infinity_ref=false) |
Check errors between floating point ranges using the specified tolerances. More... | |
template<typename Range , typename RefRange > | |
std::enable_if< std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange > > &&std::is_same_v< ranges::range_value_t< Range >, half_t >, bool >::type CK_TILE_HOST | check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-3, double atol=1e-3, bool allow_infinity_ref=false) |
Check errors between half precision floating point ranges. More... | |
template<typename Range , typename RefRange > | |
std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange >> &&std::is_integral_v< ranges::range_value_t< Range >> &&!std::is_same_v< ranges::range_value_t< Range >, bf16_t >), bool > CK_TILE_HOST | check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double=0, double atol=0) |
Check errors between integer ranges. More... | |
template<typename Range , typename RefRange > | |
std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange >> &&std::is_same_v< ranges::range_value_t< Range >, fp8_t >), bool > CK_TILE_HOST | check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", unsigned max_rounding_point_distance=1, double atol=1e-1, bool allow_infinity_ref=false) |
Check errors between FP8 ranges. More... | |
template<typename Range , typename RefRange > | |
std::enable_if_t<(std::is_same_v< ranges::range_value_t< Range >, ranges::range_value_t< RefRange >> &&std::is_same_v< ranges::range_value_t< Range >, bf8_t >), bool > CK_TILE_HOST | check_err (const Range &out, const RefRange &ref, const std::string &msg="Error: Incorrect results!", double rtol=1e-3, double atol=1e-3, bool allow_infinity_ref=false) |
Check errors between BF8 ranges. More... | |
template<typename... Ts> | |
auto | concat (const Ts &... xs) -> std::enable_if_t<!AllConvertibleToStringView< Ts... >, std::string > |
template<std::size_t N> | |
constexpr std::size_t | getSize (char(&)[N]) noexcept |
template<std::size_t N> | |
constexpr std::size_t | getSize (const char(&)[N]) noexcept |
constexpr std::size_t | getSize (const char *s) noexcept |
constexpr std::size_t | getSize (const char &) noexcept |
std::size_t | getSize (const std::string &s) noexcept |
constexpr std::size_t | getSize (const std::string_view &s) noexcept |
template<typename... Ts> | |
auto | concatInto (std::string &result, const Ts &... xs) -> std::enable_if_t< AllConvertibleToStringView< Ts... >, void > |
template<typename Sep , typename First , typename... Rest> | |
auto | concat (Sep sep, const First &first, const Rest &... rest) -> std::enable_if_t< AllConvertibleToStringView< First, Rest... >, std::string > |
template<typename T > | |
__global__ void | set_buffer_value (T *p, T x, uint64_t buffer_element_size) |
constexpr unsigned int | fnv1a_hash (std::string_view str, unsigned int h=2166136261u) |
std::string | get_device_name () |
bool | is_gfx11_supported () |
bool | is_gfx12_supported () |
bool | is_load_tr_supported () |
CK_TILE_HOST void | hip_check_error (hipError_t x) |
template<typename Range > | |
CK_TILE_HOST std::ostream & | LogRange (std::ostream &os, Range &&range, std::string delim, int precision=std::cout.precision(), int width=0) |
template<typename T , typename Range > | |
CK_TILE_HOST std::ostream & | LogRangeAsType (std::ostream &os, Range &&range, std::string delim, int precision=std::cout.precision(), int width=0) |
template<typename F , typename T , std::size_t... Is> | |
CK_TILE_HOST auto | call_f_unpack_args_impl (F f, T args, std::index_sequence< Is... >) |
template<typename F , typename T > | |
CK_TILE_HOST auto | call_f_unpack_args (F f, T args) |
template<typename F , typename T , std::size_t... Is> | |
CK_TILE_HOST auto | construct_f_unpack_args_impl (T args, std::index_sequence< Is... >) |
template<typename F , typename T > | |
CK_TILE_HOST auto | construct_f_unpack_args (F, T args) |
template<typename New2Old > | |
CK_TILE_HOST HostTensorDescriptor | transpose_host_tensor_descriptor_given_new2old (const HostTensorDescriptor &a, const New2Old &new2old) |
template<typename F , typename... Xs> | |
CK_TILE_HOST auto | make_ParallelTensorFunctor (F f, Xs... xs) |
template<bool is_row_major> | |
auto | host_tensor_descriptor (std::size_t row, std::size_t col, std::size_t stride, bool_constant< is_row_major >) |
Creates a host tensor descriptor with specified dimensions and layout. More... | |
template<bool is_row_major> | |
auto | get_default_stride (std::size_t row, std::size_t col, std::size_t stride, bool_constant< is_row_major >) |
template<int MinBlockPerCu, typename Kernel , typename... Args> | |
__global__ void | kentry (Args... args) |
template<int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU, typename KernelImpl , typename... Args> | |
CK_TILE_HOST auto | make_kernel (KernelImpl, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args) |
template<typename... Callables> | |
CK_TILE_HOST void | launch_and_check (const stream_config &sc, Callables &&... callables) |
template<typename TimerType , typename PreprocessFunc > | |
CK_TILE_HOST double | preprocess_profiling_impl (TimerType timer, const stream_config &s, PreprocessFunc preprocess) |
template<typename TimerType , typename CallablesFunc , typename PreprocessFunc = std::nullptr_t> | |
CK_TILE_HOST double | timing_loop_impl (TimerType timer, const stream_config &s, CallablesFunc &&callables_func, PreprocessFunc preprocess=nullptr) |
template<typename... Callables> | |
CK_TILE_HOST float | launch_kernel (const stream_config &s, Callables &&... callables) |
template<typename PreprocessFunc , typename... Callables> | |
CK_TILE_HOST float | launch_kernel_time_mask (const stream_config &s, PreprocessFunc preprocess, Callables &&... callables) |
template<typename DataType , typename RandValOutputDataType > | |
CK_TILE_HOST void | reference_batched_dropout (HostTensor< DataType > &in_out_b_m_n, const HostTensor< RandValOutputDataType > &randval_b_m_n, const uint8_t &p_undrop_in_uint8_t, const float scale) |
template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename BinaryElementOp = ck_tile::plus<AccDataType>> | |
CK_TILE_HOST void | reference_batched_elementwise (const HostTensor< ADataType > &a_b_m_n, const HostTensor< BDataType > &b_b_m_n, HostTensor< CDataType > &c_b_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const BinaryElementOp &binary_element_op={}) |
template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity> | |
CK_TILE_HOST void | reference_batched_gemm (const HostTensor< ADataType > &a_b_m_k, const HostTensor< BDataType > &b_b_n_k, HostTensor< CDataType > &c_b_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={}) |
template<typename CDataType , typename MaskingType > | |
CK_TILE_HOST void | reference_batched_masking (HostTensor< CDataType > &c_b_m_n, const MaskingType &mask) |
template<typename DataType , typename ComputeDataType = float> | |
CK_TILE_HOST void | reference_batched_rotary_position_embedding (const HostTensor< DataType > &input_bsd, const HostTensor< DataType > &cos_sd, const HostTensor< DataType > &sin_sd, bool interleaved, HostTensor< DataType > &output_bsd, bool use_1_row_sin_cos=false) |
template<typename ADataType , typename CompDataType , typename BDataType , typename CompElementOp = ck_tile::identity> | |
CK_TILE_HOST void | reference_batched_softmax (const HostTensor< ADataType > &a_b_m_n, HostTensor< BDataType > &b_b_m_n, const CompElementOp &comp_element_op={}, std::optional< std::reference_wrapper< HostTensor< CompDataType >>> lse_b_m=std::nullopt) |
template<typename Type > | |
CK_TILE_HOST void | reference_batched_transpose (const HostTensor< Type > &x, HostTensor< Type > &y, std::string layout_in="NCHW", std::string layout_out="NHWC") |
template<typename ADataType , typename BDataType , typename ComputeDataType , typename ElementOp > | |
CK_TILE_HOST void | reference_unary_elementwise (const HostTensor< ADataType > &a, HostTensor< BDataType > &b, ElementOp element_op) |
template<typename ADataType , typename BDataType , typename CDataType , typename ComputeDataType , typename ElementOp > | |
CK_TILE_HOST void | reference_binary_elementwise (const HostTensor< ADataType > &a, const HostTensor< BDataType > &b, HostTensor< CDataType > &c, ElementOp element_op) |
template<typename AccDataType , typename Activation , typename ADataType , typename GDataType , typename DDataType , typename ODataType , typename AScaleDataType , typename GScaleDataType , typename DScaleDataType , typename YSmoothScaleDataType , typename TopkWeightDataType , typename IndexDataType > | |
void | reference_fused_moe (const ck_tile::HostTensor< ADataType > &a_host, const ck_tile::HostTensor< GDataType > &g_host, const ck_tile::HostTensor< DDataType > &d_host, const ck_tile::HostTensor< AScaleDataType > &sa_host, const ck_tile::HostTensor< GScaleDataType > &sg_host, const ck_tile::HostTensor< DScaleDataType > &sd_host, const ck_tile::HostTensor< YSmoothScaleDataType > &sy_host, ck_tile::HostTensor< ODataType > &o_host, const ck_tile::HostTensor< IndexDataType > &sorted_token_ids_host, const ck_tile::HostTensor< TopkWeightDataType > &sorted_weight_host, const ck_tile::HostTensor< IndexDataType > &sorted_expert_ids_host, const ck_tile::HostTensor< IndexDataType > &num_sorted_tiles_host, const ck_tile::HostTensor< IndexDataType > &token_ids_host, ck_tile::index_t block_m, ck_tile::index_t tokens, ck_tile::index_t experts, ck_tile::index_t hidden_size, ck_tile::index_t intermediate_size, ck_tile::index_t topk, ck_tile::index_t gate_only) |
template<typename ADataType , typename QDataType , typename BDataType , typename AccDataType , typename CDataType , uint32_t QuantGroupSize, bool aquant, typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity> | |
CK_TILE_HOST void | reference_gemm_quant (const HostTensor< ADataType > &a_m_k, const HostTensor< QDataType > &q, const HostTensor< BDataType > &b_k_n, HostTensor< CDataType > &c_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={}) |
template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename AElementOp = ck_tile::identity, typename BElementOp = ck_tile::identity, typename ACCElementOp = ck_tile::identity> | |
CK_TILE_HOST void | reference_gemm (const HostTensor< ADataType > &a_m_k, const HostTensor< BDataType > &b_k_n, HostTensor< CDataType > &c_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={}) |
template<typename ADataType , typename BDataType , typename DsDataType , typename AccDataType , typename CDataType , typename ACCElementOp , typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>> | |
CK_TILE_HOST void | reference_gemm_multiple_d (const HostTensor< ADataType > &a_m_k, const HostTensor< BDataType > &b_k_n, const std::array< HostTensor< DDataType >, DsDataType::size()> &ds_m_n, HostTensor< CDataType > &c_m_n, const ACCElementOp &acc_element_op={}) |
template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename LayoutA , typename LayoutB , typename LayoutC > | |
__global__ void | naive_gemm_kernel (ADataType *A, BDataType *B, CDataType *C, ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t K, ck_tile::index_t strideA, ck_tile::index_t strideB, ck_tile::index_t strideC) |
template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename LayoutA , typename LayoutB , typename LayoutC > | |
void | reference_gemm_gpu (ADataType *a_ptr, BDataType *b_ptr, CDataType *c_ptr, index_t M, index_t N, index_t K, index_t stride_a, index_t stride_b, index_t stride_c) |
template<typename ADataType , typename BDataType , typename AccDataType , typename CDataType , typename LayoutA , typename LayoutB , typename LayoutC > | |
void | reference_batched_gemm_gpu (ADataType *a_ptr, BDataType *b_ptr, CDataType *c_ptr, index_t M, index_t N, index_t K, index_t stride_a, index_t stride_b, index_t stride_c, index_t batch_stride_A, index_t batch_stride_B, index_t batch_stride_C, index_t batch_count) |
template<ck_tile::index_t NDimSpatial, typename InDataType , typename WeiDataType , typename OutDataType > | |
CK_TILE_HOST void | reference_grouped_conv_bwd_data (HostTensor< InDataType > &input, const HostTensor< WeiDataType > &weight, const HostTensor< OutDataType > &output, std::vector< ck_tile::long_index_t > conv_strides, std::vector< ck_tile::long_index_t > conv_dilations, std::vector< ck_tile::long_index_t > in_left_pads, std::vector< ck_tile::long_index_t >) |
template<ck_tile::index_t NDimSpatial, typename InDataType , typename WeiDataType , typename OutDataType > | |
CK_TILE_HOST void | reference_grouped_conv_bwd_weight (const HostTensor< InDataType > &input, HostTensor< WeiDataType > &weight, const HostTensor< OutDataType > &output, std::vector< ck_tile::long_index_t > conv_strides, std::vector< ck_tile::long_index_t > conv_dilations, std::vector< ck_tile::long_index_t > in_left_pads, std::vector< ck_tile::long_index_t >) |
template<ck_tile::index_t NDimSpatial, typename InDataType , typename WeiDataType , typename OutDataType > | |
CK_TILE_HOST void | reference_grouped_conv_fwd (const HostTensor< InDataType > &input, const HostTensor< WeiDataType > &weight, HostTensor< OutDataType > &output, std::vector< ck_tile::long_index_t > conv_strides, std::vector< ck_tile::long_index_t > conv_dilations, std::vector< ck_tile::long_index_t > in_left_pads, std::vector< ck_tile::long_index_t >) |
template<typename InDataType , typename OutDataType , index_t NDimSpatial> | |
CK_TILE_HOST void | reference_im2col (const HostTensor< InDataType > &in_host, HostTensor< OutDataType > &out_host, const ck_tile::conv::ConvParam &conv_params) |
template<typename XDataType , typename GammaDataType , typename BetaDataType , typename ComputeDataType , typename YDataType , typename MeanDataType , typename InvStdDataType , typename Epilogue = reference_layernorm2d_default_epilogue> | |
void | reference_layernorm2d_fwd (const HostTensor< XDataType > &x_m_n, const HostTensor< GammaDataType > &gamma_n, const HostTensor< BetaDataType > &beta_n, HostTensor< YDataType > &y_m_n, HostTensor< MeanDataType > &mean_m, HostTensor< InvStdDataType > &invStd_m, ComputeDataType epsilon, Epilogue epilogue_functor={}) |
template<typename WeightType , typename IndexType = index_t> | |
CK_TILE_HOST void | reference_moe_sorting (const HostTensor< IndexType > &topk_ids, const HostTensor< WeightType > &weights, const HostTensor< IndexType > &local_expert_mask, HostTensor< IndexType > &p_sorted_token_ids, HostTensor< WeightType > &sorted_weight, HostTensor< IndexType > &sorted_expert_ids, index_t &unit_cnt, const index_t experts, const index_t unit_size, const index_t tokens, bool local_expert_masking, bool skip_experts_with_zero_token=true) |
template<typename DataType > | |
CK_TILE_HOST void | reference_permute (const HostTensor< DataType > &x, HostTensor< DataType > &y, std::vector< index_t > perm) |
template<typename DataType > | |
CK_TILE_HOST auto | reference_permute (const HostTensor< DataType > &x, std::vector< index_t > perm) |
template<typename XDataType , typename ComputeDataType , typename YDataType , typename ReduceOp > | |
CK_TILE_HOST void | reference_reduce (const HostTensor< XDataType > &x_m_n, HostTensor< YDataType > &y_m, ReduceOp reduce_op) |
template<typename XDataType , typename ComputeDataType , typename YDataType , typename ReduceOp , typename KeptDim , typename ReduceDims > | |
CK_TILE_HOST void | reference_reduce (const HostTensor< XDataType > &x_tensor, HostTensor< YDataType > &y_tensor, ReduceOp reduce_op, KeptDim kept_dim, ReduceDims reduce_dims) |
template<typename XDataType , typename GammaDataType , typename ComputeDataType , typename YDataType , typename InvRmsDataType , typename UnquantYDataType , typename Epilogue = reference_rmsnorm2d_default_epilogue> | |
void | reference_rmsnorm2d_fwd (const HostTensor< XDataType > &x_m_n, const HostTensor< GammaDataType > &gamma_n, HostTensor< YDataType > &y_m_n, HostTensor< InvRmsDataType > &invRms_m, HostTensor< UnquantYDataType > &unquant_y_m_n, ComputeDataType epsilon, Epilogue epilogue_functor={}) |
template<typename XDataType , typename ScaleDataType , typename QXDataType > | |
CK_TILE_HOST void | reference_rowwise_quantization2d (const HostTensor< XDataType > &x_m_n, const HostTensor< ScaleDataType > &scale_m, HostTensor< QXDataType > &qx_m_n) |
template<typename InputType , typename ComputeType , typename OutputType = ComputeType> | |
CK_TILE_HOST void | reference_softmax (const HostTensor< InputType > &x, HostTensor< OutputType > &y, index_t dim=-1) |
template<typename InputType , typename ComputeType , typename OutputType = ComputeType> | |
CK_TILE_HOST auto | reference_softmax (const HostTensor< InputType > &x, index_t dim=-1) |
template<typename DataType , typename IndexType = index_t> | |
CK_TILE_HOST void | reference_topk (const HostTensor< DataType > &x, HostTensor< DataType > &y_values, HostTensor< IndexType > &y_indices, index_t k, index_t dim=-1, bool largest=true, bool sorted=true) |
template<typename DataType , typename IndexType = index_t> | |
CK_TILE_HOST auto | reference_topk (const HostTensor< DataType > &x, index_t k, index_t dim=-1, bool largest=true, bool sorted=true) |
template<typename ADataType , typename BDataType > | |
void | reference_transpose_elementwise (const HostTensor< ADataType > &a, HostTensor< BDataType > &b) |
void | flush_icache () |
template<typename ADataType_ , typename BDataType_ > | |
std::string | gemm_prec_str () |
constexpr CK_TILE_HOST_DEVICE auto | make_generic_attention_mask_coordinates_from_lr_window (index_t left_size, index_t right_size, index_t y_total, index_t x_total, bool is_top_left=true) |
template<typename MaskType > | |
constexpr CK_TILE_HOST_DEVICE auto | make_generic_attention_mask_from_lr_window (index_t left_size, index_t right_size, index_t y_total, index_t x_total, bool is_top_left=true) |
template<typename DataType , bool RowMajor = true, unsigned LogMaxSadOprndSize = 16> | |
CK_TILE_HOST_DEVICE auto | make_alibi_from_lr_mask (DataType slope, index_t window_left_size, index_t window_right_size, index_t y_total, index_t x_total, GenericAttentionMaskEnum mask_enum) |
template<typename DataType > | |
CK_TILE_HOST std::vector< DataType > | get_alibi_slopes (ck_tile::index_t nheads) |
template<typename TensorView > | |
CK_TILE_HOST_DEVICE auto | make_page_block_navigator (const TensorView &tensor_view) |
template<typename DataType , index_t VirtualDim, typename TensorView > | |
CK_TILE_HOST_DEVICE auto | make_page_block_navigator (copy_const_t< DataType, void > *physical_blocks, long_index_t block_stride, long_index_t fixed_offset, const int32_t *physical_block_indices, index_t num_blocks, index_t page_block_size, const TensorView &complete_view, const TensorView &last_view) |
constexpr CK_TILE_HOST auto | moe_sorting_get_smem_row_col (int tokens_, int num_experts_) |
CK_TILE_HOST index_t | moe_sorting_get_sub_token (int tokens_, int num_experts_) |
CK_TILE_HOST bool | moe_sorting_is_oneshot (int tokens_, int num_experts_) |
CK_TILE_HOST index_t | moe_sorting_mp_get_workspace_size (int tokens_, int num_experts_, int topk_) |
CK_TILE_HOST index_t | moe_sorting_get_workspace_size (int tokens_, int num_experts_, int topk_, int dispatch_policy_) |
template<typename ADataType , typename BDataType , typename AccDataType , index_t M_Warp_Tile, index_t N_Warp_Tile, index_t K_Warp_Tile> | |
CK_TILE_HOST bool | check_wmma_supported () |
CK_TILE_HOST std::string | getConvSpecializationString (const ConvolutionSpecialization &s) |
template<typename BlockShape > | |
constexpr CK_TILE_DEVICE index_t | block_tile_welford_calculate_max_count (int row_size) |
template<typename VarDistributedTensor_ , bool FastFdiv_ = false> | |
constexpr CK_TILE_DEVICE void | block_tile_welford_post_scale_var (VarDistributedTensor_ &var_tensor, int count, bool_constant< FastFdiv_ >={}) |
template<typename T , bool kFastFDiv = false> | |
CK_TILE_DEVICE void | welford_update (T &mean, T &var, T x, int count, bool_constant< kFastFDiv >={}) |
template<typename AccDistributedTensor_ , typename ReduceFunc , bool WithBroadcast = true, bool CrossWarp = true> | |
CK_TILE_DEVICE void | block_tile_reduce_sync (AccDistributedTensor_ &acc_tensor, const ReduceFunc &reduce_func, bool_constant< WithBroadcast >={}, bool_constant< CrossWarp >={}) |
template<typename AccDistributedTensor_ , typename ReduceFunc > | |
CK_TILE_DEVICE void | block_tile_reduce_xor_sync (AccDistributedTensor_ &acc_tensor, const ReduceFunc &reduce_func) |
template<typename AccDistributedTensor_ , typename InDistributedTensor_ , index_t... InReduceDims, typename ReduceFunc > | |
CK_TILE_DEVICE void | block_tile_reduce (AccDistributedTensor_ &acc_tensor, const InDistributedTensor_ &in_tensor, sequence< InReduceDims... >, const ReduceFunc &reduce_func) |
template<typename AccDataType_ , typename InDistributedTensor_ , index_t... InReduceDims, typename ReduceFunc , typename InDataType_ > | |
CK_TILE_DEVICE auto | block_tile_reduce (const InDistributedTensor_ &in_tensor, sequence< InReduceDims... > in_reduce_dims, const ReduceFunc &reduce_func, const InDataType_ &reduce_init) |
template<typename T > | |
CK_TILE_HOST_DEVICE_EXTERN | BlockReduce2D (const T &, const typename T::DataType &) -> BlockReduce2D< T > |
CK_TILE_HOST float | naive_attention_fwd (naive_attention_fwd_traits t, naive_attention_fwd_args a, ck_tile::stream_config s) |
Variables | |
template<typename T > | |
constexpr bool | is_constant_v = is_constant<T>::value |
Right | |
template<typename T = double> | |
constexpr T | log2e_v = log2e<T>::value |
template<typename T = double> | |
constexpr T | log2e_rcp_v = 1. / log2e<T>::value |
template<typename T > | |
constexpr bool | is_null_tile_window_v = impl::is_null_tile_window<remove_cvref_t<T>>::value |
template<typename T > | |
constexpr bool | is_tile_window_with_static_distribution_v |
Helper variable template to check if a type is a tile window with static distribution. More... | |
template<typename T > | |
constexpr bool | is_tile_window_with_static_lengths_v |
Helper variable template to check if a type is a tile window with static lengths. More... | |
template<typename T > | |
constexpr bool | is_tile_window_linear_v = is_tile_window_linear<T>::value |
Helper variable template to check if a type is a linear tile window. More... | |
constexpr detail::ignore_t | ignore |
template<typename T > | |
constexpr bool | is_static_v = is_static<T>::value |
constexpr int | ERROR_DETAIL_LIMIT = 5 |
Maximum number of error values to display when checking errors. More... | |
template<typename... Ts> | |
constexpr bool | AllConvertibleToStringView |
constexpr uint32_t | CUSTOM_MASK = 1U |
constexpr uint32_t | SLIDING_WINDOW = 2U |
constexpr uint32_t | LOGITS_SOFT_CAP = 4U |
constexpr uint32_t | ALIBI = 8U |
template<typename Arch , typename AType , typename BType , typename CType , index_t warp_m, index_t warp_n, index_t warp_k> | |
constexpr bool | has_wmma_traits_v |
Typedef Documentation
◆ BF16
using ck_tile::BF16 = typedef ck_tile::bf16_t |
16-bit brain floating point type
◆ bf16_raw_t
using ck_tile::bf16_raw_t = typedef uint16_t |
◆ bf16_t
using ck_tile::bf16_t = typedef bfloat16_t |
◆ bf16x16_t
using ck_tile::bf16x16_t = typedef bfloat16_t |
◆ bf16x2_t
typedef bfloat16_t ck_tile::bf16x2_t |
◆ bf16x32_t
using ck_tile::bf16x32_t = typedef bfloat16_t |
◆ bf16x4_t
using ck_tile::bf16x4_t = typedef bfloat16_t |
◆ bf16x64_t
using ck_tile::bf16x64_t = typedef bfloat16_t |
◆ bf16x8_t
using ck_tile::bf16x8_t = typedef bfloat16_t |
◆ BF8
using ck_tile::BF8 = typedef ck_tile::bf8_t |
8-bit brain floating point type
◆ bf8_raw_t
using ck_tile::bf8_raw_t = typedef uint8_t |
◆ bf8_t
using ck_tile::bf8_t = typedef unsigned _BitInt(8) |
◆ bf8x16_t
using ck_tile::bf8x16_t = typedef bf8_t |
◆ bf8x2_t
using ck_tile::bf8x2_t = typedef bf8_t |
◆ bf8x32_t
using ck_tile::bf8x32_t = typedef bf8_t |
◆ bf8x4_t
using ck_tile::bf8x4_t = typedef bf8_t |
◆ bf8x64_t
using ck_tile::bf8x64_t = typedef bf8_t |
◆ bf8x8_t
using ck_tile::bf8x8_t = typedef bf8_t |
◆ bfloat16_t
using ck_tile::bfloat16_t = typedef ushort |
◆ BlockFmhaBatchPrefillPipelineQRKSVSAsyncDefaultPolicy
using ck_tile::BlockFmhaBatchPrefillPipelineQRKSVSAsyncDefaultPolicy = typedef BlockFmhaPipelineQXKSVSCustomPolicy< true, true, 3, 3> |
◆ BlockFmhaPipelineQRKSVSAsyncDefaultPolicy
using ck_tile::BlockFmhaPipelineQRKSVSAsyncDefaultPolicy = typedef BlockFmhaPipelineQXKSVSCustomPolicy< true, true, 3, 3> |
◆ BlockFmhaPipelineQRKSVSDefaultPolicy
using ck_tile::BlockFmhaPipelineQRKSVSDefaultPolicy = typedef BlockFmhaPipelineQXKSVSCustomPolicy< true, false, 1, 1> |
◆ bool_constant
using ck_tile::bool_constant = typedef constant<b> |
◆ copy_const_t
using ck_tile::copy_const_t = typedef typename copy_const<From, To>::type |
◆ Default2DAndDynamicQuantEpilogueTraits
using ck_tile::Default2DAndDynamicQuantEpilogueTraits = typedef DynamicQuantEpilogueTraits<kPadM_, kPadN_, UseSmoothInputScale_, UseRawStore_, UseMax3_> |
◆ default_linear_bottom_dims
using ck_tile::default_linear_bottom_dims = typedef typename impl::default_linear_bottom_dims_impl<TensorView_::buffer_view::get_address_space(), TensorView_::get_num_of_dimension()>::type |
◆ DeviceIp
using ck_tile::DeviceIp = typedef remove_cvref_t<decltype(ck_tile::get_device_arch())> |
◆ e8m0_raw_t
using ck_tile::e8m0_raw_t = typedef typename e8m0_t::raw_type |
◆ e8m0_t
using ck_tile::e8m0_t = typedef e8m0_bexp_t |
◆ ext_vector_t
using ck_tile::ext_vector_t = typedef typename impl::ext_vector<T, N>::type |
◆ F16
using ck_tile::F16 = typedef ck_tile::half_t |
16-bit floating point (half precision) type
◆ F32
using ck_tile::F32 = typedef float |
32-bit floating point (single precision) type
◆ F8
using ck_tile::F8 = typedef ck_tile::fp8_t |
8-bit floating point type
◆ fp16_hip_t
using ck_tile::fp16_hip_t = typedef _Float16 |
◆ fp16_raw_t
typedef ushort ck_tile::fp16_raw_t |
◆ fp16_t
using ck_tile::fp16_t = typedef _Float16 |
◆ fp16x16_t
using ck_tile::fp16x16_t = typedef _Float16 |
◆ fp16x2_t
typedef _Float16 ck_tile::fp16x2_t |
◆ fp16x32_t
using ck_tile::fp16x32_t = typedef _Float16 |
◆ fp16x4_t
using ck_tile::fp16x4_t = typedef _Float16 |
◆ fp16x64_t
using ck_tile::fp16x64_t = typedef _Float16 |
◆ fp16x8_t
using ck_tile::fp16x8_t = typedef _Float16 |
◆ fp32_t
typedef float ck_tile::fp32_t |
◆ fp32x16_t
using ck_tile::fp32x16_t = typedef float |
◆ fp32x2_t
typedef float ck_tile::fp32x2_t |
◆ fp32x32_t
using ck_tile::fp32x32_t = typedef float |
◆ fp32x4_t
using ck_tile::fp32x4_t = typedef float |
◆ fp32x64_t
using ck_tile::fp32x64_t = typedef float |
◆ fp32x8_t
using ck_tile::fp32x8_t = typedef float |
◆ fp64_t
using ck_tile::fp64_t = typedef double |
◆ fp64x2_t
using ck_tile::fp64x2_t = typedef double |
◆ fp64x4_t
using ck_tile::fp64x4_t = typedef double |
◆ fp8_raw_t
using ck_tile::fp8_raw_t = typedef uint8_t |
◆ fp8_t
using ck_tile::fp8_t = typedef _BitInt(8) |
◆ fp8x16_t
using ck_tile::fp8x16_t = typedef fp8_t |
◆ fp8x2_t
using ck_tile::fp8x2_t = typedef fp8_t |
◆ fp8x32_t
using ck_tile::fp8x32_t = typedef fp8_t |
◆ fp8x4_t
using ck_tile::fp8x4_t = typedef fp8_t |
◆ fp8x64_t
using ck_tile::fp8x64_t = typedef fp8_t |
◆ fp8x8_t
using ck_tile::fp8x8_t = typedef fp8_t |
◆ GemmAQuantPipelineProblem
using ck_tile::GemmAQuantPipelineProblem = typedef GemmAQuantPipelineProblemBase<ADataType_, AQDataType_, BDataType_, CDataType_, BlockGemmShape_, Traits_, QuantGroupSize_, TransposeC_, ComputeDataType_, Scheduler_, HasHotLoop_, TailNum_> |
◆ GemmBQuantPipelineProblem
using ck_tile::GemmBQuantPipelineProblem = typedef GemmBQuantPipelineProblemBase<ADataType_, BDataType_, BQDataType_, CDataType_, BlockGemmShape_, Traits_, QuantGroupSize_, ComputeDataType_, Scheduler_, HasHotLoop_, TailNum_> |
◆ GemmPipelineAGmemBGmemCRegV2DefaultPolicy
using ck_tile::GemmPipelineAGmemBGmemCRegV2DefaultPolicy = typedef GemmPipelineAGmemBGmemCRegV1DefaultPolicy |
◆ GemmPipelineProblem
using ck_tile::GemmPipelineProblem = typedef GemmPipelineProblemBase<ADataType_, BDataType_, CDataType_, BlockGemmShape_, Traits_, ComputeDataType_, FixedVectorSize_, VectorSizeA_, VectorSizeB_> |
◆ GroupedConvBwdDataHostArgs
using ck_tile::GroupedConvBwdDataHostArgs = typedef GroupedConvHostArgs<void*, const void*, const void*> |
◆ GroupedConvBwdWeightHostArgs
using ck_tile::GroupedConvBwdWeightHostArgs = typedef GroupedConvHostArgs<const void*, void*, const void*> |
◆ GroupedConvFwdHostArgs
using ck_tile::GroupedConvFwdHostArgs = typedef GroupedConvHostArgs<const void*, const void*, void*> |
◆ half_t
using ck_tile::half_t = typedef _Float16 |
◆ has_same_scalar_type
using ck_tile::has_same_scalar_type = typedef std::is_same<typename vector_traits<remove_cvref_t<X> >::scalar_type, typename vector_traits<remove_cvref_t<Y> >::scalar_type> |
◆ I32
using ck_tile::I32 = typedef int32_t |
32-bit signed integer type
◆ I8
using ck_tile::I8 = typedef int8_t |
8-bit signed integer type
◆ index_t
using ck_tile::index_t = typedef int32_t |
◆ InputTileDistributionTraits
using ck_tile::InputTileDistributionTraits = typedef TransposeTileDistributionTraits<TileDistributionEncoding_, DataType_, Policy, true> |
◆ int16x16_t
using ck_tile::int16x16_t = typedef int16_t |
◆ int16x2_t
using ck_tile::int16x2_t = typedef int16_t |
◆ int16x32_t
using ck_tile::int16x32_t = typedef int16_t |
◆ int16x4_t
using ck_tile::int16x4_t = typedef int16_t |
◆ int16x64_t
using ck_tile::int16x64_t = typedef int16_t |
◆ int16x8_t
using ck_tile::int16x8_t = typedef int16_t |
◆ int32_t
using ck_tile::int32_t = typedef int32_t |
◆ int32x16_t
using ck_tile::int32x16_t = typedef int32_t |
◆ int32x2_t
using ck_tile::int32x2_t = typedef int32_t |
◆ int32x32_t
using ck_tile::int32x32_t = typedef int32_t |
◆ int32x4_t
using ck_tile::int32x4_t = typedef int32_t |
◆ int32x64_t
using ck_tile::int32x64_t = typedef int32_t |
◆ int32x8_t
using ck_tile::int32x8_t = typedef int32_t |
◆ int8_t
typedef int8_t ck_tile::int8_t |
◆ int8x16_t
using ck_tile::int8x16_t = typedef int8_t |
◆ int8x2_t
typedef int8_t ck_tile::int8x2_t |
◆ int8x32_t
using ck_tile::int8x32_t = typedef int8_t |
◆ int8x4_t
using ck_tile::int8x4_t = typedef int8_t |
◆ int8x64_t
using ck_tile::int8x64_t = typedef int8_t |
◆ int8x8_t
using ck_tile::int8x8_t = typedef int8_t |
◆ is_detected
using ck_tile::is_detected = typedef typename detail::detector<nonesuch, void, Op, Args...>::value_t |
◆ is_known_at_compile_time
using ck_tile::is_known_at_compile_time = typedef is_static<T> |
◆ is_static
using ck_tile::is_static = typedef impl::is_static_impl<remove_cvref_t<T> > |
◆ is_tuple
using ck_tile::is_tuple = typedef decltype(std::declval<T&>().IsTuple()) |
◆ iter_difference_t
using ck_tile::iter_difference_t = typedef typename std::iterator_traits<remove_cvref_t<T> >::difference_type |
◆ iter_reference_t
using ck_tile::iter_reference_t = typedef decltype(*std::declval<T&>()) |
◆ iter_value_t
using ck_tile::iter_value_t = typedef typename std::iterator_traits<remove_cvref_t<T> >::value_type |
◆ long_index_t
using ck_tile::long_index_t = typedef int64_t |
◆ long_number
using ck_tile::long_number = typedef constant<v> |
◆ magic_division
using ck_tile::magic_division = typedef magic_division32_bit_range |
◆ make_index_sequence
using ck_tile::make_index_sequence = typedef typename __make_integer_seq<impl::__integer_sequence, index_t, N>::seq_type |
◆ multi_index
using ck_tile::multi_index = typedef array<index_t, N> |
◆ number
using ck_tile::number = typedef constant<v> |
◆ OutputTileDistributionTraits
using ck_tile::OutputTileDistributionTraits = typedef TransposeTileDistributionTraits<TileDistributionEncoding_, DataType_, Policy, false> |
◆ PersistentTileGemmUniversalTraits
using ck_tile::PersistentTileGemmUniversalTraits = typedef TileGemmUniversalTraits<kPadM_, kPadN_, kPadK_, DoubleSmemBuffer_, ALayout_, BLayout_, CLayout_, TransposeC_, UseStructuredSparsity_, true> |
◆ pk_fp4_raw_t
using ck_tile::pk_fp4_raw_t = typedef typename pk_fp4_t::raw_type |
◆ pk_fp4_t
using ck_tile::pk_fp4_t = typedef pk_float4_e2m1_t |
◆ pk_int4x16_t
using ck_tile::pk_int4x16_t = typedef int8_t |
◆ pk_int4x2_t
using ck_tile::pk_int4x2_t = typedef int8_t |
◆ pk_int4x32_t
using ck_tile::pk_int4x32_t = typedef int8_t |
◆ pk_int4x4_t
using ck_tile::pk_int4x4_t = typedef int8_t |
◆ pk_int4x8_t
using ck_tile::pk_int4x8_t = typedef int8_t |
◆ remove_cv_t
using ck_tile::remove_cv_t = typedef typename std::remove_cv<T>::type |
◆ remove_cvref_t
using ck_tile::remove_cvref_t = typedef remove_cv_t<std::remove_reference_t<T> > |
◆ remove_pointer_t
using ck_tile::remove_pointer_t = typedef typename std::remove_pointer<T>::type |
◆ remove_reference_t
using ck_tile::remove_reference_t = typedef typename std::remove_reference<T>::type |
◆ sequence_merge_t
using ck_tile::sequence_merge_t = typedef typename sequence_merge<Seqs...>::type |
◆ statically_indexed_array
using ck_tile::statically_indexed_array = typedef tuple_array<T, N> |
◆ thread_buffer
using ck_tile::thread_buffer = typedef tuple_array<T, N> |
◆ tile_distribution_encoding_shuffle_t
using ck_tile::tile_distribution_encoding_shuffle_t = typedef typename tile_distribution_encoding_shuffle<encoding, shuffle>::type |
◆ tuple_array
using ck_tile::tuple_array = typedef typename impl::tuple_array_impl<T, N>::type |
◆ tuple_element_or_default_t
using ck_tile::tuple_element_or_default_t = typedef typename tuple_element_or_default<Tuple_, Idx, DefaultType>::type |
◆ uint16x16_t
using ck_tile::uint16x16_t = typedef uint16_t |
◆ uint16x2_t
using ck_tile::uint16x2_t = typedef uint16_t |
◆ uint16x32_t
using ck_tile::uint16x32_t = typedef uint16_t |
◆ uint16x4_t
using ck_tile::uint16x4_t = typedef uint16_t |
◆ uint16x64_t
using ck_tile::uint16x64_t = typedef uint16_t |
◆ uint16x8_t
using ck_tile::uint16x8_t = typedef uint16_t |
◆ uint32x16_t
using ck_tile::uint32x16_t = typedef uint32_t |
◆ uint32x2_t
using ck_tile::uint32x2_t = typedef uint32_t |
◆ uint32x32_t
using ck_tile::uint32x32_t = typedef uint32_t |
◆ uint32x4_t
using ck_tile::uint32x4_t = typedef uint32_t |
◆ uint32x64_t
using ck_tile::uint32x64_t = typedef uint32_t |
◆ uint32x8_t
using ck_tile::uint32x8_t = typedef uint32_t |
◆ uint8x16_t
using ck_tile::uint8x16_t = typedef uint8_t |
◆ uint8x2_t
using ck_tile::uint8x2_t = typedef uint8_t |
◆ uint8x32_t
using ck_tile::uint8x32_t = typedef uint8_t |
◆ uint8x4_t
using ck_tile::uint8x4_t = typedef uint8_t |
◆ uint8x64_t
using ck_tile::uint8x64_t = typedef uint8_t |
◆ uint8x8_t
using ck_tile::uint8x8_t = typedef uint8_t |
◆ uniform_sequence_gen_t
using ck_tile::uniform_sequence_gen_t = typedef typename uniform_sequence_gen<NSize, I>::type |
◆ WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8
using ck_tile::WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8 = typedef WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<bf8_t, bf8_t, Ctrl_> |
◆ WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8
using ck_tile::WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8 = typedef WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<bf8_t, fp8_t, Ctrl_> |
◆ WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8
using ck_tile::WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8 = typedef WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<fp8_t, bf8_t, Ctrl_> |
◆ WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8
using ck_tile::WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8 = typedef WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<fp8_t, fp8_t, Ctrl_> |
◆ WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8
using ck_tile::WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8 = typedef WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base<bf8_t, bf8_t, Ctrl_> |
◆ WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8
using ck_tile::WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8 = typedef WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base<fp8_t, fp8_t, Ctrl_> |
◆ WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, bf8_t, Ctrl_> |
◆ WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, fp8_t, Ctrl_> |
◆ WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, bf8_t, Ctrl_> |
◆ WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t, Ctrl_> |
◆ WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<bf8_t, bf8_t, Ctrl_> |
◆ WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<bf8_t, fp8_t, Ctrl_> |
◆ WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<fp8_t, bf8_t, Ctrl_> |
◆ WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8
using ck_tile::WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8 = typedef WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<fp8_t, fp8_t, Ctrl_> |
◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_bf16_bf16
using ck_tile::WarpGemmAttributeWmmaImpl_f32_16x16x16_bf16_bf16 = typedef WarpGemmAttributeWmmaImpl<WmmaTraits<DeviceIp, bf16_t, bf16_t, float, 16, 16, 16> > |
◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_bf8
using ck_tile::WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_bf8 = typedef WarpGemmAttributeWmmaImpl<WmmaTraits<gfx12_t, bf8_t, bf8_t, float, 16, 16, 16> > |
◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_f8
using ck_tile::WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_f8 = typedef WarpGemmAttributeWmmaImpl<WmmaTraits<gfx12_t, bf8_t, fp8_t, float, 16, 16, 16> > |
◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_f16_f16
using ck_tile::WarpGemmAttributeWmmaImpl_f32_16x16x16_f16_f16 = typedef WarpGemmAttributeWmmaImpl<WmmaTraits<DeviceIp, fp16_t, fp16_t, float, 16, 16, 16> > |
◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_bf8
using ck_tile::WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_bf8 = typedef WarpGemmAttributeWmmaImpl<WmmaTraits<gfx12_t, fp8_t, bf8_t, float, 16, 16, 16> > |
◆ WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_f8
using ck_tile::WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_f8 = typedef WarpGemmAttributeWmmaImpl<WmmaTraits<gfx12_t, fp8_t, fp8_t, float, 16, 16, 16> > |
◆ WarpGemmAttributeWmmaImpl_i32_16x16x16_i8_i8
using ck_tile::WarpGemmAttributeWmmaImpl_i32_16x16x16_i8_i8 = typedef WarpGemmAttributeWmmaImpl<WmmaTraits<DeviceIp, int8_t, int8_t, int32_t, 16, 16, 16> > |
◆ WarpGemmDispatcher
using ck_tile::WarpGemmDispatcher = typedef typename impl::WarpGemmDispatcher<AType, BType, AccType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity, AttrNumAccess>::Type |
◆ WarpGemmMfma_f32_16x16x128_bf8_bf8
using ck_tile::WarpGemmMfma_f32_16x16x128_bf8_bf8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>, AttrNumAccess> > |
◆ WarpGemmMfma_f32_16x16x128_bf8_fp8
using ck_tile::WarpGemmMfma_f32_16x16x128_bf8_fp8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8<WGAttrCtlEnum::Default_>, AttrNumAccess> > |
◆ WarpGemmMfma_f32_16x16x128_fp8_bf8
using ck_tile::WarpGemmMfma_f32_16x16x128_fp8_bf8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8<WGAttrCtlEnum::Default_>, AttrNumAccess> > |
◆ WarpGemmMfma_f32_16x16x128_fp8_fp8
using ck_tile::WarpGemmMfma_f32_16x16x128_fp8_fp8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8<WGAttrCtlEnum::Default_>, AttrNumAccess> > |
◆ WarpGemmMfma_f32_16x16x32_bf8_bf8
using ck_tile::WarpGemmMfma_f32_16x16x32_bf8_bf8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmMfma_f32_16x16x32_bf8_bf8_CTransposed
◆ WarpGemmMfma_f32_16x16x32_fp8_fp8
using ck_tile::WarpGemmMfma_f32_16x16x32_fp8_fp8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmMfma_f32_16x16x32_fp8_fp8_CTransposed
◆ WarpGemmMfma_f32_16x16x64_bf8_bf8
using ck_tile::WarpGemmMfma_f32_16x16x64_bf8_bf8 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>, 2> > |
◆ WarpGemmMfma_f32_16x16x64_fp8_fp8
using ck_tile::WarpGemmMfma_f32_16x16x64_fp8_fp8 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>, 2> > |
◆ WarpGemmMfma_f32_32x32x16_bf8_bf8
using ck_tile::WarpGemmMfma_f32_32x32x16_bf8_bf8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed
◆ WarpGemmMfma_f32_32x32x16_bf8_fp8
using ck_tile::WarpGemmMfma_f32_32x32x16_bf8_fp8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed
◆ WarpGemmMfma_f32_32x32x16_fp8_bf8
using ck_tile::WarpGemmMfma_f32_32x32x16_fp8_bf8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed
◆ WarpGemmMfma_f32_32x32x16_fp8_fp8
using ck_tile::WarpGemmMfma_f32_32x32x16_fp8_fp8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed
◆ WarpGemmMfma_f32_32x32x32_bf8_bf8
using ck_tile::WarpGemmMfma_f32_32x32x32_bf8_bf8 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>, 2> > |
◆ WarpGemmMfma_f32_32x32x32_fp8_fp8
using ck_tile::WarpGemmMfma_f32_32x32x32_fp8_fp8 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>, 2> > |
◆ WarpGemmMfma_f32_32x32x64_bf8_bf8
using ck_tile::WarpGemmMfma_f32_32x32x64_bf8_bf8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8<WGAttrCtlEnum::Default_>, AttrNumAccess> > |
◆ WarpGemmMfma_f32_32x32x64_bf8_fp8
using ck_tile::WarpGemmMfma_f32_32x32x64_bf8_fp8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8<WGAttrCtlEnum::Default_>, AttrNumAccess> > |
◆ WarpGemmMfma_f32_32x32x64_fp8_bf8
using ck_tile::WarpGemmMfma_f32_32x32x64_fp8_bf8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8<WGAttrCtlEnum::Default_>, AttrNumAccess> > |
◆ WarpGemmMfma_f32_32x32x64_fp8_fp8
using ck_tile::WarpGemmMfma_f32_32x32x64_fp8_fp8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8<WGAttrCtlEnum::Default_>, AttrNumAccess> > |
◆ WarpGemmMfma_i32_16x16x32_i8_i8
using ck_tile::WarpGemmMfma_i32_16x16x32_i8_i8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed
using ck_tile::WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed = typedef WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution< WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmMfma_i32_32x32x16_i8_i8
using ck_tile::WarpGemmMfma_i32_32x32x16_i8_i8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed
using ck_tile::WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed = typedef WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution< WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmMfmaBf16Bf16F32M16N16K16
using ck_tile::WarpGemmMfmaBf16Bf16F32M16N16K16 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution
◆ WarpGemmMfmaBf16Bf16F32M16N16K32
using ck_tile::WarpGemmMfmaBf16Bf16F32M16N16K32 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>, 2, AttrNumAccess> > |
◆ WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution
◆ WarpGemmMfmaBf16Bf16F32M32N32K16
using ck_tile::WarpGemmMfmaBf16Bf16F32M32N32K16 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>, 2, AttrNumAccess> > |
◆ WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA
using ck_tile::WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA< WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>, 2> > |
◆ WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution
◆ WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution
◆ WarpGemmMfmaBf16Bf16F32M32N32K8
using ck_tile::WarpGemmMfmaBf16Bf16F32M32N32K8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA
using ck_tile::WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA< WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>, 1> > |
◆ WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution
◆ WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution
◆ WarpGemmMfmaBf16Bf16F32M4N64K16
using ck_tile::WarpGemmMfmaBf16Bf16F32M4N64K16 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplBf16Bf16F32M4N64K4<WGAttrCtlEnum::Default_>, 4> > |
◆ WarpGemmMfmaBf16Bf16F32M64N4K16
using ck_tile::WarpGemmMfmaBf16Bf16F32M64N4K16 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4<WGAttrCtlEnum::Default_>, 4> > |
◆ WarpGemmMfmaF16F16F32M16N16K16
using ck_tile::WarpGemmMfmaF16F16F32M16N16K16 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution
◆ WarpGemmMfmaF16F16F32M16N16K32
using ck_tile::WarpGemmMfmaF16F16F32M16N16K32 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>, 2, AttrNumAccess> > |
◆ WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution
◆ WarpGemmMfmaF16F16F32M32N32K16
using ck_tile::WarpGemmMfmaF16F16F32M32N32K16 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>, 2, AttrNumAccess> > |
◆ WarpGemmMfmaF16F16F32M32N32K16SwizzleA
using ck_tile::WarpGemmMfmaF16F16F32M32N32K16SwizzleA = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA< WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>, 2> > |
◆ WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution
◆ WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution
◆ WarpGemmMfmaF16F16F32M32N32K8
using ck_tile::WarpGemmMfmaF16F16F32M32N32K8 = typedef WarpGemmImpl< WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmMfmaF16F16F32M32N32K8SwizzleA
using ck_tile::WarpGemmMfmaF16F16F32M32N32K8SwizzleA = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA< WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>, 1> > |
◆ WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution
◆ WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution
◆ WarpGemmMfmaF16F16F32M4N64K16
using ck_tile::WarpGemmMfmaF16F16F32M4N64K16 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplF16F16F32M4N64K4<WGAttrCtlEnum::Default_>, 4> > |
◆ WarpGemmMfmaF16F16F32M64N4K16
using ck_tile::WarpGemmMfmaF16F16F32M64N4K16 = typedef WarpGemmImpl<WarpGemmAttributeMfmaIterateK< WarpGemmAttributeMfmaImplF16F16F32M64N4K4<WGAttrCtlEnum::Default_>, 4> > |
◆ WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution
◆ WarpGemmSmfmacF16F16F32M16N16K32
using ck_tile::WarpGemmSmfmacF16F16F32M16N16K32 = typedef WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac< WarpGemmAttributeSmfmacImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmSmfmacF16F16F32M32N32K16
using ck_tile::WarpGemmSmfmacF16F16F32M32N32K16 = typedef WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac< WarpGemmAttributeSmfmacImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_> >> |
◆ WarpGemmWmma_f32_16x16x16_bf16_bf16
using ck_tile::WarpGemmWmma_f32_16x16x16_bf16_bf16 = typedef WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_bf16_bf16, kTransC> > |
◆ WarpGemmWmma_f32_16x16x16_bf8_bf8
using ck_tile::WarpGemmWmma_f32_16x16x16_bf8_bf8 = typedef WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_bf8, kTransC> > |
◆ WarpGemmWmma_f32_16x16x16_bf8_f8
using ck_tile::WarpGemmWmma_f32_16x16x16_bf8_f8 = typedef WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_f8, kTransC> > |
◆ WarpGemmWmma_f32_16x16x16_f16_f16
using ck_tile::WarpGemmWmma_f32_16x16x16_f16_f16 = typedef WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_f16_f16, kTransC> > |
◆ WarpGemmWmma_f32_16x16x16_f8_bf8
using ck_tile::WarpGemmWmma_f32_16x16x16_f8_bf8 = typedef WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_bf8, kTransC> > |
◆ WarpGemmWmma_f32_16x16x16_f8_f8
using ck_tile::WarpGemmWmma_f32_16x16x16_f8_f8 = typedef WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_f8, kTransC> > |
◆ WarpGemmWmma_i32_16x16x16_i8_i8
using ck_tile::WarpGemmWmma_i32_16x16x16_i8_i8 = typedef WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_i32_16x16x16_i8_i8, kTransC> > |
Enumeration Type Documentation
◆ AlibiMode
|
strong |
◆ amd_buffer_coherence_enum
|
strong |
◆ bf16_rounding_mode
|
strong |
◆ BlockAttentionBiasEnum
|
strong |
◆ BlockFmhaPipelineEnum
|
strong |
◆ ConvolutionSpecialization
|
strong |
◆ coord_transform_enum
|
strong |
◆ fp8_interpretation
|
strong |
◆ fp8_rounding_mode
|
strong |
◆ FusedMoeGemmPipelineSequencerEnum
|
strong |
◆ FusedMoeGemmWeightPermuteEnum
|
strong |
◆ GemmLoopOrder
|
strong |
◆ GemmPipelineScheduler
|
strong |
◆ GenericAttentionMaskEnum
|
strong |
◆ Layernorm2dFusedAddEnum
|
strong |
◆ Layernorm2dFusedQuantEnum
|
strong |
◆ Layernorm2dXBiasEnum
|
strong |
◆ naive_attention_layout_enum
|
strong |
◆ naive_attention_quant_algo
|
strong |
◆ naive_attention_variation_enum
|
strong |
◆ PositionEncodingEnum
|
strong |
◆ Rmsnorm2dFusedAddEnum
|
strong |
◆ Rmsnorm2dFusedQuantEnum
|
strong |
◆ Rmsnorm2dSensitiveEnum
|
strong |
◆ RotaryEmbeddingEnum
|
strong |
◆ StreamKReductionStrategy
enum ck_tile::StreamKReductionStrategy : uint32_t |
◆ TailNumber
|
strong |
◆ tile_distribution_pattern
|
strong |
◆ WGAttrCtlEnum
|
strong |
◆ WGAttrNumAccessEnum
|
strong |
Function Documentation
◆ abs() [1/7]
CK_TILE_HOST_DEVICE bfloat16_t ck_tile::abs | ( | const bfloat16_t & | x | ) |
◆ abs() [2/7]
CK_TILE_HOST_DEVICE T ck_tile::abs | ( | const T & | x | ) |
◆ abs() [3/7]
CK_TILE_DEVICE double ck_tile::abs | ( | double | x | ) |
◆ abs() [4/7]
CK_TILE_DEVICE float ck_tile::abs | ( | float | x | ) |
◆ abs() [5/7]
CK_TILE_DEVICE fp16_t ck_tile::abs | ( | fp16_t | x | ) |
◆ abs() [6/7]
CK_TILE_DEVICE int32_t ck_tile::abs | ( | int32_t | x | ) |
◆ abs() [7/7]
CK_TILE_DEVICE int8_t ck_tile::abs | ( | int8_t | x | ) |
◆ acos() [1/2]
CK_TILE_HOST T ck_tile::acos | ( | T | x | ) |
◆ acos() [2/2]
CK_TILE_DEVICE T ck_tile::acos | ( | T | x | ) |
◆ acos< double >()
CK_TILE_DEVICE double ck_tile::acos< double > | ( | double | x | ) |
◆ acos< float >()
CK_TILE_DEVICE float ck_tile::acos< float > | ( | float | x | ) |
◆ acosh() [1/2]
CK_TILE_HOST T ck_tile::acosh | ( | T | x | ) |
◆ acosh() [2/2]
CK_TILE_DEVICE T ck_tile::acosh | ( | T | x | ) |
◆ acosh< double >()
CK_TILE_DEVICE double ck_tile::acosh< double > | ( | double | x | ) |
◆ acosh< float >()
CK_TILE_DEVICE float ck_tile::acosh< float > | ( | float | x | ) |
◆ adaptor_coordinate_is_valid()
|
constexpr |
◆ adaptor_coordinate_is_valid_assuming_top_index_is_valid()
|
constexpr |
◆ add()
CK_TILE_HOST_DEVICE T ck_tile::add | ( | const T & | a, |
const T & | b | ||
) |
◆ add_bf16x2_t()
CK_TILE_HOST_DEVICE bf16x2_t ck_tile::add_bf16x2_t | ( | const bf16x2_t & | a, |
const bf16x2_t & | b | ||
) |
◆ add_bf16x4_t()
CK_TILE_HOST_DEVICE bf16x4_t ck_tile::add_bf16x4_t | ( | const bf16x4_t & | a, |
const bf16x4_t & | b | ||
) |
◆ add_bf8x4_t()
CK_TILE_HOST_DEVICE bf8x4_t ck_tile::add_bf8x4_t | ( | const bf8x4_t & | a, |
const bf8x4_t & | b | ||
) |
◆ add_bf8x8_t()
CK_TILE_HOST_DEVICE bf8x8_t ck_tile::add_bf8x8_t | ( | const bf8x8_t & | a, |
const bf8x8_t & | b | ||
) |
◆ add_f16x2_t()
CK_TILE_HOST_DEVICE fp16x2_t ck_tile::add_f16x2_t | ( | const fp16x2_t & | a, |
const fp16x2_t & | b | ||
) |
◆ add_fp8x4_t()
CK_TILE_HOST_DEVICE fp8x4_t ck_tile::add_fp8x4_t | ( | const fp8x4_t & | a, |
const fp8x4_t & | b | ||
) |
◆ add_fp8x8_t()
CK_TILE_HOST_DEVICE fp8x8_t ck_tile::add_fp8x8_t | ( | const fp8x8_t & | a, |
const fp8x8_t & | b | ||
) |
◆ amd_async_buffer_load()
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load | ( | CK_TILE_LDS_ADDR T * | smem, |
int32x4_t | src_wave_buffer_resource, | ||
index_t | src_thread_addr_offset, | ||
index_t | src_wave_addr_offset, | ||
index_t | src_immediate_addr_offset = 0 , |
||
index_t | flag = 0 , |
||
bool_constant< oob_conditional_check > | = {} |
||
) |
◆ amd_async_buffer_load_impl()
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_impl | ( | CK_TILE_LDS_ADDR T * | smem, |
int32x4_t | src_wave_buffer_resource, | ||
index_t | src_thread_addr_offset, | ||
index_t | src_wave_addr_offset, | ||
index_t | src_immediate_addr_offset = 0 , |
||
bool_constant< pre_nop > | = {} |
||
) |
◆ amd_async_buffer_load_with_oob()
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_with_oob | ( | CK_TILE_LDS_ADDR T * | smem, |
const int32x4_t | src_wave_buffer_resource, | ||
index_t | src_thread_element_offset, | ||
index_t | src_linear_element_offset, | ||
bool | is_valid_element, | ||
bool_constant< oob_conditional_check > | = {} |
||
) |
◆ amd_async_buffer_load_with_oob_raw() [1/2]
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_with_oob_raw | ( | T * | smem, |
const int32x4_t | src_wave_buffer_resource, | ||
index_t | src_thread_element_offset, | ||
index_t | src_linear_element_offset, | ||
bool_constant< pre_nop > | = {} |
||
) |
◆ amd_async_buffer_load_with_oob_raw() [2/2]
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_with_oob_raw | ( | T * | smem, |
const T * | p_src_wave, | ||
index_t | src_thread_element_offset, | ||
index_t | src_linear_element_offset, | ||
index_t | src_element_space_size, | ||
bool_constant< pre_nop > | = {} |
||
) |
◆ amd_buffer_atomic_add()
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_add | ( | const thread_buffer< T, N > & | src_thread_data, |
T * | p_dst_wave, | ||
const index_t | dst_thread_element_offset, | ||
const bool | dst_thread_element_valid, | ||
const index_t | dst_element_space_size | ||
) |
◆ amd_buffer_atomic_add_impl()
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_add_impl | ( | const thread_buffer< T, N > & | src_thread_data, |
int32x4_t | dst_wave_buffer_resource, | ||
index_t | dst_thread_addr_offset, | ||
index_t | dst_wave_addr_offset | ||
) |
◆ amd_buffer_atomic_add_raw()
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_add_raw | ( | const thread_buffer< T, N > & | src_thread_data, |
T * | p_dst_wave, | ||
const index_t | dst_thread_element_offset, | ||
const index_t | dst_linear_element_offset, | ||
const bool | dst_thread_element_valid, | ||
const index_t | dst_element_space_size, | ||
bool_constant< pre_nop > | = {} |
||
) |
◆ amd_buffer_atomic_max()
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_max | ( | const thread_buffer< T, N > & | src_thread_data, |
T * | p_dst_wave, | ||
const index_t | dst_thread_element_offset, | ||
const bool | dst_thread_element_valid, | ||
const index_t | dst_element_space_size | ||
) |
◆ amd_buffer_atomic_max_impl()
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_max_impl | ( | const thread_buffer< T, N > | src_thread_data, |
int32x4_t | dst_wave_buffer_resource, | ||
index_t | dst_thread_addr_offset, | ||
index_t | dst_wave_addr_offset | ||
) |
◆ amd_buffer_load_impl()
CK_TILE_DEVICE thread_buffer<T, N> ck_tile::amd_buffer_load_impl | ( | int32x4_t | src_wave_buffer_resource, |
index_t | src_thread_addr_offset, | ||
index_t | src_wave_addr_offset | ||
) |
◆ amd_buffer_load_impl_with_bytes()
CK_TILE_DEVICE thread_buffer<int8_t, N> ck_tile::amd_buffer_load_impl_with_bytes | ( | int32x4_t | src_wave_buffer_resource, |
index_t | src_thread_addr_offset, | ||
index_t | src_wave_addr_offset | ||
) |
◆ amd_buffer_load_invalid_element_return_customized_value()
CK_TILE_DEVICE thread_buffer<T, N> ck_tile::amd_buffer_load_invalid_element_return_customized_value | ( | const T * | p_src_wave, |
index_t | src_thread_element_offset, | ||
bool | src_thread_element_valid, | ||
index_t | src_element_space_size, | ||
T | customized_value | ||
) |
◆ amd_buffer_load_invalid_element_return_zero()
CK_TILE_DEVICE thread_buffer<T, N> ck_tile::amd_buffer_load_invalid_element_return_zero | ( | const T * | p_src_wave, |
index_t | src_thread_element_offset, | ||
bool | src_thread_element_valid, | ||
index_t | src_element_space_size | ||
) |
◆ amd_buffer_load_raw() [1/2]
CK_TILE_DEVICE void ck_tile::amd_buffer_load_raw | ( | thread_buffer< T, N > & | dst, |
const int32x4_t | src_wave_buffer_resource, | ||
index_t | src_thread_element_offset, | ||
index_t | src_linear_element_offset, | ||
index_t | is_valid_element = 0 , |
||
bool_constant< pre_nop > | = {} |
||
) |
◆ amd_buffer_load_raw() [2/2]
CK_TILE_DEVICE void ck_tile::amd_buffer_load_raw | ( | thread_buffer< T, N > & | dst, |
const T * | p_src_wave, | ||
index_t | src_thread_element_offset, | ||
index_t | src_linear_element_offset, | ||
index_t | src_element_space_size, | ||
index_t | is_valid_element = 0 , |
||
bool_constant< pre_nop > | = {} |
||
) |
◆ amd_buffer_load_raw_impl()
CK_TILE_DEVICE void ck_tile::amd_buffer_load_raw_impl | ( | thread_buffer< T, N > & | dst, |
int32x4_t | src_wave_buffer_resource, | ||
index_t | src_thread_addr_offset, | ||
index_t | src_wave_addr_offset, | ||
index_t | src_linear_addr_offset, | ||
index_t | flag = 0 , |
||
bool_constant< pre_nop > | = {} |
||
) |
◆ amd_buffer_store()
CK_TILE_DEVICE void ck_tile::amd_buffer_store | ( | const thread_buffer< T, N > & | src_thread_data, |
T * | p_dst_wave, | ||
const index_t | dst_thread_element_offset, | ||
const bool | dst_thread_element_valid, | ||
const index_t | dst_element_space_size | ||
) |
◆ amd_buffer_store_impl()
CK_TILE_DEVICE void ck_tile::amd_buffer_store_impl | ( | const thread_buffer< T, N > | src_thread_data, |
int32x4_t | dst_wave_buffer_resource, | ||
index_t | dst_thread_addr_offset, | ||
index_t | dst_wave_addr_offset | ||
) |
◆ amd_buffer_store_impl_with_bytes()
CK_TILE_DEVICE void ck_tile::amd_buffer_store_impl_with_bytes | ( | const thread_buffer< int8_t, N > | src_thread_data, |
int32x4_t | dst_wave_buffer_resource, | ||
index_t | dst_thread_addr_offset, | ||
index_t | dst_wave_addr_offset | ||
) |
◆ amd_buffer_store_raw()
CK_TILE_DEVICE void ck_tile::amd_buffer_store_raw | ( | const thread_buffer< T, N > & | src_thread_data, |
T * | p_dst_wave, | ||
const index_t | dst_thread_element_offset, | ||
const index_t | dst_linear_element_offset, | ||
const bool | dst_thread_element_valid, | ||
const index_t | dst_element_space_size | ||
) |
◆ amd_buffer_store_raw_impl()
CK_TILE_DEVICE void ck_tile::amd_buffer_store_raw_impl | ( | const thread_buffer< T, N > & | dst_thread_data, |
int32x4_t | dst_wave_buffer_resource, | ||
index_t | dst_thread_addr_offset, | ||
index_t | dst_wave_addr_offset, | ||
index_t | dst_linear_addr_offset, | ||
index_t | is_valid_element = 1 |
||
) |
◆ apply()
|
constexpr |
◆ asin() [1/2]
CK_TILE_HOST T ck_tile::asin | ( | T | x | ) |
◆ asin() [2/2]
CK_TILE_DEVICE T ck_tile::asin | ( | T | x | ) |
◆ asin< double >()
CK_TILE_DEVICE double ck_tile::asin< double > | ( | double | x | ) |
◆ asin< float >()
CK_TILE_DEVICE float ck_tile::asin< float > | ( | float | x | ) |
◆ asinh() [1/2]
CK_TILE_HOST T ck_tile::asinh | ( | T | x | ) |
◆ asinh() [2/2]
CK_TILE_DEVICE T ck_tile::asinh | ( | T | x | ) |
◆ asinh< double >()
CK_TILE_DEVICE double ck_tile::asinh< double > | ( | double | x | ) |
◆ asinh< float >()
CK_TILE_DEVICE float ck_tile::asinh< float > | ( | float | x | ) |
◆ async_buffer_load_dwordxn_v()
CK_TILE_DEVICE void ck_tile::async_buffer_load_dwordxn_v | ( | void * | smem, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | , | ||
index_t | ioffset, | ||
index_t | = 0 , |
||
bool_constant< pre_nop > | = {} |
||
) |
◆ async_buffer_load_fence()
CK_TILE_DEVICE void ck_tile::async_buffer_load_fence | ( | index_t | cnt = 0 | ) |
◆ async_load_fence()
CK_TILE_DEVICE auto ck_tile::async_load_fence | ( | index_t | cnt = 0 | ) |
◆ async_load_fence_raw()
CK_TILE_DEVICE auto ck_tile::async_load_fence_raw | ( | index_t | cnt = 0 | ) |
◆ async_load_tile()
CK_TILE_DEVICE auto ck_tile::async_load_tile | ( | LdsTileWindow_ && | lds_tile, |
const TileWindow_ & | tile_window, | ||
number< i_access > | = {} , |
||
bool_constant< oob_conditional_check > | = {} |
||
) |
◆ async_load_tile_raw()
CK_TILE_DEVICE auto ck_tile::async_load_tile_raw | ( | LdsTileWindow_ && | lds_tile, |
const TileWindow_ & | tile_window, | ||
number< i_access > | = {} , |
||
bool_constant< oob_conditional_check > | = {} , |
||
bool_constant< pre_nop > | = {} |
||
) |
◆ atan() [1/2]
CK_TILE_HOST T ck_tile::atan | ( | T | x | ) |
◆ atan() [2/2]
CK_TILE_DEVICE T ck_tile::atan | ( | T | x | ) |
◆ atan< double >()
CK_TILE_DEVICE double ck_tile::atan< double > | ( | double | x | ) |
◆ atan< float >()
CK_TILE_DEVICE float ck_tile::atan< float > | ( | float | x | ) |
◆ atanh() [1/2]
CK_TILE_HOST T ck_tile::atanh | ( | T | x | ) |
◆ atanh() [2/2]
CK_TILE_DEVICE T ck_tile::atanh | ( | T | x | ) |
◆ atanh< double >()
CK_TILE_DEVICE double ck_tile::atanh< double > | ( | double | x | ) |
◆ atanh< float >()
CK_TILE_DEVICE float ck_tile::atanh< float > | ( | float | x | ) |
◆ atomic_add()
CK_TILE_DEVICE void ck_tile::atomic_add | ( | X * | p_dst, |
const X & | x | ||
) |
◆ atomic_add< bf16x2_t >()
CK_TILE_DEVICE void ck_tile::atomic_add< bf16x2_t > | ( | bf16x2_t * | p_dst, |
const bf16x2_t & | x | ||
) |
◆ atomic_add< bf16x4_t >()
CK_TILE_DEVICE void ck_tile::atomic_add< bf16x4_t > | ( | bf16x4_t * | p_dst, |
bf16x4_t const & | x | ||
) |
◆ atomic_add< bf8x4_t >()
CK_TILE_DEVICE void ck_tile::atomic_add< bf8x4_t > | ( | bf8x4_t * | p_dst, |
const bf8x4_t & | x | ||
) |
◆ atomic_add< bf8x8_t >()
CK_TILE_DEVICE void ck_tile::atomic_add< bf8x8_t > | ( | bf8x8_t * | p_dst, |
bf8x8_t const & | x | ||
) |
◆ atomic_add< fp16x2_t >()
CK_TILE_DEVICE void ck_tile::atomic_add< fp16x2_t > | ( | fp16x2_t * | p_dst, |
fp16x2_t const & | x | ||
) |
◆ atomic_add< fp8x4_t >()
CK_TILE_DEVICE void ck_tile::atomic_add< fp8x4_t > | ( | fp8x4_t * | p_dst, |
const fp8x4_t & | x | ||
) |
◆ atomic_add< fp8x8_t >()
CK_TILE_DEVICE void ck_tile::atomic_add< fp8x8_t > | ( | fp8x8_t * | p_dst, |
fp8x8_t const & | x | ||
) |
◆ atomic_add_g()
CK_TILE_DEVICE void ck_tile::atomic_add_g | ( | T * | p_dst, |
const thread_buffer< T, N > & | x | ||
) |
◆ atomic_max_g()
CK_TILE_DEVICE void ck_tile::atomic_max_g | ( | T * | p_dst, |
const thread_buffer< T, N > & | x | ||
) |
◆ bf16_to_double()
|
constexpr |
◆ bf16_to_double_raw()
|
constexpr |
◆ bf16_to_float()
|
constexpr |
◆ bf16_to_float_raw()
|
constexpr |
◆ bf16_to_fp16()
|
constexpr |
◆ bf16_to_pk_fp4()
|
constexpr |
◆ bf16x2_to_pk_fp4()
|
constexpr |
◆ bf8_to_float()
CK_TILE_HOST_DEVICE float ck_tile::bf8_to_float | ( | bf8_t | x | ) |
◆ bf8_to_float_raw()
CK_TILE_HOST_DEVICE float ck_tile::bf8_to_float_raw | ( | uint8_t | x | ) |
◆ bit_cast()
|
constexpr |
◆ block_tile_reduce() [1/2]
CK_TILE_DEVICE void ck_tile::block_tile_reduce | ( | AccDistributedTensor_ & | acc_tensor, |
const InDistributedTensor_ & | in_tensor, | ||
sequence< InReduceDims... > | , | ||
const ReduceFunc & | reduce_func | ||
) |
◆ block_tile_reduce() [2/2]
CK_TILE_DEVICE auto ck_tile::block_tile_reduce | ( | const InDistributedTensor_ & | in_tensor, |
sequence< InReduceDims... > | in_reduce_dims, | ||
const ReduceFunc & | reduce_func, | ||
const InDataType_ & | reduce_init | ||
) |
◆ block_tile_reduce_sync()
CK_TILE_DEVICE void ck_tile::block_tile_reduce_sync | ( | AccDistributedTensor_ & | acc_tensor, |
const ReduceFunc & | reduce_func, | ||
bool_constant< WithBroadcast > | = {} , |
||
bool_constant< CrossWarp > | = {} |
||
) |
◆ block_tile_reduce_xor_sync()
CK_TILE_DEVICE void ck_tile::block_tile_reduce_xor_sync | ( | AccDistributedTensor_ & | acc_tensor, |
const ReduceFunc & | reduce_func | ||
) |
◆ block_tile_welford_calculate_max_count()
|
constexpr |
◆ block_tile_welford_post_scale_var()
|
constexpr |
◆ BlockReduce2D()
CK_TILE_HOST_DEVICE_EXTERN ck_tile::BlockReduce2D | ( | const T & | , |
const typename T::DataType & | |||
) | -> BlockReduce2D< T > |
◆ buffer_load_fence() [1/2]
CK_TILE_DEVICE void ck_tile::buffer_load_fence | ( | index_t | cnt = 0 | ) |
◆ buffer_load_fence() [2/2]
CK_TILE_DEVICE void ck_tile::buffer_load_fence | ( | index_t | cnt = 0 , |
T &... | o | ||
) |
◆ buffer_store_fence()
CK_TILE_DEVICE void ck_tile::buffer_store_fence | ( | index_t | cnt = 0 | ) |
◆ c_style_pointer_cast()
CK_TILE_HOST_DEVICE PY ck_tile::c_style_pointer_cast | ( | PX | p_x | ) |
◆ call_f_unpack_args()
CK_TILE_HOST auto ck_tile::call_f_unpack_args | ( | F | f, |
T | args | ||
) |
◆ call_f_unpack_args_impl()
CK_TILE_HOST auto ck_tile::call_f_unpack_args_impl | ( | F | f, |
T | args, | ||
std::index_sequence< Is... > | |||
) |
◆ cast_tile()
CK_TILE_DEVICE auto ck_tile::cast_tile | ( | const SrcTensor & | src_tensor | ) |
◆ ceil() [1/2]
CK_TILE_HOST T ck_tile::ceil | ( | T | x | ) |
◆ ceil() [2/2]
CK_TILE_DEVICE T ck_tile::ceil | ( | T | x | ) |
◆ ceil< double >()
CK_TILE_DEVICE double ck_tile::ceil< double > | ( | double | x | ) |
◆ ceil< float >()
CK_TILE_DEVICE float ck_tile::ceil< float > | ( | float | x | ) |
◆ ceil< fp16_t >()
CK_TILE_DEVICE fp16_t ck_tile::ceil< fp16_t > | ( | fp16_t | x | ) |
◆ chain_tensor_adaptors()
|
constexpr |
◆ check_err() [1/6]
std::enable_if< std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange> > && std::is_same_v<ranges::range_value_t<Range>, bf16_t>, bool>::type CK_TILE_HOST ck_tile::check_err | ( | const Range & | out, |
const RefRange & | ref, | ||
const std::string & | msg = "Error: Incorrect results!" , |
||
double | rtol = 1e-3 , |
||
double | atol = 1e-3 , |
||
bool | allow_infinity_ref = false |
||
) |
Check errors between floating point ranges using the specified tolerances.
Compares two ranges of brain floating point values within specified relative and absolute tolerances.
- Template Parameters
-
Range Type of output range RefRange Type of reference range
- Parameters
-
out Output range to check ref Reference range to check against msg Error message to display if check fails rtol Relative tolerance atol Absolute tolerance allow_infinity_ref Whether to allow infinity in reference values
- Returns
- True if check passes, false otherwise
◆ check_err() [2/6]
std::enable_if< std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange> > && std::is_same_v<ranges::range_value_t<Range>, half_t>, bool>::type CK_TILE_HOST ck_tile::check_err | ( | const Range & | out, |
const RefRange & | ref, | ||
const std::string & | msg = "Error: Incorrect results!" , |
||
double | rtol = 1e-3 , |
||
double | atol = 1e-3 , |
||
bool | allow_infinity_ref = false |
||
) |
Check errors between half precision floating point ranges.
Compares two ranges of half precision floating point values within specified tolerances. This specialization handles the specific requirements and characteristics of half precision floating point comparisons.
- Template Parameters
-
Range Type of output range RefRange Type of reference range
- Parameters
-
out Output range to check ref Reference range to check against msg Error message to display if check fails rtol Relative tolerance atol Absolute tolerance allow_infinity_ref Whether to allow infinity in reference values
- Returns
- True if check passes, false otherwise
◆ check_err() [3/6]
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> && std::is_same_v<ranges::range_value_t<Range>, bf8_t>), bool> CK_TILE_HOST ck_tile::check_err | ( | const Range & | out, |
const RefRange & | ref, | ||
const std::string & | msg = "Error: Incorrect results!" , |
||
double | rtol = 1e-3 , |
||
double | atol = 1e-3 , |
||
bool | allow_infinity_ref = false |
||
) |
Check errors between BF8 ranges.
Specialized comparison for 8-bit brain floating point values that considers the specific numerical properties and error characteristics of the BF8 format.
- Template Parameters
-
Range Type of output range RefRange Type of reference range
- Parameters
-
out Output range to check ref Reference range to check against msg Error message to display if check fails rtol Relative tolerance atol Absolute tolerance allow_infinity_ref Whether to allow infinity in reference values
- Returns
- True if check passes, false otherwise
◆ check_err() [4/6]
std::enable_if< std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange> > && std::is_floating_point_v<ranges::range_value_t<Range> > && !std::is_same_v<ranges::range_value_t<Range>, half_t>, bool>::type CK_TILE_HOST ck_tile::check_err | ( | const Range & | out, |
const RefRange & | ref, | ||
const std::string & | msg = "Error: Incorrect results!" , |
||
double | rtol = 1e-5 , |
||
double | atol = 3e-6 , |
||
bool | allow_infinity_ref = false |
||
) |
Check errors between floating point ranges using the specified tolerances.
Compares two ranges of floating point values within specified relative and absolute tolerances. This overload handles standard floating point types except half precision floating point.
- Template Parameters
-
Range Type of output range RefRange Type of reference range
- Parameters
-
out Output range to check ref Reference range to check against msg Error message to display if check fails rtol Relative tolerance atol Absolute tolerance allow_infinity_ref Whether to allow infinity in reference values
- Returns
- True if check passes, false otherwise
◆ check_err() [5/6]
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> && std::is_integral_v<ranges::range_value_t<Range>> && !std::is_same_v<ranges::range_value_t<Range>, bf16_t>) , bool> CK_TILE_HOST ck_tile::check_err | ( | const Range & | out, |
const RefRange & | ref, | ||
const std::string & | msg = "Error: Incorrect results!" , |
||
double | = 0 , |
||
double | atol = 0 |
||
) |
Check errors between integer ranges.
Compares two ranges of integer values with an absolute tolerance. This specialization handles integer types and optionally int4_t when the experimental bit int extension is enabled.
- Template Parameters
-
Range Type of output range RefRange Type of reference range
- Parameters
-
out Output range to check ref Reference range to check against msg Error message to display if check fails atol Absolute tolerance
- Returns
- True if check passes, false otherwise
◆ check_err() [6/6]
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> && std::is_same_v<ranges::range_value_t<Range>, fp8_t>), bool> CK_TILE_HOST ck_tile::check_err | ( | const Range & | out, |
const RefRange & | ref, | ||
const std::string & | msg = "Error: Incorrect results!" , |
||
unsigned | max_rounding_point_distance = 1 , |
||
double | atol = 1e-1 , |
||
bool | allow_infinity_ref = false |
||
) |
Check errors between FP8 ranges.
Specialized comparison for 8-bit floating point values that takes into account the unique characteristics and limitations of FP8 arithmetic, including rounding point distances and special handling of infinity values.
- Template Parameters
-
Range Type of output range RefRange Type of reference range
- Parameters
-
out Output range to check ref Reference range to check against msg Error message to display if check fails max_rounding_point_distance Maximum allowed distance between rounding points atol Absolute tolerance allow_infinity_ref Whether to allow infinity in reference values
- Returns
- True if check passes, false otherwise
◆ check_size_mismatch()
CK_TILE_HOST bool ck_tile::check_size_mismatch | ( | const Range & | out, |
const RefRange & | ref, | ||
const std::string & | msg = "Error: Incorrect results!" |
||
) |
Check for size mismatch between output and reference ranges.
Verifies that the output and reference ranges are the same size.
- Template Parameters
-
Range Type of output range RefRange Type of reference range
- Parameters
-
out Output range to check ref Reference range to check against msg Error message to display if sizes mismatch
- Returns
- True if sizes mismatch, false otherwise
◆ check_wmma_supported()
CK_TILE_HOST bool ck_tile::check_wmma_supported | ( | ) |
◆ CK_PRINT()
|
inlineconstexpr |
◆ CK_TILE_ERROR()
|
noexcept |
◆ clamp()
|
constexpr |
◆ clear_tile()
CK_TILE_DEVICE void ck_tile::clear_tile | ( | DstrTensors & | dstr_tensor | ) |
◆ clz()
CK_TILE_DEVICE int ck_tile::clz | ( | uint32_t | x | ) |
◆ cmp_lt_to_exec()
CK_TILE_DEVICE auto ck_tile::cmp_lt_to_exec | ( | const X & | x, |
const Y & | y | ||
) |
◆ composes()
__host__ __device__ ck_tile::composes | ( | Ts && | ... | ) | -> composes< remove_cvref_t< Ts >... > |
FIXME: create macro to replace 'host device' and nothing more.
◆ concat() [1/2]
auto ck_tile::concat | ( | const Ts &... | xs | ) | -> std::enable_if_t<!AllConvertibleToStringView<Ts...>, std::string> |
◆ concat() [2/2]
auto ck_tile::concat | ( | Sep | sep, |
const First & | first, | ||
const Rest &... | rest | ||
) | -> std::enable_if_t<AllConvertibleToStringView<First, Rest...>, std::string> |
◆ concat_tuple() [1/3]
|
constexpr |
◆ concat_tuple() [2/3]
|
constexpr |
◆ concat_tuple() [3/3]
|
constexpr |
◆ concat_tuple_of_reference()
|
constexpr |
◆ concatInto()
auto ck_tile::concatInto | ( | std::string & | result, |
const Ts &... | xs | ||
) | -> std::enable_if_t<AllConvertibleToStringView<Ts...>, void> |
◆ conditional_expr()
|
constexpr |
◆ constexpr_strlen()
|
constexpr |
◆ construct_f_unpack_args()
CK_TILE_HOST auto ck_tile::construct_f_unpack_args | ( | F | , |
T | args | ||
) |
◆ construct_f_unpack_args_impl()
CK_TILE_HOST auto ck_tile::construct_f_unpack_args_impl | ( | T | args, |
std::index_sequence< Is... > | |||
) |
◆ container_concat() [1/4]
|
constexpr |
◆ container_concat() [2/4]
|
constexpr |
◆ container_concat() [3/4]
|
constexpr |
◆ container_concat() [4/4]
|
constexpr |
◆ container_find()
|
constexpr |
◆ container_push_back() [1/2]
|
constexpr |
◆ container_push_back() [2/2]
|
constexpr |
◆ container_push_front()
|
constexpr |
◆ container_reduce()
|
constexpr |
◆ container_reduce_impl()
|
constexpr |
◆ container_reorder_given_new2old() [1/4]
|
constexpr |
◆ container_reorder_given_new2old() [2/4]
|
constexpr |
◆ container_reorder_given_new2old() [3/4]
|
constexpr |
◆ container_reorder_given_new2old() [4/4]
|
constexpr |
◆ container_reorder_given_old2new() [1/4]
|
constexpr |
◆ container_reorder_given_old2new() [2/4]
|
constexpr |
◆ container_reorder_given_old2new() [3/4]
|
constexpr |
◆ container_reorder_given_old2new() [4/4]
|
constexpr |
◆ container_reverse_exclusive_scan() [1/3]
|
constexpr |
◆ container_reverse_exclusive_scan() [2/3]
|
constexpr |
◆ container_reverse_exclusive_scan() [3/3]
|
constexpr |
◆ container_reverse_exclusive_scan_impl()
|
constexpr |
◆ container_reverse_inclusive_scan() [1/2]
|
constexpr |
◆ container_reverse_inclusive_scan() [2/2]
|
constexpr |
◆ convert_to_float()
CK_TILE_HOST_DEVICE float ck_tile::convert_to_float | ( | typename T::raw_type | data, |
float | scale = 1.f |
||
) |
◆ convert_to_type()
CK_TILE_HOST_DEVICE T::raw_type ck_tile::convert_to_type | ( | float | value, |
float | scale = 1.f |
||
) |
◆ coordinate_has_valid_offset()
|
constexpr |
◆ coordinate_has_valid_offset_assuming_top_index_is_valid()
|
constexpr |
◆ cos()
CK_TILE_HOST T ck_tile::cos | ( | T | x | ) |
◆ cos< double >()
CK_TILE_HOST double ck_tile::cos< double > | ( | double | x | ) |
◆ cos< float >()
CK_TILE_HOST float ck_tile::cos< float > | ( | float | x | ) |
◆ cosh() [1/2]
CK_TILE_HOST T ck_tile::cosh | ( | T | x | ) |
◆ cosh() [2/2]
CK_TILE_DEVICE T ck_tile::cosh | ( | T | x | ) |
◆ cosh< double >()
CK_TILE_DEVICE double ck_tile::cosh< double > | ( | double | x | ) |
◆ cosh< float >()
CK_TILE_DEVICE float ck_tile::cosh< float > | ( | float | x | ) |
◆ double_to_bf16()
|
constexpr |
◆ double_to_bf16_raw()
|
constexpr |
◆ double_to_fp16()
|
constexpr |
◆ double_to_fp16_hip()
|
constexpr |
◆ DS_READ_TR_SIZE()
|
constexpr |
◆ embed_tuples()
|
constexpr |
◆ EnvGetString()
|
inline |
◆ EnvIsDisabled()
|
inline |
◆ EnvIsEnabled()
|
inline |
◆ EnvIsUnset()
|
inline |
◆ EnvUnset()
void ck_tile::EnvUnset | ( | EnvVar | ) |
◆ EnvValue()
|
inline |
◆ equal()
__host__ __device__ ck_tile::equal | ( | ) | -> equal< void, void > |
FIXME: create macro to replace 'host device' and nothing more.
◆ exclusive_scan_sequence()
|
constexpr |
◆ exp() [1/3]
CK_TILE_DEVICE bfloat16_t ck_tile::exp | ( | bfloat16_t | x | ) |
◆ exp() [2/3]
CK_TILE_HOST T ck_tile::exp | ( | T | x | ) |
◆ exp() [3/3]
CK_TILE_DEVICE T ck_tile::exp | ( | T | x | ) |
◆ exp2() [1/2]
CK_TILE_DEVICE bfloat16_t ck_tile::exp2 | ( | bfloat16_t | x | ) |
◆ exp2() [2/2]
CK_TILE_HOST float ck_tile::exp2 | ( | float | x | ) |
◆ exp< double >()
CK_TILE_DEVICE double ck_tile::exp< double > | ( | double | x | ) |
◆ exp< float >()
CK_TILE_DEVICE float ck_tile::exp< float > | ( | float | x | ) |
◆ exp< fp16_t >()
CK_TILE_DEVICE fp16_t ck_tile::exp< fp16_t > | ( | fp16_t | x | ) |
◆ expm1() [1/2]
CK_TILE_HOST T ck_tile::expm1 | ( | T | x | ) |
◆ expm1() [2/2]
CK_TILE_DEVICE T ck_tile::expm1 | ( | T | x | ) |
◆ expm1< double >()
CK_TILE_DEVICE double ck_tile::expm1< double > | ( | double | x | ) |
◆ expm1< float >()
CK_TILE_DEVICE float ck_tile::expm1< float > | ( | float | x | ) |
◆ flag_to_exec()
CK_TILE_DEVICE auto ck_tile::flag_to_exec | ( | const T & | v_flag | ) |
◆ float_to_bf16()
|
constexpr |
◆ float_to_bf16_raw()
|
constexpr |
◆ float_to_bf16_rta_asm()
CK_TILE_DEVICE uint16_t ck_tile::float_to_bf16_rta_asm | ( | float | f | ) |
◆ float_to_bf16_rtn_asm()
|
constexpr |
◆ float_to_bf16_rtn_raw()
|
constexpr |
◆ float_to_bf16_truc_nan_raw()
|
constexpr |
◆ float_to_bf16_truc_raw()
|
constexpr |
◆ float_to_bf8()
CK_TILE_HOST_DEVICE bf8_t ck_tile::float_to_bf8 | ( | float | x, |
constant< rounding > | = {} |
||
) |
◆ float_to_bf8_raw() [1/2]
CK_TILE_HOST_DEVICE bf8_raw_t ck_tile::float_to_bf8_raw | ( | float | x, |
constant< rounding > | |||
) |
◆ float_to_bf8_raw() [2/2]
CK_TILE_HOST_DEVICE uint8_t ck_tile::float_to_bf8_raw | ( | float | x, |
constant< rounding > | = {} |
||
) |
◆ float_to_e2m1()
|
constexpr |
◆ float_to_fp16()
|
constexpr |
◆ float_to_fp16_hip()
|
constexpr |
◆ float_to_fp8()
CK_TILE_HOST_DEVICE fp8_t ck_tile::float_to_fp8 | ( | float | x, |
constant< rounding > | = {} |
||
) |
◆ float_to_fp8_raw() [1/2]
CK_TILE_HOST_DEVICE fp8_raw_t ck_tile::float_to_fp8_raw | ( | float | x, |
constant< rounding > | |||
) |
◆ float_to_fp8_raw() [2/2]
CK_TILE_HOST_DEVICE uint8_t ck_tile::float_to_fp8_raw | ( | float | x, |
constant< rounding > | = {} |
||
) |
◆ float_to_fp8_rtn_raw()
CK_TILE_HOST_DEVICE numeric_traits<DstT>::bitwise_type ck_tile::float_to_fp8_rtn_raw | ( | SrcT | x | ) |
Converts a floating-point value to an 8-bit floating-point representation with rounding to nearest even.
This function converts a floating-point value (float or half_t) to an 8-bit floating-point representation of type fp8_t or bf8_t. The conversion process may involve clipping.
- Template Parameters
-
DstT The destination type (fp8_t or bf8_t). SrcT The source type (float or half_t) to be converted.
- Parameters
-
x The floating-point value to be converted.
- Returns
- The 8-bit floating-point representation of the input value.
◆ float_to_fp8_sr_raw()
CK_TILE_HOST_DEVICE numeric_traits<DstT>::bitwise_type ck_tile::float_to_fp8_sr_raw | ( | SrcT | x | ) |
Converts a floating-point value to an 8-bit floating-point representation with stochastic rounding.
This function converts a floating-point value (float or half_t) to an 8-bit floating-point representation of type fp8_t or bf8_t. The conversion process may involve clipping and uses a pseudo-random number generator for the stochastic rounding.
- Template Parameters
-
DstT The destination type (fp8_t or bf8_t). SrcT The source type (float or half_t) to be converted.
- Parameters
-
x The floating-point value to be converted.
- Returns
- The 8-bit floating-point representation of the input value.
◆ float_to_int8()
|
constexpr |
◆ float_to_pk_fp4()
|
constexpr |
◆ floor() [1/2]
CK_TILE_HOST T ck_tile::floor | ( | T | x | ) |
◆ floor() [2/2]
CK_TILE_DEVICE T ck_tile::floor | ( | T | x | ) |
◆ floor< double >()
CK_TILE_DEVICE double ck_tile::floor< double > | ( | double | x | ) |
◆ floor< float >()
CK_TILE_DEVICE float ck_tile::floor< float > | ( | float | x | ) |
◆ floor< fp16_t >()
CK_TILE_DEVICE fp16_t ck_tile::floor< fp16_t > | ( | fp16_t | x | ) |
◆ flush_icache()
|
inline |
◆ fnv1a_hash()
|
constexpr |
◆ fp16_to_bf16()
|
constexpr |
◆ fp16_to_double()
|
constexpr |
◆ fp16_to_double_hip()
|
constexpr |
◆ fp16_to_float()
|
constexpr |
◆ fp16_to_float_hip()
|
constexpr |
◆ fp16_to_pk_fp4()
|
constexpr |
◆ fp16x2_to_pk_fp4()
|
constexpr |
◆ fp32x2_to_pk_fp4()
|
constexpr |
◆ fp8_to_float()
CK_TILE_HOST_DEVICE float ck_tile::fp8_to_float | ( | fp8_t | x | ) |
◆ fp8_to_float_raw()
CK_TILE_HOST_DEVICE float ck_tile::fp8_to_float_raw | ( | uint8_t | x | ) |
◆ gcd() [1/2]
|
constexpr |
◆ gcd() [2/2]
|
constexpr |
◆ gemm_prec_str()
std::string ck_tile::gemm_prec_str | ( | ) |
◆ generate_array()
|
constexpr |
◆ generate_sequence()
|
constexpr |
◆ generate_sequence_v2()
|
constexpr |
◆ generate_tie()
|
constexpr |
◆ generate_tuple()
|
constexpr |
◆ generate_tuple_for()
|
constexpr |
◆ get_absolute_threshold()
CK_TILE_HOST double ck_tile::get_absolute_threshold | ( | const double | max_possible_num, |
const int | number_of_accumulations = 1 |
||
) |
Calculate absolute error threshold for numerical comparisons.
Calculates the absolute error threshold based on the maximum possible value and the characteristics of the data types involved in the computation.
- Template Parameters
-
ComputeDataType Type used for computation OutDataType Type used for output AccDataType Type used for accumulation (defaults to ComputeDataType)
- Parameters
-
max_possible_num Maximum possible value in the computation number_of_accumulations Number of accumulation operations performed
- Returns
- Absolute error threshold based on data type characteristics and maximum value
◆ get_alibi_slopes()
CK_TILE_HOST std::vector<DataType> ck_tile::get_alibi_slopes | ( | ck_tile::index_t | nheads | ) |
◆ get_async_store_smem_info()
CK_TILE_DEVICE auto ck_tile::get_async_store_smem_info | ( | LdsTileWindow_ && | lds_tile | ) |
◆ get_container_subset() [1/2]
|
constexpr |
◆ get_container_subset() [2/2]
|
constexpr |
◆ get_default_stride()
auto ck_tile::get_default_stride | ( | std::size_t | row, |
std::size_t | col, | ||
std::size_t | stride, | ||
bool_constant< is_row_major > | |||
) |
◆ get_device_name()
|
inline |
◆ get_relative_threshold()
CK_TILE_HOST double ck_tile::get_relative_threshold | ( | const int | number_of_accumulations = 1 | ) |
Calculate relative error threshold for numerical comparisons.
Calculates the relative error threshold based on the mantissa bits and characteristics of the data types involved in the computation.
- Template Parameters
-
ComputeDataType Type used for computation OutDataType Type used for output AccDataType Type used for accumulation (defaults to ComputeDataType)
- Parameters
-
number_of_accumulations Number of accumulation operations performed
- Returns
- Relative error threshold based on data type characteristics
◆ get_slice_tile() [1/2]
|
constexpr |
◆ get_slice_tile() [2/2]
|
constexpr |
◆ get_x_indices_from_distributed_indices()
|
constexpr |
◆ get_y_unpacks_from_x_unpacks()
|
constexpr |
◆ getConvSpecializationString()
CK_TILE_HOST std::string ck_tile::getConvSpecializationString | ( | const ConvolutionSpecialization & | s | ) |
◆ getSize() [1/6]
|
inlineconstexprnoexcept |
◆ getSize() [2/6]
|
inlineconstexprnoexcept |
◆ getSize() [3/6]
|
inlineconstexprnoexcept |
◆ getSize() [4/6]
|
inlineconstexprnoexcept |
◆ getSize() [5/6]
|
inlinenoexcept |
◆ getSize() [6/6]
|
inlineconstexprnoexcept |
◆ hip_check_error()
CK_TILE_HOST void ck_tile::hip_check_error | ( | hipError_t | x | ) |
◆ histogram_sorted_sequence()
|
constexpr |
◆ host_tensor_descriptor()
auto ck_tile::host_tensor_descriptor | ( | std::size_t | row, |
std::size_t | col, | ||
std::size_t | stride, | ||
bool_constant< is_row_major > | |||
) |
Creates a host tensor descriptor with specified dimensions and layout.
Constructs a HostTensorDescriptor with appropriate strides based on whether the tensor layout is row-major or column-major. This is determined via the compile-time template parameter is_row_major
.
- Template Parameters
-
is_row_major Compile-time flag indicating if the layout is row-major (true) or column-major (false)
- Parameters
-
row Number of rows in the tensor col Number of columns in the tensor stride Stride between adjacent rows (for row-major) or columns (for column-major)
- Returns
- HostTensorDescriptor with shape {row, col} and strides:
- For row-major: {stride, 1}
- For column-major: {1, stride}
◆ inclusive_scan_sequence()
|
constexpr |
◆ InputTileDistributionEncoding()
|
constexpr |
◆ int8_to_float()
|
constexpr |
◆ integer_divide_ceil()
|
constexpr |
◆ integer_divide_floor()
|
constexpr |
◆ integer_least_multiple()
|
constexpr |
◆ integer_log2_floor()
|
constexpr |
◆ is_gfx11_supported()
|
inline |
◆ is_gfx12_supported()
|
inline |
◆ is_load_tr_supported()
|
inline |
◆ is_nested_tuple()
|
constexpr |
◆ is_null_tile_window()
|
constexpr |
◆ is_power_of_two_integer()
|
constexpr |
◆ isnan() [1/8]
CK_TILE_HOST_DEVICE bool ck_tile::isnan | ( | const bf8_t & | x | ) |
◆ isnan() [2/8]
CK_TILE_HOST_DEVICE bool ck_tile::isnan | ( | const bfloat16_t & | x | ) |
◆ isnan() [3/8]
CK_TILE_HOST_DEVICE bool ck_tile::isnan | ( | const fp8_t & | x | ) |
◆ isnan() [4/8]
CK_TILE_DEVICE bool ck_tile::isnan | ( | double | x | ) |
◆ isnan() [5/8]
CK_TILE_DEVICE bool ck_tile::isnan | ( | float | x | ) |
◆ isnan() [6/8]
CK_TILE_DEVICE bool ck_tile::isnan | ( | fp16_t | x | ) |
◆ isnan() [7/8]
CK_TILE_DEVICE bool ck_tile::isnan | ( | int32_t | x | ) |
◆ isnan() [8/8]
CK_TILE_DEVICE bool ck_tile::isnan | ( | int8_t | x | ) |
◆ kentry()
__global__ void ck_tile::kentry | ( | Args... | args | ) |
◆ launch_and_check()
CK_TILE_HOST void ck_tile::launch_and_check | ( | const stream_config & | sc, |
Callables &&... | callables | ||
) |
◆ launch_kernel()
CK_TILE_HOST float ck_tile::launch_kernel | ( | const stream_config & | s, |
Callables &&... | callables | ||
) |
◆ launch_kernel_time_mask()
CK_TILE_HOST float ck_tile::launch_kernel_time_mask | ( | const stream_config & | s, |
PreprocessFunc | preprocess, | ||
Callables &&... | callables | ||
) |
◆ lcm()
|
constexpr |
◆ lds_load_fence()
CK_TILE_DEVICE void ck_tile::lds_load_fence | ( | index_t | cnt = 0 | ) |
◆ less()
__host__ __device__ ck_tile::less | ( | ) | -> less< void, void > |
FIXME: create macro to replace 'host device' and nothing more.
◆ less_equal()
__host__ __device__ ck_tile::less_equal | ( | ) | -> less_equal< void, void > |
FIXME: create macro to replace 'host device' and nothing more.
◆ llvm_amdgcn_raw_buffer_atomic_add_fp16x2()
CK_TILE_DEVICE_EXTERN fp16x2_t ck_tile::llvm_amdgcn_raw_buffer_atomic_add_fp16x2 | ( | fp16x2_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_atomic_add_fp32()
CK_TILE_DEVICE_EXTERN float ck_tile::llvm_amdgcn_raw_buffer_atomic_add_fp32 | ( | float | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_atomic_add_i32()
CK_TILE_DEVICE_EXTERN int32_t ck_tile::llvm_amdgcn_raw_buffer_atomic_add_i32 | ( | int32_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_atomic_max_fp64()
CK_TILE_DEVICE_EXTERN double ck_tile::llvm_amdgcn_raw_buffer_atomic_max_fp64 | ( | double | vdata, |
int32x4_t | rsrc, | ||
int | voffset, | ||
int | soffset, | ||
int | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_fp16()
CK_TILE_DEVICE_EXTERN _Float16 ck_tile::llvm_amdgcn_raw_buffer_load_fp16 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_fp16x2()
CK_TILE_DEVICE_EXTERN fp16x2_t ck_tile::llvm_amdgcn_raw_buffer_load_fp16x2 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_fp16x4()
CK_TILE_DEVICE_EXTERN fp16x4_t ck_tile::llvm_amdgcn_raw_buffer_load_fp16x4 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_fp32()
CK_TILE_DEVICE_EXTERN float ck_tile::llvm_amdgcn_raw_buffer_load_fp32 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_fp32x2()
CK_TILE_DEVICE_EXTERN fp32x2_t ck_tile::llvm_amdgcn_raw_buffer_load_fp32x2 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_fp32x4()
CK_TILE_DEVICE_EXTERN fp32x4_t ck_tile::llvm_amdgcn_raw_buffer_load_fp32x4 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_i16()
CK_TILE_DEVICE_EXTERN int16_t ck_tile::llvm_amdgcn_raw_buffer_load_i16 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_i16x2()
CK_TILE_DEVICE_EXTERN int16x2_t ck_tile::llvm_amdgcn_raw_buffer_load_i16x2 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_i16x4()
CK_TILE_DEVICE_EXTERN int16x4_t ck_tile::llvm_amdgcn_raw_buffer_load_i16x4 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_i32()
CK_TILE_DEVICE_EXTERN int32_t ck_tile::llvm_amdgcn_raw_buffer_load_i32 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_i32x2()
CK_TILE_DEVICE_EXTERN int32x2_t ck_tile::llvm_amdgcn_raw_buffer_load_i32x2 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_i32x4()
CK_TILE_DEVICE_EXTERN int32x4_t ck_tile::llvm_amdgcn_raw_buffer_load_i32x4 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_i8()
CK_TILE_DEVICE_EXTERN int8_t ck_tile::llvm_amdgcn_raw_buffer_load_i8 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_i8x2()
CK_TILE_DEVICE_EXTERN int8x2_t ck_tile::llvm_amdgcn_raw_buffer_load_i8x2 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_i8x4()
CK_TILE_DEVICE_EXTERN int8x4_t ck_tile::llvm_amdgcn_raw_buffer_load_i8x4 | ( | int32x4_t | srsrc, |
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_load_lds()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_load_lds | ( | int32x4_t | rsrc, |
as3_uint32_ptr | lds_ptr, | ||
index_t | size, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | offset, | ||
index_t | aux | ||
) |
◆ llvm_amdgcn_raw_buffer_store_fp16()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp16 | ( | _Float16 | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_fp16x2()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp16x2 | ( | fp16x2_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_fp16x4()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp16x4 | ( | fp16x4_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_fp32()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp32 | ( | float | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_fp32x2()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp32x2 | ( | fp32x2_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_fp32x4()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp32x4 | ( | fp32x4_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_i16()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i16 | ( | int16_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_i16x2()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i16x2 | ( | int16x2_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_i16x4()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i16x4 | ( | int16x4_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_i32()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i32 | ( | int32_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_i32x2()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i32x2 | ( | int32x2_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_i32x4()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i32x4 | ( | int32x4_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_i8()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i8 | ( | int8_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_i8x2()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i8x2 | ( | int8x2_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_i8x4()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i8x4 | ( | int8x4_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_ui16()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_ui16 | ( | uint16_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_ui16x2()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_ui16x2 | ( | uint16x2_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ llvm_amdgcn_raw_buffer_store_ui16x4()
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_ui16x4 | ( | uint16x4_t | vdata, |
int32x4_t | rsrc, | ||
index_t | voffset, | ||
index_t | soffset, | ||
index_t | glc_slc | ||
) |
◆ load_tile() [1/3]
CK_TILE_DEVICE auto ck_tile::load_tile | ( | const null_tile_window< WindowLengths > & | ) |
◆ load_tile() [2/3]
CK_TILE_DEVICE auto ck_tile::load_tile | ( | const TileWindow_ & | tile_window, |
number< i_access > | = {} , |
||
bool_constant< oob_conditional_check > | = {} |
||
) |
◆ load_tile() [3/3]
CK_TILE_DEVICE auto ck_tile::load_tile | ( | DistributedTensor_ & | dst_tile, |
const TileWindow_ & | tile_window, | ||
number< i_access > | = {} , |
||
bool_constant< oob_conditional_check > | = {} |
||
) |
◆ load_tile_raw() [1/3]
CK_TILE_DEVICE auto ck_tile::load_tile_raw | ( | T & | , |
const null_tile_window< WindowLengths > & | |||
) |
◆ load_tile_raw() [2/3]
CK_TILE_DEVICE auto ck_tile::load_tile_raw | ( | T & | tile, |
const tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > & | tile_window, | ||
number< i_access > | = {} , |
||
bool_constant< oob_conditional_check > | = {} , |
||
bool_constant< pre_nop > | = {} |
||
) |
◆ load_tile_raw() [3/3]
CK_TILE_DEVICE auto ck_tile::load_tile_raw | ( | T & | tile, |
const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > & | tile_window, | ||
number< i_access > | = {} , |
||
bool_constant< oob_conditional_check > | = {} , |
||
bool_constant< pre_nop > | = {} |
||
) |
Loads a tile of data using inline assembly.
- Note
- Bare in mind that loading data this way, you have to manually initialize your thread buffer and synchronize load afterwards in order to make sure it's done before using loaded data from registers
◆ load_tile_transpose()
CK_TILE_DEVICE auto ck_tile::load_tile_transpose | ( | const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > & | tile_window | ) |
transpose loads tile from a tensor and returns the resulting tensor with a new (transposed) tile distribution. use SFINAE to ensure the tile distribution encoding is valid.
This function is intended for use with statically distributed tensor tiles, where the input and output tile distributions differ due to the transpose operation. It ensures that the element space size and vector length remain consistent between the input and output distributions.
- Template Parameters
-
BottomTensorView_ The type of the bottom tensor view. WindowLengths_ The type representing the window lengths. TileDistribution_ The type representing the tile distribution. NumCoord The number of coordinates (dimensions). Policy The transpose policy to use (defaults to DefaultTranspose). the last is SFINAE to ensure the tile distribution encoding is valid.
- Parameters
-
tile_window The tile window with static distribution to load and transpose.
- Returns
- A statically distributed tensor containing the transposed tile data.
- Note
- The function uses compile-time checks to ensure the input and output tile distributions are compatible in terms of element space size and vector length.
- The transpose operation is performed according to the specified Policy.
◆ log() [1/3]
CK_TILE_DEVICE bfloat16_t ck_tile::log | ( | bfloat16_t | x | ) |
◆ log() [2/3]
CK_TILE_HOST T ck_tile::log | ( | T | x | ) |
◆ log() [3/3]
CK_TILE_DEVICE T ck_tile::log | ( | T | x | ) |
◆ log< double >()
CK_TILE_DEVICE double ck_tile::log< double > | ( | double | x | ) |
◆ log< float >()
CK_TILE_DEVICE float ck_tile::log< float > | ( | float | x | ) |
◆ log< fp16_t >()
CK_TILE_DEVICE fp16_t ck_tile::log< fp16_t > | ( | fp16_t | x | ) |
◆ LogRange()
CK_TILE_HOST std::ostream& ck_tile::LogRange | ( | std::ostream & | os, |
Range && | range, | ||
std::string | delim, | ||
int | precision = std::cout.precision() , |
||
int | width = 0 |
||
) |
◆ LogRangeAsType()
CK_TILE_HOST std::ostream& ck_tile::LogRangeAsType | ( | std::ostream & | os, |
Range && | range, | ||
std::string | delim, | ||
int | precision = std::cout.precision() , |
||
int | width = 0 |
||
) |
◆ m0_inc_with_memory()
CK_TILE_DEVICE void ck_tile::m0_inc_with_memory | ( | index_t | v | ) |
◆ m0_set_with_memory()
CK_TILE_DEVICE void ck_tile::m0_set_with_memory | ( | index_t | v | ) |
◆ make_alibi_from_lr_mask()
CK_TILE_HOST_DEVICE auto ck_tile::make_alibi_from_lr_mask | ( | DataType | slope, |
index_t | window_left_size, | ||
index_t | window_right_size, | ||
index_t | y_total, | ||
index_t | x_total, | ||
GenericAttentionMaskEnum | mask_enum | ||
) |
◆ make_array()
|
constexpr |
◆ make_array_with()
|
constexpr |
◆ make_buffer_view() [1/2]
|
constexpr |
◆ make_buffer_view() [2/2]
|
constexpr |
◆ make_cluster_descriptor()
|
constexpr |
◆ make_embed_transform()
|
constexpr |
◆ make_freeze_transform()
|
constexpr |
◆ make_generic_attention_mask_coordinates_from_lr_window()
|
constexpr |
◆ make_generic_attention_mask_from_lr_window()
|
constexpr |
◆ make_indexing_transform()
|
constexpr |
◆ make_indexing_transform_with_adaptor()
|
constexpr |
◆ make_insert_transform()
|
constexpr |
◆ make_kernel()
CK_TILE_HOST auto ck_tile::make_kernel | ( | KernelImpl | , |
dim3 | grid_dim, | ||
dim3 | block_dim, | ||
std::size_t | lds_byte, | ||
Args... | args | ||
) |
◆ make_left_pad_transform()
|
constexpr |
◆ make_merge_transform()
|
constexpr |
◆ make_merge_transform_v2_magic_division()
|
constexpr |
◆ make_merge_transform_v3_division_mod()
|
constexpr |
◆ make_modulo_transform()
|
constexpr |
◆ make_multi_index()
|
constexpr |
◆ make_naive_tensor_descriptor()
|
constexpr |
◆ make_naive_tensor_descriptor_aligned()
|
constexpr |
◆ make_naive_tensor_descriptor_packed()
|
constexpr |
◆ make_naive_tensor_descriptor_packed_with_offset()
|
constexpr |
◆ make_naive_tensor_descriptor_with_offset()
|
constexpr |
◆ make_naive_tensor_view()
|
constexpr |
◆ make_naive_tensor_view_packed()
|
constexpr |
◆ make_null_tile_window()
|
constexpr |
◆ make_offset_transform()
|
constexpr |
◆ make_pad_transform()
|
constexpr |
◆ make_page_block_navigator() [1/2]
CK_TILE_HOST_DEVICE auto ck_tile::make_page_block_navigator | ( | const TensorView & | tensor_view | ) |
◆ make_page_block_navigator() [2/2]
CK_TILE_HOST_DEVICE auto ck_tile::make_page_block_navigator | ( | copy_const_t< DataType, void > * | physical_blocks, |
long_index_t | block_stride, | ||
long_index_t | fixed_offset, | ||
const int32_t * | physical_block_indices, | ||
index_t | num_blocks, | ||
index_t | page_block_size, | ||
const TensorView & | complete_view, | ||
const TensorView & | last_view | ||
) |
◆ make_ParallelTensorFunctor()
CK_TILE_HOST auto ck_tile::make_ParallelTensorFunctor | ( | F | f, |
Xs... | xs | ||
) |
◆ make_pass_through_transform()
|
constexpr |
◆ make_replicate_transform()
|
constexpr |
◆ make_right_pad_transform()
|
constexpr |
◆ make_sequence()
|
constexpr |
◆ make_single_stage_tensor_adaptor()
|
constexpr |
◆ make_slice_transform()
|
constexpr |
◆ make_static_distributed_tensor() [1/2]
|
constexpr |
◆ make_static_distributed_tensor() [2/2]
|
constexpr |
◆ make_static_tile_distribution()
|
constexpr |
◆ make_tensor_adaptor_coordinate()
|
constexpr |
◆ make_tensor_coordinate()
|
constexpr |
◆ make_tensor_descriptor_from_adaptor()
|
constexpr |
◆ make_tensor_view()
|
constexpr |
◆ make_thread_buffer()
|
constexpr |
◆ make_tile_scatter_gather() [1/6]
|
constexpr |
◆ make_tile_scatter_gather() [2/6]
|
constexpr |
◆ make_tile_scatter_gather() [3/6]
|
constexpr |
◆ make_tile_scatter_gather() [4/6]
|
constexpr |
◆ make_tile_scatter_gather() [5/6]
|
constexpr |
◆ make_tile_scatter_gather() [6/6]
|
constexpr |
◆ make_tile_window() [1/7]
|
constexpr |
◆ make_tile_window() [2/7]
|
constexpr |
◆ make_tile_window() [3/7]
|
constexpr |
◆ make_tile_window() [4/7]
|
constexpr |
◆ make_tile_window() [5/7]
|
constexpr |
◆ make_tile_window() [6/7]
|
constexpr |
◆ make_tile_window() [7/7]
|
constexpr |
◆ make_tile_window_linear() [1/2]
|
constexpr |
◆ make_tile_window_linear() [2/2]
|
constexpr |
◆ make_tile_window_linear_raw() [1/2]
CK_TILE_DEVICE auto ck_tile::make_tile_window_linear_raw | ( | const TensorView_ & | tensor_view, |
const WindowLengths_ & | window_lengths, | ||
const multi_index< TensorView_::get_num_of_dimension()> & | origin, | ||
const StaticTileDistribution_ & | tile_distribution, | ||
LinearBottomDims_ | = {} |
||
) |
◆ make_tile_window_linear_raw() [2/2]
|
constexpr |
◆ make_tile_window_raw() [1/2]
CK_TILE_DEVICE auto ck_tile::make_tile_window_raw | ( | const TensorView_ & | tensor_view, |
const WindowLengths_ & | window_lengths, | ||
const multi_index< TensorView_::get_num_of_dimension()> & | origin, | ||
const StaticTileDistribution_ & | tile_distribution, | ||
number< NumCoord > | = {} |
||
) |
◆ make_tile_window_raw() [2/2]
|
constexpr |
◆ make_transposed_distr_encode()
|
constexpr |
◆ make_tuple()
|
constexpr |
◆ make_unmerge_transform()
|
constexpr |
◆ make_wave_buffer_resource()
CK_TILE_DEVICE int32x4_t ck_tile::make_wave_buffer_resource | ( | const void * | ptr, |
uint32_t | size = 0xffffffff |
||
) |
◆ make_xor_transform()
|
constexpr |
◆ make_zero_multi_index()
|
constexpr |
◆ makeTuple()
|
constexprnoexcept |
◆ max() [1/8]
|
constexpr |
◆ max() [2/8]
|
constexpr |
◆ max() [3/8]
|
constexpr |
◆ max() [4/8]
|
constexpr |
◆ max() [5/8]
|
constexpr |
◆ max() [6/8]
|
constexpr |
◆ max() [7/8]
|
constexpr |
◆ max() [8/8]
|
constexpr |
◆ merge_sequences()
|
constexpr |
◆ min() [1/8]
|
constexpr |
◆ min() [2/8]
|
constexpr |
◆ min() [3/8]
|
constexpr |
◆ min() [4/8]
|
constexpr |
◆ min() [5/8]
|
constexpr |
◆ min() [6/8]
|
constexpr |
◆ min() [7/8]
|
constexpr |
◆ min() [8/8]
|
constexpr |
◆ minus()
__host__ __device__ ck_tile::minus | ( | ) | -> minus< void, void > |
FIXME: create macro to replace 'host device' and nothing more.
◆ modify_sequence_elements_by_ids()
|
constexpr |
◆ moe_sorting_get_smem_row_col()
|
constexpr |
◆ moe_sorting_get_sub_token()
CK_TILE_HOST index_t ck_tile::moe_sorting_get_sub_token | ( | int | tokens_, |
int | num_experts_ | ||
) |
◆ moe_sorting_get_workspace_size()
CK_TILE_HOST index_t ck_tile::moe_sorting_get_workspace_size | ( | int | tokens_, |
int | num_experts_, | ||
int | topk_, | ||
int | dispatch_policy_ | ||
) |
◆ moe_sorting_is_oneshot()
CK_TILE_HOST bool ck_tile::moe_sorting_is_oneshot | ( | int | tokens_, |
int | num_experts_ | ||
) |
◆ moe_sorting_mp_get_workspace_size()
CK_TILE_HOST index_t ck_tile::moe_sorting_mp_get_workspace_size | ( | int | tokens_, |
int | num_experts_, | ||
int | topk_ | ||
) |
◆ move_tensor_adaptor_coordinate() [1/2]
|
constexpr |
◆ move_tensor_adaptor_coordinate() [2/2]
|
constexpr |
◆ move_tensor_coordinate()
|
constexpr |
◆ move_tile_window() [1/5]
CK_TILE_DEVICE void ck_tile::move_tile_window | ( | null_tile_window< WindowLengths > & | , |
const typename null_tile_window< WindowLengths >::BottomTensorIndex & | |||
) |
◆ move_tile_window() [2/5]
CK_TILE_DEVICE void ck_tile::move_tile_window | ( | tile_window_linear< TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ > & | window, |
const typename tile_window_linear< TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_ >::BottomTensorIndex & | step | ||
) |
◆ move_tile_window() [3/5]
CK_TILE_DEVICE void ck_tile::move_tile_window | ( | tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord > & | window, |
const typename tile_window_with_static_distribution< TensorView_, WindowLengths_, StaticTileDistribution_, NumCoord >::BottomTensorIndex & | step | ||
) |
◆ move_tile_window() [4/5]
CK_TILE_DEVICE void ck_tile::move_tile_window | ( | tile_window_with_static_lengths< TensorView_, WindowLengths_ > & | window, |
const typename tile_window_with_static_lengths< TensorView_, WindowLengths_ >::BottomTensorIndex & | step | ||
) |
◆ move_tile_window() [5/5]
CK_TILE_DEVICE void ck_tile::move_tile_window | ( | TileWindow_ & | window, |
const typename TileWindow_::BottomTensorIndex & | step | ||
) |
◆ multiplies()
__host__ __device__ ck_tile::multiplies | ( | ) | -> multiplies< void, void > |
FIXME: create macro to replace 'host device' and nothing more.
◆ naive_attention_fwd()
CK_TILE_HOST float ck_tile::naive_attention_fwd | ( | naive_attention_fwd_traits | t, |
naive_attention_fwd_args | a, | ||
ck_tile::stream_config | s | ||
) |
◆ naive_gemm_kernel()
__global__ void ck_tile::naive_gemm_kernel | ( | ADataType * | A, |
BDataType * | B, | ||
CDataType * | C, | ||
ck_tile::index_t | M, | ||
ck_tile::index_t | N, | ||
ck_tile::index_t | K, | ||
ck_tile::index_t | strideA, | ||
ck_tile::index_t | strideB, | ||
ck_tile::index_t | strideC | ||
) |
◆ neg() [1/2]
CK_TILE_HOST T ck_tile::neg | ( | T | x | ) |
◆ neg() [2/2]
CK_TILE_DEVICE T ck_tile::neg | ( | T | x | ) |
◆ neg< double >()
CK_TILE_DEVICE double ck_tile::neg< double > | ( | double | x | ) |
◆ neg< float >()
CK_TILE_DEVICE float ck_tile::neg< float > | ( | float | x | ) |
◆ neg< fp16_t >()
CK_TILE_DEVICE fp16_t ck_tile::neg< fp16_t > | ( | fp16_t | x | ) |
◆ neg< int32_t >()
CK_TILE_DEVICE int32_t ck_tile::neg< int32_t > | ( | int32_t | x | ) |
◆ neg< int8_t >()
CK_TILE_DEVICE int8_t ck_tile::neg< int8_t > | ( | int8_t | x | ) |
◆ next_power_of_two() [1/3]
|
constexpr |
◆ next_power_of_two() [2/3]
|
constexpr |
◆ next_power_of_two() [3/3]
|
constexpr |
◆ operator!=() [1/3]
|
constexpr |
◆ operator!=() [2/3]
|
constexpr |
◆ operator!=() [3/3]
|
constexpr |
◆ operator%() [1/3]
|
constexpr |
◆ operator%() [2/3]
|
constexpr |
◆ operator%() [3/3]
|
constexpr |
◆ operator*() [1/10]
|
constexpr |
◆ operator*() [2/10]
|
constexpr |
◆ operator*() [3/10]
|
constexpr |
◆ operator*() [4/10]
|
constexpr |
◆ operator*() [5/10]
|
constexpr |
◆ operator*() [6/10]
|
constexpr |
◆ operator*() [7/10]
|
constexpr |
◆ operator*() [8/10]
|
constexpr |
◆ operator*() [9/10]
|
constexpr |
◆ operator*() [10/10]
|
constexpr |
◆ operator+() [1/6]
|
constexpr |
◆ operator+() [2/6]
|
constexpr |
◆ operator+() [3/6]
|
constexpr |
◆ operator+() [4/6]
|
constexpr |
◆ operator+() [5/6]
|
constexpr |
◆ operator+() [6/6]
|
constexpr |
◆ operator+=() [1/2]
|
constexpr |
◆ operator+=() [2/2]
|
constexpr |
◆ operator-() [1/6]
|
constexpr |
◆ operator-() [2/6]
|
constexpr |
◆ operator-() [3/6]
|
constexpr |
◆ operator-() [4/6]
|
constexpr |
◆ operator-() [5/6]
|
constexpr |
◆ operator-() [6/6]
|
constexpr |
◆ operator-=() [1/2]
|
constexpr |
◆ operator-=() [2/2]
|
constexpr |
◆ operator/() [1/4]
|
constexpr |
◆ operator/() [2/4]
|
constexpr |
◆ operator/() [3/4]
|
constexpr |
◆ operator/() [4/4]
|
constexpr |
◆ operator<<()
std::ostream& ck_tile::operator<< | ( | std::ostream & | os, |
const std::vector< T > & | v | ||
) |
Stream operator overload for vector output.
Provides a formatted string representation of a vector, useful for debugging and logging.
- Template Parameters
-
T Type of vector elements
- Parameters
-
os Output stream v Vector to output
- Returns
- Reference to the output stream
◆ operator==() [1/3]
|
constexpr |
◆ operator==() [2/3]
|
constexpr |
◆ operator==() [3/3]
|
constexpr |
◆ pad_tensor_view()
|
constexpr |
◆ pick_sequence_elements_by_ids()
|
constexpr |
◆ pick_sequence_elements_by_mask()
|
constexpr |
◆ pk_add_f16()
CK_TILE_DEVICE fp16x2_t ck_tile::pk_add_f16 | ( | const fp16x2_t & | x, |
const fp16x2_t & | y | ||
) |
◆ pk_fp4_to_bf16()
|
constexpr |
◆ pk_fp4_to_bf16x2()
|
constexpr |
◆ pk_fp4_to_float()
|
constexpr |
◆ pk_fp4_to_fp16()
|
constexpr |
◆ pk_fp4_to_fp16x2()
|
constexpr |
◆ pk_fp4_to_fp32x2()
|
constexpr |
◆ pk_int4_t_to_bfloat16x2_t()
CK_TILE_HOST_DEVICE bf16x2_t ck_tile::pk_int4_t_to_bfloat16x2_t | ( | const pk_int4_t & | x | ) |
◆ pk_int4_t_to_fp32x2_t()
CK_TILE_HOST_DEVICE fp32x2_t ck_tile::pk_int4_t_to_fp32x2_t | ( | const pk_int4_t & | x | ) |
◆ pk_int4_t_to_fp32x2_t_signed_conversion()
CK_TILE_HOST_DEVICE fp32x2_t ck_tile::pk_int4_t_to_fp32x2_t_signed_conversion | ( | const pk_int4_t & | x | ) |
◆ pk_int4_t_to_halfx2_t()
CK_TILE_HOST_DEVICE fp16x2_t ck_tile::pk_int4_t_to_halfx2_t | ( | const pk_int4_t & | x | ) |
◆ pk_int4_t_to_int8x2_t()
CK_TILE_HOST_DEVICE int8x2_t ck_tile::pk_int4_t_to_int8x2_t | ( | const pk_int4_t & | x | ) |
◆ plus()
__host__ __device__ ck_tile::plus | ( | ) | -> plus< void, void > |
FIXME: create macro to replace 'host device' and nothing more.
◆ pow() [1/2]
CK_TILE_HOST T ck_tile::pow | ( | T | x, |
T | gamma | ||
) |
◆ pow() [2/2]
CK_TILE_DEVICE T ck_tile::pow | ( | T | x, |
T | gamma | ||
) |
◆ pow< double >()
CK_TILE_DEVICE double ck_tile::pow< double > | ( | double | x, |
double | gamma | ||
) |
◆ pow< float >()
CK_TILE_DEVICE float ck_tile::pow< float > | ( | float | x, |
float | gamma | ||
) |
◆ prefix_sum_sequence()
|
constexpr |
◆ preprocess_profiling_impl()
CK_TILE_HOST double ck_tile::preprocess_profiling_impl | ( | TimerType | timer, |
const stream_config & | s, | ||
PreprocessFunc | preprocess | ||
) |
◆ print() [1/14]
CK_TILE_HOST_DEVICE void ck_tile::print | ( | const buffer_view< BufferAddressSpace, T, BufferSizeType, InvalidElementUseNumericalZeroValue, Coherence > & | bv | ) |
◆ print() [2/14]
CK_TILE_HOST_DEVICE void ck_tile::print | ( | const char & | value | ) |
Specialization for char.
◆ print() [3/14]
CK_TILE_HOST_DEVICE void ck_tile::print | ( | const double & | value | ) |
Specialization for double.
◆ print() [4/14]
CK_TILE_HOST_DEVICE void ck_tile::print | ( | const float & | value | ) |
Specialization for float.
◆ print() [5/14]
CK_TILE_HOST_DEVICE void ck_tile::print | ( | const int & | value | ) |
Specialization for int.
◆ print() [6/14]
CK_TILE_HOST_DEVICE void ck_tile::print | ( | const long & | value | ) |
Specialization for long.
◆ print() [7/14]
CK_TILE_HOST_DEVICE void ck_tile::print | ( | const T & | ) |
Declare a ck_tile::print() interface that gets specialized in each header file for types that can be printed.
◆ print() [8/14]
CK_TILE_HOST_DEVICE void ck_tile::print | ( | const T(&) | value[N] | ) |
Specialization for array.
◆ print() [9/14]
CK_TILE_HOST_DEVICE void ck_tile::print | ( | const tile_distribution< PsYs2XsAdaptor_, Ys2DDescriptor_, StaticTileDistributionEncoding_, TileDistributionDetail_ > & | distribution | ) |
◆ print() [10/14]
CK_TILE_HOST_DEVICE void ck_tile::print | ( | const tile_distribution_encoding< RsLengths_, HsLengthss_, Ps2RHssMajor_, Ps2RHssMinor_, Ys2RHsMajor_, Ys2RHsMinor_ > & | encoding | ) |
◆ print() [11/14]
CK_TILE_HOST_DEVICE void ck_tile::print | ( | const TileDistributionEncodingPattern2D< BlockSize, YPerTile, XPerTile, VecSize, DistributionPattern, NumWaveGroups > & | ) |
◆ print() [12/14]
CK_TILE_HOST_DEVICE void ck_tile::print | ( | const tuple< T... > & | t | ) |
◆ print() [13/14]
CK_TILE_HOST_DEVICE void ck_tile::print | ( | const typename tile_distribution_encoding< RsLengths_, HsLengthss_, Ps2RHssMajor_, Ps2RHssMinor_, Ys2RHsMajor_, Ys2RHsMinor_ >::detail & | detail_obj | ) |
◆ print() [14/14]
CK_TILE_HOST_DEVICE void ck_tile::print | ( | const unsigned int & | value | ) |
Specialization for unsigned int.
◆ rcp() [1/2]
CK_TILE_HOST T ck_tile::rcp | ( | T | x | ) |
◆ rcp() [2/2]
CK_TILE_DEVICE T ck_tile::rcp | ( | T | x | ) |
◆ reduce_on_sequence()
|
constexpr |
◆ reference_batched_dropout()
CK_TILE_HOST void ck_tile::reference_batched_dropout | ( | HostTensor< DataType > & | in_out_b_m_n, |
const HostTensor< RandValOutputDataType > & | randval_b_m_n, | ||
const uint8_t & | p_undrop_in_uint8_t, | ||
const float | scale | ||
) |
◆ reference_batched_elementwise()
CK_TILE_HOST void ck_tile::reference_batched_elementwise | ( | const HostTensor< ADataType > & | a_b_m_n, |
const HostTensor< BDataType > & | b_b_m_n, | ||
HostTensor< CDataType > & | c_b_m_n, | ||
const AElementOp & | a_element_op = {} , |
||
const BElementOp & | b_element_op = {} , |
||
const BinaryElementOp & | binary_element_op = {} |
||
) |
◆ reference_batched_gemm()
CK_TILE_HOST void ck_tile::reference_batched_gemm | ( | const HostTensor< ADataType > & | a_b_m_k, |
const HostTensor< BDataType > & | b_b_n_k, | ||
HostTensor< CDataType > & | c_b_m_n, | ||
const AElementOp & | a_element_op = {} , |
||
const BElementOp & | b_element_op = {} , |
||
const ACCElementOp & | acc_element_op = {} |
||
) |
◆ reference_batched_gemm_gpu()
void ck_tile::reference_batched_gemm_gpu | ( | ADataType * | a_ptr, |
BDataType * | b_ptr, | ||
CDataType * | c_ptr, | ||
index_t | M, | ||
index_t | N, | ||
index_t | K, | ||
index_t | stride_a, | ||
index_t | stride_b, | ||
index_t | stride_c, | ||
index_t | batch_stride_A, | ||
index_t | batch_stride_B, | ||
index_t | batch_stride_C, | ||
index_t | batch_count | ||
) |
◆ reference_batched_masking()
CK_TILE_HOST void ck_tile::reference_batched_masking | ( | HostTensor< CDataType > & | c_b_m_n, |
const MaskingType & | mask | ||
) |
◆ reference_batched_rotary_position_embedding()
CK_TILE_HOST void ck_tile::reference_batched_rotary_position_embedding | ( | const HostTensor< DataType > & | input_bsd, |
const HostTensor< DataType > & | cos_sd, | ||
const HostTensor< DataType > & | sin_sd, | ||
bool | interleaved, | ||
HostTensor< DataType > & | output_bsd, | ||
bool | use_1_row_sin_cos = false |
||
) |
◆ reference_batched_softmax()
CK_TILE_HOST void ck_tile::reference_batched_softmax | ( | const HostTensor< ADataType > & | a_b_m_n, |
HostTensor< BDataType > & | b_b_m_n, | ||
const CompElementOp & | comp_element_op = {} , |
||
std::optional< std::reference_wrapper< HostTensor< CompDataType >>> | lse_b_m = std::nullopt |
||
) |
◆ reference_batched_transpose()
CK_TILE_HOST void ck_tile::reference_batched_transpose | ( | const HostTensor< Type > & | x, |
HostTensor< Type > & | y, | ||
std::string | layout_in = "NCHW" , |
||
std::string | layout_out = "NHWC" |
||
) |
◆ reference_binary_elementwise()
CK_TILE_HOST void ck_tile::reference_binary_elementwise | ( | const HostTensor< ADataType > & | a, |
const HostTensor< BDataType > & | b, | ||
HostTensor< CDataType > & | c, | ||
ElementOp | element_op | ||
) |
◆ reference_fused_moe()
void ck_tile::reference_fused_moe | ( | const ck_tile::HostTensor< ADataType > & | a_host, |
const ck_tile::HostTensor< GDataType > & | g_host, | ||
const ck_tile::HostTensor< DDataType > & | d_host, | ||
const ck_tile::HostTensor< AScaleDataType > & | sa_host, | ||
const ck_tile::HostTensor< GScaleDataType > & | sg_host, | ||
const ck_tile::HostTensor< DScaleDataType > & | sd_host, | ||
const ck_tile::HostTensor< YSmoothScaleDataType > & | sy_host, | ||
ck_tile::HostTensor< ODataType > & | o_host, | ||
const ck_tile::HostTensor< IndexDataType > & | sorted_token_ids_host, | ||
const ck_tile::HostTensor< TopkWeightDataType > & | sorted_weight_host, | ||
const ck_tile::HostTensor< IndexDataType > & | sorted_expert_ids_host, | ||
const ck_tile::HostTensor< IndexDataType > & | num_sorted_tiles_host, | ||
const ck_tile::HostTensor< IndexDataType > & | token_ids_host, | ||
ck_tile::index_t | block_m, | ||
ck_tile::index_t | tokens, | ||
ck_tile::index_t | experts, | ||
ck_tile::index_t | hidden_size, | ||
ck_tile::index_t | intermediate_size, | ||
ck_tile::index_t | topk, | ||
ck_tile::index_t | gate_only | ||
) |
◆ reference_gemm()
CK_TILE_HOST void ck_tile::reference_gemm | ( | const HostTensor< ADataType > & | a_m_k, |
const HostTensor< BDataType > & | b_k_n, | ||
HostTensor< CDataType > & | c_m_n, | ||
const AElementOp & | a_element_op = {} , |
||
const BElementOp & | b_element_op = {} , |
||
const ACCElementOp & | acc_element_op = {} |
||
) |
◆ reference_gemm_gpu()
void ck_tile::reference_gemm_gpu | ( | ADataType * | a_ptr, |
BDataType * | b_ptr, | ||
CDataType * | c_ptr, | ||
index_t | M, | ||
index_t | N, | ||
index_t | K, | ||
index_t | stride_a, | ||
index_t | stride_b, | ||
index_t | stride_c | ||
) |
◆ reference_gemm_multiple_d()
CK_TILE_HOST void ck_tile::reference_gemm_multiple_d | ( | const HostTensor< ADataType > & | a_m_k, |
const HostTensor< BDataType > & | b_k_n, | ||
const std::array< HostTensor< DDataType >, DsDataType::size()> & | ds_m_n, | ||
HostTensor< CDataType > & | c_m_n, | ||
const ACCElementOp & | acc_element_op = {} |
||
) |
◆ reference_gemm_quant()
CK_TILE_HOST void ck_tile::reference_gemm_quant | ( | const HostTensor< ADataType > & | a_m_k, |
const HostTensor< QDataType > & | q, | ||
const HostTensor< BDataType > & | b_k_n, | ||
HostTensor< CDataType > & | c_m_n, | ||
const AElementOp & | a_element_op = {} , |
||
const BElementOp & | b_element_op = {} , |
||
const ACCElementOp & | acc_element_op = {} |
||
) |
◆ reference_grouped_conv_bwd_data()
CK_TILE_HOST void ck_tile::reference_grouped_conv_bwd_data | ( | HostTensor< InDataType > & | input, |
const HostTensor< WeiDataType > & | weight, | ||
const HostTensor< OutDataType > & | output, | ||
std::vector< ck_tile::long_index_t > | conv_strides, | ||
std::vector< ck_tile::long_index_t > | conv_dilations, | ||
std::vector< ck_tile::long_index_t > | in_left_pads, | ||
std::vector< ck_tile::long_index_t > | |||
) |
◆ reference_grouped_conv_bwd_weight()
CK_TILE_HOST void ck_tile::reference_grouped_conv_bwd_weight | ( | const HostTensor< InDataType > & | input, |
HostTensor< WeiDataType > & | weight, | ||
const HostTensor< OutDataType > & | output, | ||
std::vector< ck_tile::long_index_t > | conv_strides, | ||
std::vector< ck_tile::long_index_t > | conv_dilations, | ||
std::vector< ck_tile::long_index_t > | in_left_pads, | ||
std::vector< ck_tile::long_index_t > | |||
) |
◆ reference_grouped_conv_fwd()
CK_TILE_HOST void ck_tile::reference_grouped_conv_fwd | ( | const HostTensor< InDataType > & | input, |
const HostTensor< WeiDataType > & | weight, | ||
HostTensor< OutDataType > & | output, | ||
std::vector< ck_tile::long_index_t > | conv_strides, | ||
std::vector< ck_tile::long_index_t > | conv_dilations, | ||
std::vector< ck_tile::long_index_t > | in_left_pads, | ||
std::vector< ck_tile::long_index_t > | |||
) |
◆ reference_im2col()
CK_TILE_HOST void ck_tile::reference_im2col | ( | const HostTensor< InDataType > & | in_host, |
HostTensor< OutDataType > & | out_host, | ||
const ck_tile::conv::ConvParam & | conv_params | ||
) |
◆ reference_layernorm2d_fwd()
void ck_tile::reference_layernorm2d_fwd | ( | const HostTensor< XDataType > & | x_m_n, |
const HostTensor< GammaDataType > & | gamma_n, | ||
const HostTensor< BetaDataType > & | beta_n, | ||
HostTensor< YDataType > & | y_m_n, | ||
HostTensor< MeanDataType > & | mean_m, | ||
HostTensor< InvStdDataType > & | invStd_m, | ||
ComputeDataType | epsilon, | ||
Epilogue | epilogue_functor = {} |
||
) |
◆ reference_moe_sorting()
CK_TILE_HOST void ck_tile::reference_moe_sorting | ( | const HostTensor< IndexType > & | topk_ids, |
const HostTensor< WeightType > & | weights, | ||
const HostTensor< IndexType > & | local_expert_mask, | ||
HostTensor< IndexType > & | p_sorted_token_ids, | ||
HostTensor< WeightType > & | sorted_weight, | ||
HostTensor< IndexType > & | sorted_expert_ids, | ||
index_t & | unit_cnt, | ||
const index_t | experts, | ||
const index_t | unit_size, | ||
const index_t | tokens, | ||
bool | local_expert_masking, | ||
bool | skip_experts_with_zero_token = true |
||
) |
◆ reference_permute() [1/2]
CK_TILE_HOST void ck_tile::reference_permute | ( | const HostTensor< DataType > & | x, |
HostTensor< DataType > & | y, | ||
std::vector< index_t > | perm | ||
) |
◆ reference_permute() [2/2]
CK_TILE_HOST auto ck_tile::reference_permute | ( | const HostTensor< DataType > & | x, |
std::vector< index_t > | perm | ||
) |
◆ reference_reduce() [1/2]
CK_TILE_HOST void ck_tile::reference_reduce | ( | const HostTensor< XDataType > & | x_m_n, |
HostTensor< YDataType > & | y_m, | ||
ReduceOp | reduce_op | ||
) |
◆ reference_reduce() [2/2]
CK_TILE_HOST void ck_tile::reference_reduce | ( | const HostTensor< XDataType > & | x_tensor, |
HostTensor< YDataType > & | y_tensor, | ||
ReduceOp | reduce_op, | ||
KeptDim | kept_dim, | ||
ReduceDims | reduce_dims | ||
) |
◆ reference_rmsnorm2d_fwd()
void ck_tile::reference_rmsnorm2d_fwd | ( | const HostTensor< XDataType > & | x_m_n, |
const HostTensor< GammaDataType > & | gamma_n, | ||
HostTensor< YDataType > & | y_m_n, | ||
HostTensor< InvRmsDataType > & | invRms_m, | ||
HostTensor< UnquantYDataType > & | unquant_y_m_n, | ||
ComputeDataType | epsilon, | ||
Epilogue | epilogue_functor = {} |
||
) |
◆ reference_rowwise_quantization2d()
CK_TILE_HOST void ck_tile::reference_rowwise_quantization2d | ( | const HostTensor< XDataType > & | x_m_n, |
const HostTensor< ScaleDataType > & | scale_m, | ||
HostTensor< QXDataType > & | qx_m_n | ||
) |
◆ reference_softmax() [1/2]
CK_TILE_HOST void ck_tile::reference_softmax | ( | const HostTensor< InputType > & | x, |
HostTensor< OutputType > & | y, | ||
index_t | dim = -1 |
||
) |
◆ reference_softmax() [2/2]
CK_TILE_HOST auto ck_tile::reference_softmax | ( | const HostTensor< InputType > & | x, |
index_t | dim = -1 |
||
) |
◆ reference_topk() [1/2]
CK_TILE_HOST void ck_tile::reference_topk | ( | const HostTensor< DataType > & | x, |
HostTensor< DataType > & | y_values, | ||
HostTensor< IndexType > & | y_indices, | ||
index_t | k, | ||
index_t | dim = -1 , |
||
bool | largest = true , |
||
bool | sorted = true |
||
) |
◆ reference_topk() [2/2]
CK_TILE_HOST auto ck_tile::reference_topk | ( | const HostTensor< DataType > & | x, |
index_t | k, | ||
index_t | dim = -1 , |
||
bool | largest = true , |
||
bool | sorted = true |
||
) |
◆ reference_transpose_elementwise()
void ck_tile::reference_transpose_elementwise | ( | const HostTensor< ADataType > & | a, |
HostTensor< BDataType > & | b | ||
) |
◆ reference_unary_elementwise()
CK_TILE_HOST void ck_tile::reference_unary_elementwise | ( | const HostTensor< ADataType > & | a, |
HostTensor< BDataType > & | b, | ||
ElementOp | element_op | ||
) |
◆ report_error_stats()
CK_TILE_HOST void ck_tile::report_error_stats | ( | int | err_count, |
double | max_err, | ||
std::size_t | total_size | ||
) |
Report error statistics for numerical comparisons.
Outputs statistics about numerical comparison errors including count and maximum error.
- Parameters
-
err_count Number of errors found max_err Maximum error value encountered total_size Total number of elements compared
◆ reverse_exclusive_scan_sequence()
|
constexpr |
◆ reverse_inclusive_scan_sequence()
|
constexpr |
◆ reverse_slice_sequence()
|
constexpr |
◆ sad_u16()
CK_TILE_DEVICE uint16_t ck_tile::sad_u16 | ( | uint16_t | x, |
uint16_t | y, | ||
uint16_t | acc | ||
) |
◆ sad_u32()
CK_TILE_HOST uint32_t ck_tile::sad_u32 | ( | uint32_t | x, |
uint32_t | y, | ||
uint32_t | acc | ||
) |
TODO: replace inline asm when intrinsic is available
◆ scaled_type_convert()
|
constexpr |
◆ scales()
__host__ __device__ ck_tile::scales | ( | Scale | ) | -> scales< Scale > |
FIXME: create macro to replace 'host device' and nothing more.
◆ sequence_all_of()
|
constexpr |
◆ sequence_any_of()
|
constexpr |
◆ sequence_pop_back()
|
constexpr |
◆ sequence_pop_front()
|
constexpr |
◆ sequence_to_tuple_of_number()
|
constexpr |
◆ set_buffer_value()
__global__ void ck_tile::set_buffer_value | ( | T * | p, |
T | x, | ||
uint64_t | buffer_element_size | ||
) |
◆ set_container_subset() [1/2]
|
constexpr |
◆ set_container_subset() [2/2]
|
constexpr |
◆ set_slice_tile()
|
constexpr |
◆ set_tile() [1/4]
CK_TILE_DEVICE void ck_tile::set_tile | ( | DstrTensors & | dstr_tensor, |
const T & | value | ||
) |
◆ set_tile() [2/4]
CK_TILE_DEVICE void ck_tile::set_tile | ( | DstrTensors & | dstr_tensor, |
number< v > | , | ||
bool_constant< skip_subdword_opt > | = {} |
||
) |
◆ set_tile() [3/4]
CK_TILE_DEVICE void ck_tile::set_tile | ( | null_tensor & | , |
const T & | |||
) |
◆ set_tile() [4/4]
CK_TILE_DEVICE void ck_tile::set_tile | ( | null_tensor & | , |
number< v > | |||
) |
◆ set_tile_if()
CK_TILE_HOST_DEVICE void ck_tile::set_tile_if | ( | static_distributed_tensor< DataType, StaticTileDistribution > & | out_tensor, |
DataType | value, | ||
XIndicesPredicate | predicate | ||
) |
◆ shuffle_tile()
CK_TILE_DEVICE void ck_tile::shuffle_tile | ( | OutTensor & | out, |
const InTensor & | in | ||
) |
◆ sin() [1/2]
CK_TILE_HOST T ck_tile::sin | ( | T | x | ) |
◆ sin() [2/2]
CK_TILE_DEVICE T ck_tile::sin | ( | T | x | ) |
◆ sin< double >()
CK_TILE_DEVICE double ck_tile::sin< double > | ( | double | x | ) |
◆ sin< float >()
CK_TILE_DEVICE float ck_tile::sin< float > | ( | float | x | ) |
◆ sin< fp16_t >()
CK_TILE_DEVICE fp16_t ck_tile::sin< fp16_t > | ( | fp16_t | x | ) |
◆ sinh() [1/2]
CK_TILE_HOST T ck_tile::sinh | ( | T | x | ) |
◆ sinh() [2/2]
CK_TILE_DEVICE T ck_tile::sinh | ( | T | x | ) |
◆ sinh< double >()
CK_TILE_DEVICE double ck_tile::sinh< double > | ( | double | x | ) |
◆ sinh< float >()
CK_TILE_DEVICE float ck_tile::sinh< float > | ( | float | x | ) |
◆ slice_sequence()
|
constexpr |
◆ sqrt() [1/4]
CK_TILE_DEVICE bfloat16_t ck_tile::sqrt | ( | bfloat16_t | x | ) |
◆ sqrt() [2/4]
CK_TILE_DEVICE double ck_tile::sqrt | ( | double | x | ) |
◆ sqrt() [3/4]
CK_TILE_DEVICE float ck_tile::sqrt | ( | float | x | ) |
◆ sqrt() [4/4]
CK_TILE_DEVICE fp16_t ck_tile::sqrt | ( | fp16_t | x | ) |
◆ store_tile() [1/3]
CK_TILE_DEVICE void ck_tile::store_tile | ( | tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > & | tile_window, |
const static_distributed_tensor< DataType_, TileDistribution_ > & | dstr_tensor | ||
) |
◆ store_tile() [2/3]
CK_TILE_DEVICE void ck_tile::store_tile | ( | tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > & | tile_window, |
const static_distributed_tensor< DataType_, TileDistribution_ > & | dstr_tensor | ||
) |
◆ store_tile() [3/3]
CK_TILE_DEVICE void ck_tile::store_tile | ( | tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > & | tile_window_tmp, |
const static_distributed_tensor< DataType_, TileDistribution_ > & | dstr_tensor | ||
) |
◆ store_tile_raw() [1/3]
CK_TILE_DEVICE void ck_tile::store_tile_raw | ( | tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > & | tile_window, |
const static_distributed_tensor< DataType_, TileDistribution_ > & | dstr_tensor | ||
) |
◆ store_tile_raw() [2/3]
CK_TILE_DEVICE void ck_tile::store_tile_raw | ( | tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > & | tile_window, |
const static_distributed_tensor< DataType_, TileDistribution_ > & | dstr_tensor | ||
) |
◆ store_tile_raw() [3/3]
CK_TILE_DEVICE void ck_tile::store_tile_raw | ( | tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > & | tile_window_tmp, |
const static_distributed_tensor< DataType_, TileDistribution_ > & | dstr_tensor | ||
) |
◆ sweep_tile() [1/2]
|
constexpr |
◆ sweep_tile() [2/2]
|
constexpr |
◆ sweep_tile_span()
CK_TILE_DEVICE void ck_tile::sweep_tile_span | ( | TileDistributedSpan_ | , |
const F & | f | ||
) |
◆ sweep_tile_uspan()
CK_TILE_DEVICE void ck_tile::sweep_tile_uspan | ( | TileDistributedSpan_ | , |
const F & | f, | ||
Unpacks | = {} |
||
) |
◆ tan() [1/2]
CK_TILE_HOST T ck_tile::tan | ( | T | x | ) |
◆ tan() [2/2]
CK_TILE_DEVICE T ck_tile::tan | ( | T | x | ) |
◆ tan< double >()
CK_TILE_DEVICE double ck_tile::tan< double > | ( | double | x | ) |
◆ tan< float >()
CK_TILE_DEVICE float ck_tile::tan< float > | ( | float | x | ) |
◆ tanh() [1/2]
CK_TILE_HOST T ck_tile::tanh | ( | T | x | ) |
◆ tanh() [2/2]
CK_TILE_DEVICE T ck_tile::tanh | ( | T | x | ) |
◆ tanh< double >()
CK_TILE_DEVICE double ck_tile::tanh< double > | ( | double | x | ) |
◆ tanh< float >()
CK_TILE_DEVICE float ck_tile::tanh< float > | ( | float | x | ) |
◆ tanh_fast()
CK_TILE_DEVICE T ck_tile::tanh_fast | ( | T | x | ) |
◆ tanh_fast< float >()
CK_TILE_DEVICE float ck_tile::tanh_fast< float > | ( | float | x | ) |
◆ tie()
|
constexprnoexcept |
◆ tile_distribution_pattern_to_string()
|
constexpr |
◆ tile_elementwise_in() [1/2]
CK_TILE_DEVICE auto ck_tile::tile_elementwise_in | ( | const InElementFunc & | , |
MaybeNullTensor && | ... | ||
) |
◆ tile_elementwise_in() [2/2]
CK_TILE_DEVICE auto ck_tile::tile_elementwise_in | ( | const InElementFunc & | in_element_func, |
const InTensor &... | in_dstr_tensors | ||
) |
◆ tile_elementwise_inout() [1/2]
CK_TILE_DEVICE void ck_tile::tile_elementwise_inout | ( | const InOutElementFunc & | , |
MaybeNullTensor && | ... | ||
) |
◆ tile_elementwise_inout() [2/2]
CK_TILE_DEVICE void ck_tile::tile_elementwise_inout | ( | const InOutElementFunc & | inout_element_func, |
InOutDstrTensors &... | inout_dstr_tensors | ||
) |
◆ tile_elementwise_inout_unpack() [1/2]
CK_TILE_DEVICE auto ck_tile::tile_elementwise_inout_unpack | ( | const InElementFunc & | in_element_func, |
const Tuple & | t | ||
) |
Template function that "unpacks" a tuple and applies an element-wise operation.
- Parameters
-
in_element_func Function to apply element-wise. t Any container containing elements to process, with known size and tuple-like semantic.
- Returns
- Calls the overloaded function, passing an index sequence.
◆ tile_elementwise_inout_unpack() [2/2]
CK_TILE_DEVICE auto ck_tile::tile_elementwise_inout_unpack | ( | const InElementFunc & | in_element_func, |
const Tuple & | t, | ||
std::index_sequence< I... > | |||
) |
Template function that "unpacks" a tuple and applies an element-wise operation.
- Parameters
-
in_element_func Function to apply element-wise. t Any container containing elements to process, with known size and tuple-like semantic.
- Returns
- Calls tile_elementwise_inout with unpacked tuple elements.
◆ tile_sweeper()
CK_TILE_HOST_DEVICE_EXTERN ck_tile::tile_sweeper | ( | const T & | , |
const F & | , | ||
U | = {} |
||
) | -> tile_sweeper< T, F, U > |
◆ timing_loop_impl()
CK_TILE_HOST double ck_tile::timing_loop_impl | ( | TimerType | timer, |
const stream_config & | s, | ||
CallablesFunc && | callables_func, | ||
PreprocessFunc | preprocess = nullptr |
||
) |
◆ to_array() [1/2]
|
constexpr |
◆ to_array() [2/2]
|
constexpr |
◆ to_array_of_array()
|
constexpr |
◆ to_multi_index()
|
constexpr |
◆ to_sequence()
|
constexpr |
◆ transform_sequences() [1/3]
|
constexpr |
◆ transform_sequences() [2/3]
|
constexpr |
◆ transform_sequences() [3/3]
|
constexpr |
◆ transform_tensor_adaptor()
|
constexpr |
◆ transform_tensor_descriptor()
|
constexpr |
◆ transform_tensor_view()
|
constexpr |
◆ transform_tuples() [1/3]
|
constexpr |
◆ transform_tuples() [2/3]
|
constexpr |
◆ transform_tuples() [3/3]
|
constexpr |
◆ transpose_host_tensor_descriptor_given_new2old()
CK_TILE_HOST HostTensorDescriptor ck_tile::transpose_host_tensor_descriptor_given_new2old | ( | const HostTensorDescriptor & | a, |
const New2Old & | new2old | ||
) |
◆ transpose_tile2d()
CK_TILE_DEVICE void ck_tile::transpose_tile2d | ( | OutTensor & | out, |
const InTensor & | in | ||
) |
◆ tuple_depth() [1/2]
|
constexpr |
◆ tuple_depth() [2/2]
|
constexpr |
◆ tuple_reduce()
|
constexpr |
◆ tuple_reverse()
|
constexpr |
◆ type_convert()
|
constexpr |
◆ unpack()
|
constexpr |
◆ unpack2()
|
constexpr |
◆ unroll_nested_tuple() [1/3]
|
constexpr |
◆ unroll_nested_tuple() [2/3]
|
constexpr |
◆ unroll_nested_tuple() [3/3]
|
constexpr |
◆ update_tile() [1/2]
CK_TILE_DEVICE void ck_tile::update_tile | ( | tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > & | tile_window, |
const static_distributed_tensor< DataType_, TileDistribution_ > & | dstr_tensor, | ||
number< i_access > | = {} , |
||
bool_constant< oob_conditional_check > | = {} |
||
) |
◆ update_tile() [2/2]
CK_TILE_DEVICE void ck_tile::update_tile | ( | tile_window_with_static_lengths< BottomTensorView_, WindowLengths_ > & | tile_window_tmp, |
const static_distributed_tensor< DataType_, TileDistribution_ > & | dstr_tensor | ||
) |
◆ update_tile_raw() [1/2]
CK_TILE_DEVICE auto ck_tile::update_tile_raw | ( | tile_window_linear< BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_ > & | tile_window, |
const static_distributed_tensor< DataType_, TileDistribution_ > & | dstr_tensor, | ||
number< i_access > | = {} , |
||
bool_constant< oob_conditional_check > | = {} , |
||
bool_constant< pre_nop > | = {} |
||
) |
◆ update_tile_raw() [2/2]
CK_TILE_DEVICE void ck_tile::update_tile_raw | ( | tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > & | tile_window, |
const static_distributed_tensor< DataType_, TileDistribution_ > & | dstr_tensor, | ||
number< i_access > | = {} , |
||
bool_constant< oob_conditional_check > | = {} , |
||
bool_constant< pre_nop > | = {} |
||
) |
◆ UpdateEnvVar() [1/2]
void ck_tile::UpdateEnvVar | ( | EnvVar | , |
const std::string_view & | val | ||
) |
◆ UpdateEnvVar() [2/2]
void ck_tile::UpdateEnvVar | ( | EnvVar | , |
const ValueType & | val | ||
) |
Updates the cached value of an environment variable.
◆ warp_shuffle()
CK_TILE_DEVICE T ck_tile::warp_shuffle | ( | const T & | v_local, |
uint32_t | src_lane | ||
) |
◆ warp_shuffle_down()
CK_TILE_DEVICE T ck_tile::warp_shuffle_down | ( | const T & | v_local, |
uint32_t | lane_delta | ||
) |
◆ warp_shuffle_down_pair()
CK_TILE_DEVICE auto ck_tile::warp_shuffle_down_pair | ( | const T & | v_local | ) |
◆ warp_shuffle_up()
CK_TILE_DEVICE T ck_tile::warp_shuffle_up | ( | const T & | v_local, |
uint32_t | lane_delta | ||
) |
◆ welford_update()
CK_TILE_DEVICE void ck_tile::welford_update | ( | T & | mean, |
T & | var, | ||
T | x, | ||
int | count, | ||
bool_constant< kFastFDiv > | = {} |
||
) |
Variable Documentation
◆ ALIBI
|
constexpr |
◆ AllConvertibleToStringView
|
inlineconstexpr |
◆ CUSTOM_MASK
|
constexpr |
◆ ERROR_DETAIL_LIMIT
|
constexpr |
Maximum number of error values to display when checking errors.
◆ has_wmma_traits_v
|
constexpr |
◆ ignore
|
inlineconstexpr |
◆ is_constant_v
|
inlineconstexpr |
◆ is_null_tile_window_v
|
constexpr |
◆ is_static_v
|
inlineconstexpr |
◆ is_tile_window_linear_v
|
inlineconstexpr |
Helper variable template to check if a type is a linear tile window.
Equivalent to is_tile_window_linear<T>::value
.
- Template Parameters
-
T The type to check.
◆ is_tile_window_with_static_distribution_v
|
inlineconstexpr |
Helper variable template to check if a type is a tile window with static distribution.
Equivalent to is_tile_window_with_static_distribution<T>::value
.
- Template Parameters
-
T The type to check.
◆ is_tile_window_with_static_lengths_v
|
inlineconstexpr |
Helper variable template to check if a type is a tile window with static lengths.
Equivalent to is_tile_window_with_static_lengths<T>::value
.
- Template Parameters
-
T The type to check.
◆ log2e_rcp_v
|
constexpr |
◆ log2e_v
|
constexpr |
◆ LOGITS_SOFT_CAP
|
constexpr |
◆ Right
ck_tile::Right |
◆ SLIDING_WINDOW
|
constexpr |