/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_buffer_addressing_builtins.hpp File Reference

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_buffer_addressing_builtins.hpp File Reference#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_buffer_addressing_builtins.hpp File Reference
amd_buffer_addressing_builtins.hpp File Reference
#include "data_type.hpp"

Go to the source code of this file.

Classes

union  ck::BufferResource< T >
 

Namespaces

 ck
 

Enumerations

enum class  ck::AmdBufferCoherenceEnum {
  ck::DefaultCoherence = 0 ,
  ck::GLC = 1 ,
  ck::SLC = 2 ,
  ck::GLC_SLC = 3 ,
  ck::WAVE_NT0 = 0 ,
  ck::WAVE_NT1 = 2 ,
  ck::GROUP_NT0 = 1 ,
  ck::GROUP_NT1 = 3 ,
  ck::DEVICE_NT0 = 8 ,
  ck::DEVICE_NT1 = 10 ,
  ck::SYSTEM_NT0 = 9 ,
  ck::SYSTEM_NT1 = 11 ,
  ck::DefaultCoherence = 0 ,
  ck::GLC = 1 ,
  ck::SLC = 2 ,
  ck::GLC_SLC = 3 ,
  ck::WAVE_NT0 = 0 ,
  ck::WAVE_NT1 = 2 ,
  ck::GROUP_NT0 = 1 ,
  ck::GROUP_NT1 = 3 ,
  ck::DEVICE_NT0 = 8 ,
  ck::DEVICE_NT1 = 10 ,
  ck::SYSTEM_NT0 = 9 ,
  ck::SYSTEM_NT1 = 11
}
 

Functions

template<typename T >
__device__ int32x4_t ck::make_wave_buffer_resource (T *p_wave, index_t element_space_size)
 
template<typename T >
__device__ int32x4_t ck::make_wave_buffer_resource_with_default_range (T *p_wave)
 
template<typename T >
__device__ __amdgpu_buffer_rsrc_t ck::make_wave_buffer_resource_new (T *p_wave, index_t element_space_size)
 
template<typename T >
__device__ __amdgpu_buffer_rsrc_t ck::make_wave_buffer_resource_with_default_range_new (T *p_wave)
 
__device__ half2_t ck::llvm_amdgcn_raw_buffer_atomic_add_fp16x2 (half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
 
__device__ int32_t ck::llvm_amdgcn_raw_buffer_atomic_add_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
 
__device__ float ck::llvm_amdgcn_raw_buffer_atomic_add_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
 
__device__ double ck::llvm_amdgcn_raw_buffer_atomic_max_fp64 (double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
 
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< int8_t, N >::type ck::amd_buffer_load_impl_raw (__amdgpu_buffer_rsrc_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< T, N >::type ck::amd_buffer_load_impl (__amdgpu_buffer_rsrc_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
 
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl_raw (const typename vector_type< int8_t, N >::type src_thread_data, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl (const typename vector_type< T, N >::type src_thread_data, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N>
__device__ void ck::amd_global_atomic_add_impl (const typename vector_type< T, N >::type src_thread_data, T *addr)
 
template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_add_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_max_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type ck::amd_buffer_load_invalid_element_return_zero (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type ck::amd_buffer_load_invalid_element_return_customized_value (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_add (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_max (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
__device__ void ck::llvm_amdgcn_raw_buffer_load_lds (int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
 
template<typename T , index_t NumElemsPerThread>
__device__ void ck::amd_direct_load_global_to_lds (const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)