/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_buffer_addressing.hpp File Reference

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_buffer_addressing.hpp File Reference#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/utility/amd_buffer_addressing.hpp File Reference
amd_buffer_addressing.hpp File Reference
#include "data_type.hpp"

Go to the source code of this file.

Classes

union  ck::BufferResource< T >
 

Namespaces

 ck
 

Enumerations

enum class  ck::AmdBufferCoherenceEnum {
  ck::DefaultCoherence = 0 ,
  ck::GLC = 1 ,
  ck::SLC = 2 ,
  ck::GLC_SLC = 3 ,
  ck::WAVE_NT0 = 0 ,
  ck::WAVE_NT1 = 2 ,
  ck::GROUP_NT0 = 1 ,
  ck::GROUP_NT1 = 3 ,
  ck::DEVICE_NT0 = 8 ,
  ck::DEVICE_NT1 = 10 ,
  ck::SYSTEM_NT0 = 9 ,
  ck::SYSTEM_NT1 = 11 ,
  ck::DefaultCoherence = 0 ,
  ck::GLC = 1 ,
  ck::SLC = 2 ,
  ck::GLC_SLC = 3 ,
  ck::WAVE_NT0 = 0 ,
  ck::WAVE_NT1 = 2 ,
  ck::GROUP_NT0 = 1 ,
  ck::GROUP_NT1 = 3 ,
  ck::DEVICE_NT0 = 8 ,
  ck::DEVICE_NT1 = 10 ,
  ck::SYSTEM_NT0 = 9 ,
  ck::SYSTEM_NT1 = 11
}
 

Functions

template<typename T >
__device__ int32x4_t ck::make_wave_buffer_resource (T *p_wave, index_t element_space_size)
 
template<typename T >
__device__ int32x4_t ck::make_wave_buffer_resource_with_default_range (T *p_wave)
 
__device__ int8_t ck::llvm_amdgcn_raw_buffer_load_i8 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
 
__device__ int8x2_t ck::llvm_amdgcn_raw_buffer_load_i8x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
 
__device__ int8x4_t ck::llvm_amdgcn_raw_buffer_load_i8x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
 
__device__ bhalf_t ck::llvm_amdgcn_raw_buffer_load_i16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
 
__device__ bhalf2_t ck::llvm_amdgcn_raw_buffer_load_i16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
 
__device__ bhalf4_t ck::llvm_amdgcn_raw_buffer_load_i16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
 
__device__ int32_t ck::llvm_amdgcn_raw_buffer_load_i32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
 
__device__ int32x2_t ck::llvm_amdgcn_raw_buffer_load_i32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
 
__device__ int32x4_t ck::llvm_amdgcn_raw_buffer_load_i32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
 
__device__ half_t ck::llvm_amdgcn_raw_buffer_load_fp16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
 
__device__ half2_t ck::llvm_amdgcn_raw_buffer_load_fp16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
 
__device__ half4_t ck::llvm_amdgcn_raw_buffer_load_fp16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
 
__device__ float ck::llvm_amdgcn_raw_buffer_load_fp32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
 
__device__ float2_t ck::llvm_amdgcn_raw_buffer_load_fp32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
 
__device__ float4_t ck::llvm_amdgcn_raw_buffer_load_fp32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_i8 (int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_i8x2 (int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_i8x4 (int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_i16 (bhalf_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_i16x2 (bhalf2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_i16x4 (bhalf4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_i32x2 (int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_i32x4 (int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_fp16 (half_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_fp16x2 (half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_fp16x4 (half4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_fp32x2 (float2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
 
__device__ void ck::llvm_amdgcn_raw_buffer_store_fp32x4 (float4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
 
__device__ half2_t ck::llvm_amdgcn_raw_buffer_atomic_add_fp16x2 (half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
 
__device__ int32_t ck::llvm_amdgcn_raw_buffer_atomic_add_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
 
__device__ float ck::llvm_amdgcn_raw_buffer_atomic_add_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
 
__device__ double ck::llvm_amdgcn_raw_buffer_atomic_max_fp64 (double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
 
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< int8_t, N >::type ck::amd_buffer_load_impl_raw (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< T, N >::type ck::amd_buffer_load_impl (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
 
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl_raw (const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N>
__device__ void ck::amd_global_atomic_add_impl (const typename vector_type< T, N >::type src_thread_data, T *addr)
 
template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_add_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_max_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type ck::amd_buffer_load_invalid_element_return_zero (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type ck::amd_buffer_load_invalid_element_return_customized_value (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
 
template<typename T , index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_add (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
template<typename T , index_t N>
__device__ void ck::amd_buffer_atomic_max (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
 
__device__ void ck::llvm_amdgcn_raw_buffer_load_lds (int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
 
template<typename T , index_t NumElemsPerThread>
__device__ void ck::amd_direct_load_global_to_lds (const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)