11 __device__ 
auto amd_global_load_transpose_to_vgpr(
const T* in_ptr)
 
   13     using vector_t = 
typename vector_type<T, 8>::type;
 
   14     if constexpr(
sizeof(T) == 2)
 
   16         typedef __attribute__((__vector_size__(8 * 
sizeof(__fp16)))) __fp16 llvm_fp16x8_t;
 
   17         __attribute__((address_space(1))) llvm_fp16x8_t* glb_ptr =
 
   18             reinterpret_cast<__attribute__((address_space(1))) llvm_fp16x8_t*>(
 
   20         return 
bit_cast<vector_t>(__builtin_amdgcn_global_load_tr_b128_v8f16(glb_ptr));
 
   22     else if constexpr(sizeof(T) == 1)
 
   24         typedef __attribute__((__vector_size__(2 * 
sizeof(
int)))) 
int llvm_intx2_t;
 
   25         __attribute__((address_space(1))) llvm_intx2_t* glb_ptr =
 
   26             reinterpret_cast<__attribute__((address_space(1))) llvm_intx2_t*>(
 
   28         return 
bit_cast<vector_t>(__builtin_amdgcn_global_load_tr_b64_v2i32(glb_ptr));
 
   32         static_assert(
false, 
"not implemented");
 
__host__ constexpr __device__ Y bit_cast(const X &x)
Definition: type.hpp:306
 
_W64 unsigned int uintptr_t
Definition: stdint.h:164