14 template <
typename ADataType,
28 const AElementOp& a_element_op = {},
29 const BElementOp& b_element_op = {},
30 const ACCElementOp& acc_element_op = {})
36 auto f_mn = [&](
auto m,
auto n) {
37 AccDataType v_acc = 0, v_block_acc = 0;
39 static_assert(std::is_same_v<ADataType, pk_int4_t> || std::is_same_v<ADataType, fp8_t> ||
40 std::is_same_v<ADataType, bf8_t>);
41 static_assert(std::is_same_v<BDataType, fp8_t> || std::is_same_v<BDataType, bf8_t> ||
42 std::is_same_v<BDataType, pk_int4_t>);
43 static_assert(std::is_same_v<AccDataType, float>);
44 static_assert(std::is_same_v<CDataType, float> ||
45 std::is_same_v<CDataType, ck_tile::half_t>);
46 for(std::size_t k = 0; k < K; ++k)
50 if constexpr(std::is_same_v<ADataType, pk_int4_t>)
52 const pk_int4_t pk_val = a_element_op(a_m_k(m, k));
61 v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
63 if constexpr(std::is_same_v<BDataType, pk_int4_t>)
65 const pk_int4_t pk_val = b_element_op(b_k_n(k, n));
72 else if constexpr(std::is_same_v<BDataType, fp8_t>)
78 v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
80 v_block_acc += v_a * v_b;
83 if((k + 1) % QuantGroupSize == 0)
86 index_t outer_dim = (aquant) ? m : k / QuantGroupSize;
87 index_t inner_dim = (aquant) ? k / QuantGroupSize : n;
89 if constexpr(std::is_same_v<QDataType, float>)
91 scale = q(outer_dim, inner_dim);
93 else if constexpr(std::is_same_v<QDataType, ck_tile::fp8_t>)
97 else if constexpr(std::is_same_v<QDataType, ck_tile::bf8_t>)
103 static_assert(
false,
"Unexpected Q datatype.");
105 v_block_acc *= scale;
106 v_acc += v_block_acc;
111 c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
115 std::cout << std::endl;
118 template <
typename ADataType,
122 typename AccDataType,
132 const AElementOp& a_element_op = {},
133 const BElementOp& b_element_op = {},
134 const ACCElementOp& acc_element_op = {})
136 static_assert(std::is_same_v<ADataType, fp8_t> || std::is_same_v<ADataType, bf8_t>);
137 static_assert(std::is_same_v<BDataType, fp8_t> || std::is_same_v<BDataType, bf8_t>);
138 static_assert(std::is_same_v<AccDataType, float>);
139 static_assert(std::is_same_v<CDataType, float> || std::is_same_v<CDataType, ck_tile::half_t>);
140 static_assert(std::is_same_v<AQDataType, float> && std::is_same_v<BQDataType, float>);
145 auto f_mn = [&](
auto m,
auto n) {
147 AccDataType v_acc = 0;
149 float a_scale = aq_m_1(m, 0);
150 float b_scale = bq_1_n(0, n);
153 for(std::size_t k = 0; k < K; ++k)
159 if constexpr(std::is_same_v<ADataType, pk_int4_t>)
161 const pk_int4_t pk_val = a_element_op(a_m_k(m, k));
170 v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
174 if constexpr(std::is_same_v<BDataType, pk_int4_t>)
176 const pk_int4_t pk_val = b_element_op(b_k_n(k, n));
183 else if constexpr(std::is_same_v<BDataType, fp8_t>)
189 v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
195 v_acc = v_acc * a_scale * b_scale;
197 c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
201 std::cout << std::endl;
204 template <
typename ADataType,
206 typename AccDataType,
214 const AElementOp& a_element_op = {},
215 const BElementOp& b_element_op = {},
216 const ACCElementOp& acc_element_op = {})
222 auto f_mn = [&](
auto m,
auto n) {
223 AccDataType v_acc = 0;
225 for(std::size_t k = 0; k < K; ++k)
229 if constexpr(std::is_same_v<ADataType, pk_int4_t>)
231 const pk_int4_t pk_val = a_element_op(a_m_k(m, k));
240 v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
242 if constexpr(std::is_same_v<BDataType, pk_int4_t>)
244 const pk_int4_t pk_val = b_element_op(b_k_n(k, n));
253 v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
258 c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
264 template <
typename ADataType,
267 typename AccDataType,
269 typename ACCElementOp,
270 typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>>
276 const ACCElementOp& acc_element_op = {})
282 auto f_mk_kn_mn = [&](
auto m,
auto n) {
283 AccDataType v_acc = 0;
284 for(std::size_t k = 0; k < K; ++k)
286 ADataType v_a = a_m_k(m, k);
287 BDataType v_b = b_k_n(k, n);
289 ck_tile::type_convert<AccDataType>(v_a) * ck_tile::type_convert<AccDataType>(v_b);
293 if constexpr(DsDataType::size() == 0)
295 acc_element_op(v_c, ck_tile::type_convert<float>(v_acc));
297 else if constexpr(DsDataType::size() == 1)
300 ck_tile::type_convert<float>(v_acc),
301 ck_tile::type_convert<float>(ds_m_n[0](m, n)));
303 else if constexpr(DsDataType::size() == 2)
306 ck_tile::type_convert<float>(v_acc),
307 ck_tile::type_convert<float>(ds_m_n[0](m, n)),
308 ck_tile::type_convert<float>(ds_m_n[1](m, n)));
310 c_m_n(m, n) = ck_tile::type_convert<CDataType>(v_c);
316 template <
typename ADataType,
318 typename AccDataType,
333 int idx = blockIdx.x * blockDim.x + threadIdx.x;
337 if(row < M && col < N)
339 AccDataType acc = 0.0;
340 for(
int k = 0; k < K; ++k)
345 int a_index = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
348 int b_index = (std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
354 if constexpr(std::is_same_v<ADataType, pk_int4_t>)
364 v_a = ck_tile::type_convert<AccDataType>(A[a_index]);
366 if constexpr(std::is_same_v<BDataType, pk_int4_t>)
376 v_b = ck_tile::type_convert<AccDataType>(B[b_index]);
381 int c_index = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
382 ? row * strideC + col
383 : col * strideC + row;
384 C[c_index] = ck_tile::type_convert<CDataType>(acc);
388 template <
typename ADataType,
390 typename AccDataType,
405 int totalElements = M * N;
406 int numThreadsPerBlock = 256;
407 int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
409 naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
410 <<<numBlocks, numThreadsPerBlock>>>(
411 a_ptr, b_ptr, c_ptr, M, N, K, stride_a, stride_b, stride_c);
416 template <
typename ADataType,
418 typename AccDataType,
437 int totalElements = M * N;
438 int numThreadsPerBlock = 256;
439 int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
441 for(
index_t batch_id = 0; batch_id < batch_count; ++batch_id)
443 ADataType* d_ATemp = a_ptr + batch_id * batch_stride_A;
444 BDataType* d_BTemp = b_ptr + batch_id * batch_stride_B;
445 CDataType* d_CTemp = c_ptr + batch_id * batch_stride_C;
446 naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
447 <<<numBlocks, numThreadsPerBlock>>>(
448 d_ATemp, d_BTemp, d_CTemp, M, N, K, stride_a, stride_b, stride_c);
#define CK_TILE_HOST
Definition: config.hpp:40
Definition: cluster_descriptor.hpp:13
void reference_batched_gemm_gpu(ADataType *a_ptr, BDataType *b_ptr, CDataType *c_ptr, index_t M, index_t N, index_t K, index_t stride_a, index_t stride_b, index_t stride_c, index_t batch_stride_A, index_t batch_stride_B, index_t batch_stride_C, index_t batch_count)
Definition: reference_gemm.hpp:423
CK_TILE_HOST auto make_ParallelTensorFunctor(F f, Xs... xs)
Definition: host_tensor.hpp:329
__global__ void naive_gemm_kernel(ADataType *A, BDataType *B, CDataType *C, ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t K, ck_tile::index_t strideA, ck_tile::index_t strideB, ck_tile::index_t strideC)
Definition: reference_gemm.hpp:323
CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t(const pk_int4_t &x)
Definition: pk_int4.hpp:105
CK_TILE_HOST_DEVICE float fp8_to_float_raw(uint8_t)
Definition: float8.hpp:751
CK_TILE_HOST void reference_gemm_quant(const HostTensor< ADataType > &a_m_k, const HostTensor< QDataType > &q, const HostTensor< BDataType > &b_k_n, HostTensor< CDataType > &c_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={})
Definition: reference_gemm.hpp:24
CK_TILE_HOST_DEVICE float bf8_to_float_raw(uint8_t)
Definition: float8.hpp:764
float fp32x2_t
Definition: pk_fp4.hpp:22
int32_t index_t
Definition: integer.hpp:9
CK_TILE_HOST void reference_gemm_rowcol_quant(const HostTensor< ADataType > &a_m_k, const HostTensor< AQDataType > &aq_m_1, const HostTensor< BDataType > &b_k_n, const HostTensor< BQDataType > &bq_1_n, HostTensor< CDataType > &c_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={})
Definition: reference_gemm.hpp:127
CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t_signed_conversion(const pk_int4_t &x)
Definition: pk_int4.hpp:120
void reference_gemm_gpu(ADataType *a_ptr, BDataType *b_ptr, CDataType *c_ptr, index_t M, index_t N, index_t K, index_t stride_a, index_t stride_b, index_t stride_c)
Definition: reference_gemm.hpp:395
CK_TILE_HOST void reference_gemm_multiple_d(const HostTensor< ADataType > &a_m_k, const HostTensor< BDataType > &b_k_n, const std::array< HostTensor< DDataType >, DsDataType::size()> &ds_m_n, HostTensor< CDataType > &c_m_n, const ACCElementOp &acc_element_op={})
Definition: reference_gemm.hpp:272
CK_TILE_HOST void reference_gemm(const HostTensor< ADataType > &a_m_k, const HostTensor< BDataType > &b_k_n, HostTensor< CDataType > &c_m_n, const AElementOp &a_element_op={}, const BElementOp &b_element_op={}, const ACCElementOp &acc_element_op={})
Definition: reference_gemm.hpp:211
unsigned int uint32_t
Definition: stdint.h:126
Definition: host_tensor.hpp:336
std::size_t get_length(std::size_t dim) const
Definition: host_tensor.hpp:388
Definition: functional.hpp:86
Definition: numeric.hpp:81