/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp Source File
element_wise_operation.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
7 #include "ck/utility/math_v2.hpp"
11 
12 namespace ck {
13 namespace tensor_operation {
14 namespace element_wise {
15 
16 // Need to ensure compiler will fail if there is no matching candidate, instead of compiler
17 // siliently do implicit type conversion
18 //
19 // Example:
20 //
21 // struct ExampleElementwiseOp
22 // {
23 // template<typename Y, typename X>
24 // __host__ __device__ constexpr void
25 // operator()(Y&, const X) const;
26 //
27 // template<>
28 // __host__ __device__ constexpr void
29 // operator()<half_t, half_t>(half_t& y, const half_t& x) const
30 // {
31 // }
32 // };
33 
34 struct AddReluAdd
35 {
36  template <typename Y, typename X0, typename X1, typename X2>
37  __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&, const X2&) const;
38 
39  template <>
40  __host__ __device__ constexpr void operator()<half_t, half_t, half_t, half_t>(
41  half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
42  {
43  half_t a = x0 + x1;
44  half_t b = a > 0 ? a : 0;
45  y = b + x2;
46  }
47 
48  template <>
49  __host__ __device__ constexpr void operator()<float, float, float, float>(float& y,
50  const float& x0,
51  const float& x1,
52  const float& x2) const
53  {
54  float a = x0 + x1;
55  float b = a > 0 ? a : 0;
56  float c = b + x2;
57  y = c;
58  }
59 
60  template <>
61  __host__ __device__ constexpr void operator()<half_t, float, half_t, half_t>(
62  half_t& y, const float& x0, const half_t& x1, const half_t& x2) const
63  {
64  float a = x0 + x1;
65  float b = a > 0 ? a : 0;
66  float c = b + x2;
67  y = c;
68  }
69 
70  template <>
71  __host__ __device__ constexpr void operator()<bhalf_t, float, bhalf_t, bhalf_t>(
72  bhalf_t& y, const float& x0, const bhalf_t& x1, const bhalf_t& x2) const
73  {
74  float a = x0 + x1;
75  float b = a > 0 ? a : 0;
76  float c = b + x2;
77  y = c;
78  }
79 
80  template <>
81  __host__ __device__ constexpr void operator()<int8_t, int8_t, int8_t, int8_t>(
82  int8_t& y, const int8_t& x0, const int8_t& x1, const int8_t& x2) const
83  {
84  int32_t a = x0 + x1;
85  int32_t b = a > 0 ? a : 0;
86  int32_t c = b + x2;
87  y = c;
88  }
89 
90 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
91  template <>
92  __host__ __device__ constexpr void operator()<int4_t, int8_t, int4_t, int4_t>(
93  int4_t& y, const int8_t& x0, const int4_t& x1, const int4_t& x2) const
94  {
95  int32_t a = x0 + x1;
96  int32_t b = a > 0 ? a : 0;
97  int32_t c = b + x2;
98  y = c;
99  }
100 #endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
101 };
102 
104 {
105  template <typename Y, typename X0, typename X1, typename X2>
106  __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&, const X2&) const;
107 
108  template <>
109  __host__ __device__ constexpr void operator()<float, float, float, float>(float& y,
110  const float& x0,
111  const float& x1,
112  const float& x2) const
113  {
114  float a = x0 + x1;
115  float b = a + float{3};
116  float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
117  float d = c + x2;
118  y = d;
119  }
120 
121  template <>
122  __host__ __device__ constexpr void operator()<half_t, half_t, half_t, half_t>(
123  half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
124  {
125  float a = x0 + x1;
126  float b = a + float{3};
127  float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
128  float d = c + x2;
129  y = d;
130  }
131 };
132 
133 // C = A * B
134 // E = C + D0 + D1
135 struct AddAdd
136 {
137  template <typename E, typename C, typename D0, typename D1>
138  __host__ __device__ void operator()(E& e, const C& c, const D0& d0, const D1& d1) const
139  {
140  // Only support floating so far
143  "Data type is not supported by this operation!");
144 
147  "Data type is not supported by this operation!");
148 
151  "Data type is not supported by this operation!");
152 
155  "Data type is not supported by this operation!");
156 
157  const C y = c + type_convert<C>(d0) + type_convert<C>(d1);
158  e = type_convert<E>(y);
159  }
160 };
161 
162 // C = A * B
163 // E = (C + D0) x D1
165 {
166  template <typename E, typename C, typename D0, typename D1>
167  __host__ __device__ void operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
168 
169  template <>
170  __host__ __device__ void operator()<half_t, half_t, half_t, half_t>(half_t& e,
171  const half_t& c,
172  const half_t& d0,
173  const half_t& d1) const
174  {
175  const half_t y = (c + d0) * d1;
176  e = y;
177  }
178  template <>
179  __host__ __device__ void operator()<half_t, float, half_t, half_t>(half_t& e,
180  const float& c,
181  const half_t& d0,
182  const half_t& d1) const
183  {
184  const half_t y = (type_convert<half_t>(c) + d0) * d1;
185  e = y;
186  }
187  template <>
188  __host__ __device__ void operator()<float, float, half_t, half_t>(float& e,
189  const float& c,
190  const half_t& d0,
191  const half_t& d1) const
192  {
193  const float y = (c + d0) * d1;
194  e = y;
195  }
196 };
197 
198 // C = A * B
199 // E = C x D0 + D1
201 {
202  template <typename E, typename C, typename D0, typename D1>
203  __host__ __device__ void operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
204 
205  template <>
206  __host__ __device__ void operator()<half_t, half_t, half_t, half_t>(half_t& e,
207  const half_t& c,
208  const half_t& d0,
209  const half_t& d1) const
210  {
211  const half_t y = (c * d0) + d1;
212  e = y;
213  }
214  template <>
215  __host__ __device__ void operator()<half_t, float, half_t, half_t>(half_t& e,
216  const float& c,
217  const half_t& d0,
218  const half_t& d1) const
219  {
220  const half_t y = type_convert<half_t>(c) * d0 + d1;
221  e = y;
222  }
223  template <>
224  __host__ __device__ void operator()<bhalf_t, float, bhalf_t, bhalf_t>(bhalf_t& e,
225  const float& c,
226  const bhalf_t& d0,
227  const bhalf_t& d1) const
228  {
229  const bhalf_t y = type_convert<bhalf_t>(c) * d0 + d1;
230  e = y;
231  }
232  template <>
233  __host__ __device__ void operator()<float, float, half_t, half_t>(float& e,
234  const float& c,
235  const half_t& d0,
236  const half_t& d1) const
237  {
238  const float y = c * d0 + d1;
239  e = y;
240  }
241  template <>
242  __host__ __device__ void operator()<half_t, float, float, float>(half_t& e,
243  const float& c,
244  const float& d0,
245  const float& d1) const
246  {
247  const float y = c * d0 + d1;
248  e = y;
249  }
250 };
251 
253 {
254  template <typename E, typename C, typename D0, typename D1>
255  __host__ __device__ constexpr void
256  operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
257 
258  template <>
259  __host__ __device__ constexpr void operator()<ck::half_t, float, float, float>(
260  ck::half_t& e, const float& c, const float& d0, const float& d1) const
261  {
262  const float x0_f = c * d0 * d1;
263 
264  e = ck::type_convert<ck::half_t>(x0_f);
265  }
266 
267  template <>
268  __host__ __device__ constexpr void operator()<ck::bhalf_t, float, float, float>(
269  ck::bhalf_t& e, const float& c, const float& d0, const float& d1) const
270  {
271  const float x0_f = c * d0 * d1;
272 
273  e = ck::type_convert<ck::bhalf_t>(x0_f);
274  }
275 
276  template <>
277  __host__ __device__ constexpr void operator()<ck::half_t, int, ck::half_t, ck::half_t>(
278  ck::half_t& e, const int& c, const ck::half_t& d0, const ck::half_t& d1) const
279  {
280  const float x0_f =
281  ck::type_convert<float>(c) * ck::type_convert<float>(d0) * ck::type_convert<float>(d1);
282 
283  e = ck::type_convert<ck::half_t>(x0_f);
284  }
285 
286  template <>
287  __host__ __device__ constexpr void operator()<ck::half_t, int, float, float>(
288  ck::half_t& e, const int& c, const float& d0, const float& d1) const
289  {
290  const float x0_f =
291  ck::type_convert<float>(c) * ck::type_convert<float>(d0) * ck::type_convert<float>(d1);
292 
293  e = ck::type_convert<ck::half_t>(x0_f);
294  }
295 
296  template <>
297  __host__ __device__ constexpr void operator()<ck::bhalf_t, int, float, float>(
298  ck::bhalf_t& e, const int& c, const float& d0, const float& d1) const
299  {
300  const float x0_f =
301  ck::type_convert<float>(c) * ck::type_convert<float>(d0) * ck::type_convert<float>(d1);
302 
303  e = ck::type_convert<ck::bhalf_t>(x0_f);
304  }
305 };
306 
308 {
309  template <typename E, typename C, typename D0, typename D1>
310  __host__ __device__ constexpr void
311  operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
312 
313  template <>
314  __host__ __device__ constexpr void operator()<ck::bhalf_t, float, ck::bhalf_t, ck::bhalf_t>(
315  ck::bhalf_t& e, const float& c, const ck::bhalf_t& d0, const ck::bhalf_t& d1) const
316  {
317  const float x0_f = c * ck::type_convert<float>(d0) + ck::type_convert<float>(d1);
318 
319  float x1_f = 0;
320 
321  FastGelu{}.template operator()<float, float>(x1_f, x0_f);
322 
323  e = ck::type_convert<ck::bhalf_t>(x1_f);
324  }
325 };
326 
327 // E = FastGelu(C + D0 + D1)
329 {
330  template <typename E, typename C, typename D0, typename D1>
331  __host__ __device__ constexpr void
332  operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
333 
334  template <>
335  __host__ __device__ constexpr void operator()<float, float, float, float>(float& e,
336  const float& c,
337  const float& d0,
338  const float& d1) const
339  {
340  const float x = c + d0 + d1;
341 
342  FastGelu{}.template operator()<float, float>(e, x);
343  }
344 
345  template <>
346  __host__ __device__ constexpr void operator()<half_t, half_t, half_t, half_t>(
347  half_t& e, const half_t& c, const half_t& d0, const half_t& d1) const
348  {
349  const half_t x = c + d0 + d1;
350 
351  ck::tensor_operation::element_wise::FastGelu{}.template operator()<half_t, half_t>(e, x);
352  }
353 
354  template <>
355  __host__ __device__ constexpr void operator()<half_t, float, half_t, half_t>(
356  half_t& e, const float& c, const half_t& d0, const half_t& d1) const
357  {
358  const float x0_f = c + d0 + d1;
359 
360  float x1_f = 0;
361 
362  ck::tensor_operation::element_wise::FastGelu{}.template operator()<float, float>(x1_f,
363  x0_f);
364 
365  e = type_convert<half_t>(x1_f);
366  }
367 
368  template <>
369  __host__ __device__ constexpr void operator()<bhalf_t, float, bhalf_t, bhalf_t>(
370  bhalf_t& e, const float& c, const bhalf_t& d0, const bhalf_t& d1) const
371  {
372  const float x0_f = c + type_convert<float>(d0) + type_convert<float>(d1);
373 
374  float x1_f = 0;
375 
376  ck::tensor_operation::element_wise::FastGelu{}.template operator()<float, float>(x1_f,
377  x0_f);
378 
379  e = type_convert<bhalf_t>(x1_f);
380  }
381 
382  template <>
383  __host__ __device__ constexpr void operator()<int8_t, int32_t, int8_t, int8_t>(
384  int8_t& e, const int32_t& c, const int8_t& d0, const int8_t& d1) const
385  {
386  const float x0_f =
387  type_convert<float>(c) + type_convert<float>(d0) + type_convert<float>(d1);
388 
389  float x1_f = 0;
390 
391  ck::tensor_operation::element_wise::FastGelu{}.template operator()<float, float>(x1_f,
392  x0_f);
393 
394  e = type_convert<int8_t>(x1_f);
395  }
396 };
397 
398 // E = Relu(alpha1 * C + alpha2 * D0 + D1)
400 {
401 
402  ScaleAddScaleAddRelu(const float alpha1 = 1.f, const float alpha2 = 1.f)
403  : alpha1_(alpha1), alpha2_(alpha2)
404  {
405  }
406 
407  template <typename E, typename C, typename D0, typename D1>
408  __host__ __device__ constexpr void
409  operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
410 
411  template <>
412  __host__ __device__ constexpr void operator()<float, float, float, float>(float& e,
413  const float& c,
414  const float& d0,
415  const float& d1) const
416  {
417  const float x = c * alpha1_ + alpha2_ * d0 + d1;
418  e = x > 0 ? x : 0;
419  }
420 
421  template <>
422  __host__ __device__ constexpr void operator()<half_t, half_t, half_t, half_t>(
423  half_t& e, const half_t& c, const half_t& d0, const half_t& d1) const
424  {
425  const float x = type_convert<float>(c) * alpha1_ + alpha2_ * type_convert<float>(d0) +
426  type_convert<float>(d1);
427 
428  float result = 0;
429  result = x > 0 ? x : 0;
430 
431  e = type_convert<half_t>(result);
432  }
433 
434  template <>
435  __host__ __device__ constexpr void operator()<bhalf_t, bhalf_t, bhalf_t, bhalf_t>(
436  bhalf_t& e, const bhalf_t& c, const bhalf_t& d0, const bhalf_t& d1) const
437  {
438  const float x = type_convert<float>(c) * alpha1_ + alpha2_ * type_convert<float>(d0) +
439  type_convert<float>(d1);
440 
441  float result = 0;
442  result = x > 0 ? x : 0;
443 
444  e = type_convert<bhalf_t>(result);
445  }
446 
447  template <>
448  __host__ __device__ constexpr void operator()<int8_t, int8_t, float, float>(
449  int8_t& e, const int8_t& c, const float& d0, const float& d1) const
450  {
451  const float x = type_convert<float>(c) * alpha1_ + alpha2_ * d0 + d1;
452 
453  float result = 0;
454  result = x > 0 ? x : 0;
455 
456  e = type_convert<int8_t>(result);
457  }
458 
459  const float alpha1_;
460  const float alpha2_;
461 };
462 
463 struct Normalize
464 {
465  // FIXME: is double absolutely necessary?
466  Normalize(double epsilon = 1e-4) : epsilon_(epsilon) {}
467 
468  template <typename T1, typename T2, typename T3>
469  __host__ __device__ constexpr void operator()(T1& y,
470  const T1& x,
471  const T2& mean,
472  const T2& mean_square,
473  const T3& gamma,
474  const T3& beta) const;
475 
476  template <>
477  __host__ __device__ constexpr void operator()<half_t, float, half_t>(half_t& y,
478  const half_t& x,
479  const float& mean,
480  const float& mean_square,
481  const half_t& gamma,
482  const half_t& beta) const
483  {
484  using ck::math::sqrt;
485 
486  float variance = mean_square - (mean * mean);
487 
488  float tmp_x = type_convert<float>(x);
489  float tmp_gamma = type_convert<float>(gamma);
490  float tmp_beta = type_convert<float>(beta);
491 
492  float tmp_y =
493  ((tmp_x - mean) / sqrt(variance + type_convert<float>(epsilon_))) * tmp_gamma +
494  tmp_beta;
495 
496  y = type_convert<half_t>(tmp_y);
497  };
498 
499  template <>
500  __host__ __device__ constexpr void operator()<float, float, float>(float& y,
501  const float& x,
502  const float& mean,
503  const float& mean_square,
504  const float& gamma,
505  const float& beta) const
506  {
507  using ck::math::sqrt;
508 
509  float variance = mean_square - (mean * mean);
510  y = ((x - mean) / sqrt(variance + type_convert<float>(epsilon_))) * gamma + beta;
511  };
512 
513  template <>
514  __host__ __device__ constexpr void operator()<double, double, double>(double& y,
515  const double& x,
516  const double& mean,
517  const double& mean_square,
518  const double& gamma,
519  const double& beta) const
520  {
521  using ck::math::sqrt;
522 
523  double variance = mean_square - (mean * mean);
524  y = ((x - mean) / sqrt(variance + epsilon_)) * gamma + beta;
525  };
526 
527  // FIXME: is double absolutely necessary?
528  double epsilon_;
529 };
530 
531 // used by BatchNorm inference
532 // y = gamma * (x-mean) / sqrt(epsilon+variance) + beta
533 // The data type of mean and variance is used as AccDataType
535 {
536  NormalizeInInfer(double epsilon = 1e-4) : epsilon_(epsilon) {}
537 
538  template <typename T1, typename T2, typename T3, typename T4>
539  __host__ __device__ constexpr void operator()(T1& y,
540  const T1& x,
541  const T2& mean,
542  const T2& variance,
543  const T3& gamma,
544  const T4& beta) const
545  {
547  "Data type is not supported by this operation!");
548 
549  using ck::type_convert;
550  using ck::math::sqrt;
551 
552  T2 tmp_x, tmp_y;
553 
554  tmp_x = type_convert<T2>(x);
555 
556  tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert<T2>(epsilon_))) *
557  type_convert<T2>(gamma) +
558  type_convert<T2>(beta);
559  y = type_convert<T1>(tmp_y);
560  };
561 
562  double epsilon_;
563 };
564 
565 // used by Conv+Bias+BatchNorm+Clamp inference
567 {
570  float epsilon = 1e-4)
571  : clamp_(floor, ceil), epsilon_(epsilon)
572  {
573  }
574 
575  template <typename T>
576  __host__ __device__ constexpr void operator()(T& y,
577  const T& x,
578  const T& bias,
579  const T& mean,
580  const T& variance,
581  const T& gamma,
582  const T& beta) const
583  {
584  using ck::type_convert;
585  using ck::math::sqrt;
586 
587  float tmp_x = type_convert<float>(x) + type_convert<float>(bias);
588 
589  float tmp_y =
590  ((tmp_x - type_convert<float>(mean)) / sqrt(type_convert<float>(variance) + epsilon_)) *
591  type_convert<float>(gamma) +
592  type_convert<float>(beta);
593  clamp_(tmp_y, tmp_y);
594  y = type_convert<T>(tmp_y);
595  };
596 
597  template <>
598  __host__ __device__ constexpr void operator()(float& y,
599  const float& x,
600  const float& bias,
601  const float& mean,
602  const float& variance,
603  const float& gamma,
604  const float& beta) const
605  {
606  using ck::type_convert;
607  using ck::math::sqrt;
608 
609  float tmp_y = (((x + bias) - mean) / sqrt(variance + epsilon_)) * gamma + beta;
610  clamp_(y, tmp_y);
611  };
612 
614  float epsilon_;
615 };
616 
617 template <typename Y, typename X>
619 
620 template <>
621 struct UnaryTypeConvert<float, ck::bhalf_t>
622 {
623  __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const
624  {
625  y = ck::type_convert<float, ck::bhalf_t>(x);
626  }
627 };
628 
629 template <>
630 struct UnaryTypeConvert<ck::bhalf_t, float>
631 {
632  __host__ __device__ void operator()(ck::bhalf_t& y, float& x) const
633  {
634  y = ck::type_convert<ck::bhalf_t, float>(x);
635  }
636 };
637 
638 } // namespace element_wise
639 } // namespace tensor_operation
640 } // namespace ck
__host__ T ceil(T x)
Definition: math_v2.hpp:331
__host__ T floor(T x)
Definition: math_v2.hpp:367
int8_t int8_t
Definition: int8.hpp:20
int32_t int32_t
Definition: integer.hpp:10
Definition: ck.hpp:266
_Float16 half_t
Definition: data_type.hpp:30
ushort bhalf_t
Definition: data_type.hpp:29
__host__ constexpr __device__ Y type_convert(X x)
Definition: type_convert.hpp:98
_BitInt(4) int4_t
Definition: data_type.hpp:31
Definition: numeric_limits.hpp:309
Definition: type.hpp:177
Definition: element_wise_operation.hpp:329
__host__ constexpr __device__ void operator()(E &e, const C &c, const D0 &d0, const D1 &d1) const
Definition: element_wise_operation.hpp:136
__host__ __device__ void operator()(E &e, const C &c, const D0 &d0, const D1 &d1) const
Definition: element_wise_operation.hpp:138
Definition: element_wise_operation.hpp:104
__host__ constexpr __device__ void operator()(Y &, const X0 &, const X1 &, const X2 &) const
Definition: element_wise_operation.hpp:165
__host__ __device__ void operator()(E &e, const C &c, const D0 &d0, const D1 &d1) const
Definition: element_wise_operation.hpp:35
__host__ constexpr __device__ void operator()(Y &, const X0 &, const X1 &, const X2 &) const
Definition: element_wise_operation.hpp:567
BiasNormalizeInInferClamp(float floor=0.f, float ceil=NumericLimits< float >::Max(), float epsilon=1e-4)
Definition: element_wise_operation.hpp:568
__host__ constexpr __device__ void operator()(T &y, const T &x, const T &bias, const T &mean, const T &variance, const T &gamma, const T &beta) const
Definition: element_wise_operation.hpp:576
float epsilon_
Definition: element_wise_operation.hpp:614
Clamp clamp_
Definition: element_wise_operation.hpp:611
__host__ constexpr __device__ void operator()(float &y, const float &x, const float &bias, const float &mean, const float &variance, const float &gamma, const float &beta) const
Definition: element_wise_operation.hpp:598
Definition: unary_element_wise_operation.hpp:757
Definition: unary_element_wise_operation.hpp:866
Definition: element_wise_operation.hpp:308
__host__ constexpr __device__ void operator()(E &e, const C &c, const D0 &d0, const D1 &d1) const
Definition: element_wise_operation.hpp:201
__host__ __device__ void operator()(E &e, const C &c, const D0 &d0, const D1 &d1) const
Definition: element_wise_operation.hpp:253
__host__ constexpr __device__ void operator()(E &e, const C &c, const D0 &d0, const D1 &d1) const
Definition: element_wise_operation.hpp:464
Normalize(double epsilon=1e-4)
Definition: element_wise_operation.hpp:466
double epsilon_
Definition: element_wise_operation.hpp:525
__host__ constexpr __device__ void operator()(T1 &y, const T1 &x, const T2 &mean, const T2 &mean_square, const T3 &gamma, const T3 &beta) const
Definition: element_wise_operation.hpp:535
double epsilon_
Definition: element_wise_operation.hpp:560
__host__ constexpr __device__ void operator()(T1 &y, const T1 &x, const T2 &mean, const T2 &variance, const T3 &gamma, const T4 &beta) const
Definition: element_wise_operation.hpp:539
NormalizeInInfer(double epsilon=1e-4)
Definition: element_wise_operation.hpp:536
Definition: element_wise_operation.hpp:400
ScaleAddScaleAddRelu(const float alpha1=1.f, const float alpha2=1.f)
Definition: element_wise_operation.hpp:402
const float alpha2_
Definition: element_wise_operation.hpp:460
const float alpha1_
Definition: element_wise_operation.hpp:459
__host__ constexpr __device__ void operator()(E &e, const C &c, const D0 &d0, const D1 &d1) const
__host__ __device__ void operator()(ck::bhalf_t &y, float &x) const
Definition: element_wise_operation.hpp:632
__host__ __device__ void operator()(float &y, ck::bhalf_t &x) const
Definition: element_wise_operation.hpp:623
Definition: element_wise_operation.hpp:618