clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File

clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File#

HIP Runtime API Reference: clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File
amd_hip_cooperative_groups.h
Go to the documentation of this file.
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
32#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
33#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
34
35#if __cplusplus
36#if !defined(__HIPCC_RTC__)
37#include <hip/amd_detail/hip_cooperative_groups_helper.h>
38#endif
39
41
51 protected:
52 __hip_uint32_t _type;
53 __hip_uint32_t _num_threads;
54 __hip_uint64_t _mask;
56
62 __CG_QUALIFIER__ thread_group(internal::group_type type,
63 __hip_uint32_t num_threads = static_cast<__hip_uint64_t>(0),
64 __hip_uint64_t mask = static_cast<__hip_uint64_t>(0)) {
65 _type = type;
67 _mask = mask;
68 }
69
70 struct _tiled_info {
72 unsigned int num_threads;
73 unsigned int meta_group_rank;
74 unsigned int meta_group_size;
75 };
76
82
83 friend __CG_QUALIFIER__ thread_group this_thread();
84 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
85 unsigned int tile_size);
86 friend class thread_block;
87
88 public:
92 __CG_QUALIFIER__ __hip_uint32_t num_threads() const { return _num_threads; }
94 __CG_QUALIFIER__ __hip_uint32_t size() const { return num_threads(); }
96 __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
98 __CG_QUALIFIER__ __hip_uint32_t thread_rank() const;
100 __CG_QUALIFIER__ __hip_uint32_t block_rank() const;
102 __CG_QUALIFIER__ bool is_valid() const;
103
116 __CG_QUALIFIER__ void sync() const;
117};
143 friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
144
145 protected:
147 explicit __CG_QUALIFIER__ multi_grid_group(__hip_uint32_t size)
148 : thread_group(internal::cg_multi_grid, size) {}
149
150 public:
153 __CG_QUALIFIER__ __hip_uint32_t num_grids() { return internal::multi_grid::num_grids(); }
154
157 __CG_QUALIFIER__ __hip_uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
159 __CG_QUALIFIER__ __hip_uint32_t thread_rank() const {
160 return internal::multi_grid::thread_rank();
161 }
163 __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
165 __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
166};
167
182 return multi_grid_group(internal::multi_grid::num_threads());
183}
184// Doxygen end group CooperativeGConstruct
195class grid_group : public thread_group {
198 friend __CG_QUALIFIER__ grid_group this_grid();
199
200 protected:
202 explicit __CG_QUALIFIER__ grid_group(__hip_uint32_t size)
203 : thread_group(internal::cg_grid, size) {}
204
205 public:
207 __CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::grid::thread_rank(); }
209 __CG_QUALIFIER__ __hip_uint32_t block_rank() const { return internal::grid::block_rank(); }
211 __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
213 __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
214 __CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); }
216 unsigned int signal;
217 };
219 __CG_QUALIFIER__ arrival_token barrier_arrive() const {
221 t.signal = internal::grid::barrier_signal();
222 return t;
223 }
225 __CG_QUALIFIER__ void barrier_wait(arrival_token&& t) const {
226 internal::grid::barrier_wait(t.signal);
227 }
228};
229
240__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::num_threads()); }
241
254 friend __CG_QUALIFIER__ thread_block this_thread_block();
255 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
256 unsigned int tile_size);
257 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
258 unsigned int tile_size);
259
260 protected:
261 // Construct a workgroup thread group (through the API this_thread_block())
262 explicit __CG_QUALIFIER__ thread_block(__hip_uint32_t size)
263 : thread_group(internal::cg_workgroup, size) {}
264
265 __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
266 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
267 // Invalid tile size, assert
268 if (!tile_size || (tile_size > warpSize) || !pow2) {
269 __hip_assert(false && "invalid tile size");
270 }
271
272 auto block_size = num_threads();
273 auto rank = thread_rank();
274 auto partitions = (block_size + tile_size - 1) / tile_size;
275 auto tail = (partitions * tile_size) - block_size;
276 auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size);
277 thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size);
278
279 tiledGroup.coalesced_info.tiled_info.num_threads = tile_size;
280 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
281 tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size;
282 tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions;
283 return tiledGroup;
284 }
285
286 public:
288 __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
290 __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
292 __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
293 return internal::workgroup::thread_rank();
294 }
296 __CG_STATIC_QUALIFIER__ __hip_uint32_t block_rank() {
297 return internal::workgroup::block_rank();
298 }
300 __CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
301 return internal::workgroup::num_threads();
302 }
304 __CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return num_threads(); }
306 __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
308 __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
310 __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
311 struct arrival_token {};
313 __CG_QUALIFIER__ arrival_token barrier_arrive() const {
314 internal::workgroup::barrier_arrive();
315 return arrival_token{};
316 }
318 __CG_QUALIFIER__ void barrier_wait(arrival_token&&) const { internal::workgroup::barrier_wait(); }
319};
320
331__CG_QUALIFIER__ thread_block this_thread_block() {
332 return thread_block(internal::workgroup::num_threads());
333}
334
342class tiled_group : public thread_group {
343 private:
344 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
345 unsigned int tile_size);
346 friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
347 unsigned int tile_size);
348
349 __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
350 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
351
352 if (!tile_size || (tile_size > warpSize) || !pow2) {
353 __hip_assert(false && "invalid tile size");
354 }
355
356 if (num_threads() <= tile_size) {
357 return *this;
358 }
359
360 tiled_group tiledGroup = tiled_group(tile_size);
361 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
362 return tiledGroup;
363 }
364
365 protected:
366 explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
367 : thread_group(internal::cg_tiled_group, tileSize) {
370 }
371
372 public:
374 __CG_QUALIFIER__ unsigned int num_threads() const {
376 }
377
379 __CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
380
382 __CG_QUALIFIER__ unsigned int thread_rank() const {
383 return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.num_threads - 1));
384 }
386 __CG_QUALIFIER__ void sync() const { internal::tiled_group::sync(); }
387};
388
389template <unsigned int size, class ParentCGTy> class thread_block_tile;
390
399 private:
400 friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
401 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
402 unsigned int tile_size);
403 friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
404 unsigned int tile_size);
405 friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
406 template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
408
409 __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
410 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
411
412 if (!tile_size || !pow2) {
413 return coalesced_group(0);
414 }
415
416 // If a tiled group is passed to be partitioned further into a coalesced_group.
417 // prepare a mask for further partitioning it so that it stays coalesced.
419 unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
420 unsigned int masklength =
421 min(static_cast<unsigned int>(num_threads()) - base_offset, tile_size);
422 lane_mask full_mask = (static_cast<int>(warpSize) == 32)
423 ? static_cast<lane_mask>((1u << 32) - 1)
424 : static_cast<lane_mask>(-1ull);
425 lane_mask member_mask = full_mask >> (warpSize - masklength);
426
427 member_mask <<= (__lane_id() & ~(tile_size - 1));
428 coalesced_group coalesced_tile = coalesced_group(member_mask);
429 coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
430 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
431 coalesced_tile.coalesced_info.tiled_info.meta_group_size = num_threads() / tile_size;
432 return coalesced_tile;
433 }
434 // Here the parent coalesced_group is not partitioned.
435 else {
436 lane_mask member_mask = 0;
437 unsigned int tile_rank = 0;
438 int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
439
440 for (unsigned int i = 0; i < warpSize; i++) {
441 lane_mask active = coalesced_info.member_mask & (static_cast<lane_mask>(1) << i);
442 // Make sure the lane is active
443 if (active) {
444 if (lanes_to_skip <= 0 && tile_rank < tile_size) {
445 // Prepare a member_mask that is appropriate for a tile
446 member_mask |= active;
447 tile_rank++;
448 }
449 lanes_to_skip--;
450 }
451 }
452 coalesced_group coalesced_tile = coalesced_group(member_mask);
453 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
454 coalesced_tile.coalesced_info.tiled_info.meta_group_size =
455 (num_threads() + tile_size - 1) / tile_size;
456 return coalesced_tile;
457 }
458 return coalesced_group(0);
459 }
460
461 protected:
462 // Constructor
463 explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
464 : thread_group(internal::cg_coalesced_group) {
465 coalesced_info.member_mask = member_mask; // Which threads are active
467 __popcll(coalesced_info.member_mask); // How many threads are active
468 coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
471 }
472
473 public:
475 __CG_QUALIFIER__ unsigned int num_threads() const { return coalesced_info.num_threads; }
476
478 __CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
479
481 __CG_QUALIFIER__ unsigned int thread_rank() const {
482 return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
483 }
484
486 __CG_QUALIFIER__ void sync() const { internal::coalesced_group::sync(); }
487
490 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
492 }
493
495 __CG_QUALIFIER__ unsigned int meta_group_size() const {
497 }
498
511 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
512 srcRank = srcRank % static_cast<int>(num_threads());
513
514 int lane = (num_threads() == warpSize) ? srcRank
515 : (static_cast<int>(warpSize) == 64)
516 ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
517 : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
518
519 return __shfl(var, lane, warpSize);
520 }
521
536 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
537 // Note: The cuda implementation appears to use the remainder of lane_delta
538 // and WARP_SIZE as the shift value rather than lane_delta itself.
539 // This is not described in the documentation and is not done here.
540
541 if (num_threads() == warpSize) {
542 return __shfl_down(var, lane_delta, warpSize);
543 }
544
545 int lane;
546 if (static_cast<int>(warpSize) == 64) {
547 lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
548 } else {
549 lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
550 }
551
552 if (lane == -1) {
553 lane = __lane_id();
554 }
555
556 return __shfl(var, lane, warpSize);
557 }
558
573 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
574 // Note: The cuda implementation appears to use the remainder of lane_delta
575 // and WARP_SIZE as the shift value rather than lane_delta itself.
576 // This is not described in the documentation and is not done here.
577
578 if (num_threads() == warpSize) {
579 return __shfl_up(var, lane_delta, warpSize);
580 }
581
582 int lane;
583 if (static_cast<int>(warpSize) == 64) {
584 lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
585 } else if (static_cast<int>(warpSize) == 32) {
586 lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
587 }
588
589 if (lane == -1) {
590 lane = __lane_id();
591 }
592
593 return __shfl(var, lane, warpSize);
594 }
595#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
596
604 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
605 return internal::helper::adjust_mask(
607 __ballot_sync<unsigned long long>(coalesced_info.member_mask, pred));
608 }
609
616 __CG_QUALIFIER__ int any(int pred) const {
617 return __any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
618 }
619
626 __CG_QUALIFIER__ int all(int pred) const {
627 return __all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
628 }
629
638 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
639 return internal::helper::adjust_mask(
641 __match_any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value));
642 }
643
655 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
656 return internal::helper::adjust_mask(
658 __match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
659 &pred));
660 }
661#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
662};
663
673 return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
674}
675
676#ifndef DOXYGEN_SHOULD_SKIP_THIS
677
683__CG_QUALIFIER__ __hip_uint32_t thread_group::thread_rank() const {
684 switch (this->_type) {
685 case internal::cg_multi_grid: {
686 return (static_cast<const multi_grid_group*>(this)->thread_rank());
687 }
688 case internal::cg_grid: {
689 return (static_cast<const grid_group*>(this)->thread_rank());
690 }
691 case internal::cg_workgroup: {
692 return (static_cast<const thread_block*>(this)->thread_rank());
693 }
694 case internal::cg_tiled_group: {
695 return (static_cast<const tiled_group*>(this)->thread_rank());
696 }
697 case internal::cg_coalesced_group: {
698 return (static_cast<const coalesced_group*>(this)->thread_rank());
699 }
700 default: {
701 __hip_assert(false && "invalid cooperative group type");
702 return -1;
703 }
704 }
705}
706
712__CG_QUALIFIER__ bool thread_group::is_valid() const {
713 switch (this->_type) {
714 case internal::cg_multi_grid: {
715 return (static_cast<const multi_grid_group*>(this)->is_valid());
716 }
717 case internal::cg_grid: {
718 return (static_cast<const grid_group*>(this)->is_valid());
719 }
720 case internal::cg_workgroup: {
721 return (static_cast<const thread_block*>(this)->is_valid());
722 }
723 case internal::cg_tiled_group: {
724 return (static_cast<const tiled_group*>(this)->is_valid());
725 }
726 case internal::cg_coalesced_group: {
727 return (static_cast<const coalesced_group*>(this)->is_valid());
728 }
729 default: {
730 __hip_assert(false && "invalid cooperative group type");
731 return false;
732 }
733 }
734}
735
741__CG_QUALIFIER__ void thread_group::sync() const {
742 switch (this->_type) {
743 case internal::cg_multi_grid: {
744 static_cast<const multi_grid_group*>(this)->sync();
745 break;
746 }
747 case internal::cg_grid: {
748 static_cast<const grid_group*>(this)->sync();
749 break;
750 }
751 case internal::cg_workgroup: {
752 static_cast<const thread_block*>(this)->sync();
753 break;
754 }
755 case internal::cg_tiled_group: {
756 static_cast<const tiled_group*>(this)->sync();
757 break;
758 }
759 case internal::cg_coalesced_group: {
760 static_cast<const coalesced_group*>(this)->sync();
761 break;
762 }
763 default: {
764 __hip_assert(false && "invalid cooperative group type");
765 }
766 }
767}
768
769#endif
770
788template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) {
789 return g.num_threads();
790}
791
803template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t thread_rank(CGTy const& g) {
804 return g.thread_rank();
805}
806
816template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
817
827template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
828
829// Doxygen end group CooperativeGAPI
837template <unsigned int tileSize> class tile_base {
838 protected:
839 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
840
841 public:
843 _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
844 return (internal::workgroup::thread_rank() & (numThreads - 1));
845 }
846
848 __CG_STATIC_QUALIFIER__ unsigned int num_threads() { return numThreads; }
849
852 __CG_STATIC_QUALIFIER__ unsigned int size() { return num_threads(); }
853};
854
860template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
861 static_assert(is_valid_tile_size<size>::value,
862 "Tile size is either not a power of 2 or greater than the wavefront size");
864
865 template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
867
868#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
869 __CG_QUALIFIER__ unsigned long long build_mask() const {
870 unsigned long long mask = ~0ull >> (64 - numThreads);
871 // thread_rank() gives thread id from 0..thread launch size.
872 return mask << (((internal::workgroup::thread_rank() % warpSize) / numThreads) * numThreads);
873 }
874#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
875
876 public:
877 __CG_STATIC_QUALIFIER__ void sync() { internal::tiled_group::sync(); }
878
879 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
880 return (__shfl(var, srcRank, numThreads));
881 }
882
883 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
884 return (__shfl_down(var, lane_delta, numThreads));
885 }
886
887 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
888 return (__shfl_up(var, lane_delta, numThreads));
889 }
890
891 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
892 return (__shfl_xor(var, laneMask, numThreads));
893 }
894
895#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
896 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
897 const auto mask = build_mask();
898 return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
899 }
900
901 __CG_QUALIFIER__ int any(int pred) const { return __any_sync(build_mask(), pred); }
902
903 __CG_QUALIFIER__ int all(int pred) const { return __all_sync(build_mask(), pred); }
904
905 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
906 const auto mask = build_mask();
907 return internal::helper::adjust_mask(mask, __match_any_sync(mask, value));
908 }
909
910 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
911 const auto mask = build_mask();
912 return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
913 }
914#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
915};
916
919template <unsigned int tileSize, typename ParentCGTy> class parent_group_info {
920 public:
923 __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() {
924 return ParentCGTy::thread_rank() / tileSize;
925 }
926
928 __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
929 return (ParentCGTy::num_threads() + tileSize - 1) / tileSize;
930 }
931};
932
939template <unsigned int tileSize, class ParentCGTy> class thread_block_tile_type
940 : public thread_block_tile_base<tileSize>,
941 public tiled_group,
942 public parent_group_info<tileSize, ParentCGTy> {
943 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
945
946 protected:
947 __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
950 }
951
960
961 public:
963 using tbtBase::size;
966};
967
968// Partial template specialization
969template <unsigned int tileSize> class thread_block_tile_type<tileSize, void>
970 : public thread_block_tile_base<tileSize>, public tiled_group {
971 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
972
974
975 protected:
984
985 public:
987 using tbtBase::size;
988 using tbtBase::sync;
990
993 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
995 }
996
998 __CG_QUALIFIER__ unsigned int meta_group_size() const {
1000 }
1001 // Doxygen end group CooperativeG
1005};
1006
1007__CG_QUALIFIER__ thread_group this_thread() {
1008 thread_group g(internal::group_type::cg_coalesced_group, 1, __ockl_activelane_u32());
1009 return g;
1010}
1011
1019__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
1020 if (parent.cg_type() == internal::cg_tiled_group) {
1021 const tiled_group* cg = static_cast<const tiled_group*>(&parent);
1022 return cg->new_tiled_group(tile_size);
1023 } else if (parent.cg_type() == internal::cg_coalesced_group) {
1024 const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
1025 return cg->new_tiled_group(tile_size);
1026 } else {
1027 const thread_block* tb = static_cast<const thread_block*>(&parent);
1028 return tb->new_tiled_group(tile_size);
1029 }
1030}
1031
1032// Thread block type overload
1033__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
1034 return (parent.new_tiled_group(tile_size));
1035}
1036
1037__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
1038 return (parent.new_tiled_group(tile_size));
1039}
1040
1041// If a coalesced group is passed to be partitioned, it should remain coalesced
1042__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
1043 unsigned int tile_size) {
1044 return (parent.new_tiled_group(tile_size));
1045}
1046
1047namespace impl {
1048template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
1049
1050template <unsigned int size, class ParentCGTy> class thread_block_tile_internal
1051 : public thread_block_tile_type<size, ParentCGTy> {
1052 protected:
1053 template <unsigned int tbtSize, class tbtParentT> __CG_QUALIFIER__ thread_block_tile_internal(
1056
1057 __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
1058 : thread_block_tile_type<size, ParentCGTy>() {}
1059};
1060} // namespace impl
1061
1070template <unsigned int size, class ParentCGTy> class thread_block_tile
1071 : public impl::thread_block_tile_internal<size, ParentCGTy> {
1072 protected:
1073 __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
1074 : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
1075
1076 public:
1077 __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
1078 return thread_block_tile<size, void>(*this);
1079 }
1080
1081#ifdef DOXYGEN_SHOULD_INCLUDE_THIS
1082
1084 __CG_QUALIFIER__ unsigned int thread_rank() const;
1085
1087 __CG_QUALIFIER__ void sync();
1088
1091 __CG_QUALIFIER__ unsigned int meta_group_rank() const;
1092
1094 __CG_QUALIFIER__ unsigned int meta_group_size() const;
1095
1108 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const;
1109
1124 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const;
1125
1140 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const;
1141
1154 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const;
1155
1163 __CG_QUALIFIER__ unsigned long long ballot(int pred) const;
1164
1171 __CG_QUALIFIER__ int any(int pred) const;
1172
1179 __CG_QUALIFIER__ int all(int pred) const;
1180
1189 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const;
1190
1202 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const;
1203
1204#endif
1205};
1206
1207template <unsigned int size> class thread_block_tile<size, void>
1208 : public impl::thread_block_tile_internal<size, void> {
1209 template <unsigned int, class ParentCGTy> friend class thread_block_tile;
1210
1211 protected:
1212 public:
1213 template <class ParentCGTy>
1215 : impl::thread_block_tile_internal<size, void>(g) {}
1216};
1217
1218template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
1219
1220namespace impl {
1221template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;
1222
1223template <unsigned int size> struct tiled_partition_internal<size, thread_block>
1224 : public thread_block_tile<size, thread_block> {
1227};
1228
1229// ParentCGTy = thread_block_tile<ParentSize, GrandParentCGTy> specialization
1230template <unsigned int size, unsigned int ParentSize, class GrandParentCGTy>
1231struct tiled_partition_internal<size, thread_block_tile<ParentSize, GrandParentCGTy> >
1232 : public thread_block_tile<size, thread_block_tile<ParentSize, GrandParentCGTy> > {
1233 static_assert(size <= ParentSize, "Sub tile size must be <= parent tile size in tiled_partition");
1234
1236 : thread_block_tile<size, thread_block_tile<ParentSize, GrandParentCGTy> >(g) {}
1237};
1238
1239} // namespace impl
1240
1253template <unsigned int size, class ParentCGTy>
1254__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
1255 static_assert(is_valid_tile_size<size>::value,
1256 "Tiled partition with size > wavefront size. Currently not supported ");
1258}
1259
1260#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
1261
1270__CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred) {
1271 auto mask = __ballot_sync<unsigned long long>(cgrp.coalesced_info.member_mask, pred);
1272
1273 if (pred) {
1274 return coalesced_group(mask);
1275 } else {
1276 return coalesced_group(cgrp.coalesced_info.member_mask ^ mask);
1277 }
1278}
1279
1291template <unsigned int size, class parent>
1293 bool pred) {
1294 auto mask = __ballot_sync<unsigned long long>(tgrp.build_mask(), pred);
1295
1296 if (pred) {
1297 return coalesced_group(mask);
1298 } else {
1299 return coalesced_group(tgrp.build_mask() ^ mask);
1300 }
1301}
1302#endif
1303} // namespace cooperative_groups
1304
1305#endif // __cplusplus
1306#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
The coalesced_group cooperative group type.
Definition amd_hip_cooperative_groups.h:398
The grid cooperative group type.
Definition amd_hip_cooperative_groups.h:195
Definition amd_hip_cooperative_groups.h:1051
thread_block_tile_internal(const thread_block &g)
Definition amd_hip_cooperative_groups.h:1057
thread_block_tile_internal(const thread_block_tile_internal< tbtSize, tbtParentT > &g)
Definition amd_hip_cooperative_groups.h:1053
The multi-grid cooperative group type.
Definition amd_hip_cooperative_groups.h:140
User exposed API that captures the state of the parent group pre-partition.
Definition amd_hip_cooperative_groups.h:919
thread_block_tile(const thread_block_tile< size, ParentCGTy > &g)
Definition amd_hip_cooperative_groups.h:1214
Definition amd_hip_cooperative_groups.h:860
Group type - thread_block_tile.
Definition amd_hip_cooperative_groups.h:942
Group type - thread_block_tile.
Definition amd_hip_cooperative_groups.h:1071
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
int any(int pred) const
Any function on group level.
unsigned long long ballot(int pred) const
Ballot function on group level.
thread_block_tile(const ParentCGTy &g)
Definition amd_hip_cooperative_groups.h:1073
unsigned int meta_group_rank() const
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
T shfl_xor(T var, unsigned int laneMask) const
Shuffle xor operation on group level.
int all(int pred) const
All function on group level.
unsigned long long match_any(T value) const
Match any function on group level.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
void sync()
Synchronizes the threads in the group.
T shfl(T var, int srcRank) const
Shuffle operation on group level.
The workgroup (thread-block in CUDA terminology) cooperative group type.
Definition amd_hip_cooperative_groups.h:251
The base type of all cooperative group types.
Definition amd_hip_cooperative_groups.h:50
Definition amd_hip_cooperative_groups.h:837
The tiled_group cooperative group type.
Definition amd_hip_cooperative_groups.h:342
const struct texture< T, dim, readMode > const void size_t size
Definition hip_runtime_api.h:10067
bool is_valid(CGTy const &g)
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:816
void sync(CGTy const &g)
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:827
__hip_uint32_t group_size(CGTy const &g)
Returns the size of the group.
Definition amd_hip_cooperative_groups.h:788
__hip_uint32_t thread_rank(CGTy const &g)
Returns the rank of thread of the group.
Definition amd_hip_cooperative_groups.h:803
thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition amd_hip_cooperative_groups.h:331
coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition amd_hip_cooperative_groups.h:1270
thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition amd_hip_cooperative_groups.h:1019
multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition amd_hip_cooperative_groups.h:181
coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition amd_hip_cooperative_groups.h:672
grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition amd_hip_cooperative_groups.h:240
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:386
T shfl_xor(T var, unsigned int laneMask) const
Definition amd_hip_cooperative_groups.h:891
static constexpr unsigned int numThreads
Definition amd_hip_cooperative_groups.h:839
friend multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition amd_hip_cooperative_groups.h:181
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:486
static void sync()
Definition amd_hip_cooperative_groups.h:877
void barrier_wait(arrival_token &&t) const
Arrive at a barrier.
Definition amd_hip_cooperative_groups.h:225
unsigned int num_threads
Definition amd_hip_cooperative_groups.h:72
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:207
arrival_token barrier_arrive() const
Arrive at a barrier.
Definition amd_hip_cooperative_groups.h:219
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:213
__hip_uint32_t size() const
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:94
unsigned int size() const
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:478
__hip_uint32_t num_grids()
Definition amd_hip_cooperative_groups.h:153
unsigned long long match_all(T value, int &pred) const
Definition amd_hip_cooperative_groups.h:910
unsigned long long match_any(T value) const
Match any function on group level.
Definition amd_hip_cooperative_groups.h:638
friend thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition amd_hip_cooperative_groups.h:331
static unsigned int meta_group_size()
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:928
unsigned int signal
Definition amd_hip_cooperative_groups.h:216
unsigned int meta_group_rank() const
Definition amd_hip_cooperative_groups.h:490
dim3 group_dim() const
Definition amd_hip_cooperative_groups.h:214
unsigned int meta_group_rank() const
Definition amd_hip_cooperative_groups.h:993
__hip_uint32_t _num_threads
Type of the thread_group.
Definition amd_hip_cooperative_groups.h:53
__hip_uint32_t block_rank() const
Rank of the block in calling thread within [0, num_threads() ).
static unsigned int num_threads()
Number of threads within this tile.
Definition amd_hip_cooperative_groups.h:848
thread_group(internal::group_type type, __hip_uint32_t num_threads=static_cast< __hip_uint64_t >(0), __hip_uint64_t mask=static_cast< __hip_uint64_t >(0))
Definition amd_hip_cooperative_groups.h:62
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:495
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:382
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:481
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:998
int all(int pred) const
All function on group level.
Definition amd_hip_cooperative_groups.h:626
unsigned int cg_type() const
Returns the type of the group.
Definition amd_hip_cooperative_groups.h:96
T shfl(T var, int srcRank) const
Definition amd_hip_cooperative_groups.h:879
thread_group new_tiled_group(unsigned int tile_size) const
Definition amd_hip_cooperative_groups.h:265
static __hip_uint32_t size()
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:304
unsigned int num_threads() const
Definition amd_hip_cooperative_groups.h:374
unsigned int num_threads() const
Definition amd_hip_cooperative_groups.h:475
void barrier_wait(arrival_token &&) const
Arrive at a barrier.
Definition amd_hip_cooperative_groups.h:318
tiled_group(unsigned int tileSize)
Definition amd_hip_cooperative_groups.h:366
unsigned int meta_group_rank
Definition amd_hip_cooperative_groups.h:73
void sync() const
Synchronizes the threads in the group.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
Definition amd_hip_cooperative_groups.h:655
thread_block_tile_type()
Definition amd_hip_cooperative_groups.h:947
__hip_uint32_t block_rank() const
Rank of the block in calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:209
grid_group(__hip_uint32_t size)
Construct grid thread group (through the API this_grid())
Definition amd_hip_cooperative_groups.h:202
__hip_uint32_t grid_rank()
Definition amd_hip_cooperative_groups.h:157
static constexpr unsigned int thread_rank()
Rank of the thread within this tile.
Definition amd_hip_cooperative_groups.h:843
bool is_tiled
Definition amd_hip_cooperative_groups.h:71
unsigned long long ballot(int pred) const
Ballot function on group level.
Definition amd_hip_cooperative_groups.h:604
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:165
T shfl_up(T var, unsigned int lane_delta) const
Definition amd_hip_cooperative_groups.h:887
static __hip_uint32_t block_rank()
Rank of the block in calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:296
static dim3 group_index()
Returns 3-dimensional block index within the grid.
Definition amd_hip_cooperative_groups.h:288
unsigned int num_threads
Definition amd_hip_cooperative_groups.h:79
static void sync()
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:308
friend coalesced_group binary_partition(const thread_block_tile< fsize, fparent > &tgrp, bool pred)
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:159
bool is_valid() const
Returns true if the group has not violated any API constraints.
static unsigned int size()
Definition amd_hip_cooperative_groups.h:852
friend thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition amd_hip_cooperative_groups.h:1019
int any(int pred) const
Definition amd_hip_cooperative_groups.h:901
int all(int pred) const
Definition amd_hip_cooperative_groups.h:903
__hip_uint64_t _mask
Total number of threads in the thread_group.
Definition amd_hip_cooperative_groups.h:54
friend thread_group this_thread()
Definition amd_hip_cooperative_groups.h:1007
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:211
__hip_uint32_t _type
Definition amd_hip_cooperative_groups.h:52
unsigned long long match_any(T value) const
Definition amd_hip_cooperative_groups.h:905
unsigned int size() const
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:379
struct cooperative_groups::thread_group::_coalesced_info coalesced_info
lane_mask member_mask
Definition amd_hip_cooperative_groups.h:78
thread_block(__hip_uint32_t size)
Definition amd_hip_cooperative_groups.h:262
unsigned int meta_group_size
Definition amd_hip_cooperative_groups.h:74
multi_grid_group(__hip_uint32_t size)
Construct multi-grid thread group (through the API this_multi_grid())
Definition amd_hip_cooperative_groups.h:147
int any(int pred) const
Any function on group level.
Definition amd_hip_cooperative_groups.h:616
thread_group this_thread()
Definition amd_hip_cooperative_groups.h:1007
static unsigned int meta_group_rank()
Definition amd_hip_cooperative_groups.h:923
__hip_uint32_t num_threads() const
Definition amd_hip_cooperative_groups.h:92
struct _tiled_info tiled_info
Definition amd_hip_cooperative_groups.h:80
friend coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition amd_hip_cooperative_groups.h:672
friend coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition amd_hip_cooperative_groups.h:1270
static dim3 thread_index()
Returns 3-dimensional thread index within the block.
Definition amd_hip_cooperative_groups.h:290
friend class thread_block
Definition amd_hip_cooperative_groups.h:86
coalesced_group(lane_mask member_mask)
Definition amd_hip_cooperative_groups.h:463
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
Definition amd_hip_cooperative_groups.h:536
static bool is_valid()
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:306
dim3 group_dim()
Returns the group dimensions.
Definition amd_hip_cooperative_groups.h:310
thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
Definition amd_hip_cooperative_groups.h:976
static __hip_uint32_t num_threads()
Definition amd_hip_cooperative_groups.h:300
unsigned long long ballot(int pred) const
Definition amd_hip_cooperative_groups.h:896
T shfl(T var, int srcRank) const
Shuffle operation on group level.
Definition amd_hip_cooperative_groups.h:511
arrival_token barrier_arrive() const
Arrive at a barrier.
Definition amd_hip_cooperative_groups.h:313
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:163
static __hip_uint32_t thread_rank()
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:292
friend grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition amd_hip_cooperative_groups.h:240
T shfl_down(T var, unsigned int lane_delta) const
Definition amd_hip_cooperative_groups.h:883
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
Definition amd_hip_cooperative_groups.h:573
thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
Definition amd_hip_cooperative_groups.h:952
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:40
Definition amd_hip_cooperative_groups.h:215
tiled_partition_internal(const thread_block &g)
Definition amd_hip_cooperative_groups.h:1225
tiled_partition_internal(const thread_block_tile< ParentSize, GrandParentCGTy > &g)
Definition amd_hip_cooperative_groups.h:1235
Definition amd_hip_cooperative_groups.h:1221
Definition amd_hip_cooperative_groups.h:311
Definition amd_hip_cooperative_groups.h:77
Definition amd_hip_cooperative_groups.h:70
Definition hip_runtime_api.h:1299