clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File

clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File#

HIP Runtime API Reference: clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h Source File
amd_hip_cooperative_groups.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
16#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
17#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
18
19#if __cplusplus
20#if !defined(__HIPCC_RTC__)
21#include <hip/amd_detail/hip_cooperative_groups_helper.h>
22#endif
23
25
35 protected:
36 __hip_uint32_t _type;
37 __hip_uint32_t _num_threads;
38 __hip_uint64_t _mask;
40
46 __CG_QUALIFIER__ thread_group(internal::group_type type,
47 __hip_uint32_t num_threads = static_cast<__hip_uint64_t>(0),
48 __hip_uint64_t mask = static_cast<__hip_uint64_t>(0)) {
49 _type = type;
51 _mask = mask;
52 }
53
54 struct _tiled_info {
56 unsigned int num_threads;
57 unsigned int meta_group_rank;
58 unsigned int meta_group_size;
59 };
60
66
67 friend __CG_QUALIFIER__ thread_group this_thread();
68 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
69 unsigned int tile_size);
70 friend class thread_block;
71
72 public:
76 __CG_QUALIFIER__ __hip_uint32_t num_threads() const { return _num_threads; }
78 __CG_QUALIFIER__ __hip_uint32_t size() const { return num_threads(); }
80 __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
82 __CG_QUALIFIER__ __hip_uint32_t thread_rank() const;
84 __CG_QUALIFIER__ __hip_uint32_t block_rank() const;
86 __CG_QUALIFIER__ bool is_valid() const;
87
100 __CG_QUALIFIER__ void sync() const;
101};
127 friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
128
129 protected:
131 explicit __CG_QUALIFIER__ multi_grid_group(__hip_uint32_t size)
132 : thread_group(internal::cg_multi_grid, size) {}
133
134 public:
137 __CG_QUALIFIER__ __hip_uint32_t num_grids() { return internal::multi_grid::num_grids(); }
138
141 __CG_QUALIFIER__ __hip_uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
143 __CG_QUALIFIER__ __hip_uint32_t thread_rank() const {
144 return internal::multi_grid::thread_rank();
145 }
147 __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
149 __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
150};
151
166 return multi_grid_group(internal::multi_grid::num_threads());
167}
168// Doxygen end group CooperativeGConstruct
179class grid_group : public thread_group {
182 friend __CG_QUALIFIER__ grid_group this_grid();
183
184 protected:
186 explicit __CG_QUALIFIER__ grid_group(__hip_uint32_t size)
187 : thread_group(internal::cg_grid, size) {}
188
189 public:
191 __CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::grid::thread_rank(); }
193 __CG_QUALIFIER__ __hip_uint32_t block_rank() const { return internal::grid::block_rank(); }
195 __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
197 __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
198 __CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); }
200 unsigned int signal;
201 };
203 __CG_QUALIFIER__ arrival_token barrier_arrive() const {
205 t.signal = internal::grid::barrier_signal();
206 return t;
207 }
209 __CG_QUALIFIER__ void barrier_wait(arrival_token&& t) const {
210 internal::grid::barrier_wait(t.signal);
211 }
212};
213
224__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::num_threads()); }
225
238 friend __CG_QUALIFIER__ thread_block this_thread_block();
239 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
240 unsigned int tile_size);
241 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
242 unsigned int tile_size);
243
244 protected:
245 // Construct a workgroup thread group (through the API this_thread_block())
246 explicit __CG_QUALIFIER__ thread_block(__hip_uint32_t size)
247 : thread_group(internal::cg_workgroup, size) {}
248
249 __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
250 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
251 // Invalid tile size, assert
252 if (!tile_size || (tile_size > warpSize) || !pow2) {
253 __hip_assert(false && "invalid tile size");
254 }
255
256 auto block_size = num_threads();
257 auto rank = thread_rank();
258 auto partitions = (block_size + tile_size - 1) / tile_size;
259 auto tail = (partitions * tile_size) - block_size;
260 auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size);
261 thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size);
262
263 tiledGroup.coalesced_info.tiled_info.num_threads = tile_size;
264 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
265 tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size;
266 tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions;
267 return tiledGroup;
268 }
269
270 public:
272 __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
274 __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
276 __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
277 return internal::workgroup::thread_rank();
278 }
280 __CG_STATIC_QUALIFIER__ __hip_uint32_t block_rank() {
281 return internal::workgroup::block_rank();
282 }
284 __CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
285 return internal::workgroup::num_threads();
286 }
288 __CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return num_threads(); }
290 __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
292 __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
294 __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
295 struct arrival_token {};
297 __CG_QUALIFIER__ arrival_token barrier_arrive() const {
298 internal::workgroup::barrier_arrive();
299 return arrival_token{};
300 }
302 __CG_QUALIFIER__ void barrier_wait(arrival_token&&) const { internal::workgroup::barrier_wait(); }
303};
304
315__CG_QUALIFIER__ thread_block this_thread_block() {
316 return thread_block(internal::workgroup::num_threads());
317}
318
326class tiled_group : public thread_group {
327 private:
328 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
329 unsigned int tile_size);
330 friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
331 unsigned int tile_size);
332
333 __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
334 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
335
336 if (!tile_size || (tile_size > warpSize) || !pow2) {
337 __hip_assert(false && "invalid tile size");
338 }
339
340 if (num_threads() <= tile_size) {
341 return *this;
342 }
343
344 tiled_group tiledGroup = tiled_group(tile_size);
345 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
346 return tiledGroup;
347 }
348
349 protected:
350 explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
351 : thread_group(internal::cg_tiled_group, tileSize) {
354 }
355
356 public:
358 __CG_QUALIFIER__ unsigned int num_threads() const {
360 }
361
363 __CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
364
366 __CG_QUALIFIER__ unsigned int thread_rank() const {
367 return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.num_threads - 1));
368 }
370 __CG_QUALIFIER__ void sync() const { internal::tiled_group::sync(); }
371};
372
373template <unsigned int size, class ParentCGTy> class thread_block_tile;
374
383 private:
384 friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
385 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
386 unsigned int tile_size);
387 friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
388 unsigned int tile_size);
389 friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
390 template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
392
393 __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
394 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
395
396 if (!tile_size || !pow2) {
397 return coalesced_group(0);
398 }
399
400 // If a tiled group is passed to be partitioned further into a coalesced_group.
401 // prepare a mask for further partitioning it so that it stays coalesced.
403 unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
404 unsigned int masklength =
405 min(static_cast<unsigned int>(num_threads()) - base_offset, tile_size);
406 lane_mask full_mask = (static_cast<int>(warpSize) == 32)
407 ? static_cast<lane_mask>((1u << 32) - 1)
408 : static_cast<lane_mask>(-1ull);
409 lane_mask member_mask = full_mask >> (warpSize - masklength);
410
411 member_mask <<= (__lane_id() & ~(tile_size - 1));
412 coalesced_group coalesced_tile = coalesced_group(member_mask);
413 coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
414 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
415 coalesced_tile.coalesced_info.tiled_info.meta_group_size = num_threads() / tile_size;
416 return coalesced_tile;
417 }
418 // Here the parent coalesced_group is not partitioned.
419 else {
420 lane_mask member_mask = 0;
421 unsigned int tile_rank = 0;
422 int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
423
424 for (unsigned int i = 0; i < warpSize; i++) {
425 lane_mask active = coalesced_info.member_mask & (static_cast<lane_mask>(1) << i);
426 // Make sure the lane is active
427 if (active) {
428 if (lanes_to_skip <= 0 && tile_rank < tile_size) {
429 // Prepare a member_mask that is appropriate for a tile
430 member_mask |= active;
431 tile_rank++;
432 }
433 lanes_to_skip--;
434 }
435 }
436 coalesced_group coalesced_tile = coalesced_group(member_mask);
437 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
438 coalesced_tile.coalesced_info.tiled_info.meta_group_size =
439 (num_threads() + tile_size - 1) / tile_size;
440 return coalesced_tile;
441 }
442 return coalesced_group(0);
443 }
444
445 protected:
446 // Constructor
447 explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
448 : thread_group(internal::cg_coalesced_group) {
449 coalesced_info.member_mask = member_mask; // Which threads are active
451 __popcll(coalesced_info.member_mask); // How many threads are active
452 coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
455 }
456
457 public:
459 __CG_QUALIFIER__ unsigned int num_threads() const { return coalesced_info.num_threads; }
460
462 __CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
463
465 __CG_QUALIFIER__ unsigned int thread_rank() const {
466 return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
467 }
468
470 __CG_QUALIFIER__ void sync() const { internal::coalesced_group::sync(); }
471
474 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
476 }
477
479 __CG_QUALIFIER__ unsigned int meta_group_size() const {
481 }
482
495 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
496 srcRank = srcRank % static_cast<int>(num_threads());
497
498 int lane = (num_threads() == warpSize) ? srcRank
499 : (static_cast<int>(warpSize) == 64)
500 ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
501 : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
502
503 return __shfl(var, lane, warpSize);
504 }
505
520 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
521 // Note: The cuda implementation appears to use the remainder of lane_delta
522 // and WARP_SIZE as the shift value rather than lane_delta itself.
523 // This is not described in the documentation and is not done here.
524
525 if (num_threads() == warpSize) {
526 return __shfl_down(var, lane_delta, warpSize);
527 }
528
529 int lane;
530 if (static_cast<int>(warpSize) == 64) {
531 lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
532 } else {
533 lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
534 }
535
536 if (lane == -1) {
537 lane = __lane_id();
538 }
539
540 return __shfl(var, lane, warpSize);
541 }
542
557 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
558 // Note: The cuda implementation appears to use the remainder of lane_delta
559 // and WARP_SIZE as the shift value rather than lane_delta itself.
560 // This is not described in the documentation and is not done here.
561
562 if (num_threads() == warpSize) {
563 return __shfl_up(var, lane_delta, warpSize);
564 }
565
566 int lane;
567 if (static_cast<int>(warpSize) == 64) {
568 lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
569 } else if (static_cast<int>(warpSize) == 32) {
570 lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
571 }
572
573 if (lane == -1) {
574 lane = __lane_id();
575 }
576
577 return __shfl(var, lane, warpSize);
578 }
579#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
580
588 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
589 return internal::helper::adjust_mask(
591 __ballot_sync<unsigned long long>(coalesced_info.member_mask, pred));
592 }
593
600 __CG_QUALIFIER__ int any(int pred) const {
601 return __any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
602 }
603
610 __CG_QUALIFIER__ int all(int pred) const {
611 return __all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
612 }
613
622 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
623 return internal::helper::adjust_mask(
625 __match_any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value));
626 }
627
639 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
640 return internal::helper::adjust_mask(
642 __match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
643 &pred));
644 }
645#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
646};
647
657 return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
658}
659
660#ifndef DOXYGEN_SHOULD_SKIP_THIS
661
667__CG_QUALIFIER__ __hip_uint32_t thread_group::thread_rank() const {
668 switch (this->_type) {
669 case internal::cg_multi_grid: {
670 return (static_cast<const multi_grid_group*>(this)->thread_rank());
671 }
672 case internal::cg_grid: {
673 return (static_cast<const grid_group*>(this)->thread_rank());
674 }
675 case internal::cg_workgroup: {
676 return (static_cast<const thread_block*>(this)->thread_rank());
677 }
678 case internal::cg_tiled_group: {
679 return (static_cast<const tiled_group*>(this)->thread_rank());
680 }
681 case internal::cg_coalesced_group: {
682 return (static_cast<const coalesced_group*>(this)->thread_rank());
683 }
684 default: {
685 __hip_assert(false && "invalid cooperative group type");
686 return -1;
687 }
688 }
689}
690
696__CG_QUALIFIER__ bool thread_group::is_valid() const {
697 switch (this->_type) {
698 case internal::cg_multi_grid: {
699 return (static_cast<const multi_grid_group*>(this)->is_valid());
700 }
701 case internal::cg_grid: {
702 return (static_cast<const grid_group*>(this)->is_valid());
703 }
704 case internal::cg_workgroup: {
705 return (static_cast<const thread_block*>(this)->is_valid());
706 }
707 case internal::cg_tiled_group: {
708 return (static_cast<const tiled_group*>(this)->is_valid());
709 }
710 case internal::cg_coalesced_group: {
711 return (static_cast<const coalesced_group*>(this)->is_valid());
712 }
713 default: {
714 __hip_assert(false && "invalid cooperative group type");
715 return false;
716 }
717 }
718}
719
725__CG_QUALIFIER__ void thread_group::sync() const {
726 switch (this->_type) {
727 case internal::cg_multi_grid: {
728 static_cast<const multi_grid_group*>(this)->sync();
729 break;
730 }
731 case internal::cg_grid: {
732 static_cast<const grid_group*>(this)->sync();
733 break;
734 }
735 case internal::cg_workgroup: {
736 static_cast<const thread_block*>(this)->sync();
737 break;
738 }
739 case internal::cg_tiled_group: {
740 static_cast<const tiled_group*>(this)->sync();
741 break;
742 }
743 case internal::cg_coalesced_group: {
744 static_cast<const coalesced_group*>(this)->sync();
745 break;
746 }
747 default: {
748 __hip_assert(false && "invalid cooperative group type");
749 }
750 }
751#if __has_builtin(__builtin_amdgcn_s_wait_asynccnt)
752 __builtin_amdgcn_s_wait_asynccnt(0);
753#endif
754}
755
756#endif
757
775template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) {
776 return g.num_threads();
777}
778
790template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t thread_rank(CGTy const& g) {
791 return g.thread_rank();
792}
793
803template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
804
814template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
815
816// Doxygen end group CooperativeGAPI
824template <unsigned int tileSize> class tile_base {
825 protected:
826 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
827
828 public:
830 _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
831 return (internal::workgroup::thread_rank() & (numThreads - 1));
832 }
833
835 __CG_STATIC_QUALIFIER__ unsigned int num_threads() { return numThreads; }
836
839 __CG_STATIC_QUALIFIER__ unsigned int size() { return num_threads(); }
840};
841
847template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
848 static_assert(is_valid_tile_size<size>::value,
849 "Tile size is either not a power of 2 or greater than the wavefront size");
851
852 template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
854
855#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
856 __CG_QUALIFIER__ unsigned long long build_mask() const {
857 unsigned long long mask = ~0ull >> (64 - numThreads);
858 // thread_rank() gives thread id from 0..thread launch size.
859 return mask << (((internal::workgroup::thread_rank() % warpSize) / numThreads) * numThreads);
860 }
861#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
862
863 public:
864 __CG_STATIC_QUALIFIER__ void sync() { internal::tiled_group::sync(); }
865
866 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
867 return (__shfl(var, srcRank, numThreads));
868 }
869
870 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
871 return (__shfl_down(var, lane_delta, numThreads));
872 }
873
874 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
875 return (__shfl_up(var, lane_delta, numThreads));
876 }
877
878 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
879 return (__shfl_xor(var, laneMask, numThreads));
880 }
881
882#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
883 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
884 const auto mask = build_mask();
885 return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
886 }
887
888 __CG_QUALIFIER__ int any(int pred) const { return __any_sync(build_mask(), pred); }
889
890 __CG_QUALIFIER__ int all(int pred) const { return __all_sync(build_mask(), pred); }
891
892 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
893 const auto mask = build_mask();
894 return internal::helper::adjust_mask(mask, __match_any_sync(mask, value));
895 }
896
897 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
898 const auto mask = build_mask();
899 return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
900 }
901#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
902};
903
906template <unsigned int tileSize, typename ParentCGTy> class parent_group_info {
907 public:
910 __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() {
911 return ParentCGTy::thread_rank() / tileSize;
912 }
913
915 __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
916 return (ParentCGTy::num_threads() + tileSize - 1) / tileSize;
917 }
918};
919
926template <unsigned int tileSize, class ParentCGTy> class thread_block_tile_type
927 : public thread_block_tile_base<tileSize>,
928 public tiled_group,
929 public parent_group_info<tileSize, ParentCGTy> {
930 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
932
933 protected:
934 __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
937 }
938
947
948 public:
950 using tbtBase::size;
953};
954
955// Partial template specialization
956template <unsigned int tileSize> class thread_block_tile_type<tileSize, void>
957 : public thread_block_tile_base<tileSize>, public tiled_group {
958 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
959
961
962 protected:
971
972 public:
974 using tbtBase::size;
975 using tbtBase::sync;
977
980 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
982 }
983
985 __CG_QUALIFIER__ unsigned int meta_group_size() const {
987 }
988 // Doxygen end group CooperativeG
992};
993
994__CG_QUALIFIER__ thread_group this_thread() {
995 thread_group g(internal::group_type::cg_coalesced_group, 1, __ockl_activelane_u32());
996 return g;
997}
998
1006__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
1007 if (parent.cg_type() == internal::cg_tiled_group) {
1008 const tiled_group* cg = static_cast<const tiled_group*>(&parent);
1009 return cg->new_tiled_group(tile_size);
1010 } else if (parent.cg_type() == internal::cg_coalesced_group) {
1011 const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
1012 return cg->new_tiled_group(tile_size);
1013 } else {
1014 const thread_block* tb = static_cast<const thread_block*>(&parent);
1015 return tb->new_tiled_group(tile_size);
1016 }
1017}
1018
1019// Thread block type overload
1020__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
1021 return (parent.new_tiled_group(tile_size));
1022}
1023
1024__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
1025 return (parent.new_tiled_group(tile_size));
1026}
1027
1028// If a coalesced group is passed to be partitioned, it should remain coalesced
1029__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
1030 unsigned int tile_size) {
1031 return (parent.new_tiled_group(tile_size));
1032}
1033
1034namespace impl {
1035template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
1036
1037template <unsigned int size, class ParentCGTy> class thread_block_tile_internal
1038 : public thread_block_tile_type<size, ParentCGTy> {
1039 protected:
1040 template <unsigned int tbtSize, class tbtParentT> __CG_QUALIFIER__ thread_block_tile_internal(
1043
1044 __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
1045 : thread_block_tile_type<size, ParentCGTy>() {}
1046};
1047} // namespace impl
1048
1057template <unsigned int size, class ParentCGTy> class thread_block_tile
1058 : public impl::thread_block_tile_internal<size, ParentCGTy> {
1059 protected:
1060 __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
1061 : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
1062
1063 public:
1064 __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
1065 return thread_block_tile<size, void>(*this);
1066 }
1067
1068#ifdef DOXYGEN_SHOULD_INCLUDE_THIS
1069
1071 __CG_QUALIFIER__ unsigned int thread_rank() const;
1072
1074 __CG_QUALIFIER__ void sync();
1075
1078 __CG_QUALIFIER__ unsigned int meta_group_rank() const;
1079
1081 __CG_QUALIFIER__ unsigned int meta_group_size() const;
1082
1095 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const;
1096
1111 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const;
1112
1127 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const;
1128
1141 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const;
1142
1150 __CG_QUALIFIER__ unsigned long long ballot(int pred) const;
1151
1158 __CG_QUALIFIER__ int any(int pred) const;
1159
1166 __CG_QUALIFIER__ int all(int pred) const;
1167
1176 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const;
1177
1189 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const;
1190
1191#endif
1192};
1193
1194template <unsigned int size> class thread_block_tile<size, void>
1195 : public impl::thread_block_tile_internal<size, void> {
1196 template <unsigned int, class ParentCGTy> friend class thread_block_tile;
1197
1198 protected:
1199 public:
1200 template <class ParentCGTy>
1202 : impl::thread_block_tile_internal<size, void>(g) {}
1203};
1204
1205template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
1206
1207namespace impl {
1208template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;
1209
1210template <unsigned int size> struct tiled_partition_internal<size, thread_block>
1211 : public thread_block_tile<size, thread_block> {
1214};
1215
1216// ParentCGTy = thread_block_tile<ParentSize, GrandParentCGTy> specialization
1217template <unsigned int size, unsigned int ParentSize, class GrandParentCGTy>
1218struct tiled_partition_internal<size, thread_block_tile<ParentSize, GrandParentCGTy> >
1219 : public thread_block_tile<size, thread_block_tile<ParentSize, GrandParentCGTy> > {
1220 static_assert(size < ParentSize, "Sub tile size must be < parent tile size in tiled_partition");
1221
1223 : thread_block_tile<size, thread_block_tile<ParentSize, GrandParentCGTy> >(g) {}
1224};
1225
1226} // namespace impl
1227
1240template <unsigned int size, class ParentCGTy>
1241__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
1242 static_assert(is_valid_tile_size<size>::value,
1243 "Tiled partition with size > wavefront size. Currently not supported ");
1245}
1246
1247#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
1248
1257__CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred) {
1258 auto mask = __ballot_sync<unsigned long long>(cgrp.coalesced_info.member_mask, pred);
1259
1260 if (pred) {
1261 return coalesced_group(mask);
1262 } else {
1263 return coalesced_group(cgrp.coalesced_info.member_mask ^ mask);
1264 }
1265}
1266
1278template <unsigned int size, class parent>
1280 bool pred) {
1281 auto mask = __ballot_sync<unsigned long long>(tgrp.build_mask(), pred);
1282
1283 if (pred) {
1284 return coalesced_group(mask);
1285 } else {
1286 return coalesced_group(tgrp.build_mask() ^ mask);
1287 }
1288}
1289
1290template <class T>
1291struct plus {
1292 __CG_QUALIFIER__ T operator()(T lhs, T rhs) const
1293 {
1294 return lhs + rhs;
1295 }
1296};
1297
1298template <class T>
1299struct less {
1300 __CG_QUALIFIER__ T operator()(T lhs, T rhs) const
1301 {
1302 return lhs < rhs? lhs : rhs;
1303 }
1304};
1305
1306template <class T>
1307struct greater {
1308 __CG_QUALIFIER__ T operator()(T lhs, T rhs) const
1309 {
1310 return lhs < rhs? rhs : lhs;
1311 }
1312};
1313
1314template <class T>
1315struct bit_and {
1316 __CG_QUALIFIER__ T operator()(T lhs, T rhs) const
1317 {
1318 return lhs & rhs;
1319 }
1320};
1321
1322template <class T>
1323struct bit_xor {
1324 __CG_QUALIFIER__ T operator()(T lhs, T rhs) const
1325 {
1326 return lhs ^ rhs;
1327 }
1328};
1329
1330template <class T>
1331struct bit_or {
1332 __CG_QUALIFIER__ T operator()(T lhs, T rhs) const
1333 {
1334 return lhs | rhs;
1335 }
1336};
1337#endif
1338
1346 friend __device__ cluster_group this_cluster();
1347
1348 // Default constructor, hidden
1349 __CG_QUALIFIER__ cluster_group() {}
1350
1351 public:
1352 using arrival_token = struct {};
1353
1354 // Sync the cluster, equivalent to c.barrier_wait(c.barrier_arrive());
1355 __CG_STATIC_QUALIFIER__ void sync() { internal::cluster::sync(); }
1356
1357 // Arrive on a cluster barrier, returns token that needs to be passed to barrier_wait
1358 __CG_STATIC_QUALIFIER__ arrival_token barrier_arrive() {
1359 // signal user cluster barrier
1360 internal::cluster::barrier_arrive();
1361 return arrival_token();
1362 }
1363
1364 // Wait on arrival_token
1365 __CG_STATIC_QUALIFIER__ void barrier_wait(arrival_token&&) { internal::cluster::barrier_wait(); }
1366
1367 // TODO: implement this when compiler work is done
1368 // block rank to which shared memory address belongs to
1369 // __CG_STATIC_QUALIFIER__ unsigned int query_shared_rank(const void* addr) {}
1370 // Obtain the address of shared memory variable of another block in the cluster
1371 // template <typename T> __CG_STATIC_QUALIFIER__ T* map_shared_rank(T* addr, int rank) {}
1372
1373 // index of the calling block within cluster
1374 __CG_STATIC_QUALIFIER__ dim3 block_index() { return internal::cluster::block_index(); }
1375
1376 // Rank of calling block within [0, num_blocks)
1377 __CG_STATIC_QUALIFIER__ unsigned int block_rank() { return internal::cluster::block_rank(); }
1378
1379 // index of the calling thread within cluster
1380 __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::cluster::thread_index(); }
1381
1382 // Rank of calling thread within [0, num_threads)
1383 __CG_STATIC_QUALIFIER__ unsigned int thread_rank() { return internal::cluster::thread_rank(); }
1384
1385 // Dimensions of launched cluster in unit of blocks
1386 __CG_STATIC_QUALIFIER__ dim3 dim_blocks() { return internal::cluster::dim_blocks(); }
1387
1388 // total number of blocks in the group
1389 __CG_STATIC_QUALIFIER__ unsigned int num_blocks() { return internal::cluster::num_blocks(); }
1390
1391 // Dimensions of launched cluster in unit of threads
1392 __CG_STATIC_QUALIFIER__ dim3 dim_threads() { return internal::cluster::dim_threads(); }
1393
1394 // Total number of threads in the group
1395 __CG_STATIC_QUALIFIER__ unsigned int num_threads() { return internal::cluster::num_threads(); }
1396
1397 // Get address of shared memory variable in another cluster
1398 template <typename T> __CG_STATIC_QUALIFIER__ T* map_shared_rank(T* in, int rank) {
1399 return internal::cluster::map_shared_rank<T>(in, rank);
1400 }
1401
1402 // Return block rank of shared memory address
1403 __CG_STATIC_QUALIFIER__ unsigned int query_shared_rank(const void* in) {
1404 return internal::cluster::query_shared_rank(in);
1405 }
1406
1407 // Alias of num_threads
1408 __CG_STATIC_QUALIFIER__ unsigned int size() { return num_threads(); }
1409};
1410
1417 cluster_group cg;
1418 return cg;
1419}
1420} // namespace cooperative_groups
1421
1422#endif // __cplusplus
1423#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
Cluster group.
Definition amd_hip_cooperative_groups.h:1345
static unsigned int query_shared_rank(const void *in)
Definition amd_hip_cooperative_groups.h:1403
static unsigned int thread_rank()
Definition amd_hip_cooperative_groups.h:1383
static dim3 thread_index()
Definition amd_hip_cooperative_groups.h:1380
static T * map_shared_rank(T *in, int rank)
Definition amd_hip_cooperative_groups.h:1398
static void sync()
Definition amd_hip_cooperative_groups.h:1355
{} arrival_token
Definition amd_hip_cooperative_groups.h:1352
static dim3 block_index()
Definition amd_hip_cooperative_groups.h:1374
static void barrier_wait(arrival_token &&)
Definition amd_hip_cooperative_groups.h:1365
static unsigned int block_rank()
Definition amd_hip_cooperative_groups.h:1377
static unsigned int num_threads()
Definition amd_hip_cooperative_groups.h:1395
static unsigned int size()
Definition amd_hip_cooperative_groups.h:1408
static arrival_token barrier_arrive()
Definition amd_hip_cooperative_groups.h:1358
static unsigned int num_blocks()
Definition amd_hip_cooperative_groups.h:1389
friend cluster_group this_cluster()
get cluster group
Definition amd_hip_cooperative_groups.h:1416
static dim3 dim_threads()
Definition amd_hip_cooperative_groups.h:1392
static dim3 dim_blocks()
Definition amd_hip_cooperative_groups.h:1386
The coalesced_group cooperative group type.
Definition amd_hip_cooperative_groups.h:382
The grid cooperative group type.
Definition amd_hip_cooperative_groups.h:179
Definition amd_hip_cooperative_groups.h:1038
thread_block_tile_internal(const thread_block &g)
Definition amd_hip_cooperative_groups.h:1044
thread_block_tile_internal(const thread_block_tile_internal< tbtSize, tbtParentT > &g)
Definition amd_hip_cooperative_groups.h:1040
The multi-grid cooperative group type.
Definition amd_hip_cooperative_groups.h:124
User exposed API that captures the state of the parent group pre-partition.
Definition amd_hip_cooperative_groups.h:906
thread_block_tile(const thread_block_tile< size, ParentCGTy > &g)
Definition amd_hip_cooperative_groups.h:1201
Definition amd_hip_cooperative_groups.h:847
Group type - thread_block_tile.
Definition amd_hip_cooperative_groups.h:929
Group type - thread_block_tile.
Definition amd_hip_cooperative_groups.h:1058
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
int any(int pred) const
Any function on group level.
unsigned long long ballot(int pred) const
Ballot function on group level.
thread_block_tile(const ParentCGTy &g)
Definition amd_hip_cooperative_groups.h:1060
unsigned int meta_group_rank() const
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
T shfl_xor(T var, unsigned int laneMask) const
Shuffle xor operation on group level.
int all(int pred) const
All function on group level.
unsigned long long match_any(T value) const
Match any function on group level.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
void sync()
Synchronizes the threads in the group.
T shfl(T var, int srcRank) const
Shuffle operation on group level.
The workgroup (thread-block in CUDA terminology) cooperative group type.
Definition amd_hip_cooperative_groups.h:235
The base type of all cooperative group types.
Definition amd_hip_cooperative_groups.h:34
Definition amd_hip_cooperative_groups.h:824
The tiled_group cooperative group type.
Definition amd_hip_cooperative_groups.h:326
const struct texture< T, dim, readMode > const void size_t size
Definition hip_runtime_api.h:10284
bool is_valid(CGTy const &g)
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:803
void sync(CGTy const &g)
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:814
__hip_uint32_t group_size(CGTy const &g)
Returns the size of the group.
Definition amd_hip_cooperative_groups.h:775
__hip_uint32_t thread_rank(CGTy const &g)
Returns the rank of thread of the group.
Definition amd_hip_cooperative_groups.h:790
thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition amd_hip_cooperative_groups.h:315
coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition amd_hip_cooperative_groups.h:1257
thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition amd_hip_cooperative_groups.h:1006
multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition amd_hip_cooperative_groups.h:165
coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition amd_hip_cooperative_groups.h:656
grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition amd_hip_cooperative_groups.h:224
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:370
T shfl_xor(T var, unsigned int laneMask) const
Definition amd_hip_cooperative_groups.h:878
static constexpr unsigned int numThreads
Definition amd_hip_cooperative_groups.h:826
friend multi_grid_group this_multi_grid()
User-exposed API interface to construct grid cooperative group type object - multi_grid_group.
Definition amd_hip_cooperative_groups.h:165
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:470
static void sync()
Definition amd_hip_cooperative_groups.h:864
void barrier_wait(arrival_token &&t) const
Arrive at a barrier.
Definition amd_hip_cooperative_groups.h:209
unsigned int num_threads
Definition amd_hip_cooperative_groups.h:56
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:191
arrival_token barrier_arrive() const
Arrive at a barrier.
Definition amd_hip_cooperative_groups.h:203
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:197
__hip_uint32_t size() const
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:78
unsigned int size() const
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:462
__hip_uint32_t num_grids()
Definition amd_hip_cooperative_groups.h:137
unsigned long long match_all(T value, int &pred) const
Definition amd_hip_cooperative_groups.h:897
unsigned long long match_any(T value) const
Match any function on group level.
Definition amd_hip_cooperative_groups.h:622
friend thread_block this_thread_block()
User-exposed API interface to construct workgroup cooperative group type object - thread_block.
Definition amd_hip_cooperative_groups.h:315
static unsigned int meta_group_size()
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:915
unsigned int signal
Definition amd_hip_cooperative_groups.h:200
unsigned int meta_group_rank() const
Definition amd_hip_cooperative_groups.h:474
dim3 group_dim() const
Definition amd_hip_cooperative_groups.h:198
unsigned int meta_group_rank() const
Definition amd_hip_cooperative_groups.h:980
__hip_uint32_t _num_threads
Type of the thread_group.
Definition amd_hip_cooperative_groups.h:37
__hip_uint32_t block_rank() const
Rank of the block in calling thread within [0, num_threads() ).
static unsigned int num_threads()
Number of threads within this tile.
Definition amd_hip_cooperative_groups.h:835
thread_group(internal::group_type type, __hip_uint32_t num_threads=static_cast< __hip_uint64_t >(0), __hip_uint64_t mask=static_cast< __hip_uint64_t >(0))
Definition amd_hip_cooperative_groups.h:46
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:479
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:366
unsigned int thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:465
unsigned int meta_group_size() const
Returns the number of groups created when the parent group was partitioned.
Definition amd_hip_cooperative_groups.h:985
int all(int pred) const
All function on group level.
Definition amd_hip_cooperative_groups.h:610
unsigned int cg_type() const
Returns the type of the group.
Definition amd_hip_cooperative_groups.h:80
T shfl(T var, int srcRank) const
Definition amd_hip_cooperative_groups.h:866
thread_group new_tiled_group(unsigned int tile_size) const
Definition amd_hip_cooperative_groups.h:249
static __hip_uint32_t size()
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:288
unsigned int num_threads() const
Definition amd_hip_cooperative_groups.h:358
unsigned int num_threads() const
Definition amd_hip_cooperative_groups.h:459
void barrier_wait(arrival_token &&) const
Arrive at a barrier.
Definition amd_hip_cooperative_groups.h:302
tiled_group(unsigned int tileSize)
Definition amd_hip_cooperative_groups.h:350
unsigned int meta_group_rank
Definition amd_hip_cooperative_groups.h:57
void sync() const
Synchronizes the threads in the group.
unsigned long long match_all(T value, int &pred) const
Match all function on group level.
Definition amd_hip_cooperative_groups.h:639
thread_block_tile_type()
Definition amd_hip_cooperative_groups.h:934
__hip_uint32_t block_rank() const
Rank of the block in calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:193
grid_group(__hip_uint32_t size)
Construct grid thread group (through the API this_grid())
Definition amd_hip_cooperative_groups.h:186
__hip_uint32_t grid_rank()
Definition amd_hip_cooperative_groups.h:141
static constexpr unsigned int thread_rank()
Rank of the thread within this tile.
Definition amd_hip_cooperative_groups.h:830
bool is_tiled
Definition amd_hip_cooperative_groups.h:55
unsigned long long ballot(int pred) const
Ballot function on group level.
Definition amd_hip_cooperative_groups.h:588
void sync() const
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:149
T shfl_up(T var, unsigned int lane_delta) const
Definition amd_hip_cooperative_groups.h:874
static __hip_uint32_t block_rank()
Rank of the block in calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:280
static dim3 group_index()
Returns 3-dimensional block index within the grid.
Definition amd_hip_cooperative_groups.h:272
unsigned int num_threads
Definition amd_hip_cooperative_groups.h:63
static void sync()
Synchronizes the threads in the group.
Definition amd_hip_cooperative_groups.h:292
friend coalesced_group binary_partition(const thread_block_tile< fsize, fparent > &tgrp, bool pred)
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:143
bool is_valid() const
Returns true if the group has not violated any API constraints.
static unsigned int size()
Definition amd_hip_cooperative_groups.h:839
friend thread_group tiled_partition(const thread_group &parent, unsigned int tile_size)
User-exposed API to partition groups.
Definition amd_hip_cooperative_groups.h:1006
int any(int pred) const
Definition amd_hip_cooperative_groups.h:888
int all(int pred) const
Definition amd_hip_cooperative_groups.h:890
__hip_uint64_t _mask
Total number of threads in the thread_group.
Definition amd_hip_cooperative_groups.h:38
friend thread_group this_thread()
Definition amd_hip_cooperative_groups.h:994
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:195
__hip_uint32_t _type
Definition amd_hip_cooperative_groups.h:36
unsigned long long match_any(T value) const
Definition amd_hip_cooperative_groups.h:892
unsigned int size() const
Total number of threads in the group (alias of num_threads())
Definition amd_hip_cooperative_groups.h:363
struct cooperative_groups::thread_group::_coalesced_info coalesced_info
lane_mask member_mask
Definition amd_hip_cooperative_groups.h:62
thread_block(__hip_uint32_t size)
Definition amd_hip_cooperative_groups.h:246
unsigned int meta_group_size
Definition amd_hip_cooperative_groups.h:58
multi_grid_group(__hip_uint32_t size)
Construct multi-grid thread group (through the API this_multi_grid())
Definition amd_hip_cooperative_groups.h:131
int any(int pred) const
Any function on group level.
Definition amd_hip_cooperative_groups.h:600
thread_group this_thread()
Definition amd_hip_cooperative_groups.h:994
cluster_group this_cluster()
get cluster group
Definition amd_hip_cooperative_groups.h:1416
static unsigned int meta_group_rank()
Definition amd_hip_cooperative_groups.h:910
__hip_uint32_t num_threads() const
Definition amd_hip_cooperative_groups.h:76
struct _tiled_info tiled_info
Definition amd_hip_cooperative_groups.h:64
friend coalesced_group coalesced_threads()
User-exposed API to create coalesced groups.
Definition amd_hip_cooperative_groups.h:656
friend coalesced_group binary_partition(const coalesced_group &cgrp, bool pred)
Binary partition.
Definition amd_hip_cooperative_groups.h:1257
static dim3 thread_index()
Returns 3-dimensional thread index within the block.
Definition amd_hip_cooperative_groups.h:274
friend class thread_block
Definition amd_hip_cooperative_groups.h:70
coalesced_group(lane_mask member_mask)
Definition amd_hip_cooperative_groups.h:447
T shfl_down(T var, unsigned int lane_delta) const
Shuffle down operation on group level.
Definition amd_hip_cooperative_groups.h:520
static bool is_valid()
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:290
dim3 group_dim()
Returns the group dimensions.
Definition amd_hip_cooperative_groups.h:294
thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
Definition amd_hip_cooperative_groups.h:963
static __hip_uint32_t num_threads()
Definition amd_hip_cooperative_groups.h:284
unsigned long long ballot(int pred) const
Definition amd_hip_cooperative_groups.h:883
T shfl(T var, int srcRank) const
Shuffle operation on group level.
Definition amd_hip_cooperative_groups.h:495
arrival_token barrier_arrive() const
Arrive at a barrier.
Definition amd_hip_cooperative_groups.h:297
bool is_valid() const
Returns true if the group has not violated any API constraints.
Definition amd_hip_cooperative_groups.h:147
static __hip_uint32_t thread_rank()
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:276
friend grid_group this_grid()
User-exposed API interface to construct grid cooperative group type object - grid_group.
Definition amd_hip_cooperative_groups.h:224
T shfl_down(T var, unsigned int lane_delta) const
Definition amd_hip_cooperative_groups.h:870
T shfl_up(T var, unsigned int lane_delta) const
Shuffle up operation on group level.
Definition amd_hip_cooperative_groups.h:557
thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
Definition amd_hip_cooperative_groups.h:939
__hip_uint32_t thread_rank() const
Rank of the calling thread within [0, num_threads() ).
Definition amd_hip_cooperative_groups.h:24
Definition amd_hip_cooperative_groups.h:1315
T operator()(T lhs, T rhs) const
Definition amd_hip_cooperative_groups.h:1316
Definition amd_hip_cooperative_groups.h:1331
T operator()(T lhs, T rhs) const
Definition amd_hip_cooperative_groups.h:1332
Definition amd_hip_cooperative_groups.h:1323
T operator()(T lhs, T rhs) const
Definition amd_hip_cooperative_groups.h:1324
Definition amd_hip_cooperative_groups.h:1307
T operator()(T lhs, T rhs) const
Definition amd_hip_cooperative_groups.h:1308
Definition amd_hip_cooperative_groups.h:199
tiled_partition_internal(const thread_block &g)
Definition amd_hip_cooperative_groups.h:1212
tiled_partition_internal(const thread_block_tile< ParentSize, GrandParentCGTy > &g)
Definition amd_hip_cooperative_groups.h:1222
Definition amd_hip_cooperative_groups.h:1208
Definition amd_hip_cooperative_groups.h:1299
T operator()(T lhs, T rhs) const
Definition amd_hip_cooperative_groups.h:1300
Definition amd_hip_cooperative_groups.h:1291
T operator()(T lhs, T rhs) const
Definition amd_hip_cooperative_groups.h:1292
Definition amd_hip_cooperative_groups.h:295
Definition amd_hip_cooperative_groups.h:61
Definition amd_hip_cooperative_groups.h:54
Definition hip_runtime_api.h:1298