HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
hip_cooperative_groups_helper.h
Go to the documentation of this file.
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
31#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
32#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
33
34#if __cplusplus
35#if !defined(__HIPCC_RTC__)
36#include <hip/amd_detail/amd_hip_runtime.h> // threadId, blockId
37#include <hip/amd_detail/amd_device_functions.h>
38#endif
39#if !defined(__align__)
40#define __align__(x) __attribute__((aligned(x)))
41#endif
42
43#if !defined(__CG_QUALIFIER__)
44#define __CG_QUALIFIER__ __device__ __forceinline__
45#endif
46
47#if !defined(__CG_STATIC_QUALIFIER__)
48#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
49#endif
50
51#if !defined(_CG_STATIC_CONST_DECL_)
52#define _CG_STATIC_CONST_DECL_ static constexpr
53#endif
54
55#if __AMDGCN_WAVEFRONT_SIZE == 32
56using lane_mask = unsigned int;
57#else
58using lane_mask = unsigned long long int;
59#endif
60
61namespace cooperative_groups {
62
63/* Global scope */
64template <unsigned int size>
65using is_power_of_2 = std::integral_constant<bool, (size & (size - 1)) == 0>;
66
67template <unsigned int size>
68using is_valid_wavefront = std::integral_constant<bool, (size <= __AMDGCN_WAVEFRONT_SIZE)>;
69
70template <unsigned int size>
71using is_valid_tile_size =
72 std::integral_constant<bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
73
74template <typename T>
75using is_valid_type =
76 std::integral_constant<bool, std::is_integral<T>::value || std::is_floating_point<T>::value>;
77
78namespace internal {
79
85typedef enum {
86 cg_invalid,
87 cg_multi_grid,
88 cg_grid,
89 cg_workgroup,
90 cg_tiled_group,
91 cg_coalesced_group
92} group_type;
111namespace multi_grid {
112
113__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
114 return static_cast<uint32_t>(__ockl_multi_grid_num_grids()); }
115
116__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
117 return static_cast<uint32_t>(__ockl_multi_grid_grid_rank()); }
118
119__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast<uint32_t>(__ockl_multi_grid_size()); }
120
121__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
122 return static_cast<uint32_t>(__ockl_multi_grid_thread_rank()); }
123
124__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_multi_grid_is_valid()); }
125
126__CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
127
128} // namespace multi_grid
129
134namespace grid {
135
136__CG_STATIC_QUALIFIER__ uint32_t size() {
137 return static_cast<uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
138 (blockDim.x * gridDim.x));
139}
140
141__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
142 // Compute global id of the workgroup to which the current thread belongs to
143 uint32_t blkIdx = static_cast<uint32_t>((blockIdx.z * gridDim.y * gridDim.x) +
144 (blockIdx.y * gridDim.x) + (blockIdx.x));
145
146 // Compute total number of threads being passed to reach current workgroup
147 // within grid
148 uint32_t num_threads_till_current_workgroup =
149 static_cast<uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
150
151 // Compute thread local rank within current workgroup
152 uint32_t local_thread_rank = static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
153 (threadIdx.y * blockDim.x) + (threadIdx.x));
154
155 return (num_threads_till_current_workgroup + local_thread_rank);
156}
157
158__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_grid_is_valid()); }
159
160__CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
161
162} // namespace grid
163
169namespace workgroup {
170
171__CG_STATIC_QUALIFIER__ dim3 group_index() {
172 return (dim3(static_cast<uint32_t>(blockIdx.x), static_cast<uint32_t>(blockIdx.y),
173 static_cast<uint32_t>(blockIdx.z)));
174}
175
176__CG_STATIC_QUALIFIER__ dim3 thread_index() {
177 return (dim3(static_cast<uint32_t>(threadIdx.x), static_cast<uint32_t>(threadIdx.y),
178 static_cast<uint32_t>(threadIdx.z)));
179}
180
181__CG_STATIC_QUALIFIER__ uint32_t size() {
182 return (static_cast<uint32_t>(blockDim.x * blockDim.y * blockDim.z));
183}
184
185__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
186 return (static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
187 (threadIdx.y * blockDim.x) + (threadIdx.x)));
188}
189
190__CG_STATIC_QUALIFIER__ bool is_valid() {
191 return true;
192}
193
194__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }
195
196__CG_STATIC_QUALIFIER__ dim3 block_dim() {
197 return (dim3(static_cast<uint32_t>(blockDim.x), static_cast<uint32_t>(blockDim.y),
198 static_cast<uint32_t>(blockDim.z)));
199}
200
201} // namespace workgroup
202
203namespace tiled_group {
204
205// enforce ordering for memory intructions
206__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }
207
208} // namespace tiled_group
209
210namespace coalesced_group {
211
212// enforce ordering for memory intructions
213__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }
214
215// Masked bit count
216//
217// For each thread, this function returns the number of active threads which
218// have i-th bit of x set and come before the current thread.
219__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {
220 unsigned int counter=0;
221 #if __AMDGCN_WAVEFRONT_SIZE == 32
222 counter = __builtin_amdgcn_mbcnt_lo(x, add);
223 #else
224 counter = __builtin_amdgcn_mbcnt_lo(static_cast<lane_mask>(x), add);
225 counter = __builtin_amdgcn_mbcnt_hi(static_cast<lane_mask>(x >> 32), counter);
226 #endif
227
228 return counter;
229}
230
231} // namespace coalesced_group
232
233
234} // namespace internal
235
236} // namespace cooperative_groups
241#endif // __cplusplus
242#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H