Point Cloud Library (PCL)  1.9.1
utils.hpp
1 /*
2  * Software License Agreement (BSD License)
3  *
4  * Point Cloud Library (PCL) - www.pointclouds.org
5  * Copyright (c) 2011, Willow Garage, Inc.
6  *
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * * Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  * * Redistributions in binary form must reproduce the above
16  * copyright notice, this list of conditions and the following
17  * disclaimer in the documentation and/or other materials provided
18  * with the distribution.
19  * * Neither the name of Willow Garage, Inc. nor the names of its
20  * contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  *
36  */
37 
38 
39 #ifndef PCL_GPU_KINFU_CUDA_UTILS_HPP_
40 #define PCL_GPU_KINFU_CUDA_UTILS_HPP_
41 
42 #include <cuda.h>
43 
44 namespace pcl
45 {
46  namespace device
47  {
48  namespace kinfuLS
49  {
50  template <class T>
51  __device__ __host__ __forceinline__ void swap ( T& a, T& b )
52  {
53  T c(a); a=b; b=c;
54  }
55 
56  template<typename T> struct numeric_limits;
57 
58  template<> struct numeric_limits<float>
59  {
60  __device__ __forceinline__ static float
61  quiet_NaN() { return __int_as_float(0x7fffffff); /*CUDART_NAN_F*/ };
62  __device__ __forceinline__ static float
63  epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
64 
65  __device__ __forceinline__ static float
66  min() { return 1.175494351e-38f/*FLT_MIN*/; };
67  __device__ __forceinline__ static float
68  max() { return 3.402823466e+38f/*FLT_MAX*/; };
69  };
70 
71  template<> struct numeric_limits<short>
72  {
73  __device__ __forceinline__ static short
74  max() { return SHRT_MAX; };
75  };
76 
77  __device__ __forceinline__ float
78  dot(const float3& v1, const float3& v2)
79  {
80  return v1.x * v2.x + v1.y*v2.y + v1.z*v2.z;
81  }
82 
83  __device__ __forceinline__ float3&
84  operator+=(float3& vec, const float& v)
85  {
86  vec.x += v; vec.y += v; vec.z += v; return vec;
87  }
88 
89  __device__ __forceinline__ float3
90  operator+(const float3& v1, const float3& v2)
91  {
92  return make_float3(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z);
93  }
94 
95  __device__ __forceinline__ float3&
96  operator*=(float3& vec, const float& v)
97  {
98  vec.x *= v; vec.y *= v; vec.z *= v; return vec;
99  }
100 
101  __device__ __forceinline__ float3
102  operator-(const float3& v1, const float3& v2)
103  {
104  return make_float3(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
105  }
106 
107  __device__ __forceinline__ float3
108  operator*(const float3& v1, const float& v)
109  {
110  return make_float3(v1.x * v, v1.y * v, v1.z * v);
111  }
112 
113  __device__ __forceinline__ float
114  norm(const float3& v)
115  {
116  return sqrt(dot(v, v));
117  }
118 
119  __device__ __forceinline__ float3
120  normalized(const float3& v)
121  {
122  return v * rsqrt(dot(v, v));
123  }
124 
125  __device__ __host__ __forceinline__ float3
126  cross(const float3& v1, const float3& v2)
127  {
128  return make_float3(v1.y * v2.z - v1.z * v2.y, v1.z * v2.x - v1.x * v2.z, v1.x * v2.y - v1.y * v2.x);
129  }
130 
131  __device__ __forceinline__ void computeRoots2(const float& b, const float& c, float3& roots)
132  {
133  roots.x = 0.f;
134  float d = b * b - 4.f * c;
135  if (d < 0.f) // no real roots!!!! THIS SHOULD NOT HAPPEN!
136  d = 0.f;
137 
138  float sd = sqrtf(d);
139 
140  roots.z = 0.5f * (b + sd);
141  roots.y = 0.5f * (b - sd);
142  }
143 
144  __device__ __forceinline__ void
145  computeRoots3(float c0, float c1, float c2, float3& roots)
146  {
147  if ( fabsf(c0) < numeric_limits<float>::epsilon())// one root is 0 -> quadratic equation
148  {
149  computeRoots2 (c2, c1, roots);
150  }
151  else
152  {
153  const float s_inv3 = 1.f/3.f;
154  const float s_sqrt3 = sqrtf(3.f);
155  // Construct the parameters used in classifying the roots of the equation
156  // and in solving the equation for the roots in closed form.
157  float c2_over_3 = c2 * s_inv3;
158  float a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
159  if (a_over_3 > 0.f)
160  a_over_3 = 0.f;
161 
162  float half_b = 0.5f * (c0 + c2_over_3 * (2.f * c2_over_3 * c2_over_3 - c1));
163 
164  float q = half_b * half_b + a_over_3 * a_over_3 * a_over_3;
165  if (q > 0.f)
166  q = 0.f;
167 
168  // Compute the eigenvalues by solving for the roots of the polynomial.
169  float rho = sqrtf(-a_over_3);
170  float theta = atan2f (sqrtf (-q), half_b)*s_inv3;
171  float cos_theta = __cosf (theta);
172  float sin_theta = __sinf (theta);
173  roots.x = c2_over_3 + 2.f * rho * cos_theta;
174  roots.y = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);
175  roots.z = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);
176 
177  // Sort in increasing order.
178  if (roots.x >= roots.y)
179  swap(roots.x, roots.y);
180 
181  if (roots.y >= roots.z)
182  {
183  swap(roots.y, roots.z);
184 
185  if (roots.x >= roots.y)
186  swap (roots.x, roots.y);
187  }
188  if (roots.x <= 0) // eigenval for symmetric positive semi-definite matrix can not be negative! Set it to 0
189  computeRoots2 (c2, c1, roots);
190  }
191  }
192 
193  struct Eigen33
194  {
195  public:
196  template<int Rows>
197  struct MiniMat
198  {
199  float3 data[Rows];
200  __device__ __host__ __forceinline__ float3& operator[](int i) { return data[i]; }
201  __device__ __host__ __forceinline__ const float3& operator[](int i) const { return data[i]; }
202  };
203  typedef MiniMat<3> Mat33;
204  typedef MiniMat<4> Mat43;
205 
206 
207  static __forceinline__ __device__ float3
208  unitOrthogonal (const float3& src)
209  {
210  float3 perp;
211  /* Let us compute the crossed product of *this with a vector
212  * that is not too close to being colinear to *this.
213  */
214 
215  /* unless the x and y coords are both close to zero, we can
216  * simply take ( -y, x, 0 ) and normalize it.
217  */
218  if(!isMuchSmallerThan(src.x, src.z) || !isMuchSmallerThan(src.y, src.z))
219  {
220  float invnm = rsqrtf(src.x*src.x + src.y*src.y);
221  perp.x = -src.y * invnm;
222  perp.y = src.x * invnm;
223  perp.z = 0.0f;
224  }
225  /* if both x and y are close to zero, then the vector is close
226  * to the z-axis, so it's far from colinear to the x-axis for instance.
227  * So we take the crossed product with (1,0,0) and normalize it.
228  */
229  else
230  {
231  float invnm = rsqrtf(src.z * src.z + src.y * src.y);
232  perp.x = 0.0f;
233  perp.y = -src.z * invnm;
234  perp.z = src.y * invnm;
235  }
236 
237  return perp;
238  }
239 
240  __device__ __forceinline__
241  Eigen33(volatile float* mat_pkg_arg) : mat_pkg(mat_pkg_arg) {}
242  __device__ __forceinline__ void
243  compute(Mat33& tmp, Mat33& vec_tmp, Mat33& evecs, float3& evals)
244  {
245  // Scale the matrix so its entries are in [-1,1]. The scaling is applied
246  // only when at least one matrix entry has magnitude larger than 1.
247 
248  float max01 = fmaxf( fabsf(mat_pkg[0]), fabsf(mat_pkg[1]) );
249  float max23 = fmaxf( fabsf(mat_pkg[2]), fabsf(mat_pkg[3]) );
250  float max45 = fmaxf( fabsf(mat_pkg[4]), fabsf(mat_pkg[5]) );
251  float m0123 = fmaxf( max01, max23);
252  float scale = fmaxf( max45, m0123);
253 
254  if (scale <= numeric_limits<float>::min())
255  scale = 1.f;
256 
257  mat_pkg[0] /= scale;
258  mat_pkg[1] /= scale;
259  mat_pkg[2] /= scale;
260  mat_pkg[3] /= scale;
261  mat_pkg[4] /= scale;
262  mat_pkg[5] /= scale;
263 
264  // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0. The
265  // eigenvalues are the roots to this equation, all guaranteed to be
266  // real-valued, because the matrix is symmetric.
267  float c0 = m00() * m11() * m22()
268  + 2.f * m01() * m02() * m12()
269  - m00() * m12() * m12()
270  - m11() * m02() * m02()
271  - m22() * m01() * m01();
272  float c1 = m00() * m11() -
273  m01() * m01() +
274  m00() * m22() -
275  m02() * m02() +
276  m11() * m22() -
277  m12() * m12();
278  float c2 = m00() + m11() + m22();
279 
280  computeRoots3(c0, c1, c2, evals);
281 
282  if(evals.z - evals.x <= numeric_limits<float>::epsilon())
283  {
284  evecs[0] = make_float3(1.f, 0.f, 0.f);
285  evecs[1] = make_float3(0.f, 1.f, 0.f);
286  evecs[2] = make_float3(0.f, 0.f, 1.f);
287  }
288  else if (evals.y - evals.x <= numeric_limits<float>::epsilon() )
289  {
290  // first and second equal
291  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
292  tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
293 
294  vec_tmp[0] = cross(tmp[0], tmp[1]);
295  vec_tmp[1] = cross(tmp[0], tmp[2]);
296  vec_tmp[2] = cross(tmp[1], tmp[2]);
297 
298  float len1 = dot (vec_tmp[0], vec_tmp[0]);
299  float len2 = dot (vec_tmp[1], vec_tmp[1]);
300  float len3 = dot (vec_tmp[2], vec_tmp[2]);
301 
302  if (len1 >= len2 && len1 >= len3)
303  {
304  evecs[2] = vec_tmp[0] * rsqrtf (len1);
305  }
306  else if (len2 >= len1 && len2 >= len3)
307  {
308  evecs[2] = vec_tmp[1] * rsqrtf (len2);
309  }
310  else
311  {
312  evecs[2] = vec_tmp[2] * rsqrtf (len3);
313  }
314 
315  evecs[1] = unitOrthogonal(evecs[2]);
316  evecs[0] = cross(evecs[1], evecs[2]);
317  }
318  else if (evals.z - evals.y <= numeric_limits<float>::epsilon() )
319  {
320  // second and third equal
321  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
322  tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
323 
324  vec_tmp[0] = cross(tmp[0], tmp[1]);
325  vec_tmp[1] = cross(tmp[0], tmp[2]);
326  vec_tmp[2] = cross(tmp[1], tmp[2]);
327 
328  float len1 = dot(vec_tmp[0], vec_tmp[0]);
329  float len2 = dot(vec_tmp[1], vec_tmp[1]);
330  float len3 = dot(vec_tmp[2], vec_tmp[2]);
331 
332  if (len1 >= len2 && len1 >= len3)
333  {
334  evecs[0] = vec_tmp[0] * rsqrtf(len1);
335  }
336  else if (len2 >= len1 && len2 >= len3)
337  {
338  evecs[0] = vec_tmp[1] * rsqrtf(len2);
339  }
340  else
341  {
342  evecs[0] = vec_tmp[2] * rsqrtf(len3);
343  }
344 
345  evecs[1] = unitOrthogonal( evecs[0] );
346  evecs[2] = cross(evecs[0], evecs[1]);
347  }
348  else
349  {
350 
351  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
352  tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
353 
354  vec_tmp[0] = cross(tmp[0], tmp[1]);
355  vec_tmp[1] = cross(tmp[0], tmp[2]);
356  vec_tmp[2] = cross(tmp[1], tmp[2]);
357 
358  float len1 = dot(vec_tmp[0], vec_tmp[0]);
359  float len2 = dot(vec_tmp[1], vec_tmp[1]);
360  float len3 = dot(vec_tmp[2], vec_tmp[2]);
361 
362  float mmax[3];
363 
364  unsigned int min_el = 2;
365  unsigned int max_el = 2;
366  if (len1 >= len2 && len1 >= len3)
367  {
368  mmax[2] = len1;
369  evecs[2] = vec_tmp[0] * rsqrtf (len1);
370  }
371  else if (len2 >= len1 && len2 >= len3)
372  {
373  mmax[2] = len2;
374  evecs[2] = vec_tmp[1] * rsqrtf (len2);
375  }
376  else
377  {
378  mmax[2] = len3;
379  evecs[2] = vec_tmp[2] * rsqrtf (len3);
380  }
381 
382  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
383  tmp[0].x -= evals.y; tmp[1].y -= evals.y; tmp[2].z -= evals.y;
384 
385  vec_tmp[0] = cross(tmp[0], tmp[1]);
386  vec_tmp[1] = cross(tmp[0], tmp[2]);
387  vec_tmp[2] = cross(tmp[1], tmp[2]);
388 
389  len1 = dot(vec_tmp[0], vec_tmp[0]);
390  len2 = dot(vec_tmp[1], vec_tmp[1]);
391  len3 = dot(vec_tmp[2], vec_tmp[2]);
392 
393  if (len1 >= len2 && len1 >= len3)
394  {
395  mmax[1] = len1;
396  evecs[1] = vec_tmp[0] * rsqrtf (len1);
397  min_el = len1 <= mmax[min_el] ? 1 : min_el;
398  max_el = len1 > mmax[max_el] ? 1 : max_el;
399  }
400  else if (len2 >= len1 && len2 >= len3)
401  {
402  mmax[1] = len2;
403  evecs[1] = vec_tmp[1] * rsqrtf (len2);
404  min_el = len2 <= mmax[min_el] ? 1 : min_el;
405  max_el = len2 > mmax[max_el] ? 1 : max_el;
406  }
407  else
408  {
409  mmax[1] = len3;
410  evecs[1] = vec_tmp[2] * rsqrtf (len3);
411  min_el = len3 <= mmax[min_el] ? 1 : min_el;
412  max_el = len3 > mmax[max_el] ? 1 : max_el;
413  }
414 
415  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
416  tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
417 
418  vec_tmp[0] = cross(tmp[0], tmp[1]);
419  vec_tmp[1] = cross(tmp[0], tmp[2]);
420  vec_tmp[2] = cross(tmp[1], tmp[2]);
421 
422  len1 = dot (vec_tmp[0], vec_tmp[0]);
423  len2 = dot (vec_tmp[1], vec_tmp[1]);
424  len3 = dot (vec_tmp[2], vec_tmp[2]);
425 
426 
427  if (len1 >= len2 && len1 >= len3)
428  {
429  mmax[0] = len1;
430  evecs[0] = vec_tmp[0] * rsqrtf (len1);
431  min_el = len3 <= mmax[min_el] ? 0 : min_el;
432  max_el = len3 > mmax[max_el] ? 0 : max_el;
433  }
434  else if (len2 >= len1 && len2 >= len3)
435  {
436  mmax[0] = len2;
437  evecs[0] = vec_tmp[1] * rsqrtf (len2);
438  min_el = len3 <= mmax[min_el] ? 0 : min_el;
439  max_el = len3 > mmax[max_el] ? 0 : max_el;
440  }
441  else
442  {
443  mmax[0] = len3;
444  evecs[0] = vec_tmp[2] * rsqrtf (len3);
445  min_el = len3 <= mmax[min_el] ? 0 : min_el;
446  max_el = len3 > mmax[max_el] ? 0 : max_el;
447  }
448 
449  unsigned mid_el = 3 - min_el - max_el;
450  evecs[min_el] = normalized( cross( evecs[(min_el+1) % 3], evecs[(min_el+2) % 3] ) );
451  evecs[mid_el] = normalized( cross( evecs[(mid_el+1) % 3], evecs[(mid_el+2) % 3] ) );
452  }
453  // Rescale back to the original size.
454  evals *= scale;
455  }
456  private:
457  volatile float* mat_pkg;
458 
459  __device__ __forceinline__ float m00() const { return mat_pkg[0]; }
460  __device__ __forceinline__ float m01() const { return mat_pkg[1]; }
461  __device__ __forceinline__ float m02() const { return mat_pkg[2]; }
462  __device__ __forceinline__ float m10() const { return mat_pkg[1]; }
463  __device__ __forceinline__ float m11() const { return mat_pkg[3]; }
464  __device__ __forceinline__ float m12() const { return mat_pkg[4]; }
465  __device__ __forceinline__ float m20() const { return mat_pkg[2]; }
466  __device__ __forceinline__ float m21() const { return mat_pkg[4]; }
467  __device__ __forceinline__ float m22() const { return mat_pkg[5]; }
468 
469  __device__ __forceinline__ float3 row0() const { return make_float3( m00(), m01(), m02() ); }
470  __device__ __forceinline__ float3 row1() const { return make_float3( m10(), m11(), m12() ); }
471  __device__ __forceinline__ float3 row2() const { return make_float3( m20(), m21(), m22() ); }
472 
473  __device__ __forceinline__ static bool isMuchSmallerThan (float x, float y)
474  {
475  // copied from <eigen>/include/Eigen/src/Core/NumTraits.h
477  return x * x <= prec_sqr * y * y;
478  }
479  };
480 
481  struct Block
482  {
483  static __device__ __forceinline__ unsigned int stride()
484  {
485  return blockDim.x * blockDim.y * blockDim.z;
486  }
487 
488  static __device__ __forceinline__ int
490  {
491  return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
492  }
493 
494  template<int CTA_SIZE, typename T, class BinOp>
495  static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
496  {
497  int tid = flattenedThreadId();
498  T val = buffer[tid];
499 
500  if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
501  if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
502  if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
503  if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
504 
505  if (tid < 32)
506  {
507  if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
508  if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
509  if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
510  if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
511  if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
512  if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
513  }
514  }
515 
516  template<int CTA_SIZE, typename T, class BinOp>
517  static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
518  {
519  int tid = flattenedThreadId();
520  T val = buffer[tid] = init;
521  __syncthreads();
522 
523  if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
524  if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
525  if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
526  if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
527 
528  if (tid < 32)
529  {
530  if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
531  if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
532  if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
533  if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
534  if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
535  if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
536  }
537  __syncthreads();
538  return buffer[0];
539  }
540  };
541 
542  struct Warp
543  {
544  enum
545  {
549  };
550 
551  /** \brief Returns the warp lane ID of the calling thread. */
552  static __device__ __forceinline__ unsigned int
554  {
555  unsigned int ret;
556  asm("mov.u32 %0, %laneid;" : "=r"(ret) );
557  return ret;
558  }
559 
560  static __device__ __forceinline__ unsigned int id()
561  {
562  int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
563  return tid >> LOG_WARP_SIZE;
564  }
565 
566  static __device__ __forceinline__
568  {
569  #if (__CUDA_ARCH__ >= 200)
570  unsigned int ret;
571  asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
572  return ret;
573  #else
574  return 0xFFFFFFFF >> (32 - laneId());
575  #endif
576  }
577 
578  static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
579  {
580  return __popc(Warp::laneMaskLt() & ballot_mask);
581  }
582  };
583 
584 
585  struct Emulation
586  {
587  static __device__ __forceinline__ int
588  warp_reduce ( volatile int *ptr , const unsigned int tid)
589  {
590  const unsigned int lane = tid & 31; // index of thread in warp (0..31)
591 
592  if (lane < 16)
593  {
594  int partial = ptr[tid];
595 
596  ptr[tid] = partial = partial + ptr[tid + 16];
597  ptr[tid] = partial = partial + ptr[tid + 8];
598  ptr[tid] = partial = partial + ptr[tid + 4];
599  ptr[tid] = partial = partial + ptr[tid + 2];
600  ptr[tid] = partial = partial + ptr[tid + 1];
601  }
602  return ptr[tid - lane];
603  }
604 
605  static __forceinline__ __device__ int
606  Ballot(int predicate, volatile int* cta_buffer)
607  {
608  #if CUDA_VERSION >= 9000
609  (void)cta_buffer;
610  return __ballot_sync (__activemask (), predicate);
611  #elif __CUDA_ARCH__ >= 200
612  (void)cta_buffer;
613  return __ballot (predicate);
614  #else
615  int tid = Block::flattenedThreadId();
616  cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
617  return warp_reduce(cta_buffer, tid);
618  #endif
619  }
620 
621  static __forceinline__ __device__ bool
622  All(int predicate, volatile int* cta_buffer)
623  {
624  #if CUDA_VERSION >= 9000
625  (void)cta_buffer;
626  return __all_sync (__activemask (), predicate);
627  #elif __CUDA_ARCH__ >= 200
628  (void)cta_buffer;
629  return __all (predicate);
630  #else
631  int tid = Block::flattenedThreadId();
632  cta_buffer[tid] = predicate ? 1 : 0;
633  return warp_reduce(cta_buffer, tid) == 32;
634  #endif
635  }
636  };
637  }
638  }
639 }
640 
641 #endif /* PCL_GPU_KINFU_CUDA_UTILS_HPP_ */
pcl
This file defines compatibility wrappers for low level I/O functions.
Definition: convolution.h:45
pcl::device::kinfuLS::numeric_limits< float >::min
__device__ static __forceinline__ float min()
Definition: utils.hpp:66
pcl::device::kinfuLS::operator*
__device__ __forceinline__ float3 operator*(const Mat33 &m, const float3 &vec)
Definition: device.hpp:80
pcl::device::kinfuLS::Block::reduce
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
Definition: utils.hpp:517
pcl::device::kinfuLS::Eigen33::MiniMat
Definition: utils.hpp:197
pcl::device::kinfuLS::Warp::binaryExclScan
static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
Definition: utils.hpp:578
pcl::device::kinfuLS::Warp::STRIDE
@ STRIDE
Definition: utils.hpp:548
pcl::device::kinfuLS::numeric_limits< float >::epsilon
__device__ static __forceinline__ float epsilon()
Definition: utils.hpp:63
pcl::device::kinfuLS::Eigen33::MiniMat::operator[]
__device__ __host__ __forceinline__ float3 & operator[](int i)
Definition: utils.hpp:200
pcl::device::kinfuLS::computeRoots2
__device__ __forceinline__ void computeRoots2(const float &b, const float &c, float3 &roots)
Definition: utils.hpp:131
pcl::device::kinfuLS::Warp::WARP_SIZE
@ WARP_SIZE
Definition: utils.hpp:547
pcl::device::kinfuLS::computeRoots3
__device__ __forceinline__ void computeRoots3(float c0, float c1, float c2, float3 &roots)
Definition: utils.hpp:145
pcl::device::kinfuLS::Eigen33::compute
__device__ __forceinline__ void compute(Mat33 &tmp, Mat33 &vec_tmp, Mat33 &evecs, float3 &evals)
Definition: utils.hpp:243
pcl::device::kinfuLS::Block::reduce
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
Definition: utils.hpp:495
pcl::device::kinfuLS::dot
__device__ __forceinline__ float dot(const float3 &v1, const float3 &v2)
Definition: utils.hpp:78
pcl::device::kinfuLS::numeric_limits
Definition: utils.hpp:56
pcl::device::kinfuLS::numeric_limits< float >::quiet_NaN
__device__ static __forceinline__ float quiet_NaN()
Definition: utils.hpp:61
pcl::device::kinfuLS::Emulation
Definition: utils.hpp:585
pcl::device::kinfuLS::Emulation::All
static __forceinline__ __device__ bool All(int predicate, volatile int *cta_buffer)
Definition: utils.hpp:622
pcl::device::kinfuLS::Eigen33::unitOrthogonal
static __forceinline__ __device__ float3 unitOrthogonal(const float3 &src)
Definition: utils.hpp:208
pcl::device::kinfuLS::Warp::laneMaskLt
static __device__ __forceinline__ int laneMaskLt()
Definition: utils.hpp:567
pcl::device::kinfuLS::Eigen33::MiniMat::operator[]
__device__ __host__ const __forceinline__ float3 & operator[](int i) const
Definition: utils.hpp:201
pcl::device::kinfuLS::Warp
Definition: utils.hpp:542
pcl::device::kinfuLS::Block::flattenedThreadId
static __device__ __forceinline__ int flattenedThreadId()
Definition: utils.hpp:489
pcl::device::kinfuLS::Eigen33
Definition: utils.hpp:193
pcl::device::kinfuLS::Eigen33::Mat43
MiniMat< 4 > Mat43
Definition: utils.hpp:204
pcl::device::kinfuLS::Eigen33::Eigen33
__device__ __forceinline__ Eigen33(volatile float *mat_pkg_arg)
Definition: utils.hpp:241
pcl::device::kinfuLS::Emulation::Ballot
static __forceinline__ __device__ int Ballot(int predicate, volatile int *cta_buffer)
Definition: utils.hpp:606
pcl::device::kinfuLS::Warp::id
static __device__ __forceinline__ unsigned int id()
Definition: utils.hpp:560
pcl::device::kinfuLS::Eigen33::Mat33
MiniMat< 3 > Mat33
Definition: utils.hpp:203
pcl::device::kinfuLS::Emulation::warp_reduce
static __device__ __forceinline__ int warp_reduce(volatile int *ptr, const unsigned int tid)
Definition: utils.hpp:588
pcl::device::kinfuLS::operator-
__device__ __forceinline__ float3 operator-(const float3 &v1, const float3 &v2)
Definition: utils.hpp:102
pcl::device::kinfuLS::Warp::LOG_WARP_SIZE
@ LOG_WARP_SIZE
Definition: utils.hpp:546
pcl::device::kinfuLS::swap
__device__ __host__ __forceinline__ void swap(T &a, T &b)
Definition: utils.hpp:51
pcl::device::kinfuLS::cross
__device__ __host__ __forceinline__ float3 cross(const float3 &v1, const float3 &v2)
Definition: utils.hpp:126
pcl::device::kinfuLS::normalized
__device__ __forceinline__ float3 normalized(const float3 &v)
Definition: utils.hpp:120
pcl::device::kinfuLS::Block::stride
static __device__ __forceinline__ unsigned int stride()
Definition: utils.hpp:483
pcl::device::kinfuLS::operator+
__device__ __forceinline__ float3 operator+(const float3 &v1, const float3 &v2)
Definition: utils.hpp:90
pcl::device::kinfuLS::Block
Definition: utils.hpp:481
pcl::device::kinfuLS::norm
__device__ __forceinline__ float norm(const float3 &v)
Definition: utils.hpp:114
pcl::device::kinfuLS::operator+=
__device__ __forceinline__ float3 & operator+=(float3 &vec, const float &v)
Definition: utils.hpp:84
pcl::device::kinfuLS::numeric_limits< float >::max
__device__ static __forceinline__ float max()
Definition: utils.hpp:68
pcl::device::kinfuLS::operator*=
__device__ __forceinline__ float3 & operator*=(float3 &vec, const float &v)
Definition: utils.hpp:96
pcl::device::kinfuLS::Warp::laneId
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
Definition: utils.hpp:553
pcl::device::numeric_limits::epsilon
__device__ static __forceinline__ type epsilon()
Definition: limits.hpp:49
pcl::device::kinfuLS::Eigen33::MiniMat::data
float3 data[Rows]
Definition: utils.hpp:199
pcl::device::kinfuLS::numeric_limits< short >::max
__device__ static __forceinline__ short max()
Definition: utils.hpp:74