Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_divide_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
97 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
98 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
99 
100 #include <float.h>
101 #include <inttypes.h>
102 #include <volk/volk_complex.h>
103 
104 
105 #ifdef LV_HAVE_GENERIC
106 
107 static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
108  const lv_32fc_t* aVector,
109  const lv_32fc_t* bVector,
110  unsigned int num_points)
111 {
112  lv_32fc_t* cPtr = cVector;
113  const lv_32fc_t* aPtr = aVector;
114  const lv_32fc_t* bPtr = bVector;
115 
116  for (unsigned int number = 0; number < num_points; number++) {
117  *cPtr++ = (*aPtr++) / (*bPtr++);
118  }
119 }
120 #endif /* LV_HAVE_GENERIC */
121 
122 
123 #ifdef LV_HAVE_SSE3
124 #include <pmmintrin.h>
126 
127 static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
128  const lv_32fc_t* numeratorVector,
129  const lv_32fc_t* denumeratorVector,
130  unsigned int num_points)
131 {
132  /*
133  * we'll do the "classical"
134  * a a b*
135  * --- = -------
136  * b |b|^2
137  * */
138  unsigned int number = 0;
139  const unsigned int quarterPoints = num_points / 4;
140 
141  __m128 num01, num23, den01, den23, norm, result;
142  lv_32fc_t* c = cVector;
143  const lv_32fc_t* a = numeratorVector;
144  const lv_32fc_t* b = denumeratorVector;
145 
146  for (; number < quarterPoints; number++) {
147  num01 = _mm_loadu_ps((float*)a); // first pair
148  den01 = _mm_loadu_ps((float*)b); // first pair
149  num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
150  a += 2;
151  b += 2;
152 
153  num23 = _mm_loadu_ps((float*)a); // second pair
154  den23 = _mm_loadu_ps((float*)b); // second pair
155  num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
156  a += 2;
157  b += 2;
158 
159  norm = _mm_magnitudesquared_ps_sse3(den01, den23);
160  den01 = _mm_unpacklo_ps(norm, norm);
161  den23 = _mm_unpackhi_ps(norm, norm);
162 
163  result = _mm_div_ps(num01, den01);
164  _mm_storeu_ps((float*)c, result); // Store the results back into the C container
165  c += 2;
166  result = _mm_div_ps(num23, den23);
167  _mm_storeu_ps((float*)c, result); // Store the results back into the C container
168  c += 2;
169  }
170 
171  number *= 4;
172  for (; number < num_points; number++) {
173  *c = (*a) / (*b);
174  a++;
175  b++;
176  c++;
177  }
178 }
179 #endif /* LV_HAVE_SSE3 */
180 
181 
182 #ifdef LV_HAVE_AVX
183 #include <immintrin.h>
185 
186 static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
187  const lv_32fc_t* numeratorVector,
188  const lv_32fc_t* denumeratorVector,
189  unsigned int num_points)
190 {
191  /*
192  * we'll do the "classical"
193  * a a b*
194  * --- = -------
195  * b |b|^2
196  * */
197  unsigned int number = 0;
198  const unsigned int quarterPoints = num_points / 4;
199 
200  __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
201  lv_32fc_t* c = cVector;
202  const lv_32fc_t* a = numeratorVector;
203  const lv_32fc_t* b = denumeratorVector;
204 
205  for (; number < quarterPoints; number++) {
206  num = _mm256_loadu_ps(
207  (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
208  denum = _mm256_loadu_ps(
209  (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
210  mul_conj = _mm256_complexconjugatemul_ps(num, denum);
211  sq = _mm256_mul_ps(denum, denum); // Square the values
212  mag_sq_un = _mm256_hadd_ps(
213  sq, sq); // obtain the actual squared magnitude, although out of order
214  mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
215  // best guide I found on using these functions:
216  // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
217  div = _mm256_div_ps(mul_conj, mag_sq);
218 
219  _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
220 
221  a += 4;
222  b += 4;
223  c += 4;
224  }
225 
226  number = quarterPoints * 4;
227 
228  for (; number < num_points; number++) {
229  *c++ = (*a++) / (*b++);
230  }
231 }
232 #endif /* LV_HAVE_AVX */
233 
234 
235 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
236 
237 
238 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
239 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
240 
241 #include <float.h>
242 #include <inttypes.h>
243 #include <stdio.h>
244 #include <volk/volk_complex.h>
245 
246 #ifdef LV_HAVE_SSE3
247 #include <pmmintrin.h>
249 
250 static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
251  const lv_32fc_t* numeratorVector,
252  const lv_32fc_t* denumeratorVector,
253  unsigned int num_points)
254 {
255  /*
256  * we'll do the "classical"
257  * a a b*
258  * --- = -------
259  * b |b|^2
260  * */
261  unsigned int number = 0;
262  const unsigned int quarterPoints = num_points / 4;
263 
264  __m128 num01, num23, den01, den23, norm, result;
265  lv_32fc_t* c = cVector;
266  const lv_32fc_t* a = numeratorVector;
267  const lv_32fc_t* b = denumeratorVector;
268 
269  for (; number < quarterPoints; number++) {
270  num01 = _mm_load_ps((float*)a); // first pair
271  den01 = _mm_load_ps((float*)b); // first pair
272  num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
273  a += 2;
274  b += 2;
275 
276  num23 = _mm_load_ps((float*)a); // second pair
277  den23 = _mm_load_ps((float*)b); // second pair
278  num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
279  a += 2;
280  b += 2;
281 
282  norm = _mm_magnitudesquared_ps_sse3(den01, den23);
283 
284  den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
285  den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
286 
287  result = _mm_div_ps(num01, den01);
288  _mm_store_ps((float*)c, result); // Store the results back into the C container
289  c += 2;
290  result = _mm_div_ps(num23, den23);
291  _mm_store_ps((float*)c, result); // Store the results back into the C container
292  c += 2;
293  }
294 
295  number *= 4;
296  for (; number < num_points; number++) {
297  *c = (*a) / (*b);
298  a++;
299  b++;
300  c++;
301  }
302 }
303 #endif /* LV_HAVE_SSE */
304 
305 #ifdef LV_HAVE_AVX
306 #include <immintrin.h>
308 
309 static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
310  const lv_32fc_t* numeratorVector,
311  const lv_32fc_t* denumeratorVector,
312  unsigned int num_points)
313 {
314  /*
315  * Guide to AVX intrisics:
316  * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
317  *
318  * we'll do the "classical"
319  * a a b*
320  * --- = -------
321  * b |b|^2
322  *
323  */
324  lv_32fc_t* c = cVector;
325  const lv_32fc_t* a = numeratorVector;
326  const lv_32fc_t* b = denumeratorVector;
327 
328  const unsigned int eigthPoints = num_points / 8;
329 
330  __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
331 
332  for (unsigned int number = 0; number < eigthPoints; number++) {
333  // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
334  num01 = _mm256_load_ps((float*)a);
335  denum01 = _mm256_load_ps((float*)b);
336 
337  num01 = _mm256_complexconjugatemul_ps(num01, denum01);
338  a += 4;
339  b += 4;
340 
341  num23 = _mm256_load_ps((float*)a);
342  denum23 = _mm256_load_ps((float*)b);
343  num23 = _mm256_complexconjugatemul_ps(num23, denum23);
344  a += 4;
345  b += 4;
346 
347  complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
348  _mm256_mul_ps(denum23, denum23));
349 
350  denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
351  denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
352 
353  result0 = _mm256_div_ps(num01, denum01);
354  result1 = _mm256_div_ps(num23, denum23);
355 
356  _mm256_store_ps((float*)c, result0);
357  c += 4;
358  _mm256_store_ps((float*)c, result1);
359  c += 4;
360  }
361 
362  volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eigthPoints * 8);
363 }
364 #endif /* LV_HAVE_AVX */
365 
366 
367 #ifdef LV_HAVE_NEON
368 #include <arm_neon.h>
369 
370 static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
371  const lv_32fc_t* aVector,
372  const lv_32fc_t* bVector,
373  unsigned int num_points)
374 {
375  lv_32fc_t* cPtr = cVector;
376  const lv_32fc_t* aPtr = aVector;
377  const lv_32fc_t* bPtr = bVector;
378 
379  float32x4x2_t aVal, bVal, cVal;
380  float32x4_t bAbs, bAbsInv;
381 
382  const unsigned int quarterPoints = num_points / 4;
383  unsigned int number = 0;
384  for (; number < quarterPoints; number++) {
385  aVal = vld2q_f32((const float*)(aPtr));
386  bVal = vld2q_f32((const float*)(bPtr));
387  aPtr += 4;
388  bPtr += 4;
389  __VOLK_PREFETCH(aPtr + 4);
390  __VOLK_PREFETCH(bPtr + 4);
391 
392  bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
393  bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
394 
395  bAbsInv = vrecpeq_f32(bAbs);
396  bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
397  bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
398 
399  cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
400  cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
401  cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
402 
403  cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
404  cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
405  cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
406 
407  vst2q_f32((float*)(cPtr), cVal);
408  cPtr += 4;
409  }
410 
411  for (number = quarterPoints * 4; number < num_points; number++) {
412  *cPtr++ = (*aPtr++) / (*bPtr++);
413  }
414 }
415 #endif /* LV_HAVE_NEON */
416 
417 #ifdef LV_HAVE_RVV
418 #include <riscv_vector.h>
419 
420 
421 static inline void volk_32fc_x2_divide_32fc_rvv(lv_32fc_t* cVector,
422  const lv_32fc_t* aVector,
423  const lv_32fc_t* bVector,
424  unsigned int num_points)
425 {
426  uint64_t* out = (uint64_t*)cVector;
427  size_t n = num_points;
428  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, out += vl) {
429  vl = __riscv_vsetvl_e32m4(n);
430  vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl);
431  vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)bVector, vl);
432  vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl));
433  vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
434  vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl));
435  vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
436  vfloat32m4_t mul = __riscv_vfrdiv(
437  __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl);
438  vfloat32m4_t vr = __riscv_vfmul(
439  __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl);
440  vfloat32m4_t vi = __riscv_vfmul(
441  __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl);
442  vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr);
443  vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi);
444  vuint64m8_t v =
445  __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
446  __riscv_vse64(out, v, vl);
447  }
448 }
449 #endif /*LV_HAVE_RVV*/
450 
451 #ifdef LV_HAVE_RVVSEG
452 #include <riscv_vector.h>
453 
454 static inline void volk_32fc_x2_divide_32fc_rvvseg(lv_32fc_t* cVector,
455  const lv_32fc_t* aVector,
456  const lv_32fc_t* bVector,
457  unsigned int num_points)
458 {
459  size_t n = num_points;
460  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
461  vl = __riscv_vsetvl_e32m4(n);
462  vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl);
463  vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl);
464  vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
465  vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
466  vfloat32m4_t mul = __riscv_vfrdiv(
467  __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl);
468  vfloat32m4_t vr = __riscv_vfmul(
469  __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl);
470  vfloat32m4_t vi = __riscv_vfmul(
471  __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl);
472  __riscv_vsseg2e32_v_f32m4x2(
473  (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
474  }
475 }
476 
477 #endif /*LV_HAVE_RVVSEG*/
478 
479 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
static void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:250
static void volk_32fc_x2_divide_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:107
static void volk_32fc_x2_divide_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:370
static void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:186
static void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:309
static void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:127
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:76
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
float complex lv_32fc_t
Definition: volk_complex.h:74
static __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:38
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:31