Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32f_acos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
57 #include <inttypes.h>
58 #include <math.h>
59 #include <stdio.h>
60 
61 /* This is the number of terms of Taylor series to evaluate, increase this for more
62  * accuracy*/
63 #define ACOS_TERMS 2
64 
65 #ifndef INCLUDED_volk_32f_acos_32f_a_H
66 #define INCLUDED_volk_32f_acos_32f_a_H
67 
68 #if LV_HAVE_AVX2 && LV_HAVE_FMA
69 #include <immintrin.h>
70 
71 static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
72  const float* aVector,
73  unsigned int num_points)
74 {
75  float* bPtr = bVector;
76  const float* aPtr = aVector;
77 
78  unsigned int number = 0;
79  unsigned int eighthPoints = num_points / 8;
80  int i, j;
81 
82  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
83  __m256 fzeroes, fones, ftwos, ffours, condition;
84 
85  pi = _mm256_set1_ps(3.14159265358979323846);
86  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
87  fzeroes = _mm256_setzero_ps();
88  fones = _mm256_set1_ps(1.0);
89  ftwos = _mm256_set1_ps(2.0);
90  ffours = _mm256_set1_ps(4.0);
91 
92  for (; number < eighthPoints; number++) {
93  aVal = _mm256_load_ps(aPtr);
94  d = aVal;
95  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
96  _mm256_sub_ps(fones, aVal))),
97  aVal);
98  z = aVal;
99  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
100  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
101  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
102  x = _mm256_add_ps(
103  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
104 
105  for (i = 0; i < 2; i++) {
106  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
107  }
108  x = _mm256_div_ps(fones, x);
109  y = fzeroes;
110  for (j = ACOS_TERMS - 1; j >= 0; j--) {
111  y = _mm256_fmadd_ps(
112  y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
113  }
114 
115  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
116  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
117 
118  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
119  arccosine = y;
120  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
121  arccosine = _mm256_sub_ps(
122  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
123  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
124  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
125 
126  _mm256_store_ps(bPtr, arccosine);
127  aPtr += 8;
128  bPtr += 8;
129  }
130 
131  number = eighthPoints * 8;
132  for (; number < num_points; number++) {
133  *bPtr++ = acos(*aPtr++);
134  }
135 }
136 
137 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
138 
139 
140 #ifdef LV_HAVE_AVX
141 #include <immintrin.h>
142 
143 static inline void
144 volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
145 {
146  float* bPtr = bVector;
147  const float* aPtr = aVector;
148 
149  unsigned int number = 0;
150  unsigned int eighthPoints = num_points / 8;
151  int i, j;
152 
153  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
154  __m256 fzeroes, fones, ftwos, ffours, condition;
155 
156  pi = _mm256_set1_ps(3.14159265358979323846);
157  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
158  fzeroes = _mm256_setzero_ps();
159  fones = _mm256_set1_ps(1.0);
160  ftwos = _mm256_set1_ps(2.0);
161  ffours = _mm256_set1_ps(4.0);
162 
163  for (; number < eighthPoints; number++) {
164  aVal = _mm256_load_ps(aPtr);
165  d = aVal;
166  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
167  _mm256_sub_ps(fones, aVal))),
168  aVal);
169  z = aVal;
170  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
171  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
172  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
173  x = _mm256_add_ps(
174  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
175 
176  for (i = 0; i < 2; i++) {
177  x = _mm256_add_ps(x,
178  _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
179  }
180  x = _mm256_div_ps(fones, x);
181  y = fzeroes;
182  for (j = ACOS_TERMS - 1; j >= 0; j--) {
183  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
184  _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
185  }
186 
187  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
188  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
189 
190  y = _mm256_add_ps(
191  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
192  arccosine = y;
193  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
194  arccosine = _mm256_sub_ps(
195  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
196  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
197  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
198 
199  _mm256_store_ps(bPtr, arccosine);
200  aPtr += 8;
201  bPtr += 8;
202  }
203 
204  number = eighthPoints * 8;
205  for (; number < num_points; number++) {
206  *bPtr++ = acos(*aPtr++);
207  }
208 }
209 
210 #endif /* LV_HAVE_AVX2 for aligned */
211 
212 #ifdef LV_HAVE_SSE4_1
213 #include <smmintrin.h>
214 
215 static inline void
216 volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
217 {
218  float* bPtr = bVector;
219  const float* aPtr = aVector;
220 
221  unsigned int number = 0;
222  unsigned int quarterPoints = num_points / 4;
223  int i, j;
224 
225  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
226  __m128 fzeroes, fones, ftwos, ffours, condition;
227 
228  pi = _mm_set1_ps(3.14159265358979323846);
229  pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
230  fzeroes = _mm_setzero_ps();
231  fones = _mm_set1_ps(1.0);
232  ftwos = _mm_set1_ps(2.0);
233  ffours = _mm_set1_ps(4.0);
234 
235  for (; number < quarterPoints; number++) {
236  aVal = _mm_load_ps(aPtr);
237  d = aVal;
238  aVal = _mm_div_ps(
239  _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
240  aVal);
241  z = aVal;
242  condition = _mm_cmplt_ps(z, fzeroes);
243  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
244  condition = _mm_cmplt_ps(z, fones);
245  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
246 
247  for (i = 0; i < 2; i++) {
248  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
249  }
250  x = _mm_div_ps(fones, x);
251  y = fzeroes;
252  for (j = ACOS_TERMS - 1; j >= 0; j--) {
253  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
254  _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
255  }
256 
257  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
258  condition = _mm_cmpgt_ps(z, fones);
259 
260  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
261  arccosine = y;
262  condition = _mm_cmplt_ps(aVal, fzeroes);
263  arccosine =
264  _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
265  condition = _mm_cmplt_ps(d, fzeroes);
266  arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
267 
268  _mm_store_ps(bPtr, arccosine);
269  aPtr += 4;
270  bPtr += 4;
271  }
272 
273  number = quarterPoints * 4;
274  for (; number < num_points; number++) {
275  *bPtr++ = acosf(*aPtr++);
276  }
277 }
278 
279 #endif /* LV_HAVE_SSE4_1 for aligned */
280 
281 #endif /* INCLUDED_volk_32f_acos_32f_a_H */
282 
283 
284 #ifndef INCLUDED_volk_32f_acos_32f_u_H
285 #define INCLUDED_volk_32f_acos_32f_u_H
286 
287 #if LV_HAVE_AVX2 && LV_HAVE_FMA
288 #include <immintrin.h>
289 
290 static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
291  const float* aVector,
292  unsigned int num_points)
293 {
294  float* bPtr = bVector;
295  const float* aPtr = aVector;
296 
297  unsigned int number = 0;
298  unsigned int eighthPoints = num_points / 8;
299  int i, j;
300 
301  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
302  __m256 fzeroes, fones, ftwos, ffours, condition;
303 
304  pi = _mm256_set1_ps(3.14159265358979323846);
305  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
306  fzeroes = _mm256_setzero_ps();
307  fones = _mm256_set1_ps(1.0);
308  ftwos = _mm256_set1_ps(2.0);
309  ffours = _mm256_set1_ps(4.0);
310 
311  for (; number < eighthPoints; number++) {
312  aVal = _mm256_loadu_ps(aPtr);
313  d = aVal;
314  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
315  _mm256_sub_ps(fones, aVal))),
316  aVal);
317  z = aVal;
318  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
319  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
320  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
321  x = _mm256_add_ps(
322  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
323 
324  for (i = 0; i < 2; i++) {
325  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
326  }
327  x = _mm256_div_ps(fones, x);
328  y = fzeroes;
329  for (j = ACOS_TERMS - 1; j >= 0; j--) {
330  y = _mm256_fmadd_ps(
331  y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
332  }
333 
334  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
335  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
336 
337  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
338  arccosine = y;
339  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
340  arccosine = _mm256_sub_ps(
341  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
342  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
343  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
344 
345  _mm256_storeu_ps(bPtr, arccosine);
346  aPtr += 8;
347  bPtr += 8;
348  }
349 
350  number = eighthPoints * 8;
351  for (; number < num_points; number++) {
352  *bPtr++ = acos(*aPtr++);
353  }
354 }
355 
356 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
357 
358 
359 #ifdef LV_HAVE_AVX
360 #include <immintrin.h>
361 
362 static inline void
363 volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
364 {
365  float* bPtr = bVector;
366  const float* aPtr = aVector;
367 
368  unsigned int number = 0;
369  unsigned int eighthPoints = num_points / 8;
370  int i, j;
371 
372  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
373  __m256 fzeroes, fones, ftwos, ffours, condition;
374 
375  pi = _mm256_set1_ps(3.14159265358979323846);
376  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
377  fzeroes = _mm256_setzero_ps();
378  fones = _mm256_set1_ps(1.0);
379  ftwos = _mm256_set1_ps(2.0);
380  ffours = _mm256_set1_ps(4.0);
381 
382  for (; number < eighthPoints; number++) {
383  aVal = _mm256_loadu_ps(aPtr);
384  d = aVal;
385  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
386  _mm256_sub_ps(fones, aVal))),
387  aVal);
388  z = aVal;
389  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
390  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
391  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
392  x = _mm256_add_ps(
393  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
394 
395  for (i = 0; i < 2; i++) {
396  x = _mm256_add_ps(x,
397  _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
398  }
399  x = _mm256_div_ps(fones, x);
400  y = fzeroes;
401  for (j = ACOS_TERMS - 1; j >= 0; j--) {
402  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
403  _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
404  }
405 
406  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
407  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
408 
409  y = _mm256_add_ps(
410  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
411  arccosine = y;
412  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
413  arccosine = _mm256_sub_ps(
414  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
415  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
416  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
417 
418  _mm256_storeu_ps(bPtr, arccosine);
419  aPtr += 8;
420  bPtr += 8;
421  }
422 
423  number = eighthPoints * 8;
424  for (; number < num_points; number++) {
425  *bPtr++ = acos(*aPtr++);
426  }
427 }
428 
429 #endif /* LV_HAVE_AVX2 for unaligned */
430 
431 #ifdef LV_HAVE_SSE4_1
432 #include <smmintrin.h>
433 
434 static inline void
435 volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
436 {
437  float* bPtr = bVector;
438  const float* aPtr = aVector;
439 
440  unsigned int number = 0;
441  unsigned int quarterPoints = num_points / 4;
442  int i, j;
443 
444  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
445  __m128 fzeroes, fones, ftwos, ffours, condition;
446 
447  pi = _mm_set1_ps(3.14159265358979323846);
448  pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
449  fzeroes = _mm_setzero_ps();
450  fones = _mm_set1_ps(1.0);
451  ftwos = _mm_set1_ps(2.0);
452  ffours = _mm_set1_ps(4.0);
453 
454  for (; number < quarterPoints; number++) {
455  aVal = _mm_loadu_ps(aPtr);
456  d = aVal;
457  aVal = _mm_div_ps(
458  _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
459  aVal);
460  z = aVal;
461  condition = _mm_cmplt_ps(z, fzeroes);
462  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
463  condition = _mm_cmplt_ps(z, fones);
464  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
465 
466  for (i = 0; i < 2; i++) {
467  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
468  }
469  x = _mm_div_ps(fones, x);
470  y = fzeroes;
471 
472  for (j = ACOS_TERMS - 1; j >= 0; j--) {
473  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
474  _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
475  }
476 
477  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
478  condition = _mm_cmpgt_ps(z, fones);
479 
480  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
481  arccosine = y;
482  condition = _mm_cmplt_ps(aVal, fzeroes);
483  arccosine =
484  _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
485  condition = _mm_cmplt_ps(d, fzeroes);
486  arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
487 
488  _mm_storeu_ps(bPtr, arccosine);
489  aPtr += 4;
490  bPtr += 4;
491  }
492 
493  number = quarterPoints * 4;
494  for (; number < num_points; number++) {
495  *bPtr++ = acosf(*aPtr++);
496  }
497 }
498 
499 #endif /* LV_HAVE_SSE4_1 for aligned */
500 
501 #ifdef LV_HAVE_GENERIC
502 
503 static inline void
504 volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
505 {
506  float* bPtr = bVector;
507  const float* aPtr = aVector;
508  unsigned int number = 0;
509 
510  for (number = 0; number < num_points; number++) {
511  *bPtr++ = acosf(*aPtr++);
512  }
513 }
514 #endif /* LV_HAVE_GENERIC */
515 
516 #ifdef LV_HAVE_RVV
517 #include <riscv_vector.h>
519 
520 static inline void
521 volk_32f_acos_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
522 {
523  size_t vlmax = __riscv_vsetvlmax_e32m2();
524 
525  const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
526  const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
527  const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
528  const vfloat32m2_t cf2 = __riscv_vfmv_v_f_f32m2(2.0f, vlmax);
529  const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
530 
531 #if ACOS_TERMS == 2
532  const vfloat32m2_t cfm1o3 = __riscv_vfmv_v_f_f32m2(-1 / 3.0f, vlmax);
533 #elif ACOS_TERMS == 3
534  const vfloat32m2_t cf1o5 = __riscv_vfmv_v_f_f32m2(1 / 5.0f, vlmax);
535 #elif ACOS_TERMS == 4
536  const vfloat32m2_t cfm1o7 = __riscv_vfmv_v_f_f32m2(-1 / 7.0f, vlmax);
537 #endif
538 
539  size_t n = num_points;
540  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
541  vl = __riscv_vsetvl_e32m2(n);
542  vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
543  vfloat32m2_t a =
544  __riscv_vfdiv(__riscv_vfsqrt(__riscv_vfmsac(cf1, v, v, vl), vl), v, vl);
545  vfloat32m2_t z = __riscv_vfabs(a, vl);
546  vfloat32m2_t x = __riscv_vfdiv_mu(__riscv_vmflt(z, cf1, vl), z, cf1, z, vl);
547  x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl);
548  x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl);
549  x = __riscv_vfdiv(cf1, x, vl);
550  vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
551 
552 #if ACOS_TERMS < 1
553  vfloat32m2_t y = __riscv_vfmv_v_f_f32m2(0, vl);
554 #elif ACOS_TERMS == 1
555  y = __riscv_vfmadd(y, xx, cf1, vl);
556 #elif ACOS_TERMS == 2
557  vfloat32m2_t y = cfm1o3;
558  y = __riscv_vfmadd(y, xx, cf1, vl);
559 #elif ACOS_TERMS == 3
560  vfloat32m2_t y = cf1o5;
561  y = __riscv_vfmadd(y, xx, cfm1o3, vl);
562  y = __riscv_vfmadd(y, xx, cf1, vl);
563 #elif ACOS_TERMS == 4
564  vfloat32m2_t y = cfm1o7;
565  y = __riscv_vfmadd(y, xx, cf1o5, vl);
566  y = __riscv_vfmadd(y, xx, cfm1o3, vl);
567  y = __riscv_vfmadd(y, xx, cf1, vl);
568 #else
569 #error "ACOS_TERMS > 4 not supported by volk_32f_acos_32f_rvv"
570 #endif
571  y = __riscv_vfmul(y, __riscv_vfmul(x, cf4, vl), vl);
572  y = __riscv_vfadd_mu(
573  __riscv_vmfgt(z, cf1, vl), y, y, __riscv_vfnmsub(y, cf2, cpio2, vl), vl);
574 
575  vfloat32m2_t acosine;
576  acosine = __riscv_vfneg_mu(RISCV_VMFLTZ(32m2, a, vl), y, y, vl);
577  acosine = __riscv_vfadd_mu(RISCV_VMFLTZ(32m2, v, vl), acosine, acosine, cpi, vl);
578 
579  __riscv_vse32(bVector, acosine, vl);
580  }
581 }
582 #endif /*LV_HAVE_RVV*/
583 
584 #endif /* INCLUDED_volk_32f_acos_32f_u_H */
static void volk_32f_acos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:504
#define ACOS_TERMS
Definition: volk_32f_acos_32f.h:63
static void volk_32f_acos_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:363
static void volk_32f_acos_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:144
for i
Definition: volk_config_fixed.tmpl.h:13
#define RISCV_VMFLTZ(T, v, vl)
Definition: volk_rvv_intrinsics.h:75