Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32f_x3_sum_of_poly_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
71 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
72 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
73 
74 #include <inttypes.h>
75 #include <stdio.h>
76 #include <volk/volk_complex.h>
77 
78 #ifndef MAX
79 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
80 #endif
81 
82 #ifdef LV_HAVE_SSE3
83 #include <pmmintrin.h>
84 #include <xmmintrin.h>
85 
86 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target,
87  float* src0,
88  float* center_point_array,
89  float* cutoff,
90  unsigned int num_points)
91 {
92  float result = 0.0f;
93  float fst = 0.0f;
94  float sq = 0.0f;
95  float thrd = 0.0f;
96  float frth = 0.0f;
97 
98  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
99 
100  xmm9 = _mm_setzero_ps();
101  xmm1 = _mm_setzero_ps();
102  xmm0 = _mm_load1_ps(&center_point_array[0]);
103  xmm6 = _mm_load1_ps(&center_point_array[1]);
104  xmm7 = _mm_load1_ps(&center_point_array[2]);
105  xmm8 = _mm_load1_ps(&center_point_array[3]);
106  xmm10 = _mm_load1_ps(cutoff);
107 
108  int bound = num_points / 8;
109  int leftovers = num_points - 8 * bound;
110  int i = 0;
111  for (; i < bound; ++i) {
112  // 1st
113  xmm2 = _mm_load_ps(src0);
114  xmm2 = _mm_max_ps(xmm10, xmm2);
115  xmm3 = _mm_mul_ps(xmm2, xmm2);
116  xmm4 = _mm_mul_ps(xmm2, xmm3);
117  xmm5 = _mm_mul_ps(xmm3, xmm3);
118 
119  xmm2 = _mm_mul_ps(xmm2, xmm0);
120  xmm3 = _mm_mul_ps(xmm3, xmm6);
121  xmm4 = _mm_mul_ps(xmm4, xmm7);
122  xmm5 = _mm_mul_ps(xmm5, xmm8);
123 
124  xmm2 = _mm_add_ps(xmm2, xmm3);
125  xmm3 = _mm_add_ps(xmm4, xmm5);
126 
127  src0 += 4;
128 
129  xmm9 = _mm_add_ps(xmm2, xmm9);
130  xmm9 = _mm_add_ps(xmm3, xmm9);
131 
132  // 2nd
133  xmm2 = _mm_load_ps(src0);
134  xmm2 = _mm_max_ps(xmm10, xmm2);
135  xmm3 = _mm_mul_ps(xmm2, xmm2);
136  xmm4 = _mm_mul_ps(xmm2, xmm3);
137  xmm5 = _mm_mul_ps(xmm3, xmm3);
138 
139  xmm2 = _mm_mul_ps(xmm2, xmm0);
140  xmm3 = _mm_mul_ps(xmm3, xmm6);
141  xmm4 = _mm_mul_ps(xmm4, xmm7);
142  xmm5 = _mm_mul_ps(xmm5, xmm8);
143 
144  xmm2 = _mm_add_ps(xmm2, xmm3);
145  xmm3 = _mm_add_ps(xmm4, xmm5);
146 
147  src0 += 4;
148 
149  xmm1 = _mm_add_ps(xmm2, xmm1);
150  xmm1 = _mm_add_ps(xmm3, xmm1);
151  }
152  xmm2 = _mm_hadd_ps(xmm9, xmm1);
153  xmm3 = _mm_hadd_ps(xmm2, xmm2);
154  xmm4 = _mm_hadd_ps(xmm3, xmm3);
155  _mm_store_ss(&result, xmm4);
156 
157  for (i = 0; i < leftovers; ++i) {
158  fst = *src0++;
159  fst = MAX(fst, *cutoff);
160  sq = fst * fst;
161  thrd = fst * sq;
162  frth = sq * sq;
163  result += (center_point_array[0] * fst + center_point_array[1] * sq +
164  center_point_array[2] * thrd + center_point_array[3] * frth);
165  }
166 
167  result += (float)(num_points)*center_point_array[4];
168  *target = result;
169 }
170 
171 
172 #endif /*LV_HAVE_SSE3*/
173 
174 #if LV_HAVE_AVX && LV_HAVE_FMA
175 #include <immintrin.h>
176 
177 static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target,
178  float* src0,
179  float* center_point_array,
180  float* cutoff,
181  unsigned int num_points)
182 {
183  const unsigned int eighth_points = num_points / 8;
184  float fst = 0.0;
185  float sq = 0.0;
186  float thrd = 0.0;
187  float frth = 0.0;
188 
189  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
190  __m256 target_vec;
191  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
192 
193  cpa0 = _mm256_set1_ps(center_point_array[0]);
194  cpa1 = _mm256_set1_ps(center_point_array[1]);
195  cpa2 = _mm256_set1_ps(center_point_array[2]);
196  cpa3 = _mm256_set1_ps(center_point_array[3]);
197  cutoff_vec = _mm256_set1_ps(*cutoff);
198  target_vec = _mm256_setzero_ps();
199 
200  unsigned int i;
201 
202  for (i = 0; i < eighth_points; ++i) {
203  x_to_1 = _mm256_load_ps(src0);
204  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
205  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
206  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
207  // x^1 * x^3 is slightly faster than x^2 * x^2
208  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
209 
210  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
211  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
212 
213  x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
214  x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
215  // this is slightly faster than result += (x_to_1 + x_to_3)
216  target_vec = _mm256_add_ps(x_to_1, target_vec);
217  target_vec = _mm256_add_ps(x_to_3, target_vec);
218 
219  src0 += 8;
220  }
221 
222  // the hadd for vector reduction has very very slight impact @ 50k iters
223  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
224  target_vec = _mm256_hadd_ps(
225  target_vec,
226  target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
227  _mm256_store_ps(temp_results, target_vec);
228  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
229 
230  for (i = eighth_points * 8; i < num_points; ++i) {
231  fst = *src0++;
232  fst = MAX(fst, *cutoff);
233  sq = fst * fst;
234  thrd = fst * sq;
235  frth = sq * sq;
236  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
237  center_point_array[2] * thrd + center_point_array[3] * frth);
238  }
239  *target += (float)(num_points)*center_point_array[4];
240 }
241 #endif // LV_HAVE_AVX && LV_HAVE_FMA
242 
243 #ifdef LV_HAVE_AVX
244 #include <immintrin.h>
245 
246 static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target,
247  float* src0,
248  float* center_point_array,
249  float* cutoff,
250  unsigned int num_points)
251 {
252  const unsigned int eighth_points = num_points / 8;
253  float fst = 0.0;
254  float sq = 0.0;
255  float thrd = 0.0;
256  float frth = 0.0;
257 
258  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
259  __m256 target_vec;
260  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
261 
262  cpa0 = _mm256_set1_ps(center_point_array[0]);
263  cpa1 = _mm256_set1_ps(center_point_array[1]);
264  cpa2 = _mm256_set1_ps(center_point_array[2]);
265  cpa3 = _mm256_set1_ps(center_point_array[3]);
266  cutoff_vec = _mm256_set1_ps(*cutoff);
267  target_vec = _mm256_setzero_ps();
268 
269  unsigned int i;
270 
271  for (i = 0; i < eighth_points; ++i) {
272  x_to_1 = _mm256_load_ps(src0);
273  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
274  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
275  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
276  // x^1 * x^3 is slightly faster than x^2 * x^2
277  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
278 
279  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
280  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
281  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
282  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
283 
284  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
285  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
286  // this is slightly faster than result += (x_to_1 + x_to_3)
287  target_vec = _mm256_add_ps(x_to_1, target_vec);
288  target_vec = _mm256_add_ps(x_to_3, target_vec);
289 
290  src0 += 8;
291  }
292 
293  // the hadd for vector reduction has very very slight impact @ 50k iters
294  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
295  target_vec = _mm256_hadd_ps(
296  target_vec,
297  target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
298  _mm256_store_ps(temp_results, target_vec);
299  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
300 
301  for (i = eighth_points * 8; i < num_points; ++i) {
302  fst = *src0++;
303  fst = MAX(fst, *cutoff);
304  sq = fst * fst;
305  thrd = fst * sq;
306  frth = sq * sq;
307  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
308  center_point_array[2] * thrd + center_point_array[3] * frth);
309  }
310  *target += (float)(num_points)*center_point_array[4];
311 }
312 #endif // LV_HAVE_AVX
313 
314 
315 #ifdef LV_HAVE_GENERIC
316 
317 static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target,
318  float* src0,
319  float* center_point_array,
320  float* cutoff,
321  unsigned int num_points)
322 {
323  const unsigned int eighth_points = num_points / 8;
324 
325  float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
326  float fst = 0.0f;
327  float sq = 0.0f;
328  float thrd = 0.0f;
329  float frth = 0.0f;
330 
331  unsigned int i = 0;
332  unsigned int k = 0;
333  for (i = 0; i < eighth_points; ++i) {
334  for (k = 0; k < 8; ++k) {
335  fst = *src0++;
336  fst = MAX(fst, *cutoff);
337  sq = fst * fst;
338  thrd = fst * sq;
339  frth = fst * thrd;
340  result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
341  result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
342  }
343  }
344  for (k = 0; k < 8; k += 2) {
345  result[k] = result[k] + result[k + 1];
346  }
347 
348  *target = result[0] + result[2] + result[4] + result[6];
349 
350  for (i = eighth_points * 8; i < num_points; ++i) {
351  fst = *src0++;
352  fst = MAX(fst, *cutoff);
353  sq = fst * fst;
354  thrd = fst * sq;
355  frth = fst * thrd;
356  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
357  center_point_array[2] * thrd + center_point_array[3] * frth);
358  }
359  *target += (float)(num_points)*center_point_array[4];
360 }
361 
362 #endif /*LV_HAVE_GENERIC*/
363 
364 #ifdef LV_HAVE_NEON
365 #include <arm_neon.h>
366 
367 static inline void
368 volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target,
369  float* __restrict src0,
370  float* __restrict center_point_array,
371  float* __restrict cutoff,
372  unsigned int num_points)
373 {
374  unsigned int i;
375  float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
376 
377  float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
378  float32x2_t cutoff_vector;
379  float32x2x2_t x_low, x_high;
380  float32x4_t x_qvector, c_qvector, cpa_qvector;
381  float accumulator;
382  float res_accumulators[4];
383 
384  c_qvector = vld1q_f32(zero);
385  // load the cutoff in to a vector
386  cutoff_vector = vdup_n_f32(*cutoff);
387  // ... center point array
388  cpa_qvector = vld1q_f32(center_point_array);
389 
390  for (i = 0; i < num_points; ++i) {
391  // load x (src0)
392  x_to_1 = vdup_n_f32(*src0++);
393 
394  // Get a vector of max(src0, cutoff)
395  x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1
396  x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
397  x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
398  x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
399  // zip up doubles to interleave
400  x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
401  x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
402  // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
403  x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
404  // now we finally have [x^4 | x^3 | x^2 | x] !
405 
406  c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
407  }
408  // there should be better vector reduction techniques
409  vst1q_f32(res_accumulators, c_qvector);
410  accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
411  res_accumulators[3];
412 
413  *target = accumulator + (float)num_points * center_point_array[4];
414 }
415 
416 #endif /* LV_HAVE_NEON */
417 
418 
419 #ifdef LV_HAVE_NEON
420 
421 static inline void
422 volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target,
423  float* __restrict src0,
424  float* __restrict center_point_array,
425  float* __restrict cutoff,
426  unsigned int num_points)
427 {
428  unsigned int i;
429  float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
430 
431  float accumulator;
432 
433  float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
434  accumulator1_vec = vld1q_f32(zero);
435  accumulator2_vec = vld1q_f32(zero);
436  accumulator3_vec = vld1q_f32(zero);
437  accumulator4_vec = vld1q_f32(zero);
438  float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
439  float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
440 
441  // load the cutoff in to a vector
442  cutoff_vector = vdupq_n_f32(*cutoff);
443  // ... center point array
444  cpa_0 = vdupq_n_f32(center_point_array[0]);
445  cpa_1 = vdupq_n_f32(center_point_array[1]);
446  cpa_2 = vdupq_n_f32(center_point_array[2]);
447  cpa_3 = vdupq_n_f32(center_point_array[3]);
448 
449  // nathan is not sure why this is slower *and* wrong compared to neonvertfma
450  for (i = 0; i < num_points / 4; ++i) {
451  // load x
452  x_to_1 = vld1q_f32(src0);
453 
454  // Get a vector of max(src0, cutoff)
455  x_to_1 = vmaxq_f32(x_to_1, cutoff_vector); // x^1
456  x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
457  x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
458  x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
459  x_to_1 = vmulq_f32(x_to_1, cpa_0);
460  x_to_2 = vmulq_f32(x_to_2, cpa_1);
461  x_to_3 = vmulq_f32(x_to_3, cpa_2);
462  x_to_4 = vmulq_f32(x_to_4, cpa_3);
463  accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
464  accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
465  accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
466  accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
467 
468  src0 += 4;
469  }
470  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
471  accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
472  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
473 
474  __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
475  vst1q_f32(res_accumulators, accumulator1_vec);
476  accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
477  res_accumulators[3];
478 
479  float fst = 0.0;
480  float sq = 0.0;
481  float thrd = 0.0;
482  float frth = 0.0;
483 
484  for (i = 4 * (num_points / 4); i < num_points; ++i) {
485  fst = *src0++;
486  fst = MAX(fst, *cutoff);
487 
488  sq = fst * fst;
489  thrd = fst * sq;
490  frth = sq * sq;
491  // fith = sq * thrd;
492 
493  accumulator += (center_point_array[0] * fst + center_point_array[1] * sq +
494  center_point_array[2] * thrd + center_point_array[3] * frth); //+
495  }
496 
497  *target = accumulator + (float)num_points * center_point_array[4];
498 }
499 
500 #endif /* LV_HAVE_NEON */
501 
502 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
503 
504 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
505 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
506 
507 #include <inttypes.h>
508 #include <stdio.h>
509 #include <volk/volk_complex.h>
510 
511 #ifndef MAX
512 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
513 #endif
514 
515 #if LV_HAVE_AVX && LV_HAVE_FMA
516 #include <immintrin.h>
517 
518 static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target,
519  float* src0,
520  float* center_point_array,
521  float* cutoff,
522  unsigned int num_points)
523 {
524  const unsigned int eighth_points = num_points / 8;
525  float fst = 0.0;
526  float sq = 0.0;
527  float thrd = 0.0;
528  float frth = 0.0;
529 
530  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
531  __m256 target_vec;
532  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
533 
534  cpa0 = _mm256_set1_ps(center_point_array[0]);
535  cpa1 = _mm256_set1_ps(center_point_array[1]);
536  cpa2 = _mm256_set1_ps(center_point_array[2]);
537  cpa3 = _mm256_set1_ps(center_point_array[3]);
538  cutoff_vec = _mm256_set1_ps(*cutoff);
539  target_vec = _mm256_setzero_ps();
540 
541  unsigned int i;
542 
543  for (i = 0; i < eighth_points; ++i) {
544  x_to_1 = _mm256_loadu_ps(src0);
545  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
546  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
547  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
548  // x^1 * x^3 is slightly faster than x^2 * x^2
549  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
550 
551  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
552  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
553 
554  x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
555  x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
556  // this is slightly faster than result += (x_to_1 + x_to_3)
557  target_vec = _mm256_add_ps(x_to_1, target_vec);
558  target_vec = _mm256_add_ps(x_to_3, target_vec);
559 
560  src0 += 8;
561  }
562 
563  // the hadd for vector reduction has very very slight impact @ 50k iters
564  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
565  target_vec = _mm256_hadd_ps(
566  target_vec,
567  target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
568  _mm256_storeu_ps(temp_results, target_vec);
569  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
570 
571  for (i = eighth_points * 8; i < num_points; ++i) {
572  fst = *src0++;
573  fst = MAX(fst, *cutoff);
574  sq = fst * fst;
575  thrd = fst * sq;
576  frth = sq * sq;
577  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
578  center_point_array[2] * thrd + center_point_array[3] * frth);
579  }
580 
581  *target += (float)(num_points)*center_point_array[4];
582 }
583 #endif // LV_HAVE_AVX && LV_HAVE_FMA
584 
585 #ifdef LV_HAVE_AVX
586 #include <immintrin.h>
587 
588 static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target,
589  float* src0,
590  float* center_point_array,
591  float* cutoff,
592  unsigned int num_points)
593 {
594  const unsigned int eighth_points = num_points / 8;
595  float fst = 0.0;
596  float sq = 0.0;
597  float thrd = 0.0;
598  float frth = 0.0;
599 
600  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
601  __m256 target_vec;
602  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
603 
604  cpa0 = _mm256_set1_ps(center_point_array[0]);
605  cpa1 = _mm256_set1_ps(center_point_array[1]);
606  cpa2 = _mm256_set1_ps(center_point_array[2]);
607  cpa3 = _mm256_set1_ps(center_point_array[3]);
608  cutoff_vec = _mm256_set1_ps(*cutoff);
609  target_vec = _mm256_setzero_ps();
610 
611  unsigned int i;
612 
613  for (i = 0; i < eighth_points; ++i) {
614  x_to_1 = _mm256_loadu_ps(src0);
615  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
616  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
617  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
618  // x^1 * x^3 is slightly faster than x^2 * x^2
619  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
620 
621  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
622  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
623  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
624  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
625 
626  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
627  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
628  // this is slightly faster than result += (x_to_1 + x_to_3)
629  target_vec = _mm256_add_ps(x_to_1, target_vec);
630  target_vec = _mm256_add_ps(x_to_3, target_vec);
631 
632  src0 += 8;
633  }
634 
635  // the hadd for vector reduction has very very slight impact @ 50k iters
636  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
637  target_vec = _mm256_hadd_ps(
638  target_vec,
639  target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
640  _mm256_storeu_ps(temp_results, target_vec);
641  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
642 
643  for (i = eighth_points * 8; i < num_points; ++i) {
644  fst = *src0++;
645  fst = MAX(fst, *cutoff);
646  sq = fst * fst;
647  thrd = fst * sq;
648  frth = sq * sq;
649 
650  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
651  center_point_array[2] * thrd + center_point_array[3] * frth);
652  }
653 
654  *target += (float)(num_points)*center_point_array[4];
655 }
656 #endif // LV_HAVE_AVX
657 
658 #ifdef LV_HAVE_RVV
659 #include <riscv_vector.h>
661 
662 static inline void volk_32f_x3_sum_of_poly_32f_rvv(float* target,
663  float* src0,
664  float* center_point_array,
665  float* cutoff,
666  unsigned int num_points)
667 {
668  size_t vlmax = __riscv_vsetvlmax_e32m4();
669  vfloat32m4_t vsum = __riscv_vfmv_v_f_f32m4(0, vlmax);
670  float mul1 = center_point_array[0]; // scalar to avoid register spills
671  float mul2 = center_point_array[1];
672  vfloat32m4_t vmul3 = __riscv_vfmv_v_f_f32m4(center_point_array[2], vlmax);
673  vfloat32m4_t vmul4 = __riscv_vfmv_v_f_f32m4(center_point_array[3], vlmax);
674  vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(*cutoff, vlmax);
675 
676  size_t n = num_points;
677  for (size_t vl; n > 0; n -= vl, src0 += vl) {
678  vl = __riscv_vsetvl_e32m4(n);
679  vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl);
680  vfloat32m4_t v1 = __riscv_vfmax(v, vmax, vl);
681  vfloat32m4_t v2 = __riscv_vfmul(v1, v1, vl);
682  vfloat32m4_t v3 = __riscv_vfmul(v1, v2, vl);
683  vfloat32m4_t v4 = __riscv_vfmul(v2, v2, vl);
684  v2 = __riscv_vfmul(v2, mul2, vl);
685  v4 = __riscv_vfmul(v4, vmul4, vl);
686  v1 = __riscv_vfmadd(v1, mul1, v2, vl);
687  v3 = __riscv_vfmadd(v3, vmul3, v4, vl);
688  v1 = __riscv_vfadd(v1, v3, vl);
689  vsum = __riscv_vfadd_tu(vsum, vsum, v1, vl);
690  }
691  size_t vl = __riscv_vsetvlmax_e32m1();
692  vfloat32m1_t v = RISCV_SHRINK4(vfadd, f, 32, vsum);
693  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
694  float sum = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl));
695  *target = sum + num_points * center_point_array[4];
696 }
697 #endif /*LV_HAVE_RVV*/
698 
699 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H*/
static void volk_32f_x3_sum_of_poly_32f_u_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:588
static void volk_32f_x3_sum_of_poly_32f_a_sse3(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:86
static void volk_32f_x3_sum_of_poly_32f_neonvert(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:422
static void volk_32f_x3_sum_of_poly_32f_a_neon(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:368
static void volk_32f_x3_sum_of_poly_32f_generic(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:317
static void volk_32f_x3_sum_of_poly_32f_a_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:246
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:79
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
for i
Definition: volk_config_fixed.tmpl.h:13
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24