Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_avx2_fma_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 /*
11  * This file is intended to hold AVX2 FMA intrinsics of intrinsics.
12  * They should be used in VOLK kernels to avoid copy-paste.
13  */
14 
15 #ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
16 #define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
17 #include <immintrin.h>
18 
19 /*
20  * Approximate arctan(x) via polynomial expansion
21  * on the interval [-1, 1]
22  *
23  * Maximum relative error ~6.5e-7
24  * Polynomial evaluated via Horner's method
25  */
26 static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
27 {
28  const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
29  const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
30  const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
31  const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
32  const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
33  const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
34  const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
35 
36  const __m256 x_times_x = _mm256_mul_ps(x, x);
37  __m256 arctan;
38  arctan = a13;
39  arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
40  arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
41  arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
42  arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
43  arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
44  arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
45  arctan = _mm256_mul_ps(x, arctan);
46 
47  return arctan;
48 }
49 
50 #endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
static __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:26