Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_16ic_s32f_magnitude_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
42 #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
43 #define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
44 
45 #include <inttypes.h>
46 #include <math.h>
47 #include <stdio.h>
48 #include <volk/volk_common.h>
49 
50 #ifdef LV_HAVE_AVX2
51 #include <immintrin.h>
52 
53 static inline void volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector,
54  const lv_16sc_t* complexVector,
55  const float scalar,
56  unsigned int num_points)
57 {
58  unsigned int number = 0;
59  const unsigned int eighthPoints = num_points / 8;
60 
61  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
62  float* magnitudeVectorPtr = magnitudeVector;
63 
64  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
65 
66  __m256 cplxValue1, cplxValue2, result;
67  __m256i int1, int2;
68  __m128i short1, short2;
69  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
70 
71  for (; number < eighthPoints; number++) {
72 
73  int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
74  complexVectorPtr += 16;
75  short1 = _mm256_extracti128_si256(int1, 0);
76  short2 = _mm256_extracti128_si256(int1, 1);
77 
78  int1 = _mm256_cvtepi16_epi32(short1);
79  int2 = _mm256_cvtepi16_epi32(short2);
80  cplxValue1 = _mm256_cvtepi32_ps(int1);
81  cplxValue2 = _mm256_cvtepi32_ps(int2);
82 
83  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
84  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
85 
86  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
87  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
88 
89  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
90  result = _mm256_permutevar8x32_ps(result, idx);
91 
92  result = _mm256_sqrt_ps(result); // Square root the values
93 
94  _mm256_store_ps(magnitudeVectorPtr, result);
95 
96  magnitudeVectorPtr += 8;
97  }
98 
99  number = eighthPoints * 8;
100  magnitudeVectorPtr = &magnitudeVector[number];
101  complexVectorPtr = (const int16_t*)&complexVector[number];
102  for (; number < num_points; number++) {
103  float val1Real = (float)(*complexVectorPtr++) / scalar;
104  float val1Imag = (float)(*complexVectorPtr++) / scalar;
105  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
106  }
107 }
108 #endif /* LV_HAVE_AVX2 */
109 
110 
111 #ifdef LV_HAVE_SSE3
112 #include <pmmintrin.h>
113 
114 static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector,
115  const lv_16sc_t* complexVector,
116  const float scalar,
117  unsigned int num_points)
118 {
119  unsigned int number = 0;
120  const unsigned int quarterPoints = num_points / 4;
121 
122  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
123  float* magnitudeVectorPtr = magnitudeVector;
124 
125  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
126 
127  __m128 cplxValue1, cplxValue2, result;
128 
129  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
130 
131  for (; number < quarterPoints; number++) {
132 
133  inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
134  inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
135  inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
136  inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
137 
138  inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
139  inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
140  inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
141  inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
142 
143  cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
144  cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
145 
146  complexVectorPtr += 8;
147 
148  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
149  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
150 
151  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
152  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
153 
154  result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
155 
156  result = _mm_sqrt_ps(result); // Square root the values
157 
158  _mm_store_ps(magnitudeVectorPtr, result);
159 
160  magnitudeVectorPtr += 4;
161  }
162 
163  number = quarterPoints * 4;
164  magnitudeVectorPtr = &magnitudeVector[number];
165  complexVectorPtr = (const int16_t*)&complexVector[number];
166  for (; number < num_points; number++) {
167  float val1Real = (float)(*complexVectorPtr++) / scalar;
168  float val1Imag = (float)(*complexVectorPtr++) / scalar;
169  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
170  }
171 }
172 #endif /* LV_HAVE_SSE3 */
173 
174 #ifdef LV_HAVE_SSE
175 #include <xmmintrin.h>
176 
177 static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector,
178  const lv_16sc_t* complexVector,
179  const float scalar,
180  unsigned int num_points)
181 {
182  unsigned int number = 0;
183  const unsigned int quarterPoints = num_points / 4;
184 
185  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
186  float* magnitudeVectorPtr = magnitudeVector;
187 
188  const float iScalar = 1.0 / scalar;
189  __m128 invScalar = _mm_set_ps1(iScalar);
190 
191  __m128 cplxValue1, cplxValue2, result, re, im;
192 
193  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
194 
195  for (; number < quarterPoints; number++) {
196  inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
197  inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
198  inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
199  inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
200 
201  inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
202  inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
203  inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
204  inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
205 
206  cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
207  cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
208 
209  re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
210  im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
211 
212  complexVectorPtr += 8;
213 
214  cplxValue1 = _mm_mul_ps(re, invScalar);
215  cplxValue2 = _mm_mul_ps(im, invScalar);
216 
217  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
218  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
219 
220  result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
221 
222  result = _mm_sqrt_ps(result); // Square root the values
223 
224  _mm_store_ps(magnitudeVectorPtr, result);
225 
226  magnitudeVectorPtr += 4;
227  }
228 
229  number = quarterPoints * 4;
230  magnitudeVectorPtr = &magnitudeVector[number];
231  complexVectorPtr = (const int16_t*)&complexVector[number];
232  for (; number < num_points; number++) {
233  float val1Real = (float)(*complexVectorPtr++) * iScalar;
234  float val1Imag = (float)(*complexVectorPtr++) * iScalar;
235  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
236  }
237 }
238 
239 
240 #endif /* LV_HAVE_SSE */
241 
242 #ifdef LV_HAVE_GENERIC
243 
244 static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector,
245  const lv_16sc_t* complexVector,
246  const float scalar,
247  unsigned int num_points)
248 {
249  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
250  float* magnitudeVectorPtr = magnitudeVector;
251  unsigned int number = 0;
252  const float invScalar = 1.0 / scalar;
253  for (number = 0; number < num_points; number++) {
254  float real = ((float)(*complexVectorPtr++)) * invScalar;
255  float imag = ((float)(*complexVectorPtr++)) * invScalar;
256  *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
257  }
258 }
259 #endif /* LV_HAVE_GENERIC */
260 
261 
262 #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a_H */
263 
264 #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
265 #define INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
266 
267 #include <inttypes.h>
268 #include <math.h>
269 #include <stdio.h>
270 #include <volk/volk_common.h>
271 
272 #ifdef LV_HAVE_AVX2
273 #include <immintrin.h>
274 
275 static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector,
276  const lv_16sc_t* complexVector,
277  const float scalar,
278  unsigned int num_points)
279 {
280  unsigned int number = 0;
281  const unsigned int eighthPoints = num_points / 8;
282 
283  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
284  float* magnitudeVectorPtr = magnitudeVector;
285 
286  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
287 
288  __m256 cplxValue1, cplxValue2, result;
289  __m256i int1, int2;
290  __m128i short1, short2;
291  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
292 
293  for (; number < eighthPoints; number++) {
294 
295  int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
296  complexVectorPtr += 16;
297  short1 = _mm256_extracti128_si256(int1, 0);
298  short2 = _mm256_extracti128_si256(int1, 1);
299 
300  int1 = _mm256_cvtepi16_epi32(short1);
301  int2 = _mm256_cvtepi16_epi32(short2);
302  cplxValue1 = _mm256_cvtepi32_ps(int1);
303  cplxValue2 = _mm256_cvtepi32_ps(int2);
304 
305  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
306  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
307 
308  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
309  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
310 
311  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
312  result = _mm256_permutevar8x32_ps(result, idx);
313 
314  result = _mm256_sqrt_ps(result); // Square root the values
315 
316  _mm256_storeu_ps(magnitudeVectorPtr, result);
317 
318  magnitudeVectorPtr += 8;
319  }
320 
321  number = eighthPoints * 8;
322  magnitudeVectorPtr = &magnitudeVector[number];
323  complexVectorPtr = (const int16_t*)&complexVector[number];
324  for (; number < num_points; number++) {
325  float val1Real = (float)(*complexVectorPtr++) / scalar;
326  float val1Imag = (float)(*complexVectorPtr++) / scalar;
327  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
328  }
329 }
330 #endif /* LV_HAVE_AVX2 */
331 
332 #ifdef LV_HAVE_RVV
333 #include <riscv_vector.h>
334 
335 static inline void volk_16ic_s32f_magnitude_32f_rvv(float* magnitudeVector,
336  const lv_16sc_t* complexVector,
337  const float scalar,
338  unsigned int num_points)
339 {
340  size_t n = num_points;
341  for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
342  vl = __riscv_vsetvl_e16m4(n);
343  vint32m8_t vc = __riscv_vle32_v_i32m8((const int32_t*)complexVector, vl);
344  vint16m4_t vr = __riscv_vnsra(vc, 0, vl);
345  vint16m4_t vi = __riscv_vnsra(vc, 16, vl);
346  vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0f / scalar, vl);
347  vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0f / scalar, vl);
348  vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl);
349  __riscv_vse32(magnitudeVector, __riscv_vfsqrt(vf, vl), vl);
350  }
351 }
352 #endif /*LV_HAVE_RVV*/
353 
354 #ifdef LV_HAVE_RVVSEG
355 #include <riscv_vector.h>
356 
357 static inline void volk_16ic_s32f_magnitude_32f_rvvseg(float* magnitudeVector,
358  const lv_16sc_t* complexVector,
359  const float scalar,
360  unsigned int num_points)
361 {
362  size_t n = num_points;
363  for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
364  vl = __riscv_vsetvl_e16m4(n);
365  vint16m4x2_t vc = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)complexVector, vl);
366  vint16m4_t vr = __riscv_vget_i16m4(vc, 0);
367  vint16m4_t vi = __riscv_vget_i16m4(vc, 1);
368  vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0f / scalar, vl);
369  vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0f / scalar, vl);
370  vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl);
371  __riscv_vse32(magnitudeVector, __riscv_vfsqrt(vf, vl), vl);
372  }
373 }
374 #endif /*LV_HAVE_RVVSEG*/
375 
376 #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */
static void volk_16ic_s32f_magnitude_32f_generic(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_magnitude_32f.h:244
static void volk_16ic_s32f_magnitude_32f_a_sse(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_magnitude_32f.h:177
static void volk_16ic_s32f_magnitude_32f_a_sse3(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_magnitude_32f.h:114
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
short complex lv_16sc_t
Definition: volk_complex.h:71