Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32fc_deinterleave_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
61 #define INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
62 
63 #include <inttypes.h>
64 #include <stdio.h>
65 
66 #ifdef LV_HAVE_AVX
67 #include <immintrin.h>
68 static inline void volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer,
69  float* qBuffer,
70  const lv_32fc_t* complexVector,
71  unsigned int num_points)
72 {
73  const float* complexVectorPtr = (float*)complexVector;
74  float* iBufferPtr = iBuffer;
75  float* qBufferPtr = qBuffer;
76 
77  unsigned int number = 0;
78  // Mask for real and imaginary parts
79  const unsigned int eighthPoints = num_points / 8;
80  __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
81  for (; number < eighthPoints; number++) {
82  cplxValue1 = _mm256_load_ps(complexVectorPtr);
83  complexVectorPtr += 8;
84 
85  cplxValue2 = _mm256_load_ps(complexVectorPtr);
86  complexVectorPtr += 8;
87 
88  complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
89  complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
90 
91  // Arrange in i1i2i3i4 format
92  iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
93  // Arrange in q1q2q3q4 format
94  qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
95 
96  _mm256_store_ps(iBufferPtr, iValue);
97  _mm256_store_ps(qBufferPtr, qValue);
98 
99  iBufferPtr += 8;
100  qBufferPtr += 8;
101  }
102 
103  number = eighthPoints * 8;
104  for (; number < num_points; number++) {
105  *iBufferPtr++ = *complexVectorPtr++;
106  *qBufferPtr++ = *complexVectorPtr++;
107  }
108 }
109 #endif /* LV_HAVE_AVX */
110 
111 #ifdef LV_HAVE_SSE
112 #include <xmmintrin.h>
113 
114 static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer,
115  float* qBuffer,
116  const lv_32fc_t* complexVector,
117  unsigned int num_points)
118 {
119  const float* complexVectorPtr = (float*)complexVector;
120  float* iBufferPtr = iBuffer;
121  float* qBufferPtr = qBuffer;
122 
123  unsigned int number = 0;
124  const unsigned int quarterPoints = num_points / 4;
125  __m128 cplxValue1, cplxValue2, iValue, qValue;
126  for (; number < quarterPoints; number++) {
127  cplxValue1 = _mm_load_ps(complexVectorPtr);
128  complexVectorPtr += 4;
129 
130  cplxValue2 = _mm_load_ps(complexVectorPtr);
131  complexVectorPtr += 4;
132 
133  // Arrange in i1i2i3i4 format
134  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
135  // Arrange in q1q2q3q4 format
136  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
137 
138  _mm_store_ps(iBufferPtr, iValue);
139  _mm_store_ps(qBufferPtr, qValue);
140 
141  iBufferPtr += 4;
142  qBufferPtr += 4;
143  }
144 
145  number = quarterPoints * 4;
146  for (; number < num_points; number++) {
147  *iBufferPtr++ = *complexVectorPtr++;
148  *qBufferPtr++ = *complexVectorPtr++;
149  }
150 }
151 #endif /* LV_HAVE_SSE */
152 
153 
154 #ifdef LV_HAVE_NEON
155 #include <arm_neon.h>
156 
157 static inline void volk_32fc_deinterleave_32f_x2_neon(float* iBuffer,
158  float* qBuffer,
159  const lv_32fc_t* complexVector,
160  unsigned int num_points)
161 {
162  unsigned int number = 0;
163  unsigned int quarter_points = num_points / 4;
164  const float* complexVectorPtr = (float*)complexVector;
165  float* iBufferPtr = iBuffer;
166  float* qBufferPtr = qBuffer;
167  float32x4x2_t complexInput;
168 
169  for (number = 0; number < quarter_points; number++) {
170  complexInput = vld2q_f32(complexVectorPtr);
171  vst1q_f32(iBufferPtr, complexInput.val[0]);
172  vst1q_f32(qBufferPtr, complexInput.val[1]);
173  complexVectorPtr += 8;
174  iBufferPtr += 4;
175  qBufferPtr += 4;
176  }
177 
178  for (number = quarter_points * 4; number < num_points; number++) {
179  *iBufferPtr++ = *complexVectorPtr++;
180  *qBufferPtr++ = *complexVectorPtr++;
181  }
182 }
183 #endif /* LV_HAVE_NEON */
184 
185 
186 #ifdef LV_HAVE_GENERIC
187 
188 static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer,
189  float* qBuffer,
190  const lv_32fc_t* complexVector,
191  unsigned int num_points)
192 {
193  const float* complexVectorPtr = (float*)complexVector;
194  float* iBufferPtr = iBuffer;
195  float* qBufferPtr = qBuffer;
196  unsigned int number;
197  for (number = 0; number < num_points; number++) {
198  *iBufferPtr++ = *complexVectorPtr++;
199  *qBufferPtr++ = *complexVectorPtr++;
200  }
201 }
202 #endif /* LV_HAVE_GENERIC */
203 
204 #endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a_H */
205 
206 
207 #ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_u_H
208 #define INCLUDED_volk_32fc_deinterleave_32f_x2_u_H
209 
210 #include <inttypes.h>
211 #include <stdio.h>
212 
213 #ifdef LV_HAVE_AVX
214 #include <immintrin.h>
215 static inline void volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer,
216  float* qBuffer,
217  const lv_32fc_t* complexVector,
218  unsigned int num_points)
219 {
220  const float* complexVectorPtr = (float*)complexVector;
221  float* iBufferPtr = iBuffer;
222  float* qBufferPtr = qBuffer;
223 
224  unsigned int number = 0;
225  // Mask for real and imaginary parts
226  const unsigned int eighthPoints = num_points / 8;
227  __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
228  for (; number < eighthPoints; number++) {
229  cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
230  complexVectorPtr += 8;
231 
232  cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
233  complexVectorPtr += 8;
234 
235  complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
236  complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
237 
238  // Arrange in i1i2i3i4 format
239  iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
240  // Arrange in q1q2q3q4 format
241  qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
242 
243  _mm256_storeu_ps(iBufferPtr, iValue);
244  _mm256_storeu_ps(qBufferPtr, qValue);
245 
246  iBufferPtr += 8;
247  qBufferPtr += 8;
248  }
249 
250  number = eighthPoints * 8;
251  for (; number < num_points; number++) {
252  *iBufferPtr++ = *complexVectorPtr++;
253  *qBufferPtr++ = *complexVectorPtr++;
254  }
255 }
256 #endif /* LV_HAVE_AVX */
257 
258 #ifdef LV_HAVE_RVV
259 #include <riscv_vector.h>
260 
261 static inline void volk_32fc_deinterleave_32f_x2_rvv(float* iBuffer,
262  float* qBuffer,
263  const lv_32fc_t* complexVector,
264  unsigned int num_points)
265 {
266  size_t n = num_points;
267  for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
268  vl = __riscv_vsetvl_e32m4(n);
269  vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl);
270  vuint32m4_t vr = __riscv_vnsrl(vc, 0, vl);
271  vuint32m4_t vi = __riscv_vnsrl(vc, 32, vl);
272  __riscv_vse32((uint32_t*)iBuffer, vr, vl);
273  __riscv_vse32((uint32_t*)qBuffer, vi, vl);
274  }
275 }
276 #endif /*LV_HAVE_RVV*/
277 
278 #ifdef LV_HAVE_RVVSEG
279 #include <riscv_vector.h>
280 
281 static inline void volk_32fc_deinterleave_32f_x2_rvvseg(float* iBuffer,
282  float* qBuffer,
283  const lv_32fc_t* complexVector,
284  unsigned int num_points)
285 {
286  size_t n = num_points;
287  for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
288  vl = __riscv_vsetvl_e32m4(n);
289  vuint32m4x2_t vc =
290  __riscv_vlseg2e32_v_u32m4x2((const uint32_t*)complexVector, vl);
291  vuint32m4_t vr = __riscv_vget_u32m4(vc, 0);
292  vuint32m4_t vi = __riscv_vget_u32m4(vc, 1);
293  __riscv_vse32((uint32_t*)iBuffer, vr, vl);
294  __riscv_vse32((uint32_t*)qBuffer, vi, vl);
295  }
296 }
297 #endif /*LV_HAVE_RVVSEG*/
298 
299 #endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_u_H */
static void volk_32fc_deinterleave_32f_x2_generic(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:188
static void volk_32fc_deinterleave_32f_x2_a_avx(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:68
static void volk_32fc_deinterleave_32f_x2_a_sse(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:114
static void volk_32fc_deinterleave_32f_x2_neon(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:157
static void volk_32fc_deinterleave_32f_x2_u_avx(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:215
float complex lv_32fc_t
Definition: volk_complex.h:74