54 #ifndef INCLUDED_volk_16ic_magnitude_16i_a_H 55 #define INCLUDED_volk_16ic_magnitude_16i_a_H 64 #include <immintrin.h> 67 volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector,
const lv_16sc_t* complexVector,
unsigned int num_points)
69 unsigned int number = 0;
70 const unsigned int eighthPoints = num_points / 8;
72 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
73 int16_t* magnitudeVectorPtr = magnitudeVector;
75 __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
76 __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX);
78 __m128i short1, short2;
79 __m256 cplxValue1, cplxValue2, result;
80 __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
82 for(;number < eighthPoints; number++){
84 int1 = _mm256_load_si256((__m256i*)complexVectorPtr);
85 complexVectorPtr += 16;
86 short1 = _mm256_extracti128_si256(int1,0);
87 short2 = _mm256_extracti128_si256(int1,1);
89 int1 = _mm256_cvtepi16_epi32(short1);
90 int2 = _mm256_cvtepi16_epi32(short2);
91 cplxValue1 = _mm256_cvtepi32_ps(int1);
92 cplxValue2 = _mm256_cvtepi32_ps(int2);
94 cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
95 cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
97 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
98 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
100 result = _mm256_hadd_ps(cplxValue1, cplxValue2);
102 result = _mm256_sqrt_ps(result);
104 result = _mm256_mul_ps(result, vScalar);
106 int1 = _mm256_cvtps_epi32(result);
107 int1 = _mm256_packs_epi32(int1, int1);
108 int1 = _mm256_permutevar8x32_epi32(int1, idx);
109 short1 = _mm256_extracti128_si256(int1, 0);
110 _mm_store_si128((__m128i*)magnitudeVectorPtr,short1);
111 magnitudeVectorPtr += 8;
114 number = eighthPoints * 8;
115 magnitudeVectorPtr = &magnitudeVector[number];
116 complexVectorPtr = (
const int16_t*)&complexVector[number];
117 for(; number < num_points; number++){
118 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
119 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
120 const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
121 *magnitudeVectorPtr++ = (int16_t)
rintf(val1Result);
127 #include <pmmintrin.h> 132 unsigned int number = 0;
133 const unsigned int quarterPoints = num_points / 4;
135 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
136 int16_t* magnitudeVectorPtr = magnitudeVector;
138 __m128 vScalar = _mm_set_ps1(SHRT_MAX);
139 __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX);
141 __m128 cplxValue1, cplxValue2, result;
146 for(;number < quarterPoints; number++){
148 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
149 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
150 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
151 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
153 inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
154 inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
155 inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
156 inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
158 cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
159 cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
161 complexVectorPtr += 8;
163 cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
164 cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
166 cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1);
167 cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2);
169 result = _mm_hadd_ps(cplxValue1, cplxValue2);
171 result = _mm_sqrt_ps(result);
173 result = _mm_mul_ps(result, vScalar);
175 _mm_store_ps(outputFloatBuffer, result);
176 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[0]);
177 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[1]);
178 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[2]);
179 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[3]);
182 number = quarterPoints * 4;
183 magnitudeVectorPtr = &magnitudeVector[number];
184 complexVectorPtr = (
const int16_t*)&complexVector[number];
185 for(; number < num_points; number++){
186 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
187 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
188 const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
189 *magnitudeVectorPtr++ = (int16_t)
rintf(val1Result);
195 #include <xmmintrin.h> 200 unsigned int number = 0;
201 const unsigned int quarterPoints = num_points / 4;
203 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
204 int16_t* magnitudeVectorPtr = magnitudeVector;
206 __m128 vScalar = _mm_set_ps1(SHRT_MAX);
207 __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX);
209 __m128 cplxValue1, cplxValue2, iValue, qValue, result;
214 for(;number < quarterPoints; number++){
216 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
217 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
218 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
219 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
221 cplxValue1 = _mm_load_ps(inputFloatBuffer);
222 complexVectorPtr += 4;
224 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
225 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
226 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
227 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
229 cplxValue2 = _mm_load_ps(inputFloatBuffer);
230 complexVectorPtr += 4;
232 cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
233 cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
236 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
238 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
240 iValue = _mm_mul_ps(iValue, iValue);
241 qValue = _mm_mul_ps(qValue, qValue);
243 result = _mm_add_ps(iValue, qValue);
245 result = _mm_sqrt_ps(result);
247 result = _mm_mul_ps(result, vScalar);
249 _mm_store_ps(outputFloatBuffer, result);
250 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[0]);
251 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[1]);
252 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[2]);
253 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[3]);
256 number = quarterPoints * 4;
257 magnitudeVectorPtr = &magnitudeVector[number];
258 complexVectorPtr = (
const int16_t*)&complexVector[number];
259 for(; number < num_points; number++){
260 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
261 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
262 const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
263 *magnitudeVectorPtr++ = (int16_t)
rintf(val1Result);
268 #ifdef LV_HAVE_GENERIC 273 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
274 int16_t* magnitudeVectorPtr = magnitudeVector;
275 unsigned int number = 0;
276 const float scalar = SHRT_MAX;
277 for(number = 0; number < num_points; number++){
278 float real = ((float)(*complexVectorPtr++)) / scalar;
279 float imag = ((float)(*complexVectorPtr++)) / scalar;
280 *magnitudeVectorPtr++ = (int16_t)
rintf(sqrtf((real*real) + (imag*imag)) * scalar);
285 #ifdef LV_HAVE_ORC_DISABLED 287 volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector,
const lv_16sc_t* complexVector,
float scalar,
unsigned int num_points);
290 volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector,
const lv_16sc_t* complexVector,
unsigned int num_points)
292 volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, SHRT_MAX, num_points);
300 #ifndef INCLUDED_volk_16ic_magnitude_16i_u_H 301 #define INCLUDED_volk_16ic_magnitude_16i_u_H 304 #include <inttypes.h> 309 #include <immintrin.h> 312 volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector,
const lv_16sc_t* complexVector,
unsigned int num_points)
314 unsigned int number = 0;
315 const unsigned int eighthPoints = num_points / 8;
317 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
318 int16_t* magnitudeVectorPtr = magnitudeVector;
320 __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
321 __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX);
323 __m128i short1, short2;
324 __m256 cplxValue1, cplxValue2, result;
325 __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
327 for(;number < eighthPoints; number++){
329 int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
330 complexVectorPtr += 16;
331 short1 = _mm256_extracti128_si256(int1,0);
332 short2 = _mm256_extracti128_si256(int1,1);
334 int1 = _mm256_cvtepi16_epi32(short1);
335 int2 = _mm256_cvtepi16_epi32(short2);
336 cplxValue1 = _mm256_cvtepi32_ps(int1);
337 cplxValue2 = _mm256_cvtepi32_ps(int2);
339 cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
340 cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
342 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
343 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
345 result = _mm256_hadd_ps(cplxValue1, cplxValue2);
347 result = _mm256_sqrt_ps(result);
349 result = _mm256_mul_ps(result, vScalar);
351 int1 = _mm256_cvtps_epi32(result);
352 int1 = _mm256_packs_epi32(int1, int1);
353 int1 = _mm256_permutevar8x32_epi32(int1, idx);
354 short1 = _mm256_extracti128_si256(int1, 0);
355 _mm_storeu_si128((__m128i*)magnitudeVectorPtr,short1);
356 magnitudeVectorPtr += 8;
359 number = eighthPoints * 8;
360 magnitudeVectorPtr = &magnitudeVector[number];
361 complexVectorPtr = (
const int16_t*)&complexVector[number];
362 for(; number < num_points; number++){
363 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
364 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
365 const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
366 *magnitudeVectorPtr++ = (int16_t)
rintf(val1Result);
371 #ifdef LV_HAVE_NEONV7 372 #include <arm_neon.h> 376 volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector,
const lv_16sc_t* complexVector,
unsigned int num_points)
378 unsigned int number = 0;
379 unsigned int quarter_points = num_points / 4;
381 const float scalar = SHRT_MAX;
382 const float inv_scalar = 1.0f / scalar;
384 int16_t* magnitudeVectorPtr = magnitudeVector;
385 const lv_16sc_t* complexVectorPtr = complexVector;
390 for(number = 0; number < quarter_points; number++) {
391 const int16x4x2_t c16_vec = vld2_s16((int16_t*)complexVectorPtr);
393 c_vec.val[0] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[0]));
394 c_vec.val[1] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[1]));
396 c_vec.val[0] = vmulq_n_f32(c_vec.val[0], inv_scalar);
397 c_vec.val[1] = vmulq_n_f32(c_vec.val[1], inv_scalar);
400 mag_vec = vmulq_f32(mag_vec_squared,
_vinvsqrtq_f32(mag_vec_squared));
402 mag_vec = vmulq_n_f32(mag_vec, scalar);
405 mag_vec = vaddq_f32(mag_vec, vdupq_n_f32(0.5));
406 const int16x4_t mag16_vec = vmovn_s32(vcvtq_s32_f32(mag_vec));
407 vst1_s16(magnitudeVectorPtr, mag16_vec);
409 magnitudeVectorPtr+=4;
414 for(number = quarter_points * 4; number < num_points; number++) {
415 const float real =
lv_creal(*complexVectorPtr) * inv_scalar;
416 const float imag =
lv_cimag(*complexVectorPtr) * inv_scalar;
417 *magnitudeVectorPtr = (int16_t)
rintf(sqrtf((real*real) + (imag*imag)) * scalar);
419 magnitudeVectorPtr++;
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:74
short complex lv_16sc_t
Definition: volk_complex.h:58
static float rintf(float x)
Definition: config.h:31
static void volk_16ic_magnitude_16i_a_sse(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:198
static void volk_16ic_magnitude_16i_a_sse3(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:130
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:64
static void volk_16ic_magnitude_16i_generic(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:271
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:52
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:46
#define lv_creal(x)
Definition: volk_complex.h:83
#define lv_cimag(x)
Definition: volk_complex.h:85