46 #ifndef INCLUDED_volk_32fc_convert_16ic_a_H 47 #define INCLUDED_volk_32fc_convert_16ic_a_H 54 #include <immintrin.h> 56 static inline void volk_32fc_convert_16ic_a_avx2(
lv_16sc_t* outputVector,
const lv_32fc_t* inputVector,
unsigned int num_points)
58 const unsigned int avx_iters = num_points / 8;
60 float* inputVectorPtr = (
float*)inputVector;
61 int16_t* outputVectorPtr = (int16_t*)outputVector;
64 const float min_val = (float)SHRT_MIN;
65 const float max_val = (float)SHRT_MAX;
67 __m256 inputVal1, inputVal2;
68 __m256i intInputVal1, intInputVal2;
70 const __m256 vmin_val = _mm256_set1_ps(min_val);
71 const __m256 vmax_val = _mm256_set1_ps(max_val);
74 for(i = 0; i < avx_iters; i++)
76 inputVal1 = _mm256_load_ps((
float*)inputVectorPtr);
78 inputVal2 = _mm256_load_ps((
float*)inputVectorPtr);
83 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
84 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
86 intInputVal1 = _mm256_cvtps_epi32(ret1);
87 intInputVal2 = _mm256_cvtps_epi32(ret2);
89 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
90 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
92 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
93 outputVectorPtr += 16;
96 for(i = avx_iters * 16; i < num_points * 2; i++)
98 aux = *inputVectorPtr++;
101 else if(aux < min_val)
103 *outputVectorPtr++ = (int16_t)
rintf(aux);
109 #include <emmintrin.h> 113 const unsigned int sse_iters = num_points / 4;
115 float* inputVectorPtr = (
float*)inputVector;
116 int16_t* outputVectorPtr = (int16_t*)outputVector;
119 const float min_val = (float)SHRT_MIN;
120 const float max_val = (float)SHRT_MAX;
122 __m128 inputVal1, inputVal2;
123 __m128i intInputVal1, intInputVal2;
125 const __m128 vmin_val = _mm_set_ps1(min_val);
126 const __m128 vmax_val = _mm_set_ps1(max_val);
129 for(i = 0; i < sse_iters; i++)
131 inputVal1 = _mm_load_ps((
float*)inputVectorPtr); inputVectorPtr += 4;
132 inputVal2 = _mm_load_ps((
float*)inputVectorPtr); inputVectorPtr += 4;
136 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
137 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
139 intInputVal1 = _mm_cvtps_epi32(ret1);
140 intInputVal2 = _mm_cvtps_epi32(ret2);
142 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
144 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
145 outputVectorPtr += 8;
148 for(i = sse_iters * 8; i < num_points * 2; i++)
150 aux = *inputVectorPtr++;
153 else if(aux < min_val)
155 *outputVectorPtr++ = (int16_t)
rintf(aux);
162 #include <arm_neon.h> 164 #define VCVTRQ_S32_F32(res,val) \ 165 __VOLK_ASM ("VCVTR.S32.F32 %[r0], %[v0]\n\t" : [r0]"=w"(res[0]) : [v0]"w"(val[0]) : ); \ 166 __VOLK_ASM ("VCVTR.S32.F32 %[r1], %[v1]\n\t" : [r1]"=w"(res[1]) : [v1]"w"(val[1]) : ); \ 167 __VOLK_ASM ("VCVTR.S32.F32 %[r2], %[v2]\n\t" : [r2]"=w"(res[2]) : [v2]"w"(val[2]) : ); \ 168 __VOLK_ASM ("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3]"=w"(res[3]) : [v3]"w"(val[3]) : ); 170 static inline void volk_32fc_convert_16ic_neon(
lv_16sc_t* outputVector,
const lv_32fc_t* inputVector,
unsigned int num_points)
173 const unsigned int neon_iters = num_points / 4;
175 float32_t* inputVectorPtr = (float32_t*)inputVector;
176 int16_t* outputVectorPtr = (int16_t*)outputVector;
178 const float min_val_f = (float)SHRT_MIN;
179 const float max_val_f = (float)SHRT_MAX;
183 const float32x4_t min_val = vmovq_n_f32(min_val_f);
184 const float32x4_t max_val = vmovq_n_f32(max_val_f);
185 float32x4_t ret1, ret2, a, b;
187 int32x4_t toint_a={0,0,0,0};
188 int32x4_t toint_b={0,0,0,0};
189 int16x4_t intInputVal1, intInputVal2;
192 for(i = 0; i < neon_iters; i++)
194 a = vld1q_f32((
const float32_t*)(inputVectorPtr));
196 b = vld1q_f32((
const float32_t*)(inputVectorPtr));
200 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
201 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
204 VCVTRQ_S32_F32(toint_a, ret1);
205 VCVTRQ_S32_F32(toint_b, ret2);
207 intInputVal1 = vqmovn_s32(toint_a);
208 intInputVal2 = vqmovn_s32(toint_b);
210 res = vcombine_s16(intInputVal1, intInputVal2);
211 vst1q_s16((int16_t*)outputVectorPtr, res);
212 outputVectorPtr += 8;
215 for(i = neon_iters * 8; i < num_points * 2; i++)
217 aux = *inputVectorPtr++;
220 else if(aux < min_val_f)
222 *outputVectorPtr++ = (int16_t)
rintf(aux);
226 #undef VCVTRQ_S32_F32 230 #include <arm_neon.h> 232 static inline void volk_32fc_convert_16ic_neonv8(
lv_16sc_t* outputVector,
const lv_32fc_t* inputVector,
unsigned int num_points)
234 const unsigned int neon_iters = num_points / 4;
236 float32_t* inputVectorPtr = (float32_t*)inputVector;
237 int16_t* outputVectorPtr = (int16_t*)outputVector;
239 const float min_val_f = (float)SHRT_MIN;
240 const float max_val_f = (float)SHRT_MAX;
244 const float32x4_t min_val = vmovq_n_f32(min_val_f);
245 const float32x4_t max_val = vmovq_n_f32(max_val_f);
246 float32x4_t ret1, ret2, a, b;
248 int32x4_t toint_a={0,0,0,0}, toint_b={0,0,0,0};
249 int16x4_t intInputVal1, intInputVal2;
252 for(i = 0; i < neon_iters; i++)
254 a = vld1q_f32((
const float32_t*)(inputVectorPtr));
256 b = vld1q_f32((
const float32_t*)(inputVectorPtr));
260 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
261 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
264 toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
265 toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
267 intInputVal1 = vqmovn_s32(toint_a);
268 intInputVal2 = vqmovn_s32(toint_b);
270 res = vcombine_s16(intInputVal1, intInputVal2);
271 vst1q_s16((int16_t*)outputVectorPtr, res);
272 outputVectorPtr += 8;
275 for(i = neon_iters * 8; i < num_points * 2; i++)
277 aux = *inputVectorPtr++;
280 else if(aux < min_val_f)
282 *outputVectorPtr++ = (int16_t)
rintf(aux);
289 #ifdef LV_HAVE_GENERIC 293 float* inputVectorPtr = (
float*)inputVector;
294 int16_t* outputVectorPtr = (int16_t*)outputVector;
295 const float min_val = (float)SHRT_MIN;
296 const float max_val = (float)SHRT_MAX;
299 for(i = 0; i < num_points * 2; i++)
301 aux = *inputVectorPtr++;
304 else if(aux < min_val)
306 *outputVectorPtr++ = (int16_t)
rintf(aux);
313 #ifndef INCLUDED_volk_32fc_convert_16ic_u_H 314 #define INCLUDED_volk_32fc_convert_16ic_u_H 322 #include <immintrin.h> 324 static inline void volk_32fc_convert_16ic_u_avx2(
lv_16sc_t* outputVector,
const lv_32fc_t* inputVector,
unsigned int num_points)
326 const unsigned int avx_iters = num_points / 8;
328 float* inputVectorPtr = (
float*)inputVector;
329 int16_t* outputVectorPtr = (int16_t*)outputVector;
332 const float min_val = (float)SHRT_MIN;
333 const float max_val = (float)SHRT_MAX;
335 __m256 inputVal1, inputVal2;
336 __m256i intInputVal1, intInputVal2;
338 const __m256 vmin_val = _mm256_set1_ps(min_val);
339 const __m256 vmax_val = _mm256_set1_ps(max_val);
342 for(i = 0; i < avx_iters; i++)
344 inputVal1 = _mm256_loadu_ps((
float*)inputVectorPtr);
346 inputVal2 = _mm256_loadu_ps((
float*)inputVectorPtr);
351 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
352 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
354 intInputVal1 = _mm256_cvtps_epi32(ret1);
355 intInputVal2 = _mm256_cvtps_epi32(ret2);
357 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
358 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
360 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
361 outputVectorPtr += 16;
364 for(i = avx_iters * 16; i < num_points * 2; i++)
366 aux = *inputVectorPtr++;
369 else if(aux < min_val)
371 *outputVectorPtr++ = (int16_t)
rintf(aux);
378 #include <emmintrin.h> 382 const unsigned int sse_iters = num_points / 4;
384 float* inputVectorPtr = (
float*)inputVector;
385 int16_t* outputVectorPtr = (int16_t*)outputVector;
388 const float min_val = (float)SHRT_MIN;
389 const float max_val = (float)SHRT_MAX;
391 __m128 inputVal1, inputVal2;
392 __m128i intInputVal1, intInputVal2;
394 const __m128 vmin_val = _mm_set_ps1(min_val);
395 const __m128 vmax_val = _mm_set_ps1(max_val);
398 for(i = 0; i < sse_iters; i++)
400 inputVal1 = _mm_loadu_ps((
float*)inputVectorPtr);
402 inputVal2 = _mm_loadu_ps((
float*)inputVectorPtr);
407 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
408 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
410 intInputVal1 = _mm_cvtps_epi32(ret1);
411 intInputVal2 = _mm_cvtps_epi32(ret2);
413 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
415 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
416 outputVectorPtr += 8;
419 for(i = sse_iters * 8; i < num_points * 2; i++)
421 aux = *inputVectorPtr++;
424 else if(aux < min_val)
426 *outputVectorPtr++ = (int16_t)
rintf(aux);
short complex lv_16sc_t
Definition: volk_complex.h:58
static float rintf(float x)
Definition: config.h:31
static void volk_32fc_convert_16ic_generic(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:291
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:52
static void volk_32fc_convert_16ic_a_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:111
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32fc_convert_16ic_u_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:380
float complex lv_32fc_t
Definition: volk_complex.h:61