Vector Optimized Library of Kernels  2.1
Architecture-tuned implementations of math kernels
volk_32fc_convert_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See thegit
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
46 #ifndef INCLUDED_volk_32fc_convert_16ic_a_H
47 #define INCLUDED_volk_32fc_convert_16ic_a_H
48 
49 #include <limits.h>
50 #include <math.h>
51 #include "volk/volk_complex.h"
52 
53 #ifdef LV_HAVE_AVX2
54 #include <immintrin.h>
55 
56 static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
57 {
58  const unsigned int avx_iters = num_points / 8;
59 
60  float* inputVectorPtr = (float*)inputVector;
61  int16_t* outputVectorPtr = (int16_t*)outputVector;
62  float aux;
63 
64  const float min_val = (float)SHRT_MIN;
65  const float max_val = (float)SHRT_MAX;
66 
67  __m256 inputVal1, inputVal2;
68  __m256i intInputVal1, intInputVal2;
69  __m256 ret1, ret2;
70  const __m256 vmin_val = _mm256_set1_ps(min_val);
71  const __m256 vmax_val = _mm256_set1_ps(max_val);
72  unsigned int i;
73 
74  for(i = 0; i < avx_iters; i++)
75  {
76  inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
77  inputVectorPtr += 8;
78  inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
79  inputVectorPtr += 8;
80  __VOLK_PREFETCH(inputVectorPtr + 16);
81 
82  // Clip
83  ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
84  ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
85 
86  intInputVal1 = _mm256_cvtps_epi32(ret1);
87  intInputVal2 = _mm256_cvtps_epi32(ret2);
88 
89  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
90  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
91 
92  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
93  outputVectorPtr += 16;
94  }
95 
96  for(i = avx_iters * 16; i < num_points * 2; i++)
97  {
98  aux = *inputVectorPtr++;
99  if(aux > max_val)
100  aux = max_val;
101  else if(aux < min_val)
102  aux = min_val;
103  *outputVectorPtr++ = (int16_t)rintf(aux);
104  }
105 }
106 #endif /* LV_HAVE_AVX2 */
107 
108 #ifdef LV_HAVE_SSE2
109 #include <emmintrin.h>
110 
111 static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
112 {
113  const unsigned int sse_iters = num_points / 4;
114 
115  float* inputVectorPtr = (float*)inputVector;
116  int16_t* outputVectorPtr = (int16_t*)outputVector;
117  float aux;
118 
119  const float min_val = (float)SHRT_MIN;
120  const float max_val = (float)SHRT_MAX;
121 
122  __m128 inputVal1, inputVal2;
123  __m128i intInputVal1, intInputVal2;
124  __m128 ret1, ret2;
125  const __m128 vmin_val = _mm_set_ps1(min_val);
126  const __m128 vmax_val = _mm_set_ps1(max_val);
127  unsigned int i;
128 
129  for(i = 0; i < sse_iters; i++)
130  {
131  inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
132  inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
133  __VOLK_PREFETCH(inputVectorPtr + 8);
134 
135  // Clip
136  ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
137  ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
138 
139  intInputVal1 = _mm_cvtps_epi32(ret1);
140  intInputVal2 = _mm_cvtps_epi32(ret2);
141 
142  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
143 
144  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
145  outputVectorPtr += 8;
146  }
147 
148  for(i = sse_iters * 8; i < num_points * 2; i++)
149  {
150  aux = *inputVectorPtr++;
151  if(aux > max_val)
152  aux = max_val;
153  else if(aux < min_val)
154  aux = min_val;
155  *outputVectorPtr++ = (int16_t)rintf(aux);
156  }
157 }
158 #endif /* LV_HAVE_SSE2 */
159 
160 
161 #if LV_HAVE_NEONV7
162 #include <arm_neon.h>
163 
164 #define VCVTRQ_S32_F32(res,val) \
165  __VOLK_ASM ("VCVTR.S32.F32 %[r0], %[v0]\n\t" : [r0]"=w"(res[0]) : [v0]"w"(val[0]) : ); \
166  __VOLK_ASM ("VCVTR.S32.F32 %[r1], %[v1]\n\t" : [r1]"=w"(res[1]) : [v1]"w"(val[1]) : ); \
167  __VOLK_ASM ("VCVTR.S32.F32 %[r2], %[v2]\n\t" : [r2]"=w"(res[2]) : [v2]"w"(val[2]) : ); \
168  __VOLK_ASM ("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3]"=w"(res[3]) : [v3]"w"(val[3]) : );
169 
170 static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
171 {
172 
173  const unsigned int neon_iters = num_points / 4;
174 
175  float32_t* inputVectorPtr = (float32_t*)inputVector;
176  int16_t* outputVectorPtr = (int16_t*)outputVector;
177 
178  const float min_val_f = (float)SHRT_MIN;
179  const float max_val_f = (float)SHRT_MAX;
180  float32_t aux;
181  unsigned int i;
182 
183  const float32x4_t min_val = vmovq_n_f32(min_val_f);
184  const float32x4_t max_val = vmovq_n_f32(max_val_f);
185  float32x4_t ret1, ret2, a, b;
186 
187  int32x4_t toint_a={0,0,0,0};
188  int32x4_t toint_b={0,0,0,0};
189  int16x4_t intInputVal1, intInputVal2;
190  int16x8_t res;
191 
192  for(i = 0; i < neon_iters; i++)
193  {
194  a = vld1q_f32((const float32_t*)(inputVectorPtr));
195  inputVectorPtr += 4;
196  b = vld1q_f32((const float32_t*)(inputVectorPtr));
197  inputVectorPtr += 4;
198  __VOLK_PREFETCH(inputVectorPtr + 8);
199 
200  ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
201  ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
202 
203  // vcvtr takes into account the current rounding mode (as does rintf)
204  VCVTRQ_S32_F32(toint_a, ret1);
205  VCVTRQ_S32_F32(toint_b, ret2);
206 
207  intInputVal1 = vqmovn_s32(toint_a);
208  intInputVal2 = vqmovn_s32(toint_b);
209 
210  res = vcombine_s16(intInputVal1, intInputVal2);
211  vst1q_s16((int16_t*)outputVectorPtr, res);
212  outputVectorPtr += 8;
213  }
214 
215  for(i = neon_iters * 8; i < num_points * 2; i++)
216  {
217  aux = *inputVectorPtr++;
218  if(aux > max_val_f)
219  aux = max_val_f;
220  else if(aux < min_val_f)
221  aux = min_val_f;
222  *outputVectorPtr++ = (int16_t)rintf(aux);
223  }
224 }
225 
226 #undef VCVTRQ_S32_F32
227 #endif /* LV_HAVE_NEONV7 */
228 
229 #if LV_HAVE_NEONV8
230 #include <arm_neon.h>
231 
232 static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
233 {
234  const unsigned int neon_iters = num_points / 4;
235 
236  float32_t* inputVectorPtr = (float32_t*)inputVector;
237  int16_t* outputVectorPtr = (int16_t*)outputVector;
238 
239  const float min_val_f = (float)SHRT_MIN;
240  const float max_val_f = (float)SHRT_MAX;
241  float32_t aux;
242  unsigned int i;
243 
244  const float32x4_t min_val = vmovq_n_f32(min_val_f);
245  const float32x4_t max_val = vmovq_n_f32(max_val_f);
246  float32x4_t ret1, ret2, a, b;
247 
248  int32x4_t toint_a={0,0,0,0}, toint_b={0,0,0,0};
249  int16x4_t intInputVal1, intInputVal2;
250  int16x8_t res;
251 
252  for(i = 0; i < neon_iters; i++)
253  {
254  a = vld1q_f32((const float32_t*)(inputVectorPtr));
255  inputVectorPtr += 4;
256  b = vld1q_f32((const float32_t*)(inputVectorPtr));
257  inputVectorPtr += 4;
258  __VOLK_PREFETCH(inputVectorPtr + 8);
259 
260  ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
261  ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
262 
263  // vrndiq takes into account the current rounding mode (as does rintf)
264  toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
265  toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
266 
267  intInputVal1 = vqmovn_s32(toint_a);
268  intInputVal2 = vqmovn_s32(toint_b);
269 
270  res = vcombine_s16(intInputVal1, intInputVal2);
271  vst1q_s16((int16_t*)outputVectorPtr, res);
272  outputVectorPtr += 8;
273  }
274 
275  for(i = neon_iters * 8; i < num_points * 2; i++)
276  {
277  aux = *inputVectorPtr++;
278  if(aux > max_val_f)
279  aux = max_val_f;
280  else if(aux < min_val_f)
281  aux = min_val_f;
282  *outputVectorPtr++ = (int16_t)rintf(aux);
283  }
284 }
285 #endif /* LV_HAVE_NEONV8 */
286 
287 
288 
289 #ifdef LV_HAVE_GENERIC
290 
291 static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
292 {
293  float* inputVectorPtr = (float*)inputVector;
294  int16_t* outputVectorPtr = (int16_t*)outputVector;
295  const float min_val = (float)SHRT_MIN;
296  const float max_val = (float)SHRT_MAX;
297  float aux;
298  unsigned int i;
299  for(i = 0; i < num_points * 2; i++)
300  {
301  aux = *inputVectorPtr++;
302  if(aux > max_val)
303  aux = max_val;
304  else if(aux < min_val)
305  aux = min_val;
306  *outputVectorPtr++ = (int16_t)rintf(aux);
307  }
308 }
309 #endif /* LV_HAVE_GENERIC */
310 
311 #endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
312 
313 #ifndef INCLUDED_volk_32fc_convert_16ic_u_H
314 #define INCLUDED_volk_32fc_convert_16ic_u_H
315 
316 #include <limits.h>
317 #include <math.h>
318 #include "volk/volk_complex.h"
319 
320 
321 #ifdef LV_HAVE_AVX2
322 #include <immintrin.h>
323 
324 static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
325 {
326  const unsigned int avx_iters = num_points / 8;
327 
328  float* inputVectorPtr = (float*)inputVector;
329  int16_t* outputVectorPtr = (int16_t*)outputVector;
330  float aux;
331 
332  const float min_val = (float)SHRT_MIN;
333  const float max_val = (float)SHRT_MAX;
334 
335  __m256 inputVal1, inputVal2;
336  __m256i intInputVal1, intInputVal2;
337  __m256 ret1, ret2;
338  const __m256 vmin_val = _mm256_set1_ps(min_val);
339  const __m256 vmax_val = _mm256_set1_ps(max_val);
340  unsigned int i;
341 
342  for(i = 0; i < avx_iters; i++)
343  {
344  inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
345  inputVectorPtr += 8;
346  inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
347  inputVectorPtr += 8;
348  __VOLK_PREFETCH(inputVectorPtr + 16);
349 
350  // Clip
351  ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
352  ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
353 
354  intInputVal1 = _mm256_cvtps_epi32(ret1);
355  intInputVal2 = _mm256_cvtps_epi32(ret2);
356 
357  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
358  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
359 
360  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
361  outputVectorPtr += 16;
362  }
363 
364  for(i = avx_iters * 16; i < num_points * 2; i++)
365  {
366  aux = *inputVectorPtr++;
367  if(aux > max_val)
368  aux = max_val;
369  else if(aux < min_val)
370  aux = min_val;
371  *outputVectorPtr++ = (int16_t)rintf(aux);
372  }
373 }
374 #endif /* LV_HAVE_AVX2 */
375 
376 
377 #ifdef LV_HAVE_SSE2
378 #include <emmintrin.h>
379 
380 static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
381 {
382  const unsigned int sse_iters = num_points / 4;
383 
384  float* inputVectorPtr = (float*)inputVector;
385  int16_t* outputVectorPtr = (int16_t*)outputVector;
386  float aux;
387 
388  const float min_val = (float)SHRT_MIN;
389  const float max_val = (float)SHRT_MAX;
390 
391  __m128 inputVal1, inputVal2;
392  __m128i intInputVal1, intInputVal2;
393  __m128 ret1, ret2;
394  const __m128 vmin_val = _mm_set_ps1(min_val);
395  const __m128 vmax_val = _mm_set_ps1(max_val);
396 
397  unsigned int i;
398  for(i = 0; i < sse_iters; i++)
399  {
400  inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
401  inputVectorPtr += 4;
402  inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
403  inputVectorPtr += 4;
404  __VOLK_PREFETCH(inputVectorPtr + 8);
405 
406  // Clip
407  ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
408  ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
409 
410  intInputVal1 = _mm_cvtps_epi32(ret1);
411  intInputVal2 = _mm_cvtps_epi32(ret2);
412 
413  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
414 
415  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
416  outputVectorPtr += 8;
417  }
418 
419  for(i = sse_iters * 8; i < num_points * 2; i++)
420  {
421  aux = *inputVectorPtr++;
422  if(aux > max_val)
423  aux = max_val;
424  else if(aux < min_val)
425  aux = min_val;
426  *outputVectorPtr++ = (int16_t)rintf(aux);
427  }
428 }
429 #endif /* LV_HAVE_SSE2 */
430 #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
short complex lv_16sc_t
Definition: volk_complex.h:58
static float rintf(float x)
Definition: config.h:31
static void volk_32fc_convert_16ic_generic(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:291
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:52
static void volk_32fc_convert_16ic_a_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:111
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32fc_convert_16ic_u_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:380
float complex lv_32fc_t
Definition: volk_complex.h:61