Vector Optimized Library of Kernels  2.1
Architecture-tuned implementations of math kernels
volk_32f_log2_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
92 #ifndef INCLUDED_volk_32f_log2_32f_a_H
93 #define INCLUDED_volk_32f_log2_32f_a_H
94 
95 #include <stdio.h>
96 #include <stdlib.h>
97 #include <inttypes.h>
98 #include <math.h>
99 
100 #define LOG_POLY_DEGREE 6
101 
102 // +-Inf -> +-127.0f in order to match the behaviour of the SIMD kernels
103 static inline float log2f_non_ieee(float f) {
104  float const result = log2f(f);
105  return isinf(result) ? copysignf(127.0f, result) : result;
106 }
107 
108 #ifdef LV_HAVE_GENERIC
109 
110 static inline void
111 volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
112 {
113  float* bPtr = bVector;
114  const float* aPtr = aVector;
115  unsigned int number = 0;
116 
117  for(number = 0; number < num_points; number++)
118  *bPtr++ = log2f_non_ieee(*aPtr++);
119 }
120 #endif /* LV_HAVE_GENERIC */
121 
122 #if LV_HAVE_AVX2 && LV_HAVE_FMA
123 #include <immintrin.h>
124 
125 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
126 #define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
127 #define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
128 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
129 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
130 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
131 
132 static inline void
133 volk_32f_log2_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
134 {
135  float* bPtr = bVector;
136  const float* aPtr = aVector;
137 
138  unsigned int number = 0;
139  const unsigned int eighthPoints = num_points / 8;
140 
141  __m256 aVal, bVal, mantissa, frac, leadingOne;
142  __m256i bias, exp;
143 
144  for(;number < eighthPoints; number++){
145 
146  aVal = _mm256_load_ps(aPtr);
147  bias = _mm256_set1_epi32(127);
148  leadingOne = _mm256_set1_ps(1.0f);
149  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
150  bVal = _mm256_cvtepi32_ps(exp);
151 
152  // Now to extract mantissa
153  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
154 
155 #if LOG_POLY_DEGREE == 6
156  mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
157 #elif LOG_POLY_DEGREE == 5
158  mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
159 #elif LOG_POLY_DEGREE == 4
160  mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
161 #elif LOG_POLY_DEGREE == 3
162  mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
163 #else
164 #error
165 #endif
166 
167  bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
168  _mm256_store_ps(bPtr, bVal);
169 
170  aPtr += 8;
171  bPtr += 8;
172  }
173 
174  number = eighthPoints * 8;
175  volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
176 }
177 
178 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
179 
180 #ifdef LV_HAVE_AVX2
181 #include <immintrin.h>
182 
183 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
184 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
185 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
186 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
187 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
188 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
189 
190 static inline void
191 volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
192 {
193  float* bPtr = bVector;
194  const float* aPtr = aVector;
195 
196  unsigned int number = 0;
197  const unsigned int eighthPoints = num_points / 8;
198 
199  __m256 aVal, bVal, mantissa, frac, leadingOne;
200  __m256i bias, exp;
201 
202  for(;number < eighthPoints; number++){
203 
204  aVal = _mm256_load_ps(aPtr);
205  bias = _mm256_set1_epi32(127);
206  leadingOne = _mm256_set1_ps(1.0f);
207  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
208  bVal = _mm256_cvtepi32_ps(exp);
209 
210  // Now to extract mantissa
211  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
212 
213 #if LOG_POLY_DEGREE == 6
214  mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
215 #elif LOG_POLY_DEGREE == 5
216  mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
217 #elif LOG_POLY_DEGREE == 4
218  mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
219 #elif LOG_POLY_DEGREE == 3
220  mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
221 #else
222 #error
223 #endif
224 
225  bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
226  _mm256_store_ps(bPtr, bVal);
227 
228  aPtr += 8;
229  bPtr += 8;
230  }
231 
232  number = eighthPoints * 8;
233  volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
234 }
235 
236 #endif /* LV_HAVE_AVX2 for aligned */
237 
238 #ifdef LV_HAVE_SSE4_1
239 #include <smmintrin.h>
240 
241 #define POLY0(x, c0) _mm_set1_ps(c0)
242 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
243 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
244 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
245 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
246 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
247 
248 static inline void
249 volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
250 {
251  float* bPtr = bVector;
252  const float* aPtr = aVector;
253 
254  unsigned int number = 0;
255  const unsigned int quarterPoints = num_points / 4;
256 
257  __m128 aVal, bVal, mantissa, frac, leadingOne;
258  __m128i bias, exp;
259 
260  for(;number < quarterPoints; number++){
261 
262  aVal = _mm_load_ps(aPtr);
263  bias = _mm_set1_epi32(127);
264  leadingOne = _mm_set1_ps(1.0f);
265  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
266  bVal = _mm_cvtepi32_ps(exp);
267 
268  // Now to extract mantissa
269  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
270 
271 #if LOG_POLY_DEGREE == 6
272  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
273 #elif LOG_POLY_DEGREE == 5
274  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
275 #elif LOG_POLY_DEGREE == 4
276  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
277 #elif LOG_POLY_DEGREE == 3
278  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
279 #else
280 #error
281 #endif
282 
283  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
284  _mm_store_ps(bPtr, bVal);
285 
286  aPtr += 4;
287  bPtr += 4;
288  }
289 
290  number = quarterPoints * 4;
291  volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
292 }
293 
294 #endif /* LV_HAVE_SSE4_1 for aligned */
295 
296 #ifdef LV_HAVE_NEON
297 #include <arm_neon.h>
298 
299 /* these macros allow us to embed logs in other kernels */
300 #define VLOG2Q_NEON_PREAMBLE() \
301  int32x4_t one = vdupq_n_s32(0x000800000); \
302  /* minimax polynomial */ \
303  float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
304  float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
305  float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
306  float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
307  float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
308  float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
309  float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
310  int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
311  int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
312  int32x4_t exp_bias = vdupq_n_s32(127);
313 
314 
315 #define VLOG2Q_NEON_F32(log2_approx, aval) \
316  int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
317  int32x4_t significand_i = vandq_s32(aval, sig_mask); \
318  exponent_i = vshrq_n_s32(exponent_i, 23); \
319  \
320  /* extract the exponent and significand \
321  we can treat this as fixed point to save ~9% on the \
322  conversion + float add */ \
323  significand_i = vorrq_s32(one, significand_i); \
324  float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \
325  /* debias the exponent and convert to float */ \
326  exponent_i = vsubq_s32(exponent_i, exp_bias); \
327  float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
328  \
329  /* put the significand through a polynomial fit of log2(x) [1,2] \
330  add the result to the exponent */ \
331  log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
332  float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
333  log2_approx = vaddq_f32(log2_approx, tmp1); \
334  float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
335  tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
336  log2_approx = vaddq_f32(log2_approx, tmp1); \
337  \
338  float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
339  tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
340  log2_approx = vaddq_f32(log2_approx, tmp1); \
341  float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
342  tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
343  log2_approx = vaddq_f32(log2_approx, tmp1); \
344  float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
345  tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
346  log2_approx = vaddq_f32(log2_approx, tmp1); \
347  float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
348  tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
349  log2_approx = vaddq_f32(log2_approx, tmp1);
350 
351 static inline void
352 volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
353 {
354  float* bPtr = bVector;
355  const float* aPtr = aVector;
356  unsigned int number;
357  const unsigned int quarterPoints = num_points / 4;
358 
359  int32x4_t aval;
360  float32x4_t log2_approx;
361 
363  // lms
364  //p0 = vdupq_n_f32(-1.649132280361871);
365  //p1 = vdupq_n_f32(1.995047138579499);
366  //p2 = vdupq_n_f32(-0.336914839219728);
367 
368  // keep in mind a single precision float is represented as
369  // (-1)^sign * 2^exp * 1.significand, so the log2 is
370  // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
371  for(number = 0; number < quarterPoints; ++number){
372  // load float in to an int register without conversion
373  aval = vld1q_s32((int*)aPtr);
374 
375  VLOG2Q_NEON_F32(log2_approx, aval)
376 
377  vst1q_f32(bPtr, log2_approx);
378 
379  aPtr += 4;
380  bPtr += 4;
381  }
382 
383  number = quarterPoints * 4;
384  volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
385 }
386 
387 #endif /* LV_HAVE_NEON */
388 
389 
390 #endif /* INCLUDED_volk_32f_log2_32f_a_H */
391 
392 #ifndef INCLUDED_volk_32f_log2_32f_u_H
393 #define INCLUDED_volk_32f_log2_32f_u_H
394 
395 
396 #ifdef LV_HAVE_GENERIC
397 
398 static inline void
399 volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
400 {
401  float* bPtr = bVector;
402  const float* aPtr = aVector;
403  unsigned int number = 0;
404 
405  for(number = 0; number < num_points; number++){
406  float const result = log2f(*aPtr++);
407  *bPtr++ = isinf(result) ? -127.0f : result;
408  }
409 }
410 
411 #endif /* LV_HAVE_GENERIC */
412 
413 
414 #ifdef LV_HAVE_SSE4_1
415 #include <smmintrin.h>
416 
417 #define POLY0(x, c0) _mm_set1_ps(c0)
418 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
419 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
420 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
421 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
422 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
423 
424 static inline void
425 volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
426 {
427  float* bPtr = bVector;
428  const float* aPtr = aVector;
429 
430  unsigned int number = 0;
431  const unsigned int quarterPoints = num_points / 4;
432 
433  __m128 aVal, bVal, mantissa, frac, leadingOne;
434  __m128i bias, exp;
435 
436  for(;number < quarterPoints; number++){
437 
438  aVal = _mm_loadu_ps(aPtr);
439  bias = _mm_set1_epi32(127);
440  leadingOne = _mm_set1_ps(1.0f);
441  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
442  bVal = _mm_cvtepi32_ps(exp);
443 
444  // Now to extract mantissa
445  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
446 
447 #if LOG_POLY_DEGREE == 6
448  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
449 #elif LOG_POLY_DEGREE == 5
450  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
451 #elif LOG_POLY_DEGREE == 4
452  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
453 #elif LOG_POLY_DEGREE == 3
454  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
455 #else
456 #error
457 #endif
458 
459  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
460  _mm_storeu_ps(bPtr, bVal);
461 
462  aPtr += 4;
463  bPtr += 4;
464  }
465 
466  number = quarterPoints * 4;
467  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number);
468 }
469 
470 #endif /* LV_HAVE_SSE4_1 for unaligned */
471 
472 #if LV_HAVE_AVX2 && LV_HAVE_FMA
473 #include <immintrin.h>
474 
475 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
476 #define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
477 #define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
478 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
479 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
480 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
481 
482 static inline void
483 volk_32f_log2_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
484 {
485  float* bPtr = bVector;
486  const float* aPtr = aVector;
487 
488  unsigned int number = 0;
489  const unsigned int eighthPoints = num_points / 8;
490 
491  __m256 aVal, bVal, mantissa, frac, leadingOne;
492  __m256i bias, exp;
493 
494  for(;number < eighthPoints; number++){
495 
496  aVal = _mm256_loadu_ps(aPtr);
497  bias = _mm256_set1_epi32(127);
498  leadingOne = _mm256_set1_ps(1.0f);
499  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
500  bVal = _mm256_cvtepi32_ps(exp);
501 
502  // Now to extract mantissa
503  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
504 
505 #if LOG_POLY_DEGREE == 6
506  mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
507 #elif LOG_POLY_DEGREE == 5
508  mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
509 #elif LOG_POLY_DEGREE == 4
510  mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
511 #elif LOG_POLY_DEGREE == 3
512  mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
513 #else
514 #error
515 #endif
516 
517  bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
518  _mm256_storeu_ps(bPtr, bVal);
519 
520  aPtr += 8;
521  bPtr += 8;
522  }
523 
524  number = eighthPoints * 8;
525  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number);
526 }
527 
528 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
529 
530 #ifdef LV_HAVE_AVX2
531 #include <immintrin.h>
532 
533 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
534 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
535 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
536 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
537 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
538 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
539 
540 static inline void
541 volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
542 {
543  float* bPtr = bVector;
544  const float* aPtr = aVector;
545 
546  unsigned int number = 0;
547  const unsigned int eighthPoints = num_points / 8;
548 
549  __m256 aVal, bVal, mantissa, frac, leadingOne;
550  __m256i bias, exp;
551 
552  for(;number < eighthPoints; number++){
553 
554  aVal = _mm256_loadu_ps(aPtr);
555  bias = _mm256_set1_epi32(127);
556  leadingOne = _mm256_set1_ps(1.0f);
557  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
558  bVal = _mm256_cvtepi32_ps(exp);
559 
560  // Now to extract mantissa
561  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
562 
563 #if LOG_POLY_DEGREE == 6
564  mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
565 #elif LOG_POLY_DEGREE == 5
566  mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
567 #elif LOG_POLY_DEGREE == 4
568  mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
569 #elif LOG_POLY_DEGREE == 3
570  mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
571 #else
572 #error
573 #endif
574 
575  bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
576  _mm256_storeu_ps(bPtr, bVal);
577 
578  aPtr += 8;
579  bPtr += 8;
580  }
581 
582  number = eighthPoints * 8;
583  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number);
584 }
585 
586 #endif /* LV_HAVE_AVX2 for unaligned */
587 
588 
589 #endif /* INCLUDED_volk_32f_log2_32f_u_H */
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:111
static void volk_32f_log2_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:399
#define VLOG2Q_NEON_F32(log2_approx, aval)
Definition: volk_32f_log2_32f.h:315
static float log2f_non_ieee(float f)
Definition: volk_32f_log2_32f.h:103
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:352
#define VLOG2Q_NEON_PREAMBLE()
Definition: volk_32f_log2_32f.h:300