Vector Optimized Library of Kernels  2.1
Architecture-tuned implementations of math kernels
volk_32f_index_max_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
64 #ifndef INCLUDED_volk_32f_index_max_32u_a_H
65 #define INCLUDED_volk_32f_index_max_32u_a_H
66 
67 #include <volk/volk_common.h>
68 #include <volk/volk_common.h>
69 #include <inttypes.h>
70 #include <stdio.h>
71 
72 #ifdef LV_HAVE_SSE4_1
73 #include<smmintrin.h>
74 
75 static inline void
76 volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
77 {
78  if(num_points > 0){
79  uint32_t number = 0;
80  const uint32_t quarterPoints = num_points / 4;
81 
82  float* inputPtr = (float*)src0;
83 
84  __m128 indexIncrementValues = _mm_set1_ps(4);
85  __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
86 
87  float max = src0[0];
88  float index = 0;
89  __m128 maxValues = _mm_set1_ps(max);
90  __m128 maxValuesIndex = _mm_setzero_ps();
91  __m128 compareResults;
92  __m128 currentValues;
93 
94  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
95  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
96 
97  for(;number < quarterPoints; number++){
98 
99  currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
100  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
101 
102  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
103 
104  maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
105  maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
106  }
107 
108  // Calculate the largest value from the remaining 4 points
109  _mm_store_ps(maxValuesBuffer, maxValues);
110  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
111 
112  for(number = 0; number < 4; number++){
113  if(maxValuesBuffer[number] > max){
114  index = maxIndexesBuffer[number];
115  max = maxValuesBuffer[number];
116  } else if(maxValuesBuffer[number] == max){
117  if (index > maxIndexesBuffer[number])
118  index = maxIndexesBuffer[number];
119  }
120  }
121 
122  number = quarterPoints * 4;
123  for(;number < num_points; number++){
124  if(src0[number] > max){
125  index = number;
126  max = src0[number];
127  }
128  }
129  target[0] = (uint32_t)index;
130  }
131 }
132 
133 #endif /*LV_HAVE_SSE4_1*/
134 
135 
136 #ifdef LV_HAVE_SSE
137 
138 #include<xmmintrin.h>
139 
140 static inline void
141 volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
142 {
143  if(num_points > 0){
144  uint32_t number = 0;
145  const uint32_t quarterPoints = num_points / 4;
146 
147  float* inputPtr = (float*)src0;
148 
149  __m128 indexIncrementValues = _mm_set1_ps(4);
150  __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
151 
152  float max = src0[0];
153  float index = 0;
154  __m128 maxValues = _mm_set1_ps(max);
155  __m128 maxValuesIndex = _mm_setzero_ps();
156  __m128 compareResults;
157  __m128 currentValues;
158 
159  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
160  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
161 
162  for(;number < quarterPoints; number++){
163 
164  currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
165  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
166 
167  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
168 
169  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
170  _mm_andnot_ps(compareResults, maxValuesIndex));
171 
172  maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
173  _mm_andnot_ps(compareResults, maxValues));
174  }
175 
176  // Calculate the largest value from the remaining 4 points
177  _mm_store_ps(maxValuesBuffer, maxValues);
178  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
179 
180  for(number = 0; number < 4; number++){
181  if(maxValuesBuffer[number] > max){
182  index = maxIndexesBuffer[number];
183  max = maxValuesBuffer[number];
184  } else if(maxValuesBuffer[number] == max){
185  if (index > maxIndexesBuffer[number])
186  index = maxIndexesBuffer[number];
187  }
188  }
189 
190  number = quarterPoints * 4;
191  for(;number < num_points; number++){
192  if(src0[number] > max){
193  index = number;
194  max = src0[number];
195  }
196  }
197  target[0] = (uint32_t)index;
198  }
199 }
200 
201 #endif /*LV_HAVE_SSE*/
202 
203 
204 #ifdef LV_HAVE_AVX
205 #include <immintrin.h>
206 
207 static inline void volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
208 {
209  if(num_points > 0)
210  {
211  uint32_t number = 0;
212  const uint32_t quarterPoints = num_points / 8;
213 
214  float* inputPtr = (float*)src0;
215 
216  __m256 indexIncrementValues = _mm256_set1_ps(8);
217  __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
218 
219  float max = src0[0];
220  float index = 0;
221  __m256 maxValues = _mm256_set1_ps(max);
222  __m256 maxValuesIndex = _mm256_setzero_ps();
223  __m256 compareResults;
224  __m256 currentValues;
225 
226  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
227  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
228 
229  for(;number < quarterPoints; number++)
230  {
231  currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
232  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
233  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
234  maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
235  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
236  }
237 
238  // Calculate the largest value from the remaining 8 points
239  _mm256_store_ps(maxValuesBuffer, maxValues);
240  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
241 
242  for(number = 0; number < 8; number++)
243  {
244  if(maxValuesBuffer[number] > max)
245  {
246  index = maxIndexesBuffer[number];
247  max = maxValuesBuffer[number];
248  }
249  else if(maxValuesBuffer[number] == max){
250  if (index > maxIndexesBuffer[number])
251  index = maxIndexesBuffer[number];
252  }
253  }
254 
255  number = quarterPoints * 8;
256  for(;number < num_points; number++)
257  {
258  if(src0[number] > max)
259  {
260  index = number;
261  max = src0[number];
262  }
263  }
264  target[0] = (uint32_t)index;
265  }
266 }
267 
268 #endif /*LV_HAVE_AVX*/
269 
270 
271 #ifdef LV_HAVE_NEON
272 #include <arm_neon.h>
273 
274 static inline void volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
275 {
276  if(num_points > 0)
277  {
278  uint32_t number = 0;
279  const uint32_t quarterPoints = num_points / 4;
280 
281  float* inputPtr = (float*)src0;
282  float32x4_t indexIncrementValues = vdupq_n_f32(4);
283  __VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
284  float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
285 
286  float max = src0[0];
287  float index = 0;
288  float32x4_t maxValues = vdupq_n_f32(max);
289  uint32x4_t maxValuesIndex = vmovq_n_u32(0);
290  uint32x4_t compareResults;
291  uint32x4_t currentIndexes_u;
292  float32x4_t currentValues;
293 
294  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
295  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
296 
297  for(;number < quarterPoints; number++)
298  {
299  currentValues = vld1q_f32(inputPtr); inputPtr += 4;
300  currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
301  currentIndexes_u = vcvtq_u32_f32(currentIndexes);
302  compareResults = vcleq_f32(currentValues, maxValues);
303  maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) );
304  maxValues = vmaxq_f32(currentValues, maxValues);
305  }
306 
307  // Calculate the largest value from the remaining 4 points
308  vst1q_f32(maxValuesBuffer, maxValues);
309  vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
310  for(number = 0; number < 4; number++)
311  {
312  if(maxValuesBuffer[number] > max)
313  {
314  index = maxIndexesBuffer[number];
315  max = maxValuesBuffer[number];
316  }
317  else if(maxValues[number] == max){
318  if (index > maxIndexesBuffer[number])
319  index = maxIndexesBuffer[number];
320  }
321  }
322 
323  number = quarterPoints * 4;
324  for(;number < num_points; number++)
325  {
326  if(src0[number] > max)
327  {
328  index = number;
329  max = src0[number];
330  }
331  }
332  target[0] = (uint32_t)index;
333  }
334 }
335 
336 #endif /*LV_HAVE_NEON*/
337 
338 
339 #ifdef LV_HAVE_GENERIC
340 
341 static inline void
342 volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
343 {
344  if(num_points > 0){
345  float max = src0[0];
346  uint32_t index = 0;
347 
348  uint32_t i = 1;
349 
350  for(; i < num_points; ++i) {
351  if(src0[i] > max){
352  index = i;
353  max = src0[i];
354  }
355  }
356  target[0] = index;
357  }
358 }
359 
360 #endif /*LV_HAVE_GENERIC*/
361 
362 
363 #endif /*INCLUDED_volk_32f_index_max_32u_a_H*/
364 
365 
366 #ifndef INCLUDED_volk_32f_index_max_32u_u_H
367 #define INCLUDED_volk_32f_index_max_32u_u_H
368 
369 #include <volk/volk_common.h>
370 #include <volk/volk_common.h>
371 #include <inttypes.h>
372 #include <stdio.h>
373 
374 
375 #ifdef LV_HAVE_AVX
376 #include <immintrin.h>
377 
378 static inline void volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
379 {
380  if(num_points > 0)
381  {
382  uint32_t number = 0;
383  const uint32_t quarterPoints = num_points / 8;
384 
385  float* inputPtr = (float*)src0;
386 
387  __m256 indexIncrementValues = _mm256_set1_ps(8);
388  __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
389 
390  float max = src0[0];
391  float index = 0;
392  __m256 maxValues = _mm256_set1_ps(max);
393  __m256 maxValuesIndex = _mm256_setzero_ps();
394  __m256 compareResults;
395  __m256 currentValues;
396 
397  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
398  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
399 
400  for(;number < quarterPoints; number++)
401  {
402  currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
403  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
404  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
405  maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
406  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
407  }
408 
409  // Calculate the largest value from the remaining 8 points
410  _mm256_store_ps(maxValuesBuffer, maxValues);
411  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
412 
413  for(number = 0; number < 8; number++)
414  {
415  if(maxValuesBuffer[number] > max)
416  {
417  index = maxIndexesBuffer[number];
418  max = maxValuesBuffer[number];
419  }
420  else if(maxValuesBuffer[number] == max){
421  if (index > maxIndexesBuffer[number])
422  index = maxIndexesBuffer[number];
423  }
424  }
425 
426  number = quarterPoints * 8;
427  for(;number < num_points; number++)
428  {
429  if(src0[number] > max)
430  {
431  index = number;
432  max = src0[number];
433  }
434  }
435  target[0] = (uint32_t)index;
436  }
437 }
438 
439 #endif /*LV_HAVE_AVX*/
440 
441 
442 #ifdef LV_HAVE_SSE4_1
443 #include<smmintrin.h>
444 
445 static inline void volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
446 {
447  if(num_points > 0)
448  {
449  uint32_t number = 0;
450  const uint32_t quarterPoints = num_points / 4;
451 
452  float* inputPtr = (float*)src0;
453 
454  __m128 indexIncrementValues = _mm_set1_ps(4);
455  __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
456 
457  float max = src0[0];
458  float index = 0;
459  __m128 maxValues = _mm_set1_ps(max);
460  __m128 maxValuesIndex = _mm_setzero_ps();
461  __m128 compareResults;
462  __m128 currentValues;
463 
464  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
465  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
466 
467  for(;number < quarterPoints; number++)
468  {
469  currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
470  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
471  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
472  maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
473  maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
474  }
475 
476  // Calculate the largest value from the remaining 4 points
477  _mm_store_ps(maxValuesBuffer, maxValues);
478  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
479 
480  for(number = 0; number < 4; number++)
481  {
482  if(maxValuesBuffer[number] > max)
483  {
484  index = maxIndexesBuffer[number];
485  max = maxValuesBuffer[number];
486  }
487  else if(maxValuesBuffer[number] == max){
488  if (index > maxIndexesBuffer[number])
489  index = maxIndexesBuffer[number];
490  }
491  }
492 
493  number = quarterPoints * 4;
494  for(;number < num_points; number++)
495  {
496  if(src0[number] > max)
497  {
498  index = number;
499  max = src0[number];
500  }
501  }
502  target[0] = (uint32_t)index;
503  }
504 }
505 
506 #endif /*LV_HAVE_SSE4_1*/
507 
508 #ifdef LV_HAVE_SSE
509 #include<xmmintrin.h>
510 
511 static inline void volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
512 {
513  if(num_points > 0)
514  {
515  uint32_t number = 0;
516  const uint32_t quarterPoints = num_points / 4;
517 
518  float* inputPtr = (float*)src0;
519 
520  __m128 indexIncrementValues = _mm_set1_ps(4);
521  __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
522 
523  float max = src0[0];
524  float index = 0;
525  __m128 maxValues = _mm_set1_ps(max);
526  __m128 maxValuesIndex = _mm_setzero_ps();
527  __m128 compareResults;
528  __m128 currentValues;
529 
530  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
531  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
532 
533  for(;number < quarterPoints; number++)
534  {
535  currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
536  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
537  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
538  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
539  _mm_andnot_ps(compareResults, maxValuesIndex));
540  maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
541  _mm_andnot_ps(compareResults, maxValues));
542  }
543 
544  // Calculate the largest value from the remaining 4 points
545  _mm_store_ps(maxValuesBuffer, maxValues);
546  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
547 
548  for(number = 0; number < 4; number++)
549  {
550  if(maxValuesBuffer[number] > max)
551  {
552  index = maxIndexesBuffer[number];
553  max = maxValuesBuffer[number];
554  }
555  else if(maxValuesBuffer[number] == max){
556  if (index > maxIndexesBuffer[number])
557  index = maxIndexesBuffer[number];
558  }
559  }
560 
561  number = quarterPoints * 4;
562  for(;number < num_points; number++)
563  {
564  if(src0[number] > max)
565  {
566  index = number;
567  max = src0[number];
568  }
569  }
570  target[0] = (uint32_t)index;
571  }
572 }
573 
574 #endif /*LV_HAVE_SSE*/
575 
576 #endif /*INCLUDED_volk_32f_index_max_32u_u_H*/
static void volk_32f_index_max_32u_a_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:141
static void volk_32f_index_max_32u_neon(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:274
static void volk_32f_index_max_32u_a_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:207
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32f_index_max_32u_generic(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:342
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:46
static void volk_32f_index_max_32u_u_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:511
static void volk_32f_index_max_32u_u_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:378