Vector Optimized Library of Kernels  2.1
Architecture-tuned implementations of math kernels
volk_32f_index_max_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_index_max_16u_a_H
72 #define INCLUDED_volk_32f_index_max_16u_a_H
73 
74 #include <volk/volk_common.h>
75 #include <volk/volk_common.h>
76 #include <inttypes.h>
77 #include <limits.h>
78 #include <stdio.h>
79 
80 #ifdef LV_HAVE_AVX
81 #include <immintrin.h>
82 
83 static inline void
84 volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0,
85  uint32_t num_points)
86 {
87  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
88 
89  uint32_t number = 0;
90  const uint32_t eighthPoints = num_points / 8;
91 
92  float* inputPtr = (float*)src0;
93 
94  __m256 indexIncrementValues = _mm256_set1_ps(8);
95  __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
96 
97  float max = src0[0];
98  float index = 0;
99  __m256 maxValues = _mm256_set1_ps(max);
100  __m256 maxValuesIndex = _mm256_setzero_ps();
101  __m256 compareResults;
102  __m256 currentValues;
103 
104  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
105  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
106 
107  for(;number < eighthPoints; number++){
108 
109  currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
110  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
111 
112  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
113 
114  maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
115  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
116  }
117 
118  // Calculate the largest value from the remaining 4 points
119  _mm256_store_ps(maxValuesBuffer, maxValues);
120  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
121 
122  for(number = 0; number < 8; number++){
123  if(maxValuesBuffer[number] > max){
124  index = maxIndexesBuffer[number];
125  max = maxValuesBuffer[number];
126  } else if(maxValuesBuffer[number] == max){
127  if (index > maxIndexesBuffer[number])
128  index = maxIndexesBuffer[number];
129  }
130  }
131 
132  number = eighthPoints * 8;
133  for(;number < num_points; number++){
134  if(src0[number] > max){
135  index = number;
136  max = src0[number];
137  }
138  }
139  target[0] = (uint16_t)index;
140 }
141 
142 #endif /*LV_HAVE_AVX*/
143 
144 #ifdef LV_HAVE_SSE4_1
145 #include <smmintrin.h>
146 
147 static inline void
148 volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0,
149  uint32_t num_points)
150 {
151  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
152 
153  uint32_t number = 0;
154  const uint32_t quarterPoints = num_points / 4;
155 
156  float* inputPtr = (float*)src0;
157 
158  __m128 indexIncrementValues = _mm_set1_ps(4);
159  __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
160 
161  float max = src0[0];
162  float index = 0;
163  __m128 maxValues = _mm_set1_ps(max);
164  __m128 maxValuesIndex = _mm_setzero_ps();
165  __m128 compareResults;
166  __m128 currentValues;
167 
168  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
169  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
170 
171  for(;number < quarterPoints; number++){
172 
173  currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
174  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
175 
176  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
177 
178  maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
179  maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
180  }
181 
182  // Calculate the largest value from the remaining 4 points
183  _mm_store_ps(maxValuesBuffer, maxValues);
184  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
185 
186  for(number = 0; number < 4; number++){
187  if(maxValuesBuffer[number] > max){
188  index = maxIndexesBuffer[number];
189  max = maxValuesBuffer[number];
190  } else if(maxValuesBuffer[number] == max){
191  if (index > maxIndexesBuffer[number])
192  index = maxIndexesBuffer[number];
193  }
194  }
195 
196  number = quarterPoints * 4;
197  for(;number < num_points; number++){
198  if(src0[number] > max){
199  index = number;
200  max = src0[number];
201  }
202  }
203  target[0] = (uint16_t)index;
204 }
205 
206 #endif /*LV_HAVE_SSE4_1*/
207 
208 
209 #ifdef LV_HAVE_SSE
210 
211 #include <xmmintrin.h>
212 
213 static inline void
214 volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0,
215  uint32_t num_points)
216 {
217  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
218 
219  uint32_t number = 0;
220  const uint32_t quarterPoints = num_points / 4;
221 
222  float* inputPtr = (float*)src0;
223 
224  __m128 indexIncrementValues = _mm_set1_ps(4);
225  __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
226 
227  float max = src0[0];
228  float index = 0;
229  __m128 maxValues = _mm_set1_ps(max);
230  __m128 maxValuesIndex = _mm_setzero_ps();
231  __m128 compareResults;
232  __m128 currentValues;
233 
234  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
235  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
236 
237  for(;number < quarterPoints; number++){
238 
239  currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
240  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
241 
242  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
243 
244  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
245  _mm_andnot_ps(compareResults, maxValuesIndex));
246  maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
247  _mm_andnot_ps(compareResults, maxValues));
248  }
249 
250  // Calculate the largest value from the remaining 4 points
251  _mm_store_ps(maxValuesBuffer, maxValues);
252  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
253 
254  for(number = 0; number < 4; number++){
255  if(maxValuesBuffer[number] > max){
256  index = maxIndexesBuffer[number];
257  max = maxValuesBuffer[number];
258  } else if(maxValuesBuffer[number] == max){
259  if (index > maxIndexesBuffer[number])
260  index = maxIndexesBuffer[number];
261  }
262  }
263 
264  number = quarterPoints * 4;
265  for(;number < num_points; number++){
266  if(src0[number] > max){
267  index = number;
268  max = src0[number];
269  }
270  }
271  target[0] = (uint16_t)index;
272 }
273 
274 #endif /*LV_HAVE_SSE*/
275 
276 
277 #ifdef LV_HAVE_GENERIC
278 
279 static inline void
280 volk_32f_index_max_16u_generic(uint16_t* target, const float* src0,
281  uint32_t num_points)
282 {
283  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
284 
285  float max = src0[0];
286  uint16_t index = 0;
287 
288  uint32_t i = 1;
289 
290  for(; i < num_points; ++i) {
291  if(src0[i] > max) {
292  index = i;
293  max = src0[i];
294  }
295  }
296  target[0] = index;
297 }
298 
299 #endif /*LV_HAVE_GENERIC*/
300 
301 
302 #endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
303 
304 
305 
306 #ifndef INCLUDED_volk_32f_index_max_16u_u_H
307 #define INCLUDED_volk_32f_index_max_16u_u_H
308 
309 #include <volk/volk_common.h>
310 #include <volk/volk_common.h>
311 #include <inttypes.h>
312 #include <limits.h>
313 #include <stdio.h>
314 
315 #ifdef LV_HAVE_AVX
316 #include <immintrin.h>
317 
318 static inline void
319 volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0,
320  uint32_t num_points)
321 {
322  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
323 
324  uint32_t number = 0;
325  const uint32_t eighthPoints = num_points / 8;
326 
327  float* inputPtr = (float*)src0;
328 
329  __m256 indexIncrementValues = _mm256_set1_ps(8);
330  __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
331 
332  float max = src0[0];
333  float index = 0;
334  __m256 maxValues = _mm256_set1_ps(max);
335  __m256 maxValuesIndex = _mm256_setzero_ps();
336  __m256 compareResults;
337  __m256 currentValues;
338 
339  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
340  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
341 
342  for(;number < eighthPoints; number++){
343 
344  currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
345  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
346 
347  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
348 
349  maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
350  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
351  }
352 
353  // Calculate the largest value from the remaining 4 points
354  _mm256_storeu_ps(maxValuesBuffer, maxValues);
355  _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
356 
357  for(number = 0; number < 8; number++){
358  if(maxValuesBuffer[number] > max){
359  index = maxIndexesBuffer[number];
360  max = maxValuesBuffer[number];
361  } else if(maxValuesBuffer[number] == max){
362  if (index > maxIndexesBuffer[number])
363  index = maxIndexesBuffer[number];
364  }
365  }
366 
367  number = eighthPoints * 8;
368  for(;number < num_points; number++){
369  if(src0[number] > max){
370  index = number;
371  max = src0[number];
372  }
373  }
374  target[0] = (uint16_t)index;
375 }
376 
377 #endif /*LV_HAVE_AVX*/
378 
379 #endif /*INCLUDED_volk_32f_index_max_16u_u_H*/
static void volk_32f_index_max_16u_generic(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:280
static void volk_32f_index_max_16u_u_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:319
static void volk_32f_index_max_16u_a_sse(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:214
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:46
static void volk_32f_index_max_16u_a_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:84