Vector Optimized Library of Kernels  2.1
Architecture-tuned implementations of math kernels
volk_32f_64f_add_64f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
73 #ifndef INCLUDED_volk_32f_64f_add_64f_H
74 #define INCLUDED_volk_32f_64f_add_64f_H
75 
76 #include <inttypes.h>
77 
78 #ifdef LV_HAVE_GENERIC
79 
80 static inline void volk_32f_64f_add_64f_generic(double *cVector,
81  const float *aVector,
82  const double *bVector,
83  unsigned int num_points) {
84  double *cPtr = cVector;
85  const float *aPtr = aVector;
86  const double *bPtr = bVector;
87  unsigned int number = 0;
88 
89  for (number = 0; number < num_points; number++) {
90  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
91  }
92 }
93 
94 #endif /* LV_HAVE_GENERIC */
95 
96 #ifdef LV_HAVE_NEONV8
97 #include <arm_neon.h>
98 
99 static inline void volk_32f_64f_add_64f_neon(double *cVector,
100  const float *aVector,
101  const double *bVector,
102  unsigned int num_points) {
103  unsigned int number = 0;
104  const unsigned int half_points = num_points / 2;
105 
106  double *cPtr = cVector;
107  const float *aPtr = aVector;
108  const double *bPtr = bVector;
109 
110  float64x2_t aVal, bVal, cVal;
111  float32x2_t aVal1;
112  for (number = 0; number < half_points; number++) {
113  // Load in to NEON registers
114  aVal1 = vld1_f32(aPtr);
115  bVal = vld1q_f64(bPtr);
116  __VOLK_PREFETCH(aPtr + 2);
117  __VOLK_PREFETCH(bPtr + 2);
118  aPtr += 2; // q uses quadwords, 4 floats per vadd
119  bPtr += 2;
120 
121  // Vector conversion
122  aVal = vcvt_f64_f32(aVal1);
123  // vector add
124  cVal = vaddq_f64(aVal, bVal);
125  // Store the results back into the C container
126  vst1q_f64(cPtr, cVal);
127 
128  cPtr += 2;
129  }
130 
131  number = half_points * 2; // should be = num_points
132  for (; number < num_points; number++) {
133  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
134  }
135 }
136 
137 #endif /* LV_HAVE_NEONV8 */
138 
139 #ifdef LV_HAVE_AVX
140 
141 #include <immintrin.h>
142 #include <xmmintrin.h>
143 
144 static inline void volk_32f_64f_add_64f_u_avx(double *cVector,
145  const float *aVector,
146  const double *bVector,
147  unsigned int num_points) {
148  unsigned int number = 0;
149  const unsigned int eighth_points = num_points / 8;
150 
151  double *cPtr = cVector;
152  const float *aPtr = aVector;
153  const double *bPtr = bVector;
154 
155  __m256 aVal;
156  __m128 aVal1, aVal2;
157  __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
158  for (; number < eighth_points; number++) {
159 
160  aVal = _mm256_loadu_ps(aPtr);
161  bVal1 = _mm256_loadu_pd(bPtr);
162  bVal2 = _mm256_loadu_pd(bPtr + 4);
163 
164  aVal1 = _mm256_extractf128_ps(aVal, 0);
165  aVal2 = _mm256_extractf128_ps(aVal, 1);
166 
167  aDbl1 = _mm256_cvtps_pd(aVal1);
168  aDbl2 = _mm256_cvtps_pd(aVal2);
169 
170  cVal1 = _mm256_add_pd(aDbl1, bVal1);
171  cVal2 = _mm256_add_pd(aDbl2, bVal2);
172 
173  _mm256_storeu_pd(cPtr,
174  cVal1); // Store the results back into the C container
175  _mm256_storeu_pd(cPtr + 4,
176  cVal2); // Store the results back into the C container
177 
178  aPtr += 8;
179  bPtr += 8;
180  cPtr += 8;
181  }
182 
183  number = eighth_points * 8;
184  for (; number < num_points; number++) {
185  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
186  }
187 }
188 
189 #endif /* LV_HAVE_AVX */
190 
191 #ifdef LV_HAVE_AVX
192 
193 #include <immintrin.h>
194 #include <xmmintrin.h>
195 
196 static inline void volk_32f_64f_add_64f_a_avx(double *cVector,
197  const float *aVector,
198  const double *bVector,
199  unsigned int num_points) {
200  unsigned int number = 0;
201  const unsigned int eighth_points = num_points / 8;
202 
203  double *cPtr = cVector;
204  const float *aPtr = aVector;
205  const double *bPtr = bVector;
206 
207  __m256 aVal;
208  __m128 aVal1, aVal2;
209  __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
210  for (; number < eighth_points; number++) {
211 
212  aVal = _mm256_load_ps(aPtr);
213  bVal1 = _mm256_load_pd(bPtr);
214  bVal2 = _mm256_load_pd(bPtr + 4);
215 
216  aVal1 = _mm256_extractf128_ps(aVal, 0);
217  aVal2 = _mm256_extractf128_ps(aVal, 1);
218 
219  aDbl1 = _mm256_cvtps_pd(aVal1);
220  aDbl2 = _mm256_cvtps_pd(aVal2);
221 
222  cVal1 = _mm256_add_pd(aDbl1, bVal1);
223  cVal2 = _mm256_add_pd(aDbl2, bVal2);
224 
225  _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
226  _mm256_store_pd(cPtr + 4,
227  cVal2); // Store the results back into the C container
228 
229  aPtr += 8;
230  bPtr += 8;
231  cPtr += 8;
232  }
233 
234  number = eighth_points * 8;
235  for (; number < num_points; number++) {
236  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
237  }
238 }
239 
240 #endif /* LV_HAVE_AVX */
241 
242 #endif /* INCLUDED_volk_32f_64f_add_64f_u_H */
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:52
static void volk_32f_64f_add_64f_generic(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:80
static void volk_32f_64f_add_64f_u_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:144
static void volk_32f_64f_add_64f_a_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:196