Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
factor.h
1 /*
2  * Copyright 2011-15 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : common/factor.h
30  */
31 
32 // Typebuilding MACROs
33 // - Slight difference between toolchain versions on intrinsics
34 #define FLOAT32_2x3(x1,y1,x2,y2,x3,y3) \
35  {{ \
36  {x1, y1}, {x2,y2}, {x3,y3} \
37  }}
38 
39 // Unit test use this macro to index into their function table
40 // "opc" stands for operation's code (which function),
41 // and "imp" stands for implementation (which implementation of the function)
42 #define FTBL_IDX(opc, imp) ((opc-1)*IMPL_COUNT+(imp-1))
43 
44 // This macro helps measure the performance of the code passed to it through the "code" argument
45 // It is used in the unit tests
46 #define MEASURE(res, code) \
47  { \
48  gettimeofday (&before, &zone); \
49  code \
50  gettimeofday (&after, &zone); \
51  if (before.tv_usec > after.tv_usec) \
52  { \
53  after.tv_usec += 1000000; \
54  after.tv_sec--; \
55  } \
56  lapsed.tv_usec = after.tv_usec - before.tv_usec; \
57  lapsed.tv_sec = after.tv_sec - before.tv_sec; \
58  res = lapsed.tv_sec + ((double)lapsed.tv_usec / 1000000.0); \
59  }
60 
61 // There are several categories of functions that share common code:
62 
63 // Different groups of functions take different number of inputs
64 //
65 // Group 1 = Functions that take a dst, a src, and a cst ("DstSrcCst" for short)
66 // Group 2 = Those that take a dst, an acc, a src, and a cst ("DstAccSrcCst" for short)
67 // Group 3 = The ones that take a dst, and a cst only ("DstCst" for short)
68 //
69 // Group 4 = These take a dst, and two src inputs, src2 and scr2 ("DstSrc1Src2")
70 // Group 5 = These take a dst, an acc, and two src inputs ("DstAccSrc1Src2")
71 // Group 6 = These take a dst, and a src ("DstSrc")
72 //
73 
74 // The naming convention used in the following macros is as follows:
75 // SNAPP_<A>_OPERATION_<T>_<I>
76 // where
77 // <A> Stands for the title of the operation (add, mul, etc) followed by its type (C = const as in addc).
78 // The letter X - if used - means any such operation.
79 // <T> Indicates the type of the operation (float, vec2, etc.)
80 // The letter X - is used - means any type.
81 // <I> This indicates the implementation (it can be C, ASM, or NEON).
82 
83 // A few macros to check pointers and their address range to make sure there's
84 // no unwanted overlap between any two of them
85 #define NE10_CHECKPOINTER_DstSrcCst_OPERATION \
86  if ( (void *)dst < (void *)src ) \
87  { assert ( (void *)dst + count <= (void *)src ); } \
88  else if ( (void *)dst > (void *)src ) \
89  { assert ( (void *)src + count <= (void *)dst ); }
90 
91 #define NE10_CHECKPOINTER_DstSrc_OPERATION NE10_CHECKPOINTER_DstSrcCst_OPERATION
92 
93 #define NE10_CHECKPOINTER_3POINTER_OPERATION(arg1, arg2, arg3) \
94  if ( (void *)arg1 < (void *)arg2 ) \
95  { assert ( (void *)arg1 + count <= (void *)arg2 ); } \
96  else if ( (void *)arg1 > (void *)arg2 ) \
97  { assert ( (void *)arg2 + count <= (void *)arg1 ); } \
98  if ( (void *)arg1 < (void *)arg3 ) \
99  { assert ( (void *)arg1 + count <= (void *)arg3 ); } \
100  else if ( (void *)arg1 > (void *)arg3 ) \
101  { assert ( (void *)arg3 + count <= (void *)arg1 ); } \
102  if ( (void *)arg3 < (void *)arg2 ) \
103  { assert ( (void *)arg3 + count <= (void *)arg2 ); } \
104  else if ( (void *)arg3 > (void *)arg2 ) \
105  { assert ( (void *)arg2 + count <= (void *)arg3 ); }
106 
107 #define NE10_CHECKPOINTER_4POINTER_OPERATION(arg1, arg2, arg3, arg4) \
108  NE10_CHECKPOINTER_3POINTER_OPERATION(arg1, arg2, arg3) \
109  if ( (void *)arg1 < (void *)arg4 ) \
110  { assert ( (void *)arg1 + count <= (void *)arg4 ); } \
111  else if ( (void *)arg1 > (void *)arg4 ) \
112  { assert ( (void *)arg4 + count <= (void *)arg1 ); } \
113  if ( (void *)arg2 < (void *)arg4 ) \
114  { assert ( (void *)arg2 + count <= (void *)arg4 ); } \
115  else if ( (void *)arg2 > (void *)arg4 ) \
116  { assert ( (void *)arg4 + count <= (void *)arg2 ); } \
117  if ( (void *)arg4 < (void *)arg3 ) \
118  { assert ( (void *)arg4 + count <= (void *)arg3 ); } \
119  else if ( (void *)arg4 > (void *)arg3 ) \
120  { assert ( (void *)arg3 + count <= (void *)arg4 ); }
121 
122 
123 
124 #define NE10_CHECKPOINTER_DstAccSrcCst_OPERATION { \
125  NE10_CHECKPOINTER_3POINTER_OPERATION(dst, acc, src); }
126 
127 #define NE10_CHECKPOINTER_DstCst_OPERATION {}
128 
129 #define NE10_CHECKPOINTER_DstSrc1Src2_OPERATION { \
130  NE10_CHECKPOINTER_3POINTER_OPERATION(dst, src1, src2); }
131 
132 #define NE10_CHECKPOINTER_DstAccSrc1Src2_OPERATION { \
133  NE10_CHECKPOINTER_4POINTER_OPERATION(dst, acc, src1, src2); }
134 
135 // These macros generalise implementation of the functions.
136 
137 // Macros used in C implementations
138 #define NE10_TEMPLATE_XC_OPERATION_X_C(checkPointer, loopCode) { \
139  ne10_result_t res = NE10_OK; \
140  unsigned int itr = 0; \
141  checkPointer; \
142  for ( itr = 0; itr < count; itr++ ) \
143  { loopCode ; /* this loop iterates through each and every float item one at a time */ \
144  } \
145  return res; \
146  }
147 
148 // macros used in the NEON implementations
149 
150 // Main Loop = The loop where the number of items to be processed is exactly the
151 // number that we can process in a single iteration.
152 //
153 // Secondary Loop = The loop that follows a Main Loop to fill in the entries that
154 // did not fit into the Main Loop. This is needed when the number of
155 // input items is not a multiple of the number of items that we
156 // process in every iteration of the Main Loop.
157 
158 
159 /****************************************************
160  * *
161  * The "DstSrcCst" group of functions *
162  * *
163  ****************************************************/
164 
166 
167 #define NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
168  /* load 4 values */ \
169  n_src = vld1q_f32( (float32_t*)src ); \
170  src += 4; /* move to the next 4 float items; 4*float */ \
171  loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
172  vst1q_f32 ( (float32_t*)dst , n_dst ); /* store the results back */ \
173  dst += 4; /* move to the next items; 4*float */ \
174  }
175 
176 #define NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
177  float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
178  float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
179  n_tmp_src = vld1_lane_f32 ( (float32_t*)src, n_tmp_src, 0); /* load into the first lane of d0 */ \
180  loopCode; /* the actual operation is placed here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
181  vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
182  /* move to the next item in the stream */ \
183  src++; \
184  dst++; \
185  }
186 
187 #define NE10_DstSrcCst_OPERATION_FLOAT_NEON(checkPointer, loopCode1, loopCode2) { \
188  ne10_result_t res = NE10_OK; \
189  float32x4_t n_src; \
190  float32x4_t n_dst; \
191  checkPointer; \
192  int dif = 0; \
193  dif = count % 4; /* either 0 or one of 1,2,3; in the latter cases the second path is taken */ \
194  for (; count > dif; count -= 4) { \
195  loopCode1; \
196  } \
197  if ( 0 != dif ) { \
198  unsigned int idx; \
199  for ( idx = 0 ; idx < dif; idx++ ) { \
200  loopCode2; \
201  } \
202  } \
203  return res; \
204  }
205 
207 
208 #define NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
209  n_src = vld1q_f32( (float32_t*)src ); /* load two vectors */ \
210  src += 2; /* move to the next two vectors */ \
211  loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
212  vst1q_f32 ( (float32_t*)dst , n_dst ); /* store back */ \
213  dst += 2; /* move to the next 2 vectors */ \
214  }
215 
216 #define NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
217  float32x2_t n_tmp_src; \
218  float32x2_t n_tmp_cst = { cst->x, cst->y }; \
219  n_tmp_src = vld1_f32( (float32_t*)src ); \
220  loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
221  vst1_f32( (float32_t*)dst, n_tmp_src); \
222  }
223 
224 #define NE10_DstSrcCst_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
225  ne10_result_t res = NE10_OK; \
226  float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
227  float32x4_t n_src; \
228  float32x4_t n_dst; \
229  checkPointer; \
230  int dif = count % 2; \
231  for (; count > dif; count -= 2) { \
232  loopCode1; \
233  } \
234  if ( 0 != dif ) { \
235  loopCode2; \
236  } \
237  return res; \
238  }
239 
241 
242 #define NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
243  n_src1 = vld1q_f32( (float32_t*)src ); \
244  src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
245  n_src2 = vld1q_f32( (float32_t*)src ); \
246  src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
247  n_src3 = vld1q_f32( (float32_t*)src ); \
248  src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
249  loopCode; /* The main loop iterates through three 3D vectors each time */ \
250  vst1q_f32 ( (float32_t*)dst , n_dst1 ); \
251  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
252  vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
253  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
254  vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
255  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
256  }
257 
258 #define NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
259  float32x2x3_t n_tmp_src = FLOAT32_2x3( \
260  0.0f, 0.0f, 0.0f , 0.0f, 0.0f , 0.0f); \
261  float32x2x3_t n_tmp_cst = { (const float32x2_t){cst->x, 0}, \
262  (const float32x2_t){cst->y, 0}, (const float32x2_t){cst->z, 0} }; \
263  n_tmp_src = vld3_lane_f32 ( (float32_t*)src, n_tmp_src, 0); \
264  loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
265  vst3_lane_f32( (float32_t*)dst, n_tmp_src, 0); \
266  src++; \
267  dst++; \
268  }
269 
270 #define NE10_DstSrcCst_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
271  ne10_result_t res = NE10_OK; \
272  float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
273  float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
274  float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
275  float32x4_t n_src1, n_src2, n_src3; \
276  float32x4_t n_dst1, n_dst2, n_dst3; \
277  checkPointer; \
278  int dif = count % 4; \
279  for (; count > dif; count -= 4) { \
280  loopCode1; \
281  } \
282  if ( 0 != dif ) { \
283  unsigned int idx; \
284  for ( idx = 0 ; idx < dif; idx++ ) { \
285  loopCode2; \
286  } \
287  } \
288  return res; \
289  }
290 
292 
293 /* Note that for the VEC4* types, we do not need a second loop as the number
294  of input items is always a multiple of four. */
295 
296 #define NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
297  n_src = vld1q_f32( (float32_t*)src ); \
298  src ++; \
299  loopCode; \
300  vst1q_f32 ( (float32_t*)dst , n_dst ); /* The main loop iterates through one 4D vector each time */ \
301  dst ++; \
302  }
303 
304 #define NE10_DstSrcCst_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
305  ne10_result_t res = NE10_OK; \
306  float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
307  float32x4_t n_src; \
308  float32x4_t n_dst; \
309  checkPointer; \
310  for (; count != 0; count --) { \
311  loopCode; \
312  } \
313  return res; \
314  }
315 
316 /****************************************************
317  * *
318  * The "DstAccSrcCst" group of functions *
319  * *
320  ****************************************************/
321 
323 
324 #define NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
325  /* load 4 values */ \
326  n_acc = vld1q_f32( (float32_t*)acc ); \
327  n_src = vld1q_f32( (float32_t*)src ); \
328  acc += 4; /* move to the next 4 float items; 4*float */ \
329  src += 4; \
330  loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
331  vst1q_f32 ( (float32_t*)dst , n_dst ); /* store theresults back */ \
332  dst += 4; /* move to the next items; 4*float */ \
333  }
334 
335 #define NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
336  float32x2_t n_tmp_acc = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
337  float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
338  float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
339  n_tmp_acc = vld1_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); /* load into the first lane of d0 */ \
340  n_tmp_src = vld1_lane_f32 ( (float32_t*)src, n_tmp_src, 0); /* load into the first lane of d1 */ \
341  loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
342  vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
343  /* move to the next item in the stream */ \
344  acc++; \
345  src++; \
346  dst++; \
347  }
348 
349 #define NE10_DstAccSrcCst_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
350 
352 
353 #define NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
354  n_acc = vld1q_f32( (float32_t*)acc ); /* load two vectors */ \
355  n_src = vld1q_f32( (float32_t*)src ); /* load two vectors */ \
356  acc += 2; /* move to the next two vectors */ \
357  src += 2; \
358  loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
359  vst1q_f32 ( (float32_t*)dst , n_dst ); /* store back */ \
360  dst += 2; /* move to the next 2 vectors */ \
361  }
362 
363 #define NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
364  float32x2_t n_tmp_acc; \
365  float32x2_t n_tmp_src; \
366  float32x2_t n_tmp_cst = { cst->x, cst->y }; \
367  n_tmp_acc = vld1_f32( (float32_t*)acc ); \
368  n_tmp_src = vld1_f32( (float32_t*)src ); \
369  loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
370  vst1_f32( (float32_t*)dst, n_tmp_src); \
371  }
372 
373 #define NE10_DstAccSrcCst_OPERATION_VEC2F_NEON NE10_DstSrcCst_OPERATION_VEC2F_NEON
374 
376 
377 #define NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
378  n_acc1 = vld1q_f32( (float32_t*)acc ); /* Load accumulator values */ \
379  acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
380  n_acc2 = vld1q_f32( (float32_t*)acc ); \
381  acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
382  n_acc3 = vld1q_f32( (float32_t*)acc ); \
383  acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
384  n_src1 = vld1q_f32( (float32_t*)src ); /* Load source values */ \
385  src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
386  n_src2 = vld1q_f32( (float32_t*)src ); \
387  src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
388  n_src3 = vld1q_f32( (float32_t*)src ); \
389  src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
390  loopCode; /* The main loop iterates through three 3D vectors each time */ \
391  vst1q_f32 ( (float32_t*)dst , n_dst1 ); /* Store the results back into the memory */ \
392  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
393  vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
394  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
395  vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
396  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
397  }
398 
399 #define NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
400  float32x2x3_t n_tmp_acc = FLOAT32_2x3( \
401  0.0f, 0.0f, \
402  0.0f, 0.0f, \
403  0.0f, 0.0f \
404  ); \
405  float32x2x3_t n_tmp_src = FLOAT32_2x3( \
406  0.0f, 0.0f, \
407  0.0f, 0.0f, \
408  0.0f, 0.0f \
409  ); \
410  float32x2x3_t n_tmp_cst = { (const float32x2_t){cst->x, 0}, \
411  (const float32x2_t){cst->y, 0}, \
412  (const float32x2_t){cst->z, 0} }; \
413  n_tmp_acc = vld3_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); \
414  n_tmp_src = vld3_lane_f32 ( (float32_t*)src, n_tmp_src, 0); \
415  loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
416  vst3_lane_f32( (float32_t*)dst, n_tmp_src, 0); \
417  acc++; \
418  src++; \
419  dst++; \
420  }
421 
422 #define NE10_DstAccSrcCst_OPERATION_VEC3F_NEON NE10_DstSrcCst_OPERATION_VEC3F_NEON
423 
425 
426 #define NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
427  n_acc = vld1q_f32( (float32_t*)acc ); \
428  n_src = vld1q_f32( (float32_t*)src ); \
429  acc ++; \
430  src ++; \
431  loopCode; \
432  vst1q_f32 ( (float32_t*)dst , n_dst ); /* The main loop iterates through one 4D vector each time */ \
433  dst ++; \
434  }
435 
436 #define NE10_DstAccSrcCst_OPERATION_VEC4F_NEON NE10_DstSrcCst_OPERATION_VEC4F_NEON
437 
438 /****************************************************
439  * *
440  * The "DstCst" group of functions *
441  * *
442  ****************************************************/
443 
445 
446 #define NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode) { \
447  /* load 4 values */ \
448  loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
449  vst1q_f32 ( (float32_t*)dst , n_cst ); /* store theresults back */ \
450  dst += 4; /* move to the next items; 4*float */ \
451  }
452 
453 #define NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
454  float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
455  loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
456  vst1_lane_f32( (float32_t*)dst, n_tmp_cst, 0); /* store the lane back into the memory */ \
457  /* move to the next item in the stream */ \
458  dst++; \
459  }
460 
461 #define NE10_DstCst_OPERATION_FLOAT_NEON(checkPointer, loopCode1, loopCode2) { \
462  ne10_result_t res = NE10_OK; \
463  checkPointer; \
464  int dif = 0; \
465  dif = count % 4; /* either 0 or one of 1,2,3; in the latter cases the second path is taken */ \
466  for (; count > dif; count -= 4) { \
467  loopCode1; \
468  } \
469  if ( 0 != dif ) { \
470  unsigned int idx; \
471  for ( idx = 0 ; idx < dif; idx++ ) { \
472  loopCode2; \
473  } \
474  } \
475  return res; \
476  }
477 
479 
480 
481 #define NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode) { \
482  loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
483  vst1q_f32 ( (float32_t*)dst , n_cst ); /* store back */ \
484  dst += 2; /* move to the next 2 vectors */ \
485  }
486 
487 #define NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
488  float32x2_t n_tmp_cst = { cst->x, cst->y }; \
489  loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
490  vst1_f32( (float32_t*)dst, n_tmp_cst); \
491  }
492 
493 #define NE10_DstCst_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
494  ne10_result_t res = NE10_OK; \
495  float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
496  checkPointer; \
497  int dif = count % 2; \
498  for (; count > dif; count -= 2) { \
499  loopCode1; \
500  } \
501  if ( 0 != dif ) { \
502  loopCode2; \
503  } \
504  return res; \
505  }
506 
508 
509 #define NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode) { \
510  loopCode; /* The main loop iterates through three 3D vectors each time */ \
511  vst1q_f32 ( (float32_t*)dst , n_cst1 ); \
512  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
513  vst1q_f32 ( (float32_t*)dst , n_cst2 ); \
514  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
515  vst1q_f32 ( (float32_t*)dst , n_cst3 ); \
516  dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
517  }
518 
519 #define NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
520  float32x2x3_t n_tmp_cst = { (const float32x2_t){cst->x, 0}, \
521  (const float32x2_t){cst->y, 0}, (const float32x2_t){cst->z, 0} }; \
522  loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
523  vst3_lane_f32( (float32_t*)dst, n_tmp_cst, 0); \
524  dst++; \
525  }
526 
527 #define NE10_DstCst_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
528  ne10_result_t res = NE10_OK; \
529  float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
530  float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
531  float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
532  checkPointer; \
533  int dif = count % 4; \
534  for (; count > dif; count -= 4) { \
535  loopCode1; \
536  } \
537  if ( 0 != dif ) { \
538  unsigned int idx; \
539  for ( idx = 0 ; idx < dif; idx++ ) { \
540  loopCode2; \
541  } \
542  } \
543  return res; \
544  }
545 
547 
548 #define NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode) { \
549  loopCode; \
550  vst1q_f32 ( (float32_t*)dst , n_cst ); /* The main loop iterates through one 4D vector each time */ \
551  dst ++; \
552  }
553 
554 #define NE10_DstCst_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
555  ne10_result_t res = NE10_OK; \
556  float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
557  checkPointer; \
558  for (; count != 0; count --) { \
559  loopCode; \
560  } \
561  return res; \
562  }
563 
564 /****************************************************
565  * *
566  * The "DstSrc1Src2" group of functions *
567  * *
568  ****************************************************/
569 
571 
572 #define NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
573  /* load 4 values */ \
574  n_src = vld1q_f32( (float32_t*)src1 ); \
575  src1 += 4; /* move to the next 4 float items; 4*float */ \
576  n_src2 = vld1q_f32( (float32_t*)src2 ); \
577  src2 += 4; /* move to the next 4 float items; 4*float */ \
578  loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
579  vst1q_f32 ( (float32_t*)dst , n_dst ); /* store the results back */ \
580  dst += 4; /* move to the next items; 4*float */ \
581  }
582 
583 #define NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
584  float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
585  float32x2_t n_tmp_src2 = { 0.0f , 0.0f }; \
586  n_tmp_src = vld1_lane_f32 ( (float32_t*)src1, n_tmp_src, 0); /* load into the first lane of d0 */ \
587  n_tmp_src2 = vld1_lane_f32 ( (float32_t*)src2, n_tmp_src, 0); \
588  loopCode; /* the actual operation is placed here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
589  vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
590  /* move to the next item in the stream */ \
591  src1++; \
592  src2++; \
593  dst++; \
594  }
595 
596 #define NE10_DstSrc1Src2_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
597 
598 /****************************************************
599  * *
600  * The "DstAccSrc1Src2" group of functions *
601  * *
602  ****************************************************/
603 
605 
606 #define NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
607  /* load 4 values */ \
608  n_acc = vld1q_f32( (float32_t*)acc ); \
609  n_src = vld1q_f32( (float32_t*)src1 ); \
610  n_src2 = vld1q_f32( (float32_t*)src2 ); \
611  acc += 4; /* move to the next 4 float items; 4*float */ \
612  src1 += 4; \
613  src2 += 4; \
614  loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
615  vst1q_f32 ( (float32_t*)dst , n_dst ); /* store theresults back */ \
616  dst += 4; /* move to the next items; 4*float */ \
617  }
618 
619 #define NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
620  float32x2_t n_tmp_acc = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
621  float32x2_t n_tmp_src = { 0.0f , 0.0f }; \
622  float32x2_t n_tmp_src2 = { 0.0f, 0.0f }; \
623  n_tmp_acc = vld1_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); /* load into the first lane of d0 */ \
624  n_tmp_src = vld1_lane_f32 ( (float32_t*)src1, n_tmp_src, 0); /* load into the first lane of d1 */ \
625  n_tmp_src2 = vld1_lane_f32 ( (float32_t*)src2, n_tmp_src2, 0); /* load into the first lane of d2 */ \
626  loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
627  vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
628  /* move to the next item in the stream */ \
629  acc++; \
630  src1++; \
631  src2++; \
632  dst++; \
633  }
634 
635 #define NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON NE10_DstAccSrcCst_OPERATION_FLOAT_NEON
636 
637 /****************************************************
638  * *
639  * The "DstSrc" group of functions *
640  * *
641  ****************************************************/
642 
644 
645 #define NE10_DstSrc_MAINLOOP_FLOAT_NEON NE10_DstSrcCst_MAINLOOP_FLOAT_NEON
646 
647 #define NE10_DstSrc_SECONDLOOP_FLOAT_NEON NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON
648 
649 #define NE10_DstSrc_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
650 
652 
653 #define NE10_DstSrc_MAINLOOP_VEC2F_NEON(loopCode) { \
654  n_src = vld2_f32( (float32_t*)src ); /* load two vectors */ \
655  src += 2; /* move to the next two vectors */ \
656  loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
657  /* store the results and increment the destination pointer within the loopCode */ \
658  }
659 
660 #define NE10_DstSrc_SECONDLOOP_VEC2F_NEON(loopCode) { \
661  loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
662  /* store the results within the loopCode */ \
663  }
664 
665 #define NE10_DstSrc_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
666  ne10_result_t res = NE10_OK; \
667  float32x2x2_t n_src; \
668  float32x2_t n_dst; \
669  checkPointer; \
670  int dif = count % 2; \
671  for (; count > dif; count -= 2) { \
672  loopCode1; \
673  } \
674  if ( 0 != dif ) { \
675  loopCode2; \
676  } \
677  return res; \
678  }
679 
681 
682 #define NE10_DstSrc_MAINLOOP_VEC3F_NEON(loopCode) { \
683  n_src = vld3q_f32( (float32_t*)src ); \
684  src = ((void*)src)+(12*sizeof(ne10_float32_t)); \
685  loopCode; /* The main loop iterates through four 3D vectors each time */ \
686  /* store the results and increment the destination pointer within the loopCode */ \
687  }
688 
689 #define NE10_DstSrc_SECONDLOOP_VEC3F_NEON(loopCode) { \
690  loopCode; /* exceptional cases where the count isn't a multiple of 4 */ \
691  /* store the results within the loopCode */ \
692  }
693 
694 #define NE10_DstSrc_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
695  ne10_result_t res = NE10_OK; \
696  float32x4x3_t n_src; \
697  float32x4_t n_dst; \
698  checkPointer; \
699  int dif = count % 4; \
700  for (; count > dif; count -= 4) { \
701  loopCode1; \
702  } \
703  if ( 0 != dif ) { \
704  unsigned int idx; \
705  for ( idx = 0 ; idx < dif; idx++ ) { \
706  loopCode2; \
707  } \
708  } \
709  return res; \
710  }
711 
713 
714 /* Note that for the VEC4* types, we do not need a second loop as the number
715  of input items is always a multiple of four. */
716 
717 #define NE10_DstSrc_MAINLOOP_VEC4F_NEON(loopCode) { \
718  n_src = vld1q_f32( (float32_t*)src ); \
719  src ++; \
720  loopCode; \
721  /* store the results and increment the destination pointer within the loopCode */ \
722  }
723 
724 #define NE10_DstSrc_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
725  ne10_result_t res = NE10_OK; \
726  float32x4_t n_src; \
727  checkPointer; \
728  for (; count != 0; count --) { \
729  loopCode; \
730  } \
731  return res; \
732  }
733