Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
NE10_fft_float32.neonintrinsic.c
1 /*
2  * Copyright 2014-15 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : dsp/NE10_fft_float32.neon.c
30  */
31 
32 #include <arm_neon.h>
33 
34 #include "NE10_types.h"
35 #include "NE10_macros.h"
36 #include "NE10_fft.h"
37 #include "NE10_dsp.h"
38 
39 static inline void ne10_fft4_forward_float32 (ne10_fft_cpx_float32_t * Fout,
41 {
42  ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
43  ne10_float32_t tmp_r, tmp_i;
44 
45  s2_r = Fin[0].r - Fin[2].r;
46  s2_i = Fin[0].i - Fin[2].i;
47 
48  tmp_r = Fin[0].r + Fin[2].r;
49  tmp_i = Fin[0].i + Fin[2].i;
50 
51  s0_r = Fin[1].r + Fin[3].r;
52  s0_i = Fin[1].i + Fin[3].i;
53 
54  s1_r = Fin[1].r - Fin[3].r;
55  s1_i = Fin[1].i - Fin[3].i;
56  Fout[2].r = tmp_r - s0_r;
57  Fout[2].i = tmp_i - s0_i;
58  Fout[0].r = tmp_r + s0_r;
59  Fout[0].i = tmp_i + s0_i;
60 
61  Fout[1].r = s2_r + s1_i;
62  Fout[1].i = s2_i - s1_r;
63  Fout[3].r = s2_r - s1_i;
64  Fout[3].i = s2_i + s1_r;
65 }
66 
67 static inline void ne10_fft4_backward_float32 (ne10_fft_cpx_float32_t * Fout,
69 {
70  ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
71  ne10_float32_t tmp_r, tmp_i;
72 
73  s2_r = Fin[0].r - Fin[2].r;
74  s2_i = Fin[0].i - Fin[2].i;
75 
76  tmp_r = Fin[0].r + Fin[2].r;
77  tmp_i = Fin[0].i + Fin[2].i;
78 
79  s0_r = Fin[1].r + Fin[3].r;
80  s0_i = Fin[1].i + Fin[3].i;
81 
82  s1_r = Fin[1].r - Fin[3].r;
83  s1_i = Fin[1].i - Fin[3].i;
84  Fout[2].r = (tmp_r - s0_r) * 0.25f;
85  Fout[2].i = (tmp_i - s0_i) * 0.25f;
86  Fout[0].r = (tmp_r + s0_r) * 0.25f;
87  Fout[0].i = (tmp_i + s0_i) * 0.25f;
88 
89  Fout[1].r = (s2_r - s1_i) * 0.25f;
90  Fout[1].i = (s2_i + s1_r) * 0.25f;
91  Fout[3].r = (s2_r + s1_i) * 0.25f;
92  Fout[3].i = (s2_i - s1_r) * 0.25f;
93 }
94 
95 
96 static inline void ne10_fft8_forward_float32 (ne10_fft_cpx_float32_t * Fout,
98 {
99  ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
100  ne10_float32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
101  const ne10_float32_t TW_81 = 0.70710678;
102 
103  s0_r = Fin[0].r + Fin[4].r;
104  s0_i = Fin[0].i + Fin[4].i;
105  s1_r = Fin[0].r - Fin[4].r;
106  s1_i = Fin[0].i - Fin[4].i;
107  s2_r = Fin[1].r + Fin[5].r;
108  s2_i = Fin[1].i + Fin[5].i;
109  s3_r = Fin[1].r - Fin[5].r;
110  s3_i = Fin[1].i - Fin[5].i;
111  s4_r = Fin[2].r + Fin[6].r;
112  s4_i = Fin[2].i + Fin[6].i;
113  s5_r = Fin[2].r - Fin[6].r;
114  s5_i = Fin[2].i - Fin[6].i;
115  s6_r = Fin[3].r + Fin[7].r;
116  s6_i = Fin[3].i + Fin[7].i;
117  s7_r = Fin[3].r - Fin[7].r;
118  s7_i = Fin[3].i - Fin[7].i;
119 
120  t0_r = s0_r - s4_r;
121  t0_i = s0_i - s4_i;
122  t1_r = s0_r + s4_r;
123  t1_i = s0_i + s4_i;
124  t2_r = s2_r + s6_r;
125  t2_i = s2_i + s6_i;
126  t3_r = s2_r - s6_r;
127  t3_i = s2_i - s6_i;
128  Fout[0].r = t1_r + t2_r;
129  Fout[0].i = t1_i + t2_i;
130  Fout[4].r = t1_r - t2_r;
131  Fout[4].i = t1_i - t2_i;
132  Fout[2].r = t0_r + t3_i;
133  Fout[2].i = t0_i - t3_r;
134  Fout[6].r = t0_r - t3_i;
135  Fout[6].i = t0_i + t3_r;
136 
137  t4_r = (s3_r + s3_i) * TW_81;
138  t4_i = - (s3_r - s3_i) * TW_81;
139  t5_r = (s7_r - s7_i) * TW_81;
140  t5_i = (s7_r + s7_i) * TW_81;
141 
142  t0_r = s1_r - s5_i;
143  t0_i = s1_i + s5_r;
144  t1_r = s1_r + s5_i;
145  t1_i = s1_i - s5_r;
146  t2_r = t4_r - t5_r;
147  t2_i = t4_i - t5_i;
148  t3_r = t4_r + t5_r;
149  t3_i = t4_i + t5_i;
150  Fout[1].r = t1_r + t2_r;
151  Fout[1].i = t1_i + t2_i;
152  Fout[5].r = t1_r - t2_r;
153  Fout[5].i = t1_i - t2_i;
154  Fout[3].r = t0_r + t3_i;
155  Fout[3].i = t0_i - t3_r;
156  Fout[7].r = t0_r - t3_i;
157  Fout[7].i = t0_i + t3_r;
158 }
159 
160 static inline void ne10_fft8_backward_float32 (ne10_fft_cpx_float32_t * Fout,
162 {
163  ne10_float32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
164  ne10_float32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
165  const ne10_float32_t TW_81 = 0.70710678;
166 
167  s0_r = Fin[0].r + Fin[4].r;
168  s0_i = Fin[0].i + Fin[4].i;
169  s1_r = Fin[0].r - Fin[4].r;
170  s1_i = Fin[0].i - Fin[4].i;
171  s2_r = Fin[1].r + Fin[5].r;
172  s2_i = Fin[1].i + Fin[5].i;
173  s3_r = Fin[1].r - Fin[5].r;
174  s3_i = Fin[1].i - Fin[5].i;
175  s4_r = Fin[2].r + Fin[6].r;
176  s4_i = Fin[2].i + Fin[6].i;
177  s5_r = Fin[2].r - Fin[6].r;
178  s5_i = Fin[2].i - Fin[6].i;
179  s6_r = Fin[3].r + Fin[7].r;
180  s6_i = Fin[3].i + Fin[7].i;
181  s7_r = Fin[3].r - Fin[7].r;
182  s7_i = Fin[3].i - Fin[7].i;
183 
184  t0_r = s0_r - s4_r;
185  t0_i = s0_i - s4_i;
186  t1_r = s0_r + s4_r;
187  t1_i = s0_i + s4_i;
188  t2_r = s2_r + s6_r;
189  t2_i = s2_i + s6_i;
190  t3_r = s2_r - s6_r;
191  t3_i = s2_i - s6_i;
192  Fout[0].r = (t1_r + t2_r) * 0.125f;
193  Fout[0].i = (t1_i + t2_i) * 0.125f;
194  Fout[4].r = (t1_r - t2_r) * 0.125f;
195  Fout[4].i = (t1_i - t2_i) * 0.125f;
196  Fout[2].r = (t0_r - t3_i) * 0.125f;
197  Fout[2].i = (t0_i + t3_r) * 0.125f;
198  Fout[6].r = (t0_r + t3_i) * 0.125f;
199  Fout[6].i = (t0_i - t3_r) * 0.125f;
200 
201  t4_r = (s3_r - s3_i) * TW_81;
202  t4_i = (s3_r + s3_i) * TW_81;
203  t5_r = (s7_r + s7_i) * TW_81;
204  t5_i = - (s7_r - s7_i) * TW_81;
205 
206  t0_r = s1_r + s5_i;
207  t0_i = s1_i - s5_r;
208  t1_r = s1_r - s5_i;
209  t1_i = s1_i + s5_r;
210  t2_r = t4_r - t5_r;
211  t2_i = t4_i - t5_i;
212  t3_r = t4_r + t5_r;
213  t3_i = t4_i + t5_i;
214  Fout[1].r = (t1_r + t2_r) * 0.125f;
215  Fout[1].i = (t1_i + t2_i) * 0.125f;
216  Fout[5].r = (t1_r - t2_r) * 0.125f;
217  Fout[5].i = (t1_i - t2_i) * 0.125f;
218  Fout[3].r = (t0_r - t3_i) * 0.125f;
219  Fout[3].i = (t0_i + t3_r) * 0.125f;
220  Fout[7].r = (t0_r + t3_i) * 0.125f;
221  Fout[7].i = (t0_i - t3_r) * 0.125f;
222 }
223 
224 static void ne10_fft16_forward_float32_neon (ne10_fft_cpx_float32_t * Fout,
226  ne10_fft_cpx_float32_t * twiddles)
227 {
228  ne10_fft_cpx_float32_t *tw1, *tw2, *tw3;
229 
230  // the first stage
231  float32_t *p_src0, *p_src4, *p_src8, *p_src12;
232  float32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
233  float32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
234  float32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
235  float32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
236  p_src0 = (float32_t*) (& (Fin[0]));
237  p_src4 = (float32_t*) (& (Fin[4]));
238  p_src8 = (float32_t*) (& (Fin[8]));
239  p_src12 = (float32_t*) (& (Fin[12]));
240  q2_in_0123 = vld2q_f32 (p_src0);
241  q2_in_4567 = vld2q_f32 (p_src4);
242  q2_in_89ab = vld2q_f32 (p_src8);
243  q2_in_cdef = vld2q_f32 (p_src12);
244 
245  q_t2_r = vsubq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
246  q_t2_i = vsubq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
247  q_t3_r = vaddq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
248  q_t3_i = vaddq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
249 
250  q_t0_r = vaddq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
251  q_t0_i = vaddq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
252  q_t1_r = vsubq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
253  q_t1_i = vsubq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
254 
255  q_out_r26ae = vsubq_f32 (q_t3_r, q_t0_r);
256  q_out_i26ae = vsubq_f32 (q_t3_i, q_t0_i);
257  q_out_r048c = vaddq_f32 (q_t3_r, q_t0_r);
258  q_out_i048c = vaddq_f32 (q_t3_i, q_t0_i);
259  q_out_r159d = vaddq_f32 (q_t2_r, q_t1_i);
260  q_out_i159d = vsubq_f32 (q_t2_i, q_t1_r);
261  q_out_r37bf = vsubq_f32 (q_t2_r, q_t1_i);
262  q_out_i37bf = vaddq_f32 (q_t2_i, q_t1_r);
263 
264  // second stages
265  float32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
266  float32_t *p_tw1, *p_tw2, *p_tw3;
267  float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
268  float32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
269  float32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
270  float32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
271  float32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
272  float32x4x2_t q2_tw1, q2_tw2, q2_tw3;
273  float32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
274  tw1 = twiddles;
275  tw2 = twiddles + 4;
276  tw3 = twiddles + 8;
277  p_dst0 = (float32_t*) (&Fout[0]);
278  p_dst1 = (float32_t*) (&Fout[4]);
279  p_dst2 = (float32_t*) (&Fout[8]);
280  p_dst3 = (float32_t*) (&Fout[12]);
281  p_tw1 = (float32_t*) tw1;
282  p_tw2 = (float32_t*) tw2;
283  p_tw3 = (float32_t*) tw3;
284  q2_tmp_0 = vzipq_f32 (q_out_r048c, q_out_r159d);
285  q2_tmp_1 = vzipq_f32 (q_out_i048c, q_out_i159d);
286  q2_tmp_2 = vzipq_f32 (q_out_r26ae, q_out_r37bf);
287  q2_tmp_3 = vzipq_f32 (q_out_i26ae, q_out_i37bf);
288  q_in_r0123 = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[0]), vget_low_f32 (q2_tmp_2.val[0]));
289  q_in_i0123 = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[0]), vget_low_f32 (q2_tmp_3.val[0]));
290  q_in_r4567 = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[0]), vget_high_f32 (q2_tmp_2.val[0]));
291  q_in_i4567 = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[0]), vget_high_f32 (q2_tmp_3.val[0]));
292  q_in_r89ab = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[1]), vget_low_f32 (q2_tmp_2.val[1]));
293  q_in_i89ab = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[1]), vget_low_f32 (q2_tmp_3.val[1]));
294  q_in_rcdef = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[1]), vget_high_f32 (q2_tmp_2.val[1]));
295  q_in_icdef = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[1]), vget_high_f32 (q2_tmp_3.val[1]));
296  q2_tw1 = vld2q_f32 (p_tw1);
297  q2_tw2 = vld2q_f32 (p_tw2);
298  q2_tw3 = vld2q_f32 (p_tw3);
299 
300  q_s0_r = vmulq_f32 (q_in_r4567, q2_tw1.val[0]);
301  q_s0_i = vmulq_f32 (q_in_r4567, q2_tw1.val[1]);
302  q_s1_r = vmulq_f32 (q_in_r89ab, q2_tw2.val[0]);
303  q_s1_i = vmulq_f32 (q_in_r89ab, q2_tw2.val[1]);
304  q_s2_r = vmulq_f32 (q_in_rcdef, q2_tw3.val[0]);
305  q_s2_i = vmulq_f32 (q_in_rcdef, q2_tw3.val[1]);
306  q_s0_r = vmlsq_f32 (q_s0_r, q_in_i4567, q2_tw1.val[1]);
307  q_s0_i = vmlaq_f32 (q_s0_i, q_in_i4567, q2_tw1.val[0]);
308  q_s1_r = vmlsq_f32 (q_s1_r, q_in_i89ab, q2_tw2.val[1]);
309  q_s1_i = vmlaq_f32 (q_s1_i, q_in_i89ab, q2_tw2.val[0]);
310  q_s2_r = vmlsq_f32 (q_s2_r, q_in_icdef, q2_tw3.val[1]);
311  q_s2_i = vmlaq_f32 (q_s2_i, q_in_icdef, q2_tw3.val[0]);
312 
313 
314  q_s5_r = vsubq_f32 (q_in_r0123, q_s1_r);
315  q_s5_i = vsubq_f32 (q_in_i0123, q_s1_i);
316  q2_out_0123.val[0] = vaddq_f32 (q_in_r0123, q_s1_r);
317  q2_out_0123.val[1] = vaddq_f32 (q_in_i0123, q_s1_i);
318 
319  q_s3_r = vaddq_f32 (q_s0_r, q_s2_r);
320  q_s3_i = vaddq_f32 (q_s0_i, q_s2_i);
321  q_s4_r = vsubq_f32 (q_s0_r, q_s2_r);
322  q_s4_i = vsubq_f32 (q_s0_i, q_s2_i);
323 
324  q2_out_89ab.val[0] = vsubq_f32 (q2_out_0123.val[0], q_s3_r);
325  q2_out_89ab.val[1] = vsubq_f32 (q2_out_0123.val[1], q_s3_i);
326  q2_out_0123.val[0] = vaddq_f32 (q2_out_0123.val[0], q_s3_r);
327  q2_out_0123.val[1] = vaddq_f32 (q2_out_0123.val[1], q_s3_i);
328 
329  q2_out_4567.val[0] = vaddq_f32 (q_s5_r, q_s4_i);
330  q2_out_4567.val[1] = vsubq_f32 (q_s5_i, q_s4_r);
331  q2_out_cdef.val[0] = vsubq_f32 (q_s5_r, q_s4_i);
332  q2_out_cdef.val[1] = vaddq_f32 (q_s5_i, q_s4_r);
333 
334  vst2q_f32 (p_dst0, q2_out_0123);
335  vst2q_f32 (p_dst1, q2_out_4567);
336  vst2q_f32 (p_dst2, q2_out_89ab);
337  vst2q_f32 (p_dst3, q2_out_cdef);
338 }
339 
340 static void ne10_fft16_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
342  ne10_fft_cpx_float32_t * twiddles)
343 {
344  ne10_fft_cpx_float32_t *tw1, *tw2, *tw3;
345 
346  // the first stage
347  float32_t *p_src0, *p_src4, *p_src8, *p_src12;
348  float32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef;
349  float32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i;
350  float32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d;
351  float32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf;
352  p_src0 = (float32_t*) (& (Fin[0]));
353  p_src4 = (float32_t*) (& (Fin[4]));
354  p_src8 = (float32_t*) (& (Fin[8]));
355  p_src12 = (float32_t*) (& (Fin[12]));
356  q2_in_0123 = vld2q_f32 (p_src0);
357  q2_in_4567 = vld2q_f32 (p_src4);
358  q2_in_89ab = vld2q_f32 (p_src8);
359  q2_in_cdef = vld2q_f32 (p_src12);
360 
361  q_t2_r = vsubq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
362  q_t2_i = vsubq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
363  q_t3_r = vaddq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]);
364  q_t3_i = vaddq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]);
365 
366  q_t0_r = vaddq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
367  q_t0_i = vaddq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
368  q_t1_r = vsubq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]);
369  q_t1_i = vsubq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]);
370 
371  q_out_r26ae = vsubq_f32 (q_t3_r, q_t0_r);
372  q_out_i26ae = vsubq_f32 (q_t3_i, q_t0_i);
373  q_out_r048c = vaddq_f32 (q_t3_r, q_t0_r);
374  q_out_i048c = vaddq_f32 (q_t3_i, q_t0_i);
375  q_out_r159d = vsubq_f32 (q_t2_r, q_t1_i);
376  q_out_i159d = vaddq_f32 (q_t2_i, q_t1_r);
377  q_out_r37bf = vaddq_f32 (q_t2_r, q_t1_i);
378  q_out_i37bf = vsubq_f32 (q_t2_i, q_t1_r);
379 
380  // second stages
381  float32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3;
382  float32_t *p_tw1, *p_tw2, *p_tw3;
383  float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i;
384  float32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i;
385  float32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3;
386  float32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef;
387  float32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef;
388  float32x4x2_t q2_tw1, q2_tw2, q2_tw3;
389  float32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef;
390  float32x4_t q_one_by_nfft;
391  tw1 = twiddles;
392  tw2 = twiddles + 4;
393  tw3 = twiddles + 8;
394  p_dst0 = (float32_t*) (&Fout[0]);
395  p_dst1 = (float32_t*) (&Fout[4]);
396  p_dst2 = (float32_t*) (&Fout[8]);
397  p_dst3 = (float32_t*) (&Fout[12]);
398  p_tw1 = (float32_t*) tw1;
399  p_tw2 = (float32_t*) tw2;
400  p_tw3 = (float32_t*) tw3;
401  q2_tmp_0 = vzipq_f32 (q_out_r048c, q_out_r159d);
402  q2_tmp_1 = vzipq_f32 (q_out_i048c, q_out_i159d);
403  q2_tmp_2 = vzipq_f32 (q_out_r26ae, q_out_r37bf);
404  q2_tmp_3 = vzipq_f32 (q_out_i26ae, q_out_i37bf);
405  q_in_r0123 = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[0]), vget_low_f32 (q2_tmp_2.val[0]));
406  q_in_i0123 = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[0]), vget_low_f32 (q2_tmp_3.val[0]));
407  q_in_r4567 = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[0]), vget_high_f32 (q2_tmp_2.val[0]));
408  q_in_i4567 = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[0]), vget_high_f32 (q2_tmp_3.val[0]));
409  q_in_r89ab = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[1]), vget_low_f32 (q2_tmp_2.val[1]));
410  q_in_i89ab = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[1]), vget_low_f32 (q2_tmp_3.val[1]));
411  q_in_rcdef = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[1]), vget_high_f32 (q2_tmp_2.val[1]));
412  q_in_icdef = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[1]), vget_high_f32 (q2_tmp_3.val[1]));
413  q2_tw1 = vld2q_f32 (p_tw1);
414  q2_tw2 = vld2q_f32 (p_tw2);
415  q2_tw3 = vld2q_f32 (p_tw3);
416 
417  q_s0_r = vmulq_f32 (q_in_r4567, q2_tw1.val[0]);
418  q_s0_i = vmulq_f32 (q_in_i4567, q2_tw1.val[0]);
419  q_s1_r = vmulq_f32 (q_in_r89ab, q2_tw2.val[0]);
420  q_s1_i = vmulq_f32 (q_in_i89ab, q2_tw2.val[0]);
421  q_s2_r = vmulq_f32 (q_in_rcdef, q2_tw3.val[0]);
422  q_s2_i = vmulq_f32 (q_in_icdef, q2_tw3.val[0]);
423  q_s0_r = vmlaq_f32 (q_s0_r, q_in_i4567, q2_tw1.val[1]);
424  q_s0_i = vmlsq_f32 (q_s0_i, q_in_r4567, q2_tw1.val[1]);
425  q_s1_r = vmlaq_f32 (q_s1_r, q_in_i89ab, q2_tw2.val[1]);
426  q_s1_i = vmlsq_f32 (q_s1_i, q_in_r89ab, q2_tw2.val[1]);
427  q_s2_r = vmlaq_f32 (q_s2_r, q_in_icdef, q2_tw3.val[1]);
428  q_s2_i = vmlsq_f32 (q_s2_i, q_in_rcdef, q2_tw3.val[1]);
429 
430  q_s5_r = vsubq_f32 (q_in_r0123, q_s1_r);
431  q_s5_i = vsubq_f32 (q_in_i0123, q_s1_i);
432  q2_out_0123.val[0] = vaddq_f32 (q_in_r0123, q_s1_r);
433  q2_out_0123.val[1] = vaddq_f32 (q_in_i0123, q_s1_i);
434 
435  q_s3_r = vaddq_f32 (q_s0_r, q_s2_r);
436  q_s3_i = vaddq_f32 (q_s0_i, q_s2_i);
437  q_s4_r = vsubq_f32 (q_s0_r, q_s2_r);
438  q_s4_i = vsubq_f32 (q_s0_i, q_s2_i);
439 
440  q_one_by_nfft = vdupq_n_f32 (0.0625f);
441  q2_out_89ab.val[0] = vsubq_f32 (q2_out_0123.val[0], q_s3_r);
442  q2_out_89ab.val[1] = vsubq_f32 (q2_out_0123.val[1], q_s3_i);
443  q2_out_0123.val[0] = vaddq_f32 (q2_out_0123.val[0], q_s3_r);
444  q2_out_0123.val[1] = vaddq_f32 (q2_out_0123.val[1], q_s3_i);
445 
446  q2_out_4567.val[0] = vsubq_f32 (q_s5_r, q_s4_i);
447  q2_out_4567.val[1] = vaddq_f32 (q_s5_i, q_s4_r);
448  q2_out_cdef.val[0] = vaddq_f32 (q_s5_r, q_s4_i);
449  q2_out_cdef.val[1] = vsubq_f32 (q_s5_i, q_s4_r);
450 
451  q2_out_89ab.val[0] = vmulq_f32 (q2_out_89ab.val[0], q_one_by_nfft);
452  q2_out_89ab.val[1] = vmulq_f32 (q2_out_89ab.val[1], q_one_by_nfft);
453  q2_out_0123.val[0] = vmulq_f32 (q2_out_0123.val[0], q_one_by_nfft);
454  q2_out_0123.val[1] = vmulq_f32 (q2_out_0123.val[1], q_one_by_nfft);
455  q2_out_4567.val[0] = vmulq_f32 (q2_out_4567.val[0], q_one_by_nfft);
456  q2_out_4567.val[1] = vmulq_f32 (q2_out_4567.val[1], q_one_by_nfft);
457  q2_out_cdef.val[0] = vmulq_f32 (q2_out_cdef.val[0], q_one_by_nfft);
458  q2_out_cdef.val[1] = vmulq_f32 (q2_out_cdef.val[1], q_one_by_nfft);
459 
460  vst2q_f32 (p_dst0, q2_out_0123);
461  vst2q_f32 (p_dst1, q2_out_4567);
462  vst2q_f32 (p_dst2, q2_out_89ab);
463  vst2q_f32 (p_dst3, q2_out_cdef);
464 }
465 
466 static inline void ne10_radix8x4_neon (ne10_fft_cpx_float32_t * Fout,
468  ne10_int32_t stride)
469 {
470  ne10_int32_t f_count;
471  ne10_int32_t src_step = stride << 1;
472 
473  const ne10_float32_t TW_81 = 0.70710678;
474  const ne10_float32_t TW_81N = -0.70710678;
475 
476  float32_t *p_src, *p_dst;
477  float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7;
478  float32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i;
479  float32x4_t q_sin4_r, q_sin4_i, q_sin5_r, q_sin5_i, q_sin6_r, q_sin6_i, q_sin7_r, q_sin7_i;
480  float32x4_t q_s3_r, q_s3_i, q_s5_r, q_s5_i, q_s7_r, q_s7_i;
481  float32x4_t q_s8_r, q_s8_i, q_s9_r, q_s9_i, q_s10_r, q_s10_i, q_s11_r, q_s11_i;
482  float32x4_t q_s12_r, q_s12_i, q_s13_r, q_s13_i, q_s14_r, q_s14_i, q_s15_r, q_s15_i;
483  float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
484  float32x4_t q_out4_r, q_out4_i, q_out5_r, q_out5_i, q_out6_r, q_out6_i, q_out7_r, q_out7_i;
485  float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3, q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7;
486  float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3, q2_out4, q2_out5, q2_out6, q2_out7;
487  float32x4_t q_tw_81, q_tw_81n;
488 
489  p_src = (float32_t *) Fin;
490  p_dst = (float32_t *) Fout;
491 
492  for (f_count = 0; f_count < stride; f_count += 4)
493  {
494  q2_in0 = vld2q_f32 (p_src);
495  p_src += src_step;
496  q2_in2 = vld2q_f32 (p_src);
497  p_src += src_step;
498  q2_in4 = vld2q_f32 (p_src);
499  p_src += src_step;
500  q2_in6 = vld2q_f32 (p_src);
501  p_src += src_step;
502  q2_in1 = vld2q_f32 (p_src);
503  p_src += src_step;
504  q2_in3 = vld2q_f32 (p_src);
505  p_src += src_step;
506  q2_in5 = vld2q_f32 (p_src);
507  p_src += src_step;
508  q2_in7 = vld2q_f32 (p_src);
509  p_src += src_step;
510 
511  q_sin0_r = vaddq_f32 (q2_in0.val[0], q2_in1.val[0]);
512  q_sin0_i = vaddq_f32 (q2_in0.val[1], q2_in1.val[1]);
513  q_sin1_r = vsubq_f32 (q2_in0.val[0], q2_in1.val[0]);
514  q_sin1_i = vsubq_f32 (q2_in0.val[1], q2_in1.val[1]);
515  q_sin2_r = vaddq_f32 (q2_in2.val[0], q2_in3.val[0]);
516  q_sin2_i = vaddq_f32 (q2_in2.val[1], q2_in3.val[1]);
517  q_sin3_r = vsubq_f32 (q2_in2.val[0], q2_in3.val[0]);
518  q_sin3_i = vsubq_f32 (q2_in2.val[1], q2_in3.val[1]);
519  q_sin4_r = vaddq_f32 (q2_in4.val[0], q2_in5.val[0]);
520  q_sin4_i = vaddq_f32 (q2_in4.val[1], q2_in5.val[1]);
521  q_sin5_r = vsubq_f32 (q2_in4.val[0], q2_in5.val[0]);
522  q_sin5_i = vsubq_f32 (q2_in4.val[1], q2_in5.val[1]);
523  q_sin6_r = vaddq_f32 (q2_in6.val[0], q2_in7.val[0]);
524  q_sin6_i = vaddq_f32 (q2_in6.val[1], q2_in7.val[1]);
525  q_sin7_r = vsubq_f32 (q2_in6.val[0], q2_in7.val[0]);
526  q_sin7_i = vsubq_f32 (q2_in6.val[1], q2_in7.val[1]);
527 
528  // radix 4 butterfly without twiddles
529  q_tw_81 = vdupq_n_f32 (TW_81);
530  q_tw_81n = vdupq_n_f32 (TW_81N);
531  q_s5_r = q_sin5_i;
532  q_s5_i = vnegq_f32 (q_sin5_r);
533  q_s3_r = vaddq_f32 (q_sin3_r, q_sin3_i);
534  q_s3_i = vsubq_f32 (q_sin3_i, q_sin3_r);
535  q_s7_r = vsubq_f32 (q_sin7_r, q_sin7_i);
536  q_s7_i = vaddq_f32 (q_sin7_i, q_sin7_r);
537  q_s3_r = vmulq_f32 (q_s3_r, q_tw_81);
538  q_s3_i = vmulq_f32 (q_s3_i, q_tw_81);
539  q_s7_r = vmulq_f32 (q_s7_r, q_tw_81n);
540  q_s7_i = vmulq_f32 (q_s7_i, q_tw_81n);
541 
542  // radix 2 butterfly
543  q_s8_r = vaddq_f32 (q_sin0_r, q_sin4_r);
544  q_s8_i = vaddq_f32 (q_sin0_i, q_sin4_i);
545  q_s9_r = vaddq_f32 (q_sin1_r, q_s5_r);
546  q_s9_i = vaddq_f32 (q_sin1_i, q_s5_i);
547  q_s10_r = vsubq_f32 (q_sin0_r, q_sin4_r);
548  q_s10_i = vsubq_f32 (q_sin0_i, q_sin4_i);
549  q_s11_r = vsubq_f32 (q_sin1_r, q_s5_r);
550  q_s11_i = vsubq_f32 (q_sin1_i, q_s5_i);
551 
552  // radix 2 butterfly
553  q_s12_r = vaddq_f32 (q_sin2_r, q_sin6_r);
554  q_s12_i = vaddq_f32 (q_sin2_i, q_sin6_i);
555  q_s13_r = vaddq_f32 (q_s3_r, q_s7_r);
556  q_s13_i = vaddq_f32 (q_s3_i, q_s7_i);
557  q_s14_r = vsubq_f32 (q_sin2_r, q_sin6_r);
558  q_s14_i = vsubq_f32 (q_sin2_i, q_sin6_i);
559  q_s15_r = vsubq_f32 (q_s3_r, q_s7_r);
560  q_s15_i = vsubq_f32 (q_s3_i, q_s7_i);
561 
562  // third result
563  q_out4_r = vsubq_f32 (q_s8_r, q_s12_r);
564  q_out4_i = vsubq_f32 (q_s8_i, q_s12_i);
565  q_out5_r = vsubq_f32 (q_s9_r, q_s13_r);
566  q_out5_i = vsubq_f32 (q_s9_i, q_s13_i);
567 
568  // first result
569  q_out0_r = vaddq_f32 (q_s8_r, q_s12_r);
570  q_out0_i = vaddq_f32 (q_s8_i, q_s12_i);
571  q_out1_r = vaddq_f32 (q_s9_r, q_s13_r);
572  q_out1_i = vaddq_f32 (q_s9_i, q_s13_i);
573 
574  // second result
575  q_out2_r = vaddq_f32 (q_s10_r, q_s14_i);
576  q_out2_i = vsubq_f32 (q_s10_i, q_s14_r);
577  q_out3_r = vaddq_f32 (q_s11_r, q_s15_i);
578  q_out3_i = vsubq_f32 (q_s11_i, q_s15_r);
579 
580  // forth result
581  q_out6_r = vsubq_f32 (q_s10_r, q_s14_i);
582  q_out6_i = vaddq_f32 (q_s10_i, q_s14_r);
583  q_out7_r = vsubq_f32 (q_s11_r, q_s15_i);
584  q_out7_i = vaddq_f32 (q_s11_i, q_s15_r);
585 
586  q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
587  q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
588  q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
589  q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
590  q2_tmp4 = vtrnq_f32 (q_out4_r, q_out5_r);
591  q2_tmp5 = vtrnq_f32 (q_out4_i, q_out5_i);
592  q2_tmp6 = vtrnq_f32 (q_out6_r, q_out7_r);
593  q2_tmp7 = vtrnq_f32 (q_out6_i, q_out7_i);
594 
595  q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
596  q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
597  q2_out2.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
598  q2_out2.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
599  q2_out4.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
600  q2_out4.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
601  q2_out6.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
602  q2_out6.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
603 
604  q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[0]), vget_low_f32 (q2_tmp6.val[0]));
605  q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[0]), vget_low_f32 (q2_tmp7.val[0]));
606  q2_out3.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[1]), vget_low_f32 (q2_tmp6.val[1]));
607  q2_out3.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[1]), vget_low_f32 (q2_tmp7.val[1]));
608  q2_out5.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[0]), vget_high_f32 (q2_tmp6.val[0]));
609  q2_out5.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[0]), vget_high_f32 (q2_tmp7.val[0]));
610  q2_out7.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[1]), vget_high_f32 (q2_tmp6.val[1]));
611  q2_out7.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[1]), vget_high_f32 (q2_tmp7.val[1]));
612 
613  // store
614  vst2q_f32 (p_dst, q2_out0);
615  p_dst += 8;
616  vst2q_f32 (p_dst, q2_out1);
617  p_dst += 8;
618  vst2q_f32 (p_dst, q2_out2);
619  p_dst += 8;
620  vst2q_f32 (p_dst, q2_out3);
621  p_dst += 8;
622  vst2q_f32 (p_dst, q2_out4);
623  p_dst += 8;
624  vst2q_f32 (p_dst, q2_out5);
625  p_dst += 8;
626  vst2q_f32 (p_dst, q2_out6);
627  p_dst += 8;
628  vst2q_f32 (p_dst, q2_out7);
629  p_dst += 8;
630 
631  p_src = p_src - src_step * 8 + 8;
632  } // f_count
633 }
634 
635 static inline void ne10_radix4x4_without_twiddles_neon (ne10_fft_cpx_float32_t * Fout,
637  ne10_int32_t stride)
638 {
639  ne10_int32_t f_count;
640  ne10_int32_t src_step = stride << 1;
641 
642  float32_t *p_src, *p_dst;
643  float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
644  float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
645  float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
646  float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3;
647  float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
648 
649  p_src = (float32_t *) Fin;
650  p_dst = (float32_t *) Fout;
651 
652  for (f_count = 0; f_count < stride; f_count += 4)
653  {
654  // load
655  q2_in0 = vld2q_f32 (p_src);
656  p_src += src_step;
657  q2_in1 = vld2q_f32 (p_src);
658  p_src += src_step;
659  q2_in2 = vld2q_f32 (p_src);
660  p_src += src_step;
661  q2_in3 = vld2q_f32 (p_src);
662  p_src += src_step;
663 
664  // radix 4 butterfly without twiddles
665  q_s0_r = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
666  q_s0_i = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
667  q_s1_r = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
668  q_s1_i = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
669  q_s2_r = vaddq_f32 (q2_in1.val[0], q2_in3.val[0]);
670  q_s2_i = vaddq_f32 (q2_in1.val[1], q2_in3.val[1]);
671  q_s3_r = vsubq_f32 (q2_in1.val[0], q2_in3.val[0]);
672  q_s3_i = vsubq_f32 (q2_in1.val[1], q2_in3.val[1]);
673 
674  // third result
675  q_out2_r = vsubq_f32 (q_s0_r, q_s2_r);
676  q_out2_i = vsubq_f32 (q_s0_i, q_s2_i);
677  q_out0_r = vaddq_f32 (q_s0_r, q_s2_r);
678  q_out0_i = vaddq_f32 (q_s0_i, q_s2_i);
679 
680  q_out1_r = vaddq_f32 (q_s1_r, q_s3_i);
681  q_out1_i = vsubq_f32 (q_s1_i, q_s3_r);
682  q_out3_r = vsubq_f32 (q_s1_r, q_s3_i);
683  q_out3_i = vaddq_f32 (q_s1_i, q_s3_r);
684 
685  q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
686  q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
687  q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
688  q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
689  q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
690  q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
691  q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
692  q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
693  q2_out2.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
694  q2_out2.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
695  q2_out3.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
696  q2_out3.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
697 
698  // store
699  vst2q_f32 (p_dst, q2_out0);
700  p_dst += 8;
701  vst2q_f32 (p_dst, q2_out1);
702  p_dst += 8;
703  vst2q_f32 (p_dst, q2_out2);
704  p_dst += 8;
705  vst2q_f32 (p_dst, q2_out3);
706  p_dst += 8;
707 
708  p_src = p_src - src_step * 4 + 8;
709  }
710 }
711 
712 static inline void ne10_radix4x4_with_twiddles_neon (ne10_fft_cpx_float32_t * Fout,
715  ne10_int32_t src_stride,
716  ne10_int32_t dst_stride,
717  ne10_int32_t mstride)
718 {
719  ne10_int32_t m_count;
720  ne10_int32_t src_step = src_stride << 1;
721  ne10_int32_t dst_step = dst_stride << 1;
722  ne10_int32_t tw_step = mstride << 1;
723 
724  float32_t *p_src, *p_dst, *p_tw;
725  float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
726  float32x4x2_t q2_tw0, q2_tw1, q2_tw2;
727  float32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
728  float32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i;
729  float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
730 
731  p_src = (float32_t *) Fin;
732  p_dst = (float32_t *) Fout;
733  p_tw = (float32_t *) tw;
734 
735  for (m_count = 0; m_count < mstride; m_count += 4)
736  {
737  // load
738  q2_in0 = vld2q_f32 (p_src);
739  p_src += src_step;
740  q2_in1 = vld2q_f32 (p_src);
741  p_src += src_step;
742  q2_in2 = vld2q_f32 (p_src);
743  p_src += src_step;
744  q2_in3 = vld2q_f32 (p_src);
745  p_src += src_step;
746 
747  q2_tw0 = vld2q_f32 (p_tw);
748  p_tw += tw_step;
749  q2_tw1 = vld2q_f32 (p_tw);
750  p_tw += tw_step;
751  q2_tw2 = vld2q_f32 (p_tw);
752 
753  q_s1_r = vmulq_f32 (q2_in1.val[0], q2_tw0.val[0]);
754  q_s1_i = vmulq_f32 (q2_in1.val[1], q2_tw0.val[0]);
755  q_s2_r = vmulq_f32 (q2_in2.val[0], q2_tw1.val[0]);
756  q_s2_i = vmulq_f32 (q2_in2.val[1], q2_tw1.val[0]);
757  q_s3_r = vmulq_f32 (q2_in3.val[0], q2_tw2.val[0]);
758  q_s3_i = vmulq_f32 (q2_in3.val[1], q2_tw2.val[0]);
759  q_s1_r = vmlsq_f32 (q_s1_r, q2_in1.val[1], q2_tw0.val[1]);
760  q_s1_i = vmlaq_f32 (q_s1_i, q2_in1.val[0], q2_tw0.val[1]);
761  q_s2_r = vmlsq_f32 (q_s2_r, q2_in2.val[1], q2_tw1.val[1]);
762  q_s2_i = vmlaq_f32 (q_s2_i, q2_in2.val[0], q2_tw1.val[1]);
763  q_s3_r = vmlsq_f32 (q_s3_r, q2_in3.val[1], q2_tw2.val[1]);
764  q_s3_i = vmlaq_f32 (q_s3_i, q2_in3.val[0], q2_tw2.val[1]);
765 
766  q_s4_r = vaddq_f32 (q2_in0.val[0], q_s2_r);
767  q_s4_i = vaddq_f32 (q2_in0.val[1], q_s2_i);
768  q_s5_r = vsubq_f32 (q2_in0.val[0], q_s2_r);
769  q_s5_i = vsubq_f32 (q2_in0.val[1], q_s2_i);
770 
771  q_s6_r = vaddq_f32 (q_s1_r, q_s3_r);
772  q_s6_i = vaddq_f32 (q_s1_i, q_s3_i);
773  q_s7_r = vsubq_f32 (q_s1_r, q_s3_r);
774  q_s7_i = vsubq_f32 (q_s1_i, q_s3_i);
775 
776  q2_out2.val[0] = vsubq_f32 (q_s4_r, q_s6_r);
777  q2_out2.val[1] = vsubq_f32 (q_s4_i, q_s6_i);
778  q2_out0.val[0] = vaddq_f32 (q_s4_r, q_s6_r);
779  q2_out0.val[1] = vaddq_f32 (q_s4_i, q_s6_i);
780 
781  q2_out1.val[0] = vaddq_f32 (q_s5_r, q_s7_i);
782  q2_out1.val[1] = vsubq_f32 (q_s5_i, q_s7_r);
783  q2_out3.val[0] = vsubq_f32 (q_s5_r, q_s7_i);
784  q2_out3.val[1] = vaddq_f32 (q_s5_i, q_s7_r);
785 
786  // store
787  vst2q_f32 (p_dst, q2_out0);
788  p_dst += dst_step;
789  vst2q_f32 (p_dst, q2_out1);
790  p_dst += dst_step;
791  vst2q_f32 (p_dst, q2_out2);
792  p_dst += dst_step;
793  vst2q_f32 (p_dst, q2_out3);
794  p_dst += dst_step;
795 
796  p_src = p_src - src_step * 4 + 8;
797  p_dst = p_dst - dst_step * 4 + 8;
798  p_tw = p_tw - tw_step * 2 + 8;
799  }
800 }
801 static inline void ne10_radix8x4_inverse_neon (ne10_fft_cpx_float32_t * Fout,
803  ne10_int32_t stride)
804 {
805  ne10_int32_t f_count;
806  ne10_int32_t src_step = stride << 1;
807 
808  const ne10_float32_t TW_81 = 0.70710678;
809  const ne10_float32_t TW_81N = -0.70710678;
810 
811  float32_t *p_src, *p_dst;
812  float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7;
813  float32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i;
814  float32x4_t q_sin4_r, q_sin4_i, q_sin5_r, q_sin5_i, q_sin6_r, q_sin6_i, q_sin7_r, q_sin7_i;
815  float32x4_t q_s3_r, q_s3_i, q_s5_r, q_s5_i, q_s7_r, q_s7_i;
816  float32x4_t q_s8_r, q_s8_i, q_s9_r, q_s9_i, q_s10_r, q_s10_i, q_s11_r, q_s11_i;
817  float32x4_t q_s12_r, q_s12_i, q_s13_r, q_s13_i, q_s14_r, q_s14_i, q_s15_r, q_s15_i;
818  float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
819  float32x4_t q_out4_r, q_out4_i, q_out5_r, q_out5_i, q_out6_r, q_out6_i, q_out7_r, q_out7_i;
820  float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3, q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7;
821  float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3, q2_out4, q2_out5, q2_out6, q2_out7;
822  float32x4_t q_tw_81, q_tw_81n;
823 
824  p_src = (float32_t *) Fin;
825  p_dst = (float32_t *) Fout;
826 
827  for (f_count = 0; f_count < stride; f_count += 4)
828  {
829  q2_in0 = vld2q_f32 (p_src);
830  p_src += src_step;
831  q2_in2 = vld2q_f32 (p_src);
832  p_src += src_step;
833  q2_in4 = vld2q_f32 (p_src);
834  p_src += src_step;
835  q2_in6 = vld2q_f32 (p_src);
836  p_src += src_step;
837  q2_in1 = vld2q_f32 (p_src);
838  p_src += src_step;
839  q2_in3 = vld2q_f32 (p_src);
840  p_src += src_step;
841  q2_in5 = vld2q_f32 (p_src);
842  p_src += src_step;
843  q2_in7 = vld2q_f32 (p_src);
844  p_src += src_step;
845 
846  q_sin0_r = vaddq_f32 (q2_in0.val[0], q2_in1.val[0]);
847  q_sin0_i = vaddq_f32 (q2_in0.val[1], q2_in1.val[1]);
848  q_sin1_r = vsubq_f32 (q2_in0.val[0], q2_in1.val[0]);
849  q_sin1_i = vsubq_f32 (q2_in0.val[1], q2_in1.val[1]);
850  q_sin2_r = vaddq_f32 (q2_in2.val[0], q2_in3.val[0]);
851  q_sin2_i = vaddq_f32 (q2_in2.val[1], q2_in3.val[1]);
852  q_sin3_r = vsubq_f32 (q2_in2.val[0], q2_in3.val[0]);
853  q_sin3_i = vsubq_f32 (q2_in2.val[1], q2_in3.val[1]);
854  q_sin4_r = vaddq_f32 (q2_in4.val[0], q2_in5.val[0]);
855  q_sin4_i = vaddq_f32 (q2_in4.val[1], q2_in5.val[1]);
856  q_sin5_r = vsubq_f32 (q2_in4.val[0], q2_in5.val[0]);
857  q_sin5_i = vsubq_f32 (q2_in4.val[1], q2_in5.val[1]);
858  q_sin6_r = vaddq_f32 (q2_in6.val[0], q2_in7.val[0]);
859  q_sin6_i = vaddq_f32 (q2_in6.val[1], q2_in7.val[1]);
860  q_sin7_r = vsubq_f32 (q2_in6.val[0], q2_in7.val[0]);
861  q_sin7_i = vsubq_f32 (q2_in6.val[1], q2_in7.val[1]);
862 
863  // radix 4 butterfly without twiddles
864  q_tw_81 = vdupq_n_f32 (TW_81);
865  q_tw_81n = vdupq_n_f32 (TW_81N);
866  q_s5_r = vnegq_f32 (q_sin5_i);
867  q_s5_i = q_sin5_r;
868  q_s3_r = vsubq_f32 (q_sin3_r, q_sin3_i);
869  q_s3_i = vaddq_f32 (q_sin3_i, q_sin3_r);
870  q_s7_r = vaddq_f32 (q_sin7_r, q_sin7_i);
871  q_s7_i = vsubq_f32 (q_sin7_i, q_sin7_r);
872  q_s3_r = vmulq_f32 (q_s3_r, q_tw_81);
873  q_s3_i = vmulq_f32 (q_s3_i, q_tw_81);
874  q_s7_r = vmulq_f32 (q_s7_r, q_tw_81n);
875  q_s7_i = vmulq_f32 (q_s7_i, q_tw_81n);
876 
877  // radix 2 butterfly
878  q_s8_r = vaddq_f32 (q_sin0_r, q_sin4_r);
879  q_s8_i = vaddq_f32 (q_sin0_i, q_sin4_i);
880  q_s9_r = vaddq_f32 (q_sin1_r, q_s5_r);
881  q_s9_i = vaddq_f32 (q_sin1_i, q_s5_i);
882  q_s10_r = vsubq_f32 (q_sin0_r, q_sin4_r);
883  q_s10_i = vsubq_f32 (q_sin0_i, q_sin4_i);
884  q_s11_r = vsubq_f32 (q_sin1_r, q_s5_r);
885  q_s11_i = vsubq_f32 (q_sin1_i, q_s5_i);
886 
887  // radix 2 butterfly
888  q_s12_r = vaddq_f32 (q_sin2_r, q_sin6_r);
889  q_s12_i = vaddq_f32 (q_sin2_i, q_sin6_i);
890  q_s13_r = vaddq_f32 (q_s3_r, q_s7_r);
891  q_s13_i = vaddq_f32 (q_s3_i, q_s7_i);
892  q_s14_r = vsubq_f32 (q_sin2_r, q_sin6_r);
893  q_s14_i = vsubq_f32 (q_sin2_i, q_sin6_i);
894  q_s15_r = vsubq_f32 (q_s3_r, q_s7_r);
895  q_s15_i = vsubq_f32 (q_s3_i, q_s7_i);
896 
897  // third result
898  q_out4_r = vsubq_f32 (q_s8_r, q_s12_r);
899  q_out4_i = vsubq_f32 (q_s8_i, q_s12_i);
900  q_out5_r = vsubq_f32 (q_s9_r, q_s13_r);
901  q_out5_i = vsubq_f32 (q_s9_i, q_s13_i);
902 
903  // first result
904  q_out0_r = vaddq_f32 (q_s8_r, q_s12_r);
905  q_out0_i = vaddq_f32 (q_s8_i, q_s12_i);
906  q_out1_r = vaddq_f32 (q_s9_r, q_s13_r);
907  q_out1_i = vaddq_f32 (q_s9_i, q_s13_i);
908 
909  // second result
910  q_out2_r = vsubq_f32 (q_s10_r, q_s14_i);
911  q_out2_i = vaddq_f32 (q_s10_i, q_s14_r);
912  q_out3_r = vsubq_f32 (q_s11_r, q_s15_i);
913  q_out3_i = vaddq_f32 (q_s11_i, q_s15_r);
914 
915  // forth result
916  q_out6_r = vaddq_f32 (q_s10_r, q_s14_i);
917  q_out6_i = vsubq_f32 (q_s10_i, q_s14_r);
918  q_out7_r = vaddq_f32 (q_s11_r, q_s15_i);
919  q_out7_i = vsubq_f32 (q_s11_i, q_s15_r);
920 
921  q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
922  q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
923  q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
924  q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
925  q2_tmp4 = vtrnq_f32 (q_out4_r, q_out5_r);
926  q2_tmp5 = vtrnq_f32 (q_out4_i, q_out5_i);
927  q2_tmp6 = vtrnq_f32 (q_out6_r, q_out7_r);
928  q2_tmp7 = vtrnq_f32 (q_out6_i, q_out7_i);
929 
930  q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
931  q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
932  q2_out2.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
933  q2_out2.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
934  q2_out4.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
935  q2_out4.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
936  q2_out6.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
937  q2_out6.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
938 
939  q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[0]), vget_low_f32 (q2_tmp6.val[0]));
940  q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[0]), vget_low_f32 (q2_tmp7.val[0]));
941  q2_out3.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp4.val[1]), vget_low_f32 (q2_tmp6.val[1]));
942  q2_out3.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp5.val[1]), vget_low_f32 (q2_tmp7.val[1]));
943  q2_out5.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[0]), vget_high_f32 (q2_tmp6.val[0]));
944  q2_out5.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[0]), vget_high_f32 (q2_tmp7.val[0]));
945  q2_out7.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp4.val[1]), vget_high_f32 (q2_tmp6.val[1]));
946  q2_out7.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp5.val[1]), vget_high_f32 (q2_tmp7.val[1]));
947 
948  // store
949  vst2q_f32 (p_dst, q2_out0);
950  p_dst += 8;
951  vst2q_f32 (p_dst, q2_out1);
952  p_dst += 8;
953  vst2q_f32 (p_dst, q2_out2);
954  p_dst += 8;
955  vst2q_f32 (p_dst, q2_out3);
956  p_dst += 8;
957  vst2q_f32 (p_dst, q2_out4);
958  p_dst += 8;
959  vst2q_f32 (p_dst, q2_out5);
960  p_dst += 8;
961  vst2q_f32 (p_dst, q2_out6);
962  p_dst += 8;
963  vst2q_f32 (p_dst, q2_out7);
964  p_dst += 8;
965 
966  p_src = p_src - src_step * 8 + 8;
967  } // f_count
968 }
969 
970 static inline void ne10_radix4x4_inverse_without_twiddles_neon (ne10_fft_cpx_float32_t * Fout,
972  ne10_int32_t stride)
973 {
974  ne10_int32_t f_count;
975  ne10_int32_t src_step = stride << 1;
976 
977  float32_t *p_src, *p_dst;
978  float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
979  float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
980  float32x4_t q_out0_r, q_out0_i, q_out1_r, q_out1_i, q_out2_r, q_out2_i, q_out3_r, q_out3_i;
981  float32x4x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3;
982  float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
983 
984  p_src = (float32_t *) Fin;
985  p_dst = (float32_t *) Fout;
986 
987  for (f_count = 0; f_count < stride; f_count += 4)
988  {
989  // load
990  q2_in0 = vld2q_f32 (p_src);
991  p_src += src_step;
992  q2_in1 = vld2q_f32 (p_src);
993  p_src += src_step;
994  q2_in2 = vld2q_f32 (p_src);
995  p_src += src_step;
996  q2_in3 = vld2q_f32 (p_src);
997  p_src += src_step;
998 
999  // radix 4 butterfly without twiddles
1000  q_s0_r = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
1001  q_s0_i = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
1002  q_s1_r = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
1003  q_s1_i = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
1004  q_s2_r = vaddq_f32 (q2_in1.val[0], q2_in3.val[0]);
1005  q_s2_i = vaddq_f32 (q2_in1.val[1], q2_in3.val[1]);
1006  q_s3_r = vsubq_f32 (q2_in1.val[0], q2_in3.val[0]);
1007  q_s3_i = vsubq_f32 (q2_in1.val[1], q2_in3.val[1]);
1008 
1009  q_out2_r = vsubq_f32 (q_s0_r, q_s2_r);
1010  q_out2_i = vsubq_f32 (q_s0_i, q_s2_i);
1011  q_out0_r = vaddq_f32 (q_s0_r, q_s2_r);
1012  q_out0_i = vaddq_f32 (q_s0_i, q_s2_i);
1013 
1014  q_out1_r = vsubq_f32 (q_s1_r, q_s3_i);
1015  q_out1_i = vaddq_f32 (q_s1_i, q_s3_r);
1016  q_out3_r = vaddq_f32 (q_s1_r, q_s3_i);
1017  q_out3_i = vsubq_f32 (q_s1_i, q_s3_r);
1018 
1019  q2_tmp0 = vtrnq_f32 (q_out0_r, q_out1_r);
1020  q2_tmp1 = vtrnq_f32 (q_out0_i, q_out1_i);
1021  q2_tmp2 = vtrnq_f32 (q_out2_r, q_out3_r);
1022  q2_tmp3 = vtrnq_f32 (q_out2_i, q_out3_i);
1023  q2_out0.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0]));
1024  q2_out0.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0]));
1025  q2_out1.val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1]));
1026  q2_out1.val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1]));
1027  q2_out2.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));
1028  q2_out2.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));
1029  q2_out3.val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));
1030  q2_out3.val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));
1031 
1032  // store
1033  vst2q_f32 (p_dst, q2_out0);
1034  p_dst += 8;
1035  vst2q_f32 (p_dst, q2_out1);
1036  p_dst += 8;
1037  vst2q_f32 (p_dst, q2_out2);
1038  p_dst += 8;
1039  vst2q_f32 (p_dst, q2_out3);
1040  p_dst += 8;
1041 
1042  p_src = p_src - src_step * 4 + 8;
1043  }
1044 }
1045 
1046 static inline void ne10_radix4x4_inverse_with_twiddles_neon (ne10_fft_cpx_float32_t * Fout,
1047  ne10_fft_cpx_float32_t * Fin,
1049  ne10_int32_t src_stride,
1050  ne10_int32_t dst_stride,
1051  ne10_int32_t mstride)
1052 {
1053  ne10_int32_t m_count;
1054  ne10_int32_t src_step = src_stride << 1;
1055  ne10_int32_t dst_step = dst_stride << 1;
1056  ne10_int32_t tw_step = mstride << 1;
1057 
1058  float32_t *p_src, *p_dst, *p_tw;
1059  float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
1060  float32x4x2_t q2_tw0, q2_tw1, q2_tw2;
1061  float32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
1062  float32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i;
1063  float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
1064 
1065  p_src = (float32_t *) Fin;
1066  p_dst = (float32_t *) Fout;
1067  p_tw = (float32_t *) tw;
1068 
1069  for (m_count = 0; m_count < mstride; m_count += 4)
1070  {
1071  // load
1072  q2_in0 = vld2q_f32 (p_src);
1073  p_src += src_step;
1074  q2_in1 = vld2q_f32 (p_src);
1075  p_src += src_step;
1076  q2_in2 = vld2q_f32 (p_src);
1077  p_src += src_step;
1078  q2_in3 = vld2q_f32 (p_src);
1079  p_src += src_step;
1080 
1081  q2_tw0 = vld2q_f32 (p_tw);
1082  p_tw += tw_step;
1083  q2_tw1 = vld2q_f32 (p_tw);
1084  p_tw += tw_step;
1085  q2_tw2 = vld2q_f32 (p_tw);
1086 
1087  q_s1_r = vmulq_f32 (q2_in1.val[0], q2_tw0.val[0]);
1088  q_s1_i = vmulq_f32 (q2_in1.val[1], q2_tw0.val[0]);
1089  q_s2_r = vmulq_f32 (q2_in2.val[0], q2_tw1.val[0]);
1090  q_s2_i = vmulq_f32 (q2_in2.val[1], q2_tw1.val[0]);
1091  q_s3_r = vmulq_f32 (q2_in3.val[0], q2_tw2.val[0]);
1092  q_s3_i = vmulq_f32 (q2_in3.val[1], q2_tw2.val[0]);
1093  q_s1_r = vmlaq_f32 (q_s1_r, q2_in1.val[1], q2_tw0.val[1]);
1094  q_s1_i = vmlsq_f32 (q_s1_i, q2_in1.val[0], q2_tw0.val[1]);
1095  q_s2_r = vmlaq_f32 (q_s2_r, q2_in2.val[1], q2_tw1.val[1]);
1096  q_s2_i = vmlsq_f32 (q_s2_i, q2_in2.val[0], q2_tw1.val[1]);
1097  q_s3_r = vmlaq_f32 (q_s3_r, q2_in3.val[1], q2_tw2.val[1]);
1098  q_s3_i = vmlsq_f32 (q_s3_i, q2_in3.val[0], q2_tw2.val[1]);
1099 
1100  q_s4_r = vaddq_f32 (q2_in0.val[0], q_s2_r);
1101  q_s4_i = vaddq_f32 (q2_in0.val[1], q_s2_i);
1102  q_s5_r = vsubq_f32 (q2_in0.val[0], q_s2_r);
1103  q_s5_i = vsubq_f32 (q2_in0.val[1], q_s2_i);
1104 
1105  q_s6_r = vaddq_f32 (q_s1_r, q_s3_r);
1106  q_s6_i = vaddq_f32 (q_s1_i, q_s3_i);
1107  q_s7_r = vsubq_f32 (q_s1_r, q_s3_r);
1108  q_s7_i = vsubq_f32 (q_s1_i, q_s3_i);
1109 
1110  q2_out2.val[0] = vsubq_f32 (q_s4_r, q_s6_r);
1111  q2_out2.val[1] = vsubq_f32 (q_s4_i, q_s6_i);
1112  q2_out0.val[0] = vaddq_f32 (q_s4_r, q_s6_r);
1113  q2_out0.val[1] = vaddq_f32 (q_s4_i, q_s6_i);
1114 
1115  q2_out1.val[0] = vsubq_f32 (q_s5_r, q_s7_i);
1116  q2_out1.val[1] = vaddq_f32 (q_s5_i, q_s7_r);
1117  q2_out3.val[0] = vaddq_f32 (q_s5_r, q_s7_i);
1118  q2_out3.val[1] = vsubq_f32 (q_s5_i, q_s7_r);
1119 
1120  // store
1121  vst2q_f32 (p_dst, q2_out0);
1122  p_dst += dst_step;
1123  vst2q_f32 (p_dst, q2_out1);
1124  p_dst += dst_step;
1125  vst2q_f32 (p_dst, q2_out2);
1126  p_dst += dst_step;
1127  vst2q_f32 (p_dst, q2_out3);
1128  p_dst += dst_step;
1129 
1130  p_src = p_src - src_step * 4 + 8;
1131  p_dst = p_dst - dst_step * 4 + 8;
1132  p_tw = p_tw - tw_step * 2 + 8;
1133  }
1134 }
1135 
1136 static inline void ne10_radix4x4_inverse_with_twiddles_last_stage_neon (ne10_fft_cpx_float32_t * Fout,
1137  ne10_fft_cpx_float32_t * Fin,
1139  ne10_int32_t src_stride,
1140  ne10_int32_t dst_stride,
1141  ne10_int32_t mstride,
1142  ne10_int32_t nfft)
1143 {
1144  ne10_int32_t m_count;
1145  ne10_int32_t src_step = src_stride << 1;
1146  ne10_int32_t dst_step = dst_stride << 1;
1147  ne10_int32_t tw_step = mstride << 1;
1148  ne10_float32_t one_by_nfft = (1.0f / (ne10_float32_t) nfft);
1149 
1150  float32_t *p_src, *p_dst, *p_tw;
1151  float32x4x2_t q2_in0, q2_in1, q2_in2, q2_in3;
1152  float32x4x2_t q2_tw0, q2_tw1, q2_tw2;
1153  float32x4_t q_s1_r, q_s1_i, q_s2_r, q_s2_i, q_s3_r, q_s3_i;
1154  float32x4_t q_s4_r, q_s4_i, q_s5_r, q_s5_i, q_s6_r, q_s6_i, q_s7_r, q_s7_i;
1155  float32x4x2_t q2_out0, q2_out1, q2_out2, q2_out3;
1156  float32x4_t q_one_by_nfft = vdupq_n_f32 (one_by_nfft);
1157 
1158  p_src = (float32_t *) Fin;
1159  p_dst = (float32_t *) Fout;
1160  p_tw = (float32_t *) tw;
1161 
1162  for (m_count = 0; m_count < mstride; m_count += 4)
1163  {
1164  // load
1165  q2_in0 = vld2q_f32 (p_src);
1166  p_src += src_step;
1167  q2_in1 = vld2q_f32 (p_src);
1168  p_src += src_step;
1169  q2_in2 = vld2q_f32 (p_src);
1170  p_src += src_step;
1171  q2_in3 = vld2q_f32 (p_src);
1172  p_src += src_step;
1173 
1174  q2_tw0 = vld2q_f32 (p_tw);
1175  p_tw += tw_step;
1176  q2_tw1 = vld2q_f32 (p_tw);
1177  p_tw += tw_step;
1178  q2_tw2 = vld2q_f32 (p_tw);
1179 
1180  q_s1_r = vmulq_f32 (q2_in1.val[0], q2_tw0.val[0]);
1181  q_s1_i = vmulq_f32 (q2_in1.val[1], q2_tw0.val[0]);
1182  q_s2_r = vmulq_f32 (q2_in2.val[0], q2_tw1.val[0]);
1183  q_s2_i = vmulq_f32 (q2_in2.val[1], q2_tw1.val[0]);
1184  q_s3_r = vmulq_f32 (q2_in3.val[0], q2_tw2.val[0]);
1185  q_s3_i = vmulq_f32 (q2_in3.val[1], q2_tw2.val[0]);
1186  q_s1_r = vmlaq_f32 (q_s1_r, q2_in1.val[1], q2_tw0.val[1]);
1187  q_s1_i = vmlsq_f32 (q_s1_i, q2_in1.val[0], q2_tw0.val[1]);
1188  q_s2_r = vmlaq_f32 (q_s2_r, q2_in2.val[1], q2_tw1.val[1]);
1189  q_s2_i = vmlsq_f32 (q_s2_i, q2_in2.val[0], q2_tw1.val[1]);
1190  q_s3_r = vmlaq_f32 (q_s3_r, q2_in3.val[1], q2_tw2.val[1]);
1191  q_s3_i = vmlsq_f32 (q_s3_i, q2_in3.val[0], q2_tw2.val[1]);
1192 
1193  q_s4_r = vaddq_f32 (q2_in0.val[0], q_s2_r);
1194  q_s4_i = vaddq_f32 (q2_in0.val[1], q_s2_i);
1195  q_s5_r = vsubq_f32 (q2_in0.val[0], q_s2_r);
1196  q_s5_i = vsubq_f32 (q2_in0.val[1], q_s2_i);
1197 
1198  q_s6_r = vaddq_f32 (q_s1_r, q_s3_r);
1199  q_s6_i = vaddq_f32 (q_s1_i, q_s3_i);
1200  q_s7_r = vsubq_f32 (q_s1_r, q_s3_r);
1201  q_s7_i = vsubq_f32 (q_s1_i, q_s3_i);
1202 
1203  q2_out2.val[0] = vsubq_f32 (q_s4_r, q_s6_r);
1204  q2_out2.val[1] = vsubq_f32 (q_s4_i, q_s6_i);
1205  q2_out0.val[0] = vaddq_f32 (q_s4_r, q_s6_r);
1206  q2_out0.val[1] = vaddq_f32 (q_s4_i, q_s6_i);
1207 
1208  q2_out1.val[0] = vsubq_f32 (q_s5_r, q_s7_i);
1209  q2_out1.val[1] = vaddq_f32 (q_s5_i, q_s7_r);
1210  q2_out3.val[0] = vaddq_f32 (q_s5_r, q_s7_i);
1211  q2_out3.val[1] = vsubq_f32 (q_s5_i, q_s7_r);
1212 
1213  q2_out0.val[0] = vmulq_f32 (q2_out0.val[0], q_one_by_nfft);
1214  q2_out0.val[1] = vmulq_f32 (q2_out0.val[1], q_one_by_nfft);
1215  q2_out1.val[0] = vmulq_f32 (q2_out1.val[0], q_one_by_nfft);
1216  q2_out1.val[1] = vmulq_f32 (q2_out1.val[1], q_one_by_nfft);
1217  q2_out2.val[0] = vmulq_f32 (q2_out2.val[0], q_one_by_nfft);
1218  q2_out2.val[1] = vmulq_f32 (q2_out2.val[1], q_one_by_nfft);
1219  q2_out3.val[0] = vmulq_f32 (q2_out3.val[0], q_one_by_nfft);
1220  q2_out3.val[1] = vmulq_f32 (q2_out3.val[1], q_one_by_nfft);
1221 
1222  // store
1223  vst2q_f32 (p_dst, q2_out0);
1224  p_dst += dst_step;
1225  vst2q_f32 (p_dst, q2_out1);
1226  p_dst += dst_step;
1227  vst2q_f32 (p_dst, q2_out2);
1228  p_dst += dst_step;
1229  vst2q_f32 (p_dst, q2_out3);
1230  p_dst += dst_step;
1231 
1232  p_src = p_src - src_step * 4 + 8;
1233  p_dst = p_dst - dst_step * 4 + 8;
1234  p_tw = p_tw - tw_step * 2 + 8;
1235  }
1236 }
1237 
1238 void ne10_mixed_radix_fft_forward_float32_neon (ne10_fft_cpx_float32_t * Fout,
1239  ne10_fft_cpx_float32_t * Fin,
1240  ne10_int32_t * factors,
1241  ne10_fft_cpx_float32_t * twiddles,
1242  ne10_fft_cpx_float32_t * buffer)
1243 {
1244  ne10_int32_t fstride, mstride, N;
1245  ne10_int32_t fstride1;
1246  ne10_int32_t f_count;
1247  ne10_int32_t stage_count;
1248 
1249  ne10_fft_cpx_float32_t *Fin1, *Fout1;
1250  ne10_fft_cpx_float32_t *Fout_ls = Fout;
1251  ne10_fft_cpx_float32_t *Ftmp;
1252  ne10_fft_cpx_float32_t *tw, *tw1;
1253 
1254  // init fstride, mstride, N
1255  stage_count = factors[0];
1256  fstride = factors[1];
1257  mstride = factors[ (stage_count << 1) - 1 ];
1258  N = factors[ stage_count << 1 ]; // radix
1259 
1260  // the first stage
1261  Fin1 = Fin;
1262  Fout1 = Fout;
1263  if (N == 2) // length of FFT is 2^n (n is odd)
1264  {
1265  // radix 8
1266  N = fstride >> 1; // 1/4 of length of FFT
1267  tw = twiddles;
1268  fstride1 = fstride >> 2;
1269 
1270  ne10_radix8x4_neon (Fout, Fin, fstride1);
1271 
1272  tw += 6;
1273  mstride <<= 2;
1274  fstride >>= 4;
1275  stage_count -= 2;
1276 
1277  // swap
1278  Ftmp = buffer;
1279  buffer = Fout;
1280  Fout = Ftmp;
1281  }
1282  else if (N == 4) // length of FFT is 2^n (n is even)
1283  {
1284  //fstride is nfft>>2
1285  ne10_radix4x4_without_twiddles_neon (Fout, Fin, fstride);
1286 
1287  N = fstride; // 1/4 of length of FFT
1288 
1289  // swap
1290  Ftmp = buffer;
1291  buffer = Fout;
1292  Fout = Ftmp;
1293 
1294  // update address for other stages
1295  stage_count--;
1296  tw = twiddles;
1297  fstride >>= 2;
1298  // end of first stage
1299  }
1300 
1301 
1302  // others but the last one
1303  for (; stage_count > 1 ; stage_count--)
1304  {
1305  Fin1 = buffer;
1306  for (f_count = 0; f_count < fstride; f_count ++)
1307  {
1308  Fout1 = & Fout[ f_count * mstride << 2 ];
1309  tw1 = tw;
1310  ne10_radix4x4_with_twiddles_neon (Fout1, Fin1, tw1, N, mstride, mstride);
1311  Fin1 += mstride;
1312  } // f_count
1313  tw += mstride * 3;
1314  mstride <<= 2;
1315  fstride >>= 2;
1316 
1317  // swap
1318  Ftmp = buffer;
1319  buffer = Fout;
1320  Fout = Ftmp;
1321  } // stage_count
1322 
1323  // the last one
1324  if (stage_count)
1325  {
1326  Fin1 = buffer;
1327  // if stage count is even, output to the input array
1328  Fout1 = Fout_ls;
1329 
1330  for (f_count = 0; f_count < fstride; f_count ++)
1331  {
1332  tw1 = tw;
1333  ne10_radix4x4_with_twiddles_neon (Fout1, Fin1, tw1, N, N, mstride);
1334  Fin1 += mstride;
1335  Fout1 += mstride;
1336  } // f_count
1337  } // last stage
1338 }
1339 
1340 void ne10_mixed_radix_fft_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
1341  ne10_fft_cpx_float32_t * Fin,
1342  ne10_int32_t * factors,
1343  ne10_fft_cpx_float32_t * twiddles,
1344  ne10_fft_cpx_float32_t * buffer)
1345 {
1346  ne10_int32_t fstride, mstride, N;
1347  ne10_int32_t fstride1;
1348  ne10_int32_t f_count;
1349  ne10_int32_t stage_count;
1350  ne10_int32_t nfft;
1351 
1352  ne10_fft_cpx_float32_t *Fin1, *Fout1;
1353  ne10_fft_cpx_float32_t *Fout_ls = Fout;
1354  ne10_fft_cpx_float32_t *Ftmp;
1355  ne10_fft_cpx_float32_t *tw, *tw1;
1356 
1357  // init fstride, mstride, N
1358  stage_count = factors[0];
1359  fstride = factors[1];
1360  mstride = factors[ (stage_count << 1) - 1 ];
1361  N = factors[ stage_count << 1 ]; // radix
1362  nfft = fstride * N;
1363 
1364  // the first stage
1365  Fin1 = Fin;
1366  Fout1 = Fout;
1367  if (N == 2) // length of FFT is 2^n (n is odd)
1368  {
1369  // radix 8
1370  N = fstride >> 1; // 1/4 of length of FFT
1371  tw = twiddles;
1372  fstride1 = fstride >> 2;
1373 
1374  ne10_radix8x4_inverse_neon (Fout, Fin, fstride1);
1375 
1376  tw += 6;
1377  mstride <<= 2;
1378  fstride >>= 4;
1379  stage_count -= 2;
1380 
1381  // swap
1382  Ftmp = buffer;
1383  buffer = Fout;
1384  Fout = Ftmp;
1385  }
1386  else if (N == 4) // length of FFT is 2^n (n is even)
1387  {
1388  //fstride is nfft>>2
1389  ne10_radix4x4_inverse_without_twiddles_neon (Fout, Fin, fstride);
1390 
1391  N = fstride; // 1/4 of length of FFT
1392 
1393  // swap
1394  Ftmp = buffer;
1395  buffer = Fout;
1396  Fout = Ftmp;
1397 
1398  // update address for other stages
1399  stage_count--;
1400  tw = twiddles;
1401  fstride >>= 2;
1402  // end of first stage
1403  }
1404 
1405  // others but the last one
1406  for (; stage_count > 1 ; stage_count--)
1407  {
1408  Fin1 = buffer;
1409  for (f_count = 0; f_count < fstride; f_count ++)
1410  {
1411  Fout1 = & Fout[ f_count * mstride << 2 ];
1412  tw1 = tw;
1413  ne10_radix4x4_inverse_with_twiddles_neon (Fout1, Fin1, tw1, N, mstride, mstride);
1414  Fin1 += mstride;
1415  } // f_count
1416  tw += mstride * 3;
1417  mstride <<= 2;
1418  fstride >>= 2;
1419 
1420  // swap
1421  Ftmp = buffer;
1422  buffer = Fout;
1423  Fout = Ftmp;
1424  } // stage_count
1425 
1426  // the last one
1427  if (stage_count)
1428  {
1429  Fin1 = buffer;
1430  // if stage count is even, output to the input array
1431  Fout1 = Fout_ls;
1432 
1433  for (f_count = 0; f_count < fstride; f_count ++)
1434  {
1435  tw1 = tw;
1436  ne10_radix4x4_inverse_with_twiddles_last_stage_neon (Fout1, Fin1, tw1, N, N, mstride, nfft);
1437  Fin1 += mstride;
1438  Fout1 += mstride;
1439  } // f_count
1440  } // last stage
1441 }
1442 
1462  ne10_int32_t inverse_fft)
1463 {
1464  // For input shorter than 16, fall back to c version.
1465  // We would not get much improvement from NEON for these cases.
1466  if (cfg->nfft < 16)
1467  {
1468  ne10_fft_c2c_1d_float32_c (fout, fin, cfg, inverse_fft);
1469  return;
1470  }
1471 
1472  ne10_int32_t stage_count = cfg->factors[0];
1473  ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
1474 
1475  assert ((algorithm_flag == NE10_FFT_ALG_24)
1476  || (algorithm_flag == NE10_FFT_ALG_ANY));
1477 
1478  // For NE10_FFT_ALG_ANY.
1479  // Function will return inside this branch.
1480  if (algorithm_flag == NE10_FFT_ALG_ANY)
1481  {
1482  if (inverse_fft)
1483  {
1484  ne10_mixed_radix_generic_butterfly_inverse_float32_neon (fout, fin,
1485  cfg->factors, cfg->twiddles, cfg->buffer, cfg->is_backward_scaled);
1486  }
1487  else
1488  {
1489  ne10_mixed_radix_generic_butterfly_float32_neon (fout, fin,
1490  cfg->factors, cfg->twiddles, cfg->buffer, cfg->is_forward_scaled);
1491  }
1492  return;
1493  }
1494 
1495  // Since function goes pass assertion and skips branch above, algorithm_flag
1496  // must be NE10_FFT_ALG_24.
1497  if (inverse_fft)
1498  {
1499  switch (cfg->nfft)
1500  {
1501  case 4:
1502  ne10_fft4_backward_float32 (fout, fin);
1503  break;
1504  case 8:
1505  ne10_fft8_backward_float32 (fout, fin);
1506  break;
1507  case 16:
1508  ne10_fft16_backward_float32_neon (fout, fin, cfg->twiddles);
1509  break;
1510  default:
1511  ne10_mixed_radix_fft_backward_float32_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1512  break;
1513  }
1514  }
1515  else
1516  {
1517  switch (cfg->nfft)
1518  {
1519  case 4:
1520  ne10_fft4_forward_float32 (fout, fin);
1521  break;
1522  case 8:
1523  ne10_fft8_forward_float32 (fout, fin);
1524  break;
1525  case 16:
1526  ne10_fft16_forward_float32_neon (fout, fin, cfg->twiddles);
1527  break;
1528  default:
1529  ne10_mixed_radix_fft_forward_float32_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1530  break;
1531  }
1532  }
1533 }
1534  //end of C2C_FFT_IFFT group
ne10_fft_state_float32_t
structure for the floating point FFT state
Definition: NE10_types.h:240
ne10_fft_c2c_1d_float32_neon
void ne10_fft_c2c_1d_float32_neon(ne10_fft_cpx_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_cfg_float32_t cfg, ne10_int32_t inverse_fft)
Mixed radix-2/3/4/5 complex FFT/IFFT of float(32-bit) data.
Definition: NE10_fft_float32.neonintrinsic.c:1459
ne10_fft_cpx_float32_t
Definition: NE10_types.h:230
ne10_fft_c2c_1d_float32_c
void ne10_fft_c2c_1d_float32_c(ne10_fft_cpx_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_cfg_float32_t cfg, ne10_int32_t inverse_fft)
Mixed radix-2/3/4/5 complex FFT/IFFT of float(32-bit) data.
Definition: NE10_fft_float32.c:1065
ne10_fft_state_float32_t::is_forward_scaled
ne10_int32_t is_forward_scaled
@biref Flag to control scaling behaviour in forward floating point complex FFT.
Definition: NE10_types.h:255
ne10_fft_state_float32_t::is_backward_scaled
ne10_int32_t is_backward_scaled
@biref Flag to control scaling behaviour in backward floating point complex FFT.
Definition: NE10_types.h:264