47 #include "NE10_types.h"
48 #include "NE10_macros.h"
52 #if (NE10_UNROLL_LEVEL > 0)
56 const ne10_int32_t fstride,
57 const ne10_int32_t mstride,
58 const ne10_int32_t nfft)
60 const ne10_int32_t in_step = nfft >> 3;
63 ne10_float32_t scratch_in[8];
64 ne10_float32_t scratch [4];
67 const ne10_float32_t* Fin_r = (ne10_float32_t*) Fin;
68 ne10_float32_t* Fout_r = (ne10_float32_t*) Fout;
71 for (f_count = fstride; f_count; f_count --)
73 scratch_in[0] = Fin_r[in_step * 0] + Fin_r[in_step * (0 + 4)];
74 scratch_in[1] = Fin_r[in_step * 0] - Fin_r[in_step * (0 + 4)];
75 scratch_in[2] = Fin_r[in_step * 1] + Fin_r[in_step * (1 + 4)];
76 scratch_in[3] = Fin_r[in_step * 1] - Fin_r[in_step * (1 + 4)];
77 scratch_in[4] = Fin_r[in_step * 2] + Fin_r[in_step * (2 + 4)];
78 scratch_in[5] = Fin_r[in_step * 2] - Fin_r[in_step * (2 + 4)];
79 scratch_in[6] = Fin_r[in_step * 3] + Fin_r[in_step * (3 + 4)];
80 scratch_in[7] = Fin_r[in_step * 3] - Fin_r[in_step * (3 + 4)];
82 scratch_in[3] *= TW_81_F32;
83 scratch_in[7] *= TW_81N_F32;
86 scratch[0] = scratch_in[0] + scratch_in[4];
87 scratch[1] = scratch_in[2] + scratch_in[6];
88 scratch[2] = scratch_in[7] - scratch_in[3];
89 scratch[3] = scratch_in[3] + scratch_in[7];
91 Fout_r[0] = scratch [0] + scratch [1];
92 Fout_r[7] = scratch [0] - scratch [1];
94 Fout_r[1] = scratch_in[1] + scratch [3];
95 Fout_r[5] = scratch_in[1] - scratch [3];
97 Fout_r[2] = scratch [2] - scratch_in[5];
98 Fout_r[6] = scratch [2] + scratch_in[5];
100 Fout_r[3] = scratch_in[0] - scratch_in[4];
102 Fout_r[4] = scratch_in[6] - scratch_in[2];
111 const ne10_int32_t fstride,
112 const ne10_int32_t mstride,
113 const ne10_int32_t nfft)
115 const ne10_int32_t in_step = nfft >> 3;
116 ne10_int32_t f_count;
118 ne10_float32_t scratch_in[8];
120 const ne10_float32_t one_by_N = 1.0 / nfft;
123 const ne10_float32_t* Fin_r = (ne10_float32_t*) Fin;
124 ne10_float32_t* Fout_r = (ne10_float32_t*) Fout;
126 for (f_count = fstride; f_count; f_count --)
128 scratch_in[0] = Fin_r[0] + Fin_r[3] + Fin_r[3] + Fin_r[7];
129 scratch_in[1] = Fin_r[1] + Fin_r[1] + Fin_r[5] + Fin_r[5];
130 scratch_in[2] = Fin_r[0] - Fin_r[4] - Fin_r[4] - Fin_r[7];
131 scratch_in[3] = Fin_r[1] - Fin_r[2] - Fin_r[5] - Fin_r[6];
132 scratch_in[4] = Fin_r[0] - Fin_r[3] - Fin_r[3] + Fin_r[7];
133 scratch_in[5] = - Fin_r[2] - Fin_r[2] + Fin_r[6] + Fin_r[6];
134 scratch_in[6] = Fin_r[0] + Fin_r[4] + Fin_r[4] - Fin_r[7];
135 scratch_in[7] = Fin_r[1] + Fin_r[2] - Fin_r[5] + Fin_r[6];
137 scratch_in[3] /= TW_81_F32;
138 scratch_in[7] /= TW_81N_F32;
140 Fout_r[0 * in_step] = scratch_in[0] + scratch_in[1];
141 Fout_r[4 * in_step] = scratch_in[0] - scratch_in[1];
142 Fout_r[1 * in_step] = scratch_in[2] + scratch_in[3];
143 Fout_r[5 * in_step] = scratch_in[2] - scratch_in[3];
144 Fout_r[2 * in_step] = scratch_in[4] + scratch_in[5];
145 Fout_r[6 * in_step] = scratch_in[4] - scratch_in[5];
146 Fout_r[3 * in_step] = scratch_in[6] + scratch_in[7];
147 Fout_r[7 * in_step] = scratch_in[6] - scratch_in[7];
149 #if defined(NE10_DSP_RFFT_SCALING)
150 Fout_r[0 * in_step] *= one_by_N;
151 Fout_r[4 * in_step] *= one_by_N;
152 Fout_r[1 * in_step] *= one_by_N;
153 Fout_r[5 * in_step] *= one_by_N;
154 Fout_r[2 * in_step] *= one_by_N;
155 Fout_r[6 * in_step] *= one_by_N;
156 Fout_r[3 * in_step] *= one_by_N;
157 Fout_r[7 * in_step] *= one_by_N;
167 const ne10_int32_t fstride,
168 const ne10_int32_t mstride,
169 const ne10_int32_t nfft)
171 const ne10_int32_t in_step = nfft >> 2;
172 ne10_int32_t f_count;
174 ne10_float32_t scratch_in [4];
175 ne10_float32_t scratch_out[4];
178 const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
179 ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
182 for (f_count = fstride; f_count; f_count --)
184 scratch_in[0] = Fin_r[0 * in_step];
185 scratch_in[1] = Fin_r[1 * in_step];
186 scratch_in[2] = Fin_r[2 * in_step];
187 scratch_in[3] = Fin_r[3 * in_step];
191 NE10_FFT_R2C_4R_RCR(scratch_out,scratch_in);
195 Fout_r[0] = scratch_out[0];
196 Fout_r[1] = scratch_out[1];
197 Fout_r[2] = scratch_out[2];
198 Fout_r[3] = scratch_out[3];
207 const ne10_int32_t fstride,
208 const ne10_int32_t mstride,
209 const ne10_int32_t nfft)
211 ne10_int32_t f_count;
212 const ne10_int32_t in_step = nfft >> 2;
213 ne10_float32_t scratch_in [4];
214 ne10_float32_t scratch_out[4];
216 const ne10_float32_t one_by_N = 1.0 / nfft;
219 const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
220 ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
222 for (f_count = fstride; f_count; f_count --)
224 scratch_in[0] = Fin_r[0];
225 scratch_in[1] = Fin_r[1];
226 scratch_in[2] = Fin_r[2];
227 scratch_in[3] = Fin_r[3];
231 NE10_FFT_C2R_RCR_4R(scratch_out,scratch_in);
235 #if defined(NE10_DSP_RFFT_SCALING)
236 scratch_out[0] *= one_by_N;
237 scratch_out[1] *= one_by_N;
238 scratch_out[2] *= one_by_N;
239 scratch_out[3] *= one_by_N;
243 Fout_r[0 * in_step] = scratch_out[0];
244 Fout_r[1 * in_step] = scratch_out[1];
245 Fout_r[2 * in_step] = scratch_out[2];
246 Fout_r[3 * in_step] = scratch_out[3];
253 NE10_INLINE
void ne10_radix4_r2c_with_twiddles_first_butterfly_c (ne10_float32_t *Fout_r,
254 const ne10_float32_t *Fin_r,
255 const ne10_int32_t out_step,
256 const ne10_int32_t in_step,
259 ne10_float32_t scratch_out[4];
260 ne10_float32_t scratch_in [4];
263 scratch_in[0] = Fin_r[0 * in_step];
264 scratch_in[1] = Fin_r[1 * in_step];
265 scratch_in[2] = Fin_r[2 * in_step];
266 scratch_in[3] = Fin_r[3 * in_step];
270 NE10_FFT_R2C_4R_RCR(scratch_out,scratch_in);
275 Fout_r[ 0] = scratch_out[0];
276 Fout_r[ (out_step << 1) - 1] = scratch_out[1];
277 Fout_r[ (out_step << 1) ] = scratch_out[2];
278 Fout_r[2 * (out_step << 1) - 1] = scratch_out[3];
281 NE10_INLINE
void ne10_radix4_c2r_with_twiddles_first_butterfly_c (ne10_float32_t *Fout_r,
282 const ne10_float32_t *Fin_r,
283 const ne10_int32_t out_step,
284 const ne10_int32_t in_step,
287 ne10_float32_t scratch [8];
288 ne10_float32_t scratch_in_r [4];
289 ne10_float32_t scratch_out_r[4];
292 scratch_in_r[0] = Fin_r[0 ];
293 scratch_in_r[1] = Fin_r[1*(out_step<<1)-1];
294 scratch_in_r[2] = Fin_r[1*(out_step<<1) ];
295 scratch_in_r[3] = Fin_r[2*(out_step<<1)-1];
300 scratch[0] = scratch_in_r[0] + scratch_in_r[3];
301 scratch[1] = scratch_in_r[0] - scratch_in_r[3];
302 scratch[2] = scratch_in_r[1] + scratch_in_r[1];
303 scratch[3] = scratch_in_r[2] + scratch_in_r[2];
305 scratch_out_r[0] = scratch[0] + scratch[2];
306 scratch_out_r[1] = scratch[1] - scratch[3];
307 scratch_out_r[2] = scratch[0] - scratch[2];
308 scratch_out_r[3] = scratch[1] + scratch[3];
313 Fout_r[0 * in_step] = scratch_out_r[0];
314 Fout_r[1 * in_step] = scratch_out_r[1];
315 Fout_r[2 * in_step] = scratch_out_r[2];
316 Fout_r[3 * in_step] = scratch_out_r[3];
320 NE10_INLINE
void ne10_radix4_r2c_with_twiddles_other_butterfly_c (ne10_float32_t *Fout_r,
321 const ne10_float32_t *Fin_r,
322 const ne10_int32_t out_step,
323 const ne10_int32_t in_step,
326 ne10_int32_t m_count;
327 ne10_float32_t *Fout_b = Fout_r + (((out_step<<1)-1)<<1) - 2;
332 for (m_count = (out_step >> 1) - 1; m_count; m_count --)
334 scratch_tw [0] = twiddles[0 * out_step];
335 scratch_tw [1] = twiddles[1 * out_step];
336 scratch_tw [2] = twiddles[2 * out_step];
338 scratch_in[0].r = Fin_r[0 * in_step ];
339 scratch_in[0].i = Fin_r[0 * in_step + 1];
340 scratch_in[1].r = Fin_r[1 * in_step ];
341 scratch_in[1].i = Fin_r[1 * in_step + 1];
342 scratch_in[2].r = Fin_r[2 * in_step ];
343 scratch_in[2].i = Fin_r[2 * in_step + 1];
344 scratch_in[3].r = Fin_r[3 * in_step ];
345 scratch_in[3].i = Fin_r[3 * in_step + 1];
350 scratch[0].r = scratch_in[0].r;
351 scratch[0].i = scratch_in[0].i;
352 scratch[1].r = scratch_in[1].r * scratch_tw[0].r - scratch_in[1].i * scratch_tw[0].i;
353 scratch[1].i = scratch_in[1].i * scratch_tw[0].r + scratch_in[1].r * scratch_tw[0].i;
355 scratch[2].r = scratch_in[2].r * scratch_tw[1].r - scratch_in[2].i * scratch_tw[1].i;
356 scratch[2].i = scratch_in[2].i * scratch_tw[1].r + scratch_in[2].r * scratch_tw[1].i;
358 scratch[3].r = scratch_in[3].r * scratch_tw[2].r - scratch_in[3].i * scratch_tw[2].i;
359 scratch[3].i = scratch_in[3].i * scratch_tw[2].r + scratch_in[3].r * scratch_tw[2].i;
361 NE10_FFT_R2C_CC_CC(scratch_out,scratch);
366 Fout_r[ 0] = scratch_out[0].r;
367 Fout_r[ 1] = scratch_out[0].i;
368 Fout_r[ (out_step << 1) ] = scratch_out[1].r;
369 Fout_r[ (out_step << 1) + 1] = scratch_out[1].i;
370 Fout_b[ 0] = scratch_out[2].r;
371 Fout_b[ 1] = scratch_out[2].i;
372 Fout_b[- (out_step << 1) ] = scratch_out[3].r;
373 Fout_b[- (out_step << 1) + 1] = scratch_out[3].i;
383 NE10_INLINE
void ne10_radix4_c2r_with_twiddles_other_butterfly_c (ne10_float32_t *Fout_r,
384 const ne10_float32_t *Fin_r,
385 const ne10_int32_t out_step,
386 const ne10_int32_t in_step,
389 ne10_int32_t m_count;
390 const ne10_float32_t *Fin_b = Fin_r + (((out_step<<1)-1)<<1) - 2;
396 for (m_count = (out_step >> 1) - 1; m_count; m_count --)
398 scratch_tw[0] = twiddles[0 * out_step];
399 scratch_tw[1] = twiddles[1 * out_step];
400 scratch_tw[2] = twiddles[2 * out_step];
402 scratch_in[0].r = Fin_r[0];
403 scratch_in[0].i = Fin_r[1];
405 scratch_in[1].r = Fin_b[0];
406 scratch_in[1].i = Fin_b[1];
408 scratch_in[2].r = Fin_r[(out_step<<1) + 0];
409 scratch_in[2].i = Fin_r[(out_step<<1) + 1];
411 scratch_in[3].r = Fin_b[-(out_step<<1) + 0];
412 scratch_in[3].i = Fin_b[-(out_step<<1) + 1];
417 NE10_FFT_C2R_CC_CC(scratch,scratch_in);
420 scratch_out[0] = scratch[0];
422 scratch_out[1].r = scratch[1].r * scratch_tw[0].r + scratch[1].i * scratch_tw[0].i;
423 scratch_out[1].i = scratch[1].i * scratch_tw[0].r - scratch[1].r * scratch_tw[0].i;
425 scratch_out[2].r = scratch[2].r * scratch_tw[1].r + scratch[2].i * scratch_tw[1].i;
426 scratch_out[2].i = scratch[2].i * scratch_tw[1].r - scratch[2].r * scratch_tw[1].i;
428 scratch_out[3].r = scratch[3].r * scratch_tw[2].r + scratch[3].i * scratch_tw[2].i;
429 scratch_out[3].i = scratch[3].i * scratch_tw[2].r - scratch[3].r * scratch_tw[2].i;
434 Fout_r[0 * in_step ] = scratch_out[0].r;
435 Fout_r[0 * in_step + 1] = scratch_out[0].i;
436 Fout_r[1 * in_step ] = scratch_out[1].r;
437 Fout_r[1 * in_step + 1] = scratch_out[1].i;
438 Fout_r[2 * in_step ] = scratch_out[2].r;
439 Fout_r[2 * in_step + 1] = scratch_out[2].i;
440 Fout_r[3 * in_step ] = scratch_out[3].r;
441 Fout_r[3 * in_step + 1] = scratch_out[3].i;
451 NE10_INLINE
void ne10_radix4_r2c_with_twiddles_last_butterfly_c (ne10_float32_t *Fout_r,
452 const ne10_float32_t *Fin_r,
453 const ne10_int32_t out_step,
454 const ne10_int32_t in_step,
457 ne10_float32_t scratch_in [4];
458 ne10_float32_t scratch_out[4];
460 scratch_in[0] = Fin_r[0 * in_step];
461 scratch_in[1] = Fin_r[1 * in_step];
462 scratch_in[2] = Fin_r[2 * in_step];
463 scratch_in[3] = Fin_r[3 * in_step];
467 NE10_FFT_R2C_4R_CC(scratch_out,scratch_in);
471 Fout_r[ 0] = scratch_out[0];
472 Fout_r[ 1] = scratch_out[1];
473 Fout_r[ (out_step << 1) ] = scratch_out[2];
474 Fout_r[ (out_step << 1) + 1] = scratch_out[3];
477 NE10_INLINE
void ne10_radix4_c2r_with_twiddles_last_butterfly_c (ne10_float32_t *Fout_r,
478 const ne10_float32_t *Fin_r,
479 const ne10_int32_t out_step,
480 const ne10_int32_t in_step,
484 ne10_float32_t scratch_in [4];
485 ne10_float32_t scratch_out[4];
488 scratch_in[0] = Fin_r[ 0];
489 scratch_in[1] = Fin_r[ 1];
490 scratch_in[2] = Fin_r[ (out_step << 1) ];
491 scratch_in[3] = Fin_r[ (out_step << 1) + 1];
495 NE10_FFT_C2R_CC_4R(scratch_out,scratch_in);
500 Fout_r[0 * in_step] = scratch_out[0];
501 Fout_r[1 * in_step] = scratch_out[1];
502 Fout_r[2 * in_step] = scratch_out[2];
503 Fout_r[3 * in_step] = scratch_out[3];
508 const ne10_int32_t fstride,
509 const ne10_int32_t mstride,
510 const ne10_int32_t nfft,
513 ne10_int32_t f_count;
514 const ne10_int32_t in_step = nfft >> 2;
515 const ne10_int32_t out_step = mstride;
517 const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
518 ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
524 for (f_count = fstride; f_count; f_count --)
529 ne10_radix4_r2c_with_twiddles_first_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
536 ne10_radix4_r2c_with_twiddles_other_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
539 tw += ( (out_step >> 1) - 1);
540 Fin_r += 2 * ( (out_step >> 1) - 1);
541 Fout_r += 2 * ( (out_step >> 1) - 1);
544 ne10_radix4_r2c_with_twiddles_last_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
549 Fout_r += 3 * out_step;
555 const ne10_int32_t fstride,
556 const ne10_int32_t mstride,
557 const ne10_int32_t nfft,
560 ne10_int32_t f_count;
561 const ne10_int32_t in_step = nfft >> 2;
562 const ne10_int32_t out_step = mstride;
564 const ne10_float32_t *Fin_r = (ne10_float32_t*) Fin;
565 ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
568 for (f_count = fstride; f_count; f_count --)
573 ne10_radix4_c2r_with_twiddles_first_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
580 ne10_radix4_c2r_with_twiddles_other_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
583 tw += ( (out_step >> 1) - 1);
584 Fin_r += 2 * ( (out_step >> 1) - 1);
585 Fout_r += 2 * ( (out_step >> 1) - 1);
588 ne10_radix4_c2r_with_twiddles_last_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
593 Fin_r += 3 * out_step;
597 NE10_INLINE
void ne10_mixed_radix_r2c_butterfly_float32_c (
600 const ne10_int32_t * factors,
606 ne10_int32_t fstride, mstride, nfft;
608 ne10_int32_t stage_count;
611 stage_count = factors[0];
612 fstride = factors[1];
613 mstride = factors[ (stage_count << 1) - 1 ];
614 radix = factors[ stage_count << 1 ];
615 nfft = radix * fstride;
628 if (stage_count % 2 == 0)
630 ne10_swap_ptr (buffer, Fout);
637 ne10_radix8_r2c_c (Fout, Fin, fstride, mstride, nfft);
642 ne10_radix4_r2c_c (Fout, Fin, fstride, mstride, nfft);
650 ne10_swap_ptr (buffer, Fout);
653 ne10_radix4_r2c_with_twiddles_c (Fout, buffer, fstride, mstride, nfft, twiddles);
654 twiddles += 3 * mstride;
660 NE10_INLINE
void ne10_mixed_radix_c2r_butterfly_float32_c (
663 const ne10_int32_t * factors,
669 ne10_int32_t fstride, mstride, nfft;
671 ne10_int32_t stage_count;
674 stage_count = factors[0];
675 fstride = factors[1];
676 mstride = factors[ (stage_count << 1) - 1 ];
677 radix = factors[ stage_count << 1 ];
678 nfft = radix * fstride;
691 if (stage_count % 2 == 1)
693 ne10_swap_ptr (buffer, Fout);
699 twiddles -= 3 * mstride;
702 ne10_radix4_c2r_with_twiddles_c (buffer, Fin, fstride, mstride, nfft, twiddles);
709 for (; stage_count > 1;)
711 twiddles -= 3 * mstride;
714 ne10_radix4_c2r_with_twiddles_c (Fout, buffer, fstride, mstride, nfft, twiddles);
718 ne10_swap_ptr (buffer, Fout);
728 ne10_radix8_c2r_c (Fout, buffer, fstride, mstride, nfft);
734 ne10_radix4_c2r_c (Fout, buffer, fstride, mstride, nfft);
747 ne10_int32_t ncfft = nfft >> 1;
752 +
sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2)
753 +
sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2)
757 + NE10_FFT_BYTE_ALIGNMENT;
768 const ne10_float32_t pi = NE10_PI;
769 ne10_float32_t phase1;
774 NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
777 st->r_twiddles = st->buffer + nfft;
778 st->r_factors = (ne10_int32_t*) (st->r_twiddles + nfft);
780 st->r_factors_neon = (ne10_int32_t*) (st->r_twiddles_neon + nfft/4);
789 ne10_factor (nfft, st->r_factors, NE10_FACTOR_DEFAULT);
792 st->r_twiddles_backward = ne10_fft_generate_twiddles_float32 (st->r_twiddles, st->r_factors, nfft);
795 result = ne10_factor (nfft/4, st->r_factors_neon, NE10_FACTOR_DEFAULT);
796 if (result == NE10_ERR)
802 st->r_twiddles_neon_backward = ne10_fft_generate_twiddles_transposed_float32 (
808 tw = st->r_super_twiddles_neon;
809 for (i = 1; i < 4; i ++)
811 for (j = 0; j < 4; j++)
813 phase1 = - 2 * pi * ( (ne10_float32_t) (i * j) / nfft);
814 tw[4*i-4+j].r = (ne10_float32_t) cos (phase1);
815 tw[4*i-4+j].i = (ne10_float32_t) sin (phase1);
822 for (k=1; k<nfft/32; k++)
825 for (s = 1; s < 4; s++)
827 for (j = 0; j < 4; j++)
829 phase1 = - 2 * pi * ( (ne10_float32_t) ((k*4+j) * s) / nfft);
830 tw[12*k+j+4*(s-1)].r = (ne10_float32_t) cos (phase1);
831 tw[12*k+j+4*(s-1)].i = (ne10_float32_t) sin (phase1);
860 ne10_mixed_radix_r2c_butterfly_float32_c (
869 fout[0].r = fout[0].i;
871 fout[(cfg->nfft) >> 1].i = 0.0f;
898 ne10_mixed_radix_c2r_butterfly_float32_c (
902 cfg->r_twiddles_backward,
914 #endif // NE10_UNROLL_LEVEL