47#include "NE10_types.h"
48#include "NE10_macros.h"
54 ne10_int32_t * factors,
57 ne10_int32_t scaled_flag)
59 ne10_int32_t fstride, mstride, N;
60 ne10_int32_t fstride1;
61 ne10_int32_t f_count, m_count;
62 ne10_int32_t stage_count;
73 const ne10_int32_t TW_81 = 1518500249;
74 const ne10_int32_t TW_81N = -1518500249;
79 stage_count = factors[0];
81 mstride = factors[ (stage_count << 1) - 1 ];
82 N = factors[ stage_count << 1 ];
92 fstride1 = fstride >> 2;
95 for (f_count = 0; f_count < fstride1; f_count ++)
97 Fout1 = & Fout[ f_count * 8 ];
101 NE10_F2I32_FIXDIV (Fin1[0], 8);
102 NE10_F2I32_FIXDIV (Fin1[0 + fstride], 8);
103 NE10_F2I32_FIXDIV (Fin1[fstride1], 8);
104 NE10_F2I32_FIXDIV (Fin1[fstride1 + fstride], 8);
105 NE10_F2I32_FIXDIV (Fin1[fstride1 * 2], 8);
106 NE10_F2I32_FIXDIV (Fin1[fstride1 * 2 + fstride], 8);
107 NE10_F2I32_FIXDIV (Fin1[fstride1 * 3], 8);
108 NE10_F2I32_FIXDIV (Fin1[fstride1 * 3 + fstride], 8);
110 scratch_in[0].r = Fin1[0].r + Fin1[0 + fstride].r;
111 scratch_in[0].i = Fin1[0].i + Fin1[0 + fstride].i;
112 scratch_in[1].r = Fin1[0].r - Fin1[0 + fstride].r;
113 scratch_in[1].i = Fin1[0].i - Fin1[0 + fstride].i;
114 scratch_in[2].r = Fin1[fstride1].r + Fin1[fstride1 + fstride].r;
115 scratch_in[2].i = Fin1[fstride1].i + Fin1[fstride1 + fstride].i;
116 scratch_in[3].r = Fin1[fstride1].r - Fin1[fstride1 + fstride].r;
117 scratch_in[3].i = Fin1[fstride1].i - Fin1[fstride1 + fstride].i;
118 scratch_in[4].r = Fin1[fstride1 * 2].r + Fin1[fstride1 * 2 + fstride].r;
119 scratch_in[4].i = Fin1[fstride1 * 2].i + Fin1[fstride1 * 2 + fstride].i;
120 scratch_in[5].r = Fin1[fstride1 * 2].r - Fin1[fstride1 * 2 + fstride].r;
121 scratch_in[5].i = Fin1[fstride1 * 2].i - Fin1[fstride1 * 2 + fstride].i;
122 scratch_in[6].r = Fin1[fstride1 * 3].r + Fin1[fstride1 * 3 + fstride].r;
123 scratch_in[6].i = Fin1[fstride1 * 3].i + Fin1[fstride1 * 3 + fstride].i;
124 scratch_in[7].r = Fin1[fstride1 * 3].r - Fin1[fstride1 * 3 + fstride].r;
125 scratch_in[7].i = Fin1[fstride1 * 3].i - Fin1[fstride1 * 3 + fstride].i;
128 scratch[0] = scratch_in[0];
129 scratch[1] = scratch_in[1];
131 scratch[2] = scratch_in[2];
132 scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[3].r + scratch_in[3].i) * TW_81) >> 31);
133 scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[3].i - scratch_in[3].r) * TW_81) >> 31);
135 scratch[4] = scratch_in[4];
136 scratch[5].r = scratch_in[5].i;
137 scratch[5].i = -scratch_in[5].r;
139 scratch[6].r = scratch_in[6].r;
140 scratch[6].i = scratch_in[6].i;
141 scratch[7].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[7].r - scratch_in[7].i) * TW_81N) >> 31);
142 scratch[7].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[7].i + scratch_in[7].r) * TW_81N) >> 31);
145 scratch[8].r = scratch[0].r + scratch[4].r;
146 scratch[8].i = scratch[0].i + scratch[4].i;
147 scratch[9].r = scratch[1].r + scratch[5].r;
148 scratch[9].i = scratch[1].i + scratch[5].i;
150 scratch[10].r = scratch[0].r - scratch[4].r;
151 scratch[10].i = scratch[0].i - scratch[4].i;
152 scratch[11].r = scratch[1].r - scratch[5].r;
153 scratch[11].i = scratch[1].i - scratch[5].i;
156 scratch[12].r = scratch[2].r + scratch[6].r;
157 scratch[12].i = scratch[2].i + scratch[6].i;
158 scratch[13].r = scratch[3].r + scratch[7].r;
159 scratch[13].i = scratch[3].i + scratch[7].i;
161 scratch[14].r = scratch[2].r - scratch[6].r;
162 scratch[14].i = scratch[2].i - scratch[6].i;
163 scratch[15].r = scratch[3].r - scratch[7].r;
164 scratch[15].i = scratch[3].i - scratch[7].i;
167 scratch_out[4].r = scratch[8].r - scratch[12].r;
168 scratch_out[4].i = scratch[8].i - scratch[12].i;
169 scratch_out[5].r = scratch[9].r - scratch[13].r;
170 scratch_out[5].i = scratch[9].i - scratch[13].i;
173 scratch_out[0].r = scratch[8].r + scratch[12].r;
174 scratch_out[0].i = scratch[8].i + scratch[12].i;
175 scratch_out[1].r = scratch[9].r + scratch[13].r;
176 scratch_out[1].i = scratch[9].i + scratch[13].i;
179 scratch_out[2].r = scratch[10].r + scratch[14].i;
180 scratch_out[2].i = scratch[10].i - scratch[14].r;
181 scratch_out[3].r = scratch[11].r + scratch[15].i;
182 scratch_out[3].i = scratch[11].i - scratch[15].r;
185 scratch_out[6].r = scratch[10].r - scratch[14].i;
186 scratch_out[6].i = scratch[10].i + scratch[14].r;
187 scratch_out[7].r = scratch[11].r - scratch[15].i;
188 scratch_out[7].i = scratch[11].i + scratch[15].r;
191 Fout1[0] = scratch_out[0];
192 Fout1[1] = scratch_out[1];
193 Fout1[2] = scratch_out[2];
194 Fout1[3] = scratch_out[3];
195 Fout1[4] = scratch_out[4];
196 Fout1[5] = scratch_out[5];
197 Fout1[6] = scratch_out[6];
198 Fout1[7] = scratch_out[7];
215 for (f_count = fstride; f_count ; f_count --)
218 scratch_in[0] = *Fin1;
219 Fin2 = Fin1 + fstride;
220 scratch_in[1] = *Fin2;
221 Fin2 = Fin2 + fstride;
222 scratch_in[2] = *Fin2;
223 Fin2 = Fin2 + fstride;
224 scratch_in[3] = *Fin2;
227 if (scaled_flag == 1)
229 NE10_F2I32_FIXDIV (scratch_in[0], 4);
230 NE10_F2I32_FIXDIV (scratch_in[1], 4);
231 NE10_F2I32_FIXDIV (scratch_in[2], 4);
232 NE10_F2I32_FIXDIV (scratch_in[3], 4);
236 scratch[0].r = scratch_in[0].r + scratch_in[2].r;
237 scratch[0].i = scratch_in[0].i + scratch_in[2].i;
239 scratch[1].r = scratch_in[0].r - scratch_in[2].r;
240 scratch[1].i = scratch_in[0].i - scratch_in[2].i;
243 scratch[2].r = scratch_in[1].r + scratch_in[3].r;
244 scratch[2].i = scratch_in[1].i + scratch_in[3].i;
246 scratch[3].r = scratch_in[1].r - scratch_in[3].r;
247 scratch[3].i = scratch_in[1].i - scratch_in[3].i;
250 scratch_out[2].r = scratch[0].r - scratch[2].r;
251 scratch_out[2].i = scratch[0].i - scratch[2].i;
254 scratch_out[0].r = scratch[0].r + scratch[2].r;
255 scratch_out[0].i = scratch[0].i + scratch[2].i;
258 scratch_out[1].r = scratch[1].r + scratch[3].i;
259 scratch_out[1].i = scratch[1].i - scratch[3].r;
262 scratch_out[3].r = scratch[1].r - scratch[3].i;
263 scratch_out[3].i = scratch[1].i + scratch[3].r;
266 * Fout1 ++ = scratch_out[0];
267 * Fout1 ++ = scratch_out[1];
268 * Fout1 ++ = scratch_out[2];
269 * Fout1 ++ = scratch_out[3];
289 for (; stage_count > 1 ; stage_count--)
292 for (f_count = 0; f_count < fstride; f_count ++)
294 Fout1 = & Fout[ f_count * mstride << 2 ];
296 for (m_count = mstride; m_count ; m_count --)
299 scratch_tw[0] = *tw1;
301 scratch_tw[1] = *tw2;
303 scratch_tw[2] = *tw2;
304 scratch_in[0] = * Fin1;
306 scratch_in[1] = * Fin2;
308 scratch_in[2] = * Fin2;
310 scratch_in[3] = * Fin2;
311 if (scaled_flag == 1)
313 NE10_F2I32_FIXDIV (scratch_in[0], 4);
314 NE10_F2I32_FIXDIV (scratch_in[1], 4);
315 NE10_F2I32_FIXDIV (scratch_in[2], 4);
316 NE10_F2I32_FIXDIV (scratch_in[3], 4);
321 scratch[0] = scratch_in[0];
322 scratch[1].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].r
323 - (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].i) >> 31);
324 scratch[1].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].r
325 + (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].i) >> 31);
327 scratch[2].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].r
328 - (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].i) >> 31);
329 scratch[2].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].r
330 + (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].i) >> 31);
332 scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].r
333 - (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].i) >> 31);
334 scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].r
335 + (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].i) >> 31);
338 scratch[4].r = scratch[0].r + scratch[2].r;
339 scratch[4].i = scratch[0].i + scratch[2].i;
341 scratch[5].r = scratch[0].r - scratch[2].r;
342 scratch[5].i = scratch[0].i - scratch[2].i;
345 scratch[6].r = scratch[1].r + scratch[3].r;
346 scratch[6].i = scratch[1].i + scratch[3].i;
348 scratch[7].r = scratch[1].r - scratch[3].r;
349 scratch[7].i = scratch[1].i - scratch[3].i;
352 scratch_out[2].r = scratch[4].r - scratch[6].r;
353 scratch_out[2].i = scratch[4].i - scratch[6].i;
356 scratch_out[0].r = scratch[4].r + scratch[6].r;
357 scratch_out[0].i = scratch[4].i + scratch[6].i;
360 scratch_out[1].r = scratch[5].r + scratch[7].i;
361 scratch_out[1].i = scratch[5].i - scratch[7].r;
364 scratch_out[3].r = scratch[5].r - scratch[7].i;
365 scratch_out[3].i = scratch[5].i + scratch[7].r;
368 *Fout1 = scratch_out[0];
369 Fout2 = Fout1 + mstride;
370 *Fout2 = scratch_out[1];
372 *Fout2 = scratch_out[2];
374 *Fout2 = scratch_out[3];
397 for (f_count = 0; f_count < fstride; f_count ++)
400 for (m_count = mstride; m_count ; m_count --)
403 scratch_tw[0] = *tw1;
405 scratch_tw[1] = *tw2;
407 scratch_tw[2] = *tw2;
408 scratch_in[0] = * Fin1;
410 scratch_in[1] = * Fin2;
412 scratch_in[2] = * Fin2;
414 scratch_in[3] = * Fin2;
415 if (scaled_flag == 1)
417 NE10_F2I32_FIXDIV (scratch_in[0], 4);
418 NE10_F2I32_FIXDIV (scratch_in[1], 4);
419 NE10_F2I32_FIXDIV (scratch_in[2], 4);
420 NE10_F2I32_FIXDIV (scratch_in[3], 4);
425 scratch[0] = scratch_in[0];
426 scratch[1].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].r
427 - (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].i) >> 31);
428 scratch[1].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].r
429 + (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].i) >> 31);
431 scratch[2].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].r
432 - (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].i) >> 31);
433 scratch[2].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].r
434 + (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].i) >> 31);
436 scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].r
437 - (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].i) >> 31);
438 scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].r
439 + (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].i) >> 31);
442 scratch[4].r = scratch[0].r + scratch[2].r;
443 scratch[4].i = scratch[0].i + scratch[2].i;
445 scratch[5].r = scratch[0].r - scratch[2].r;
446 scratch[5].i = scratch[0].i - scratch[2].i;
449 scratch[6].r = scratch[1].r + scratch[3].r;
450 scratch[6].i = scratch[1].i + scratch[3].i;
452 scratch[7].r = scratch[1].r - scratch[3].r;
453 scratch[7].i = scratch[1].i - scratch[3].i;
456 scratch_out[2].r = scratch[4].r - scratch[6].r;
457 scratch_out[2].i = scratch[4].i - scratch[6].i;
460 scratch_out[0].r = scratch[4].r + scratch[6].r;
461 scratch_out[0].i = scratch[4].i + scratch[6].i;
464 scratch_out[1].r = scratch[5].r + scratch[7].i;
465 scratch_out[1].i = scratch[5].i - scratch[7].r;
468 scratch_out[3].r = scratch[5].r - scratch[7].i;
469 scratch_out[3].i = scratch[5].i + scratch[7].r;
472 *Fout1 = scratch_out[0];
474 *Fout2 = scratch_out[1];
476 *Fout2 = scratch_out[2];
478 *Fout2 = scratch_out[3];
490 ne10_int32_t * factors,
493 ne10_int32_t scaled_flag)
495 ne10_int32_t fstride, mstride, N;
496 ne10_int32_t fstride1;
497 ne10_int32_t f_count, m_count;
498 ne10_int32_t stage_count;
509 const ne10_int32_t TW_81 = 1518500249;
510 const ne10_int32_t TW_81N = -1518500249;
513 stage_count = factors[0];
514 fstride = factors[1];
515 mstride = factors[ (stage_count << 1) - 1 ];
516 N = factors[ stage_count << 1 ];
526 fstride1 = fstride >> 2;
529 for (f_count = 0; f_count < fstride1; f_count ++)
531 Fout1 = & Fout[ f_count * 8 ];
533 if (scaled_flag == 1)
535 NE10_F2I32_FIXDIV (Fin1[0], 8);
536 NE10_F2I32_FIXDIV (Fin1[0 + fstride], 8);
537 NE10_F2I32_FIXDIV (Fin1[fstride1], 8);
538 NE10_F2I32_FIXDIV (Fin1[fstride1 + fstride], 8);
539 NE10_F2I32_FIXDIV (Fin1[fstride1 * 2], 8);
540 NE10_F2I32_FIXDIV (Fin1[fstride1 * 2 + fstride], 8);
541 NE10_F2I32_FIXDIV (Fin1[fstride1 * 3], 8);
542 NE10_F2I32_FIXDIV (Fin1[fstride1 * 3 + fstride], 8);
545 scratch_in[0].r = Fin1[0].r + Fin1[0 + fstride].r;
546 scratch_in[0].i = Fin1[0].i + Fin1[0 + fstride].i;
547 scratch_in[1].r = Fin1[0].r - Fin1[0 + fstride].r;
548 scratch_in[1].i = Fin1[0].i - Fin1[0 + fstride].i;
549 scratch_in[2].r = Fin1[fstride1].r + Fin1[fstride1 + fstride].r;
550 scratch_in[2].i = Fin1[fstride1].i + Fin1[fstride1 + fstride].i;
551 scratch_in[3].r = Fin1[fstride1].r - Fin1[fstride1 + fstride].r;
552 scratch_in[3].i = Fin1[fstride1].i - Fin1[fstride1 + fstride].i;
553 scratch_in[4].r = Fin1[fstride1 * 2].r + Fin1[fstride1 * 2 + fstride].r;
554 scratch_in[4].i = Fin1[fstride1 * 2].i + Fin1[fstride1 * 2 + fstride].i;
555 scratch_in[5].r = Fin1[fstride1 * 2].r - Fin1[fstride1 * 2 + fstride].r;
556 scratch_in[5].i = Fin1[fstride1 * 2].i - Fin1[fstride1 * 2 + fstride].i;
557 scratch_in[6].r = Fin1[fstride1 * 3].r + Fin1[fstride1 * 3 + fstride].r;
558 scratch_in[6].i = Fin1[fstride1 * 3].i + Fin1[fstride1 * 3 + fstride].i;
559 scratch_in[7].r = Fin1[fstride1 * 3].r - Fin1[fstride1 * 3 + fstride].r;
560 scratch_in[7].i = Fin1[fstride1 * 3].i - Fin1[fstride1 * 3 + fstride].i;
564 scratch[0] = scratch_in[0];
565 scratch[1] = scratch_in[1];
567 scratch[2] = scratch_in[2];
568 scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[3].r - scratch_in[3].i) * TW_81) >> 31);
569 scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[3].i + scratch_in[3].r) * TW_81) >> 31);
571 scratch[4] = scratch_in[4];
572 scratch[5].r = -scratch_in[5].i;
573 scratch[5].i = scratch_in[5].r;
575 scratch[6].r = scratch_in[6].r;
576 scratch[6].i = scratch_in[6].i;
577 scratch[7].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[7].r + scratch_in[7].i) * TW_81N) >> 31);
578 scratch[7].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[7].i - scratch_in[7].r) * TW_81N) >> 31);
581 scratch[8].r = scratch[0].r + scratch[4].r;
582 scratch[8].i = scratch[0].i + scratch[4].i;
583 scratch[9].r = scratch[1].r + scratch[5].r;
584 scratch[9].i = scratch[1].i + scratch[5].i;
586 scratch[10].r = scratch[0].r - scratch[4].r;
587 scratch[10].i = scratch[0].i - scratch[4].i;
588 scratch[11].r = scratch[1].r - scratch[5].r;
589 scratch[11].i = scratch[1].i - scratch[5].i;
592 scratch[12].r = scratch[2].r + scratch[6].r;
593 scratch[12].i = scratch[2].i + scratch[6].i;
594 scratch[13].r = scratch[3].r + scratch[7].r;
595 scratch[13].i = scratch[3].i + scratch[7].i;
597 scratch[14].r = scratch[2].r - scratch[6].r;
598 scratch[14].i = scratch[2].i - scratch[6].i;
599 scratch[15].r = scratch[3].r - scratch[7].r;
600 scratch[15].i = scratch[3].i - scratch[7].i;
603 scratch_out[4].r = scratch[8].r - scratch[12].r;
604 scratch_out[4].i = scratch[8].i - scratch[12].i;
605 scratch_out[5].r = scratch[9].r - scratch[13].r;
606 scratch_out[5].i = scratch[9].i - scratch[13].i;
609 scratch_out[0].r = scratch[8].r + scratch[12].r;
610 scratch_out[0].i = scratch[8].i + scratch[12].i;
611 scratch_out[1].r = scratch[9].r + scratch[13].r;
612 scratch_out[1].i = scratch[9].i + scratch[13].i;
615 scratch_out[2].r = scratch[10].r - scratch[14].i;
616 scratch_out[2].i = scratch[10].i + scratch[14].r;
617 scratch_out[3].r = scratch[11].r - scratch[15].i;
618 scratch_out[3].i = scratch[11].i + scratch[15].r;
621 scratch_out[6].r = scratch[10].r + scratch[14].i;
622 scratch_out[6].i = scratch[10].i - scratch[14].r;
623 scratch_out[7].r = scratch[11].r + scratch[15].i;
624 scratch_out[7].i = scratch[11].i - scratch[15].r;
627 Fout1[0] = scratch_out[0];
628 Fout1[1] = scratch_out[1];
629 Fout1[2] = scratch_out[2];
630 Fout1[3] = scratch_out[3];
631 Fout1[4] = scratch_out[4];
632 Fout1[5] = scratch_out[5];
633 Fout1[6] = scratch_out[6];
634 Fout1[7] = scratch_out[7];
651 for (f_count = fstride; f_count ; f_count --)
654 scratch_in[0] = *Fin1;
655 Fin2 = Fin1 + fstride;
656 scratch_in[1] = *Fin2;
657 Fin2 = Fin2 + fstride;
658 scratch_in[2] = *Fin2;
659 Fin2 = Fin2 + fstride;
660 scratch_in[3] = *Fin2;
663 if (scaled_flag == 1)
665 NE10_F2I32_FIXDIV (scratch_in[0], 4);
666 NE10_F2I32_FIXDIV (scratch_in[1], 4);
667 NE10_F2I32_FIXDIV (scratch_in[2], 4);
668 NE10_F2I32_FIXDIV (scratch_in[3], 4);
672 scratch[0].r = scratch_in[0].r + scratch_in[2].r;
673 scratch[0].i = scratch_in[0].i + scratch_in[2].i;
675 scratch[1].r = scratch_in[0].r - scratch_in[2].r;
676 scratch[1].i = scratch_in[0].i - scratch_in[2].i;
679 scratch[2].r = scratch_in[1].r + scratch_in[3].r;
680 scratch[2].i = scratch_in[1].i + scratch_in[3].i;
682 scratch[3].r = scratch_in[1].r - scratch_in[3].r;
683 scratch[3].i = scratch_in[1].i - scratch_in[3].i;
686 scratch_out[2].r = scratch[0].r - scratch[2].r;
687 scratch_out[2].i = scratch[0].i - scratch[2].i;
690 scratch_out[0].r = scratch[0].r + scratch[2].r;
691 scratch_out[0].i = scratch[0].i + scratch[2].i;
694 scratch_out[1].r = scratch[1].r - scratch[3].i;
695 scratch_out[1].i = scratch[1].i + scratch[3].r;
698 scratch_out[3].r = scratch[1].r + scratch[3].i;
699 scratch_out[3].i = scratch[1].i - scratch[3].r;
702 * Fout1 ++ = scratch_out[0];
703 * Fout1 ++ = scratch_out[1];
704 * Fout1 ++ = scratch_out[2];
705 * Fout1 ++ = scratch_out[3];
725 for (; stage_count > 1 ; stage_count--)
728 for (f_count = 0; f_count < fstride; f_count ++)
730 Fout1 = & Fout[ f_count * mstride << 2 ];
732 for (m_count = mstride; m_count ; m_count --)
735 scratch_tw[0] = *tw1;
737 scratch_tw[1] = *tw2;
739 scratch_tw[2] = *tw2;
740 scratch_in[0] = * Fin1;
742 scratch_in[1] = * Fin2;
744 scratch_in[2] = * Fin2;
746 scratch_in[3] = * Fin2;
747 if (scaled_flag == 1)
749 NE10_F2I32_FIXDIV (scratch_in[0], 4);
750 NE10_F2I32_FIXDIV (scratch_in[1], 4);
751 NE10_F2I32_FIXDIV (scratch_in[2], 4);
752 NE10_F2I32_FIXDIV (scratch_in[3], 4);
757 scratch[0] = scratch_in[0];
758 scratch[1].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].r
759 + (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].i) >> 31);
760 scratch[1].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].r
761 - (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].i) >> 31);
763 scratch[2].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].r
764 + (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].i) >> 31);
765 scratch[2].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].r
766 - (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].i) >> 31);
768 scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].r
769 + (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].i) >> 31);
770 scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].r
771 - (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].i) >> 31);
774 scratch[4].r = scratch[0].r + scratch[2].r;
775 scratch[4].i = scratch[0].i + scratch[2].i;
777 scratch[5].r = scratch[0].r - scratch[2].r;
778 scratch[5].i = scratch[0].i - scratch[2].i;
781 scratch[6].r = scratch[1].r + scratch[3].r;
782 scratch[6].i = scratch[1].i + scratch[3].i;
784 scratch[7].r = scratch[1].r - scratch[3].r;
785 scratch[7].i = scratch[1].i - scratch[3].i;
788 scratch_out[2].r = scratch[4].r - scratch[6].r;
789 scratch_out[2].i = scratch[4].i - scratch[6].i;
792 scratch_out[0].r = scratch[4].r + scratch[6].r;
793 scratch_out[0].i = scratch[4].i + scratch[6].i;
796 scratch_out[1].r = scratch[5].r - scratch[7].i;
797 scratch_out[1].i = scratch[5].i + scratch[7].r;
800 scratch_out[3].r = scratch[5].r + scratch[7].i;
801 scratch_out[3].i = scratch[5].i - scratch[7].r;
804 *Fout1 = scratch_out[0];
805 Fout2 = Fout1 + mstride;
806 *Fout2 = scratch_out[1];
808 *Fout2 = scratch_out[2];
810 *Fout2 = scratch_out[3];
833 for (f_count = 0; f_count < fstride; f_count ++)
836 for (m_count = mstride; m_count ; m_count --)
839 scratch_tw[0] = *tw1;
841 scratch_tw[1] = *tw2;
843 scratch_tw[2] = *tw2;
844 scratch_in[0] = * Fin1;
846 scratch_in[1] = * Fin2;
848 scratch_in[2] = * Fin2;
850 scratch_in[3] = * Fin2;
851 if (scaled_flag == 1)
853 NE10_F2I32_FIXDIV (scratch_in[0], 4);
854 NE10_F2I32_FIXDIV (scratch_in[1], 4);
855 NE10_F2I32_FIXDIV (scratch_in[2], 4);
856 NE10_F2I32_FIXDIV (scratch_in[3], 4);
861 scratch[0] = scratch_in[0];
862 scratch[1].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].r
863 + (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].i) >> 31);
864 scratch[1].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].r
865 - (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].i) >> 31);
867 scratch[2].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].r
868 + (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].i) >> 31);
869 scratch[2].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].r
870 - (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].i) >> 31);
872 scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].r
873 + (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].i) >> 31);
874 scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].r
875 - (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].i) >> 31);
878 scratch[4].r = scratch[0].r + scratch[2].r;
879 scratch[4].i = scratch[0].i + scratch[2].i;
881 scratch[5].r = scratch[0].r - scratch[2].r;
882 scratch[5].i = scratch[0].i - scratch[2].i;
885 scratch[6].r = scratch[1].r + scratch[3].r;
886 scratch[6].i = scratch[1].i + scratch[3].i;
888 scratch[7].r = scratch[1].r - scratch[3].r;
889 scratch[7].i = scratch[1].i - scratch[3].i;
892 scratch_out[2].r = scratch[4].r - scratch[6].r;
893 scratch_out[2].i = scratch[4].i - scratch[6].i;
896 scratch_out[0].r = scratch[4].r + scratch[6].r;
897 scratch_out[0].i = scratch[4].i + scratch[6].i;
900 scratch_out[1].r = scratch[5].r - scratch[7].i;
901 scratch_out[1].i = scratch[5].i + scratch[7].r;
904 scratch_out[3].r = scratch[5].r + scratch[7].i;
905 scratch_out[3].i = scratch[5].i - scratch[7].r;
908 *Fout1 = scratch_out[0];
910 *Fout2 = scratch_out[1];
912 *Fout2 = scratch_out[2];
914 *Fout2 = scratch_out[3];
928 ne10_int32_t scaled_flag)
937 NE10_F2I32_FIXDIV (tdc, 2);
939 dst[0].r = tdc.r + tdc.i;
940 dst[ncfft].r = tdc.r - tdc.i;
941 dst[ncfft].i = dst[0].i = 0;
943 for (k = 1; k <= ncfft / 2 ; ++k)
946 fpnk.r = src[ncfft - k].r;
947 fpnk.i = - src[ncfft - k].i;
950 NE10_F2I32_FIXDIV (fpk, 2);
951 NE10_F2I32_FIXDIV (fpnk, 2);
954 f1k.r = fpk.r + fpnk.r;
955 f1k.i = fpk.i + fpnk.i;
957 f2k.r = fpk.r - fpnk.r;
958 f2k.i = fpk.i - fpnk.i;
960 tw.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> 32))) << 1;
961 tw.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).i) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> 32))) << 1;
963 dst[k].r = (f1k.r + tw.r) >> 1;
964 dst[k].i = (f1k.i + tw.i) >> 1;
965 dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
966 dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
974 ne10_int32_t scaled_flag)
981 dst[0].r = src[0].r + src[ncfft].r;
982 dst[0].i = src[0].r - src[ncfft].r;
985 NE10_F2I32_FIXDIV (dst[0], 2);
987 for (k = 1; k <= ncfft / 2; k++)
990 fnkc.r = src[ncfft - k].r;
991 fnkc.i = -src[ncfft - k].i;
994 NE10_F2I32_FIXDIV (fk, 2);
995 NE10_F2I32_FIXDIV (fnkc, 2);
998 fek.r = fk.r + fnkc.r;
999 fek.i = fk.i + fnkc.i;
1001 tmp.r = fk.r - fnkc.r;
1002 tmp.i = fk.i - fnkc.i;
1004 fok.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).r) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> 32))) << 1;
1005 fok.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> 32))) << 1;
1007 dst[k].r = fek.r + fok.r;
1008 dst[k].i = fek.i + fok.i;
1010 dst[ncfft - k].r = fek.r - fok.r;
1011 dst[ncfft - k].i = fok.i - fek.i;
1031 +
sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2)
1034 + NE10_FFT_BYTE_ALIGNMENT;
1040 NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
1041 st->factors = (ne10_int32_t*) address;
1043 st->buffer = st->twiddles + nfft;
1046 ne10_int32_t result = ne10_factor (nfft, st->factors, NE10_FACTOR_DEFAULT);
1047 if (result == NE10_ERR)
1053 ne10_int32_t *factors = st->factors;
1056 ne10_fft_generate_twiddles_int32 (twiddles, factors, nfft);
1075 ne10_int32_t inverse_fft,
1076 ne10_int32_t scaled_flag)
1078 ne10_int32_t stage_count = cfg->factors[0];
1079 ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
1081 assert ((algorithm_flag == NE10_FFT_ALG_24)
1082 || (algorithm_flag == NE10_FFT_ALG_ANY));
1084 switch (algorithm_flag)
1086 case NE10_FFT_ALG_24:
1089 ne10_mixed_radix_butterfly_inverse_int32_c (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1093 ne10_mixed_radix_butterfly_int32_c (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1096 case NE10_FFT_ALG_ANY:
1099 ne10_mixed_radix_generic_butterfly_inverse_int32_c (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1103 ne10_mixed_radix_generic_butterfly_int32_c (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1128 ne10_int32_t ncfft = nfft >> 1;
1131 +
sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2)
1135 + NE10_FFT_BYTE_ALIGNMENT;
1142 NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
1143 st->factors = (ne10_int32_t*) address;
1145 st->super_twiddles = st->twiddles + ncfft;
1146 st->buffer = st->super_twiddles + (ncfft / 2);
1149 ne10_int32_t result = ne10_factor (ncfft, st->factors, NE10_FACTOR_DEFAULT);
1150 if (result == NE10_ERR)
1157 ne10_int32_t *factors = st->factors;
1160 ne10_int32_t stage_count = factors[0];
1161 ne10_int32_t fstride1 = factors[1];
1162 ne10_int32_t fstride2 = fstride1 * 2;
1163 ne10_int32_t fstride3 = fstride1 * 3;
1166 const ne10_float32_t pi = NE10_PI;
1167 ne10_float32_t phase1;
1168 ne10_float32_t phase2;
1169 ne10_float32_t phase3;
1171 for (i = stage_count - 1; i > 0; i--)
1176 m = factors[2 * i + 1];
1178 for (j = 0; j < m; j++)
1180 phase1 = -2 * pi * fstride1 * j / ncfft;
1181 phase2 = -2 * pi * fstride2 * j / ncfft;
1182 phase3 = -2 * pi * fstride3 * j / ncfft;
1183 tw->r = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * cos (phase1));
1184 tw->i = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * sin (phase1));
1185 (tw + m)->r = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * cos (phase2));
1186 (tw + m)->i = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * sin (phase2));
1187 (tw + m * 2)->r = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * cos (phase3));
1188 (tw + m * 2)->i = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * sin (phase3));
1194 tw = st->super_twiddles;
1195 for (i = 0; i < ncfft / 2; i++)
1197 phase1 = -pi * ( (ne10_float32_t) (i + 1) / ncfft + 0.5f);
1198 tw->r = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * cos (phase1));
1199 tw->i = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * sin (phase1));
1222 ne10_int32_t scaled_flag)
1226 ne10_mixed_radix_butterfly_int32_c (tmpbuf, (
ne10_fft_cpx_int32_t*) fin, cfg->factors, cfg->twiddles, fout, scaled_flag);
1227 ne10_fft_split_r2c_1d_int32 (fout, tmpbuf, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1244 ne10_int32_t scaled_flag)
1249 ne10_fft_split_c2r_1d_int32 (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1250 ne10_mixed_radix_butterfly_inverse_int32_c ( (
ne10_fft_cpx_int32_t*) fout, tmpbuf1, cfg->factors, cfg->twiddles, tmpbuf2, scaled_flag);
void ne10_fft_c2c_1d_int32_c(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Mixed radix-2/4 complex FFT/IFFT of 32-bit fixed point data.
ne10_fft_cfg_int32_t ne10_fft_alloc_c2c_int32_c(ne10_int32_t nfft)
User-callable function to allocate all necessary storage space for the fft.
void ne10_fft_r2c_1d_int32_c(ne10_fft_cpx_int32_t *fout, ne10_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 FFT (real to complex) of int32 data.
void ne10_fft_c2r_1d_int32_c(ne10_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 IFFT (complex to real) of int32 data.
ne10_fft_r2c_cfg_int32_t ne10_fft_alloc_r2c_int32(ne10_int32_t nfft)
User-callable function to allocate all necessary storage space for the fft (r2c/c2r).
structure for the 32 bits fixed point FFT function.