32 #ifndef NE10_FFT_NEONINTRINSIC_H
33 #define NE10_FFT_NEONINTRINSIC_H
38 #define NE10_CPX_ADD_NEON_F32(Z,A,B) do { \
39 Z.val[0] = A.val[0] + B.val[0]; \
40 Z.val[1] = A.val[1] + B.val[1]; \
43 #define NE10_CPX_SUB_NEON_F32(Z,A,B) do { \
44 Z.val[0] = A.val[0] - B.val[0]; \
45 Z.val[1] = A.val[1] - B.val[1]; \
48 #define NE10_CPX_MUL_NEON_F32(Z,A,B) do { \
49 float32x4_t ARBR = vmulq_f32( A.val[0], B.val[0] ); \
50 float32x4_t ARBI = vmulq_f32( A.val[0], B.val[1] ); \
51 Z.val[0] = vmlsq_f32(ARBR, A.val[1], B.val[1]); \
52 Z.val[1] = vmlaq_f32(ARBI, A.val[1], B.val[0]); \
55 #define NE10_CPX_MUL_INV_NEON_F32(Z,A,B) do { \
56 float32x4_t ARBR = vmulq_f32( A.val[0], B.val[0] ); \
57 float32x4_t AIBI = vmulq_f32( A.val[1], B.val[1] ); \
58 float32x4_t ARBI = vmulq_f32( A.val[0], B.val[1] ); \
59 float32x4_t AIBR = vmulq_f32( A.val[1], B.val[0] ); \
60 Z.val[0] = vaddq_f32(ARBR,AIBI); \
61 Z.val[1] = vsubq_f32(AIBR,ARBI); \
64 #define NE10_BUTTERFLY_NEON_F32(O1,O2,I1,I2) do { \
65 NE10_CPX_ADD_NEON(O1,I1,I2); \
66 NE10_CPX_SUB_NEON(O2,I1,I2); \
69 #define NE10_DECLARE_2(TYPE,NAME) TYPE NAME ## 0; \
72 #define NE10_DECLARE_3(TYPE,NAME) NE10_DECLARE_2(TYPE,NAME); \
75 #define NE10_DECLARE_4(TYPE,NAME) NE10_DECLARE_3(TYPE,NAME); \
78 #define NE10_DECLARE_8(TYPE,NAME) NE10_DECLARE_4(TYPE,NAME); \
84 #define NE10_REVERSE_FLOAT32X4(VECTOR4F) do { \
85 VECTOR4F = vrev64q_f32(VECTOR4F); \
86 VECTOR4F = vcombine_f32( vget_high_f32( VECTOR4F ), vget_low_f32( VECTOR4F ) ); \
89 #define NE10_REVERSE_OUT_FLOAT32X4(VECTOR4F_OUT,VECTOR4F) do { \
90 float32x4_t Q_TMP = vrev64q_f32(VECTOR4F); \
91 VECTOR4F_OUT = vcombine_f32( vget_high_f32( Q_TMP ), vget_low_f32( Q_TMP ) ); \
94 #define NE10_RADIX4X4C_TRANSPOSE_NEON(Q2_OUT,Q2_IN) do { \
95 NE10_DECLARE_4(float32x4x2_t,q2_tmp); \
96 q2_tmp0 = vtrnq_f32 (Q2_IN ## 0 .val[0], Q2_IN ## 1 .val[0]); \
97 q2_tmp1 = vtrnq_f32 (Q2_IN ## 0 .val[1], Q2_IN ## 1 .val[1]); \
98 q2_tmp2 = vtrnq_f32 (Q2_IN ## 2 .val[0], Q2_IN ## 3 .val[0]); \
99 q2_tmp3 = vtrnq_f32 (Q2_IN ## 2 .val[1], Q2_IN ## 3 .val[1]); \
100 Q2_OUT ## 0 .val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[0]), vget_low_f32 (q2_tmp2.val[0])); \
101 Q2_OUT ## 0 .val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[0]), vget_low_f32 (q2_tmp3.val[0])); \
102 Q2_OUT ## 1 .val[0] = vcombine_f32 (vget_low_f32 (q2_tmp0.val[1]), vget_low_f32 (q2_tmp2.val[1])); \
103 Q2_OUT ## 1 .val[1] = vcombine_f32 (vget_low_f32 (q2_tmp1.val[1]), vget_low_f32 (q2_tmp3.val[1])); \
104 Q2_OUT ## 2 .val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0])); \
105 Q2_OUT ## 2 .val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0])); \
106 Q2_OUT ## 3 .val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1])); \
107 Q2_OUT ## 3 .val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1])); \
110 #define VDUPQ_N_F32(VAR) { VAR, VAR, VAR, VAR }
112 #define CONST_TW_81 0.70710678
113 #define CONST_TW_81N -0.70710678
115 const static float32x4_t Q_TW_81 = VDUPQ_N_F32(CONST_TW_81 );
116 const static float32x4_t Q_TW_81N = VDUPQ_N_F32(CONST_TW_81N);
118 #define DIV_TW81 1.4142136f
119 #define DIV_TW81N -1.4142136f
121 const static float32x4_t DIV_TW81_NEON = VDUPQ_N_F32(DIV_TW81);
122 const static float32x4_t DIV_TW81N_NEON = VDUPQ_N_F32(DIV_TW81N);
124 #define NE10_RADIX8x4_R2C_NEON_KERNEL_S1(Q_OUT,Q_IN) do { \
125 Q_OUT ## 0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 4); \
126 Q_OUT ## 1 = vsubq_f32 (Q_IN ## 0, Q_IN ## 4); \
127 Q_OUT ## 2 = vaddq_f32 (Q_IN ## 1, Q_IN ## 5); \
128 Q_OUT ## 3 = vsubq_f32 (Q_IN ## 1, Q_IN ## 5); \
129 Q_OUT ## 4 = vaddq_f32 (Q_IN ## 2, Q_IN ## 6); \
130 Q_OUT ## 5 = vsubq_f32 (Q_IN ## 2, Q_IN ## 6); \
131 Q_OUT ## 6 = vaddq_f32 (Q_IN ## 3, Q_IN ## 7); \
132 Q_OUT ## 7 = vsubq_f32 (Q_IN ## 3, Q_IN ## 7); \
133 Q_OUT ## 3 = vmulq_f32 (Q_OUT ## 3, Q_TW_81 ); \
134 Q_OUT ## 7 = vmulq_f32 (Q_OUT ## 7, Q_TW_81N); \
137 #define NE10_RADIX8x4_R2C_NEON_KERNEL_S2(Q_OUT,Q_IN) do { \
138 NE10_DECLARE_4(float32x4_t,Q_S); \
139 Q_S0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 4); \
140 Q_S1 = vaddq_f32 (Q_IN ## 2, Q_IN ## 6); \
141 Q_S2 = vsubq_f32 (Q_IN ## 7, Q_IN ## 3); \
142 Q_S3 = vaddq_f32 (Q_IN ## 3, Q_IN ## 7); \
143 Q_OUT ## 0 = vaddq_f32 ( Q_S0, Q_S1 ); \
144 Q_OUT ## 1 = vaddq_f32 ( Q_IN ## 1, Q_S3 ); \
145 Q_OUT ## 2 = vsubq_f32 ( Q_S2, Q_IN ## 5 ); \
146 Q_OUT ## 3 = vsubq_f32 ( Q_IN ## 0, Q_IN ## 4 ); \
147 Q_OUT ## 4 = vsubq_f32 ( Q_IN ## 6, Q_IN ## 2 ); \
148 Q_OUT ## 5 = vsubq_f32 ( Q_IN ## 1, Q_S3 ); \
149 Q_OUT ## 6 = vaddq_f32 ( Q_IN ## 5, Q_S2 ); \
150 Q_OUT ## 7 = vsubq_f32 ( Q_S0, Q_S1 ); \
153 #define NE10_RADIX8x4_C2R_NEON_KERNEL_S1(Q_OUT,Q_IN) do { \
154 NE10_DECLARE_8(float32x4_t,Q_S_IN); \
155 Q_S_IN0 = vaddq_f32(Q_IN ## 0, Q_IN ## 7); \
156 Q_S_IN1 = vsubq_f32(Q_IN ## 0, Q_IN ## 7); \
157 Q_S_IN2 = vaddq_f32(Q_IN ## 1, Q_IN ## 5); \
158 Q_S_IN3 = vsubq_f32(Q_IN ## 1, Q_IN ## 5); \
159 Q_S_IN4 = vaddq_f32(Q_IN ## 6, Q_IN ## 2); \
160 Q_S_IN5 = vsubq_f32(Q_IN ## 6, Q_IN ## 2); \
161 Q_S_IN6 = vaddq_f32(Q_IN ## 3, Q_IN ## 3); \
162 Q_S_IN7 = vaddq_f32(Q_IN ## 4, Q_IN ## 4); \
163 Q_OUT ## 0 = vaddq_f32(Q_S_IN0, Q_S_IN6); \
164 Q_OUT ## 1 = vaddq_f32(Q_S_IN2, Q_S_IN2); \
165 Q_OUT ## 2 = vsubq_f32(Q_S_IN1, Q_S_IN7); \
166 Q_OUT ## 3 = vsubq_f32(Q_S_IN3, Q_S_IN4); \
167 Q_OUT ## 4 = vsubq_f32(Q_S_IN0, Q_S_IN6); \
168 Q_OUT ## 5 = vaddq_f32(Q_S_IN5, Q_S_IN5); \
169 Q_OUT ## 6 = vaddq_f32(Q_S_IN1, Q_S_IN7); \
170 Q_OUT ## 7 = vaddq_f32(Q_S_IN4, Q_S_IN3); \
173 #define NE10_RADIX8x4_C2R_NEON_KERNEL_S2(Q_OUT,Q_IN) do { \
174 Q_IN ## 3 = vmulq_f32(Q_IN ## 3,DIV_TW81_NEON); \
175 Q_IN ## 7 = vmulq_f32(Q_IN ## 7,DIV_TW81N_NEON); \
176 Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 1); \
177 Q_OUT ## 4 = vsubq_f32(Q_IN ## 0, Q_IN ## 1); \
178 Q_OUT ## 1 = vaddq_f32(Q_IN ## 2, Q_IN ## 3); \
179 Q_OUT ## 5 = vsubq_f32(Q_IN ## 2, Q_IN ## 3); \
180 Q_OUT ## 2 = vaddq_f32(Q_IN ## 4, Q_IN ## 5); \
181 Q_OUT ## 6 = vsubq_f32(Q_IN ## 4, Q_IN ## 5); \
182 Q_OUT ## 3 = vaddq_f32(Q_IN ## 6, Q_IN ## 7); \
183 Q_OUT ## 7 = vsubq_f32(Q_IN ## 6, Q_IN ## 7); \
186 #define NE10_RADIX8x4_C2R_NEON_KERNEL_SCALE(Q_OUT) do { \
187 Q_OUT ## 0 = vmulq_f32( Q_OUT ## 0, EIGH_NEON); \
188 Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, EIGH_NEON); \
189 Q_OUT ## 2 = vmulq_f32( Q_OUT ## 2, EIGH_NEON); \
190 Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, EIGH_NEON); \
191 Q_OUT ## 4 = vmulq_f32( Q_OUT ## 4, EIGH_NEON); \
192 Q_OUT ## 5 = vmulq_f32( Q_OUT ## 5, EIGH_NEON); \
193 Q_OUT ## 6 = vmulq_f32( Q_OUT ## 6, EIGH_NEON); \
194 Q_OUT ## 7 = vmulq_f32( Q_OUT ## 7, EIGH_NEON); \
197 #define NE10_RADIX4x4_C2R_NEON_KERNEL_SCALE(Q_OUT) do { \
198 Q_OUT ## 0 = vmulq_f32( Q_OUT ## 0, QUAD_NEON); \
199 Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, QUAD_NEON); \
200 Q_OUT ## 2 = vmulq_f32( Q_OUT ## 2, QUAD_NEON); \
201 Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, QUAD_NEON); \
204 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_SCALE(Q2_OUT) do { \
205 Q2_OUT ## 0 .val[0] = vmulq_f32( Q2_OUT ## 0 .val[0], QUAD_NEON); \
206 Q2_OUT ## 1 .val[0] = vmulq_f32( Q2_OUT ## 1 .val[0], QUAD_NEON); \
207 Q2_OUT ## 2 .val[0] = vmulq_f32( Q2_OUT ## 2 .val[0], QUAD_NEON); \
208 Q2_OUT ## 3 .val[0] = vmulq_f32( Q2_OUT ## 3 .val[0], QUAD_NEON); \
209 Q2_OUT ## 0 .val[1] = vmulq_f32( Q2_OUT ## 0 .val[1], QUAD_NEON); \
210 Q2_OUT ## 1 .val[1] = vmulq_f32( Q2_OUT ## 1 .val[1], QUAD_NEON); \
211 Q2_OUT ## 2 .val[1] = vmulq_f32( Q2_OUT ## 2 .val[1], QUAD_NEON); \
212 Q2_OUT ## 3 .val[1] = vmulq_f32( Q2_OUT ## 3 .val[1], QUAD_NEON); \
215 #define NE10_RADIX8x4_R2C_NEON_KERNEL(Q_OUT,Q_IN) do { \
216 NE10_DECLARE_8(float32x4_t,Q_S_IN); \
217 NE10_RADIX8x4_R2C_NEON_KERNEL_S1(Q_S_IN,Q_IN); \
218 NE10_RADIX8x4_R2C_NEON_KERNEL_S2(Q_OUT,Q_S_IN); \
221 #define NE10_RADIX4x4_R2C_NEON_KERNEL(Q_OUT,Q_IN) do { \
222 NE10_DECLARE_4(float32x4_t,Q_S_IN); \
223 Q_S_IN0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 2); \
224 Q_S_IN1 = vaddq_f32 (Q_IN ## 1, Q_IN ## 3); \
225 Q_OUT ## 0 = vaddq_f32 (Q_S_IN0, Q_S_IN1); \
226 Q_OUT ## 1 = vsubq_f32 (Q_IN##0, Q_IN##2); \
227 Q_OUT ## 2 = vsubq_f32 (Q_IN##3, Q_IN##1); \
228 Q_OUT ## 3 = vsubq_f32 (Q_S_IN0, Q_S_IN1); \
231 #define NE10_RADIX4x4_C2R_NEON_KERNEL(Q_OUT,Q_IN) do { \
232 NE10_DECLARE_4(float32x4_t,Q_S_IN); \
233 Q_S_IN0 = vaddq_f32 (Q_IN##0, Q_IN##3); \
234 Q_S_IN1 = vsubq_f32 (Q_IN##0, Q_IN##3); \
235 Q_S_IN2 = vaddq_f32 (Q_IN##1, Q_IN##1); \
236 Q_S_IN3 = vaddq_f32 (Q_IN##2, Q_IN##2); \
237 Q_OUT ## 0 = vaddq_f32 (Q_S_IN0, Q_S_IN2); \
238 Q_OUT ## 1 = vsubq_f32 (Q_S_IN1, Q_S_IN3); \
239 Q_OUT ## 2 = vsubq_f32 (Q_S_IN0, Q_S_IN2); \
240 Q_OUT ## 3 = vaddq_f32 (Q_S_IN1, Q_S_IN3); \
243 #define NE10_RADIX8x4_C2R_NEON_KERNEL(Q_OUT,Q_IN) do { \
244 NE10_DECLARE_8(float32x4_t,Q_S_IN_C2R_KERNEL); \
245 NE10_RADIX8x4_C2R_NEON_KERNEL_S1(Q_S_IN_C2R_KERNEL,Q_IN); \
246 NE10_RADIX8x4_C2R_NEON_KERNEL_S2(Q_OUT,Q_S_IN_C2R_KERNEL); \
249 #define NE10_RADIX8x4_R2C_NEON_LOAD(PTR_IN,Q_IN,IN_STEP) do { \
250 Q_IN ## 0 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
252 Q_IN ## 1 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
254 Q_IN ## 2 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
256 Q_IN ## 3 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
258 Q_IN ## 4 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
260 Q_IN ## 5 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
262 Q_IN ## 6 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
264 Q_IN ## 7 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
268 #define NE10_RADIX4x4_R2C_NEON_LOAD(PTR_IN,Q_IN,IN_STEP) do {\
269 Q_IN ## 0 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
271 Q_IN ## 1 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
273 Q_IN ## 2 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
275 Q_IN ## 3 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) ); \
279 #define NE10_RADIX8x4_R2C_NEON_STORE(PTR_OUT,Q_OUT,OUT_STEP) do { \
280 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 0 * OUT_STEP ), Q_OUT ## 0); \
281 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 1 * OUT_STEP ), Q_OUT ## 1); \
282 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 2 * OUT_STEP ), Q_OUT ## 2); \
283 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 3 * OUT_STEP ), Q_OUT ## 3); \
284 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 4 * OUT_STEP ), Q_OUT ## 4); \
285 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 5 * OUT_STEP ), Q_OUT ## 5); \
286 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 6 * OUT_STEP ), Q_OUT ## 6); \
287 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 7 * OUT_STEP ), Q_OUT ## 7); \
290 #define NE10_RADIX4x4_R2C_NEON_STORE(PTR_OUT,Q_OUT,OUT_STEP) do { \
291 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 0 * OUT_STEP ), Q_OUT ## 0); \
292 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 1 * OUT_STEP ), Q_OUT ## 1); \
293 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 2 * OUT_STEP ), Q_OUT ## 2); \
294 vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 3 * OUT_STEP ), Q_OUT ## 3); \
297 #define NE10_RADIX4x4_R2C_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW) do { \
298 Q2_OUT ## 0 = Q2_IN ## 0; \
299 NE10_CPX_MUL_NEON_F32(Q2_OUT ## 1,Q2_IN ## 1,Q2_TW ## 0); \
300 NE10_CPX_MUL_NEON_F32(Q2_OUT ## 2,Q2_IN ## 2,Q2_TW ## 1); \
301 NE10_CPX_MUL_NEON_F32(Q2_OUT ## 3,Q2_IN ## 3,Q2_TW ## 2); \
304 #define NE10_RADIX4x4_C2R_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW) do { \
305 Q2_OUT ## 0 = Q2_IN ## 0; \
306 NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 1,Q2_IN ## 1,Q2_TW ## 0); \
307 NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 2,Q2_IN ## 2,Q2_TW ## 1); \
308 NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 3,Q2_IN ## 3,Q2_TW ## 2); \
311 #define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN) do { \
312 NE10_CPX_ADD_NEON_F32(Q2_OUT ## 0,Q2_IN ## 0,Q2_IN ## 2); \
313 NE10_CPX_SUB_NEON_F32(Q2_OUT ## 1,Q2_IN ## 0,Q2_IN ## 2); \
314 NE10_CPX_ADD_NEON_F32(Q2_OUT ## 2,Q2_IN ## 1,Q2_IN ## 3); \
315 NE10_CPX_SUB_NEON_F32(Q2_OUT ## 3,Q2_IN ## 1,Q2_IN ## 3); \
318 #define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN) do { \
319 Q2_OUT ## 0 .val[0] = vaddq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
320 Q2_OUT ## 0 .val[1] = vaddq_f32(Q2_IN ## 0 .val[1] , Q2_IN ## 2 .val[1]); \
321 Q2_OUT ## 2 .val[0] = vsubq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
322 Q2_OUT ## 2 .val[1] = vsubq_f32(Q2_IN ## 2 .val[1] , Q2_IN ## 0 .val[1]); \
323 Q2_OUT ## 1 .val[0] = vaddq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[1]); \
324 Q2_OUT ## 1 .val[1] = vsubq_f32(Q2_IN ## 1 .val[1] , Q2_IN ## 3 .val[0]); \
325 Q2_OUT ## 3 .val[0] = vsubq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[1]); \
326 Q2_OUT ## 3 .val[1] = vaddq_f32(Q2_IN ## 3 .val[0] , Q2_IN ## 1 .val[1]); \
327 Q2_OUT ## 3 .val[1] = vnegq_f32(Q2_OUT ## 3 .val[1]); \
330 #define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_LAST(Q_OUT,Q_IN) do { \
332 Q_IN ## 1 = vmulq_f32(Q_IN ## 1, Q_TW_81); \
333 Q_IN ## 3 = vmulq_f32(Q_IN ## 3, Q_TW_81); \
334 Q_TMP = vsubq_f32(Q_IN ## 1, Q_IN ## 3); \
335 Q_IN ## 3 = vaddq_f32(Q_IN ## 1, Q_IN ## 3); \
337 Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 1); \
338 Q_OUT ## 1 = vaddq_f32(Q_IN ## 2, Q_IN ## 3); \
339 Q_OUT ## 2 = vsubq_f32(Q_IN ## 0, Q_IN ## 1); \
340 Q_OUT ## 3 = vsubq_f32(Q_IN ## 2, Q_IN ## 3); \
341 Q_OUT ## 1 = vnegq_f32(Q_OUT ## 1); \
344 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_LAST(Q_OUT,Q_IN) do { \
346 Q_IN ## 1 = vnegq_f32(Q_IN ## 1 ); \
347 Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 2); \
348 Q_OUT ## 1 = vsubq_f32(Q_IN ## 0, Q_IN ## 2); \
349 Q_OUT ## 2 = vaddq_f32(Q_IN ## 1, Q_IN ## 3); \
350 Q_OUT ## 3 = vsubq_f32(Q_IN ## 1, Q_IN ## 3); \
351 Q_TMP = vaddq_f32(Q_OUT ## 1, Q_OUT ## 3); \
352 Q_OUT ## 3 = vsubq_f32(Q_OUT ## 3, Q_OUT ## 1); \
353 Q_OUT ## 1 = Q_TMP; \
354 Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, DIV_TW81_NEON); \
355 Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, DIV_TW81_NEON); \
356 Q_OUT ## 0 = vaddq_f32( Q_OUT ## 0, Q_OUT ## 0 ); \
357 Q_OUT ## 2 = vaddq_f32( Q_OUT ## 2, Q_OUT ## 2 ); \
360 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN) do { \
361 Q2_IN ## 3 .val[1] = vnegq_f32(Q2_IN ## 3 .val[1]); \
362 Q2_OUT ## 0 .val[0] = vaddq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
363 Q2_OUT ## 0 .val[1] = vsubq_f32(Q2_IN ## 0 .val[1] , Q2_IN ## 2 .val[1]); \
364 Q2_OUT ## 2 .val[0] = vsubq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
365 Q2_OUT ## 2 .val[1] = vaddq_f32(Q2_IN ## 2 .val[1] , Q2_IN ## 0 .val[1]); \
366 Q2_OUT ## 1 .val[0] = vaddq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[0]); \
367 Q2_OUT ## 1 .val[1] = vaddq_f32(Q2_IN ## 1 .val[1] , Q2_IN ## 3 .val[1]); \
368 Q2_OUT ## 3 .val[0] = vsubq_f32(Q2_IN ## 3 .val[1] , Q2_IN ## 1 .val[1]); \
369 Q2_OUT ## 3 .val[1] = vsubq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[0]); \
372 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN) do { \
373 NE10_CPX_ADD_NEON_F32(Q2_OUT ## 0,Q2_IN ## 0,Q2_IN ## 1); \
374 NE10_CPX_SUB_NEON_F32(Q2_OUT ## 2,Q2_IN ## 0,Q2_IN ## 1); \
375 NE10_CPX_ADD_NEON_F32(Q2_OUT ## 1,Q2_IN ## 2,Q2_IN ## 3); \
376 NE10_CPX_SUB_NEON_F32(Q2_OUT ## 3,Q2_IN ## 2,Q2_IN ## 3); \
379 #define NE10_RADIX4x4_R2C_TW_NEON_KERNEL(Q2_OUT,Q2_IN,Q2_TW) do { \
380 NE10_RADIX4x4_R2C_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW); \
381 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1(Q2_IN,Q2_OUT); \
382 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN); \
385 #define NE10_RADIX4x4_C2R_TW_NEON_KERNEL(Q2_OUT,Q2_IN,Q2_TW) do { \
386 NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN); \
387 NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S2(Q2_IN,Q2_OUT); \
388 NE10_RADIX4x4_C2R_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW); \
392 #define NE10_PRINT_Qx8_VECTOR(Q_VECTOR) do { \
393 fprintf(stderr,"inside %s\n", __FUNCTION__ ); \
394 fprintf(stderr, #Q_VECTOR "\n"); \
395 fprintf(stderr,"0:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0[0], Q_VECTOR ## 0[1], Q_VECTOR ## 0[2], Q_VECTOR ## 0[3] ); \
396 fprintf(stderr,"1:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1[0], Q_VECTOR ## 1[1], Q_VECTOR ## 1[2], Q_VECTOR ## 1[3] ); \
397 fprintf(stderr,"2:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2[0], Q_VECTOR ## 2[1], Q_VECTOR ## 2[2], Q_VECTOR ## 2[3] ); \
398 fprintf(stderr,"3:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3[0], Q_VECTOR ## 3[1], Q_VECTOR ## 3[2], Q_VECTOR ## 3[3] ); \
399 fprintf(stderr,"4:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 4[0], Q_VECTOR ## 4[1], Q_VECTOR ## 4[2], Q_VECTOR ## 4[3] ); \
400 fprintf(stderr,"5:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 5[0], Q_VECTOR ## 5[1], Q_VECTOR ## 5[2], Q_VECTOR ## 5[3] ); \
401 fprintf(stderr,"6:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 6[0], Q_VECTOR ## 6[1], Q_VECTOR ## 6[2], Q_VECTOR ## 6[3] ); \
402 fprintf(stderr,"7:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 7[0], Q_VECTOR ## 7[1], Q_VECTOR ## 7[2], Q_VECTOR ## 7[3] ); \
404 #define NE10_PRINT_Qx4_VECTOR(Q_VECTOR) do { \
405 fprintf(stderr,"inside %s\n", __FUNCTION__ ); \
406 fprintf(stderr, #Q_VECTOR "\n"); \
407 fprintf(stderr,"0:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0[0], Q_VECTOR ## 0[1], Q_VECTOR ## 0[2], Q_VECTOR ## 0[3] ); \
408 fprintf(stderr,"1:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1[0], Q_VECTOR ## 1[1], Q_VECTOR ## 1[2], Q_VECTOR ## 1[3] ); \
409 fprintf(stderr,"2:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2[0], Q_VECTOR ## 2[1], Q_VECTOR ## 2[2], Q_VECTOR ## 2[3] ); \
410 fprintf(stderr,"3:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3[0], Q_VECTOR ## 3[1], Q_VECTOR ## 3[2], Q_VECTOR ## 3[3] ); \
412 #define NE10_PRINT_Q2x4_VECTOR(Q_VECTOR) do { \
413 fprintf(stderr,"inside %s\n", __FUNCTION__ ); \
414 fprintf(stderr, #Q_VECTOR "\n"); \
415 fprintf(stderr,"0R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0 .val[0][0], Q_VECTOR ## 0 .val[0][1], Q_VECTOR ## 0 .val[0][2], Q_VECTOR ## 0 .val[0][3] ); \
416 fprintf(stderr,"1R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1 .val[0][0], Q_VECTOR ## 1 .val[0][1], Q_VECTOR ## 1 .val[0][2], Q_VECTOR ## 1 .val[0][3] ); \
417 fprintf(stderr,"2R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2 .val[0][0], Q_VECTOR ## 2 .val[0][1], Q_VECTOR ## 2 .val[0][2], Q_VECTOR ## 2 .val[0][3] ); \
418 fprintf(stderr,"3R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3 .val[0][0], Q_VECTOR ## 3 .val[0][1], Q_VECTOR ## 3 .val[0][2], Q_VECTOR ## 3 .val[0][3] ); \
419 fprintf(stderr,"0I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0 .val[1][0], Q_VECTOR ## 0 .val[1][1], Q_VECTOR ## 0 .val[1][2], Q_VECTOR ## 0 .val[1][3] ); \
420 fprintf(stderr,"1I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1 .val[1][0], Q_VECTOR ## 1 .val[1][1], Q_VECTOR ## 1 .val[1][2], Q_VECTOR ## 1 .val[1][3] ); \
421 fprintf(stderr,"2I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2 .val[1][0], Q_VECTOR ## 2 .val[1][1], Q_VECTOR ## 2 .val[1][2], Q_VECTOR ## 2 .val[1][3] ); \
422 fprintf(stderr,"3I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3 .val[1][0], Q_VECTOR ## 3 .val[1][1], Q_VECTOR ## 3 .val[1][2], Q_VECTOR ## 3 .val[1][3] ); \
424 #else // NE10_VERBOSE not defined
425 #define NE10_PRINT_Qx8_VECTOR(Q_VECTOR) ;
426 #define NE10_PRINT_Qx4_VECTOR(Q_VECTOR) ;
427 #define NE10_PRINT_Q2x4_VECTOR(Q2_VECTOR) ;
428 #endif // NE10_VERBOSE