34 #include "NE10_types.h" 35 #include "NE10_macros.h" 38 #define FFT4_FS_START \ 39 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i; \ 40 ne10_int16_t tmp_r, tmp_i; 44 s2_r = Fin[0].r - Fin[2].r; \ 45 s2_i = Fin[0].i - Fin[2].i; \ 46 tmp_r = Fin[0].r + Fin[2].r; \ 47 tmp_i = Fin[0].i + Fin[2].i; \ 48 s0_r = Fin[1].r + Fin[3].r; \ 49 s0_i = Fin[1].i + Fin[3].i; \ 50 s1_r = Fin[1].r - Fin[3].r; \ 51 s1_i = Fin[1].i - Fin[3].i; 53 #define FFT4_FS_SCALED \ 54 s2_r = (Fin[0].r - Fin[2].r) >> 2; \ 55 s2_i = (Fin[0].i - Fin[2].i) >> 2; \ 56 tmp_r = (Fin[0].r + Fin[2].r) >> 2; \ 57 tmp_i = (Fin[0].i + Fin[2].i) >> 2; \ 58 s0_r = (Fin[1].r + Fin[3].r) >> 2; \ 59 s0_i = (Fin[1].i + Fin[3].i) >> 2; \ 60 s1_r = (Fin[1].r - Fin[3].r) >> 2; \ 61 s1_i = (Fin[1].i - Fin[3].i) >> 2; 64 Fout[2].r = tmp_r - s0_r; \ 65 Fout[2].i = tmp_i - s0_i; \ 66 Fout[0].r = tmp_r + s0_r; \ 67 Fout[0].i = tmp_i + s0_i; \ 68 Fout[1].r = s2_r + s1_i; \ 69 Fout[1].i = s2_i - s1_r; \ 70 Fout[3].r = s2_r - s1_i; \ 71 Fout[3].i = s2_i + s1_r; 74 Fout[2].r = tmp_r - s0_r; \ 75 Fout[2].i = tmp_i - s0_i; \ 76 Fout[0].r = tmp_r + s0_r; \ 77 Fout[0].i = tmp_i + s0_i; \ 78 Fout[1].r = s2_r - s1_i; \ 79 Fout[1].i = s2_i + s1_r; \ 80 Fout[3].r = s2_r + s1_i; \ 81 Fout[3].i = s2_i - s1_r; 118 #define FFT8_FS_START \ 119 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i; \ 120 ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \ 121 const ne10_int16_t TW_81 = 23169; 124 s0_r = Fin[0].r + Fin[4].r; \ 125 s0_i = Fin[0].i + Fin[4].i; \ 126 s1_r = Fin[0].r - Fin[4].r; \ 127 s1_i = Fin[0].i - Fin[4].i; \ 128 s2_r = Fin[1].r + Fin[5].r; \ 129 s2_i = Fin[1].i + Fin[5].i; \ 130 s3_r = Fin[1].r - Fin[5].r; \ 131 s3_i = Fin[1].i - Fin[5].i; \ 132 s4_r = Fin[2].r + Fin[6].r; \ 133 s4_i = Fin[2].i + Fin[6].i; \ 134 s5_r = Fin[2].r - Fin[6].r; \ 135 s5_i = Fin[2].i - Fin[6].i; \ 136 s6_r = Fin[3].r + Fin[7].r; \ 137 s6_i = Fin[3].i + Fin[7].i; \ 138 s7_r = Fin[3].r - Fin[7].r; \ 139 s7_i = Fin[3].i - Fin[7].i; 141 #define FFT8_FS_SCALED \ 142 s0_r = (Fin[0].r + Fin[4].r) >> 3; \ 143 s0_i = (Fin[0].i + Fin[4].i) >> 3; \ 144 s1_r = (Fin[0].r - Fin[4].r) >> 3; \ 145 s1_i = (Fin[0].i - Fin[4].i) >> 3; \ 146 s2_r = (Fin[1].r + Fin[5].r) >> 3; \ 147 s2_i = (Fin[1].i + Fin[5].i) >> 3; \ 148 s3_r = (Fin[1].r - Fin[5].r) >> 3; \ 149 s3_i = (Fin[1].i - Fin[5].i) >> 3; \ 150 s4_r = (Fin[2].r + Fin[6].r) >> 3; \ 151 s4_i = (Fin[2].i + Fin[6].i) >> 3; \ 152 s5_r = (Fin[2].r - Fin[6].r) >> 3; \ 153 s5_i = (Fin[2].i - Fin[6].i) >> 3; \ 154 s6_r = (Fin[3].r + Fin[7].r) >> 3; \ 155 s6_i = (Fin[3].i + Fin[7].i) >> 3; \ 156 s7_r = (Fin[3].r - Fin[7].r) >> 3; \ 157 s7_i = (Fin[3].i - Fin[7].i) >> 3; 160 #define FFT8_FWD_LS \ 161 t0_r = s0_r - s4_r; \ 162 t0_i = s0_i - s4_i; \ 163 t1_r = s0_r + s4_r; \ 164 t1_i = s0_i + s4_i; \ 165 t2_r = s2_r + s6_r; \ 166 t2_i = s2_i + s6_i; \ 167 t3_r = s2_r - s6_r; \ 168 t3_i = s2_i - s6_i; \ 169 Fout[0].r = t1_r + t2_r; \ 170 Fout[0].i = t1_i + t2_i; \ 171 Fout[4].r = t1_r - t2_r; \ 172 Fout[4].i = t1_i - t2_i; \ 173 Fout[2].r = t0_r + t3_i; \ 174 Fout[2].i = t0_i - t3_r; \ 175 Fout[6].r = t0_r - t3_i; \ 176 Fout[6].i = t0_i + t3_r; \ 177 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT); \ 178 t4_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT); \ 179 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT); \ 180 t5_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT); \ 181 t0_r = s1_r - s5_i; \ 182 t0_i = s1_i + s5_r; \ 183 t1_r = s1_r + s5_i; \ 184 t1_i = s1_i - s5_r; \ 185 t2_r = t4_r - t5_r; \ 186 t2_i = t4_i - t5_i; \ 187 t3_r = t4_r + t5_r; \ 188 t3_i = t4_i + t5_i; \ 189 Fout[1].r = t1_r + t2_r; \ 190 Fout[1].i = t1_i + t2_i; \ 191 Fout[5].r = t1_r - t2_r; \ 192 Fout[5].i = t1_i - t2_i; \ 193 Fout[3].r = t0_r + t3_i; \ 194 Fout[3].i = t0_i - t3_r; \ 195 Fout[7].r = t0_r - t3_i; \ 196 Fout[7].i = t0_i + t3_r; 198 #define FFT8_INV_LS \ 199 t0_r = s0_r - s4_r; \ 200 t0_i = s0_i - s4_i; \ 201 t1_r = s0_r + s4_r; \ 202 t1_i = s0_i + s4_i; \ 203 t2_r = s2_r + s6_r; \ 204 t2_i = s2_i + s6_i; \ 205 t3_r = s2_r - s6_r; \ 206 t3_i = s2_i - s6_i; \ 207 Fout[0].r = t1_r + t2_r; \ 208 Fout[0].i = t1_i + t2_i; \ 209 Fout[4].r = t1_r - t2_r; \ 210 Fout[4].i = t1_i - t2_i; \ 211 Fout[2].r = t0_r - t3_i; \ 212 Fout[2].i = t0_i + t3_r; \ 213 Fout[6].r = t0_r + t3_i; \ 214 Fout[6].i = t0_i - t3_r; \ 215 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT); \ 216 t4_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT); \ 217 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT); \ 218 t5_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT); \ 219 t0_r = s1_r + s5_i; \ 220 t0_i = s1_i - s5_r; \ 221 t1_r = s1_r - s5_i; \ 222 t1_i = s1_i + s5_r; \ 223 t2_r = t4_r - t5_r; \ 224 t2_i = t4_i - t5_i; \ 225 t3_r = t4_r + t5_r; \ 226 t3_i = t4_i + t5_i; \ 227 Fout[1].r = t1_r + t2_r; \ 228 Fout[1].i = t1_i + t2_i; \ 229 Fout[5].r = t1_r - t2_r; \ 230 Fout[5].i = t1_i - t2_i; \ 231 Fout[3].r = t0_r - t3_i; \ 232 Fout[3].i = t0_i + t3_r; \ 233 Fout[7].r = t0_r + t3_i; \ 234 Fout[7].i = t0_i - t3_r; 271 #define RADIX8x4_START \ 272 ne10_int32_t f_count; \ 273 ne10_int32_t src_step = stride << 1; \ 274 const ne10_int16_t TW_81 = 23169; \ 275 const ne10_int16_t TW_81N = -23169; \ 276 int16_t *p_src, *p_dst; \ 277 int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3, d2_in4, d2_in5, d2_in6, d2_in7; \ 278 int16x4_t d_sin0_r, d_sin0_i, d_sin1_r, d_sin1_i, d_sin2_r, d_sin2_i, d_sin3_r, d_sin3_i; \ 279 int16x4_t d_sin4_r, d_sin4_i, d_sin5_r, d_sin5_i, d_sin6_r, d_sin6_i, d_sin7_r, d_sin7_i; \ 280 int16x4_t d_s3_r, d_s3_i, d_s5_r, d_s5_i, d_s7_r, d_s7_i; \ 281 int16x4_t d_s8_r, d_s8_i, d_s9_r, d_s9_i, d_s10_r, d_s10_i, d_s11_r, d_s11_i; \ 282 int16x4_t d_s12_r, d_s12_i, d_s13_r, d_s13_i, d_s14_r, d_s14_i, d_s15_r, d_s15_i; \ 283 int16x4_t d_out0_r, d_out0_i, d_out1_r, d_out1_i, d_out2_r, d_out2_i, d_out3_r, d_out3_i; \ 284 int16x4_t d_out4_r, d_out4_i, d_out5_r, d_out5_i, d_out6_r, d_out6_i, d_out7_r, d_out7_i; \ 285 int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3, d2_out4, d2_out5, d2_out6, d2_out7; \ 286 int16x8x2_t q2_tmp0, q2_tmp1, q2_tmp2, q2_tmp3; \ 287 int32x4x2_t q2_tmp4, q2_tmp5, q2_tmp6, q2_tmp7; \ 288 int16x4_t d_tw_81, d_tw_81n; \ 289 p_src = (int16_t *) Fin; \ 290 p_dst = (int16_t *) Fout; 293 #define RADIX8x4_LOAD \ 294 d2_in0 = vld2_s16 (p_src); \ 296 d2_in2 = vld2_s16 (p_src); \ 298 d2_in4 = vld2_s16 (p_src); \ 300 d2_in6 = vld2_s16 (p_src); \ 302 d2_in1 = vld2_s16 (p_src); \ 304 d2_in3 = vld2_s16 (p_src); \ 306 d2_in5 = vld2_s16 (p_src); \ 308 d2_in7 = vld2_s16 (p_src); \ 311 #define RADIX8x4_STORE \ 312 q2_tmp0 = vtrnq_s16 (vcombine_s16(d_out0_r, d_out0_i), vcombine_s16(d_out1_r, d_out1_i)); \ 313 q2_tmp1 = vtrnq_s16 (vcombine_s16(d_out2_r, d_out2_i), vcombine_s16(d_out3_r, d_out3_i)); \ 314 q2_tmp2 = vtrnq_s16 (vcombine_s16(d_out4_r, d_out4_i), vcombine_s16(d_out5_r, d_out5_i)); \ 315 q2_tmp3 = vtrnq_s16 (vcombine_s16(d_out6_r, d_out6_i), vcombine_s16(d_out7_r, d_out7_i)); \ 316 q2_tmp4 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[0]), vreinterpretq_s32_s16(q2_tmp1.val[0])); \ 317 q2_tmp5 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[1]), vreinterpretq_s32_s16(q2_tmp1.val[1])); \ 318 q2_tmp6 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp2.val[0]), vreinterpretq_s32_s16(q2_tmp3.val[0])); \ 319 q2_tmp7 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp2.val[1]), vreinterpretq_s32_s16(q2_tmp3.val[1])); \ 320 d2_out0.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp4.val[0])); \ 321 d2_out0.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp4.val[0])); \ 322 d2_out1.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp6.val[0])); \ 323 d2_out1.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp6.val[0])); \ 324 d2_out2.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp5.val[0])); \ 325 d2_out2.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp5.val[0])); \ 326 d2_out3.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp7.val[0])); \ 327 d2_out3.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp7.val[0])); \ 328 d2_out4.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp4.val[1])); \ 329 d2_out4.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp4.val[1])); \ 330 d2_out5.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp6.val[1])); \ 331 d2_out5.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp6.val[1])); \ 332 d2_out6.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp5.val[1])); \ 333 d2_out6.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp5.val[1])); \ 334 d2_out7.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp7.val[1])); \ 335 d2_out7.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp7.val[1])); \ 336 vst2_s16 (p_dst, d2_out0); \ 338 vst2_s16 (p_dst, d2_out1); \ 340 vst2_s16 (p_dst, d2_out2); \ 342 vst2_s16 (p_dst, d2_out3); \ 344 vst2_s16 (p_dst, d2_out4); \ 346 vst2_s16 (p_dst, d2_out5); \ 348 vst2_s16 (p_dst, d2_out6); \ 350 vst2_s16 (p_dst, d2_out7); \ 352 p_src = p_src - src_step * 8 + 8; 354 #define RADIX8x4_FS_S0 \ 355 d_sin0_r = vadd_s16 (d2_in0.val[0], d2_in1.val[0]); \ 356 d_sin0_i = vadd_s16 (d2_in0.val[1], d2_in1.val[1]); \ 357 d_sin1_r = vsub_s16 (d2_in0.val[0], d2_in1.val[0]); \ 358 d_sin1_i = vsub_s16 (d2_in0.val[1], d2_in1.val[1]); \ 359 d_sin2_r = vadd_s16 (d2_in2.val[0], d2_in3.val[0]); \ 360 d_sin2_i = vadd_s16 (d2_in2.val[1], d2_in3.val[1]); \ 361 d_sin3_r = vsub_s16 (d2_in2.val[0], d2_in3.val[0]); \ 362 d_sin3_i = vsub_s16 (d2_in2.val[1], d2_in3.val[1]); \ 363 d_sin4_r = vadd_s16 (d2_in4.val[0], d2_in5.val[0]); \ 364 d_sin4_i = vadd_s16 (d2_in4.val[1], d2_in5.val[1]); \ 365 d_sin5_r = vsub_s16 (d2_in4.val[0], d2_in5.val[0]); \ 366 d_sin5_i = vsub_s16 (d2_in4.val[1], d2_in5.val[1]); \ 367 d_sin6_r = vadd_s16 (d2_in6.val[0], d2_in7.val[0]); \ 368 d_sin6_i = vadd_s16 (d2_in6.val[1], d2_in7.val[1]); \ 369 d_sin7_r = vsub_s16 (d2_in6.val[0], d2_in7.val[0]); \ 370 d_sin7_i = vsub_s16 (d2_in6.val[1], d2_in7.val[1]); 372 #define RADIX8x4_FWD_S357 \ 373 d_tw_81 = vdup_n_s16 (TW_81); \ 374 d_tw_81n = vdup_n_s16 (TW_81N); \ 376 d_s5_i = vneg_s16 (d_sin5_r); \ 377 d_s3_r = vadd_s16 (d_sin3_r, d_sin3_i); \ 378 d_s3_i = vsub_s16 (d_sin3_i, d_sin3_r); \ 379 d_s7_r = vsub_s16 (d_sin7_r, d_sin7_i); \ 380 d_s7_i = vadd_s16 (d_sin7_i, d_sin7_r); \ 381 d_s3_r = vqdmulh_s16 (d_s3_r, d_tw_81); \ 382 d_s3_i = vqdmulh_s16 (d_s3_i, d_tw_81); \ 383 d_s7_r = vqdmulh_s16 (d_s7_r, d_tw_81n); \ 384 d_s7_i = vqdmulh_s16 (d_s7_i, d_tw_81n); 386 #define RADIX8x4_INV_S357 \ 387 d_tw_81 = vdup_n_s16 (TW_81); \ 388 d_tw_81n = vdup_n_s16 (TW_81N); \ 389 d_s5_r = vneg_s16 (d_sin5_i); \ 391 d_s3_r = vsub_s16 (d_sin3_r, d_sin3_i); \ 392 d_s3_i = vadd_s16 (d_sin3_i, d_sin3_r); \ 393 d_s7_r = vadd_s16 (d_sin7_r, d_sin7_i); \ 394 d_s7_i = vsub_s16 (d_sin7_i, d_sin7_r); \ 395 d_s3_r = vqdmulh_s16 (d_s3_r, d_tw_81); \ 396 d_s3_i = vqdmulh_s16 (d_s3_i, d_tw_81); \ 397 d_s7_r = vqdmulh_s16 (d_s7_r, d_tw_81n); \ 398 d_s7_i = vqdmulh_s16 (d_s7_i, d_tw_81n); 400 #define RADIX8x4_LS_02 \ 401 d_s8_r = vadd_s16 (d_sin0_r, d_sin4_r); \ 402 d_s8_i = vadd_s16 (d_sin0_i, d_sin4_i); \ 403 d_s9_r = vadd_s16 (d_sin1_r, d_s5_r); \ 404 d_s9_i = vadd_s16 (d_sin1_i, d_s5_i); \ 405 d_s10_r = vsub_s16 (d_sin0_r, d_sin4_r); \ 406 d_s10_i = vsub_s16 (d_sin0_i, d_sin4_i); \ 407 d_s11_r = vsub_s16 (d_sin1_r, d_s5_r); \ 408 d_s11_i = vsub_s16 (d_sin1_i, d_s5_i); \ 409 d_s12_r = vadd_s16 (d_sin2_r, d_sin6_r); \ 410 d_s12_i = vadd_s16 (d_sin2_i, d_sin6_i); \ 411 d_s13_r = vadd_s16 (d_s3_r, d_s7_r); \ 412 d_s13_i = vadd_s16 (d_s3_i, d_s7_i); \ 413 d_s14_r = vsub_s16 (d_sin2_r, d_sin6_r); \ 414 d_s14_i = vsub_s16 (d_sin2_i, d_sin6_i); \ 415 d_s15_r = vsub_s16 (d_s3_r, d_s7_r); \ 416 d_s15_i = vsub_s16 (d_s3_i, d_s7_i); \ 417 d_out4_r = vsub_s16 (d_s8_r, d_s12_r); \ 418 d_out4_i = vsub_s16 (d_s8_i, d_s12_i); \ 419 d_out5_r = vsub_s16 (d_s9_r, d_s13_r); \ 420 d_out5_i = vsub_s16 (d_s9_i, d_s13_i); \ 421 d_out0_r = vadd_s16 (d_s8_r, d_s12_r); \ 422 d_out0_i = vadd_s16 (d_s8_i, d_s12_i); \ 423 d_out1_r = vadd_s16 (d_s9_r, d_s13_r); \ 424 d_out1_i = vadd_s16 (d_s9_i, d_s13_i); 426 #define RADIX8x4_FS_S0_SCALED \ 427 d_sin0_r = vhadd_s16 (d2_in0.val[0], d2_in1.val[0]); \ 428 d_sin0_i = vhadd_s16 (d2_in0.val[1], d2_in1.val[1]); \ 429 d_sin1_r = vhsub_s16 (d2_in0.val[0], d2_in1.val[0]); \ 430 d_sin1_i = vhsub_s16 (d2_in0.val[1], d2_in1.val[1]); \ 431 d_sin2_r = vhadd_s16 (d2_in2.val[0], d2_in3.val[0]); \ 432 d_sin2_i = vhadd_s16 (d2_in2.val[1], d2_in3.val[1]); \ 433 d_sin3_r = vhsub_s16 (d2_in2.val[0], d2_in3.val[0]); \ 434 d_sin3_i = vhsub_s16 (d2_in2.val[1], d2_in3.val[1]); \ 435 d_sin4_r = vhadd_s16 (d2_in4.val[0], d2_in5.val[0]); \ 436 d_sin4_i = vhadd_s16 (d2_in4.val[1], d2_in5.val[1]); \ 437 d_sin5_r = vhsub_s16 (d2_in4.val[0], d2_in5.val[0]); \ 438 d_sin5_i = vhsub_s16 (d2_in4.val[1], d2_in5.val[1]); \ 439 d_sin6_r = vhadd_s16 (d2_in6.val[0], d2_in7.val[0]); \ 440 d_sin6_i = vhadd_s16 (d2_in6.val[1], d2_in7.val[1]); \ 441 d_sin7_r = vhsub_s16 (d2_in6.val[0], d2_in7.val[0]); \ 442 d_sin7_i = vhsub_s16 (d2_in6.val[1], d2_in7.val[1]); 444 #define RADIX8x4_LS_02_SCALED \ 445 d_s8_r = vhadd_s16 (d_sin0_r, d_sin4_r); \ 446 d_s8_i = vhadd_s16 (d_sin0_i, d_sin4_i); \ 447 d_s9_r = vhadd_s16 (d_sin1_r, d_s5_r); \ 448 d_s9_i = vhadd_s16 (d_sin1_i, d_s5_i); \ 449 d_s10_r = vhsub_s16 (d_sin0_r, d_sin4_r); \ 450 d_s10_i = vhsub_s16 (d_sin0_i, d_sin4_i); \ 451 d_s11_r = vhsub_s16 (d_sin1_r, d_s5_r); \ 452 d_s11_i = vhsub_s16 (d_sin1_i, d_s5_i); \ 453 d_s12_r = vhadd_s16 (d_sin2_r, d_sin6_r); \ 454 d_s12_i = vhadd_s16 (d_sin2_i, d_sin6_i); \ 455 d_s13_r = vhadd_s16 (d_s3_r, d_s7_r); \ 456 d_s13_i = vhadd_s16 (d_s3_i, d_s7_i); \ 457 d_s14_r = vhsub_s16 (d_sin2_r, d_sin6_r); \ 458 d_s14_i = vhsub_s16 (d_sin2_i, d_sin6_i); \ 459 d_s15_r = vhsub_s16 (d_s3_r, d_s7_r); \ 460 d_s15_i = vhsub_s16 (d_s3_i, d_s7_i); \ 461 d_out4_r = vhsub_s16 (d_s8_r, d_s12_r); \ 462 d_out4_i = vhsub_s16 (d_s8_i, d_s12_i); \ 463 d_out5_r = vhsub_s16 (d_s9_r, d_s13_r); \ 464 d_out5_i = vhsub_s16 (d_s9_i, d_s13_i); \ 465 d_out0_r = vhadd_s16 (d_s8_r, d_s12_r); \ 466 d_out0_i = vhadd_s16 (d_s8_i, d_s12_i); \ 467 d_out1_r = vhadd_s16 (d_s9_r, d_s13_r); \ 468 d_out1_i = vhadd_s16 (d_s9_i, d_s13_i); 477 for (f_count = 0; f_count < stride; f_count += 4)
487 d_out2_r = vadd_s16 (d_s10_r, d_s14_i);
488 d_out2_i = vsub_s16 (d_s10_i, d_s14_r);
489 d_out3_r = vadd_s16 (d_s11_r, d_s15_i);
490 d_out3_i = vsub_s16 (d_s11_i, d_s15_r);
491 d_out6_r = vsub_s16 (d_s10_r, d_s14_i);
492 d_out6_i = vadd_s16 (d_s10_i, d_s14_r);
493 d_out7_r = vsub_s16 (d_s11_r, d_s15_i);
494 d_out7_i = vadd_s16 (d_s11_i, d_s15_r);
506 for (f_count = 0; f_count < stride; f_count += 4)
515 d_out2_r = vsub_s16 (d_s10_r, d_s14_i);
516 d_out2_i = vadd_s16 (d_s10_i, d_s14_r);
517 d_out3_r = vsub_s16 (d_s11_r, d_s15_i);
518 d_out3_i = vadd_s16 (d_s11_i, d_s15_r);
519 d_out6_r = vadd_s16 (d_s10_r, d_s14_i);
520 d_out6_i = vsub_s16 (d_s10_i, d_s14_r);
521 d_out7_r = vadd_s16 (d_s11_r, d_s15_i);
522 d_out7_i = vsub_s16 (d_s11_i, d_s15_r);
533 for (f_count = 0; f_count < stride; f_count += 4)
536 RADIX8x4_FS_S0_SCALED
540 RADIX8x4_LS_02_SCALED
542 d_out2_r = vhadd_s16 (d_s10_r, d_s14_i);
543 d_out2_i = vhsub_s16 (d_s10_i, d_s14_r);
544 d_out3_r = vhadd_s16 (d_s11_r, d_s15_i);
545 d_out3_i = vhsub_s16 (d_s11_i, d_s15_r);
546 d_out6_r = vhsub_s16 (d_s10_r, d_s14_i);
547 d_out6_i = vhadd_s16 (d_s10_i, d_s14_r);
548 d_out7_r = vhsub_s16 (d_s11_r, d_s15_i);
549 d_out7_i = vhadd_s16 (d_s11_i, d_s15_r);
561 for (f_count = 0; f_count < stride; f_count += 4)
564 RADIX8x4_FS_S0_SCALED
568 RADIX8x4_LS_02_SCALED
570 d_out2_r = vhsub_s16 (d_s10_r, d_s14_i);
571 d_out2_i = vhadd_s16 (d_s10_i, d_s14_r);
572 d_out3_r = vhsub_s16 (d_s11_r, d_s15_i);
573 d_out3_i = vhadd_s16 (d_s11_i, d_s15_r);
574 d_out6_r = vhadd_s16 (d_s10_r, d_s14_i);
575 d_out6_i = vhsub_s16 (d_s10_i, d_s14_r);
576 d_out7_r = vhadd_s16 (d_s11_r, d_s15_i);
577 d_out7_i = vhsub_s16 (d_s11_i, d_s15_r);
583 #define RADIX4x4_WITHOUT_TW_START \ 584 ne10_int32_t f_count; \ 585 ne10_int32_t src_step = stride << 1; \ 586 int16_t *p_src, *p_dst; \ 587 int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3; \ 588 int16x4_t d_s0_r, d_s0_i, d_s1_r, d_s1_i, d_s2_r, d_s2_i, d_s3_r, d_s3_i; \ 589 int16x4_t d_out0_r, d_out0_i, d_out1_r, d_out1_i, d_out2_r, d_out2_i, d_out3_r, d_out3_i; \ 590 int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3; \ 591 int16x8x2_t q2_tmp0, q2_tmp1; \ 592 int32x4x2_t q2_tmp2, q2_tmp3; \ 593 p_src = (int16_t *) Fin; \ 594 p_dst = (int16_t *) Fout; 596 #define RADIX4x4_WITHOUT_TW_LOAD \ 597 d2_in0 = vld2_s16 (p_src); \ 599 d2_in1 = vld2_s16 (p_src); \ 601 d2_in2 = vld2_s16 (p_src); \ 603 d2_in3 = vld2_s16 (p_src); \ 606 #define RADIX4x4_WITHOUT_TW_STORE \ 607 q2_tmp0 = vtrnq_s16 (vcombine_s16(d_out0_r, d_out0_i), vcombine_s16(d_out1_r, d_out1_i)); \ 608 q2_tmp1 = vtrnq_s16 (vcombine_s16(d_out2_r, d_out2_i), vcombine_s16(d_out3_r, d_out3_i)); \ 609 q2_tmp2 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[0]), vreinterpretq_s32_s16(q2_tmp1.val[0])); \ 610 q2_tmp3 = vtrnq_s32 (vreinterpretq_s32_s16(q2_tmp0.val[1]), vreinterpretq_s32_s16(q2_tmp1.val[1])); \ 611 d2_out0.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp2.val[0])); \ 612 d2_out0.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp2.val[0])); \ 613 d2_out1.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp3.val[0])); \ 614 d2_out1.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp3.val[0])); \ 615 d2_out2.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp2.val[1])); \ 616 d2_out2.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp2.val[1])); \ 617 d2_out3.val[0] = vget_low_s16 (vreinterpretq_s16_s32(q2_tmp3.val[1])); \ 618 d2_out3.val[1] = vget_high_s16 (vreinterpretq_s16_s32(q2_tmp3.val[1])); \ 619 vst2_s16 (p_dst, d2_out0); \ 621 vst2_s16 (p_dst, d2_out1); \ 623 vst2_s16 (p_dst, d2_out2); \ 625 vst2_s16 (p_dst, d2_out3); \ 627 p_src = p_src - src_step * 4 + 8; 629 #define RADIX4x4_WITHOUT_TW_S0 \ 630 d_s0_r = vadd_s16 (d2_in0.val[0], d2_in2.val[0]); \ 631 d_s0_i = vadd_s16 (d2_in0.val[1], d2_in2.val[1]); \ 632 d_s1_r = vsub_s16 (d2_in0.val[0], d2_in2.val[0]); \ 633 d_s1_i = vsub_s16 (d2_in0.val[1], d2_in2.val[1]); \ 634 d_s2_r = vadd_s16 (d2_in1.val[0], d2_in3.val[0]); \ 635 d_s2_i = vadd_s16 (d2_in1.val[1], d2_in3.val[1]); \ 636 d_s3_r = vsub_s16 (d2_in1.val[0], d2_in3.val[0]); \ 637 d_s3_i = vsub_s16 (d2_in1.val[1], d2_in3.val[1]); \ 638 d_out2_r = vsub_s16 (d_s0_r, d_s2_r); \ 639 d_out2_i = vsub_s16 (d_s0_i, d_s2_i); \ 640 d_out0_r = vadd_s16 (d_s0_r, d_s2_r); \ 641 d_out0_i = vadd_s16 (d_s0_i, d_s2_i); 643 #define RADIX4x4_WITHOUT_TW_S0_SCALED \ 644 d_s0_r = vhadd_s16 (d2_in0.val[0], d2_in2.val[0]); \ 645 d_s0_i = vhadd_s16 (d2_in0.val[1], d2_in2.val[1]); \ 646 d_s1_r = vhsub_s16 (d2_in0.val[0], d2_in2.val[0]); \ 647 d_s1_i = vhsub_s16 (d2_in0.val[1], d2_in2.val[1]); \ 648 d_s2_r = vhadd_s16 (d2_in1.val[0], d2_in3.val[0]); \ 649 d_s2_i = vhadd_s16 (d2_in1.val[1], d2_in3.val[1]); \ 650 d_s3_r = vhsub_s16 (d2_in1.val[0], d2_in3.val[0]); \ 651 d_s3_i = vhsub_s16 (d2_in1.val[1], d2_in3.val[1]); \ 652 d_out2_r = vhsub_s16 (d_s0_r, d_s2_r); \ 653 d_out2_i = vhsub_s16 (d_s0_i, d_s2_i); \ 654 d_out0_r = vhadd_s16 (d_s0_r, d_s2_r); \ 655 d_out0_i = vhadd_s16 (d_s0_i, d_s2_i); 658 static inline void ne10_radix4x4_without_twiddles_forward_unscaled_neon (
ne10_fft_cpx_int16_t * Fout,
662 RADIX4x4_WITHOUT_TW_START
664 for (f_count = 0; f_count < stride; f_count += 4)
667 RADIX4x4_WITHOUT_TW_LOAD
670 RADIX4x4_WITHOUT_TW_S0
672 d_out1_r = vadd_s16 (d_s1_r, d_s3_i);
673 d_out1_i = vsub_s16 (d_s1_i, d_s3_r);
674 d_out3_r = vsub_s16 (d_s1_r, d_s3_i);
675 d_out3_i = vadd_s16 (d_s1_i, d_s3_r);
677 RADIX4x4_WITHOUT_TW_STORE
681 static inline void ne10_radix4x4_without_twiddles_backward_unscaled_neon (
ne10_fft_cpx_int16_t * Fout,
685 RADIX4x4_WITHOUT_TW_START
687 for (f_count = 0; f_count < stride; f_count += 4)
690 RADIX4x4_WITHOUT_TW_LOAD
693 RADIX4x4_WITHOUT_TW_S0
695 d_out1_r = vsub_s16 (d_s1_r, d_s3_i);
696 d_out1_i = vadd_s16 (d_s1_i, d_s3_r);
697 d_out3_r = vadd_s16 (d_s1_r, d_s3_i);
698 d_out3_i = vsub_s16 (d_s1_i, d_s3_r);
700 RADIX4x4_WITHOUT_TW_STORE
704 static inline void ne10_radix4x4_without_twiddles_forward_scaled_neon (
ne10_fft_cpx_int16_t * Fout,
708 RADIX4x4_WITHOUT_TW_START
710 for (f_count = 0; f_count < stride; f_count += 4)
713 RADIX4x4_WITHOUT_TW_LOAD
716 RADIX4x4_WITHOUT_TW_S0_SCALED
718 d_out1_r = vhadd_s16 (d_s1_r, d_s3_i);
719 d_out1_i = vhsub_s16 (d_s1_i, d_s3_r);
720 d_out3_r = vhsub_s16 (d_s1_r, d_s3_i);
721 d_out3_i = vhadd_s16 (d_s1_i, d_s3_r);
723 RADIX4x4_WITHOUT_TW_STORE
727 static inline void ne10_radix4x4_without_twiddles_backward_scaled_neon (
ne10_fft_cpx_int16_t * Fout,
731 RADIX4x4_WITHOUT_TW_START
733 for (f_count = 0; f_count < stride; f_count += 4)
736 RADIX4x4_WITHOUT_TW_LOAD
739 RADIX4x4_WITHOUT_TW_S0_SCALED
741 d_out1_r = vhsub_s16 (d_s1_r, d_s3_i);
742 d_out1_i = vhadd_s16 (d_s1_i, d_s3_r);
743 d_out3_r = vhadd_s16 (d_s1_r, d_s3_i);
744 d_out3_i = vhsub_s16 (d_s1_i, d_s3_r);
746 RADIX4x4_WITHOUT_TW_STORE
750 #define RADIX4x4_WITH_TW_START \ 751 ne10_int32_t m_count; \ 752 ne10_int32_t src_step = src_stride << 1; \ 753 ne10_int32_t dst_step = dst_stride << 1; \ 754 ne10_int32_t tw_step = mstride << 1; \ 755 int16_t *p_src, *p_dst, *p_tw; \ 756 int16x4x2_t d2_in0, d2_in1, d2_in2, d2_in3; \ 757 int16x4x2_t d2_tw0, d2_tw1, d2_tw2; \ 758 int16x4_t d_s1_r, d_s1_i, d_s2_r, d_s2_i, d_s3_r, d_s3_i; \ 759 int16x4_t d_tmp0, d_tmp1, d_tmp2, d_tmp3, d_tmp4, d_tmp5; \ 760 int16x4_t d_s4_r, d_s4_i, d_s5_r, d_s5_i, d_s6_r, d_s6_i, d_s7_r, d_s7_i; \ 761 int16x4x2_t d2_out0, d2_out1, d2_out2, d2_out3; \ 762 p_src = (int16_t *) Fin; \ 763 p_dst = (int16_t *) Fout; \ 764 p_tw = (int16_t *) tw; 766 #define RADIX4x4_WITH_TW_LOAD \ 767 d2_in0 = vld2_s16 (p_src); \ 769 d2_in1 = vld2_s16 (p_src); \ 771 d2_in2 = vld2_s16 (p_src); \ 773 d2_in3 = vld2_s16 (p_src); \ 775 d2_tw0 = vld2_s16 (p_tw); \ 777 d2_tw1 = vld2_s16 (p_tw); \ 779 d2_tw2 = vld2_s16 (p_tw); \ 780 d_s1_r = vqdmulh_s16 (d2_in1.val[0], d2_tw0.val[0]); \ 781 d_s1_i = vqdmulh_s16 (d2_in1.val[1], d2_tw0.val[0]); \ 782 d_s2_r = vqdmulh_s16 (d2_in2.val[0], d2_tw1.val[0]); \ 783 d_s2_i = vqdmulh_s16 (d2_in2.val[1], d2_tw1.val[0]); \ 784 d_s3_r = vqdmulh_s16 (d2_in3.val[0], d2_tw2.val[0]); \ 785 d_s3_i = vqdmulh_s16 (d2_in3.val[1], d2_tw2.val[0]); \ 786 d_tmp0 = vqdmulh_s16 (d2_in1.val[1], d2_tw0.val[1]); \ 787 d_tmp1 = vqdmulh_s16 (d2_in1.val[0], d2_tw0.val[1]); \ 788 d_tmp2 = vqdmulh_s16 (d2_in2.val[1], d2_tw1.val[1]); \ 789 d_tmp3 = vqdmulh_s16 (d2_in2.val[0], d2_tw1.val[1]); \ 790 d_tmp4 = vqdmulh_s16 (d2_in3.val[1], d2_tw2.val[1]); \ 791 d_tmp5 = vqdmulh_s16 (d2_in3.val[0], d2_tw2.val[1]); 793 #define RADIX4x4_WITH_TW_STORE \ 794 vst2_s16 (p_dst, d2_out0); \ 796 vst2_s16 (p_dst, d2_out1); \ 798 vst2_s16 (p_dst, d2_out2); \ 800 vst2_s16 (p_dst, d2_out3); \ 802 p_src = p_src - src_step * 4 + 8; \ 803 p_dst = p_dst - dst_step * 4 + 8; \ 804 p_tw = p_tw - tw_step * 2 + 8; 806 #define RADIX4x4_WITH_TW_S1_FWD \ 807 d_s1_r = vsub_s16 (d_s1_r, d_tmp0); \ 808 d_s1_i = vadd_s16 (d_s1_i, d_tmp1); \ 809 d_s2_r = vsub_s16 (d_s2_r, d_tmp2); \ 810 d_s2_i = vadd_s16 (d_s2_i, d_tmp3); \ 811 d_s3_r = vsub_s16 (d_s3_r, d_tmp4); \ 812 d_s3_i = vadd_s16 (d_s3_i, d_tmp5); 814 #define RADIX4x4_WITH_TW_S1_INV \ 815 d_s1_r = vadd_s16 (d_s1_r, d_tmp0); \ 816 d_s1_i = vsub_s16 (d_s1_i, d_tmp1); \ 817 d_s2_r = vadd_s16 (d_s2_r, d_tmp2); \ 818 d_s2_i = vsub_s16 (d_s2_i, d_tmp3); \ 819 d_s3_r = vadd_s16 (d_s3_r, d_tmp4); \ 820 d_s3_i = vsub_s16 (d_s3_i, d_tmp5); 823 #define RADIX4x4_WITH_TW_LS_02 \ 824 d_s4_r = vadd_s16 (d2_in0.val[0], d_s2_r); \ 825 d_s4_i = vadd_s16 (d2_in0.val[1], d_s2_i); \ 826 d_s5_r = vsub_s16 (d2_in0.val[0], d_s2_r); \ 827 d_s5_i = vsub_s16 (d2_in0.val[1], d_s2_i); \ 828 d_s6_r = vadd_s16 (d_s1_r, d_s3_r); \ 829 d_s6_i = vadd_s16 (d_s1_i, d_s3_i); \ 830 d_s7_r = vsub_s16 (d_s1_r, d_s3_r); \ 831 d_s7_i = vsub_s16 (d_s1_i, d_s3_i); \ 832 d2_out2.val[0] = vsub_s16 (d_s4_r, d_s6_r); \ 833 d2_out2.val[1] = vsub_s16 (d_s4_i, d_s6_i); \ 834 d2_out0.val[0] = vadd_s16 (d_s4_r, d_s6_r); \ 835 d2_out0.val[1] = vadd_s16 (d_s4_i, d_s6_i); 837 #define RADIX4x4_WITH_TW_LS_02_SCALED \ 838 d_s4_r = vhadd_s16 (d2_in0.val[0], d_s2_r); \ 839 d_s4_i = vhadd_s16 (d2_in0.val[1], d_s2_i); \ 840 d_s5_r = vhsub_s16 (d2_in0.val[0], d_s2_r); \ 841 d_s5_i = vhsub_s16 (d2_in0.val[1], d_s2_i); \ 842 d_s6_r = vhadd_s16 (d_s1_r, d_s3_r); \ 843 d_s6_i = vhadd_s16 (d_s1_i, d_s3_i); \ 844 d_s7_r = vhsub_s16 (d_s1_r, d_s3_r); \ 845 d_s7_i = vhsub_s16 (d_s1_i, d_s3_i); \ 846 d2_out2.val[0] = vhsub_s16 (d_s4_r, d_s6_r); \ 847 d2_out2.val[1] = vhsub_s16 (d_s4_i, d_s6_i); \ 848 d2_out0.val[0] = vhadd_s16 (d_s4_r, d_s6_r); \ 849 d2_out0.val[1] = vhadd_s16 (d_s4_i, d_s6_i); 852 static inline void ne10_radix4x4_with_twiddles_forward_unscaled_neon (
ne10_fft_cpx_int16_t * Fout,
855 ne10_int32_t src_stride,
856 ne10_int32_t dst_stride,
857 ne10_int32_t mstride)
859 RADIX4x4_WITH_TW_START
861 for (m_count = 0; m_count < mstride; m_count += 4)
864 RADIX4x4_WITH_TW_LOAD
865 RADIX4x4_WITH_TW_S1_FWD
867 RADIX4x4_WITH_TW_LS_02
869 d2_out1.val[0] = vadd_s16 (d_s5_r, d_s7_i);
870 d2_out1.val[1] = vsub_s16 (d_s5_i, d_s7_r);
871 d2_out3.val[0] = vsub_s16 (d_s5_r, d_s7_i);
872 d2_out3.val[1] = vadd_s16 (d_s5_i, d_s7_r);
875 RADIX4x4_WITH_TW_STORE
880 static inline void ne10_radix4x4_with_twiddles_backward_unscaled_neon (
ne10_fft_cpx_int16_t * Fout,
883 ne10_int32_t src_stride,
884 ne10_int32_t dst_stride,
885 ne10_int32_t mstride)
887 RADIX4x4_WITH_TW_START
889 for (m_count = 0; m_count < mstride; m_count += 4)
892 RADIX4x4_WITH_TW_LOAD
893 RADIX4x4_WITH_TW_S1_INV
895 RADIX4x4_WITH_TW_LS_02
897 d2_out1.val[0] = vsub_s16 (d_s5_r, d_s7_i);
898 d2_out1.val[1] = vadd_s16 (d_s5_i, d_s7_r);
899 d2_out3.val[0] = vadd_s16 (d_s5_r, d_s7_i);
900 d2_out3.val[1] = vsub_s16 (d_s5_i, d_s7_r);
903 RADIX4x4_WITH_TW_STORE
909 static inline void ne10_radix4x4_with_twiddles_forward_scaled_neon (
ne10_fft_cpx_int16_t * Fout,
912 ne10_int32_t src_stride,
913 ne10_int32_t dst_stride,
914 ne10_int32_t mstride)
916 RADIX4x4_WITH_TW_START
918 for (m_count = 0; m_count < mstride; m_count += 4)
921 RADIX4x4_WITH_TW_LOAD
922 RADIX4x4_WITH_TW_S1_FWD
924 RADIX4x4_WITH_TW_LS_02_SCALED
926 d2_out1.val[0] = vhadd_s16 (d_s5_r, d_s7_i);
927 d2_out1.val[1] = vhsub_s16 (d_s5_i, d_s7_r);
928 d2_out3.val[0] = vhsub_s16 (d_s5_r, d_s7_i);
929 d2_out3.val[1] = vhadd_s16 (d_s5_i, d_s7_r);
932 RADIX4x4_WITH_TW_STORE
936 static inline void ne10_radix4x4_with_twiddles_backward_scaled_neon (
ne10_fft_cpx_int16_t * Fout,
939 ne10_int32_t src_stride,
940 ne10_int32_t dst_stride,
941 ne10_int32_t mstride)
943 RADIX4x4_WITH_TW_START
945 for (m_count = 0; m_count < mstride; m_count += 4)
948 RADIX4x4_WITH_TW_LOAD
949 RADIX4x4_WITH_TW_S1_INV
951 RADIX4x4_WITH_TW_LS_02_SCALED
953 d2_out1.val[0] = vhsub_s16 (d_s5_r, d_s7_i);
954 d2_out1.val[1] = vhadd_s16 (d_s5_i, d_s7_r);
955 d2_out3.val[0] = vhadd_s16 (d_s5_r, d_s7_i);
956 d2_out3.val[1] = vhsub_s16 (d_s5_i, d_s7_r);
959 RADIX4x4_WITH_TW_STORE
964 #define ne10_mixed_radix_fft_forward_int16_neon(scaled) \ 965 void ne10_mixed_radix_fft_forward_int16_##scaled##_neon (ne10_fft_cpx_int16_t * Fout, \ 966 ne10_fft_cpx_int16_t * Fin, \ 967 ne10_int32_t * factors, \ 968 ne10_fft_cpx_int16_t * twiddles, \ 969 ne10_fft_cpx_int16_t * buffer) \ 971 ne10_int32_t fstride, mstride, N; \ 972 ne10_int32_t fstride1; \ 973 ne10_int32_t f_count; \ 974 ne10_int32_t stage_count; \ 976 ne10_fft_cpx_int16_t *Fin1, *Fout1; \ 977 ne10_fft_cpx_int16_t *Fout_ls = Fout; \ 978 ne10_fft_cpx_int16_t *Ftmp; \ 979 ne10_fft_cpx_int16_t *tw, *tw1; \ 982 stage_count = factors[0]; \ 983 fstride = factors[1]; \ 984 mstride = factors[ (stage_count << 1) - 1 ]; \ 985 N = factors[ stage_count << 1 ]; \ 994 fstride1 = fstride >> 2; \ 995 ne10_radix8x4_forward_##scaled##_neon (Fout, Fin, fstride1);\ 1008 ne10_radix4x4_without_twiddles_forward_##scaled##_neon (Fout, Fin, fstride); \ 1019 for (; stage_count > 1 ; stage_count--) \ 1022 for (f_count = 0; f_count < fstride; f_count ++) \ 1024 Fout1 = & Fout[ f_count * mstride << 2 ]; \ 1026 ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \ 1029 tw += mstride * 3; \ 1041 for (f_count = 0; f_count < fstride; f_count ++) \ 1044 ne10_radix4x4_with_twiddles_forward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \ 1051 #define ne10_mixed_radix_fft_backward_int16_neon(scaled) \ 1052 void ne10_mixed_radix_fft_backward_int16_##scaled##_neon (ne10_fft_cpx_int16_t * Fout, \ 1053 ne10_fft_cpx_int16_t * Fin, \ 1054 ne10_int32_t * factors, \ 1055 ne10_fft_cpx_int16_t * twiddles, \ 1056 ne10_fft_cpx_int16_t * buffer) \ 1058 ne10_int32_t fstride, mstride, N; \ 1059 ne10_int32_t fstride1; \ 1060 ne10_int32_t f_count; \ 1061 ne10_int32_t stage_count; \ 1063 ne10_fft_cpx_int16_t *Fin1, *Fout1; \ 1064 ne10_fft_cpx_int16_t *Fout_ls = Fout; \ 1065 ne10_fft_cpx_int16_t *Ftmp; \ 1066 ne10_fft_cpx_int16_t *tw, *tw1; \ 1069 stage_count = factors[0]; \ 1070 fstride = factors[1]; \ 1071 mstride = factors[ (stage_count << 1) - 1 ]; \ 1072 N = factors[ stage_count << 1 ]; \ 1081 fstride1 = fstride >> 2; \ 1082 ne10_radix8x4_backward_##scaled##_neon (Fout, Fin, fstride1);\ 1095 ne10_radix4x4_without_twiddles_backward_##scaled##_neon (Fout, Fin, fstride); \ 1106 for (; stage_count > 1 ; stage_count--) \ 1109 for (f_count = 0; f_count < fstride; f_count ++) \ 1111 Fout1 = & Fout[ f_count * mstride << 2 ]; \ 1113 ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, mstride, mstride); \ 1116 tw += mstride * 3; \ 1128 for (f_count = 0; f_count < fstride; f_count ++) \ 1131 ne10_radix4x4_with_twiddles_backward_##scaled##_neon (Fout1, Fin1, tw1, N, N, mstride); \ 1139 ne10_mixed_radix_fft_forward_int16_neon (unscaled)
1140 ne10_mixed_radix_fft_forward_int16_neon (scaled)
1141 ne10_mixed_radix_fft_backward_int16_neon (unscaled)
1142 ne10_mixed_radix_fft_backward_int16_neon (scaled)
1149 ne10_int32_t scaled_flag)
1152 ne10_int32_t count = ncfft / 2;
1154 int16x8x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
1155 int16x8_t q_fpnk_r, q_fpnk_i;
1156 int16x8_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
1157 int16x8_t q_tw_r, q_tw_i;
1158 int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1159 int16x8_t q_dst2_r, q_dst2_i;
1160 int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1166 NE10_F2I16_FIXDIV (tdc, 2);
1168 dst[0].r = tdc.r + tdc.i;
1169 dst[ncfft].r = tdc.r - tdc.i;
1170 dst[ncfft].i = dst[0].i = 0;
1176 for (k = 1; k <= count ; k += 8)
1178 p_src = (int16_t*) (& (src[k]));
1179 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1180 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1181 p_dst = (int16_t*) (& (dst[k]));
1182 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1184 q2_fpk = vld2q_s16 (p_src);
1185 q2_fpnk = vld2q_s16 (p_src2);
1187 q2_tw = vld2q_s16 (p_twiddles);
1188 q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
1189 q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
1190 q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
1191 q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
1192 q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
1193 q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
1194 q_fpnk_i = vnegq_s16 (q_fpnk_i);
1196 q_f1k_r = vhaddq_s16 (q2_fpk.val[0], q_fpnk_r);
1197 q_f1k_i = vhaddq_s16 (q2_fpk.val[1], q_fpnk_i);
1199 q_f2k_r = vhsubq_s16 (q2_fpk.val[0], q_fpnk_r);
1200 q_f2k_i = vhsubq_s16 (q2_fpk.val[1], q_fpnk_i);
1202 q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
1203 q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
1204 q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
1205 q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
1206 q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
1207 q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
1209 q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
1210 q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
1211 q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
1212 q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
1213 q_dst2_r = vrev32q_s16 (q_dst2_r);
1214 q_dst2_i = vrev32q_s16 (q_dst2_i);
1215 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1216 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1217 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1218 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1219 vst2q_s16 (p_dst, q2_dst);
1220 vst2q_s16 (p_dst2, q2_dst2);
1226 for (k = 1; k <= count ; k += 8)
1228 p_src = (int16_t*) (& (src[k]));
1229 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1230 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1231 p_dst = (int16_t*) (& (dst[k]));
1232 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1234 q2_fpk = vld2q_s16 (p_src);
1235 q2_fpnk = vld2q_s16 (p_src2);
1237 q2_tw = vld2q_s16 (p_twiddles);
1238 q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
1239 q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
1240 q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
1241 q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
1242 q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
1243 q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
1244 q_fpnk_i = vnegq_s16 (q_fpnk_i);
1246 q_f1k_r = vaddq_s16 (q2_fpk.val[0], q_fpnk_r);
1247 q_f1k_i = vaddq_s16 (q2_fpk.val[1], q_fpnk_i);
1249 q_f2k_r = vsubq_s16 (q2_fpk.val[0], q_fpnk_r);
1250 q_f2k_i = vsubq_s16 (q2_fpk.val[1], q_fpnk_i);
1252 q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
1253 q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
1254 q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
1255 q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
1256 q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
1257 q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
1259 q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
1260 q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
1261 q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
1262 q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
1263 q_dst2_r = vrev32q_s16 (q_dst2_r);
1264 q_dst2_i = vrev32q_s16 (q_dst2_i);
1265 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1266 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1267 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1268 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1269 vst2q_s16 (p_dst, q2_dst);
1270 vst2q_s16 (p_dst2, q2_dst2);
1278 for (k = 1; k <= ncfft / 2 ; ++k)
1281 fpnk.r = src[ncfft - k].r;
1282 fpnk.i = - src[ncfft - k].i;
1285 NE10_F2I16_FIXDIV (fpk, 2);
1286 NE10_F2I16_FIXDIV (fpnk, 2);
1289 f1k.r = fpk.r + fpnk.r;
1290 f1k.i = fpk.i + fpnk.i;
1292 f2k.r = fpk.r - fpnk.r;
1293 f2k.i = fpk.i - fpnk.i;
1295 tw.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).r
1296 - (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1297 tw.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).i
1298 + (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> NE10_F2I16_SHIFT);
1300 dst[k].r = (f1k.r + tw.r) >> 1;
1301 dst[k].i = (f1k.i + tw.i) >> 1;
1302 dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
1303 dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
1312 ne10_int32_t scaled_flag)
1316 ne10_int32_t count = ncfft / 2;
1318 int16x8x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
1319 int16x8_t q_fnkc_r, q_fnkc_i;
1320 int16x8_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
1321 int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
1322 int16x8_t q_dst2_r, q_dst2_i;
1323 int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
1326 dst[0].r = src[0].r + src[ncfft].r;
1327 dst[0].i = src[0].r - src[ncfft].r;
1330 NE10_F2I16_FIXDIV (dst[0], 2);
1335 for (k = 1; k <= count ; k += 8)
1337 p_src = (int16_t*) (& (src[k]));
1338 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1339 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1340 p_dst = (int16_t*) (& (dst[k]));
1341 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1343 q2_fk = vld2q_s16 (p_src);
1344 q2_fnkc = vld2q_s16 (p_src2);
1345 q2_tw = vld2q_s16 (p_twiddles);
1346 q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
1347 q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
1348 q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
1349 q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
1350 q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
1351 q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
1352 q_fnkc_i = vnegq_s16 (q_fnkc_i);
1354 q_fek_r = vhaddq_s16 (q2_fk.val[0], q_fnkc_r);
1355 q_fek_i = vhaddq_s16 (q2_fk.val[1], q_fnkc_i);
1356 q_tmp0 = vhsubq_s16 (q2_fk.val[0], q_fnkc_r);
1357 q_tmp1 = vhsubq_s16 (q2_fk.val[1], q_fnkc_i);
1359 q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
1360 q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
1361 q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
1362 q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
1363 q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
1364 q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
1366 q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
1367 q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
1368 q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
1369 q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
1370 q_dst2_r = vrev32q_s16 (q_dst2_r);
1371 q_dst2_i = vrev32q_s16 (q_dst2_i);
1372 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1373 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1374 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1375 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1376 vst2q_s16 (p_dst, q2_dst);
1377 vst2q_s16 (p_dst2, q2_dst2);
1384 for (k = 1; k <= count ; k += 8)
1386 p_src = (int16_t*) (& (src[k]));
1387 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
1388 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
1389 p_dst = (int16_t*) (& (dst[k]));
1390 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
1392 q2_fk = vld2q_s16 (p_src);
1393 q2_fnkc = vld2q_s16 (p_src2);
1394 q2_tw = vld2q_s16 (p_twiddles);
1395 q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
1396 q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
1397 q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
1398 q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
1399 q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
1400 q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
1401 q_fnkc_i = vnegq_s16 (q_fnkc_i);
1403 q_fek_r = vaddq_s16 (q2_fk.val[0], q_fnkc_r);
1404 q_fek_i = vaddq_s16 (q2_fk.val[1], q_fnkc_i);
1405 q_tmp0 = vsubq_s16 (q2_fk.val[0], q_fnkc_r);
1406 q_tmp1 = vsubq_s16 (q2_fk.val[1], q_fnkc_i);
1408 q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
1409 q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
1410 q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
1411 q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
1412 q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
1413 q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
1415 q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
1416 q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
1417 q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
1418 q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
1419 q_dst2_r = vrev32q_s16 (q_dst2_r);
1420 q_dst2_i = vrev32q_s16 (q_dst2_i);
1421 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
1422 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
1423 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
1424 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
1425 vst2q_s16 (p_dst, q2_dst);
1426 vst2q_s16 (p_dst2, q2_dst2);
1434 for (k = 1; k <= ncfft / 2; k++)
1437 fnkc.r = src[ncfft - k].r;
1438 fnkc.i = -src[ncfft - k].i;
1441 NE10_F2I16_FIXDIV (fk, 2);
1442 NE10_F2I16_FIXDIV (fnkc, 2);
1445 fek.r = fk.r + fnkc.r;
1446 fek.i = fk.i + fnkc.i;
1448 tmp.r = fk.r - fnkc.r;
1449 tmp.i = fk.i - fnkc.i;
1451 fok.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).r
1452 + (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1453 fok.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).r
1454 - (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
1456 dst[k].r = fek.r + fok.r;
1457 dst[k].i = fek.i + fok.i;
1459 dst[ncfft - k].r = fek.r - fok.r;
1460 dst[ncfft - k].i = fok.i - fek.i;
1490 ne10_int32_t inverse_fft,
1491 ne10_int32_t scaled_flag)
1500 ne10_fft4_backward_int16_scaled (fout, fin);
1503 ne10_fft8_backward_int16_scaled (fout, fin);
1506 ne10_mixed_radix_fft_backward_int16_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1515 ne10_fft4_forward_int16_scaled (fout, fin);
1518 ne10_fft8_forward_int16_scaled (fout, fin);
1521 ne10_mixed_radix_fft_forward_int16_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1533 ne10_fft4_backward_int16_unscaled (fout, fin);
1536 ne10_fft8_backward_int16_unscaled (fout, fin);
1539 ne10_mixed_radix_fft_backward_int16_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1548 ne10_fft4_forward_int16_unscaled (fout, fin);
1551 ne10_fft8_forward_int16_unscaled (fout, fin);
1554 ne10_mixed_radix_fft_forward_int16_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
1587 ne10_int32_t scaled_flag)
1593 c2c_state.nfft = cfg->ncfft;
1594 c2c_state.factors = cfg->factors;
1595 c2c_state.twiddles = cfg->twiddles;
1596 c2c_state.buffer = tmpbuf2;
1599 ne10_fft_split_r2c_1d_int16_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1619 ne10_int32_t scaled_flag)
1625 c2c_state.nfft = cfg->ncfft;
1626 c2c_state.factors = cfg->factors;
1627 c2c_state.twiddles = cfg->twiddles;
1628 c2c_state.buffer = tmpbuf2;
1630 ne10_fft_split_c2r_1d_int16_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
structure for the 16 bits fixed point FFT function.
void ne10_fft_c2r_1d_int16_neon(ne10_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 IFFT (complex to real) of int16 data.
void ne10_fft_r2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 FFT (real to complex) of int16 data.
void ne10_fft_c2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_cfg_int16_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Mixed radix-2/4 complex FFT/IFFT of 32-bit fixed point data.