// from https://gist.github.com/cyanreg/665b9c79cbe51df9296a969257f2a16c static void fft4(FFTComplex *z) { FFTSample r1 = z[0].re - z[4].re; FFTSample r2 = z[0].im - z[4].im; FFTSample r3 = z[1].re - z[5].re; FFTSample r4 = z[1].im - z[5].im; /* r5-r8 second transform */ FFTSample t1 = z[0].re + z[4].re; FFTSample t2 = z[0].im + z[4].im; FFTSample t3 = z[1].re + z[5].re; FFTSample t4 = z[1].im + z[5].im; /* t5-t8 second transform */ /* 1sub + 1add = 2 instructions */ FFTSample a3 = t1 - t3; FFTSample a1 = t1 + t3; FFTSample a4 = t2 - t4; FFTSample a2 = t2 + t4; /* shuf + shuf + addsub = 3 */ FFTSample b3 = r1 - r4; FFTSample b1 = r1 + r4; FFTSample b2 = r2 - r3; FFTSample b4 = r2 + r3; /* shuf + shuf + addsub = 3 */ /* shufps + shufps = 10 */ z[0].re = a1; z[0].im = a2; z[1].re = a3; z[1].im = a4; z[4].re = b1; z[4].im = b2; z[5].re = b3; z[5].im = b4; } // from https://gist.github.com/cyanreg/bbf25c8a8dfb910ed3b9ae7663983ca6 static void fft8(FFTComplex *z) { /* _1_ sub */ FFTSample r1 = z[0].re - z[4].re; FFTSample r2 = z[0].im - z[4].im; // IN LOW m1 FFTSample r3 = z[1].re - z[5].re; FFTSample r4 = z[1].im - z[5].im; FFTSample r5 = z[2].re - z[6].re; FFTSample r6 = z[2].im - z[6].im; // IN HIGH m1 FFTSample r7 = z[3].re - z[7].re; FFTSample r8 = z[3].im - z[7].im; /* _1_ add */ FFTSample q1 = z[0].re + z[4].re; FFTSample q2 = z[0].im + z[4].im; // IN LOW m2 FFTSample q3 = z[1].re + z[5].re; FFTSample q4 = z[1].im + z[5].im; FFTSample q5 = z[2].re + z[6].re; FFTSample q6 = z[2].im + z[6].im; // IN HIGH m2 FFTSample q7 = z[3].re + z[7].re; FFTSample q8 = z[3].im + z[7].im; /* 2 shufps, 1 xor MEM, 1 add = _4_ */ FFTSample s1 = q1 + q3; FFTSample s2 = q2 + q4; FFTSample s3 = q1 - q3; FFTSample s4 = q2 - q4; FFTSample s5 = q5 + q7; FFTSample s6 = q6 + q8; FFTSample s7 = q5 - q7; FFTSample s8 = q6 - q8; /* (1 extract, 1 shuffle = _2_) */ /* additionally 1 insert, 1 xor, 1 add = _3_ to actually perform it */ q1 = s1 + s5; q2 = s2 + s6; q3 = s3 + s8; q4 = s4 - s7; q5 = s1 - s5; q6 = s2 - s6; q7 = s3 - s8; q8 = s4 + s7; z[0].re = q1; z[0].im = q2; z[1].re = q3; z[1].im = q4; z[2].re = q5; z[2].im = q6; z[3].re = q7; z[3].im = q8; /* 11 instructions */ /* 2 shufps + 1 xor + 1 addsub = 4 instr */ FFTSample z1 = r1 + r4; FFTSample z2 = r3 - r2; FFTSample z3 = r3 + r2; FFTSample z4 = r1 - r4; FFTSample z5 = r5 + r6; FFTSample z6 = r7 - r8; FFTSample z7 = r7 + r8; FFTSample z8 = r5 - r6; /* 1 mov, 2 shuffles, 1 addsub = 3 */ FFTSample t5 = z6 + z5; FFTSample t6 = z7 - z8; FFTSample t7 = z7 + z8; FFTSample t8 = z6 - z5; /* 2 extract, 1 shuffle, 1 fma, 1 xors = 5 */ FFTSample u1 = z1 + t5*( M_SQRT1_2); FFTSample u2 = -z2 + t6*( M_SQRT1_2); FFTSample u3 = z4 + t7*(-M_SQRT1_2); FFTSample u4 = z3 + t8*( M_SQRT1_2); FFTSample u5 = z1 + t5*(-M_SQRT1_2); FFTSample u6 = -z2 + t6*(-M_SQRT1_2); FFTSample u7 = z4 + t7*( M_SQRT1_2); FFTSample u8 = z3 + t8*(-M_SQRT1_2); z[4].re = u1; z[4].im = u2; z[5].re = u3; z[5].im = u4; z[6].re = u5; z[6].im = u6; z[7].re = u7; z[7].im = u8; /* 12 instructions = 23 instructions total */ } static void fft16(AVTXContext *ctx, FFTComplex *temp, FFTComplex *z) { FFTSample cos_16_1 = TX_NAME(ff_cos_16)[1]; FFTSample cos_16_3 = TX_NAME(ff_cos_16)[3]; fft8(z); fft4(z+8); fft4(z+10); FFTSample s[32]; /* xorps m1, m1 - free mulps m0 shufps m1, m1, m0 xorps addsub shufps mulps mulps addps or (fma3) shufps shufps mulps mulps fma fma */ s[0] = z[8].re*( 1) - z[8].im*( 0); s[1] = z[8].im*( 1) + z[8].re*( 0); s[2] = z[9].re*( 1) - z[9].im*(-1); s[3] = z[9].im*( 1) + z[9].re*(-1); s[4] = z[10].re*( 1) - z[10].im*( 0); s[5] = z[10].im*( 1) + z[10].re*( 0); s[6] = z[11].re*( 1) - z[11].im*( 1); s[7] = z[11].im*( 1) + z[11].re*( 1); s[8] = z[12].re*( cos_16_1) - z[12].im*( -cos_16_3); s[9] = z[12].im*( cos_16_1) + z[12].re*( -cos_16_3); s[10] = z[13].re*( cos_16_3) - z[13].im*( -cos_16_1); s[11] = z[13].im*( cos_16_3) + z[13].re*( -cos_16_1); s[12] = z[14].re*( cos_16_1) - z[14].im*( cos_16_3); s[13] = z[14].im*( -cos_16_1) + z[14].re*( -cos_16_3); s[14] = z[15].re*( cos_16_3) - z[15].im*( cos_16_1); s[15] = z[15].im*( -cos_16_3) + z[15].re*( -cos_16_1); s[2] *= M_SQRT1_2; s[3] *= M_SQRT1_2; s[5] *= -1; s[6] *= M_SQRT1_2; s[7] *= -M_SQRT1_2; FFTSample w5 = s[0] + s[4]; FFTSample w6 = s[1] - s[5]; FFTSample x5 = s[2] + s[6]; FFTSample x6 = s[3] - s[7]; FFTSample w3 = s[4] - s[0]; FFTSample w4 = s[5] + s[1]; FFTSample x3 = s[6] - s[2]; FFTSample x4 = s[7] + s[3]; FFTSample y5 = s[8] + s[12]; FFTSample y6 = s[9] - s[13]; FFTSample u5 = s[10] + s[14]; FFTSample u6 = s[11] - s[15]; FFTSample y3 = s[12] - s[8]; FFTSample y4 = s[13] + s[9]; FFTSample u3 = s[14] - s[10]; FFTSample u4 = s[15] + s[11]; /* 2xorps, 2vperm2fs, 2 adds, 2 vpermilps = 8 */ FFTSample o1 = z[0].re + w5; FFTSample o2 = z[0].im + w6; FFTSample o5 = z[1].re + x5; FFTSample o6 = z[1].im + x6; FFTSample o9 = z[2].re + w4; //h FFTSample o10 = z[2].im + w3; FFTSample o13 = z[3].re + x4; FFTSample o14 = z[3].im + x3; FFTSample o17 = z[0].re - w5; FFTSample o18 = z[0].im - w6; FFTSample o21 = z[1].re - x5; FFTSample o22 = z[1].im - x6; FFTSample o25 = z[2].re - w4; //h FFTSample o26 = z[2].im - w3; FFTSample o29 = z[3].re - x4; FFTSample o30 = z[3].im - x3; FFTSample o3 = z[4].re + y5; FFTSample o4 = z[4].im + y6; FFTSample o7 = z[5].re + u5; FFTSample o8 = z[5].im + u6; FFTSample o11 = z[6].re + y4; //h FFTSample o12 = z[6].im + y3; FFTSample o15 = z[7].re + u4; FFTSample o16 = z[7].im + u3; FFTSample o19 = z[4].re - y5; FFTSample o20 = z[4].im - y6; FFTSample o23 = z[5].re - u5; FFTSample o24 = z[5].im - u6; FFTSample o27 = z[6].re - y4; //h FFTSample o28 = z[6].im - y3; FFTSample o31 = z[7].re - u4; FFTSample o32 = z[7].im - u3; /* This is just deinterleaving, it's separate */ z[0] = (FFTComplex){ o1, o2 }; z[1] = (FFTComplex){ o3, o4 }; z[2] = (FFTComplex){ o5, o6 }; z[3] = (FFTComplex){ o7, o8 }; z[4] = (FFTComplex){ o9, o10 }; z[5] = (FFTComplex){ o11, o12 }; z[6] = (FFTComplex){ o13, o14 }; z[7] = (FFTComplex){ o15, o16 }; z[8] = (FFTComplex){ o17, o18 }; z[9] = (FFTComplex){ o19, o20 }; z[10] = (FFTComplex){ o21, o22 }; z[11] = (FFTComplex){ o23, o24 }; z[12] = (FFTComplex){ o25, o26 }; z[13] = (FFTComplex){ o27, o28 }; z[14] = (FFTComplex){ o29, o30 }; z[15] = (FFTComplex){ o31, o32 }; } #if 0 %define M_SQRT1_2 0.707106781186547524401 %define COS16_1 0.92387950420379638671875 %define COS16_3 0.3826834261417388916015625 s16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2 s16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, COS16_3, -COS16_3 s16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1 s16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2 mask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG INIT_YMM avx cglobal fft16, 4, 4, 8, ctx, out, in, stride mov ctxq, [ctxq + AVTXContext.revtab] LOADZ_LUT 0, inq, ctxq + 16*0, strideq, 4 LOADZ_LUT 1, inq, ctxq + 16*1, strideq, 4 LOADZ_LUT 2, inq, ctxq + 16*2, strideq, 4 LOADZ_LUT 3, inq, ctxq + 16*3, strideq, 4 FFT4 m2, m3, m4, m5 FFT8 m0, m1, m6, m7 movaps m6, [mask_mpmppmpm] movaps m7, [s16_perm] xorps m4, m4 ; 0 shufps m4, m4, m2, q2301 ; 0, 0, z[8].im, z[8].re... xorps m4, [mask_mppmmpmp] addps m4, m2, m4 ; s[0..7] mulps m4, [s16_mult_even] ; s[0..7]*costab shufps m5, m3, m3, q2301 ; z[12].im, z[12].re, z[13].im, z[13].re... mulps m3, [s16_mult_odd1] ; z.reim * costab mulps m5, [s16_mult_odd2] ; z.imre * costab addps m5, m3, m5 ; s[8..15] xorps m2, m4, m6 ; s[0..7]*mpmppmpm vperm2f128 m2, m2, q0001 ; s[4..7, 0..3] addps m4, m2 ; w5, w6, x5, x6, w3, w4, x3, x4 vpermilps m4, m7 ; w5, w6, x5, x6, w4, w3, x4, x3 xorps m3, m5, m6 ; s[8..15]*mpmppmpm vperm2f128 m3, m3, q0001 ; s[12..15, 8..11] addps m5, m3 ; y5, y6, u5, u6, y3, y4, u3, u4 vpermilps m5, m7 ; y5, y6, u5, u6, y4, y3, u4, u3 subps m3, m1, m5 ; odd part 2 addps m2, m1, m5 ; odd part 1 subps m1, m0, m4 ; even part 2 addps m0, m0, m4 ; even part 1 ; deinterleaving unpcklpd m5, m1, m3 unpcklpd m4, m0, m2 unpckhpd m1, m3 unpckhpd m0, m2 vextractf128 [outq + 16*0], m4, 0 vextractf128 [outq + 16*1], m0, 0 vextractf128 [outq + 16*2], m4, 1 vextractf128 [outq + 16*3], m0, 1 vextractf128 [outq + 16*4], m5, 0 vextractf128 [outq + 16*5], m1, 0 vextractf128 [outq + 16*6], m5, 1 vextractf128 [outq + 16*7], m1, 1 ret #endif