00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "dsputil.h"
00022
00023 static const int p1p1p1m1[4] __attribute__((aligned(16))) =
00024 { 0, 0, 0, 1 << 31 };
00025
00026 static const int p1p1m1p1[4] __attribute__((aligned(16))) =
00027 { 0, 0, 1 << 31, 0 };
00028
00029 static const int p1p1m1m1[4] __attribute__((aligned(16))) =
00030 { 0, 0, 1 << 31, 1 << 31 };
00031
00032 static const int p1m1p1m1[4] __attribute__((aligned(16))) =
00033 { 0, 1 << 31, 0, 1 << 31 };
00034
00035 static const int m1m1m1m1[4] __attribute__((aligned(16))) =
00036 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
00037
00038 #if 0
00039 static void print_v4sf(const char *str, __m128 a)
00040 {
00041 float *p = (float *)&a;
00042 printf("%s: %f %f %f %f\n",
00043 str, p[0], p[1], p[2], p[3]);
00044 }
00045 #endif
00046
00047
00048 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
00049 {
00050 int ln = s->nbits;
00051 long i, j;
00052 long nblocks, nloops;
00053 FFTComplex *p, *cptr;
00054
00055 asm volatile(
00056 "movaps %0, %%xmm4 \n\t"
00057 "movaps %1, %%xmm5 \n\t"
00058 ::"m"(*p1p1m1m1),
00059 "m"(*(s->inverse ? p1p1m1p1 : p1p1p1m1))
00060 );
00061
00062 i = 8 << ln;
00063 asm volatile(
00064 "1: \n\t"
00065 "sub $32, %0 \n\t"
00066
00067 "movaps (%0,%1), %%xmm0 \n\t"
00068 "movaps %%xmm0, %%xmm1 \n\t"
00069 "shufps $0x4E, %%xmm0, %%xmm0 \n\t"
00070 "xorps %%xmm4, %%xmm1 \n\t"
00071 "addps %%xmm1, %%xmm0 \n\t"
00072 "movaps 16(%0,%1), %%xmm2 \n\t"
00073 "movaps %%xmm2, %%xmm3 \n\t"
00074 "shufps $0x4E, %%xmm2, %%xmm2 \n\t"
00075 "xorps %%xmm4, %%xmm3 \n\t"
00076 "addps %%xmm3, %%xmm2 \n\t"
00077
00078
00079 "shufps $0xB4, %%xmm2, %%xmm2 \n\t"
00080 "xorps %%xmm5, %%xmm2 \n\t"
00081
00082 "movaps %%xmm0, %%xmm1 \n\t"
00083 "addps %%xmm2, %%xmm0 \n\t"
00084 "subps %%xmm2, %%xmm1 \n\t"
00085 "movaps %%xmm0, (%0,%1) \n\t"
00086 "movaps %%xmm1, 16(%0,%1) \n\t"
00087 "jg 1b \n\t"
00088 :"+r"(i)
00089 :"r"(z)
00090 );
00091
00092
00093 nblocks = 1 << (ln-3);
00094 nloops = 1 << 2;
00095 cptr = s->exptab1;
00096 do {
00097 p = z;
00098 j = nblocks;
00099 do {
00100 i = nloops*8;
00101 asm volatile(
00102 "1: \n\t"
00103 "sub $32, %0 \n\t"
00104 "movaps (%2,%0), %%xmm1 \n\t"
00105 "movaps (%1,%0), %%xmm0 \n\t"
00106 "movaps 16(%2,%0), %%xmm5 \n\t"
00107 "movaps 16(%1,%0), %%xmm4 \n\t"
00108 "movaps %%xmm1, %%xmm2 \n\t"
00109 "movaps %%xmm5, %%xmm6 \n\t"
00110 "shufps $0xA0, %%xmm1, %%xmm1 \n\t"
00111 "shufps $0xF5, %%xmm2, %%xmm2 \n\t"
00112 "shufps $0xA0, %%xmm5, %%xmm5 \n\t"
00113 "shufps $0xF5, %%xmm6, %%xmm6 \n\t"
00114 "mulps (%3,%0,2), %%xmm1 \n\t"
00115 "mulps 16(%3,%0,2), %%xmm2 \n\t"
00116 "mulps 32(%3,%0,2), %%xmm5 \n\t"
00117 "mulps 48(%3,%0,2), %%xmm6 \n\t"
00118 "addps %%xmm2, %%xmm1 \n\t"
00119 "addps %%xmm6, %%xmm5 \n\t"
00120 "movaps %%xmm0, %%xmm3 \n\t"
00121 "movaps %%xmm4, %%xmm7 \n\t"
00122 "addps %%xmm1, %%xmm0 \n\t"
00123 "subps %%xmm1, %%xmm3 \n\t"
00124 "addps %%xmm5, %%xmm4 \n\t"
00125 "subps %%xmm5, %%xmm7 \n\t"
00126 "movaps %%xmm0, (%1,%0) \n\t"
00127 "movaps %%xmm3, (%2,%0) \n\t"
00128 "movaps %%xmm4, 16(%1,%0) \n\t"
00129 "movaps %%xmm7, 16(%2,%0) \n\t"
00130 "jg 1b \n\t"
00131 :"+r"(i)
00132 :"r"(p), "r"(p + nloops), "r"(cptr)
00133 );
00134 p += nloops*2;
00135 } while (--j);
00136 cptr += nloops*2;
00137 nblocks >>= 1;
00138 nloops <<= 1;
00139 } while (nblocks != 0);
00140 }
00141
00142 void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
00143 const FFTSample *input, FFTSample *tmp)
00144 {
00145 long k, n8, n4, n2, n;
00146 const uint16_t *revtab = s->fft.revtab;
00147 const FFTSample *tcos = s->tcos;
00148 const FFTSample *tsin = s->tsin;
00149 const FFTSample *in1, *in2;
00150 FFTComplex *z = (FFTComplex *)tmp;
00151
00152 n = 1 << s->nbits;
00153 n2 = n >> 1;
00154 n4 = n >> 2;
00155 n8 = n >> 3;
00156
00157 #ifdef ARCH_X86_64
00158 asm volatile ("movaps %0, %%xmm8\n\t"::"m"(*p1m1p1m1));
00159 #define P1M1P1M1 "%%xmm8"
00160 #else
00161 #define P1M1P1M1 "%4"
00162 #endif
00163
00164
00165 in1 = input;
00166 in2 = input + n2 - 4;
00167
00168
00169 for (k = 0; k < n4; k += 4) {
00170 asm volatile (
00171 "movaps %0, %%xmm0 \n\t"
00172 "movaps %1, %%xmm3 \n\t"
00173 "movaps -16+1*%0, %%xmm4 \n\t"
00174 "movaps 16+1*%1, %%xmm7 \n\t"
00175 "movlps %2, %%xmm1 \n\t"
00176 "movlps %3, %%xmm2 \n\t"
00177 "movlps 8+1*%2, %%xmm5 \n\t"
00178 "movlps 8+1*%3, %%xmm6 \n\t"
00179 "shufps $95, %%xmm0, %%xmm0 \n\t"
00180 "shufps $160,%%xmm3, %%xmm3 \n\t"
00181 "shufps $95, %%xmm4, %%xmm4 \n\t"
00182 "shufps $160,%%xmm7, %%xmm7 \n\t"
00183 "unpcklps %%xmm2, %%xmm1 \n\t"
00184 "unpcklps %%xmm6, %%xmm5 \n\t"
00185 "movaps %%xmm1, %%xmm2 \n\t"
00186 "movaps %%xmm5, %%xmm6 \n\t"
00187 "xorps "P1M1P1M1", %%xmm2 \n\t"
00188 "xorps "P1M1P1M1", %%xmm6 \n\t"
00189 "mulps %%xmm1, %%xmm0 \n\t"
00190 "mulps %%xmm5, %%xmm4 \n\t"
00191 "shufps $177,%%xmm2, %%xmm2 \n\t"
00192 "shufps $177,%%xmm6, %%xmm6 \n\t"
00193 "mulps %%xmm2, %%xmm3 \n\t"
00194 "mulps %%xmm6, %%xmm7 \n\t"
00195 "addps %%xmm3, %%xmm0 \n\t"
00196 "addps %%xmm7, %%xmm4 \n\t"
00197 ::"m"(in2[-2*k]), "m"(in1[2*k]),
00198 "m"(tcos[k]), "m"(tsin[k])
00199 #ifndef ARCH_X86_64
00200 ,"m"(*p1m1p1m1)
00201 #endif
00202 );
00203
00204 asm (
00205 "movlps %%xmm0, %0 \n\t"
00206 "movhps %%xmm0, %1 \n\t"
00207 "movlps %%xmm4, %2 \n\t"
00208 "movhps %%xmm4, %3 \n\t"
00209 :"=m"(z[revtab[k]]), "=m"(z[revtab[k + 1]]),
00210 "=m"(z[revtab[k + 2]]), "=m"(z[revtab[k + 3]])
00211 );
00212 }
00213
00214 ff_fft_calc_sse(&s->fft, z);
00215
00216 #ifndef ARCH_X86_64
00217 #undef P1M1P1M1
00218 #define P1M1P1M1 "%3"
00219 #endif
00220
00221
00222 for (k = 0; k < n4; k += 4) {
00223 asm (
00224 "movaps %0, %%xmm0 \n\t"
00225 "movaps 16+1*%0, %%xmm4 \n\t"
00226 "movlps %1, %%xmm1 \n\t"
00227 "movlps 8+1*%1, %%xmm5 \n\t"
00228 "movaps %%xmm0, %%xmm3 \n\t"
00229 "movaps %%xmm4, %%xmm7 \n\t"
00230 "movlps %2, %%xmm2 \n\t"
00231 "movlps 8+1*%2, %%xmm6 \n\t"
00232 "shufps $160,%%xmm0, %%xmm0 \n\t"
00233 "shufps $245,%%xmm3, %%xmm3 \n\t"
00234 "shufps $160,%%xmm4, %%xmm4 \n\t"
00235 "shufps $245,%%xmm7, %%xmm7 \n\t"
00236 "unpcklps %%xmm2, %%xmm1 \n\t"
00237 "unpcklps %%xmm6, %%xmm5 \n\t"
00238 "movaps %%xmm1, %%xmm2 \n\t"
00239 "movaps %%xmm5, %%xmm6 \n\t"
00240 "xorps "P1M1P1M1", %%xmm2 \n\t"
00241 "mulps %%xmm1, %%xmm0 \n\t"
00242 "xorps "P1M1P1M1", %%xmm6 \n\t"
00243 "mulps %%xmm5, %%xmm4 \n\t"
00244 "shufps $177,%%xmm2, %%xmm2 \n\t"
00245 "shufps $177,%%xmm6, %%xmm6 \n\t"
00246 "mulps %%xmm2, %%xmm3 \n\t"
00247 "mulps %%xmm6, %%xmm7 \n\t"
00248 "addps %%xmm3, %%xmm0 \n\t"
00249 "addps %%xmm7, %%xmm4 \n\t"
00250 "movaps %%xmm0, %0 \n\t"
00251 "movaps %%xmm4, 16+1*%0\n\t"
00252 :"+m"(z[k])
00253 :"m"(tcos[k]), "m"(tsin[k])
00254 #ifndef ARCH_X86_64
00255 ,"m"(*p1m1p1m1)
00256 #endif
00257 );
00258 }
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268
00269
00270
00271 k = 16-n;
00272 asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1));
00273 asm volatile(
00274 "1: \n\t"
00275 "movaps -16(%4,%0), %%xmm1 \n\t"
00276 "neg %0 \n\t"
00277 "movaps (%4,%0), %%xmm0 \n\t"
00278 "xorps %%xmm7, %%xmm0 \n\t"
00279 "movaps %%xmm0, %%xmm2 \n\t"
00280 "shufps $141,%%xmm1, %%xmm0 \n\t"
00281 "shufps $216,%%xmm1, %%xmm2 \n\t"
00282 "shufps $156,%%xmm0, %%xmm0 \n\t"
00283 "shufps $156,%%xmm2, %%xmm2 \n\t"
00284 "movaps %%xmm0, (%1,%0) \n\t"
00285 "movaps %%xmm2, (%2,%0) \n\t"
00286 "neg %0 \n\t"
00287 "shufps $27, %%xmm0, %%xmm0 \n\t"
00288 "xorps %%xmm7, %%xmm0 \n\t"
00289 "shufps $27, %%xmm2, %%xmm2 \n\t"
00290 "movaps %%xmm0, -16(%2,%0) \n\t"
00291 "movaps %%xmm2, -16(%3,%0) \n\t"
00292 "add $16, %0 \n\t"
00293 "jle 1b \n\t"
00294 :"+r"(k)
00295 :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8)
00296 :"memory"
00297 );
00298 }
00299