00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "dsputil.h"
00023 #include "simple_idct.h"
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00036 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00037 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00038 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00039 #if 0
00040 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00041 #else
00042 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
00043 #endif
00044 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00045 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00046 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00047
00048 #define ROW_SHIFT 11
00049 #define COL_SHIFT 20 // 6
00050
00051 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
00052 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
00053
00054 DECLARE_ALIGNED(8, static const int16_t, coeffs[])= {
00055 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
00056
00057
00058 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
00059
00060
00061
00062
00063 C4, C4, C4, C4,
00064 C4, -C4, C4, -C4,
00065
00066 C2, C6, C2, C6,
00067 C6, -C2, C6, -C2,
00068
00069 C1, C3, C1, C3,
00070 C5, C7, C5, C7,
00071
00072 C3, -C7, C3, -C7,
00073 -C1, -C5, -C1, -C5,
00074
00075 C5, -C1, C5, -C1,
00076 C7, C3, C7, C3,
00077
00078 C7, -C5, C7, -C5,
00079 C3, -C1, C3, -C1
00080 };
00081
00082 #if 0
00083 static void unused_var_killer(){
00084 int a= wm1010 + d40000;
00085 temp[0]=a;
00086 }
00087
00088 static void inline idctCol (int16_t * col, int16_t *input)
00089 {
00090 #undef C0
00091 #undef C1
00092 #undef C2
00093 #undef C3
00094 #undef C4
00095 #undef C5
00096 #undef C6
00097 #undef C7
00098 int a0, a1, a2, a3, b0, b1, b2, b3;
00099 const int C0 = 23170;
00100 const int C1 = 22725;
00101 const int C2 = 21407;
00102 const int C3 = 19266;
00103 const int C4 = 16383;
00104 const int C5 = 12873;
00105 const int C6 = 8867;
00106 const int C7 = 4520;
00107
00108
00109
00110
00111
00112
00113
00114 col[8*0] = input[8*0 + 0];
00115 col[8*1] = input[8*2 + 0];
00116 col[8*2] = input[8*0 + 1];
00117 col[8*3] = input[8*2 + 1];
00118 col[8*4] = input[8*4 + 0];
00119 col[8*5] = input[8*6 + 0];
00120 col[8*6] = input[8*4 + 1];
00121 col[8*7] = input[8*6 + 1];
00122
00123 a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
00124 a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
00125 a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
00126 a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
00127
00128 b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
00129 b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
00130 b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
00131 b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
00132
00133 col[8*0] = (a0 + b0) >> COL_SHIFT;
00134 col[8*1] = (a1 + b1) >> COL_SHIFT;
00135 col[8*2] = (a2 + b2) >> COL_SHIFT;
00136 col[8*3] = (a3 + b3) >> COL_SHIFT;
00137 col[8*4] = (a3 - b3) >> COL_SHIFT;
00138 col[8*5] = (a2 - b2) >> COL_SHIFT;
00139 col[8*6] = (a1 - b1) >> COL_SHIFT;
00140 col[8*7] = (a0 - b0) >> COL_SHIFT;
00141 }
00142
00143 static void inline idctRow (int16_t * output, int16_t * input)
00144 {
00145 int16_t row[8];
00146
00147 int a0, a1, a2, a3, b0, b1, b2, b3;
00148 const int C0 = 23170;
00149 const int C1 = 22725;
00150 const int C2 = 21407;
00151 const int C3 = 19266;
00152 const int C4 = 16383;
00153 const int C5 = 12873;
00154 const int C6 = 8867;
00155 const int C7 = 4520;
00156
00157 row[0] = input[0];
00158 row[2] = input[1];
00159 row[4] = input[4];
00160 row[6] = input[5];
00161 row[1] = input[8];
00162 row[3] = input[9];
00163 row[5] = input[12];
00164 row[7] = input[13];
00165
00166 if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
00167 row[0] = row[1] = row[2] = row[3] = row[4] =
00168 row[5] = row[6] = row[7] = row[0]<<3;
00169 output[0] = row[0];
00170 output[2] = row[1];
00171 output[4] = row[2];
00172 output[6] = row[3];
00173 output[8] = row[4];
00174 output[10] = row[5];
00175 output[12] = row[6];
00176 output[14] = row[7];
00177 return;
00178 }
00179
00180 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
00181 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
00182 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
00183 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
00184
00185 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
00186 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
00187 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
00188 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
00189
00190 row[0] = (a0 + b0) >> ROW_SHIFT;
00191 row[1] = (a1 + b1) >> ROW_SHIFT;
00192 row[2] = (a2 + b2) >> ROW_SHIFT;
00193 row[3] = (a3 + b3) >> ROW_SHIFT;
00194 row[4] = (a3 - b3) >> ROW_SHIFT;
00195 row[5] = (a2 - b2) >> ROW_SHIFT;
00196 row[6] = (a1 - b1) >> ROW_SHIFT;
00197 row[7] = (a0 - b0) >> ROW_SHIFT;
00198
00199 output[0] = row[0];
00200 output[2] = row[1];
00201 output[4] = row[2];
00202 output[6] = row[3];
00203 output[8] = row[4];
00204 output[10] = row[5];
00205 output[12] = row[6];
00206 output[14] = row[7];
00207 }
00208 #endif
00209
00210 static inline void idct(int16_t *block)
00211 {
00212 DECLARE_ALIGNED(8, int64_t, align_tmp[16]);
00213 int16_t * const temp= (int16_t*)align_tmp;
00214
00215 asm volatile(
00216 #if 0 //Alternative, simpler variant
00217
00218 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00219 "movq " #src0 ", %%mm0 \n\t" \
00220 "movq " #src4 ", %%mm1 \n\t" \
00221 "movq " #src1 ", %%mm2 \n\t" \
00222 "movq " #src5 ", %%mm3 \n\t" \
00223 "movq 16(%2), %%mm4 \n\t" \
00224 "pmaddwd %%mm0, %%mm4 \n\t" \
00225 "movq 24(%2), %%mm5 \n\t" \
00226 "pmaddwd %%mm5, %%mm0 \n\t" \
00227 "movq 32(%2), %%mm5 \n\t" \
00228 "pmaddwd %%mm1, %%mm5 \n\t" \
00229 "movq 40(%2), %%mm6 \n\t" \
00230 "pmaddwd %%mm6, %%mm1 \n\t" \
00231 "movq 48(%2), %%mm7 \n\t" \
00232 "pmaddwd %%mm2, %%mm7 \n\t" \
00233 #rounder ", %%mm4 \n\t"\
00234 "movq %%mm4, %%mm6 \n\t" \
00235 "paddd %%mm5, %%mm4 \n\t" \
00236 "psubd %%mm5, %%mm6 \n\t" \
00237 "movq 56(%2), %%mm5 \n\t" \
00238 "pmaddwd %%mm3, %%mm5 \n\t" \
00239 #rounder ", %%mm0 \n\t"\
00240 "paddd %%mm0, %%mm1 \n\t" \
00241 "paddd %%mm0, %%mm0 \n\t" \
00242 "psubd %%mm1, %%mm0 \n\t" \
00243 "pmaddwd 64(%2), %%mm2 \n\t" \
00244 "paddd %%mm5, %%mm7 \n\t" \
00245 "movq 72(%2), %%mm5 \n\t" \
00246 "pmaddwd %%mm3, %%mm5 \n\t" \
00247 "paddd %%mm4, %%mm7 \n\t" \
00248 "paddd %%mm4, %%mm4 \n\t" \
00249 "psubd %%mm7, %%mm4 \n\t" \
00250 "paddd %%mm2, %%mm5 \n\t" \
00251 "psrad $" #shift ", %%mm7 \n\t"\
00252 "psrad $" #shift ", %%mm4 \n\t"\
00253 "movq %%mm1, %%mm2 \n\t" \
00254 "paddd %%mm5, %%mm1 \n\t" \
00255 "psubd %%mm5, %%mm2 \n\t" \
00256 "psrad $" #shift ", %%mm1 \n\t"\
00257 "psrad $" #shift ", %%mm2 \n\t"\
00258 "packssdw %%mm1, %%mm7 \n\t" \
00259 "packssdw %%mm4, %%mm2 \n\t" \
00260 "movq %%mm7, " #dst " \n\t"\
00261 "movq " #src1 ", %%mm1 \n\t" \
00262 "movq 80(%2), %%mm4 \n\t" \
00263 "movq %%mm2, 24+" #dst " \n\t"\
00264 "pmaddwd %%mm1, %%mm4 \n\t" \
00265 "movq 88(%2), %%mm7 \n\t" \
00266 "pmaddwd 96(%2), %%mm1 \n\t" \
00267 "pmaddwd %%mm3, %%mm7 \n\t" \
00268 "movq %%mm0, %%mm2 \n\t" \
00269 "pmaddwd 104(%2), %%mm3 \n\t" \
00270 "paddd %%mm7, %%mm4 \n\t" \
00271 "paddd %%mm4, %%mm2 \n\t" \
00272 "psubd %%mm4, %%mm0 \n\t" \
00273 "psrad $" #shift ", %%mm2 \n\t"\
00274 "psrad $" #shift ", %%mm0 \n\t"\
00275 "movq %%mm6, %%mm4 \n\t" \
00276 "paddd %%mm1, %%mm3 \n\t" \
00277 "paddd %%mm3, %%mm6 \n\t" \
00278 "psubd %%mm3, %%mm4 \n\t" \
00279 "psrad $" #shift ", %%mm6 \n\t"\
00280 "packssdw %%mm6, %%mm2 \n\t" \
00281 "movq %%mm2, 8+" #dst " \n\t"\
00282 "psrad $" #shift ", %%mm4 \n\t"\
00283 "packssdw %%mm0, %%mm4 \n\t" \
00284 "movq %%mm4, 16+" #dst " \n\t"\
00285
00286 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
00287 "movq " #src0 ", %%mm0 \n\t" \
00288 "movq " #src4 ", %%mm1 \n\t" \
00289 "movq " #src1 ", %%mm2 \n\t" \
00290 "movq " #src5 ", %%mm3 \n\t" \
00291 "movq 16(%2), %%mm4 \n\t" \
00292 "pmaddwd %%mm0, %%mm4 \n\t" \
00293 "movq 24(%2), %%mm5 \n\t" \
00294 "pmaddwd %%mm5, %%mm0 \n\t" \
00295 "movq 32(%2), %%mm5 \n\t" \
00296 "pmaddwd %%mm1, %%mm5 \n\t" \
00297 "movq 40(%2), %%mm6 \n\t" \
00298 "pmaddwd %%mm6, %%mm1 \n\t" \
00299 "movq %%mm4, %%mm6 \n\t" \
00300 "movq 48(%2), %%mm7 \n\t" \
00301 "pmaddwd %%mm2, %%mm7 \n\t" \
00302 "paddd %%mm5, %%mm4 \n\t" \
00303 "psubd %%mm5, %%mm6 \n\t" \
00304 "movq %%mm0, %%mm5 \n\t" \
00305 "paddd %%mm1, %%mm0 \n\t" \
00306 "psubd %%mm1, %%mm5 \n\t" \
00307 "movq 56(%2), %%mm1 \n\t" \
00308 "pmaddwd %%mm3, %%mm1 \n\t" \
00309 "pmaddwd 64(%2), %%mm2 \n\t" \
00310 "paddd %%mm1, %%mm7 \n\t" \
00311 "movq 72(%2), %%mm1 \n\t" \
00312 "pmaddwd %%mm3, %%mm1 \n\t" \
00313 "paddd %%mm4, %%mm7 \n\t" \
00314 "paddd %%mm4, %%mm4 \n\t" \
00315 "psubd %%mm7, %%mm4 \n\t" \
00316 "paddd %%mm2, %%mm1 \n\t" \
00317 "psrad $" #shift ", %%mm7 \n\t"\
00318 "psrad $" #shift ", %%mm4 \n\t"\
00319 "movq %%mm0, %%mm2 \n\t" \
00320 "paddd %%mm1, %%mm0 \n\t" \
00321 "psubd %%mm1, %%mm2 \n\t" \
00322 "psrad $" #shift ", %%mm0 \n\t"\
00323 "psrad $" #shift ", %%mm2 \n\t"\
00324 "packssdw %%mm7, %%mm7 \n\t" \
00325 "movd %%mm7, " #dst " \n\t"\
00326 "packssdw %%mm0, %%mm0 \n\t" \
00327 "movd %%mm0, 16+" #dst " \n\t"\
00328 "packssdw %%mm2, %%mm2 \n\t" \
00329 "movd %%mm2, 96+" #dst " \n\t"\
00330 "packssdw %%mm4, %%mm4 \n\t" \
00331 "movd %%mm4, 112+" #dst " \n\t"\
00332 "movq " #src1 ", %%mm0 \n\t" \
00333 "movq 80(%2), %%mm4 \n\t" \
00334 "pmaddwd %%mm0, %%mm4 \n\t" \
00335 "movq 88(%2), %%mm7 \n\t" \
00336 "pmaddwd 96(%2), %%mm0 \n\t" \
00337 "pmaddwd %%mm3, %%mm7 \n\t" \
00338 "movq %%mm5, %%mm2 \n\t" \
00339 "pmaddwd 104(%2), %%mm3 \n\t" \
00340 "paddd %%mm7, %%mm4 \n\t" \
00341 "paddd %%mm4, %%mm2 \n\t" \
00342 "psubd %%mm4, %%mm5 \n\t" \
00343 "psrad $" #shift ", %%mm2 \n\t"\
00344 "psrad $" #shift ", %%mm5 \n\t"\
00345 "movq %%mm6, %%mm4 \n\t" \
00346 "paddd %%mm0, %%mm3 \n\t" \
00347 "paddd %%mm3, %%mm6 \n\t" \
00348 "psubd %%mm3, %%mm4 \n\t" \
00349 "psrad $" #shift ", %%mm6 \n\t"\
00350 "psrad $" #shift ", %%mm4 \n\t"\
00351 "packssdw %%mm2, %%mm2 \n\t" \
00352 "packssdw %%mm6, %%mm6 \n\t" \
00353 "movd %%mm2, 32+" #dst " \n\t"\
00354 "packssdw %%mm4, %%mm4 \n\t" \
00355 "packssdw %%mm5, %%mm5 \n\t" \
00356 "movd %%mm6, 48+" #dst " \n\t"\
00357 "movd %%mm4, 64+" #dst " \n\t"\
00358 "movd %%mm5, 80+" #dst " \n\t"\
00359
00360
00361 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00362 "movq " #src0 ", %%mm0 \n\t" \
00363 "movq " #src4 ", %%mm1 \n\t" \
00364 "movq " #src1 ", %%mm2 \n\t" \
00365 "movq " #src5 ", %%mm3 \n\t" \
00366 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
00367 "pand %%mm0, %%mm4 \n\t"\
00368 "por %%mm1, %%mm4 \n\t"\
00369 "por %%mm2, %%mm4 \n\t"\
00370 "por %%mm3, %%mm4 \n\t"\
00371 "packssdw %%mm4,%%mm4 \n\t"\
00372 "movd %%mm4, %%eax \n\t"\
00373 "orl %%eax, %%eax \n\t"\
00374 "jz 1f \n\t"\
00375 "movq 16(%2), %%mm4 \n\t" \
00376 "pmaddwd %%mm0, %%mm4 \n\t" \
00377 "movq 24(%2), %%mm5 \n\t" \
00378 "pmaddwd %%mm5, %%mm0 \n\t" \
00379 "movq 32(%2), %%mm5 \n\t" \
00380 "pmaddwd %%mm1, %%mm5 \n\t" \
00381 "movq 40(%2), %%mm6 \n\t" \
00382 "pmaddwd %%mm6, %%mm1 \n\t" \
00383 "movq 48(%2), %%mm7 \n\t" \
00384 "pmaddwd %%mm2, %%mm7 \n\t" \
00385 #rounder ", %%mm4 \n\t"\
00386 "movq %%mm4, %%mm6 \n\t" \
00387 "paddd %%mm5, %%mm4 \n\t" \
00388 "psubd %%mm5, %%mm6 \n\t" \
00389 "movq 56(%2), %%mm5 \n\t" \
00390 "pmaddwd %%mm3, %%mm5 \n\t" \
00391 #rounder ", %%mm0 \n\t"\
00392 "paddd %%mm0, %%mm1 \n\t" \
00393 "paddd %%mm0, %%mm0 \n\t" \
00394 "psubd %%mm1, %%mm0 \n\t" \
00395 "pmaddwd 64(%2), %%mm2 \n\t" \
00396 "paddd %%mm5, %%mm7 \n\t" \
00397 "movq 72(%2), %%mm5 \n\t" \
00398 "pmaddwd %%mm3, %%mm5 \n\t" \
00399 "paddd %%mm4, %%mm7 \n\t" \
00400 "paddd %%mm4, %%mm4 \n\t" \
00401 "psubd %%mm7, %%mm4 \n\t" \
00402 "paddd %%mm2, %%mm5 \n\t" \
00403 "psrad $" #shift ", %%mm7 \n\t"\
00404 "psrad $" #shift ", %%mm4 \n\t"\
00405 "movq %%mm1, %%mm2 \n\t" \
00406 "paddd %%mm5, %%mm1 \n\t" \
00407 "psubd %%mm5, %%mm2 \n\t" \
00408 "psrad $" #shift ", %%mm1 \n\t"\
00409 "psrad $" #shift ", %%mm2 \n\t"\
00410 "packssdw %%mm1, %%mm7 \n\t" \
00411 "packssdw %%mm4, %%mm2 \n\t" \
00412 "movq %%mm7, " #dst " \n\t"\
00413 "movq " #src1 ", %%mm1 \n\t" \
00414 "movq 80(%2), %%mm4 \n\t" \
00415 "movq %%mm2, 24+" #dst " \n\t"\
00416 "pmaddwd %%mm1, %%mm4 \n\t" \
00417 "movq 88(%2), %%mm7 \n\t" \
00418 "pmaddwd 96(%2), %%mm1 \n\t" \
00419 "pmaddwd %%mm3, %%mm7 \n\t" \
00420 "movq %%mm0, %%mm2 \n\t" \
00421 "pmaddwd 104(%2), %%mm3 \n\t" \
00422 "paddd %%mm7, %%mm4 \n\t" \
00423 "paddd %%mm4, %%mm2 \n\t" \
00424 "psubd %%mm4, %%mm0 \n\t" \
00425 "psrad $" #shift ", %%mm2 \n\t"\
00426 "psrad $" #shift ", %%mm0 \n\t"\
00427 "movq %%mm6, %%mm4 \n\t" \
00428 "paddd %%mm1, %%mm3 \n\t" \
00429 "paddd %%mm3, %%mm6 \n\t" \
00430 "psubd %%mm3, %%mm4 \n\t" \
00431 "psrad $" #shift ", %%mm6 \n\t"\
00432 "packssdw %%mm6, %%mm2 \n\t" \
00433 "movq %%mm2, 8+" #dst " \n\t"\
00434 "psrad $" #shift ", %%mm4 \n\t"\
00435 "packssdw %%mm0, %%mm4 \n\t" \
00436 "movq %%mm4, 16+" #dst " \n\t"\
00437 "jmp 2f \n\t"\
00438 "1: \n\t"\
00439 "pslld $16, %%mm0 \n\t"\
00440 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
00441 "psrad $13, %%mm0 \n\t"\
00442 "packssdw %%mm0, %%mm0 \n\t"\
00443 "movq %%mm0, " #dst " \n\t"\
00444 "movq %%mm0, 8+" #dst " \n\t"\
00445 "movq %%mm0, 16+" #dst " \n\t"\
00446 "movq %%mm0, 24+" #dst " \n\t"\
00447 "2: \n\t"
00448
00449
00450
00451 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
00452
00453
00454
00455
00456 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
00457 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
00458 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
00459
00460
00461
00462 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
00463 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
00464 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
00465 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00466
00467 #else
00468
00469 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00470 "movq " #src0 ", %%mm0 \n\t" \
00471 "movq " #src4 ", %%mm1 \n\t" \
00472 "movq " #src1 ", %%mm2 \n\t" \
00473 "movq " #src5 ", %%mm3 \n\t" \
00474 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
00475 "pand %%mm0, %%mm4 \n\t"\
00476 "por %%mm1, %%mm4 \n\t"\
00477 "por %%mm2, %%mm4 \n\t"\
00478 "por %%mm3, %%mm4 \n\t"\
00479 "packssdw %%mm4,%%mm4 \n\t"\
00480 "movd %%mm4, %%eax \n\t"\
00481 "orl %%eax, %%eax \n\t"\
00482 "jz 1f \n\t"\
00483 "movq 16(%2), %%mm4 \n\t" \
00484 "pmaddwd %%mm0, %%mm4 \n\t" \
00485 "movq 24(%2), %%mm5 \n\t" \
00486 "pmaddwd %%mm5, %%mm0 \n\t" \
00487 "movq 32(%2), %%mm5 \n\t" \
00488 "pmaddwd %%mm1, %%mm5 \n\t" \
00489 "movq 40(%2), %%mm6 \n\t" \
00490 "pmaddwd %%mm6, %%mm1 \n\t" \
00491 "movq 48(%2), %%mm7 \n\t" \
00492 "pmaddwd %%mm2, %%mm7 \n\t" \
00493 #rounder ", %%mm4 \n\t"\
00494 "movq %%mm4, %%mm6 \n\t" \
00495 "paddd %%mm5, %%mm4 \n\t" \
00496 "psubd %%mm5, %%mm6 \n\t" \
00497 "movq 56(%2), %%mm5 \n\t" \
00498 "pmaddwd %%mm3, %%mm5 \n\t" \
00499 #rounder ", %%mm0 \n\t"\
00500 "paddd %%mm0, %%mm1 \n\t" \
00501 "paddd %%mm0, %%mm0 \n\t" \
00502 "psubd %%mm1, %%mm0 \n\t" \
00503 "pmaddwd 64(%2), %%mm2 \n\t" \
00504 "paddd %%mm5, %%mm7 \n\t" \
00505 "movq 72(%2), %%mm5 \n\t" \
00506 "pmaddwd %%mm3, %%mm5 \n\t" \
00507 "paddd %%mm4, %%mm7 \n\t" \
00508 "paddd %%mm4, %%mm4 \n\t" \
00509 "psubd %%mm7, %%mm4 \n\t" \
00510 "paddd %%mm2, %%mm5 \n\t" \
00511 "psrad $" #shift ", %%mm7 \n\t"\
00512 "psrad $" #shift ", %%mm4 \n\t"\
00513 "movq %%mm1, %%mm2 \n\t" \
00514 "paddd %%mm5, %%mm1 \n\t" \
00515 "psubd %%mm5, %%mm2 \n\t" \
00516 "psrad $" #shift ", %%mm1 \n\t"\
00517 "psrad $" #shift ", %%mm2 \n\t"\
00518 "packssdw %%mm1, %%mm7 \n\t" \
00519 "packssdw %%mm4, %%mm2 \n\t" \
00520 "movq %%mm7, " #dst " \n\t"\
00521 "movq " #src1 ", %%mm1 \n\t" \
00522 "movq 80(%2), %%mm4 \n\t" \
00523 "movq %%mm2, 24+" #dst " \n\t"\
00524 "pmaddwd %%mm1, %%mm4 \n\t" \
00525 "movq 88(%2), %%mm7 \n\t" \
00526 "pmaddwd 96(%2), %%mm1 \n\t" \
00527 "pmaddwd %%mm3, %%mm7 \n\t" \
00528 "movq %%mm0, %%mm2 \n\t" \
00529 "pmaddwd 104(%2), %%mm3 \n\t" \
00530 "paddd %%mm7, %%mm4 \n\t" \
00531 "paddd %%mm4, %%mm2 \n\t" \
00532 "psubd %%mm4, %%mm0 \n\t" \
00533 "psrad $" #shift ", %%mm2 \n\t"\
00534 "psrad $" #shift ", %%mm0 \n\t"\
00535 "movq %%mm6, %%mm4 \n\t" \
00536 "paddd %%mm1, %%mm3 \n\t" \
00537 "paddd %%mm3, %%mm6 \n\t" \
00538 "psubd %%mm3, %%mm4 \n\t" \
00539 "psrad $" #shift ", %%mm6 \n\t"\
00540 "packssdw %%mm6, %%mm2 \n\t" \
00541 "movq %%mm2, 8+" #dst " \n\t"\
00542 "psrad $" #shift ", %%mm4 \n\t"\
00543 "packssdw %%mm0, %%mm4 \n\t" \
00544 "movq %%mm4, 16+" #dst " \n\t"\
00545 "jmp 2f \n\t"\
00546 "1: \n\t"\
00547 "pslld $16, %%mm0 \n\t"\
00548 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
00549 "psrad $13, %%mm0 \n\t"\
00550 "packssdw %%mm0, %%mm0 \n\t"\
00551 "movq %%mm0, " #dst " \n\t"\
00552 "movq %%mm0, 8+" #dst " \n\t"\
00553 "movq %%mm0, 16+" #dst " \n\t"\
00554 "movq %%mm0, 24+" #dst " \n\t"\
00555 "2: \n\t"
00556
00557 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
00558 "movq " #src0 ", %%mm0 \n\t" \
00559 "movq " #src4 ", %%mm1 \n\t" \
00560 "movq " #src1 ", %%mm2 \n\t" \
00561 "movq " #src5 ", %%mm3 \n\t" \
00562 "movq %%mm0, %%mm4 \n\t"\
00563 "por %%mm1, %%mm4 \n\t"\
00564 "por %%mm2, %%mm4 \n\t"\
00565 "por %%mm3, %%mm4 \n\t"\
00566 "packssdw %%mm4,%%mm4 \n\t"\
00567 "movd %%mm4, %%eax \n\t"\
00568 "orl %%eax, %%eax \n\t"\
00569 "jz " #bt " \n\t"\
00570 "movq 16(%2), %%mm4 \n\t" \
00571 "pmaddwd %%mm0, %%mm4 \n\t" \
00572 "movq 24(%2), %%mm5 \n\t" \
00573 "pmaddwd %%mm5, %%mm0 \n\t" \
00574 "movq 32(%2), %%mm5 \n\t" \
00575 "pmaddwd %%mm1, %%mm5 \n\t" \
00576 "movq 40(%2), %%mm6 \n\t" \
00577 "pmaddwd %%mm6, %%mm1 \n\t" \
00578 "movq 48(%2), %%mm7 \n\t" \
00579 "pmaddwd %%mm2, %%mm7 \n\t" \
00580 #rounder ", %%mm4 \n\t"\
00581 "movq %%mm4, %%mm6 \n\t" \
00582 "paddd %%mm5, %%mm4 \n\t" \
00583 "psubd %%mm5, %%mm6 \n\t" \
00584 "movq 56(%2), %%mm5 \n\t" \
00585 "pmaddwd %%mm3, %%mm5 \n\t" \
00586 #rounder ", %%mm0 \n\t"\
00587 "paddd %%mm0, %%mm1 \n\t" \
00588 "paddd %%mm0, %%mm0 \n\t" \
00589 "psubd %%mm1, %%mm0 \n\t" \
00590 "pmaddwd 64(%2), %%mm2 \n\t" \
00591 "paddd %%mm5, %%mm7 \n\t" \
00592 "movq 72(%2), %%mm5 \n\t" \
00593 "pmaddwd %%mm3, %%mm5 \n\t" \
00594 "paddd %%mm4, %%mm7 \n\t" \
00595 "paddd %%mm4, %%mm4 \n\t" \
00596 "psubd %%mm7, %%mm4 \n\t" \
00597 "paddd %%mm2, %%mm5 \n\t" \
00598 "psrad $" #shift ", %%mm7 \n\t"\
00599 "psrad $" #shift ", %%mm4 \n\t"\
00600 "movq %%mm1, %%mm2 \n\t" \
00601 "paddd %%mm5, %%mm1 \n\t" \
00602 "psubd %%mm5, %%mm2 \n\t" \
00603 "psrad $" #shift ", %%mm1 \n\t"\
00604 "psrad $" #shift ", %%mm2 \n\t"\
00605 "packssdw %%mm1, %%mm7 \n\t" \
00606 "packssdw %%mm4, %%mm2 \n\t" \
00607 "movq %%mm7, " #dst " \n\t"\
00608 "movq " #src1 ", %%mm1 \n\t" \
00609 "movq 80(%2), %%mm4 \n\t" \
00610 "movq %%mm2, 24+" #dst " \n\t"\
00611 "pmaddwd %%mm1, %%mm4 \n\t" \
00612 "movq 88(%2), %%mm7 \n\t" \
00613 "pmaddwd 96(%2), %%mm1 \n\t" \
00614 "pmaddwd %%mm3, %%mm7 \n\t" \
00615 "movq %%mm0, %%mm2 \n\t" \
00616 "pmaddwd 104(%2), %%mm3 \n\t" \
00617 "paddd %%mm7, %%mm4 \n\t" \
00618 "paddd %%mm4, %%mm2 \n\t" \
00619 "psubd %%mm4, %%mm0 \n\t" \
00620 "psrad $" #shift ", %%mm2 \n\t"\
00621 "psrad $" #shift ", %%mm0 \n\t"\
00622 "movq %%mm6, %%mm4 \n\t" \
00623 "paddd %%mm1, %%mm3 \n\t" \
00624 "paddd %%mm3, %%mm6 \n\t" \
00625 "psubd %%mm3, %%mm4 \n\t" \
00626 "psrad $" #shift ", %%mm6 \n\t"\
00627 "packssdw %%mm6, %%mm2 \n\t" \
00628 "movq %%mm2, 8+" #dst " \n\t"\
00629 "psrad $" #shift ", %%mm4 \n\t"\
00630 "packssdw %%mm0, %%mm4 \n\t" \
00631 "movq %%mm4, 16+" #dst " \n\t"\
00632
00633 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00634 "movq " #src0 ", %%mm0 \n\t" \
00635 "movq " #src4 ", %%mm1 \n\t" \
00636 "movq " #src1 ", %%mm2 \n\t" \
00637 "movq " #src5 ", %%mm3 \n\t" \
00638 "movq 16(%2), %%mm4 \n\t" \
00639 "pmaddwd %%mm0, %%mm4 \n\t" \
00640 "movq 24(%2), %%mm5 \n\t" \
00641 "pmaddwd %%mm5, %%mm0 \n\t" \
00642 "movq 32(%2), %%mm5 \n\t" \
00643 "pmaddwd %%mm1, %%mm5 \n\t" \
00644 "movq 40(%2), %%mm6 \n\t" \
00645 "pmaddwd %%mm6, %%mm1 \n\t" \
00646 "movq 48(%2), %%mm7 \n\t" \
00647 "pmaddwd %%mm2, %%mm7 \n\t" \
00648 #rounder ", %%mm4 \n\t"\
00649 "movq %%mm4, %%mm6 \n\t" \
00650 "paddd %%mm5, %%mm4 \n\t" \
00651 "psubd %%mm5, %%mm6 \n\t" \
00652 "movq 56(%2), %%mm5 \n\t" \
00653 "pmaddwd %%mm3, %%mm5 \n\t" \
00654 #rounder ", %%mm0 \n\t"\
00655 "paddd %%mm0, %%mm1 \n\t" \
00656 "paddd %%mm0, %%mm0 \n\t" \
00657 "psubd %%mm1, %%mm0 \n\t" \
00658 "pmaddwd 64(%2), %%mm2 \n\t" \
00659 "paddd %%mm5, %%mm7 \n\t" \
00660 "movq 72(%2), %%mm5 \n\t" \
00661 "pmaddwd %%mm3, %%mm5 \n\t" \
00662 "paddd %%mm4, %%mm7 \n\t" \
00663 "paddd %%mm4, %%mm4 \n\t" \
00664 "psubd %%mm7, %%mm4 \n\t" \
00665 "paddd %%mm2, %%mm5 \n\t" \
00666 "psrad $" #shift ", %%mm7 \n\t"\
00667 "psrad $" #shift ", %%mm4 \n\t"\
00668 "movq %%mm1, %%mm2 \n\t" \
00669 "paddd %%mm5, %%mm1 \n\t" \
00670 "psubd %%mm5, %%mm2 \n\t" \
00671 "psrad $" #shift ", %%mm1 \n\t"\
00672 "psrad $" #shift ", %%mm2 \n\t"\
00673 "packssdw %%mm1, %%mm7 \n\t" \
00674 "packssdw %%mm4, %%mm2 \n\t" \
00675 "movq %%mm7, " #dst " \n\t"\
00676 "movq " #src1 ", %%mm1 \n\t" \
00677 "movq 80(%2), %%mm4 \n\t" \
00678 "movq %%mm2, 24+" #dst " \n\t"\
00679 "pmaddwd %%mm1, %%mm4 \n\t" \
00680 "movq 88(%2), %%mm7 \n\t" \
00681 "pmaddwd 96(%2), %%mm1 \n\t" \
00682 "pmaddwd %%mm3, %%mm7 \n\t" \
00683 "movq %%mm0, %%mm2 \n\t" \
00684 "pmaddwd 104(%2), %%mm3 \n\t" \
00685 "paddd %%mm7, %%mm4 \n\t" \
00686 "paddd %%mm4, %%mm2 \n\t" \
00687 "psubd %%mm4, %%mm0 \n\t" \
00688 "psrad $" #shift ", %%mm2 \n\t"\
00689 "psrad $" #shift ", %%mm0 \n\t"\
00690 "movq %%mm6, %%mm4 \n\t" \
00691 "paddd %%mm1, %%mm3 \n\t" \
00692 "paddd %%mm3, %%mm6 \n\t" \
00693 "psubd %%mm3, %%mm4 \n\t" \
00694 "psrad $" #shift ", %%mm6 \n\t"\
00695 "packssdw %%mm6, %%mm2 \n\t" \
00696 "movq %%mm2, 8+" #dst " \n\t"\
00697 "psrad $" #shift ", %%mm4 \n\t"\
00698 "packssdw %%mm0, %%mm4 \n\t" \
00699 "movq %%mm4, 16+" #dst " \n\t"\
00700
00701
00702 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
00703 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
00704 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
00705 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
00706
00707 #undef IDCT
00708 #define IDCT(src0, src4, src1, src5, dst, shift) \
00709 "movq " #src0 ", %%mm0 \n\t" \
00710 "movq " #src4 ", %%mm1 \n\t" \
00711 "movq " #src1 ", %%mm2 \n\t" \
00712 "movq " #src5 ", %%mm3 \n\t" \
00713 "movq 16(%2), %%mm4 \n\t" \
00714 "pmaddwd %%mm0, %%mm4 \n\t" \
00715 "movq 24(%2), %%mm5 \n\t" \
00716 "pmaddwd %%mm5, %%mm0 \n\t" \
00717 "movq 32(%2), %%mm5 \n\t" \
00718 "pmaddwd %%mm1, %%mm5 \n\t" \
00719 "movq 40(%2), %%mm6 \n\t" \
00720 "pmaddwd %%mm6, %%mm1 \n\t" \
00721 "movq %%mm4, %%mm6 \n\t" \
00722 "movq 48(%2), %%mm7 \n\t" \
00723 "pmaddwd %%mm2, %%mm7 \n\t" \
00724 "paddd %%mm5, %%mm4 \n\t" \
00725 "psubd %%mm5, %%mm6 \n\t" \
00726 "movq %%mm0, %%mm5 \n\t" \
00727 "paddd %%mm1, %%mm0 \n\t" \
00728 "psubd %%mm1, %%mm5 \n\t" \
00729 "movq 56(%2), %%mm1 \n\t" \
00730 "pmaddwd %%mm3, %%mm1 \n\t" \
00731 "pmaddwd 64(%2), %%mm2 \n\t" \
00732 "paddd %%mm1, %%mm7 \n\t" \
00733 "movq 72(%2), %%mm1 \n\t" \
00734 "pmaddwd %%mm3, %%mm1 \n\t" \
00735 "paddd %%mm4, %%mm7 \n\t" \
00736 "paddd %%mm4, %%mm4 \n\t" \
00737 "psubd %%mm7, %%mm4 \n\t" \
00738 "paddd %%mm2, %%mm1 \n\t" \
00739 "psrad $" #shift ", %%mm7 \n\t"\
00740 "psrad $" #shift ", %%mm4 \n\t"\
00741 "movq %%mm0, %%mm2 \n\t" \
00742 "paddd %%mm1, %%mm0 \n\t" \
00743 "psubd %%mm1, %%mm2 \n\t" \
00744 "psrad $" #shift ", %%mm0 \n\t"\
00745 "psrad $" #shift ", %%mm2 \n\t"\
00746 "packssdw %%mm7, %%mm7 \n\t" \
00747 "movd %%mm7, " #dst " \n\t"\
00748 "packssdw %%mm0, %%mm0 \n\t" \
00749 "movd %%mm0, 16+" #dst " \n\t"\
00750 "packssdw %%mm2, %%mm2 \n\t" \
00751 "movd %%mm2, 96+" #dst " \n\t"\
00752 "packssdw %%mm4, %%mm4 \n\t" \
00753 "movd %%mm4, 112+" #dst " \n\t"\
00754 "movq " #src1 ", %%mm0 \n\t" \
00755 "movq 80(%2), %%mm4 \n\t" \
00756 "pmaddwd %%mm0, %%mm4 \n\t" \
00757 "movq 88(%2), %%mm7 \n\t" \
00758 "pmaddwd 96(%2), %%mm0 \n\t" \
00759 "pmaddwd %%mm3, %%mm7 \n\t" \
00760 "movq %%mm5, %%mm2 \n\t" \
00761 "pmaddwd 104(%2), %%mm3 \n\t" \
00762 "paddd %%mm7, %%mm4 \n\t" \
00763 "paddd %%mm4, %%mm2 \n\t" \
00764 "psubd %%mm4, %%mm5 \n\t" \
00765 "psrad $" #shift ", %%mm2 \n\t"\
00766 "psrad $" #shift ", %%mm5 \n\t"\
00767 "movq %%mm6, %%mm4 \n\t" \
00768 "paddd %%mm0, %%mm3 \n\t" \
00769 "paddd %%mm3, %%mm6 \n\t" \
00770 "psubd %%mm3, %%mm4 \n\t" \
00771 "psrad $" #shift ", %%mm6 \n\t"\
00772 "psrad $" #shift ", %%mm4 \n\t"\
00773 "packssdw %%mm2, %%mm2 \n\t" \
00774 "packssdw %%mm6, %%mm6 \n\t" \
00775 "movd %%mm2, 32+" #dst " \n\t"\
00776 "packssdw %%mm4, %%mm4 \n\t" \
00777 "packssdw %%mm5, %%mm5 \n\t" \
00778 "movd %%mm6, 48+" #dst " \n\t"\
00779 "movd %%mm4, 64+" #dst " \n\t"\
00780 "movd %%mm5, 80+" #dst " \n\t"
00781
00782
00783
00784 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
00785 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
00786 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
00787 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00788 "jmp 9f \n\t"
00789
00790 "#" ASMALIGN(4) \
00791 "4: \n\t"
00792 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
00793 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
00794
00795 #undef IDCT
00796 #define IDCT(src0, src4, src1, src5, dst, shift) \
00797 "movq " #src0 ", %%mm0 \n\t" \
00798 "movq " #src4 ", %%mm1 \n\t" \
00799 "movq " #src5 ", %%mm3 \n\t" \
00800 "movq 16(%2), %%mm4 \n\t" \
00801 "pmaddwd %%mm0, %%mm4 \n\t" \
00802 "movq 24(%2), %%mm5 \n\t" \
00803 "pmaddwd %%mm5, %%mm0 \n\t" \
00804 "movq 32(%2), %%mm5 \n\t" \
00805 "pmaddwd %%mm1, %%mm5 \n\t" \
00806 "movq 40(%2), %%mm6 \n\t" \
00807 "pmaddwd %%mm6, %%mm1 \n\t" \
00808 "movq %%mm4, %%mm6 \n\t" \
00809 "paddd %%mm5, %%mm4 \n\t" \
00810 "psubd %%mm5, %%mm6 \n\t" \
00811 "movq %%mm0, %%mm5 \n\t" \
00812 "paddd %%mm1, %%mm0 \n\t" \
00813 "psubd %%mm1, %%mm5 \n\t" \
00814 "movq 56(%2), %%mm1 \n\t" \
00815 "pmaddwd %%mm3, %%mm1 \n\t" \
00816 "movq 72(%2), %%mm7 \n\t" \
00817 "pmaddwd %%mm3, %%mm7 \n\t" \
00818 "paddd %%mm4, %%mm1 \n\t" \
00819 "paddd %%mm4, %%mm4 \n\t" \
00820 "psubd %%mm1, %%mm4 \n\t" \
00821 "psrad $" #shift ", %%mm1 \n\t"\
00822 "psrad $" #shift ", %%mm4 \n\t"\
00823 "movq %%mm0, %%mm2 \n\t" \
00824 "paddd %%mm7, %%mm0 \n\t" \
00825 "psubd %%mm7, %%mm2 \n\t" \
00826 "psrad $" #shift ", %%mm0 \n\t"\
00827 "psrad $" #shift ", %%mm2 \n\t"\
00828 "packssdw %%mm1, %%mm1 \n\t" \
00829 "movd %%mm1, " #dst " \n\t"\
00830 "packssdw %%mm0, %%mm0 \n\t" \
00831 "movd %%mm0, 16+" #dst " \n\t"\
00832 "packssdw %%mm2, %%mm2 \n\t" \
00833 "movd %%mm2, 96+" #dst " \n\t"\
00834 "packssdw %%mm4, %%mm4 \n\t" \
00835 "movd %%mm4, 112+" #dst " \n\t"\
00836 "movq 88(%2), %%mm1 \n\t" \
00837 "pmaddwd %%mm3, %%mm1 \n\t" \
00838 "movq %%mm5, %%mm2 \n\t" \
00839 "pmaddwd 104(%2), %%mm3 \n\t" \
00840 "paddd %%mm1, %%mm2 \n\t" \
00841 "psubd %%mm1, %%mm5 \n\t" \
00842 "psrad $" #shift ", %%mm2 \n\t"\
00843 "psrad $" #shift ", %%mm5 \n\t"\
00844 "movq %%mm6, %%mm1 \n\t" \
00845 "paddd %%mm3, %%mm6 \n\t" \
00846 "psubd %%mm3, %%mm1 \n\t" \
00847 "psrad $" #shift ", %%mm6 \n\t"\
00848 "psrad $" #shift ", %%mm1 \n\t"\
00849 "packssdw %%mm2, %%mm2 \n\t" \
00850 "packssdw %%mm6, %%mm6 \n\t" \
00851 "movd %%mm2, 32+" #dst " \n\t"\
00852 "packssdw %%mm1, %%mm1 \n\t" \
00853 "packssdw %%mm5, %%mm5 \n\t" \
00854 "movd %%mm6, 48+" #dst " \n\t"\
00855 "movd %%mm1, 64+" #dst " \n\t"\
00856 "movd %%mm5, 80+" #dst " \n\t"
00857
00858
00859 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
00860 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
00861 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
00862 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00863 "jmp 9f \n\t"
00864
00865 "#" ASMALIGN(4) \
00866 "6: \n\t"
00867 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
00868
00869 #undef IDCT
00870 #define IDCT(src0, src4, src1, src5, dst, shift) \
00871 "movq " #src0 ", %%mm0 \n\t" \
00872 "movq " #src5 ", %%mm3 \n\t" \
00873 "movq 16(%2), %%mm4 \n\t" \
00874 "pmaddwd %%mm0, %%mm4 \n\t" \
00875 "movq 24(%2), %%mm5 \n\t" \
00876 "pmaddwd %%mm5, %%mm0 \n\t" \
00877 "movq %%mm4, %%mm6 \n\t" \
00878 "movq %%mm0, %%mm5 \n\t" \
00879 "movq 56(%2), %%mm1 \n\t" \
00880 "pmaddwd %%mm3, %%mm1 \n\t" \
00881 "movq 72(%2), %%mm7 \n\t" \
00882 "pmaddwd %%mm3, %%mm7 \n\t" \
00883 "paddd %%mm4, %%mm1 \n\t" \
00884 "paddd %%mm4, %%mm4 \n\t" \
00885 "psubd %%mm1, %%mm4 \n\t" \
00886 "psrad $" #shift ", %%mm1 \n\t"\
00887 "psrad $" #shift ", %%mm4 \n\t"\
00888 "movq %%mm0, %%mm2 \n\t" \
00889 "paddd %%mm7, %%mm0 \n\t" \
00890 "psubd %%mm7, %%mm2 \n\t" \
00891 "psrad $" #shift ", %%mm0 \n\t"\
00892 "psrad $" #shift ", %%mm2 \n\t"\
00893 "packssdw %%mm1, %%mm1 \n\t" \
00894 "movd %%mm1, " #dst " \n\t"\
00895 "packssdw %%mm0, %%mm0 \n\t" \
00896 "movd %%mm0, 16+" #dst " \n\t"\
00897 "packssdw %%mm2, %%mm2 \n\t" \
00898 "movd %%mm2, 96+" #dst " \n\t"\
00899 "packssdw %%mm4, %%mm4 \n\t" \
00900 "movd %%mm4, 112+" #dst " \n\t"\
00901 "movq 88(%2), %%mm1 \n\t" \
00902 "pmaddwd %%mm3, %%mm1 \n\t" \
00903 "movq %%mm5, %%mm2 \n\t" \
00904 "pmaddwd 104(%2), %%mm3 \n\t" \
00905 "paddd %%mm1, %%mm2 \n\t" \
00906 "psubd %%mm1, %%mm5 \n\t" \
00907 "psrad $" #shift ", %%mm2 \n\t"\
00908 "psrad $" #shift ", %%mm5 \n\t"\
00909 "movq %%mm6, %%mm1 \n\t" \
00910 "paddd %%mm3, %%mm6 \n\t" \
00911 "psubd %%mm3, %%mm1 \n\t" \
00912 "psrad $" #shift ", %%mm6 \n\t"\
00913 "psrad $" #shift ", %%mm1 \n\t"\
00914 "packssdw %%mm2, %%mm2 \n\t" \
00915 "packssdw %%mm6, %%mm6 \n\t" \
00916 "movd %%mm2, 32+" #dst " \n\t"\
00917 "packssdw %%mm1, %%mm1 \n\t" \
00918 "packssdw %%mm5, %%mm5 \n\t" \
00919 "movd %%mm6, 48+" #dst " \n\t"\
00920 "movd %%mm1, 64+" #dst " \n\t"\
00921 "movd %%mm5, 80+" #dst " \n\t"
00922
00923
00924
00925 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
00926 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
00927 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
00928 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00929 "jmp 9f \n\t"
00930
00931 "#" ASMALIGN(4) \
00932 "2: \n\t"
00933 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
00934
00935 #undef IDCT
00936 #define IDCT(src0, src4, src1, src5, dst, shift) \
00937 "movq " #src0 ", %%mm0 \n\t" \
00938 "movq " #src1 ", %%mm2 \n\t" \
00939 "movq " #src5 ", %%mm3 \n\t" \
00940 "movq 16(%2), %%mm4 \n\t" \
00941 "pmaddwd %%mm0, %%mm4 \n\t" \
00942 "movq 24(%2), %%mm5 \n\t" \
00943 "pmaddwd %%mm5, %%mm0 \n\t" \
00944 "movq %%mm4, %%mm6 \n\t" \
00945 "movq 48(%2), %%mm7 \n\t" \
00946 "pmaddwd %%mm2, %%mm7 \n\t" \
00947 "movq %%mm0, %%mm5 \n\t" \
00948 "movq 56(%2), %%mm1 \n\t" \
00949 "pmaddwd %%mm3, %%mm1 \n\t" \
00950 "pmaddwd 64(%2), %%mm2 \n\t" \
00951 "paddd %%mm1, %%mm7 \n\t" \
00952 "movq 72(%2), %%mm1 \n\t" \
00953 "pmaddwd %%mm3, %%mm1 \n\t" \
00954 "paddd %%mm4, %%mm7 \n\t" \
00955 "paddd %%mm4, %%mm4 \n\t" \
00956 "psubd %%mm7, %%mm4 \n\t" \
00957 "paddd %%mm2, %%mm1 \n\t" \
00958 "psrad $" #shift ", %%mm7 \n\t"\
00959 "psrad $" #shift ", %%mm4 \n\t"\
00960 "movq %%mm0, %%mm2 \n\t" \
00961 "paddd %%mm1, %%mm0 \n\t" \
00962 "psubd %%mm1, %%mm2 \n\t" \
00963 "psrad $" #shift ", %%mm0 \n\t"\
00964 "psrad $" #shift ", %%mm2 \n\t"\
00965 "packssdw %%mm7, %%mm7 \n\t" \
00966 "movd %%mm7, " #dst " \n\t"\
00967 "packssdw %%mm0, %%mm0 \n\t" \
00968 "movd %%mm0, 16+" #dst " \n\t"\
00969 "packssdw %%mm2, %%mm2 \n\t" \
00970 "movd %%mm2, 96+" #dst " \n\t"\
00971 "packssdw %%mm4, %%mm4 \n\t" \
00972 "movd %%mm4, 112+" #dst " \n\t"\
00973 "movq " #src1 ", %%mm0 \n\t" \
00974 "movq 80(%2), %%mm4 \n\t" \
00975 "pmaddwd %%mm0, %%mm4 \n\t" \
00976 "movq 88(%2), %%mm7 \n\t" \
00977 "pmaddwd 96(%2), %%mm0 \n\t" \
00978 "pmaddwd %%mm3, %%mm7 \n\t" \
00979 "movq %%mm5, %%mm2 \n\t" \
00980 "pmaddwd 104(%2), %%mm3 \n\t" \
00981 "paddd %%mm7, %%mm4 \n\t" \
00982 "paddd %%mm4, %%mm2 \n\t" \
00983 "psubd %%mm4, %%mm5 \n\t" \
00984 "psrad $" #shift ", %%mm2 \n\t"\
00985 "psrad $" #shift ", %%mm5 \n\t"\
00986 "movq %%mm6, %%mm4 \n\t" \
00987 "paddd %%mm0, %%mm3 \n\t" \
00988 "paddd %%mm3, %%mm6 \n\t" \
00989 "psubd %%mm3, %%mm4 \n\t" \
00990 "psrad $" #shift ", %%mm6 \n\t"\
00991 "psrad $" #shift ", %%mm4 \n\t"\
00992 "packssdw %%mm2, %%mm2 \n\t" \
00993 "packssdw %%mm6, %%mm6 \n\t" \
00994 "movd %%mm2, 32+" #dst " \n\t"\
00995 "packssdw %%mm4, %%mm4 \n\t" \
00996 "packssdw %%mm5, %%mm5 \n\t" \
00997 "movd %%mm6, 48+" #dst " \n\t"\
00998 "movd %%mm4, 64+" #dst " \n\t"\
00999 "movd %%mm5, 80+" #dst " \n\t"
01000
01001
01002 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
01003 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
01004 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
01005 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01006 "jmp 9f \n\t"
01007
01008 "#" ASMALIGN(4) \
01009 "3: \n\t"
01010 #undef IDCT
01011 #define IDCT(src0, src4, src1, src5, dst, shift) \
01012 "movq " #src0 ", %%mm0 \n\t" \
01013 "movq " #src1 ", %%mm2 \n\t" \
01014 "movq 16(%2), %%mm4 \n\t" \
01015 "pmaddwd %%mm0, %%mm4 \n\t" \
01016 "movq 24(%2), %%mm5 \n\t" \
01017 "pmaddwd %%mm5, %%mm0 \n\t" \
01018 "movq %%mm4, %%mm6 \n\t" \
01019 "movq 48(%2), %%mm7 \n\t" \
01020 "pmaddwd %%mm2, %%mm7 \n\t" \
01021 "movq %%mm0, %%mm5 \n\t" \
01022 "movq 64(%2), %%mm3 \n\t"\
01023 "pmaddwd %%mm2, %%mm3 \n\t" \
01024 "paddd %%mm4, %%mm7 \n\t" \
01025 "paddd %%mm4, %%mm4 \n\t" \
01026 "psubd %%mm7, %%mm4 \n\t" \
01027 "psrad $" #shift ", %%mm7 \n\t"\
01028 "psrad $" #shift ", %%mm4 \n\t"\
01029 "movq %%mm0, %%mm1 \n\t" \
01030 "paddd %%mm3, %%mm0 \n\t" \
01031 "psubd %%mm3, %%mm1 \n\t" \
01032 "psrad $" #shift ", %%mm0 \n\t"\
01033 "psrad $" #shift ", %%mm1 \n\t"\
01034 "packssdw %%mm7, %%mm7 \n\t" \
01035 "movd %%mm7, " #dst " \n\t"\
01036 "packssdw %%mm0, %%mm0 \n\t" \
01037 "movd %%mm0, 16+" #dst " \n\t"\
01038 "packssdw %%mm1, %%mm1 \n\t" \
01039 "movd %%mm1, 96+" #dst " \n\t"\
01040 "packssdw %%mm4, %%mm4 \n\t" \
01041 "movd %%mm4, 112+" #dst " \n\t"\
01042 "movq 80(%2), %%mm4 \n\t" \
01043 "pmaddwd %%mm2, %%mm4 \n\t" \
01044 "pmaddwd 96(%2), %%mm2 \n\t" \
01045 "movq %%mm5, %%mm1 \n\t" \
01046 "paddd %%mm4, %%mm1 \n\t" \
01047 "psubd %%mm4, %%mm5 \n\t" \
01048 "psrad $" #shift ", %%mm1 \n\t"\
01049 "psrad $" #shift ", %%mm5 \n\t"\
01050 "movq %%mm6, %%mm4 \n\t" \
01051 "paddd %%mm2, %%mm6 \n\t" \
01052 "psubd %%mm2, %%mm4 \n\t" \
01053 "psrad $" #shift ", %%mm6 \n\t"\
01054 "psrad $" #shift ", %%mm4 \n\t"\
01055 "packssdw %%mm1, %%mm1 \n\t" \
01056 "packssdw %%mm6, %%mm6 \n\t" \
01057 "movd %%mm1, 32+" #dst " \n\t"\
01058 "packssdw %%mm4, %%mm4 \n\t" \
01059 "packssdw %%mm5, %%mm5 \n\t" \
01060 "movd %%mm6, 48+" #dst " \n\t"\
01061 "movd %%mm4, 64+" #dst " \n\t"\
01062 "movd %%mm5, 80+" #dst " \n\t"
01063
01064
01065
01066 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
01067 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
01068 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
01069 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01070 "jmp 9f \n\t"
01071
01072 "#" ASMALIGN(4) \
01073 "5: \n\t"
01074 #undef IDCT
01075 #define IDCT(src0, src4, src1, src5, dst, shift) \
01076 "movq " #src0 ", %%mm0 \n\t" \
01077 "movq " #src4 ", %%mm1 \n\t" \
01078 "movq 16(%2), %%mm4 \n\t" \
01079 "pmaddwd %%mm0, %%mm4 \n\t" \
01080 "movq 24(%2), %%mm5 \n\t" \
01081 "pmaddwd %%mm5, %%mm0 \n\t" \
01082 "movq 32(%2), %%mm5 \n\t" \
01083 "pmaddwd %%mm1, %%mm5 \n\t" \
01084 "movq 40(%2), %%mm6 \n\t" \
01085 "pmaddwd %%mm6, %%mm1 \n\t" \
01086 "movq %%mm4, %%mm6 \n\t" \
01087 "paddd %%mm5, %%mm4 \n\t" \
01088 "psubd %%mm5, %%mm6 \n\t" \
01089 "movq %%mm0, %%mm5 \n\t" \
01090 "paddd %%mm1, %%mm0 \n\t" \
01091 "psubd %%mm1, %%mm5 \n\t" \
01092 "movq 8+" #src0 ", %%mm2 \n\t" \
01093 "movq 8+" #src4 ", %%mm3 \n\t" \
01094 "movq 16(%2), %%mm1 \n\t" \
01095 "pmaddwd %%mm2, %%mm1 \n\t" \
01096 "movq 24(%2), %%mm7 \n\t" \
01097 "pmaddwd %%mm7, %%mm2 \n\t" \
01098 "movq 32(%2), %%mm7 \n\t" \
01099 "pmaddwd %%mm3, %%mm7 \n\t" \
01100 "pmaddwd 40(%2), %%mm3 \n\t" \
01101 "paddd %%mm1, %%mm7 \n\t" \
01102 "paddd %%mm1, %%mm1 \n\t" \
01103 "psubd %%mm7, %%mm1 \n\t" \
01104 "paddd %%mm2, %%mm3 \n\t" \
01105 "paddd %%mm2, %%mm2 \n\t" \
01106 "psubd %%mm3, %%mm2 \n\t" \
01107 "psrad $" #shift ", %%mm4 \n\t"\
01108 "psrad $" #shift ", %%mm7 \n\t"\
01109 "psrad $" #shift ", %%mm3 \n\t"\
01110 "packssdw %%mm7, %%mm4 \n\t" \
01111 "movq %%mm4, " #dst " \n\t"\
01112 "psrad $" #shift ", %%mm0 \n\t"\
01113 "packssdw %%mm3, %%mm0 \n\t" \
01114 "movq %%mm0, 16+" #dst " \n\t"\
01115 "movq %%mm0, 96+" #dst " \n\t"\
01116 "movq %%mm4, 112+" #dst " \n\t"\
01117 "psrad $" #shift ", %%mm5 \n\t"\
01118 "psrad $" #shift ", %%mm6 \n\t"\
01119 "psrad $" #shift ", %%mm2 \n\t"\
01120 "packssdw %%mm2, %%mm5 \n\t" \
01121 "movq %%mm5, 32+" #dst " \n\t"\
01122 "psrad $" #shift ", %%mm1 \n\t"\
01123 "packssdw %%mm1, %%mm6 \n\t" \
01124 "movq %%mm6, 48+" #dst " \n\t"\
01125 "movq %%mm6, 64+" #dst " \n\t"\
01126 "movq %%mm5, 80+" #dst " \n\t"
01127
01128
01129
01130 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
01131
01132 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
01133
01134 "jmp 9f \n\t"
01135
01136
01137 "#" ASMALIGN(4) \
01138 "1: \n\t"
01139 #undef IDCT
01140 #define IDCT(src0, src4, src1, src5, dst, shift) \
01141 "movq " #src0 ", %%mm0 \n\t" \
01142 "movq " #src4 ", %%mm1 \n\t" \
01143 "movq " #src1 ", %%mm2 \n\t" \
01144 "movq 16(%2), %%mm4 \n\t" \
01145 "pmaddwd %%mm0, %%mm4 \n\t" \
01146 "movq 24(%2), %%mm5 \n\t" \
01147 "pmaddwd %%mm5, %%mm0 \n\t" \
01148 "movq 32(%2), %%mm5 \n\t" \
01149 "pmaddwd %%mm1, %%mm5 \n\t" \
01150 "movq 40(%2), %%mm6 \n\t" \
01151 "pmaddwd %%mm6, %%mm1 \n\t" \
01152 "movq %%mm4, %%mm6 \n\t" \
01153 "movq 48(%2), %%mm7 \n\t" \
01154 "pmaddwd %%mm2, %%mm7 \n\t" \
01155 "paddd %%mm5, %%mm4 \n\t" \
01156 "psubd %%mm5, %%mm6 \n\t" \
01157 "movq %%mm0, %%mm5 \n\t" \
01158 "paddd %%mm1, %%mm0 \n\t" \
01159 "psubd %%mm1, %%mm5 \n\t" \
01160 "movq 64(%2), %%mm1 \n\t"\
01161 "pmaddwd %%mm2, %%mm1 \n\t" \
01162 "paddd %%mm4, %%mm7 \n\t" \
01163 "paddd %%mm4, %%mm4 \n\t" \
01164 "psubd %%mm7, %%mm4 \n\t" \
01165 "psrad $" #shift ", %%mm7 \n\t"\
01166 "psrad $" #shift ", %%mm4 \n\t"\
01167 "movq %%mm0, %%mm3 \n\t" \
01168 "paddd %%mm1, %%mm0 \n\t" \
01169 "psubd %%mm1, %%mm3 \n\t" \
01170 "psrad $" #shift ", %%mm0 \n\t"\
01171 "psrad $" #shift ", %%mm3 \n\t"\
01172 "packssdw %%mm7, %%mm7 \n\t" \
01173 "movd %%mm7, " #dst " \n\t"\
01174 "packssdw %%mm0, %%mm0 \n\t" \
01175 "movd %%mm0, 16+" #dst " \n\t"\
01176 "packssdw %%mm3, %%mm3 \n\t" \
01177 "movd %%mm3, 96+" #dst " \n\t"\
01178 "packssdw %%mm4, %%mm4 \n\t" \
01179 "movd %%mm4, 112+" #dst " \n\t"\
01180 "movq 80(%2), %%mm4 \n\t" \
01181 "pmaddwd %%mm2, %%mm4 \n\t" \
01182 "pmaddwd 96(%2), %%mm2 \n\t" \
01183 "movq %%mm5, %%mm3 \n\t" \
01184 "paddd %%mm4, %%mm3 \n\t" \
01185 "psubd %%mm4, %%mm5 \n\t" \
01186 "psrad $" #shift ", %%mm3 \n\t"\
01187 "psrad $" #shift ", %%mm5 \n\t"\
01188 "movq %%mm6, %%mm4 \n\t" \
01189 "paddd %%mm2, %%mm6 \n\t" \
01190 "psubd %%mm2, %%mm4 \n\t" \
01191 "psrad $" #shift ", %%mm6 \n\t"\
01192 "packssdw %%mm3, %%mm3 \n\t" \
01193 "movd %%mm3, 32+" #dst " \n\t"\
01194 "psrad $" #shift ", %%mm4 \n\t"\
01195 "packssdw %%mm6, %%mm6 \n\t" \
01196 "movd %%mm6, 48+" #dst " \n\t"\
01197 "packssdw %%mm4, %%mm4 \n\t" \
01198 "packssdw %%mm5, %%mm5 \n\t" \
01199 "movd %%mm4, 64+" #dst " \n\t"\
01200 "movd %%mm5, 80+" #dst " \n\t"
01201
01202
01203
01204 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
01205 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
01206 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
01207 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01208 "jmp 9f \n\t"
01209
01210
01211 "#" ASMALIGN(4)
01212 "7: \n\t"
01213 #undef IDCT
01214 #define IDCT(src0, src4, src1, src5, dst, shift) \
01215 "movq " #src0 ", %%mm0 \n\t" \
01216 "movq 16(%2), %%mm4 \n\t" \
01217 "pmaddwd %%mm0, %%mm4 \n\t" \
01218 "movq 24(%2), %%mm5 \n\t" \
01219 "pmaddwd %%mm5, %%mm0 \n\t" \
01220 "psrad $" #shift ", %%mm4 \n\t"\
01221 "psrad $" #shift ", %%mm0 \n\t"\
01222 "movq 8+" #src0 ", %%mm2 \n\t" \
01223 "movq 16(%2), %%mm1 \n\t" \
01224 "pmaddwd %%mm2, %%mm1 \n\t" \
01225 "movq 24(%2), %%mm7 \n\t" \
01226 "pmaddwd %%mm7, %%mm2 \n\t" \
01227 "movq 32(%2), %%mm7 \n\t" \
01228 "psrad $" #shift ", %%mm1 \n\t"\
01229 "packssdw %%mm1, %%mm4 \n\t" \
01230 "movq %%mm4, " #dst " \n\t"\
01231 "psrad $" #shift ", %%mm2 \n\t"\
01232 "packssdw %%mm2, %%mm0 \n\t" \
01233 "movq %%mm0, 16+" #dst " \n\t"\
01234 "movq %%mm0, 96+" #dst " \n\t"\
01235 "movq %%mm4, 112+" #dst " \n\t"\
01236 "movq %%mm0, 32+" #dst " \n\t"\
01237 "movq %%mm4, 48+" #dst " \n\t"\
01238 "movq %%mm4, 64+" #dst " \n\t"\
01239 "movq %%mm0, 80+" #dst " \n\t"
01240
01241
01242 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
01243
01244 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
01245
01246
01247
01248 #endif
01249
01250
01251
01252
01253
01254
01255
01256
01257
01258
01259
01260
01261
01262
01263
01264
01265
01266
01267
01268
01269
01270
01271
01272 "9: \n\t"
01273 :: "r" (block), "r" (temp), "r" (coeffs)
01274 : "%eax"
01275 );
01276 }
01277
01278 void ff_simple_idct_mmx(int16_t *block)
01279 {
01280 idct(block);
01281 }
01282
01283
01284
01285 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
01286 {
01287 idct(block);
01288 put_pixels_clamped_mmx(block, dest, line_size);
01289 }
01290 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
01291 {
01292 idct(block);
01293 add_pixels_clamped_mmx(block, dest, line_size);
01294 }