00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00026 #include "dsputil.h"
00027 #include "mmx.h"
00028
00029 static DECLARE_ALIGNED_16(const unsigned short, SSE2_dequant_const[]) =
00030 {
00031 0,65535,65535,0,0,0,0,0,
00032 0,0,0,0,65535,65535,0,0,
00033 65535,65535,65535,0,0,0,0,0,
00034 0,0,0,65535,0,0,0,0,
00035 0,0,0,65535,65535,0,0,0,
00036 65535,0,0,0,0,65535,0,0,
00037 0,0,65535,65535, 0,0,0,0
00038 };
00039
00040 static DECLARE_ALIGNED_16(const unsigned int, eight_data[]) =
00041 {
00042 0x00080008,
00043 0x00080008,
00044 0x00080008,
00045 0x00080008
00046 };
00047
00048 static DECLARE_ALIGNED_16(const unsigned short, SSE2_idct_data[7 * 8]) =
00049 {
00050 64277,64277,64277,64277,64277,64277,64277,64277,
00051 60547,60547,60547,60547,60547,60547,60547,60547,
00052 54491,54491,54491,54491,54491,54491,54491,54491,
00053 46341,46341,46341,46341,46341,46341,46341,46341,
00054 36410,36410,36410,36410,36410,36410,36410,36410,
00055 25080,25080,25080,25080,25080,25080,25080,25080,
00056 12785,12785,12785,12785,12785,12785,12785,12785
00057 };
00058
00059
00060 #define SSE2_Column_IDCT() { \
00061 \
00062 movdqu_m2r(*I(3), xmm2); \
00063 movdqu_m2r(*C(3), xmm6); \
00064 \
00065 movdqu_r2r(xmm2, xmm4); \
00066 movdqu_m2r(*I(5), xmm7); \
00067 \
00068 pmulhw_r2r(xmm6, xmm4); \
00069 movdqu_m2r(*C(5), xmm1); \
00070 \
00071 pmulhw_r2r(xmm7, xmm6); \
00072 movdqu_r2r(xmm1, xmm5); \
00073 \
00074 pmulhw_r2r(xmm2, xmm1); \
00075 movdqu_m2r(*I(1), xmm3); \
00076 \
00077 pmulhw_r2r(xmm7, xmm5); \
00078 movdqu_m2r(*C(1), xmm0); \
00079 \
00080 \
00081 \
00082 paddw_r2r(xmm2, xmm4); \
00083 paddw_r2r(xmm7, xmm6); \
00084 \
00085 paddw_r2r(xmm1, xmm2); \
00086 movdqu_m2r(*I(7), xmm1); \
00087 \
00088 paddw_r2r(xmm5, xmm7); \
00089 movdqu_r2r(xmm0, xmm5); \
00090 \
00091 pmulhw_r2r(xmm3, xmm0); \
00092 paddsw_r2r(xmm7, xmm4); \
00093 \
00094 pmulhw_r2r(xmm1, xmm5); \
00095 movdqu_m2r(*C(7), xmm7); \
00096 \
00097 psubsw_r2r(xmm2, xmm6); \
00098 paddw_r2r(xmm3, xmm0); \
00099 \
00100 pmulhw_r2r(xmm7, xmm3); \
00101 movdqu_m2r(*I(2), xmm2); \
00102 \
00103 pmulhw_r2r(xmm1, xmm7); \
00104 paddw_r2r(xmm1, xmm5); \
00105 \
00106 movdqu_r2r(xmm2, xmm1); \
00107 pmulhw_m2r(*C(2), xmm2); \
00108 \
00109 psubsw_r2r(xmm5, xmm3); \
00110 movdqu_m2r(*I(6), xmm5); \
00111 \
00112 paddsw_r2r(xmm7, xmm0); \
00113 movdqu_r2r(xmm5, xmm7); \
00114 \
00115 psubsw_r2r(xmm4, xmm0); \
00116 pmulhw_m2r(*C(2), xmm5); \
00117 \
00118 paddw_r2r(xmm1, xmm2); \
00119 pmulhw_m2r(*C(6), xmm1); \
00120 \
00121 paddsw_r2r(xmm4, xmm4); \
00122 paddsw_r2r(xmm0, xmm4); \
00123 \
00124 psubsw_r2r(xmm6, xmm3); \
00125 paddw_r2r(xmm7, xmm5); \
00126 \
00127 paddsw_r2r(xmm6, xmm6); \
00128 pmulhw_m2r(*C(6), xmm7); \
00129 \
00130 paddsw_r2r(xmm3, xmm6); \
00131 movdqu_r2m(xmm4, *I(1)); \
00132 \
00133 psubsw_r2r(xmm5, xmm1); \
00134 movdqu_m2r(*C(4), xmm4); \
00135 \
00136 movdqu_r2r(xmm3, xmm5); \
00137 pmulhw_r2r(xmm4, xmm3); \
00138 \
00139 paddsw_r2r(xmm2, xmm7); \
00140 movdqu_r2m(xmm6, *I(2)); \
00141 \
00142 movdqu_r2r(xmm0, xmm2); \
00143 movdqu_m2r(*I(0), xmm6); \
00144 \
00145 pmulhw_r2r(xmm4, xmm0); \
00146 paddw_r2r(xmm3, xmm5); \
00147 \
00148 movdqu_m2r(*I(4), xmm3); \
00149 psubsw_r2r(xmm1, xmm5); \
00150 \
00151 paddw_r2r(xmm0, xmm2); \
00152 psubsw_r2r(xmm3, xmm6); \
00153 \
00154 movdqu_r2r(xmm6, xmm0); \
00155 pmulhw_r2r(xmm4, xmm6); \
00156 \
00157 paddsw_r2r(xmm3, xmm3); \
00158 paddsw_r2r(xmm1, xmm1); \
00159 \
00160 paddsw_r2r(xmm0, xmm3); \
00161 paddsw_r2r(xmm5, xmm1); \
00162 \
00163 pmulhw_r2r(xmm3, xmm4); \
00164 paddw_r2r(xmm0, xmm6); \
00165 \
00166 psubsw_r2r(xmm2, xmm6); \
00167 paddsw_r2r(xmm2, xmm2); \
00168 \
00169 movdqu_m2r(*I(1), xmm0); \
00170 paddsw_r2r(xmm6, xmm2); \
00171 \
00172 paddw_r2r(xmm3, xmm4); \
00173 psubsw_r2r(xmm1, xmm2); \
00174 \
00175 paddsw_m2r(*Eight, xmm2); \
00176 paddsw_r2r(xmm1, xmm1); \
00177 \
00178 paddsw_r2r(xmm2, xmm1); \
00179 psraw_i2r(4, xmm2); \
00180 \
00181 psubsw_r2r(xmm7, xmm4); \
00182 psraw_i2r(4, xmm1); \
00183 \
00184 movdqu_m2r(*I(2), xmm3); \
00185 paddsw_r2r(xmm7, xmm7); \
00186 \
00187 movdqu_r2m(xmm2, *O(2)); \
00188 paddsw_r2r(xmm4, xmm7); \
00189 \
00190 movdqu_r2m(xmm1, *O(1)); \
00191 psubsw_r2r(xmm3, xmm4); \
00192 \
00193 paddsw_m2r(*Eight, xmm4); \
00194 paddsw_r2r(xmm3, xmm3); \
00195 \
00196 paddsw_r2r(xmm4, xmm3); \
00197 psraw_i2r(4, xmm4); \
00198 \
00199 psubsw_r2r(xmm5, xmm6); \
00200 psraw_i2r(4, xmm3); \
00201 \
00202 paddsw_m2r(*Eight, xmm6); \
00203 paddsw_r2r(xmm5, xmm5); \
00204 \
00205 paddsw_r2r(xmm6, xmm5); \
00206 psraw_i2r(4, xmm6); \
00207 \
00208 movdqu_r2m(xmm4, *O(4)); \
00209 psraw_i2r(4, xmm5); \
00210 \
00211 movdqu_r2m(xmm3, *O(3)); \
00212 psubsw_r2r(xmm0, xmm7); \
00213 \
00214 paddsw_m2r(*Eight, xmm7); \
00215 paddsw_r2r(xmm0, xmm0); \
00216 \
00217 paddsw_r2r(xmm7, xmm0); \
00218 psraw_i2r(4, xmm7); \
00219 \
00220 movdqu_r2m(xmm6, *O(6)); \
00221 psraw_i2r(4, xmm0); \
00222 \
00223 movdqu_r2m(xmm5, *O(5)); \
00224 movdqu_r2m(xmm7, *O(7)); \
00225 \
00226 movdqu_r2m(xmm0, *O(0)); \
00227 \
00228 }
00229
00230
00231 #define SSE2_Row_IDCT() { \
00232 \
00233 movdqu_m2r(*I(3), xmm2); \
00234 movdqu_m2r(*C(3), xmm6); \
00235 \
00236 movdqu_r2r(xmm2, xmm4); \
00237 movdqu_m2r(*I(5), xmm7); \
00238 \
00239 pmulhw_r2r(xmm6, xmm4); \
00240 movdqu_m2r(*C(5), xmm1); \
00241 \
00242 pmulhw_r2r(xmm7, xmm6); \
00243 movdqu_r2r(xmm1, xmm5); \
00244 \
00245 pmulhw_r2r(xmm2, xmm1); \
00246 movdqu_m2r(*I(1), xmm3); \
00247 \
00248 pmulhw_r2r(xmm7, xmm5); \
00249 movdqu_m2r(*C(1), xmm0); \
00250 \
00251 \
00252 \
00253 paddw_r2r(xmm2, xmm4); \
00254 paddw_r2r(xmm7, xmm6); \
00255 \
00256 paddw_r2r(xmm1, xmm2); \
00257 movdqu_m2r(*I(7), xmm1); \
00258 \
00259 paddw_r2r(xmm5, xmm7); \
00260 movdqu_r2r(xmm0, xmm5); \
00261 \
00262 pmulhw_r2r(xmm3, xmm0); \
00263 paddsw_r2r(xmm7, xmm4); \
00264 \
00265 pmulhw_r2r(xmm1, xmm5); \
00266 movdqu_m2r(*C(7), xmm7); \
00267 \
00268 psubsw_r2r(xmm2, xmm6); \
00269 paddw_r2r(xmm3, xmm0); \
00270 \
00271 pmulhw_r2r(xmm7, xmm3); \
00272 movdqu_m2r(*I(2), xmm2); \
00273 \
00274 pmulhw_r2r(xmm1, xmm7); \
00275 paddw_r2r(xmm1, xmm5); \
00276 \
00277 movdqu_r2r(xmm2, xmm1); \
00278 pmulhw_m2r(*C(2), xmm2); \
00279 \
00280 psubsw_r2r(xmm5, xmm3); \
00281 movdqu_m2r(*I(6), xmm5); \
00282 \
00283 paddsw_r2r(xmm7, xmm0); \
00284 movdqu_r2r(xmm5, xmm7); \
00285 \
00286 psubsw_r2r(xmm4, xmm0); \
00287 pmulhw_m2r(*C(2), xmm5); \
00288 \
00289 paddw_r2r(xmm1, xmm2); \
00290 pmulhw_m2r(*C(6), xmm1); \
00291 \
00292 paddsw_r2r(xmm4, xmm4); \
00293 paddsw_r2r(xmm0, xmm4); \
00294 \
00295 psubsw_r2r(xmm6, xmm3); \
00296 paddw_r2r(xmm7, xmm5); \
00297 \
00298 paddsw_r2r(xmm6, xmm6); \
00299 pmulhw_m2r(*C(6), xmm7); \
00300 \
00301 paddsw_r2r(xmm3, xmm6); \
00302 movdqu_r2m(xmm4, *I(1)); \
00303 \
00304 psubsw_r2r(xmm5, xmm1); \
00305 movdqu_m2r(*C(4), xmm4); \
00306 \
00307 movdqu_r2r(xmm3, xmm5); \
00308 pmulhw_r2r(xmm4, xmm3); \
00309 \
00310 paddsw_r2r(xmm2, xmm7); \
00311 movdqu_r2m(xmm6, *I(2)); \
00312 \
00313 movdqu_r2r(xmm0, xmm2); \
00314 movdqu_m2r(*I(0), xmm6); \
00315 \
00316 pmulhw_r2r(xmm4, xmm0); \
00317 paddw_r2r(xmm3, xmm5); \
00318 \
00319 movdqu_m2r(*I(4), xmm3); \
00320 psubsw_r2r(xmm1, xmm5); \
00321 \
00322 paddw_r2r(xmm0, xmm2); \
00323 psubsw_r2r(xmm3, xmm6); \
00324 \
00325 movdqu_r2r(xmm6, xmm0); \
00326 pmulhw_r2r(xmm4, xmm6); \
00327 \
00328 paddsw_r2r(xmm3, xmm3); \
00329 paddsw_r2r(xmm1, xmm1); \
00330 \
00331 paddsw_r2r(xmm0, xmm3); \
00332 paddsw_r2r(xmm5, xmm1); \
00333 \
00334 pmulhw_r2r(xmm3, xmm4); \
00335 paddw_r2r(xmm0, xmm6); \
00336 \
00337 psubsw_r2r(xmm2, xmm6); \
00338 paddsw_r2r(xmm2, xmm2); \
00339 \
00340 movdqu_m2r(*I(1), xmm0); \
00341 paddsw_r2r(xmm6, xmm2); \
00342 \
00343 paddw_r2r(xmm3, xmm4); \
00344 psubsw_r2r(xmm1, xmm2); \
00345 \
00346 paddsw_r2r(xmm1, xmm1); \
00347 paddsw_r2r(xmm2, xmm1); \
00348 \
00349 psubsw_r2r(xmm7, xmm4); \
00350 \
00351 movdqu_m2r(*I(2), xmm3); \
00352 paddsw_r2r(xmm7, xmm7); \
00353 \
00354 movdqu_r2m(xmm2, *I(2)); \
00355 paddsw_r2r(xmm4, xmm7); \
00356 \
00357 movdqu_r2m(xmm1, *I(1)); \
00358 psubsw_r2r(xmm3, xmm4); \
00359 \
00360 paddsw_r2r(xmm3, xmm3); \
00361 \
00362 paddsw_r2r(xmm4, xmm3); \
00363 \
00364 psubsw_r2r(xmm5, xmm6); \
00365 \
00366 paddsw_r2r(xmm5, xmm5); \
00367 \
00368 paddsw_r2r(xmm6, xmm5); \
00369 \
00370 movdqu_r2m(xmm4, *I(4)); \
00371 \
00372 movdqu_r2m(xmm3, *I(3)); \
00373 psubsw_r2r(xmm0, xmm7); \
00374 \
00375 paddsw_r2r(xmm0, xmm0); \
00376 \
00377 paddsw_r2r(xmm7, xmm0); \
00378 \
00379 movdqu_r2m(xmm6, *I(6)); \
00380 \
00381 movdqu_r2m(xmm5, *I(5)); \
00382 movdqu_r2m(xmm7, *I(7)); \
00383 \
00384 movdqu_r2m(xmm0, *I(0)); \
00385 \
00386 }
00387
00388
00389 #define SSE2_Transpose() { \
00390 \
00391 movdqu_m2r(*I(4), xmm4); \
00392 movdqu_m2r(*I(5), xmm0); \
00393 \
00394 movdqu_r2r(xmm4, xmm5); \
00395 punpcklwd_r2r(xmm0, xmm4); \
00396 \
00397 punpckhwd_r2r(xmm0, xmm5); \
00398 movdqu_m2r(*I(6), xmm6); \
00399 \
00400 movdqu_m2r(*I(7), xmm0); \
00401 movdqu_r2r(xmm6, xmm7); \
00402 \
00403 punpcklwd_r2r(xmm0, xmm6); \
00404 punpckhwd_r2r(xmm0, xmm7); \
00405 \
00406 movdqu_r2r(xmm4, xmm3); \
00407 punpckldq_r2r(xmm6, xmm4); \
00408 \
00409 punpckhdq_r2r(xmm6, xmm3); \
00410 movdqu_r2m(xmm3, *I(6)); \
00411 \
00412 movdqu_r2r(xmm5, xmm6); \
00413 punpckldq_r2r(xmm7, xmm5); \
00414 \
00415 punpckhdq_r2r(xmm7, xmm6); \
00416 movdqu_m2r(*I(0), xmm0); \
00417 \
00418 movdqu_m2r(*I(1), xmm1); \
00419 movdqu_r2r(xmm0, xmm7); \
00420 \
00421 punpcklwd_r2r(xmm1, xmm0); \
00422 punpckhwd_r2r(xmm1, xmm7); \
00423 \
00424 movdqu_m2r(*I(2), xmm2); \
00425 movdqu_m2r(*I(3), xmm3); \
00426 \
00427 movdqu_r2r(xmm2, xmm1); \
00428 punpcklwd_r2r(xmm3, xmm2); \
00429 \
00430 punpckhwd_r2r(xmm3, xmm1); \
00431 movdqu_r2r(xmm0, xmm3); \
00432 \
00433 punpckldq_r2r(xmm2, xmm0); \
00434 punpckhdq_r2r(xmm2, xmm3); \
00435 \
00436 movdqu_r2r(xmm7, xmm2); \
00437 punpckldq_r2r(xmm1, xmm2); \
00438 \
00439 punpckhdq_r2r(xmm1, xmm7); \
00440 movdqu_r2r(xmm0, xmm1); \
00441 \
00442 punpcklqdq_r2r(xmm4, xmm0); \
00443 punpckhqdq_r2r(xmm4, xmm1); \
00444 \
00445 movdqu_r2m(xmm0, *I(0)); \
00446 movdqu_r2m(xmm1, *I(1)); \
00447 \
00448 movdqu_m2r(*I(6), xmm0); \
00449 movdqu_r2r(xmm3, xmm1); \
00450 \
00451 punpcklqdq_r2r(xmm0, xmm1); \
00452 punpckhqdq_r2r(xmm0, xmm3); \
00453 \
00454 movdqu_r2r(xmm2, xmm4); \
00455 punpcklqdq_r2r(xmm5, xmm4); \
00456 \
00457 punpckhqdq_r2r(xmm5, xmm2); \
00458 movdqu_r2m(xmm1, *I(2)); \
00459 \
00460 movdqu_r2m(xmm3, *I(3)); \
00461 movdqu_r2m(xmm4, *I(4)); \
00462 \
00463 movdqu_r2m(xmm2, *I(5)); \
00464 movdqu_r2r(xmm7, xmm5); \
00465 \
00466 punpcklqdq_r2r(xmm6, xmm5); \
00467 punpckhqdq_r2r(xmm6, xmm7); \
00468 \
00469 movdqu_r2m(xmm5, *I(6)); \
00470 movdqu_r2m(xmm7, *I(7)); \
00471 \
00472 }
00473
00474
00475 #define SSE2_Dequantize() { \
00476 movdqu_m2r(*(eax), xmm0); \
00477 \
00478 pmullw_m2r(*(ebx), xmm0); \
00479 movdqu_m2r(*(eax + 16), xmm1); \
00480 \
00481 pmullw_m2r(*(ebx + 16), xmm1); \
00482 pshuflw_r2r(xmm0, xmm3, 0x078); \
00483 \
00484 movdqu_r2r(xmm1, xmm2); \
00485 movdqu_m2r(*(ecx), xmm7); \
00486 \
00487 movdqu_m2r(*(eax + 32), xmm4); \
00488 movdqu_m2r(*(eax + 64), xmm5); \
00489 \
00490 pmullw_m2r(*(ebx + 32), xmm4); \
00491 pmullw_m2r(*(ebx + 64), xmm5); \
00492 \
00493 movdqu_m2r(*(ecx + 16), xmm6); \
00494 pand_r2r(xmm2, xmm7); \
00495 \
00496 pand_r2r(xmm4, xmm6); \
00497 pxor_r2r(xmm7, xmm2); \
00498 \
00499 pxor_r2r(xmm6, xmm4); \
00500 pslldq_i2r(4, xmm7); \
00501 \
00502 pslldq_i2r(2, xmm6); \
00503 por_r2r(xmm6, xmm7); \
00504 \
00505 movdqu_m2r(*(ecx + 32), xmm0); \
00506 movdqu_m2r(*(ecx + 48), xmm6); \
00507 \
00508 pand_r2r(xmm3, xmm0); \
00509 pand_r2r(xmm5, xmm6); \
00510 \
00511 pxor_r2r(xmm0, xmm3); \
00512 pxor_r2r(xmm6, xmm5); \
00513 \
00514 por_r2r(xmm7, xmm0); \
00515 pslldq_i2r(8, xmm6); \
00516 \
00517 por_r2r(xmm6, xmm0); \
00518 \
00519 \
00520 movdqu_m2r(*(ecx + 64 ), xmm1); \
00521 pshuflw_r2r(xmm5, xmm5, 0x0B4); \
00522 \
00523 movdqu_r2r(xmm1, xmm7); \
00524 movdqu_r2r(xmm1, xmm6); \
00525 \
00526 movdqu_r2m(xmm0, *(eax)); \
00527 pshufhw_r2r(xmm4, xmm4, 0x0C2); \
00528 \
00529 pand_r2r(xmm4, xmm7); \
00530 pand_r2r(xmm5, xmm1); \
00531 \
00532 pxor_r2r(xmm7, xmm4); \
00533 pxor_r2r(xmm1, xmm5); \
00534 \
00535 pshuflw_r2r(xmm2, xmm2, 0x0C6); \
00536 movdqu_r2r(xmm6, xmm0); \
00537 \
00538 pslldq_i2r(2, xmm7); \
00539 pslldq_i2r(6, xmm1); \
00540 \
00541 psrldq_i2r(2, xmm0); \
00542 pand_r2r(xmm3, xmm6); \
00543 \
00544 pand_r2r(xmm2, xmm0); \
00545 pxor_r2r(xmm6, xmm3); \
00546 \
00547 pxor_r2r(xmm0, xmm2); \
00548 psrldq_i2r(6, xmm6); \
00549 \
00550 por_r2r(xmm7, xmm1); \
00551 por_r2r(xmm6, xmm0); \
00552 \
00553 por_r2r(xmm0, xmm1); \
00554 pshuflw_r2r(xmm4, xmm4, 0x093); \
00555 \
00556 pshufhw_r2r(xmm4, xmm4, 0x093); \
00557 movdqu_r2m(xmm1, *(eax + 16)); \
00558 \
00559 pshufhw_r2r(xmm3, xmm3, 0x0D2); \
00560 movdqu_m2r(*(ecx + 64), xmm0); \
00561 \
00562 pand_r2r(xmm3, xmm0); \
00563 psrldq_i2r(12, xmm3); \
00564 \
00565 psrldq_i2r(8, xmm0); \
00566 \
00567 movdqu_m2r(*(ecx + 64), xmm6); \
00568 movdqu_m2r(*(ecx + 96), xmm7); \
00569 \
00570 pand_r2r(xmm4, xmm6); \
00571 pxor_r2r(xmm6, xmm4); \
00572 \
00573 por_r2r(xmm6, xmm3); \
00574 pand_r2r(xmm4, xmm7); \
00575 \
00576 por_r2r(xmm7, xmm0); \
00577 pxor_r2r(xmm7, xmm4); \
00578 \
00579 movdqu_m2r(*(ecx + 16 ), xmm6); \
00580 movdqu_m2r(*(ecx + 64 ), xmm1); \
00581 \
00582 pand_r2r(xmm2, xmm6); \
00583 pand_r2r(xmm6, xmm1); \
00584 \
00585 pxor_r2r(xmm6, xmm2); \
00586 pxor_r2r(xmm1, xmm6); \
00587 \
00588 psrldq_i2r(4, xmm1); \
00589 \
00590 psrldq_i2r(8, xmm6); \
00591 por_r2r(xmm1, xmm3); \
00592 \
00593 por_r2r(xmm6, xmm0); \
00594 pshufhw_r2r(xmm5, xmm5, 0x0E1); \
00595 \
00596 movdqu_m2r(*(ecx + 64), xmm1); \
00597 pshuflw_r2r(xmm5, xmm5, 0x072); \
00598 \
00599 movdqu_r2r(xmm1, xmm6); \
00600 pand_r2r(xmm5, xmm1); \
00601 \
00602 pxor_r2r(xmm1, xmm5); \
00603 pslldq_i2r(4, xmm1); \
00604 \
00605 pshufd_r2r(xmm5, xmm5, 0x09C); \
00606 por_r2r(xmm1, xmm3); \
00607 \
00608 movdqu_m2r(*(eax + 96), xmm1); \
00609 pmullw_m2r(*(ebx + 96), xmm1); \
00610 \
00611 movdqu_m2r(*(ecx), xmm7); \
00612 \
00613 psrldq_i2r(8, xmm6); \
00614 pand_r2r(xmm5, xmm7); \
00615 \
00616 pand_r2r(xmm1, xmm6); \
00617 pxor_r2r(xmm7, xmm5); \
00618 \
00619 pxor_r2r(xmm6, xmm1); \
00620 pslldq_i2r(2, xmm5); \
00621 \
00622 pslldq_i2r(14, xmm6); \
00623 por_r2r(xmm5, xmm4); \
00624 \
00625 por_r2r(xmm6, xmm3); \
00626 pslldq_i2r(6, xmm7); \
00627 \
00628 movdqu_r2m(xmm3, *(eax+32)); \
00629 por_r2r(xmm7, xmm0); \
00630 \
00631 movdqu_m2r(*(eax + 48), xmm3); \
00632 movdqu_m2r(*(eax + 80), xmm5); \
00633 \
00634 pmullw_m2r(*(ebx + 48), xmm3); \
00635 pmullw_m2r(*(ebx + 80), xmm5); \
00636 \
00637 movdqu_m2r(*(ecx + 64), xmm6); \
00638 movdqu_m2r(*(ecx + 64), xmm7); \
00639 \
00640 psrldq_i2r(8, xmm6); \
00641 pslldq_i2r(8, xmm7); \
00642 \
00643 pand_r2r(xmm3, xmm6); \
00644 pand_r2r(xmm5, xmm7); \
00645 \
00646 pxor_r2r(xmm6, xmm3); \
00647 pxor_r2r(xmm7, xmm5); \
00648 \
00649 pslldq_i2r(6, xmm6); \
00650 psrldq_i2r(2, xmm7); \
00651 \
00652 por_r2r(xmm7, xmm6); \
00653 movdqu_m2r(*(ecx), xmm7); \
00654 \
00655 por_r2r(xmm6, xmm0); \
00656 psrldq_i2r(2, xmm7); \
00657 \
00658 movdqu_r2r(xmm2, xmm6); \
00659 pand_r2r(xmm1, xmm7); \
00660 \
00661 pslldq_i2r(2, xmm6); \
00662 psrldq_i2r(14, xmm2); \
00663 \
00664 pxor_r2r(xmm7, xmm1); \
00665 pslldq_i2r(12, xmm7); \
00666 \
00667 psrldq_i2r(14, xmm6); \
00668 por_r2r(xmm6, xmm4); \
00669 \
00670 por_r2r(xmm7, xmm0); \
00671 movdqu_m2r(*(ecx), xmm6); \
00672 \
00673 psrldq_i2r(2, xmm6); \
00674 movdqu_r2m(xmm0, *(eax+48)); \
00675 \
00676 movdqu_m2r(*(ecx), xmm0); \
00677 pand_r2r(xmm3, xmm6); \
00678 \
00679 movdqu_r2r(xmm3, xmm7); \
00680 pxor_r2r(xmm6, xmm3); \
00681 \
00682 pslldq_i2r(2, xmm3); \
00683 pand_r2r(xmm1, xmm0); \
00684 \
00685 psrldq_i2r(14, xmm7); \
00686 pxor_r2r(xmm0, xmm1); \
00687 \
00688 por_r2r(xmm7, xmm6); \
00689 movdqu_m2r(*(ecx + 64), xmm7); \
00690 \
00691 pshuflw_r2r(xmm6, xmm6, 0x01E); \
00692 pslldq_i2r(6, xmm7); \
00693 \
00694 por_r2r(xmm6, xmm4); \
00695 pand_r2r(xmm5, xmm7); \
00696 \
00697 pslldq_i2r(8, xmm0); \
00698 pxor_r2r(xmm7, xmm5); \
00699 \
00700 psrldq_i2r(2, xmm7); \
00701 \
00702 pshufhw_r2r(xmm3, xmm3, 0x087); \
00703 por_r2r(xmm7, xmm0); \
00704 \
00705 movdqu_m2r(*(eax + 112), xmm7); \
00706 pmullw_m2r(*(ebx + 112), xmm7); \
00707 \
00708 movdqu_m2r(*(ecx + 64), xmm6); \
00709 por_r2r(xmm0, xmm4); \
00710 \
00711 pshuflw_r2r(xmm7, xmm7, 0x0E1); \
00712 psrldq_i2r(8, xmm6); \
00713 \
00714 movdqu_m2r(*(ecx + 64), xmm0); \
00715 pand_r2r(xmm7, xmm6); \
00716 \
00717 pand_r2r(xmm3, xmm0); \
00718 pxor_r2r(xmm6, xmm7); \
00719 \
00720 pxor_r2r(xmm0, xmm3); \
00721 pslldq_i2r(14, xmm6); \
00722 \
00723 psrldq_i2r(4, xmm0); \
00724 por_r2r(xmm6, xmm4); \
00725 \
00726 por_r2r(xmm0, xmm2); \
00727 movdqu_r2m(xmm4, *(eax + 64)); \
00728 \
00729 movdqu_m2r(*(ecx + 80), xmm6); \
00730 pshufhw_r2r(xmm7, xmm7, 0x0D2); \
00731 \
00732 movdqu_m2r(*(ecx), xmm4); \
00733 movdqu_m2r(*(ecx+48), xmm0); \
00734 \
00735 pand_r2r(xmm5, xmm6); \
00736 pand_r2r(xmm7, xmm4); \
00737 \
00738 pand_r2r(xmm1, xmm0); \
00739 pxor_r2r(xmm6, xmm5); \
00740 \
00741 pxor_r2r(xmm4, xmm7); \
00742 pxor_r2r(xmm0, xmm1); \
00743 \
00744 pshuflw_r2r(xmm6, xmm6, 0x02B); \
00745 pslldq_i2r(10, xmm4); \
00746 \
00747 pshufhw_r2r(xmm6, xmm6, 0x0B1); \
00748 pslldq_i2r(4, xmm0); \
00749 \
00750 por_r2r(xmm4, xmm6); \
00751 por_r2r(xmm0, xmm2); \
00752 \
00753 por_r2r(xmm6, xmm2); \
00754 pshufhw_r2r(xmm1, xmm1, 0x0C9); \
00755 \
00756 movdqu_r2r(xmm3, xmm6); \
00757 movdqu_r2m(xmm2, *(eax+80)); \
00758 \
00759 psrldq_i2r(12, xmm6); \
00760 pslldq_i2r(4, xmm3); \
00761 \
00762 pshuflw_r2r(xmm5, xmm5, 0x04E); \
00763 movdqu_r2r(xmm7, xmm4); \
00764 \
00765 movdqu_r2r(xmm5, xmm2); \
00766 psrldq_i2r(10, xmm7); \
00767 \
00768 pslldq_i2r(6, xmm4); \
00769 pslldq_i2r(12, xmm2); \
00770 \
00771 movdqu_r2r(xmm1, xmm0); \
00772 psrldq_i2r(12, xmm1); \
00773 \
00774 psrldq_i2r(6, xmm5); \
00775 psrldq_i2r(14, xmm3); \
00776 \
00777 pslldq_i2r(10, xmm7); \
00778 por_r2r(xmm6, xmm4); \
00779 \
00780 psrldq_i2r(10, xmm2); \
00781 pslldq_i2r(4, xmm0); \
00782 \
00783 pslldq_i2r(8, xmm1); \
00784 por_r2r(xmm7, xmm3); \
00785 \
00786 psrldq_i2r(6, xmm0); \
00787 pslldq_i2r(4, xmm5); \
00788 \
00789 por_r2r(xmm1, xmm4); \
00790 por_r2r(xmm2, xmm3); \
00791 \
00792 por_r2r(xmm5, xmm4); \
00793 por_r2r(xmm0, xmm3); \
00794 \
00795 movdqu_r2m(xmm4, *(eax+96)); \
00796 movdqu_r2m(xmm3, *(eax+112)); \
00797 \
00798 }
00799
00800
00801 void ff_vp3_idct_sse2(int16_t *input_data)
00802 {
00803 unsigned char *input_bytes = (unsigned char *)input_data;
00804 unsigned char *output_data_bytes = (unsigned char *)input_data;
00805 unsigned char *idct_data_bytes = (unsigned char *)SSE2_idct_data;
00806 unsigned char *Eight = (unsigned char *)eight_data;
00807
00808 #define eax input_bytes
00809
00810 #define ecx dequant_const_bytes
00811 #define edx idct_data_bytes
00812
00813 #define I(i) (eax + 16 * i)
00814 #define O(i) (ebx + 16 * i)
00815 #define C(i) (edx + 16 * (i-1))
00816
00817
00818
00819 #undef ebx
00820 #define ebx output_data_bytes
00821
00822 SSE2_Row_IDCT();
00823
00824 SSE2_Transpose();
00825
00826 SSE2_Column_IDCT();
00827 }
00828
00829 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block)
00830 {
00831 ff_vp3_idct_sse2(block);
00832 put_signed_pixels_clamped_mmx(block, dest, line_size);
00833 }
00834
00835 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block)
00836 {
00837 ff_vp3_idct_sse2(block);
00838 add_pixels_clamped_mmx(block, dest, line_size);
00839 }