00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "dsputil.h"
00026 #include "dsputil_mmx.h"
00027 #include "mpegvideo.h"
00028 #include "avcodec.h"
00029 #include "x86_cpu.h"
00030
00031 extern uint16_t inv_zigzag_direct16[64];
00032
00033
00034 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
00035 DCTELEM *block, int n, int qscale)
00036 {
00037 long level, qmul, qadd, nCoeffs;
00038
00039 qmul = qscale << 1;
00040
00041 assert(s->block_last_index[n]>=0 || s->h263_aic);
00042
00043 if (!s->h263_aic) {
00044 if (n < 4)
00045 level = block[0] * s->y_dc_scale;
00046 else
00047 level = block[0] * s->c_dc_scale;
00048 qadd = (qscale - 1) | 1;
00049 }else{
00050 qadd = 0;
00051 level= block[0];
00052 }
00053 if(s->ac_pred)
00054 nCoeffs=63;
00055 else
00056 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00057
00058 asm volatile(
00059 "movd %1, %%mm6 \n\t"
00060 "packssdw %%mm6, %%mm6 \n\t"
00061 "packssdw %%mm6, %%mm6 \n\t"
00062 "movd %2, %%mm5 \n\t"
00063 "pxor %%mm7, %%mm7 \n\t"
00064 "packssdw %%mm5, %%mm5 \n\t"
00065 "packssdw %%mm5, %%mm5 \n\t"
00066 "psubw %%mm5, %%mm7 \n\t"
00067 "pxor %%mm4, %%mm4 \n\t"
00068 ASMALIGN(4)
00069 "1: \n\t"
00070 "movq (%0, %3), %%mm0 \n\t"
00071 "movq 8(%0, %3), %%mm1 \n\t"
00072
00073 "pmullw %%mm6, %%mm0 \n\t"
00074 "pmullw %%mm6, %%mm1 \n\t"
00075
00076 "movq (%0, %3), %%mm2 \n\t"
00077 "movq 8(%0, %3), %%mm3 \n\t"
00078
00079 "pcmpgtw %%mm4, %%mm2 \n\t"
00080 "pcmpgtw %%mm4, %%mm3 \n\t"
00081
00082 "pxor %%mm2, %%mm0 \n\t"
00083 "pxor %%mm3, %%mm1 \n\t"
00084
00085 "paddw %%mm7, %%mm0 \n\t"
00086 "paddw %%mm7, %%mm1 \n\t"
00087
00088 "pxor %%mm0, %%mm2 \n\t"
00089 "pxor %%mm1, %%mm3 \n\t"
00090
00091 "pcmpeqw %%mm7, %%mm0 \n\t"
00092 "pcmpeqw %%mm7, %%mm1 \n\t"
00093
00094 "pandn %%mm2, %%mm0 \n\t"
00095 "pandn %%mm3, %%mm1 \n\t"
00096
00097 "movq %%mm0, (%0, %3) \n\t"
00098 "movq %%mm1, 8(%0, %3) \n\t"
00099
00100 "add $16, %3 \n\t"
00101 "jng 1b \n\t"
00102 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
00103 : "memory"
00104 );
00105 block[0]= level;
00106 }
00107
00108
00109 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
00110 DCTELEM *block, int n, int qscale)
00111 {
00112 long qmul, qadd, nCoeffs;
00113
00114 qmul = qscale << 1;
00115 qadd = (qscale - 1) | 1;
00116
00117 assert(s->block_last_index[n]>=0 || s->h263_aic);
00118
00119 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00120
00121 asm volatile(
00122 "movd %1, %%mm6 \n\t"
00123 "packssdw %%mm6, %%mm6 \n\t"
00124 "packssdw %%mm6, %%mm6 \n\t"
00125 "movd %2, %%mm5 \n\t"
00126 "pxor %%mm7, %%mm7 \n\t"
00127 "packssdw %%mm5, %%mm5 \n\t"
00128 "packssdw %%mm5, %%mm5 \n\t"
00129 "psubw %%mm5, %%mm7 \n\t"
00130 "pxor %%mm4, %%mm4 \n\t"
00131 ASMALIGN(4)
00132 "1: \n\t"
00133 "movq (%0, %3), %%mm0 \n\t"
00134 "movq 8(%0, %3), %%mm1 \n\t"
00135
00136 "pmullw %%mm6, %%mm0 \n\t"
00137 "pmullw %%mm6, %%mm1 \n\t"
00138
00139 "movq (%0, %3), %%mm2 \n\t"
00140 "movq 8(%0, %3), %%mm3 \n\t"
00141
00142 "pcmpgtw %%mm4, %%mm2 \n\t"
00143 "pcmpgtw %%mm4, %%mm3 \n\t"
00144
00145 "pxor %%mm2, %%mm0 \n\t"
00146 "pxor %%mm3, %%mm1 \n\t"
00147
00148 "paddw %%mm7, %%mm0 \n\t"
00149 "paddw %%mm7, %%mm1 \n\t"
00150
00151 "pxor %%mm0, %%mm2 \n\t"
00152 "pxor %%mm1, %%mm3 \n\t"
00153
00154 "pcmpeqw %%mm7, %%mm0 \n\t"
00155 "pcmpeqw %%mm7, %%mm1 \n\t"
00156
00157 "pandn %%mm2, %%mm0 \n\t"
00158 "pandn %%mm3, %%mm1 \n\t"
00159
00160 "movq %%mm0, (%0, %3) \n\t"
00161 "movq %%mm1, 8(%0, %3) \n\t"
00162
00163 "add $16, %3 \n\t"
00164 "jng 1b \n\t"
00165 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
00166 : "memory"
00167 );
00168 }
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
00201 DCTELEM *block, int n, int qscale)
00202 {
00203 long nCoeffs;
00204 const uint16_t *quant_matrix;
00205 int block0;
00206
00207 assert(s->block_last_index[n]>=0);
00208
00209 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00210
00211 if (n < 4)
00212 block0 = block[0] * s->y_dc_scale;
00213 else
00214 block0 = block[0] * s->c_dc_scale;
00215
00216 quant_matrix = s->intra_matrix;
00217 asm volatile(
00218 "pcmpeqw %%mm7, %%mm7 \n\t"
00219 "psrlw $15, %%mm7 \n\t"
00220 "movd %2, %%mm6 \n\t"
00221 "packssdw %%mm6, %%mm6 \n\t"
00222 "packssdw %%mm6, %%mm6 \n\t"
00223 "mov %3, %%"REG_a" \n\t"
00224 ASMALIGN(4)
00225 "1: \n\t"
00226 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00227 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00228 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00229 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00230 "pmullw %%mm6, %%mm4 \n\t"
00231 "pmullw %%mm6, %%mm5 \n\t"
00232 "pxor %%mm2, %%mm2 \n\t"
00233 "pxor %%mm3, %%mm3 \n\t"
00234 "pcmpgtw %%mm0, %%mm2 \n\t"
00235 "pcmpgtw %%mm1, %%mm3 \n\t"
00236 "pxor %%mm2, %%mm0 \n\t"
00237 "pxor %%mm3, %%mm1 \n\t"
00238 "psubw %%mm2, %%mm0 \n\t"
00239 "psubw %%mm3, %%mm1 \n\t"
00240 "pmullw %%mm4, %%mm0 \n\t"
00241 "pmullw %%mm5, %%mm1 \n\t"
00242 "pxor %%mm4, %%mm4 \n\t"
00243 "pxor %%mm5, %%mm5 \n\t"
00244 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00245 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00246 "psraw $3, %%mm0 \n\t"
00247 "psraw $3, %%mm1 \n\t"
00248 "psubw %%mm7, %%mm0 \n\t"
00249 "psubw %%mm7, %%mm1 \n\t"
00250 "por %%mm7, %%mm0 \n\t"
00251 "por %%mm7, %%mm1 \n\t"
00252 "pxor %%mm2, %%mm0 \n\t"
00253 "pxor %%mm3, %%mm1 \n\t"
00254 "psubw %%mm2, %%mm0 \n\t"
00255 "psubw %%mm3, %%mm1 \n\t"
00256 "pandn %%mm0, %%mm4 \n\t"
00257 "pandn %%mm1, %%mm5 \n\t"
00258 "movq %%mm4, (%0, %%"REG_a") \n\t"
00259 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00260
00261 "add $16, %%"REG_a" \n\t"
00262 "js 1b \n\t"
00263 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
00264 : "%"REG_a, "memory"
00265 );
00266 block[0]= block0;
00267 }
00268
00269 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
00270 DCTELEM *block, int n, int qscale)
00271 {
00272 long nCoeffs;
00273 const uint16_t *quant_matrix;
00274
00275 assert(s->block_last_index[n]>=0);
00276
00277 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00278
00279 quant_matrix = s->inter_matrix;
00280 asm volatile(
00281 "pcmpeqw %%mm7, %%mm7 \n\t"
00282 "psrlw $15, %%mm7 \n\t"
00283 "movd %2, %%mm6 \n\t"
00284 "packssdw %%mm6, %%mm6 \n\t"
00285 "packssdw %%mm6, %%mm6 \n\t"
00286 "mov %3, %%"REG_a" \n\t"
00287 ASMALIGN(4)
00288 "1: \n\t"
00289 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00290 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00291 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00292 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00293 "pmullw %%mm6, %%mm4 \n\t"
00294 "pmullw %%mm6, %%mm5 \n\t"
00295 "pxor %%mm2, %%mm2 \n\t"
00296 "pxor %%mm3, %%mm3 \n\t"
00297 "pcmpgtw %%mm0, %%mm2 \n\t"
00298 "pcmpgtw %%mm1, %%mm3 \n\t"
00299 "pxor %%mm2, %%mm0 \n\t"
00300 "pxor %%mm3, %%mm1 \n\t"
00301 "psubw %%mm2, %%mm0 \n\t"
00302 "psubw %%mm3, %%mm1 \n\t"
00303 "paddw %%mm0, %%mm0 \n\t"
00304 "paddw %%mm1, %%mm1 \n\t"
00305 "paddw %%mm7, %%mm0 \n\t"
00306 "paddw %%mm7, %%mm1 \n\t"
00307 "pmullw %%mm4, %%mm0 \n\t"
00308 "pmullw %%mm5, %%mm1 \n\t"
00309 "pxor %%mm4, %%mm4 \n\t"
00310 "pxor %%mm5, %%mm5 \n\t"
00311 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00312 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00313 "psraw $4, %%mm0 \n\t"
00314 "psraw $4, %%mm1 \n\t"
00315 "psubw %%mm7, %%mm0 \n\t"
00316 "psubw %%mm7, %%mm1 \n\t"
00317 "por %%mm7, %%mm0 \n\t"
00318 "por %%mm7, %%mm1 \n\t"
00319 "pxor %%mm2, %%mm0 \n\t"
00320 "pxor %%mm3, %%mm1 \n\t"
00321 "psubw %%mm2, %%mm0 \n\t"
00322 "psubw %%mm3, %%mm1 \n\t"
00323 "pandn %%mm0, %%mm4 \n\t"
00324 "pandn %%mm1, %%mm5 \n\t"
00325 "movq %%mm4, (%0, %%"REG_a") \n\t"
00326 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00327
00328 "add $16, %%"REG_a" \n\t"
00329 "js 1b \n\t"
00330 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
00331 : "%"REG_a, "memory"
00332 );
00333 }
00334
00335 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
00336 DCTELEM *block, int n, int qscale)
00337 {
00338 long nCoeffs;
00339 const uint16_t *quant_matrix;
00340 int block0;
00341
00342 assert(s->block_last_index[n]>=0);
00343
00344 if(s->alternate_scan) nCoeffs= 63;
00345 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00346
00347 if (n < 4)
00348 block0 = block[0] * s->y_dc_scale;
00349 else
00350 block0 = block[0] * s->c_dc_scale;
00351 quant_matrix = s->intra_matrix;
00352 asm volatile(
00353 "pcmpeqw %%mm7, %%mm7 \n\t"
00354 "psrlw $15, %%mm7 \n\t"
00355 "movd %2, %%mm6 \n\t"
00356 "packssdw %%mm6, %%mm6 \n\t"
00357 "packssdw %%mm6, %%mm6 \n\t"
00358 "mov %3, %%"REG_a" \n\t"
00359 ASMALIGN(4)
00360 "1: \n\t"
00361 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00362 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00363 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00364 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00365 "pmullw %%mm6, %%mm4 \n\t"
00366 "pmullw %%mm6, %%mm5 \n\t"
00367 "pxor %%mm2, %%mm2 \n\t"
00368 "pxor %%mm3, %%mm3 \n\t"
00369 "pcmpgtw %%mm0, %%mm2 \n\t"
00370 "pcmpgtw %%mm1, %%mm3 \n\t"
00371 "pxor %%mm2, %%mm0 \n\t"
00372 "pxor %%mm3, %%mm1 \n\t"
00373 "psubw %%mm2, %%mm0 \n\t"
00374 "psubw %%mm3, %%mm1 \n\t"
00375 "pmullw %%mm4, %%mm0 \n\t"
00376 "pmullw %%mm5, %%mm1 \n\t"
00377 "pxor %%mm4, %%mm4 \n\t"
00378 "pxor %%mm5, %%mm5 \n\t"
00379 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00380 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00381 "psraw $3, %%mm0 \n\t"
00382 "psraw $3, %%mm1 \n\t"
00383 "pxor %%mm2, %%mm0 \n\t"
00384 "pxor %%mm3, %%mm1 \n\t"
00385 "psubw %%mm2, %%mm0 \n\t"
00386 "psubw %%mm3, %%mm1 \n\t"
00387 "pandn %%mm0, %%mm4 \n\t"
00388 "pandn %%mm1, %%mm5 \n\t"
00389 "movq %%mm4, (%0, %%"REG_a") \n\t"
00390 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00391
00392 "add $16, %%"REG_a" \n\t"
00393 "jng 1b \n\t"
00394 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
00395 : "%"REG_a, "memory"
00396 );
00397 block[0]= block0;
00398
00399 }
00400
00401 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
00402 DCTELEM *block, int n, int qscale)
00403 {
00404 long nCoeffs;
00405 const uint16_t *quant_matrix;
00406
00407 assert(s->block_last_index[n]>=0);
00408
00409 if(s->alternate_scan) nCoeffs= 63;
00410 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00411
00412 quant_matrix = s->inter_matrix;
00413 asm volatile(
00414 "pcmpeqw %%mm7, %%mm7 \n\t"
00415 "psrlq $48, %%mm7 \n\t"
00416 "movd %2, %%mm6 \n\t"
00417 "packssdw %%mm6, %%mm6 \n\t"
00418 "packssdw %%mm6, %%mm6 \n\t"
00419 "mov %3, %%"REG_a" \n\t"
00420 ASMALIGN(4)
00421 "1: \n\t"
00422 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00423 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00424 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00425 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00426 "pmullw %%mm6, %%mm4 \n\t"
00427 "pmullw %%mm6, %%mm5 \n\t"
00428 "pxor %%mm2, %%mm2 \n\t"
00429 "pxor %%mm3, %%mm3 \n\t"
00430 "pcmpgtw %%mm0, %%mm2 \n\t"
00431 "pcmpgtw %%mm1, %%mm3 \n\t"
00432 "pxor %%mm2, %%mm0 \n\t"
00433 "pxor %%mm3, %%mm1 \n\t"
00434 "psubw %%mm2, %%mm0 \n\t"
00435 "psubw %%mm3, %%mm1 \n\t"
00436 "paddw %%mm0, %%mm0 \n\t"
00437 "paddw %%mm1, %%mm1 \n\t"
00438 "pmullw %%mm4, %%mm0 \n\t"
00439 "pmullw %%mm5, %%mm1 \n\t"
00440 "paddw %%mm4, %%mm0 \n\t"
00441 "paddw %%mm5, %%mm1 \n\t"
00442 "pxor %%mm4, %%mm4 \n\t"
00443 "pxor %%mm5, %%mm5 \n\t"
00444 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00445 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00446 "psrlw $4, %%mm0 \n\t"
00447 "psrlw $4, %%mm1 \n\t"
00448 "pxor %%mm2, %%mm0 \n\t"
00449 "pxor %%mm3, %%mm1 \n\t"
00450 "psubw %%mm2, %%mm0 \n\t"
00451 "psubw %%mm3, %%mm1 \n\t"
00452 "pandn %%mm0, %%mm4 \n\t"
00453 "pandn %%mm1, %%mm5 \n\t"
00454 "pxor %%mm4, %%mm7 \n\t"
00455 "pxor %%mm5, %%mm7 \n\t"
00456 "movq %%mm4, (%0, %%"REG_a") \n\t"
00457 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00458
00459 "add $16, %%"REG_a" \n\t"
00460 "jng 1b \n\t"
00461 "movd 124(%0, %3), %%mm0 \n\t"
00462 "movq %%mm7, %%mm6 \n\t"
00463 "psrlq $32, %%mm7 \n\t"
00464 "pxor %%mm6, %%mm7 \n\t"
00465 "movq %%mm7, %%mm6 \n\t"
00466 "psrlq $16, %%mm7 \n\t"
00467 "pxor %%mm6, %%mm7 \n\t"
00468 "pslld $31, %%mm7 \n\t"
00469 "psrlq $15, %%mm7 \n\t"
00470 "pxor %%mm7, %%mm0 \n\t"
00471 "movd %%mm0, 124(%0, %3) \n\t"
00472
00473 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs)
00474 : "%"REG_a, "memory"
00475 );
00476 }
00477
00478
00479
00480 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
00481 {
00482 uint8_t *ptr, *last_line;
00483 int i;
00484
00485 last_line = buf + (height - 1) * wrap;
00486
00487 ptr = buf;
00488 if(w==8)
00489 {
00490 asm volatile(
00491 "1: \n\t"
00492 "movd (%0), %%mm0 \n\t"
00493 "punpcklbw %%mm0, %%mm0 \n\t"
00494 "punpcklwd %%mm0, %%mm0 \n\t"
00495 "punpckldq %%mm0, %%mm0 \n\t"
00496 "movq %%mm0, -8(%0) \n\t"
00497 "movq -8(%0, %2), %%mm1 \n\t"
00498 "punpckhbw %%mm1, %%mm1 \n\t"
00499 "punpckhwd %%mm1, %%mm1 \n\t"
00500 "punpckhdq %%mm1, %%mm1 \n\t"
00501 "movq %%mm1, (%0, %2) \n\t"
00502 "add %1, %0 \n\t"
00503 "cmp %3, %0 \n\t"
00504 " jb 1b \n\t"
00505 : "+r" (ptr)
00506 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
00507 );
00508 }
00509 else
00510 {
00511 asm volatile(
00512 "1: \n\t"
00513 "movd (%0), %%mm0 \n\t"
00514 "punpcklbw %%mm0, %%mm0 \n\t"
00515 "punpcklwd %%mm0, %%mm0 \n\t"
00516 "punpckldq %%mm0, %%mm0 \n\t"
00517 "movq %%mm0, -8(%0) \n\t"
00518 "movq %%mm0, -16(%0) \n\t"
00519 "movq -8(%0, %2), %%mm1 \n\t"
00520 "punpckhbw %%mm1, %%mm1 \n\t"
00521 "punpckhwd %%mm1, %%mm1 \n\t"
00522 "punpckhdq %%mm1, %%mm1 \n\t"
00523 "movq %%mm1, (%0, %2) \n\t"
00524 "movq %%mm1, 8(%0, %2) \n\t"
00525 "add %1, %0 \n\t"
00526 "cmp %3, %0 \n\t"
00527 " jb 1b \n\t"
00528 : "+r" (ptr)
00529 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
00530 );
00531 }
00532
00533 for(i=0;i<w;i+=4) {
00534
00535 ptr= buf - (i + 1) * wrap - w;
00536 asm volatile(
00537 "1: \n\t"
00538 "movq (%1, %0), %%mm0 \n\t"
00539 "movq %%mm0, (%0) \n\t"
00540 "movq %%mm0, (%0, %2) \n\t"
00541 "movq %%mm0, (%0, %2, 2) \n\t"
00542 "movq %%mm0, (%0, %3) \n\t"
00543 "add $8, %0 \n\t"
00544 "cmp %4, %0 \n\t"
00545 " jb 1b \n\t"
00546 : "+r" (ptr)
00547 : "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w)
00548 );
00549 ptr= last_line + (i + 1) * wrap - w;
00550 asm volatile(
00551 "1: \n\t"
00552 "movq (%1, %0), %%mm0 \n\t"
00553 "movq %%mm0, (%0) \n\t"
00554 "movq %%mm0, (%0, %2) \n\t"
00555 "movq %%mm0, (%0, %2, 2) \n\t"
00556 "movq %%mm0, (%0, %3) \n\t"
00557 "add $8, %0 \n\t"
00558 "cmp %4, %0 \n\t"
00559 " jb 1b \n\t"
00560 : "+r" (ptr)
00561 : "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w)
00562 );
00563 }
00564 }
00565
00566 static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
00567 const int intra= s->mb_intra;
00568 int *sum= s->dct_error_sum[intra];
00569 uint16_t *offset= s->dct_offset[intra];
00570
00571 s->dct_count[intra]++;
00572
00573 asm volatile(
00574 "pxor %%mm7, %%mm7 \n\t"
00575 "1: \n\t"
00576 "pxor %%mm0, %%mm0 \n\t"
00577 "pxor %%mm1, %%mm1 \n\t"
00578 "movq (%0), %%mm2 \n\t"
00579 "movq 8(%0), %%mm3 \n\t"
00580 "pcmpgtw %%mm2, %%mm0 \n\t"
00581 "pcmpgtw %%mm3, %%mm1 \n\t"
00582 "pxor %%mm0, %%mm2 \n\t"
00583 "pxor %%mm1, %%mm3 \n\t"
00584 "psubw %%mm0, %%mm2 \n\t"
00585 "psubw %%mm1, %%mm3 \n\t"
00586 "movq %%mm2, %%mm4 \n\t"
00587 "movq %%mm3, %%mm5 \n\t"
00588 "psubusw (%2), %%mm2 \n\t"
00589 "psubusw 8(%2), %%mm3 \n\t"
00590 "pxor %%mm0, %%mm2 \n\t"
00591 "pxor %%mm1, %%mm3 \n\t"
00592 "psubw %%mm0, %%mm2 \n\t"
00593 "psubw %%mm1, %%mm3 \n\t"
00594 "movq %%mm2, (%0) \n\t"
00595 "movq %%mm3, 8(%0) \n\t"
00596 "movq %%mm4, %%mm2 \n\t"
00597 "movq %%mm5, %%mm3 \n\t"
00598 "punpcklwd %%mm7, %%mm4 \n\t"
00599 "punpckhwd %%mm7, %%mm2 \n\t"
00600 "punpcklwd %%mm7, %%mm5 \n\t"
00601 "punpckhwd %%mm7, %%mm3 \n\t"
00602 "paddd (%1), %%mm4 \n\t"
00603 "paddd 8(%1), %%mm2 \n\t"
00604 "paddd 16(%1), %%mm5 \n\t"
00605 "paddd 24(%1), %%mm3 \n\t"
00606 "movq %%mm4, (%1) \n\t"
00607 "movq %%mm2, 8(%1) \n\t"
00608 "movq %%mm5, 16(%1) \n\t"
00609 "movq %%mm3, 24(%1) \n\t"
00610 "add $16, %0 \n\t"
00611 "add $32, %1 \n\t"
00612 "add $16, %2 \n\t"
00613 "cmp %3, %0 \n\t"
00614 " jb 1b \n\t"
00615 : "+r" (block), "+r" (sum), "+r" (offset)
00616 : "r"(block+64)
00617 );
00618 }
00619
00620 static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
00621 const int intra= s->mb_intra;
00622 int *sum= s->dct_error_sum[intra];
00623 uint16_t *offset= s->dct_offset[intra];
00624
00625 s->dct_count[intra]++;
00626
00627 asm volatile(
00628 "pxor %%xmm7, %%xmm7 \n\t"
00629 "1: \n\t"
00630 "pxor %%xmm0, %%xmm0 \n\t"
00631 "pxor %%xmm1, %%xmm1 \n\t"
00632 "movdqa (%0), %%xmm2 \n\t"
00633 "movdqa 16(%0), %%xmm3 \n\t"
00634 "pcmpgtw %%xmm2, %%xmm0 \n\t"
00635 "pcmpgtw %%xmm3, %%xmm1 \n\t"
00636 "pxor %%xmm0, %%xmm2 \n\t"
00637 "pxor %%xmm1, %%xmm3 \n\t"
00638 "psubw %%xmm0, %%xmm2 \n\t"
00639 "psubw %%xmm1, %%xmm3 \n\t"
00640 "movdqa %%xmm2, %%xmm4 \n\t"
00641 "movdqa %%xmm3, %%xmm5 \n\t"
00642 "psubusw (%2), %%xmm2 \n\t"
00643 "psubusw 16(%2), %%xmm3 \n\t"
00644 "pxor %%xmm0, %%xmm2 \n\t"
00645 "pxor %%xmm1, %%xmm3 \n\t"
00646 "psubw %%xmm0, %%xmm2 \n\t"
00647 "psubw %%xmm1, %%xmm3 \n\t"
00648 "movdqa %%xmm2, (%0) \n\t"
00649 "movdqa %%xmm3, 16(%0) \n\t"
00650 "movdqa %%xmm4, %%xmm6 \n\t"
00651 "movdqa %%xmm5, %%xmm0 \n\t"
00652 "punpcklwd %%xmm7, %%xmm4 \n\t"
00653 "punpckhwd %%xmm7, %%xmm6 \n\t"
00654 "punpcklwd %%xmm7, %%xmm5 \n\t"
00655 "punpckhwd %%xmm7, %%xmm0 \n\t"
00656 "paddd (%1), %%xmm4 \n\t"
00657 "paddd 16(%1), %%xmm6 \n\t"
00658 "paddd 32(%1), %%xmm5 \n\t"
00659 "paddd 48(%1), %%xmm0 \n\t"
00660 "movdqa %%xmm4, (%1) \n\t"
00661 "movdqa %%xmm6, 16(%1) \n\t"
00662 "movdqa %%xmm5, 32(%1) \n\t"
00663 "movdqa %%xmm0, 48(%1) \n\t"
00664 "add $32, %0 \n\t"
00665 "add $64, %1 \n\t"
00666 "add $32, %2 \n\t"
00667 "cmp %3, %0 \n\t"
00668 " jb 1b \n\t"
00669 : "+r" (block), "+r" (sum), "+r" (offset)
00670 : "r"(block+64)
00671 );
00672 }
00673
00674 #ifdef HAVE_SSSE3
00675 #define HAVE_SSSE3_BAK
00676 #endif
00677 #undef HAVE_SSSE3
00678
00679 #undef HAVE_SSE2
00680 #undef HAVE_MMX2
00681 #define RENAME(a) a ## _MMX
00682 #define RENAMEl(a) a ## _mmx
00683 #include "mpegvideo_mmx_template.c"
00684
00685 #define HAVE_MMX2
00686 #undef RENAME
00687 #undef RENAMEl
00688 #define RENAME(a) a ## _MMX2
00689 #define RENAMEl(a) a ## _mmx2
00690 #include "mpegvideo_mmx_template.c"
00691
00692 #define HAVE_SSE2
00693 #undef RENAME
00694 #undef RENAMEl
00695 #define RENAME(a) a ## _SSE2
00696 #define RENAMEl(a) a ## _sse2
00697 #include "mpegvideo_mmx_template.c"
00698
00699 #ifdef HAVE_SSSE3_BAK
00700 #define HAVE_SSSE3
00701 #undef RENAME
00702 #undef RENAMEl
00703 #define RENAME(a) a ## _SSSE3
00704 #define RENAMEl(a) a ## _sse2
00705 #include "mpegvideo_mmx_template.c"
00706 #endif
00707
00708 void MPV_common_init_mmx(MpegEncContext *s)
00709 {
00710 if (mm_flags & MM_MMX) {
00711 const int dct_algo = s->avctx->dct_algo;
00712
00713 s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
00714 s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
00715 s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
00716 s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
00717 if(!(s->flags & CODEC_FLAG_BITEXACT))
00718 s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
00719 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
00720
00721 draw_edges = draw_edges_mmx;
00722
00723 if (mm_flags & MM_SSE2) {
00724 s->denoise_dct= denoise_dct_sse2;
00725 } else {
00726 s->denoise_dct= denoise_dct_mmx;
00727 }
00728
00729 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
00730 #ifdef HAVE_SSSE3
00731 if(mm_flags & MM_SSSE3){
00732 s->dct_quantize= dct_quantize_SSSE3;
00733 } else
00734 #endif
00735 if(mm_flags & MM_SSE2){
00736 s->dct_quantize= dct_quantize_SSE2;
00737 } else if(mm_flags & MM_MMXEXT){
00738 s->dct_quantize= dct_quantize_MMX2;
00739 } else {
00740 s->dct_quantize= dct_quantize_MMX;
00741 }
00742 }
00743 }
00744 }