00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #include "common.h"
00033 #include "dsputil.h"
00034 #include "mmx.h"
00035
00036 #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
00037
00039
00040
00041
00042
00043
00044
00045
00046
00048
00049 #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
00050 #define SHIFT_FRW_COL BITS_FRW_ACC
00051 #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
00052 #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
00053
00054
00055 #define X8(x) x,x,x,x,x,x,x,x
00056
00057
00058 static const int16_t fdct_tg_all_16[24] ATTR_ALIGN(16) = {
00059 X8(13036),
00060 X8(27146),
00061 X8(-21746)
00062 };
00063
00064 static const int16_t ocos_4_16[8] ATTR_ALIGN(16) = {
00065 X8(23170)
00066 };
00067
00068 static const int16_t fdct_one_corr[8] ATTR_ALIGN(16) = { X8(1) };
00069
00070 static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
00071
00072 static struct
00073 {
00074 const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
00075 } fdct_r_row_sse2 ATTR_ALIGN(16)=
00076 {{
00077 RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
00078 }};
00079
00080
00081 static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {
00082 16384, 16384, 22725, 19266,
00083 16384, 16384, 12873, 4520,
00084 21407, 8867, 19266, -4520,
00085 -8867, -21407, -22725, -12873,
00086 16384, -16384, 12873, -22725,
00087 -16384, 16384, 4520, 19266,
00088 8867, -21407, 4520, -12873,
00089 21407, -8867, 19266, -22725,
00090
00091 22725, 22725, 31521, 26722,
00092 22725, 22725, 17855, 6270,
00093 29692, 12299, 26722, -6270,
00094 -12299, -29692, -31521, -17855,
00095 22725, -22725, 17855, -31521,
00096 -22725, 22725, 6270, 26722,
00097 12299, -29692, 6270, -17855,
00098 29692, -12299, 26722, -31521,
00099
00100 21407, 21407, 29692, 25172,
00101 21407, 21407, 16819, 5906,
00102 27969, 11585, 25172, -5906,
00103 -11585, -27969, -29692, -16819,
00104 21407, -21407, 16819, -29692,
00105 -21407, 21407, 5906, 25172,
00106 11585, -27969, 5906, -16819,
00107 27969, -11585, 25172, -29692,
00108
00109 19266, 19266, 26722, 22654,
00110 19266, 19266, 15137, 5315,
00111 25172, 10426, 22654, -5315,
00112 -10426, -25172, -26722, -15137,
00113 19266, -19266, 15137, -26722,
00114 -19266, 19266, 5315, 22654,
00115 10426, -25172, 5315, -15137,
00116 25172, -10426, 22654, -26722,
00117
00118 16384, 16384, 22725, 19266,
00119 16384, 16384, 12873, 4520,
00120 21407, 8867, 19266, -4520,
00121 -8867, -21407, -22725, -12873,
00122 16384, -16384, 12873, -22725,
00123 -16384, 16384, 4520, 19266,
00124 8867, -21407, 4520, -12873,
00125 21407, -8867, 19266, -22725,
00126
00127 19266, 19266, 26722, 22654,
00128 19266, 19266, 15137, 5315,
00129 25172, 10426, 22654, -5315,
00130 -10426, -25172, -26722, -15137,
00131 19266, -19266, 15137, -26722,
00132 -19266, 19266, 5315, 22654,
00133 10426, -25172, 5315, -15137,
00134 25172, -10426, 22654, -26722,
00135
00136 21407, 21407, 29692, 25172,
00137 21407, 21407, 16819, 5906,
00138 27969, 11585, 25172, -5906,
00139 -11585, -27969, -29692, -16819,
00140 21407, -21407, 16819, -29692,
00141 -21407, 21407, 5906, 25172,
00142 11585, -27969, 5906, -16819,
00143 27969, -11585, 25172, -29692,
00144
00145 22725, 22725, 31521, 26722,
00146 22725, 22725, 17855, 6270,
00147 29692, 12299, 26722, -6270,
00148 -12299, -29692, -31521, -17855,
00149 22725, -22725, 17855, -31521,
00150 -22725, 22725, 6270, 26722,
00151 12299, -29692, 6270, -17855,
00152 29692, -12299, 26722, -31521,
00153 };
00154
00155 static struct
00156 {
00157 const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16);
00158 } tab_frw_01234567_sse2 ATTR_ALIGN(16) =
00159 {{
00160
00161 #define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
00162 C4, C4, C5, C7, C2, C6, C3, -C7, \
00163 -C4, C4, C7, C3, C6, -C2, C7, -C5, \
00164 C4, -C4, C5, -C1, C2, -C6, C3, -C1,
00165
00166 #define C1 22725
00167 #define C2 21407
00168 #define C3 19266
00169 #define C4 16384
00170 #define C5 12873
00171 #define C6 8867
00172 #define C7 4520
00173 TABLE_SSE2
00174
00175 #undef C1
00176 #undef C2
00177 #undef C3
00178 #undef C4
00179 #undef C5
00180 #undef C6
00181 #undef C7
00182 #define C1 31521
00183 #define C2 29692
00184 #define C3 26722
00185 #define C4 22725
00186 #define C5 17855
00187 #define C6 12299
00188 #define C7 6270
00189 TABLE_SSE2
00190
00191 #undef C1
00192 #undef C2
00193 #undef C3
00194 #undef C4
00195 #undef C5
00196 #undef C6
00197 #undef C7
00198 #define C1 29692
00199 #define C2 27969
00200 #define C3 25172
00201 #define C4 21407
00202 #define C5 16819
00203 #define C6 11585
00204 #define C7 5906
00205 TABLE_SSE2
00206
00207 #undef C1
00208 #undef C2
00209 #undef C3
00210 #undef C4
00211 #undef C5
00212 #undef C6
00213 #undef C7
00214 #define C1 26722
00215 #define C2 25172
00216 #define C3 22654
00217 #define C4 19266
00218 #define C5 15137
00219 #define C6 10426
00220 #define C7 5315
00221 TABLE_SSE2
00222
00223 #undef C1
00224 #undef C2
00225 #undef C3
00226 #undef C4
00227 #undef C5
00228 #undef C6
00229 #undef C7
00230 #define C1 22725
00231 #define C2 21407
00232 #define C3 19266
00233 #define C4 16384
00234 #define C5 12873
00235 #define C6 8867
00236 #define C7 4520
00237 TABLE_SSE2
00238
00239 #undef C1
00240 #undef C2
00241 #undef C3
00242 #undef C4
00243 #undef C5
00244 #undef C6
00245 #undef C7
00246 #define C1 26722
00247 #define C2 25172
00248 #define C3 22654
00249 #define C4 19266
00250 #define C5 15137
00251 #define C6 10426
00252 #define C7 5315
00253 TABLE_SSE2
00254
00255 #undef C1
00256 #undef C2
00257 #undef C3
00258 #undef C4
00259 #undef C5
00260 #undef C6
00261 #undef C7
00262 #define C1 29692
00263 #define C2 27969
00264 #define C3 25172
00265 #define C4 21407
00266 #define C5 16819
00267 #define C6 11585
00268 #define C7 5906
00269 TABLE_SSE2
00270
00271 #undef C1
00272 #undef C2
00273 #undef C3
00274 #undef C4
00275 #undef C5
00276 #undef C6
00277 #undef C7
00278 #define C1 31521
00279 #define C2 29692
00280 #define C3 26722
00281 #define C4 22725
00282 #define C5 17855
00283 #define C6 12299
00284 #define C7 6270
00285 TABLE_SSE2
00286 }};
00287
00288 #define FDCT_COL(cpu, mm, mov)\
00289 static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
00290 {\
00291 mov##_m2r(*(in + offset + 1 * 8), mm##0);\
00292 mov##_m2r(*(in + offset + 6 * 8), mm##1);\
00293 mov##_r2r(mm##0, mm##2);\
00294 mov##_m2r(*(in + offset + 2 * 8), mm##3);\
00295 paddsw_r2r(mm##1, mm##0);\
00296 mov##_m2r(*(in + offset + 5 * 8), mm##4);\
00297 psllw_i2r(SHIFT_FRW_COL, mm##0);\
00298 mov##_m2r(*(in + offset + 0 * 8), mm##5);\
00299 paddsw_r2r(mm##3, mm##4);\
00300 paddsw_m2r(*(in + offset + 7 * 8), mm##5);\
00301 psllw_i2r(SHIFT_FRW_COL, mm##4);\
00302 mov##_r2r(mm##0, mm##6);\
00303 psubsw_r2r(mm##1, mm##2);\
00304 mov##_m2r(*(fdct_tg_all_16 + 8), mm##1);\
00305 psubsw_r2r(mm##4, mm##0);\
00306 mov##_m2r(*(in + offset + 3 * 8), mm##7);\
00307 pmulhw_r2r(mm##0, mm##1);\
00308 paddsw_m2r(*(in + offset + 4 * 8), mm##7);\
00309 psllw_i2r(SHIFT_FRW_COL, mm##5);\
00310 paddsw_r2r(mm##4, mm##6);\
00311 psllw_i2r(SHIFT_FRW_COL, mm##7);\
00312 mov##_r2r(mm##5, mm##4);\
00313 psubsw_r2r(mm##7, mm##5);\
00314 paddsw_r2r(mm##5, mm##1);\
00315 paddsw_r2r(mm##7, mm##4);\
00316 por_m2r(*fdct_one_corr, mm##1);\
00317 psllw_i2r(SHIFT_FRW_COL + 1, mm##2);\
00318 pmulhw_m2r(*(fdct_tg_all_16 + 8), mm##5);\
00319 mov##_r2r(mm##4, mm##7);\
00320 psubsw_m2r(*(in + offset + 5 * 8), mm##3);\
00321 psubsw_r2r(mm##6, mm##4);\
00322 mov##_r2m(mm##1, *(out + offset + 2 * 8));\
00323 paddsw_r2r(mm##6, mm##7);\
00324 mov##_m2r(*(in + offset + 3 * 8), mm##1);\
00325 psllw_i2r(SHIFT_FRW_COL + 1, mm##3);\
00326 psubsw_m2r(*(in + offset + 4 * 8), mm##1);\
00327 mov##_r2r(mm##2, mm##6);\
00328 mov##_r2m(mm##4, *(out + offset + 4 * 8));\
00329 paddsw_r2r(mm##3, mm##2);\
00330 pmulhw_m2r(*ocos_4_16, mm##2);\
00331 psubsw_r2r(mm##3, mm##6);\
00332 pmulhw_m2r(*ocos_4_16, mm##6);\
00333 psubsw_r2r(mm##0, mm##5);\
00334 por_m2r(*fdct_one_corr, mm##5);\
00335 psllw_i2r(SHIFT_FRW_COL, mm##1);\
00336 por_m2r(*fdct_one_corr, mm##2);\
00337 mov##_r2r(mm##1, mm##4);\
00338 mov##_m2r(*(in + offset + 0 * 8), mm##3);\
00339 paddsw_r2r(mm##6, mm##1);\
00340 psubsw_m2r(*(in + offset + 7 * 8), mm##3);\
00341 psubsw_r2r(mm##6, mm##4);\
00342 mov##_m2r(*(fdct_tg_all_16 + 0), mm##0);\
00343 psllw_i2r(SHIFT_FRW_COL, mm##3);\
00344 mov##_m2r(*(fdct_tg_all_16 + 16), mm##6);\
00345 pmulhw_r2r(mm##1, mm##0);\
00346 mov##_r2m(mm##7, *(out + offset + 0 * 8));\
00347 pmulhw_r2r(mm##4, mm##6);\
00348 mov##_r2m(mm##5, *(out + offset + 6 * 8));\
00349 mov##_r2r(mm##3, mm##7);\
00350 mov##_m2r(*(fdct_tg_all_16 + 16), mm##5);\
00351 psubsw_r2r(mm##2, mm##7);\
00352 paddsw_r2r(mm##2, mm##3);\
00353 pmulhw_r2r(mm##7, mm##5);\
00354 paddsw_r2r(mm##3, mm##0);\
00355 paddsw_r2r(mm##4, mm##6);\
00356 pmulhw_m2r(*(fdct_tg_all_16 + 0), mm##3);\
00357 por_m2r(*fdct_one_corr, mm##0);\
00358 paddsw_r2r(mm##7, mm##5);\
00359 psubsw_r2r(mm##6, mm##7);\
00360 mov##_r2m(mm##0, *(out + offset + 1 * 8));\
00361 paddsw_r2r(mm##4, mm##5);\
00362 mov##_r2m(mm##7, *(out + offset + 3 * 8));\
00363 psubsw_r2r(mm##1, mm##3);\
00364 mov##_r2m(mm##5, *(out + offset + 5 * 8));\
00365 mov##_r2m(mm##3, *(out + offset + 7 * 8));\
00366 }
00367
00368 FDCT_COL(mmx, mm, movq)
00369 FDCT_COL(sse2, xmm, movdqa)
00370
00371 static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
00372 {
00373 asm volatile(
00374 #define FDCT_ROW_SSE2_H1(i,t) \
00375 "movq " #i "(%0), %%xmm2 \n\t" \
00376 "movq " #i "+8(%0), %%xmm0 \n\t" \
00377 "movdqa " #t "+32(%1), %%xmm3 \n\t" \
00378 "movdqa " #t "+48(%1), %%xmm7 \n\t" \
00379 "movdqa " #t "(%1), %%xmm4 \n\t" \
00380 "movdqa " #t "+16(%1), %%xmm5 \n\t"
00381
00382 #define FDCT_ROW_SSE2_H2(i,t) \
00383 "movq " #i "(%0), %%xmm2 \n\t" \
00384 "movq " #i "+8(%0), %%xmm0 \n\t" \
00385 "movdqa " #t "+32(%1), %%xmm3 \n\t" \
00386 "movdqa " #t "+48(%1), %%xmm7 \n\t"
00387
00388 #define FDCT_ROW_SSE2(i) \
00389 "movq %%xmm2, %%xmm1 \n\t" \
00390 "pshuflw $27, %%xmm0, %%xmm0 \n\t" \
00391 "paddsw %%xmm0, %%xmm1 \n\t" \
00392 "psubsw %%xmm0, %%xmm2 \n\t" \
00393 "punpckldq %%xmm2, %%xmm1 \n\t" \
00394 "pshufd $78, %%xmm1, %%xmm2 \n\t" \
00395 "pmaddwd %%xmm2, %%xmm3 \n\t" \
00396 "pmaddwd %%xmm1, %%xmm7 \n\t" \
00397 "pmaddwd %%xmm5, %%xmm2 \n\t" \
00398 "pmaddwd %%xmm4, %%xmm1 \n\t" \
00399 "paddd %%xmm7, %%xmm3 \n\t" \
00400 "paddd %%xmm2, %%xmm1 \n\t" \
00401 "paddd %%xmm6, %%xmm3 \n\t" \
00402 "paddd %%xmm6, %%xmm1 \n\t" \
00403 "psrad %3, %%xmm3 \n\t" \
00404 "psrad %3, %%xmm1 \n\t" \
00405 "packssdw %%xmm3, %%xmm1 \n\t" \
00406 "movdqa %%xmm1, " #i "(%4) \n\t"
00407
00408 "movdqa (%2), %%xmm6 \n\t"
00409 FDCT_ROW_SSE2_H1(0,0)
00410 FDCT_ROW_SSE2(0)
00411 FDCT_ROW_SSE2_H2(64,0)
00412 FDCT_ROW_SSE2(64)
00413
00414 FDCT_ROW_SSE2_H1(16,64)
00415 FDCT_ROW_SSE2(16)
00416 FDCT_ROW_SSE2_H2(112,64)
00417 FDCT_ROW_SSE2(112)
00418
00419 FDCT_ROW_SSE2_H1(32,128)
00420 FDCT_ROW_SSE2(32)
00421 FDCT_ROW_SSE2_H2(96,128)
00422 FDCT_ROW_SSE2(96)
00423
00424 FDCT_ROW_SSE2_H1(48,192)
00425 FDCT_ROW_SSE2(48)
00426 FDCT_ROW_SSE2_H2(80,192)
00427 FDCT_ROW_SSE2(80)
00428 :
00429 : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
00430 );
00431 }
00432
00433 static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
00434 {
00435 pshufw_m2r(*(in + 4), mm5, 0x1B);
00436 movq_m2r(*(in + 0), mm0);
00437 movq_r2r(mm0, mm1);
00438 paddsw_r2r(mm5, mm0);
00439 psubsw_r2r(mm5, mm1);
00440 movq_r2r(mm0, mm2);
00441 punpckldq_r2r(mm1, mm0);
00442 punpckhdq_r2r(mm1, mm2);
00443 movq_m2r(*(table + 0), mm1);
00444 movq_m2r(*(table + 4), mm3);
00445 movq_m2r(*(table + 8), mm4);
00446 movq_m2r(*(table + 12), mm5);
00447 movq_m2r(*(table + 16), mm6);
00448 movq_m2r(*(table + 20), mm7);
00449 pmaddwd_r2r(mm0, mm1);
00450 pmaddwd_r2r(mm2, mm3);
00451 pmaddwd_r2r(mm0, mm4);
00452 pmaddwd_r2r(mm2, mm5);
00453 pmaddwd_r2r(mm0, mm6);
00454 pmaddwd_r2r(mm2, mm7);
00455 pmaddwd_m2r(*(table + 24), mm0);
00456 pmaddwd_m2r(*(table + 28), mm2);
00457 paddd_r2r(mm1, mm3);
00458 paddd_r2r(mm4, mm5);
00459 paddd_r2r(mm6, mm7);
00460 paddd_r2r(mm0, mm2);
00461 movq_m2r(*fdct_r_row, mm0);
00462 paddd_r2r(mm0, mm3);
00463 paddd_r2r(mm0, mm5);
00464 paddd_r2r(mm0, mm7);
00465 paddd_r2r(mm0, mm2);
00466 psrad_i2r(SHIFT_FRW_ROW, mm3);
00467 psrad_i2r(SHIFT_FRW_ROW, mm5);
00468 psrad_i2r(SHIFT_FRW_ROW, mm7);
00469 psrad_i2r(SHIFT_FRW_ROW, mm2);
00470 packssdw_r2r(mm5, mm3);
00471 packssdw_r2r(mm2, mm7);
00472 movq_r2m(mm3, *(out + 0));
00473 movq_r2m(mm7, *(out + 4));
00474 }
00475
00476 static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
00477 {
00478
00479 movd_m2r(*(in + 6), mm1);
00480 punpcklwd_m2r(*(in + 4), mm1);
00481 movq_r2r(mm1, mm2);
00482 psrlq_i2r(0x20, mm1);
00483 movq_m2r(*(in + 0), mm0);
00484 punpcklwd_r2r(mm2, mm1);
00485 movq_r2r(mm0, mm5);
00486 paddsw_r2r(mm1, mm0);
00487 psubsw_r2r(mm1, mm5);
00488 movq_r2r(mm0, mm2);
00489 punpckldq_r2r(mm5, mm0);
00490 punpckhdq_r2r(mm5, mm2);
00491 movq_m2r(*(table + 0), mm1);
00492 movq_m2r(*(table + 4), mm3);
00493 movq_m2r(*(table + 8), mm4);
00494 movq_m2r(*(table + 12), mm5);
00495 movq_m2r(*(table + 16), mm6);
00496 movq_m2r(*(table + 20), mm7);
00497 pmaddwd_r2r(mm0, mm1);
00498 pmaddwd_r2r(mm2, mm3);
00499 pmaddwd_r2r(mm0, mm4);
00500 pmaddwd_r2r(mm2, mm5);
00501 pmaddwd_r2r(mm0, mm6);
00502 pmaddwd_r2r(mm2, mm7);
00503 pmaddwd_m2r(*(table + 24), mm0);
00504 pmaddwd_m2r(*(table + 28), mm2);
00505 paddd_r2r(mm1, mm3);
00506 paddd_r2r(mm4, mm5);
00507 paddd_r2r(mm6, mm7);
00508 paddd_r2r(mm0, mm2);
00509 movq_m2r(*fdct_r_row, mm0);
00510 paddd_r2r(mm0, mm3);
00511 paddd_r2r(mm0, mm5);
00512 paddd_r2r(mm0, mm7);
00513 paddd_r2r(mm0, mm2);
00514 psrad_i2r(SHIFT_FRW_ROW, mm3);
00515 psrad_i2r(SHIFT_FRW_ROW, mm5);
00516 psrad_i2r(SHIFT_FRW_ROW, mm7);
00517 psrad_i2r(SHIFT_FRW_ROW, mm2);
00518 packssdw_r2r(mm5, mm3);
00519 packssdw_r2r(mm2, mm7);
00520 movq_r2m(mm3, *(out + 0));
00521 movq_r2m(mm7, *(out + 4));
00522 }
00523
00524 void ff_fdct_mmx(int16_t *block)
00525 {
00526 int64_t align_tmp[16] ATTR_ALIGN(8);
00527 int16_t * block1= (int16_t*)align_tmp;
00528 const int16_t *table= tab_frw_01234567;
00529 int i;
00530
00531 fdct_col_mmx(block, block1, 0);
00532 fdct_col_mmx(block, block1, 4);
00533
00534 for(i=8;i>0;i--) {
00535 fdct_row_mmx(block1, block, table);
00536 block1 += 8;
00537 table += 32;
00538 block += 8;
00539 }
00540 }
00541
00542 void ff_fdct_mmx2(int16_t *block)
00543 {
00544 int64_t align_tmp[16] ATTR_ALIGN(8);
00545 int16_t *block1= (int16_t*)align_tmp;
00546 const int16_t *table= tab_frw_01234567;
00547 int i;
00548
00549 fdct_col_mmx(block, block1, 0);
00550 fdct_col_mmx(block, block1, 4);
00551
00552 for(i=8;i>0;i--) {
00553 fdct_row_mmx2(block1, block, table);
00554 block1 += 8;
00555 table += 32;
00556 block += 8;
00557 }
00558 }
00559
00560 void ff_fdct_sse2(int16_t *block)
00561 {
00562 int64_t align_tmp[16] ATTR_ALIGN(16);
00563 int16_t * const block1= (int16_t*)align_tmp;
00564
00565 fdct_col_sse2(block, block1, 0);
00566 fdct_row_sse2(block1, block);
00567 }
00568