00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00027 {
00028 int stride = line_size;
00029 __asm__ __volatile__ (
00030 "and r12, %[pixels], #7 \n\t"
00031 "bic %[pixels], %[pixels], #7 \n\t"
00032 "tmcr wcgr1, r12 \n\t"
00033 "add r4, %[pixels], %[line_size] \n\t"
00034 "add r5, %[block], %[line_size] \n\t"
00035 "mov %[line_size], %[line_size], lsl #1 \n\t"
00036 "1: \n\t"
00037 "wldrd wr0, [%[pixels]] \n\t"
00038 "subs %[h], %[h], #2 \n\t"
00039 "wldrd wr1, [%[pixels], #8] \n\t"
00040 "add %[pixels], %[pixels], %[line_size] \n\t"
00041 "wldrd wr3, [r4] \n\t"
00042 "pld [%[pixels]] \n\t"
00043 "pld [%[pixels], #32] \n\t"
00044 "wldrd wr4, [r4, #8] \n\t"
00045 "add r4, r4, %[line_size] \n\t"
00046 "walignr1 wr8, wr0, wr1 \n\t"
00047 "pld [r4] \n\t"
00048 "pld [r4, #32] \n\t"
00049 "walignr1 wr10, wr3, wr4 \n\t"
00050 "wstrd wr8, [%[block]] \n\t"
00051 "add %[block], %[block], %[line_size] \n\t"
00052 "wstrd wr10, [r5] \n\t"
00053 "add r5, r5, %[line_size] \n\t"
00054 "bne 1b \n\t"
00055 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00056 :
00057 : "memory", "r4", "r5", "r12");
00058 }
00059
00060 void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00061 {
00062 int stride = line_size;
00063 __asm__ __volatile__ (
00064 "and r12, %[pixels], #7 \n\t"
00065 "bic %[pixels], %[pixels], #7 \n\t"
00066 "tmcr wcgr1, r12 \n\t"
00067 "add r4, %[pixels], %[line_size] \n\t"
00068 "add r5, %[block], %[line_size] \n\t"
00069 "mov %[line_size], %[line_size], lsl #1 \n\t"
00070 "1: \n\t"
00071 "wldrd wr0, [%[pixels]] \n\t"
00072 "subs %[h], %[h], #2 \n\t"
00073 "wldrd wr1, [%[pixels], #8] \n\t"
00074 "add %[pixels], %[pixels], %[line_size] \n\t"
00075 "wldrd wr3, [r4] \n\t"
00076 "pld [%[pixels]] \n\t"
00077 "pld [%[pixels], #32] \n\t"
00078 "wldrd wr4, [r4, #8] \n\t"
00079 "add r4, r4, %[line_size] \n\t"
00080 "walignr1 wr8, wr0, wr1 \n\t"
00081 "wldrd wr0, [%[block]] \n\t"
00082 "wldrd wr2, [r5] \n\t"
00083 "pld [r4] \n\t"
00084 "pld [r4, #32] \n\t"
00085 "walignr1 wr10, wr3, wr4 \n\t"
00086 WAVG2B" wr8, wr8, wr0 \n\t"
00087 WAVG2B" wr10, wr10, wr2 \n\t"
00088 "wstrd wr8, [%[block]] \n\t"
00089 "add %[block], %[block], %[line_size] \n\t"
00090 "wstrd wr10, [r5] \n\t"
00091 "pld [%[block]] \n\t"
00092 "pld [%[block], #32] \n\t"
00093 "add r5, r5, %[line_size] \n\t"
00094 "pld [r5] \n\t"
00095 "pld [r5, #32] \n\t"
00096 "bne 1b \n\t"
00097 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00098 :
00099 : "memory", "r4", "r5", "r12");
00100 }
00101
00102 void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00103 {
00104 int stride = line_size;
00105 __asm__ __volatile__ (
00106 "and r12, %[pixels], #7 \n\t"
00107 "bic %[pixels], %[pixels], #7 \n\t"
00108 "tmcr wcgr1, r12 \n\t"
00109 "add r4, %[pixels], %[line_size] \n\t"
00110 "add r5, %[block], %[line_size] \n\t"
00111 "mov %[line_size], %[line_size], lsl #1 \n\t"
00112 "1: \n\t"
00113 "wldrd wr0, [%[pixels]] \n\t"
00114 "wldrd wr1, [%[pixels], #8] \n\t"
00115 "subs %[h], %[h], #2 \n\t"
00116 "wldrd wr2, [%[pixels], #16] \n\t"
00117 "add %[pixels], %[pixels], %[line_size] \n\t"
00118 "wldrd wr3, [r4] \n\t"
00119 "pld [%[pixels]] \n\t"
00120 "pld [%[pixels], #32] \n\t"
00121 "walignr1 wr8, wr0, wr1 \n\t"
00122 "wldrd wr4, [r4, #8] \n\t"
00123 "walignr1 wr9, wr1, wr2 \n\t"
00124 "wldrd wr5, [r4, #16] \n\t"
00125 "add r4, r4, %[line_size] \n\t"
00126 "pld [r4] \n\t"
00127 "pld [r4, #32] \n\t"
00128 "walignr1 wr10, wr3, wr4 \n\t"
00129 "wstrd wr8, [%[block]] \n\t"
00130 "walignr1 wr11, wr4, wr5 \n\t"
00131 "wstrd wr9, [%[block], #8] \n\t"
00132 "add %[block], %[block], %[line_size] \n\t"
00133 "wstrd wr10, [r5] \n\t"
00134 "wstrd wr11, [r5, #8] \n\t"
00135 "add r5, r5, %[line_size] \n\t"
00136 "bne 1b \n\t"
00137 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00138 :
00139 : "memory", "r4", "r5", "r12");
00140 }
00141
00142 void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00143 {
00144 int stride = line_size;
00145 __asm__ __volatile__ (
00146 "pld [%[pixels]] \n\t"
00147 "pld [%[pixels], #32] \n\t"
00148 "pld [%[block]] \n\t"
00149 "pld [%[block], #32] \n\t"
00150 "and r12, %[pixels], #7 \n\t"
00151 "bic %[pixels], %[pixels], #7 \n\t"
00152 "tmcr wcgr1, r12 \n\t"
00153 "add r4, %[pixels], %[line_size]\n\t"
00154 "add r5, %[block], %[line_size] \n\t"
00155 "mov %[line_size], %[line_size], lsl #1 \n\t"
00156 "1: \n\t"
00157 "wldrd wr0, [%[pixels]] \n\t"
00158 "wldrd wr1, [%[pixels], #8] \n\t"
00159 "subs %[h], %[h], #2 \n\t"
00160 "wldrd wr2, [%[pixels], #16] \n\t"
00161 "add %[pixels], %[pixels], %[line_size] \n\t"
00162 "wldrd wr3, [r4] \n\t"
00163 "pld [%[pixels]] \n\t"
00164 "pld [%[pixels], #32] \n\t"
00165 "walignr1 wr8, wr0, wr1 \n\t"
00166 "wldrd wr4, [r4, #8] \n\t"
00167 "walignr1 wr9, wr1, wr2 \n\t"
00168 "wldrd wr5, [r4, #16] \n\t"
00169 "add r4, r4, %[line_size] \n\t"
00170 "wldrd wr0, [%[block]] \n\t"
00171 "pld [r4] \n\t"
00172 "wldrd wr1, [%[block], #8] \n\t"
00173 "pld [r4, #32] \n\t"
00174 "wldrd wr2, [r5] \n\t"
00175 "walignr1 wr10, wr3, wr4 \n\t"
00176 "wldrd wr3, [r5, #8] \n\t"
00177 WAVG2B" wr8, wr8, wr0 \n\t"
00178 WAVG2B" wr9, wr9, wr1 \n\t"
00179 WAVG2B" wr10, wr10, wr2 \n\t"
00180 "wstrd wr8, [%[block]] \n\t"
00181 "walignr1 wr11, wr4, wr5 \n\t"
00182 WAVG2B" wr11, wr11, wr3 \n\t"
00183 "wstrd wr9, [%[block], #8] \n\t"
00184 "add %[block], %[block], %[line_size] \n\t"
00185 "wstrd wr10, [r5] \n\t"
00186 "pld [%[block]] \n\t"
00187 "pld [%[block], #32] \n\t"
00188 "wstrd wr11, [r5, #8] \n\t"
00189 "add r5, r5, %[line_size] \n\t"
00190 "pld [r5] \n\t"
00191 "pld [r5, #32] \n\t"
00192 "bne 1b \n\t"
00193 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00194 :
00195 : "memory", "r4", "r5", "r12");
00196 }
00197
00198 void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00199 {
00200 int stride = line_size;
00201
00202
00203 SET_RND(wr15);
00204 __asm__ __volatile__(
00205 "pld [%[pixels]] \n\t"
00206 "pld [%[pixels], #32] \n\t"
00207 "and r12, %[pixels], #7 \n\t"
00208 "bic %[pixels], %[pixels], #7 \n\t"
00209 "tmcr wcgr1, r12 \n\t"
00210 "add r12, r12, #1 \n\t"
00211 "add r4, %[pixels], %[line_size]\n\t"
00212 "tmcr wcgr2, r12 \n\t"
00213 "add r5, %[block], %[line_size] \n\t"
00214 "mov %[line_size], %[line_size], lsl #1 \n\t"
00215
00216 "1: \n\t"
00217 "wldrd wr10, [%[pixels]] \n\t"
00218 "cmp r12, #8 \n\t"
00219 "wldrd wr11, [%[pixels], #8] \n\t"
00220 "add %[pixels], %[pixels], %[line_size] \n\t"
00221 "wldrd wr13, [r4] \n\t"
00222 "pld [%[pixels]] \n\t"
00223 "wldrd wr14, [r4, #8] \n\t"
00224 "pld [%[pixels], #32] \n\t"
00225 "add r4, r4, %[line_size] \n\t"
00226 "walignr1 wr0, wr10, wr11 \n\t"
00227 "pld [r4] \n\t"
00228 "pld [r4, #32] \n\t"
00229 "walignr1 wr2, wr13, wr14 \n\t"
00230 "wmoveq wr4, wr11 \n\t"
00231 "wmoveq wr6, wr14 \n\t"
00232 "walignr2ne wr4, wr10, wr11 \n\t"
00233 "walignr2ne wr6, wr13, wr14 \n\t"
00234 WAVG2B" wr0, wr0, wr4 \n\t"
00235 WAVG2B" wr2, wr2, wr6 \n\t"
00236 "wstrd wr0, [%[block]] \n\t"
00237 "subs %[h], %[h], #2 \n\t"
00238 "wstrd wr2, [r5] \n\t"
00239 "add %[block], %[block], %[line_size] \n\t"
00240 "add r5, r5, %[line_size] \n\t"
00241 "bne 1b \n\t"
00242 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00243 :
00244 : "r4", "r5", "r12", "memory");
00245 }
00246
00247 void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00248 {
00249 int stride = line_size;
00250
00251
00252 SET_RND(wr15);
00253 __asm__ __volatile__(
00254 "pld [%[pixels]] \n\t"
00255 "pld [%[pixels], #32] \n\t"
00256 "and r12, %[pixels], #7 \n\t"
00257 "bic %[pixels], %[pixels], #7 \n\t"
00258 "tmcr wcgr1, r12 \n\t"
00259 "add r12, r12, #1 \n\t"
00260 "add r4, %[pixels], %[line_size]\n\t"
00261 "tmcr wcgr2, r12 \n\t"
00262 "add r5, %[block], %[line_size] \n\t"
00263 "mov %[line_size], %[line_size], lsl #1 \n\t"
00264
00265 "1: \n\t"
00266 "wldrd wr10, [%[pixels]] \n\t"
00267 "cmp r12, #8 \n\t"
00268 "wldrd wr11, [%[pixels], #8] \n\t"
00269 "wldrd wr12, [%[pixels], #16] \n\t"
00270 "add %[pixels], %[pixels], %[line_size] \n\t"
00271 "wldrd wr13, [r4] \n\t"
00272 "pld [%[pixels]] \n\t"
00273 "wldrd wr14, [r4, #8] \n\t"
00274 "pld [%[pixels], #32] \n\t"
00275 "wldrd wr15, [r4, #16] \n\t"
00276 "add r4, r4, %[line_size] \n\t"
00277 "walignr1 wr0, wr10, wr11 \n\t"
00278 "pld [r4] \n\t"
00279 "pld [r4, #32] \n\t"
00280 "walignr1 wr1, wr11, wr12 \n\t"
00281 "walignr1 wr2, wr13, wr14 \n\t"
00282 "walignr1 wr3, wr14, wr15 \n\t"
00283 "wmoveq wr4, wr11 \n\t"
00284 "wmoveq wr5, wr12 \n\t"
00285 "wmoveq wr6, wr14 \n\t"
00286 "wmoveq wr7, wr15 \n\t"
00287 "walignr2ne wr4, wr10, wr11 \n\t"
00288 "walignr2ne wr5, wr11, wr12 \n\t"
00289 "walignr2ne wr6, wr13, wr14 \n\t"
00290 "walignr2ne wr7, wr14, wr15 \n\t"
00291 WAVG2B" wr0, wr0, wr4 \n\t"
00292 WAVG2B" wr1, wr1, wr5 \n\t"
00293 "wstrd wr0, [%[block]] \n\t"
00294 WAVG2B" wr2, wr2, wr6 \n\t"
00295 "wstrd wr1, [%[block], #8] \n\t"
00296 WAVG2B" wr3, wr3, wr7 \n\t"
00297 "add %[block], %[block], %[line_size] \n\t"
00298 "wstrd wr2, [r5] \n\t"
00299 "subs %[h], %[h], #2 \n\t"
00300 "wstrd wr3, [r5, #8] \n\t"
00301 "add r5, r5, %[line_size] \n\t"
00302 "bne 1b \n\t"
00303 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00304 :
00305 : "r4", "r5", "r12", "memory");
00306 }
00307
00308 void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00309 {
00310 int stride = line_size;
00311
00312
00313 SET_RND(wr15);
00314 __asm__ __volatile__(
00315 "pld [%[pixels]] \n\t"
00316 "pld [%[pixels], #32] \n\t"
00317 "pld [%[block]] \n\t"
00318 "pld [%[block], #32] \n\t"
00319 "and r12, %[pixels], #7 \n\t"
00320 "bic %[pixels], %[pixels], #7 \n\t"
00321 "tmcr wcgr1, r12 \n\t"
00322 "add r12, r12, #1 \n\t"
00323 "add r4, %[pixels], %[line_size]\n\t"
00324 "tmcr wcgr2, r12 \n\t"
00325 "add r5, %[block], %[line_size] \n\t"
00326 "mov %[line_size], %[line_size], lsl #1 \n\t"
00327 "pld [r5] \n\t"
00328 "pld [r5, #32] \n\t"
00329
00330 "1: \n\t"
00331 "wldrd wr10, [%[pixels]] \n\t"
00332 "cmp r12, #8 \n\t"
00333 "wldrd wr11, [%[pixels], #8] \n\t"
00334 "add %[pixels], %[pixels], %[line_size] \n\t"
00335 "wldrd wr13, [r4] \n\t"
00336 "pld [%[pixels]] \n\t"
00337 "wldrd wr14, [r4, #8] \n\t"
00338 "pld [%[pixels], #32] \n\t"
00339 "add r4, r4, %[line_size] \n\t"
00340 "walignr1 wr0, wr10, wr11 \n\t"
00341 "pld [r4] \n\t"
00342 "pld [r4, #32] \n\t"
00343 "walignr1 wr2, wr13, wr14 \n\t"
00344 "wmoveq wr4, wr11 \n\t"
00345 "wmoveq wr6, wr14 \n\t"
00346 "walignr2ne wr4, wr10, wr11 \n\t"
00347 "wldrd wr10, [%[block]] \n\t"
00348 "walignr2ne wr6, wr13, wr14 \n\t"
00349 "wldrd wr12, [r5] \n\t"
00350 WAVG2B" wr0, wr0, wr4 \n\t"
00351 WAVG2B" wr2, wr2, wr6 \n\t"
00352 WAVG2B" wr0, wr0, wr10 \n\t"
00353 WAVG2B" wr2, wr2, wr12 \n\t"
00354 "wstrd wr0, [%[block]] \n\t"
00355 "subs %[h], %[h], #2 \n\t"
00356 "wstrd wr2, [r5] \n\t"
00357 "add %[block], %[block], %[line_size] \n\t"
00358 "add r5, r5, %[line_size] \n\t"
00359 "pld [%[block]] \n\t"
00360 "pld [%[block], #32] \n\t"
00361 "pld [r5] \n\t"
00362 "pld [r5, #32] \n\t"
00363 "bne 1b \n\t"
00364 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00365 :
00366 : "r4", "r5", "r12", "memory");
00367 }
00368
00369 void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00370 {
00371 int stride = line_size;
00372
00373
00374 SET_RND(wr15);
00375 __asm__ __volatile__(
00376 "pld [%[pixels]] \n\t"
00377 "pld [%[pixels], #32] \n\t"
00378 "pld [%[block]] \n\t"
00379 "pld [%[block], #32] \n\t"
00380 "and r12, %[pixels], #7 \n\t"
00381 "bic %[pixels], %[pixels], #7 \n\t"
00382 "tmcr wcgr1, r12 \n\t"
00383 "add r12, r12, #1 \n\t"
00384 "add r4, %[pixels], %[line_size]\n\t"
00385 "tmcr wcgr2, r12 \n\t"
00386 "add r5, %[block], %[line_size] \n\t"
00387 "mov %[line_size], %[line_size], lsl #1 \n\t"
00388 "pld [r5] \n\t"
00389 "pld [r5, #32] \n\t"
00390
00391 "1: \n\t"
00392 "wldrd wr10, [%[pixels]] \n\t"
00393 "cmp r12, #8 \n\t"
00394 "wldrd wr11, [%[pixels], #8] \n\t"
00395 "wldrd wr12, [%[pixels], #16] \n\t"
00396 "add %[pixels], %[pixels], %[line_size] \n\t"
00397 "wldrd wr13, [r4] \n\t"
00398 "pld [%[pixels]] \n\t"
00399 "wldrd wr14, [r4, #8] \n\t"
00400 "pld [%[pixels], #32] \n\t"
00401 "wldrd wr15, [r4, #16] \n\t"
00402 "add r4, r4, %[line_size] \n\t"
00403 "walignr1 wr0, wr10, wr11 \n\t"
00404 "pld [r4] \n\t"
00405 "pld [r4, #32] \n\t"
00406 "walignr1 wr1, wr11, wr12 \n\t"
00407 "walignr1 wr2, wr13, wr14 \n\t"
00408 "walignr1 wr3, wr14, wr15 \n\t"
00409 "wmoveq wr4, wr11 \n\t"
00410 "wmoveq wr5, wr12 \n\t"
00411 "wmoveq wr6, wr14 \n\t"
00412 "wmoveq wr7, wr15 \n\t"
00413 "walignr2ne wr4, wr10, wr11 \n\t"
00414 "walignr2ne wr5, wr11, wr12 \n\t"
00415 "walignr2ne wr6, wr13, wr14 \n\t"
00416 "walignr2ne wr7, wr14, wr15 \n\t"
00417 "wldrd wr10, [%[block]] \n\t"
00418 WAVG2B" wr0, wr0, wr4 \n\t"
00419 "wldrd wr11, [%[block], #8] \n\t"
00420 WAVG2B" wr1, wr1, wr5 \n\t"
00421 "wldrd wr12, [r5] \n\t"
00422 WAVG2B" wr2, wr2, wr6 \n\t"
00423 "wldrd wr13, [r5, #8] \n\t"
00424 WAVG2B" wr3, wr3, wr7 \n\t"
00425 WAVG2B" wr0, wr0, wr10 \n\t"
00426 WAVG2B" wr1, wr1, wr11 \n\t"
00427 WAVG2B" wr2, wr2, wr12 \n\t"
00428 WAVG2B" wr3, wr3, wr13 \n\t"
00429 "wstrd wr0, [%[block]] \n\t"
00430 "subs %[h], %[h], #2 \n\t"
00431 "wstrd wr1, [%[block], #8] \n\t"
00432 "add %[block], %[block], %[line_size] \n\t"
00433 "wstrd wr2, [r5] \n\t"
00434 "pld [%[block]] \n\t"
00435 "wstrd wr3, [r5, #8] \n\t"
00436 "add r5, r5, %[line_size] \n\t"
00437 "pld [%[block], #32] \n\t"
00438 "pld [r5] \n\t"
00439 "pld [r5, #32] \n\t"
00440 "bne 1b \n\t"
00441 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00442 :
00443 :"r4", "r5", "r12", "memory");
00444 }
00445
00446 void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00447 {
00448 int stride = line_size;
00449
00450
00451 __asm__ __volatile__(
00452 "pld [%[pixels]] \n\t"
00453 "pld [%[pixels], #32] \n\t"
00454 "and r12, %[pixels], #7 \n\t"
00455 "tmcr wcgr1, r12 \n\t"
00456 "bic %[pixels], %[pixels], #7 \n\t"
00457
00458 "wldrd wr10, [%[pixels]] \n\t"
00459 "wldrd wr11, [%[pixels], #8] \n\t"
00460 "pld [%[block]] \n\t"
00461 "add %[pixels], %[pixels], %[line_size] \n\t"
00462 "walignr1 wr0, wr10, wr11 \n\t"
00463 "pld [%[pixels]] \n\t"
00464 "pld [%[pixels], #32] \n\t"
00465
00466 "1: \n\t"
00467 "wldrd wr10, [%[pixels]] \n\t"
00468 "wldrd wr11, [%[pixels], #8] \n\t"
00469 "add %[pixels], %[pixels], %[line_size] \n\t"
00470 "pld [%[pixels]] \n\t"
00471 "pld [%[pixels], #32] \n\t"
00472 "walignr1 wr4, wr10, wr11 \n\t"
00473 "wldrd wr10, [%[block]] \n\t"
00474 WAVG2B" wr8, wr0, wr4 \n\t"
00475 WAVG2B" wr8, wr8, wr10 \n\t"
00476 "wstrd wr8, [%[block]] \n\t"
00477 "add %[block], %[block], %[line_size] \n\t"
00478
00479 "wldrd wr10, [%[pixels]] \n\t"
00480 "wldrd wr11, [%[pixels], #8] \n\t"
00481 "pld [%[block]] \n\t"
00482 "add %[pixels], %[pixels], %[line_size] \n\t"
00483 "pld [%[pixels]] \n\t"
00484 "pld [%[pixels], #32] \n\t"
00485 "walignr1 wr0, wr10, wr11 \n\t"
00486 "wldrd wr10, [%[block]] \n\t"
00487 WAVG2B" wr8, wr0, wr4 \n\t"
00488 WAVG2B" wr8, wr8, wr10 \n\t"
00489 "wstrd wr8, [%[block]] \n\t"
00490 "add %[block], %[block], %[line_size] \n\t"
00491
00492 "subs %[h], %[h], #2 \n\t"
00493 "pld [%[block]] \n\t"
00494 "bne 1b \n\t"
00495 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00496 :
00497 : "cc", "memory", "r12");
00498 }
00499
00500 void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00501 {
00502 int stride = line_size;
00503
00504
00505 __asm__ __volatile__(
00506 "pld [%[pixels]] \n\t"
00507 "pld [%[pixels], #32] \n\t"
00508 "and r12, %[pixels], #7 \n\t"
00509 "tmcr wcgr1, r12 \n\t"
00510 "bic %[pixels], %[pixels], #7 \n\t"
00511
00512 "wldrd wr10, [%[pixels]] \n\t"
00513 "wldrd wr11, [%[pixels], #8] \n\t"
00514 "wldrd wr12, [%[pixels], #16] \n\t"
00515 "add %[pixels], %[pixels], %[line_size] \n\t"
00516 "pld [%[pixels]] \n\t"
00517 "pld [%[pixels], #32] \n\t"
00518 "walignr1 wr0, wr10, wr11 \n\t"
00519 "walignr1 wr1, wr11, wr12 \n\t"
00520
00521 "1: \n\t"
00522 "wldrd wr10, [%[pixels]] \n\t"
00523 "wldrd wr11, [%[pixels], #8] \n\t"
00524 "wldrd wr12, [%[pixels], #16] \n\t"
00525 "add %[pixels], %[pixels], %[line_size] \n\t"
00526 "pld [%[pixels]] \n\t"
00527 "pld [%[pixels], #32] \n\t"
00528 "walignr1 wr4, wr10, wr11 \n\t"
00529 "walignr1 wr5, wr11, wr12 \n\t"
00530 WAVG2B" wr8, wr0, wr4 \n\t"
00531 WAVG2B" wr9, wr1, wr5 \n\t"
00532 "wstrd wr8, [%[block]] \n\t"
00533 "wstrd wr9, [%[block], #8] \n\t"
00534 "add %[block], %[block], %[line_size] \n\t"
00535
00536 "wldrd wr10, [%[pixels]] \n\t"
00537 "wldrd wr11, [%[pixels], #8] \n\t"
00538 "wldrd wr12, [%[pixels], #16] \n\t"
00539 "add %[pixels], %[pixels], %[line_size] \n\t"
00540 "pld [%[pixels]] \n\t"
00541 "pld [%[pixels], #32] \n\t"
00542 "walignr1 wr0, wr10, wr11 \n\t"
00543 "walignr1 wr1, wr11, wr12 \n\t"
00544 WAVG2B" wr8, wr0, wr4 \n\t"
00545 WAVG2B" wr9, wr1, wr5 \n\t"
00546 "wstrd wr8, [%[block]] \n\t"
00547 "wstrd wr9, [%[block], #8] \n\t"
00548 "add %[block], %[block], %[line_size] \n\t"
00549
00550 "subs %[h], %[h], #2 \n\t"
00551 "bne 1b \n\t"
00552 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00553 :
00554 : "r4", "r5", "r12", "memory");
00555 }
00556
00557 void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00558 {
00559 int stride = line_size;
00560
00561
00562 __asm__ __volatile__(
00563 "pld [%[pixels]] \n\t"
00564 "pld [%[pixels], #32] \n\t"
00565 "and r12, %[pixels], #7 \n\t"
00566 "tmcr wcgr1, r12 \n\t"
00567 "bic %[pixels], %[pixels], #7 \n\t"
00568
00569 "wldrd wr10, [%[pixels]] \n\t"
00570 "wldrd wr11, [%[pixels], #8] \n\t"
00571 "pld [%[block]] \n\t"
00572 "wldrd wr12, [%[pixels], #16] \n\t"
00573 "add %[pixels], %[pixels], %[line_size] \n\t"
00574 "pld [%[pixels]] \n\t"
00575 "pld [%[pixels], #32] \n\t"
00576 "walignr1 wr0, wr10, wr11 \n\t"
00577 "walignr1 wr1, wr11, wr12 \n\t"
00578
00579 "1: \n\t"
00580 "wldrd wr10, [%[pixels]] \n\t"
00581 "wldrd wr11, [%[pixels], #8] \n\t"
00582 "wldrd wr12, [%[pixels], #16] \n\t"
00583 "add %[pixels], %[pixels], %[line_size] \n\t"
00584 "pld [%[pixels]] \n\t"
00585 "pld [%[pixels], #32] \n\t"
00586 "walignr1 wr4, wr10, wr11 \n\t"
00587 "walignr1 wr5, wr11, wr12 \n\t"
00588 "wldrd wr10, [%[block]] \n\t"
00589 "wldrd wr11, [%[block], #8] \n\t"
00590 WAVG2B" wr8, wr0, wr4 \n\t"
00591 WAVG2B" wr9, wr1, wr5 \n\t"
00592 WAVG2B" wr8, wr8, wr10 \n\t"
00593 WAVG2B" wr9, wr9, wr11 \n\t"
00594 "wstrd wr8, [%[block]] \n\t"
00595 "wstrd wr9, [%[block], #8] \n\t"
00596 "add %[block], %[block], %[line_size] \n\t"
00597
00598 "wldrd wr10, [%[pixels]] \n\t"
00599 "wldrd wr11, [%[pixels], #8] \n\t"
00600 "pld [%[block]] \n\t"
00601 "wldrd wr12, [%[pixels], #16] \n\t"
00602 "add %[pixels], %[pixels], %[line_size] \n\t"
00603 "pld [%[pixels]] \n\t"
00604 "pld [%[pixels], #32] \n\t"
00605 "walignr1 wr0, wr10, wr11 \n\t"
00606 "walignr1 wr1, wr11, wr12 \n\t"
00607 "wldrd wr10, [%[block]] \n\t"
00608 "wldrd wr11, [%[block], #8] \n\t"
00609 WAVG2B" wr8, wr0, wr4 \n\t"
00610 WAVG2B" wr9, wr1, wr5 \n\t"
00611 WAVG2B" wr8, wr8, wr10 \n\t"
00612 WAVG2B" wr9, wr9, wr11 \n\t"
00613 "wstrd wr8, [%[block]] \n\t"
00614 "wstrd wr9, [%[block], #8] \n\t"
00615 "add %[block], %[block], %[line_size] \n\t"
00616
00617 "subs %[h], %[h], #2 \n\t"
00618 "pld [%[block]] \n\t"
00619 "bne 1b \n\t"
00620 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00621 :
00622 : "r4", "r5", "r12", "memory");
00623 }
00624
00625 void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00626 {
00627
00628
00629 SET_RND(wr15);
00630 __asm__ __volatile__(
00631 "pld [%[pixels]] \n\t"
00632 "mov r12, #2 \n\t"
00633 "pld [%[pixels], #32] \n\t"
00634 "tmcr wcgr0, r12 \n\t"
00635 "and r12, %[pixels], #7 \n\t"
00636 "bic %[pixels], %[pixels], #7 \n\t"
00637 "tmcr wcgr1, r12 \n\t"
00638
00639
00640
00641 "wldrd wr12, [%[pixels]] \n\t"
00642 "add r12, r12, #1 \n\t"
00643 "wldrd wr13, [%[pixels], #8] \n\t"
00644 "tmcr wcgr2, r12 \n\t"
00645 "add %[pixels], %[pixels], %[line_size] \n\t"
00646 "cmp r12, #8 \n\t"
00647 "pld [%[pixels]] \n\t"
00648 "pld [%[pixels], #32] \n\t"
00649 "walignr1 wr2, wr12, wr13 \n\t"
00650 "wmoveq wr10, wr13 \n\t"
00651 "walignr2ne wr10, wr12, wr13 \n\t"
00652 "wunpckelub wr0, wr2 \n\t"
00653 "wunpckehub wr1, wr2 \n\t"
00654 "wunpckelub wr8, wr10 \n\t"
00655 "wunpckehub wr9, wr10 \n\t"
00656 "waddhus wr0, wr0, wr8 \n\t"
00657 "waddhus wr1, wr1, wr9 \n\t"
00658
00659 "1: \n\t"
00660
00661
00662 "wldrd wr12, [%[pixels]] \n\t"
00663 "cmp r12, #8 \n\t"
00664 "wldrd wr13, [%[pixels], #8] \n\t"
00665 "add %[pixels], %[pixels], %[line_size] \n\t"
00666 "walignr1 wr6, wr12, wr13 \n\t"
00667 "pld [%[pixels]] \n\t"
00668 "pld [%[pixels], #32] \n\t"
00669 "wmoveq wr10, wr13 \n\t"
00670 "walignr2ne wr10, wr12, wr13 \n\t"
00671 "wunpckelub wr4, wr6 \n\t"
00672 "wunpckehub wr5, wr6 \n\t"
00673 "wunpckelub wr8, wr10 \n\t"
00674 "wunpckehub wr9, wr10 \n\t"
00675 "waddhus wr4, wr4, wr8 \n\t"
00676 "waddhus wr5, wr5, wr9 \n\t"
00677 "waddhus wr8, wr0, wr4 \n\t"
00678 "waddhus wr9, wr1, wr5 \n\t"
00679 "waddhus wr8, wr8, wr15 \n\t"
00680 "waddhus wr9, wr9, wr15 \n\t"
00681 "wsrlhg wr8, wr8, wcgr0 \n\t"
00682 "wsrlhg wr9, wr9, wcgr0 \n\t"
00683 "wpackhus wr8, wr8, wr9 \n\t"
00684 "wstrd wr8, [%[block]] \n\t"
00685 "add %[block], %[block], %[line_size] \n\t"
00686
00687
00688
00689 "wldrd wr12, [%[pixels]] \n\t"
00690 "wldrd wr13, [%[pixels], #8] \n\t"
00691 "add %[pixels], %[pixels], %[line_size] \n\t"
00692 "walignr1 wr2, wr12, wr13 \n\t"
00693 "pld [%[pixels]] \n\t"
00694 "pld [%[pixels], #32] \n\t"
00695 "wmoveq wr10, wr13 \n\t"
00696 "walignr2ne wr10, wr12, wr13 \n\t"
00697 "wunpckelub wr0, wr2 \n\t"
00698 "wunpckehub wr1, wr2 \n\t"
00699 "wunpckelub wr8, wr10 \n\t"
00700 "wunpckehub wr9, wr10 \n\t"
00701 "waddhus wr0, wr0, wr8 \n\t"
00702 "waddhus wr1, wr1, wr9 \n\t"
00703 "waddhus wr8, wr0, wr4 \n\t"
00704 "waddhus wr9, wr1, wr5 \n\t"
00705 "waddhus wr8, wr8, wr15 \n\t"
00706 "waddhus wr9, wr9, wr15 \n\t"
00707 "wsrlhg wr8, wr8, wcgr0 \n\t"
00708 "wsrlhg wr9, wr9, wcgr0 \n\t"
00709 "wpackhus wr8, wr8, wr9 \n\t"
00710 "subs %[h], %[h], #2 \n\t"
00711 "wstrd wr8, [%[block]] \n\t"
00712 "add %[block], %[block], %[line_size] \n\t"
00713 "bne 1b \n\t"
00714 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00715 : [line_size]"r"(line_size)
00716 : "r12", "memory");
00717 }
00718
00719 void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00720 {
00721
00722
00723 SET_RND(wr15);
00724 __asm__ __volatile__(
00725 "pld [%[pixels]] \n\t"
00726 "mov r12, #2 \n\t"
00727 "pld [%[pixels], #32] \n\t"
00728 "tmcr wcgr0, r12 \n\t"
00729
00730 "and r12, %[pixels], #7 \n\t"
00731 "bic %[pixels], %[pixels], #7 \n\t"
00732 "tmcr wcgr1, r12 \n\t"
00733 "add r12, r12, #1 \n\t"
00734 "tmcr wcgr2, r12 \n\t"
00735
00736
00737
00738 "wldrd wr12, [%[pixels]] \n\t"
00739 "cmp r12, #8 \n\t"
00740 "wldrd wr13, [%[pixels], #8] \n\t"
00741 "wldrd wr14, [%[pixels], #16] \n\t"
00742 "add %[pixels], %[pixels], %[line_size] \n\t"
00743 "pld [%[pixels]] \n\t"
00744 "walignr1 wr2, wr12, wr13 \n\t"
00745 "pld [%[pixels], #32] \n\t"
00746 "walignr1 wr3, wr13, wr14 \n\t"
00747 "wmoveq wr10, wr13 \n\t"
00748 "wmoveq wr11, wr14 \n\t"
00749 "walignr2ne wr10, wr12, wr13 \n\t"
00750 "walignr2ne wr11, wr13, wr14 \n\t"
00751 "wunpckelub wr0, wr2 \n\t"
00752 "wunpckehub wr1, wr2 \n\t"
00753 "wunpckelub wr2, wr3 \n\t"
00754 "wunpckehub wr3, wr3 \n\t"
00755 "wunpckelub wr8, wr10 \n\t"
00756 "wunpckehub wr9, wr10 \n\t"
00757 "wunpckelub wr10, wr11 \n\t"
00758 "wunpckehub wr11, wr11 \n\t"
00759 "waddhus wr0, wr0, wr8 \n\t"
00760 "waddhus wr1, wr1, wr9 \n\t"
00761 "waddhus wr2, wr2, wr10 \n\t"
00762 "waddhus wr3, wr3, wr11 \n\t"
00763
00764 "1: \n\t"
00765
00766
00767 "wldrd wr12, [%[pixels]] \n\t"
00768 "cmp r12, #8 \n\t"
00769 "wldrd wr13, [%[pixels], #8] \n\t"
00770 "wldrd wr14, [%[pixels], #16] \n\t"
00771 "add %[pixels], %[pixels], %[line_size] \n\t"
00772 "walignr1 wr6, wr12, wr13 \n\t"
00773 "pld [%[pixels]] \n\t"
00774 "pld [%[pixels], #32] \n\t"
00775 "walignr1 wr7, wr13, wr14 \n\t"
00776 "wmoveq wr10, wr13 \n\t"
00777 "wmoveq wr11, wr14 \n\t"
00778 "walignr2ne wr10, wr12, wr13 \n\t"
00779 "walignr2ne wr11, wr13, wr14 \n\t"
00780 "wunpckelub wr4, wr6 \n\t"
00781 "wunpckehub wr5, wr6 \n\t"
00782 "wunpckelub wr6, wr7 \n\t"
00783 "wunpckehub wr7, wr7 \n\t"
00784 "wunpckelub wr8, wr10 \n\t"
00785 "wunpckehub wr9, wr10 \n\t"
00786 "wunpckelub wr10, wr11 \n\t"
00787 "wunpckehub wr11, wr11 \n\t"
00788 "waddhus wr4, wr4, wr8 \n\t"
00789 "waddhus wr5, wr5, wr9 \n\t"
00790 "waddhus wr6, wr6, wr10 \n\t"
00791 "waddhus wr7, wr7, wr11 \n\t"
00792 "waddhus wr8, wr0, wr4 \n\t"
00793 "waddhus wr9, wr1, wr5 \n\t"
00794 "waddhus wr10, wr2, wr6 \n\t"
00795 "waddhus wr11, wr3, wr7 \n\t"
00796 "waddhus wr8, wr8, wr15 \n\t"
00797 "waddhus wr9, wr9, wr15 \n\t"
00798 "waddhus wr10, wr10, wr15 \n\t"
00799 "waddhus wr11, wr11, wr15 \n\t"
00800 "wsrlhg wr8, wr8, wcgr0 \n\t"
00801 "wsrlhg wr9, wr9, wcgr0 \n\t"
00802 "wsrlhg wr10, wr10, wcgr0 \n\t"
00803 "wsrlhg wr11, wr11, wcgr0 \n\t"
00804 "wpackhus wr8, wr8, wr9 \n\t"
00805 "wpackhus wr9, wr10, wr11 \n\t"
00806 "wstrd wr8, [%[block]] \n\t"
00807 "wstrd wr9, [%[block], #8] \n\t"
00808 "add %[block], %[block], %[line_size] \n\t"
00809
00810
00811
00812 "wldrd wr12, [%[pixels]] \n\t"
00813 "wldrd wr13, [%[pixels], #8] \n\t"
00814 "wldrd wr14, [%[pixels], #16] \n\t"
00815 "add %[pixels], %[pixels], %[line_size] \n\t"
00816 "walignr1 wr2, wr12, wr13 \n\t"
00817 "pld [%[pixels]] \n\t"
00818 "pld [%[pixels], #32] \n\t"
00819 "walignr1 wr3, wr13, wr14 \n\t"
00820 "wmoveq wr10, wr13 \n\t"
00821 "wmoveq wr11, wr14 \n\t"
00822 "walignr2ne wr10, wr12, wr13 \n\t"
00823 "walignr2ne wr11, wr13, wr14 \n\t"
00824 "wunpckelub wr0, wr2 \n\t"
00825 "wunpckehub wr1, wr2 \n\t"
00826 "wunpckelub wr2, wr3 \n\t"
00827 "wunpckehub wr3, wr3 \n\t"
00828 "wunpckelub wr8, wr10 \n\t"
00829 "wunpckehub wr9, wr10 \n\t"
00830 "wunpckelub wr10, wr11 \n\t"
00831 "wunpckehub wr11, wr11 \n\t"
00832 "waddhus wr0, wr0, wr8 \n\t"
00833 "waddhus wr1, wr1, wr9 \n\t"
00834 "waddhus wr2, wr2, wr10 \n\t"
00835 "waddhus wr3, wr3, wr11 \n\t"
00836 "waddhus wr8, wr0, wr4 \n\t"
00837 "waddhus wr9, wr1, wr5 \n\t"
00838 "waddhus wr10, wr2, wr6 \n\t"
00839 "waddhus wr11, wr3, wr7 \n\t"
00840 "waddhus wr8, wr8, wr15 \n\t"
00841 "waddhus wr9, wr9, wr15 \n\t"
00842 "waddhus wr10, wr10, wr15 \n\t"
00843 "waddhus wr11, wr11, wr15 \n\t"
00844 "wsrlhg wr8, wr8, wcgr0 \n\t"
00845 "wsrlhg wr9, wr9, wcgr0 \n\t"
00846 "wsrlhg wr10, wr10, wcgr0 \n\t"
00847 "wsrlhg wr11, wr11, wcgr0 \n\t"
00848 "wpackhus wr8, wr8, wr9 \n\t"
00849 "wpackhus wr9, wr10, wr11 \n\t"
00850 "wstrd wr8, [%[block]] \n\t"
00851 "wstrd wr9, [%[block], #8] \n\t"
00852 "add %[block], %[block], %[line_size] \n\t"
00853
00854 "subs %[h], %[h], #2 \n\t"
00855 "bne 1b \n\t"
00856 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00857 : [line_size]"r"(line_size)
00858 : "r12", "memory");
00859 }
00860
00861 void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00862 {
00863
00864
00865 SET_RND(wr15);
00866 __asm__ __volatile__(
00867 "pld [%[block]] \n\t"
00868 "pld [%[block], #32] \n\t"
00869 "pld [%[pixels]] \n\t"
00870 "mov r12, #2 \n\t"
00871 "pld [%[pixels], #32] \n\t"
00872 "tmcr wcgr0, r12 \n\t"
00873 "and r12, %[pixels], #7 \n\t"
00874 "bic %[pixels], %[pixels], #7 \n\t"
00875 "tmcr wcgr1, r12 \n\t"
00876
00877
00878
00879 "wldrd wr12, [%[pixels]] \n\t"
00880 "add r12, r12, #1 \n\t"
00881 "wldrd wr13, [%[pixels], #8] \n\t"
00882 "tmcr wcgr2, r12 \n\t"
00883 "add %[pixels], %[pixels], %[line_size] \n\t"
00884 "cmp r12, #8 \n\t"
00885 "pld [%[pixels]] \n\t"
00886 "pld [%[pixels], #32] \n\t"
00887 "walignr1 wr2, wr12, wr13 \n\t"
00888 "wmoveq wr10, wr13 \n\t"
00889 "walignr2ne wr10, wr12, wr13 \n\t"
00890 "wunpckelub wr0, wr2 \n\t"
00891 "wunpckehub wr1, wr2 \n\t"
00892 "wunpckelub wr8, wr10 \n\t"
00893 "wunpckehub wr9, wr10 \n\t"
00894 "waddhus wr0, wr0, wr8 \n\t"
00895 "waddhus wr1, wr1, wr9 \n\t"
00896
00897 "1: \n\t"
00898
00899
00900 "wldrd wr12, [%[pixels]] \n\t"
00901 "cmp r12, #8 \n\t"
00902 "wldrd wr13, [%[pixels], #8] \n\t"
00903 "add %[pixels], %[pixels], %[line_size] \n\t"
00904 "walignr1 wr6, wr12, wr13 \n\t"
00905 "pld [%[pixels]] \n\t"
00906 "pld [%[pixels], #32] \n\t"
00907 "wmoveq wr10, wr13 \n\t"
00908 "walignr2ne wr10, wr12, wr13 \n\t"
00909 "wunpckelub wr4, wr6 \n\t"
00910 "wunpckehub wr5, wr6 \n\t"
00911 "wunpckelub wr8, wr10 \n\t"
00912 "wunpckehub wr9, wr10 \n\t"
00913 "waddhus wr4, wr4, wr8 \n\t"
00914 "waddhus wr5, wr5, wr9 \n\t"
00915 "waddhus wr8, wr0, wr4 \n\t"
00916 "waddhus wr9, wr1, wr5 \n\t"
00917 "waddhus wr8, wr8, wr15 \n\t"
00918 "waddhus wr9, wr9, wr15 \n\t"
00919 "wldrd wr12, [%[block]] \n\t"
00920 "wsrlhg wr8, wr8, wcgr0 \n\t"
00921 "wsrlhg wr9, wr9, wcgr0 \n\t"
00922 "wpackhus wr8, wr8, wr9 \n\t"
00923 WAVG2B" wr8, wr8, wr12 \n\t"
00924 "wstrd wr8, [%[block]] \n\t"
00925 "add %[block], %[block], %[line_size] \n\t"
00926 "wldrd wr12, [%[pixels]] \n\t"
00927 "pld [%[block]] \n\t"
00928 "pld [%[block], #32] \n\t"
00929
00930
00931
00932 "wldrd wr13, [%[pixels], #8] \n\t"
00933 "add %[pixels], %[pixels], %[line_size] \n\t"
00934 "walignr1 wr2, wr12, wr13 \n\t"
00935 "pld [%[pixels]] \n\t"
00936 "pld [%[pixels], #32] \n\t"
00937 "wmoveq wr10, wr13 \n\t"
00938 "walignr2ne wr10, wr12, wr13 \n\t"
00939 "wunpckelub wr0, wr2 \n\t"
00940 "wunpckehub wr1, wr2 \n\t"
00941 "wunpckelub wr8, wr10 \n\t"
00942 "wunpckehub wr9, wr10 \n\t"
00943 "waddhus wr0, wr0, wr8 \n\t"
00944 "waddhus wr1, wr1, wr9 \n\t"
00945 "waddhus wr8, wr0, wr4 \n\t"
00946 "waddhus wr9, wr1, wr5 \n\t"
00947 "waddhus wr8, wr8, wr15 \n\t"
00948 "waddhus wr9, wr9, wr15 \n\t"
00949 "wldrd wr12, [%[block]] \n\t"
00950 "wsrlhg wr8, wr8, wcgr0 \n\t"
00951 "wsrlhg wr9, wr9, wcgr0 \n\t"
00952 "wpackhus wr8, wr8, wr9 \n\t"
00953 "subs %[h], %[h], #2 \n\t"
00954 WAVG2B" wr8, wr8, wr12 \n\t"
00955 "wstrd wr8, [%[block]] \n\t"
00956 "add %[block], %[block], %[line_size] \n\t"
00957 "pld [%[block]] \n\t"
00958 "pld [%[block], #32] \n\t"
00959 "bne 1b \n\t"
00960 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00961 : [line_size]"r"(line_size)
00962 : "r12", "memory");
00963 }
00964
00965 void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00966 {
00967
00968
00969 SET_RND(wr15);
00970 __asm__ __volatile__(
00971 "pld [%[block]] \n\t"
00972 "pld [%[block], #32] \n\t"
00973 "pld [%[pixels]] \n\t"
00974 "mov r12, #2 \n\t"
00975 "pld [%[pixels], #32] \n\t"
00976 "tmcr wcgr0, r12 \n\t"
00977
00978 "and r12, %[pixels], #7 \n\t"
00979 "bic %[pixels], %[pixels], #7 \n\t"
00980 "tmcr wcgr1, r12 \n\t"
00981 "add r12, r12, #1 \n\t"
00982 "tmcr wcgr2, r12 \n\t"
00983
00984
00985
00986 "wldrd wr12, [%[pixels]] \n\t"
00987 "cmp r12, #8 \n\t"
00988 "wldrd wr13, [%[pixels], #8] \n\t"
00989 "wldrd wr14, [%[pixels], #16] \n\t"
00990 "add %[pixels], %[pixels], %[line_size] \n\t"
00991 "pld [%[pixels]] \n\t"
00992 "walignr1 wr2, wr12, wr13 \n\t"
00993 "pld [%[pixels], #32] \n\t"
00994 "walignr1 wr3, wr13, wr14 \n\t"
00995 "wmoveq wr10, wr13 \n\t"
00996 "wmoveq wr11, wr14 \n\t"
00997 "walignr2ne wr10, wr12, wr13 \n\t"
00998 "walignr2ne wr11, wr13, wr14 \n\t"
00999 "wunpckelub wr0, wr2 \n\t"
01000 "wunpckehub wr1, wr2 \n\t"
01001 "wunpckelub wr2, wr3 \n\t"
01002 "wunpckehub wr3, wr3 \n\t"
01003 "wunpckelub wr8, wr10 \n\t"
01004 "wunpckehub wr9, wr10 \n\t"
01005 "wunpckelub wr10, wr11 \n\t"
01006 "wunpckehub wr11, wr11 \n\t"
01007 "waddhus wr0, wr0, wr8 \n\t"
01008 "waddhus wr1, wr1, wr9 \n\t"
01009 "waddhus wr2, wr2, wr10 \n\t"
01010 "waddhus wr3, wr3, wr11 \n\t"
01011
01012 "1: \n\t"
01013
01014
01015 "wldrd wr12, [%[pixels]] \n\t"
01016 "cmp r12, #8 \n\t"
01017 "wldrd wr13, [%[pixels], #8] \n\t"
01018 "wldrd wr14, [%[pixels], #16] \n\t"
01019 "add %[pixels], %[pixels], %[line_size] \n\t"
01020 "walignr1 wr6, wr12, wr13 \n\t"
01021 "pld [%[pixels]] \n\t"
01022 "pld [%[pixels], #32] \n\t"
01023 "walignr1 wr7, wr13, wr14 \n\t"
01024 "wmoveq wr10, wr13 \n\t"
01025 "wmoveq wr11, wr14 \n\t"
01026 "walignr2ne wr10, wr12, wr13 \n\t"
01027 "walignr2ne wr11, wr13, wr14 \n\t"
01028 "wunpckelub wr4, wr6 \n\t"
01029 "wunpckehub wr5, wr6 \n\t"
01030 "wunpckelub wr6, wr7 \n\t"
01031 "wunpckehub wr7, wr7 \n\t"
01032 "wunpckelub wr8, wr10 \n\t"
01033 "wunpckehub wr9, wr10 \n\t"
01034 "wunpckelub wr10, wr11 \n\t"
01035 "wunpckehub wr11, wr11 \n\t"
01036 "waddhus wr4, wr4, wr8 \n\t"
01037 "waddhus wr5, wr5, wr9 \n\t"
01038 "waddhus wr6, wr6, wr10 \n\t"
01039 "waddhus wr7, wr7, wr11 \n\t"
01040 "waddhus wr8, wr0, wr4 \n\t"
01041 "waddhus wr9, wr1, wr5 \n\t"
01042 "waddhus wr10, wr2, wr6 \n\t"
01043 "waddhus wr11, wr3, wr7 \n\t"
01044 "waddhus wr8, wr8, wr15 \n\t"
01045 "waddhus wr9, wr9, wr15 \n\t"
01046 "waddhus wr10, wr10, wr15 \n\t"
01047 "waddhus wr11, wr11, wr15 \n\t"
01048 "wsrlhg wr8, wr8, wcgr0 \n\t"
01049 "wsrlhg wr9, wr9, wcgr0 \n\t"
01050 "wldrd wr12, [%[block]] \n\t"
01051 "wldrd wr13, [%[block], #8] \n\t"
01052 "wsrlhg wr10, wr10, wcgr0 \n\t"
01053 "wsrlhg wr11, wr11, wcgr0 \n\t"
01054 "wpackhus wr8, wr8, wr9 \n\t"
01055 "wpackhus wr9, wr10, wr11 \n\t"
01056 WAVG2B" wr8, wr8, wr12 \n\t"
01057 WAVG2B" wr9, wr9, wr13 \n\t"
01058 "wstrd wr8, [%[block]] \n\t"
01059 "wstrd wr9, [%[block], #8] \n\t"
01060 "add %[block], %[block], %[line_size] \n\t"
01061
01062
01063
01064 "wldrd wr12, [%[pixels]] \n\t"
01065 "pld [%[block]] \n\t"
01066 "wldrd wr13, [%[pixels], #8] \n\t"
01067 "pld [%[block], #32] \n\t"
01068 "wldrd wr14, [%[pixels], #16] \n\t"
01069 "add %[pixels], %[pixels], %[line_size] \n\t"
01070 "walignr1 wr2, wr12, wr13 \n\t"
01071 "pld [%[pixels]] \n\t"
01072 "pld [%[pixels], #32] \n\t"
01073 "walignr1 wr3, wr13, wr14 \n\t"
01074 "wmoveq wr10, wr13 \n\t"
01075 "wmoveq wr11, wr14 \n\t"
01076 "walignr2ne wr10, wr12, wr13 \n\t"
01077 "walignr2ne wr11, wr13, wr14 \n\t"
01078 "wunpckelub wr0, wr2 \n\t"
01079 "wunpckehub wr1, wr2 \n\t"
01080 "wunpckelub wr2, wr3 \n\t"
01081 "wunpckehub wr3, wr3 \n\t"
01082 "wunpckelub wr8, wr10 \n\t"
01083 "wunpckehub wr9, wr10 \n\t"
01084 "wunpckelub wr10, wr11 \n\t"
01085 "wunpckehub wr11, wr11 \n\t"
01086 "waddhus wr0, wr0, wr8 \n\t"
01087 "waddhus wr1, wr1, wr9 \n\t"
01088 "waddhus wr2, wr2, wr10 \n\t"
01089 "waddhus wr3, wr3, wr11 \n\t"
01090 "waddhus wr8, wr0, wr4 \n\t"
01091 "waddhus wr9, wr1, wr5 \n\t"
01092 "waddhus wr10, wr2, wr6 \n\t"
01093 "waddhus wr11, wr3, wr7 \n\t"
01094 "waddhus wr8, wr8, wr15 \n\t"
01095 "waddhus wr9, wr9, wr15 \n\t"
01096 "waddhus wr10, wr10, wr15 \n\t"
01097 "waddhus wr11, wr11, wr15 \n\t"
01098 "wsrlhg wr8, wr8, wcgr0 \n\t"
01099 "wsrlhg wr9, wr9, wcgr0 \n\t"
01100 "wldrd wr12, [%[block]] \n\t"
01101 "wldrd wr13, [%[block], #8] \n\t"
01102 "wsrlhg wr10, wr10, wcgr0 \n\t"
01103 "wsrlhg wr11, wr11, wcgr0 \n\t"
01104 "wpackhus wr8, wr8, wr9 \n\t"
01105 "wpackhus wr9, wr10, wr11 \n\t"
01106 WAVG2B" wr8, wr8, wr12 \n\t"
01107 WAVG2B" wr9, wr9, wr13 \n\t"
01108 "wstrd wr8, [%[block]] \n\t"
01109 "wstrd wr9, [%[block], #8] \n\t"
01110 "add %[block], %[block], %[line_size] \n\t"
01111 "subs %[h], %[h], #2 \n\t"
01112 "pld [%[block]] \n\t"
01113 "pld [%[block], #32] \n\t"
01114 "bne 1b \n\t"
01115 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
01116 : [line_size]"r"(line_size)
01117 : "r12", "memory");
01118 }