1 /* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18 #ifndef _SAD_INLINE_H_ 19 #define _SAD_INLINE_H_ 20 21 #ifdef __cplusplus 22 extern "C" 23 { 24 #endif 25 26 /* Intentionally not using the gcc asm version, since it is 27 * slightly slower than the plain C version on modern GCC versions. */ 28 #if !defined(__CC_ARM) /* Generic C version */ 29 30 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 31 { 32 tmp = tmp - tmp2; 33 if (tmp > 0) sad += tmp; 34 else sad -= tmp; 35 36 return sad; 37 } 38 39 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 40 { 41 int32 x7; 42 43 x7 = src2 ^ src1; /* check odd/even combination */ 44 if ((uint32)src2 >= (uint32)src1) 45 { 46 src1 = src2 - src1; /* subs */ 47 } 48 else 49 { 50 src1 = src1 - src2; 51 } 52 x7 = x7 ^ src1; /* only odd bytes need to add carry */ 53 x7 = mask & ((uint32)x7 >> 1); 54 x7 = (x7 << 8) - x7; 55 src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */ 56 src1 = src1 ^(x7 >> 7); /* take absolute value of negative byte */ 57 58 return src1; 59 } 60 61 #define NUMBER 3 62 #define SHIFT 24 63 64 #include "sad_mb_offset.h" 65 66 #undef NUMBER 67 #define NUMBER 2 68 #undef SHIFT 69 #define SHIFT 16 70 #include "sad_mb_offset.h" 71 72 #undef NUMBER 73 #define NUMBER 1 74 #undef SHIFT 75 #define SHIFT 8 76 #include "sad_mb_offset.h" 77 78 79 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 80 { 81 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 82 83 x9 = 0x80808080; /* const. */ 84 85 x8 = (intptr_t)ref & 0x3; 86 if (x8 == 3) 87 goto SadMBOffset3; 88 if (x8 == 2) 89 goto SadMBOffset2; 90 if (x8 == 1) 91 goto SadMBOffset1; 92 93 // x5 = (x4<<8)-x4; /* x5 = x4*255; */ 94 x4 = x5 = 0; 95 96 x6 = 0xFFFF00FF; 97 98 ref -= lx; 99 blk -= 16; 100 101 x8 = 16; 102 103 LOOP_SAD0: 104 /****** process 8 pixels ******/ 105 x10 = *((uint32*)(ref += lx)); 106 x11 = *((uint32*)(ref + 4)); 107 x12 = *((uint32*)(blk += 16)); 108 x14 = *((uint32*)(blk + 4)); 109 110 /* process x11 & x14 */ 111 x11 = sad_4pixel(x11, x14, x9); 112 113 /* process x12 & x10 */ 114 x10 = sad_4pixel(x10, x12, x9); 115 116 x5 = x5 + x10; /* accumulate low bytes */ 117 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 118 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 119 x5 = x5 + x11; /* accumulate low bytes */ 120 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 121 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 122 123 /****** process 8 pixels ******/ 124 x10 = *((uint32*)(ref + 8)); 125 x11 = *((uint32*)(ref + 12)); 126 x12 = *((uint32*)(blk + 8)); 127 x14 = *((uint32*)(blk + 12)); 128 129 /* process x11 & x14 */ 130 x11 = sad_4pixel(x11, x14, x9); 131 132 /* process x12 & x10 */ 133 x10 = sad_4pixel(x10, x12, x9); 134 135 x5 = x5 + x10; /* accumulate low bytes */ 136 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 137 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 138 x5 = x5 + x11; /* accumulate low bytes */ 139 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 140 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 141 142 /****************/ 143 x10 = x5 - (x4 << 8); /* extract low bytes */ 144 x10 = x10 + x4; /* add with high bytes */ 145 x10 = x10 + (x10 << 16); /* add with lower half word */ 146 147 if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */ 148 { 149 if (--x8) 150 { 151 goto LOOP_SAD0; 152 } 153 154 } 155 156 return ((uint32)x10 >> 16); 157 158 SadMBOffset3: 159 160 return sad_mb_offset3(ref, blk, lx, dmin); 161 162 SadMBOffset2: 163 164 return sad_mb_offset2(ref, blk, lx, dmin); 165 166 SadMBOffset1: 167 168 return sad_mb_offset1(ref, blk, lx, dmin); 169 170 } 171 172 #elif defined(__CC_ARM) /* only work with arm v5 */ 173 174 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 175 { 176 __asm 177 { 178 rsbs tmp, tmp, tmp2 ; 179 rsbmi tmp, tmp, #0 ; 180 add sad, sad, tmp ; 181 } 182 183 return sad; 184 } 185 186 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 187 { 188 int32 x7; 189 190 __asm 191 { 192 EOR x7, src2, src1; /* check odd/even combination */ 193 SUBS src1, src2, src1; 194 EOR x7, x7, src1; 195 AND x7, mask, x7, lsr #1; 196 ORRCC x7, x7, #0x80000000; 197 RSB x7, x7, x7, lsl #8; 198 ADD src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */ 199 EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */ 200 } 201 202 return src1; 203 } 204 205 __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask) 206 { 207 int32 x7; 208 209 __asm 210 { 211 EOR x7, src2, src1; /* check odd/even combination */ 212 ADDS src1, src2, src1; 213 EOR x7, x7, src1; /* only odd bytes need to add carry */ 214 ANDS x7, mask, x7, rrx; 215 RSB x7, x7, x7, lsl #8; 216 SUB src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */ 217 EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */ 218 } 219 220 return src1; 221 } 222 223 #define sum_accumulate __asm{ SBC x5, x5, x10; /* accumulate low bytes */ \ 224 BIC x10, x6, x10; /* x10 & 0xFF00FF00 */ \ 225 ADD x4, x4, x10,lsr #8; /* accumulate high bytes */ \ 226 SBC x5, x5, x11; /* accumulate low bytes */ \ 227 BIC x11, x6, x11; /* x11 & 0xFF00FF00 */ \ 228 ADD x4, x4, x11,lsr #8; } /* accumulate high bytes */ 229 230 231 #define NUMBER 3 232 #define SHIFT 24 233 #define INC_X8 0x08000001 234 235 #include "sad_mb_offset.h" 236 237 #undef NUMBER 238 #define NUMBER 2 239 #undef SHIFT 240 #define SHIFT 16 241 #undef INC_X8 242 #define INC_X8 0x10000001 243 #include "sad_mb_offset.h" 244 245 #undef NUMBER 246 #define NUMBER 1 247 #undef SHIFT 248 #define SHIFT 8 249 #undef INC_X8 250 #define INC_X8 0x08000001 251 #include "sad_mb_offset.h" 252 253 254 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 255 { 256 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 257 258 x9 = 0x80808080; /* const. */ 259 x4 = x5 = 0; 260 261 __asm 262 { 263 MOVS x8, ref, lsl #31 ; 264 BHI SadMBOffset3; 265 BCS SadMBOffset2; 266 BMI SadMBOffset1; 267 268 MVN x6, #0xFF00; 269 } 270 LOOP_SAD0: 271 /****** process 8 pixels ******/ 272 x11 = *((int32*)(ref + 12)); 273 x10 = *((int32*)(ref + 8)); 274 x14 = *((int32*)(blk + 12)); 275 x12 = *((int32*)(blk + 8)); 276 277 /* process x11 & x14 */ 278 x11 = sad_4pixel(x11, x14, x9); 279 280 /* process x12 & x10 */ 281 x10 = sad_4pixel(x10, x12, x9); 282 283 x5 = x5 + x10; /* accumulate low bytes */ 284 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 285 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 286 x5 = x5 + x11; /* accumulate low bytes */ 287 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 288 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 289 290 __asm 291 { 292 /****** process 8 pixels ******/ 293 LDR x11, [ref, #4]; 294 LDR x10, [ref], lx ; 295 LDR x14, [blk, #4]; 296 LDR x12, [blk], #16 ; 297 } 298 299 /* process x11 & x14 */ 300 x11 = sad_4pixel(x11, x14, x9); 301 302 /* process x12 & x10 */ 303 x10 = sad_4pixel(x10, x12, x9); 304 305 x5 = x5 + x10; /* accumulate low bytes */ 306 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 307 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 308 x5 = x5 + x11; /* accumulate low bytes */ 309 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 310 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 311 312 /****************/ 313 x10 = x5 - (x4 << 8); /* extract low bytes */ 314 x10 = x10 + x4; /* add with high bytes */ 315 x10 = x10 + (x10 << 16); /* add with lower half word */ 316 317 __asm 318 { 319 /****************/ 320 RSBS x11, dmin, x10, lsr #16; 321 ADDLSS x8, x8, #0x10000001; 322 BLS LOOP_SAD0; 323 } 324 325 return ((uint32)x10 >> 16); 326 327 SadMBOffset3: 328 329 return sad_mb_offset3(ref, blk, lx, dmin, x8); 330 331 SadMBOffset2: 332 333 return sad_mb_offset2(ref, blk, lx, dmin, x8); 334 335 SadMBOffset1: 336 337 return sad_mb_offset1(ref, blk, lx, dmin, x8); 338 } 339 340 341 #elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER */ 342 343 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 344 { 345 __asm__ volatile( 346 "rsbs %1, %1, %2\n\t" 347 "rsbmi %1, %1, #0\n\t" 348 "add %0, %0, %1" 349 : "+r"(sad), "+r"(tmp) 350 : "r"(tmp2) 351 ); 352 return sad; 353 } 354 355 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 356 { 357 int32 x7; 358 359 __asm__ volatile( 360 "EOR %1, %2, %0\n\t" 361 "SUBS %0, %2, %0\n\t" 362 "EOR %1, %1, %0\n\t" 363 "AND %1, %3, %1, lsr #1\n\t" 364 "ORRCC %1, %1, #0x80000000\n\t" 365 "RSB %1, %1, %1, lsl #8\n\t" 366 "ADD %0, %0, %1, asr #7\n\t" 367 "EOR %0, %0, %1, asr #7" 368 : "+r"(src1), "=&r"(x7) 369 : "r"(src2), "r"(mask) 370 ); 371 372 return src1; 373 } 374 375 __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask) 376 { 377 int32 x7; 378 379 __asm__ volatile( 380 "EOR %1, %2, %0\n\t" 381 "ADDS %0, %2, %0\n\t" 382 "EOR %1, %1, %0\n\t" 383 "ANDS %1, %3, %1, rrx\n\t" 384 "RSB %1, %1, %1, lsl #8\n\t" 385 "SUB %0, %0, %1, asr #7\n\t" 386 "EOR %0, %0, %1, asr #7" 387 : "+r"(src1), "=&r"(x7) 388 : "r"(src2), "r"(mask) 389 ); 390 391 return src1; 392 } 393 394 #define sum_accumulate __asm__ volatile( \ 395 "SBC %0, %0, %1\n\t" \ 396 "BIC %1, %4, %1\n\t" \ 397 "ADD %2, %2, %1, lsr #8\n\t" \ 398 "SBC %0, %0, %3\n\t" \ 399 "BIC %3, %4, %3\n\t" \ 400 "ADD %2, %2, %3, lsr #8" \ 401 : "+r" (x5), "+r" (x10), "+r" (x4), "+r" (x11) \ 402 : "r" (x6) \ 403 ); 404 405 #define NUMBER 3 406 #define SHIFT 24 407 #define INC_X8 0x08000001 408 409 #include "sad_mb_offset.h" 410 411 #undef NUMBER 412 #define NUMBER 2 413 #undef SHIFT 414 #define SHIFT 16 415 #undef INC_X8 416 #define INC_X8 0x10000001 417 #include "sad_mb_offset.h" 418 419 #undef NUMBER 420 #define NUMBER 1 421 #undef SHIFT 422 #define SHIFT 8 423 #undef INC_X8 424 #define INC_X8 0x08000001 425 #include "sad_mb_offset.h" 426 427 428 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 429 { 430 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 431 432 x9 = 0x80808080; /* const. */ 433 x4 = x5 = 0; 434 435 x8 = (uint32)ref & 0x3; 436 if (x8 == 3) 437 goto SadMBOffset3; 438 if (x8 == 2) 439 goto SadMBOffset2; 440 if (x8 == 1) 441 goto SadMBOffset1; 442 443 x8 = 16; 444 /// 445 __asm__ volatile("MVN %0, #0xFF00": "=r"(x6)); 446 447 LOOP_SAD0: 448 /****** process 8 pixels ******/ 449 x11 = *((int32*)(ref + 12)); 450 x10 = *((int32*)(ref + 8)); 451 x14 = *((int32*)(blk + 12)); 452 x12 = *((int32*)(blk + 8)); 453 454 /* process x11 & x14 */ 455 x11 = sad_4pixel(x11, x14, x9); 456 457 /* process x12 & x10 */ 458 x10 = sad_4pixel(x10, x12, x9); 459 460 x5 = x5 + x10; /* accumulate low bytes */ 461 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 462 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 463 x5 = x5 + x11; /* accumulate low bytes */ 464 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 465 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 466 467 /****** process 8 pixels ******/ 468 x11 = *((int32*)(ref + 4)); 469 __asm__ volatile("LDR %0, [%1], %2": "=&r"(x10), "+r"(ref): "r"(lx)); 470 //x10 = *((int32*)ref); ref+=lx; 471 x14 = *((int32*)(blk + 4)); 472 __asm__ volatile("LDR %0, [%1], #16": "=&r"(x12), "+r"(blk)); 473 474 /* process x11 & x14 */ 475 x11 = sad_4pixel(x11, x14, x9); 476 477 /* process x12 & x10 */ 478 x10 = sad_4pixel(x10, x12, x9); 479 480 x5 = x5 + x10; /* accumulate low bytes */ 481 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 482 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 483 x5 = x5 + x11; /* accumulate low bytes */ 484 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 485 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 486 487 /****************/ 488 x10 = x5 - (x4 << 8); /* extract low bytes */ 489 x10 = x10 + x4; /* add with high bytes */ 490 x10 = x10 + (x10 << 16); /* add with lower half word */ 491 492 /****************/ 493 494 if (((uint32)x10 >> 16) <= dmin) /* compare with dmin */ 495 { 496 if (--x8) 497 { 498 goto LOOP_SAD0; 499 } 500 501 } 502 503 return ((uint32)x10 >> 16); 504 505 SadMBOffset3: 506 507 return sad_mb_offset3(ref, blk, lx, dmin); 508 509 SadMBOffset2: 510 511 return sad_mb_offset2(ref, blk, lx, dmin); 512 513 SadMBOffset1: 514 515 return sad_mb_offset1(ref, blk, lx, dmin); 516 } 517 518 519 #endif 520 521 #ifdef __cplusplus 522 } 523 #endif 524 525 #endif // _SAD_INLINE_H_ 526 527