1 /* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18 #ifndef _SAD_INLINE_H_ 19 #define _SAD_INLINE_H_ 20 21 #ifdef __cplusplus 22 extern "C" 23 { 24 #endif 25 26 #if defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER */ 27 28 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 29 { 30 tmp = tmp - tmp2; 31 if (tmp > 0) sad += tmp; 32 else sad -= tmp; 33 34 return sad; 35 } 36 37 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 38 { 39 int32 x7; 40 41 x7 = src2 ^ src1; /* check odd/even combination */ 42 if ((uint32)src2 >= (uint32)src1) 43 { 44 src1 = src2 - src1; /* subs */ 45 } 46 else 47 { 48 src1 = src1 - src2; 49 } 50 x7 = x7 ^ src1; /* only odd bytes need to add carry */ 51 x7 = mask & ((uint32)x7 >> 1); 52 x7 = (x7 << 8) - x7; 53 src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */ 54 src1 = src1 ^(x7 >> 7); /* take absolute value of negative byte */ 55 56 return src1; 57 } 58 59 #define NUMBER 3 60 #define SHIFT 24 61 62 #include "sad_mb_offset.h" 63 64 #undef NUMBER 65 #define NUMBER 2 66 #undef SHIFT 67 #define SHIFT 16 68 #include "sad_mb_offset.h" 69 70 #undef NUMBER 71 #define NUMBER 1 72 #undef SHIFT 73 #define SHIFT 8 74 #include "sad_mb_offset.h" 75 76 77 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 78 { 79 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 80 81 x9 = 0x80808080; /* const. */ 82 83 x8 = (uint32)ref & 0x3; 84 if (x8 == 3) 85 goto SadMBOffset3; 86 if (x8 == 2) 87 goto SadMBOffset2; 88 if (x8 == 1) 89 goto SadMBOffset1; 90 91 // x5 = (x4<<8)-x4; /* x5 = x4*255; */ 92 x4 = x5 = 0; 93 94 x6 = 0xFFFF00FF; 95 96 ref -= lx; 97 blk -= 16; 98 99 x8 = 16; 100 101 LOOP_SAD0: 102 /****** process 8 pixels ******/ 103 x10 = *((uint32*)(ref += lx)); 104 x11 = *((uint32*)(ref + 4)); 105 x12 = *((uint32*)(blk += 16)); 106 x14 = *((uint32*)(blk + 4)); 107 108 /* process x11 & x14 */ 109 x11 = sad_4pixel(x11, x14, x9); 110 111 /* process x12 & x10 */ 112 x10 = sad_4pixel(x10, x12, x9); 113 114 x5 = x5 + x10; /* accumulate low bytes */ 115 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 116 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 117 x5 = x5 + x11; /* accumulate low bytes */ 118 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 119 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 120 121 /****** process 8 pixels ******/ 122 x10 = *((uint32*)(ref + 8)); 123 x11 = *((uint32*)(ref + 12)); 124 x12 = *((uint32*)(blk + 8)); 125 x14 = *((uint32*)(blk + 12)); 126 127 /* process x11 & x14 */ 128 x11 = sad_4pixel(x11, x14, x9); 129 130 /* process x12 & x10 */ 131 x10 = sad_4pixel(x10, x12, x9); 132 133 x5 = x5 + x10; /* accumulate low bytes */ 134 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 135 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 136 x5 = x5 + x11; /* accumulate low bytes */ 137 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 138 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 139 140 /****************/ 141 x10 = x5 - (x4 << 8); /* extract low bytes */ 142 x10 = x10 + x4; /* add with high bytes */ 143 x10 = x10 + (x10 << 16); /* add with lower half word */ 144 145 if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */ 146 { 147 if (--x8) 148 { 149 goto LOOP_SAD0; 150 } 151 152 } 153 154 return ((uint32)x10 >> 16); 155 156 SadMBOffset3: 157 158 return sad_mb_offset3(ref, blk, lx, dmin); 159 160 SadMBOffset2: 161 162 return sad_mb_offset2(ref, blk, lx, dmin); 163 164 SadMBOffset1: 165 166 return sad_mb_offset1(ref, blk, lx, dmin); 167 168 } 169 170 #elif defined(__CC_ARM) /* only work with arm v5 */ 171 172 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 173 { 174 __asm 175 { 176 rsbs tmp, tmp, tmp2 ; 177 rsbmi tmp, tmp, #0 ; 178 add sad, sad, tmp ; 179 } 180 181 return sad; 182 } 183 184 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 185 { 186 int32 x7; 187 188 __asm 189 { 190 EOR x7, src2, src1; /* check odd/even combination */ 191 SUBS src1, src2, src1; 192 EOR x7, x7, src1; 193 AND x7, mask, x7, lsr #1; 194 ORRCC x7, x7, #0x80000000; 195 RSB x7, x7, x7, lsl #8; 196 ADD src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */ 197 EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */ 198 } 199 200 return src1; 201 } 202 203 __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask) 204 { 205 int32 x7; 206 207 __asm 208 { 209 EOR x7, src2, src1; /* check odd/even combination */ 210 ADDS src1, src2, src1; 211 EOR x7, x7, src1; /* only odd bytes need to add carry */ 212 ANDS x7, mask, x7, rrx; 213 RSB x7, x7, x7, lsl #8; 214 SUB src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */ 215 EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */ 216 } 217 218 return src1; 219 } 220 221 #define sum_accumulate __asm{ SBC x5, x5, x10; /* accumulate low bytes */ \ 222 BIC x10, x6, x10; /* x10 & 0xFF00FF00 */ \ 223 ADD x4, x4, x10,lsr #8; /* accumulate high bytes */ \ 224 SBC x5, x5, x11; /* accumulate low bytes */ \ 225 BIC x11, x6, x11; /* x11 & 0xFF00FF00 */ \ 226 ADD x4, x4, x11,lsr #8; } /* accumulate high bytes */ 227 228 229 #define NUMBER 3 230 #define SHIFT 24 231 #define INC_X8 0x08000001 232 233 #include "sad_mb_offset.h" 234 235 #undef NUMBER 236 #define NUMBER 2 237 #undef SHIFT 238 #define SHIFT 16 239 #undef INC_X8 240 #define INC_X8 0x10000001 241 #include "sad_mb_offset.h" 242 243 #undef NUMBER 244 #define NUMBER 1 245 #undef SHIFT 246 #define SHIFT 8 247 #undef INC_X8 248 #define INC_X8 0x08000001 249 #include "sad_mb_offset.h" 250 251 252 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 253 { 254 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 255 256 x9 = 0x80808080; /* const. */ 257 x4 = x5 = 0; 258 259 __asm 260 { 261 MOVS x8, ref, lsl #31 ; 262 BHI SadMBOffset3; 263 BCS SadMBOffset2; 264 BMI SadMBOffset1; 265 266 MVN x6, #0xFF00; 267 } 268 LOOP_SAD0: 269 /****** process 8 pixels ******/ 270 x11 = *((int32*)(ref + 12)); 271 x10 = *((int32*)(ref + 8)); 272 x14 = *((int32*)(blk + 12)); 273 x12 = *((int32*)(blk + 8)); 274 275 /* process x11 & x14 */ 276 x11 = sad_4pixel(x11, x14, x9); 277 278 /* process x12 & x10 */ 279 x10 = sad_4pixel(x10, x12, x9); 280 281 x5 = x5 + x10; /* accumulate low bytes */ 282 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 283 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 284 x5 = x5 + x11; /* accumulate low bytes */ 285 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 286 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 287 288 __asm 289 { 290 /****** process 8 pixels ******/ 291 LDR x11, [ref, #4]; 292 LDR x10, [ref], lx ; 293 LDR x14, [blk, #4]; 294 LDR x12, [blk], #16 ; 295 } 296 297 /* process x11 & x14 */ 298 x11 = sad_4pixel(x11, x14, x9); 299 300 /* process x12 & x10 */ 301 x10 = sad_4pixel(x10, x12, x9); 302 303 x5 = x5 + x10; /* accumulate low bytes */ 304 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 305 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 306 x5 = x5 + x11; /* accumulate low bytes */ 307 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 308 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 309 310 /****************/ 311 x10 = x5 - (x4 << 8); /* extract low bytes */ 312 x10 = x10 + x4; /* add with high bytes */ 313 x10 = x10 + (x10 << 16); /* add with lower half word */ 314 315 __asm 316 { 317 /****************/ 318 RSBS x11, dmin, x10, lsr #16; 319 ADDLSS x8, x8, #0x10000001; 320 BLS LOOP_SAD0; 321 } 322 323 return ((uint32)x10 >> 16); 324 325 SadMBOffset3: 326 327 return sad_mb_offset3(ref, blk, lx, dmin, x8); 328 329 SadMBOffset2: 330 331 return sad_mb_offset2(ref, blk, lx, dmin, x8); 332 333 SadMBOffset1: 334 335 return sad_mb_offset1(ref, blk, lx, dmin, x8); 336 } 337 338 339 #elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER */ 340 341 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 342 { 343 __asm__ volatile("rsbs %1, %1, %2\n\trsbmi %1, %1, #0\n\tadd %0, %0, %1": "=r"(sad): "r"(tmp), "r"(tmp2)); 344 return sad; 345 } 346 347 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 348 { 349 int32 x7; 350 351 __asm__ volatile("EOR %1, %2, %0\n\tSUBS %0, %2, %0\n\tEOR %1, %1, %0\n\tAND %1, %3, %1, lsr #1\n\tORRCC %1, %1, #0x80000000\n\tRSB %1, %1, %1, lsl #8\n\tADD %0, %0, %1, asr #7\n\tEOR %0, %0, %1, asr #7": "=r"(src1), "=&r"(x7): "r"(src2), "r"(mask)); 352 353 return src1; 354 } 355 356 __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask) 357 { 358 int32 x7; 359 360 __asm__ volatile("EOR %1, %2, %0\n\tADDS %0, %2, %0\n\tEOR %1, %1, %0\n\tANDS %1, %3, %1, rrx\n\tRSB %1, %1, %1, lsl #8\n\tSUB %0, %0, %1, asr #7\n\tEOR %0, %0, %1, asr #7": "=r"(src1), "=&r"(x7): "r"(src2), "r"(mask)); 361 362 return src1; 363 } 364 365 #define sum_accumulate __asm__ volatile("SBC %0, %0, %1\n\tBIC %1, %4, %1\n\tADD %2, %2, %1, lsr #8\n\tSBC %0, %0, %3\n\tBIC %3, %4, %3\n\tADD %2, %2, %3, lsr #8": "=&r" (x5), "=&r" (x10), "=&r" (x4), "=&r" (x11): "r" (x6)); 366 367 #define NUMBER 3 368 #define SHIFT 24 369 #define INC_X8 0x08000001 370 371 #include "sad_mb_offset.h" 372 373 #undef NUMBER 374 #define NUMBER 2 375 #undef SHIFT 376 #define SHIFT 16 377 #undef INC_X8 378 #define INC_X8 0x10000001 379 #include "sad_mb_offset.h" 380 381 #undef NUMBER 382 #define NUMBER 1 383 #undef SHIFT 384 #define SHIFT 8 385 #undef INC_X8 386 #define INC_X8 0x08000001 387 #include "sad_mb_offset.h" 388 389 390 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 391 { 392 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 393 394 x9 = 0x80808080; /* const. */ 395 x4 = x5 = 0; 396 397 x8 = (uint32)ref & 0x3; 398 if (x8 == 3) 399 goto SadMBOffset3; 400 if (x8 == 2) 401 goto SadMBOffset2; 402 if (x8 == 1) 403 goto SadMBOffset1; 404 405 x8 = 16; 406 /// 407 __asm__ volatile("MVN %0, #0xFF00": "=r"(x6)); 408 409 LOOP_SAD0: 410 /****** process 8 pixels ******/ 411 x11 = *((int32*)(ref + 12)); 412 x10 = *((int32*)(ref + 8)); 413 x14 = *((int32*)(blk + 12)); 414 x12 = *((int32*)(blk + 8)); 415 416 /* process x11 & x14 */ 417 x11 = sad_4pixel(x11, x14, x9); 418 419 /* process x12 & x10 */ 420 x10 = sad_4pixel(x10, x12, x9); 421 422 x5 = x5 + x10; /* accumulate low bytes */ 423 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 424 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 425 x5 = x5 + x11; /* accumulate low bytes */ 426 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 427 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 428 429 /****** process 8 pixels ******/ 430 x11 = *((int32*)(ref + 4)); 431 __asm__ volatile("LDR %0, [%1], %2": "=&r"(x10), "=r"(ref): "r"(lx)); 432 //x10 = *((int32*)ref); ref+=lx; 433 x14 = *((int32*)(blk + 4)); 434 __asm__ volatile("LDR %0, [%1], #16": "=&r"(x12), "=r"(blk)); 435 436 /* process x11 & x14 */ 437 x11 = sad_4pixel(x11, x14, x9); 438 439 /* process x12 & x10 */ 440 x10 = sad_4pixel(x10, x12, x9); 441 442 x5 = x5 + x10; /* accumulate low bytes */ 443 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 444 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 445 x5 = x5 + x11; /* accumulate low bytes */ 446 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 447 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 448 449 /****************/ 450 x10 = x5 - (x4 << 8); /* extract low bytes */ 451 x10 = x10 + x4; /* add with high bytes */ 452 x10 = x10 + (x10 << 16); /* add with lower half word */ 453 454 /****************/ 455 456 if (((uint32)x10 >> 16) <= dmin) /* compare with dmin */ 457 { 458 if (--x8) 459 { 460 goto LOOP_SAD0; 461 } 462 463 } 464 465 return ((uint32)x10 >> 16); 466 467 SadMBOffset3: 468 469 return sad_mb_offset3(ref, blk, lx, dmin); 470 471 SadMBOffset2: 472 473 return sad_mb_offset2(ref, blk, lx, dmin); 474 475 SadMBOffset1: 476 477 return sad_mb_offset1(ref, blk, lx, dmin); 478 } 479 480 481 #endif 482 483 #ifdef __cplusplus 484 } 485 #endif 486 487 #endif // _SAD_INLINE_H_ 488 489