Home | History | Annotate | Download | only in src
      1 /* ------------------------------------------------------------------
      2  * Copyright (C) 1998-2009 PacketVideo
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
     13  * express or implied.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  * -------------------------------------------------------------------
     17  */
     18 #ifndef _SAD_INLINE_H_
     19 #define _SAD_INLINE_H_
     20 
     21 #ifdef __cplusplus
     22 extern "C"
     23 {
     24 #endif
     25 
     26 /* Intentionally not using the gcc asm version, since it is
     27  * slightly slower than the plain C version on modern GCC versions. */
     28 #if !defined(__CC_ARM) /* Generic C version */
     29 
     30     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
     31     {
     32         tmp = tmp - tmp2;
     33         if (tmp > 0) sad += tmp;
     34         else sad -= tmp;
     35 
     36         return sad;
     37     }
     38 
     39     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
     40     {
     41         int32 x7;
     42 
     43         x7 = src2 ^ src1;       /* check odd/even combination */
     44         if ((uint32)src2 >= (uint32)src1)
     45         {
     46             src1 = src2 - src1;     /* subs */
     47         }
     48         else
     49         {
     50             src1 = src1 - src2;
     51         }
     52         x7 = x7 ^ src1;     /* only odd bytes need to add carry */
     53         x7 = mask & ((uint32)x7 >> 1);
     54         x7 = (x7 << 8) - x7;
     55         src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */
     56         src1 = src1 ^(x7 >> 7);   /* take absolute value of negative byte */
     57 
     58         return src1;
     59     }
     60 
     61 #define NUMBER 3
     62 #define SHIFT 24
     63 
     64 #include "sad_mb_offset.h"
     65 
     66 #undef NUMBER
     67 #define NUMBER 2
     68 #undef SHIFT
     69 #define SHIFT 16
     70 #include "sad_mb_offset.h"
     71 
     72 #undef NUMBER
     73 #define NUMBER 1
     74 #undef SHIFT
     75 #define SHIFT 8
     76 #include "sad_mb_offset.h"
     77 
     78 
     79     __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
     80     {
     81         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
     82 
     83         x9 = 0x80808080; /* const. */
     84 
     85         x8 = (intptr_t)ref & 0x3;
     86         if (x8 == 3)
     87             goto SadMBOffset3;
     88         if (x8 == 2)
     89             goto SadMBOffset2;
     90         if (x8 == 1)
     91             goto SadMBOffset1;
     92 
     93 //  x5 = (x4<<8)-x4; /* x5 = x4*255; */
     94         x4 = x5 = 0;
     95 
     96         x6 = 0xFFFF00FF;
     97 
     98         ref -= lx;
     99         blk -= 16;
    100 
    101         x8 = 16;
    102 
    103 LOOP_SAD0:
    104         /****** process 8 pixels ******/
    105         x10 = *((uint32*)(ref += lx));
    106         x11 = *((uint32*)(ref + 4));
    107         x12 = *((uint32*)(blk += 16));
    108         x14 = *((uint32*)(blk + 4));
    109 
    110         /* process x11 & x14 */
    111         x11 = sad_4pixel(x11, x14, x9);
    112 
    113         /* process x12 & x10 */
    114         x10 = sad_4pixel(x10, x12, x9);
    115 
    116         x5 = x5 + x10; /* accumulate low bytes */
    117         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    118         x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
    119         x5 = x5 + x11;  /* accumulate low bytes */
    120         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    121         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    122 
    123         /****** process 8 pixels ******/
    124         x10 = *((uint32*)(ref + 8));
    125         x11 = *((uint32*)(ref + 12));
    126         x12 = *((uint32*)(blk + 8));
    127         x14 = *((uint32*)(blk + 12));
    128 
    129         /* process x11 & x14 */
    130         x11 = sad_4pixel(x11, x14, x9);
    131 
    132         /* process x12 & x10 */
    133         x10 = sad_4pixel(x10, x12, x9);
    134 
    135         x5 = x5 + x10;  /* accumulate low bytes */
    136         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    137         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    138         x5 = x5 + x11;  /* accumulate low bytes */
    139         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    140         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    141 
    142         /****************/
    143         x10 = x5 - (x4 << 8); /* extract low bytes */
    144         x10 = x10 + x4;     /* add with high bytes */
    145         x10 = x10 + (x10 << 16); /* add with lower half word */
    146 
    147         if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
    148         {
    149             if (--x8)
    150             {
    151                 goto LOOP_SAD0;
    152             }
    153 
    154         }
    155 
    156         return ((uint32)x10 >> 16);
    157 
    158 SadMBOffset3:
    159 
    160         return sad_mb_offset3(ref, blk, lx, dmin);
    161 
    162 SadMBOffset2:
    163 
    164         return sad_mb_offset2(ref, blk, lx, dmin);
    165 
    166 SadMBOffset1:
    167 
    168         return sad_mb_offset1(ref, blk, lx, dmin);
    169 
    170     }
    171 
    172 #elif defined(__CC_ARM)  /* only work with arm v5 */
    173 
    174     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
    175     {
    176         __asm
    177         {
    178             rsbs    tmp, tmp, tmp2 ;
    179             rsbmi   tmp, tmp, #0 ;
    180             add     sad, sad, tmp ;
    181         }
    182 
    183         return sad;
    184     }
    185 
    186     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
    187     {
    188         int32 x7;
    189 
    190         __asm
    191         {
    192             EOR     x7, src2, src1;     /* check odd/even combination */
    193             SUBS    src1, src2, src1;
    194             EOR     x7, x7, src1;
    195             AND     x7, mask, x7, lsr #1;
    196             ORRCC   x7, x7, #0x80000000;
    197             RSB     x7, x7, x7, lsl #8;
    198             ADD     src1, src1, x7, asr #7;   /* add 0xFF to the negative byte, add back carry */
    199             EOR     src1, src1, x7, asr #7;   /* take absolute value of negative byte */
    200         }
    201 
    202         return src1;
    203     }
    204 
    205     __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
    206     {
    207         int32 x7;
    208 
    209         __asm
    210         {
    211             EOR      x7, src2, src1;        /* check odd/even combination */
    212             ADDS     src1, src2, src1;
    213             EOR      x7, x7, src1;      /* only odd bytes need to add carry */
    214             ANDS     x7, mask, x7, rrx;
    215             RSB      x7, x7, x7, lsl #8;
    216             SUB      src1, src1, x7, asr #7;  /* add 0xFF to the negative byte, add back carry */
    217             EOR      src1, src1, x7, asr #7; /* take absolute value of negative byte */
    218         }
    219 
    220         return src1;
    221     }
    222 
    223 #define sum_accumulate  __asm{      SBC      x5, x5, x10;  /* accumulate low bytes */ \
    224         BIC      x10, x6, x10;   /* x10 & 0xFF00FF00 */ \
    225         ADD      x4, x4, x10,lsr #8;   /* accumulate high bytes */ \
    226         SBC      x5, x5, x11;    /* accumulate low bytes */ \
    227         BIC      x11, x6, x11;   /* x11 & 0xFF00FF00 */ \
    228         ADD      x4, x4, x11,lsr #8; } /* accumulate high bytes */
    229 
    230 
    231 #define NUMBER 3
    232 #define SHIFT 24
    233 #define INC_X8 0x08000001
    234 
    235 #include "sad_mb_offset.h"
    236 
    237 #undef NUMBER
    238 #define NUMBER 2
    239 #undef SHIFT
    240 #define SHIFT 16
    241 #undef INC_X8
    242 #define INC_X8 0x10000001
    243 #include "sad_mb_offset.h"
    244 
    245 #undef NUMBER
    246 #define NUMBER 1
    247 #undef SHIFT
    248 #define SHIFT 8
    249 #undef INC_X8
    250 #define INC_X8 0x08000001
    251 #include "sad_mb_offset.h"
    252 
    253 
    254     __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
    255     {
    256         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
    257 
    258         x9 = 0x80808080; /* const. */
    259         x4 = x5 = 0;
    260 
    261         __asm
    262         {
    263             MOVS    x8, ref, lsl #31 ;
    264             BHI     SadMBOffset3;
    265             BCS     SadMBOffset2;
    266             BMI     SadMBOffset1;
    267 
    268             MVN     x6, #0xFF00;
    269         }
    270 LOOP_SAD0:
    271         /****** process 8 pixels ******/
    272         x11 = *((int32*)(ref + 12));
    273         x10 = *((int32*)(ref + 8));
    274         x14 = *((int32*)(blk + 12));
    275         x12 = *((int32*)(blk + 8));
    276 
    277         /* process x11 & x14 */
    278         x11 = sad_4pixel(x11, x14, x9);
    279 
    280         /* process x12 & x10 */
    281         x10 = sad_4pixel(x10, x12, x9);
    282 
    283         x5 = x5 + x10;  /* accumulate low bytes */
    284         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    285         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    286         x5 = x5 + x11;  /* accumulate low bytes */
    287         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    288         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    289 
    290         __asm
    291         {
    292             /****** process 8 pixels ******/
    293             LDR     x11, [ref, #4];
    294             LDR     x10, [ref], lx ;
    295             LDR     x14, [blk, #4];
    296             LDR     x12, [blk], #16 ;
    297         }
    298 
    299         /* process x11 & x14 */
    300         x11 = sad_4pixel(x11, x14, x9);
    301 
    302         /* process x12 & x10 */
    303         x10 = sad_4pixel(x10, x12, x9);
    304 
    305         x5 = x5 + x10;  /* accumulate low bytes */
    306         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    307         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    308         x5 = x5 + x11;  /* accumulate low bytes */
    309         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    310         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    311 
    312         /****************/
    313         x10 = x5 - (x4 << 8); /* extract low bytes */
    314         x10 = x10 + x4;     /* add with high bytes */
    315         x10 = x10 + (x10 << 16); /* add with lower half word */
    316 
    317         __asm
    318         {
    319             /****************/
    320             RSBS    x11, dmin, x10, lsr #16;
    321             ADDLSS  x8, x8, #0x10000001;
    322             BLS     LOOP_SAD0;
    323         }
    324 
    325         return ((uint32)x10 >> 16);
    326 
    327 SadMBOffset3:
    328 
    329         return sad_mb_offset3(ref, blk, lx, dmin, x8);
    330 
    331 SadMBOffset2:
    332 
    333         return sad_mb_offset2(ref, blk, lx, dmin, x8);
    334 
    335 SadMBOffset1:
    336 
    337         return sad_mb_offset1(ref, blk, lx, dmin, x8);
    338     }
    339 
    340 
    341 #elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
    342 
    343     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
    344     {
    345         __asm__ volatile(
    346             "rsbs       %1, %1, %2\n\t"
    347             "rsbmi      %1, %1, #0\n\t"
    348             "add        %0, %0, %1"
    349             : "+r"(sad), "+r"(tmp)
    350             : "r"(tmp2)
    351         );
    352         return sad;
    353     }
    354 
    355     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
    356     {
    357         int32 x7;
    358 
    359         __asm__ volatile(
    360             "EOR        %1, %2, %0\n\t"
    361             "SUBS       %0, %2, %0\n\t"
    362             "EOR        %1, %1, %0\n\t"
    363             "AND        %1, %3, %1, lsr #1\n\t"
    364             "ORRCC      %1, %1, #0x80000000\n\t"
    365             "RSB        %1, %1, %1, lsl #8\n\t"
    366             "ADD        %0, %0, %1, asr #7\n\t"
    367             "EOR        %0, %0, %1, asr #7"
    368             : "+r"(src1), "=&r"(x7)
    369             : "r"(src2), "r"(mask)
    370         );
    371 
    372         return src1;
    373     }
    374 
    375     __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
    376     {
    377         int32 x7;
    378 
    379         __asm__ volatile(
    380             "EOR        %1, %2, %0\n\t"
    381             "ADDS       %0, %2, %0\n\t"
    382             "EOR        %1, %1, %0\n\t"
    383             "ANDS       %1, %3, %1, rrx\n\t"
    384             "RSB        %1, %1, %1, lsl #8\n\t"
    385             "SUB        %0, %0, %1, asr #7\n\t"
    386             "EOR        %0, %0, %1, asr #7"
    387             : "+r"(src1), "=&r"(x7)
    388             : "r"(src2), "r"(mask)
    389         );
    390 
    391         return src1;
    392     }
    393 
    394 #define sum_accumulate  __asm__ volatile(              \
    395     "SBC   %0, %0, %1\n\t"                             \
    396     "BIC   %1, %4, %1\n\t"                             \
    397     "ADD   %2, %2, %1, lsr #8\n\t"                     \
    398     "SBC   %0, %0, %3\n\t"                             \
    399     "BIC   %3, %4, %3\n\t"                             \
    400     "ADD   %2, %2, %3, lsr #8"                         \
    401     : "+r" (x5), "+r" (x10), "+r" (x4), "+r" (x11)     \
    402     : "r" (x6)                                         \
    403     );
    404 
    405 #define NUMBER 3
    406 #define SHIFT 24
    407 #define INC_X8 0x08000001
    408 
    409 #include "sad_mb_offset.h"
    410 
    411 #undef NUMBER
    412 #define NUMBER 2
    413 #undef SHIFT
    414 #define SHIFT 16
    415 #undef INC_X8
    416 #define INC_X8 0x10000001
    417 #include "sad_mb_offset.h"
    418 
    419 #undef NUMBER
    420 #define NUMBER 1
    421 #undef SHIFT
    422 #define SHIFT 8
    423 #undef INC_X8
    424 #define INC_X8 0x08000001
    425 #include "sad_mb_offset.h"
    426 
    427 
    428     __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
    429     {
    430         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
    431 
    432         x9 = 0x80808080; /* const. */
    433         x4 = x5 = 0;
    434 
    435         x8 = (uint32)ref & 0x3;
    436         if (x8 == 3)
    437             goto SadMBOffset3;
    438         if (x8 == 2)
    439             goto SadMBOffset2;
    440         if (x8 == 1)
    441             goto SadMBOffset1;
    442 
    443         x8 = 16;
    444 ///
    445         __asm__ volatile("MVN   %0, #0xFF00": "=r"(x6));
    446 
    447 LOOP_SAD0:
    448         /****** process 8 pixels ******/
    449         x11 = *((int32*)(ref + 12));
    450         x10 = *((int32*)(ref + 8));
    451         x14 = *((int32*)(blk + 12));
    452         x12 = *((int32*)(blk + 8));
    453 
    454         /* process x11 & x14 */
    455         x11 = sad_4pixel(x11, x14, x9);
    456 
    457         /* process x12 & x10 */
    458         x10 = sad_4pixel(x10, x12, x9);
    459 
    460         x5 = x5 + x10;  /* accumulate low bytes */
    461         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    462         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    463         x5 = x5 + x11;  /* accumulate low bytes */
    464         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    465         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    466 
    467         /****** process 8 pixels ******/
    468         x11 = *((int32*)(ref + 4));
    469         __asm__ volatile("LDR   %0, [%1], %2": "=&r"(x10), "+r"(ref): "r"(lx));
    470         //x10 = *((int32*)ref); ref+=lx;
    471         x14 = *((int32*)(blk + 4));
    472         __asm__ volatile("LDR   %0, [%1], #16": "=&r"(x12), "+r"(blk));
    473 
    474         /* process x11 & x14 */
    475         x11 = sad_4pixel(x11, x14, x9);
    476 
    477         /* process x12 & x10 */
    478         x10 = sad_4pixel(x10, x12, x9);
    479 
    480         x5 = x5 + x10;  /* accumulate low bytes */
    481         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    482         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    483         x5 = x5 + x11;  /* accumulate low bytes */
    484         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    485         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    486 
    487         /****************/
    488         x10 = x5 - (x4 << 8); /* extract low bytes */
    489         x10 = x10 + x4;     /* add with high bytes */
    490         x10 = x10 + (x10 << 16); /* add with lower half word */
    491 
    492         /****************/
    493 
    494         if (((uint32)x10 >> 16) <= dmin) /* compare with dmin */
    495         {
    496             if (--x8)
    497             {
    498                 goto LOOP_SAD0;
    499             }
    500 
    501         }
    502 
    503         return ((uint32)x10 >> 16);
    504 
    505 SadMBOffset3:
    506 
    507         return sad_mb_offset3(ref, blk, lx, dmin);
    508 
    509 SadMBOffset2:
    510 
    511         return sad_mb_offset2(ref, blk, lx, dmin);
    512 
    513 SadMBOffset1:
    514 
    515         return sad_mb_offset1(ref, blk, lx, dmin);
    516     }
    517 
    518 
    519 #endif
    520 
    521 #ifdef __cplusplus
    522 }
    523 #endif
    524 
    525 #endif // _SAD_INLINE_H_
    526 
    527