Home | History | Annotate | Download | only in src
      1 /* ------------------------------------------------------------------
      2  * Copyright (C) 1998-2009 PacketVideo
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
     13  * express or implied.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  * -------------------------------------------------------------------
     17  */
     18 /*********************************************************************************/
     19 /*  Filename: sad_inline.h                                                      */
     20 /*  Description: Implementation for in-line functions used in dct.cpp           */
     21 /*  Modified:                                                                   */
     22 /*********************************************************************************/
     23 #ifndef _SAD_INLINE_H_
     24 #define _SAD_INLINE_H_
     25 
     26 #ifdef __cplusplus
     27 extern "C"
     28 {
     29 #endif
     30 
     31 #if !defined(PV_ARM_GCC_V5) && !defined(PV_ARM_GCC_V4) /* ARM GNU COMPILER  */
     32 
     33     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
     34     {
     35         tmp = tmp - tmp2;
     36         if (tmp > 0) sad += tmp;
     37         else sad -= tmp;
     38 
     39         return sad;
     40     }
     41 
     42     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
     43     {
     44         int32 x7;
     45 
     46         x7 = src2 ^ src1;       /* check odd/even combination */
     47         if ((uint32)src2 >= (uint32)src1)
     48         {
     49             src1 = src2 - src1;     /* subs */
     50         }
     51         else
     52         {
     53             src1 = src1 - src2;
     54         }
     55         x7 = x7 ^ src1;     /* only odd bytes need to add carry */
     56         x7 = mask & ((uint32)x7 >> 1);
     57         x7 = (x7 << 8) - x7;
     58         src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */
     59         src1 = src1 ^(x7 >> 7);   /* take absolute value of negative byte */
     60 
     61         return src1;
     62     }
     63 
     64 #define NUMBER 3
     65 #define SHIFT 24
     66 
     67 #include "sad_mb_offset.h"
     68 
     69 #undef NUMBER
     70 #define NUMBER 2
     71 #undef SHIFT
     72 #define SHIFT 16
     73 #include "sad_mb_offset.h"
     74 
     75 #undef NUMBER
     76 #define NUMBER 1
     77 #undef SHIFT
     78 #define SHIFT 8
     79 #include "sad_mb_offset.h"
     80 
     81 
     82     __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
     83     {
     84         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
     85 
     86         x9 = 0x80808080; /* const. */
     87 
     88         x8 = (uintptr_t)ref & 0x3;
     89         if (x8 == 3)
     90             goto SadMBOffset3;
     91         if (x8 == 2)
     92             goto SadMBOffset2;
     93         if (x8 == 1)
     94             goto SadMBOffset1;
     95 
     96 //  x5 = (x4<<8)-x4; /* x5 = x4*255; */
     97         x4 = x5 = 0;
     98 
     99         x6 = 0xFFFF00FF;
    100 
    101         ref -= lx;
    102         blk -= 16;
    103 
    104         x8 = 16;
    105 
    106 LOOP_SAD0:
    107         /****** process 8 pixels ******/
    108         x10 = *((uint32*)(ref += lx));
    109         x11 = *((uint32*)(ref + 4));
    110         x12 = *((uint32*)(blk += 16));
    111         x14 = *((uint32*)(blk + 4));
    112 
    113         /* process x11 & x14 */
    114         x11 = sad_4pixel(x11, x14, x9);
    115 
    116         /* process x12 & x10 */
    117         x10 = sad_4pixel(x10, x12, x9);
    118 
    119         x5 = x5 + x10; /* accumulate low bytes */
    120         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    121         x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
    122         x5 = x5 + x11;  /* accumulate low bytes */
    123         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    124         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    125 
    126         /****** process 8 pixels ******/
    127         x10 = *((uint32*)(ref + 8));
    128         x11 = *((uint32*)(ref + 12));
    129         x12 = *((uint32*)(blk + 8));
    130         x14 = *((uint32*)(blk + 12));
    131 
    132         /* process x11 & x14 */
    133         x11 = sad_4pixel(x11, x14, x9);
    134 
    135         /* process x12 & x10 */
    136         x10 = sad_4pixel(x10, x12, x9);
    137 
    138         x5 = x5 + x10;  /* accumulate low bytes */
    139         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    140         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    141         x5 = x5 + x11;  /* accumulate low bytes */
    142         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    143         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    144 
    145         /****************/
    146         x10 = x5 - (x4 << 8); /* extract low bytes */
    147         x10 = x10 + x4;     /* add with high bytes */
    148         x10 = x10 + (x10 << 16); /* add with lower half word */
    149 
    150         if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
    151         {
    152             if (--x8)
    153             {
    154                 goto LOOP_SAD0;
    155             }
    156 
    157         }
    158 
    159         return ((uint32)x10 >> 16);
    160 
    161 SadMBOffset3:
    162 
    163         return sad_mb_offset3(ref, blk, lx, dmin);
    164 
    165 SadMBOffset2:
    166 
    167         return sad_mb_offset2(ref, blk, lx, dmin);
    168 
    169 SadMBOffset1:
    170 
    171         return sad_mb_offset1(ref, blk, lx, dmin);
    172 
    173     }
    174 
    175 #elif defined(__CC_ARM)  /* only work with arm v5 */
    176 
    177     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
    178     {
    179         __asm
    180         {
    181             rsbs    tmp, tmp, tmp2 ;
    182             rsbmi   tmp, tmp, #0 ;
    183             add     sad, sad, tmp ;
    184         }
    185 
    186         return sad;
    187     }
    188 
    189     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
    190     {
    191         int32 x7;
    192 
    193         __asm
    194         {
    195             EOR     x7, src2, src1;     /* check odd/even combination */
    196             SUBS    src1, src2, src1;
    197             EOR     x7, x7, src1;
    198             AND     x7, mask, x7, lsr #1;
    199             ORRCC   x7, x7, #0x80000000;
    200             RSB     x7, x7, x7, lsl #8;
    201             ADD     src1, src1, x7, asr #7;   /* add 0xFF to the negative byte, add back carry */
    202             EOR     src1, src1, x7, asr #7;   /* take absolute value of negative byte */
    203         }
    204 
    205         return src1;
    206     }
    207 
    208     __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
    209     {
    210         int32 x7;
    211 
    212         __asm
    213         {
    214             EOR      x7, src2, src1;        /* check odd/even combination */
    215             ADDS     src1, src2, src1;
    216             EOR      x7, x7, src1;      /* only odd bytes need to add carry */
    217             ANDS     x7, mask, x7, rrx;
    218             RSB      x7, x7, x7, lsl #8;
    219             SUB      src1, src1, x7, asr #7;  /* add 0xFF to the negative byte, add back carry */
    220             EOR      src1, src1, x7, asr #7; /* take absolute value of negative byte */
    221         }
    222 
    223         return src1;
    224     }
    225 
    226 #define sum_accumulate  __asm{      SBC      x5, x5, x10;  /* accumulate low bytes */ \
    227         BIC      x10, x6, x10;   /* x10 & 0xFF00FF00 */ \
    228         ADD      x4, x4, x10,lsr #8;   /* accumulate high bytes */ \
    229         SBC      x5, x5, x11;    /* accumulate low bytes */ \
    230         BIC      x11, x6, x11;   /* x11 & 0xFF00FF00 */ \
    231         ADD      x4, x4, x11,lsr #8; } /* accumulate high bytes */
    232 
    233 
    234 #define NUMBER 3
    235 #define SHIFT 24
    236 #define INC_X8 0x08000001
    237 
    238 #include "sad_mb_offset.h"
    239 
    240 #undef NUMBER
    241 #define NUMBER 2
    242 #undef SHIFT
    243 #define SHIFT 16
    244 #undef INC_X8
    245 #define INC_X8 0x10000001
    246 #include "sad_mb_offset.h"
    247 
    248 #undef NUMBER
    249 #define NUMBER 1
    250 #undef SHIFT
    251 #define SHIFT 8
    252 #undef INC_X8
    253 #define INC_X8 0x08000001
    254 #include "sad_mb_offset.h"
    255 
    256 
    257     __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
    258     {
    259         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
    260 
    261         x9 = 0x80808080; /* const. */
    262         x4 = x5 = 0;
    263 
    264         __asm
    265         {
    266             MOVS    x8, ref, lsl #31 ;
    267             BHI     SadMBOffset3;
    268             BCS     SadMBOffset2;
    269             BMI     SadMBOffset1;
    270 
    271             MVN     x6, #0xFF00;
    272         }
    273 LOOP_SAD0:
    274         /****** process 8 pixels ******/
    275         x11 = *((int32*)(ref + 12));
    276         x10 = *((int32*)(ref + 8));
    277         x14 = *((int32*)(blk + 12));
    278         x12 = *((int32*)(blk + 8));
    279 
    280         /* process x11 & x14 */
    281         x11 = sad_4pixel(x11, x14, x9);
    282 
    283         /* process x12 & x10 */
    284         x10 = sad_4pixel(x10, x12, x9);
    285 
    286         x5 = x5 + x10;  /* accumulate low bytes */
    287         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    288         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    289         x5 = x5 + x11;  /* accumulate low bytes */
    290         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    291         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    292 
    293         __asm
    294         {
    295             /****** process 8 pixels ******/
    296             LDR     x11, [ref, #4];
    297             LDR     x10, [ref], lx ;
    298             LDR     x14, [blk, #4];
    299             LDR     x12, [blk], #16 ;
    300         }
    301 
    302         /* process x11 & x14 */
    303         x11 = sad_4pixel(x11, x14, x9);
    304 
    305         /* process x12 & x10 */
    306         x10 = sad_4pixel(x10, x12, x9);
    307 
    308         x5 = x5 + x10;  /* accumulate low bytes */
    309         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    310         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    311         x5 = x5 + x11;  /* accumulate low bytes */
    312         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    313         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    314 
    315         /****************/
    316         x10 = x5 - (x4 << 8); /* extract low bytes */
    317         x10 = x10 + x4;     /* add with high bytes */
    318         x10 = x10 + (x10 << 16); /* add with lower half word */
    319 
    320         __asm
    321         {
    322             /****************/
    323             RSBS    x11, dmin, x10, lsr #16;
    324             ADDLSS  x8, x8, #0x10000001;
    325             BLS     LOOP_SAD0;
    326         }
    327 
    328         return ((uint32)x10 >> 16);
    329 
    330 SadMBOffset3:
    331 
    332         return sad_mb_offset3(ref, blk, lx, dmin, x8);
    333 
    334 SadMBOffset2:
    335 
    336         return sad_mb_offset2(ref, blk, lx, dmin, x8);
    337 
    338 SadMBOffset1:
    339 
    340         return sad_mb_offset1(ref, blk, lx, dmin, x8);
    341     }
    342 
    343 
    344 #elif ( defined(PV_ARM_GCC_V5) || defined(PV_ARM_GCC_V4) ) /* ARM GNU COMPILER  */
    345 
    346     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
    347     {
    348         register int32 out;
    349         register int32 temp1;
    350         register int32 ss = sad;
    351         register int32 tt = tmp;
    352         register int32 uu = tmp2;
    353 
    354         asm volatile("rsbs  %1, %4, %3\n\t"
    355                      "rsbmi %1, %1, #0\n\t"
    356                      "add   %0, %2, %1"
    357              : "=&r"(out),
    358                      "=&r"(temp1)
    359                              : "r"(ss),
    360                              "r"(tt),
    361                              "r"(uu));
    362         return out;
    363     }
    364 
    365     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
    366 {
    367         register int32 out;
    368         register int32 temp1;
    369         register int32 s1 = src1;
    370         register int32 s2 = src2;
    371         register int32 mm = mask;
    372 
    373         asm volatile("eor   %0, %3, %2\n\t"
    374                      "subs  %1, %3, %2\n\t"
    375                      "eor   %0, %0, %1\n\t"
    376                      "and   %0, %4, %0, lsr #1\n\t"
    377                      "orrcc %0, %0, #0x80000000\n\t"
    378                      "rsb   %0, %0, %0, lsl #8\n\t"
    379                      "add   %1, %1, %0, asr #7\n\t"
    380                      "eor   %1, %1, %0, asr #7"
    381              : "=&r"(out),
    382                      "=&r"(temp1)
    383                              : "r"(s1),
    384                              "r"(s2),
    385                              "r"(mm));
    386 
    387         return temp1;
    388     }
    389 
    390     __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
    391 {
    392         register int32 out;
    393         register int32 temp1;
    394         register int32 s1 = src1;
    395         register int32 s2 = src2;
    396         register int32 mm = mask;
    397 
    398         asm volatile("eor    %1, %3, %2\n\t"
    399                      "adds   %0, %3, %2\n\t"
    400                      "eor    %1, %1, %0\n\t"
    401                      "ands   %1, %4, %1,rrx\n\t"
    402                      "rsb    %1, %1, %1, lsl #8\n\t"
    403                      "sub    %0, %0, %1, asr #7\n\t"
    404                      "eor    %0, %0, %1, asr #7"
    405              : "=&r"(out),
    406                      "=&r"(temp1)
    407                              : "r"(s1),
    408                              "r"(s2),
    409                              "r"(mm));
    410 
    411         return (out);
    412     }
    413 
    414 #define sum_accumulate asm volatile("sbc  %0, %0, %1\n\t" \
    415                                 "bic  %1, %4, %1\n\t" \
    416                                 "add  %2, %2, %1, lsr #8\n\t" \
    417                                 "sbc  %0, %0, %3\n\t" \
    418                                 "bic  %3, %4, %3\n\t" \
    419                                 "add  %2, %2, %3, lsr #8" \
    420                                 :"+r"(x5), "+r"(x10), "+r"(x4), "+r"(x11) \
    421                                 :"r"(x6));
    422 
    423 #define NUMBER 3
    424 #define SHIFT 24
    425 #define INC_X8 0x08000001
    426 
    427 #include "sad_mb_offset.h"
    428 
    429 #undef NUMBER
    430 #define NUMBER 2
    431 #undef SHIFT
    432 #define SHIFT 16
    433 #undef INC_X8
    434 #define INC_X8 0x10000001
    435 #include "sad_mb_offset.h"
    436 
    437 #undef NUMBER
    438 #define NUMBER 1
    439 #undef SHIFT
    440 #define SHIFT 8
    441 #undef INC_X8
    442 #define INC_X8 0x08000001
    443 #include "sad_mb_offset.h"
    444 
    445 
    446     __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
    447 {
    448         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
    449 
    450         x9 = 0x80808080; /* const. */
    451         x4 = x5 = 0;
    452 
    453         x8 = (uint32)ref & 0x3;
    454         if (x8 == 3)
    455             goto SadMBOffset3;
    456         if (x8 == 2)
    457             goto SadMBOffset2;
    458         if (x8 == 1)
    459             goto SadMBOffset1;
    460 
    461 asm volatile("mvn %0, #0xFF00": "=r"(x6));
    462 
    463 LOOP_SAD0:
    464         /****** process 8 pixels ******/
    465         x11 = *((int32*)(ref + 12));
    466         x10 = *((int32*)(ref + 8));
    467         x14 = *((int32*)(blk + 12));
    468         x12 = *((int32*)(blk + 8));
    469 
    470         /* process x11 & x14 */
    471         x11 = sad_4pixel(x11, x14, x9);
    472 
    473         /* process x12 & x10 */
    474         x10 = sad_4pixel(x10, x12, x9);
    475 
    476         x5 = x5 + x10;  /* accumulate low bytes */
    477         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    478         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    479         x5 = x5 + x11;  /* accumulate low bytes */
    480         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    481         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    482 
    483         asm volatile("ldr  %0, [%4, #4]\n\t"
    484                      "ldr  %1, [%4], %6\n\t"
    485                      "ldr  %2, [%5, #4]\n\t"
    486                      "ldr  %3, [%5], #16"
    487              : "=r"(x11), "=r"(x10), "=r"(x14), "=r"(x12), "+r"(ref), "+r"(blk)
    488                              : "r"(lx));
    489 
    490         /* process x11 & x14 */
    491         x11 = sad_4pixel(x11, x14, x9);
    492 
    493         /* process x12 & x10 */
    494         x10 = sad_4pixel(x10, x12, x9);
    495 
    496         x5 = x5 + x10;  /* accumulate low bytes */
    497         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    498         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    499         x5 = x5 + x11;  /* accumulate low bytes */
    500         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    501         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    502 
    503         /****************/
    504         x10 = x5 - (x4 << 8); /* extract low bytes */
    505         x10 = x10 + x4;     /* add with high bytes */
    506         x10 = x10 + (x10 << 16); /* add with lower half word */
    507 
    508         if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
    509         {
    510             if (--x8)
    511             {
    512                 goto LOOP_SAD0;
    513             }
    514 
    515         }
    516 
    517         return ((uint32)x10 >> 16);
    518 
    519 SadMBOffset3:
    520 
    521         return sad_mb_offset3(ref, blk, lx, dmin);
    522 
    523 SadMBOffset2:
    524 
    525         return sad_mb_offset2(ref, blk, lx, dmin);
    526 
    527 SadMBOffset1:
    528 
    529         return sad_mb_offset1(ref, blk, lx, dmin);
    530     }
    531 
    532 #endif // OS
    533 
    534 #ifdef __cplusplus
    535 }
    536 #endif
    537 
    538 #endif // _SAD_INLINE_H_
    539 
    540