Home | History | Annotate | Download | only in src
      1 /* ------------------------------------------------------------------
      2  * Copyright (C) 1998-2009 PacketVideo
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
     13  * express or implied.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  * -------------------------------------------------------------------
     17  */
     18 #ifndef _SAD_INLINE_H_
     19 #define _SAD_INLINE_H_
     20 
     21 #ifdef __cplusplus
     22 extern "C"
     23 {
     24 #endif
     25 
     26 #if defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
     27 
     28     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
     29     {
     30         tmp = tmp - tmp2;
     31         if (tmp > 0) sad += tmp;
     32         else sad -= tmp;
     33 
     34         return sad;
     35     }
     36 
     37     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
     38     {
     39         int32 x7;
     40 
     41         x7 = src2 ^ src1;       /* check odd/even combination */
     42         if ((uint32)src2 >= (uint32)src1)
     43         {
     44             src1 = src2 - src1;     /* subs */
     45         }
     46         else
     47         {
     48             src1 = src1 - src2;
     49         }
     50         x7 = x7 ^ src1;     /* only odd bytes need to add carry */
     51         x7 = mask & ((uint32)x7 >> 1);
     52         x7 = (x7 << 8) - x7;
     53         src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */
     54         src1 = src1 ^(x7 >> 7);   /* take absolute value of negative byte */
     55 
     56         return src1;
     57     }
     58 
     59 #define NUMBER 3
     60 #define SHIFT 24
     61 
     62 #include "sad_mb_offset.h"
     63 
     64 #undef NUMBER
     65 #define NUMBER 2
     66 #undef SHIFT
     67 #define SHIFT 16
     68 #include "sad_mb_offset.h"
     69 
     70 #undef NUMBER
     71 #define NUMBER 1
     72 #undef SHIFT
     73 #define SHIFT 8
     74 #include "sad_mb_offset.h"
     75 
     76 
     77     __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
     78     {
     79         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
     80 
     81         x9 = 0x80808080; /* const. */
     82 
     83         x8 = (uint32)ref & 0x3;
     84         if (x8 == 3)
     85             goto SadMBOffset3;
     86         if (x8 == 2)
     87             goto SadMBOffset2;
     88         if (x8 == 1)
     89             goto SadMBOffset1;
     90 
     91 //  x5 = (x4<<8)-x4; /* x5 = x4*255; */
     92         x4 = x5 = 0;
     93 
     94         x6 = 0xFFFF00FF;
     95 
     96         ref -= lx;
     97         blk -= 16;
     98 
     99         x8 = 16;
    100 
    101 LOOP_SAD0:
    102         /****** process 8 pixels ******/
    103         x10 = *((uint32*)(ref += lx));
    104         x11 = *((uint32*)(ref + 4));
    105         x12 = *((uint32*)(blk += 16));
    106         x14 = *((uint32*)(blk + 4));
    107 
    108         /* process x11 & x14 */
    109         x11 = sad_4pixel(x11, x14, x9);
    110 
    111         /* process x12 & x10 */
    112         x10 = sad_4pixel(x10, x12, x9);
    113 
    114         x5 = x5 + x10; /* accumulate low bytes */
    115         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    116         x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
    117         x5 = x5 + x11;  /* accumulate low bytes */
    118         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    119         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    120 
    121         /****** process 8 pixels ******/
    122         x10 = *((uint32*)(ref + 8));
    123         x11 = *((uint32*)(ref + 12));
    124         x12 = *((uint32*)(blk + 8));
    125         x14 = *((uint32*)(blk + 12));
    126 
    127         /* process x11 & x14 */
    128         x11 = sad_4pixel(x11, x14, x9);
    129 
    130         /* process x12 & x10 */
    131         x10 = sad_4pixel(x10, x12, x9);
    132 
    133         x5 = x5 + x10;  /* accumulate low bytes */
    134         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    135         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    136         x5 = x5 + x11;  /* accumulate low bytes */
    137         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    138         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    139 
    140         /****************/
    141         x10 = x5 - (x4 << 8); /* extract low bytes */
    142         x10 = x10 + x4;     /* add with high bytes */
    143         x10 = x10 + (x10 << 16); /* add with lower half word */
    144 
    145         if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
    146         {
    147             if (--x8)
    148             {
    149                 goto LOOP_SAD0;
    150             }
    151 
    152         }
    153 
    154         return ((uint32)x10 >> 16);
    155 
    156 SadMBOffset3:
    157 
    158         return sad_mb_offset3(ref, blk, lx, dmin);
    159 
    160 SadMBOffset2:
    161 
    162         return sad_mb_offset2(ref, blk, lx, dmin);
    163 
    164 SadMBOffset1:
    165 
    166         return sad_mb_offset1(ref, blk, lx, dmin);
    167 
    168     }
    169 
    170 #elif defined(__CC_ARM)  /* only work with arm v5 */
    171 
    172     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
    173     {
    174         __asm
    175         {
    176             rsbs    tmp, tmp, tmp2 ;
    177             rsbmi   tmp, tmp, #0 ;
    178             add     sad, sad, tmp ;
    179         }
    180 
    181         return sad;
    182     }
    183 
    184     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
    185     {
    186         int32 x7;
    187 
    188         __asm
    189         {
    190             EOR     x7, src2, src1;     /* check odd/even combination */
    191             SUBS    src1, src2, src1;
    192             EOR     x7, x7, src1;
    193             AND     x7, mask, x7, lsr #1;
    194             ORRCC   x7, x7, #0x80000000;
    195             RSB     x7, x7, x7, lsl #8;
    196             ADD     src1, src1, x7, asr #7;   /* add 0xFF to the negative byte, add back carry */
    197             EOR     src1, src1, x7, asr #7;   /* take absolute value of negative byte */
    198         }
    199 
    200         return src1;
    201     }
    202 
    203     __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
    204     {
    205         int32 x7;
    206 
    207         __asm
    208         {
    209             EOR      x7, src2, src1;        /* check odd/even combination */
    210             ADDS     src1, src2, src1;
    211             EOR      x7, x7, src1;      /* only odd bytes need to add carry */
    212             ANDS     x7, mask, x7, rrx;
    213             RSB      x7, x7, x7, lsl #8;
    214             SUB      src1, src1, x7, asr #7;  /* add 0xFF to the negative byte, add back carry */
    215             EOR      src1, src1, x7, asr #7; /* take absolute value of negative byte */
    216         }
    217 
    218         return src1;
    219     }
    220 
    221 #define sum_accumulate  __asm{      SBC      x5, x5, x10;  /* accumulate low bytes */ \
    222         BIC      x10, x6, x10;   /* x10 & 0xFF00FF00 */ \
    223         ADD      x4, x4, x10,lsr #8;   /* accumulate high bytes */ \
    224         SBC      x5, x5, x11;    /* accumulate low bytes */ \
    225         BIC      x11, x6, x11;   /* x11 & 0xFF00FF00 */ \
    226         ADD      x4, x4, x11,lsr #8; } /* accumulate high bytes */
    227 
    228 
    229 #define NUMBER 3
    230 #define SHIFT 24
    231 #define INC_X8 0x08000001
    232 
    233 #include "sad_mb_offset.h"
    234 
    235 #undef NUMBER
    236 #define NUMBER 2
    237 #undef SHIFT
    238 #define SHIFT 16
    239 #undef INC_X8
    240 #define INC_X8 0x10000001
    241 #include "sad_mb_offset.h"
    242 
    243 #undef NUMBER
    244 #define NUMBER 1
    245 #undef SHIFT
    246 #define SHIFT 8
    247 #undef INC_X8
    248 #define INC_X8 0x08000001
    249 #include "sad_mb_offset.h"
    250 
    251 
    252     __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
    253     {
    254         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
    255 
    256         x9 = 0x80808080; /* const. */
    257         x4 = x5 = 0;
    258 
    259         __asm
    260         {
    261             MOVS    x8, ref, lsl #31 ;
    262             BHI     SadMBOffset3;
    263             BCS     SadMBOffset2;
    264             BMI     SadMBOffset1;
    265 
    266             MVN     x6, #0xFF00;
    267         }
    268 LOOP_SAD0:
    269         /****** process 8 pixels ******/
    270         x11 = *((int32*)(ref + 12));
    271         x10 = *((int32*)(ref + 8));
    272         x14 = *((int32*)(blk + 12));
    273         x12 = *((int32*)(blk + 8));
    274 
    275         /* process x11 & x14 */
    276         x11 = sad_4pixel(x11, x14, x9);
    277 
    278         /* process x12 & x10 */
    279         x10 = sad_4pixel(x10, x12, x9);
    280 
    281         x5 = x5 + x10;  /* accumulate low bytes */
    282         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    283         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    284         x5 = x5 + x11;  /* accumulate low bytes */
    285         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    286         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    287 
    288         __asm
    289         {
    290             /****** process 8 pixels ******/
    291             LDR     x11, [ref, #4];
    292             LDR     x10, [ref], lx ;
    293             LDR     x14, [blk, #4];
    294             LDR     x12, [blk], #16 ;
    295         }
    296 
    297         /* process x11 & x14 */
    298         x11 = sad_4pixel(x11, x14, x9);
    299 
    300         /* process x12 & x10 */
    301         x10 = sad_4pixel(x10, x12, x9);
    302 
    303         x5 = x5 + x10;  /* accumulate low bytes */
    304         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    305         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    306         x5 = x5 + x11;  /* accumulate low bytes */
    307         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    308         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    309 
    310         /****************/
    311         x10 = x5 - (x4 << 8); /* extract low bytes */
    312         x10 = x10 + x4;     /* add with high bytes */
    313         x10 = x10 + (x10 << 16); /* add with lower half word */
    314 
    315         __asm
    316         {
    317             /****************/
    318             RSBS    x11, dmin, x10, lsr #16;
    319             ADDLSS  x8, x8, #0x10000001;
    320             BLS     LOOP_SAD0;
    321         }
    322 
    323         return ((uint32)x10 >> 16);
    324 
    325 SadMBOffset3:
    326 
    327         return sad_mb_offset3(ref, blk, lx, dmin, x8);
    328 
    329 SadMBOffset2:
    330 
    331         return sad_mb_offset2(ref, blk, lx, dmin, x8);
    332 
    333 SadMBOffset1:
    334 
    335         return sad_mb_offset1(ref, blk, lx, dmin, x8);
    336     }
    337 
    338 
    339 #elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
    340 
    341     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
    342     {
    343 __asm__ volatile("rsbs	%1, %1, %2\n\trsbmi %1, %1, #0\n\tadd	%0, %0, %1": "=r"(sad): "r"(tmp), "r"(tmp2));
    344         return sad;
    345     }
    346 
    347     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
    348     {
    349         int32 x7;
    350 
    351 __asm__ volatile("EOR	%1, %2, %0\n\tSUBS  %0, %2, %0\n\tEOR	%1, %1, %0\n\tAND  %1, %3, %1, lsr #1\n\tORRCC	%1, %1, #0x80000000\n\tRSB  %1, %1, %1, lsl #8\n\tADD  %0, %0, %1, asr #7\n\tEOR  %0, %0, %1, asr #7": "=r"(src1), "=&r"(x7): "r"(src2), "r"(mask));
    352 
    353         return src1;
    354     }
    355 
    356     __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
    357     {
    358         int32 x7;
    359 
    360 __asm__ volatile("EOR	%1, %2, %0\n\tADDS  %0, %2, %0\n\tEOR  %1, %1, %0\n\tANDS  %1, %3, %1, rrx\n\tRSB  %1, %1, %1, lsl #8\n\tSUB	%0, %0, %1, asr #7\n\tEOR   %0, %0, %1, asr #7": "=r"(src1), "=&r"(x7): "r"(src2), "r"(mask));
    361 
    362         return src1;
    363     }
    364 
    365 #define sum_accumulate  __asm__ volatile("SBC  %0, %0, %1\n\tBIC   %1, %4, %1\n\tADD   %2, %2, %1, lsr #8\n\tSBC   %0, %0, %3\n\tBIC   %3, %4, %3\n\tADD   %2, %2, %3, lsr #8": "=&r" (x5), "=&r" (x10), "=&r" (x4), "=&r" (x11): "r" (x6));
    366 
    367 #define NUMBER 3
    368 #define SHIFT 24
    369 #define INC_X8 0x08000001
    370 
    371 #include "sad_mb_offset.h"
    372 
    373 #undef NUMBER
    374 #define NUMBER 2
    375 #undef SHIFT
    376 #define SHIFT 16
    377 #undef INC_X8
    378 #define INC_X8 0x10000001
    379 #include "sad_mb_offset.h"
    380 
    381 #undef NUMBER
    382 #define NUMBER 1
    383 #undef SHIFT
    384 #define SHIFT 8
    385 #undef INC_X8
    386 #define INC_X8 0x08000001
    387 #include "sad_mb_offset.h"
    388 
    389 
    390     __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
    391     {
    392         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
    393 
    394         x9 = 0x80808080; /* const. */
    395         x4 = x5 = 0;
    396 
    397         x8 = (uint32)ref & 0x3;
    398         if (x8 == 3)
    399             goto SadMBOffset3;
    400         if (x8 == 2)
    401             goto SadMBOffset2;
    402         if (x8 == 1)
    403             goto SadMBOffset1;
    404 
    405         x8 = 16;
    406 ///
    407 __asm__ volatile("MVN	%0, #0xFF00": "=r"(x6));
    408 
    409 LOOP_SAD0:
    410         /****** process 8 pixels ******/
    411         x11 = *((int32*)(ref + 12));
    412         x10 = *((int32*)(ref + 8));
    413         x14 = *((int32*)(blk + 12));
    414         x12 = *((int32*)(blk + 8));
    415 
    416         /* process x11 & x14 */
    417         x11 = sad_4pixel(x11, x14, x9);
    418 
    419         /* process x12 & x10 */
    420         x10 = sad_4pixel(x10, x12, x9);
    421 
    422         x5 = x5 + x10;  /* accumulate low bytes */
    423         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    424         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    425         x5 = x5 + x11;  /* accumulate low bytes */
    426         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    427         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    428 
    429         /****** process 8 pixels ******/
    430         x11 = *((int32*)(ref + 4));
    431 __asm__ volatile("LDR	%0, [%1], %2": "=&r"(x10), "=r"(ref): "r"(lx));
    432         //x10 = *((int32*)ref); ref+=lx;
    433         x14 = *((int32*)(blk + 4));
    434 __asm__ volatile("LDR	%0, [%1], #16": "=&r"(x12), "=r"(blk));
    435 
    436         /* process x11 & x14 */
    437         x11 = sad_4pixel(x11, x14, x9);
    438 
    439         /* process x12 & x10 */
    440         x10 = sad_4pixel(x10, x12, x9);
    441 
    442         x5 = x5 + x10;  /* accumulate low bytes */
    443         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
    444         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
    445         x5 = x5 + x11;  /* accumulate low bytes */
    446         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    447         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    448 
    449         /****************/
    450         x10 = x5 - (x4 << 8); /* extract low bytes */
    451         x10 = x10 + x4;     /* add with high bytes */
    452         x10 = x10 + (x10 << 16); /* add with lower half word */
    453 
    454         /****************/
    455 
    456         if (((uint32)x10 >> 16) <= dmin) /* compare with dmin */
    457         {
    458             if (--x8)
    459             {
    460                 goto LOOP_SAD0;
    461             }
    462 
    463         }
    464 
    465         return ((uint32)x10 >> 16);
    466 
    467 SadMBOffset3:
    468 
    469         return sad_mb_offset3(ref, blk, lx, dmin);
    470 
    471 SadMBOffset2:
    472 
    473         return sad_mb_offset2(ref, blk, lx, dmin);
    474 
    475 SadMBOffset1:
    476 
    477         return sad_mb_offset1(ref, blk, lx, dmin);
    478     }
    479 
    480 
    481 #endif
    482 
    483 #ifdef __cplusplus
    484 }
    485 #endif
    486 
    487 #endif // _SAD_INLINE_H_
    488 
    489