Home | History | Annotate | Download | only in src
      1 /* ------------------------------------------------------------------
      2  * Copyright (C) 1998-2009 PacketVideo
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
     13  * express or implied.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  * -------------------------------------------------------------------
     17  */
     18 /*********************************************************************************/
     19 /*  Filename: sad_mb_offset.h                                                       */
     20 /*  Description: Implementation for in-line functions used in dct.cpp           */
     21 /*  Modified:                                                                   */
     22 /*********************************************************************************/
     23 
     24 #if !defined(PV_ARM_GCC_V4) && !defined(PV_ARM_GCC_V5) /* ARM GNU COMPILER  */
     25 
     26 #if (NUMBER==3)
     27 __inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin)
     28 #elif (NUMBER==2)
     29 __inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin)
     30 #elif (NUMBER==1)
     31 __inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin)
     32 #endif
     33 {
     34     int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
     35 
     36     //  x5 = (x4<<8) - x4;
     37     x4 = x5 = 0;
     38     x6 = 0xFFFF00FF;
     39     x9 = 0x80808080; /* const. */
     40     ref -= NUMBER; /* bic ref, ref, #3 */
     41     ref -= lx;
     42     blk -= 16;
     43     x8 = 16;
     44 
     45 #if (NUMBER==3)
     46 LOOP_SAD3:
     47 #elif (NUMBER==2)
     48 LOOP_SAD2:
     49 #elif (NUMBER==1)
     50 LOOP_SAD1:
     51 #endif
     52     /****** process 8 pixels ******/
     53     x10 = *((uint32*)(ref += lx)); /* D C B A */
     54     x11 = *((uint32*)(ref + 4));    /* H G F E */
     55     x12 = *((uint32*)(ref + 8));    /* L K J I */
     56 
     57     x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */
     58     x10 = x10 | (x11 << (32 - SHIFT));        /* G F E D */
     59     x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */
     60     x11 = x11 | (x12 << (32 - SHIFT));        /* K J I H */
     61 
     62     x12 = *((uint32*)(blk += 16));
     63     x14 = *((uint32*)(blk + 4));
     64 
     65     /* process x11 & x14 */
     66     x11 = sad_4pixel(x11, x14, x9);
     67 
     68     /* process x12 & x10 */
     69     x10 = sad_4pixel(x10, x12, x9);
     70 
     71     x5 = x5 + x10; /* accumulate low bytes */
     72     x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
     73     x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
     74     x5 = x5 + x11;  /* accumulate low bytes */
     75     x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
     76     x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
     77 
     78     /****** process 8 pixels ******/
     79     x10 = *((uint32*)(ref + 8)); /* D C B A */
     80     x11 = *((uint32*)(ref + 12));   /* H G F E */
     81     x12 = *((uint32*)(ref + 16));   /* L K J I */
     82 
     83     x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24  = 0xFF 0xFF 0xFF ~D */
     84     x10 = x10 | (x11 << (32 - SHIFT));        /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */
     85     x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */
     86     x11 = x11 | (x12 << (32 - SHIFT));        /* ~K ~J ~I ~H */
     87 
     88     x12 = *((uint32*)(blk + 8));
     89     x14 = *((uint32*)(blk + 12));
     90 
     91     /* process x11 & x14 */
     92     x11 = sad_4pixel(x11, x14, x9);
     93 
     94     /* process x12 & x10 */
     95     x10 = sad_4pixel(x10, x12, x9);
     96 
     97     x5 = x5 + x10; /* accumulate low bytes */
     98     x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
     99     x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
    100     x5 = x5 + x11;  /* accumulate low bytes */
    101     x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
    102     x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    103 
    104     /****************/
    105     x10 = x5 - (x4 << 8); /* extract low bytes */
    106     x10 = x10 + x4;     /* add with high bytes */
    107     x10 = x10 + (x10 << 16); /* add with lower half word */
    108 
    109     if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
    110     {
    111         if (--x8)
    112         {
    113 #if (NUMBER==3)
    114             goto         LOOP_SAD3;
    115 #elif (NUMBER==2)
    116             goto         LOOP_SAD2;
    117 #elif (NUMBER==1)
    118             goto         LOOP_SAD1;
    119 #endif
    120         }
    121 
    122     }
    123 
    124     return ((uint32)x10 >> 16);
    125 }
    126 
    127 #elif defined(__CC_ARM)  /* only work with arm v5 */
    128 
    129 #if (NUMBER==3)
    130 __inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8)
    131 #elif (NUMBER==2)
    132 __inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8)
    133 #elif (NUMBER==1)
    134 __inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8)
    135 #endif
    136 {
    137     int32 x4, x5, x6, x9, x10, x11, x12, x14;
    138 
    139     x9 = 0x80808080; /* const. */
    140     x4 = x5 = 0;
    141 
    142     __asm{
    143         MVN      x6, #0xff0000;
    144         BIC      ref, ref, #3;
    145 
    146 #if (NUMBER==3)
    147 LOOP_SAD3:
    148 #elif (NUMBER==2)
    149 LOOP_SAD2:
    150 #elif (NUMBER==1)
    151 LOOP_SAD1:
    152 #endif
    153     }
    154     /****** process 8 pixels ******/
    155     x11 = *((int32*)(ref + 12));
    156     x12 = *((int32*)(ref + 16));
    157     x10 = *((int32*)(ref + 8));
    158     x14 = *((int32*)(blk + 12));
    159 
    160     __asm{
    161         MVN      x10, x10, lsr #SHIFT;
    162         BIC      x10, x10, x11, lsl #(32-SHIFT);
    163         MVN      x11, x11, lsr #SHIFT;
    164         BIC      x11, x11, x12, lsl #(32-SHIFT);
    165 
    166         LDR      x12, [blk, #8];
    167     }
    168 
    169     /* process x11 & x14 */
    170     x11 = sad_4pixelN(x11, x14, x9);
    171 
    172     /* process x12 & x10 */
    173     x10 = sad_4pixelN(x10, x12, x9);
    174 
    175     sum_accumulate;
    176 
    177     __asm{
    178         /****** process 8 pixels ******/
    179         LDR      x11, [ref, #4];
    180         LDR      x12, [ref, #8];
    181         LDR  x10, [ref], lx ;
    182         LDR  x14, [blk, #4];
    183 
    184         MVN      x10, x10, lsr #SHIFT;
    185         BIC      x10, x10, x11, lsl #(32-SHIFT);
    186         MVN      x11, x11, lsr #SHIFT;
    187         BIC      x11, x11, x12, lsl #(32-SHIFT);
    188 
    189         LDR      x12, [blk], #16;
    190     }
    191 
    192     /* process x11 & x14 */
    193     x11 = sad_4pixelN(x11, x14, x9);
    194 
    195     /* process x12 & x10 */
    196     x10 = sad_4pixelN(x10, x12, x9);
    197 
    198     sum_accumulate;
    199 
    200     /****************/
    201     x10 = x5 - (x4 << 8); /* extract low bytes */
    202     x10 = x10 + x4;     /* add with high bytes */
    203     x10 = x10 + (x10 << 16); /* add with lower half word */
    204 
    205     __asm{
    206         RSBS     x11, dmin, x10, lsr #16
    207         ADDLSS   x8, x8, #INC_X8
    208 #if (NUMBER==3)
    209         BLS      LOOP_SAD3;
    210 #elif (NUMBER==2)
    211 BLS      LOOP_SAD2;
    212 #elif (NUMBER==1)
    213 BLS      LOOP_SAD1;
    214 #endif
    215     }
    216 
    217     return ((uint32)x10 >> 16);
    218 }
    219 
    220 #elif ( defined(PV_ARM_GCC_V5) || defined(PV_ARM_GCC_V4) ) /* ARM GNU COMPILER  */
    221 
    222 #if (NUMBER==3)
    223 __inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin)
    224 #elif (NUMBER==2)
    225 __inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin)
    226 #elif (NUMBER==1)
    227 __inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin)
    228 #endif
    229 {
    230     int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
    231 
    232     //  x5 = (x4<<8) - x4;
    233     x4 = x5 = 0;
    234     x6 = 0xFFFF00FF;
    235     x9 = 0x80808080; /* const. */
    236     ref -= NUMBER; /* bic ref, ref, #3 */
    237     ref -= lx;
    238     x8 = 16;
    239 
    240 #if (NUMBER==3)
    241 LOOP_SAD3:
    242 #elif (NUMBER==2)
    243 LOOP_SAD2:
    244 #elif (NUMBER==1)
    245 LOOP_SAD1:
    246 #endif
    247     /****** process 8 pixels ******/
    248     x10 = *((uint32*)(ref += lx)); /* D C B A */
    249     x11 = *((uint32*)(ref + 4));    /* H G F E */
    250     x12 = *((uint32*)(ref + 8));    /* L K J I */
    251 
    252     int32 shift = SHIFT;
    253     int32 shift2 = 32 - SHIFT;
    254     asm volatile("ldr  %3, [%4, #4]\n\t"
    255                  "mvn  %0, %0, lsr %5\n\t"
    256                  "bic  %0, %0, %1, lsl %6\n\t"
    257                  "mvn  %1, %1, lsr %5\n\t"
    258                  "bic  %1, %1, %2, lsl %6\n\t"
    259                  "ldr  %2, [%4, #8]"
    260              : "+r"(x10), "+r"(x11), "+r"(x12), "=r"(x14)
    261                          : "r"(blk), "r"(shift), "r"(shift2));
    262 
    263     /* process x11 & x14 */
    264     x11 = sad_4pixel(x11, x14, x9);
    265 
    266     /* process x12 & x10 */
    267     x10 = sad_4pixel(x10, x12, x9);
    268 
    269     sum_accumulate;
    270 
    271     /****** process 8 pixels ******/
    272     x10 = *((uint32*)(ref + 8)); /* D C B A */
    273     x11 = *((uint32*)(ref + 12));   /* H G F E */
    274     x12 = *((uint32*)(ref + 16));   /* L K J I */
    275 
    276     asm volatile("ldr  %3, [%4, #4]\n\t"
    277                  "mvn  %0, %0, lsr %5\n\t"
    278                  "bic  %0, %0, %1, lsl %6\n\t"
    279                  "mvn  %1, %1, lsr %5\n\t"
    280                  "bic  %1, %1, %2, lsl %6\n\t"
    281                  "ldr  %2, [%4, #8]"
    282              : "+r"(x10), "+r"(x11), "+r"(x12), "=r"(x14)
    283                          : "r"(blk), "r"(shift), "r"(shift2));
    284 
    285     /* process x11 & x14 */
    286     x11 = sad_4pixel(x11, x14, x9);
    287 
    288     /* process x12 & x10 */
    289     x10 = sad_4pixel(x10, x12, x9);
    290 
    291     sum_accumulate;
    292 
    293     /****************/
    294     x10 = x5 - (x4 << 8); /* extract low bytes */
    295     x10 = x10 + x4;     /* add with high bytes */
    296     x10 = x10 + (x10 << 16); /* add with lower half word */
    297 
    298     if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
    299     {
    300         if (--x8)
    301         {
    302 #if (NUMBER==3)
    303             goto         LOOP_SAD3;
    304 #elif (NUMBER==2)
    305 goto         LOOP_SAD2;
    306 #elif (NUMBER==1)
    307 goto         LOOP_SAD1;
    308 #endif
    309         }
    310 
    311     }
    312 
    313     return ((uint32)x10 >> 16);
    314 }
    315 
    316 #endif
    317 
    318