Home | History | Annotate | Download | only in src
      1 /* ------------------------------------------------------------------
      2  * Copyright (C) 1998-2009 PacketVideo
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
     13  * express or implied.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  * -------------------------------------------------------------------
     17  */
     18 
     19 /* Intentionally not using the gcc asm version, since it is
     20  * slightly slower than the plain C version on modern GCC versions. */
     21 #if !defined(__CC_ARM) /* Generic C version */
     22 
     23 #if (NUMBER==3)
     24 __inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
     25 #elif (NUMBER==2)
     26 __inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
     27 #elif (NUMBER==1)
     28 __inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
     29 #endif
     30 {
     31     int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
     32 
     33     //  x5 = (x4<<8) - x4;
     34     x4 = x5 = 0;
     35     x6 = 0xFFFF00FF;
     36     x9 = 0x80808080; /* const. */
     37     ref -= NUMBER; /* bic ref, ref, #3 */
     38     ref -= lx;
     39     blk -= 16;
     40     x8 = 16;
     41 
     42 #if (NUMBER==3)
     43 LOOP_SAD3:
     44 #elif (NUMBER==2)
     45 LOOP_SAD2:
     46 #elif (NUMBER==1)
     47 LOOP_SAD1:
     48 #endif
     49     /****** process 8 pixels ******/
     50     x10 = *((uint32*)(ref += lx)); /* D C B A */
     51     x11 = *((uint32*)(ref + 4));    /* H G F E */
     52     x12 = *((uint32*)(ref + 8));    /* L K J I */
     53 
     54     x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */
     55     x10 = x10 | (x11 << (32 - SHIFT));        /* G F E D */
     56     x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */
     57     x11 = x11 | (x12 << (32 - SHIFT));        /* K J I H */
     58 
     59     x12 = *((uint32*)(blk += 16));
     60     x14 = *((uint32*)(blk + 4));
     61 
     62     /* process x11 & x14 */
     63     x11 = sad_4pixel(x11, x14, x9);
     64 
     65     /* process x12 & x10 */
     66     x10 = sad_4pixel(x10, x12, x9);
     67 
     68     x5 = x5 + x10; /* accumulate low bytes */
     69     x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
     70     x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
     71     x5 = x5 + x11;  /* accumulate low bytes */
     72     x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
     73     x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
     74 
     75     /****** process 8 pixels ******/
     76     x10 = *((uint32*)(ref + 8)); /* D C B A */
     77     x11 = *((uint32*)(ref + 12));   /* H G F E */
     78     x12 = *((uint32*)(ref + 16));   /* L K J I */
     79 
     80     x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24  = 0xFF 0xFF 0xFF ~D */
     81     x10 = x10 | (x11 << (32 - SHIFT));        /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */
     82     x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */
     83     x11 = x11 | (x12 << (32 - SHIFT));        /* ~K ~J ~I ~H */
     84 
     85     x12 = *((uint32*)(blk + 8));
     86     x14 = *((uint32*)(blk + 12));
     87 
     88     /* process x11 & x14 */
     89     x11 = sad_4pixel(x11, x14, x9);
     90 
     91     /* process x12 & x10 */
     92     x10 = sad_4pixel(x10, x12, x9);
     93 
     94     x5 = x5 + x10; /* accumulate low bytes */
     95     x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
     96     x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
     97     x5 = x5 + x11;  /* accumulate low bytes */
     98     x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
     99     x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
    100 
    101     /****************/
    102     x10 = x5 - (x4 << 8); /* extract low bytes */
    103     x10 = x10 + x4;     /* add with high bytes */
    104     x10 = x10 + (x10 << 16); /* add with lower half word */
    105 
    106     if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
    107     {
    108         if (--x8)
    109         {
    110 #if (NUMBER==3)
    111             goto         LOOP_SAD3;
    112 #elif (NUMBER==2)
    113             goto         LOOP_SAD2;
    114 #elif (NUMBER==1)
    115             goto         LOOP_SAD1;
    116 #endif
    117         }
    118 
    119     }
    120 
    121     return ((uint32)x10 >> 16);
    122 }
    123 
    124 #elif defined(__CC_ARM)  /* only work with arm v5 */
    125 
    126 #if (NUMBER==3)
    127 __inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
    128 #elif (NUMBER==2)
    129 __inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
    130 #elif (NUMBER==1)
    131 __inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
    132 #endif
    133 {
    134     int32 x4, x5, x6, x9, x10, x11, x12, x14;
    135 
    136     x9 = 0x80808080; /* const. */
    137     x4 = x5 = 0;
    138 
    139     __asm{
    140         MVN      x6, #0xff0000;
    141 #if (NUMBER==3)
    142 LOOP_SAD3:
    143 #elif (NUMBER==2)
    144 LOOP_SAD2:
    145 #elif (NUMBER==1)
    146 LOOP_SAD1:
    147 #endif
    148         BIC      ref, ref, #3;
    149     }
    150     /****** process 8 pixels ******/
    151     x11 = *((int32*)(ref + 12));
    152     x12 = *((int32*)(ref + 16));
    153     x10 = *((int32*)(ref + 8));
    154     x14 = *((int32*)(blk + 12));
    155 
    156     __asm{
    157         MVN      x10, x10, lsr #SHIFT;
    158         BIC      x10, x10, x11, lsl #(32-SHIFT);
    159         MVN      x11, x11, lsr #SHIFT;
    160         BIC      x11, x11, x12, lsl #(32-SHIFT);
    161 
    162         LDR      x12, [blk, #8];
    163     }
    164 
    165     /* process x11 & x14 */
    166     x11 = sad_4pixelN(x11, x14, x9);
    167 
    168     /* process x12 & x10 */
    169     x10 = sad_4pixelN(x10, x12, x9);
    170 
    171     sum_accumulate;
    172 
    173     __asm{
    174         /****** process 8 pixels ******/
    175         LDR      x11, [ref, #4];
    176         LDR      x12, [ref, #8];
    177         LDR  x10, [ref], lx ;
    178         LDR  x14, [blk, #4];
    179 
    180         MVN      x10, x10, lsr #SHIFT;
    181         BIC      x10, x10, x11, lsl #(32-SHIFT);
    182         MVN      x11, x11, lsr #SHIFT;
    183         BIC      x11, x11, x12, lsl #(32-SHIFT);
    184 
    185         LDR      x12, [blk], #16;
    186     }
    187 
    188     /* process x11 & x14 */
    189     x11 = sad_4pixelN(x11, x14, x9);
    190 
    191     /* process x12 & x10 */
    192     x10 = sad_4pixelN(x10, x12, x9);
    193 
    194     sum_accumulate;
    195 
    196     /****************/
    197     x10 = x5 - (x4 << 8); /* extract low bytes */
    198     x10 = x10 + x4;     /* add with high bytes */
    199     x10 = x10 + (x10 << 16); /* add with lower half word */
    200 
    201     __asm{
    202         RSBS     x11, dmin, x10, lsr #16
    203         ADDLSS   x8, x8, #INC_X8
    204 #if (NUMBER==3)
    205         BLS      LOOP_SAD3;
    206 #elif (NUMBER==2)
    207 BLS      LOOP_SAD2;
    208 #elif (NUMBER==1)
    209 BLS      LOOP_SAD1;
    210 #endif
    211     }
    212 
    213     return ((uint32)x10 >> 16);
    214 }
    215 
    216 #elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
    217 
    218 #if (NUMBER==3)
    219 __inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
    220 #elif (NUMBER==2)
    221 __inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
    222 #elif (NUMBER==1)
    223 __inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
    224 #endif
    225 {
    226     int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
    227 
    228     x9 = 0x80808080; /* const. */
    229     x4 = x5 = 0;
    230     x8 = 16; //<<===========*******
    231 
    232     __asm__ volatile("MVN       %0, #0xFF0000": "=r"(x6));
    233 
    234 #if (NUMBER==3)
    235 LOOP_SAD3:
    236 #elif (NUMBER==2)
    237 LOOP_SAD2:
    238 #elif (NUMBER==1)
    239 LOOP_SAD1:
    240 #endif
    241     __asm__ volatile("BIC  %0, %0, #3": "+r"(ref));
    242     /****** process 8 pixels ******/
    243     x11 = *((int32*)(ref + 12));
    244     x12 = *((int32*)(ref + 16));
    245     x10 = *((int32*)(ref + 8));
    246     x14 = *((int32*)(blk + 12));
    247 
    248 #if (SHIFT==8)
    249     __asm__ volatile(
    250         "MVN   %0, %0, lsr #8\n\t"
    251         "BIC   %0, %0, %1, lsl #24\n\t"
    252         "MVN   %1, %1, lsr #8\n\t"
    253         "BIC   %1, %1, %2, lsl #24"
    254         : "+r"(x10), "+r"(x11)
    255         : "r"(x12)
    256     );
    257 #elif (SHIFT==16)
    258     __asm__ volatile(
    259         "MVN   %0, %0, lsr #16\n\t"
    260         "BIC   %0, %0, %1, lsl #16\n\t"
    261         "MVN   %1, %1, lsr #16\n\t"
    262         "BIC   %1, %1, %2, lsl #16"
    263         : "+r"(x10), "+r"(x11)
    264         : "r"(x12)
    265     );
    266 #elif (SHIFT==24)
    267     __asm__ volatile(
    268         "MVN   %0, %0, lsr #24\n\t"
    269         "BIC   %0, %0, %1, lsl #8\n\t"
    270         "MVN   %1, %1, lsr #24\n\t"
    271         "BIC   %1, %1, %2, lsl #8"
    272         : "+r"(x10), "+r"(x11)
    273         : "r"(x12)
    274     );
    275 #endif
    276 
    277     x12 = *((int32*)(blk + 8));
    278 
    279     /* process x11 & x14 */
    280     x11 = sad_4pixelN(x11, x14, x9);
    281 
    282     /* process x12 & x10 */
    283     x10 = sad_4pixelN(x10, x12, x9);
    284 
    285     sum_accumulate;
    286 
    287     /****** process 8 pixels ******/
    288     x11 = *((int32*)(ref + 4));
    289     x12 = *((int32*)(ref + 8));
    290     x10 = *((int32*)ref); ref += lx;
    291     x14 = *((int32*)(blk + 4));
    292 
    293 #if (SHIFT==8)
    294     __asm__ volatile(
    295         "MVN   %0, %0, lsr #8\n\t"
    296         "BIC   %0, %0, %1, lsl #24\n\t"
    297         "MVN   %1, %1, lsr #8\n\t"
    298         "BIC   %1, %1, %2, lsl #24"
    299         : "+r"(x10), "+r"(x11)
    300         : "r"(x12)
    301     );
    302 #elif (SHIFT==16)
    303     __asm__ volatile(
    304         "MVN   %0, %0, lsr #16\n\t"
    305         "BIC   %0, %0, %1, lsl #16\n\t"
    306         "MVN   %1, %1, lsr #16\n\t"
    307         "BIC   %1, %1, %2, lsl #16"
    308         : "+r"(x10), "+r"(x11)
    309         : "r"(x12)
    310     );
    311 #elif (SHIFT==24)
    312     __asm__ volatile(
    313         "MVN   %0, %0, lsr #24\n\t"
    314         "BIC   %0, %0, %1, lsl #8\n\t"
    315         "MVN   %1, %1, lsr #24\n\t"
    316         "BIC   %1, %1, %2, lsl #8"
    317         : "+r"(x10), "+r"(x11)
    318         : "r"(x12)
    319     );
    320 #endif
    321     __asm__ volatile("LDR   %0, [%1], #16": "=&r"(x12), "+r"(blk));
    322 
    323     /* process x11 & x14 */
    324     x11 = sad_4pixelN(x11, x14, x9);
    325 
    326     /* process x12 & x10 */
    327     x10 = sad_4pixelN(x10, x12, x9);
    328 
    329     sum_accumulate;
    330 
    331     /****************/
    332     x10 = x5 - (x4 << 8); /* extract low bytes */
    333     x10 = x10 + x4;     /* add with high bytes */
    334     x10 = x10 + (x10 << 16); /* add with lower half word */
    335 
    336     if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
    337     {
    338         if (--x8)
    339         {
    340 #if (NUMBER==3)
    341             goto         LOOP_SAD3;
    342 #elif (NUMBER==2)
    343             goto         LOOP_SAD2;
    344 #elif (NUMBER==1)
    345             goto         LOOP_SAD1;
    346 #endif
    347         }
    348 
    349     }
    350 
    351     return ((uint32)x10 >> 16);
    352 }
    353 
    354 #endif
    355 
    356