Home | History | Annotate | Download | only in src
      1 /* ------------------------------------------------------------------
      2  * Copyright (C) 1998-2009 PacketVideo
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
     13  * express or implied.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  * -------------------------------------------------------------------
     17  */
     18 
     19 #if defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
     20 
     21 #if (NUMBER==3)
     22 __inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
     23 #elif (NUMBER==2)
     24 __inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
     25 #elif (NUMBER==1)
     26 __inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
     27 #endif
     28 {
     29     int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
     30 
     31     //  x5 = (x4<<8) - x4;
     32     x4 = x5 = 0;
     33     x6 = 0xFFFF00FF;
     34     x9 = 0x80808080; /* const. */
     35     ref -= NUMBER; /* bic ref, ref, #3 */
     36     ref -= lx;
     37     blk -= 16;
     38     x8 = 16;
     39 
     40 #if (NUMBER==3)
     41 LOOP_SAD3:
     42 #elif (NUMBER==2)
     43 LOOP_SAD2:
     44 #elif (NUMBER==1)
     45 LOOP_SAD1:
     46 #endif
     47     /****** process 8 pixels ******/
     48     x10 = *((uint32*)(ref += lx)); /* D C B A */
     49     x11 = *((uint32*)(ref + 4));    /* H G F E */
     50     x12 = *((uint32*)(ref + 8));    /* L K J I */
     51 
     52     x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */
     53     x10 = x10 | (x11 << (32 - SHIFT));        /* G F E D */
     54     x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */
     55     x11 = x11 | (x12 << (32 - SHIFT));        /* K J I H */
     56 
     57     x12 = *((uint32*)(blk += 16));
     58     x14 = *((uint32*)(blk + 4));
     59 
     60     /* process x11 & x14 */
     61     x11 = sad_4pixel(x11, x14, x9);
     62 
     63     /* process x12 & x10 */
     64     x10 = sad_4pixel(x10, x12, x9);
     65 
     66     x5 = x5 + x10; /* accumulate low bytes */
     67     x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
     68     x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
     69     x5 = x5 + x11;  /* accumulate low bytes */
     70     x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
     71     x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
     72 
     73     /****** process 8 pixels ******/
     74     x10 = *((uint32*)(ref + 8)); /* D C B A */
     75     x11 = *((uint32*)(ref + 12));   /* H G F E */
     76     x12 = *((uint32*)(ref + 16));   /* L K J I */
     77 
     78     x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24  = 0xFF 0xFF 0xFF ~D */
     79     x10 = x10 | (x11 << (32 - SHIFT));        /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */
     80     x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */
     81     x11 = x11 | (x12 << (32 - SHIFT));        /* ~K ~J ~I ~H */
     82 
     83     x12 = *((uint32*)(blk + 8));
     84     x14 = *((uint32*)(blk + 12));
     85 
     86     /* process x11 & x14 */
     87     x11 = sad_4pixel(x11, x14, x9);
     88 
     89     /* process x12 & x10 */
     90     x10 = sad_4pixel(x10, x12, x9);
     91 
     92     x5 = x5 + x10; /* accumulate low bytes */
     93     x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
     94     x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
     95     x5 = x5 + x11;  /* accumulate low bytes */
     96     x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
     97     x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
     98 
     99     /****************/
    100     x10 = x5 - (x4 << 8); /* extract low bytes */
    101     x10 = x10 + x4;     /* add with high bytes */
    102     x10 = x10 + (x10 << 16); /* add with lower half word */
    103 
    104     if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
    105     {
    106         if (--x8)
    107         {
    108 #if (NUMBER==3)
    109             goto         LOOP_SAD3;
    110 #elif (NUMBER==2)
    111             goto         LOOP_SAD2;
    112 #elif (NUMBER==1)
    113             goto         LOOP_SAD1;
    114 #endif
    115         }
    116 
    117     }
    118 
    119     return ((uint32)x10 >> 16);
    120 }
    121 
    122 #elif defined(__CC_ARM)  /* only work with arm v5 */
    123 
    124 #if (NUMBER==3)
    125 __inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
    126 #elif (NUMBER==2)
    127 __inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
    128 #elif (NUMBER==1)
    129 __inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
    130 #endif
    131 {
    132     int32 x4, x5, x6, x9, x10, x11, x12, x14;
    133 
    134     x9 = 0x80808080; /* const. */
    135     x4 = x5 = 0;
    136 
    137     __asm{
    138         MVN      x6, #0xff0000;
    139 #if (NUMBER==3)
    140 LOOP_SAD3:
    141 #elif (NUMBER==2)
    142 LOOP_SAD2:
    143 #elif (NUMBER==1)
    144 LOOP_SAD1:
    145 #endif
    146         BIC      ref, ref, #3;
    147     }
    148     /****** process 8 pixels ******/
    149     x11 = *((int32*)(ref + 12));
    150     x12 = *((int32*)(ref + 16));
    151     x10 = *((int32*)(ref + 8));
    152     x14 = *((int32*)(blk + 12));
    153 
    154     __asm{
    155         MVN      x10, x10, lsr #SHIFT;
    156         BIC      x10, x10, x11, lsl #(32-SHIFT);
    157         MVN      x11, x11, lsr #SHIFT;
    158         BIC      x11, x11, x12, lsl #(32-SHIFT);
    159 
    160         LDR      x12, [blk, #8];
    161     }
    162 
    163     /* process x11 & x14 */
    164     x11 = sad_4pixelN(x11, x14, x9);
    165 
    166     /* process x12 & x10 */
    167     x10 = sad_4pixelN(x10, x12, x9);
    168 
    169     sum_accumulate;
    170 
    171     __asm{
    172         /****** process 8 pixels ******/
    173         LDR      x11, [ref, #4];
    174         LDR      x12, [ref, #8];
    175         LDR  x10, [ref], lx ;
    176         LDR  x14, [blk, #4];
    177 
    178         MVN      x10, x10, lsr #SHIFT;
    179         BIC      x10, x10, x11, lsl #(32-SHIFT);
    180         MVN      x11, x11, lsr #SHIFT;
    181         BIC      x11, x11, x12, lsl #(32-SHIFT);
    182 
    183         LDR      x12, [blk], #16;
    184     }
    185 
    186     /* process x11 & x14 */
    187     x11 = sad_4pixelN(x11, x14, x9);
    188 
    189     /* process x12 & x10 */
    190     x10 = sad_4pixelN(x10, x12, x9);
    191 
    192     sum_accumulate;
    193 
    194     /****************/
    195     x10 = x5 - (x4 << 8); /* extract low bytes */
    196     x10 = x10 + x4;     /* add with high bytes */
    197     x10 = x10 + (x10 << 16); /* add with lower half word */
    198 
    199     __asm{
    200         RSBS     x11, dmin, x10, lsr #16
    201         ADDLSS   x8, x8, #INC_X8
    202 #if (NUMBER==3)
    203         BLS      LOOP_SAD3;
    204 #elif (NUMBER==2)
    205 BLS      LOOP_SAD2;
    206 #elif (NUMBER==1)
    207 BLS      LOOP_SAD1;
    208 #endif
    209     }
    210 
    211     return ((uint32)x10 >> 16);
    212 }
    213 
    214 #elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
    215 
    216 #if (NUMBER==3)
    217 __inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
    218 #elif (NUMBER==2)
    219 __inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
    220 #elif (NUMBER==1)
    221 __inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
    222 #endif
    223 {
    224     int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
    225 
    226     x9 = 0x80808080; /* const. */
    227     x4 = x5 = 0;
    228     x8 = 16; //<<===========*******
    229 
    230 __asm__ volatile("MVN	%0, #0xFF0000": "=r"(x6));
    231 
    232 #if (NUMBER==3)
    233 LOOP_SAD3:
    234 #elif (NUMBER==2)
    235 LOOP_SAD2:
    236 #elif (NUMBER==1)
    237 LOOP_SAD1:
    238 #endif
    239 __asm__ volatile("BIC  %0, %0, #3": "=r"(ref));
    240     /****** process 8 pixels ******/
    241     x11 = *((int32*)(ref + 12));
    242     x12 = *((int32*)(ref + 16));
    243     x10 = *((int32*)(ref + 8));
    244     x14 = *((int32*)(blk + 12));
    245 
    246 #if (SHIFT==8)
    247 __asm__ volatile("MVN   %0, %0, lsr #8\n\tBIC   %0, %0, %1,lsl #24\n\tMVN   %1, %1,lsr #8\n\tBIC   %1, %1, %2,lsl #24": "=&r"(x10), "=&r"(x11): "r"(x12));
    248 #elif (SHIFT==16)
    249 __asm__ volatile("MVN   %0, %0, lsr #16\n\tBIC   %0, %0, %1,lsl #16\n\tMVN   %1, %1,lsr #16\n\tBIC   %1, %1, %2,lsl #16": "=&r"(x10), "=&r"(x11): "r"(x12));
    250 #elif (SHIFT==24)
    251 __asm__ volatile("MVN   %0, %0, lsr #24\n\tBIC   %0, %0, %1,lsl #8\n\tMVN   %1, %1,lsr #24\n\tBIC   %1, %1, %2,lsl #8": "=&r"(x10), "=&r"(x11): "r"(x12));
    252 #endif
    253 
    254     x12 = *((int32*)(blk + 8));
    255 
    256     /* process x11 & x14 */
    257     x11 = sad_4pixelN(x11, x14, x9);
    258 
    259     /* process x12 & x10 */
    260     x10 = sad_4pixelN(x10, x12, x9);
    261 
    262     sum_accumulate;
    263 
    264     /****** process 8 pixels ******/
    265     x11 = *((int32*)(ref + 4));
    266     x12 = *((int32*)(ref + 8));
    267     x10 = *((int32*)ref); ref += lx;
    268     x14 = *((int32*)(blk + 4));
    269 
    270 #if (SHIFT==8)
    271 __asm__ volatile("MVN   %0, %0, lsr #8\n\tBIC   %0, %0, %1,lsl #24\n\tMVN   %1, %1,lsr #8\n\tBIC   %1, %1, %2,lsl #24": "=&r"(x10), "=&r"(x11): "r"(x12));
    272 #elif (SHIFT==16)
    273 __asm__ volatile("MVN   %0, %0, lsr #16\n\tBIC   %0, %0, %1,lsl #16\n\tMVN   %1, %1,lsr #16\n\tBIC   %1, %1, %2,lsl #16": "=&r"(x10), "=&r"(x11): "r"(x12));
    274 #elif (SHIFT==24)
    275 __asm__ volatile("MVN   %0, %0, lsr #24\n\tBIC   %0, %0, %1,lsl #8\n\tMVN   %1, %1,lsr #24\n\tBIC   %1, %1, %2,lsl #8": "=&r"(x10), "=&r"(x11): "r"(x12));
    276 #endif
    277 __asm__ volatile("LDR   %0, [%1], #16": "=&r"(x12), "=r"(blk));
    278 
    279     /* process x11 & x14 */
    280     x11 = sad_4pixelN(x11, x14, x9);
    281 
    282     /* process x12 & x10 */
    283     x10 = sad_4pixelN(x10, x12, x9);
    284 
    285     sum_accumulate;
    286 
    287     /****************/
    288     x10 = x5 - (x4 << 8); /* extract low bytes */
    289     x10 = x10 + x4;     /* add with high bytes */
    290     x10 = x10 + (x10 << 16); /* add with lower half word */
    291 
    292     if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
    293     {
    294         if (--x8)
    295         {
    296 #if (NUMBER==3)
    297             goto         LOOP_SAD3;
    298 #elif (NUMBER==2)
    299 goto         LOOP_SAD2;
    300 #elif (NUMBER==1)
    301 goto         LOOP_SAD1;
    302 #endif
    303         }
    304 
    305     }
    306 
    307     return ((uint32)x10 >> 16);
    308 }
    309 
    310 #endif
    311 
    312