Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "vp8_rtcd.h"
     12 
     13 #if HAVE_DSPR2
     14 #define CROP_WIDTH 256
     15 
     16 /******************************************************************************
     17  * Notes:
     18  *
     19  * This implementation makes use of 16 bit fixed point version of two multiply
     20  * constants:
     21  *         1.   sqrt(2) * cos (pi/8)
     22  *         2.   sqrt(2) * sin (pi/8)
     23  * Since the first constant is bigger than 1, to maintain the same 16 bit
     24  * fixed point precision as the second one, we use a trick of
     25  *         x * a = x + x*(a-1)
     26  * so
     27  *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
     28  ****************************************************************************/
     29 extern unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
     30 static const int cospi8sqrt2minus1 = 20091;
     31 static const int sinpi8sqrt2      = 35468;
     32 
     33 inline void prefetch_load_short(short *src)
     34 {
     35     __asm__ __volatile__ (
     36         "pref   0,  0(%[src])   \n\t"
     37         :
     38         : [src] "r" (src)
     39     );
     40 }
     41 
     42 void vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred_ptr,
     43                                 int pred_stride, unsigned char *dst_ptr,
     44                                 int dst_stride)
     45 {
     46     int r, c;
     47     int a1, b1, c1, d1;
     48     short output[16];
     49     short *ip = input;
     50     short *op = output;
     51     int temp1, temp2;
     52     int shortpitch = 4;
     53 
     54     int c2, d2;
     55     int temp3, temp4;
     56     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
     57 
     58     /* prepare data for load */
     59     prefetch_load_short(ip + 8);
     60 
     61     /* first loop is unrolled */
     62     a1 = ip[0] + ip[8];
     63     b1 = ip[0] - ip[8];
     64 
     65     temp1 = (ip[4] * sinpi8sqrt2) >> 16;
     66     temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
     67     c1 = temp1 - temp2;
     68 
     69     temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
     70     temp2 = (ip[12] * sinpi8sqrt2) >> 16;
     71     d1 = temp1 + temp2;
     72 
     73     temp3 = (ip[5] * sinpi8sqrt2) >> 16;
     74     temp4 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
     75     c2 = temp3 - temp4;
     76 
     77     temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
     78     temp4 = (ip[13] * sinpi8sqrt2) >> 16;
     79     d2 = temp3 + temp4;
     80 
     81     op[0] = a1 + d1;
     82     op[12] = a1 - d1;
     83     op[4] = b1 + c1;
     84     op[8] = b1 - c1;
     85 
     86     a1 = ip[1] + ip[9];
     87     b1 = ip[1] - ip[9];
     88 
     89     op[1] = a1 + d2;
     90     op[13] = a1 - d2;
     91     op[5] = b1 + c2;
     92     op[9] = b1 - c2;
     93 
     94     a1 = ip[2] + ip[10];
     95     b1 = ip[2] - ip[10];
     96 
     97     temp1 = (ip[6] * sinpi8sqrt2) >> 16;
     98     temp2 = ip[14] + ((ip[14] * cospi8sqrt2minus1) >> 16);
     99     c1 = temp1 - temp2;
    100 
    101     temp1 = ip[6] + ((ip[6] * cospi8sqrt2minus1) >> 16);
    102     temp2 = (ip[14] * sinpi8sqrt2) >> 16;
    103     d1 = temp1 + temp2;
    104 
    105     temp3 = (ip[7] * sinpi8sqrt2) >> 16;
    106     temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
    107     c2 = temp3 - temp4;
    108 
    109     temp3 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
    110     temp4 = (ip[15] * sinpi8sqrt2) >> 16;
    111     d2 = temp3 + temp4;
    112 
    113     op[2] = a1 + d1;
    114     op[14] = a1 - d1;
    115     op[6] = b1 + c1;
    116     op[10] = b1 - c1;
    117 
    118     a1 = ip[3] + ip[11];
    119     b1 = ip[3] - ip[11];
    120 
    121     op[3] = a1 + d2;
    122     op[15] = a1 - d2;
    123     op[7] = b1 + c2;
    124     op[11] = b1 - c2;
    125 
    126     ip = output;
    127 
    128     /* prepare data for load */
    129     prefetch_load_short(ip + shortpitch);
    130 
    131     /* second loop is unrolled */
    132     a1 = ip[0] + ip[2];
    133     b1 = ip[0] - ip[2];
    134 
    135     temp1 = (ip[1] * sinpi8sqrt2) >> 16;
    136     temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
    137     c1 = temp1 - temp2;
    138 
    139     temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
    140     temp2 = (ip[3] * sinpi8sqrt2) >> 16;
    141     d1 = temp1 + temp2;
    142 
    143     temp3 = (ip[5] * sinpi8sqrt2) >> 16;
    144     temp4 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
    145     c2 = temp3 - temp4;
    146 
    147     temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
    148     temp4 = (ip[7] * sinpi8sqrt2) >> 16;
    149     d2 = temp3 + temp4;
    150 
    151     op[0] = (a1 + d1 + 4) >> 3;
    152     op[3] = (a1 - d1 + 4) >> 3;
    153     op[1] = (b1 + c1 + 4) >> 3;
    154     op[2] = (b1 - c1 + 4) >> 3;
    155 
    156     a1 = ip[4] + ip[6];
    157     b1 = ip[4] - ip[6];
    158 
    159     op[4] = (a1 + d2 + 4) >> 3;
    160     op[7] = (a1 - d2 + 4) >> 3;
    161     op[5] = (b1 + c2 + 4) >> 3;
    162     op[6] = (b1 - c2 + 4) >> 3;
    163 
    164     a1 = ip[8] + ip[10];
    165     b1 = ip[8] - ip[10];
    166 
    167     temp1 = (ip[9] * sinpi8sqrt2) >> 16;
    168     temp2 = ip[11] + ((ip[11] * cospi8sqrt2minus1) >> 16);
    169     c1 = temp1 - temp2;
    170 
    171     temp1 = ip[9] + ((ip[9] * cospi8sqrt2minus1) >> 16);
    172     temp2 = (ip[11] * sinpi8sqrt2) >> 16;
    173     d1 = temp1 + temp2;
    174 
    175     temp3 = (ip[13] * sinpi8sqrt2) >> 16;
    176     temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
    177     c2 = temp3 - temp4;
    178 
    179     temp3 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
    180     temp4 = (ip[15] * sinpi8sqrt2) >> 16;
    181     d2 = temp3 + temp4;
    182 
    183     op[8] = (a1 + d1 + 4) >> 3;
    184     op[11] = (a1 - d1 + 4) >> 3;
    185     op[9] = (b1 + c1 + 4) >> 3;
    186     op[10] = (b1 - c1 + 4) >> 3;
    187 
    188     a1 = ip[12] + ip[14];
    189     b1 = ip[12] - ip[14];
    190 
    191     op[12] = (a1 + d2 + 4) >> 3;
    192     op[15] = (a1 - d2 + 4) >> 3;
    193     op[13] = (b1 + c2 + 4) >> 3;
    194     op[14] = (b1 - c2 + 4) >> 3;
    195 
    196     ip = output;
    197 
    198     for (r = 0; r < 4; r++)
    199     {
    200         for (c = 0; c < 4; c++)
    201         {
    202             short a = ip[c] + pred_ptr[c] ;
    203             dst_ptr[c] = cm[a] ;
    204         }
    205 
    206         ip += 4;
    207         dst_ptr += dst_stride;
    208         pred_ptr += pred_stride;
    209     }
    210 }
    211 
    212 void vp8_dc_only_idct_add_dspr2(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride)
    213 {
    214     int a1;
    215     int i, absa1;
    216     int t2, vector_a1, vector_a;
    217 
    218     /* a1 = ((input_dc + 4) >> 3); */
    219     __asm__ __volatile__ (
    220         "addi  %[a1], %[input_dc], 4   \n\t"
    221         "sra   %[a1], %[a1],       3   \n\t"
    222         : [a1] "=r" (a1)
    223         : [input_dc] "r" (input_dc)
    224     );
    225 
    226     if (a1 < 0)
    227     {
    228         /* use quad-byte
    229          * input and output memory are four byte aligned
    230          */
    231         __asm__ __volatile__ (
    232             "abs        %[absa1],     %[a1]         \n\t"
    233             "replv.qb   %[vector_a1], %[absa1]      \n\t"
    234             : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
    235             : [a1] "r" (a1)
    236         );
    237 
    238         /* use (a1 - predptr[c]) instead a1 + predptr[c] */
    239         for (i = 4; i--;)
    240         {
    241             __asm__ __volatile__ (
    242                 "lw             %[t2],       0(%[pred_ptr])                     \n\t"
    243                 "add            %[pred_ptr], %[pred_ptr],    %[pred_stride]     \n\t"
    244                 "subu_s.qb      %[vector_a], %[t2],          %[vector_a1]       \n\t"
    245                 "sw             %[vector_a], 0(%[dst_ptr])                      \n\t"
    246                 "add            %[dst_ptr],  %[dst_ptr],     %[dst_stride]      \n\t"
    247                 : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
    248                   [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr)
    249                 : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1)
    250             );
    251         }
    252     }
    253     else
    254     {
    255         /* use quad-byte
    256          * input and output memory are four byte aligned
    257          */
    258         __asm__ __volatile__ (
    259             "replv.qb       %[vector_a1], %[a1]     \n\t"
    260             : [vector_a1] "=r" (vector_a1)
    261             : [a1] "r" (a1)
    262         );
    263 
    264         for (i = 4; i--;)
    265         {
    266             __asm__ __volatile__ (
    267                 "lw             %[t2],       0(%[pred_ptr])                 \n\t"
    268                 "add            %[pred_ptr], %[pred_ptr],    %[pred_stride] \n\t"
    269                 "addu_s.qb      %[vector_a], %[vector_a1],   %[t2]          \n\t"
    270                 "sw             %[vector_a], 0(%[dst_ptr])                  \n\t"
    271                 "add            %[dst_ptr],  %[dst_ptr],     %[dst_stride]  \n\t"
    272                 : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
    273                   [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr)
    274                 : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1)
    275             );
    276         }
    277     }
    278 
    279 }
    280 
    281 void vp8_short_inv_walsh4x4_dspr2(short *input, short *mb_dqcoeff)
    282 {
    283     short output[16];
    284     int i;
    285     int a1, b1, c1, d1;
    286     int a2, b2, c2, d2;
    287     short *ip = input;
    288     short *op = output;
    289 
    290     prefetch_load_short(ip);
    291 
    292     for (i = 4; i--;)
    293     {
    294         a1 = ip[0] + ip[12];
    295         b1 = ip[4] + ip[8];
    296         c1 = ip[4] - ip[8];
    297         d1 = ip[0] - ip[12];
    298 
    299         op[0] = a1 + b1;
    300         op[4] = c1 + d1;
    301         op[8] = a1 - b1;
    302         op[12] = d1 - c1;
    303 
    304         ip++;
    305         op++;
    306     }
    307 
    308     ip = output;
    309     op = output;
    310 
    311     prefetch_load_short(ip);
    312 
    313     for (i = 4; i--;)
    314     {
    315         a1 = ip[0] + ip[3] + 3;
    316         b1 = ip[1] + ip[2];
    317         c1 = ip[1] - ip[2];
    318         d1 = ip[0] - ip[3] + 3;
    319 
    320         a2 = a1 + b1;
    321         b2 = d1 + c1;
    322         c2 = a1 - b1;
    323         d2 = d1 - c1;
    324 
    325         op[0] = a2 >> 3;
    326         op[1] = b2 >> 3;
    327         op[2] = c2 >> 3;
    328         op[3] = d2 >> 3;
    329 
    330         ip += 4;
    331         op += 4;
    332     }
    333 
    334     for (i = 0; i < 16; i++)
    335     {
    336         mb_dqcoeff[i * 16] = output[i];
    337     }
    338 }
    339 
    340 void vp8_short_inv_walsh4x4_1_dspr2(short *input, short *mb_dqcoeff)
    341 {
    342     int a1;
    343 
    344     a1 = ((input[0] + 3) >> 3);
    345 
    346     __asm__ __volatile__ (
    347         "sh             %[a1], 0(%[mb_dqcoeff])                    \n\t"
    348         "sh             %[a1], 32(%[mb_dqcoeff])                   \n\t"
    349         "sh             %[a1], 64(%[mb_dqcoeff])                   \n\t"
    350         "sh             %[a1], 96(%[mb_dqcoeff])                   \n\t"
    351         "sh             %[a1], 128(%[mb_dqcoeff])                  \n\t"
    352         "sh             %[a1], 160(%[mb_dqcoeff])                  \n\t"
    353         "sh             %[a1], 192(%[mb_dqcoeff])                  \n\t"
    354         "sh             %[a1], 224(%[mb_dqcoeff])                  \n\t"
    355         "sh             %[a1], 256(%[mb_dqcoeff])                  \n\t"
    356         "sh             %[a1], 288(%[mb_dqcoeff])                  \n\t"
    357         "sh             %[a1], 320(%[mb_dqcoeff])                  \n\t"
    358         "sh             %[a1], 352(%[mb_dqcoeff])                  \n\t"
    359         "sh             %[a1], 384(%[mb_dqcoeff])                  \n\t"
    360         "sh             %[a1], 416(%[mb_dqcoeff])                  \n\t"
    361         "sh             %[a1], 448(%[mb_dqcoeff])                  \n\t"
    362         "sh             %[a1], 480(%[mb_dqcoeff])                  \n\t"
    363 
    364         :
    365         : [a1] "r" (a1), [mb_dqcoeff] "r" (mb_dqcoeff)
    366     );
    367 }
    368 
    369 #endif
    370