Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <stdio.h>
     13 
     14 #include "./vpx_config.h"
     15 #include "./vp9_rtcd.h"
     16 #include "vp9/common/vp9_common.h"
     17 #include "vpx/vpx_integer.h"
     18 #include "vpx_ports/mem.h"
     19 #include "vp9/common/vp9_convolve.h"
     20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
     21 
     22 #if HAVE_DSPR2
     23 static void convolve_vert_4_dspr2(const uint8_t *src,
     24                                   int32_t src_stride,
     25                                   uint8_t *dst,
     26                                   int32_t dst_stride,
     27                                   const int16_t *filter_y,
     28                                   int32_t w,
     29                                   int32_t h) {
     30   int32_t x, y;
     31   const uint8_t *src_ptr;
     32   uint8_t *dst_ptr;
     33   uint8_t *cm = vp9_ff_cropTbl;
     34   uint32_t vector4a = 64;
     35   uint32_t load1, load2, load3, load4;
     36   uint32_t p1, p2;
     37   uint32_t n1, n2;
     38   uint32_t scratch1, scratch2;
     39   uint32_t store1, store2;
     40   int32_t vector1b, vector2b, vector3b, vector4b;
     41   int32_t Temp1, Temp2;
     42 
     43   vector1b = ((const int32_t *)filter_y)[0];
     44   vector2b = ((const int32_t *)filter_y)[1];
     45   vector3b = ((const int32_t *)filter_y)[2];
     46   vector4b = ((const int32_t *)filter_y)[3];
     47 
     48   src -= 3 * src_stride;
     49 
     50   for (y = h; y--;) {
     51     /* prefetch data to cache memory */
     52     vp9_prefetch_store(dst + dst_stride);
     53 
     54     for (x = 0; x < w; x += 4) {
     55       src_ptr = src + x;
     56       dst_ptr = dst + x;
     57 
     58       __asm__ __volatile__ (
     59           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
     60           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
     61           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
     62           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
     63           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
     64           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
     65           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
     66 
     67           "mtlo             %[vector4a],  $ac0                            \n\t"
     68           "mtlo             %[vector4a],  $ac1                            \n\t"
     69           "mtlo             %[vector4a],  $ac2                            \n\t"
     70           "mtlo             %[vector4a],  $ac3                            \n\t"
     71           "mthi             $zero,        $ac0                            \n\t"
     72           "mthi             $zero,        $ac1                            \n\t"
     73           "mthi             $zero,        $ac2                            \n\t"
     74           "mthi             $zero,        $ac3                            \n\t"
     75 
     76           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
     77           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
     78           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
     79           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
     80           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
     81           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
     82           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
     83           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
     84 
     85           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
     86           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
     87           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
     88           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
     89 
     90           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
     91           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
     92           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
     93           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
     94           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
     95           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
     96           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
     97           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
     98 
     99           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
    100           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
    101           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
    102           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
    103 
    104           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    105           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
    106           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    107           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
    108           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    109           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
    110           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    111           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
    112 
    113           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
    114           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
    115           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
    116           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
    117           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
    118           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
    119           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
    120           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
    121 
    122           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
    123           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
    124           "extp             %[Temp1],     $ac0,           31              \n\t"
    125           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
    126           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
    127           "extp             %[Temp2],     $ac1,           31              \n\t"
    128 
    129           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
    130           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
    131           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
    132           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
    133           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
    134           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
    135           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
    136           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
    137 
    138           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
    139           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
    140           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
    141           "extp             %[Temp1],     $ac2,           31              \n\t"
    142 
    143           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
    144           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
    145           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
    146           "extp             %[Temp2],     $ac3,           31              \n\t"
    147 
    148           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
    149           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
    150 
    151           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
    152           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
    153 
    154           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
    155           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
    156 
    157           : [load1] "=&r" (load1), [load2] "=&r" (load2),
    158             [load3] "=&r" (load3), [load4] "=&r" (load4),
    159             [p1] "=&r" (p1), [p2] "=&r" (p2),
    160             [n1] "=&r" (n1), [n2] "=&r" (n2),
    161             [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
    162             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    163             [store1] "=&r" (store1), [store2] "=&r" (store2),
    164             [src_ptr] "+r" (src_ptr)
    165           : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    166             [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
    167             [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
    168             [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
    169       );
    170     }
    171 
    172     /* Next row... */
    173     src += src_stride;
    174     dst += dst_stride;
    175   }
    176 }
    177 
    178 static void convolve_vert_64_dspr2(const uint8_t *src,
    179                                    int32_t src_stride,
    180                                    uint8_t *dst,
    181                                    int32_t dst_stride,
    182                                    const int16_t *filter_y,
    183                                    int32_t h) {
    184   int32_t x, y;
    185   const uint8_t *src_ptr;
    186   uint8_t *dst_ptr;
    187   uint8_t *cm = vp9_ff_cropTbl;
    188   uint32_t vector4a = 64;
    189   uint32_t load1, load2, load3, load4;
    190   uint32_t p1, p2;
    191   uint32_t n1, n2;
    192   uint32_t scratch1, scratch2;
    193   uint32_t store1, store2;
    194   int32_t vector1b, vector2b, vector3b, vector4b;
    195   int32_t Temp1, Temp2;
    196 
    197   vector1b = ((const int32_t *)filter_y)[0];
    198   vector2b = ((const int32_t *)filter_y)[1];
    199   vector3b = ((const int32_t *)filter_y)[2];
    200   vector4b = ((const int32_t *)filter_y)[3];
    201 
    202   src -= 3 * src_stride;
    203 
    204   for (y = h; y--;) {
    205     /* prefetch data to cache memory */
    206     vp9_prefetch_store(dst + dst_stride);
    207     vp9_prefetch_store(dst + dst_stride + 32);
    208 
    209     for (x = 0; x < 64; x += 4) {
    210       src_ptr = src + x;
    211       dst_ptr = dst + x;
    212 
    213       __asm__ __volatile__ (
    214           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
    215           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    216           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
    217           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    218           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
    219           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    220           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
    221 
    222           "mtlo             %[vector4a],  $ac0                            \n\t"
    223           "mtlo             %[vector4a],  $ac1                            \n\t"
    224           "mtlo             %[vector4a],  $ac2                            \n\t"
    225           "mtlo             %[vector4a],  $ac3                            \n\t"
    226           "mthi             $zero,        $ac0                            \n\t"
    227           "mthi             $zero,        $ac1                            \n\t"
    228           "mthi             $zero,        $ac2                            \n\t"
    229           "mthi             $zero,        $ac3                            \n\t"
    230 
    231           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
    232           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
    233           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
    234           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
    235           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
    236           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
    237           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
    238           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
    239 
    240           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
    241           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
    242           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
    243           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
    244 
    245           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
    246           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
    247           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
    248           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
    249           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
    250           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
    251           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
    252           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
    253 
    254           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
    255           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
    256           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
    257           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
    258 
    259           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    260           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
    261           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    262           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
    263           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    264           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
    265           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    266           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
    267 
    268           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
    269           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
    270           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
    271           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
    272           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
    273           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
    274           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
    275           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
    276 
    277           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
    278           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
    279           "extp             %[Temp1],     $ac0,           31              \n\t"
    280           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
    281           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
    282           "extp             %[Temp2],     $ac1,           31              \n\t"
    283 
    284           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
    285           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
    286           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
    287           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
    288           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
    289           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
    290           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
    291           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
    292 
    293           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
    294           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
    295           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
    296           "extp             %[Temp1],     $ac2,           31              \n\t"
    297 
    298           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
    299           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
    300           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
    301           "extp             %[Temp2],     $ac3,           31              \n\t"
    302 
    303           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
    304           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
    305 
    306           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
    307           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
    308 
    309           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
    310           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
    311 
    312           : [load1] "=&r" (load1), [load2] "=&r" (load2),
    313             [load3] "=&r" (load3), [load4] "=&r" (load4),
    314             [p1] "=&r" (p1), [p2] "=&r" (p2),
    315             [n1] "=&r" (n1), [n2] "=&r" (n2),
    316             [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
    317             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    318             [store1] "=&r" (store1), [store2] "=&r" (store2),
    319             [src_ptr] "+r" (src_ptr)
    320           : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    321             [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
    322             [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
    323             [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
    324       );
    325     }
    326 
    327     /* Next row... */
    328     src += src_stride;
    329     dst += dst_stride;
    330   }
    331 }
    332 
    333 void vp9_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
    334                               uint8_t *dst, ptrdiff_t dst_stride,
    335                               const int16_t *filter_x, int x_step_q4,
    336                               const int16_t *filter_y, int y_step_q4,
    337                               int w, int h) {
    338   if (((const int32_t *)filter_y)[1] == 0x800000) {
    339     vp9_convolve_copy(src, src_stride,
    340                       dst, dst_stride,
    341                       filter_x, x_step_q4,
    342                       filter_y, y_step_q4,
    343                       w, h);
    344   } else if (((const int32_t *)filter_y)[0] == 0) {
    345     vp9_convolve2_vert_dspr2(src, src_stride,
    346                              dst, dst_stride,
    347                              filter_x, x_step_q4,
    348                              filter_y, y_step_q4,
    349                              w, h);
    350   } else {
    351     if (16 == y_step_q4) {
    352       uint32_t pos = 38;
    353 
    354       /* bit positon for extract from acc */
    355       __asm__ __volatile__ (
    356         "wrdsp      %[pos],     1           \n\t"
    357         :
    358         : [pos] "r" (pos)
    359       );
    360 
    361       vp9_prefetch_store(dst);
    362 
    363       switch (w) {
    364         case 4 :
    365         case 8 :
    366         case 16 :
    367         case 32 :
    368           convolve_vert_4_dspr2(src, src_stride,
    369                                 dst, dst_stride,
    370                                 filter_y, w, h);
    371           break;
    372         case 64 :
    373           vp9_prefetch_store(dst + 32);
    374           convolve_vert_64_dspr2(src, src_stride,
    375                                  dst, dst_stride,
    376                                  filter_y, h);
    377           break;
    378         default:
    379           vp9_convolve8_vert_c(src, src_stride,
    380                                dst, dst_stride,
    381                                filter_x, x_step_q4,
    382                                filter_y, y_step_q4,
    383                                w, h);
    384           break;
    385       }
    386     } else {
    387       vp9_convolve8_vert_c(src, src_stride,
    388                            dst, dst_stride,
    389                            filter_x, x_step_q4,
    390                            filter_y, y_step_q4,
    391                            w, h);
    392     }
    393   }
    394 }
    395 
    396 #endif
    397