Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <stdlib.h>
     12 
     13 #include "./vpx_dsp_rtcd.h"
     14 #include "vpx/vpx_integer.h"
     15 #include "vpx_dsp/mips/common_dspr2.h"
     16 #include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
     17 #include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
     18 #include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
     19 #include "vpx_mem/vpx_mem.h"
     20 
     21 #if HAVE_DSPR2
     22 void vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
     23                                 const uint8_t *blimit, const uint8_t *limit,
     24                                 const uint8_t *thresh) {
     25   uint8_t i;
     26   uint32_t mask;
     27   uint32_t hev;
     28   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
     29   uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
     30   uint32_t thresh_vec, flimit_vec, limit_vec;
     31   uint32_t uflimit, ulimit, uthresh;
     32 
     33   uflimit = *blimit;
     34   ulimit = *limit;
     35   uthresh = *thresh;
     36 
     37   /* create quad-byte */
     38   __asm__ __volatile__(
     39       "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
     40       "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
     41       "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
     42 
     43       : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
     44         [limit_vec] "=r"(limit_vec)
     45       : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
     46 
     47   /* prefetch data for store */
     48   prefetch_store(s);
     49 
     50   /* loop filter designed to work using chars so that we can make maximum use
     51      of 8 bit simd instructions. */
     52   for (i = 0; i < 2; i++) {
     53     sm1 = s - (pitch << 2);
     54     s0 = sm1 + pitch;
     55     s1 = s0 + pitch;
     56     s2 = s - pitch;
     57     s3 = s;
     58     s4 = s + pitch;
     59     s5 = s4 + pitch;
     60     s6 = s5 + pitch;
     61 
     62     __asm__ __volatile__(
     63         "lw     %[p1],  (%[s1])    \n\t"
     64         "lw     %[p2],  (%[s2])    \n\t"
     65         "lw     %[p3],  (%[s3])    \n\t"
     66         "lw     %[p4],  (%[s4])    \n\t"
     67 
     68         : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
     69         : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
     70 
     71     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
     72        mask will be zero and filtering is not needed */
     73     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
     74       __asm__ __volatile__(
     75           "lw       %[pm1], (%[sm1])   \n\t"
     76           "lw       %[p0],  (%[s0])    \n\t"
     77           "lw       %[p5],  (%[s5])    \n\t"
     78           "lw       %[p6],  (%[s6])    \n\t"
     79 
     80           : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
     81           : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
     82 
     83       filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
     84                             p6, thresh_vec, &hev, &mask);
     85 
     86       /* if mask == 0 do filtering is not needed */
     87       if (mask) {
     88         /* filtering */
     89         filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
     90 
     91         __asm__ __volatile__(
     92             "sw     %[p1],  (%[s1])    \n\t"
     93             "sw     %[p2],  (%[s2])    \n\t"
     94             "sw     %[p3],  (%[s3])    \n\t"
     95             "sw     %[p4],  (%[s4])    \n\t"
     96 
     97             :
     98             : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
     99               [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
    100       }
    101     }
    102 
    103     s = s + 4;
    104   }
    105 }
    106 
    107 void vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
    108                               const uint8_t *blimit, const uint8_t *limit,
    109                               const uint8_t *thresh) {
    110   uint8_t i;
    111   uint32_t mask, hev;
    112   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
    113   uint8_t *s1, *s2, *s3, *s4;
    114   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
    115   uint32_t thresh_vec, flimit_vec, limit_vec;
    116   uint32_t uflimit, ulimit, uthresh;
    117 
    118   uflimit = *blimit;
    119   ulimit = *limit;
    120   uthresh = *thresh;
    121 
    122   /* create quad-byte */
    123   __asm__ __volatile__(
    124       "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
    125       "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
    126       "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
    127 
    128       : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
    129         [limit_vec] "=r"(limit_vec)
    130       : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
    131 
    132   /* prefetch data for store */
    133   prefetch_store(s + pitch);
    134 
    135   for (i = 0; i < 2; i++) {
    136     s1 = s;
    137     s2 = s + pitch;
    138     s3 = s2 + pitch;
    139     s4 = s3 + pitch;
    140     s = s4 + pitch;
    141 
    142     /* load quad-byte vectors
    143      * memory is 4 byte aligned
    144      */
    145     p2 = *((uint32_t *)(s1 - 4));
    146     p6 = *((uint32_t *)(s1));
    147     p1 = *((uint32_t *)(s2 - 4));
    148     p5 = *((uint32_t *)(s2));
    149     p0 = *((uint32_t *)(s3 - 4));
    150     p4 = *((uint32_t *)(s3));
    151     pm1 = *((uint32_t *)(s4 - 4));
    152     p3 = *((uint32_t *)(s4));
    153 
    154     /* transpose pm1, p0, p1, p2 */
    155     __asm__ __volatile__(
    156         "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
    157         "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
    158         "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
    159         "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
    160 
    161         "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
    162         "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
    163         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
    164         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
    165 
    166         "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
    167         "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
    168         "append         %[p1],      %[sec3],    16          \n\t"
    169         "append         %[pm1],     %[sec4],    16          \n\t"
    170 
    171         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
    172           [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
    173           [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
    174         :);
    175 
    176     /* transpose p3, p4, p5, p6 */
    177     __asm__ __volatile__(
    178         "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
    179         "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
    180         "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
    181         "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
    182 
    183         "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
    184         "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
    185         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
    186         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
    187 
    188         "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
    189         "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
    190         "append         %[p5],      %[sec3],    16          \n\t"
    191         "append         %[p3],      %[sec4],    16          \n\t"
    192 
    193         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
    194           [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
    195           [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
    196         :);
    197 
    198     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    199      * mask will be zero and filtering is not needed
    200      */
    201     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
    202       filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
    203                             p6, thresh_vec, &hev, &mask);
    204 
    205       /* if mask == 0 do filtering is not needed */
    206       if (mask) {
    207         /* filtering */
    208         filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
    209 
    210         /* unpack processed 4x4 neighborhood
    211          * don't use transpose on output data
    212          * because memory isn't aligned
    213          */
    214         __asm__ __volatile__(
    215             "sb     %[p4],   1(%[s4])    \n\t"
    216             "sb     %[p3],   0(%[s4])    \n\t"
    217             "sb     %[p2],  -1(%[s4])    \n\t"
    218             "sb     %[p1],  -2(%[s4])    \n\t"
    219 
    220             :
    221             : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
    222               [s4] "r"(s4));
    223 
    224         __asm__ __volatile__(
    225             "srl    %[p4],  %[p4],  8     \n\t"
    226             "srl    %[p3],  %[p3],  8     \n\t"
    227             "srl    %[p2],  %[p2],  8     \n\t"
    228             "srl    %[p1],  %[p1],  8     \n\t"
    229 
    230             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
    231             :);
    232 
    233         __asm__ __volatile__(
    234             "sb     %[p4],   1(%[s3])    \n\t"
    235             "sb     %[p3],   0(%[s3])    \n\t"
    236             "sb     %[p2],  -1(%[s3])    \n\t"
    237             "sb     %[p1],  -2(%[s3])    \n\t"
    238 
    239             : [p1] "+r"(p1)
    240             : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
    241 
    242         __asm__ __volatile__(
    243             "srl    %[p4],  %[p4],  8     \n\t"
    244             "srl    %[p3],  %[p3],  8     \n\t"
    245             "srl    %[p2],  %[p2],  8     \n\t"
    246             "srl    %[p1],  %[p1],  8     \n\t"
    247 
    248             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
    249             :);
    250 
    251         __asm__ __volatile__(
    252             "sb     %[p4],   1(%[s2])    \n\t"
    253             "sb     %[p3],   0(%[s2])    \n\t"
    254             "sb     %[p2],  -1(%[s2])    \n\t"
    255             "sb     %[p1],  -2(%[s2])    \n\t"
    256 
    257             :
    258             : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
    259               [s2] "r"(s2));
    260 
    261         __asm__ __volatile__(
    262             "srl    %[p4],  %[p4],  8     \n\t"
    263             "srl    %[p3],  %[p3],  8     \n\t"
    264             "srl    %[p2],  %[p2],  8     \n\t"
    265             "srl    %[p1],  %[p1],  8     \n\t"
    266 
    267             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
    268             :);
    269 
    270         __asm__ __volatile__(
    271             "sb     %[p4],   1(%[s1])    \n\t"
    272             "sb     %[p3],   0(%[s1])    \n\t"
    273             "sb     %[p2],  -1(%[s1])    \n\t"
    274             "sb     %[p1],  -2(%[s1])    \n\t"
    275 
    276             :
    277             : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
    278               [s1] "r"(s1));
    279       }
    280     }
    281   }
    282 }
    283 
    284 void vpx_lpf_horizontal_4_dual_dspr2(
    285     uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
    286     const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
    287     const uint8_t *limit1, const uint8_t *thresh1) {
    288   vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
    289   vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
    290 }
    291 
    292 void vpx_lpf_horizontal_8_dual_dspr2(
    293     uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
    294     const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
    295     const uint8_t *limit1, const uint8_t *thresh1) {
    296   vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
    297   vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
    298 }
    299 
    300 void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
    301                                    const uint8_t *limit0,
    302                                    const uint8_t *thresh0,
    303                                    const uint8_t *blimit1,
    304                                    const uint8_t *limit1,
    305                                    const uint8_t *thresh1) {
    306   vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
    307   vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
    308 }
    309 
    310 void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
    311                                    const uint8_t *limit0,
    312                                    const uint8_t *thresh0,
    313                                    const uint8_t *blimit1,
    314                                    const uint8_t *limit1,
    315                                    const uint8_t *thresh1) {
    316   vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
    317   vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
    318 }
    319 
    320 void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
    321                                     const uint8_t *limit,
    322                                     const uint8_t *thresh) {
    323   vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
    324   vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
    325 }
    326 #endif  // #if HAVE_DSPR2
    327