Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vpx_dsp_rtcd.h"
     12 #include "vpx_dsp/mips/macros_msa.h"
     13 
     14 static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
     15                             const uint8_t *pred_ptr, int32_t pred_stride,
     16                             int16_t *diff_ptr, int32_t diff_stride) {
     17   uint32_t src0, src1, src2, src3;
     18   uint32_t pred0, pred1, pred2, pred3;
     19   v16i8 src = { 0 };
     20   v16i8 pred = { 0 };
     21   v16u8 src_l0, src_l1;
     22   v8i16 diff0, diff1;
     23 
     24   LW4(src_ptr, src_stride, src0, src1, src2, src3);
     25   LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
     26   INSERT_W4_SB(src0, src1, src2, src3, src);
     27   INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
     28   ILVRL_B2_UB(src, pred, src_l0, src_l1);
     29   HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
     30   ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
     31 }
     32 
     33 static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
     34                             const uint8_t *pred_ptr, int32_t pred_stride,
     35                             int16_t *diff_ptr, int32_t diff_stride) {
     36   uint32_t loop_cnt;
     37   uint64_t src0, src1, pred0, pred1;
     38   v16i8 src = { 0 };
     39   v16i8 pred = { 0 };
     40   v16u8 src_l0, src_l1;
     41   v8i16 diff0, diff1;
     42 
     43   for (loop_cnt = 4; loop_cnt--;) {
     44     LD2(src_ptr, src_stride, src0, src1);
     45     src_ptr += (2 * src_stride);
     46     LD2(pred_ptr, pred_stride, pred0, pred1);
     47     pred_ptr += (2 * pred_stride);
     48 
     49     INSERT_D2_SB(src0, src1, src);
     50     INSERT_D2_SB(pred0, pred1, pred);
     51     ILVRL_B2_UB(src, pred, src_l0, src_l1);
     52     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
     53     ST_SH2(diff0, diff1, diff_ptr, diff_stride);
     54     diff_ptr += (2 * diff_stride);
     55   }
     56 }
     57 
     58 static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
     59                               const uint8_t *pred, int32_t pred_stride,
     60                               int16_t *diff, int32_t diff_stride) {
     61   int8_t count;
     62   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
     63   v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
     64   v16u8 src_l0, src_l1;
     65   v8i16 diff0, diff1;
     66 
     67   for (count = 2; count--;) {
     68     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
     69     src += (8 * src_stride);
     70 
     71     LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
     72            pred7);
     73     pred += (8 * pred_stride);
     74 
     75     ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
     76     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
     77     ST_SH2(diff0, diff1, diff, 8);
     78     diff += diff_stride;
     79 
     80     ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
     81     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
     82     ST_SH2(diff0, diff1, diff, 8);
     83     diff += diff_stride;
     84 
     85     ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
     86     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
     87     ST_SH2(diff0, diff1, diff, 8);
     88     diff += diff_stride;
     89 
     90     ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
     91     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
     92     ST_SH2(diff0, diff1, diff, 8);
     93     diff += diff_stride;
     94 
     95     ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
     96     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
     97     ST_SH2(diff0, diff1, diff, 8);
     98     diff += diff_stride;
     99 
    100     ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
    101     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    102     ST_SH2(diff0, diff1, diff, 8);
    103     diff += diff_stride;
    104 
    105     ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
    106     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    107     ST_SH2(diff0, diff1, diff, 8);
    108     diff += diff_stride;
    109 
    110     ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
    111     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    112     ST_SH2(diff0, diff1, diff, 8);
    113     diff += diff_stride;
    114   }
    115 }
    116 
    117 static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
    118                               const uint8_t *pred, int32_t pred_stride,
    119                               int16_t *diff, int32_t diff_stride) {
    120   uint32_t loop_cnt;
    121   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
    122   v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
    123   v16u8 src_l0, src_l1;
    124   v8i16 diff0, diff1;
    125 
    126   for (loop_cnt = 8; loop_cnt--;) {
    127     LD_SB2(src, 16, src0, src1);
    128     src += src_stride;
    129     LD_SB2(src, 16, src2, src3);
    130     src += src_stride;
    131     LD_SB2(src, 16, src4, src5);
    132     src += src_stride;
    133     LD_SB2(src, 16, src6, src7);
    134     src += src_stride;
    135 
    136     LD_SB2(pred, 16, pred0, pred1);
    137     pred += pred_stride;
    138     LD_SB2(pred, 16, pred2, pred3);
    139     pred += pred_stride;
    140     LD_SB2(pred, 16, pred4, pred5);
    141     pred += pred_stride;
    142     LD_SB2(pred, 16, pred6, pred7);
    143     pred += pred_stride;
    144 
    145     ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
    146     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    147     ST_SH2(diff0, diff1, diff, 8);
    148     ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
    149     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    150     ST_SH2(diff0, diff1, diff + 16, 8);
    151     diff += diff_stride;
    152 
    153     ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
    154     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    155     ST_SH2(diff0, diff1, diff, 8);
    156     ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
    157     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    158     ST_SH2(diff0, diff1, diff + 16, 8);
    159     diff += diff_stride;
    160 
    161     ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
    162     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    163     ST_SH2(diff0, diff1, diff, 8);
    164     ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
    165     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    166     ST_SH2(diff0, diff1, diff + 16, 8);
    167     diff += diff_stride;
    168 
    169     ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
    170     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    171     ST_SH2(diff0, diff1, diff, 8);
    172     ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
    173     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    174     ST_SH2(diff0, diff1, diff + 16, 8);
    175     diff += diff_stride;
    176   }
    177 }
    178 
    179 static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
    180                               const uint8_t *pred, int32_t pred_stride,
    181                               int16_t *diff, int32_t diff_stride) {
    182   uint32_t loop_cnt;
    183   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
    184   v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
    185   v16u8 src_l0, src_l1;
    186   v8i16 diff0, diff1;
    187 
    188   for (loop_cnt = 32; loop_cnt--;) {
    189     LD_SB4(src, 16, src0, src1, src2, src3);
    190     src += src_stride;
    191     LD_SB4(src, 16, src4, src5, src6, src7);
    192     src += src_stride;
    193 
    194     LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
    195     pred += pred_stride;
    196     LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
    197     pred += pred_stride;
    198 
    199     ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
    200     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    201     ST_SH2(diff0, diff1, diff, 8);
    202     ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
    203     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    204     ST_SH2(diff0, diff1, diff + 16, 8);
    205     ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
    206     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    207     ST_SH2(diff0, diff1, diff + 32, 8);
    208     ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
    209     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    210     ST_SH2(diff0, diff1, diff + 48, 8);
    211     diff += diff_stride;
    212 
    213     ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
    214     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    215     ST_SH2(diff0, diff1, diff, 8);
    216     ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
    217     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    218     ST_SH2(diff0, diff1, diff + 16, 8);
    219     ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
    220     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    221     ST_SH2(diff0, diff1, diff + 32, 8);
    222     ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
    223     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
    224     ST_SH2(diff0, diff1, diff + 48, 8);
    225     diff += diff_stride;
    226   }
    227 }
    228 
    229 void vpx_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
    230                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
    231                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
    232                             ptrdiff_t pred_stride) {
    233   if (rows == cols) {
    234     switch (rows) {
    235       case 4:
    236         sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
    237                         diff_stride);
    238         break;
    239       case 8:
    240         sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
    241                         diff_stride);
    242         break;
    243       case 16:
    244         sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
    245                           diff_stride);
    246         break;
    247       case 32:
    248         sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
    249                           diff_stride);
    250         break;
    251       case 64:
    252         sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
    253                           diff_stride);
    254         break;
    255       default:
    256         vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
    257                              src_stride, pred_ptr, pred_stride);
    258         break;
    259     }
    260   } else {
    261     vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
    262                          pred_ptr, pred_stride);
    263   }
    264 }
    265