Home | History | Annotate | Download | only in msa
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vp8_rtcd.h"
     12 #include "vp8/common/mips/msa/vp8_macros_msa.h"
     13 
     14 #define TRANSPOSE4x4_H(in0, in1, in2, in3, out0, out1, out2, out3)  \
     15 {                                                                   \
     16     v8i16 s0_m, s1_m, tp0_m, tp1_m, tp2_m, tp3_m;                   \
     17                                                                     \
     18     ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                     \
     19     ILVRL_H2_SH(s1_m, s0_m, tp0_m, tp1_m);                          \
     20     ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                     \
     21     ILVRL_H2_SH(s1_m, s0_m, tp2_m, tp3_m);                          \
     22     PCKEV_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out0, out2);            \
     23     PCKOD_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out1, out3);            \
     24 }
     25 
     26 #define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2)    \
     27 {                                                                   \
     28     v8i16 tmp0_m;                                                   \
     29                                                                     \
     30     SPLATI_H3_SH(coeff, val0, val1, val2, tmp0_m, const1, const2);  \
     31     ILVEV_H2_SH(tmp0_m, const1, const2, tmp0_m, const1, const2);    \
     32 }
     33 
     34 #define RET_1_IF_NZERO_H(in0)       \
     35 ({                                  \
     36     v8i16 tmp0_m;                   \
     37     v8i16 one_m = __msa_ldi_h(1);   \
     38                                     \
     39     tmp0_m = __msa_ceqi_h(in0, 0);  \
     40     tmp0_m = tmp0_m ^ 255;          \
     41     tmp0_m = one_m & tmp0_m;        \
     42                                     \
     43     tmp0_m;                         \
     44 })
     45 
     46 #define RET_1_IF_NZERO_W(in0)       \
     47 ({                                  \
     48     v4i32 tmp0_m;                   \
     49     v4i32 one_m = __msa_ldi_w(1);   \
     50                                     \
     51     tmp0_m = __msa_ceqi_w(in0, 0);  \
     52     tmp0_m = tmp0_m ^ 255;          \
     53     tmp0_m = one_m & tmp0_m;        \
     54                                     \
     55     tmp0_m;                         \
     56 })
     57 
     58 #define RET_1_IF_NEG_W(in0)           \
     59 ({                                    \
     60     v4i32 tmp0_m;                     \
     61                                       \
     62     v4i32 one_m = __msa_ldi_w(1);     \
     63     tmp0_m = __msa_clti_s_w(in0, 0);  \
     64     tmp0_m = one_m & tmp0_m;          \
     65                                       \
     66     tmp0_m;                           \
     67 })
     68 
     69 void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch)
     70 {
     71     v8i16 in0, in1, in2, in3;
     72     v8i16 temp0, temp1;
     73     v8i16 const0, const1;
     74     v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
     75     v4i32 out0, out1, out2, out3;
     76     v8i16 zero = { 0 };
     77 
     78     LD_SH4(input, pitch / 2, in0, in1, in2, in3);
     79     TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
     80 
     81     BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
     82     SLLI_4V(temp0, temp1, in1, in3, 3);
     83     in0 = temp0 + temp1;
     84     in2 = temp0 - temp1;
     85     SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
     86     temp0 = __msa_ilvr_h(in3, in1);
     87     in1 = __msa_splati_h(coeff, 3);
     88     out0 = (v4i32)__msa_ilvev_h(zero, in1);
     89     coeff = __msa_ilvl_h(zero, coeff);
     90     out1 = __msa_splati_w((v4i32)coeff, 0);
     91     DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1);
     92     out0 >>= 12;
     93     out1 >>= 12;
     94     PCKEV_H2_SH(out0, out0, out1, out1, in1, in3);
     95     TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
     96 
     97     BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
     98     in0 = temp0 + temp1 + 7;
     99     in2 = temp0 - temp1 + 7;
    100     in0 >>= 4;
    101     in2 >>= 4;
    102     ILVR_H2_SW(zero, in0, zero, in2, out0, out2);
    103     temp1 = RET_1_IF_NZERO_H(in3);
    104     ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0);
    105     SPLATI_W2_SW(coeff, 2, out3, out1);
    106     out3 += out1;
    107     out1 = __msa_splati_w((v4i32)coeff, 1);
    108     DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3);
    109     out1 >>= 16;
    110     out3 >>= 16;
    111     out1 += (v4i32)temp1;
    112     PCKEV_H2_SH(out1, out0, out3, out2, in0, in2);
    113     ST_SH2(in0, in2, output, 8);
    114 }
    115 
    116 void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch)
    117 {
    118     v8i16 in0, in1, in2, in3;
    119     v8i16 temp0, temp1, tmp0, tmp1;
    120     v8i16 const0, const1, const2;
    121     v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
    122     v8i16 zero = { 0 };
    123     v4i32 vec0_w, vec1_w, vec2_w, vec3_w;
    124 
    125     LD_SH4(input, pitch / 2, in0, in1, in2, in3);
    126     TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
    127 
    128     BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
    129     SLLI_4V(temp0, temp1, in1, in3, 3);
    130     in0 = temp0 + temp1;
    131     in2 = temp0 - temp1;
    132     SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
    133     temp0 = __msa_splati_h(coeff, 3);
    134     vec1_w = (v4i32)__msa_ilvev_h(zero, temp0);
    135     coeff = __msa_ilvl_h(zero, coeff);
    136     vec3_w = __msa_splati_w((v4i32)coeff, 0);
    137     ILVRL_H2_SH(in3, in1, tmp1, tmp0);
    138     vec0_w = vec1_w;
    139     vec2_w = vec3_w;
    140     DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2,
    141                  vec0_w, vec1_w, vec2_w, vec3_w);
    142     SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12);
    143     PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
    144     TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
    145 
    146     BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
    147     in0 = temp0 + temp1 + 7;
    148     in2 = temp0 - temp1 + 7;
    149     in0 >>= 4;
    150     in2 >>= 4;
    151     SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w);
    152     vec3_w += vec1_w;
    153     vec1_w = __msa_splati_w((v4i32)coeff, 1);
    154     const0 = RET_1_IF_NZERO_H(in3);
    155     ILVRL_H2_SH(in3, in1, tmp1, tmp0);
    156     vec0_w = vec1_w;
    157     vec2_w = vec3_w;
    158     DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2,
    159                  vec0_w, vec1_w, vec2_w, vec3_w);
    160     SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16);
    161     PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
    162     in1 += const0;
    163     PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1);
    164     ST_SH2(temp0, temp1, output, 8);
    165 
    166     PCKOD_D2_SH(in1, in0, in3, in2, in0, in2);
    167     ST_SH2(in0, in2, output + 16, 8);
    168 }
    169 
    170 void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch)
    171 {
    172     v8i16 in0_h, in1_h, in2_h, in3_h;
    173     v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3;
    174 
    175     LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h);
    176     TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h);
    177 
    178     UNPCK_R_SH_SW(in0_h, in0_w);
    179     UNPCK_R_SH_SW(in1_h, in1_w);
    180     UNPCK_R_SH_SW(in2_h, in2_w);
    181     UNPCK_R_SH_SW(in3_h, in3_w);
    182     BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
    183     SLLI_4V(temp0, temp1, temp2, temp3, 2);
    184     BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
    185     temp0 = RET_1_IF_NZERO_W(temp0);
    186     in0_w += temp0;
    187     TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w);
    188 
    189     BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
    190     BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
    191     in0_w += RET_1_IF_NEG_W(in0_w);
    192     in1_w += RET_1_IF_NEG_W(in1_w);
    193     in2_w += RET_1_IF_NEG_W(in2_w);
    194     in3_w += RET_1_IF_NEG_W(in3_w);
    195     ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w);
    196     SRA_4V(in0_w, in1_w, in2_w, in3_w, 3);
    197     PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h);
    198     ST_SH2(in0_h, in1_h, output, 8);
    199 }
    200