1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "./vp8_rtcd.h" 12 #include "vp8/common/mips/msa/vp8_macros_msa.h" 13 14 #define TRANSPOSE4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \ 15 { \ 16 v8i16 s0_m, s1_m, tp0_m, tp1_m, tp2_m, tp3_m; \ 17 \ 18 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 19 ILVRL_H2_SH(s1_m, s0_m, tp0_m, tp1_m); \ 20 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 21 ILVRL_H2_SH(s1_m, s0_m, tp2_m, tp3_m); \ 22 PCKEV_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out0, out2); \ 23 PCKOD_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out1, out3); \ 24 } 25 26 #define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2) \ 27 { \ 28 v8i16 tmp0_m; \ 29 \ 30 SPLATI_H3_SH(coeff, val0, val1, val2, tmp0_m, const1, const2); \ 31 ILVEV_H2_SH(tmp0_m, const1, const2, tmp0_m, const1, const2); \ 32 } 33 34 #define RET_1_IF_NZERO_H(in0) \ 35 ({ \ 36 v8i16 tmp0_m; \ 37 v8i16 one_m = __msa_ldi_h(1); \ 38 \ 39 tmp0_m = __msa_ceqi_h(in0, 0); \ 40 tmp0_m = tmp0_m ^ 255; \ 41 tmp0_m = one_m & tmp0_m; \ 42 \ 43 tmp0_m; \ 44 }) 45 46 #define RET_1_IF_NZERO_W(in0) \ 47 ({ \ 48 v4i32 tmp0_m; \ 49 v4i32 one_m = __msa_ldi_w(1); \ 50 \ 51 tmp0_m = __msa_ceqi_w(in0, 0); \ 52 tmp0_m = tmp0_m ^ 255; \ 53 tmp0_m = one_m & tmp0_m; \ 54 \ 55 tmp0_m; \ 56 }) 57 58 #define RET_1_IF_NEG_W(in0) \ 59 ({ \ 60 v4i32 tmp0_m; \ 61 \ 62 v4i32 one_m = __msa_ldi_w(1); \ 63 tmp0_m = __msa_clti_s_w(in0, 0); \ 64 tmp0_m = one_m & tmp0_m; \ 65 \ 66 tmp0_m; \ 67 }) 68 69 void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { 70 v8i16 in0, in1, in2, in3; 71 v8i16 temp0, temp1; 72 v8i16 const0, const1; 73 v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; 74 v4i32 out0, out1, out2, out3; 75 v8i16 zero = { 0 }; 76 77 LD_SH4(input, pitch / 2, in0, in1, in2, in3); 78 TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); 79 80 BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); 81 SLLI_4V(temp0, temp1, in1, in3, 3); 82 in0 = temp0 + temp1; 83 in2 = temp0 - temp1; 84 SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); 85 temp0 = __msa_ilvr_h(in3, in1); 86 in1 = __msa_splati_h(coeff, 3); 87 out0 = (v4i32)__msa_ilvev_h(zero, in1); 88 coeff = __msa_ilvl_h(zero, coeff); 89 out1 = __msa_splati_w((v4i32)coeff, 0); 90 DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1); 91 out0 >>= 12; 92 out1 >>= 12; 93 PCKEV_H2_SH(out0, out0, out1, out1, in1, in3); 94 TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); 95 96 BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); 97 in0 = temp0 + temp1 + 7; 98 in2 = temp0 - temp1 + 7; 99 in0 >>= 4; 100 in2 >>= 4; 101 ILVR_H2_SW(zero, in0, zero, in2, out0, out2); 102 temp1 = RET_1_IF_NZERO_H(in3); 103 ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0); 104 SPLATI_W2_SW(coeff, 2, out3, out1); 105 out3 += out1; 106 out1 = __msa_splati_w((v4i32)coeff, 1); 107 DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3); 108 out1 >>= 16; 109 out3 >>= 16; 110 out1 += (v4i32)temp1; 111 PCKEV_H2_SH(out1, out0, out3, out2, in0, in2); 112 ST_SH2(in0, in2, output, 8); 113 } 114 115 void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch) { 116 v8i16 in0, in1, in2, in3; 117 v8i16 temp0, temp1, tmp0, tmp1; 118 v8i16 const0, const1, const2; 119 v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; 120 v8i16 zero = { 0 }; 121 v4i32 vec0_w, vec1_w, vec2_w, vec3_w; 122 123 LD_SH4(input, pitch / 2, in0, in1, in2, in3); 124 TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); 125 126 BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); 127 SLLI_4V(temp0, temp1, in1, in3, 3); 128 in0 = temp0 + temp1; 129 in2 = temp0 - temp1; 130 SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2); 131 temp0 = __msa_splati_h(coeff, 3); 132 vec1_w = (v4i32)__msa_ilvev_h(zero, temp0); 133 coeff = __msa_ilvl_h(zero, coeff); 134 vec3_w = __msa_splati_w((v4i32)coeff, 0); 135 ILVRL_H2_SH(in3, in1, tmp1, tmp0); 136 vec0_w = vec1_w; 137 vec2_w = vec3_w; 138 DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w, 139 vec1_w, vec2_w, vec3_w); 140 SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12); 141 PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); 142 TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); 143 144 BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); 145 in0 = temp0 + temp1 + 7; 146 in2 = temp0 - temp1 + 7; 147 in0 >>= 4; 148 in2 >>= 4; 149 SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w); 150 vec3_w += vec1_w; 151 vec1_w = __msa_splati_w((v4i32)coeff, 1); 152 const0 = RET_1_IF_NZERO_H(in3); 153 ILVRL_H2_SH(in3, in1, tmp1, tmp0); 154 vec0_w = vec1_w; 155 vec2_w = vec3_w; 156 DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w, 157 vec1_w, vec2_w, vec3_w); 158 SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16); 159 PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); 160 in1 += const0; 161 PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1); 162 ST_SH2(temp0, temp1, output, 8); 163 164 PCKOD_D2_SH(in1, in0, in3, in2, in0, in2); 165 ST_SH2(in0, in2, output + 16, 8); 166 } 167 168 void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { 169 v8i16 in0_h, in1_h, in2_h, in3_h; 170 v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3; 171 172 LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h); 173 TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h); 174 175 UNPCK_R_SH_SW(in0_h, in0_w); 176 UNPCK_R_SH_SW(in1_h, in1_w); 177 UNPCK_R_SH_SW(in2_h, in2_w); 178 UNPCK_R_SH_SW(in3_h, in3_w); 179 BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); 180 SLLI_4V(temp0, temp1, temp2, temp3, 2); 181 BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); 182 temp0 = RET_1_IF_NZERO_W(temp0); 183 in0_w += temp0; 184 TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w); 185 186 BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); 187 BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); 188 in0_w += RET_1_IF_NEG_W(in0_w); 189 in1_w += RET_1_IF_NEG_W(in1_w); 190 in2_w += RET_1_IF_NEG_W(in2_w); 191 in3_w += RET_1_IF_NEG_W(in3_w); 192 ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w); 193 SRA_4V(in0_w, in1_w, in2_w, in3_w, 3); 194 PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h); 195 ST_SH2(in0_h, in1_h, output, 8); 196 } 197