1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "./vpx_dsp_rtcd.h" 12 #include "vpx_dsp/mips/macros_msa.h" 13 14 static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride, 15 const uint8_t *pred_ptr, int32_t pred_stride, 16 int16_t *diff_ptr, int32_t diff_stride) { 17 uint32_t src0, src1, src2, src3; 18 uint32_t pred0, pred1, pred2, pred3; 19 v16i8 src = { 0 }; 20 v16i8 pred = { 0 }; 21 v16u8 src_l0, src_l1; 22 v8i16 diff0, diff1; 23 24 LW4(src_ptr, src_stride, src0, src1, src2, src3); 25 LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3); 26 INSERT_W4_SB(src0, src1, src2, src3, src); 27 INSERT_W4_SB(pred0, pred1, pred2, pred3, pred); 28 ILVRL_B2_UB(src, pred, src_l0, src_l1); 29 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 30 ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride)); 31 } 32 33 static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride, 34 const uint8_t *pred_ptr, int32_t pred_stride, 35 int16_t *diff_ptr, int32_t diff_stride) { 36 uint32_t loop_cnt; 37 uint64_t src0, src1, pred0, pred1; 38 v16i8 src = { 0 }; 39 v16i8 pred = { 0 }; 40 v16u8 src_l0, src_l1; 41 v8i16 diff0, diff1; 42 43 for (loop_cnt = 4; loop_cnt--;) { 44 LD2(src_ptr, src_stride, src0, src1); 45 src_ptr += (2 * src_stride); 46 LD2(pred_ptr, pred_stride, pred0, pred1); 47 pred_ptr += (2 * pred_stride); 48 49 INSERT_D2_SB(src0, src1, src); 50 INSERT_D2_SB(pred0, pred1, pred); 51 ILVRL_B2_UB(src, pred, src_l0, src_l1); 52 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 53 ST_SH2(diff0, diff1, diff_ptr, diff_stride); 54 diff_ptr += (2 * diff_stride); 55 } 56 } 57 58 static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride, 59 const uint8_t *pred, int32_t pred_stride, 60 int16_t *diff, int32_t diff_stride) { 61 int8_t count; 62 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 63 v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; 64 v16u8 src_l0, src_l1; 65 v8i16 diff0, diff1; 66 67 for (count = 2; count--;) { 68 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 69 src += (8 * src_stride); 70 71 LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6, 72 pred7); 73 pred += (8 * pred_stride); 74 75 ILVRL_B2_UB(src0, pred0, src_l0, src_l1); 76 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 77 ST_SH2(diff0, diff1, diff, 8); 78 diff += diff_stride; 79 80 ILVRL_B2_UB(src1, pred1, src_l0, src_l1); 81 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 82 ST_SH2(diff0, diff1, diff, 8); 83 diff += diff_stride; 84 85 ILVRL_B2_UB(src2, pred2, src_l0, src_l1); 86 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 87 ST_SH2(diff0, diff1, diff, 8); 88 diff += diff_stride; 89 90 ILVRL_B2_UB(src3, pred3, src_l0, src_l1); 91 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 92 ST_SH2(diff0, diff1, diff, 8); 93 diff += diff_stride; 94 95 ILVRL_B2_UB(src4, pred4, src_l0, src_l1); 96 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 97 ST_SH2(diff0, diff1, diff, 8); 98 diff += diff_stride; 99 100 ILVRL_B2_UB(src5, pred5, src_l0, src_l1); 101 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 102 ST_SH2(diff0, diff1, diff, 8); 103 diff += diff_stride; 104 105 ILVRL_B2_UB(src6, pred6, src_l0, src_l1); 106 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 107 ST_SH2(diff0, diff1, diff, 8); 108 diff += diff_stride; 109 110 ILVRL_B2_UB(src7, pred7, src_l0, src_l1); 111 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 112 ST_SH2(diff0, diff1, diff, 8); 113 diff += diff_stride; 114 } 115 } 116 117 static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride, 118 const uint8_t *pred, int32_t pred_stride, 119 int16_t *diff, int32_t diff_stride) { 120 uint32_t loop_cnt; 121 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 122 v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; 123 v16u8 src_l0, src_l1; 124 v8i16 diff0, diff1; 125 126 for (loop_cnt = 8; loop_cnt--;) { 127 LD_SB2(src, 16, src0, src1); 128 src += src_stride; 129 LD_SB2(src, 16, src2, src3); 130 src += src_stride; 131 LD_SB2(src, 16, src4, src5); 132 src += src_stride; 133 LD_SB2(src, 16, src6, src7); 134 src += src_stride; 135 136 LD_SB2(pred, 16, pred0, pred1); 137 pred += pred_stride; 138 LD_SB2(pred, 16, pred2, pred3); 139 pred += pred_stride; 140 LD_SB2(pred, 16, pred4, pred5); 141 pred += pred_stride; 142 LD_SB2(pred, 16, pred6, pred7); 143 pred += pred_stride; 144 145 ILVRL_B2_UB(src0, pred0, src_l0, src_l1); 146 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 147 ST_SH2(diff0, diff1, diff, 8); 148 ILVRL_B2_UB(src1, pred1, src_l0, src_l1); 149 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 150 ST_SH2(diff0, diff1, diff + 16, 8); 151 diff += diff_stride; 152 153 ILVRL_B2_UB(src2, pred2, src_l0, src_l1); 154 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 155 ST_SH2(diff0, diff1, diff, 8); 156 ILVRL_B2_UB(src3, pred3, src_l0, src_l1); 157 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 158 ST_SH2(diff0, diff1, diff + 16, 8); 159 diff += diff_stride; 160 161 ILVRL_B2_UB(src4, pred4, src_l0, src_l1); 162 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 163 ST_SH2(diff0, diff1, diff, 8); 164 ILVRL_B2_UB(src5, pred5, src_l0, src_l1); 165 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 166 ST_SH2(diff0, diff1, diff + 16, 8); 167 diff += diff_stride; 168 169 ILVRL_B2_UB(src6, pred6, src_l0, src_l1); 170 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 171 ST_SH2(diff0, diff1, diff, 8); 172 ILVRL_B2_UB(src7, pred7, src_l0, src_l1); 173 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 174 ST_SH2(diff0, diff1, diff + 16, 8); 175 diff += diff_stride; 176 } 177 } 178 179 static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride, 180 const uint8_t *pred, int32_t pred_stride, 181 int16_t *diff, int32_t diff_stride) { 182 uint32_t loop_cnt; 183 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 184 v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; 185 v16u8 src_l0, src_l1; 186 v8i16 diff0, diff1; 187 188 for (loop_cnt = 32; loop_cnt--;) { 189 LD_SB4(src, 16, src0, src1, src2, src3); 190 src += src_stride; 191 LD_SB4(src, 16, src4, src5, src6, src7); 192 src += src_stride; 193 194 LD_SB4(pred, 16, pred0, pred1, pred2, pred3); 195 pred += pred_stride; 196 LD_SB4(pred, 16, pred4, pred5, pred6, pred7); 197 pred += pred_stride; 198 199 ILVRL_B2_UB(src0, pred0, src_l0, src_l1); 200 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 201 ST_SH2(diff0, diff1, diff, 8); 202 ILVRL_B2_UB(src1, pred1, src_l0, src_l1); 203 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 204 ST_SH2(diff0, diff1, diff + 16, 8); 205 ILVRL_B2_UB(src2, pred2, src_l0, src_l1); 206 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 207 ST_SH2(diff0, diff1, diff + 32, 8); 208 ILVRL_B2_UB(src3, pred3, src_l0, src_l1); 209 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 210 ST_SH2(diff0, diff1, diff + 48, 8); 211 diff += diff_stride; 212 213 ILVRL_B2_UB(src4, pred4, src_l0, src_l1); 214 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 215 ST_SH2(diff0, diff1, diff, 8); 216 ILVRL_B2_UB(src5, pred5, src_l0, src_l1); 217 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 218 ST_SH2(diff0, diff1, diff + 16, 8); 219 ILVRL_B2_UB(src6, pred6, src_l0, src_l1); 220 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 221 ST_SH2(diff0, diff1, diff + 32, 8); 222 ILVRL_B2_UB(src7, pred7, src_l0, src_l1); 223 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); 224 ST_SH2(diff0, diff1, diff + 48, 8); 225 diff += diff_stride; 226 } 227 } 228 229 void vpx_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr, 230 ptrdiff_t diff_stride, const uint8_t *src_ptr, 231 ptrdiff_t src_stride, const uint8_t *pred_ptr, 232 ptrdiff_t pred_stride) { 233 if (rows == cols) { 234 switch (rows) { 235 case 4: 236 sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, 237 diff_stride); 238 break; 239 case 8: 240 sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, 241 diff_stride); 242 break; 243 case 16: 244 sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, 245 diff_stride); 246 break; 247 case 32: 248 sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, 249 diff_stride); 250 break; 251 case 64: 252 sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, 253 diff_stride); 254 break; 255 default: 256 vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, 257 src_stride, pred_ptr, pred_stride); 258 break; 259 } 260 } else { 261 vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, 262 pred_ptr, pred_stride); 263 } 264 } 265