1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <string.h> 12 #include "./vpx_dsp_rtcd.h" 13 #include "vpx_dsp/mips/macros_msa.h" 14 15 static void copy_width8_msa(const uint8_t *src, int32_t src_stride, 16 uint8_t *dst, int32_t dst_stride, int32_t height) { 17 int32_t cnt; 18 uint64_t out0, out1, out2, out3, out4, out5, out6, out7; 19 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 20 21 if (0 == height % 12) { 22 for (cnt = (height / 12); cnt--;) { 23 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 24 src += (8 * src_stride); 25 26 out0 = __msa_copy_u_d((v2i64)src0, 0); 27 out1 = __msa_copy_u_d((v2i64)src1, 0); 28 out2 = __msa_copy_u_d((v2i64)src2, 0); 29 out3 = __msa_copy_u_d((v2i64)src3, 0); 30 out4 = __msa_copy_u_d((v2i64)src4, 0); 31 out5 = __msa_copy_u_d((v2i64)src5, 0); 32 out6 = __msa_copy_u_d((v2i64)src6, 0); 33 out7 = __msa_copy_u_d((v2i64)src7, 0); 34 35 SD4(out0, out1, out2, out3, dst, dst_stride); 36 dst += (4 * dst_stride); 37 SD4(out4, out5, out6, out7, dst, dst_stride); 38 dst += (4 * dst_stride); 39 40 LD_UB4(src, src_stride, src0, src1, src2, src3); 41 src += (4 * src_stride); 42 43 out0 = __msa_copy_u_d((v2i64)src0, 0); 44 out1 = __msa_copy_u_d((v2i64)src1, 0); 45 out2 = __msa_copy_u_d((v2i64)src2, 0); 46 out3 = __msa_copy_u_d((v2i64)src3, 0); 47 SD4(out0, out1, out2, out3, dst, dst_stride); 48 dst += (4 * dst_stride); 49 } 50 } else if (0 == height % 8) { 51 for (cnt = height >> 3; cnt--;) { 52 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 53 src += (8 * src_stride); 54 55 out0 = __msa_copy_u_d((v2i64)src0, 0); 56 out1 = __msa_copy_u_d((v2i64)src1, 0); 57 out2 = __msa_copy_u_d((v2i64)src2, 0); 58 out3 = __msa_copy_u_d((v2i64)src3, 0); 59 out4 = __msa_copy_u_d((v2i64)src4, 0); 60 out5 = __msa_copy_u_d((v2i64)src5, 0); 61 out6 = __msa_copy_u_d((v2i64)src6, 0); 62 out7 = __msa_copy_u_d((v2i64)src7, 0); 63 64 SD4(out0, out1, out2, out3, dst, dst_stride); 65 dst += (4 * dst_stride); 66 SD4(out4, out5, out6, out7, dst, dst_stride); 67 dst += (4 * dst_stride); 68 } 69 } else if (0 == height % 4) { 70 for (cnt = (height / 4); cnt--;) { 71 LD_UB4(src, src_stride, src0, src1, src2, src3); 72 src += (4 * src_stride); 73 out0 = __msa_copy_u_d((v2i64)src0, 0); 74 out1 = __msa_copy_u_d((v2i64)src1, 0); 75 out2 = __msa_copy_u_d((v2i64)src2, 0); 76 out3 = __msa_copy_u_d((v2i64)src3, 0); 77 78 SD4(out0, out1, out2, out3, dst, dst_stride); 79 dst += (4 * dst_stride); 80 } 81 } else if (0 == height % 2) { 82 for (cnt = (height / 2); cnt--;) { 83 LD_UB2(src, src_stride, src0, src1); 84 src += (2 * src_stride); 85 out0 = __msa_copy_u_d((v2i64)src0, 0); 86 out1 = __msa_copy_u_d((v2i64)src1, 0); 87 88 SD(out0, dst); 89 dst += dst_stride; 90 SD(out1, dst); 91 dst += dst_stride; 92 } 93 } 94 } 95 96 static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, 97 uint8_t *dst, int32_t dst_stride, 98 int32_t height, int32_t width) { 99 int32_t cnt, loop_cnt; 100 const uint8_t *src_tmp; 101 uint8_t *dst_tmp; 102 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 103 104 for (cnt = (width >> 4); cnt--;) { 105 src_tmp = src; 106 dst_tmp = dst; 107 108 for (loop_cnt = (height >> 3); loop_cnt--;) { 109 LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6, 110 src7); 111 src_tmp += (8 * src_stride); 112 113 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp, 114 dst_stride); 115 dst_tmp += (8 * dst_stride); 116 } 117 118 src += 16; 119 dst += 16; 120 } 121 } 122 123 static void copy_width16_msa(const uint8_t *src, int32_t src_stride, 124 uint8_t *dst, int32_t dst_stride, int32_t height) { 125 int32_t cnt; 126 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 127 128 if (0 == height % 12) { 129 for (cnt = (height / 12); cnt--;) { 130 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 131 src += (8 * src_stride); 132 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 133 dst += (8 * dst_stride); 134 135 LD_UB4(src, src_stride, src0, src1, src2, src3); 136 src += (4 * src_stride); 137 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 138 dst += (4 * dst_stride); 139 } 140 } else if (0 == height % 8) { 141 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); 142 } else if (0 == height % 4) { 143 for (cnt = (height >> 2); cnt--;) { 144 LD_UB4(src, src_stride, src0, src1, src2, src3); 145 src += (4 * src_stride); 146 147 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 148 dst += (4 * dst_stride); 149 } 150 } 151 } 152 153 static void copy_width32_msa(const uint8_t *src, int32_t src_stride, 154 uint8_t *dst, int32_t dst_stride, int32_t height) { 155 int32_t cnt; 156 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 157 158 if (0 == height % 12) { 159 for (cnt = (height / 12); cnt--;) { 160 LD_UB4(src, src_stride, src0, src1, src2, src3); 161 LD_UB4(src + 16, src_stride, src4, src5, src6, src7); 162 src += (4 * src_stride); 163 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 164 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); 165 dst += (4 * dst_stride); 166 167 LD_UB4(src, src_stride, src0, src1, src2, src3); 168 LD_UB4(src + 16, src_stride, src4, src5, src6, src7); 169 src += (4 * src_stride); 170 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 171 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); 172 dst += (4 * dst_stride); 173 174 LD_UB4(src, src_stride, src0, src1, src2, src3); 175 LD_UB4(src + 16, src_stride, src4, src5, src6, src7); 176 src += (4 * src_stride); 177 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 178 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); 179 dst += (4 * dst_stride); 180 } 181 } else if (0 == height % 8) { 182 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); 183 } else if (0 == height % 4) { 184 for (cnt = (height >> 2); cnt--;) { 185 LD_UB4(src, src_stride, src0, src1, src2, src3); 186 LD_UB4(src + 16, src_stride, src4, src5, src6, src7); 187 src += (4 * src_stride); 188 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 189 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); 190 dst += (4 * dst_stride); 191 } 192 } 193 } 194 195 static void copy_width64_msa(const uint8_t *src, int32_t src_stride, 196 uint8_t *dst, int32_t dst_stride, int32_t height) { 197 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); 198 } 199 200 void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, 201 uint8_t *dst, ptrdiff_t dst_stride, 202 const InterpKernel *filter, int x0_q4, 203 int32_t x_step_q4, int y0_q4, int32_t y_step_q4, 204 int32_t w, int32_t h) { 205 (void)filter; 206 (void)x0_q4; 207 (void)x_step_q4; 208 (void)y0_q4; 209 (void)y_step_q4; 210 211 switch (w) { 212 case 4: { 213 uint32_t cnt, tmp; 214 /* 1 word storage */ 215 for (cnt = h; cnt--;) { 216 tmp = LW(src); 217 SW(tmp, dst); 218 src += src_stride; 219 dst += dst_stride; 220 } 221 break; 222 } 223 case 8: { 224 copy_width8_msa(src, src_stride, dst, dst_stride, h); 225 break; 226 } 227 case 16: { 228 copy_width16_msa(src, src_stride, dst, dst_stride, h); 229 break; 230 } 231 case 32: { 232 copy_width32_msa(src, src_stride, dst, dst_stride, h); 233 break; 234 } 235 case 64: { 236 copy_width64_msa(src, src_stride, dst, dst_stride, h); 237 break; 238 } 239 default: { 240 uint32_t cnt; 241 for (cnt = h; cnt--;) { 242 memcpy(dst, src, w); 243 src += src_stride; 244 dst += dst_stride; 245 } 246 break; 247 } 248 } 249 } 250