1 /* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <stdlib.h> 12 13 #include "./vpx_dsp_rtcd.h" 14 #include "vpx/vpx_integer.h" 15 #include "vpx_dsp/mips/common_dspr2.h" 16 #include "vpx_dsp/mips/loopfilter_filters_dspr2.h" 17 #include "vpx_dsp/mips/loopfilter_macros_dspr2.h" 18 #include "vpx_dsp/mips/loopfilter_masks_dspr2.h" 19 #include "vpx_mem/vpx_mem.h" 20 21 #if HAVE_DSPR2 22 void vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch, 23 const uint8_t *blimit, const uint8_t *limit, 24 const uint8_t *thresh) { 25 uint8_t i; 26 uint32_t mask; 27 uint32_t hev; 28 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; 29 uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; 30 uint32_t thresh_vec, flimit_vec, limit_vec; 31 uint32_t uflimit, ulimit, uthresh; 32 33 uflimit = *blimit; 34 ulimit = *limit; 35 uthresh = *thresh; 36 37 /* create quad-byte */ 38 __asm__ __volatile__( 39 "replv.qb %[thresh_vec], %[uthresh] \n\t" 40 "replv.qb %[flimit_vec], %[uflimit] \n\t" 41 "replv.qb %[limit_vec], %[ulimit] \n\t" 42 43 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), 44 [limit_vec] "=r"(limit_vec) 45 : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); 46 47 /* prefetch data for store */ 48 prefetch_store(s); 49 50 /* loop filter designed to work using chars so that we can make maximum use 51 of 8 bit simd instructions. */ 52 for (i = 0; i < 2; i++) { 53 sm1 = s - (pitch << 2); 54 s0 = sm1 + pitch; 55 s1 = s0 + pitch; 56 s2 = s - pitch; 57 s3 = s; 58 s4 = s + pitch; 59 s5 = s4 + pitch; 60 s6 = s5 + pitch; 61 62 __asm__ __volatile__( 63 "lw %[p1], (%[s1]) \n\t" 64 "lw %[p2], (%[s2]) \n\t" 65 "lw %[p3], (%[s3]) \n\t" 66 "lw %[p4], (%[s4]) \n\t" 67 68 : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4) 69 : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); 70 71 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 72 mask will be zero and filtering is not needed */ 73 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 74 __asm__ __volatile__( 75 "lw %[pm1], (%[sm1]) \n\t" 76 "lw %[p0], (%[s0]) \n\t" 77 "lw %[p5], (%[s5]) \n\t" 78 "lw %[p6], (%[s6]) \n\t" 79 80 : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6) 81 : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6)); 82 83 filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, 84 p6, thresh_vec, &hev, &mask); 85 86 /* if mask == 0 do filtering is not needed */ 87 if (mask) { 88 /* filtering */ 89 filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); 90 91 __asm__ __volatile__( 92 "sw %[p1], (%[s1]) \n\t" 93 "sw %[p2], (%[s2]) \n\t" 94 "sw %[p3], (%[s3]) \n\t" 95 "sw %[p4], (%[s4]) \n\t" 96 97 : 98 : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4), 99 [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); 100 } 101 } 102 103 s = s + 4; 104 } 105 } 106 107 void vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch, 108 const uint8_t *blimit, const uint8_t *limit, 109 const uint8_t *thresh) { 110 uint8_t i; 111 uint32_t mask, hev; 112 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; 113 uint8_t *s1, *s2, *s3, *s4; 114 uint32_t prim1, prim2, sec3, sec4, prim3, prim4; 115 uint32_t thresh_vec, flimit_vec, limit_vec; 116 uint32_t uflimit, ulimit, uthresh; 117 118 uflimit = *blimit; 119 ulimit = *limit; 120 uthresh = *thresh; 121 122 /* create quad-byte */ 123 __asm__ __volatile__( 124 "replv.qb %[thresh_vec], %[uthresh] \n\t" 125 "replv.qb %[flimit_vec], %[uflimit] \n\t" 126 "replv.qb %[limit_vec], %[ulimit] \n\t" 127 128 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), 129 [limit_vec] "=r"(limit_vec) 130 : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); 131 132 /* prefetch data for store */ 133 prefetch_store(s + pitch); 134 135 for (i = 0; i < 2; i++) { 136 s1 = s; 137 s2 = s + pitch; 138 s3 = s2 + pitch; 139 s4 = s3 + pitch; 140 s = s4 + pitch; 141 142 /* load quad-byte vectors 143 * memory is 4 byte aligned 144 */ 145 p2 = *((uint32_t *)(s1 - 4)); 146 p6 = *((uint32_t *)(s1)); 147 p1 = *((uint32_t *)(s2 - 4)); 148 p5 = *((uint32_t *)(s2)); 149 p0 = *((uint32_t *)(s3 - 4)); 150 p4 = *((uint32_t *)(s3)); 151 pm1 = *((uint32_t *)(s4 - 4)); 152 p3 = *((uint32_t *)(s4)); 153 154 /* transpose pm1, p0, p1, p2 */ 155 __asm__ __volatile__( 156 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" 157 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" 158 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" 159 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" 160 161 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" 162 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" 163 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 164 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 165 166 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" 167 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" 168 "append %[p1], %[sec3], 16 \n\t" 169 "append %[pm1], %[sec4], 16 \n\t" 170 171 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 172 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), 173 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 174 :); 175 176 /* transpose p3, p4, p5, p6 */ 177 __asm__ __volatile__( 178 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" 179 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" 180 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" 181 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" 182 183 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" 184 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" 185 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 186 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 187 188 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" 189 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" 190 "append %[p5], %[sec3], 16 \n\t" 191 "append %[p3], %[sec4], 16 \n\t" 192 193 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 194 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), 195 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 196 :); 197 198 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 199 * mask will be zero and filtering is not needed 200 */ 201 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 202 filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, 203 p6, thresh_vec, &hev, &mask); 204 205 /* if mask == 0 do filtering is not needed */ 206 if (mask) { 207 /* filtering */ 208 filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); 209 210 /* unpack processed 4x4 neighborhood 211 * don't use transpose on output data 212 * because memory isn't aligned 213 */ 214 __asm__ __volatile__( 215 "sb %[p4], 1(%[s4]) \n\t" 216 "sb %[p3], 0(%[s4]) \n\t" 217 "sb %[p2], -1(%[s4]) \n\t" 218 "sb %[p1], -2(%[s4]) \n\t" 219 220 : 221 : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), 222 [s4] "r"(s4)); 223 224 __asm__ __volatile__( 225 "srl %[p4], %[p4], 8 \n\t" 226 "srl %[p3], %[p3], 8 \n\t" 227 "srl %[p2], %[p2], 8 \n\t" 228 "srl %[p1], %[p1], 8 \n\t" 229 230 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 231 :); 232 233 __asm__ __volatile__( 234 "sb %[p4], 1(%[s3]) \n\t" 235 "sb %[p3], 0(%[s3]) \n\t" 236 "sb %[p2], -1(%[s3]) \n\t" 237 "sb %[p1], -2(%[s3]) \n\t" 238 239 : [p1] "+r"(p1) 240 : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3)); 241 242 __asm__ __volatile__( 243 "srl %[p4], %[p4], 8 \n\t" 244 "srl %[p3], %[p3], 8 \n\t" 245 "srl %[p2], %[p2], 8 \n\t" 246 "srl %[p1], %[p1], 8 \n\t" 247 248 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 249 :); 250 251 __asm__ __volatile__( 252 "sb %[p4], 1(%[s2]) \n\t" 253 "sb %[p3], 0(%[s2]) \n\t" 254 "sb %[p2], -1(%[s2]) \n\t" 255 "sb %[p1], -2(%[s2]) \n\t" 256 257 : 258 : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), 259 [s2] "r"(s2)); 260 261 __asm__ __volatile__( 262 "srl %[p4], %[p4], 8 \n\t" 263 "srl %[p3], %[p3], 8 \n\t" 264 "srl %[p2], %[p2], 8 \n\t" 265 "srl %[p1], %[p1], 8 \n\t" 266 267 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 268 :); 269 270 __asm__ __volatile__( 271 "sb %[p4], 1(%[s1]) \n\t" 272 "sb %[p3], 0(%[s1]) \n\t" 273 "sb %[p2], -1(%[s1]) \n\t" 274 "sb %[p1], -2(%[s1]) \n\t" 275 276 : 277 : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), 278 [s1] "r"(s1)); 279 } 280 } 281 } 282 } 283 284 void vpx_lpf_horizontal_4_dual_dspr2( 285 uint8_t *s, int p /* pitch */, const uint8_t *blimit0, 286 const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, 287 const uint8_t *limit1, const uint8_t *thresh1) { 288 vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0); 289 vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1); 290 } 291 292 void vpx_lpf_horizontal_8_dual_dspr2( 293 uint8_t *s, int p /* pitch */, const uint8_t *blimit0, 294 const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, 295 const uint8_t *limit1, const uint8_t *thresh1) { 296 vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0); 297 vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1); 298 } 299 300 void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, 301 const uint8_t *limit0, 302 const uint8_t *thresh0, 303 const uint8_t *blimit1, 304 const uint8_t *limit1, 305 const uint8_t *thresh1) { 306 vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0); 307 vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); 308 } 309 310 void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, 311 const uint8_t *limit0, 312 const uint8_t *thresh0, 313 const uint8_t *blimit1, 314 const uint8_t *limit1, 315 const uint8_t *thresh1) { 316 vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0); 317 vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); 318 } 319 320 void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit, 321 const uint8_t *limit, 322 const uint8_t *thresh) { 323 vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh); 324 vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh); 325 } 326 #endif // #if HAVE_DSPR2 327