1 /* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "./vpx_dsp_rtcd.h" 12 #include "vpx_dsp/variance.h" 13 #include "vpx_ports/mem.h" 14 #include "vpx/vpx_integer.h" 15 #include "vpx_ports/asmdefs_mmi.h" 16 17 static const uint8_t bilinear_filters[8][2] = { 18 { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, 19 { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, 20 }; 21 22 /* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32, 23 vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */ 24 #define VARIANCE_SSE_SUM_8_FOR_W64 \ 25 /* sse */ \ 26 "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ 27 "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ 28 "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ 29 "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ 30 "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ 31 "paddw %[ftmp10], %[ftmp10], %[ftmp6] \n\t" \ 32 "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \ 33 \ 34 /* sum */ \ 35 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 36 "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ 37 "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \ 38 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ 39 "punpcklhw %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \ 40 "punpckhhw %[ftmp2], %[ftmp3], %[ftmp0] \n\t" \ 41 "punpcklhw %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ 42 "punpckhhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ 43 "psubw %[ftmp3], %[ftmp1], %[ftmp7] \n\t" \ 44 "psubw %[ftmp5], %[ftmp2], %[ftmp8] \n\t" \ 45 "punpcklhw %[ftmp1], %[ftmp4], %[ftmp0] \n\t" \ 46 "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t" \ 47 "punpcklhw %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ 48 "punpckhhw %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ 49 "psubw %[ftmp4], %[ftmp1], %[ftmp7] \n\t" \ 50 "psubw %[ftmp6], %[ftmp2], %[ftmp8] \n\t" \ 51 "paddw %[ftmp9], %[ftmp9], %[ftmp3] \n\t" \ 52 "paddw %[ftmp9], %[ftmp9], %[ftmp4] \n\t" \ 53 "paddw %[ftmp9], %[ftmp9], %[ftmp5] \n\t" \ 54 "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" 55 56 #define VARIANCE_SSE_SUM_4 \ 57 /* sse */ \ 58 "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ 59 "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ 60 "pmaddhw %[ftmp5], %[ftmp4], %[ftmp4] \n\t" \ 61 "paddw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" \ 62 \ 63 /* sum */ \ 64 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 65 "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \ 66 "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" \ 67 "paddh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" 68 69 #define VARIANCE_SSE_SUM_8 \ 70 /* sse */ \ 71 "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ 72 "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ 73 "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ 74 "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ 75 "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ 76 "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ 77 "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" \ 78 \ 79 /* sum */ \ 80 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 81 "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ 82 "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \ 83 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ 84 "paddh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" \ 85 "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" \ 86 "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" \ 87 "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t" 88 89 #define VARIANCE_SSE_8 \ 90 "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ 91 "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ 92 "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" \ 93 "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" \ 94 "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ 95 "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ 96 "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ 97 "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ 98 "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ 99 "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ 100 "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" 101 102 #define VARIANCE_SSE_16 \ 103 VARIANCE_SSE_8 \ 104 "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ 105 "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ 106 "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" \ 107 "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" \ 108 "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ 109 "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ 110 "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ 111 "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ 112 "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ 113 "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ 114 "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" 115 116 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \ 117 /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \ 118 "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ 119 "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ 120 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 121 "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ 122 "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ 123 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 124 "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ 125 "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ 126 "pmullh %[ftmp3], %[ftmp3], %[filter_x1] \n\t" \ 127 "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ 128 "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" 129 130 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \ 131 /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \ 132 "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ 133 "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ 134 "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ 135 "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ 136 "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ 137 "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ 138 "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ 139 "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ 140 "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \ 141 "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ 142 "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" 143 144 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A \ 145 /* calculate: temp2[0] ~ temp2[3] */ \ 146 "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \ 147 "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ 148 "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \ 149 "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \ 150 "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \ 151 \ 152 /* store: temp2[0] ~ temp2[3] */ \ 153 "and %[ftmp2], %[ftmp2], %[mask] \n\t" \ 154 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 155 "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t" 156 157 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B \ 158 /* calculate: temp2[0] ~ temp2[3] */ \ 159 "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \ 160 "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ 161 "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \ 162 "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ 163 "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \ 164 \ 165 /* store: temp2[0] ~ temp2[3] */ \ 166 "and %[ftmp4], %[ftmp4], %[mask] \n\t" \ 167 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ 168 "gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t" 169 170 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ 171 /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \ 172 "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ 173 "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ 174 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ 175 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ 176 "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ 177 "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ 178 "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ 179 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ 180 "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ 181 "pmullh %[ftmp3], %[ftmp3], %[filter_x0] \n\t" \ 182 "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ 183 "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \ 184 "pmullh %[ftmp4], %[ftmp4], %[filter_x1] \n\t" \ 185 "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \ 186 "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ 187 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 188 "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \ 189 "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" 190 191 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ 192 /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \ 193 "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ 194 "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ 195 "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \ 196 "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \ 197 "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ 198 "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ 199 "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ 200 "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ 201 "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \ 202 "pmullh %[ftmp9], %[ftmp9], %[filter_x0] \n\t" \ 203 "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \ 204 "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \ 205 "pmullh %[ftmp10], %[ftmp10], %[filter_x1] \n\t" \ 206 "pmullh %[ftmp11], %[ftmp11], %[filter_x1] \n\t" \ 207 "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" \ 208 "paddh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" \ 209 "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \ 210 "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" 211 212 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \ 213 /* calculate: temp2[0] ~ temp2[3] */ \ 214 "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \ 215 "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ 216 "pmullh %[ftmp1], %[ftmp8], %[filter_y1] \n\t" \ 217 "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \ 218 "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \ 219 \ 220 /* calculate: temp2[4] ~ temp2[7] */ \ 221 "pmullh %[ftmp3], %[ftmp3], %[filter_y0] \n\t" \ 222 "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \ 223 "pmullh %[ftmp1], %[ftmp9], %[filter_y1] \n\t" \ 224 "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \ 225 "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \ 226 \ 227 /* store: temp2[0] ~ temp2[7] */ \ 228 "and %[ftmp2], %[ftmp2], %[mask] \n\t" \ 229 "and %[ftmp3], %[ftmp3], %[mask] \n\t" \ 230 "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ 231 "gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \ 232 "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t" 233 234 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \ 235 /* calculate: temp2[0] ~ temp2[3] */ \ 236 "pmullh %[ftmp8], %[ftmp8], %[filter_y0] \n\t" \ 237 "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \ 238 "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \ 239 "paddh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" \ 240 "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \ 241 \ 242 /* calculate: temp2[4] ~ temp2[7] */ \ 243 "pmullh %[ftmp9], %[ftmp9], %[filter_y0] \n\t" \ 244 "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \ 245 "pmullh %[ftmp1], %[ftmp3], %[filter_y1] \n\t" \ 246 "paddh %[ftmp9], %[ftmp9], %[ftmp1] \n\t" \ 247 "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \ 248 \ 249 /* store: temp2[0] ~ temp2[7] */ \ 250 "and %[ftmp8], %[ftmp8], %[mask] \n\t" \ 251 "and %[ftmp9], %[ftmp9], %[mask] \n\t" \ 252 "packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ 253 "gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \ 254 "gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t" 255 256 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A \ 257 /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \ 258 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ 259 \ 260 /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \ 261 "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ 262 "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ 263 "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ 264 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ 265 "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \ 266 "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \ 267 "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ 268 "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \ 269 "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ 270 "pmullh %[ftmp5], %[ftmp5], %[filter_x0] \n\t" \ 271 "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ 272 "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \ 273 "pmullh %[ftmp6], %[ftmp6], %[filter_x1] \n\t" \ 274 "pmullh %[ftmp7], %[ftmp7], %[filter_x1] \n\t" \ 275 "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \ 276 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \ 277 "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \ 278 "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" 279 280 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B \ 281 /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \ 282 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ 283 \ 284 /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \ 285 "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ 286 "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ 287 "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ 288 "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ 289 "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \ 290 "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \ 291 "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \ 292 "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \ 293 "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \ 294 "pmullh %[ftmp11], %[ftmp11], %[filter_x0] \n\t" \ 295 "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \ 296 "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \ 297 "pmullh %[ftmp12], %[ftmp12], %[filter_x1] \n\t" \ 298 "pmullh %[ftmp13], %[ftmp13], %[filter_x1] \n\t" \ 299 "paddh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" \ 300 "paddh %[ftmp11], %[ftmp11], %[ftmp13] \n\t" \ 301 "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \ 302 "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" 303 304 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A \ 305 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \ 306 \ 307 /* calculate: temp2[8] ~ temp2[11] */ \ 308 "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \ 309 "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ 310 "pmullh %[ftmp1], %[ftmp10], %[filter_y1] \n\t" \ 311 "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ 312 "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \ 313 \ 314 /* calculate: temp2[12] ~ temp2[15] */ \ 315 "pmullh %[ftmp5], %[ftmp5], %[filter_y0] \n\t" \ 316 "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \ 317 "pmullh %[ftmp1], %[ftmp11], %[filter_y1] \n\t" \ 318 "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ 319 "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \ 320 \ 321 /* store: temp2[8] ~ temp2[15] */ \ 322 "and %[ftmp4], %[ftmp4], %[mask] \n\t" \ 323 "and %[ftmp5], %[ftmp5], %[mask] \n\t" \ 324 "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ 325 "gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \ 326 "gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t" 327 328 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B \ 329 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \ 330 \ 331 /* calculate: temp2[8] ~ temp2[11] */ \ 332 "pmullh %[ftmp10], %[ftmp10], %[filter_y0] \n\t" \ 333 "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \ 334 "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \ 335 "paddh %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \ 336 "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \ 337 \ 338 /* calculate: temp2[12] ~ temp2[15] */ \ 339 "pmullh %[ftmp11], %[ftmp11], %[filter_y0] \n\t" \ 340 "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \ 341 "pmullh %[ftmp1], %[ftmp5], %[filter_y1] \n\t" \ 342 "paddh %[ftmp11], %[ftmp11], %[ftmp1] \n\t" \ 343 "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \ 344 \ 345 /* store: temp2[8] ~ temp2[15] */ \ 346 "and %[ftmp10], %[ftmp10], %[mask] \n\t" \ 347 "and %[ftmp11], %[ftmp11], %[mask] \n\t" \ 348 "packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ 349 "gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \ 350 "gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t" 351 352 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal 353 // or vertical direction to produce the filtered output block. Used to implement 354 // the first-pass of 2-D separable filter. 355 // 356 // Produces int16_t output to retain precision for the next pass. Two filter 357 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is 358 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). 359 // It defines the offset required to move from one input to the next. 360 static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, 361 unsigned int src_pixels_per_line, 362 int pixel_step, 363 unsigned int output_height, 364 unsigned int output_width, 365 const uint8_t *filter) { 366 unsigned int i, j; 367 368 for (i = 0; i < output_height; ++i) { 369 for (j = 0; j < output_width; ++j) { 370 b[j] = ROUND_POWER_OF_TWO( 371 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); 372 373 ++a; 374 } 375 376 a += src_pixels_per_line - output_width; 377 b += output_width; 378 } 379 } 380 381 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal 382 // or vertical direction to produce the filtered output block. Used to implement 383 // the second-pass of 2-D separable filter. 384 // 385 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two 386 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the 387 // filter is applied horizontally (pixel_step = 1) or vertically 388 // (pixel_step = stride). It defines the offset required to move from one input 389 // to the next. Output is 8-bit. 390 static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, 391 unsigned int src_pixels_per_line, 392 unsigned int pixel_step, 393 unsigned int output_height, 394 unsigned int output_width, 395 const uint8_t *filter) { 396 unsigned int i, j; 397 398 for (i = 0; i < output_height; ++i) { 399 for (j = 0; j < output_width; ++j) { 400 b[j] = ROUND_POWER_OF_TWO( 401 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); 402 ++a; 403 } 404 405 a += src_pixels_per_line - output_width; 406 b += output_width; 407 } 408 } 409 410 static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride, 411 const uint8_t *b, int b_stride, 412 uint32_t *sse, int high) { 413 int sum; 414 double ftmp[12]; 415 uint32_t tmp[3]; 416 417 *sse = 0; 418 419 __asm__ volatile ( 420 "li %[tmp0], 0x20 \n\t" 421 "mtc1 %[tmp0], %[ftmp11] \n\t" 422 MMI_L(%[tmp0], %[high], 0x00) 423 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 424 "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" 425 "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" 426 "1: \n\t" 427 "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" 428 "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" 429 "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" 430 "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" 431 VARIANCE_SSE_SUM_8_FOR_W64 432 433 "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" 434 "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" 435 "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" 436 "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" 437 VARIANCE_SSE_SUM_8_FOR_W64 438 439 "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" 440 "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" 441 "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" 442 "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" 443 VARIANCE_SSE_SUM_8_FOR_W64 444 445 "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" 446 "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" 447 "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" 448 "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" 449 VARIANCE_SSE_SUM_8_FOR_W64 450 451 "gsldlc1 %[ftmp1], 0x27(%[a]) \n\t" 452 "gsldrc1 %[ftmp1], 0x20(%[a]) \n\t" 453 "gsldlc1 %[ftmp2], 0x27(%[b]) \n\t" 454 "gsldrc1 %[ftmp2], 0x20(%[b]) \n\t" 455 VARIANCE_SSE_SUM_8_FOR_W64 456 457 "gsldlc1 %[ftmp1], 0x2f(%[a]) \n\t" 458 "gsldrc1 %[ftmp1], 0x28(%[a]) \n\t" 459 "gsldlc1 %[ftmp2], 0x2f(%[b]) \n\t" 460 "gsldrc1 %[ftmp2], 0x28(%[b]) \n\t" 461 VARIANCE_SSE_SUM_8_FOR_W64 462 463 "gsldlc1 %[ftmp1], 0x37(%[a]) \n\t" 464 "gsldrc1 %[ftmp1], 0x30(%[a]) \n\t" 465 "gsldlc1 %[ftmp2], 0x37(%[b]) \n\t" 466 "gsldrc1 %[ftmp2], 0x30(%[b]) \n\t" 467 VARIANCE_SSE_SUM_8_FOR_W64 468 469 "gsldlc1 %[ftmp1], 0x3f(%[a]) \n\t" 470 "gsldrc1 %[ftmp1], 0x38(%[a]) \n\t" 471 "gsldlc1 %[ftmp2], 0x3f(%[b]) \n\t" 472 "gsldrc1 %[ftmp2], 0x38(%[b]) \n\t" 473 VARIANCE_SSE_SUM_8_FOR_W64 474 475 "addiu %[tmp0], %[tmp0], -0x01 \n\t" 476 MMI_ADDU(%[a], %[a], %[a_stride]) 477 MMI_ADDU(%[b], %[b], %[b_stride]) 478 "bnez %[tmp0], 1b \n\t" 479 480 "mfc1 %[tmp1], %[ftmp9] \n\t" 481 "mfhc1 %[tmp2], %[ftmp9] \n\t" 482 "addu %[sum], %[tmp1], %[tmp2] \n\t" 483 "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t" 484 "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t" 485 "swc1 %[ftmp1], 0x00(%[sse]) \n\t" 486 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 487 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 488 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 489 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 490 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 491 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 492 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), 493 [tmp2]"=&r"(tmp[2]), 494 [a]"+&r"(a), [b]"+&r"(b), 495 [sum]"=&r"(sum) 496 : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), 497 [high]"r"(&high), [sse]"r"(sse) 498 : "memory" 499 ); 500 501 return *sse - (((int64_t)sum * sum) / (64 * high)); 502 } 503 504 #define VPX_VARIANCE64XN(n) \ 505 uint32_t vpx_variance64x##n##_mmi(const uint8_t *a, int a_stride, \ 506 const uint8_t *b, int b_stride, \ 507 uint32_t *sse) { \ 508 return vpx_variance64x(a, a_stride, b, b_stride, sse, n); \ 509 } 510 511 VPX_VARIANCE64XN(64) 512 VPX_VARIANCE64XN(32) 513 514 uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b, 515 int b_stride, uint32_t *sse) { 516 int sum; 517 double ftmp[12]; 518 uint32_t tmp[3]; 519 520 *sse = 0; 521 522 __asm__ volatile ( 523 "li %[tmp0], 0x20 \n\t" 524 "mtc1 %[tmp0], %[ftmp11] \n\t" 525 "li %[tmp0], 0x40 \n\t" 526 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 527 "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" 528 "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" 529 "1: \n\t" 530 "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" 531 "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" 532 "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" 533 "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" 534 VARIANCE_SSE_SUM_8_FOR_W64 535 536 "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" 537 "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" 538 "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" 539 "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" 540 VARIANCE_SSE_SUM_8_FOR_W64 541 542 "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" 543 "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" 544 "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" 545 "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" 546 VARIANCE_SSE_SUM_8_FOR_W64 547 548 "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" 549 "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" 550 "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" 551 "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" 552 VARIANCE_SSE_SUM_8_FOR_W64 553 554 "addiu %[tmp0], %[tmp0], -0x01 \n\t" 555 MMI_ADDU(%[a], %[a], %[a_stride]) 556 MMI_ADDU(%[b], %[b], %[b_stride]) 557 "bnez %[tmp0], 1b \n\t" 558 559 "mfc1 %[tmp1], %[ftmp9] \n\t" 560 "mfhc1 %[tmp2], %[ftmp9] \n\t" 561 "addu %[sum], %[tmp1], %[tmp2] \n\t" 562 "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t" 563 "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t" 564 "swc1 %[ftmp1], 0x00(%[sse]) \n\t" 565 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 566 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 567 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 568 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 569 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 570 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 571 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), 572 [tmp2]"=&r"(tmp[2]), 573 [a]"+&r"(a), [b]"+&r"(b), 574 [sum]"=&r"(sum) 575 : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), 576 [sse]"r"(sse) 577 : "memory" 578 ); 579 580 return *sse - (((int64_t)sum * sum) / 2048); 581 } 582 583 static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride, 584 const uint8_t *b, int b_stride, 585 uint32_t *sse, int high) { 586 int sum; 587 double ftmp[13]; 588 uint32_t tmp[3]; 589 590 *sse = 0; 591 592 __asm__ volatile ( 593 "li %[tmp0], 0x20 \n\t" 594 "mtc1 %[tmp0], %[ftmp11] \n\t" 595 MMI_L(%[tmp0], %[high], 0x00) 596 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 597 "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" 598 "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" 599 "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" 600 "1: \n\t" 601 "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" 602 "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" 603 "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" 604 "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" 605 VARIANCE_SSE_SUM_8 606 "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" 607 "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" 608 "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" 609 "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" 610 VARIANCE_SSE_SUM_8 611 "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" 612 "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" 613 "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" 614 "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" 615 VARIANCE_SSE_SUM_8 616 "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" 617 "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" 618 "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" 619 "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" 620 VARIANCE_SSE_SUM_8 621 622 "addiu %[tmp0], %[tmp0], -0x01 \n\t" 623 MMI_ADDU(%[a], %[a], %[a_stride]) 624 MMI_ADDU(%[b], %[b], %[b_stride]) 625 "bnez %[tmp0], 1b \n\t" 626 627 "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" 628 "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" 629 "swc1 %[ftmp9], 0x00(%[sse]) \n\t" 630 631 "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" 632 "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" 633 "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" 634 "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" 635 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" 636 "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" 637 "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" 638 "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t" 639 "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" 640 "swc1 %[ftmp0], 0x00(%[sum]) \n\t" 641 642 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 643 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 644 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 645 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 646 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 647 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 648 [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), 649 [a]"+&r"(a), [b]"+&r"(b) 650 : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), 651 [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) 652 : "memory" 653 ); 654 655 return *sse - (((int64_t)sum * sum) / (32 * high)); 656 } 657 658 #define VPX_VARIANCE32XN(n) \ 659 uint32_t vpx_variance32x##n##_mmi(const uint8_t *a, int a_stride, \ 660 const uint8_t *b, int b_stride, \ 661 uint32_t *sse) { \ 662 return vpx_variance32x(a, a_stride, b, b_stride, sse, n); \ 663 } 664 665 VPX_VARIANCE32XN(32) 666 VPX_VARIANCE32XN(16) 667 668 static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride, 669 const uint8_t *b, int b_stride, 670 uint32_t *sse, int high) { 671 int sum; 672 double ftmp[13]; 673 uint32_t tmp[3]; 674 675 *sse = 0; 676 677 __asm__ volatile ( 678 "li %[tmp0], 0x20 \n\t" 679 "mtc1 %[tmp0], %[ftmp11] \n\t" 680 MMI_L(%[tmp0], %[high], 0x00) 681 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 682 "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" 683 "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" 684 "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" 685 "1: \n\t" 686 "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" 687 "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" 688 "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" 689 "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" 690 VARIANCE_SSE_SUM_8 691 "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" 692 "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" 693 "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" 694 "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" 695 VARIANCE_SSE_SUM_8 696 697 "addiu %[tmp0], %[tmp0], -0x01 \n\t" 698 MMI_ADDU(%[a], %[a], %[a_stride]) 699 MMI_ADDU(%[b], %[b], %[b_stride]) 700 "bnez %[tmp0], 1b \n\t" 701 702 "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" 703 "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" 704 "swc1 %[ftmp9], 0x00(%[sse]) \n\t" 705 706 "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" 707 "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" 708 "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" 709 "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" 710 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" 711 "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" 712 "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" 713 "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t" 714 "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" 715 "swc1 %[ftmp0], 0x00(%[sum]) \n\t" 716 717 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 718 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 719 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 720 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 721 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 722 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 723 [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), 724 [a]"+&r"(a), [b]"+&r"(b) 725 : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), 726 [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) 727 : "memory" 728 ); 729 730 return *sse - (((int64_t)sum * sum) / (16 * high)); 731 } 732 733 #define VPX_VARIANCE16XN(n) \ 734 uint32_t vpx_variance16x##n##_mmi(const uint8_t *a, int a_stride, \ 735 const uint8_t *b, int b_stride, \ 736 uint32_t *sse) { \ 737 return vpx_variance16x(a, a_stride, b, b_stride, sse, n); \ 738 } 739 740 VPX_VARIANCE16XN(32) 741 VPX_VARIANCE16XN(16) 742 VPX_VARIANCE16XN(8) 743 744 static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride, 745 const uint8_t *b, int b_stride, 746 uint32_t *sse, int high) { 747 int sum; 748 double ftmp[13]; 749 uint32_t tmp[3]; 750 751 *sse = 0; 752 753 __asm__ volatile ( 754 "li %[tmp0], 0x20 \n\t" 755 "mtc1 %[tmp0], %[ftmp11] \n\t" 756 MMI_L(%[tmp0], %[high], 0x00) 757 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 758 "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" 759 "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" 760 "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" 761 "1: \n\t" 762 "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" 763 "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" 764 "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" 765 "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" 766 VARIANCE_SSE_SUM_8 767 768 "addiu %[tmp0], %[tmp0], -0x01 \n\t" 769 MMI_ADDU(%[a], %[a], %[a_stride]) 770 MMI_ADDU(%[b], %[b], %[b_stride]) 771 "bnez %[tmp0], 1b \n\t" 772 773 "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" 774 "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" 775 "swc1 %[ftmp9], 0x00(%[sse]) \n\t" 776 777 "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" 778 "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" 779 "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" 780 "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" 781 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" 782 "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" 783 "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" 784 "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t" 785 "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" 786 "swc1 %[ftmp0], 0x00(%[sum]) \n\t" 787 788 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 789 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 790 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 791 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 792 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 793 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 794 [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), 795 [a]"+&r"(a), [b]"+&r"(b) 796 : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), 797 [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) 798 : "memory" 799 ); 800 801 return *sse - (((int64_t)sum * sum) / (8 * high)); 802 } 803 804 #define VPX_VARIANCE8XN(n) \ 805 uint32_t vpx_variance8x##n##_mmi(const uint8_t *a, int a_stride, \ 806 const uint8_t *b, int b_stride, \ 807 uint32_t *sse) { \ 808 return vpx_variance8x(a, a_stride, b, b_stride, sse, n); \ 809 } 810 811 VPX_VARIANCE8XN(16) 812 VPX_VARIANCE8XN(8) 813 VPX_VARIANCE8XN(4) 814 815 static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride, 816 const uint8_t *b, int b_stride, 817 uint32_t *sse, int high) { 818 int sum; 819 double ftmp[12]; 820 uint32_t tmp[3]; 821 822 *sse = 0; 823 824 __asm__ volatile ( 825 "li %[tmp0], 0x20 \n\t" 826 "mtc1 %[tmp0], %[ftmp10] \n\t" 827 MMI_L(%[tmp0], %[high], 0x00) 828 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 829 "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t" 830 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" 831 "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" 832 "1: \n\t" 833 "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" 834 "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" 835 "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" 836 "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" 837 VARIANCE_SSE_SUM_4 838 839 "addiu %[tmp0], %[tmp0], -0x01 \n\t" 840 MMI_ADDU(%[a], %[a], %[a_stride]) 841 MMI_ADDU(%[b], %[b], %[b_stride]) 842 "bnez %[tmp0], 1b \n\t" 843 844 "dsrl %[ftmp9], %[ftmp6], %[ftmp10] \n\t" 845 "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" 846 "swc1 %[ftmp9], 0x00(%[sse]) \n\t" 847 848 "punpcklhw %[ftmp3], %[ftmp7], %[ftmp0] \n\t" 849 "punpckhhw %[ftmp4], %[ftmp7], %[ftmp0] \n\t" 850 "punpcklhw %[ftmp5], %[ftmp8], %[ftmp0] \n\t" 851 "punpckhhw %[ftmp6], %[ftmp8], %[ftmp0] \n\t" 852 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" 853 "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" 854 "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" 855 "dsrl %[ftmp0], %[ftmp3], %[ftmp10] \n\t" 856 "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" 857 "swc1 %[ftmp0], 0x00(%[sum]) \n\t" 858 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 859 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 860 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 861 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 862 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 863 [ftmp10]"=&f"(ftmp[10]), 864 [tmp0]"=&r"(tmp[0]), 865 [a]"+&r"(a), [b]"+&r"(b) 866 : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), 867 [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) 868 : "memory" 869 ); 870 871 return *sse - (((int64_t)sum * sum) / (4 * high)); 872 } 873 874 #define VPX_VARIANCE4XN(n) \ 875 uint32_t vpx_variance4x##n##_mmi(const uint8_t *a, int a_stride, \ 876 const uint8_t *b, int b_stride, \ 877 uint32_t *sse) { \ 878 return vpx_variance4x(a, a_stride, b, b_stride, sse, n); \ 879 } 880 881 VPX_VARIANCE4XN(8) 882 VPX_VARIANCE4XN(4) 883 884 static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride, 885 const uint8_t *b, int b_stride, uint32_t *sse, 886 uint64_t high) { 887 double ftmp[12]; 888 uint32_t tmp[1]; 889 890 *sse = 0; 891 892 __asm__ volatile ( 893 "li %[tmp0], 0x20 \n\t" 894 "mtc1 %[tmp0], %[ftmp11] \n\t" 895 MMI_L(%[tmp0], %[high], 0x00) 896 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 897 "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" 898 899 "1: \n\t" 900 VARIANCE_SSE_16 901 902 "addiu %[tmp0], %[tmp0], -0x01 \n\t" 903 MMI_ADDU(%[a], %[a], %[a_stride]) 904 MMI_ADDU(%[b], %[b], %[b_stride]) 905 "bnez %[tmp0], 1b \n\t" 906 907 "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" 908 "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" 909 "swc1 %[ftmp9], 0x00(%[sse]) \n\t" 910 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 911 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 912 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 913 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 914 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 915 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 916 [tmp0]"=&r"(tmp[0]), 917 [a]"+&r"(a), [b]"+&r"(b) 918 : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), 919 [high]"r"(&high), [sse]"r"(sse) 920 : "memory" 921 ); 922 923 return *sse; 924 } 925 926 #define vpx_mse16xN(n) \ 927 uint32_t vpx_mse16x##n##_mmi(const uint8_t *a, int a_stride, \ 928 const uint8_t *b, int b_stride, \ 929 uint32_t *sse) { \ 930 return vpx_mse16x(a, a_stride, b, b_stride, sse, n); \ 931 } 932 933 vpx_mse16xN(16); 934 vpx_mse16xN(8); 935 936 static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride, 937 const uint8_t *b, int b_stride, uint32_t *sse, 938 uint64_t high) { 939 double ftmp[12]; 940 uint32_t tmp[1]; 941 942 *sse = 0; 943 944 __asm__ volatile ( 945 "li %[tmp0], 0x20 \n\t" 946 "mtc1 %[tmp0], %[ftmp11] \n\t" 947 MMI_L(%[tmp0], %[high], 0x00) 948 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 949 "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" 950 951 "1: \n\t" 952 VARIANCE_SSE_8 953 954 "addiu %[tmp0], %[tmp0], -0x01 \n\t" 955 MMI_ADDU(%[a], %[a], %[a_stride]) 956 MMI_ADDU(%[b], %[b], %[b_stride]) 957 "bnez %[tmp0], 1b \n\t" 958 959 "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" 960 "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" 961 "swc1 %[ftmp9], 0x00(%[sse]) \n\t" 962 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 963 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 964 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 965 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 966 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 967 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 968 [tmp0]"=&r"(tmp[0]), 969 [a]"+&r"(a), [b]"+&r"(b) 970 : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), 971 [high]"r"(&high), [sse]"r"(sse) 972 : "memory" 973 ); 974 975 return *sse; 976 } 977 978 #define vpx_mse8xN(n) \ 979 uint32_t vpx_mse8x##n##_mmi(const uint8_t *a, int a_stride, \ 980 const uint8_t *b, int b_stride, uint32_t *sse) { \ 981 return vpx_mse8x(a, a_stride, b, b_stride, sse, n); \ 982 } 983 984 vpx_mse8xN(16); 985 vpx_mse8xN(8); 986 987 #define SUBPIX_VAR(W, H) \ 988 uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \ 989 const uint8_t *a, int a_stride, int xoffset, int yoffset, \ 990 const uint8_t *b, int b_stride, uint32_t *sse) { \ 991 uint16_t fdata3[(H + 1) * W]; \ 992 uint8_t temp2[H * W]; \ 993 \ 994 var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ 995 bilinear_filters[xoffset]); \ 996 var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ 997 bilinear_filters[yoffset]); \ 998 \ 999 return vpx_variance##W##x##H##_mmi(temp2, W, b, b_stride, sse); \ 1000 } 1001 1002 SUBPIX_VAR(64, 64) 1003 SUBPIX_VAR(64, 32) 1004 SUBPIX_VAR(32, 64) 1005 SUBPIX_VAR(32, 32) 1006 SUBPIX_VAR(32, 16) 1007 SUBPIX_VAR(16, 32) 1008 1009 static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride, 1010 int xoffset, int yoffset, 1011 uint8_t *temp2, int counter) { 1012 uint8_t *temp2_ptr = temp2; 1013 mips_reg l_counter = counter; 1014 double ftmp[15]; 1015 mips_reg tmp[2]; 1016 DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; 1017 DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; 1018 1019 const uint8_t *filter_x = bilinear_filters[xoffset]; 1020 const uint8_t *filter_y = bilinear_filters[yoffset]; 1021 1022 __asm__ volatile ( 1023 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1024 MMI_LI(%[tmp0], 0x07) 1025 MMI_MTC1(%[tmp0], %[ftmp14]) 1026 "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" 1027 "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" 1028 "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" 1029 "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" 1030 1031 // fdata3: fdata3[0] ~ fdata3[15] 1032 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A 1033 1034 // fdata3 +a_stride*1: fdata3[0] ~ fdata3[15] 1035 MMI_ADDU(%[a], %[a], %[a_stride]) 1036 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B 1037 // temp2: temp2[0] ~ temp2[15] 1038 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A 1039 1040 // fdata3 +a_stride*2: fdata3[0] ~ fdata3[15] 1041 MMI_ADDU(%[a], %[a], %[a_stride]) 1042 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A 1043 // temp2+16*1: temp2[0] ~ temp2[15] 1044 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) 1045 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B 1046 1047 "1: \n\t" 1048 MMI_ADDU(%[a], %[a], %[a_stride]) 1049 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B 1050 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) 1051 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A 1052 1053 MMI_ADDU(%[a], %[a], %[a_stride]) 1054 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A 1055 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) 1056 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B 1057 "addiu %[counter], %[counter], -0x01 \n\t" 1058 "bnez %[counter], 1b \n\t" 1059 : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), 1060 [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), 1061 [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), 1062 [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), 1063 [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), 1064 [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), 1065 [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr), 1066 [counter]"+&r"(l_counter) 1067 : [filter_x0] "f"((uint64_t)filter_x[0]), 1068 [filter_x1] "f"((uint64_t)filter_x[1]), 1069 [filter_y0] "f"((uint64_t)filter_y[0]), 1070 [filter_y1] "f"((uint64_t)filter_y[1]), 1071 [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), 1072 [mask] "f"(mask) 1073 : "memory" 1074 ); 1075 } 1076 1077 #define SUBPIX_VAR16XN(H) \ 1078 uint32_t vpx_sub_pixel_variance16x##H##_mmi( \ 1079 const uint8_t *a, int a_stride, int xoffset, int yoffset, \ 1080 const uint8_t *b, int b_stride, uint32_t *sse) { \ 1081 uint8_t temp2[16 * H]; \ 1082 var_filter_block2d_bil_16x(a, a_stride, xoffset, yoffset, temp2, \ 1083 (H - 2) / 2); \ 1084 \ 1085 return vpx_variance16x##H##_mmi(temp2, 16, b, b_stride, sse); \ 1086 } 1087 1088 SUBPIX_VAR16XN(16) 1089 SUBPIX_VAR16XN(8) 1090 1091 static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride, 1092 int xoffset, int yoffset, 1093 uint8_t *temp2, int counter) { 1094 uint8_t *temp2_ptr = temp2; 1095 mips_reg l_counter = counter; 1096 double ftmp[15]; 1097 mips_reg tmp[2]; 1098 DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; 1099 DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; 1100 const uint8_t *filter_x = bilinear_filters[xoffset]; 1101 const uint8_t *filter_y = bilinear_filters[yoffset]; 1102 1103 __asm__ volatile ( 1104 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1105 MMI_LI(%[tmp0], 0x07) 1106 MMI_MTC1(%[tmp0], %[ftmp14]) 1107 "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" 1108 "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" 1109 "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" 1110 "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" 1111 1112 // fdata3: fdata3[0] ~ fdata3[7] 1113 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A 1114 1115 // fdata3 +a_stride*1: fdata3[0] ~ fdata3[7] 1116 MMI_ADDU(%[a], %[a], %[a_stride]) 1117 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B 1118 // temp2: temp2[0] ~ temp2[7] 1119 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A 1120 1121 // fdata3 +a_stride*2: fdata3[0] ~ fdata3[7] 1122 MMI_ADDU(%[a], %[a], %[a_stride]) 1123 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A 1124 // temp2+8*1: temp2[0] ~ temp2[7] 1125 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) 1126 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B 1127 1128 "1: \n\t" 1129 MMI_ADDU(%[a], %[a], %[a_stride]) 1130 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B 1131 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) 1132 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A 1133 1134 MMI_ADDU(%[a], %[a], %[a_stride]) 1135 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A 1136 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) 1137 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B 1138 "addiu %[counter], %[counter], -0x01 \n\t" 1139 "bnez %[counter], 1b \n\t" 1140 : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), 1141 [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), 1142 [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), 1143 [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), 1144 [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), 1145 [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), 1146 [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr), 1147 [counter]"+&r"(l_counter) 1148 : [filter_x0] "f"((uint64_t)filter_x[0]), 1149 [filter_x1] "f"((uint64_t)filter_x[1]), 1150 [filter_y0] "f"((uint64_t)filter_y[0]), 1151 [filter_y1] "f"((uint64_t)filter_y[1]), 1152 [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), 1153 [mask] "f"(mask) 1154 : "memory" 1155 ); 1156 } 1157 1158 #define SUBPIX_VAR8XN(H) \ 1159 uint32_t vpx_sub_pixel_variance8x##H##_mmi( \ 1160 const uint8_t *a, int a_stride, int xoffset, int yoffset, \ 1161 const uint8_t *b, int b_stride, uint32_t *sse) { \ 1162 uint8_t temp2[8 * H]; \ 1163 var_filter_block2d_bil_8x(a, a_stride, xoffset, yoffset, temp2, \ 1164 (H - 2) / 2); \ 1165 \ 1166 return vpx_variance8x##H##_mmi(temp2, 8, b, b_stride, sse); \ 1167 } 1168 1169 SUBPIX_VAR8XN(16) 1170 SUBPIX_VAR8XN(8) 1171 SUBPIX_VAR8XN(4) 1172 1173 static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride, 1174 int xoffset, int yoffset, 1175 uint8_t *temp2, int counter) { 1176 uint8_t *temp2_ptr = temp2; 1177 mips_reg l_counter = counter; 1178 double ftmp[7]; 1179 mips_reg tmp[2]; 1180 DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; 1181 DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; 1182 const uint8_t *filter_x = bilinear_filters[xoffset]; 1183 const uint8_t *filter_y = bilinear_filters[yoffset]; 1184 1185 __asm__ volatile ( 1186 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1187 MMI_LI(%[tmp0], 0x07) 1188 MMI_MTC1(%[tmp0], %[ftmp6]) 1189 "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" 1190 "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" 1191 "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" 1192 "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" 1193 // fdata3: fdata3[0] ~ fdata3[3] 1194 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A 1195 1196 // fdata3 +a_stride*1: fdata3[0] ~ fdata3[3] 1197 MMI_ADDU(%[a], %[a], %[a_stride]) 1198 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B 1199 // temp2: temp2[0] ~ temp2[7] 1200 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A 1201 1202 // fdata3 +a_stride*2: fdata3[0] ~ fdata3[3] 1203 MMI_ADDU(%[a], %[a], %[a_stride]) 1204 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A 1205 // temp2+4*1: temp2[0] ~ temp2[7] 1206 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) 1207 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B 1208 1209 "1: \n\t" 1210 MMI_ADDU(%[a], %[a], %[a_stride]) 1211 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B 1212 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) 1213 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A 1214 1215 MMI_ADDU(%[a], %[a], %[a_stride]) 1216 VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A 1217 MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) 1218 VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B 1219 "addiu %[counter], %[counter], -0x01 \n\t" 1220 "bnez %[counter], 1b \n\t" 1221 : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), 1222 [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), 1223 [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), 1224 [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter) 1225 : [filter_x0] "f"((uint64_t)filter_x[0]), 1226 [filter_x1] "f"((uint64_t)filter_x[1]), 1227 [filter_y0] "f"((uint64_t)filter_y[0]), 1228 [filter_y1] "f"((uint64_t)filter_y[1]), 1229 [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), 1230 [mask] "f"(mask) 1231 : "memory" 1232 ); 1233 } 1234 1235 #define SUBPIX_VAR4XN(H) \ 1236 uint32_t vpx_sub_pixel_variance4x##H##_mmi( \ 1237 const uint8_t *a, int a_stride, int xoffset, int yoffset, \ 1238 const uint8_t *b, int b_stride, uint32_t *sse) { \ 1239 uint8_t temp2[4 * H]; \ 1240 var_filter_block2d_bil_4x(a, a_stride, xoffset, yoffset, temp2, \ 1241 (H - 2) / 2); \ 1242 \ 1243 return vpx_variance4x##H##_mmi(temp2, 4, b, b_stride, sse); \ 1244 } 1245 1246 SUBPIX_VAR4XN(8) 1247 SUBPIX_VAR4XN(4) 1248 1249 #define SUBPIX_AVG_VAR(W, H) \ 1250 uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \ 1251 const uint8_t *a, int a_stride, int xoffset, int yoffset, \ 1252 const uint8_t *b, int b_stride, uint32_t *sse, \ 1253 const uint8_t *second_pred) { \ 1254 uint16_t fdata3[(H + 1) * W]; \ 1255 uint8_t temp2[H * W]; \ 1256 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ 1257 \ 1258 var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ 1259 bilinear_filters[xoffset]); \ 1260 var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ 1261 bilinear_filters[yoffset]); \ 1262 \ 1263 vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ 1264 \ 1265 return vpx_variance##W##x##H##_mmi(temp3, W, b, b_stride, sse); \ 1266 } 1267 1268 SUBPIX_AVG_VAR(64, 64) 1269 SUBPIX_AVG_VAR(64, 32) 1270 SUBPIX_AVG_VAR(32, 64) 1271 SUBPIX_AVG_VAR(32, 32) 1272 SUBPIX_AVG_VAR(32, 16) 1273 SUBPIX_AVG_VAR(16, 32) 1274 SUBPIX_AVG_VAR(16, 16) 1275 SUBPIX_AVG_VAR(16, 8) 1276 SUBPIX_AVG_VAR(8, 16) 1277 SUBPIX_AVG_VAR(8, 8) 1278 SUBPIX_AVG_VAR(8, 4) 1279 SUBPIX_AVG_VAR(4, 8) 1280 SUBPIX_AVG_VAR(4, 4) 1281