1 /* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 #include "./vpx_config.h" 11 12 #include "vp9/encoder/vp9_variance.h" 13 #include "vp9/common/vp9_pragmas.h" 14 #include "vpx_ports/mem.h" 15 16 typedef void (*get_var_avx2) ( 17 const unsigned char *src_ptr, 18 int source_stride, 19 const unsigned char *ref_ptr, 20 int recon_stride, 21 unsigned int *SSE, 22 int *Sum 23 ); 24 25 void vp9_get16x16var_avx2 26 ( 27 const unsigned char *src_ptr, 28 int source_stride, 29 const unsigned char *ref_ptr, 30 int recon_stride, 31 unsigned int *SSE, 32 int *Sum 33 ); 34 35 void vp9_get32x32var_avx2 36 ( 37 const unsigned char *src_ptr, 38 int source_stride, 39 const unsigned char *ref_ptr, 40 int recon_stride, 41 unsigned int *SSE, 42 int *Sum 43 ); 44 45 unsigned int vp9_sub_pixel_variance32xh_avx2 46 ( 47 const uint8_t *src, 48 int src_stride, 49 int x_offset, 50 int y_offset, 51 const uint8_t *dst, 52 int dst_stride, 53 int height, 54 unsigned int *sse 55 ); 56 57 unsigned int vp9_sub_pixel_avg_variance32xh_avx2 58 ( 59 const uint8_t *src, 60 int src_stride, 61 int x_offset, 62 int y_offset, 63 const uint8_t *dst, 64 int dst_stride, 65 const uint8_t *sec, 66 int sec_stride, 67 int height, 68 unsigned int *sseptr 69 ); 70 71 static void variance_avx2(const unsigned char *src_ptr, int source_stride, 72 const unsigned char *ref_ptr, int recon_stride, 73 int w, int h, unsigned int *sse, int *sum, 74 get_var_avx2 var_fn, int block_size) { 75 unsigned int sse0; 76 int sum0; 77 int i, j; 78 79 *sse = 0; 80 *sum = 0; 81 82 for (i = 0; i < h; i += 16) { 83 for (j = 0; j < w; j += block_size) { 84 // processing 16 rows horizontally each call 85 var_fn(src_ptr + source_stride * i + j, source_stride, 86 ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0); 87 *sse += sse0; 88 *sum += sum0; 89 } 90 } 91 } 92 93 unsigned int vp9_variance16x16_avx2 94 ( 95 const unsigned char *src_ptr, 96 int source_stride, 97 const unsigned char *ref_ptr, 98 int recon_stride, 99 unsigned int *sse) { 100 unsigned int var; 101 int avg; 102 103 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, 104 &var, &avg, vp9_get16x16var_avx2, 16); 105 *sse = var; 106 return (var - (((unsigned int)avg * avg) >> 8)); 107 } 108 109 unsigned int vp9_mse16x16_avx2( 110 const unsigned char *src_ptr, 111 int source_stride, 112 const unsigned char *ref_ptr, 113 int recon_stride, 114 unsigned int *sse) { 115 unsigned int sse0; 116 int sum0; 117 vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, 118 &sum0); 119 *sse = sse0; 120 return sse0; 121 } 122 123 unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, 124 int source_stride, 125 const uint8_t *ref_ptr, 126 int recon_stride, 127 unsigned int *sse) { 128 unsigned int var; 129 int avg; 130 131 // processing 32 elements vertically in parallel 132 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, 133 &var, &avg, vp9_get32x32var_avx2, 32); 134 *sse = var; 135 return (var - (((int64_t)avg * avg) >> 10)); 136 } 137 138 unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, 139 int source_stride, 140 const uint8_t *ref_ptr, 141 int recon_stride, 142 unsigned int *sse) { 143 unsigned int var; 144 int avg; 145 146 // processing 32 elements vertically in parallel 147 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, 148 &var, &avg, vp9_get32x32var_avx2, 32); 149 *sse = var; 150 return (var - (((int64_t)avg * avg) >> 9)); 151 } 152 153 154 unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, 155 int source_stride, 156 const uint8_t *ref_ptr, 157 int recon_stride, 158 unsigned int *sse) { 159 unsigned int var; 160 int avg; 161 162 // processing 32 elements vertically in parallel 163 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, 164 &var, &avg, vp9_get32x32var_avx2, 32); 165 *sse = var; 166 return (var - (((int64_t)avg * avg) >> 12)); 167 } 168 169 unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, 170 int source_stride, 171 const uint8_t *ref_ptr, 172 int recon_stride, 173 unsigned int *sse) { 174 unsigned int var; 175 int avg; 176 177 // processing 32 elements vertically in parallel 178 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, 179 &var, &avg, vp9_get32x32var_avx2, 32); 180 181 *sse = var; 182 return (var - (((int64_t)avg * avg) >> 11)); 183 } 184 185 unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, 186 int src_stride, 187 int x_offset, 188 int y_offset, 189 const uint8_t *dst, 190 int dst_stride, 191 unsigned int *sse_ptr) { 192 // processing 32 elements in parallel 193 unsigned int sse; 194 int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, 195 y_offset, dst, dst_stride, 196 64, &sse); 197 // processing the next 32 elements in parallel 198 unsigned int sse2; 199 int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride, 200 x_offset, y_offset, 201 dst + 32, dst_stride, 202 64, &sse2); 203 se += se2; 204 sse += sse2; 205 *sse_ptr = sse; 206 return sse - (((int64_t)se * se) >> 12); 207 } 208 209 unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src, 210 int src_stride, 211 int x_offset, 212 int y_offset, 213 const uint8_t *dst, 214 int dst_stride, 215 unsigned int *sse_ptr) { 216 // processing 32 element in parallel 217 unsigned int sse; 218 int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, 219 y_offset, dst, dst_stride, 220 32, &sse); 221 *sse_ptr = sse; 222 return sse - (((int64_t)se * se) >> 10); 223 } 224 225 unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, 226 int src_stride, 227 int x_offset, 228 int y_offset, 229 const uint8_t *dst, 230 int dst_stride, 231 unsigned int *sseptr, 232 const uint8_t *sec) { 233 // processing 32 elements in parallel 234 unsigned int sse; 235 236 int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, 237 y_offset, dst, dst_stride, 238 sec, 64, 64, &sse); 239 unsigned int sse2; 240 // processing the next 32 elements in parallel 241 int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, 242 y_offset, dst + 32, dst_stride, 243 sec + 32, 64, 64, &sse2); 244 se += se2; 245 sse += sse2; 246 *sseptr = sse; 247 248 return sse - (((int64_t)se * se) >> 12); 249 } 250 251 unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, 252 int src_stride, 253 int x_offset, 254 int y_offset, 255 const uint8_t *dst, 256 int dst_stride, 257 unsigned int *sseptr, 258 const uint8_t *sec) { 259 // processing 32 element in parallel 260 unsigned int sse; 261 int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, 262 y_offset, dst, dst_stride, 263 sec, 32, 32, &sse); 264 *sseptr = sse; 265 return sse - (((int64_t)se * se) >> 10); 266 } 267 268 269