1 /* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 #include "./vpx_dsp_rtcd.h" 11 12 typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, 13 const uint8_t *ref, int ref_stride, 14 unsigned int *sse, int *sum); 15 16 void vpx_get32x32var_avx2(const uint8_t *src, int src_stride, 17 const uint8_t *ref, int ref_stride, 18 unsigned int *sse, int *sum); 19 20 static void variance_avx2(const uint8_t *src, int src_stride, 21 const uint8_t *ref, int ref_stride, 22 int w, int h, unsigned int *sse, int *sum, 23 get_var_avx2 var_fn, int block_size) { 24 int i, j; 25 26 *sse = 0; 27 *sum = 0; 28 29 for (i = 0; i < h; i += 16) { 30 for (j = 0; j < w; j += block_size) { 31 unsigned int sse0; 32 int sum0; 33 var_fn(&src[src_stride * i + j], src_stride, 34 &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); 35 *sse += sse0; 36 *sum += sum0; 37 } 38 } 39 } 40 41 42 unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, 43 const uint8_t *ref, int ref_stride, 44 unsigned int *sse) { 45 int sum; 46 variance_avx2(src, src_stride, ref, ref_stride, 16, 16, 47 sse, &sum, vpx_get16x16var_avx2, 16); 48 return *sse - (((unsigned int)sum * sum) >> 8); 49 } 50 51 unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride, 52 const uint8_t *ref, int ref_stride, 53 unsigned int *sse) { 54 int sum; 55 vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); 56 return *sse; 57 } 58 59 unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride, 60 const uint8_t *ref, int ref_stride, 61 unsigned int *sse) { 62 int sum; 63 variance_avx2(src, src_stride, ref, ref_stride, 32, 16, 64 sse, &sum, vpx_get32x32var_avx2, 32); 65 return *sse - (((int64_t)sum * sum) >> 9); 66 } 67 68 unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, 69 const uint8_t *ref, int ref_stride, 70 unsigned int *sse) { 71 int sum; 72 variance_avx2(src, src_stride, ref, ref_stride, 32, 32, 73 sse, &sum, vpx_get32x32var_avx2, 32); 74 return *sse - (((int64_t)sum * sum) >> 10); 75 } 76 77 unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, 78 const uint8_t *ref, int ref_stride, 79 unsigned int *sse) { 80 int sum; 81 variance_avx2(src, src_stride, ref, ref_stride, 64, 64, 82 sse, &sum, vpx_get32x32var_avx2, 32); 83 return *sse - (((int64_t)sum * sum) >> 12); 84 } 85 86 unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, 87 const uint8_t *ref, int ref_stride, 88 unsigned int *sse) { 89 int sum; 90 variance_avx2(src, src_stride, ref, ref_stride, 64, 32, 91 sse, &sum, vpx_get32x32var_avx2, 32); 92 return *sse - (((int64_t)sum * sum) >> 11); 93 } 94 95 unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, 96 int x_offset, int y_offset, 97 const uint8_t *dst, int dst_stride, 98 int height, 99 unsigned int *sse); 100 101 unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, 102 int src_stride, 103 int x_offset, 104 int y_offset, 105 const uint8_t *dst, 106 int dst_stride, 107 const uint8_t *sec, 108 int sec_stride, 109 int height, 110 unsigned int *sseptr); 111 112 unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src, 113 int src_stride, 114 int x_offset, 115 int y_offset, 116 const uint8_t *dst, 117 int dst_stride, 118 unsigned int *sse) { 119 unsigned int sse1; 120 const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, 121 y_offset, dst, dst_stride, 122 64, &sse1); 123 unsigned int sse2; 124 const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, 125 x_offset, y_offset, 126 dst + 32, dst_stride, 127 64, &sse2); 128 const int se = se1 + se2; 129 *sse = sse1 + sse2; 130 return *sse - (((int64_t)se * se) >> 12); 131 } 132 133 unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src, 134 int src_stride, 135 int x_offset, 136 int y_offset, 137 const uint8_t *dst, 138 int dst_stride, 139 unsigned int *sse) { 140 const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, 141 y_offset, dst, dst_stride, 142 32, sse); 143 return *sse - (((int64_t)se * se) >> 10); 144 } 145 146 unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, 147 int src_stride, 148 int x_offset, 149 int y_offset, 150 const uint8_t *dst, 151 int dst_stride, 152 unsigned int *sse, 153 const uint8_t *sec) { 154 unsigned int sse1; 155 const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, 156 y_offset, dst, dst_stride, 157 sec, 64, 64, &sse1); 158 unsigned int sse2; 159 const int se2 = 160 vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, 161 y_offset, dst + 32, dst_stride, 162 sec + 32, 64, 64, &sse2); 163 const int se = se1 + se2; 164 165 *sse = sse1 + sse2; 166 167 return *sse - (((int64_t)se * se) >> 12); 168 } 169 170 unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, 171 int src_stride, 172 int x_offset, 173 int y_offset, 174 const uint8_t *dst, 175 int dst_stride, 176 unsigned int *sse, 177 const uint8_t *sec) { 178 // Process 32 elements in parallel. 179 const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, 180 y_offset, dst, dst_stride, 181 sec, 32, 32, sse); 182 return *sse - (((int64_t)se * se) >> 10); 183 } 184