1 /* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 #include "./vpx_dsp_rtcd.h" 11 12 typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, 13 const uint8_t *ref, int ref_stride, 14 unsigned int *sse, int *sum); 15 16 void vpx_get32x32var_avx2(const uint8_t *src, int src_stride, 17 const uint8_t *ref, int ref_stride, unsigned int *sse, 18 int *sum); 19 20 static void variance_avx2(const uint8_t *src, int src_stride, 21 const uint8_t *ref, int ref_stride, int w, int h, 22 unsigned int *sse, int *sum, get_var_avx2 var_fn, 23 int block_size) { 24 int i, j; 25 26 *sse = 0; 27 *sum = 0; 28 29 for (i = 0; i < h; i += 16) { 30 for (j = 0; j < w; j += block_size) { 31 unsigned int sse0; 32 int sum0; 33 var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j], 34 ref_stride, &sse0, &sum0); 35 *sse += sse0; 36 *sum += sum0; 37 } 38 } 39 } 40 41 unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, 42 const uint8_t *ref, int ref_stride, 43 unsigned int *sse) { 44 int sum; 45 variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, 46 vpx_get16x16var_avx2, 16); 47 return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); 48 } 49 50 unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride, 51 const uint8_t *ref, int ref_stride, 52 unsigned int *sse) { 53 int sum; 54 vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); 55 return *sse; 56 } 57 58 unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride, 59 const uint8_t *ref, int ref_stride, 60 unsigned int *sse) { 61 int sum; 62 variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, 63 vpx_get32x32var_avx2, 32); 64 return *sse - (uint32_t)(((int64_t)sum * sum) >> 9); 65 } 66 67 unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, 68 const uint8_t *ref, int ref_stride, 69 unsigned int *sse) { 70 int sum; 71 variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, 72 vpx_get32x32var_avx2, 32); 73 return *sse - (uint32_t)(((int64_t)sum * sum) >> 10); 74 } 75 76 unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, 77 const uint8_t *ref, int ref_stride, 78 unsigned int *sse) { 79 int sum; 80 variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, 81 vpx_get32x32var_avx2, 32); 82 return *sse - (uint32_t)(((int64_t)sum * sum) >> 12); 83 } 84 85 unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, 86 const uint8_t *ref, int ref_stride, 87 unsigned int *sse) { 88 int sum; 89 variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, 90 vpx_get32x32var_avx2, 32); 91 return *sse - (uint32_t)(((int64_t)sum * sum) >> 11); 92 } 93 94 unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, 95 int x_offset, int y_offset, 96 const uint8_t *dst, int dst_stride, 97 int height, unsigned int *sse); 98 99 unsigned int vpx_sub_pixel_avg_variance32xh_avx2( 100 const uint8_t *src, int src_stride, int x_offset, int y_offset, 101 const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, 102 int height, unsigned int *sseptr); 103 104 unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src, 105 int src_stride, int x_offset, 106 int y_offset, const uint8_t *dst, 107 int dst_stride, 108 unsigned int *sse) { 109 unsigned int sse1; 110 const int se1 = vpx_sub_pixel_variance32xh_avx2( 111 src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1); 112 unsigned int sse2; 113 const int se2 = 114 vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset, 115 dst + 32, dst_stride, 64, &sse2); 116 const int se = se1 + se2; 117 *sse = sse1 + sse2; 118 return *sse - (uint32_t)(((int64_t)se * se) >> 12); 119 } 120 121 unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src, 122 int src_stride, int x_offset, 123 int y_offset, const uint8_t *dst, 124 int dst_stride, 125 unsigned int *sse) { 126 const int se = vpx_sub_pixel_variance32xh_avx2( 127 src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse); 128 return *sse - (uint32_t)(((int64_t)se * se) >> 10); 129 } 130 131 unsigned int vpx_sub_pixel_avg_variance64x64_avx2( 132 const uint8_t *src, int src_stride, int x_offset, int y_offset, 133 const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { 134 unsigned int sse1; 135 const int se1 = vpx_sub_pixel_avg_variance32xh_avx2( 136 src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1); 137 unsigned int sse2; 138 const int se2 = vpx_sub_pixel_avg_variance32xh_avx2( 139 src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32, 140 64, 64, &sse2); 141 const int se = se1 + se2; 142 143 *sse = sse1 + sse2; 144 145 return *sse - (uint32_t)(((int64_t)se * se) >> 12); 146 } 147 148 unsigned int vpx_sub_pixel_avg_variance32x32_avx2( 149 const uint8_t *src, int src_stride, int x_offset, int y_offset, 150 const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { 151 // Process 32 elements in parallel. 152 const int se = vpx_sub_pixel_avg_variance32xh_avx2( 153 src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse); 154 return *sse - (uint32_t)(((int64_t)se * se) >> 10); 155 } 156