1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 #include "./vp9_rtcd.h" 13 14 #include "vpx_ports/mem.h" 15 #include "vpx/vpx_integer.h" 16 17 #include "vp9/common/vp9_common.h" 18 #include "vp9/common/vp9_filter.h" 19 20 #include "vp9/encoder/vp9_variance.h" 21 22 enum { kWidth8 = 8 }; 23 enum { kHeight8 = 8 }; 24 enum { kHeight8PlusOne = 9 }; 25 enum { kWidth16 = 16 }; 26 enum { kHeight16 = 16 }; 27 enum { kHeight16PlusOne = 17 }; 28 enum { kWidth32 = 32 }; 29 enum { kHeight32 = 32 }; 30 enum { kHeight32PlusOne = 33 }; 31 enum { kPixelStepOne = 1 }; 32 enum { kAlign16 = 16 }; 33 34 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { 35 const int32x4_t a = vpaddlq_s16(v_16x8); 36 const int64x2_t b = vpaddlq_s32(a); 37 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), 38 vreinterpret_s32_s64(vget_high_s64(b))); 39 return vget_lane_s32(c, 0); 40 } 41 42 static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { 43 const int64x2_t b = vpaddlq_s32(v_32x4); 44 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), 45 vreinterpret_s32_s64(vget_high_s64(b))); 46 return vget_lane_s32(c, 0); 47 } 48 49 static void variance_neon_w8(const uint8_t *a, int a_stride, 50 const uint8_t *b, int b_stride, 51 int w, int h, unsigned int *sse, int *sum) { 52 int i, j; 53 int16x8_t v_sum = vdupq_n_s16(0); 54 int32x4_t v_sse_lo = vdupq_n_s32(0); 55 int32x4_t v_sse_hi = vdupq_n_s32(0); 56 57 for (i = 0; i < h; ++i) { 58 for (j = 0; j < w; j += 8) { 59 const uint8x8_t v_a = vld1_u8(&a[j]); 60 const uint8x8_t v_b = vld1_u8(&b[j]); 61 const uint16x8_t v_diff = vsubl_u8(v_a, v_b); 62 const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); 63 v_sum = vaddq_s16(v_sum, sv_diff); 64 v_sse_lo = vmlal_s16(v_sse_lo, 65 vget_low_s16(sv_diff), 66 vget_low_s16(sv_diff)); 67 v_sse_hi = vmlal_s16(v_sse_hi, 68 vget_high_s16(sv_diff), 69 vget_high_s16(sv_diff)); 70 } 71 a += a_stride; 72 b += b_stride; 73 } 74 75 *sum = horizontal_add_s16x8(v_sum); 76 *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); 77 } 78 79 void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride, 80 const uint8_t *ref_ptr, int ref_stride, 81 unsigned int *sse, int *sum) { 82 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth8, 83 kHeight8, sse, sum); 84 } 85 86 unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride, 87 const uint8_t *b, int b_stride, 88 unsigned int *sse) { 89 int sum; 90 variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum); 91 return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8)); 92 } 93 94 void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, 95 const uint8_t *ref_ptr, int ref_stride, 96 unsigned int *sse, int *sum) { 97 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16, 98 kHeight16, sse, sum); 99 } 100 101 unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride, 102 const uint8_t *b, int b_stride, 103 unsigned int *sse) { 104 int sum; 105 variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum); 106 return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16)); 107 } 108 109 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, 110 uint8_t *output_ptr, 111 unsigned int src_pixels_per_line, 112 int pixel_step, 113 unsigned int output_height, 114 unsigned int output_width, 115 const int16_t *vp9_filter) { 116 const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); 117 const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); 118 unsigned int i; 119 for (i = 0; i < output_height; ++i) { 120 const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); 121 const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); 122 const uint16x8_t a = vmull_u8(src_0, f0); 123 const uint16x8_t b = vmlal_u8(a, src_1, f1); 124 const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); 125 vst1_u8(&output_ptr[0], out); 126 // Next row... 127 src_ptr += src_pixels_per_line; 128 output_ptr += output_width; 129 } 130 } 131 132 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, 133 uint8_t *output_ptr, 134 unsigned int src_pixels_per_line, 135 int pixel_step, 136 unsigned int output_height, 137 unsigned int output_width, 138 const int16_t *vp9_filter) { 139 const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); 140 const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); 141 unsigned int i, j; 142 for (i = 0; i < output_height; ++i) { 143 for (j = 0; j < output_width; j += 16) { 144 const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); 145 const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); 146 const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); 147 const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); 148 const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); 149 const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); 150 const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); 151 const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); 152 vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); 153 } 154 // Next row... 155 src_ptr += src_pixels_per_line; 156 output_ptr += output_width; 157 } 158 } 159 160 unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src, 161 int src_stride, 162 int xoffset, 163 int yoffset, 164 const uint8_t *dst, 165 int dst_stride, 166 unsigned int *sse) { 167 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8); 168 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8); 169 170 var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne, 171 kHeight8PlusOne, kWidth8, 172 BILINEAR_FILTERS_2TAP(xoffset)); 173 var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8, 174 kWidth8, BILINEAR_FILTERS_2TAP(yoffset)); 175 return vp9_variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse); 176 } 177 178 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, 179 int src_stride, 180 int xoffset, 181 int yoffset, 182 const uint8_t *dst, 183 int dst_stride, 184 unsigned int *sse) { 185 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight16 * kWidth16); 186 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight16PlusOne * kWidth16); 187 188 var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, 189 kHeight16PlusOne, kWidth16, 190 BILINEAR_FILTERS_2TAP(xoffset)); 191 var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16, 192 kWidth16, BILINEAR_FILTERS_2TAP(yoffset)); 193 return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse); 194 } 195 196 void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride, 197 const uint8_t *ref_ptr, int ref_stride, 198 unsigned int *sse, int *sum) { 199 variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth32, 200 kHeight32, sse, sum); 201 } 202 203 unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride, 204 const uint8_t *b, int b_stride, 205 unsigned int *sse) { 206 int sum; 207 variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum); 208 return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32)); 209 } 210 211 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, 212 int src_stride, 213 int xoffset, 214 int yoffset, 215 const uint8_t *dst, 216 int dst_stride, 217 unsigned int *sse) { 218 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight32 * kWidth32); 219 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight32PlusOne * kWidth32); 220 221 var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, 222 kHeight32PlusOne, kWidth32, 223 BILINEAR_FILTERS_2TAP(xoffset)); 224 var_filter_block2d_bil_w16(fdata3, temp2, kWidth32, kWidth32, kHeight32, 225 kWidth32, BILINEAR_FILTERS_2TAP(yoffset)); 226 return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse); 227 } 228