1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #include "./vpx_dsp_rtcd.h" 14 #include "vpx/vpx_integer.h" 15 16 void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, 17 uint8_t *dst, ptrdiff_t dst_stride, 18 const int16_t *filter_x, int filter_x_stride, 19 const int16_t *filter_y, int filter_y_stride, int w, 20 int h) { 21 (void)filter_x; 22 (void)filter_x_stride; 23 (void)filter_y; 24 (void)filter_y_stride; 25 26 if (w < 8) { // avg4 27 uint8x8_t s0, s1; 28 uint8x8_t dd0 = vdup_n_u8(0); 29 uint32x2x2_t s01; 30 do { 31 s0 = vld1_u8(src); 32 src += src_stride; 33 s1 = vld1_u8(src); 34 src += src_stride; 35 s01 = vzip_u32(vreinterpret_u32_u8(s0), vreinterpret_u32_u8(s1)); 36 dd0 = vreinterpret_u8_u32( 37 vld1_lane_u32((const uint32_t *)dst, vreinterpret_u32_u8(dd0), 0)); 38 dd0 = vreinterpret_u8_u32(vld1_lane_u32( 39 (const uint32_t *)(dst + dst_stride), vreinterpret_u32_u8(dd0), 1)); 40 dd0 = vrhadd_u8(vreinterpret_u8_u32(s01.val[0]), dd0); 41 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 0); 42 dst += dst_stride; 43 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 1); 44 dst += dst_stride; 45 h -= 2; 46 } while (h > 0); 47 } else if (w == 8) { // avg8 48 uint8x8_t s0, s1, d0, d1; 49 uint8x16_t s01, d01; 50 do { 51 s0 = vld1_u8(src); 52 src += src_stride; 53 s1 = vld1_u8(src); 54 src += src_stride; 55 d0 = vld1_u8(dst); 56 d1 = vld1_u8(dst + dst_stride); 57 58 s01 = vcombine_u8(s0, s1); 59 d01 = vcombine_u8(d0, d1); 60 d01 = vrhaddq_u8(s01, d01); 61 62 vst1_u8(dst, vget_low_u8(d01)); 63 dst += dst_stride; 64 vst1_u8(dst, vget_high_u8(d01)); 65 dst += dst_stride; 66 h -= 2; 67 } while (h > 0); 68 } else if (w < 32) { // avg16 69 uint8x16_t s0, s1, d0, d1; 70 do { 71 s0 = vld1q_u8(src); 72 src += src_stride; 73 s1 = vld1q_u8(src); 74 src += src_stride; 75 d0 = vld1q_u8(dst); 76 d1 = vld1q_u8(dst + dst_stride); 77 78 d0 = vrhaddq_u8(s0, d0); 79 d1 = vrhaddq_u8(s1, d1); 80 81 vst1q_u8(dst, d0); 82 dst += dst_stride; 83 vst1q_u8(dst, d1); 84 dst += dst_stride; 85 h -= 2; 86 } while (h > 0); 87 } else if (w == 32) { // avg32 88 uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3; 89 do { 90 s0 = vld1q_u8(src); 91 s1 = vld1q_u8(src + 16); 92 src += src_stride; 93 s2 = vld1q_u8(src); 94 s3 = vld1q_u8(src + 16); 95 src += src_stride; 96 d0 = vld1q_u8(dst); 97 d1 = vld1q_u8(dst + 16); 98 d2 = vld1q_u8(dst + dst_stride); 99 d3 = vld1q_u8(dst + dst_stride + 16); 100 101 d0 = vrhaddq_u8(s0, d0); 102 d1 = vrhaddq_u8(s1, d1); 103 d2 = vrhaddq_u8(s2, d2); 104 d3 = vrhaddq_u8(s3, d3); 105 106 vst1q_u8(dst, d0); 107 vst1q_u8(dst + 16, d1); 108 dst += dst_stride; 109 vst1q_u8(dst, d2); 110 vst1q_u8(dst + 16, d3); 111 dst += dst_stride; 112 h -= 2; 113 } while (h > 0); 114 } else { // avg64 115 uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3; 116 do { 117 s0 = vld1q_u8(src); 118 s1 = vld1q_u8(src + 16); 119 s2 = vld1q_u8(src + 32); 120 s3 = vld1q_u8(src + 48); 121 src += src_stride; 122 d0 = vld1q_u8(dst); 123 d1 = vld1q_u8(dst + 16); 124 d2 = vld1q_u8(dst + 32); 125 d3 = vld1q_u8(dst + 48); 126 127 d0 = vrhaddq_u8(s0, d0); 128 d1 = vrhaddq_u8(s1, d1); 129 d2 = vrhaddq_u8(s2, d2); 130 d3 = vrhaddq_u8(s3, d3); 131 132 vst1q_u8(dst, d0); 133 vst1q_u8(dst + 16, d1); 134 vst1q_u8(dst + 32, d2); 135 vst1q_u8(dst + 48, d3); 136 dst += dst_stride; 137 } while (--h); 138 } 139 } 140