1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #include "./vpx_dsp_rtcd.h" 14 #include "vpx/vpx_integer.h" 15 16 void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, 17 uint8_t *dst, ptrdiff_t dst_stride, 18 const int16_t *filter_x, int filter_x_stride, 19 const int16_t *filter_y, int filter_y_stride, int w, 20 int h) { 21 (void)filter_x; 22 (void)filter_x_stride; 23 (void)filter_y; 24 (void)filter_y_stride; 25 26 if (w < 8) { // copy4 27 do { 28 *(uint32_t *)dst = *(const uint32_t *)src; 29 src += src_stride; 30 dst += dst_stride; 31 *(uint32_t *)dst = *(const uint32_t *)src; 32 src += src_stride; 33 dst += dst_stride; 34 h -= 2; 35 } while (h > 0); 36 } else if (w == 8) { // copy8 37 uint8x8_t s0, s1; 38 do { 39 s0 = vld1_u8(src); 40 src += src_stride; 41 s1 = vld1_u8(src); 42 src += src_stride; 43 44 vst1_u8(dst, s0); 45 dst += dst_stride; 46 vst1_u8(dst, s1); 47 dst += dst_stride; 48 h -= 2; 49 } while (h > 0); 50 } else if (w < 32) { // copy16 51 uint8x16_t s0, s1; 52 do { 53 s0 = vld1q_u8(src); 54 src += src_stride; 55 s1 = vld1q_u8(src); 56 src += src_stride; 57 58 vst1q_u8(dst, s0); 59 dst += dst_stride; 60 vst1q_u8(dst, s1); 61 dst += dst_stride; 62 h -= 2; 63 } while (h > 0); 64 } else if (w == 32) { // copy32 65 uint8x16_t s0, s1, s2, s3; 66 do { 67 s0 = vld1q_u8(src); 68 s1 = vld1q_u8(src + 16); 69 src += src_stride; 70 s2 = vld1q_u8(src); 71 s3 = vld1q_u8(src + 16); 72 src += src_stride; 73 74 vst1q_u8(dst, s0); 75 vst1q_u8(dst + 16, s1); 76 dst += dst_stride; 77 vst1q_u8(dst, s2); 78 vst1q_u8(dst + 16, s3); 79 dst += dst_stride; 80 h -= 2; 81 } while (h > 0); 82 } else { // copy64 83 uint8x16_t s0, s1, s2, s3; 84 do { 85 s0 = vld1q_u8(src); 86 s1 = vld1q_u8(src + 16); 87 s2 = vld1q_u8(src + 32); 88 s3 = vld1q_u8(src + 48); 89 src += src_stride; 90 91 vst1q_u8(dst, s0); 92 vst1q_u8(dst + 16, s1); 93 vst1q_u8(dst + 32, s2); 94 vst1q_u8(dst + 48, s3); 95 dst += dst_stride; 96 } while (--h); 97 } 98 } 99