1 // Copyright 2014 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // NEON common code. 11 12 #ifndef WEBP_DSP_NEON_H_ 13 #define WEBP_DSP_NEON_H_ 14 15 #include <arm_neon.h> 16 17 #include "./dsp.h" 18 19 // Right now, some intrinsics functions seem slower, so we disable them 20 // everywhere except aarch64 where the inline assembly is incompatible. 21 #if defined(__aarch64__) 22 #define USE_INTRINSICS // use intrinsics when possible 23 #endif 24 25 #define INIT_VECTOR2(v, a, b) do { \ 26 v.val[0] = a; \ 27 v.val[1] = b; \ 28 } while (0) 29 30 #define INIT_VECTOR3(v, a, b, c) do { \ 31 v.val[0] = a; \ 32 v.val[1] = b; \ 33 v.val[2] = c; \ 34 } while (0) 35 36 #define INIT_VECTOR4(v, a, b, c, d) do { \ 37 v.val[0] = a; \ 38 v.val[1] = b; \ 39 v.val[2] = c; \ 40 v.val[3] = d; \ 41 } while (0) 42 43 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3 44 // crash ("internal compiler error: in immed_double_const, at emit-rtl."). 45 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) 46 #if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) 47 #define WORK_AROUND_GCC 48 #endif 49 50 static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) { 51 uint64x2x2_t row01, row23; 52 53 row01.val[0] = vreinterpretq_u64_s32(rows.val[0]); 54 row01.val[1] = vreinterpretq_u64_s32(rows.val[1]); 55 row23.val[0] = vreinterpretq_u64_s32(rows.val[2]); 56 row23.val[1] = vreinterpretq_u64_s32(rows.val[3]); 57 // Transpose 64-bit values (there's no vswp equivalent) 58 { 59 const uint64x1_t row0h = vget_high_u64(row01.val[0]); 60 const uint64x1_t row2l = vget_low_u64(row23.val[0]); 61 const uint64x1_t row1h = vget_high_u64(row01.val[1]); 62 const uint64x1_t row3l = vget_low_u64(row23.val[1]); 63 row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l); 64 row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0])); 65 row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l); 66 row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1])); 67 } 68 { 69 const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]), 70 vreinterpretq_s32_u64(row01.val[1])); 71 const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]), 72 vreinterpretq_s32_u64(row23.val[1])); 73 int32x4x4_t out; 74 out.val[0] = out01.val[0]; 75 out.val[1] = out01.val[1]; 76 out.val[2] = out23.val[0]; 77 out.val[3] = out23.val[1]; 78 return out; 79 } 80 } 81 82 #endif // WEBP_DSP_NEON_H_ 83