1 // Copyright 2014 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // NEON common code. 11 12 #ifndef WEBP_DSP_NEON_H_ 13 #define WEBP_DSP_NEON_H_ 14 15 #include <arm_neon.h> 16 17 #include "src/dsp/dsp.h" 18 19 // Right now, some intrinsics functions seem slower, so we disable them 20 // everywhere except newer clang/gcc or aarch64 where the inline assembly is 21 // incompatible. 22 #if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__) 23 #define WEBP_USE_INTRINSICS // use intrinsics when possible 24 #endif 25 26 #define INIT_VECTOR2(v, a, b) do { \ 27 v.val[0] = a; \ 28 v.val[1] = b; \ 29 } while (0) 30 31 #define INIT_VECTOR3(v, a, b, c) do { \ 32 v.val[0] = a; \ 33 v.val[1] = b; \ 34 v.val[2] = c; \ 35 } while (0) 36 37 #define INIT_VECTOR4(v, a, b, c, d) do { \ 38 v.val[0] = a; \ 39 v.val[1] = b; \ 40 v.val[2] = c; \ 41 v.val[3] = d; \ 42 } while (0) 43 44 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3 45 // crash ("internal compiler error: in immed_double_const, at emit-rtl."). 46 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) 47 #if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) 48 #define WORK_AROUND_GCC 49 #endif 50 51 static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) { 52 uint64x2x2_t row01, row23; 53 54 row01.val[0] = vreinterpretq_u64_s32(rows.val[0]); 55 row01.val[1] = vreinterpretq_u64_s32(rows.val[1]); 56 row23.val[0] = vreinterpretq_u64_s32(rows.val[2]); 57 row23.val[1] = vreinterpretq_u64_s32(rows.val[3]); 58 // Transpose 64-bit values (there's no vswp equivalent) 59 { 60 const uint64x1_t row0h = vget_high_u64(row01.val[0]); 61 const uint64x1_t row2l = vget_low_u64(row23.val[0]); 62 const uint64x1_t row1h = vget_high_u64(row01.val[1]); 63 const uint64x1_t row3l = vget_low_u64(row23.val[1]); 64 row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l); 65 row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0])); 66 row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l); 67 row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1])); 68 } 69 { 70 const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]), 71 vreinterpretq_s32_u64(row01.val[1])); 72 const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]), 73 vreinterpretq_s32_u64(row23.val[1])); 74 int32x4x4_t out; 75 out.val[0] = out01.val[0]; 76 out.val[1] = out01.val[1]; 77 out.val[2] = out23.val[0]; 78 out.val[3] = out23.val[1]; 79 return out; 80 } 81 } 82 83 #if 0 // Useful debug macro. 84 #include <stdio.h> 85 #define PRINT_REG(REG, SIZE) do { \ 86 int i; \ 87 printf("%s \t[%d]: 0x", #REG, SIZE); \ 88 if (SIZE == 8) { \ 89 uint8_t _tmp[8]; \ 90 vst1_u8(_tmp, (REG)); \ 91 for (i = 0; i < 8; ++i) printf("%.2x ", _tmp[i]); \ 92 } else if (SIZE == 16) { \ 93 uint16_t _tmp[4]; \ 94 vst1_u16(_tmp, (REG)); \ 95 for (i = 0; i < 4; ++i) printf("%.4x ", _tmp[i]); \ 96 } \ 97 printf("\n"); \ 98 } while (0) 99 #endif 100 101 #endif // WEBP_DSP_NEON_H_ 102