Home | History | Annotate | Download | only in dsp
      1 // Copyright 2014 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 //  NEON common code.
     11 
     12 #ifndef WEBP_DSP_NEON_H_
     13 #define WEBP_DSP_NEON_H_
     14 
     15 #include <arm_neon.h>
     16 
     17 #include "./dsp.h"
     18 
     19 // Right now, some intrinsics functions seem slower, so we disable them
     20 // everywhere except aarch64 where the inline assembly is incompatible.
     21 #if defined(__aarch64__)
     22 #define USE_INTRINSICS   // use intrinsics when possible
     23 #endif
     24 
     25 #define INIT_VECTOR2(v, a, b) do {  \
     26   v.val[0] = a;                     \
     27   v.val[1] = b;                     \
     28 } while (0)
     29 
     30 #define INIT_VECTOR3(v, a, b, c) do {  \
     31   v.val[0] = a;                        \
     32   v.val[1] = b;                        \
     33   v.val[2] = c;                        \
     34 } while (0)
     35 
     36 #define INIT_VECTOR4(v, a, b, c, d) do {  \
     37   v.val[0] = a;                           \
     38   v.val[1] = b;                           \
     39   v.val[2] = c;                           \
     40   v.val[3] = d;                           \
     41 } while (0)
     42 
     43 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
     44 // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
     45 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
     46 #if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
     47 #define WORK_AROUND_GCC
     48 #endif
     49 
     50 static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
     51   uint64x2x2_t row01, row23;
     52 
     53   row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
     54   row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
     55   row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
     56   row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
     57   // Transpose 64-bit values (there's no vswp equivalent)
     58   {
     59     const uint64x1_t row0h = vget_high_u64(row01.val[0]);
     60     const uint64x1_t row2l = vget_low_u64(row23.val[0]);
     61     const uint64x1_t row1h = vget_high_u64(row01.val[1]);
     62     const uint64x1_t row3l = vget_low_u64(row23.val[1]);
     63     row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
     64     row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
     65     row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
     66     row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
     67   }
     68   {
     69     const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
     70                                         vreinterpretq_s32_u64(row01.val[1]));
     71     const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
     72                                         vreinterpretq_s32_u64(row23.val[1]));
     73     int32x4x4_t out;
     74     out.val[0] = out01.val[0];
     75     out.val[1] = out01.val[1];
     76     out.val[2] = out23.val[0];
     77     out.val[3] = out23.val[1];
     78     return out;
     79   }
     80 }
     81 
     82 #endif  // WEBP_DSP_NEON_H_
     83