Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 void vp8_short_inv_walsh4x4_neon(int16_t *input, int16_t *mb_dqcoeff) {
     14   int16x8_t q0s16, q1s16, q2s16, q3s16;
     15   int16x4_t d4s16, d5s16, d6s16, d7s16;
     16   int16x4x2_t v2tmp0, v2tmp1;
     17   int32x2x2_t v2tmp2, v2tmp3;
     18   int16x8_t qAdd3;
     19 
     20   q0s16 = vld1q_s16(input);
     21   q1s16 = vld1q_s16(input + 8);
     22 
     23   // 1st for loop
     24   d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
     25   d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
     26   d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
     27   d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
     28 
     29   q2s16 = vcombine_s16(d4s16, d5s16);
     30   q3s16 = vcombine_s16(d6s16, d7s16);
     31 
     32   q0s16 = vaddq_s16(q2s16, q3s16);
     33   q1s16 = vsubq_s16(q2s16, q3s16);
     34 
     35   v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
     36                     vreinterpret_s32_s16(vget_low_s16(q1s16)));
     37   v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
     38                     vreinterpret_s32_s16(vget_high_s16(q1s16)));
     39   v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
     40                     vreinterpret_s16_s32(v2tmp3.val[0]));
     41   v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
     42                     vreinterpret_s16_s32(v2tmp3.val[1]));
     43 
     44   // 2nd for loop
     45   d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
     46   d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
     47   d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
     48   d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
     49   q2s16 = vcombine_s16(d4s16, d5s16);
     50   q3s16 = vcombine_s16(d6s16, d7s16);
     51 
     52   qAdd3 = vdupq_n_s16(3);
     53 
     54   q0s16 = vaddq_s16(q2s16, q3s16);
     55   q1s16 = vsubq_s16(q2s16, q3s16);
     56 
     57   q0s16 = vaddq_s16(q0s16, qAdd3);
     58   q1s16 = vaddq_s16(q1s16, qAdd3);
     59 
     60   q0s16 = vshrq_n_s16(q0s16, 3);
     61   q1s16 = vshrq_n_s16(q1s16, 3);
     62 
     63   // store
     64   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0);
     65   mb_dqcoeff += 16;
     66   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
     67   mb_dqcoeff += 16;
     68   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0);
     69   mb_dqcoeff += 16;
     70   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
     71   mb_dqcoeff += 16;
     72 
     73   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1);
     74   mb_dqcoeff += 16;
     75   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
     76   mb_dqcoeff += 16;
     77   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1);
     78   mb_dqcoeff += 16;
     79   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
     80   mb_dqcoeff += 16;
     81 
     82   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2);
     83   mb_dqcoeff += 16;
     84   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
     85   mb_dqcoeff += 16;
     86   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2);
     87   mb_dqcoeff += 16;
     88   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
     89   mb_dqcoeff += 16;
     90 
     91   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3);
     92   mb_dqcoeff += 16;
     93   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
     94   mb_dqcoeff += 16;
     95   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3);
     96   mb_dqcoeff += 16;
     97   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
     98   mb_dqcoeff += 16;
     99   return;
    100 }
    101