Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 #include "./vp8_rtcd.h"
     14 
     15 void vp8_short_inv_walsh4x4_neon(int16_t *input, int16_t *mb_dqcoeff) {
     16   int16x8_t q0s16, q1s16, q2s16, q3s16;
     17   int16x4_t d4s16, d5s16, d6s16, d7s16;
     18   int16x4x2_t v2tmp0, v2tmp1;
     19   int32x2x2_t v2tmp2, v2tmp3;
     20   int16x8_t qAdd3;
     21 
     22   q0s16 = vld1q_s16(input);
     23   q1s16 = vld1q_s16(input + 8);
     24 
     25   // 1st for loop
     26   d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
     27   d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
     28   d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
     29   d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
     30 
     31   q2s16 = vcombine_s16(d4s16, d5s16);
     32   q3s16 = vcombine_s16(d6s16, d7s16);
     33 
     34   q0s16 = vaddq_s16(q2s16, q3s16);
     35   q1s16 = vsubq_s16(q2s16, q3s16);
     36 
     37   v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
     38                     vreinterpret_s32_s16(vget_low_s16(q1s16)));
     39   v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
     40                     vreinterpret_s32_s16(vget_high_s16(q1s16)));
     41   v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
     42                     vreinterpret_s16_s32(v2tmp3.val[0]));
     43   v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
     44                     vreinterpret_s16_s32(v2tmp3.val[1]));
     45 
     46   // 2nd for loop
     47   d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
     48   d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
     49   d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
     50   d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
     51   q2s16 = vcombine_s16(d4s16, d5s16);
     52   q3s16 = vcombine_s16(d6s16, d7s16);
     53 
     54   qAdd3 = vdupq_n_s16(3);
     55 
     56   q0s16 = vaddq_s16(q2s16, q3s16);
     57   q1s16 = vsubq_s16(q2s16, q3s16);
     58 
     59   q0s16 = vaddq_s16(q0s16, qAdd3);
     60   q1s16 = vaddq_s16(q1s16, qAdd3);
     61 
     62   q0s16 = vshrq_n_s16(q0s16, 3);
     63   q1s16 = vshrq_n_s16(q1s16, 3);
     64 
     65   // store
     66   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0);
     67   mb_dqcoeff += 16;
     68   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
     69   mb_dqcoeff += 16;
     70   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0);
     71   mb_dqcoeff += 16;
     72   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
     73   mb_dqcoeff += 16;
     74 
     75   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1);
     76   mb_dqcoeff += 16;
     77   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
     78   mb_dqcoeff += 16;
     79   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1);
     80   mb_dqcoeff += 16;
     81   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
     82   mb_dqcoeff += 16;
     83 
     84   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2);
     85   mb_dqcoeff += 16;
     86   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
     87   mb_dqcoeff += 16;
     88   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2);
     89   mb_dqcoeff += 16;
     90   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
     91   mb_dqcoeff += 16;
     92 
     93   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3);
     94   mb_dqcoeff += 16;
     95   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
     96   mb_dqcoeff += 16;
     97   vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3);
     98   mb_dqcoeff += 16;
     99   vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
    100   mb_dqcoeff += 16;
    101   return;
    102 }
    103