Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 void vp8_short_inv_walsh4x4_neon(
     14         int16_t *input,
     15         int16_t *mb_dqcoeff) {
     16     int16x8_t q0s16, q1s16, q2s16, q3s16;
     17     int16x4_t d4s16, d5s16, d6s16, d7s16;
     18     int16x4x2_t v2tmp0, v2tmp1;
     19     int32x2x2_t v2tmp2, v2tmp3;
     20     int16x8_t qAdd3;
     21 
     22     q0s16 = vld1q_s16(input);
     23     q1s16 = vld1q_s16(input + 8);
     24 
     25     // 1st for loop
     26     d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
     27     d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
     28     d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
     29     d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
     30 
     31     q2s16 = vcombine_s16(d4s16, d5s16);
     32     q3s16 = vcombine_s16(d6s16, d7s16);
     33 
     34     q0s16 = vaddq_s16(q2s16, q3s16);
     35     q1s16 = vsubq_s16(q2s16, q3s16);
     36 
     37     v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
     38                       vreinterpret_s32_s16(vget_low_s16(q1s16)));
     39     v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
     40                       vreinterpret_s32_s16(vget_high_s16(q1s16)));
     41     v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
     42                       vreinterpret_s16_s32(v2tmp3.val[0]));
     43     v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
     44                       vreinterpret_s16_s32(v2tmp3.val[1]));
     45 
     46     // 2nd for loop
     47     d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
     48     d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
     49     d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
     50     d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
     51     q2s16 = vcombine_s16(d4s16, d5s16);
     52     q3s16 = vcombine_s16(d6s16, d7s16);
     53 
     54     qAdd3 = vdupq_n_s16(3);
     55 
     56     q0s16 = vaddq_s16(q2s16, q3s16);
     57     q1s16 = vsubq_s16(q2s16, q3s16);
     58 
     59     q0s16 = vaddq_s16(q0s16, qAdd3);
     60     q1s16 = vaddq_s16(q1s16, qAdd3);
     61 
     62     q0s16 = vshrq_n_s16(q0s16, 3);
     63     q1s16 = vshrq_n_s16(q1s16, 3);
     64 
     65     // store
     66     vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  0);
     67     mb_dqcoeff += 16;
     68     vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
     69     mb_dqcoeff += 16;
     70     vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  0);
     71     mb_dqcoeff += 16;
     72     vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
     73     mb_dqcoeff += 16;
     74 
     75     vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  1);
     76     mb_dqcoeff += 16;
     77     vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
     78     mb_dqcoeff += 16;
     79     vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  1);
     80     mb_dqcoeff += 16;
     81     vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
     82     mb_dqcoeff += 16;
     83 
     84     vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  2);
     85     mb_dqcoeff += 16;
     86     vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
     87     mb_dqcoeff += 16;
     88     vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  2);
     89     mb_dqcoeff += 16;
     90     vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
     91     mb_dqcoeff += 16;
     92 
     93     vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  3);
     94     mb_dqcoeff += 16;
     95     vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
     96     mb_dqcoeff += 16;
     97     vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  3);
     98     mb_dqcoeff += 16;
     99     vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
    100     mb_dqcoeff += 16;
    101     return;
    102 }
    103