Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 void vp8_short_fdct4x4_neon(int16_t *input, int16_t *output, int pitch) {
     14   int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
     15   int16x4_t d16s16, d17s16, d26s16, dEmptys16;
     16   uint16x4_t d4u16;
     17   int16x8_t q0s16, q1s16;
     18   int32x4_t q9s32, q10s32, q11s32, q12s32;
     19   int16x4x2_t v2tmp0, v2tmp1;
     20   int32x2x2_t v2tmp2, v2tmp3;
     21 
     22   d16s16 = vdup_n_s16(5352);
     23   d17s16 = vdup_n_s16(2217);
     24   q9s32 = vdupq_n_s32(14500);
     25   q10s32 = vdupq_n_s32(7500);
     26   q11s32 = vdupq_n_s32(12000);
     27   q12s32 = vdupq_n_s32(51000);
     28 
     29   // Part one
     30   pitch >>= 1;
     31   d0s16 = vld1_s16(input);
     32   input += pitch;
     33   d1s16 = vld1_s16(input);
     34   input += pitch;
     35   d2s16 = vld1_s16(input);
     36   input += pitch;
     37   d3s16 = vld1_s16(input);
     38 
     39   v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16));
     40   v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16));
     41   v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
     42                     vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
     43   v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
     44                     vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
     45 
     46   d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
     47   d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
     48   d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
     49   d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
     50 
     51   d4s16 = vshl_n_s16(d4s16, 3);
     52   d5s16 = vshl_n_s16(d5s16, 3);
     53   d6s16 = vshl_n_s16(d6s16, 3);
     54   d7s16 = vshl_n_s16(d7s16, 3);
     55 
     56   d0s16 = vadd_s16(d4s16, d5s16);
     57   d2s16 = vsub_s16(d4s16, d5s16);
     58 
     59   q9s32 = vmlal_s16(q9s32, d7s16, d16s16);
     60   q10s32 = vmlal_s16(q10s32, d7s16, d17s16);
     61   q9s32 = vmlal_s16(q9s32, d6s16, d17s16);
     62   q10s32 = vmlsl_s16(q10s32, d6s16, d16s16);
     63 
     64   d1s16 = vshrn_n_s32(q9s32, 12);
     65   d3s16 = vshrn_n_s32(q10s32, 12);
     66 
     67   // Part two
     68   v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16));
     69   v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16));
     70   v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
     71                     vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
     72   v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
     73                     vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
     74 
     75   d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
     76   d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
     77   d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
     78   d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
     79 
     80   d26s16 = vdup_n_s16(7);
     81   d4s16 = vadd_s16(d4s16, d26s16);
     82 
     83   d0s16 = vadd_s16(d4s16, d5s16);
     84   d2s16 = vsub_s16(d4s16, d5s16);
     85 
     86   q11s32 = vmlal_s16(q11s32, d7s16, d16s16);
     87   q12s32 = vmlal_s16(q12s32, d7s16, d17s16);
     88 
     89   dEmptys16 = vdup_n_s16(0);
     90   d4u16 = vceq_s16(d7s16, dEmptys16);
     91 
     92   d0s16 = vshr_n_s16(d0s16, 4);
     93   d2s16 = vshr_n_s16(d2s16, 4);
     94 
     95   q11s32 = vmlal_s16(q11s32, d6s16, d17s16);
     96   q12s32 = vmlsl_s16(q12s32, d6s16, d16s16);
     97 
     98   d4u16 = vmvn_u16(d4u16);
     99   d1s16 = vshrn_n_s32(q11s32, 16);
    100   d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16));
    101   d3s16 = vshrn_n_s32(q12s32, 16);
    102 
    103   q0s16 = vcombine_s16(d0s16, d1s16);
    104   q1s16 = vcombine_s16(d2s16, d3s16);
    105 
    106   vst1q_s16(output, q0s16);
    107   vst1q_s16(output + 8, q1s16);
    108   return;
    109 }
    110 
    111 void vp8_short_fdct8x4_neon(int16_t *input, int16_t *output, int pitch) {
    112   int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
    113   int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16;
    114   uint16x4_t d28u16, d29u16;
    115   uint16x8_t q14u16;
    116   int16x8_t q0s16, q1s16, q2s16, q3s16;
    117   int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16;
    118   int32x4_t q9s32, q10s32, q11s32, q12s32;
    119   int16x8x2_t v2tmp0, v2tmp1;
    120   int32x4x2_t v2tmp2, v2tmp3;
    121 
    122   d16s16 = vdup_n_s16(5352);
    123   d17s16 = vdup_n_s16(2217);
    124   q9s32 = vdupq_n_s32(14500);
    125   q10s32 = vdupq_n_s32(7500);
    126 
    127   // Part one
    128   pitch >>= 1;
    129   q0s16 = vld1q_s16(input);
    130   input += pitch;
    131   q1s16 = vld1q_s16(input);
    132   input += pitch;
    133   q2s16 = vld1q_s16(input);
    134   input += pitch;
    135   q3s16 = vld1q_s16(input);
    136 
    137   v2tmp2 =
    138       vtrnq_s32(vreinterpretq_s32_s16(q0s16), vreinterpretq_s32_s16(q2s16));
    139   v2tmp3 =
    140       vtrnq_s32(vreinterpretq_s32_s16(q1s16), vreinterpretq_s32_s16(q3s16));
    141   v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]),   // q0
    142                      vreinterpretq_s16_s32(v2tmp3.val[0]));  // q1
    143   v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]),   // q2
    144                      vreinterpretq_s16_s32(v2tmp3.val[1]));  // q3
    145 
    146   q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
    147   q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
    148   q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
    149   q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
    150 
    151   q11s16 = vshlq_n_s16(q11s16, 3);
    152   q12s16 = vshlq_n_s16(q12s16, 3);
    153   q13s16 = vshlq_n_s16(q13s16, 3);
    154   q14s16 = vshlq_n_s16(q14s16, 3);
    155 
    156   q0s16 = vaddq_s16(q11s16, q12s16);
    157   q2s16 = vsubq_s16(q11s16, q12s16);
    158 
    159   q11s32 = q9s32;
    160   q12s32 = q10s32;
    161 
    162   d26s16 = vget_low_s16(q13s16);
    163   d27s16 = vget_high_s16(q13s16);
    164   d28s16 = vget_low_s16(q14s16);
    165   d29s16 = vget_high_s16(q14s16);
    166 
    167   q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
    168   q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
    169   q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
    170   q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
    171 
    172   q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
    173   q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
    174   q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
    175   q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
    176 
    177   d2s16 = vshrn_n_s32(q9s32, 12);
    178   d6s16 = vshrn_n_s32(q10s32, 12);
    179   d3s16 = vshrn_n_s32(q11s32, 12);
    180   d7s16 = vshrn_n_s32(q12s32, 12);
    181   q1s16 = vcombine_s16(d2s16, d3s16);
    182   q3s16 = vcombine_s16(d6s16, d7s16);
    183 
    184   // Part two
    185   q9s32 = vdupq_n_s32(12000);
    186   q10s32 = vdupq_n_s32(51000);
    187 
    188   v2tmp2 =
    189       vtrnq_s32(vreinterpretq_s32_s16(q0s16), vreinterpretq_s32_s16(q2s16));
    190   v2tmp3 =
    191       vtrnq_s32(vreinterpretq_s32_s16(q1s16), vreinterpretq_s32_s16(q3s16));
    192   v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]),   // q0
    193                      vreinterpretq_s16_s32(v2tmp3.val[0]));  // q1
    194   v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]),   // q2
    195                      vreinterpretq_s16_s32(v2tmp3.val[1]));  // q3
    196 
    197   q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
    198   q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
    199   q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
    200   q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
    201 
    202   q15s16 = vdupq_n_s16(7);
    203   q11s16 = vaddq_s16(q11s16, q15s16);
    204   q0s16 = vaddq_s16(q11s16, q12s16);
    205   q1s16 = vsubq_s16(q11s16, q12s16);
    206 
    207   q11s32 = q9s32;
    208   q12s32 = q10s32;
    209 
    210   d0s16 = vget_low_s16(q0s16);
    211   d1s16 = vget_high_s16(q0s16);
    212   d2s16 = vget_low_s16(q1s16);
    213   d3s16 = vget_high_s16(q1s16);
    214 
    215   d0s16 = vshr_n_s16(d0s16, 4);
    216   d4s16 = vshr_n_s16(d1s16, 4);
    217   d2s16 = vshr_n_s16(d2s16, 4);
    218   d6s16 = vshr_n_s16(d3s16, 4);
    219 
    220   d26s16 = vget_low_s16(q13s16);
    221   d27s16 = vget_high_s16(q13s16);
    222   d28s16 = vget_low_s16(q14s16);
    223   d29s16 = vget_high_s16(q14s16);
    224 
    225   q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
    226   q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
    227   q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
    228   q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
    229 
    230   q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
    231   q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
    232   q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
    233   q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
    234 
    235   d1s16 = vshrn_n_s32(q9s32, 16);
    236   d3s16 = vshrn_n_s32(q10s32, 16);
    237   d5s16 = vshrn_n_s32(q11s32, 16);
    238   d7s16 = vshrn_n_s32(q12s32, 16);
    239 
    240   qEmptys16 = vdupq_n_s16(0);
    241   q14u16 = vceqq_s16(q14s16, qEmptys16);
    242   q14u16 = vmvnq_u16(q14u16);
    243 
    244   d28u16 = vget_low_u16(q14u16);
    245   d29u16 = vget_high_u16(q14u16);
    246   d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16));
    247   d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16));
    248 
    249   q0s16 = vcombine_s16(d0s16, d1s16);
    250   q1s16 = vcombine_s16(d2s16, d3s16);
    251   q2s16 = vcombine_s16(d4s16, d5s16);
    252   q3s16 = vcombine_s16(d6s16, d7s16);
    253 
    254   vst1q_s16(output, q0s16);
    255   vst1q_s16(output + 8, q1s16);
    256   vst1q_s16(output + 16, q2s16);
    257   vst1q_s16(output + 24, q3s16);
    258   return;
    259 }
    260