Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 #include <assert.h>
     13 
     14 #include "./vp9_rtcd.h"
     15 #include "./vpx_config.h"
     16 #include "vp9/common/vp9_common.h"
     17 
     18 static int16_t sinpi_1_9 = 0x14a3;
     19 static int16_t sinpi_2_9 = 0x26c9;
     20 static int16_t sinpi_3_9 = 0x3441;
     21 static int16_t sinpi_4_9 = 0x3b6c;
     22 static int16_t cospi_8_64 = 0x3b21;
     23 static int16_t cospi_16_64 = 0x2d41;
     24 static int16_t cospi_24_64 = 0x187e;
     25 
     26 static INLINE void TRANSPOSE4X4(
     27         int16x8_t *q8s16,
     28         int16x8_t *q9s16) {
     29     int32x4_t q8s32, q9s32;
     30     int16x4x2_t d0x2s16, d1x2s16;
     31     int32x4x2_t q0x2s32;
     32 
     33     d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
     34     d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
     35 
     36     q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
     37     q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
     38     q0x2s32 = vtrnq_s32(q8s32, q9s32);
     39 
     40     *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
     41     *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
     42     return;
     43 }
     44 
     45 static INLINE void GENERATE_COSINE_CONSTANTS(
     46         int16x4_t *d0s16,
     47         int16x4_t *d1s16,
     48         int16x4_t *d2s16) {
     49     *d0s16 = vdup_n_s16(cospi_8_64);
     50     *d1s16 = vdup_n_s16(cospi_16_64);
     51     *d2s16 = vdup_n_s16(cospi_24_64);
     52     return;
     53 }
     54 
     55 static INLINE void GENERATE_SINE_CONSTANTS(
     56         int16x4_t *d3s16,
     57         int16x4_t *d4s16,
     58         int16x4_t *d5s16,
     59         int16x8_t *q3s16) {
     60     *d3s16 = vdup_n_s16(sinpi_1_9);
     61     *d4s16 = vdup_n_s16(sinpi_2_9);
     62     *q3s16 = vdupq_n_s16(sinpi_3_9);
     63     *d5s16 = vdup_n_s16(sinpi_4_9);
     64     return;
     65 }
     66 
     67 static INLINE void IDCT4x4_1D(
     68         int16x4_t *d0s16,
     69         int16x4_t *d1s16,
     70         int16x4_t *d2s16,
     71         int16x8_t *q8s16,
     72         int16x8_t *q9s16) {
     73     int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
     74     int16x4_t d26s16, d27s16, d28s16, d29s16;
     75     int32x4_t q10s32, q13s32, q14s32, q15s32;
     76     int16x8_t q13s16, q14s16;
     77 
     78     d16s16 = vget_low_s16(*q8s16);
     79     d17s16 = vget_high_s16(*q8s16);
     80     d18s16 = vget_low_s16(*q9s16);
     81     d19s16 = vget_high_s16(*q9s16);
     82 
     83     d23s16 = vadd_s16(d16s16, d18s16);
     84     d24s16 = vsub_s16(d16s16, d18s16);
     85 
     86     q15s32 = vmull_s16(d17s16, *d2s16);
     87     q10s32 = vmull_s16(d17s16, *d0s16);
     88     q13s32 = vmull_s16(d23s16, *d1s16);
     89     q14s32 = vmull_s16(d24s16, *d1s16);
     90     q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
     91     q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
     92 
     93     d26s16 = vqrshrn_n_s32(q13s32, 14);
     94     d27s16 = vqrshrn_n_s32(q14s32, 14);
     95     d29s16 = vqrshrn_n_s32(q15s32, 14);
     96     d28s16 = vqrshrn_n_s32(q10s32, 14);
     97 
     98     q13s16 = vcombine_s16(d26s16, d27s16);
     99     q14s16 = vcombine_s16(d28s16, d29s16);
    100     *q8s16 = vaddq_s16(q13s16, q14s16);
    101     *q9s16 = vsubq_s16(q13s16, q14s16);
    102     *q9s16 = vcombine_s16(vget_high_s16(*q9s16),
    103                           vget_low_s16(*q9s16));  // vswp
    104     return;
    105 }
    106 
    107 static INLINE void IADST4x4_1D(
    108         int16x4_t *d3s16,
    109         int16x4_t *d4s16,
    110         int16x4_t *d5s16,
    111         int16x8_t *q3s16,
    112         int16x8_t *q8s16,
    113         int16x8_t *q9s16) {
    114     int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
    115     int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
    116 
    117     d6s16 = vget_low_s16(*q3s16);
    118 
    119     d16s16 = vget_low_s16(*q8s16);
    120     d17s16 = vget_high_s16(*q8s16);
    121     d18s16 = vget_low_s16(*q9s16);
    122     d19s16 = vget_high_s16(*q9s16);
    123 
    124     q10s32 = vmull_s16(*d3s16, d16s16);
    125     q11s32 = vmull_s16(*d4s16, d16s16);
    126     q12s32 = vmull_s16(d6s16, d17s16);
    127     q13s32 = vmull_s16(*d5s16, d18s16);
    128     q14s32 = vmull_s16(*d3s16, d18s16);
    129     q15s32 = vmovl_s16(d16s16);
    130     q15s32 = vaddw_s16(q15s32, d19s16);
    131     q8s32  = vmull_s16(*d4s16, d19s16);
    132     q15s32 = vsubw_s16(q15s32, d18s16);
    133     q9s32  = vmull_s16(*d5s16, d19s16);
    134 
    135     q10s32 = vaddq_s32(q10s32, q13s32);
    136     q10s32 = vaddq_s32(q10s32, q8s32);
    137     q11s32 = vsubq_s32(q11s32, q14s32);
    138     q8s32  = vdupq_n_s32(sinpi_3_9);
    139     q11s32 = vsubq_s32(q11s32, q9s32);
    140     q15s32 = vmulq_s32(q15s32, q8s32);
    141 
    142     q13s32 = vaddq_s32(q10s32, q12s32);
    143     q10s32 = vaddq_s32(q10s32, q11s32);
    144     q14s32 = vaddq_s32(q11s32, q12s32);
    145     q10s32 = vsubq_s32(q10s32, q12s32);
    146 
    147     d16s16 = vqrshrn_n_s32(q13s32, 14);
    148     d17s16 = vqrshrn_n_s32(q14s32, 14);
    149     d18s16 = vqrshrn_n_s32(q15s32, 14);
    150     d19s16 = vqrshrn_n_s32(q10s32, 14);
    151 
    152     *q8s16 = vcombine_s16(d16s16, d17s16);
    153     *q9s16 = vcombine_s16(d18s16, d19s16);
    154     return;
    155 }
    156 
    157 void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
    158                             int dest_stride, int tx_type) {
    159     uint8x8_t d26u8, d27u8;
    160     int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
    161     uint32x2_t d26u32, d27u32;
    162     int16x8_t q3s16, q8s16, q9s16;
    163     uint16x8_t q8u16, q9u16;
    164 
    165     d26u32 = d27u32 = vdup_n_u32(0);
    166 
    167     q8s16 = vld1q_s16(input);
    168     q9s16 = vld1q_s16(input + 8);
    169 
    170     TRANSPOSE4X4(&q8s16, &q9s16);
    171 
    172     switch (tx_type) {
    173       case 0:  // idct_idct is not supported. Fall back to C
    174         vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
    175         return;
    176         break;
    177       case 1:  // iadst_idct
    178         // generate constants
    179         GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
    180         GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
    181 
    182         // first transform rows
    183         IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
    184 
    185         // transpose the matrix
    186         TRANSPOSE4X4(&q8s16, &q9s16);
    187 
    188         // then transform columns
    189         IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
    190         break;
    191       case 2:  // idct_iadst
    192         // generate constantsyy
    193         GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
    194         GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
    195 
    196         // first transform rows
    197         IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
    198 
    199         // transpose the matrix
    200         TRANSPOSE4X4(&q8s16, &q9s16);
    201 
    202         // then transform columns
    203         IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
    204         break;
    205       case 3:  // iadst_iadst
    206         // generate constants
    207         GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
    208 
    209         // first transform rows
    210         IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
    211 
    212         // transpose the matrix
    213         TRANSPOSE4X4(&q8s16, &q9s16);
    214 
    215         // then transform columns
    216         IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
    217         break;
    218       default:  // iadst_idct
    219         assert(0);
    220         break;
    221     }
    222 
    223     q8s16 = vrshrq_n_s16(q8s16, 4);
    224     q9s16 = vrshrq_n_s16(q9s16, 4);
    225 
    226     d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
    227     dest += dest_stride;
    228     d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
    229     dest += dest_stride;
    230     d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
    231     dest += dest_stride;
    232     d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
    233 
    234     q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
    235     q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
    236 
    237     d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
    238     d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
    239 
    240     vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
    241     dest -= dest_stride;
    242     vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
    243     dest -= dest_stride;
    244     vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
    245     dest -= dest_stride;
    246     vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
    247     return;
    248 }
    249