Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 #include "vp8/common/blockd.h"
     14 
     15 void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x,
     16                                            unsigned char * yabove_row,
     17                                            unsigned char * yleft,
     18                                            int left_stride,
     19                                            unsigned char * ypred_ptr,
     20                                            int y_stride) {
     21   const int mode = x->mode_info_context->mbmi.mode;
     22   int i;
     23 
     24   switch (mode) {
     25     case DC_PRED:
     26     {
     27       int shift = x->up_available + x->left_available;
     28       uint8x16_t v_expected_dc = vdupq_n_u8(128);
     29 
     30       if (shift) {
     31         unsigned int average = 0;
     32         int expected_dc;
     33         if (x->up_available) {
     34           const uint8x16_t v_above = vld1q_u8(yabove_row);
     35           const uint16x8_t a = vpaddlq_u8(v_above);
     36           const uint32x4_t b = vpaddlq_u16(a);
     37           const uint64x2_t c = vpaddlq_u32(b);
     38           const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
     39                                         vreinterpret_u32_u64(vget_high_u64(c)));
     40           average = vget_lane_u32(d, 0);
     41         }
     42         if (x->left_available) {
     43           for (i = 0; i < 16; ++i) {
     44               average += yleft[0];
     45               yleft += left_stride;
     46           }
     47         }
     48         shift += 3;
     49         expected_dc = (average + (1 << (shift - 1))) >> shift;
     50         v_expected_dc = vmovq_n_u8((uint8_t)expected_dc);
     51       }
     52       for (i = 0; i < 16; ++i) {
     53         vst1q_u8(ypred_ptr, v_expected_dc);
     54         ypred_ptr += y_stride;
     55       }
     56     }
     57     break;
     58     case V_PRED:
     59     {
     60       const uint8x16_t v_above = vld1q_u8(yabove_row);
     61       for (i = 0; i < 16; ++i) {
     62         vst1q_u8(ypred_ptr, v_above);
     63         ypred_ptr += y_stride;
     64       }
     65     }
     66     break;
     67     case H_PRED:
     68     {
     69       for (i = 0; i < 16; ++i) {
     70         const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]);
     71         yleft += left_stride;
     72         vst1q_u8(ypred_ptr, v_yleft);
     73         ypred_ptr += y_stride;
     74       }
     75     }
     76     break;
     77     case TM_PRED:
     78     {
     79       const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]);
     80       const uint8x16_t v_above = vld1q_u8(yabove_row);
     81       for (i = 0; i < 16; ++i) {
     82         const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]);
     83         const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft);
     84         const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft);
     85         const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo),
     86                                          vreinterpretq_s16_u16(v_ytop_left));
     87         const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi),
     88                                          vreinterpretq_s16_u16(v_ytop_left));
     89         const uint8x8_t pred_lo = vqmovun_s16(b_lo);
     90         const uint8x8_t pred_hi = vqmovun_s16(b_hi);
     91 
     92         vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi));
     93         ypred_ptr += y_stride;
     94         yleft += left_stride;
     95       }
     96     }
     97     break;
     98   }
     99 }
    100 
    101 void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x,
    102                                             unsigned char * uabove_row,
    103                                             unsigned char * vabove_row,
    104                                             unsigned char * uleft,
    105                                             unsigned char * vleft,
    106                                             int left_stride,
    107                                             unsigned char * upred_ptr,
    108                                             unsigned char * vpred_ptr,
    109                                             int pred_stride) {
    110   const int mode = x->mode_info_context->mbmi.uv_mode;
    111   int i;
    112 
    113   switch (mode) {
    114     case DC_PRED:
    115     {
    116       int shift = x->up_available + x->left_available;
    117       uint8x8_t v_expected_udc = vdup_n_u8(128);
    118       uint8x8_t v_expected_vdc = vdup_n_u8(128);
    119 
    120       if (shift) {
    121         unsigned int average_u = 0;
    122         unsigned int average_v = 0;
    123         int expected_udc;
    124         int expected_vdc;
    125         if (x->up_available) {
    126           const uint8x8_t v_uabove = vld1_u8(uabove_row);
    127           const uint8x8_t v_vabove = vld1_u8(vabove_row);
    128           const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove));
    129           const uint32x4_t b = vpaddlq_u16(a);
    130           const uint64x2_t c = vpaddlq_u32(b);
    131           average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0);
    132           average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2);
    133         }
    134         if (x->left_available) {
    135           for (i = 0; i < 8; ++i) {
    136               average_u += uleft[0];
    137               uleft += left_stride;
    138               average_v += vleft[0];
    139               vleft += left_stride;
    140           }
    141         }
    142         shift += 2;
    143         expected_udc = (average_u + (1 << (shift - 1))) >> shift;
    144         expected_vdc = (average_v + (1 << (shift - 1))) >> shift;
    145         v_expected_udc = vmov_n_u8((uint8_t)expected_udc);
    146         v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc);
    147       }
    148       for (i = 0; i < 8; ++i) {
    149         vst1_u8(upred_ptr, v_expected_udc);
    150         upred_ptr += pred_stride;
    151         vst1_u8(vpred_ptr, v_expected_vdc);
    152         vpred_ptr += pred_stride;
    153       }
    154     }
    155     break;
    156     case V_PRED:
    157     {
    158       const uint8x8_t v_uabove = vld1_u8(uabove_row);
    159       const uint8x8_t v_vabove = vld1_u8(vabove_row);
    160       for (i = 0; i < 8; ++i) {
    161         vst1_u8(upred_ptr, v_uabove);
    162         upred_ptr += pred_stride;
    163         vst1_u8(vpred_ptr, v_vabove);
    164         vpred_ptr += pred_stride;
    165       }
    166     }
    167     break;
    168     case H_PRED:
    169     {
    170       for (i = 0; i < 8; ++i) {
    171         const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]);
    172         const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]);
    173         uleft += left_stride;
    174         vleft += left_stride;
    175         vst1_u8(upred_ptr, v_uleft);
    176         upred_ptr += pred_stride;
    177         vst1_u8(vpred_ptr, v_vleft);
    178         vpred_ptr += pred_stride;
    179       }
    180     }
    181     break;
    182     case TM_PRED:
    183     {
    184       const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]);
    185       const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]);
    186       const uint8x8_t v_uabove = vld1_u8(uabove_row);
    187       const uint8x8_t v_vabove = vld1_u8(vabove_row);
    188       for (i = 0; i < 8; ++i) {
    189         const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]);
    190         const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]);
    191         const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft);
    192         const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft);
    193         const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u),
    194                                         vreinterpretq_s16_u16(v_utop_left));
    195         const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v),
    196                                         vreinterpretq_s16_u16(v_vtop_left));
    197         const uint8x8_t pred_u = vqmovun_s16(b_u);
    198         const uint8x8_t pred_v = vqmovun_s16(b_v);
    199 
    200         vst1_u8(upred_ptr, pred_u);
    201         vst1_u8(vpred_ptr, pred_v);
    202         upred_ptr += pred_stride;
    203         vpred_ptr += pred_stride;
    204         uleft += left_stride;
    205         vleft += left_stride;
    206       }
    207     }
    208     break;
    209   }
    210 }
    211