Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vp9_rtcd.h"
     12 #include "vp9/common/vp9_common.h"
     13 
     14 void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
     15                                       int16_t *output,
     16                                       int output_stride);
     17 void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
     18                                       int16_t *output,
     19                                       int16_t *pass1Output,
     20                                       int16_t skip_adding,
     21                                       uint8_t *dest,
     22                                       int dest_stride);
     23 void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
     24                                      int16_t *output,
     25                                      int output_stride);
     26 void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
     27                                      int16_t *output,
     28                                      int16_t *pass1Output,
     29                                      int16_t skip_adding,
     30                                      uint8_t *dest,
     31                                      int dest_stride);
     32 
     33 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
     34 extern void vp9_push_neon(int64_t *store);
     35 extern void vp9_pop_neon(int64_t *store);
     36 
     37 void vp9_idct16x16_256_add_neon(const int16_t *input,
     38                                 uint8_t *dest, int dest_stride) {
     39   int64_t store_reg[8];
     40   int16_t pass1_output[16*16] = {0};
     41   int16_t row_idct_output[16*16] = {0};
     42 
     43   // save d8-d15 register values.
     44   vp9_push_neon(store_reg);
     45 
     46   /* Parallel idct on the upper 8 rows */
     47   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
     48   // stage 6 result in pass1_output.
     49   vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
     50 
     51   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
     52   // with result in pass1(pass1_output) to calculate final result in stage 7
     53   // which will be saved into row_idct_output.
     54   vp9_idct16x16_256_add_neon_pass2(input+1,
     55                                      row_idct_output,
     56                                      pass1_output,
     57                                      0,
     58                                      dest,
     59                                      dest_stride);
     60 
     61   /* Parallel idct on the lower 8 rows */
     62   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
     63   // stage 6 result in pass1_output.
     64   vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
     65 
     66   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
     67   // with result in pass1(pass1_output) to calculate final result in stage 7
     68   // which will be saved into row_idct_output.
     69   vp9_idct16x16_256_add_neon_pass2(input+8*16+1,
     70                                      row_idct_output+8,
     71                                      pass1_output,
     72                                      0,
     73                                      dest,
     74                                      dest_stride);
     75 
     76   /* Parallel idct on the left 8 columns */
     77   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
     78   // stage 6 result in pass1_output.
     79   vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
     80 
     81   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
     82   // with result in pass1(pass1_output) to calculate final result in stage 7.
     83   // Then add the result to the destination data.
     84   vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
     85                                      row_idct_output,
     86                                      pass1_output,
     87                                      1,
     88                                      dest,
     89                                      dest_stride);
     90 
     91   /* Parallel idct on the right 8 columns */
     92   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
     93   // stage 6 result in pass1_output.
     94   vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
     95 
     96   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
     97   // with result in pass1(pass1_output) to calculate final result in stage 7.
     98   // Then add the result to the destination data.
     99   vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
    100                                      row_idct_output+8,
    101                                      pass1_output,
    102                                      1,
    103                                      dest+8,
    104                                      dest_stride);
    105 
    106   // restore d8-d15 register values.
    107   vp9_pop_neon(store_reg);
    108 
    109   return;
    110 }
    111 
    112 void vp9_idct16x16_10_add_neon(const int16_t *input,
    113                                uint8_t *dest, int dest_stride) {
    114   int64_t store_reg[8];
    115   int16_t pass1_output[16*16] = {0};
    116   int16_t row_idct_output[16*16] = {0};
    117 
    118   // save d8-d15 register values.
    119   vp9_push_neon(store_reg);
    120 
    121   /* Parallel idct on the upper 8 rows */
    122   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
    123   // stage 6 result in pass1_output.
    124   vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
    125 
    126   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
    127   // with result in pass1(pass1_output) to calculate final result in stage 7
    128   // which will be saved into row_idct_output.
    129   vp9_idct16x16_10_add_neon_pass2(input+1,
    130                                         row_idct_output,
    131                                         pass1_output,
    132                                         0,
    133                                         dest,
    134                                         dest_stride);
    135 
    136   /* Skip Parallel idct on the lower 8 rows as they are all 0s */
    137 
    138   /* Parallel idct on the left 8 columns */
    139   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
    140   // stage 6 result in pass1_output.
    141   vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
    142 
    143   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
    144   // with result in pass1(pass1_output) to calculate final result in stage 7.
    145   // Then add the result to the destination data.
    146   vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
    147                                      row_idct_output,
    148                                      pass1_output,
    149                                      1,
    150                                      dest,
    151                                      dest_stride);
    152 
    153   /* Parallel idct on the right 8 columns */
    154   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
    155   // stage 6 result in pass1_output.
    156   vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
    157 
    158   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
    159   // with result in pass1(pass1_output) to calculate final result in stage 7.
    160   // Then add the result to the destination data.
    161   vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
    162                                      row_idct_output+8,
    163                                      pass1_output,
    164                                      1,
    165                                      dest+8,
    166                                      dest_stride);
    167 
    168   // restore d8-d15 register values.
    169   vp9_pop_neon(store_reg);
    170 
    171   return;
    172 }
    173