Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 ;TODO(cd): adjust these constant to be able to use vqdmulh for faster
     12 ;          dct_const_round_shift(a * b) within butterfly calculations.
     13 cospi_1_64  EQU 16364
     14 cospi_2_64  EQU 16305
     15 cospi_3_64  EQU 16207
     16 cospi_4_64  EQU 16069
     17 cospi_5_64  EQU 15893
     18 cospi_6_64  EQU 15679
     19 cospi_7_64  EQU 15426
     20 cospi_8_64  EQU 15137
     21 cospi_9_64  EQU 14811
     22 cospi_10_64 EQU 14449
     23 cospi_11_64 EQU 14053
     24 cospi_12_64 EQU 13623
     25 cospi_13_64 EQU 13160
     26 cospi_14_64 EQU 12665
     27 cospi_15_64 EQU 12140
     28 cospi_16_64 EQU 11585
     29 cospi_17_64 EQU 11003
     30 cospi_18_64 EQU 10394
     31 cospi_19_64 EQU  9760
     32 cospi_20_64 EQU  9102
     33 cospi_21_64 EQU  8423
     34 cospi_22_64 EQU  7723
     35 cospi_23_64 EQU  7005
     36 cospi_24_64 EQU  6270
     37 cospi_25_64 EQU  5520
     38 cospi_26_64 EQU  4756
     39 cospi_27_64 EQU  3981
     40 cospi_28_64 EQU  3196
     41 cospi_29_64 EQU  2404
     42 cospi_30_64 EQU  1606
     43 cospi_31_64 EQU   804
     44 
     45 
     46     EXPORT  |vp9_idct32x32_1024_add_neon|
     47     ARM
     48     REQUIRE8
     49     PRESERVE8
     50 
     51     AREA ||.text||, CODE, READONLY, ALIGN=2
     52 
     53     AREA     Block, CODE, READONLY
     54 
     55     ; --------------------------------------------------------------------------
     56     ; Load from transposed_buffer
     57     ;   q13 = transposed_buffer[first_offset]
     58     ;   q14 = transposed_buffer[second_offset]
     59     ;   for proper address calculation, the last offset used when manipulating
     60     ;   transposed_buffer must be passed in. use 0 for first use.
     61     MACRO
     62     LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
     63     ; address calculation with proper stride and loading
     64     add r0, #($first_offset  - $prev_offset )*8*2
     65     vld1.s16        {q14}, [r0]
     66     add r0, #($second_offset - $first_offset)*8*2
     67     vld1.s16        {q13}, [r0]
     68     ; (used) two registers (q14, q13)
     69     MEND
     70     ; --------------------------------------------------------------------------
     71     ; Load from output (used as temporary storage)
     72     ;   reg1 = output[first_offset]
     73     ;   reg2 = output[second_offset]
     74     ;   for proper address calculation, the last offset used when manipulating
     75     ;   output, whether reading or storing) must be passed in. use 0 for first
     76     ;   use.
     77     MACRO
     78     LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
     79     ; address calculation with proper stride and loading
     80     add r1, #($first_offset  - $prev_offset )*32*2
     81     vld1.s16        {$reg1}, [r1]
     82     add r1, #($second_offset - $first_offset)*32*2
     83     vld1.s16        {$reg2}, [r1]
     84     ; (used) two registers ($reg1, $reg2)
     85     MEND
     86     ; --------------------------------------------------------------------------
     87     ; Store into output (sometimes as as temporary storage)
     88     ;   output[first_offset] = reg1
     89     ;   output[second_offset] = reg2
     90     ;   for proper address calculation, the last offset used when manipulating
     91     ;   output, whether reading or storing) must be passed in. use 0 for first
     92     ;   use.
     93     MACRO
     94     STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
     95     ; address calculation with proper stride and storing
     96     add r1, #($first_offset  - $prev_offset )*32*2
     97     vst1.16 {$reg1}, [r1]
     98     add r1, #($second_offset - $first_offset)*32*2
     99     vst1.16 {$reg2}, [r1]
    100     MEND
    101     ; --------------------------------------------------------------------------
    102     ; Combine-add results with current destination content
    103     ;   q6-q9 contain the results (out[j * 32 + 0-31])
    104     MACRO
    105     STORE_COMBINE_CENTER_RESULTS
    106     ; load dest[j * dest_stride + 0-31]
    107     vld1.s16        {d8}, [r10], r2
    108     vld1.s16        {d11}, [r9], r11
    109     vld1.s16        {d9}, [r10]
    110     vld1.s16        {d10}, [r9]
    111     ; ROUND_POWER_OF_TWO
    112     vrshr.s16       q7, q7, #6
    113     vrshr.s16       q8, q8, #6
    114     vrshr.s16       q9, q9, #6
    115     vrshr.s16       q6, q6, #6
    116     ; add to dest[j * dest_stride + 0-31]
    117     vaddw.u8        q7, q7, d9
    118     vaddw.u8        q8, q8, d10
    119     vaddw.u8        q9, q9, d11
    120     vaddw.u8        q6, q6, d8
    121     ; clip pixel
    122     vqmovun.s16     d9,  q7
    123     vqmovun.s16     d10, q8
    124     vqmovun.s16     d11, q9
    125     vqmovun.s16     d8,  q6
    126     ; store back into dest[j * dest_stride + 0-31]
    127     vst1.16         {d9}, [r10], r11
    128     vst1.16         {d10}, [r9], r2
    129     vst1.16         {d8}, [r10]
    130     vst1.16         {d11}, [r9]
    131     ; update pointers (by dest_stride * 2)
    132     sub r9,  r9,  r2, lsl #1
    133     add r10, r10, r2, lsl #1
    134     MEND
    135     ; --------------------------------------------------------------------------
    136     ; Combine-add results with current destination content
    137     ;   q6-q9 contain the results (out[j * 32 + 0-31])
    138     MACRO
    139     STORE_COMBINE_CENTER_RESULTS_LAST
    140     ; load dest[j * dest_stride + 0-31]
    141     vld1.s16        {d8}, [r10], r2
    142     vld1.s16        {d11}, [r9], r11
    143     vld1.s16        {d9}, [r10]
    144     vld1.s16        {d10}, [r9]
    145     ; ROUND_POWER_OF_TWO
    146     vrshr.s16       q7, q7, #6
    147     vrshr.s16       q8, q8, #6
    148     vrshr.s16       q9, q9, #6
    149     vrshr.s16       q6, q6, #6
    150     ; add to dest[j * dest_stride + 0-31]
    151     vaddw.u8        q7, q7, d9
    152     vaddw.u8        q8, q8, d10
    153     vaddw.u8        q9, q9, d11
    154     vaddw.u8        q6, q6, d8
    155     ; clip pixel
    156     vqmovun.s16     d9,  q7
    157     vqmovun.s16     d10, q8
    158     vqmovun.s16     d11, q9
    159     vqmovun.s16     d8,  q6
    160     ; store back into dest[j * dest_stride + 0-31]
    161     vst1.16         {d9}, [r10], r11
    162     vst1.16         {d10}, [r9], r2
    163     vst1.16         {d8}, [r10]!
    164     vst1.16         {d11}, [r9]!
    165     ; update pointers (by dest_stride * 2)
    166     sub r9,  r9,  r2, lsl #1
    167     add r10, r10, r2, lsl #1
    168     MEND
    169     ; --------------------------------------------------------------------------
    170     ; Combine-add results with current destination content
    171     ;   q4-q7 contain the results (out[j * 32 + 0-31])
    172     MACRO
    173     STORE_COMBINE_EXTREME_RESULTS
    174     ; load dest[j * dest_stride + 0-31]
    175     vld1.s16        {d4}, [r7], r2
    176     vld1.s16        {d7}, [r6], r11
    177     vld1.s16        {d5}, [r7]
    178     vld1.s16        {d6}, [r6]
    179     ; ROUND_POWER_OF_TWO
    180     vrshr.s16       q5, q5, #6
    181     vrshr.s16       q6, q6, #6
    182     vrshr.s16       q7, q7, #6
    183     vrshr.s16       q4, q4, #6
    184     ; add to dest[j * dest_stride + 0-31]
    185     vaddw.u8        q5, q5, d5
    186     vaddw.u8        q6, q6, d6
    187     vaddw.u8        q7, q7, d7
    188     vaddw.u8        q4, q4, d4
    189     ; clip pixel
    190     vqmovun.s16     d5, q5
    191     vqmovun.s16     d6, q6
    192     vqmovun.s16     d7, q7
    193     vqmovun.s16     d4, q4
    194     ; store back into dest[j * dest_stride + 0-31]
    195     vst1.16         {d5}, [r7], r11
    196     vst1.16         {d6}, [r6], r2
    197     vst1.16         {d7}, [r6]
    198     vst1.16         {d4}, [r7]
    199     ; update pointers (by dest_stride * 2)
    200     sub r6, r6, r2, lsl #1
    201     add r7, r7, r2, lsl #1
    202     MEND
    203     ; --------------------------------------------------------------------------
    204     ; Combine-add results with current destination content
    205     ;   q4-q7 contain the results (out[j * 32 + 0-31])
    206     MACRO
    207     STORE_COMBINE_EXTREME_RESULTS_LAST
    208     ; load dest[j * dest_stride + 0-31]
    209     vld1.s16        {d4}, [r7], r2
    210     vld1.s16        {d7}, [r6], r11
    211     vld1.s16        {d5}, [r7]
    212     vld1.s16        {d6}, [r6]
    213     ; ROUND_POWER_OF_TWO
    214     vrshr.s16       q5, q5, #6
    215     vrshr.s16       q6, q6, #6
    216     vrshr.s16       q7, q7, #6
    217     vrshr.s16       q4, q4, #6
    218     ; add to dest[j * dest_stride + 0-31]
    219     vaddw.u8        q5, q5, d5
    220     vaddw.u8        q6, q6, d6
    221     vaddw.u8        q7, q7, d7
    222     vaddw.u8        q4, q4, d4
    223     ; clip pixel
    224     vqmovun.s16     d5, q5
    225     vqmovun.s16     d6, q6
    226     vqmovun.s16     d7, q7
    227     vqmovun.s16     d4, q4
    228     ; store back into dest[j * dest_stride + 0-31]
    229     vst1.16         {d5}, [r7], r11
    230     vst1.16         {d6}, [r6], r2
    231     vst1.16         {d7}, [r6]!
    232     vst1.16         {d4}, [r7]!
    233     ; update pointers (by dest_stride * 2)
    234     sub r6, r6, r2, lsl #1
    235     add r7, r7, r2, lsl #1
    236     MEND
    237     ; --------------------------------------------------------------------------
    238     ; Touches q8-q12, q15 (q13-q14 are preserved)
    239     ; valid output registers are anything but q8-q11
    240     MACRO
    241     DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
    242     ; TODO(cd): have special case to re-use constants when they are similar for
    243     ;           consecutive butterflies
    244     ; TODO(cd): have special case when both constants are the same, do the
    245     ;           additions/subtractions before the multiplies.
    246     ; generate the constants
    247     ;   generate scalar constants
    248     mov             r8,  #$first_constant  & 0xFF00
    249     mov             r12, #$second_constant & 0xFF00
    250     add             r8,  #$first_constant  & 0x00FF
    251     add             r12, #$second_constant & 0x00FF
    252     ;   generate vector constants
    253     vdup.16         d30, r8
    254     vdup.16         d31, r12
    255     ; (used) two for inputs (regA-regD), one for constants (q15)
    256     ; do some multiplications (ordered for maximum latency hiding)
    257     vmull.s16 q8,  $regC, d30
    258     vmull.s16 q10, $regA, d31
    259     vmull.s16 q9,  $regD, d30
    260     vmull.s16 q11, $regB, d31
    261     vmull.s16 q12, $regC, d31
    262     ; (used) five for intermediate (q8-q12), one for constants (q15)
    263     ; do some addition/subtractions (to get back two register)
    264     vsub.s32  q8, q8, q10
    265     vsub.s32  q9, q9, q11
    266     ; do more multiplications (ordered for maximum latency hiding)
    267     vmull.s16 q10, $regD, d31
    268     vmull.s16 q11, $regA, d30
    269     vmull.s16 q15, $regB, d30
    270     ; (used) six for intermediate (q8-q12, q15)
    271     ; do more addition/subtractions
    272     vadd.s32  q11, q12, q11
    273     vadd.s32  q10, q10, q15
    274     ; (used) four for intermediate (q8-q11)
    275     ; dct_const_round_shift
    276     vqrshrn.s32 $reg1, q8,  #14
    277     vqrshrn.s32 $reg2, q9,  #14
    278     vqrshrn.s32 $reg3, q11, #14
    279     vqrshrn.s32 $reg4, q10, #14
    280     ; (used) two for results, well four d registers
    281     MEND
    282     ; --------------------------------------------------------------------------
    283     ; Touches q8-q12, q15 (q13-q14 are preserved)
    284     ; valid output registers are anything but q8-q11
    285     MACRO
    286     DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
    287     DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
    288     MEND
    289     ; --------------------------------------------------------------------------
    290 
    291 ;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
    292 ;
    293 ;   r0  int16_t *input,
    294 ;   r1  uint8_t *dest,
    295 ;   r2  int dest_stride)
    296 ; loop counters
    297 ;   r4  bands loop counter
    298 ;   r5  pass loop counter
    299 ;   r8  transpose loop counter
    300 ; combine-add pointers
    301 ;   r6  dest + 31 * dest_stride, descending (30, 29, 28, ...)
    302 ;   r7  dest +  0 * dest_stride, ascending  (1, 2, 3, ...)
    303 ;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)
    304 ;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)
    305 
    306 |vp9_idct32x32_1024_add_neon| PROC
    307     ; This function does one pass of idct32x32 transform.
    308     ;
    309     ; This is done by transposing the input and then doing a 1d transform on
    310     ; columns. In the first pass, the transposed columns are the original
    311     ; rows. In the second pass, after the transposition, the colums are the
    312     ; original columns.
    313     ; The 1d transform is done by looping over bands of eight columns (the
    314     ; idct32_bands loop). For each band, the transform input transposition
    315     ; is done on demand, one band of four 8x8 matrices at a time. The four
    316     ; matrices are transposed by pairs (the idct32_transpose_pair loop).
    317     push  {r4-r11}
    318     vpush {d8-d15}
    319     ; stack operation
    320     ; internal buffer used to transpose 8 lines into before transforming them
    321     ;   int16_t transpose_buffer[32 * 8];
    322     ;   at sp + [4096, 4607]
    323     ; results of the first pass (transpose and transform rows)
    324     ;   int16_t pass1[32 * 32];
    325     ;   at sp + [0, 2047]
    326     ; results of the second pass (transpose and transform columns)
    327     ;   int16_t pass2[32 * 32];
    328     ;   at sp + [2048, 4095]
    329     sub sp, sp, #512+2048+2048
    330 
    331     ; r6  = dest + 31 * dest_stride
    332     ; r7  = dest +  0 * dest_stride
    333     ; r9  = dest + 15 * dest_stride
    334     ; r10 = dest + 16 * dest_stride
    335     rsb r6,  r2, r2, lsl #5
    336     rsb r9,  r2, r2, lsl #4
    337     add r10, r1, r2, lsl #4
    338     mov r7, r1
    339     add r6, r6, r1
    340     add r9, r9, r1
    341     ; r11 = -dest_stride
    342     neg r11, r2
    343     ; r3 = input
    344     mov r3, r0
    345     ; parameters for first pass
    346       ; r0 = transpose_buffer[32 * 8]
    347     add r0, sp, #4096
    348       ; r1 = pass1[32 * 32]
    349     mov r1, sp
    350 
    351     mov r5, #0          ; initialize pass loop counter
    352 idct32_pass_loop
    353     mov r4, #4          ; initialize bands loop counter
    354 idct32_bands_loop
    355     mov r8, #2          ; initialize transpose loop counter
    356 idct32_transpose_pair_loop
    357     ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
    358     ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
    359     ; adjusted to 32 because of the two post-increments.
    360     vld1.s16        {q8},  [r3]!
    361     vld1.s16        {q0},  [r3]!
    362     add r3, #32
    363     vld1.s16        {q9},  [r3]!
    364     vld1.s16        {q1},  [r3]!
    365     add r3, #32
    366     vld1.s16        {q10}, [r3]!
    367     vld1.s16        {q2},  [r3]!
    368     add r3, #32
    369     vld1.s16        {q11}, [r3]!
    370     vld1.s16        {q3},  [r3]!
    371     add r3, #32
    372     vld1.s16        {q12}, [r3]!
    373     vld1.s16        {q4},  [r3]!
    374     add r3, #32
    375     vld1.s16        {q13}, [r3]!
    376     vld1.s16        {q5},  [r3]!
    377     add r3, #32
    378     vld1.s16        {q14}, [r3]!
    379     vld1.s16        {q6},  [r3]!
    380     add r3, #32
    381     vld1.s16        {q15}, [r3]!
    382     vld1.s16        {q7},  [r3]!
    383 
    384     ; Transpose the two 8x8 16bit data matrices.
    385     vswp            d17, d24
    386     vswp            d23, d30
    387     vswp            d21, d28
    388     vswp            d19, d26
    389     vswp            d1,  d8
    390     vswp            d7,  d14
    391     vswp            d5,  d12
    392     vswp            d3,  d10
    393     vtrn.32         q8,  q10
    394     vtrn.32         q9,  q11
    395     vtrn.32         q12, q14
    396     vtrn.32         q13, q15
    397     vtrn.32         q0,  q2
    398     vtrn.32         q1,  q3
    399     vtrn.32         q4,  q6
    400     vtrn.32         q5,  q7
    401     vtrn.16         q8,  q9
    402     vtrn.16         q10, q11
    403     vtrn.16         q12, q13
    404     vtrn.16         q14, q15
    405     vtrn.16         q0,  q1
    406     vtrn.16         q2,  q3
    407     vtrn.16         q4,  q5
    408     vtrn.16         q6,  q7
    409 
    410     ; Store both matrices after each other. There is a stride of 32, which
    411     ; adjusts to nothing because of the post-increments.
    412     vst1.16        {q8},  [r0]!
    413     vst1.16        {q9},  [r0]!
    414     vst1.16        {q10}, [r0]!
    415     vst1.16        {q11}, [r0]!
    416     vst1.16        {q12}, [r0]!
    417     vst1.16        {q13}, [r0]!
    418     vst1.16        {q14}, [r0]!
    419     vst1.16        {q15}, [r0]!
    420     vst1.16        {q0},  [r0]!
    421     vst1.16        {q1},  [r0]!
    422     vst1.16        {q2},  [r0]!
    423     vst1.16        {q3},  [r0]!
    424     vst1.16        {q4},  [r0]!
    425     vst1.16        {q5},  [r0]!
    426     vst1.16        {q6},  [r0]!
    427     vst1.16        {q7},  [r0]!
    428 
    429     ; increment pointers by adjusted stride (not necessary for r0/out)
    430     ;   go back by 7*32 for the seven lines moved fully by read and add
    431     ;   go back by 32 for the eigth line only read
    432     ;   advance by 16*2 to go the next pair
    433     sub r3,  r3,  #7*32*2 + 32 - 16*2
    434     ; transpose pair loop processing
    435     subs r8, r8, #1
    436     bne idct32_transpose_pair_loop
    437 
    438     ; restore r0/input to its original value
    439     sub r0, r0, #32*8*2
    440 
    441     ; Instead of doing the transforms stage by stage, it is done by loading
    442     ; some input values and doing as many stages as possible to minimize the
    443     ; storing/loading of intermediate results. To fit within registers, the
    444     ; final coefficients are cut into four blocks:
    445     ; BLOCK A: 16-19,28-31
    446     ; BLOCK B: 20-23,24-27
    447     ; BLOCK C: 8-10,11-15
    448     ; BLOCK D: 0-3,4-7
    449     ; Blocks A and C are straight calculation through the various stages. In
    450     ; block B, further calculations are performed using the results from
    451     ; block A. In block D, further calculations are performed using the results
    452     ; from block C and then the final calculations are done using results from
    453     ; block A and B which have been combined at the end of block B.
    454 
    455     ; --------------------------------------------------------------------------
    456     ; BLOCK A: 16-19,28-31
    457     ; --------------------------------------------------------------------------
    458     ; generate 16,17,30,31
    459     ; --------------------------------------------------------------------------
    460     ; part of stage 1
    461     ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] *  cospi_1_64;
    462     ;temp2 = input[1 * 32] *  cospi_1_64 + input[31 * 32] * cospi_31_64;
    463     ;step1b[16][i] = dct_const_round_shift(temp1);
    464     ;step1b[31][i] = dct_const_round_shift(temp2);
    465     LOAD_FROM_TRANSPOSED 0, 1, 31
    466     DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
    467     ; --------------------------------------------------------------------------
    468     ; part of stage 1
    469     ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
    470     ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
    471     ;step1b[17][i] = dct_const_round_shift(temp1);
    472     ;step1b[30][i] = dct_const_round_shift(temp2);
    473     LOAD_FROM_TRANSPOSED 31, 17, 15
    474     DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
    475     ; --------------------------------------------------------------------------
    476     ; part of stage 2
    477     ;step2[16] =  step1b[16][i] + step1b[17][i];
    478     ;step2[17] =  step1b[16][i] - step1b[17][i];
    479     ;step2[30] = -step1b[30][i] + step1b[31][i];
    480     ;step2[31] =  step1b[30][i] + step1b[31][i];
    481     vadd.s16  q4, q0, q1
    482     vsub.s16  q13, q0, q1
    483     vadd.s16  q6, q2, q3
    484     vsub.s16  q14, q2, q3
    485     ; --------------------------------------------------------------------------
    486     ; part of stage 3
    487     ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
    488     ;temp2 = step1b[30][i] * cospi_4_64  - step1b[17][i] * cospi_28_64;
    489     ;step3[17] = dct_const_round_shift(temp1);
    490     ;step3[30] = dct_const_round_shift(temp2);
    491     DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
    492     ; --------------------------------------------------------------------------
    493     ; generate 18,19,28,29
    494     ; --------------------------------------------------------------------------
    495     ; part of stage 1
    496     ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
    497     ;temp2 = input[9 * 32] *  cospi_9_64 + input[23 * 32] * cospi_23_64;
    498     ;step1b[18][i] = dct_const_round_shift(temp1);
    499     ;step1b[29][i] = dct_const_round_shift(temp2);
    500     LOAD_FROM_TRANSPOSED 15, 9, 23
    501     DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
    502     ; --------------------------------------------------------------------------
    503     ; part of stage 1
    504     ;temp1 = input[25 * 32] *  cospi_7_64 - input[7 * 32] * cospi_25_64;
    505     ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
    506     ;step1b[19][i] = dct_const_round_shift(temp1);
    507     ;step1b[28][i] = dct_const_round_shift(temp2);
    508     LOAD_FROM_TRANSPOSED 23, 25, 7
    509     DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
    510     ; --------------------------------------------------------------------------
    511     ; part of stage 2
    512     ;step2[18] = -step1b[18][i] + step1b[19][i];
    513     ;step2[19] =  step1b[18][i] + step1b[19][i];
    514     ;step2[28] =  step1b[28][i] + step1b[29][i];
    515     ;step2[29] =  step1b[28][i] - step1b[29][i];
    516     vsub.s16  q13, q3, q2
    517     vadd.s16  q3,  q3, q2
    518     vsub.s16  q14, q1, q0
    519     vadd.s16  q2,  q1, q0
    520     ; --------------------------------------------------------------------------
    521     ; part of stage 3
    522     ;temp1 = step1b[18][i] * (-cospi_4_64)  - step1b[29][i] * (-cospi_28_64);
    523     ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
    524     ;step3[29] = dct_const_round_shift(temp1);
    525     ;step3[18] = dct_const_round_shift(temp2);
    526     DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
    527     ; --------------------------------------------------------------------------
    528     ; combine 16-19,28-31
    529     ; --------------------------------------------------------------------------
    530     ; part of stage 4
    531     ;step1[16] = step1b[16][i] + step1b[19][i];
    532     ;step1[17] = step1b[17][i] + step1b[18][i];
    533     ;step1[18] = step1b[17][i] - step1b[18][i];
    534     ;step1[29] = step1b[30][i] - step1b[29][i];
    535     ;step1[30] = step1b[30][i] + step1b[29][i];
    536     ;step1[31] = step1b[31][i] + step1b[28][i];
    537     vadd.s16  q8,  q4, q2
    538     vadd.s16  q9,  q5, q0
    539     vadd.s16  q10, q7, q1
    540     vadd.s16  q15, q6, q3
    541     vsub.s16  q13, q5, q0
    542     vsub.s16  q14, q7, q1
    543     STORE_IN_OUTPUT 0,  16, 31, q8,  q15
    544     STORE_IN_OUTPUT 31, 17, 30, q9,  q10
    545     ; --------------------------------------------------------------------------
    546     ; part of stage 5
    547     ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
    548     ;temp2 = step1b[29][i] * cospi_8_64  + step1b[18][i] * cospi_24_64;
    549     ;step2[18] = dct_const_round_shift(temp1);
    550     ;step2[29] = dct_const_round_shift(temp2);
    551     DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
    552     STORE_IN_OUTPUT 30, 29, 18, q1, q0
    553     ; --------------------------------------------------------------------------
    554     ; part of stage 4
    555     ;step1[19] = step1b[16][i] - step1b[19][i];
    556     ;step1[28] = step1b[31][i] - step1b[28][i];
    557     vsub.s16  q13, q4, q2
    558     vsub.s16  q14, q6, q3
    559     ; --------------------------------------------------------------------------
    560     ; part of stage 5
    561     ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
    562     ;temp2 = step1b[28][i] * cospi_8_64  + step1b[19][i] * cospi_24_64;
    563     ;step2[19] = dct_const_round_shift(temp1);
    564     ;step2[28] = dct_const_round_shift(temp2);
    565     DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
    566     STORE_IN_OUTPUT 18, 19, 28, q4, q6
    567     ; --------------------------------------------------------------------------
    568 
    569 
    570     ; --------------------------------------------------------------------------
    571     ; BLOCK B: 20-23,24-27
    572     ; --------------------------------------------------------------------------
    573     ; generate 20,21,26,27
    574     ; --------------------------------------------------------------------------
    575     ; part of stage 1
    576     ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
    577     ;temp2 = input[5 * 32] *  cospi_5_64 + input[27 * 32] * cospi_27_64;
    578     ;step1b[20][i] = dct_const_round_shift(temp1);
    579     ;step1b[27][i] = dct_const_round_shift(temp2);
    580     LOAD_FROM_TRANSPOSED 7, 5, 27
    581     DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
    582     ; --------------------------------------------------------------------------
    583     ; part of stage 1
    584     ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
    585     ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
    586     ;step1b[21][i] = dct_const_round_shift(temp1);
    587     ;step1b[26][i] = dct_const_round_shift(temp2);
    588     LOAD_FROM_TRANSPOSED 27, 21, 11
    589     DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
    590     ; --------------------------------------------------------------------------
    591     ; part of stage 2
    592     ;step2[20] =  step1b[20][i] + step1b[21][i];
    593     ;step2[21] =  step1b[20][i] - step1b[21][i];
    594     ;step2[26] = -step1b[26][i] + step1b[27][i];
    595     ;step2[27] =  step1b[26][i] + step1b[27][i];
    596     vsub.s16  q13, q0, q1
    597     vadd.s16  q0, q0, q1
    598     vsub.s16  q14, q2, q3
    599     vadd.s16  q2, q2, q3
    600     ; --------------------------------------------------------------------------
    601     ; part of stage 3
    602     ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
    603     ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
    604     ;step3[21] = dct_const_round_shift(temp1);
    605     ;step3[26] = dct_const_round_shift(temp2);
    606     DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
    607     ; --------------------------------------------------------------------------
    608     ; generate 22,23,24,25
    609     ; --------------------------------------------------------------------------
    610     ; part of stage 1
    611     ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
    612     ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
    613     ;step1b[22][i] = dct_const_round_shift(temp1);
    614     ;step1b[25][i] = dct_const_round_shift(temp2);
    615     LOAD_FROM_TRANSPOSED 11, 13, 19
    616     DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
    617     ; --------------------------------------------------------------------------
    618     ; part of stage 1
    619     ;temp1 = input[29 * 32] *  cospi_3_64 - input[3 * 32] * cospi_29_64;
    620     ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
    621     ;step1b[23][i] = dct_const_round_shift(temp1);
    622     ;step1b[24][i] = dct_const_round_shift(temp2);
    623     LOAD_FROM_TRANSPOSED 19, 29, 3
    624     DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
    625     ; --------------------------------------------------------------------------
    626     ; part of stage 2
    627     ;step2[22] = -step1b[22][i] + step1b[23][i];
    628     ;step2[23] =  step1b[22][i] + step1b[23][i];
    629     ;step2[24] =  step1b[24][i] + step1b[25][i];
    630     ;step2[25] =  step1b[24][i] - step1b[25][i];
    631     vsub.s16  q14, q4, q5
    632     vadd.s16  q5, q4, q5
    633     vsub.s16  q13, q6, q7
    634     vadd.s16  q6, q6, q7
    635     ; --------------------------------------------------------------------------
    636     ; part of stage 3
    637     ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
    638     ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
    639     ;step3[25] = dct_const_round_shift(temp1);
    640     ;step3[22] = dct_const_round_shift(temp2);
    641     DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
    642     ; --------------------------------------------------------------------------
    643     ; combine 20-23,24-27
    644     ; --------------------------------------------------------------------------
    645     ; part of stage 4
    646     ;step1[22] = step1b[22][i] + step1b[21][i];
    647     ;step1[23] = step1b[23][i] + step1b[20][i];
    648     vadd.s16  q10, q7, q1
    649     vadd.s16  q11, q5, q0
    650     ;step1[24] = step1b[24][i] + step1b[27][i];
    651     ;step1[25] = step1b[25][i] + step1b[26][i];
    652     vadd.s16  q12, q6, q2
    653     vadd.s16  q15, q4, q3
    654     ; --------------------------------------------------------------------------
    655     ; part of stage 6
    656     ;step3[16] = step1b[16][i] + step1b[23][i];
    657     ;step3[17] = step1b[17][i] + step1b[22][i];
    658     ;step3[22] = step1b[17][i] - step1b[22][i];
    659     ;step3[23] = step1b[16][i] - step1b[23][i];
    660     LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
    661     vadd.s16  q8,  q14, q11
    662     vadd.s16  q9,  q13, q10
    663     vsub.s16  q13, q13, q10
    664     vsub.s16  q11, q14, q11
    665     STORE_IN_OUTPUT 17, 17, 16, q9, q8
    666     ; --------------------------------------------------------------------------
    667     ; part of stage 6
    668     ;step3[24] = step1b[31][i] - step1b[24][i];
    669     ;step3[25] = step1b[30][i] - step1b[25][i];
    670     ;step3[30] = step1b[30][i] + step1b[25][i];
    671     ;step3[31] = step1b[31][i] + step1b[24][i];
    672     LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
    673     vsub.s16  q8,  q9,  q12
    674     vadd.s16  q10, q14, q15
    675     vsub.s16  q14, q14, q15
    676     vadd.s16  q12, q9,  q12
    677     STORE_IN_OUTPUT 31, 30, 31, q10, q12
    678     ; --------------------------------------------------------------------------
    679     ; TODO(cd) do some register allocation change to remove these push/pop
    680     vpush {q8}  ; [24]
    681     vpush {q11} ; [23]
    682     ; --------------------------------------------------------------------------
    683     ; part of stage 7
    684     ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
    685     ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
    686     ;step1[22] = dct_const_round_shift(temp1);
    687     ;step1[25] = dct_const_round_shift(temp2);
    688     DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
    689     STORE_IN_OUTPUT 31, 25, 22, q14, q13
    690     ; --------------------------------------------------------------------------
    691     ; part of stage 7
    692     ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
    693     ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
    694     ;step1[23] = dct_const_round_shift(temp1);
    695     ;step1[24] = dct_const_round_shift(temp2);
    696     ; TODO(cd) do some register allocation change to remove these push/pop
    697     vpop  {q13} ; [23]
    698     vpop  {q14} ; [24]
    699     DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
    700     STORE_IN_OUTPUT 22, 24, 23, q14, q13
    701     ; --------------------------------------------------------------------------
    702     ; part of stage 4
    703     ;step1[20] = step1b[23][i] - step1b[20][i];
    704     ;step1[27] = step1b[24][i] - step1b[27][i];
    705     vsub.s16  q14, q5, q0
    706     vsub.s16  q13, q6, q2
    707     ; --------------------------------------------------------------------------
    708     ; part of stage 5
    709     ;temp1 = step1b[20][i] * (-cospi_8_64)  - step1b[27][i] * (-cospi_24_64);
    710     ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
    711     ;step2[27] = dct_const_round_shift(temp1);
    712     ;step2[20] = dct_const_round_shift(temp2);
    713     DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
    714     ; --------------------------------------------------------------------------
    715     ; part of stage 4
    716     ;step1[21] = step1b[22][i] - step1b[21][i];
    717     ;step1[26] = step1b[25][i] - step1b[26][i];
    718     vsub.s16  q14,  q7, q1
    719     vsub.s16  q13,  q4, q3
    720     ; --------------------------------------------------------------------------
    721     ; part of stage 5
    722     ;temp1 = step1b[21][i] * (-cospi_8_64)  - step1b[26][i] * (-cospi_24_64);
    723     ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
    724     ;step2[26] = dct_const_round_shift(temp1);
    725     ;step2[21] = dct_const_round_shift(temp2);
    726     DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
    727     ; --------------------------------------------------------------------------
    728     ; part of stage 6
    729     ;step3[18] = step1b[18][i] + step1b[21][i];
    730     ;step3[19] = step1b[19][i] + step1b[20][i];
    731     ;step3[20] = step1b[19][i] - step1b[20][i];
    732     ;step3[21] = step1b[18][i] - step1b[21][i];
    733     LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
    734     vadd.s16  q8,  q14, q1
    735     vadd.s16  q9,  q13, q6
    736     vsub.s16  q13, q13, q6
    737     vsub.s16  q1,  q14, q1
    738     STORE_IN_OUTPUT 19, 18, 19, q8, q9
    739     ; --------------------------------------------------------------------------
    740     ; part of stage 6
    741     ;step3[27] = step1b[28][i] - step1b[27][i];
    742     ;step3[28] = step1b[28][i] + step1b[27][i];
    743     ;step3[29] = step1b[29][i] + step1b[26][i];
    744     ;step3[26] = step1b[29][i] - step1b[26][i];
    745     LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
    746     vsub.s16  q14, q8, q5
    747     vadd.s16  q10, q8, q5
    748     vadd.s16  q11, q9, q0
    749     vsub.s16  q0, q9, q0
    750     STORE_IN_OUTPUT 29, 28, 29, q10, q11
    751     ; --------------------------------------------------------------------------
    752     ; part of stage 7
    753     ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
    754     ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
    755     ;step1[20] = dct_const_round_shift(temp1);
    756     ;step1[27] = dct_const_round_shift(temp2);
    757     DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
    758     STORE_IN_OUTPUT 29, 20, 27, q13, q14
    759     ; --------------------------------------------------------------------------
    760     ; part of stage 7
    761     ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
    762     ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
    763     ;step1[21] = dct_const_round_shift(temp1);
    764     ;step1[26] = dct_const_round_shift(temp2);
    765     DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
    766     STORE_IN_OUTPUT 27, 21, 26, q1, q0
    767     ; --------------------------------------------------------------------------
    768 
    769 
    770     ; --------------------------------------------------------------------------
    771     ; BLOCK C: 8-10,11-15
    772     ; --------------------------------------------------------------------------
    773     ; generate 8,9,14,15
    774     ; --------------------------------------------------------------------------
    775     ; part of stage 2
    776     ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
    777     ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
    778     ;step2[8] = dct_const_round_shift(temp1);
    779     ;step2[15] = dct_const_round_shift(temp2);
    780     LOAD_FROM_TRANSPOSED 3, 2, 30
    781     DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
    782     ; --------------------------------------------------------------------------
    783     ; part of stage 2
    784     ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
    785     ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
    786     ;step2[9] = dct_const_round_shift(temp1);
    787     ;step2[14] = dct_const_round_shift(temp2);
    788     LOAD_FROM_TRANSPOSED 30, 18, 14
    789     DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
    790     ; --------------------------------------------------------------------------
    791     ; part of stage 3
    792     ;step3[8] = step1b[8][i] + step1b[9][i];
    793     ;step3[9] = step1b[8][i] - step1b[9][i];
    794     ;step3[14] = step1b[15][i] - step1b[14][i];
    795     ;step3[15] = step1b[15][i] + step1b[14][i];
    796     vsub.s16  q13, q0, q1
    797     vadd.s16  q0, q0, q1
    798     vsub.s16  q14, q2, q3
    799     vadd.s16  q2, q2, q3
    800     ; --------------------------------------------------------------------------
    801     ; part of stage 4
    802     ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
    803     ;temp2 = step1b[14][i] * cospi_8_64  + step1b[9][i] * cospi_24_64;
    804     ;step1[9]  = dct_const_round_shift(temp1);
    805     ;step1[14] = dct_const_round_shift(temp2);
    806     DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
    807     ; --------------------------------------------------------------------------
    808     ; generate 10,11,12,13
    809     ; --------------------------------------------------------------------------
    810     ; part of stage 2
    811     ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
    812     ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
    813     ;step2[10] = dct_const_round_shift(temp1);
    814     ;step2[13] = dct_const_round_shift(temp2);
    815     LOAD_FROM_TRANSPOSED 14, 10, 22
    816     DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
    817     ; --------------------------------------------------------------------------
    818     ; part of stage 2
    819     ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
    820     ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
    821     ;step2[11] = dct_const_round_shift(temp1);
    822     ;step2[12] = dct_const_round_shift(temp2);
    823     LOAD_FROM_TRANSPOSED 22, 26, 6
    824     DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
    825     ; --------------------------------------------------------------------------
    826     ; part of stage 3
    827     ;step3[10] = step1b[11][i] - step1b[10][i];
    828     ;step3[11] = step1b[11][i] + step1b[10][i];
    829     ;step3[12] = step1b[12][i] + step1b[13][i];
    830     ;step3[13] = step1b[12][i] - step1b[13][i];
    831     vsub.s16  q14, q4, q5
    832     vadd.s16  q5, q4, q5
    833     vsub.s16  q13, q6, q7
    834     vadd.s16  q6, q6, q7
    835     ; --------------------------------------------------------------------------
    836     ; part of stage 4
    837     ;temp1 = step1b[10][i] * (-cospi_8_64)  - step1b[13][i] * (-cospi_24_64);
    838     ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
    839     ;step1[13] = dct_const_round_shift(temp1);
    840     ;step1[10] = dct_const_round_shift(temp2);
    841     DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
    842     ; --------------------------------------------------------------------------
    843     ; combine 8-10,11-15
    844     ; --------------------------------------------------------------------------
    845     ; part of stage 5
    846     ;step2[8]  = step1b[8][i] + step1b[11][i];
    847     ;step2[9]  = step1b[9][i] + step1b[10][i];
    848     ;step2[10] = step1b[9][i] - step1b[10][i];
    849     vadd.s16  q8,  q0, q5
    850     vadd.s16  q9,  q1, q7
    851     vsub.s16  q13, q1, q7
    852     ;step2[13] = step1b[14][i] - step1b[13][i];
    853     ;step2[14] = step1b[14][i] + step1b[13][i];
    854     ;step2[15] = step1b[15][i] + step1b[12][i];
    855     vsub.s16  q14, q3, q4
    856     vadd.s16  q10, q3, q4
    857     vadd.s16  q15, q2, q6
    858     STORE_IN_OUTPUT 26, 8, 15, q8, q15
    859     STORE_IN_OUTPUT 15, 9, 14, q9, q10
    860     ; --------------------------------------------------------------------------
    861     ; part of stage 6
    862     ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
    863     ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
    864     ;step3[10] = dct_const_round_shift(temp1);
    865     ;step3[13] = dct_const_round_shift(temp2);
    866     DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
    867     STORE_IN_OUTPUT 14, 13, 10, q3, q1
    868     ; --------------------------------------------------------------------------
    869     ; part of stage 5
    870     ;step2[11] = step1b[8][i] - step1b[11][i];
    871     ;step2[12] = step1b[15][i] - step1b[12][i];
    872     vsub.s16  q13, q0, q5
    873     vsub.s16  q14,  q2, q6
    874     ; --------------------------------------------------------------------------
    875     ; part of stage 6
    876     ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
    877     ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
    878     ;step3[11] = dct_const_round_shift(temp1);
    879     ;step3[12] = dct_const_round_shift(temp2);
    880     DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
    881     STORE_IN_OUTPUT 10, 11, 12, q1, q3
    882     ; --------------------------------------------------------------------------
    883 
    884 
    885     ; --------------------------------------------------------------------------
    886     ; BLOCK D: 0-3,4-7
    887     ; --------------------------------------------------------------------------
    888     ; generate 4,5,6,7
    889     ; --------------------------------------------------------------------------
    890     ; part of stage 3
    891     ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
    892     ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
    893     ;step3[4] = dct_const_round_shift(temp1);
    894     ;step3[7] = dct_const_round_shift(temp2);
    895     LOAD_FROM_TRANSPOSED 6, 4, 28
    896     DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
    897     ; --------------------------------------------------------------------------
    898     ; part of stage 3
    899     ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
    900     ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
    901     ;step3[5] = dct_const_round_shift(temp1);
    902     ;step3[6] = dct_const_round_shift(temp2);
    903     LOAD_FROM_TRANSPOSED 28, 20, 12
    904     DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
    905     ; --------------------------------------------------------------------------
    906     ; part of stage 4
    907     ;step1[4] = step1b[4][i] + step1b[5][i];
    908     ;step1[5] = step1b[4][i] - step1b[5][i];
    909     ;step1[6] = step1b[7][i] - step1b[6][i];
    910     ;step1[7] = step1b[7][i] + step1b[6][i];
    911     vsub.s16  q13, q0, q1
    912     vadd.s16  q0, q0, q1
    913     vsub.s16  q14, q2, q3
    914     vadd.s16  q2, q2, q3
    915     ; --------------------------------------------------------------------------
    916     ; part of stage 5
    917     ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
    918     ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
    919     ;step2[5] = dct_const_round_shift(temp1);
    920     ;step2[6] = dct_const_round_shift(temp2);
    921     DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
    922     ; --------------------------------------------------------------------------
    923     ; generate 0,1,2,3
    924     ; --------------------------------------------------------------------------
    925     ; part of stage 4
    926     ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
    927     ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
    928     ;step1[1] = dct_const_round_shift(temp1);
    929     ;step1[0] = dct_const_round_shift(temp2);
    930     LOAD_FROM_TRANSPOSED 12, 0, 16
    931     DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
    932     ; --------------------------------------------------------------------------
    933     ; part of stage 4
    934     ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
    935     ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
    936     ;step1[2] = dct_const_round_shift(temp1);
    937     ;step1[3] = dct_const_round_shift(temp2);
    938     LOAD_FROM_TRANSPOSED 16, 8, 24
    939     DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
    940     ; --------------------------------------------------------------------------
    941     ; part of stage 5
    942     ;step2[0] = step1b[0][i] + step1b[3][i];
    943     ;step2[1] = step1b[1][i] + step1b[2][i];
    944     ;step2[2] = step1b[1][i] - step1b[2][i];
    945     ;step2[3] = step1b[0][i] - step1b[3][i];
    946     vadd.s16  q4, q7, q6
    947     vsub.s16  q7, q7, q6
    948     vsub.s16  q6, q5, q14
    949     vadd.s16  q5, q5, q14
    950     ; --------------------------------------------------------------------------
    951     ; combine 0-3,4-7
    952     ; --------------------------------------------------------------------------
    953     ; part of stage 6
    954     ;step3[0] = step1b[0][i] + step1b[7][i];
    955     ;step3[1] = step1b[1][i] + step1b[6][i];
    956     ;step3[2] = step1b[2][i] + step1b[5][i];
    957     ;step3[3] = step1b[3][i] + step1b[4][i];
    958     vadd.s16  q8,  q4, q2
    959     vadd.s16  q9,  q5, q3
    960     vadd.s16  q10, q6, q1
    961     vadd.s16  q11, q7, q0
    962     ;step3[4] = step1b[3][i] - step1b[4][i];
    963     ;step3[5] = step1b[2][i] - step1b[5][i];
    964     ;step3[6] = step1b[1][i] - step1b[6][i];
    965     ;step3[7] = step1b[0][i] - step1b[7][i];
    966     vsub.s16  q12, q7, q0
    967     vsub.s16  q13, q6, q1
    968     vsub.s16  q14, q5, q3
    969     vsub.s16  q15, q4, q2
    970     ; --------------------------------------------------------------------------
    971     ; part of stage 7
    972     ;step1[0] = step1b[0][i] + step1b[15][i];
    973     ;step1[1] = step1b[1][i] + step1b[14][i];
    974     ;step1[14] = step1b[1][i] - step1b[14][i];
    975     ;step1[15] = step1b[0][i] - step1b[15][i];
    976     LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
    977     vadd.s16  q2, q8, q1
    978     vadd.s16  q3, q9, q0
    979     vsub.s16  q4, q9, q0
    980     vsub.s16  q5, q8, q1
    981     ; --------------------------------------------------------------------------
    982     ; part of final stage
    983     ;output[14 * 32] = step1b[14][i] + step1b[17][i];
    984     ;output[15 * 32] = step1b[15][i] + step1b[16][i];
    985     ;output[16 * 32] = step1b[15][i] - step1b[16][i];
    986     ;output[17 * 32] = step1b[14][i] - step1b[17][i];
    987     LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
    988     vadd.s16  q8, q4, q1
    989     vadd.s16  q9, q5, q0
    990     vsub.s16  q6, q5, q0
    991     vsub.s16  q7, q4, q1
    992 
    993     cmp r5, #0
    994     bgt idct32_bands_end_2nd_pass
    995 
    996 idct32_bands_end_1st_pass
    997     STORE_IN_OUTPUT 17, 16, 17, q6, q7
    998     STORE_IN_OUTPUT 17, 14, 15, q8, q9
    999     ; --------------------------------------------------------------------------
   1000     ; part of final stage
   1001     ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
   1002     ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
   1003     ;output[30 * 32] = step1b[1][i] - step1b[30][i];
   1004     ;output[31 * 32] = step1b[0][i] - step1b[31][i];
   1005     LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
   1006     vadd.s16  q4, q2, q1
   1007     vadd.s16  q5, q3, q0
   1008     vsub.s16  q6, q3, q0
   1009     vsub.s16  q7, q2, q1
   1010     STORE_IN_OUTPUT 31, 30, 31, q6, q7
   1011     STORE_IN_OUTPUT 31,  0,  1, q4, q5
   1012     ; --------------------------------------------------------------------------
   1013     ; part of stage 7
   1014     ;step1[2] = step1b[2][i] + step1b[13][i];
   1015     ;step1[3] = step1b[3][i] + step1b[12][i];
   1016     ;step1[12] = step1b[3][i] - step1b[12][i];
   1017     ;step1[13] = step1b[2][i] - step1b[13][i];
   1018     LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
   1019     vadd.s16  q2, q10, q1
   1020     vadd.s16  q3, q11, q0
   1021     vsub.s16  q4, q11, q0
   1022     vsub.s16  q5, q10, q1
   1023     ; --------------------------------------------------------------------------
   1024     ; part of final stage
   1025     ;output[12 * 32] = step1b[12][i] + step1b[19][i];
   1026     ;output[13 * 32] = step1b[13][i] + step1b[18][i];
   1027     ;output[18 * 32] = step1b[13][i] - step1b[18][i];
   1028     ;output[19 * 32] = step1b[12][i] - step1b[19][i];
   1029     LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
   1030     vadd.s16  q8, q4, q1
   1031     vadd.s16  q9, q5, q0
   1032     vsub.s16  q6, q5, q0
   1033     vsub.s16  q7, q4, q1
   1034     STORE_IN_OUTPUT 19, 18, 19, q6, q7
   1035     STORE_IN_OUTPUT 19, 12, 13, q8, q9
   1036     ; --------------------------------------------------------------------------
   1037     ; part of final stage
   1038     ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
   1039     ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
   1040     ;output[28 * 32] = step1b[3][i] - step1b[28][i];
   1041     ;output[29 * 32] = step1b[2][i] - step1b[29][i];
   1042     LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
   1043     vadd.s16  q4, q2, q1
   1044     vadd.s16  q5, q3, q0
   1045     vsub.s16  q6, q3, q0
   1046     vsub.s16  q7, q2, q1
   1047     STORE_IN_OUTPUT 29, 28, 29, q6, q7
   1048     STORE_IN_OUTPUT 29,  2,  3, q4, q5
   1049     ; --------------------------------------------------------------------------
   1050     ; part of stage 7
   1051     ;step1[4] = step1b[4][i] + step1b[11][i];
   1052     ;step1[5] = step1b[5][i] + step1b[10][i];
   1053     ;step1[10] = step1b[5][i] - step1b[10][i];
   1054     ;step1[11] = step1b[4][i] - step1b[11][i];
   1055     LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
   1056     vadd.s16  q2, q12, q1
   1057     vadd.s16  q3, q13, q0
   1058     vsub.s16  q4, q13, q0
   1059     vsub.s16  q5, q12, q1
   1060     ; --------------------------------------------------------------------------
   1061     ; part of final stage
   1062     ;output[10 * 32] = step1b[10][i] + step1b[21][i];
   1063     ;output[11 * 32] = step1b[11][i] + step1b[20][i];
   1064     ;output[20 * 32] = step1b[11][i] - step1b[20][i];
   1065     ;output[21 * 32] = step1b[10][i] - step1b[21][i];
   1066     LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
   1067     vadd.s16  q8, q4, q1
   1068     vadd.s16  q9, q5, q0
   1069     vsub.s16  q6, q5, q0
   1070     vsub.s16  q7, q4, q1
   1071     STORE_IN_OUTPUT 21, 20, 21, q6, q7
   1072     STORE_IN_OUTPUT 21, 10, 11, q8, q9
   1073     ; --------------------------------------------------------------------------
   1074     ; part of final stage
   1075     ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
   1076     ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
   1077     ;output[26 * 32] = step1b[5][i] - step1b[26][i];
   1078     ;output[27 * 32] = step1b[4][i] - step1b[27][i];
   1079     LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
   1080     vadd.s16  q4, q2, q1
   1081     vadd.s16  q5, q3, q0
   1082     vsub.s16  q6, q3, q0
   1083     vsub.s16  q7, q2, q1
   1084     STORE_IN_OUTPUT 27, 26, 27, q6, q7
   1085     STORE_IN_OUTPUT 27,  4,  5, q4, q5
   1086     ; --------------------------------------------------------------------------
   1087     ; part of stage 7
   1088     ;step1[6] = step1b[6][i] + step1b[9][i];
   1089     ;step1[7] = step1b[7][i] + step1b[8][i];
   1090     ;step1[8] = step1b[7][i] - step1b[8][i];
   1091     ;step1[9] = step1b[6][i] - step1b[9][i];
   1092     LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
   1093     vadd.s16  q2, q14, q1
   1094     vadd.s16  q3, q15, q0
   1095     vsub.s16  q4, q15, q0
   1096     vsub.s16  q5, q14, q1
   1097     ; --------------------------------------------------------------------------
   1098     ; part of final stage
   1099     ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
   1100     ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
   1101     ;output[22 * 32] = step1b[9][i] - step1b[22][i];
   1102     ;output[23 * 32] = step1b[8][i] - step1b[23][i];
   1103     LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
   1104     vadd.s16  q8, q4, q1
   1105     vadd.s16  q9, q5, q0
   1106     vsub.s16  q6, q5, q0
   1107     vsub.s16  q7, q4, q1
   1108     STORE_IN_OUTPUT 23, 22, 23, q6, q7
   1109     STORE_IN_OUTPUT 23, 8, 9, q8, q9
   1110     ; --------------------------------------------------------------------------
   1111     ; part of final stage
   1112     ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
   1113     ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
   1114     ;output[24 * 32] = step1b[7][i] - step1b[24][i];
   1115     ;output[25 * 32] = step1b[6][i] - step1b[25][i];
   1116     LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
   1117     vadd.s16  q4, q2, q1
   1118     vadd.s16  q5, q3, q0
   1119     vsub.s16  q6, q3, q0
   1120     vsub.s16  q7, q2, q1
   1121     STORE_IN_OUTPUT 25, 24, 25, q6, q7
   1122     STORE_IN_OUTPUT 25,  6,  7, q4, q5
   1123 
   1124     ; restore r0 by removing the last offset from the last
   1125     ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
   1126     sub r0, r0, #24*8*2
   1127     ; restore r1 by removing the last offset from the last
   1128     ;     operation (STORE_IN_OUTPUT 24,  6,  7) => 7*32*2
   1129     ; advance by 8 columns => 8*2
   1130     sub r1, r1, #7*32*2 - 8*2
   1131     ;   advance by 8 lines (8*32*2)
   1132     ;   go back by the two pairs from the loop (32*2)
   1133     add r3, r3, #8*32*2 - 32*2
   1134 
   1135     ; bands loop processing
   1136     subs r4, r4, #1
   1137     bne idct32_bands_loop
   1138 
   1139     ; parameters for second pass
   1140     ; the input of pass2 is the result of pass1. we have to remove the offset
   1141     ;   of 32 columns induced by the above idct32_bands_loop
   1142     sub r3, r1, #32*2
   1143       ; r1 = pass2[32 * 32]
   1144     add r1, sp, #2048
   1145 
   1146     ; pass loop processing
   1147     add r5, r5, #1
   1148     b idct32_pass_loop
   1149 
   1150 idct32_bands_end_2nd_pass
   1151     STORE_COMBINE_CENTER_RESULTS
   1152     ; --------------------------------------------------------------------------
   1153     ; part of final stage
   1154     ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
   1155     ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
   1156     ;output[30 * 32] = step1b[1][i] - step1b[30][i];
   1157     ;output[31 * 32] = step1b[0][i] - step1b[31][i];
   1158     LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
   1159     vadd.s16  q4, q2, q1
   1160     vadd.s16  q5, q3, q0
   1161     vsub.s16  q6, q3, q0
   1162     vsub.s16  q7, q2, q1
   1163     STORE_COMBINE_EXTREME_RESULTS
   1164     ; --------------------------------------------------------------------------
   1165     ; part of stage 7
   1166     ;step1[2] = step1b[2][i] + step1b[13][i];
   1167     ;step1[3] = step1b[3][i] + step1b[12][i];
   1168     ;step1[12] = step1b[3][i] - step1b[12][i];
   1169     ;step1[13] = step1b[2][i] - step1b[13][i];
   1170     LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
   1171     vadd.s16  q2, q10, q1
   1172     vadd.s16  q3, q11, q0
   1173     vsub.s16  q4, q11, q0
   1174     vsub.s16  q5, q10, q1
   1175     ; --------------------------------------------------------------------------
   1176     ; part of final stage
   1177     ;output[12 * 32] = step1b[12][i] + step1b[19][i];
   1178     ;output[13 * 32] = step1b[13][i] + step1b[18][i];
   1179     ;output[18 * 32] = step1b[13][i] - step1b[18][i];
   1180     ;output[19 * 32] = step1b[12][i] - step1b[19][i];
   1181     LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
   1182     vadd.s16  q8, q4, q1
   1183     vadd.s16  q9, q5, q0
   1184     vsub.s16  q6, q5, q0
   1185     vsub.s16  q7, q4, q1
   1186     STORE_COMBINE_CENTER_RESULTS
   1187     ; --------------------------------------------------------------------------
   1188     ; part of final stage
   1189     ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
   1190     ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
   1191     ;output[28 * 32] = step1b[3][i] - step1b[28][i];
   1192     ;output[29 * 32] = step1b[2][i] - step1b[29][i];
   1193     LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
   1194     vadd.s16  q4, q2, q1
   1195     vadd.s16  q5, q3, q0
   1196     vsub.s16  q6, q3, q0
   1197     vsub.s16  q7, q2, q1
   1198     STORE_COMBINE_EXTREME_RESULTS
   1199     ; --------------------------------------------------------------------------
   1200     ; part of stage 7
   1201     ;step1[4] = step1b[4][i] + step1b[11][i];
   1202     ;step1[5] = step1b[5][i] + step1b[10][i];
   1203     ;step1[10] = step1b[5][i] - step1b[10][i];
   1204     ;step1[11] = step1b[4][i] - step1b[11][i];
   1205     LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
   1206     vadd.s16  q2, q12, q1
   1207     vadd.s16  q3, q13, q0
   1208     vsub.s16  q4, q13, q0
   1209     vsub.s16  q5, q12, q1
   1210     ; --------------------------------------------------------------------------
   1211     ; part of final stage
   1212     ;output[10 * 32] = step1b[10][i] + step1b[21][i];
   1213     ;output[11 * 32] = step1b[11][i] + step1b[20][i];
   1214     ;output[20 * 32] = step1b[11][i] - step1b[20][i];
   1215     ;output[21 * 32] = step1b[10][i] - step1b[21][i];
   1216     LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
   1217     vadd.s16  q8, q4, q1
   1218     vadd.s16  q9, q5, q0
   1219     vsub.s16  q6, q5, q0
   1220     vsub.s16  q7, q4, q1
   1221     STORE_COMBINE_CENTER_RESULTS
   1222     ; --------------------------------------------------------------------------
   1223     ; part of final stage
   1224     ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
   1225     ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
   1226     ;output[26 * 32] = step1b[5][i] - step1b[26][i];
   1227     ;output[27 * 32] = step1b[4][i] - step1b[27][i];
   1228     LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
   1229     vadd.s16  q4, q2, q1
   1230     vadd.s16  q5, q3, q0
   1231     vsub.s16  q6, q3, q0
   1232     vsub.s16  q7, q2, q1
   1233     STORE_COMBINE_EXTREME_RESULTS
   1234     ; --------------------------------------------------------------------------
   1235     ; part of stage 7
   1236     ;step1[6] = step1b[6][i] + step1b[9][i];
   1237     ;step1[7] = step1b[7][i] + step1b[8][i];
   1238     ;step1[8] = step1b[7][i] - step1b[8][i];
   1239     ;step1[9] = step1b[6][i] - step1b[9][i];
   1240     LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
   1241     vadd.s16  q2, q14, q1
   1242     vadd.s16  q3, q15, q0
   1243     vsub.s16  q4, q15, q0
   1244     vsub.s16  q5, q14, q1
   1245     ; --------------------------------------------------------------------------
   1246     ; part of final stage
   1247     ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
   1248     ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
   1249     ;output[22 * 32] = step1b[9][i] - step1b[22][i];
   1250     ;output[23 * 32] = step1b[8][i] - step1b[23][i];
   1251     LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
   1252     vadd.s16  q8, q4, q1
   1253     vadd.s16  q9, q5, q0
   1254     vsub.s16  q6, q5, q0
   1255     vsub.s16  q7, q4, q1
   1256     STORE_COMBINE_CENTER_RESULTS_LAST
   1257     ; --------------------------------------------------------------------------
   1258     ; part of final stage
   1259     ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
   1260     ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
   1261     ;output[24 * 32] = step1b[7][i] - step1b[24][i];
   1262     ;output[25 * 32] = step1b[6][i] - step1b[25][i];
   1263     LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
   1264     vadd.s16  q4, q2, q1
   1265     vadd.s16  q5, q3, q0
   1266     vsub.s16  q6, q3, q0
   1267     vsub.s16  q7, q2, q1
   1268     STORE_COMBINE_EXTREME_RESULTS_LAST
   1269     ; --------------------------------------------------------------------------
   1270     ; restore pointers to their initial indices for next band pass by
   1271     ;     removing/adding dest_stride * 8. The actual increment by eight
   1272     ;     is taken care of within the _LAST macros.
   1273     add r6,  r6,  r2, lsl #3
   1274     add r9,  r9,  r2, lsl #3
   1275     sub r7,  r7,  r2, lsl #3
   1276     sub r10, r10, r2, lsl #3
   1277 
   1278     ; restore r0 by removing the last offset from the last
   1279     ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
   1280     sub r0, r0, #24*8*2
   1281     ; restore r1 by removing the last offset from the last
   1282     ;     operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
   1283     ; advance by 8 columns => 8*2
   1284     sub r1, r1, #25*32*2 - 8*2
   1285     ;   advance by 8 lines (8*32*2)
   1286     ;   go back by the two pairs from the loop (32*2)
   1287     add r3, r3, #8*32*2 - 32*2
   1288 
   1289     ; bands loop processing
   1290     subs r4, r4, #1
   1291     bne idct32_bands_loop
   1292 
   1293     ; stack operation
   1294     add sp, sp, #512+2048+2048
   1295     vpop {d8-d15}
   1296     pop  {r4-r11}
   1297     bx              lr
   1298     ENDP  ; |vp9_idct32x32_1024_add_neon|
   1299     END
   1300