Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 @**
     21 @*******************************************************************************
     22 @* @file
     23 @*  ih264_resi_trans_quant_a9.s
     24 @*
     25 @* @brief
     26 @*  Contains function definitions for residual and forward trans
     27 @*
     28 @* @author
     29 @*  Ittiam
     30 @*
     31 @* @par List of Functions:
     32 @*  ih264_resi_trans_quant_4x4_a9
     33 @*  ih264_resi_trans_quant_8x8_a9
     34 @*  ih264_resi_trans_quant_chroma_4x4_a9
     35 @*  ih264_hadamard_quant_4x4_a9
     36 @*  ih264_hadamard_quant_2x2_uv_a9
     37 @*
     38 @* @remarks
     39 @*  None
     40 @*
     41 @*******************************************************************************
     42 
     43 
     44 .text
     45 .p2align 2
     46 @*****************************************************************************
     47 @*
     48 @* Function Name     : ih264_resi_trans_quant_4x4_a9
     49 @* Description       : This function does cf4 of H264
     50 @*
     51 @* Arguments         :  R0 :pointer to src buffer
     52 @                       R1 :pointer to pred buffer
     53 @                       R2 :pointer to dst buffer
     54 @                       R3 :source stride
     55 @                       STACK : pred stride,
     56 @                               dst stride,
     57 @                               pointer to scaling matrix,
     58 @                               pointer to threshold matrix,
     59 @                               qbits,
     60 @                               rounding factor,
     61 @                               pointer to store nnz
     62 @                               pointer to store non quantized dc value
     63 @ Values Returned   : NONE
     64 @
     65 @ Register Usage    :
     66 @ Stack Usage       : 40 bytes
     67 @ Cycles            : Around
     68 @ Interruptiaility  : Interruptable
     69 @
     70 @ Known Limitations
     71 @   \Assumptions    :
     72 @
     73 @ Revision History  :
     74 @         DD MM YYYY    Author(s)   Changes
     75 @         1 12 2013    100633      First version
     76 @         20 1 2014    100633      Changes the API, Optimization
     77 @
     78 @*****************************************************************************
     79 
     80     .global ih264_resi_trans_quant_4x4_a9
     81 ih264_resi_trans_quant_4x4_a9:
     82 
     83     @R0     :pointer to src buffer
     84     @R1     :pointer to pred buffer
     85     @R2     :pointer to dst buffer
     86     @R3     :Source stride
     87     @STACk  :pred stride
     88     @       :scale matirx,
     89     @       :threshold matrix
     90     @       :qbits
     91     @       :round factor
     92     @       :nnz
     93 
     94     push          {r4-r12, lr}          @push all the variables first
     95 
     96     add           r11, sp, #40          @decrement stack pointer,to accomodate two variables
     97     ldmfd         r11, {r4-r10}         @load the strides into registers
     98 
     99     @R0     :pointer to src buffer
    100     @R1     :pointer to pred buffer
    101     @R2     :pointer to dst buffer
    102     @R3     :Source stride
    103     @R4     :Pred stride
    104     @R5     :scale matirx,
    105     @R6     :threshold matrix
    106     @R7     :qbits
    107     @R8     :round factor
    108     @R9     :nnz
    109 
    110     vpush         {d8-d15}
    111 
    112     mov           r11, #0
    113     sub           r7, r11, r7           @Negate the qbit value for usiing LSL
    114 
    115     @------------Fucntion Loading done----------------;
    116 
    117     vld1.u8       d30, [r0], r3         @load first 8 pix src row 1
    118 
    119     vld1.u8       d31, [r1], r4         @load first 8 pix pred row 1
    120 
    121     vld1.u8       d28, [r0], r3         @load first 8 pix src row 2
    122 
    123     vld1.u8       d29, [r1], r4         @load first 8 pix pred row 2
    124 
    125     vld1.u8       d26, [r0], r3         @load first 8 pix src row 3
    126 
    127     vld1.u8       d27, [r1], r4         @load first 8 pix pred row 3
    128     vsubl.u8      q0, d30, d31          @find residue row 1
    129 
    130     vld1.u8       d24, [r0], r3         @load first 8 pix src row 4
    131 
    132     vld1.u8       d25, [r1], r4         @load first 8 pix pred row 4
    133     vsubl.u8      q1, d28, d29          @find residue row 2
    134 
    135     vsubl.u8      q2, d26, d27          @find residue row 3
    136     vsubl.u8      q3, d24, d25          @find residue row 4
    137 
    138     vtrn.16       d0, d2                @T12
    139     vtrn.16       d4, d6                @T23
    140     vtrn.32       d0, d4                @T13
    141     vtrn.32       d2, d6                @T14
    142 
    143     vadd.s16      d8 , d0, d6           @x0 = x4+x7
    144     vadd.s16      d9 , d2, d4           @x1 = x5+x6
    145     vsub.s16      d10, d2, d4           @x2 = x5-x6
    146     vsub.s16      d11, d0, d6           @x3 = x4-x7
    147 
    148     vshl.s16      d12, d10, #1          @U_SHIFT(x2,1,shft)
    149     vshl.s16      d13, d11, #1          @U_SHIFT(x3,1,shft)
    150 
    151     vadd.s16      d14, d8, d9           @x4 = x0 + x1;
    152     vsub.s16      d16, d8, d9           @x6 = x0 - x1;
    153     vadd.s16      d15, d13, d10         @x5 = U_SHIFT(x3,1,shft) + x2;
    154     vsub.s16      d17, d11, d12         @x7 = x3 - U_SHIFT(x2,1,shft);
    155 
    156     @taking transpose again so as to make do vert transform
    157     vtrn.16       d14, d15              @T12
    158     vtrn.16       d16, d17              @T23
    159     vtrn.32       d14, d16              @T13
    160     vtrn.32       d15, d17              @T24
    161 
    162     @let us do vertical transform
    163     @same code as horiz
    164     vadd.s16      d18, d14, d17         @x0 = x4+x7
    165     vadd.s16      d19, d15, d16         @x1 = x5+x6
    166     vsub.s16      d20, d15, d16         @x2 = x5-x6
    167     vsub.s16      d21, d14, d17         @x3 = x4-x7
    168 
    169     vshl.s16      d22, d20, #1          @U_SHIFT(x2,1,shft)
    170     vshl.s16      d23, d21, #1          @U_SHIFT(x3,1,shft)
    171 
    172     vdup.s32      q4, r8                @Load rounding value row 1
    173 
    174     vadd.s16      d24, d18, d19         @x5 = x0 + x1;
    175     vsub.s16      d26, d18, d19         @x7 = x0 - x1;
    176     vadd.s16      d25, d23, d20         @x6 = U_SHIFT(x3,1,shft) + x2;
    177     vsub.s16      d27, d21, d22         @x8 = x3 - U_SHIFT(x2,1,shft);
    178     vdup.s32      q10, r7               @Load qbit values
    179 
    180     vst1.s16      d24[0], [r10]         @Store the dc value to alternate dc sddress
    181 
    182 @core tranform is done for 4x8 block 1
    183     vld1.s16      {q14-q15}, [r5]       @load the scaling values
    184 
    185     vabs.s16      q0, q12               @Abs val of row 1 blk 1
    186 
    187     vabs.s16      q1, q13               @Abs val of row 2 blk 1
    188 
    189     vmov.s32      q5, q4                @copy round fact for row 2
    190 
    191     vmov.s32      q6, q4                @copy round fact for row 2
    192     vclt.s16      q2, q12, #0           @Get the sign of row 1 blk 1
    193 
    194     vmov.s32      q7, q4                @copy round fact for row 2
    195     vclt.s16      q3, q13, #0           @Get the sign of row 2 blk 1
    196 
    197     vmlal.s16     q4, d0, d28           @Multiply and add row 1
    198     vmlal.s16     q5, d1, d29           @Multiply and add row 2
    199     vmlal.s16     q6, d2, d30           @Multiply and add row 3
    200     vmlal.s16     q7, d3, d31           @Multiply and add row 4
    201 
    202     vshl.s32      q11, q4, q10          @Shift row 1
    203     vshl.s32      q12, q5, q10          @Shift row 2
    204     vshl.s32      q13, q6, q10          @Shift row 3
    205     vshl.s32      q14, q7, q10          @Shift row 4
    206 
    207     vmovn.s32     d30, q11              @Narrow row 1
    208     vmovn.s32     d31, q12              @Narrow row 2
    209     vmovn.s32     d0 , q13              @Narrow row 3
    210     vmovn.s32     d1 , q14              @Narrow row 4
    211 
    212     vneg.s16      q1, q15               @Get negative
    213     vneg.s16      q4, q0                @Get negative
    214 
    215     vceq.s16      q5, q15, #0           @I  compare with zero row 1 and 2 blk 1
    216     vceq.s16      q6, q0 , #0           @I  compare with zero row 1 and 2 blk 1
    217 
    218     vbsl.s16      q2, q1, q15           @Restore sign of row 1 and 2
    219     vbsl.s16      q3, q4, q0            @Restore sign of row 3 and 4
    220 
    221 
    222     vmovn.u16     d14, q5               @I  Narrow the comparison for row 1 and 2 blk 1
    223     vmovn.u16     d15, q6               @I  Narrow the comparison for row 1 and 2 blk 2
    224 
    225     vshr.u8       q8, q7, #7            @I  Reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
    226 
    227     vpadd.u8      d18, d16, d17         @I pair add nnz 1
    228     vpadd.u8      d20, d18, d19         @I Pair add nnz 2
    229     vpadd.u8      d22, d20, d21         @I Pair add nnz 3
    230     vpadd.u8      d24, d22, d23         @I Pair add nnz4
    231     vst1.s16      {q2-q3}, [r2]         @Store blk
    232 
    233     vmov.u8       d25, #16              @I Get max nnz
    234     vsub.u8       d26, d25, d24         @I invert current nnz
    235 
    236     vst1.u8       d26[0], [r9]          @I  Write nnz
    237 
    238     vpop          {d8-d15}
    239     pop           {r4-r12, pc}
    240 
    241 
    242 
    243 @*****************************************************************************
    244 @*
    245 @* Function Name     : ih264_resi_trans_quant_chroma_4x4_a9
    246 @* Description       : This function does residue calculation, forward transform
    247 @*                     and quantization for 4x4 chroma block.
    248 @*
    249 @* Arguments         :  R0 :pointer to src buffer
    250 @                       R1 :pointer to pred buffer
    251 @                       R2 :pointer to dst buffer
    252 @                       R3 :source stride
    253 @                       STACK : pred stride,
    254 @                               dst stride,
    255 @                               pointer to scaling matrix,
    256 @                               pointer to threshold matrix,
    257 @                               qbits,
    258 @                               rounding factor,
    259 @                               pointer to store nnz
    260 @                               pointer to store unquantized dc values
    261 @ Values Returned   : NONE
    262 @
    263 @ Register Usage    :
    264 @ Stack Usage       : 40 bytes
    265 @ Cycles            : Around
    266 @ Interruptiaility  : Interruptable
    267 @
    268 @ Known Limitations
    269 @   \Assumptions    :
    270 @
    271 @ Revision History  :
    272 @         DD MM YYYY    Author(s)   Changes
    273 @         11 2 2015    100664      First version
    274 @
    275 @*****************************************************************************
    276 
    277     .global ih264_resi_trans_quant_chroma_4x4_a9
    278 ih264_resi_trans_quant_chroma_4x4_a9:
    279 
    280     @R0     :pointer to src buffer
    281     @R1     :pointer to pred buffer
    282     @R2     :pointer to dst buffer
    283     @R3     :Source stride
    284     @STACk  :pred stride
    285     @       :scale matirx,
    286     @       :threshold matrix
    287     @       :qbits
    288     @       :round factor
    289     @       :nnz
    290     @       :pu1_dc_alt_addr
    291     push          {r4-r12, lr}          @push all the variables first
    292 
    293     add           r11, sp, #40          @decrement stack pointer,to accomodate two variables
    294     ldmfd         r11, {r4-r10}         @load the strides into registers
    295 
    296     @R0     :pointer to src buffer
    297     @R1     :pointer to pred buffer
    298     @R2     :pointer to dst buffer
    299     @R3     :Source stride
    300     @R4     :Pred stride
    301     @R5     :scale matirx,
    302     @R6     :threshold matrix
    303     @R7     :qbits
    304     @R8     :round factor
    305     @R9     :nnz
    306     vpush         {d8-d15}
    307     mov           r11, #0
    308     sub           r7, r11, r7           @Negate the qbit value for usiing LSL
    309 
    310     @------------Fucntion Loading done----------------;
    311 
    312     vld2.u8       {d10, d11}, [r0], r3  @load first 8 pix src row 1
    313 
    314     vld2.u8       {d11, d12}, [r1], r4  @load first 8 pix pred row 1
    315 
    316     vld2.u8       {d28, d29}, [r0], r3  @load first 8 pix src row 2
    317 
    318     vld2.u8       {d29, d30}, [r1], r4  @load first 8 pix pred row 2
    319 
    320     vld2.u8       {d25, d26}, [r0], r3  @load first 8 pix src row 3
    321 
    322     vld2.u8       {d26, d27}, [r1], r4  @load first 8 pix pred row 3
    323     vsubl.u8      q0, d10, d11          @find residue row 1
    324 
    325     vld2.u8       {d22, d23}, [r0], r3  @load first 8 pix src row 4
    326 
    327     vld2.u8       {d23, d24}, [r1], r4  @load first 8 pix pred row 4
    328     vsubl.u8      q1, d28, d29          @find residue row 2
    329 
    330     vsubl.u8      q2, d25, d26          @find residue row 3
    331     vsubl.u8      q3, d22, d23          @find residue row 4
    332 
    333     vtrn.16       d0, d2                @T12
    334     vtrn.16       d4, d6                @T23
    335     vtrn.32       d0, d4                @T13
    336     vtrn.32       d2, d6                @T14
    337 
    338     vadd.s16      d8 , d0, d6           @x0 = x4+x7
    339     vadd.s16      d9 , d2, d4           @x1 = x5+x6
    340     vsub.s16      d10, d2, d4           @x2 = x5-x6
    341     vsub.s16      d11, d0, d6           @x3 = x4-x7
    342 
    343     vshl.s16      d12, d10, #1          @U_SHIFT(x2,1,shft)
    344     vshl.s16      d13, d11, #1          @U_SHIFT(x3,1,shft)
    345 
    346     vadd.s16      d14, d8, d9           @x4 = x0 + x1;
    347     vsub.s16      d16, d8, d9           @x6 = x0 - x1;
    348     vadd.s16      d15, d13, d10         @x5 = U_SHIFT(x3,1,shft) + x2;
    349     vsub.s16      d17, d11, d12         @x7 = x3 - U_SHIFT(x2,1,shft);
    350 
    351     @taking transpose again so as to make do vert transform
    352     vtrn.16       d14, d15              @T12
    353     vtrn.16       d16, d17              @T23
    354     vtrn.32       d14, d16              @T13
    355     vtrn.32       d15, d17              @T24
    356 
    357     @let us do vertical transform
    358     @same code as horiz
    359     vadd.s16      d18, d14, d17         @x0 = x4+x7
    360     vadd.s16      d19, d15, d16         @x1 = x5+x6
    361     vsub.s16      d20, d15, d16         @x2 = x5-x6
    362     vsub.s16      d21, d14, d17         @x3 = x4-x7
    363 
    364     vshl.s16      d22, d20, #1          @U_SHIFT(x2,1,shft)
    365     vshl.s16      d23, d21, #1          @U_SHIFT(x3,1,shft)
    366 
    367     vdup.s32      q4, r8                @Load rounding value row 1
    368 
    369     vadd.s16      d24, d18, d19         @x5 = x0 + x1;
    370     vsub.s16      d26, d18, d19         @x7 = x0 - x1;
    371     vadd.s16      d25, d23, d20         @x6 = U_SHIFT(x3,1,shft) + x2;
    372     vsub.s16      d27, d21, d22         @x8 = x3 - U_SHIFT(x2,1,shft);
    373     vdup.s32      q10, r7               @Load qbit values
    374 
    375     vst1.s16      d24[0], [r10]         @Store Unquantized dc value to dc alte address
    376 
    377 @core tranform is done for 4x8 block 1
    378     vld1.s16      {q14-q15}, [r5]       @load the scaling values
    379 
    380     vabs.s16      q0, q12               @Abs val of row 1 blk 1
    381 
    382     vabs.s16      q1, q13               @Abs val of row 2 blk 1
    383 
    384     vmov.s32      q5, q4                @copy round fact for row 2
    385 
    386     vmov.s32      q6, q4                @copy round fact for row 2
    387     vclt.s16      q2, q12, #0           @Get the sign of row 1 blk 1
    388 
    389     vmov.s32      q7, q4                @copy round fact for row 2
    390     vclt.s16      q3, q13, #0           @Get the sign of row 2 blk 1
    391 
    392     vmlal.s16     q4, d0, d28           @Multiply and add row 1
    393     vmlal.s16     q5, d1, d29           @Multiply and add row 2
    394     vmlal.s16     q6, d2, d30           @Multiply and add row 3
    395     vmlal.s16     q7, d3, d31           @Multiply and add row 4
    396 
    397     vshl.s32      q11, q4, q10          @Shift row 1
    398     vshl.s32      q12, q5, q10          @Shift row 2
    399     vshl.s32      q13, q6, q10          @Shift row 3
    400     vshl.s32      q14, q7, q10          @Shift row 4
    401 
    402     vmovn.s32     d30, q11              @Narrow row 1
    403     vmovn.s32     d31, q12              @Narrow row 2
    404     vmovn.s32     d0 , q13              @Narrow row 3
    405     vmovn.s32     d1 , q14              @Narrow row 4
    406 
    407     vneg.s16      q1, q15               @Get negative
    408     vneg.s16      q4, q0                @Get negative
    409 
    410     vceq.s16      q5, q15, #0           @I  compare with zero row 1 and 2 blk 1
    411     vceq.s16      q6, q0 , #0           @I  compare with zero row 1 and 2 blk 1
    412 
    413     vbsl.s16      q2, q1, q15           @Restore sign of row 1 and 2
    414     vbsl.s16      q3, q4, q0            @Restore sign of row 3 and 4
    415 
    416     vmovn.u16     d14, q5               @I  Narrow the comparison for row 1 and 2 blk 1
    417     vmovn.u16     d15, q6               @I  Narrow the comparison for row 1 and 2 blk 2
    418 
    419     vshr.u8       q8, q7, #7            @I  Reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
    420 
    421     vpadd.u8      d18, d16, d17         @I pair add nnz 1
    422     vpadd.u8      d20, d18, d19         @I Pair add nnz 2
    423     vpadd.u8      d22, d20, d21         @I Pair add nnz 3
    424     vpadd.u8      d24, d22, d23         @I Pair add nnz4
    425     vst1.s16      {q2-q3}, [r2]         @Store blk
    426 
    427     vmov.u8       d25, #16              @I Get max nnz
    428     vsub.u8       d26, d25, d24         @I invert current nnz
    429 
    430     vst1.u8       d26[0], [r9]          @I  Write nnz
    431 
    432     vpop          {d8-d15}
    433     pop           {r4-r12, pc}
    434 
    435 
    436 
    437 @*****************************************************************************
    438 @*
    439 @* Function Name     : ih264_hadamard_quant_4x4_a9
    440 @* Description       : This function does forward hadamard transform and
    441 @*                     quantization for luma dc block
    442 @*
    443 @* Arguments         :  R0 :pointer to src buffer
    444 @                       R1 :pointer to dst buffer
    445 @                       R2 :pu2_scale_matrix
    446 @                       R2 :pu2_threshold_matrix
    447 @                       STACk : u4_qbits
    448 @                               u4_round_factor
    449 @                               pu1_nnz
    450 @ Values Returned   : NONE
    451 @
    452 @ Register Usage    :
    453 @ Stack Usage       : 0 bytes
    454 @ Cycles            : Around
    455 @ Interruptiaility  : Interruptable
    456 @
    457 @ Known Limitations
    458 @   \Assumptions    :
    459 @
    460 @ Revision History  :
    461 @         DD MM YYYY    Author(s)   Changes
    462 @         20 2 2015    100633      First version
    463 @
    464 @*****************************************************************************
    465 @ih264_hadamard_quant_4x4_a9(WORD16 *pi2_src, WORD16 *pi2_dst,
    466 @                           const UWORD16 *pu2_scale_matrix,
    467 @                           const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
    468 @                           UWORD32 u4_round_factor,UWORD8  *pu1_nnz
    469 @                           )
    470     .global ih264_hadamard_quant_4x4_a9
    471 ih264_hadamard_quant_4x4_a9:
    472 
    473 @Registert usage
    474 @   r0 : src
    475 @   r1 : dst
    476 @   r2 : *pu2_scale_matrix
    477 @   r3 : *pu2_threshold_matrix
    478 
    479     vld4.s16      {d0, d1, d2, d3}, [r0]! @Load 4x4 block
    480     vpush         {d8-d15}
    481 
    482     vld1.u16      d30[0], [r2]          @load pu2_scale_matrix[0]
    483 
    484     vaddl.s16     q3, d0, d3            @x0 = x4 + x7;
    485     vaddl.s16     q4, d1, d2            @x1 = x5 + x6;
    486     vsubl.s16     q5, d1, d2            @x2 = x5 - x6;
    487     vsubl.s16     q6, d0, d3            @x3 = x4 - x7;
    488 
    489     vdup.u16      d30, d30[0]           @pu2_scale_matrix[0]
    490 
    491     vadd.s32      q7, q3, q4            @pi2_dst[0] = x0 + x1;
    492     vadd.s32      q8, q6, q5            @pi2_dst[1] = x3 + x2;
    493     add           r3, sp, #68           @Get address of u4_round_factor
    494     vsub.s32      q9, q3, q4            @pi2_dst[2] = x0 - x1;
    495     vsub.s32      q10, q6, q5           @pi2_dst[3] = x3 - x2;
    496 
    497     vtrn.s32      q7, q8                @transpose 4x4 block
    498     vtrn.s32      q9, q10
    499     vld1.s32      d0[0], [r3]           @load   u4_round_factor
    500     vswp          d15, d18
    501     vswp          d17, d20
    502 
    503     add           r3, sp, #64           @Get address of u4_qbits
    504     vadd.s32      q11, q7, q10          @x0 = x4 + x7;
    505     vadd.s32      q12, q8, q9           @x1 = x5 + x6;
    506     vld1.s32      d31[0], [r3]          @load  u4_qbits
    507     vsub.s32      q13, q8, q9           @x2 = x5 - x6;
    508     vsub.s32      q14, q7, q10          @x3 = x4 - x7;
    509 
    510     vdup.s32      q7, d0[0]             @u4_round_factor
    511 
    512     vadd.s32      q0, q11, q12          @(x0 + x1)
    513     vadd.s32      q1, q14, q13          @(x3 + x2)
    514     vsub.s32      q2, q11, q12          @(x0 - x1)
    515     vsub.s32      q3, q14, q13          @(x3 - x2)
    516 
    517     vdup.s32      q11, d31[0]           @u4_round_factor
    518 
    519     vshrn.s32     d0, q0, #1            @i4_value = (x0 + x1) >> 1;
    520     vshrn.s32     d1, q1, #1            @i4_value = (x3 + x2) >> 1;
    521     vshrn.s32     d2, q2, #1            @i4_value = (x0 - x1) >> 1;
    522     vshrn.s32     d3, q3, #1            @i4_value = (x3 - x2) >> 1;
    523 
    524     vabs.s16      q5, q0
    525     vabs.s16      q6, q1
    526 
    527     vmov.s32      q8, q7                @Get the round fact
    528     vmov.s32      q9, q7
    529     vmov.s32      q10, q7
    530 
    531     vclt.s16      q3, q0, #0            @get the sign row 1,2
    532     vclt.s16      q4, q1, #0
    533 
    534     vneg.s32      q11, q11              @-u4_round_factor
    535 
    536     vmlal.u16     q7, d10, d30
    537     vmlal.u16     q8, d11, d30
    538     vmlal.u16     q9, d12, d30
    539     vmlal.u16     q10, d13, d30
    540 
    541     vshl.u32      q7, q7, q11
    542     vshl.u32      q8, q8, q11
    543     vshl.u32      q9, q9, q11
    544     vshl.u32      q10, q10, q11
    545 
    546     vqmovn.u32    d22, q7
    547     vqmovn.u32    d23, q8
    548     vqmovn.u32    d24, q9
    549     vqmovn.u32    d25, q10
    550 
    551     vneg.s16      q13, q11
    552     vneg.s16      q14, q12
    553 
    554     vbsl.s16      q3, q13, q11
    555     vbsl.s16      q4, q14, q12
    556 
    557     vceq.s16      q5, q11, #0
    558     vceq.s16      q6, q12, #0
    559 
    560     vst1.s16      {q3}, [r1]!
    561 
    562     vshrn.u16     d14, q5, #8
    563     vshrn.u16     d15, q6, #8
    564 
    565     ldr           r3, [sp, #72]         @Load *pu1_nnz
    566 
    567     vshr.u8       q7, q7, #7
    568 
    569     vst1.s16      {q4}, [r1]!
    570 
    571     vadd.u8       d16, d14, d15
    572     vmov.u8       d20, #16
    573     vpadd.u8      d17, d16, d16
    574     vpadd.u8      d18, d17, d17
    575     vpadd.u8      d19, d18, d18
    576     vsub.u8       d20, d20, d19
    577     vst1.u8       d20[0], [r3]
    578 
    579     vpop          {d8-d15}
    580     bx            lr
    581 
    582 
    583 
    584 
    585 @*****************************************************************************
    586 @*
    587 @* Function Name     : ih264_hadamard_quant_2x2_uv_a9
    588 @* Description       : This function does forward hadamard transform and
    589 @*                     quantization for dc block of chroma for both planes
    590 @*
    591 @* Arguments         :  R0 :pointer to src buffer
    592 @                       R1 :pointer to dst buffer
    593 @                       R2 :pu2_scale_matrix
    594 @                       R2 :pu2_threshold_matrix
    595 @                       STACk : u4_qbits
    596 @                               u4_round_factor
    597 @                               pu1_nnz
    598 @ Values Returned   : NONE
    599 @
    600 @ Register Usage    :
    601 @ Stack Usage       : 0 bytes
    602 @ Cycles            : Around
    603 @ Interruptiaility  : Interruptable
    604 @
    605 @ Known Limitations
    606 @   \Assumptions    :
    607 @
    608 @ Revision History  :
    609 @         DD MM YYYY    Author(s)   Changes
    610 @         20 2 2015    100633      First version
    611 @
    612 @*****************************************************************************
    613 @ ih264_hadamard_quant_2x2_uv_a9(WORD16 *pi2_src, WORD16 *pi2_dst,
    614 @                             const UWORD16 *pu2_scale_matrix,
    615 @                             const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
    616 @                             UWORD32 u4_round_factor,UWORD8  *pu1_nnz
    617 @                             )
    618 
    619     .global ih264_hadamard_quant_2x2_uv_a9
    620 ih264_hadamard_quant_2x2_uv_a9:
    621 
    622     vpush         {d8-d15}
    623     vld2.s16      {d0-d1}, [r0]         @load src
    624 
    625     add           r3, sp, #68           @Get address of u4_round_factor
    626 
    627     vaddl.s16     q3, d0, d1            @x0 = x4 + x5;, x2 = x6 + x7;
    628     vld1.u16      d30[0], [r2]          @load pu2_scale_matrix[0]
    629     vsubl.s16     q4, d0, d1            @x1 = x4 - x5;  x3 = x6 - x7;
    630 
    631     add           r0, sp, #64           @Get affress of u4_qbits
    632     vld1.s32      d28[0], [r3]          @load   u4_round_factor
    633     vtrn.s32      q3, q4                @q1 -> x0 x1, q2 -> x2 x3
    634 
    635     vadd.s32      q0, q3, q4            @ (x0 + x2) (x1 + x3)  (y0 + y2); (y1 + y3);
    636     vld1.s32      d24[0], [r0]          @load  u4_qbits
    637     vsub.s32      q1, q3, q4            @ (x0 - x2) (x1 - x3)  (y0 - y2); (y1 - y3);
    638 
    639     vdup.u16      d30, d30[0]           @pu2_scale_matrix
    640 
    641     vabs.s32      q2, q0
    642     vabs.s32      q3, q1
    643 
    644     vdup.s32      q14, d28[0]           @u4_round_factor
    645 
    646     vmovl.u16     q15, d30              @pu2_scale_matrix
    647 
    648     vclt.s32      q4, q0, #0            @get the sign row 1,2
    649     vdup.s32      q12, d24[0]           @u4_round_factor
    650     vclt.s32      q5, q1, #0
    651 
    652     vqmovn.u32    d8, q4
    653     vqmovn.s32    d9, q5
    654 
    655     vmov.s32      q13, q14              @Get the round fact
    656     vneg.s32      q12, q12              @-u4_round_factor
    657 
    658     vmla.u32      q13, q2, q15
    659     vmla.u32      q14, q3, q15
    660 
    661     vshl.u32      q13, q13, q12         @>>qbit
    662     vshl.u32      q14, q14, q12         @>>qbit
    663 
    664     vqmovn.u32    d10, q13
    665     vqmovn.u32    d11, q14
    666 
    667     vneg.s16      q6, q5
    668 
    669     vbsl.s16      q4, q6, q5            @*sign
    670 
    671     vtrn.s32      d8, d9
    672 
    673     vceq.s16      q7, q4, #0            @Compute nnz
    674 
    675     vshrn.u16     d14, q7, #8           @reduce nnz comparison to 1 bit
    676 
    677     ldr           r3, [sp, #72]         @Load *pu1_nnz
    678     vshr.u8       d14, d14, #7          @reduce nnz comparison to 1 bit
    679     vmov.u8       d20, #4               @Since we add zeros, we need to subtract from 4 to get nnz
    680     vpadd.u8      d17, d14, d14         @Sum up nnz
    681 
    682     vst1.s16      {q4}, [r1]!           @Store the block
    683 
    684     vpadd.u8      d17, d17, d17         @Sum up nnz
    685     vsub.u8       d20, d20, d17         @4- numzeros
    686     vst1.u16      d20[0], [r3]          @store nnz
    687 
    688     vpop          {d8-d15}
    689     bx            lr
    690 
    691 
    692 
    693 
    694 
    695