Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///*****************************************************************************/
     21 ///**
     22 //*******************************************************************************
     23 //* @file
     24 //*  ih264_resi_trans_quant_av8.c
     25 //*
     26 //* @brief
     27 //*  contains function definitions for residual and forward trans
     28 //*
     29 //* @author
     30 //*  ittiam
     31 //*
     32 //* @par list of functions:
     33 //*    ih264_resi_trans_quant_4x4_av8
     34 //*    ih264_resi_trans_quant_8x8_av8
     35 //*    ih264_resi_trans_quant_chroma_4x4_av8
     36 //* @remarks
     37 //*  none
     38 //*
     39 //*******************************************************************************
     40 .include "ih264_neon_macros.s"
     41 .text
     42 .p2align 2
     43 //*****************************************************************************
     44 //*
     45 //* function name     : ih264_resi_trans_quant_4x4
     46 //* description       : this function does cf4 of h264
     47 //*
     48 //* arguments         :   x0 :pointer to src buffer
     49 //                        x1 :pointer to pred buffer
     50 //                        x2 :pointer to dst buffer
     51 //                        x3 :source stride
     52 //                        x4 :pred stride,
     53 //                        x5 :dst stride,
     54 //                        x6 :pointer to scaling matrix,
     55 //                        x7 :pointer to threshold matrix,
     56 //                        stack   qbits,
     57 //                                rounding factor,
     58 //                                pointer to store nnz
     59 //                                pointer to store non quantized dc value
     60 // values returned   : none
     61 //
     62 // register usage    :
     63 // stack usage       : 64 bytes
     64 // cycles            :
     65 // interruptiaility  : interruptable
     66 //
     67 // known limitations
     68 //   \assumptions    :
     69 //
     70 // revision history  :
     71 //         dd mm yyyy    author(s)   changes
     72 //         1 12 2013    100633      first version
     73 //         20 1 2014    100633      changes the api, optimization
     74 //
     75 //*****************************************************************************
     76 
     77     .global ih264_resi_trans_quant_4x4_av8
     78 ih264_resi_trans_quant_4x4_av8:
     79 
     80     //x0     :pointer to src buffer
     81     //x1     :pointer to pred buffer
     82     //x2     :pointer to dst buffer
     83     //x3     :source stride
     84     //x4     :pred stride
     85     //x5     :dst stride,
     86     //x6     :scale matirx,
     87     //x7     :threshold matrix
     88     //       :qbits
     89     //       :round factor
     90     //       :nnz
     91     //       :pointer to store non quantized dc value
     92     push_v_regs
     93     //x0     :pointer to src buffer
     94     //x1     :pointer to pred buffer
     95     //x2     :pointer to dst buffer
     96     //x3     :source stride
     97     //x4     :pred stride
     98     //x5     :scale matirx,
     99     //x6     :threshold matrix
    100     //x7     :qbits
    101     //x8        :round factor
    102     //x9        :nnz
    103     //x10       :pointer to store non quantized dc value
    104 
    105     ldr       w8, [sp, #64]             //load round factor
    106     ldr       x10, [sp, #80]            //load addres for non quant val
    107     neg       x7, x7                    //negate the qbit value for usiing lsl
    108     ldr       x9, [sp, #72]
    109 
    110     //------------fucntion loading done----------------;
    111 
    112     ld1       {v30.8b}, [x0], x3        //load first 8 pix src  row 1
    113     ld1       {v31.8b}, [x1], x4        //load first 8 pix pred row 1
    114     ld1       {v28.8b}, [x0], x3        //load first 8 pix src  row 2
    115     ld1       {v29.8b}, [x1], x4        //load first 8 pix pred row 2
    116     ld1       {v26.8b}, [x0], x3        //load first 8 pix src  row 3
    117     ld1       {v27.8b}, [x1], x4        //load first 8 pix pred row 3
    118     ld1       {v24.8b}, [x0]            //load first 8 pix src row 4
    119     ld1       {v25.8b}, [x1]            //load first 8 pix pred row 4
    120 
    121     usubl     v0.8h, v30.8b, v31.8b     //find residue row 1
    122     usubl     v2.8h, v28.8b, v29.8b     //find residue row 2
    123     usubl     v4.8h, v26.8b, v27.8b     //find residue row 3
    124     usubl     v6.8h, v24.8b, v25.8b     //find residue row 4
    125 
    126     trn1      v1.4h, v0.4h, v2.4h
    127     trn2      v3.4h, v0.4h, v2.4h       //t12
    128     trn1      v5.4h, v4.4h, v6.4h
    129     trn2      v7.4h, v4.4h, v6.4h       //t23
    130 
    131     trn1      v0.2s, v1.2s, v5.2s
    132     trn2      v4.2s, v1.2s, v5.2s       //t13
    133     trn1      v2.2s, v3.2s, v7.2s
    134     trn2      v6.2s, v3.2s, v7.2s       //t14
    135 
    136     add       v8.4h, v0.4h, v6.4h       //x0 = x4+x7
    137     add       v9.4h, v2.4h, v4.4h       //x1 = x5+x6
    138     sub       v10.4h, v2.4h, v4.4h      //x2 = x5-x6
    139     sub       v11.4h, v0.4h, v6.4h      //x3 = x4-x7
    140 
    141     shl       v12.4h, v10.4h, #1        //u_shift(x2,1,shft)
    142     shl       v13.4h, v11.4h, #1        //u_shift(x3,1,shft)
    143 
    144     add       v14.4h, v8.4h, v9.4h      //x4 = x0 + x1;
    145     sub       v16.4h, v8.4h, v9.4h      //x6 = x0 - x1;
    146     add       v15.4h, v13.4h, v10.4h    //x5 = u_shift(x3,1,shft) + x2;
    147     sub       v17.4h, v11.4h, v12.4h    //x7 = x3 - u_shift(x2,1,shft);
    148 
    149     //taking transpose again so as to make do vert transform
    150     trn1      v0.4h, v14.4h, v15.4h
    151     trn2      v1.4h, v14.4h, v15.4h     //t12
    152     trn1      v2.4h, v16.4h, v17.4h
    153     trn2      v3.4h, v16.4h, v17.4h     //t23
    154 
    155     trn1      v14.2s, v0.2s, v2.2s
    156     trn2      v16.2s, v0.2s, v2.2s      //t13
    157     trn1      v15.2s, v1.2s, v3.2s
    158     trn2      v17.2s, v1.2s, v3.2s      //t24
    159 
    160     //let us do vertical transform
    161     //same code as horiz
    162     add       v18.4h, v14.4h , v17.4h   //x0 = x4+x7
    163     add       v19.4h, v15.4h , v16.4h   //x1 = x5+x6
    164     sub       v20.4h, v15.4h , v16.4h   //x2 = x5-x6
    165     sub       v21.4h, v14.4h , v17.4h   //x3 = x4-x7
    166 
    167     shl       v22.4h, v20.4h, #1        //u_shift(x2,1,shft)
    168     shl       v23.4h, v21.4h, #1        //u_shift(x3,1,shft)
    169 
    170     dup       v8.4s, w8                 //load rounding value row 1
    171 
    172     add       v24.4h, v18.4h , v19.4h   //x5 = x0 + x1;
    173     sub       v26.4h, v18.4h , v19.4h   //x7 = x0 - x1;
    174     add       v25.4h, v23.4h , v20.4h   //x6 = u_shift(x3,1,shft) + x2;
    175     sub       v27.4h, v21.4h , v22.4h   //x8 = x3 - u_shift(x2,1,shft);
    176 
    177     dup       v23.4s, w8                //load round factor values
    178 
    179     st1       {v24.h}[0], [x10]         //store the dc value to alternate dc sddress
    180 //core tranform is done for 4x8 block 1
    181     ld1       {v28.4h-v31.4h}, [x5]     //load the scaling values
    182 
    183     abs       v0.4h, v24.4h             //abs val of row 1
    184     abs       v1.4h, v25.4h             //abs val of row 2
    185     abs       v2.4h, v26.4h             //abs val of row 3
    186     abs       v3.4h, v27.4h             //abs val of row 4
    187 
    188     cmgt      v4.4h, v24.4h, #0
    189     cmgt      v5.4h, v25.4h, #0
    190     cmgt      v6.4h, v26.4h, #0
    191     cmgt      v7.4h, v27.4h, #0
    192 
    193     smull     v0.4s, v0.4h, v28.4h      //multiply and add row 1
    194     smull     v1.4s, v1.4h, v29.4h      //multiply and add row 2
    195     smull     v2.4s, v2.4h, v30.4h      //multiply and add row 3
    196     smull     v3.4s, v3.4h, v31.4h      //multiply and add row 4
    197 
    198     add       v20.4s, v0.4s, v23.4s
    199     add       v21.4s, v1.4s, v23.4s
    200     add       v22.4s, v2.4s, v23.4s
    201     add       v23.4s, v3.4s, v23.4s
    202 
    203     dup       v24.4s, w7
    204 
    205     sshl      v20.4s, v20.4s, v24.4s    //shift row 1
    206     sshl      v21.4s, v21.4s, v24.4s    //shift row 2
    207     sshl      v22.4s, v22.4s, v24.4s    //shift row 3
    208     sshl      v23.4s, v23.4s, v24.4s    //shift row 4
    209 
    210     xtn       v20.4h, v20.4s            //narrow row 1
    211     xtn       v21.4h, v21.4s            //narrow row 2
    212     xtn       v22.4h, v22.4s            //narrow row 3
    213     xtn       v23.4h, v23.4s            //narrow row 4
    214 
    215     neg       v24.8h, v20.8h            //get negative
    216     neg       v25.8h, v21.8h            //get negative
    217     neg       v26.8h, v22.8h            //get negative
    218     neg       v27.8h, v23.8h            //get negative
    219 
    220     //compare with zero for computng nnz
    221     cmeq      v0.4h, v20.4h, #0
    222     cmeq      v1.4h, v21.4h, #0
    223     cmeq      v2.4h, v22.4h, #0
    224     cmeq      v3.4h, v23.4h, #0
    225 
    226     bsl       v4.8b, v20.8b, v24.8b     //restore sign of row 1 and 2
    227     bsl       v5.8b, v21.8b, v25.8b     //restore sign of row 3 and 4
    228     bsl       v6.8b, v22.8b, v26.8b     //restore sign of row 1 and 2
    229     bsl       v7.8b, v23.8b, v27.8b     //restore sign of row 3 and 4
    230 
    231     //narrow the comaprison result
    232     mov       v0.d[1], v2.d[0]
    233     mov       v1.d[1], v3.d[0]
    234 
    235     xtn       v0.8b, v0.8h
    236     xtn       v1.8b, v1.8h
    237 
    238     ushr      v0.8b, v0.8b, #7          //i    reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
    239     ushr      v1.8b, v1.8b, #7          //i    reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
    240 
    241     add       v0.8b, v0.8b, v1.8b       //i pair add nnz 1
    242     addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
    243     addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
    244     addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
    245 
    246     st1       {v4.4h-v7.4h}, [x2]       //store blk
    247 
    248     movi      v25.8b, #16               //get max nnz
    249     sub       v26.8b, v25.8b , v0.8b    //invert current nnz
    250     st1       {v26.b}[0], [x9]          //write nnz
    251 
    252     pop_v_regs
    253     ret
    254 
    255 
    256 //*****************************************************************************
    257 //*
    258 //* function name     : ih264_resi_trans_quant_chroma_4x4
    259 //* description       : this function does residue calculation, forward transform
    260 //*                        and quantization for 4x4 chroma block.
    261 //*
    262 //* arguments         :   x0 :pointer to src buffer
    263 //                        x1 :pointer to pred buffer
    264 //                        x2 :pointer to dst buffer
    265 //                        x3 :source stride
    266 //                        x4 :pred stride,
    267 //                        x5 :dst stride,
    268 //                        x6 :pointer to scaling matrix,
    269 //                        x7 :pointer to threshold matrix,
    270 //                        stack     qbits,
    271 //                                  rounding factor,
    272 //                                  pointer to store nnz
    273 //                                  pointer to store unquantized dc values
    274 // values returned   : none
    275 //
    276 // register usage    :
    277 // stack usage       : 64 bytes
    278 // cycles            :
    279 // interruptiaility  : interruptable
    280 //
    281 // known limitations
    282 //   \assumptions    :
    283 //
    284 // revision history  :
    285 //         dd mm yyyy    author(s)   changes
    286 //         11 2 2015    100664      first version
    287 //         25 2 2015    100633      first av8 version
    288 //*****************************************************************************
    289 
    290     .global ih264_resi_trans_quant_chroma_4x4_av8
    291 ih264_resi_trans_quant_chroma_4x4_av8:
    292 
    293     //x0     :pointer to src buffer
    294     //x1     :pointer to pred buffer
    295     //x2     :pointer to dst buffer
    296     //x3     :source stride
    297     //stack     :pred stride
    298     //          :scale matirx,
    299     //          :threshold matrix
    300     //          :qbits
    301     //          :round factor
    302     //          :nnz
    303     //          :pu1_dc_alt_addr
    304     push_v_regs
    305     //x0     :pointer to src buffer
    306     //x1     :pointer to pred buffer
    307     //x2     :pointer to dst buffer
    308     //x3     :source stride
    309     //x4     :pred stride
    310     //x5     :scale matirx,
    311     //x6     :threshold matrix
    312     //x7     :qbits
    313     //x8        :round factor
    314     //x9        :nnz
    315     //x10       :pointer to store non quantized dc value
    316 
    317     ldr       w8, [sp, #64]             //load round factor
    318     ldr       x10, [sp, #80]            //load addres for non quant val
    319     neg       x7, x7                    //negate the qbit value for usiing lsl
    320     ldr       x9, [sp, #72]
    321     //------------fucntion loading done----------------;
    322 
    323     ld1       {v30.8b}, [x0], x3        //load first 8 pix src  row 1
    324     ld1       {v31.8b}, [x1], x4        //load first 8 pix pred row 1
    325     ld1       {v28.8b}, [x0], x3        //load first 8 pix src  row 2
    326     ld1       {v29.8b}, [x1], x4        //load first 8 pix pred row 2
    327     ld1       {v26.8b}, [x0], x3        //load first 8 pix src  row 3
    328     ld1       {v27.8b}, [x1], x4        //load first 8 pix pred row 3
    329     ld1       {v24.8b}, [x0]            //load first 8 pix src row 4
    330     ld1       {v25.8b}, [x1]            //load first 8 pix pred row 4
    331 
    332 
    333     //deinterleave the loaded values
    334     uzp1      v30.8b, v30.8b, v30.8b
    335     uzp1      v31.8b, v31.8b, v31.8b
    336     uzp1      v28.8b, v28.8b, v28.8b
    337     uzp1      v29.8b, v29.8b, v29.8b
    338     uzp1      v26.8b, v26.8b, v26.8b
    339     uzp1      v27.8b, v27.8b, v27.8b
    340     uzp1      v24.8b, v24.8b, v24.8b
    341     uzp1      v25.8b, v25.8b, v25.8b
    342     //this deinterleaving is the only differnece betweenchrom and luma fucntions
    343 
    344     usubl     v0.8h, v30.8b, v31.8b     //find residue row 1
    345     usubl     v2.8h, v28.8b, v29.8b     //find residue row 2
    346     usubl     v4.8h, v26.8b, v27.8b     //find residue row 3
    347     usubl     v6.8h, v24.8b, v25.8b     //find residue row 4
    348 
    349     trn1      v1.4h, v0.4h, v2.4h
    350     trn2      v3.4h, v0.4h, v2.4h       //t12
    351     trn1      v5.4h, v4.4h, v6.4h
    352     trn2      v7.4h, v4.4h, v6.4h       //t23
    353 
    354     trn1      v0.2s, v1.2s, v5.2s
    355     trn2      v4.2s, v1.2s, v5.2s       //t13
    356     trn1      v2.2s, v3.2s, v7.2s
    357     trn2      v6.2s, v3.2s, v7.2s       //t14
    358 
    359     add       v8.4h, v0.4h, v6.4h       //x0 = x4+x7
    360     add       v9.4h, v2.4h, v4.4h       //x1 = x5+x6
    361     sub       v10.4h, v2.4h, v4.4h      //x2 = x5-x6
    362     sub       v11.4h, v0.4h, v6.4h      //x3 = x4-x7
    363 
    364     shl       v12.4h, v10.4h, #1        //u_shift(x2,1,shft)
    365     shl       v13.4h, v11.4h, #1        //u_shift(x3,1,shft)
    366 
    367     add       v14.4h, v8.4h, v9.4h      //x4 = x0 + x1;
    368     sub       v16.4h, v8.4h, v9.4h      //x6 = x0 - x1;
    369     add       v15.4h, v13.4h, v10.4h    //x5 = u_shift(x3,1,shft) + x2;
    370     sub       v17.4h, v11.4h, v12.4h    //x7 = x3 - u_shift(x2,1,shft);
    371 
    372     //taking transpose again so as to make do vert transform
    373     trn1      v0.4h, v14.4h, v15.4h
    374     trn2      v1.4h, v14.4h, v15.4h     //t12
    375     trn1      v2.4h, v16.4h, v17.4h
    376     trn2      v3.4h, v16.4h, v17.4h     //t23
    377 
    378     trn1      v14.2s, v0.2s, v2.2s
    379     trn2      v16.2s, v0.2s, v2.2s      //t13
    380     trn1      v15.2s, v1.2s, v3.2s
    381     trn2      v17.2s, v1.2s, v3.2s      //t24
    382 
    383     //let us do vertical transform
    384     //same code as horiz
    385     add       v18.4h, v14.4h , v17.4h   //x0 = x4+x7
    386     add       v19.4h, v15.4h , v16.4h   //x1 = x5+x6
    387     sub       v20.4h, v15.4h , v16.4h   //x2 = x5-x6
    388     sub       v21.4h, v14.4h , v17.4h   //x3 = x4-x7
    389 
    390     shl       v22.4h, v20.4h, #1        //u_shift(x2,1,shft)
    391     shl       v23.4h, v21.4h, #1        //u_shift(x3,1,shft)
    392 
    393     dup       v8.4s, w8                 //load rounding value row 1
    394 
    395     add       v24.4h, v18.4h , v19.4h   //x5 = x0 + x1;
    396     sub       v26.4h, v18.4h , v19.4h   //x7 = x0 - x1;
    397     add       v25.4h, v23.4h , v20.4h   //x6 = u_shift(x3,1,shft) + x2;
    398     sub       v27.4h, v21.4h , v22.4h   //x8 = x3 - u_shift(x2,1,shft);
    399 
    400     dup       v23.4s, w8                //load round factor values
    401 
    402     st1       {v24.h}[0], [x10]         //store the dc value to alternate dc sddress
    403 //core tranform is done for 4x8 block 1
    404     ld1       {v28.4h-v31.4h}, [x5]     //load the scaling values
    405 
    406     abs       v0.4h, v24.4h             //abs val of row 1
    407     abs       v1.4h, v25.4h             //abs val of row 2
    408     abs       v2.4h, v26.4h             //abs val of row 3
    409     abs       v3.4h, v27.4h             //abs val of row 4
    410 
    411     cmgt      v4.4h, v24.4h, #0
    412     cmgt      v5.4h, v25.4h, #0
    413     cmgt      v6.4h, v26.4h, #0
    414     cmgt      v7.4h, v27.4h, #0
    415 
    416     smull     v0.4s, v0.4h, v28.4h      //multiply and add row 1
    417     smull     v1.4s, v1.4h, v29.4h      //multiply and add row 2
    418     smull     v2.4s, v2.4h, v30.4h      //multiply and add row 3
    419     smull     v3.4s, v3.4h, v31.4h      //multiply and add row 4
    420 
    421     add       v20.4s, v0.4s, v23.4s
    422     add       v21.4s, v1.4s, v23.4s
    423     add       v22.4s, v2.4s, v23.4s
    424     add       v23.4s, v3.4s, v23.4s
    425 
    426     dup       v24.4s, w7
    427 
    428     sshl      v20.4s, v20.4s, v24.4s    //shift row 1
    429     sshl      v21.4s, v21.4s, v24.4s    //shift row 2
    430     sshl      v22.4s, v22.4s, v24.4s    //shift row 3
    431     sshl      v23.4s, v23.4s, v24.4s    //shift row 4
    432 
    433     xtn       v20.4h, v20.4s            //narrow row 1
    434     xtn       v21.4h, v21.4s            //narrow row 2
    435     xtn       v22.4h, v22.4s            //narrow row 3
    436     xtn       v23.4h, v23.4s            //narrow row 4
    437 
    438     neg       v24.8h, v20.8h            //get negative
    439     neg       v25.8h, v21.8h            //get negative
    440     neg       v26.8h, v22.8h            //get negative
    441     neg       v27.8h, v23.8h            //get negative
    442 
    443     //compare with zero for computng nnz
    444     cmeq      v0.4h, v20.4h, #0
    445     cmeq      v1.4h, v21.4h, #0
    446     cmeq      v2.4h, v22.4h, #0
    447     cmeq      v3.4h, v23.4h, #0
    448 
    449     bsl       v4.8b, v20.8b, v24.8b     //restore sign of row 1 and 2
    450     bsl       v5.8b, v21.8b, v25.8b     //restore sign of row 3 and 4
    451     bsl       v6.8b, v22.8b, v26.8b     //restore sign of row 1 and 2
    452     bsl       v7.8b, v23.8b, v27.8b     //restore sign of row 3 and 4
    453 
    454     //narrow the comaprison result
    455     mov       v0.d[1], v2.d[0]
    456     mov       v1.d[1], v3.d[0]
    457 
    458     xtn       v0.8b, v0.8h
    459     xtn       v1.8b, v1.8h
    460 
    461     ushr      v0.8b, v0.8b, #7          //i    reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
    462     ushr      v1.8b, v1.8b, #7          //i    reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
    463 
    464     add       v0.8b, v0.8b, v1.8b       //i pair add nnz 1
    465     addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
    466     addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
    467     addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
    468 
    469     st1       {v4.4h-v7.4h}, [x2]       //store blk
    470 
    471     movi      v25.8b, #16               //get max nnz
    472     sub       v26.8b, v25.8b , v0.8b    //invert current nnz
    473     st1       {v26.b}[0], [x9]          //write nnz
    474 
    475     pop_v_regs
    476     ret
    477 
    478 
    479 //*****************************************************************************
    480 //*
    481 //* function name     : ih264_hadamard_quant_4x4_av8
    482 //* description       : this function does forward hadamard transform and
    483 //*                     quantization for luma dc block
    484 //*
    485 //* arguments         :  x0 :pointer to src buffer
    486 //                       x1 :pointer to dst buffer
    487 //                       x2 :pu2_scale_matrix
    488 //                       x2 :pu2_threshold_matrix
    489 //                       x3 :u4_qbits
    490 //                       x4 :u4_round_factor
    491 //                       x5 :pu1_nnz
    492 // values returned   : none
    493 //
    494 // register usage    :
    495 // stack usage       : 0 bytes
    496 // cycles            : around
    497 // interruptiaility  : interruptable
    498 //
    499 // known limitations
    500 //   \assumptions    :
    501 //
    502 // revision history  :
    503 //         dd mm yyyy    author(s)   changes
    504 //         20 2 2015    100633      first version
    505 //
    506 //*****************************************************************************
    507 //ih264_hadamard_quant_4x4_av8(word16 *pi2_src, word16 *pi2_dst,
    508 //                           const uword16 *pu2_scale_matrix,
    509 //                           const uword16 *pu2_threshold_matrix, uword32 u4_qbits,
    510 //                           uword32 u4_round_factor,uword8  *pu1_nnz
    511 //                           )
    512     .global ih264_hadamard_quant_4x4_av8
    513 ih264_hadamard_quant_4x4_av8:
    514 
    515 //x0 :pointer to src buffer
    516 //x1 :pointer to dst buffer
    517 //x2 :pu2_scale_matrix
    518 //x3 :pu2_threshold_matrix
    519 //x4 :u4_qbits
    520 //x5 :u4_round_factor
    521 //x6 :pu1_nnz
    522 
    523     push_v_regs
    524 
    525     ld4       {v0.4h-v3.4h}, [x0]       //load 4x4 block
    526     ld1       {v30.h}[0], [x2]          //load pu2_scale_matrix[0]
    527 
    528     saddl     v4.4s, v0.4h, v3.4h       //x0 = x4 + x7;
    529     saddl     v5.4s, v1.4h, v2.4h       //x1 = x5 + x6;
    530     ssubl     v6.4s, v1.4h, v2.4h       //x2 = x5 - x6;
    531     ssubl     v7.4s, v0.4h, v3.4h       //x3 = x4 - x7;
    532 
    533     dup       v30.8h, v30.h[0]          //pu2_scale_matrix[0]
    534 
    535     add       v14.4s, v4.4s, v5.4s      //pi2_dst[0] = x0 + x1;
    536     add       v15.4s, v7.4s, v6.4s      //pi2_dst[1] = x3 + x2;
    537     sub       v16.4s, v4.4s, v5.4s      //pi2_dst[2] = x0 - x1;
    538     sub       v17.4s, v7.4s, v6.4s      //pi2_dst[3] = x3 - x2;
    539 
    540     //transpose 4x4 block
    541     trn1      v18.4s, v14.4s, v15.4s
    542     trn2      v19.4s, v14.4s, v15.4s
    543     trn1      v20.4s, v16.4s, v17.4s
    544     trn2      v21.4s, v16.4s, v17.4s
    545 
    546     trn1      v14.2d, v18.2d, v20.2d
    547     trn2      v16.2d, v18.2d, v20.2d
    548     trn1      v15.2d, v19.2d, v21.2d
    549     trn2      v17.2d, v19.2d, v21.2d
    550     //end transpose
    551 
    552     add       v18.4s, v14.4s, v17.4s    //x0 = x4 + x7;
    553     add       v19.4s, v15.4s, v16.4s    //x1 = x5 + x6;
    554     sub       v20.4s, v15.4s, v16.4s    //x2 = x5 - x6;
    555     sub       v21.4s, v14.4s, v17.4s    //x3 = x4 - x7;
    556 
    557     dup       v14.4s, w5                //round factor
    558     dup       v15.4s, v14.s[0]
    559     dup       v16.4s, v14.s[0]
    560     dup       v17.4s, v14.s[0]
    561 
    562     add       v22.4s, v18.4s, v19.4s    //(x0 + x1)
    563     add       v23.4s, v21.4s, v20.4s    //(x3 + x2)
    564     sub       v24.4s, v18.4s, v19.4s    //(x0 - x1)
    565     sub       v25.4s, v21.4s, v20.4s    //(x3 - x2)
    566 
    567     shrn      v0.4h, v22.4s, #1         //i4_value = (x0 + x1) >> 1;
    568     shrn2     v0.8h, v23.4s, #1         //i4_value = (x3 + x2) >> 1;
    569     shrn      v1.4h, v24.4s, #1         //i4_value = (x0 - x1) >> 1;
    570     shrn2     v1.8h, v25.4s, #1         //i4_value = (x3 - x2) >> 1;
    571 
    572     abs       v2.8h, v0.8h
    573     abs       v3.8h, v1.8h
    574 
    575     cmgt      v4.8h, v0.8h, #0          //get the sign row 1,2
    576     cmgt      v5.8h, v1.8h, #0
    577 
    578     neg       w4, w4                    //-u4_qbits
    579     dup       v22.4s, w4                //load  -u4_qbits
    580 
    581     umlal     v14.4s, v2.4h, v30.4h
    582     umlal2    v15.4s, v2.8h, v30.8h
    583     umlal     v16.4s, v3.4h, v30.4h
    584     umlal2    v17.4s, v3.8h, v30.8h
    585 
    586     ushl      v14.4s, v14.4s, v22.4s
    587     ushl      v15.4s, v15.4s, v22.4s
    588     ushl      v16.4s, v16.4s, v22.4s
    589     ushl      v17.4s, v17.4s, v22.4s
    590 
    591     uqxtn     v14.4h, v14.4s
    592     uqxtn2    v14.8h, v15.4s
    593     uqxtn     v16.4h, v16.4s
    594     uqxtn2    v16.8h, v17.4s
    595 
    596     neg       v15.8h, v14.8h
    597     neg       v17.8h, v16.8h
    598 
    599     bsl       v4.16b, v14.16b, v15.16b
    600     bsl       v5.16b, v16.16b, v17.16b
    601 
    602     cmeq      v0.8h, v14.8h, #0
    603     cmeq      v1.8h, v16.8h, #0
    604 
    605     st1       {v4.8h-v5.8h}, [x1]
    606 
    607     movi      v20.8b, #16
    608 
    609     xtn       v2.8b, v0.8h
    610     xtn       v3.8b, v1.8h
    611 
    612     ushr      v2.8b, v2.8b, #7
    613     ushr      v3.8b, v3.8b, #7
    614 
    615     add       v2.8b, v2.8b, v3.8b
    616     addp      v2.8b, v2.8b, v2.8b
    617     addp      v2.8b, v2.8b, v2.8b
    618     addp      v2.8b, v2.8b, v2.8b
    619     sub       v20.8b, v20.8b, v2.8b
    620     st1       {v20.b}[0], [x6]
    621 
    622     pop_v_regs
    623     ret
    624 
    625 
    626 //*****************************************************************************
    627 //*
    628 //* function name     : ih264_hadamard_quant_2x2_uv
    629 //* description       : this function does forward hadamard transform and
    630 //*                     quantization for dc block of chroma for both planes
    631 //*
    632 //* arguments         :  x0 :pointer to src buffer
    633 //                       x1 :pointer to dst buffer
    634 //                       x2 :pu2_scale_matrix
    635 //                       x2 :pu2_threshold_matrix
    636 //                       x3 :u4_qbits
    637 //                       x4 :u4_round_factor
    638 //                       x5 :pu1_nnz
    639 // values returned   : none
    640 //
    641 // register usage    :
    642 // stack usage       : 0 bytes
    643 // cycles            : around
    644 // interruptiaility  : interruptable
    645 //
    646 // known limitations
    647 //   \assumptions    :
    648 //
    649 // revision history  :
    650 //         dd mm yyyy    author(s)   changes
    651 //         20 2 2015    100633      first version
    652 //
    653 //*****************************************************************************
    654 // ih264_hadamard_quant_2x2_uv_av8(word16 *pi2_src, word16 *pi2_dst,
    655 //                             const uword16 *pu2_scale_matrix,
    656 //                             const uword16 *pu2_threshold_matrix, uword32 u4_qbits,
    657 //                             uword32 u4_round_factor,uword8  *pu1_nnz
    658 //                             )
    659 
    660     .global ih264_hadamard_quant_2x2_uv_av8
    661 ih264_hadamard_quant_2x2_uv_av8:
    662 
    663     push_v_regs
    664 
    665     ld2       {v0.4h-v1.4h}, [x0]       //load src
    666 
    667     ld1       {v30.h}[0], [x2]          //load pu2_scale_matrix[0]
    668     dup       v30.4h, v30.h[0]          //pu2_scale_matrix
    669     uxtl      v30.4s, v30.4h            //pu2_scale_matrix
    670 
    671     neg       w4, w4
    672     dup       v24.4s, w4                //u4_qbits
    673 
    674     dup       v25.4s, w5                //round fact
    675     dup       v26.4s, v25.s[0]
    676 
    677     saddl     v2.4s, v0.4h, v1.4h       //x0 = x4 + x5;, x2 = x6 + x7;
    678     ssubl     v3.4s, v0.4h, v1.4h       //x1 = x4 - x5;  x3 = x6 - x7;
    679 
    680     trn1      v4.4s, v2.4s, v3.4s
    681     trn2      v5.4s, v2.4s, v3.4s       //q1 -> x0 x1, q2 -> x2 x3
    682 
    683     add       v0.4s, v4.4s , v5.4s      // (x0 + x2) (x1 + x3)  (y0 + y2); (y1 + y3);
    684     sub       v1.4s, v4.4s , v5.4s      // (x0 - x2) (x1 - x3)  (y0 - y2); (y1 - y3);
    685 
    686     abs       v2.4s, v0.4s
    687     abs       v3.4s, v1.4s
    688 
    689     cmgt      v4.4s, v0.4s, #0          //get the sign row 1,2
    690     cmgt      v5.4s, v1.4s, #0
    691 
    692     uqxtn     v4.4h, v4.4s
    693     sqxtn2    v4.8h, v5.4s
    694 
    695     mla       v25.4s, v2.4s, v30.4s
    696     mla       v26.4s, v3.4s, v30.4s
    697 
    698     ushl      v2.4s, v25.4s, v24.4s     //>>qbit
    699     ushl      v3.4s, v26.4s, v24.4s     //>>qbit
    700 
    701     uqxtn     v2.4h, v2.4s
    702     uqxtn2    v2.8h, v3.4s
    703 
    704     neg       v5.8h, v2.8h
    705 
    706     bsl       v4.16b, v2.16b, v5.16b    //*sign
    707 
    708     //rearrange such that we get each plane coeffs as continous
    709     mov       v5.s[0], v4.s[1]
    710     mov       v4.s[1], v4.s[2]
    711     mov       v4.s[2], v5.s[0]
    712 
    713     cmeq      v5.8h, v4.8h, #0          //compute nnz
    714     xtn       v5.8b, v5.8h              //reduce nnz comparison to 1 bit
    715     ushr      v5.8b, v5.8b, #7          //reduce nnz comparison to 1 bit
    716     movi      v20.8b, #4                //since we add zeros, we need to subtract from 4 to get nnz
    717     addp      v5.8b, v5.8b, v5.8b       //sum up nnz
    718     addp      v5.8b, v5.8b, v5.8b       //sum up nnz
    719 
    720     st1       {v4.8h}, [x1]             //store the block
    721 
    722     st1       {v4.8h}, [x1]             //store the block
    723     sub       v20.8b, v20.8b, v5.8b     //4- numzeros
    724 
    725     st1       {v20.h}[0], [x6]          //store nnz
    726 
    727     pop_v_regs
    728     ret
    729 
    730 
    731 
    732