Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_filters_planar.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for inter prediction  interpolation.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* @author
     30 //*  akshaya mukund
     31 //*
     32 //* @par list of functions:
     33 //*
     34 //*
     35 //* @remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* @brief
     44 //*    luma intraprediction filter for planar input
     45 //*
     46 //* @par description:
     47 //*
     48 //* @param[in] pu1_ref
     49 //*  uword8 pointer to the source
     50 //*
     51 //* @param[out] pu1_dst
     52 //*  uword8 pointer to the destination
     53 //*
     54 //* @param[in] src_strd
     55 //*  integer source stride
     56 //*
     57 //* @param[in] dst_strd
     58 //*  integer destination stride
     59 //*
     60 //* @param[in] pi1_coeff
     61 //*  word8 pointer to the planar coefficients
     62 //*
     63 //* @param[in] nt
     64 //*  size of tranform block
     65 //*
     66 //* @param[in] mode
     67 //*  type of filtering
     68 //*
     69 //* @returns
     70 //*
     71 //* @remarks
     72 //*  none
     73 //*
     74 //*******************************************************************************
     75 //*/
     76 
     77 //void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
     78 //                                  word32 src_strd,
     79 //                                  uword8* pu1_dst,
     80 //                                  word32 dst_strd,
     81 //                                  word32 nt,
     82 //                                  word32 mode,
     83 //                   word32 pi1_coeff)
     84 //**************variables vs registers*****************************************
     85 //x0 => *pu1_ref
     86 //x1 => src_strd
     87 //x2 => *pu1_dst
     88 //x3 => dst_strd
     89 
     90 //stack contents from #40
     91 //    nt
     92 //    mode
     93 //    pi1_coeff
     94 
     95 .text
     96 .align 4
     97 .include "ihevc_neon_macros.s"
     98 
     99 
    100 
    101 .globl ihevc_intra_pred_luma_planar_av8
    102 .extern gau1_ihevc_planar_factor
    103 .extern gau1_ihevc_planar_factor_1
    104 
    105 .type ihevc_intra_pred_luma_planar_av8, %function
    106 
    107 ihevc_intra_pred_luma_planar_av8:
    108 
    109     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
    110 
    111     stp         x19, x20,[sp,#-16]!
    112 
    113     adrp        x11, :got:gau1_ihevc_planar_factor //loads table of coeffs
    114     ldr         x11, [x11, #:got_lo12:gau1_ihevc_planar_factor]
    115 
    116     clz         w5,w4
    117     sub         x20, x5, #32
    118     neg         x5, x20
    119     dup         v29.8h,w5
    120     neg         v29.8h, v29.8h              //shr value (so vneg)
    121     dup         v2.8b,w4                    //nt
    122     dup         v16.8h,w4                   //nt
    123 
    124     sub         x6, x4, #1                  //nt-1
    125     add         x6, x6, x0
    126     ldr         w7,  [x6]
    127     sxtw        x7,w7
    128     dup         v0.8b,w7                    //src[nt-1]
    129 
    130     add         x6, x4, x4,lsl #1           //3nt
    131     add         x6, x6, #1                  //3nt + 1
    132     add         x6, x6, x0
    133     ldr         w7,  [x6]
    134     sxtw        x7,w7
    135     dup         v1.8b,w7                    //src[3nt+1]
    136 
    137     add         x6, x4, x4                  //2nt
    138     add         x14, x6, #1                 //2nt+1
    139     sub         x6, x6, #1                  //2nt-1
    140     add         x6, x6, x0                  //&src[2nt-1]
    141     add         x14, x14, x0                //&src[2nt+1]
    142 
    143     mov         x8, #1                      //row+1 (row is first 0)
    144     sub         x9, x4, x8                  //nt-1-row (row is first 0)
    145 
    146     dup         v5.8b,w8                    //row + 1
    147     dup         v6.8b,w9                    //nt - 1 - row
    148     mov         v7.8b, v5.8b                //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
    149 
    150     add         x12, x11, #1                //coeffs (to be reloaded after every row)
    151     mov         x1, x4                      //nt (row counter) (dec after every row)
    152     mov         x5, x2                      //dst (to be reloaded after every row and inc by dst_strd)
    153     mov         x10, #8                     //increment for the coeffs
    154     mov         x0, x14                     //&src[2nt+1] (to be reloaded after every row)
    155 
    156     cmp         x4, #4
    157     beq         tf_sz_4
    158 
    159 //@ ========== ***************** =====================
    160 prolog:
    161 tf_sz_8_16_32:
    162 
    163     mov         x7, x4                      //column counter (set to no of cols)
    164     lsr         x9, x4, #3                  //divide nt by 8
    165     mul         x7, x7, x9                  //multiply width * height
    166     adrp        x5, :got:gau1_ihevc_planar_factor_1 //loads table of coeffs
    167     ldr         x5, [x5, #:got_lo12:gau1_ihevc_planar_factor_1]
    168     sub         x6, x6, #7
    169     mov         x8, x2
    170     lsl         x9, x3, #3                  //4*stride
    171     sub         x20, x9, #8                 //8-4*stride
    172     neg         x9, x20
    173     mov         x10, x4                     //nt
    174     sub         x10, x10, #8                //nt - 8
    175 
    176 col_loop_8_16_32:
    177 
    178     ld1         {v17.8b},[x12]              //(1-8)load 8 coeffs [col+1]
    179     dup         v27.8h,w4                   //(1)
    180     ld1         {v4.8b},[x6]                //(1-8)src[2nt-1-row]
    181     sub         v19.8b,  v2.8b ,  v17.8b    //(1-8)[nt-1-col]
    182 
    183 
    184     umlal       v27.8h, v5.8b, v0.8b        //(1)(row+1)    *    src[nt-1]
    185 
    186     ld1         {v3.8b},[x14]               //(1-8)load 8 src[2nt+1+col]
    187     umlal       v27.8h, v17.8b, v1.8b       //(1)(col+1)    *    src[3nt+1]
    188 
    189     dup         v20.8b, v4.b[7]             //(1)
    190     umlal       v27.8h, v6.8b, v3.8b        //(1)(nt-1-row)    *    src[2nt+1+col]
    191 
    192     dup         v21.8b, v4.b[6]             //(2)
    193     umlal       v27.8h, v19.8b, v20.8b      //(1)(nt-1-col)    *    src[2nt-1-row]
    194 
    195     dup         v30.8h,w4                   //(2)
    196     add         v5.8b,  v5.8b ,  v7.8b      //(1)
    197 
    198     sub         v6.8b,  v6.8b ,  v7.8b      //(1)
    199 
    200     dup         v22.8b, v4.b[5]             //(3)
    201     umlal       v30.8h, v5.8b, v0.8b        //(2)
    202 
    203     dup         v28.8h,w4                   //(3)
    204     umlal       v30.8h, v17.8b, v1.8b       //(2)
    205 
    206     umlal       v30.8h, v6.8b, v3.8b        //(2)
    207     umlal       v30.8h, v19.8b, v21.8b      //(2)
    208 
    209     sshl        v27.8h, v27.8h, v29.8h      //(1)shr
    210 
    211     add         v5.8b,  v5.8b ,  v7.8b      //(2)
    212     sub         v6.8b,  v6.8b ,  v7.8b      //(2)
    213 
    214     xtn         v27.8b,  v27.8h             //(1)
    215     umlal       v28.8h, v5.8b, v0.8b        //(3)
    216 
    217     dup         v23.8b, v4.b[4]             //(4)
    218     umlal       v28.8h, v17.8b, v1.8b       //(3)
    219 
    220     dup         v25.8h,w4                   //(4)
    221     umlal       v28.8h, v6.8b, v3.8b        //(3)
    222 
    223     st1         {v27.8b},[x2], x3           //(1)str 8 values
    224     umlal       v28.8h, v19.8b, v22.8b      //(3)
    225 
    226     sshl        v30.8h, v30.8h, v29.8h      //(2)shr
    227 
    228     add         v5.8b,  v5.8b ,  v7.8b      //(3)
    229     sub         v6.8b,  v6.8b ,  v7.8b      //(3)
    230 
    231     xtn         v30.8b,  v30.8h             //(2)
    232     umlal       v25.8h, v5.8b, v0.8b        //(4)
    233 
    234     dup         v20.8b, v4.b[3]             //(5)
    235     umlal       v25.8h, v17.8b, v1.8b       //(4)
    236 
    237     dup         v16.8h,w4                   //(5)
    238     umlal       v25.8h, v6.8b, v3.8b        //(4)
    239 
    240     st1         {v30.8b},[x2], x3           //(2)str 8 values
    241     umlal       v25.8h, v19.8b, v23.8b      //(4)
    242 
    243     sshl        v28.8h, v28.8h, v29.8h      //(3)shr
    244 
    245     add         v5.8b,  v5.8b ,  v7.8b      //(4)
    246     sub         v6.8b,  v6.8b ,  v7.8b      //(4)
    247 
    248     xtn         v28.8b,  v28.8h             //(3)
    249     umlal       v16.8h, v5.8b, v0.8b        //(5)
    250 
    251     dup         v21.8b, v4.b[2]             //(6)
    252     umlal       v16.8h, v17.8b, v1.8b       //(5)
    253 
    254     dup         v18.8h,w4                   //(6)
    255     umlal       v16.8h, v6.8b, v3.8b        //(5)
    256 
    257     st1         {v28.8b},[x2], x3           //(3)str 8 values
    258     umlal       v16.8h, v19.8b, v20.8b      //(5)
    259 
    260     sshl        v25.8h, v25.8h, v29.8h      //(4)shr
    261     add         v5.8b,  v5.8b ,  v7.8b      //(5)
    262     sub         v6.8b,  v6.8b ,  v7.8b      //(5)
    263 
    264     xtn         v25.8b,  v25.8h             //(4)
    265     umlal       v18.8h, v5.8b, v0.8b        //(6)
    266 
    267     dup         v22.8b, v4.b[1]             //(7)
    268     umlal       v18.8h, v17.8b, v1.8b       //(6)
    269 
    270     dup         v26.8h,w4                   //(7)
    271     umlal       v18.8h, v6.8b, v3.8b        //(6)
    272 
    273     st1         {v25.8b},[x2], x3           //(4)str 8 values
    274     umlal       v18.8h, v19.8b, v21.8b      //(6)
    275 
    276     sshl        v16.8h, v16.8h, v29.8h      //(5)shr
    277 
    278     add         v5.8b,  v5.8b ,  v7.8b      //(6)
    279     sub         v6.8b,  v6.8b ,  v7.8b      //(6)
    280 
    281     xtn         v16.8b,  v16.8h             //(5)
    282     umlal       v26.8h, v5.8b, v0.8b        //(7)
    283 
    284     dup         v23.8b, v4.b[0]             //(8)
    285     umlal       v26.8h, v17.8b, v1.8b       //(7)
    286 
    287     dup         v24.8h,w4                   //(8)
    288     umlal       v26.8h, v6.8b, v3.8b        //(7)
    289 
    290     st1         {v16.8b},[x2], x3           //(5)str 8 values
    291     umlal       v26.8h, v19.8b, v22.8b      //(7)
    292 
    293     sshl        v18.8h, v18.8h, v29.8h      //(6)shr
    294 
    295     add         v5.8b,  v5.8b ,  v7.8b      //(7)
    296     sub         v6.8b,  v6.8b ,  v7.8b      //(7)
    297 
    298     xtn         v18.8b,  v18.8h             //(6)
    299     umlal       v24.8h, v5.8b, v0.8b        //(8)
    300 
    301 
    302     umlal       v24.8h, v17.8b, v1.8b       //(8)
    303 
    304     umlal       v24.8h, v6.8b, v3.8b        //(8)
    305 
    306     st1         {v18.8b},[x2], x3           //(6)str 8 values
    307     umlal       v24.8h, v19.8b, v23.8b      //(8)
    308 
    309     sshl        v26.8h, v26.8h, v29.8h      //(7)shr
    310 
    311     subs        x7, x7, #8
    312 
    313     beq         epilog
    314 
    315     subs        x1, x1, #8                  //row counter
    316     add         x20, x12, #8                //col inc
    317     csel        x12, x20, x12,gt
    318     add         x20, x14, #8                //also for col inc
    319     csel        x14, x20, x14,gt
    320     csel        x1, x4, x1,le               //nt reloaded (refresh the value)
    321     add         x20, x11, #1                //x12 reset
    322     csel        x12, x20, x12,le
    323 
    324     csel        x14, x0, x14,le             //x14 reset
    325     ld1         {v17.8b},[x12]              //(1n)(1-8)load 8 coeffs [col+1]
    326 
    327     sub         x20, x6, #8                 //for next set of rows
    328     csel        x6, x20, x6,le
    329     ld1         {v3.8b},[x14]               //(1n)(1-8)load 8 src[2nt+1+col]
    330 
    331     add         x20, x5, #8
    332     csel        x5, x20, x5,le
    333     dup         v27.8h,w4                   //(1n)(1)
    334 
    335     ld1         {v5.8b},[x5]
    336 
    337     ld1         {v4.8b},[x6]                //(1n)(1-8)src[2nt-1-row]
    338     sub         v19.8b,  v2.8b ,  v17.8b    //(1n)(1-8)[nt-1-col]
    339 
    340     dup         v20.8b, v4.b[7]             //(1n)(1)
    341     sub         v6.8b,  v2.8b ,  v5.8b
    342 
    343     beq         epilog
    344 
    345 kernel_plnr:
    346 
    347     cmp         x1, #0                      // (cond loop)
    348     sshl        v24.8h, v24.8h, v29.8h      //(8)shr
    349 
    350     xtn         v26.8b,  v26.8h             //(7)
    351     umlal       v27.8h, v5.8b, v0.8b        //(1)(row+1)    *    src[nt-1]
    352 
    353     xtn         v24.8b,  v24.8h             //(8)
    354     umlal       v27.8h, v17.8b, v1.8b       //(1)(col+1)    *    src[3nt+1]
    355 
    356     dup         v21.8b, v4.b[6]             //(2)
    357     umlal       v27.8h, v6.8b, v3.8b        //(1)(nt-1-row)    *    src[2nt+1+col]
    358 
    359     dup         v30.8h,w4                   //(2)
    360     umlal       v27.8h, v19.8b, v20.8b      //(1)(nt-1-col)    *    src[2nt-1-row]
    361 
    362     st1         {v26.8b},[x2], x3           //(7)str 8 values
    363     add         v5.8b,  v5.8b ,  v7.8b      //(1)
    364 
    365     st1         {v24.8b},[x2], x3           //(8)str 8 values
    366     sub         v6.8b,  v6.8b ,  v7.8b      //(1)
    367 
    368     add         x20, x2, x9                 //since more cols to fill, dst + 8 - 6*strd (cond loop)
    369     csel        x2, x20, x2,gt
    370     umlal       v30.8h, v5.8b, v0.8b        //(2)
    371 
    372     sub         x20, x2, x10                //else go to next set of rows, dst - (nt-8) (cond loop)
    373     csel        x2, x20, x2,le
    374     umlal       v30.8h, v17.8b, v1.8b       //(2)
    375 
    376     dup         v22.8b, v4.b[5]             //(3)
    377     umlal       v30.8h, v6.8b, v3.8b        //(2)
    378 
    379     dup         v28.8h,w4                   //(3)
    380     umlal       v30.8h, v19.8b, v21.8b      //(2)
    381 
    382     sshl        v27.8h, v27.8h, v29.8h      //(1)shr
    383 
    384     add         v5.8b,  v5.8b ,  v7.8b      //(2)
    385     csel        x1, x4, x1,le               //nt reloaded (refresh the value)    (cond loop)
    386 
    387     sub         v6.8b,  v6.8b ,  v7.8b      //(2)
    388     subs        x1, x1, #8                  //row counter (loop)
    389 
    390     xtn         v27.8b,  v27.8h             //(1)
    391     umlal       v28.8h, v5.8b, v0.8b        //(3)
    392 
    393     dup         v23.8b, v4.b[4]             //(4)
    394     umlal       v28.8h, v17.8b, v1.8b       //(3)
    395 
    396     dup         v25.8h,w4                   //(4)
    397     umlal       v28.8h, v6.8b, v3.8b        //(3)
    398 
    399     st1         {v27.8b},[x2], x3           //(1)str 8 values
    400     umlal       v28.8h, v19.8b, v22.8b      //(3)
    401 
    402     sshl        v30.8h, v30.8h, v29.8h      //(2)shr
    403 
    404     add         v5.8b,  v5.8b ,  v7.8b      //(3)
    405 
    406     sub         v6.8b,  v6.8b ,  v7.8b      //(3)
    407 
    408     xtn         v30.8b,  v30.8h             //(2)
    409     umlal       v25.8h, v5.8b, v0.8b        //(4)
    410 
    411     dup         v20.8b, v4.b[3]             //(5)
    412     umlal       v25.8h, v17.8b, v1.8b       //(4)
    413 
    414     dup         v16.8h,w4                   //(5)
    415     umlal       v25.8h, v6.8b, v3.8b        //(4)
    416 
    417     st1         {v30.8b},[x2], x3           //(2)str 8 values
    418     umlal       v25.8h, v19.8b, v23.8b      //(4)
    419 
    420     sshl        v28.8h, v28.8h, v29.8h      //(3)shr
    421 
    422     add         v5.8b,  v5.8b ,  v7.8b      //(4)
    423 
    424     sub         v6.8b,  v6.8b ,  v7.8b      //(4)
    425 
    426     xtn         v28.8b,  v28.8h             //(3)
    427     umlal       v16.8h, v5.8b, v0.8b        //(5)
    428 
    429     dup         v21.8b, v4.b[2]             //(6)
    430     umlal       v16.8h, v17.8b, v1.8b       //(5)
    431 
    432     dup         v18.8h,w4                   //(6)
    433     umlal       v16.8h, v6.8b, v3.8b        //(5)
    434 
    435     st1         {v28.8b},[x2], x3           //(3)str 8 values
    436     umlal       v16.8h, v19.8b, v20.8b      //(5)
    437 
    438     add         x20, x11, #1                //x12 reset (cond loop)
    439     csel        x12, x20, x12,le
    440     sshl        v25.8h, v25.8h, v29.8h      //(4)shr
    441 
    442     add         x20, x12, #8                //col inc (cond loop)
    443     csel        x12, x20, x12,gt
    444     add         v5.8b,  v5.8b ,  v7.8b      //(5)
    445 
    446     add         x20, x14, #8                //also for col inc (cond loop)
    447     csel        x14, x20, x14,gt
    448     sub         v6.8b,  v6.8b ,  v7.8b      //(5)
    449 
    450     xtn         v25.8b,  v25.8h             //(4)
    451     umlal       v18.8h, v5.8b, v0.8b        //(6)
    452 
    453     dup         v22.8b, v4.b[1]             //(7)
    454     umlal       v18.8h, v17.8b, v1.8b       //(6)
    455 
    456     dup         v26.8h,w4                   //(7)
    457     umlal       v18.8h, v6.8b, v3.8b        //(6)
    458 
    459     st1         {v25.8b},[x2], x3           //(4)str 8 values
    460     umlal       v18.8h, v19.8b, v21.8b      //(6)
    461 
    462     csel        x14, x0, x14,le             //x14 reset (cond loop)
    463     sshl        v16.8h, v16.8h, v29.8h      //(5)shr
    464 
    465     sub         x20, x6, #8                 //for next set of rows (cond loop)
    466     csel        x6, x20, x6,le
    467     add         v5.8b,  v5.8b ,  v7.8b      //(6)
    468 
    469     add         x20, x5, #8                 // (cond loop)
    470     csel        x5, x20, x5,le
    471     sub         v6.8b,  v6.8b ,  v7.8b      //(6)
    472 
    473     xtn         v16.8b,  v16.8h             //(5)
    474     umlal       v26.8h, v5.8b, v0.8b        //(7)
    475 
    476     dup         v23.8b, v4.b[0]             //(8)
    477     umlal       v26.8h, v17.8b, v1.8b       //(7)
    478 
    479     dup         v24.8h,w4                   //(8)
    480     umlal       v26.8h, v6.8b, v3.8b        //(7)
    481 
    482     st1         {v16.8b},[x2], x3           //(5)str 8 values
    483     umlal       v26.8h, v19.8b, v22.8b      //(7)
    484 
    485     ld1         {v4.8b},[x6]                //(1n)(1-8)src[2nt-1-row]
    486     sshl        v18.8h, v18.8h, v29.8h      //(6)shr
    487 
    488     add         v5.8b,  v5.8b ,  v7.8b      //(7)
    489 
    490     sub         v6.8b,  v6.8b ,  v7.8b      //(7)
    491 
    492     xtn         v18.8b,  v18.8h             //(6)
    493     umlal       v24.8h, v5.8b, v0.8b        //(8)
    494 
    495     ld1         {v5.8b},[x5]                //(row+1 value)
    496     umlal       v24.8h, v17.8b, v1.8b       //(8)
    497 
    498     dup         v20.8b, v4.b[7]             //(1n)(1)
    499     umlal       v24.8h, v6.8b, v3.8b        //(8)
    500 
    501     st1         {v18.8b},[x2], x3           //(6)str 8 values
    502     umlal       v24.8h, v19.8b, v23.8b      //(8)
    503 
    504     ld1         {v17.8b},[x12]              //(1n)(1-8)load 8 coeffs [col+1]
    505     sub         v6.8b,  v2.8b ,  v5.8b      //(nt-1-row) value
    506 
    507     subs        x7, x7, #8                  //col counter
    508 
    509     ld1         {v3.8b},[x14]               //(1n)(1-8)load 8 src[2nt+1+col]
    510     sshl        v26.8h, v26.8h, v29.8h      //(7)shr
    511 
    512     dup         v27.8h,w4                   //(1n)(1)
    513     sub         v19.8b,  v2.8b ,  v17.8b    //(1n)(1-8)[nt-1-col]
    514 
    515     bne         kernel_plnr
    516 
    517 epilog:
    518 
    519     xtn         v26.8b,  v26.8h             //(7)
    520     st1         {v26.8b},[x2], x3           //(7)str 8 values
    521 
    522     sshl        v24.8h, v24.8h, v29.8h      //(8)shr
    523     xtn         v24.8b,  v24.8h             //(8)
    524     st1         {v24.8b},[x2], x3           //(8)str 8 values
    525 
    526 //@ ========== ***************** =====================
    527 
    528     beq         end_loop
    529 
    530 tf_sz_4:
    531     ld1         {v25.8b},[x14]              //load src[2nt+1+col]
    532     ld1         {v17.8b},[x12], x10         //load 8 coeffs [col+1]
    533 loop_sz_4:
    534     mov         x10, #4                     //reduce inc to #4 for 4x4
    535     ldr         w7,  [x6], #-1              //src[2nt-1-row] (dec to take into account row)
    536     sxtw        x7,w7
    537     dup         v4.8b,w7                    //src[2nt-1-row]
    538 
    539     sub         v19.8b,  v2.8b ,  v17.8b    //[nt-1-col]
    540 
    541     umull       v27.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
    542     umlal       v27.8h, v6.8b, v25.8b       //(nt-1-row)    *    src[2nt+1+col]
    543     umlal       v27.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
    544     umlal       v27.8h, v19.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
    545 //    vadd.i16    q6, q6, q8            @add (nt)
    546 //    vshl.s16     q6, q6, q7            @shr
    547 //    vmovn.i16     d12, q6
    548     rshrn       v27.8b, v27.8h,#3
    549     st1         {v27.s}[0],[x2], x3
    550 
    551     add         v5.8b,  v5.8b ,  v7.8b      //row++ [(row+1)++]
    552     sub         v6.8b,  v6.8b ,  v7.8b      //[nt-1-row]--
    553     subs        x1, x1, #1
    554 
    555     bne         loop_sz_4
    556 
    557 end_loop:
    558     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    559     ldp         x19, x20,[sp],#16
    560 
    561     ret
    562 
    563 
    564 
    565 
    566 
    567 
    568 
    569 
    570