Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 ///*******************************************************************************
     20 //* //file
     21 //*  ihevc_deblk_luma_vert.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for inter prediction  interpolation.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* //author
     30 //*  anand s
     31 //*
     32 //* //par list of functions:
     33 //*
     34 //*
     35 //* //remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************/
     39 
     40 .text
     41 .align 4
     42 
     43 
     44 
     45 .extern gai4_ihevc_tc_table
     46 .extern gai4_ihevc_beta_table
     47 
     48 .globl ihevc_deblk_luma_vert_av8
     49 
     50 .type ihevc_deblk_luma_vert_av8, %function
     51 
     52 ihevc_deblk_luma_vert_av8:
     53 
     54     sxtw        x5,w5
     55     sxtw        x6,w6
     56     stp         d8,d9,[sp,#-16]!
     57     stp         d10,d11,[sp,#-16]!
     58     stp         d12,d13,[sp,#-16]!
     59     stp         d14,d15,[sp,#-16]!
     60     stp         x19, x20,[sp,#-16]!
     61     stp         x21, x22,[sp,#-16]!
     62     mov         x21,x7
     63     ldr         w22,[sp,#96]
     64     add         x3,x3,x4
     65     add         x3,x3,#1
     66     asr         x3,x3,#1
     67     add         x7,x3,x5,lsl #1
     68     add         x3,x3,x6,lsl #1
     69     cmp         x7,#0x33
     70     mov         x20,#0x33
     71     csel        x7, x20, x7,gt
     72     bgt         l1.56
     73     cmp         x7,#0x0
     74     mov         x20,#0x0
     75     csel        x7, x20, x7,lt              // x7 has the beta_index value
     76 l1.56:
     77 
     78 //     bic      x2,x2,#1
     79     asr         x2,x2,#1
     80 
     81     add         x3,x3,x2,lsl #1
     82     cmp         x3,#0x35
     83     mov         x20,#0x35
     84     csel        x3, x20, x3,gt
     85     bgt         l1.88
     86     cmp         x3,#0x0
     87     mov         x20,#0x0
     88     csel        x3, x20, x3,lt              // x3 has the tc_index value
     89 
     90 //    qp_luma = (quant_param_p + quant_param_q + 1) >> 1//
     91 //    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)//
     92 //    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)//
     93 
     94 l1.88:
     95     adrp        x2, :got:gai4_ihevc_beta_table
     96     ldr         x2, [x2, #:got_lo12:gai4_ihevc_beta_table]
     97 
     98     movi        v18.8b, #0x2
     99     adrp        x4, :got:gai4_ihevc_tc_table
    100     ldr         x4, [x4, #:got_lo12:gai4_ihevc_tc_table]
    101 
    102     ldr         w5,[x2,x7,lsl #2]           // beta
    103     movi        v16.8h, #0x2
    104     ldr         w6,[x4,x3,lsl #2]           // tc
    105     lsl         x8,x6,#1
    106     cmp         x6,#0
    107     dup         v19.8b,w8
    108     sub         x7,x0,#4
    109     movi        v23.8b, #0x3
    110     beq         l1.964
    111 
    112 
    113     sub         x19,x0,#3
    114     ld1         {v15.8b},[x7],x1
    115     ldrb        w8,[x19]                    // -3 value
    116     ld1         {v1.8b},[x7],x1
    117     ldrb        w10,[x19,#1]                //-2 value
    118     ld1         {v29.8b},[x7],x1
    119     ldrb        w11,[x19,#2]                //-1 value
    120     ld1         {v0.8b},[x7]
    121     ldrb        w12,[x0,#0]                 // 0 value
    122     ldrb        w9,[x0,#1]                  // 1 value
    123     trn1        v24.8b,v15.8b,v1.8b
    124     trn2        v1.8b,v15.8b,v1.8b
    125     ldrb        w2,[x0,#2]                  // 2 value
    126     trn1        v2.8b,v29.8b,v0.8b
    127     trn2        v0.8b,v29.8b,v0.8b
    128     add         x12,x12,x2
    129     subs        x9,x12,x9,lsl #1            // dq0 value is stored in x9
    130     csneg       x9,x9,x9,pl
    131 //dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )//
    132     mov         v29.8b,v24.8b
    133     trn1        v24.4h,v29.4h,v2.4h
    134     trn2        v2.4h,v29.4h,v2.4h
    135     add         x8,x8,x11
    136     mov         v15.8b,v1.8b
    137     trn1        v1.4h,v15.4h,v0.4h
    138     trn2        v0.4h,v15.4h,v0.4h
    139     subs        x8,x8,x10,lsl #1
    140     csneg       x8,x8,x8,pl
    141 //  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )//
    142 
    143 
    144 
    145     add         x14,x1,x1,lsl #1
    146     add         x14,x0,x14
    147 
    148     sub         x19,x14,#3
    149     dup         v4.2s, v24.s[1]
    150     ldrb        w2,[x19]                    // -2 value
    151     dup         v7.2s, v2.s[1]
    152     ldrb        w10,[x19,#1]                // -2 value
    153     dup         v3.2s, v2.s[0]
    154     ldrb        w11,[x19,#2]                // -1 value
    155     dup         v5.2s, v1.s[1]
    156     ldrb        w12,[x14,#0]                // 0 value
    157     dup         v6.2s, v1.s[0]
    158     ldrb        w3,[x14,#1]                 // 1 value
    159     dup         v2.2s, v0.s[0]
    160     ldrb        w4,[x14,#2]                 // 2 value
    161 
    162 
    163     add         x12,x12,x4
    164     subs        x12,x12,x3,lsl #1           // dq3value is stored in x12
    165     csneg       x12,x12,x12,pl
    166 //    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )//
    167 
    168 
    169     add         x2,x2,x11
    170     subs        x11,x2,x10,lsl #1
    171     csneg       x11,x11,x11,pl              // dp3 value is stored in x8
    172 //    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )//
    173 
    174 
    175 
    176     add         x3,x8,x9                    // x3 has the d0 value
    177     add         x4,x11,x12                  // x4 has the d3 value
    178 
    179 
    180 //    d0 = dp0 + dq0//
    181 //    d3 = dp3 + dq3//
    182 
    183     add         x14,x8,x11                  // x13 has the value dp
    184     add         x12,x12,x9                  // x12 has the value  dq
    185 //    dp = dp0 + dp3//
    186 //   dq = dq0 + dq3//
    187 
    188     add         x11, x3, x4                 // x3 has the value d
    189 
    190 //   d = d0 + d3//
    191 
    192 
    193     cmp         x11,x5
    194     dup         v22.2s, v0.s[1]
    195     bge         l1.964
    196 
    197 //    if(d < beta)
    198 
    199 
    200     // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11
    201 
    202     // registers for use: x2,x7,x8,x9,x10,
    203     uqsub       v30.8b,v7.8b,v19.8b
    204     asr         x10,x5,#2
    205     uqadd       v31.8b,v7.8b,v19.8b
    206     cmp         x10,x3,lsl #1
    207     uaddl       v0.8h,v5.8b,v4.8b
    208     ble         l1.336
    209 
    210     sub         x19,x0,4
    211     ldrb        w2,[x19]
    212     uaddw       v0.8h,  v0.8h ,  v2.8b
    213     ldrb        w7,[x19,#3]
    214     umull       v20.8h, v7.8b, v23.8b
    215     ldrb        w3,[x0,#0]
    216     umlal       v20.8h, v22.8b, v18.8b
    217     ldrb        w8,[x0,#3]
    218 //   ubfx   x7,x2,#24,#8           // has the -1 value
    219 //  and    x2,#0xff               // has the -4 value
    220 //  ubfx   x8,x3,#24,#8           // has the 3 value
    221 //  and    x3,#0xff               // x4 has the 0 value
    222 
    223     add         v20.8h,  v20.8h ,  v0.8h
    224     subs        x8,x8,x3
    225     rshrn       v22.8b,v20.8h,#3
    226     csneg       x8,x8,x8,pl
    227     subs        x2,x2,x7
    228     umin        v21.8b,  v22.8b ,  v31.8b
    229     csneg       x2,x2,x2,pl
    230     umax        v22.8b,  v21.8b ,  v30.8b
    231     add         x8,x8,x2
    232     uaddl       v20.8h,v7.8b,v3.8b
    233     cmp         x8,x5,asr #3
    234     mla         v20.8h, v0.8h, v16.8h
    235     bge         l1.336
    236     uaddw       v0.8h,  v0.8h ,  v7.8b
    237     subs        x7,x3,x7
    238     rshrn       v20.8b,v20.8h,#3
    239     csneg       x7,x7,x7,pl
    240     rshrn       v0.8b,v0.8h,#2
    241     mov         x10,#5
    242     uqadd       v30.8b,v5.8b,v19.8b
    243     mul         x10, x10, x6
    244     uqsub       v31.8b,v5.8b,v19.8b
    245     add         x10, x10,#1
    246     cmp         x7,x10,asr #1
    247     bge         l1.336
    248 
    249 
    250 //        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
    251 //            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
    252 
    253 
    254     asr         x10,x5,#2
    255     uqsub       v25.8b,v4.8b,v19.8b
    256     cmp         x10,x4,lsl #1
    257     uqadd       v21.8b,v4.8b,v19.8b
    258     ble         l1.336
    259     umin        v26.8b,  v20.8b ,  v21.8b
    260     add         x4,x1,x1,lsl #1
    261     add         x4,x4,x0
    262     umax        v20.8b,  v26.8b ,  v25.8b
    263     sub         x19,x4,#4
    264     ldrb        w2,[x19]
    265     umin        v19.8b,  v0.8b ,  v30.8b
    266     ldrb        w7,[x19,#3]
    267     umax        v21.8b,  v19.8b ,  v31.8b
    268     ldrb        w3,[x4,#0]
    269     lsl         x10,x6,#1
    270     ldrb        w8,[x4,#3]
    271 //   ubfx   x7,x2,#24,#8           // has the -1 value
    272 //  and    x2,#0xff               // has the -4 value
    273 //  ubfx   x8,x3,#24,#8           // has the 3 value
    274 //  and    x3,#0xff               // x4 has the 0 value
    275     uaddl       v0.8h,v2.8b,v3.8b
    276     dup         v19.8b,w10
    277     subs        x8,x8,x3
    278     uaddw       v0.8h,  v0.8h ,  v4.8b
    279     csneg       x8,x8,x8,pl
    280     uqadd       v30.8b,v2.8b,v19.8b
    281     subs        x2,x2,x7
    282     uqsub       v31.8b,v2.8b,v19.8b
    283     csneg       x2,x2,x2,pl
    284     uaddl       v26.8h,v5.8b,v6.8b
    285     add         x8,x8,x2
    286     mla         v26.8h, v0.8h, v16.8h
    287     cmp         x8,x5,asr #3
    288     bge         l1.336
    289     rshrn       v26.8b,v26.8h,#3
    290     subs        x7,x3,x7
    291     uqadd       v27.8b,v3.8b,v19.8b
    292     csneg       x7,x7,x7,pl
    293     uqsub       v28.8b,v3.8b,v19.8b
    294     mov         x10,#5
    295     umin        v16.8b,  v26.8b ,  v30.8b
    296     mul         x10, x10, x6
    297     add         x10, x10,#1
    298     cmp         x7,x10,asr #1
    299     umax        v26.8b,  v16.8b ,  v31.8b
    300     bge         l1.336
    301     uqadd       v30.8b,v6.8b,v19.8b
    302 
    303     mov         x2,#2
    304     mov         x4,x21
    305     uqsub       v31.8b,v6.8b,v19.8b
    306     mov         x5,x22
    307     b           end_dep_deq_decision
    308 // x2 has the value of de
    309 // x6 has teh value of tc
    310 // x5 has the value of beta
    311 // x14 has the value of dp
    312 // x12 has the value of dq
    313 // x0 has the value of source address
    314 // x1 has the src stride
    315 
    316 l1.336:
    317     mov         x2,#1
    318 l1.424:
    319     mov         x11,x5
    320     mov         x4,x21
    321     mov         x5,x22
    322 
    323     cmp         x6,#1
    324     mov         x20,#0
    325     csel        x9, x20, x9,eq
    326     mov         x20,#0
    327     csel        x10, x20, x10,eq
    328     beq         end_dep_deq_decision
    329 
    330     and         x7,x4,x5
    331 
    332     cmp         x7,#1
    333     beq         both_flags_set
    334     cmp         x4,#0
    335     beq         set_flag_dep_zero
    336 
    337 
    338     add         x8,x11,x11,asr #1
    339     mov         x10,#0
    340     asr         x8,x8,#3
    341     cmp         x8,x14
    342     mov         x20,#1
    343     csel        x9, x20, x9,gt
    344     mov         x20,#0
    345     csel        x9, x20, x9,le
    346     b           end_dep_deq_decision
    347 set_flag_dep_zero:
    348 
    349     add         x8,x11,x11,asr #1
    350     mov         x9,#0
    351     asr         x8,x8,#3
    352     cmp         x8,x12
    353     mov         x20,#1
    354     csel        x10, x20, x10,gt
    355     mov         x20,#0
    356     csel        x10, x20, x10,le
    357     b           end_dep_deq_decision
    358 
    359 both_flags_set:
    360     add         x8,x11,x11,asr #1
    361     asr         x8,x8,#3
    362     cmp         x8,x14
    363     mov         x20,#1
    364     csel        x9, x20, x9,gt
    365     mov         x20,#0
    366     csel        x9, x20, x9,le
    367     cmp         x8,x12
    368     mov         x20,#1
    369     csel        x10, x20, x10,gt
    370     mov         x20,#0
    371     csel        x10, x20, x10,le
    372 end_dep_deq_decision:
    373 
    374 //x0=source address
    375 //x1=stride
    376 // x2 =de
    377 // x4=flag p
    378 //x5= flag q
    379 //x6 =tc
    380 // x9 =dep
    381 // x10=deq
    382 //    b    l1.964
    383 
    384 
    385     cmp         x2,#2
    386 // x4 has the value of de
    387     bne         l1.968
    388 
    389     cmp         x5,#0
    390     beq         l1.780
    391 // x5 has the flag of q
    392 
    393     add         x3,x0,#2
    394     st1         {v22.b}[0],[x3],x1
    395 
    396     st1         {v22.b}[1],[x3],x1
    397 
    398     st1         {v22.b}[2],[x3],x1
    399 
    400     st1         {v22.b}[3],[x3]
    401     add         x3,x0,x1
    402     mov         v29.8b,v20.8b
    403     trn1        v20.8b,v29.8b,v21.8b
    404     trn2        v21.8b,v29.8b,v21.8b
    405 
    406     st1         {v20.h}[0],[x0]
    407     st1         {v21.h}[0],[x3],x1
    408     st1         {v20.h}[1],[x3],x1
    409     st1         {v21.h}[1],[x3]
    410 
    411 
    412 l1.780:
    413     cmp         x4,#0
    414     beq         l1.964
    415     // x4 has the flag p
    416 
    417 
    418     dup         v7.2s, v24.s[0]
    419     sub         x3,x0,#1
    420     uaddw       v16.8h,  v0.8h ,  v6.8b
    421     add         x7,x3,x1
    422     rshrn       v2.8b,v16.8h,#2
    423     st1         {v26.b}[0],[x3]
    424     sub         x0,x0,#3
    425     umin        v16.8b,  v2.8b ,  v27.8b
    426     st1         {v26.b}[1],[x7],x1
    427     umull       v2.8h, v6.8b, v23.8b
    428     umlal       v2.8h, v7.8b, v18.8b
    429     st1         {v26.b}[2],[x7],x1
    430     umax        v5.8b,  v16.8b ,  v28.8b
    431     st1         {v26.b}[3],[x7]
    432     add         v0.8h,  v2.8h ,  v0.8h
    433     rshrn       v0.8b,v0.8h,#3
    434 
    435 
    436     umin        v1.8b,  v0.8b ,  v30.8b
    437     umax        v0.8b,  v1.8b ,  v31.8b
    438 
    439     mov         v29.8b,v0.8b
    440     trn1        v0.8b,v29.8b,v5.8b
    441     trn2        v5.8b,v29.8b,v5.8b
    442     st1         {v0.h}[0],[x0],x1
    443     st1         {v5.h}[0],[x0],x1
    444     st1         {v0.h}[1],[x0],x1
    445     st1         {v5.h}[1],[x0]
    446 l1.964:
    447     ldp         x21, x22,[sp],#16
    448     ldp         x19, x20,[sp],#16
    449     ldp         d14,d15,[sp],#16
    450     ldp         d12,d13,[sp],#16
    451     ldp         d10,d11,[sp],#16
    452     ldp         d8,d9,[sp],#16
    453     ret
    454 
    455 l1.968:
    456 
    457 
    458     movi        v0.8h, #0x9
    459     neg         x11, x6
    460     cmp         x4,#0
    461     // checks for the flag p
    462     movi        v16.8h, #0x3
    463     movi        v24.8b, #0x1
    464 
    465 
    466     dup         v30.8b,w11
    467     and         x11,x6,#0xff
    468     dup         v31.8b,w11
    469 
    470     usubl       v18.8h,v4.8b,v2.8b
    471     mul         v18.8h, v18.8h, v0.8h
    472     usubl       v0.8h,v5.8b,v3.8b
    473 
    474 
    475 
    476     mul         v16.8h, v0.8h, v16.8h
    477     sub         v16.8h,  v18.8h ,  v16.8h
    478     srshr       v16.8h,v16.8h,#4
    479 //   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4//
    480 
    481     abs         v0.8h, v16.8h
    482     xtn         v0.8b,  v0.8h
    483     // storing the absolute values of delta in d0
    484 
    485     sqxtn       v16.8b,v16.8h
    486     // storing the clipped values of delta in d16
    487 
    488     movi        v1.8b, #0xa
    489     dup         v21.8b,w11
    490     mul         v1.8b, v1.8b, v21.8b
    491     // d1 stores the value (10 * tc)
    492 
    493 //if(abs(delta) < 10 * tc)
    494 
    495     smin        v18.8b,  v16.8b ,  v31.8b
    496     smax        v20.8b,  v18.8b ,  v30.8b
    497 
    498 // delta = clip3(delta, -tc, tc)//
    499     sxtl        v16.8h, v20.8b
    500     uxtl        v18.8h, v2.8b
    501     add         v18.8h,  v18.8h ,  v16.8h
    502 
    503     sqxtun      v22.8b, v18.8h
    504     uxtl        v18.8h, v4.8b
    505     sub         v16.8h,  v18.8h ,  v16.8h
    506     sqxtun      v23.8b, v16.8h
    507 // tmp_p0 = clip_u8(pu1_src[-1] + delta)//
    508 //  tmp_q0 = clip_u8(pu1_src[0] - delta)//
    509     beq         l1.1272
    510 
    511 
    512 
    513     cmp         x9,#1
    514     bne         l1.1212
    515 // checks for the flag dep
    516 
    517     asr         x3,x6,#1
    518 
    519 
    520     uaddl       v16.8h,v6.8b,v2.8b
    521     uaddw       v16.8h,  v16.8h ,  v24.8b
    522     dup         v18.8b,w3
    523     sub         x20,x3,#0
    524     neg         x3, x20
    525     dup         v19.8b,w3
    526     ushr        v16.8h,v16.8h,#1
    527     xtn         v16.8b,  v16.8h
    528 
    529     usubl       v16.8h,v16.8b,v3.8b
    530     saddw       v16.8h,  v16.8h ,  v20.8b
    531     sshr        v16.8h,v16.8h,#1
    532     sqxtn       v16.8b,v16.8h
    533 
    534     smin        v17.8b,  v16.8b ,  v18.8b
    535     smax        v16.8b,  v19.8b ,  v17.8b
    536 
    537 
    538 
    539 
    540     uxtl        v18.8h, v3.8b
    541     sxtl        v16.8h, v16.8b
    542     add         v16.8h,  v18.8h ,  v16.8h
    543 
    544     sqxtun      v16.8b, v16.8h
    545     mov         v30.8b,v3.8b
    546     cmhs        v3.8b,v0.8b,v1.8b
    547 
    548 
    549     bsl         v3.8b,v30.8b,v16.8b
    550 l1.1212:
    551     dup         v16.8b,w11
    552     sub         x12,x0,#3
    553     sub         x3,x0,#1
    554 //     smul v16.8b, v16.8b, v1.8b
    555     mov         v29.8b,v6.8b
    556     trn1        v6.8b,v29.8b,v3.8b
    557     trn2        v3.8b,v29.8b,v3.8b
    558     st1         {v6.h}[0],[x12],x1
    559     cmhs        v16.8b,v0.8b,v1.8b
    560     st1         {v3.h}[0],[x12],x1
    561     bsl         v16.8b,v2.8b,v22.8b
    562     st1         {v16.b}[0],[x3],x1
    563     st1         {v16.b}[1],[x3],x1
    564     st1         {v6.h}[1],[x12],x1
    565     st1         {v16.b}[2],[x3],x1
    566     st1         {v3.h}[1],[x12]
    567     st1         {v16.b}[3],[x3]
    568 l1.1272:
    569     cmp         x5,#0
    570     beq         l1.964
    571     // checks for the flag q
    572     cmp         x10,#1
    573     bne         l1.1412
    574     // checks for the flag deq
    575     mov         v2.8b,v7.8b
    576     asr         x3,x6,#1
    577 
    578     dup         v6.8b,w3
    579     sub         x20,x3,#0
    580     neg         x3, x20
    581     dup         v16.8b,w3
    582     uaddl       v2.8h,v2.8b,v4.8b
    583     uaddw       v2.8h,  v2.8h ,  v24.8b
    584     ushr        v2.8h,v2.8h,#1
    585     xtn         v2.8b,  v2.8h
    586 
    587     usubl       v2.8h,v2.8b,v5.8b
    588     ssubw       v2.8h,  v2.8h ,  v20.8b
    589     sshr        v2.8h,v2.8h,#1
    590     sqxtn       v3.8b,v2.8h
    591 
    592     smin        v2.8b,  v3.8b ,  v6.8b
    593     smax        v3.8b,  v16.8b ,  v2.8b
    594     //  dup  v6.8b,w2
    595     //   smul v6.8b, v6.8b, v1.8b
    596 
    597 
    598 
    599     uxtl        v16.8h, v5.8b
    600     sxtl        v2.8h, v3.8b
    601     add         v2.8h,  v16.8h ,  v2.8h
    602     sqxtun      v3.8b, v2.8h
    603     mov         v30.8b,v5.8b
    604     cmhs        v5.8b,v0.8b,v1.8b
    605 
    606 
    607     bsl         v5.8b,v30.8b,v3.8b
    608 l1.1412:
    609     //  dup  v2.8b,w2
    610     add         x3,x0,#2
    611     add         x11,x3,x1
    612     //   smul v1.8b, v2.8b, v1.8b
    613     st1         {v7.b}[0],[x3]
    614     st1         {v7.b}[1],[x11],x1
    615     st1         {v7.b}[2],[x11],x1
    616     cmhs        v0.8b,v0.8b,v1.8b
    617     st1         {v7.b}[3],[x11]
    618     bsl         v0.8b,v4.8b,v23.8b
    619     mov         v29.8b,v0.8b
    620     trn1        v0.8b,v29.8b,v5.8b
    621     trn2        v5.8b,v29.8b,v5.8b
    622     st1         {v0.h}[0],[x0],x1
    623     st1         {v5.h}[0],[x0],x1
    624     st1         {v0.h}[1],[x0],x1
    625     st1         {v5.h}[1],[x0]
    626 
    627     ldp         x21, x22,[sp],#16
    628     ldp         x19, x20,[sp],#16
    629     ldp         d14,d15,[sp],#16
    630     ldp         d12,d13,[sp],#16
    631     ldp         d10,d11,[sp],#16
    632     ldp         d8,d9,[sp],#16
    633     ret
    634 
    635 
    636