Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
     18 #define PRIVATE(f) .text; .align 4; .type f,#function; f:
     19 #define END(f) .size f, .-f;
     20 
     21 .set FRACTION_BITS, 7
     22 .set MAX_R, 25
     23 
     24 
     25 /* A quick way of making a line of code conditional on some other condition.
     26  * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
     27  * `ifcc`:
     28  */
     29 .macro ifcc zzz:vararg
     30 .if cc
     31             \zzz
     32 .endif
     33 .endm
     34 
     35 /* Fetch 16 columns of bytes (regardless of image format), convolve these
     36  * vertically, and leave them in the register file.  If working near the top or
     37  * bottom of an image then clamp the addressing while loading the data in.
     38  *
     39  * The convolution is fully unrolled for windows up to max_r, with the
     40  * outermost edges calculated first.  This way it's possible to branch directly
     41  * into the relevant part of the code for an arbitrary convolution radius.  Two
     42  * variants of the loop are produced; one eliminates the clamping code for a
     43  * slight speed advantage.
     44  *
     45  * Where the macro is called with reg=x, the specified register is taken to
     46  * contain a pre-calculated pointer into one of the two loops.
     47  *
     48  * Input:
     49  *      x1 -- src
     50  *      x2 -- pitch
     51  *      x5 -- r
     52  *      x6 -- rup
     53  *      x7 -- rdn
     54  *      x12 -- switch index
     55  *      v0-v3 -- coefficient table
     56  *      x13 = -pitch
     57  *      x15 = top-row in
     58  *      x19 = bottom-row in
     59  * Output:
     60  *      x1 += 16
     61  *      v10,v11 -- 16 convolved columns
     62  * Modifies:
     63  *      x10 = upper row pointer
     64  *      x11 = lower row pointer
     65  *      v12-v15 = temporary sums
     66  */
     67 .macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
     68   .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
     69 
     70             ld1         {v15.16b}, [x1], #16
     71             mov         x10, x15
     72 
     73             uxtl        v14.8h, v15.8b
     74 //            prfm        PLDL1KEEP,[x1, #16] // TODO: confirm
     75             uxtl2       v15.8h, v15.16b
     76   .if \max_r < 16 // approximate
     77     ifcc    adr         \reg, 1f
     78   .else
     79     ifcc    adrp        \reg, 1f
     80     ifcc    add         \reg, \reg, #:lo12:1f
     81   .endif
     82 
     83             umull       v12.4s, v14.4h, v0.h[0]
     84     ifcc    sub         \reg, \reg, x5, LSL #6
     85             umull2      v13.4s, v14.8h, v0.h[0]
     86             mov         x11, x19
     87             umull       v14.4s, v15.4h, v0.h[0]
     88     ifcc    add         \reg, \reg, x5, LSL #3
     89             umull2      v15.4s, v15.8h, v0.h[0]
     90             br          \reg
     91 
     92   .irp rowclamp, 1, 0
     93     .set cc, \rowclamp
     94     .align 4
     95     .irp dreg, 4, 3, 2, 1, 0 ; .irp lane, 7, 6, 5, 4, 3, 2, 1, 0 ; .irp doth, .h
     96         .set i, \dreg * 8 + \lane
     97         .if 0 < i && i <= \max_r
     98             ld1         {v10.16b}, [x10], x2
     99     ifcc    cmp         x6, #i
    100             ld1         {v11.16b}, [x11], x13
    101     ifcc    csel        x10, x15, x10, lo
    102             uaddl       v16.8h, v10.8b, v11.8b
    103     ifcc    cmp         x7, #i
    104             uaddl2      v11.8h, v10.16b, v11.16b
    105     ifcc    csel        x11, x19, x11, lo
    106             umlal       v12.4s, v16.4h, v\dreg\doth[\lane]
    107             umlal2      v13.4s, v16.8h, v\dreg\doth[\lane]
    108 //            prfm        PLDL1KEEP,[x10, #32] // TODO: confirm
    109 nop
    110             umlal       v14.4s, v11.4h, v\dreg\doth[\lane]
    111 //            prfm        PLDL1KEEP,[x11, #32] // TODO: confirm
    112 nop
    113             umlal2      v15.4s, v11.8h, v\dreg\doth[\lane]
    114         .endif
    115     .endr ; .endr ; .endr
    116     .if \rowclamp == 1
    117         1: \labelc :
    118             b           2f
    119     .else
    120         2: \labelnc :
    121     .endif
    122   .endr
    123 
    124             uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
    125             add         x15, x15, #16
    126             uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
    127             add         x19, x19, #16
    128             uqrshrn     v11.4h, v14.4s, #16 - FRACTION_BITS
    129             uqrshrn2    v11.8h, v15.4s, #16 - FRACTION_BITS
    130 .endm /*}}}*/
    131 
    132 /* Some portion of the convolution window (as much as will fit, and all of it
    133  * for the uchar1 cases) is kept in the register file to avoid unnecessary
    134  * memory accesses.  This forces the horizontal loops to be unrolled because
    135  * there's no indexed addressing into the register file.
    136  *
    137  * As in the fetch macro, the operations are ordered from outside to inside, so
    138  * that jumping into the middle of the block bypasses the unwanted window taps.
    139  *
    140  * There are several variants of the macro because of the fixed offets of the
    141  * taps -- the wider the maximum radius the further the centre tap is from the
    142  * most recently fetched data.  This means that pre-filling the window requires
    143  * more data that won't be used and it means that rotating the window involves
    144  * more mov operations.
    145  *
    146  * When the buffer gets too big the buffer at [x9] is used.
    147  *
    148  * Input:
    149  *      v16-v31,v4-v11 -- convoltion window
    150  *      x9 -- pointer to additional convolution window data
    151  * Output:
    152  *      x9 -- updated buffer pointer (if used)
    153  *      d31 -- result to be stored
    154  * Modifies:
    155  *      x12 -- temp buffer pointer
    156  *      v12-v13 -- temporaries for load and vext operations.
    157  *      v14-v15 -- intermediate sums
    158  */
    159 #define TUNED_LIST1 8, 16
    160 .macro hconv1_8/*{{{*/
    161             umull       v14.4s, v9.4h, v0.h[0]
    162             umull2      v15.4s, v9.8h, v0.h[0]
    163 
    164             adr         x16, 100f
    165             ldrsh       x12, [x16, x5, LSL #1]
    166             add         x12, x12, x16
    167             br          x12
    168    100:     .hword -4
    169             .hword 101f-100b
    170             .hword 102f-100b
    171             .hword 103f-100b
    172             .hword 104f-100b
    173             .hword 105f-100b
    174             .hword 106f-100b
    175             .hword 107f-100b
    176             .hword 108f-100b
    177             .align      4
    178     108:    umlal       v14.4s, v8.4h, v1.h[0]
    179             umlal2      v15.4s, v8.8h, v1.h[0]
    180             umlal       v14.4s, v10.4h, v1.h[0]
    181             umlal2      v15.4s, v10.8h, v1.h[0]
    182     107:    ext         v12.16b, v8.16b, v9.16b, #1*2
    183             ext         v13.16b, v9.16b, v10.16b, #7*2
    184             umlal       v14.4s, v12.4h, v0.h[7]
    185             umlal2      v15.4s, v12.8h, v0.h[7]
    186             umlal       v14.4s, v13.4h, v0.h[7]
    187             umlal2      v15.4s, v13.8h, v0.h[7]
    188     106:    ext         v12.16b, v8.16b, v9.16b, #2*2
    189             ext         v13.16b, v9.16b, v10.16b, #6*2
    190             umlal       v14.4s, v12.4h, v0.h[6]
    191             umlal2      v15.4s, v12.8h, v0.h[6]
    192             umlal       v14.4s, v13.4h, v0.h[6]
    193             umlal2      v15.4s, v13.8h, v0.h[6]
    194     105:    ext         v12.16b, v8.16b, v9.16b, #3*2
    195             ext         v13.16b, v9.16b, v10.16b, #5*2
    196             umlal       v14.4s, v12.4h, v0.h[5]
    197             umlal2      v15.4s, v12.8h, v0.h[5]
    198             umlal       v14.4s, v13.4h, v0.h[5]
    199             umlal2      v15.4s, v13.8h, v0.h[5]
    200     104:    //ext         v12.16b, v8.16b, v9.16b, #4*2
    201             //ext         v13.16b, v9.16b, v10.16b, #4*2
    202             umlal2      v14.4s, v8.8h, v0.h[4]
    203             umlal       v15.4s, v9.4h, v0.h[4]
    204             umlal2      v14.4s, v9.8h, v0.h[4]
    205             umlal       v15.4s, v10.4h, v0.h[4]
    206     103:    ext         v12.16b, v8.16b, v9.16b, #5*2
    207             ext         v13.16b, v9.16b, v10.16b, #3*2
    208             umlal       v14.4s, v12.4h, v0.h[3]
    209             umlal2      v15.4s, v12.8h, v0.h[3]
    210             umlal       v14.4s, v13.4h, v0.h[3]
    211             umlal2      v15.4s, v13.8h, v0.h[3]
    212     102:    ext         v12.16b, v8.16b, v9.16b, #6*2
    213             ext         v13.16b, v9.16b, v10.16b, #2*2
    214             umlal       v14.4s, v12.4h, v0.h[2]
    215             umlal2      v15.4s, v12.8h, v0.h[2]
    216             umlal       v14.4s, v13.4h, v0.h[2]
    217             umlal2      v15.4s, v13.8h, v0.h[2]
    218     101:    ext         v12.16b, v8.16b, v9.16b, #7*2
    219             ext         v13.16b, v9.16b, v10.16b, #1*2
    220             umlal       v14.4s, v12.4h, v0.h[1]
    221             umlal2      v15.4s, v12.8h, v0.h[1]
    222             umlal       v14.4s, v13.4h, v0.h[1]
    223             umlal2      v15.4s, v13.8h, v0.h[1]
    224 
    225             uqrshrn     v14.4h, v14.4s, #16
    226             uqrshrn2    v14.8h, v15.4s, #16
    227             uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
    228 
    229             mov         v8.16b, v9.16b
    230             mov         v9.16b, v10.16b
    231             mov         v10.16b, v11.16b
    232 .endm/*}}}*/
    233 
    234 .macro hconv1_16/*{{{*/
    235             umull       v14.4s, v8.4h, v0.h[0]
    236             umull2      v15.4s, v8.8h, v0.h[0]
    237 
    238             adr         x16, 100f
    239             ldrsh       x12, [x16, x5, LSL #1]
    240             add         x12, x12, x16
    241             br          x12
    242    100:     .hword -4
    243             .hword 101f-100b
    244             .hword 102f-100b
    245             .hword 103f-100b
    246             .hword 104f-100b
    247             .hword 105f-100b
    248             .hword 106f-100b
    249             .hword 107f-100b
    250             .hword 108f-100b
    251             .hword 109f-100b
    252             .hword 110f-100b
    253             .hword 111f-100b
    254             .hword 112f-100b
    255             .hword 113f-100b
    256             .hword 114f-100b
    257             .hword 115f-100b
    258             .hword 116f-100b
    259             .align 4
    260     116:    //ext         v12.16b, v6.16b, v7.16b, #0*2
    261             //ext         v13.16b, v10.16b, v11.16b, #0*2
    262             umlal       v14.4s, v6.4h, v2.h[0]
    263             umlal2      v15.4s, v6.8h, v2.h[0]
    264             umlal       v14.4s, v10.4h, v2.h[0]
    265             umlal2      v15.4s, v10.8h, v2.h[0]
    266     115:    ext         v12.16b, v6.16b, v7.16b, #1*2
    267             ext         v13.16b, v9.16b, v10.16b, #7*2
    268             umlal       v14.4s, v12.4h, v1.h[7]
    269             umlal2      v15.4s, v12.8h, v1.h[7]
    270             umlal       v14.4s, v13.4h, v1.h[7]
    271             umlal2      v15.4s, v13.8h, v1.h[7]
    272     114:    ext         v12.16b, v6.16b, v7.16b, #2*2
    273             ext         v13.16b, v9.16b, v10.16b, #6*2
    274             umlal       v14.4s, v12.4h, v1.h[6]
    275             umlal2      v15.4s, v12.8h, v1.h[6]
    276             umlal       v14.4s, v13.4h, v1.h[6]
    277             umlal2      v15.4s, v13.8h, v1.h[6]
    278     113:    ext         v12.16b, v6.16b, v7.16b, #3*2
    279             ext         v13.16b, v9.16b, v10.16b, #5*2
    280             umlal       v14.4s, v12.4h, v1.h[5]
    281             umlal2      v15.4s, v12.8h, v1.h[5]
    282             umlal       v14.4s, v13.4h, v1.h[5]
    283             umlal2      v15.4s, v13.8h, v1.h[5]
    284     112:    //ext         v12.16b, v6.16b, v7.16b, #4*2
    285             //ext         v13.16b, v9.16b, v10.16b, #4*2
    286             umlal2      v14.4s, v6.8h, v1.h[4]
    287             umlal       v15.4s, v7.4h, v1.h[4]
    288             umlal2      v14.4s, v9.8h, v1.h[4]
    289             umlal       v15.4s, v10.4h, v1.h[4]
    290     111:    ext         v12.16b, v6.16b, v7.16b, #5*2
    291             ext         v13.16b, v9.16b, v10.16b, #3*2
    292             umlal       v14.4s, v12.4h, v1.h[3]
    293             umlal2      v15.4s, v12.8h, v1.h[3]
    294             umlal       v14.4s, v13.4h, v1.h[3]
    295             umlal2      v15.4s, v13.8h, v1.h[3]
    296     110:    ext         v12.16b, v6.16b, v7.16b, #6*2
    297             ext         v13.16b, v9.16b, v10.16b, #2*2
    298             umlal       v14.4s, v12.4h, v1.h[2]
    299             umlal2      v15.4s, v12.8h, v1.h[2]
    300             umlal       v14.4s, v13.4h, v1.h[2]
    301             umlal2      v15.4s, v13.8h, v1.h[2]
    302     109:    ext         v12.16b, v6.16b, v7.16b, #7*2
    303             ext         v13.16b, v9.16b, v10.16b, #1*2
    304             umlal       v14.4s, v12.4h, v1.h[1]
    305             umlal2      v15.4s, v12.8h, v1.h[1]
    306             umlal       v14.4s, v13.4h, v1.h[1]
    307             umlal2      v15.4s, v13.8h, v1.h[1]
    308     108:    //ext         v12.16b, v7.16b, v8.16b, #0*2
    309             //ext         v13.16b, v9.16b, v10.16b, #0*2
    310             umlal       v14.4s, v7.4h, v1.h[0]
    311             umlal2      v15.4s, v7.8h, v1.h[0]
    312             umlal       v14.4s, v9.4h, v1.h[0]
    313             umlal2      v15.4s, v9.8h, v1.h[0]
    314     107:    ext         v12.16b, v7.16b, v8.16b, #1*2
    315             ext         v13.16b, v8.16b, v9.16b, #7*2
    316             umlal       v14.4s, v12.4h, v0.h[7]
    317             umlal2      v15.4s, v12.8h, v0.h[7]
    318             umlal       v14.4s, v13.4h, v0.h[7]
    319             umlal2      v15.4s, v13.8h, v0.h[7]
    320     106:    ext         v12.16b, v7.16b, v8.16b, #2*2
    321             ext         v13.16b, v8.16b, v9.16b, #6*2
    322             umlal       v14.4s, v12.4h, v0.h[6]
    323             umlal2      v15.4s, v12.8h, v0.h[6]
    324             umlal       v14.4s, v13.4h, v0.h[6]
    325             umlal2      v15.4s, v13.8h, v0.h[6]
    326     105:    ext         v12.16b, v7.16b, v8.16b, #3*2
    327             ext         v13.16b, v8.16b, v9.16b, #5*2
    328             umlal       v14.4s, v12.4h, v0.h[5]
    329             umlal2      v15.4s, v12.8h, v0.h[5]
    330             umlal       v14.4s, v13.4h, v0.h[5]
    331             umlal2      v15.4s, v13.8h, v0.h[5]
    332     104:    //ext         v12.16b, v7.16b, v8.16b, #4*2
    333             //ext         v13.16b, v8.16b, v9.16b, #4*2
    334             umlal2      v14.4s, v7.8h, v0.h[4]
    335             umlal       v15.4s, v8.4h, v0.h[4]
    336             umlal2      v14.4s, v8.8h, v0.h[4]
    337             umlal       v15.4s, v9.4h, v0.h[4]
    338     103:    ext         v12.16b, v7.16b, v8.16b, #5*2
    339             ext         v13.16b, v8.16b, v9.16b, #3*2
    340             umlal       v14.4s, v12.4h, v0.h[3]
    341             umlal2      v15.4s, v12.8h, v0.h[3]
    342             umlal       v14.4s, v13.4h, v0.h[3]
    343             umlal2      v15.4s, v13.8h, v0.h[3]
    344     102:    ext         v12.16b, v7.16b, v8.16b, #6*2
    345             ext         v13.16b, v8.16b, v9.16b, #2*2
    346             umlal       v14.4s, v12.4h, v0.h[2]
    347             umlal2      v15.4s, v12.8h, v0.h[2]
    348             umlal       v14.4s, v13.4h, v0.h[2]
    349             umlal2      v15.4s, v13.8h, v0.h[2]
    350     101:    ext         v12.16b, v7.16b, v8.16b, #7*2
    351             ext         v13.16b, v8.16b, v9.16b, #1*2
    352             umlal       v14.4s, v12.4h, v0.h[1]
    353             umlal2      v15.4s, v12.8h, v0.h[1]
    354             umlal       v14.4s, v13.4h, v0.h[1]
    355             umlal2      v15.4s, v13.8h, v0.h[1]
    356 
    357             uqrshrn     v14.4h, v14.4s, #16
    358             uqrshrn2    v14.8h, v15.4s, #16
    359             uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
    360 
    361             mov         v6.16b, v7.16b
    362             mov         v7.16b, v8.16b
    363             mov         v8.16b, v9.16b
    364             mov         v9.16b, v10.16b
    365             mov         v10.16b, v11.16b
    366 .endm/*}}}*/
    367 
    368 .macro hconv1_25/*{{{*/
    369             ext         v12.16b, v6.16b, v7.16b, #7*2
    370             umull       v14.4s, v12.4h, v0.h[0]
    371             umull2      v15.4s, v12.8h, v0.h[0]
    372 
    373             adr         x16, 100f
    374             ldrsh       x12, [x16, x5, LSL #1]
    375             add         x12, x12, x16
    376             br          x12
    377    100:     .hword -4
    378             .hword 101f-100b
    379             .hword 102f-100b
    380             .hword 103f-100b
    381             .hword 104f-100b
    382             .hword 105f-100b
    383             .hword 106f-100b
    384             .hword 107f-100b
    385             .hword 108f-100b
    386             .hword 109f-100b
    387             .hword 110f-100b
    388             .hword 111f-100b
    389             .hword 112f-100b
    390             .hword 113f-100b
    391             .hword 114f-100b
    392             .hword 115f-100b
    393             .hword 116f-100b
    394             .hword 117f-100b
    395             .hword 118f-100b
    396             .hword 119f-100b
    397             .hword 120f-100b
    398             .hword 121f-100b
    399             .hword 122f-100b
    400             .hword 123f-100b
    401             .hword 124f-100b
    402             .hword 125f-100b
    403             .align 4
    404     125:    ext         v12.16b, v31.16b, v4.16b, #6*2
    405             ext         v13.16b, v10.16b, v11.16b, #0*2
    406             umlal       v14.4s, v12.4h, v3.h[1]
    407             umlal2      v15.4s, v12.8h, v3.h[1]
    408             umlal       v14.4s, v13.4h, v3.h[1]
    409             umlal2      v15.4s, v13.8h, v3.h[1]
    410     124:    ext         v12.16b, v31.16b, v4.16b, #7*2
    411             ext         v13.16b, v9.16b, v10.16b, #7*2
    412             umlal       v14.4s, v12.4h, v3.h[0]
    413             umlal2      v15.4s, v12.8h, v3.h[0]
    414             umlal       v14.4s, v13.4h, v3.h[0]
    415             umlal2      v15.4s, v13.8h, v3.h[0]
    416     123:    ext         v12.16b, v4.16b, v5.16b, #0*2
    417             ext         v13.16b, v9.16b, v10.16b, #6*2
    418             umlal       v14.4s, v12.4h, v2.h[7]
    419             umlal2      v15.4s, v12.8h, v2.h[7]
    420             umlal       v14.4s, v13.4h, v2.h[7]
    421             umlal2      v15.4s, v13.8h, v2.h[7]
    422     122:    ext         v12.16b, v4.16b, v5.16b, #1*2
    423             ext         v13.16b, v9.16b, v10.16b, #5*2
    424             umlal       v14.4s, v12.4h, v2.h[6]
    425             umlal2      v15.4s, v12.8h, v2.h[6]
    426             umlal       v14.4s, v13.4h, v2.h[6]
    427             umlal2      v15.4s, v13.8h, v2.h[6]
    428     121:    ext         v12.16b, v4.16b, v5.16b, #2*2
    429             ext         v13.16b, v9.16b, v10.16b, #4*2
    430             umlal       v14.4s, v12.4h, v2.h[5]
    431             umlal2      v15.4s, v12.8h, v2.h[5]
    432             umlal       v14.4s, v13.4h, v2.h[5]
    433             umlal2      v15.4s, v13.8h, v2.h[5]
    434     120:    ext         v12.16b, v4.16b, v5.16b, #3*2
    435             ext         v13.16b, v9.16b, v10.16b, #3*2
    436             umlal       v14.4s, v12.4h, v2.h[4]
    437             umlal2      v15.4s, v12.8h, v2.h[4]
    438             umlal       v14.4s, v13.4h, v2.h[4]
    439             umlal2      v15.4s, v13.8h, v2.h[4]
    440     119:    ext         v12.16b, v4.16b, v5.16b, #4*2
    441             ext         v13.16b, v9.16b, v10.16b, #2*2
    442             umlal       v14.4s, v12.4h, v2.h[3]
    443             umlal2      v15.4s, v12.8h, v2.h[3]
    444             umlal       v14.4s, v13.4h, v2.h[3]
    445             umlal2      v15.4s, v13.8h, v2.h[3]
    446     118:    ext         v12.16b, v4.16b, v5.16b, #5*2
    447             ext         v13.16b, v9.16b, v10.16b, #1*2
    448             umlal       v14.4s, v12.4h, v2.h[2]
    449             umlal2      v15.4s, v12.8h, v2.h[2]
    450             umlal       v14.4s, v13.4h, v2.h[2]
    451             umlal2      v15.4s, v13.8h, v2.h[2]
    452     117:    ext         v12.16b, v4.16b, v5.16b, #6*2
    453             ext         v13.16b, v9.16b, v10.16b, #0*2
    454             umlal       v14.4s, v12.4h, v2.h[1]
    455             umlal2      v15.4s, v12.8h, v2.h[1]
    456             umlal       v14.4s, v13.4h, v2.h[1]
    457             umlal2      v15.4s, v13.8h, v2.h[1]
    458     116:    ext         v12.16b, v4.16b, v5.16b, #7*2
    459             ext         v13.16b, v8.16b, v9.16b, #7*2
    460             umlal       v14.4s, v12.4h, v2.h[0]
    461             umlal2      v15.4s, v12.8h, v2.h[0]
    462             umlal       v14.4s, v13.4h, v2.h[0]
    463             umlal2      v15.4s, v13.8h, v2.h[0]
    464     115:    ext         v12.16b, v5.16b, v6.16b, #0*2
    465             ext         v13.16b, v8.16b, v9.16b, #6*2
    466             umlal       v14.4s, v12.4h, v1.h[7]
    467             umlal2      v15.4s, v12.8h, v1.h[7]
    468             umlal       v14.4s, v13.4h, v1.h[7]
    469             umlal2      v15.4s, v13.8h, v1.h[7]
    470     114:    ext         v12.16b, v5.16b, v6.16b, #1*2
    471             ext         v13.16b, v8.16b, v9.16b, #5*2
    472             umlal       v14.4s, v12.4h, v1.h[6]
    473             umlal2      v15.4s, v12.8h, v1.h[6]
    474             umlal       v14.4s, v13.4h, v1.h[6]
    475             umlal2      v15.4s, v13.8h, v1.h[6]
    476     113:    ext         v12.16b, v5.16b, v6.16b, #2*2
    477             ext         v13.16b, v8.16b, v9.16b, #4*2
    478             umlal       v14.4s, v12.4h, v1.h[5]
    479             umlal2      v15.4s, v12.8h, v1.h[5]
    480             umlal       v14.4s, v13.4h, v1.h[5]
    481             umlal2      v15.4s, v13.8h, v1.h[5]
    482     112:    ext         v12.16b, v5.16b, v6.16b, #3*2
    483             ext         v13.16b, v8.16b, v9.16b, #3*2
    484             umlal       v14.4s, v12.4h, v1.h[4]
    485             umlal2      v15.4s, v12.8h, v1.h[4]
    486             umlal       v14.4s, v13.4h, v1.h[4]
    487             umlal2      v15.4s, v13.8h, v1.h[4]
    488     111:    ext         v12.16b, v5.16b, v6.16b, #4*2
    489             ext         v13.16b, v8.16b, v9.16b, #2*2
    490             umlal       v14.4s, v12.4h, v1.h[3]
    491             umlal2      v15.4s, v12.8h, v1.h[3]
    492             umlal       v14.4s, v13.4h, v1.h[3]
    493             umlal2      v15.4s, v13.8h, v1.h[3]
    494     110:    ext         v12.16b, v5.16b, v6.16b, #5*2
    495             ext         v13.16b, v8.16b, v9.16b, #1*2
    496             umlal       v14.4s, v12.4h, v1.h[2]
    497             umlal2      v15.4s, v12.8h, v1.h[2]
    498             umlal       v14.4s, v13.4h, v1.h[2]
    499             umlal2      v15.4s, v13.8h, v1.h[2]
    500     109:    ext         v12.16b, v5.16b, v6.16b, #6*2
    501             ext         v13.16b, v8.16b, v9.16b, #0*2
    502             umlal       v14.4s, v12.4h, v1.h[1]
    503             umlal2      v15.4s, v12.8h, v1.h[1]
    504             umlal       v14.4s, v13.4h, v1.h[1]
    505             umlal2      v15.4s, v13.8h, v1.h[1]
    506     108:    ext         v12.16b, v5.16b, v6.16b, #7*2
    507             ext         v13.16b, v7.16b, v8.16b, #7*2
    508             umlal       v14.4s, v12.4h, v1.h[0]
    509             umlal2      v15.4s, v12.8h, v1.h[0]
    510             umlal       v14.4s, v13.4h, v1.h[0]
    511             umlal2      v15.4s, v13.8h, v1.h[0]
    512     107:    ext         v12.16b, v6.16b, v7.16b, #0*2
    513             ext         v13.16b, v7.16b, v8.16b, #6*2
    514             umlal       v14.4s, v12.4h, v0.h[7]
    515             umlal2      v15.4s, v12.8h, v0.h[7]
    516             umlal       v14.4s, v13.4h, v0.h[7]
    517             umlal2      v15.4s, v13.8h, v0.h[7]
    518     106:    ext         v12.16b, v6.16b, v7.16b, #1*2
    519             ext         v13.16b, v7.16b, v8.16b, #5*2
    520             umlal       v14.4s, v12.4h, v0.h[6]
    521             umlal2      v15.4s, v12.8h, v0.h[6]
    522             umlal       v14.4s, v13.4h, v0.h[6]
    523             umlal2      v15.4s, v13.8h, v0.h[6]
    524     105:    ext         v12.16b, v6.16b, v7.16b, #2*2
    525             ext         v13.16b, v7.16b, v8.16b, #4*2
    526             umlal       v14.4s, v12.4h, v0.h[5]
    527             umlal2      v15.4s, v12.8h, v0.h[5]
    528             umlal       v14.4s, v13.4h, v0.h[5]
    529             umlal2      v15.4s, v13.8h, v0.h[5]
    530     104:    ext         v12.16b, v6.16b, v7.16b, #3*2
    531             ext         v13.16b, v7.16b, v8.16b, #3*2
    532             umlal       v14.4s, v12.4h, v0.h[4]
    533             umlal2      v15.4s, v12.8h, v0.h[4]
    534             umlal       v14.4s, v13.4h, v0.h[4]
    535             umlal2      v15.4s, v13.8h, v0.h[4]
    536     103:    ext         v12.16b, v6.16b, v7.16b, #4*2
    537             ext         v13.16b, v7.16b, v8.16b, #2*2
    538             umlal       v14.4s, v12.4h, v0.h[3]
    539             umlal2      v15.4s, v12.8h, v0.h[3]
    540             umlal       v14.4s, v13.4h, v0.h[3]
    541             umlal2      v15.4s, v13.8h, v0.h[3]
    542     102:    ext         v12.16b, v6.16b, v7.16b, #5*2
    543             ext         v13.16b, v7.16b, v8.16b, #1*2
    544             umlal       v14.4s, v12.4h, v0.h[2]
    545             umlal2      v15.4s, v12.8h, v0.h[2]
    546             umlal       v14.4s, v13.4h, v0.h[2]
    547             umlal2      v15.4s, v13.8h, v0.h[2]
    548     101:    ext         v12.16b, v6.16b, v7.16b, #6*2
    549             ext         v13.16b, v7.16b, v8.16b, #0*2
    550             umlal       v14.4s, v12.4h, v0.h[1]
    551             umlal2      v15.4s, v12.8h, v0.h[1]
    552             umlal       v14.4s, v13.4h, v0.h[1]
    553             umlal2      v15.4s, v13.8h, v0.h[1]
    554 
    555             uqrshrn     v14.4h, v14.4s, #16
    556             uqrshrn2    v14.8h, v15.4s, #16
    557             uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
    558 
    559             mov         v31.16b, v4.16b
    560             mov         v4.16b, v5.16b
    561             mov         v5.16b, v6.16b
    562             mov         v6.16b, v7.16b
    563             mov         v7.16b, v8.16b
    564             mov         v8.16b, v9.16b
    565             mov         v9.16b, v10.16b
    566             mov         v10.16b, v11.16b
    567 .endm/*}}}*/
    568 
    569 #define TUNED_LIST4 6, 12, 20
    570 .macro hconv4_6/*{{{*/
    571             umull       v14.4s, v7.4h, v0.h[0]
    572             umull2      v15.4s, v7.8h, v0.h[0]
    573 
    574             adr         x16, 100f
    575             ldrsh       x12, [x16, x5, LSL #1]
    576             add         x12, x12, x16
    577             br          x12
    578    100:     .hword -4
    579             .hword 101f-100b
    580             .hword 102f-100b
    581             .hword 103f-100b
    582             .hword 104f-100b
    583             .hword 105f-100b
    584             .hword 106f-100b
    585             .align      4
    586     106:    umlal       v14.4s, v4.4h,  v0.h[6]
    587             umlal2      v15.4s, v4.8h,  v0.h[6]
    588             umlal       v14.4s, v10.4h, v0.h[6]
    589             umlal2      v15.4s, v10.8h, v0.h[6]
    590     105:    umlal2      v14.4s, v4.8h,  v0.h[5]
    591             umlal       v15.4s, v5.4h, v0.h[5]
    592             umlal2      v14.4s, v9.8h, v0.h[5]
    593             umlal       v15.4s, v10.4h, v0.h[5]
    594     104:    umlal       v14.4s, v5.4h, v0.h[4]
    595             umlal2      v15.4s, v5.8h, v0.h[4]
    596             umlal       v14.4s, v9.4h, v0.h[4]
    597             umlal2      v15.4s, v9.8h, v0.h[4]
    598     103:    umlal2      v14.4s, v5.8h, v0.h[3]
    599             umlal       v15.4s, v6.4h, v0.h[3]
    600             umlal2      v14.4s, v8.8h, v0.h[3]
    601             umlal       v15.4s, v9.4h, v0.h[3]
    602     102:    umlal       v14.4s, v6.4h, v0.h[2]
    603             umlal2      v15.4s, v6.8h, v0.h[2]
    604             umlal       v14.4s, v8.4h, v0.h[2]
    605             umlal2      v15.4s, v8.8h, v0.h[2]
    606     101:    umlal2      v14.4s, v6.8h, v0.h[1]
    607             umlal       v15.4s, v7.4h, v0.h[1]
    608             umlal2      v14.4s, v7.8h, v0.h[1]
    609             umlal       v15.4s, v8.4h, v0.h[1]
    610 
    611             uqrshrn     v14.4h, v14.4s, #16
    612             uqrshrn2    v14.8h, v15.4s, #16
    613             uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
    614 
    615             mov         v4.16b, v5.16b
    616             mov         v5.16b, v6.16b
    617             mov         v6.16b, v7.16b
    618             mov         v7.16b, v8.16b
    619             mov         v8.16b, v9.16b
    620             mov         v9.16b, v10.16b
    621             mov         v10.16b, v11.16b
    622 .endm/*}}}*/
    623 
    624 .macro hconv4_12/*{{{*/
    625             umull       v14.4s, v4.4h, v0.h[0]
    626             umull2      v15.4s, v4.8h, v0.h[0]
    627 
    628             adr         x16, 100f
    629             ldrsh       x12, [x16, x5, LSL #1]
    630             add         x12, x12, x16
    631             br          x12
    632    100:     .hword -4
    633             .hword 101f-100b
    634             .hword 102f-100b
    635             .hword 103f-100b
    636             .hword 104f-100b
    637             .hword 105f-100b
    638             .hword 106f-100b
    639             .hword 107f-100b
    640             .hword 108f-100b
    641             .hword 109f-100b
    642             .hword 110f-100b
    643             .hword 111f-100b
    644             .hword 112f-100b
    645             .align 4
    646     112:    umlal       v14.4s, v26.4h, v1.h[4]
    647             umlal2      v15.4s, v26.8h, v1.h[4]
    648             umlal       v14.4s, v10.4h, v1.h[4]
    649             umlal2      v15.4s, v10.8h, v1.h[4]
    650     111:    umlal2      v14.4s, v26.8h, v1.h[3]
    651             umlal       v15.4s, v27.4h, v1.h[3]
    652             umlal2      v14.4s, v9.8h, v1.h[3]
    653             umlal       v15.4s, v10.4h, v1.h[3]
    654     110:    umlal       v14.4s, v27.4h, v1.h[2]
    655             umlal2      v15.4s, v27.8h, v1.h[2]
    656             umlal       v14.4s, v9.4h, v1.h[2]
    657             umlal2      v15.4s, v9.8h, v1.h[2]
    658     109:    umlal2      v14.4s, v27.8h, v1.h[1]
    659             umlal       v15.4s, v28.4h, v1.h[1]
    660             umlal2      v14.4s, v8.8h, v1.h[1]
    661             umlal       v15.4s, v9.4h, v1.h[1]
    662     108:    umlal       v14.4s, v28.4h, v1.h[0]
    663             umlal2      v15.4s, v28.8h, v1.h[0]
    664             umlal       v14.4s, v8.4h, v1.h[0]
    665             umlal2      v15.4s, v8.8h, v1.h[0]
    666     107:    umlal2      v14.4s, v28.8h, v0.h[7]
    667             umlal       v15.4s, v29.4h, v0.h[7]
    668             umlal2      v14.4s, v7.8h, v0.h[7]
    669             umlal       v15.4s, v8.4h, v0.h[7]
    670     106:    umlal       v14.4s, v29.4h, v0.h[6]
    671             umlal2      v15.4s, v29.8h, v0.h[6]
    672             umlal       v14.4s, v7.4h, v0.h[6]
    673             umlal2      v15.4s, v7.8h, v0.h[6]
    674     105:    umlal2      v14.4s, v29.8h, v0.h[5]
    675             umlal       v15.4s, v30.4h, v0.h[5]
    676             umlal2      v14.4s, v6.8h, v0.h[5]
    677             umlal       v15.4s, v7.4h, v0.h[5]
    678     104:    umlal       v14.4s, v30.4h, v0.h[4]
    679             umlal2      v15.4s, v30.8h, v0.h[4]
    680             umlal       v14.4s, v6.4h, v0.h[4]
    681             umlal2      v15.4s, v6.8h, v0.h[4]
    682     103:    umlal2      v14.4s, v30.8h, v0.h[3]
    683             umlal       v15.4s, v31.4h, v0.h[3]
    684             umlal2      v14.4s, v5.8h, v0.h[3]
    685             umlal       v15.4s, v6.4h, v0.h[3]
    686     102:    umlal       v14.4s, v31.4h, v0.h[2]
    687             umlal2      v15.4s, v31.8h, v0.h[2]
    688             umlal       v14.4s, v5.4h, v0.h[2]
    689             umlal2      v15.4s, v5.8h, v0.h[2]
    690     101:    umlal2      v14.4s, v31.8h, v0.h[1]
    691             umlal       v15.4s, v4.4h,  v0.h[1]
    692             umlal2      v14.4s, v4.8h,  v0.h[1]
    693             umlal       v15.4s, v5.4h, v0.h[1]
    694 
    695             uqrshrn     v14.4h, v14.4s, #16
    696             uqrshrn2    v14.8h, v15.4s, #16
    697             uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
    698 
    699             mov         v26.16b, v27.16b
    700             mov         v27.16b, v28.16b
    701             mov         v28.16b, v29.16b
    702             mov         v29.16b, v30.16b
    703             mov         v30.16b, v31.16b
    704             mov         v31.16b, v4.16b
    705             mov         v4.16b, v5.16b
    706             mov         v5.16b, v6.16b
    707             mov         v6.16b, v7.16b
    708             mov         v7.16b, v8.16b
    709             mov         v8.16b, v9.16b
    710             mov         v9.16b, v10.16b
    711             mov         v10.16b, v11.16b
    712 .endm/*}}}*/
    713 
    714 .macro hconv4_20/*{{{*/
    715             umull       v14.4s, v28.4h, v0.h[0]
    716             umull2      v15.4s, v28.8h, v0.h[0]
    717 
    718             adr         x16, 100f
    719             ldrsh       x12, [x16, x5, LSL #1]
    720             add         x12, x12, x16
    721             br          x12
    722    100:     .hword -4
    723             .hword 101f-100b
    724             .hword 102f-100b
    725             .hword 103f-100b
    726             .hword 104f-100b
    727             .hword 105f-100b
    728             .hword 106f-100b
    729             .hword 107f-100b
    730             .hword 108f-100b
    731             .hword 109f-100b
    732             .hword 110f-100b
    733             .hword 111f-100b
    734             .hword 112f-100b
    735             .hword 113f-100b
    736             .hword 114f-100b
    737             .hword 115f-100b
    738             .hword 116f-100b
    739             .hword 117f-100b
    740             .hword 118f-100b
    741             .hword 119f-100b
    742             .hword 120f-100b
    743             .align 4
    744 
    745     120:    umlal       v14.4s, v18.4h, v2.h[4]
    746             umlal2      v15.4s, v18.8h, v2.h[4]
    747             umlal       v14.4s, v10.4h, v2.h[4]
    748             umlal2      v15.4s, v10.8h, v2.h[4]
    749     119:    umlal2      v14.4s, v18.8h, v2.h[3]
    750             umlal       v15.4s, v19.4h, v2.h[3]
    751             umlal2      v14.4s, v9.8h,  v2.h[3]
    752             umlal       v15.4s, v10.4h, v2.h[3]
    753     118:    umlal       v14.4s, v19.4h, v2.h[2]
    754             umlal2      v15.4s, v19.8h, v2.h[2]
    755             umlal       v14.4s, v9.4h,  v2.h[2]
    756             umlal2      v15.4s, v9.8h,  v2.h[2]
    757     117:    umlal2      v14.4s, v19.8h, v2.h[1]
    758             umlal       v15.4s, v20.4h, v2.h[1]
    759             umlal2      v14.4s, v8.8h,  v2.h[1]
    760             umlal       v15.4s, v9.4h,  v2.h[1]
    761     116:    umlal       v14.4s, v20.4h, v2.h[0]
    762             umlal2      v15.4s, v20.8h, v2.h[0]
    763             umlal       v14.4s, v8.4h,  v2.h[0]
    764             umlal2      v15.4s, v8.8h,  v2.h[0]
    765     115:    umlal2      v14.4s, v20.8h, v1.h[7]
    766             umlal       v15.4s, v21.4h, v1.h[7]
    767             umlal2      v14.4s, v7.8h,  v1.h[7]
    768             umlal       v15.4s, v8.4h,  v1.h[7]
    769     114:    umlal       v14.4s, v21.4h, v1.h[6]
    770             umlal2      v15.4s, v21.8h, v1.h[6]
    771             umlal       v14.4s, v7.4h,  v1.h[6]
    772             umlal2      v15.4s, v7.8h,  v1.h[6]
    773     113:    umlal2      v14.4s, v21.8h, v1.h[5]
    774             umlal       v15.4s, v22.4h, v1.h[5]
    775             umlal2      v14.4s, v6.8h,  v1.h[5]
    776             umlal       v15.4s, v7.4h,  v1.h[5]
    777     112:    umlal       v14.4s, v22.4h, v1.h[4]
    778             umlal2      v15.4s, v22.8h, v1.h[4]
    779             umlal       v14.4s, v6.4h,  v1.h[4]
    780             umlal2      v15.4s, v6.8h,  v1.h[4]
    781     111:    umlal2      v14.4s, v22.8h, v1.h[3]
    782             umlal       v15.4s, v23.4h, v1.h[3]
    783             umlal2      v14.4s, v5.8h,  v1.h[3]
    784             umlal       v15.4s, v6.4h,  v1.h[3]
    785     110:    umlal       v14.4s, v23.4h, v1.h[2]
    786             umlal2      v15.4s, v23.8h, v1.h[2]
    787             umlal       v14.4s, v5.4h,  v1.h[2]
    788             umlal2      v15.4s, v5.8h,  v1.h[2]
    789     109:    umlal2      v14.4s, v23.8h, v1.h[1]
    790             umlal       v15.4s, v24.4h, v1.h[1]
    791             umlal2      v14.4s, v4.8h,  v1.h[1]
    792             umlal       v15.4s, v5.4h,  v1.h[1]
    793     108:    umlal       v14.4s, v24.4h, v1.h[0]
    794             umlal2      v15.4s, v24.8h, v1.h[0]
    795             umlal       v14.4s, v4.4h,  v1.h[0]
    796             umlal2      v15.4s, v4.8h,  v1.h[0]
    797     107:    umlal2      v14.4s, v24.8h, v0.h[7]
    798             umlal       v15.4s, v25.4h, v0.h[7]
    799             umlal2      v14.4s, v31.8h, v0.h[7]
    800             umlal       v15.4s, v4.4h,  v0.h[7]
    801     106:    umlal       v14.4s, v25.4h, v0.h[6]
    802             umlal2      v15.4s, v25.8h, v0.h[6]
    803             umlal       v14.4s, v31.4h, v0.h[6]
    804             umlal2      v15.4s, v31.8h, v0.h[6]
    805     105:    umlal2      v14.4s, v25.8h, v0.h[5]
    806             umlal       v15.4s, v26.4h, v0.h[5]
    807             umlal2      v14.4s, v30.8h, v0.h[5]
    808             umlal       v15.4s, v31.4h, v0.h[5]
    809     104:    umlal       v14.4s, v26.4h, v0.h[4]
    810             umlal2      v15.4s, v26.8h, v0.h[4]
    811             umlal       v14.4s, v30.4h, v0.h[4]
    812             umlal2      v15.4s, v30.8h, v0.h[4]
    813     103:    umlal2      v14.4s, v26.8h, v0.h[3]
    814             umlal       v15.4s, v27.4h, v0.h[3]
    815             umlal2      v14.4s, v29.8h, v0.h[3]
    816             umlal       v15.4s, v30.4h, v0.h[3]
    817     102:    umlal       v14.4s, v27.4h, v0.h[2]
    818             umlal2      v15.4s, v27.8h, v0.h[2]
    819             umlal       v14.4s, v29.4h, v0.h[2]
    820             umlal2      v15.4s, v29.8h, v0.h[2]
    821     101:    umlal2      v14.4s, v27.8h, v0.h[1]
    822             umlal       v15.4s, v28.4h, v0.h[1]
    823             umlal2      v14.4s, v28.8h, v0.h[1]
    824             umlal       v15.4s, v29.4h, v0.h[1]
    825 
    826             uqrshrn     v14.4h, v14.4s, #16
    827             uqrshrn2    v14.8h, v15.4s, #16
    828             uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
    829 
    830             mov         v18.16b, v19.16b
    831             mov         v19.16b, v20.16b
    832             mov         v20.16b, v21.16b
    833             mov         v21.16b, v22.16b
    834             mov         v22.16b, v23.16b
    835             mov         v23.16b, v24.16b
    836             mov         v24.16b, v25.16b
    837             mov         v25.16b, v26.16b
    838             mov         v26.16b, v27.16b
    839             mov         v27.16b, v28.16b
    840             mov         v28.16b, v29.16b
    841             mov         v29.16b, v30.16b
    842             mov         v30.16b, v31.16b
    843             mov         v31.16b, v4.16b
    844             mov         v4.16b, v5.16b
    845             mov         v5.16b, v6.16b
    846             mov         v6.16b, v7.16b
    847             mov         v7.16b, v8.16b
    848             mov         v8.16b, v9.16b
    849             mov         v9.16b, v10.16b
    850             mov         v10.16b, v11.16b
    851 .endm/*}}}*/
    852 
    853 .macro hconv4_25/*{{{*/
    854             umull2      v14.4s, v25.8h, v0.h[0]
    855             umull       v15.4s, v26.4h, v0.h[0]
    856 
    857             adr         x16, 100f
    858             ldrsh       x12, [x16, x5, LSL #1]
    859             add         x12, x12, x16
    860             br          x12
    861    100:     .hword -4
    862             .hword 101f-100b
    863             .hword 102f-100b
    864             .hword 103f-100b
    865             .hword 104f-100b
    866             .hword 105f-100b
    867             .hword 106f-100b
    868             .hword 107f-100b
    869             .hword 108f-100b
    870             .hword 109f-100b
    871             .hword 110f-100b
    872             .hword 111f-100b
    873             .hword 112f-100b
    874             .hword 113f-100b
    875             .hword 114f-100b
    876             .hword 115f-100b
    877             .hword 116f-100b
    878             .hword 117f-100b
    879             .hword 118f-100b
    880             .hword 119f-100b
    881             .hword 120f-100b
    882             .hword 121f-100b
    883             .hword 122f-100b
    884             .hword 123f-100b
    885             .hword 124f-100b
    886             .hword 125f-100b
    887             .align 4
    888 
    889     125:    ld1         {v12.8h}, [x9]
    890             umlal       v14.4s, v12.4h, v3.h[1]
    891             umlal2      v15.4s, v12.8h, v3.h[1]
    892             umlal       v14.4s, v10.4h, v3.h[1]
    893             umlal2      v15.4s, v10.8h, v3.h[1]
    894     124:    add         x12, x9, #0x08
    895             bic         x12, x12, #0x40
    896             ld1         {v12.4h}, [x12], #8
    897             bic         x12, x12, #0x40
    898             ld1         {v13.4h}, [x12]
    899             umlal       v14.4s, v12.4h, v3.h[0]
    900             umlal       v15.4s, v13.4h, v3.h[0]
    901             umlal2      v14.4s, v9.8h,  v3.h[0]
    902             umlal       v15.4s, v10.4h, v3.h[0]
    903     123:    add         x12, x9, #0x10
    904             bic         x12, x12, #0x40
    905             ld1         {v12.8h}, [x12]
    906             umlal       v14.4s, v12.4h, v2.h[7]
    907             umlal2      v15.4s, v12.8h, v2.h[7]
    908             umlal       v14.4s, v9.4h,  v2.h[7]
    909             umlal2      v15.4s, v9.8h,  v2.h[7]
    910     122:    add         x12, x9, #0x18
    911             bic         x12, x12, #0x40
    912             ld1         {v12.4h}, [x12], #8
    913             bic         x12, x12, #0x40
    914             ld1         {v13.4h}, [x12]
    915             umlal       v14.4s, v12.4h, v2.h[6]
    916             umlal       v15.4s, v13.4h, v2.h[6]
    917             umlal2      v14.4s, v8.8h,  v2.h[6]
    918             umlal       v15.4s, v9.4h,  v2.h[6]
    919     121:    add         x12, x9, #0x20
    920             bic         x12, x12, #0x40
    921             ld1         {v12.8h}, [x12]
    922             umlal       v14.4s, v12.4h, v2.h[5]
    923             umlal2      v15.4s, v12.8h, v2.h[5]
    924             umlal       v14.4s, v8.4h,  v2.h[5]
    925             umlal2      v15.4s, v8.8h,  v2.h[5]
    926     120:    add         x12, x9, #0x28
    927             bic         x12, x12, #0x40
    928             ld1         {v12.4h}, [x12], #8
    929             bic         x12, x12, #0x40
    930             ld1         {v13.4h}, [x12]
    931             umlal       v14.4s, v12.4h, v2.h[4]
    932             umlal       v15.4s, v13.4h, v2.h[4]
    933             umlal2      v14.4s, v7.8h,  v2.h[4]
    934             umlal       v15.4s, v8.4h,  v2.h[4]
    935     119:    add         x12, x9, #0x30
    936             bic         x12, x12, #0x40
    937             ld1         {v12.8h}, [x12]
    938             umlal       v14.4s, v12.4h, v2.h[3]
    939             umlal2      v15.4s, v12.8h, v2.h[3]
    940             umlal       v14.4s, v7.4h,  v2.h[3]
    941             umlal2      v15.4s, v7.8h,  v2.h[3]
    942     118:    add         x12, x9, #0x38
    943             bic         x12, x12, #0x40
    944             ld1         {v12.4h}, [x12]
    945             umlal       v14.4s, v12.4h, v2.h[2]
    946             umlal       v15.4s, v17.4h, v2.h[2]
    947             umlal2      v14.4s, v6.8h,  v2.h[2]
    948             umlal       v15.4s, v7.4h,  v2.h[2]
    949     117:    umlal       v14.4s, v17.4h, v2.h[1]
    950             umlal2      v15.4s, v17.8h, v2.h[1]
    951             umlal       v14.4s, v6.4h,  v2.h[1]
    952             umlal2      v15.4s, v6.8h,  v2.h[1]
    953     116:    umlal2      v14.4s, v17.8h, v2.h[0]
    954             umlal       v15.4s, v18.4h, v2.h[0]
    955             umlal2      v14.4s, v5.8h,  v2.h[0]
    956             umlal       v15.4s, v6.4h,  v2.h[0]
    957     115:    umlal       v14.4s, v18.4h, v1.h[7]
    958             umlal2      v15.4s, v18.8h, v1.h[7]
    959             umlal       v14.4s, v5.4h,  v1.h[7]
    960             umlal2      v15.4s, v5.8h,  v1.h[7]
    961     114:    umlal2      v14.4s, v18.8h, v1.h[6]
    962             umlal       v15.4s, v19.4h, v1.h[6]
    963             umlal2      v14.4s, v4.8h,  v1.h[6]
    964             umlal       v15.4s, v5.4h,  v1.h[6]
    965     113:    umlal       v14.4s, v19.4h, v1.h[5]
    966             umlal2      v15.4s, v19.8h, v1.h[5]
    967             umlal       v14.4s, v4.4h,  v1.h[5]
    968             umlal2      v15.4s, v4.8h,  v1.h[5]
    969     112:    umlal2      v14.4s, v19.8h, v1.h[4]
    970             umlal       v15.4s, v20.4h, v1.h[4]
    971             umlal2      v14.4s, v31.8h, v1.h[4]
    972             umlal       v15.4s, v4.4h,  v1.h[4]
    973     111:    umlal       v14.4s, v20.4h, v1.h[3]
    974             umlal2      v15.4s, v20.8h, v1.h[3]
    975             umlal       v14.4s, v31.4h, v1.h[3]
    976             umlal2      v15.4s, v31.8h, v1.h[3]
    977     110:    umlal2      v14.4s, v20.8h, v1.h[2]
    978             umlal       v15.4s, v21.4h, v1.h[2]
    979             umlal2      v14.4s, v30.8h, v1.h[2]
    980             umlal       v15.4s, v31.4h, v1.h[2]
    981     109:    umlal       v14.4s, v21.4h, v1.h[1]
    982             umlal2      v15.4s, v21.8h, v1.h[1]
    983             umlal       v14.4s, v30.4h, v1.h[1]
    984             umlal2      v15.4s, v30.8h, v1.h[1]
    985     108:    umlal2      v14.4s, v21.8h, v1.h[0]
    986             umlal       v15.4s, v22.4h, v1.h[0]
    987             umlal2      v14.4s, v29.8h, v1.h[0]
    988             umlal       v15.4s, v30.4h, v1.h[0]
    989     107:    umlal       v14.4s, v22.4h, v0.h[7]
    990             umlal2      v15.4s, v22.8h, v0.h[7]
    991             umlal       v14.4s, v29.4h, v0.h[7]
    992             umlal2      v15.4s, v29.8h, v0.h[7]
    993     106:    umlal2      v14.4s, v22.8h, v0.h[6]
    994             umlal       v15.4s, v23.4h, v0.h[6]
    995             umlal2      v14.4s, v28.8h, v0.h[6]
    996             umlal       v15.4s, v29.4h, v0.h[6]
    997     105:    umlal       v14.4s, v23.4h, v0.h[5]
    998             umlal2      v15.4s, v23.8h, v0.h[5]
    999             umlal       v14.4s, v28.4h, v0.h[5]
   1000             umlal2      v15.4s, v28.8h, v0.h[5]
   1001     104:    umlal2      v14.4s, v23.8h, v0.h[4]
   1002             umlal       v15.4s, v24.4h, v0.h[4]
   1003             umlal2      v14.4s, v27.8h, v0.h[4]
   1004             umlal       v15.4s, v28.4h, v0.h[4]
   1005     103:    umlal       v14.4s, v24.4h, v0.h[3]
   1006             umlal2      v15.4s, v24.8h, v0.h[3]
   1007             umlal       v14.4s, v27.4h, v0.h[3]
   1008             umlal2      v15.4s, v27.8h, v0.h[3]
   1009     102:    umlal2      v14.4s, v24.8h, v0.h[2]
   1010             umlal       v15.4s, v25.4h, v0.h[2]
   1011             umlal2      v14.4s, v26.8h, v0.h[2]
   1012             umlal       v15.4s, v27.4h, v0.h[2]
   1013     101:    umlal       v14.4s, v25.4h, v0.h[1]
   1014             umlal2      v15.4s, v25.8h, v0.h[1]
   1015             umlal       v14.4s, v26.4h, v0.h[1]
   1016             umlal2      v15.4s, v26.8h, v0.h[1]
   1017 
   1018             uqrshrn     v14.4h, v14.4s, #16
   1019             uqrshrn2    v14.8h, v15.4s, #16
   1020             uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
   1021 
   1022             st1         {v17.16b}, [x9], #16
   1023             bic         x9, x9, #0x40
   1024             mov         v17.16b, v18.16b
   1025             mov         v18.16b, v19.16b
   1026             mov         v19.16b, v20.16b
   1027             mov         v20.16b, v21.16b
   1028             mov         v21.16b, v22.16b
   1029             mov         v22.16b, v23.16b
   1030             mov         v23.16b, v24.16b
   1031             mov         v24.16b, v25.16b
   1032             mov         v25.16b, v26.16b
   1033             mov         v26.16b, v27.16b
   1034             mov         v27.16b, v28.16b
   1035             mov         v28.16b, v29.16b
   1036             mov         v29.16b, v30.16b
   1037             mov         v30.16b, v31.16b
   1038             mov         v31.16b, v4.16b
   1039             mov         v4.16b, v5.16b
   1040             mov         v5.16b, v6.16b
   1041             mov         v6.16b, v7.16b
   1042             mov         v7.16b, v8.16b
   1043             mov         v8.16b, v9.16b
   1044             mov         v9.16b, v10.16b
   1045             mov         v10.16b, v11.16b
   1046 .endm/*}}}*/
   1047 
   1048 /* Dedicated function wrapper for the fetch macro, for the cases where
   1049  * performance isn't that important, to keep code size down.
   1050  */
   1051 PRIVATE(fetch_generic_asm)
   1052             stp         x10, x11, [sp, #-16]!
   1053             fetch
   1054             ldp         x10, x11, [sp], #16
   1055             ret
   1056 END(fetch_generic_asm)
   1057 
   1058 /* Given values in v10 and v11, and an index in x11, sweep the (x11&15)th value
   1059  * across to fill the rest of the register pair.  Used for filling the right
   1060  * hand edge of the window when starting too close to the right hand edge of
   1061  * the image.
   1062  * Also returns a dup-ed copy of the last element in v12 for the tail-fill
   1063  * case (this happens incidentally in common path, but must be done
   1064  * deliberately in the fast-out path).
   1065  */
   1066 PRIVATE(prefetch_clampright1)
   1067             ands        x12, x11, #15
   1068             beq         1f
   1069             sub         x12, x12, #1
   1070             sub         sp, sp, #64
   1071             st1         {v10.8h,v11.8h}, [sp]
   1072             add         x12, sp, x12, LSL #1
   1073             ld1r        {v12.8h}, [x12]
   1074             st1         {v12.8h}, [x12], #16
   1075             st1         {v12.8h}, [x12]
   1076             ld1         {v10.8h,v11.8h}, [sp]
   1077             add         sp, sp, #64
   1078             ret
   1079 1:          dup         v12.8h, v11.h[7]
   1080             ret
   1081 END(prefetch_clampright1)
   1082 
   1083 PRIVATE(prefetch_clampright4)
   1084             ands        x12, x11, #15
   1085             beq         1f
   1086             sub         x12, x12, #4
   1087             sub         sp, sp, #64
   1088             st1         {v10.8h,v11.8h}, [sp]
   1089             add         x12, sp, x12, LSL #1
   1090             ld1r        {v12.2d}, [x12]
   1091             st1         {v12.8h}, [x12], #16
   1092             st1         {v12.8h}, [x12]
   1093             ld1         {v10.8h,v11.8h}, [sp]
   1094             add         sp, sp, #64
   1095             ret
   1096 1:          dup         v12.2d, v11.d[1]
   1097             ret
   1098 END(prefetch_clampright4)
   1099 
   1100 
   1101 /* Helpers for prefetch, below.
   1102  */
   1103 .macro prefetch_out qa, qb, store, qsa, qsb, qsb_hi
   1104   .if \store == 2
   1105     .ifc \qsa,\qsb
   1106             st1         {\qsa}, [x9], #16
   1107             st1         {\qsb}, [x9], #16
   1108     .else
   1109             st1         {\qsa,\qsb}, [x9], #32
   1110     .endif
   1111   .elseif \store == 1
   1112             bic         x9, x9, #0x40
   1113             st1         {\qsa}, [x9], #16
   1114             mov         \qb, \qsb
   1115   .elseif \store == 0
   1116             mov         \qa, \qsa
   1117             mov         \qb, \qsb
   1118   .endif
   1119 .endm
   1120 
   1121 .macro prefetch_one  qa, qb, rem, c, store=0, step=1
   1122 .set i, (need - 16) - \rem
   1123 .if i >= 0
   1124 1:          cmp         x10, #i+16
   1125             blo         2f
   1126             prefetch_out \qa, \qb, \store, v9.16b, v9.16b, v9.d[1]
   1127             b           1f
   1128 2:          cmp         x11, #i+16
   1129             bls         3f
   1130             prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
   1131             bl          fetch_generic_asm
   1132             b           2f
   1133 3:          bl          prefetch_clampright\step
   1134             prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
   1135 4:          b           4f+4
   1136            //v12 contains pad word from prefetch_clampright call
   1137             prefetch_out \qa, \qb, \store, v12.16b, v12.16b, v12.d[1]
   1138   .if \rem > 0
   1139             b           4f+4
   1140   .else
   1141 1:
   1142 2:
   1143 3:
   1144 4:          nop
   1145   .endif
   1146 .endif
   1147 .endm
   1148 
   1149 /* Fill the convolution window with context data.  The aim here is to load
   1150  * exactly rlf + rrt columns, and in the main loop to read as many columns as
   1151  * will be written.  This is complicated by the need to handle cases when the
   1152  * input starts very close to the left or right (or both) edges of the image,
   1153  * and where these do not fall on 16-byte boundaries.
   1154  *
   1155  * Input:
   1156  *      x1 -- src
   1157  *      x2 -- pitch
   1158  *      x3 -- count
   1159  *      x4 -- inlen
   1160  *      x5 -- r
   1161  *      x6 -- rup
   1162  *      x7 -- rdn
   1163  *      x8 -- rlf
   1164  *      x9 -- buffer (if needed)
   1165  *      x13 = -pitch
   1166  *      x15 = top-row in
   1167  *      x19 = bottom-row in
   1168  * Output:
   1169  *      x1 += rlf + min(count, rrt)
   1170  * Modifies:
   1171  *      x10 -- fill start index in the window
   1172  *      x11 -- fill stop index in the window
   1173  *      x12 -- scratch
   1174  */
   1175 .macro prefetch step=1, max_r=25
   1176 .set need, ((\max_r + \max_r) * \step + 15) & ~15
   1177   .if \step == 1
   1178             mov         x10, #need - (\max_r * \step)
   1179             sub         x10, x10, x8
   1180   .else
   1181             mov         x10, #need - (\max_r * \step)
   1182             sub         x10, x10, x8, LSL #2
   1183   .endif
   1184             add         x11, x10, x4
   1185             subs        x11, x11, #need
   1186             csel        x11, xzr, x11, hi
   1187             add         x11, x11, #need
   1188 
   1189             bl          fetch_generic_asm
   1190   .if \step == 1
   1191             dup         v9.8h, v10.h[0]
   1192   .else
   1193             dup         v9.2d, v10.d[0]
   1194   .endif
   1195             ands        x12, x10, #15
   1196             beq         2f
   1197             sub         sp, sp, #32
   1198             st1         {v10.8h,v11.8h}, [sp]
   1199             sub         x12, sp, x12, LSL #1
   1200             sub         sp, sp, #16
   1201             st1         {v9.8h}, [sp]
   1202             sub         sp, sp, #16
   1203             st1         {v9.8h}, [sp]
   1204             ld1         {v10.8h,v11.8h}, [x12]
   1205             add         sp, sp, #64
   1206             sub         x1, x1, x10
   1207             sub         x15, x15, x10
   1208             sub         x19, x19, x10
   1209             bic         x10, x10, #15
   1210             add         x1, x1, x10
   1211             add         x15, x15, x10
   1212             add         x19, x19, x10
   1213 2:
   1214   .if \step > 1
   1215             /* it's only in the uchar2 and uchar4 cases where the register file
   1216              * is insufficient (given MAX_R <= 25).
   1217              */
   1218             prefetch_one xx, xx, 192, c=\max_r, step=\step, store=2
   1219             prefetch_one xx, xx, 176, c=\max_r, step=\step, store=2
   1220             prefetch_one xx,      v17.16b, 160, c=\max_r, step=\step, store=1
   1221             prefetch_one v18.16b, v19.16b, 144, c=\max_r, step=\step, store=0
   1222             prefetch_one v20.16b, v21.16b, 128, c=\max_r, step=\step, store=0
   1223             prefetch_one v22.16b, v23.16b, 112, c=\max_r, step=\step, store=0
   1224             prefetch_one v24.16b, v25.16b,  96, c=\max_r, step=\step, store=0
   1225             prefetch_one v26.16b, v27.16b,  80, c=\max_r, step=\step, store=0
   1226             prefetch_one v28.16b, v29.16b,  64, c=\max_r, step=\step, store=0
   1227   .endif
   1228             prefetch_one v30.16b, v31.16b,  48, c=\max_r, step=\step, store=0
   1229             prefetch_one v4.16b,  v5.16b,   32, c=\max_r, step=\step, store=0
   1230             prefetch_one v6.16b,  v7.16b,   16, c=\max_r, step=\step, store=0
   1231             prefetch_one v8.16b,  v9.16b,    0, c=\max_r, step=\step, store=0
   1232 
   1233   .if \step == 1
   1234             add         x10, x8, #\max_r * \step
   1235   .else
   1236             lsl         x10, x8, #2
   1237             add         x10, x10, #\max_r * \step
   1238   .endif
   1239             subs        x4, x4, x10
   1240             csel        x4, xzr, x4, lo
   1241 .endm
   1242 
   1243 /* The main loop.
   1244  *
   1245  * Input:
   1246  *      x0 = dst
   1247  *      x1 = src
   1248  *      x2 = pitch
   1249  *      x3 = count
   1250  *      x4 = inlen
   1251  *      x5 = r
   1252  *      x6 = rup
   1253  *      x7 = rdn
   1254  *      x9 = buffer
   1255  *      x13 = -pitch
   1256  *      x15 = top-row in
   1257  *      x19 = bottom-row in
   1258  * Modifies
   1259  *      x8 = fetch code pointer
   1260  */
   1261 .macro mainloop core, step=1, max_r=25, labelc="", labelnc=""
   1262             adrp        x8, \labelnc
   1263             add         x8, x8, #:lo12:\labelnc
   1264             sub         x8, x8, x5, LSL #5
   1265             sub         x8, x8, x5, LSL #3
   1266             cmp         x5, x6
   1267             ccmp        x5, x7, #0, eq
   1268             beq         5f
   1269 
   1270             /* if (r != rup || r != rdn) then the address-clamping table should
   1271              * be used rather than the short-cut version.
   1272              */
   1273             adrp        x8, \labelc
   1274             add         x8, x8, #:lo12:\labelc
   1275             sub         x8, x8, x5, LSL #6
   1276             add         x8, x8, x5, LSL #3
   1277             b           5f
   1278             .align  4
   1279 3:          fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8
   1280 
   1281             /* For each call to fetch two are made to \core.  It would be
   1282              * preferable to have twice the work done in \core.
   1283              */
   1284             \core
   1285             st1         {v15.8b}, [x0], #8
   1286             \core
   1287             st1         {v15.8b}, [x0], #8
   1288 
   1289             sub         x3, x3, #16
   1290 5:          subs        x4, x4, #16
   1291             bhs         3b
   1292             adds        x4, x4, #16
   1293             bne         1f
   1294   .if \step==1
   1295             dup         v10.8h, v9.h[7]
   1296             dup         v11.8h, v9.h[7]
   1297   .else
   1298             dup         v10.2d, v9.d[1]
   1299             dup         v11.2d, v9.d[1]
   1300   .endif
   1301             b           4f
   1302 
   1303 1:          sub         x1, x1, #16
   1304             sub         x15, x15, #16
   1305             sub         x19, x19, #16
   1306             add         x1, x1, x4
   1307             add         x15, x15, x4
   1308             add         x19, x19, x4
   1309             bl          fetch_generic_asm
   1310 
   1311   .if \step==1
   1312             dup         v12.8h, v11.h[7]
   1313   .else
   1314             dup         v12.2d, v11.d[1]
   1315   .endif
   1316             sub         x4, xzr, x4
   1317             tbz         x4, #3, 1f
   1318             mov         v10.16b, v11.16b
   1319             mov         v11.16b, v12.16b
   1320 1:          tbz         x4, #2, 1f
   1321             ext         v10.16b, v10.16b, v11.16b, #4*2
   1322             ext         v11.16b, v11.16b, v12.16b, #4*2
   1323 1:          tbz         x4, #1, 1f
   1324             ext         v10.16b, v10.16b, v11.16b, #2*2
   1325             ext         v11.16b, v11.16b, v12.16b, #2*2
   1326 1:          tbz         x4, #0, 4f
   1327             ext         v10.16b, v10.16b, v11.16b, #1*2
   1328             ext         v11.16b, v11.16b, v12.16b, #1*2
   1329 4:          cbz         x3, 5f
   1330 3:          \core
   1331   .if \step==1
   1332             dup         v11.8h, v11.h[7]
   1333   .else
   1334             dup         v11.2d, v11.d[1]
   1335   .endif
   1336             subs        x3, x3, #8
   1337             blo         4f
   1338             st1         {v15.8b}, [x0], #8
   1339             beq         5f
   1340             b           3b
   1341 4:          tbz         x3, #2, 1f
   1342             st1         {v15.s}[0], [x0], #4
   1343             ext         v15.8b, v15.8b, v15.8b, #4
   1344 1:          tbz         x3, #1, 1f
   1345             st1         {v15.h}[0], [x0], #2
   1346             ext         v15.8b, v15.8b, v15.8b, #2
   1347 1:          tbz         x3, #0, 5f
   1348             st1         {v15.b}[0], [x0], #1
   1349             ext         v15.8b, v15.8b, v15.8b, #1
   1350 5:          nop
   1351 .endm
   1352 
   1353 .irep r, TUNED_LIST1, 25
   1354 PRIVATE(convolve1_\r)
   1355             stp         x29,x30, [sp, #-16]!
   1356 
   1357             prefetch    step=1, max_r=\r
   1358 
   1359             mainloop    core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
   1360 
   1361             ldp         x29,x30, [sp], #16
   1362             ret
   1363 END(convolve1_\r)
   1364 .endr
   1365 
   1366 .irep r, TUNED_LIST4, 25
   1367 PRIVATE(convolve4_\r)
   1368             sub         x12, sp, #0x040
   1369             bic         x9, x12, #0x07f
   1370             mov         sp, x9
   1371             stp         x12,x30, [sp, #-16]!
   1372 
   1373             /* x9 now points to a buffer on the stack whose address has the low
   1374              * 7 bits clear.  This allows easy address calculation in the
   1375              * wrap-around cases.
   1376              */
   1377 
   1378 
   1379             prefetch    step=4, max_r=\r
   1380 
   1381             mainloop    core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
   1382 
   1383             ldp         x12,x30, [sp]
   1384             add         sp, x12, #0x40
   1385             ret
   1386 END(convolve4_\r)
   1387 .endr
   1388 
   1389 /* void rsdIntrinsicBlurU1_K(
   1390  *                  void *out,      // x0
   1391  *                  void *in,       // x1
   1392  *                  size_t w,       // x2
   1393  *                  size_t h,       // x3
   1394  *                  size_t p,       // x4
   1395  *                  size_t x,       // x5
   1396  *                  size_t y,       // x6
   1397  *                  size_t count,   // x7
   1398  *                  size_t r,       // [sp]
   1399  *                  uint16_t *tab); // [sp,#8]
   1400  */
   1401 ENTRY(rsdIntrinsicBlurU1_K)
   1402             stp         x19,x30, [sp, #-16]!
   1403             sub         x8, sp, #32
   1404             sub         sp, sp, #64
   1405             st1         {v8.1d - v11.1d}, [sp]
   1406             st1         {v12.1d - v15.1d}, [x8]
   1407             mov         x8, x5        // x
   1408             ldr         w5, [sp,#80]  // r
   1409             sub         x9, x2, x8
   1410             sub         x10, x3, x6
   1411             mov         x2, x4        // pitch
   1412             mov         x3, x7        // count
   1413             sub         x7, x10, #1
   1414             sub         x9, x9, x3
   1415 
   1416             ldr         x12, [sp, #88] // tab
   1417 
   1418             add         x1, x1, x8
   1419 
   1420             cmp         x6, x5
   1421             csel        x6, x5, x6, hs
   1422             cmp         x7, x5
   1423             csel        x7, x5, x7, hs
   1424             cmp         x8, x5
   1425             csel        x8, x5, x8, hs
   1426             cmp         x9, x5
   1427             csel        x9, x5, x9, hs
   1428 
   1429             add         x4, x8, x9
   1430             add         x4, x4, x3
   1431 
   1432             sub         x1, x1, x8
   1433 
   1434             sub         x13, xzr, x2
   1435             msub        x15, x2, x6, x1
   1436             madd        x19, x2, x7, x1
   1437 
   1438             ld1         {v0.8h,v1.8h}, [x12], #32
   1439             ld1         {v2.8h,v3.8h}, [x12], #32
   1440 
   1441             adr         x30, 1f
   1442   .irep r, TUNED_LIST1
   1443             cmp         x5, #\r
   1444             bls         convolve1_\r
   1445   .endr
   1446             b           convolve1_25
   1447 
   1448 1:          ld1         {v8.1d - v11.1d}, [sp], #32
   1449             ld1         {v12.1d - v15.1d}, [sp], #32
   1450             ldp         x19,x30, [sp], #16
   1451             ret
   1452 END(rsdIntrinsicBlurU1_K)
   1453 
   1454 /* void rsdIntrinsicBlurU4_K(
   1455  *                  void *out,      // x0
   1456  *                  void *in,       // x1
   1457  *                  size_t w,       // x2
   1458  *                  size_t h,       // x3
   1459  *                  size_t p,       // x4
   1460  *                  size_t x,       // x5
   1461  *                  size_t y,       // x6
   1462  *                  size_t count,   // x7
   1463  *                  size_t r,       // [sp]
   1464  *                  uint16_t *tab); // [sp,#8]
   1465  */
   1466 ENTRY(rsdIntrinsicBlurU4_K)
   1467             stp         x19,x30, [sp, #-16]!
   1468             sub         x8, sp, #32
   1469             sub         sp, sp, #64
   1470             st1         {v8.1d - v11.1d}, [sp]
   1471             st1         {v12.1d - v15.1d}, [x8]
   1472             mov         x8, x5        // x
   1473             ldr         w5, [sp,#80]  // r
   1474             sub         x9, x2, x8
   1475             sub         x10, x3, x6
   1476             mov         x2, x4        // pitch
   1477             mov         x3, x7        // count
   1478             sub         x7, x10, #1
   1479             sub         x9, x9, x3
   1480 
   1481             ldr         x12, [sp, #88]
   1482 
   1483             add         x1, x1, x8, LSL #2
   1484 
   1485             cmp         x6, x5
   1486             csel        x6, x5, x6, hs
   1487             cmp         x7, x5
   1488             csel        x7, x5, x7, hs
   1489             cmp         x8, x5
   1490             csel        x8, x5, x8, hs
   1491             cmp         x9, x5
   1492             csel        x9, x5, x9, hs
   1493 
   1494             lsl         x3, x3, #2
   1495             add         x4, x8, x9
   1496             add         x4, x3, x4, LSL #2
   1497 
   1498             sub         x1, x1, x8, LSL #2
   1499 
   1500             sub         x13, xzr, x2
   1501             msub        x15, x2, x6, x1
   1502             madd        x19, x2, x7, x1
   1503 
   1504             ld1         {v0.8h,v1.8h}, [x12], #32
   1505             ld1         {v2.8h,v3.8h}, [x12], #32
   1506 
   1507             adr         x30, 1f
   1508   .irep r, TUNED_LIST4
   1509             cmp         x5, #\r
   1510             bls         convolve4_\r
   1511   .endr
   1512             b           convolve4_25
   1513 
   1514 1:          ld1         {v8.1d - v11.1d}, [sp], #32
   1515             ld1         {v12.1d - v15.1d}, [sp], #32
   1516             ldp         x19,x30, [sp], #16
   1517             ret
   1518 END(rsdIntrinsicBlurU4_K)
   1519