Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
     18 #define PRIVATE(f) .text; .align 4; .type f,#function; f:
     19 #define END(f) .size f, .-f;
     20 
     21 //#define ARCH_ARM64_USE_BLUR_PRELOAD
     22 
     23 /* Number of fractional bits to preserve in intermediate results.  The
     24  * intermediate storage is 16-bit, and we started with 8 bit data (the integer
     25  * part), so this should be between 0 and 8.
     26  */
     27 .set FRACTION_BITS, 7
     28 .set MAX_R, 25
     29 
     30 
     31 /* A quick way of making a line of code conditional on some other condition.
     32  * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
     33  * `ifcc`:
     34  */
     35 .macro ifcc zzz:vararg
     36 .if cc
     37             \zzz
     38 .endif
     39 .endm
     40 
     41 /* It's not always clear that prefetching is beneficial and this needs further
     42  * testing on different cores, so it's made switchable here.
     43  */
     44 #if defined(ARCH_ARM64_USE_BLUR_PRELOAD)
     45 #define VERTPLD(...) prfm        PLDL1KEEP, [__VA_ARGS__]
     46 #else
     47 #define VERTPLD(...) nop
     48 #endif
     49 
     50 /* Fetch 16 columns of bytes (regardless of image format), convolve these
     51  * vertically, and leave them in the register file.  If working near the top or
     52  * bottom of an image then clamp the addressing while loading the data in.
     53  *
     54  * The convolution is fully unrolled for windows up to max_r, with the
     55  * outermost edges calculated first.  This way it's possible to branch directly
     56  * into the relevant part of the code for an arbitrary convolution radius.  Two
     57  * variants of the loop are produced; one eliminates the clamping code for a
     58  * slight speed advantage.
     59  *
     60  * Where the macro is called with reg=x, the specified register is taken to
     61  * contain a pre-calculated pointer into one of the two loops.
     62  *
     63  * Input:
     64  *      x1 -- src
     65  *      x2 -- pitch
     66  *      x5 -- r
     67  *      x6 -- rup (r, unless clipped to top of source image)
     68  *      x7 -- rdn (r, unless clipped to bottom of source image)
     69  *      x12 -- switch index
     70  *      v0-v3 -- coefficient table
     71  *      x13 = -pitch
     72  *      x15 = top-row in
     73  *      x19 = bottom-row in
     74  * Output:
     75  *      x1 += 16
     76  *      v10,v11 -- 16 convolved columns
     77  * Modifies:
     78  *      x10 = upper row pointer
     79  *      x11 = lower row pointer
     80  *      v12-v15 = temporary sums
     81  */
     82 .macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
     83   .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
     84 
     85             ld1         {v15.16b}, [x1], #16
     86             mov         x10, x15
     87 
     88             uxtl        v14.8h, v15.8b
     89             VERTPLD(x1, #16)
     90             uxtl2       v15.8h, v15.16b
     91   .if \max_r < 16 // approximate
     92     ifcc    adr         \reg, 1f
     93   .else
     94     ifcc    adrp        \reg, 1f
     95     ifcc    add         \reg, \reg, #:lo12:1f
     96   .endif
     97 
     98             umull       v12.4s, v14.4h, v0.h[0]
     99     ifcc    sub         \reg, \reg, x5, LSL #6
    100             umull2      v13.4s, v14.8h, v0.h[0]
    101             mov         x11, x19
    102             umull       v14.4s, v15.4h, v0.h[0]
    103     ifcc    add         \reg, \reg, x5, LSL #3
    104             umull2      v15.4s, v15.8h, v0.h[0]
    105             br          \reg
    106 
    107   /* This version of the vertical fetch loop body is used away from the edges
    108    * of the source image.  The pointers start at the top and bottom source rows
    109    * and work their way towards the centre on each iteration.  This way the
    110    * number of taps used can be controlled by jumping directly into the middle
    111    * of the loop and running to completion.
    112    * If the loop body changes size then the code which caculates the address of
    113    * the initial iteration must be updated to accordingly.
    114    */
    115   .macro vertfetch_noclamp i, dreg
    116     .if 0 < \i && \i <= \max_r
    117             ld1         {v10.16b}, [x10], x2
    118             ld1         {v11.16b}, [x11], x13
    119             uaddl       v16.8h, v10.8b, v11.8b
    120             uaddl2      v11.8h, v10.16b, v11.16b
    121             umlal       v12.4s, v16.4h, \dreg
    122             umlal2      v13.4s, v16.8h, \dreg
    123             VERTPLD(x10, #32)
    124             umlal       v14.4s, v11.4h, \dreg
    125             VERTPLD(x11, #32)
    126             umlal2      v15.4s, v11.8h, \dreg
    127     .endif
    128   .endm
    129 
    130   /* This version of the vertical fetch loop body is used near the edges of the
    131    * source image, where one or both of the accesses may start with a clamped
    132    * value, and the row addresses only begin to change after some number of
    133    * iterations before the end.
    134    * If the loop body changes size then the code which caculates the address of
    135    * the initial iteration must be updated to accordingly.
    136    */
    137   .macro vertfetch_clamped i, dreg
    138     .if 0 < \i && \i <= \max_r
    139             ld1         {v10.16b}, [x10], x2
    140             cmp         x6, #\i
    141             ld1         {v11.16b}, [x11], x13
    142             csel        x10, x15, x10, lo
    143             uaddl       v16.8h, v10.8b, v11.8b
    144             cmp         x7, #\i
    145             uaddl2      v11.8h, v10.16b, v11.16b
    146             csel        x11, x19, x11, lo
    147             umlal       v12.4s, v16.4h, \dreg
    148             umlal2      v13.4s, v16.8h, \dreg
    149             VERTPLD(x10, #32)
    150             umlal       v14.4s, v11.4h, \dreg
    151             VERTPLD(x11, #32)
    152             umlal2      v15.4s, v11.8h, \dreg
    153     .endif
    154   .endm
    155 
    156   /* Entry into this unrolled loop is computed as a negative index from
    157    * \labelc at the end of the block.
    158    */
    159   .align 4
    160   vertfetch_clamped 27, v3.h[3]
    161   vertfetch_clamped 26, v3.h[2]
    162   vertfetch_clamped 25, v3.h[1]
    163   vertfetch_clamped 24, v3.h[0]
    164   vertfetch_clamped 23, v2.h[7]
    165   vertfetch_clamped 22, v2.h[6]
    166   vertfetch_clamped 21, v2.h[5]
    167   vertfetch_clamped 20, v2.h[4]
    168   vertfetch_clamped 19, v2.h[3]
    169   vertfetch_clamped 18, v2.h[2]
    170   vertfetch_clamped 17, v2.h[1]
    171   vertfetch_clamped 16, v2.h[0]
    172   vertfetch_clamped 15, v1.h[7]
    173   vertfetch_clamped 14, v1.h[6]
    174   vertfetch_clamped 13, v1.h[5]
    175   vertfetch_clamped 12, v1.h[4]
    176   vertfetch_clamped 11, v1.h[3]
    177   vertfetch_clamped 10, v1.h[2]
    178   vertfetch_clamped  9, v1.h[1]
    179   vertfetch_clamped  8, v1.h[0]
    180   vertfetch_clamped  7, v0.h[7]
    181   vertfetch_clamped  6, v0.h[6]
    182   vertfetch_clamped  5, v0.h[5]
    183   vertfetch_clamped  4, v0.h[4]
    184   vertfetch_clamped  3, v0.h[3]
    185   vertfetch_clamped  2, v0.h[2]
    186   vertfetch_clamped  1, v0.h[1]
    187   vertfetch_clamped  0, v0.h[0]
    188   1:
    189   \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
    190 
    191   /* Entry into this unrolled loop is computed as a negative index from
    192    * \labelnc at the end of the block.
    193    */
    194   .align 4
    195   vertfetch_noclamp 27, v3.h[3]
    196   vertfetch_noclamp 26, v3.h[2]
    197   vertfetch_noclamp 25, v3.h[1]
    198   vertfetch_noclamp 24, v3.h[0]
    199   vertfetch_noclamp 23, v2.h[7]
    200   vertfetch_noclamp 22, v2.h[6]
    201   vertfetch_noclamp 21, v2.h[5]
    202   vertfetch_noclamp 20, v2.h[4]
    203   vertfetch_noclamp 19, v2.h[3]
    204   vertfetch_noclamp 18, v2.h[2]
    205   vertfetch_noclamp 17, v2.h[1]
    206   vertfetch_noclamp 16, v2.h[0]
    207   vertfetch_noclamp 15, v1.h[7]
    208   vertfetch_noclamp 14, v1.h[6]
    209   vertfetch_noclamp 13, v1.h[5]
    210   vertfetch_noclamp 12, v1.h[4]
    211   vertfetch_noclamp 11, v1.h[3]
    212   vertfetch_noclamp 10, v1.h[2]
    213   vertfetch_noclamp  9, v1.h[1]
    214   vertfetch_noclamp  8, v1.h[0]
    215   vertfetch_noclamp  7, v0.h[7]
    216   vertfetch_noclamp  6, v0.h[6]
    217   vertfetch_noclamp  5, v0.h[5]
    218   vertfetch_noclamp  4, v0.h[4]
    219   vertfetch_noclamp  3, v0.h[3]
    220   vertfetch_noclamp  2, v0.h[2]
    221   vertfetch_noclamp  1, v0.h[1]
    222   vertfetch_noclamp  0, v0.h[0]
    223   \labelnc :
    224 
    225   .purgem vertfetch_clamped
    226   .purgem vertfetch_noclamp
    227 
    228   2:        uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
    229             add         x15, x15, #16
    230             uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
    231             add         x19, x19, #16
    232             uqrshrn     v11.4h, v14.4s, #16 - FRACTION_BITS
    233             uqrshrn2    v11.8h, v15.4s, #16 - FRACTION_BITS
    234 .endm /*}}}*/
    235 
    236 /* Some portion of the convolution window (as much as will fit, and all of it
    237  * for the uchar1 cases) is kept in the register file to avoid unnecessary
    238  * memory accesses.  This forces the horizontal loops to be unrolled because
    239  * there's no indexed addressing into the register file.
    240  *
    241  * As in the fetch macro, the operations are ordered from outside to inside, so
    242  * that jumping into the middle of the block bypasses the unwanted window taps.
    243  *
    244  * There are several variants of the macro because of the fixed offets of the
    245  * taps -- the wider the maximum radius the further the centre tap is from the
    246  * most recently fetched data.  This means that pre-filling the window requires
    247  * more data that won't be used and it means that rotating the window involves
    248  * more mov operations.
    249  *
    250  * When the buffer gets too big the buffer at [x9] is used.
    251  *
    252  * Input:
    253  *      v16-v31,v4-v11 -- convoltion window
    254  *      x9 -- pointer to additional convolution window data
    255  * Output:
    256  *      x9 -- updated buffer pointer (if used)
    257  *      d31 -- result to be stored
    258  * Modifies:
    259  *      x12 -- temp buffer pointer
    260  *      v12-v13 -- temporaries for load and vext operations.
    261  *      v14-v15 -- intermediate sums
    262  */
    263 #define TUNED_LIST1 8, 16
    264 .macro hconv1_8/*{{{*/
    265 
    266 .rodata
    267     200:    .hword -4
    268             .hword 101f-100f
    269             .hword 102f-100f
    270             .hword 103f-100f
    271             .hword 104f-100f
    272             .hword 105f-100f
    273             .hword 106f-100f
    274             .hword 107f-100f
    275             .hword 108f-100f
    276             .align      4
    277 .text
    278             umull       v14.4s, v9.4h, v0.h[0]
    279             umull2      v15.4s, v9.8h, v0.h[0]
    280 
    281             adrp        x16, 200b
    282             add         x16, x16, :lo12:200b
    283             ldrsh       x12, [x16, x5, LSL #1]
    284             adr         x16, 100f
    285             add         x12, x12, x16
    286     100:    br          x12
    287     108:    umlal       v14.4s, v8.4h, v1.h[0]
    288             umlal2      v15.4s, v8.8h, v1.h[0]
    289             umlal       v14.4s, v10.4h, v1.h[0]
    290             umlal2      v15.4s, v10.8h, v1.h[0]
    291     107:    ext         v12.16b, v8.16b, v9.16b, #1*2
    292             ext         v13.16b, v9.16b, v10.16b, #7*2
    293             umlal       v14.4s, v12.4h, v0.h[7]
    294             umlal2      v15.4s, v12.8h, v0.h[7]
    295             umlal       v14.4s, v13.4h, v0.h[7]
    296             umlal2      v15.4s, v13.8h, v0.h[7]
    297     106:    ext         v12.16b, v8.16b, v9.16b, #2*2
    298             ext         v13.16b, v9.16b, v10.16b, #6*2
    299             umlal       v14.4s, v12.4h, v0.h[6]
    300             umlal2      v15.4s, v12.8h, v0.h[6]
    301             umlal       v14.4s, v13.4h, v0.h[6]
    302             umlal2      v15.4s, v13.8h, v0.h[6]
    303     105:    ext         v12.16b, v8.16b, v9.16b, #3*2
    304             ext         v13.16b, v9.16b, v10.16b, #5*2
    305             umlal       v14.4s, v12.4h, v0.h[5]
    306             umlal2      v15.4s, v12.8h, v0.h[5]
    307             umlal       v14.4s, v13.4h, v0.h[5]
    308             umlal2      v15.4s, v13.8h, v0.h[5]
    309     104:    //ext         v12.16b, v8.16b, v9.16b, #4*2
    310             //ext         v13.16b, v9.16b, v10.16b, #4*2
    311             umlal2      v14.4s, v8.8h, v0.h[4]
    312             umlal       v15.4s, v9.4h, v0.h[4]
    313             umlal2      v14.4s, v9.8h, v0.h[4]
    314             umlal       v15.4s, v10.4h, v0.h[4]
    315     103:    ext         v12.16b, v8.16b, v9.16b, #5*2
    316             ext         v13.16b, v9.16b, v10.16b, #3*2
    317             umlal       v14.4s, v12.4h, v0.h[3]
    318             umlal2      v15.4s, v12.8h, v0.h[3]
    319             umlal       v14.4s, v13.4h, v0.h[3]
    320             umlal2      v15.4s, v13.8h, v0.h[3]
    321     102:    ext         v12.16b, v8.16b, v9.16b, #6*2
    322             ext         v13.16b, v9.16b, v10.16b, #2*2
    323             umlal       v14.4s, v12.4h, v0.h[2]
    324             umlal2      v15.4s, v12.8h, v0.h[2]
    325             umlal       v14.4s, v13.4h, v0.h[2]
    326             umlal2      v15.4s, v13.8h, v0.h[2]
    327     101:    ext         v12.16b, v8.16b, v9.16b, #7*2
    328             ext         v13.16b, v9.16b, v10.16b, #1*2
    329             umlal       v14.4s, v12.4h, v0.h[1]
    330             umlal2      v15.4s, v12.8h, v0.h[1]
    331             umlal       v14.4s, v13.4h, v0.h[1]
    332             umlal2      v15.4s, v13.8h, v0.h[1]
    333 
    334             uqrshrn     v14.4h, v14.4s, #16
    335             uqrshrn2    v14.8h, v15.4s, #16
    336             uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
    337 
    338             mov         v8.16b, v9.16b
    339             mov         v9.16b, v10.16b
    340             mov         v10.16b, v11.16b
    341 .endm/*}}}*/
    342 
    343 .macro hconv1_16/*{{{*/
    344 .rodata
    345    200:     .hword -4
    346             .hword 101f-100f
    347             .hword 102f-100f
    348             .hword 103f-100f
    349             .hword 104f-100f
    350             .hword 105f-100f
    351             .hword 106f-100f
    352             .hword 107f-100f
    353             .hword 108f-100f
    354             .hword 109f-100f
    355             .hword 110f-100f
    356             .hword 111f-100f
    357             .hword 112f-100f
    358             .hword 113f-100f
    359             .hword 114f-100f
    360             .hword 115f-100f
    361             .hword 116f-100f
    362             .align 4
    363 
    364 .text
    365             umull       v14.4s, v8.4h, v0.h[0]
    366             umull2      v15.4s, v8.8h, v0.h[0]
    367 
    368             adrp        x16, 200b
    369             add         x16, x16, :lo12:200b
    370             ldrsh       x12, [x16, x5, LSL #1]
    371             adr         x16, 100f
    372             add         x12, x12, x16
    373     100:    br          x12
    374     116:    //ext         v12.16b, v6.16b, v7.16b, #0*2
    375             //ext         v13.16b, v10.16b, v11.16b, #0*2
    376             umlal       v14.4s, v6.4h, v2.h[0]
    377             umlal2      v15.4s, v6.8h, v2.h[0]
    378             umlal       v14.4s, v10.4h, v2.h[0]
    379             umlal2      v15.4s, v10.8h, v2.h[0]
    380     115:    ext         v12.16b, v6.16b, v7.16b, #1*2
    381             ext         v13.16b, v9.16b, v10.16b, #7*2
    382             umlal       v14.4s, v12.4h, v1.h[7]
    383             umlal2      v15.4s, v12.8h, v1.h[7]
    384             umlal       v14.4s, v13.4h, v1.h[7]
    385             umlal2      v15.4s, v13.8h, v1.h[7]
    386     114:    ext         v12.16b, v6.16b, v7.16b, #2*2
    387             ext         v13.16b, v9.16b, v10.16b, #6*2
    388             umlal       v14.4s, v12.4h, v1.h[6]
    389             umlal2      v15.4s, v12.8h, v1.h[6]
    390             umlal       v14.4s, v13.4h, v1.h[6]
    391             umlal2      v15.4s, v13.8h, v1.h[6]
    392     113:    ext         v12.16b, v6.16b, v7.16b, #3*2
    393             ext         v13.16b, v9.16b, v10.16b, #5*2
    394             umlal       v14.4s, v12.4h, v1.h[5]
    395             umlal2      v15.4s, v12.8h, v1.h[5]
    396             umlal       v14.4s, v13.4h, v1.h[5]
    397             umlal2      v15.4s, v13.8h, v1.h[5]
    398     112:    //ext         v12.16b, v6.16b, v7.16b, #4*2
    399             //ext         v13.16b, v9.16b, v10.16b, #4*2
    400             umlal2      v14.4s, v6.8h, v1.h[4]
    401             umlal       v15.4s, v7.4h, v1.h[4]
    402             umlal2      v14.4s, v9.8h, v1.h[4]
    403             umlal       v15.4s, v10.4h, v1.h[4]
    404     111:    ext         v12.16b, v6.16b, v7.16b, #5*2
    405             ext         v13.16b, v9.16b, v10.16b, #3*2
    406             umlal       v14.4s, v12.4h, v1.h[3]
    407             umlal2      v15.4s, v12.8h, v1.h[3]
    408             umlal       v14.4s, v13.4h, v1.h[3]
    409             umlal2      v15.4s, v13.8h, v1.h[3]
    410     110:    ext         v12.16b, v6.16b, v7.16b, #6*2
    411             ext         v13.16b, v9.16b, v10.16b, #2*2
    412             umlal       v14.4s, v12.4h, v1.h[2]
    413             umlal2      v15.4s, v12.8h, v1.h[2]
    414             umlal       v14.4s, v13.4h, v1.h[2]
    415             umlal2      v15.4s, v13.8h, v1.h[2]
    416     109:    ext         v12.16b, v6.16b, v7.16b, #7*2
    417             ext         v13.16b, v9.16b, v10.16b, #1*2
    418             umlal       v14.4s, v12.4h, v1.h[1]
    419             umlal2      v15.4s, v12.8h, v1.h[1]
    420             umlal       v14.4s, v13.4h, v1.h[1]
    421             umlal2      v15.4s, v13.8h, v1.h[1]
    422     108:    //ext         v12.16b, v7.16b, v8.16b, #0*2
    423             //ext         v13.16b, v9.16b, v10.16b, #0*2
    424             umlal       v14.4s, v7.4h, v1.h[0]
    425             umlal2      v15.4s, v7.8h, v1.h[0]
    426             umlal       v14.4s, v9.4h, v1.h[0]
    427             umlal2      v15.4s, v9.8h, v1.h[0]
    428     107:    ext         v12.16b, v7.16b, v8.16b, #1*2
    429             ext         v13.16b, v8.16b, v9.16b, #7*2
    430             umlal       v14.4s, v12.4h, v0.h[7]
    431             umlal2      v15.4s, v12.8h, v0.h[7]
    432             umlal       v14.4s, v13.4h, v0.h[7]
    433             umlal2      v15.4s, v13.8h, v0.h[7]
    434     106:    ext         v12.16b, v7.16b, v8.16b, #2*2
    435             ext         v13.16b, v8.16b, v9.16b, #6*2
    436             umlal       v14.4s, v12.4h, v0.h[6]
    437             umlal2      v15.4s, v12.8h, v0.h[6]
    438             umlal       v14.4s, v13.4h, v0.h[6]
    439             umlal2      v15.4s, v13.8h, v0.h[6]
    440     105:    ext         v12.16b, v7.16b, v8.16b, #3*2
    441             ext         v13.16b, v8.16b, v9.16b, #5*2
    442             umlal       v14.4s, v12.4h, v0.h[5]
    443             umlal2      v15.4s, v12.8h, v0.h[5]
    444             umlal       v14.4s, v13.4h, v0.h[5]
    445             umlal2      v15.4s, v13.8h, v0.h[5]
    446     104:    //ext         v12.16b, v7.16b, v8.16b, #4*2
    447             //ext         v13.16b, v8.16b, v9.16b, #4*2
    448             umlal2      v14.4s, v7.8h, v0.h[4]
    449             umlal       v15.4s, v8.4h, v0.h[4]
    450             umlal2      v14.4s, v8.8h, v0.h[4]
    451             umlal       v15.4s, v9.4h, v0.h[4]
    452     103:    ext         v12.16b, v7.16b, v8.16b, #5*2
    453             ext         v13.16b, v8.16b, v9.16b, #3*2
    454             umlal       v14.4s, v12.4h, v0.h[3]
    455             umlal2      v15.4s, v12.8h, v0.h[3]
    456             umlal       v14.4s, v13.4h, v0.h[3]
    457             umlal2      v15.4s, v13.8h, v0.h[3]
    458     102:    ext         v12.16b, v7.16b, v8.16b, #6*2
    459             ext         v13.16b, v8.16b, v9.16b, #2*2
    460             umlal       v14.4s, v12.4h, v0.h[2]
    461             umlal2      v15.4s, v12.8h, v0.h[2]
    462             umlal       v14.4s, v13.4h, v0.h[2]
    463             umlal2      v15.4s, v13.8h, v0.h[2]
    464     101:    ext         v12.16b, v7.16b, v8.16b, #7*2
    465             ext         v13.16b, v8.16b, v9.16b, #1*2
    466             umlal       v14.4s, v12.4h, v0.h[1]
    467             umlal2      v15.4s, v12.8h, v0.h[1]
    468             umlal       v14.4s, v13.4h, v0.h[1]
    469             umlal2      v15.4s, v13.8h, v0.h[1]
    470 
    471             uqrshrn     v14.4h, v14.4s, #16
    472             uqrshrn2    v14.8h, v15.4s, #16
    473             uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
    474 
    475             mov         v6.16b, v7.16b
    476             mov         v7.16b, v8.16b
    477             mov         v8.16b, v9.16b
    478             mov         v9.16b, v10.16b
    479             mov         v10.16b, v11.16b
    480 .endm/*}}}*/
    481 
    482 .macro hconv1_25/*{{{*/
    483 .rodata
    484    200:     .hword -4
    485             .hword 101f-100f
    486             .hword 102f-100f
    487             .hword 103f-100f
    488             .hword 104f-100f
    489             .hword 105f-100f
    490             .hword 106f-100f
    491             .hword 107f-100f
    492             .hword 108f-100f
    493             .hword 109f-100f
    494             .hword 110f-100f
    495             .hword 111f-100f
    496             .hword 112f-100f
    497             .hword 113f-100f
    498             .hword 114f-100f
    499             .hword 115f-100f
    500             .hword 116f-100f
    501             .hword 117f-100f
    502             .hword 118f-100f
    503             .hword 119f-100f
    504             .hword 120f-100f
    505             .hword 121f-100f
    506             .hword 122f-100f
    507             .hword 123f-100f
    508             .hword 124f-100f
    509             .hword 125f-100f
    510             .align 4
    511 .text
    512             ext         v12.16b, v6.16b, v7.16b, #7*2
    513             umull       v14.4s, v12.4h, v0.h[0]
    514             umull2      v15.4s, v12.8h, v0.h[0]
    515 
    516             adrp        x16, 200b
    517             add         x16, x16, :lo12:200b
    518             ldrsh       x12, [x16, x5, LSL #1]
    519             adr         x16, 100f
    520             add         x12, x12, x16
    521     100:    br          x12
    522     125:    ext         v12.16b, v31.16b, v4.16b, #6*2
    523             ext         v13.16b, v10.16b, v11.16b, #0*2
    524             umlal       v14.4s, v12.4h, v3.h[1]
    525             umlal2      v15.4s, v12.8h, v3.h[1]
    526             umlal       v14.4s, v13.4h, v3.h[1]
    527             umlal2      v15.4s, v13.8h, v3.h[1]
    528     124:    ext         v12.16b, v31.16b, v4.16b, #7*2
    529             ext         v13.16b, v9.16b, v10.16b, #7*2
    530             umlal       v14.4s, v12.4h, v3.h[0]
    531             umlal2      v15.4s, v12.8h, v3.h[0]
    532             umlal       v14.4s, v13.4h, v3.h[0]
    533             umlal2      v15.4s, v13.8h, v3.h[0]
    534     123:    ext         v12.16b, v4.16b, v5.16b, #0*2
    535             ext         v13.16b, v9.16b, v10.16b, #6*2
    536             umlal       v14.4s, v12.4h, v2.h[7]
    537             umlal2      v15.4s, v12.8h, v2.h[7]
    538             umlal       v14.4s, v13.4h, v2.h[7]
    539             umlal2      v15.4s, v13.8h, v2.h[7]
    540     122:    ext         v12.16b, v4.16b, v5.16b, #1*2
    541             ext         v13.16b, v9.16b, v10.16b, #5*2
    542             umlal       v14.4s, v12.4h, v2.h[6]
    543             umlal2      v15.4s, v12.8h, v2.h[6]
    544             umlal       v14.4s, v13.4h, v2.h[6]
    545             umlal2      v15.4s, v13.8h, v2.h[6]
    546     121:    ext         v12.16b, v4.16b, v5.16b, #2*2
    547             ext         v13.16b, v9.16b, v10.16b, #4*2
    548             umlal       v14.4s, v12.4h, v2.h[5]
    549             umlal2      v15.4s, v12.8h, v2.h[5]
    550             umlal       v14.4s, v13.4h, v2.h[5]
    551             umlal2      v15.4s, v13.8h, v2.h[5]
    552     120:    ext         v12.16b, v4.16b, v5.16b, #3*2
    553             ext         v13.16b, v9.16b, v10.16b, #3*2
    554             umlal       v14.4s, v12.4h, v2.h[4]
    555             umlal2      v15.4s, v12.8h, v2.h[4]
    556             umlal       v14.4s, v13.4h, v2.h[4]
    557             umlal2      v15.4s, v13.8h, v2.h[4]
    558     119:    ext         v12.16b, v4.16b, v5.16b, #4*2
    559             ext         v13.16b, v9.16b, v10.16b, #2*2
    560             umlal       v14.4s, v12.4h, v2.h[3]
    561             umlal2      v15.4s, v12.8h, v2.h[3]
    562             umlal       v14.4s, v13.4h, v2.h[3]
    563             umlal2      v15.4s, v13.8h, v2.h[3]
    564     118:    ext         v12.16b, v4.16b, v5.16b, #5*2
    565             ext         v13.16b, v9.16b, v10.16b, #1*2
    566             umlal       v14.4s, v12.4h, v2.h[2]
    567             umlal2      v15.4s, v12.8h, v2.h[2]
    568             umlal       v14.4s, v13.4h, v2.h[2]
    569             umlal2      v15.4s, v13.8h, v2.h[2]
    570     117:    ext         v12.16b, v4.16b, v5.16b, #6*2
    571             ext         v13.16b, v9.16b, v10.16b, #0*2
    572             umlal       v14.4s, v12.4h, v2.h[1]
    573             umlal2      v15.4s, v12.8h, v2.h[1]
    574             umlal       v14.4s, v13.4h, v2.h[1]
    575             umlal2      v15.4s, v13.8h, v2.h[1]
    576     116:    ext         v12.16b, v4.16b, v5.16b, #7*2
    577             ext         v13.16b, v8.16b, v9.16b, #7*2
    578             umlal       v14.4s, v12.4h, v2.h[0]
    579             umlal2      v15.4s, v12.8h, v2.h[0]
    580             umlal       v14.4s, v13.4h, v2.h[0]
    581             umlal2      v15.4s, v13.8h, v2.h[0]
    582     115:    ext         v12.16b, v5.16b, v6.16b, #0*2
    583             ext         v13.16b, v8.16b, v9.16b, #6*2
    584             umlal       v14.4s, v12.4h, v1.h[7]
    585             umlal2      v15.4s, v12.8h, v1.h[7]
    586             umlal       v14.4s, v13.4h, v1.h[7]
    587             umlal2      v15.4s, v13.8h, v1.h[7]
    588     114:    ext         v12.16b, v5.16b, v6.16b, #1*2
    589             ext         v13.16b, v8.16b, v9.16b, #5*2
    590             umlal       v14.4s, v12.4h, v1.h[6]
    591             umlal2      v15.4s, v12.8h, v1.h[6]
    592             umlal       v14.4s, v13.4h, v1.h[6]
    593             umlal2      v15.4s, v13.8h, v1.h[6]
    594     113:    ext         v12.16b, v5.16b, v6.16b, #2*2
    595             ext         v13.16b, v8.16b, v9.16b, #4*2
    596             umlal       v14.4s, v12.4h, v1.h[5]
    597             umlal2      v15.4s, v12.8h, v1.h[5]
    598             umlal       v14.4s, v13.4h, v1.h[5]
    599             umlal2      v15.4s, v13.8h, v1.h[5]
    600     112:    ext         v12.16b, v5.16b, v6.16b, #3*2
    601             ext         v13.16b, v8.16b, v9.16b, #3*2
    602             umlal       v14.4s, v12.4h, v1.h[4]
    603             umlal2      v15.4s, v12.8h, v1.h[4]
    604             umlal       v14.4s, v13.4h, v1.h[4]
    605             umlal2      v15.4s, v13.8h, v1.h[4]
    606     111:    ext         v12.16b, v5.16b, v6.16b, #4*2
    607             ext         v13.16b, v8.16b, v9.16b, #2*2
    608             umlal       v14.4s, v12.4h, v1.h[3]
    609             umlal2      v15.4s, v12.8h, v1.h[3]
    610             umlal       v14.4s, v13.4h, v1.h[3]
    611             umlal2      v15.4s, v13.8h, v1.h[3]
    612     110:    ext         v12.16b, v5.16b, v6.16b, #5*2
    613             ext         v13.16b, v8.16b, v9.16b, #1*2
    614             umlal       v14.4s, v12.4h, v1.h[2]
    615             umlal2      v15.4s, v12.8h, v1.h[2]
    616             umlal       v14.4s, v13.4h, v1.h[2]
    617             umlal2      v15.4s, v13.8h, v1.h[2]
    618     109:    ext         v12.16b, v5.16b, v6.16b, #6*2
    619             ext         v13.16b, v8.16b, v9.16b, #0*2
    620             umlal       v14.4s, v12.4h, v1.h[1]
    621             umlal2      v15.4s, v12.8h, v1.h[1]
    622             umlal       v14.4s, v13.4h, v1.h[1]
    623             umlal2      v15.4s, v13.8h, v1.h[1]
    624     108:    ext         v12.16b, v5.16b, v6.16b, #7*2
    625             ext         v13.16b, v7.16b, v8.16b, #7*2
    626             umlal       v14.4s, v12.4h, v1.h[0]
    627             umlal2      v15.4s, v12.8h, v1.h[0]
    628             umlal       v14.4s, v13.4h, v1.h[0]
    629             umlal2      v15.4s, v13.8h, v1.h[0]
    630     107:    ext         v12.16b, v6.16b, v7.16b, #0*2
    631             ext         v13.16b, v7.16b, v8.16b, #6*2
    632             umlal       v14.4s, v12.4h, v0.h[7]
    633             umlal2      v15.4s, v12.8h, v0.h[7]
    634             umlal       v14.4s, v13.4h, v0.h[7]
    635             umlal2      v15.4s, v13.8h, v0.h[7]
    636     106:    ext         v12.16b, v6.16b, v7.16b, #1*2
    637             ext         v13.16b, v7.16b, v8.16b, #5*2
    638             umlal       v14.4s, v12.4h, v0.h[6]
    639             umlal2      v15.4s, v12.8h, v0.h[6]
    640             umlal       v14.4s, v13.4h, v0.h[6]
    641             umlal2      v15.4s, v13.8h, v0.h[6]
    642     105:    ext         v12.16b, v6.16b, v7.16b, #2*2
    643             ext         v13.16b, v7.16b, v8.16b, #4*2
    644             umlal       v14.4s, v12.4h, v0.h[5]
    645             umlal2      v15.4s, v12.8h, v0.h[5]
    646             umlal       v14.4s, v13.4h, v0.h[5]
    647             umlal2      v15.4s, v13.8h, v0.h[5]
    648     104:    ext         v12.16b, v6.16b, v7.16b, #3*2
    649             ext         v13.16b, v7.16b, v8.16b, #3*2
    650             umlal       v14.4s, v12.4h, v0.h[4]
    651             umlal2      v15.4s, v12.8h, v0.h[4]
    652             umlal       v14.4s, v13.4h, v0.h[4]
    653             umlal2      v15.4s, v13.8h, v0.h[4]
    654     103:    ext         v12.16b, v6.16b, v7.16b, #4*2
    655             ext         v13.16b, v7.16b, v8.16b, #2*2
    656             umlal       v14.4s, v12.4h, v0.h[3]
    657             umlal2      v15.4s, v12.8h, v0.h[3]
    658             umlal       v14.4s, v13.4h, v0.h[3]
    659             umlal2      v15.4s, v13.8h, v0.h[3]
    660     102:    ext         v12.16b, v6.16b, v7.16b, #5*2
    661             ext         v13.16b, v7.16b, v8.16b, #1*2
    662             umlal       v14.4s, v12.4h, v0.h[2]
    663             umlal2      v15.4s, v12.8h, v0.h[2]
    664             umlal       v14.4s, v13.4h, v0.h[2]
    665             umlal2      v15.4s, v13.8h, v0.h[2]
    666     101:    ext         v12.16b, v6.16b, v7.16b, #6*2
    667             ext         v13.16b, v7.16b, v8.16b, #0*2
    668             umlal       v14.4s, v12.4h, v0.h[1]
    669             umlal2      v15.4s, v12.8h, v0.h[1]
    670             umlal       v14.4s, v13.4h, v0.h[1]
    671             umlal2      v15.4s, v13.8h, v0.h[1]
    672 
    673             uqrshrn     v14.4h, v14.4s, #16
    674             uqrshrn2    v14.8h, v15.4s, #16
    675             uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
    676 
    677             mov         v31.16b, v4.16b
    678             mov         v4.16b, v5.16b
    679             mov         v5.16b, v6.16b
    680             mov         v6.16b, v7.16b
    681             mov         v7.16b, v8.16b
    682             mov         v8.16b, v9.16b
    683             mov         v9.16b, v10.16b
    684             mov         v10.16b, v11.16b
    685 .endm/*}}}*/
    686 
    687 #define TUNED_LIST4 6, 12, 20
    688 .macro hconv4_6/*{{{*/
    689 .rodata
    690    200:     .hword -4
    691             .hword 101f-100f
    692             .hword 102f-100f
    693             .hword 103f-100f
    694             .hword 104f-100f
    695             .hword 105f-100f
    696             .hword 106f-100f
    697             .align      4
    698 .text
    699             umull       v14.4s, v7.4h, v0.h[0]
    700             umull2      v15.4s, v7.8h, v0.h[0]
    701 
    702             adrp        x16, 200b
    703             add         x16, x16, :lo12:200b
    704             ldrsh       x12, [x16, x5, LSL #1]
    705             adr         x16, 100f
    706             add         x12, x12, x16
    707     100:    br          x12
    708     106:    umlal       v14.4s, v4.4h,  v0.h[6]
    709             umlal2      v15.4s, v4.8h,  v0.h[6]
    710             umlal       v14.4s, v10.4h, v0.h[6]
    711             umlal2      v15.4s, v10.8h, v0.h[6]
    712     105:    umlal2      v14.4s, v4.8h,  v0.h[5]
    713             umlal       v15.4s, v5.4h, v0.h[5]
    714             umlal2      v14.4s, v9.8h, v0.h[5]
    715             umlal       v15.4s, v10.4h, v0.h[5]
    716     104:    umlal       v14.4s, v5.4h, v0.h[4]
    717             umlal2      v15.4s, v5.8h, v0.h[4]
    718             umlal       v14.4s, v9.4h, v0.h[4]
    719             umlal2      v15.4s, v9.8h, v0.h[4]
    720     103:    umlal2      v14.4s, v5.8h, v0.h[3]
    721             umlal       v15.4s, v6.4h, v0.h[3]
    722             umlal2      v14.4s, v8.8h, v0.h[3]
    723             umlal       v15.4s, v9.4h, v0.h[3]
    724     102:    umlal       v14.4s, v6.4h, v0.h[2]
    725             umlal2      v15.4s, v6.8h, v0.h[2]
    726             umlal       v14.4s, v8.4h, v0.h[2]
    727             umlal2      v15.4s, v8.8h, v0.h[2]
    728     101:    umlal2      v14.4s, v6.8h, v0.h[1]
    729             umlal       v15.4s, v7.4h, v0.h[1]
    730             umlal2      v14.4s, v7.8h, v0.h[1]
    731             umlal       v15.4s, v8.4h, v0.h[1]
    732 
    733             uqrshrn     v14.4h, v14.4s, #16
    734             uqrshrn2    v14.8h, v15.4s, #16
    735             uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
    736 
    737             mov         v4.16b, v5.16b
    738             mov         v5.16b, v6.16b
    739             mov         v6.16b, v7.16b
    740             mov         v7.16b, v8.16b
    741             mov         v8.16b, v9.16b
    742             mov         v9.16b, v10.16b
    743             mov         v10.16b, v11.16b
    744 .endm/*}}}*/
    745 
    746 .macro hconv4_12/*{{{*/
    747 .rodata
    748    200:     .hword -4 //Might need to remove these...
    749             .hword 101f-100f
    750             .hword 102f-100f
    751             .hword 103f-100f
    752             .hword 104f-100f
    753             .hword 105f-100f
    754             .hword 106f-100f
    755             .hword 107f-100f
    756             .hword 108f-100f
    757             .hword 109f-100f
    758             .hword 110f-100f
    759             .hword 111f-100f
    760             .hword 112f-100f
    761             .align 4
    762 .text
    763             umull       v14.4s, v4.4h, v0.h[0]
    764             umull2      v15.4s, v4.8h, v0.h[0]
    765 
    766             adrp        x16, 200b
    767             add         x16, x16, :lo12:200b
    768             ldrsh       x12, [x16, x5, LSL #1]
    769             adr         x16, 100f
    770             add         x12, x12, x16
    771     100:    br          x12
    772     112:    umlal       v14.4s, v26.4h, v1.h[4]
    773             umlal2      v15.4s, v26.8h, v1.h[4]
    774             umlal       v14.4s, v10.4h, v1.h[4]
    775             umlal2      v15.4s, v10.8h, v1.h[4]
    776     111:    umlal2      v14.4s, v26.8h, v1.h[3]
    777             umlal       v15.4s, v27.4h, v1.h[3]
    778             umlal2      v14.4s, v9.8h, v1.h[3]
    779             umlal       v15.4s, v10.4h, v1.h[3]
    780     110:    umlal       v14.4s, v27.4h, v1.h[2]
    781             umlal2      v15.4s, v27.8h, v1.h[2]
    782             umlal       v14.4s, v9.4h, v1.h[2]
    783             umlal2      v15.4s, v9.8h, v1.h[2]
    784     109:    umlal2      v14.4s, v27.8h, v1.h[1]
    785             umlal       v15.4s, v28.4h, v1.h[1]
    786             umlal2      v14.4s, v8.8h, v1.h[1]
    787             umlal       v15.4s, v9.4h, v1.h[1]
    788     108:    umlal       v14.4s, v28.4h, v1.h[0]
    789             umlal2      v15.4s, v28.8h, v1.h[0]
    790             umlal       v14.4s, v8.4h, v1.h[0]
    791             umlal2      v15.4s, v8.8h, v1.h[0]
    792     107:    umlal2      v14.4s, v28.8h, v0.h[7]
    793             umlal       v15.4s, v29.4h, v0.h[7]
    794             umlal2      v14.4s, v7.8h, v0.h[7]
    795             umlal       v15.4s, v8.4h, v0.h[7]
    796     106:    umlal       v14.4s, v29.4h, v0.h[6]
    797             umlal2      v15.4s, v29.8h, v0.h[6]
    798             umlal       v14.4s, v7.4h, v0.h[6]
    799             umlal2      v15.4s, v7.8h, v0.h[6]
    800     105:    umlal2      v14.4s, v29.8h, v0.h[5]
    801             umlal       v15.4s, v30.4h, v0.h[5]
    802             umlal2      v14.4s, v6.8h, v0.h[5]
    803             umlal       v15.4s, v7.4h, v0.h[5]
    804     104:    umlal       v14.4s, v30.4h, v0.h[4]
    805             umlal2      v15.4s, v30.8h, v0.h[4]
    806             umlal       v14.4s, v6.4h, v0.h[4]
    807             umlal2      v15.4s, v6.8h, v0.h[4]
    808     103:    umlal2      v14.4s, v30.8h, v0.h[3]
    809             umlal       v15.4s, v31.4h, v0.h[3]
    810             umlal2      v14.4s, v5.8h, v0.h[3]
    811             umlal       v15.4s, v6.4h, v0.h[3]
    812     102:    umlal       v14.4s, v31.4h, v0.h[2]
    813             umlal2      v15.4s, v31.8h, v0.h[2]
    814             umlal       v14.4s, v5.4h, v0.h[2]
    815             umlal2      v15.4s, v5.8h, v0.h[2]
    816     101:    umlal2      v14.4s, v31.8h, v0.h[1]
    817             umlal       v15.4s, v4.4h,  v0.h[1]
    818             umlal2      v14.4s, v4.8h,  v0.h[1]
    819             umlal       v15.4s, v5.4h, v0.h[1]
    820 
    821             uqrshrn     v14.4h, v14.4s, #16
    822             uqrshrn2    v14.8h, v15.4s, #16
    823             uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
    824 
    825             mov         v26.16b, v27.16b
    826             mov         v27.16b, v28.16b
    827             mov         v28.16b, v29.16b
    828             mov         v29.16b, v30.16b
    829             mov         v30.16b, v31.16b
    830             mov         v31.16b, v4.16b
    831             mov         v4.16b, v5.16b
    832             mov         v5.16b, v6.16b
    833             mov         v6.16b, v7.16b
    834             mov         v7.16b, v8.16b
    835             mov         v8.16b, v9.16b
    836             mov         v9.16b, v10.16b
    837             mov         v10.16b, v11.16b
    838 .endm/*}}}*/
    839 
    840 .macro hconv4_20/*{{{*/
    841 .rodata
    842    200:     .hword -4
    843             .hword 101f-100f
    844             .hword 102f-100f
    845             .hword 103f-100f
    846             .hword 104f-100f
    847             .hword 105f-100f
    848             .hword 106f-100f
    849             .hword 107f-100f
    850             .hword 108f-100f
    851             .hword 109f-100f
    852             .hword 110f-100f
    853             .hword 111f-100f
    854             .hword 112f-100f
    855             .hword 113f-100f
    856             .hword 114f-100f
    857             .hword 115f-100f
    858             .hword 116f-100f
    859             .hword 117f-100f
    860             .hword 118f-100f
    861             .hword 119f-100f
    862             .hword 120f-100f
    863             .align 4
    864 .text
    865             umull       v14.4s, v28.4h, v0.h[0]
    866             umull2      v15.4s, v28.8h, v0.h[0]
    867 
    868             adrp        x16, 200b
    869             add         x16, x16, :lo12:200b
    870             ldrsh       x12, [x16, x5, LSL #1]
    871             adr         x16, 100f
    872             add         x12, x12, x16
    873     100:    br          x12
    874     120:    umlal       v14.4s, v18.4h, v2.h[4]
    875             umlal2      v15.4s, v18.8h, v2.h[4]
    876             umlal       v14.4s, v10.4h, v2.h[4]
    877             umlal2      v15.4s, v10.8h, v2.h[4]
    878     119:    umlal2      v14.4s, v18.8h, v2.h[3]
    879             umlal       v15.4s, v19.4h, v2.h[3]
    880             umlal2      v14.4s, v9.8h,  v2.h[3]
    881             umlal       v15.4s, v10.4h, v2.h[3]
    882     118:    umlal       v14.4s, v19.4h, v2.h[2]
    883             umlal2      v15.4s, v19.8h, v2.h[2]
    884             umlal       v14.4s, v9.4h,  v2.h[2]
    885             umlal2      v15.4s, v9.8h,  v2.h[2]
    886     117:    umlal2      v14.4s, v19.8h, v2.h[1]
    887             umlal       v15.4s, v20.4h, v2.h[1]
    888             umlal2      v14.4s, v8.8h,  v2.h[1]
    889             umlal       v15.4s, v9.4h,  v2.h[1]
    890     116:    umlal       v14.4s, v20.4h, v2.h[0]
    891             umlal2      v15.4s, v20.8h, v2.h[0]
    892             umlal       v14.4s, v8.4h,  v2.h[0]
    893             umlal2      v15.4s, v8.8h,  v2.h[0]
    894     115:    umlal2      v14.4s, v20.8h, v1.h[7]
    895             umlal       v15.4s, v21.4h, v1.h[7]
    896             umlal2      v14.4s, v7.8h,  v1.h[7]
    897             umlal       v15.4s, v8.4h,  v1.h[7]
    898     114:    umlal       v14.4s, v21.4h, v1.h[6]
    899             umlal2      v15.4s, v21.8h, v1.h[6]
    900             umlal       v14.4s, v7.4h,  v1.h[6]
    901             umlal2      v15.4s, v7.8h,  v1.h[6]
    902     113:    umlal2      v14.4s, v21.8h, v1.h[5]
    903             umlal       v15.4s, v22.4h, v1.h[5]
    904             umlal2      v14.4s, v6.8h,  v1.h[5]
    905             umlal       v15.4s, v7.4h,  v1.h[5]
    906     112:    umlal       v14.4s, v22.4h, v1.h[4]
    907             umlal2      v15.4s, v22.8h, v1.h[4]
    908             umlal       v14.4s, v6.4h,  v1.h[4]
    909             umlal2      v15.4s, v6.8h,  v1.h[4]
    910     111:    umlal2      v14.4s, v22.8h, v1.h[3]
    911             umlal       v15.4s, v23.4h, v1.h[3]
    912             umlal2      v14.4s, v5.8h,  v1.h[3]
    913             umlal       v15.4s, v6.4h,  v1.h[3]
    914     110:    umlal       v14.4s, v23.4h, v1.h[2]
    915             umlal2      v15.4s, v23.8h, v1.h[2]
    916             umlal       v14.4s, v5.4h,  v1.h[2]
    917             umlal2      v15.4s, v5.8h,  v1.h[2]
    918     109:    umlal2      v14.4s, v23.8h, v1.h[1]
    919             umlal       v15.4s, v24.4h, v1.h[1]
    920             umlal2      v14.4s, v4.8h,  v1.h[1]
    921             umlal       v15.4s, v5.4h,  v1.h[1]
    922     108:    umlal       v14.4s, v24.4h, v1.h[0]
    923             umlal2      v15.4s, v24.8h, v1.h[0]
    924             umlal       v14.4s, v4.4h,  v1.h[0]
    925             umlal2      v15.4s, v4.8h,  v1.h[0]
    926     107:    umlal2      v14.4s, v24.8h, v0.h[7]
    927             umlal       v15.4s, v25.4h, v0.h[7]
    928             umlal2      v14.4s, v31.8h, v0.h[7]
    929             umlal       v15.4s, v4.4h,  v0.h[7]
    930     106:    umlal       v14.4s, v25.4h, v0.h[6]
    931             umlal2      v15.4s, v25.8h, v0.h[6]
    932             umlal       v14.4s, v31.4h, v0.h[6]
    933             umlal2      v15.4s, v31.8h, v0.h[6]
    934     105:    umlal2      v14.4s, v25.8h, v0.h[5]
    935             umlal       v15.4s, v26.4h, v0.h[5]
    936             umlal2      v14.4s, v30.8h, v0.h[5]
    937             umlal       v15.4s, v31.4h, v0.h[5]
    938     104:    umlal       v14.4s, v26.4h, v0.h[4]
    939             umlal2      v15.4s, v26.8h, v0.h[4]
    940             umlal       v14.4s, v30.4h, v0.h[4]
    941             umlal2      v15.4s, v30.8h, v0.h[4]
    942     103:    umlal2      v14.4s, v26.8h, v0.h[3]
    943             umlal       v15.4s, v27.4h, v0.h[3]
    944             umlal2      v14.4s, v29.8h, v0.h[3]
    945             umlal       v15.4s, v30.4h, v0.h[3]
    946     102:    umlal       v14.4s, v27.4h, v0.h[2]
    947             umlal2      v15.4s, v27.8h, v0.h[2]
    948             umlal       v14.4s, v29.4h, v0.h[2]
    949             umlal2      v15.4s, v29.8h, v0.h[2]
    950     101:    umlal2      v14.4s, v27.8h, v0.h[1]
    951             umlal       v15.4s, v28.4h, v0.h[1]
    952             umlal2      v14.4s, v28.8h, v0.h[1]
    953             umlal       v15.4s, v29.4h, v0.h[1]
    954 
    955             uqrshrn     v14.4h, v14.4s, #16
    956             uqrshrn2    v14.8h, v15.4s, #16
    957             uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
    958 
    959             mov         v18.16b, v19.16b
    960             mov         v19.16b, v20.16b
    961             mov         v20.16b, v21.16b
    962             mov         v21.16b, v22.16b
    963             mov         v22.16b, v23.16b
    964             mov         v23.16b, v24.16b
    965             mov         v24.16b, v25.16b
    966             mov         v25.16b, v26.16b
    967             mov         v26.16b, v27.16b
    968             mov         v27.16b, v28.16b
    969             mov         v28.16b, v29.16b
    970             mov         v29.16b, v30.16b
    971             mov         v30.16b, v31.16b
    972             mov         v31.16b, v4.16b
    973             mov         v4.16b, v5.16b
    974             mov         v5.16b, v6.16b
    975             mov         v6.16b, v7.16b
    976             mov         v7.16b, v8.16b
    977             mov         v8.16b, v9.16b
    978             mov         v9.16b, v10.16b
    979             mov         v10.16b, v11.16b
    980 .endm/*}}}*/
    981 
    982 .macro hconv4_25/*{{{*/
    983 .rodata
    984    200:     .hword -4
    985             .hword 101f-100f
    986             .hword 102f-100f
    987             .hword 103f-100f
    988             .hword 104f-100f
    989             .hword 105f-100f
    990             .hword 106f-100f
    991             .hword 107f-100f
    992             .hword 108f-100f
    993             .hword 109f-100f
    994             .hword 110f-100f
    995             .hword 111f-100f
    996             .hword 112f-100f
    997             .hword 113f-100f
    998             .hword 114f-100f
    999             .hword 115f-100f
   1000             .hword 116f-100f
   1001             .hword 117f-100f
   1002             .hword 118f-100f
   1003             .hword 119f-100f
   1004             .hword 120f-100f
   1005             .hword 121f-100f
   1006             .hword 122f-100f
   1007             .hword 123f-100f
   1008             .hword 124f-100f
   1009             .hword 125f-100f
   1010             .align 4
   1011 .text
   1012             umull2      v14.4s, v25.8h, v0.h[0]
   1013             umull       v15.4s, v26.4h, v0.h[0]
   1014 
   1015             adrp        x16, 200b
   1016             add         x16, x16, :lo12:200b
   1017             ldrsh       x12, [x16, x5, LSL #1]
   1018             adr         x16, 100f
   1019             add         x12, x12, x16
   1020     100:    br          x12
   1021     125:    ld1         {v12.8h}, [x9]
   1022             umlal       v14.4s, v12.4h, v3.h[1]
   1023             umlal2      v15.4s, v12.8h, v3.h[1]
   1024             umlal       v14.4s, v10.4h, v3.h[1]
   1025             umlal2      v15.4s, v10.8h, v3.h[1]
   1026     124:    add         x12, x9, #0x08
   1027             bic         x12, x12, #0x40
   1028             ld1         {v12.4h}, [x12], #8
   1029             bic         x12, x12, #0x40
   1030             ld1         {v13.4h}, [x12]
   1031             umlal       v14.4s, v12.4h, v3.h[0]
   1032             umlal       v15.4s, v13.4h, v3.h[0]
   1033             umlal2      v14.4s, v9.8h,  v3.h[0]
   1034             umlal       v15.4s, v10.4h, v3.h[0]
   1035     123:    add         x12, x9, #0x10
   1036             bic         x12, x12, #0x40
   1037             ld1         {v12.8h}, [x12]
   1038             umlal       v14.4s, v12.4h, v2.h[7]
   1039             umlal2      v15.4s, v12.8h, v2.h[7]
   1040             umlal       v14.4s, v9.4h,  v2.h[7]
   1041             umlal2      v15.4s, v9.8h,  v2.h[7]
   1042     122:    add         x12, x9, #0x18
   1043             bic         x12, x12, #0x40
   1044             ld1         {v12.4h}, [x12], #8
   1045             bic         x12, x12, #0x40
   1046             ld1         {v13.4h}, [x12]
   1047             umlal       v14.4s, v12.4h, v2.h[6]
   1048             umlal       v15.4s, v13.4h, v2.h[6]
   1049             umlal2      v14.4s, v8.8h,  v2.h[6]
   1050             umlal       v15.4s, v9.4h,  v2.h[6]
   1051     121:    add         x12, x9, #0x20
   1052             bic         x12, x12, #0x40
   1053             ld1         {v12.8h}, [x12]
   1054             umlal       v14.4s, v12.4h, v2.h[5]
   1055             umlal2      v15.4s, v12.8h, v2.h[5]
   1056             umlal       v14.4s, v8.4h,  v2.h[5]
   1057             umlal2      v15.4s, v8.8h,  v2.h[5]
   1058     120:    add         x12, x9, #0x28
   1059             bic         x12, x12, #0x40
   1060             ld1         {v12.4h}, [x12], #8
   1061             bic         x12, x12, #0x40
   1062             ld1         {v13.4h}, [x12]
   1063             umlal       v14.4s, v12.4h, v2.h[4]
   1064             umlal       v15.4s, v13.4h, v2.h[4]
   1065             umlal2      v14.4s, v7.8h,  v2.h[4]
   1066             umlal       v15.4s, v8.4h,  v2.h[4]
   1067     119:    add         x12, x9, #0x30
   1068             bic         x12, x12, #0x40
   1069             ld1         {v12.8h}, [x12]
   1070             umlal       v14.4s, v12.4h, v2.h[3]
   1071             umlal2      v15.4s, v12.8h, v2.h[3]
   1072             umlal       v14.4s, v7.4h,  v2.h[3]
   1073             umlal2      v15.4s, v7.8h,  v2.h[3]
   1074     118:    add         x12, x9, #0x38
   1075             bic         x12, x12, #0x40
   1076             ld1         {v12.4h}, [x12]
   1077             umlal       v14.4s, v12.4h, v2.h[2]
   1078             umlal       v15.4s, v17.4h, v2.h[2]
   1079             umlal2      v14.4s, v6.8h,  v2.h[2]
   1080             umlal       v15.4s, v7.4h,  v2.h[2]
   1081     117:    umlal       v14.4s, v17.4h, v2.h[1]
   1082             umlal2      v15.4s, v17.8h, v2.h[1]
   1083             umlal       v14.4s, v6.4h,  v2.h[1]
   1084             umlal2      v15.4s, v6.8h,  v2.h[1]
   1085     116:    umlal2      v14.4s, v17.8h, v2.h[0]
   1086             umlal       v15.4s, v18.4h, v2.h[0]
   1087             umlal2      v14.4s, v5.8h,  v2.h[0]
   1088             umlal       v15.4s, v6.4h,  v2.h[0]
   1089     115:    umlal       v14.4s, v18.4h, v1.h[7]
   1090             umlal2      v15.4s, v18.8h, v1.h[7]
   1091             umlal       v14.4s, v5.4h,  v1.h[7]
   1092             umlal2      v15.4s, v5.8h,  v1.h[7]
   1093     114:    umlal2      v14.4s, v18.8h, v1.h[6]
   1094             umlal       v15.4s, v19.4h, v1.h[6]
   1095             umlal2      v14.4s, v4.8h,  v1.h[6]
   1096             umlal       v15.4s, v5.4h,  v1.h[6]
   1097     113:    umlal       v14.4s, v19.4h, v1.h[5]
   1098             umlal2      v15.4s, v19.8h, v1.h[5]
   1099             umlal       v14.4s, v4.4h,  v1.h[5]
   1100             umlal2      v15.4s, v4.8h,  v1.h[5]
   1101     112:    umlal2      v14.4s, v19.8h, v1.h[4]
   1102             umlal       v15.4s, v20.4h, v1.h[4]
   1103             umlal2      v14.4s, v31.8h, v1.h[4]
   1104             umlal       v15.4s, v4.4h,  v1.h[4]
   1105     111:    umlal       v14.4s, v20.4h, v1.h[3]
   1106             umlal2      v15.4s, v20.8h, v1.h[3]
   1107             umlal       v14.4s, v31.4h, v1.h[3]
   1108             umlal2      v15.4s, v31.8h, v1.h[3]
   1109     110:    umlal2      v14.4s, v20.8h, v1.h[2]
   1110             umlal       v15.4s, v21.4h, v1.h[2]
   1111             umlal2      v14.4s, v30.8h, v1.h[2]
   1112             umlal       v15.4s, v31.4h, v1.h[2]
   1113     109:    umlal       v14.4s, v21.4h, v1.h[1]
   1114             umlal2      v15.4s, v21.8h, v1.h[1]
   1115             umlal       v14.4s, v30.4h, v1.h[1]
   1116             umlal2      v15.4s, v30.8h, v1.h[1]
   1117     108:    umlal2      v14.4s, v21.8h, v1.h[0]
   1118             umlal       v15.4s, v22.4h, v1.h[0]
   1119             umlal2      v14.4s, v29.8h, v1.h[0]
   1120             umlal       v15.4s, v30.4h, v1.h[0]
   1121     107:    umlal       v14.4s, v22.4h, v0.h[7]
   1122             umlal2      v15.4s, v22.8h, v0.h[7]
   1123             umlal       v14.4s, v29.4h, v0.h[7]
   1124             umlal2      v15.4s, v29.8h, v0.h[7]
   1125     106:    umlal2      v14.4s, v22.8h, v0.h[6]
   1126             umlal       v15.4s, v23.4h, v0.h[6]
   1127             umlal2      v14.4s, v28.8h, v0.h[6]
   1128             umlal       v15.4s, v29.4h, v0.h[6]
   1129     105:    umlal       v14.4s, v23.4h, v0.h[5]
   1130             umlal2      v15.4s, v23.8h, v0.h[5]
   1131             umlal       v14.4s, v28.4h, v0.h[5]
   1132             umlal2      v15.4s, v28.8h, v0.h[5]
   1133     104:    umlal2      v14.4s, v23.8h, v0.h[4]
   1134             umlal       v15.4s, v24.4h, v0.h[4]
   1135             umlal2      v14.4s, v27.8h, v0.h[4]
   1136             umlal       v15.4s, v28.4h, v0.h[4]
   1137     103:    umlal       v14.4s, v24.4h, v0.h[3]
   1138             umlal2      v15.4s, v24.8h, v0.h[3]
   1139             umlal       v14.4s, v27.4h, v0.h[3]
   1140             umlal2      v15.4s, v27.8h, v0.h[3]
   1141     102:    umlal2      v14.4s, v24.8h, v0.h[2]
   1142             umlal       v15.4s, v25.4h, v0.h[2]
   1143             umlal2      v14.4s, v26.8h, v0.h[2]
   1144             umlal       v15.4s, v27.4h, v0.h[2]
   1145     101:    umlal       v14.4s, v25.4h, v0.h[1]
   1146             umlal2      v15.4s, v25.8h, v0.h[1]
   1147             umlal       v14.4s, v26.4h, v0.h[1]
   1148             umlal2      v15.4s, v26.8h, v0.h[1]
   1149 
   1150             uqrshrn     v14.4h, v14.4s, #16
   1151             uqrshrn2    v14.8h, v15.4s, #16
   1152             uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
   1153 
   1154             st1         {v17.16b}, [x9], #16
   1155             bic         x9, x9, #0x40
   1156             mov         v17.16b, v18.16b
   1157             mov         v18.16b, v19.16b
   1158             mov         v19.16b, v20.16b
   1159             mov         v20.16b, v21.16b
   1160             mov         v21.16b, v22.16b
   1161             mov         v22.16b, v23.16b
   1162             mov         v23.16b, v24.16b
   1163             mov         v24.16b, v25.16b
   1164             mov         v25.16b, v26.16b
   1165             mov         v26.16b, v27.16b
   1166             mov         v27.16b, v28.16b
   1167             mov         v28.16b, v29.16b
   1168             mov         v29.16b, v30.16b
   1169             mov         v30.16b, v31.16b
   1170             mov         v31.16b, v4.16b
   1171             mov         v4.16b, v5.16b
   1172             mov         v5.16b, v6.16b
   1173             mov         v6.16b, v7.16b
   1174             mov         v7.16b, v8.16b
   1175             mov         v8.16b, v9.16b
   1176             mov         v9.16b, v10.16b
   1177             mov         v10.16b, v11.16b
   1178 .endm/*}}}*/
   1179 
   1180 /* Dedicated function wrapper for the fetch macro, for the cases where
   1181  * performance isn't that important, to keep code size down.
   1182  */
   1183 PRIVATE(fetch_generic_asm)
   1184             stp         x10, x11, [sp, #-16]!
   1185             fetch
   1186             ldp         x10, x11, [sp], #16
   1187             ret
   1188 END(fetch_generic_asm)
   1189 
   1190 
   1191 /* Fetch the next (16 - (x10 & 15)) columns of data, avoiding reading memory
   1192  * beyond that limit, and filling the rest of the vector with the last legal
   1193  * pixel.
   1194  * Result is in v10 and v11.  v8 and v9 are filled with the first legal pixel.
   1195  * Note: This function can read beyond the right edge of input if the image is
   1196  * narrower than 16 bytes.
   1197  */
   1198 PRIVATE(fetch_clampleft1)
   1199             stp         x29, x30, [sp, #-16]!
   1200             bl          fetch_generic_asm
   1201             dup         v8.8h, v10.h[0]
   1202             dup         v9.8h, v10.h[0]
   1203             ands        x12, x10, #15
   1204             beq         1f
   1205             sub         x1, x1, x12
   1206             sub         x15, x15, x12
   1207             sub         x19, x19, x12
   1208             sub         x10, x10, x12
   1209             sub         x12, sp, x12, LSL #1
   1210             sub         sp, sp, #64
   1211             sub         x12, x12, #32
   1212             st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
   1213             ld1         {v10.8h,v11.8h}, [x12]
   1214             add         sp, sp, #64
   1215 1:          ldp         x29, x30, [sp], #16
   1216             ret
   1217 END(fetch_clampleft1)
   1218 
   1219 PRIVATE(fetch_clampleft4)
   1220             stp         x29, x30, [sp, #-16]!
   1221             bl          fetch_generic_asm
   1222             dup         v8.2d, v10.d[0]
   1223             dup         v9.2d, v10.d[0]
   1224             ands        x12, x10, #15
   1225             beq         1f
   1226             sub         x1, x1, x12
   1227             sub         x15, x15, x12
   1228             sub         x19, x19, x12
   1229             sub         x10, x10, x12
   1230             sub         x12, sp, x12, LSL #1
   1231             sub         sp, sp, #64
   1232             sub         x12, x12, #32
   1233             st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
   1234             ld1         {v10.8h,v11.8h}, [x12]
   1235             add         sp, sp, #64
   1236 1:          ldp         x29, x30, [sp], #16
   1237             ret
   1238 END(fetch_clampleft4)
   1239 
   1240 /* Fetch only the next (x11 & 15) (where 0 means 16) columns of data, avoiding
   1241  * reading memory beyond that limit, and filling the rest of the vector with
   1242  * the last legal pixel.
   1243  * Result is in v10 and v11.  v12 and v13 are filled with the last legal pixel.
   1244  * Note: This function can read beyond the left edge of input if the image is
   1245  * narrower than 16 bytes.
   1246  */
   1247 PRIVATE(fetch_clampright1)
   1248             stp         x29, x30, [sp, #-16]!
   1249             sub         x12, xzr, x11
   1250             ands        x12, x12, #15
   1251             beq         1f
   1252             sub         x1, x1, x12
   1253             sub         x15, x15, x12
   1254             sub         x19, x19, x12
   1255             bl          fetch_generic_asm
   1256             dup         v12.8h, v11.h[7]
   1257             dup         v13.8h, v11.h[7]
   1258             sub         x12, xzr, x11
   1259             and         x12, x12, #15
   1260             sub         sp, sp, #64
   1261             add         x12, sp, x12, LSL #1
   1262             st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
   1263             ld1         {v10.8h,v11.8h}, [x12]
   1264             add         sp, sp, #64
   1265             ldp         x29, x30, [sp], #16
   1266             ret
   1267 1:          bl          fetch_generic_asm
   1268             dup         v12.8h, v11.h[7]
   1269             dup         v13.8h, v11.h[7]
   1270             ldp         x29, x30, [sp], #16
   1271             ret
   1272 END(fetch_clampright1)
   1273 
   1274 PRIVATE(fetch_clampright4)
   1275             stp         x29, x30, [sp, #-16]!
   1276             sub         x12, xzr, x11
   1277             ands        x12, x12, #15
   1278             beq         1f
   1279             sub         x1, x1, x12
   1280             sub         x15, x15, x12
   1281             sub         x19, x19, x12
   1282             bl          fetch_generic_asm
   1283             dup         v12.2d, v11.d[1]
   1284             dup         v13.2d, v11.d[1]
   1285             sub         x12, xzr, x11
   1286             and         x12, x12, #15
   1287             sub         sp, sp, #64
   1288             add         x12, sp, x12, LSL #1
   1289             st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
   1290             ld1         {v10.8h,v11.8h}, [x12]
   1291             add         sp, sp, #64
   1292             ldp         x29, x30, [sp], #16
   1293             ret
   1294 1:          bl          fetch_generic_asm
   1295             dup         v12.2d, v11.d[1]
   1296             dup         v13.2d, v11.d[1]
   1297             ldp         x29, x30, [sp], #16
   1298             ret
   1299 END(fetch_clampright4)
   1300 
   1301 /* Given values in v10 and v11, and an index in x11, sweep the (x11 & 15)th
   1302  * value across to fill the rest of the register pair.  Used for filling the
   1303  * right hand edge of the window when reading too close to the right hand edge
   1304  * of the image.
   1305  * Also returns a dup-ed copy of the last element in v12 for the tail-fill
   1306  * case (this happens incidentally in common path, but must be done
   1307  * deliberately in the fast-out path).
   1308  */
   1309 PRIVATE(prefill_sweepright1)
   1310             ands        x12, x11, #15
   1311             beq         1f
   1312             sub         x12, x12, #1
   1313             sub         sp, sp, #64
   1314             st1         {v10.8h,v11.8h}, [sp]
   1315             add         x12, sp, x12, LSL #1
   1316             ld1r        {v12.8h}, [x12]
   1317             ld1r        {v13.8h}, [x12]
   1318             st1         {v12.8h,v13.8h}, [x12]
   1319             ld1         {v10.8h,v11.8h}, [sp]
   1320             add         sp, sp, #64
   1321             ret
   1322 1:          dup         v12.8h, v11.h[7]
   1323             dup         v13.8h, v11.h[7]
   1324             ret
   1325 END(prefill_sweepright1)
   1326 
   1327 PRIVATE(prefill_sweepright4)
   1328             ands        x12, x11, #15
   1329             beq         1f
   1330             sub         x12, x12, #4
   1331             sub         sp, sp, #64
   1332             st1         {v10.8h,v11.8h}, [sp]
   1333             add         x12, sp, x12, LSL #1
   1334             ld1r        {v12.2d}, [x12]
   1335             st1         {v13.8h}, [x12]
   1336             ld1         {v10.8h,v11.8h}, [sp]
   1337             add         sp, sp, #64
   1338             ret
   1339 1:          dup         v12.2d, v11.d[1]
   1340             dup         v13.2d, v11.d[1]
   1341             ret
   1342 END(prefill_sweepright4)
   1343 
   1344 /* The main loop keeps a sliding window of data that has already been convolved
   1345  * in the vertical axis for the current line.  This usually stays in the
   1346  * register file, but spills to memory for large windows.  The first thing that
   1347  * needs to be done at start-up is to fill this window with image data, taking
   1348  * into account the padding needed if the left or right edges of the image fall
   1349  * within this window.
   1350  */
   1351 
   1352 /* Because the window is in the register file writes to it cannot be indexed
   1353  * by another register.  Consequently the fill loops are unrolled to address
   1354  * the registers directly.  This macro distinguishes between writes to the
   1355  * register file and writes to the spill buffer (indicated by a destination
   1356  * register named xx).
   1357  */
   1358 .macro prefill_out ra, rb, sra, srb
   1359   .ifc \ra,xx
   1360     .ifc \rb,xx
   1361             st1         {\sra,\srb}, [x9], #32
   1362     .else
   1363             bic         x9, x9, #0x40
   1364             st1         {\sra}, [x9], #16
   1365             mov         \rb, \srb
   1366     .endif
   1367   .else
   1368     .ifnc \ra,\sra
   1369             mov         \ra, \sra
   1370     .endif
   1371     .ifnc \rb,\srb
   1372             mov         \rb, \srb
   1373     .endif
   1374   .endif
   1375 .endm
   1376 
   1377 /* This macro provides the list of registers representing the window, and the
   1378  * cases where the register file is too small and a spill buffer is used
   1379  * instead.
   1380  * Since several specialisations of each function are generated, this also
   1381  * culls superfluous iterations, and sets the variable `i` for subsequent
   1382  * macros indicating the current index into the window.
   1383  */
   1384 .macro prefill_list, macro, nextmacro, max_r, step, label
   1385   .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
   1386     .if windowsize >= (\line * 16)
   1387       .set i, windowsize - (\line * 16)
   1388 \label\macro\line:
   1389             prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
   1390     .endif
   1391   .endm
   1392             ifneeded \macro \nextmacro, 13, 12, xx,      xx,      \step, \label
   1393             ifneeded \macro \nextmacro, 12, 11, xx,      xx,      \step, \label
   1394             ifneeded \macro \nextmacro, 11, 10, xx,      v17.16b, \step, \label
   1395             ifneeded \macro \nextmacro, 10,  9, v18.16b, v19.16b, \step, \label
   1396             ifneeded \macro \nextmacro,  9,  8, v20.16b, v21.16b, \step, \label
   1397             ifneeded \macro \nextmacro,  8,  7, v22.16b, v23.16b, \step, \label
   1398             ifneeded \macro \nextmacro,  7,  6, v24.16b, v25.16b, \step, \label
   1399             ifneeded \macro \nextmacro,  6,  5, v26.16b, v27.16b, \step, \label
   1400             ifneeded \macro \nextmacro,  5,  4, v28.16b, v29.16b, \step, \label
   1401             ifneeded \macro \nextmacro,  4,  3, v30.16b, v31.16b, \step, \label
   1402             ifneeded \macro \nextmacro,  3,  2, v4.16b,  v5.16b,  \step, \label
   1403             ifneeded \macro \nextmacro,  2,  1, v6.16b,  v7.16b,  \step, \label
   1404             ifneeded \macro \nextmacro,  1,  0, v8.16b,  v9.16b,  \step, \label
   1405 \label\macro\()0:
   1406             b           \label\()_end
   1407   .purgem ifneeded
   1408 .endm
   1409 
   1410 /* These macros represent the possible stages of filling the window.
   1411  * Each macro is unrolled enough times that it can fill the entire window
   1412  * itself, but normally it will have to hand control to subsequent macros
   1413  * part-way through and this is done using labels named \next and \after, where
   1414  * \next is the next macro starting at the same window position and \after is
   1415  * the next macro starting after the current window position.
   1416  */
   1417 
   1418 /* leftfill: v8 and v9 contain the left padding value.  While the window
   1419  * extends outside of the image on the left-hand side, and at least 16 more
   1420  * padding values are needed in the window, store v8 and v9 into the window.
   1421  * Otherwise skip forward to storing image data.
   1422  */
   1423 .macro prefill_leftfill, next, after, ra, rb, step
   1424             cmp         x10, #i+16
   1425             blo         \next
   1426             prefill_out \ra, \rb, v8.16b, v9.16b
   1427 .endm
   1428 
   1429 /* leftedge: The very first non-fill or partial-fill chunk from the image is
   1430  * already loaded (as it was used to calculate the left padding value), so
   1431  * store it here, and then drop into the regular load/store cycle in the next
   1432  * macro.
   1433  */
   1434 .macro prefill_leftedge, next, after, ra, rb, step
   1435 1:          prefill_out \ra, \rb, v10.16b, v11.16b
   1436             b           \after
   1437 .endm
   1438 
   1439 /* dofetch: Copy chunks of the image into the window without any complications
   1440  * from edge conditions.
   1441  */
   1442 .macro prefill_dofetch, next, after, ra, rb, step
   1443             cmp         x11, #i+16
   1444             bls         \next
   1445             bl          fetch_generic_asm
   1446             prefill_out \ra, \rb, v10.16b, v11.16b
   1447 .endm
   1448 
   1449 /* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
   1450  * the right-hand edge of the image.  In that case sweep the last valid pixel
   1451  * across the rest of the chunk, and in either case prepare padding data in v12
   1452  * and v13 for the next macro.  This is done in fetch_clampright.
   1453  * This only happens once before going on to the next macro.
   1454  * Sometimes leftedge also covers the rightedge case, in which case this has
   1455  * to be skipped altogether.
   1456  */
   1457 .macro prefill_rightedge, next, after, ra, rb, step
   1458             cmp         x11, #i
   1459             bls         \next
   1460             bl          fetch_clampright\step
   1461             prefill_out \ra, \rb, v10.16b, v11.16b
   1462             b           \after
   1463 .endm
   1464 
   1465 /* rightfill: The rest of the window is simply filled with right padding from
   1466  * v12 and v13.
   1467  */
   1468 .macro prefill_rightfill, next, after, ra, rb, step
   1469             prefill_out \ra, \rb, v12.16b, v13.16b
   1470 .endm
   1471 
   1472 /* Here all of the macros above are unrolled and laid out in the proper order.
   1473  */
   1474 .macro prefill_body, max_r, step, label
   1475             prefill_list leftfill,  leftedge,   \max_r, \step, \label
   1476             prefill_list leftedge,  dofetch,    \max_r, \step, \label
   1477             prefill_list dofetch,   rightedge,  \max_r, \step, \label
   1478             prefill_list rightedge, rightfill,  \max_r, \step, \label
   1479             prefill_list rightfill, oops,       \max_r, \step, \label
   1480 \label\()_end:
   1481 .endm
   1482 
   1483 
   1484 /* Fill the convolution window with context data.  The aim here is to load
   1485  * exactly 2*r columns, and in the main loop to read as many columns as will be
   1486  * written.  This is complicated by the window being divided into chunks at
   1487  * register boundaries, and the need to handle cases when the input starts very
   1488  * close to the left or right (or both) edges of the image and the need to fill
   1489  * the spaces that leaves with left and right edge padding values.
   1490  *
   1491  * Input:
   1492  *      x1 -- src
   1493  *      x2 -- pitch
   1494  *      x3 -- count
   1495  *      x4 -- available image data right of src pointer
   1496  *      x5 -- r
   1497  *      x6 -- rup
   1498  *      x7 -- rdn
   1499  *      x8 -- available image data left of src pointer
   1500  *      x9 -- buffer (if needed)
   1501  *      x13 = -pitch
   1502  *      x15 = top-row in
   1503  *      x19 = bottom-row in
   1504  * Output:
   1505  *      x4 -= min(inlen, count + windowsize - centertap)
   1506  *      x1 += min(inlen, count + windowsize - centertap)
   1507  *      x15 += min(inlen, count + windowsize - centertap)
   1508  *      x19 += min(inlen, count + windowsize - centertap)
   1509  * Modifies:
   1510  *      x10 -- fill start index in the window
   1511  *      x11 -- fill stop index in the window
   1512  *      x12 -- scratch
   1513  */
   1514 .macro prefill step=1, max_r=25, label=xx
   1515 .set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
   1516 .set centertap, (windowsize - \max_r * \step)
   1517             mov         x10, #centertap
   1518             subs        x10, x10, x8
   1519             csel        x10, xzr, x10, lo
   1520 
   1521             subs        x11, x4, #windowsize - centertap
   1522             csel        x11, xzr, x11, hs
   1523             add         x11, x11, #windowsize
   1524 
   1525             /* x10 indicates where in the window legal image data begins.
   1526              * x11 indicates where in the window legal image date ends.
   1527              * When starting near the centre of a large image these would be
   1528              * zero and windowsize respectively, but when starting near the
   1529              * edges this can change.
   1530              * When starting on the leftmost pixel, x10 will be centertap.
   1531              * When starting on the rightmost pixel, x11 will be centertap+1.
   1532              */
   1533 
   1534             /* x4 indicates how much data there is between the current pointers
   1535              * and the right edge of the image.  The pointers currently point
   1536              * to the data needed at centertap.  The subsequent code will
   1537              * consume (windowsize - x10) data, but only the data from
   1538              * centertap to windowsize comes out of x4's budget.
   1539              */
   1540 1:          subs        x4, x4, #windowsize - centertap
   1541             csel        x4, xzr, x4, lo
   1542 
   1543             /* And the pointers need to rewind to the start of the window.
   1544              */
   1545             sub         x1, x1, #centertap
   1546             sub         x15, x15, #centertap
   1547             sub         x19, x19, #centertap
   1548 
   1549             /* Unless x8 indicated that there wasn't that much data available.
   1550              */
   1551             add         x1, x1, x10
   1552             add         x15, x15, x10
   1553             add         x19, x19, x10
   1554 
   1555             /* Get the first chunk, and add padding to align it to the window
   1556              * if necessary.
   1557              */
   1558             bl          fetch_clampleft\step
   1559 
   1560             /* Sometimes the start and the end of the window are in the same
   1561              * chunk.  In that case both ends need filler at the outset.
   1562              */
   1563             sub         x12, x11, #1
   1564             eor         x12,  x10, x12
   1565             cmp         x12, #16
   1566             bhs         1f
   1567             bl          prefill_sweepright\step
   1568 
   1569             /* Iterate through all the points in the window and fill them in
   1570              * with padding or image data as needed.
   1571              */
   1572 1:          prefill_body \max_r, \step, \label
   1573 .endm
   1574 
   1575 /* The main body of the convolve functions.  Having already pre-filled the
   1576  * convolution window with 2*r input values, the logic settles into a regular
   1577  * pattern of reading and writing at a 1:1 rate until either input or output
   1578  * expires.  The input leads the output by r values, so when processing all the
   1579  * way to the right-hand edge, or within r pixels of that edge, the input will
   1580  * run out first.  In the case of very narrow images, or sub-windows starting
   1581  * near the right edge, the input may already have run out while the
   1582  * convolution window was being filled and this loop will start with a
   1583  * zero-length input.
   1584  *
   1585  * Once the input runs out, the rest of the output must be processed by padding
   1586  * the remainder of the window with pad value from the last valid pixel from
   1587  * the source.
   1588  *
   1589  * Input:
   1590  *      x0 = dst
   1591  *      x1 = src
   1592  *      x2 = pitch
   1593  *      x3 = count
   1594  *      x4 = inlen
   1595  *      x5 = r
   1596  *      x6 = rup
   1597  *      x7 = rdn
   1598  *      x9 = buffer
   1599  *      x13 = -pitch
   1600  *      x15 = top-row in
   1601  *      x19 = bottom-row in
   1602  * Modifies
   1603  *      x8 = fetch code pointer
   1604  */
   1605 .macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
   1606 
   1607             /* If x4 >= x3 then there's no need for clipping.  The main loop
   1608              * needs to exit when either x3 or x4 runs out, so clamp x4 to be
   1609              * no greater than x3 and use x4 for the loop.
   1610              * However, if x4 comes out of the loop with less than 16 bytes
   1611              * left, a partial read would be necessary to avoid reading beyond
   1612              * the end of the image.  To avoid this, clamp x4 to the next
   1613              * multiple of 16, which is still sufficient to force it out of the
   1614              * loop but doesn't imply a rewind.
   1615              */
   1616             add         x12, x3, #15
   1617             bic         x12, x12, #15
   1618             cmp         x4, x12
   1619             csel        x4, x12, x4, hi
   1620 
   1621             /* First calculate the entry-point into the internal fetch logic.
   1622              * This is done so the same function can service several kernel
   1623              * sizes.
   1624              */
   1625             adrp        x8, \labelnc
   1626             add         x8, x8, #:lo12:\labelnc
   1627             sub         x8, x8, x5, LSL #5
   1628             sub         x8, x8, x5, LSL #3
   1629             cmp         x5, x6
   1630             ccmp        x5, x7, #0, eq
   1631             beq         5f
   1632 
   1633             /* if (r != rup || r != rdn) then the address-clamping table should
   1634              * be used rather than the short-cut version.
   1635              */
   1636             adrp        x8, \labelc
   1637             add         x8, x8, #:lo12:\labelc
   1638             sub         x8, x8, x5, LSL #6
   1639             add         x8, x8, x5, LSL #3
   1640             b           5f
   1641 
   1642             /* Main loop: ... */
   1643             .align  4
   1644 3:          /* first perform a vertical convolution from memory to get the next
   1645              * 16 taps of the horizontal window into the register file...
   1646              */
   1647             fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8
   1648 
   1649             /* ...then perform a horizontal convolution on that window to
   1650              * produce eight output bytes, and slide the window along.
   1651              * This has to be done twice to match the 16-way vertical pass.
   1652              * It would be preferable to have twice the work done in \core, but
   1653              * that would demand yet another variant on those macros and would
   1654              * perturb the register allocation severely.
   1655              */
   1656             \core
   1657             st1         {v15.8b}, [x0], #8
   1658             \core
   1659             st1         {v15.8b}, [x0], #8
   1660 
   1661             sub         x3, x3, #16
   1662 5:          subs        x4, x4, #16
   1663             bhi         3b
   1664             /* Here there's 16 or fewer bytes available before the edge of the
   1665              * source image.  x4 holds that count minus 16 (because it was
   1666              * decremented before the first iteration ran).  The last read may
   1667              * not be a whole chunk, and beyond that a fill value must be used.
   1668              *
   1669              * Of course, none of that matters if there's no more output to
   1670              * produce...
   1671              */
   1672             cbz         x3, 5f
   1673 
   1674             /* Oh well. */
   1675             adds        x4, x4, #16
   1676             bne         1f
   1677   .if \step==1
   1678             dup         v10.8h, v9.h[7]
   1679             dup         v11.8h, v9.h[7]
   1680   .else
   1681             dup         v10.2d, v9.d[1]
   1682             dup         v11.2d, v9.d[1]
   1683   .endif
   1684             b           3f
   1685 
   1686             /* To avoid reading past end of input, rewind pointers by (16-x4)
   1687              * to ensure that they're exactly 16 bytes from the edge.
   1688              */
   1689 1:          mov         x11, x4
   1690             bl          fetch_clampright\step
   1691             /* Now to put this padding to use, perform any remaining
   1692              * iterations.  This is done at half the rate of the main loop,
   1693              * because there's no longer pressure from a 16-lane window filler.
   1694              */
   1695 3:          \core
   1696   .if \step==1
   1697             dup         v11.8h, v11.h[7]
   1698   .else
   1699             dup         v11.2d, v11.d[1]
   1700   .endif
   1701             subs        x3, x3, #8
   1702             blo         4f
   1703             st1         {v15.8b}, [x0], #8
   1704             bne         3b
   1705             b           5f
   1706 
   1707             /* If the final iteration contained 0 < l < 8 values, then perform
   1708              * a piecewise store of the final vector.
   1709              */
   1710 4:          tbz         x3, #2, 1f
   1711             st1         {v15.s}[0], [x0], #4
   1712             ext         v15.8b, v15.8b, v15.8b, #4
   1713 1:          tbz         x3, #1, 1f
   1714             st1         {v15.h}[0], [x0], #2
   1715             ext         v15.8b, v15.8b, v15.8b, #2
   1716 1:          tbz         x3, #0, 5f
   1717             st1         {v15.b}[0], [x0], #1
   1718             ext         v15.8b, v15.8b, v15.8b, #1
   1719 5:          mov         x0, #0
   1720 .endm
   1721 
   1722 
   1723 .irp r, TUNED_LIST1, 25
   1724 PRIVATE(convolve1_\r)
   1725             stp         x29,x30, [sp, #-16]!
   1726 
   1727             prefill     step=1, max_r=\r, label=.Lcnv1_\r
   1728 
   1729             conv_body   core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
   1730 
   1731             ldp         x29,x30, [sp], #16
   1732             ret
   1733 END(convolve1_\r)
   1734 .endr
   1735 
   1736 .irp r, TUNED_LIST4, 25
   1737 PRIVATE(convolve4_\r)
   1738             sub         x9, sp, #0x40
   1739             stp         x29,x30, [sp, #-(16 + 0x40 + 0x80)]!
   1740             bic         x9, x9, #0x7f
   1741 
   1742             /* x9 now points to a 0x40 byte buffer on the stack whose address
   1743              * has the low 7 bits clear.  This allows easy address calculation
   1744              * in the wrap-around cases.
   1745              */
   1746 
   1747             prefill     step=4, max_r=\r, label=.Lcnv4_\r
   1748 
   1749             conv_body   core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
   1750 
   1751             ldp         x29,x30, [sp], #(16 + 0x40 + 0x80)
   1752             ret
   1753 END(convolve4_\r)
   1754 .endr
   1755 
   1756 /* void rsdIntrinsicBlurU1_K(
   1757  *                  void *out,      // x0
   1758  *                  void *in,       // x1
   1759  *                  size_t w,       // x2
   1760  *                  size_t h,       // x3
   1761  *                  size_t p,       // x4
   1762  *                  size_t x,       // x5
   1763  *                  size_t y,       // x6
   1764  *                  size_t count,   // x7
   1765  *                  size_t r,       // [sp]
   1766  *                  uint16_t *tab); // [sp,#8]
   1767  */
   1768 ENTRY(rsdIntrinsicBlurU1_K)
   1769             stp         x19,x30, [sp, #-16]!
   1770             sub         x8, sp, #32
   1771             sub         sp, sp, #64
   1772             st1         {v8.1d - v11.1d}, [sp]
   1773             st1         {v12.1d - v15.1d}, [x8]
   1774             mov         x8, x5          // x
   1775             ldr         w5, [sp,#80]    // r
   1776             sub         x9, x2, x8      // w - x
   1777             sub         x10, x3, x6     // h - y
   1778             mov         x2, x4          // pitch
   1779             mov         x3, x7          // count
   1780             sub         x7, x10, #1     // h - y - 1
   1781             mov         x4, x9          // inlen = (w - x)
   1782 
   1783             ldr         x12, [sp, #88] // tab
   1784 
   1785             add         x1, x1, x8      // src += x
   1786 
   1787             cmp         x6, x5
   1788             csel        x6, x5, x6, hs  // rup = min(r, y)
   1789             cmp         x7, x5
   1790             csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
   1791 
   1792             sub         x13, xzr, x2    // -pitch
   1793             msub        x15, x2, x6, x1
   1794             madd        x19, x2, x7, x1
   1795 
   1796             ld1         {v0.8h,v1.8h}, [x12], #32
   1797             ld1         {v2.8h,v3.8h}, [x12], #32
   1798 
   1799             adr         x30, 1f
   1800   .irp r, TUNED_LIST1
   1801             cmp         x5, #\r
   1802             bls         convolve1_\r
   1803   .endr
   1804             b           convolve1_25
   1805 
   1806 1:          ld1         {v8.1d - v11.1d}, [sp], #32
   1807             ld1         {v12.1d - v15.1d}, [sp], #32
   1808             ldp         x19,x30, [sp], #16
   1809             ret
   1810 END(rsdIntrinsicBlurU1_K)
   1811 
   1812 /* void rsdIntrinsicBlurU4_K(
   1813  *                  void *out,      // x0
   1814  *                  void *in,       // x1
   1815  *                  size_t w,       // x2
   1816  *                  size_t h,       // x3
   1817  *                  size_t p,       // x4
   1818  *                  size_t x,       // x5
   1819  *                  size_t y,       // x6
   1820  *                  size_t count,   // x7
   1821  *                  size_t r,       // [sp]
   1822  *                  uint16_t *tab); // [sp,#8]
   1823  */
   1824 ENTRY(rsdIntrinsicBlurU4_K)
   1825             stp         x19,x30, [sp, #-16]!
   1826             sub         x8, sp, #32
   1827             sub         sp, sp, #64
   1828             st1         {v8.1d - v11.1d}, [sp]
   1829             st1         {v12.1d - v15.1d}, [x8]
   1830             lsl         x8, x5, #2      // x
   1831             lsl         x2, x2, #2
   1832             ldr         w5, [sp,#80]    // r
   1833             sub         x9, x2, x8      // w - x
   1834             sub         x10, x3, x6     // h - y
   1835             mov         x2, x4          // pitch
   1836             lsl         x3, x7, #2      // count
   1837             sub         x7, x10, #1     // h - y - 1
   1838             mov         x4, x9          // inlen = (w - x)
   1839 
   1840             ldr         x12, [sp, #88]
   1841 
   1842             add         x1, x1, x8      // in += x
   1843 
   1844             cmp         x6, x5
   1845             csel        x6, x5, x6, hs  // rup = min(r, y)
   1846             cmp         x7, x5
   1847             csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
   1848 
   1849 
   1850             sub         x13, xzr, x2
   1851             msub        x15, x2, x6, x1
   1852             madd        x19, x2, x7, x1
   1853 
   1854             ld1         {v0.8h,v1.8h}, [x12], #32
   1855             ld1         {v2.8h,v3.8h}, [x12], #32
   1856 
   1857             adr         x30, 1f
   1858   .irp r, TUNED_LIST4
   1859             cmp         x5, #\r
   1860             bls         convolve4_\r
   1861   .endr
   1862             b           convolve4_25
   1863 
   1864 1:          ld1         {v8.1d - v11.1d}, [sp], #32
   1865             ld1         {v12.1d - v15.1d}, [sp], #32
   1866             ldp         x19,x30, [sp], #16
   1867             ret
   1868 END(rsdIntrinsicBlurU4_K)
   1869