Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
     18 #define PRIVATE(f) .text; .align 4; .type f,#function; f: .fnstart
     19 #define END(f) .fnend; .size f, .-f;
     20 
     21 #define ARCH_ARM_USE_BLUR_PRELOAD
     22 
     23 .eabi_attribute 25,1 @Tag_ABI_align8_preserved
     24 .arm
     25 
     26 /* Number of fractional bits to preserve in intermediate results.  The
     27  * intermediate storage is 16-bit, and we started with 8 bit data (the integer
     28  * part), so this should be between 0 and 8.
     29  */
     30 .set FRACTION_BITS, 7
     31 
     32 .set MAX_R, 25
     33 
     34 
     35 /* A quick way of making a line of code conditional on some other condition.
     36  * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
     37  * `ifcc`:
     38  */
     39 .macro ifcc zzz:vararg
     40 .if cc
     41             \zzz
     42 .endif
     43 .endm
     44 
     45 /* It's not always clear that prefetching is beneficial and this needs further
     46  * testing on different cores, so it's made switchable here.
     47  */
     48 #if defined(ARCH_ARM_USE_BLUR_PRELOAD)
     49 #define VERTPLD(...) pld [__VA_ARGS__]
     50 #else
     51 #define VERTPLD(...) nop
     52 #endif
     53 
     54 /* Fetch 16 columns of bytes (regardless of image format), convolve these
     55  * vertically, and leave them in the register file.  If working near the top or
     56  * bottom of an image then clamp the addressing while loading the data in.
     57  *
     58  * The convolution is fully unrolled for windows up to max_r, with the
     59  * outermost edges calculated first.  This way it's possible to branch directly
     60  * into the relevant part of the code for an arbitrary convolution radius.  Two
     61  * variants of the loop are produced; one eliminates the clamping code for a
     62  * slight speed advantage.
     63  *
     64  * Where the macro is called with reg=x, the specified register is taken to
     65  * contain a pre-calculated pointer into one of the two loops.
     66  *
     67  * Input:
     68  *      r1 -- src
     69  *      r2 -- pitch
     70  *      r5 -- r
     71  *      r6 -- rup (r, unless clipped to top of source image)
     72  *      r7 -- rdn (r, unless clipped to bottom of source image)
     73  *      r12 -- switch index
     74  *      q0-q3 -- coefficient table
     75  * Output:
     76  *      r1 += 16
     77  *      q10,q11 -- 16 convolved columns
     78  * Modifies:
     79  *      r10 = upper row pointer
     80  *      r11 = lower row pointer
     81  *      q12-q15 = temporary sums
     82  */
     83 .macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=r12 /*{{{*/
     84   .ifc \reg,r12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
     85 
     86             vld1.8      {d30,d31}, [r1]
     87             mls         r10, r2, r6, r1
     88 
     89             vmovl.u8    q14, d30
     90             VERTPLD(r1, #32)
     91             vmovl.u8    q15, d31
     92   .if \max_r < 16 // approximate
     93     ifcc    adr         \reg, 1f
     94   .else
     95     ifcc    ldr         \reg, 2f
     96 1:  ifcc    add         \reg, \reg, pc
     97   .endif
     98 
     99             vmull.u16   q12, d28, d0[0]
    100     ifcc    sub         \reg, r5, LSL #6
    101             vmull.u16   q13, d29, d0[0]
    102             mla         r11, r2, r7, r1
    103             vmull.u16   q14, d30, d0[0]
    104             add         r1, r1, #16
    105             vmull.u16   q15, d31, d0[0]
    106             bx          \reg
    107 
    108      ifcc   .align 2
    109   2: ifcc   .word       1f-1b-8
    110 
    111   /* This version of the vertical fetch loop body is used away from the edges
    112    * of the source image.  The pointers start at the top and bottom source rows
    113    * and work their way towards the centre on each iteration.  This way the
    114    * number of taps used can be controlled by jumping directly into the middle
    115    * of the loop and running to completion.
    116    * If the loop body changes size then the code which caculates the address of
    117    * the initial iteration must be updated to accordingly.
    118    */
    119   .macro vertfetch_noclamp i, dreg
    120     .if 0 < \i && \i <= \max_r
    121             vld1.8      {d20,d21}, [r10], r2
    122             vld1.8      {d22,d23}, [r11]
    123             sub         r11, r11, r2
    124             vswp        d21, d22
    125             VERTPLD(r10, #32)
    126             vaddl.u8    q10, d20, d21
    127             vaddl.u8    q11, d22, d23
    128             vmlal.u16   q12, d20, \dreg
    129             VERTPLD(r11, #32)
    130             vmlal.u16   q13, d21, \dreg
    131             vmlal.u16   q14, d22, \dreg
    132             vmlal.u16   q15, d23, \dreg
    133     .endif
    134   .endm
    135 
    136   /* This version of the vertical fetch loop body is used near the edges of the
    137    * source image, where one or both of the accesses may start with a clamped
    138    * value, and the row addresses only begin to change after some number of
    139    * iterations before the end.
    140    * If the loop body changes size then the code which caculates the address of
    141    * the initial iteration must be updated to accordingly.
    142    */
    143   .macro vertfetch_clamped i, dreg
    144     .if 0 < \i && \i <= \max_r
    145             vld1.8      {d20,d21}, [r10]
    146             vld1.8      {d22,d23}, [r11]
    147             cmp         r6, #\i
    148             vswp        d21, d22
    149             VERTPLD(r10, #32)
    150             vaddl.u8    q10, d20, d21
    151             addhs       r10, r10, r2
    152             vaddl.u8    q11, d22, d23
    153             cmp         r7, #\i
    154             vmlal.u16   q12, d20, \dreg
    155             VERTPLD(r11, #32)
    156             vmlal.u16   q13, d21, \dreg
    157             subhs       r11, r11, r2
    158             vmlal.u16   q14, d22, \dreg
    159             nop
    160             vmlal.u16   q15, d23, \dreg
    161     .endif
    162   .endm
    163 
    164   /* Entry into this unrolled loop is computed as a negative index from
    165    * \labelc at the end of the block.
    166    */
    167   .align 4
    168   vertfetch_clamped 27, d6[3]
    169   vertfetch_clamped 26, d6[2]
    170   vertfetch_clamped 25, d6[1]
    171   vertfetch_clamped 24, d6[0]
    172   vertfetch_clamped 23, d5[3]
    173   vertfetch_clamped 22, d5[2]
    174   vertfetch_clamped 21, d5[1]
    175   vertfetch_clamped 20, d5[0]
    176   vertfetch_clamped 19, d4[3]
    177   vertfetch_clamped 18, d4[2]
    178   vertfetch_clamped 17, d4[1]
    179   vertfetch_clamped 16, d4[0]
    180   vertfetch_clamped 15, d3[3]
    181   vertfetch_clamped 14, d3[2]
    182   vertfetch_clamped 13, d3[1]
    183   vertfetch_clamped 12, d3[0]
    184   vertfetch_clamped 11, d2[3]
    185   vertfetch_clamped 10, d2[2]
    186   vertfetch_clamped  9, d2[1]
    187   vertfetch_clamped  8, d2[0]
    188   vertfetch_clamped  7, d1[3]
    189   vertfetch_clamped  6, d1[2]
    190   vertfetch_clamped  5, d1[1]
    191   vertfetch_clamped  4, d1[0]
    192   vertfetch_clamped  3, d0[3]
    193   vertfetch_clamped  2, d0[2]
    194   vertfetch_clamped  1, d0[1]
    195   vertfetch_clamped  0, d0[0]
    196   1:
    197   \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
    198 
    199   /* Entry into this unrolled loop is computed as a negative index from
    200    * \labelnc at the end of the block.
    201    */
    202   .align 4
    203   vertfetch_noclamp 27, d6[3]
    204   vertfetch_noclamp 26, d6[2]
    205   vertfetch_noclamp 25, d6[1]
    206   vertfetch_noclamp 24, d6[0]
    207   vertfetch_noclamp 23, d5[3]
    208   vertfetch_noclamp 22, d5[2]
    209   vertfetch_noclamp 21, d5[1]
    210   vertfetch_noclamp 20, d5[0]
    211   vertfetch_noclamp 19, d4[3]
    212   vertfetch_noclamp 18, d4[2]
    213   vertfetch_noclamp 17, d4[1]
    214   vertfetch_noclamp 16, d4[0]
    215   vertfetch_noclamp 15, d3[3]
    216   vertfetch_noclamp 14, d3[2]
    217   vertfetch_noclamp 13, d3[1]
    218   vertfetch_noclamp 12, d3[0]
    219   vertfetch_noclamp 11, d2[3]
    220   vertfetch_noclamp 10, d2[2]
    221   vertfetch_noclamp  9, d2[1]
    222   vertfetch_noclamp  8, d2[0]
    223   vertfetch_noclamp  7, d1[3]
    224   vertfetch_noclamp  6, d1[2]
    225   vertfetch_noclamp  5, d1[1]
    226   vertfetch_noclamp  4, d1[0]
    227   vertfetch_noclamp  3, d0[3]
    228   vertfetch_noclamp  2, d0[2]
    229   vertfetch_noclamp  1, d0[1]
    230   vertfetch_noclamp  0, d0[0]
    231   \labelnc :
    232 
    233   .purgem vertfetch_clamped
    234   .purgem vertfetch_noclamp
    235 
    236   2:        vqrshrn.u32 d20, q12, #16 - FRACTION_BITS
    237             vqrshrn.u32 d21, q13, #16 - FRACTION_BITS
    238             vqrshrn.u32 d22, q14, #16 - FRACTION_BITS
    239             vqrshrn.u32 d23, q15, #16 - FRACTION_BITS
    240 .endm /*}}}*/
    241 
    242 /* Some portion of the convolution window (as much as will fit, and all of it
    243  * for the uchar1 cases) is kept in the register file to avoid unnecessary
    244  * memory accesses.  This forces the horizontal loops to be unrolled because
    245  * there's no indexed addressing into the register file.
    246  *
    247  * As in the fetch macro, the operations are ordered from outside to inside, so
    248  * that jumping into the middle of the block bypasses the unwanted window taps.
    249  *
    250  * There are several variants of the macro because of the fixed offets of the
    251  * taps -- the wider the maximum radius the further the centre tap is from the
    252  * most recently fetched data.  This means that pre-filling the window requires
    253  * more data that won't be used and it means that rotating the window involves
    254  * more mov operations.
    255  *
    256  * When the buffer gets too big the buffer at [r9] is used.
    257  *
    258  * Input:
    259  *      q4-q11 -- convoltion window
    260  *      r9 -- pointer to additional convolution window data
    261  * Output:
    262  *      r9 -- updated buffer pointer (if used)
    263  *      d31 -- result to be stored
    264  * Modifies:
    265  *      r12 -- temp buffer pointer
    266  *      q12-q13 -- temporaries for load and vext operations.
    267  *      q14-q15 -- intermediate sums
    268  */
    269 #define TUNED_LIST1 8, 16
    270 .macro hconv1_8/*{{{*/
    271             vmull.u16   q14, d18, d0[0]
    272             vmull.u16   q15, d19, d0[0]
    273 
    274             ldr         r12, [pc, r5, LSL #2]
    275             add         pc, pc, r12
    276             bkpt
    277     100:    .word 101f-100b
    278             .word 102f-100b
    279             .word 103f-100b
    280             .word 104f-100b
    281             .word 105f-100b
    282             .word 106f-100b
    283             .word 107f-100b
    284             .word 108f-100b
    285     108:    vmlal.u16   q14, d16, d2[0]
    286             vmlal.u16   q15, d17, d2[0]
    287             vmlal.u16   q14, d20, d2[0]
    288             vmlal.u16   q15, d21, d2[0]
    289     107:    vext.u16    q12, q8, q9, #1
    290             vext.u16    q13, q9, q10, #7
    291             vmlal.u16   q14, d24, d1[3]
    292             vmlal.u16   q15, d25, d1[3]
    293             vmlal.u16   q14, d26, d1[3]
    294             vmlal.u16   q15, d27, d1[3]
    295     106:    vext.u16    q12, q8, q9, #2
    296             vext.u16    q13, q9, q10, #6
    297             vmlal.u16   q14, d24, d1[2]
    298             vmlal.u16   q15, d25, d1[2]
    299             vmlal.u16   q14, d26, d1[2]
    300             vmlal.u16   q15, d27, d1[2]
    301     105:    vext.u16    q12, q8, q9, #3
    302             vext.u16    q13, q9, q10, #5
    303             vmlal.u16   q14, d24, d1[1]
    304             vmlal.u16   q15, d25, d1[1]
    305             vmlal.u16   q14, d26, d1[1]
    306             vmlal.u16   q15, d27, d1[1]
    307     104:    //vext.u16    q12, q8, q9, #4
    308             //vext.u16    q13, q9, q10, #4
    309             vmlal.u16   q14, d17, d1[0]
    310             vmlal.u16   q15, d18, d1[0]
    311             vmlal.u16   q14, d19, d1[0]
    312             vmlal.u16   q15, d20, d1[0]
    313     103:    vext.u16    q12, q8, q9, #5
    314             vext.u16    q13, q9, q10, #3
    315             vmlal.u16   q14, d24, d0[3]
    316             vmlal.u16   q15, d25, d0[3]
    317             vmlal.u16   q14, d26, d0[3]
    318             vmlal.u16   q15, d27, d0[3]
    319     102:    vext.u16    q12, q8, q9, #6
    320             vext.u16    q13, q9, q10, #2
    321             vmlal.u16   q14, d24, d0[2]
    322             vmlal.u16   q15, d25, d0[2]
    323             vmlal.u16   q14, d26, d0[2]
    324             vmlal.u16   q15, d27, d0[2]
    325     101:    vext.u16    q12, q8, q9, #7
    326             vext.u16    q13, q9, q10, #1
    327             vmlal.u16   q14, d24, d0[1]
    328             vmlal.u16   q15, d25, d0[1]
    329             vmlal.u16   q14, d26, d0[1]
    330             vmlal.u16   q15, d27, d0[1]
    331 
    332             vqrshrn.u32 d28, q14, #16
    333             vqrshrn.u32 d29, q15, #16
    334             vqrshrn.u16 d31, q14, #FRACTION_BITS
    335 
    336             vmov        q8, q9
    337             vmov        q9, q10
    338             vmov        q10, q11
    339 .endm/*}}}*/
    340 
    341 .macro hconv1_16/*{{{*/
    342             vmull.u16   q14, d16, d0[0]
    343             vmull.u16   q15, d17, d0[0]
    344 
    345             ldr         r12, [pc, r5, LSL #2]
    346             add         pc, pc, r12
    347             bkpt
    348     100:    .word 101f-100b
    349             .word 102f-100b
    350             .word 103f-100b
    351             .word 104f-100b
    352             .word 105f-100b
    353             .word 106f-100b
    354             .word 107f-100b
    355             .word 108f-100b
    356             .word 109f-100b
    357             .word 110f-100b
    358             .word 111f-100b
    359             .word 112f-100b
    360             .word 113f-100b
    361             .word 114f-100b
    362             .word 115f-100b
    363             .word 116f-100b
    364     116:    //vext.u16    q12, q6, q7, #0
    365             //vext.u16    q13, q10, q11, #0
    366             vmlal.u16   q14, d12, d4[0]
    367             vmlal.u16   q15, d13, d4[0]
    368             vmlal.u16   q14, d20, d4[0]
    369             vmlal.u16   q15, d21, d4[0]
    370     115:    vext.u16    q12, q6, q7, #1
    371             vext.u16    q13, q9, q10, #7
    372             vmlal.u16   q14, d24, d3[3]
    373             vmlal.u16   q15, d25, d3[3]
    374             vmlal.u16   q14, d26, d3[3]
    375             vmlal.u16   q15, d27, d3[3]
    376     114:    vext.u16    q12, q6, q7, #2
    377             vext.u16    q13, q9, q10, #6
    378             vmlal.u16   q14, d24, d3[2]
    379             vmlal.u16   q15, d25, d3[2]
    380             vmlal.u16   q14, d26, d3[2]
    381             vmlal.u16   q15, d27, d3[2]
    382     113:    vext.u16    q12, q6, q7, #3
    383             vext.u16    q13, q9, q10, #5
    384             vmlal.u16   q14, d24, d3[1]
    385             vmlal.u16   q15, d25, d3[1]
    386             vmlal.u16   q14, d26, d3[1]
    387             vmlal.u16   q15, d27, d3[1]
    388     112:    //vext.u16    q12, q6, q7, #4
    389             //vext.u16    q13, q9, q10, #4
    390             vmlal.u16   q14, d13, d3[0]
    391             vmlal.u16   q15, d14, d3[0]
    392             vmlal.u16   q14, d19, d3[0]
    393             vmlal.u16   q15, d20, d3[0]
    394     111:    vext.u16    q12, q6, q7, #5
    395             vext.u16    q13, q9, q10, #3
    396             vmlal.u16   q14, d24, d2[3]
    397             vmlal.u16   q15, d25, d2[3]
    398             vmlal.u16   q14, d26, d2[3]
    399             vmlal.u16   q15, d27, d2[3]
    400     110:    vext.u16    q12, q6, q7, #6
    401             vext.u16    q13, q9, q10, #2
    402             vmlal.u16   q14, d24, d2[2]
    403             vmlal.u16   q15, d25, d2[2]
    404             vmlal.u16   q14, d26, d2[2]
    405             vmlal.u16   q15, d27, d2[2]
    406     109:    vext.u16    q12, q6, q7, #7
    407             vext.u16    q13, q9, q10, #1
    408             vmlal.u16   q14, d24, d2[1]
    409             vmlal.u16   q15, d25, d2[1]
    410             vmlal.u16   q14, d26, d2[1]
    411             vmlal.u16   q15, d27, d2[1]
    412     108:    //vext.u16    q12, q7, q8, #0
    413             //vext.u16    q13, q9, q10, #0
    414             vmlal.u16   q14, d14, d2[0]
    415             vmlal.u16   q15, d15, d2[0]
    416             vmlal.u16   q14, d18, d2[0]
    417             vmlal.u16   q15, d19, d2[0]
    418     107:    vext.u16    q12, q7, q8, #1
    419             vext.u16    q13, q8, q9, #7
    420             vmlal.u16   q14, d24, d1[3]
    421             vmlal.u16   q15, d25, d1[3]
    422             vmlal.u16   q14, d26, d1[3]
    423             vmlal.u16   q15, d27, d1[3]
    424     106:    vext.u16    q12, q7, q8, #2
    425             vext.u16    q13, q8, q9, #6
    426             vmlal.u16   q14, d24, d1[2]
    427             vmlal.u16   q15, d25, d1[2]
    428             vmlal.u16   q14, d26, d1[2]
    429             vmlal.u16   q15, d27, d1[2]
    430     105:    vext.u16    q12, q7, q8, #3
    431             vext.u16    q13, q8, q9, #5
    432             vmlal.u16   q14, d24, d1[1]
    433             vmlal.u16   q15, d25, d1[1]
    434             vmlal.u16   q14, d26, d1[1]
    435             vmlal.u16   q15, d27, d1[1]
    436     104:    //vext.u16    q12, q7, q8, #4
    437             //vext.u16    q13, q8, q9, #4
    438             vmlal.u16   q14, d15, d1[0]
    439             vmlal.u16   q15, d16, d1[0]
    440             vmlal.u16   q14, d17, d1[0]
    441             vmlal.u16   q15, d18, d1[0]
    442     103:    vext.u16    q12, q7, q8, #5
    443             vext.u16    q13, q8, q9, #3
    444             vmlal.u16   q14, d24, d0[3]
    445             vmlal.u16   q15, d25, d0[3]
    446             vmlal.u16   q14, d26, d0[3]
    447             vmlal.u16   q15, d27, d0[3]
    448     102:    vext.u16    q12, q7, q8, #6
    449             vext.u16    q13, q8, q9, #2
    450             vmlal.u16   q14, d24, d0[2]
    451             vmlal.u16   q15, d25, d0[2]
    452             vmlal.u16   q14, d26, d0[2]
    453             vmlal.u16   q15, d27, d0[2]
    454     101:    vext.u16    q12, q7, q8, #7
    455             vext.u16    q13, q8, q9, #1
    456             vmlal.u16   q14, d24, d0[1]
    457             vmlal.u16   q15, d25, d0[1]
    458             vmlal.u16   q14, d26, d0[1]
    459             vmlal.u16   q15, d27, d0[1]
    460 
    461             vqrshrn.u32 d28, q14, #16
    462             vqrshrn.u32 d29, q15, #16
    463             vqrshrn.u16 d31, q14, #FRACTION_BITS
    464 
    465             vmov        q6, q7
    466             vmov        q7, q8
    467             vmov        q8, q9
    468             vmov        q9, q10
    469             vmov        q10, q11
    470 .endm/*}}}*/
    471 
    472 .macro hconv1_25/*{{{*/
    473             vext.u16    q12, q6, q7, #7
    474             vmull.u16   q14, d24, d0[0]
    475             vmull.u16   q15, d25, d0[0]
    476 
    477             ldr         r12, [pc, r5, LSL #2]
    478             add         pc, pc, r12
    479             bkpt
    480     100:    .word 101f-100b
    481             .word 102f-100b
    482             .word 103f-100b
    483             .word 104f-100b
    484             .word 105f-100b
    485             .word 106f-100b
    486             .word 107f-100b
    487             .word 108f-100b
    488             .word 109f-100b
    489             .word 110f-100b
    490             .word 111f-100b
    491             .word 112f-100b
    492             .word 113f-100b
    493             .word 114f-100b
    494             .word 115f-100b
    495             .word 116f-100b
    496             .word 117f-100b
    497             .word 118f-100b
    498             .word 119f-100b
    499             .word 120f-100b
    500             .word 121f-100b
    501             .word 122f-100b
    502             .word 123f-100b
    503             .word 124f-100b
    504             .word 125f-100b
    505     125:    vext.u16    q12, q3, q4, #6
    506             vext.u16    q13, q10, q11, #0
    507             vmlal.u16   q14, d24, d6[1]
    508             vmlal.u16   q15, d25, d6[1]
    509             vmlal.u16   q14, d26, d6[1]
    510             vmlal.u16   q15, d27, d6[1]
    511     124:    vext.u16    q12, q3, q4, #7
    512             vext.u16    q13, q9, q10, #7
    513             vmlal.u16   q14, d24, d6[0]
    514             vmlal.u16   q15, d25, d6[0]
    515             vmlal.u16   q14, d26, d6[0]
    516             vmlal.u16   q15, d27, d6[0]
    517     123:    vext.u16    q12, q4, q5, #0
    518             vext.u16    q13, q9, q10, #6
    519             vmlal.u16   q14, d24, d5[3]
    520             vmlal.u16   q15, d25, d5[3]
    521             vmlal.u16   q14, d26, d5[3]
    522             vmlal.u16   q15, d27, d5[3]
    523     122:    vext.u16    q12, q4, q5, #1
    524             vext.u16    q13, q9, q10, #5
    525             vmlal.u16   q14, d24, d5[2]
    526             vmlal.u16   q15, d25, d5[2]
    527             vmlal.u16   q14, d26, d5[2]
    528             vmlal.u16   q15, d27, d5[2]
    529     121:    vext.u16    q12, q4, q5, #2
    530             vext.u16    q13, q9, q10, #4
    531             vmlal.u16   q14, d24, d5[1]
    532             vmlal.u16   q15, d25, d5[1]
    533             vmlal.u16   q14, d26, d5[1]
    534             vmlal.u16   q15, d27, d5[1]
    535     120:    vext.u16    q12, q4, q5, #3
    536             vext.u16    q13, q9, q10, #3
    537             vmlal.u16   q14, d24, d5[0]
    538             vmlal.u16   q15, d25, d5[0]
    539             vmlal.u16   q14, d26, d5[0]
    540             vmlal.u16   q15, d27, d5[0]
    541     119:    vext.u16    q12, q4, q5, #4
    542             vext.u16    q13, q9, q10, #2
    543             vmlal.u16   q14, d24, d4[3]
    544             vmlal.u16   q15, d25, d4[3]
    545             vmlal.u16   q14, d26, d4[3]
    546             vmlal.u16   q15, d27, d4[3]
    547     118:    vext.u16    q12, q4, q5, #5
    548             vext.u16    q13, q9, q10, #1
    549             vmlal.u16   q14, d24, d4[2]
    550             vmlal.u16   q15, d25, d4[2]
    551             vmlal.u16   q14, d26, d4[2]
    552             vmlal.u16   q15, d27, d4[2]
    553     117:    vext.u16    q12, q4, q5, #6
    554             vext.u16    q13, q9, q10, #0
    555             vmlal.u16   q14, d24, d4[1]
    556             vmlal.u16   q15, d25, d4[1]
    557             vmlal.u16   q14, d26, d4[1]
    558             vmlal.u16   q15, d27, d4[1]
    559     116:    vext.u16    q12, q4, q5, #7
    560             vext.u16    q13, q8, q9, #7
    561             vmlal.u16   q14, d24, d4[0]
    562             vmlal.u16   q15, d25, d4[0]
    563             vmlal.u16   q14, d26, d4[0]
    564             vmlal.u16   q15, d27, d4[0]
    565     115:    vext.u16    q12, q5, q6, #0
    566             vext.u16    q13, q8, q9, #6
    567             vmlal.u16   q14, d24, d3[3]
    568             vmlal.u16   q15, d25, d3[3]
    569             vmlal.u16   q14, d26, d3[3]
    570             vmlal.u16   q15, d27, d3[3]
    571     114:    vext.u16    q12, q5, q6, #1
    572             vext.u16    q13, q8, q9, #5
    573             vmlal.u16   q14, d24, d3[2]
    574             vmlal.u16   q15, d25, d3[2]
    575             vmlal.u16   q14, d26, d3[2]
    576             vmlal.u16   q15, d27, d3[2]
    577     113:    vext.u16    q12, q5, q6, #2
    578             vext.u16    q13, q8, q9, #4
    579             vmlal.u16   q14, d24, d3[1]
    580             vmlal.u16   q15, d25, d3[1]
    581             vmlal.u16   q14, d26, d3[1]
    582             vmlal.u16   q15, d27, d3[1]
    583     112:    vext.u16    q12, q5, q6, #3
    584             vext.u16    q13, q8, q9, #3
    585             vmlal.u16   q14, d24, d3[0]
    586             vmlal.u16   q15, d25, d3[0]
    587             vmlal.u16   q14, d26, d3[0]
    588             vmlal.u16   q15, d27, d3[0]
    589     111:    vext.u16    q12, q5, q6, #4
    590             vext.u16    q13, q8, q9, #2
    591             vmlal.u16   q14, d24, d2[3]
    592             vmlal.u16   q15, d25, d2[3]
    593             vmlal.u16   q14, d26, d2[3]
    594             vmlal.u16   q15, d27, d2[3]
    595     110:    vext.u16    q12, q5, q6, #5
    596             vext.u16    q13, q8, q9, #1
    597             vmlal.u16   q14, d24, d2[2]
    598             vmlal.u16   q15, d25, d2[2]
    599             vmlal.u16   q14, d26, d2[2]
    600             vmlal.u16   q15, d27, d2[2]
    601     109:    vext.u16    q12, q5, q6, #6
    602             vext.u16    q13, q8, q9, #0
    603             vmlal.u16   q14, d24, d2[1]
    604             vmlal.u16   q15, d25, d2[1]
    605             vmlal.u16   q14, d26, d2[1]
    606             vmlal.u16   q15, d27, d2[1]
    607     108:    vext.u16    q12, q5, q6, #7
    608             vext.u16    q13, q7, q8, #7
    609             vmlal.u16   q14, d24, d2[0]
    610             vmlal.u16   q15, d25, d2[0]
    611             vmlal.u16   q14, d26, d2[0]
    612             vmlal.u16   q15, d27, d2[0]
    613     107:    vext.u16    q12, q6, q7, #0
    614             vext.u16    q13, q7, q8, #6
    615             vmlal.u16   q14, d24, d1[3]
    616             vmlal.u16   q15, d25, d1[3]
    617             vmlal.u16   q14, d26, d1[3]
    618             vmlal.u16   q15, d27, d1[3]
    619     106:    vext.u16    q12, q6, q7, #1
    620             vext.u16    q13, q7, q8, #5
    621             vmlal.u16   q14, d24, d1[2]
    622             vmlal.u16   q15, d25, d1[2]
    623             vmlal.u16   q14, d26, d1[2]
    624             vmlal.u16   q15, d27, d1[2]
    625     105:    vext.u16    q12, q6, q7, #2
    626             vext.u16    q13, q7, q8, #4
    627             vmlal.u16   q14, d24, d1[1]
    628             vmlal.u16   q15, d25, d1[1]
    629             vmlal.u16   q14, d26, d1[1]
    630             vmlal.u16   q15, d27, d1[1]
    631     104:    vext.u16    q12, q6, q7, #3
    632             vext.u16    q13, q7, q8, #3
    633             vmlal.u16   q14, d24, d1[0]
    634             vmlal.u16   q15, d25, d1[0]
    635             vmlal.u16   q14, d26, d1[0]
    636             vmlal.u16   q15, d27, d1[0]
    637     103:    vext.u16    q12, q6, q7, #4
    638             vext.u16    q13, q7, q8, #2
    639             vmlal.u16   q14, d24, d0[3]
    640             vmlal.u16   q15, d25, d0[3]
    641             vmlal.u16   q14, d26, d0[3]
    642             vmlal.u16   q15, d27, d0[3]
    643     102:    vext.u16    q12, q6, q7, #5
    644             vext.u16    q13, q7, q8, #1
    645             vmlal.u16   q14, d24, d0[2]
    646             vmlal.u16   q15, d25, d0[2]
    647             vmlal.u16   q14, d26, d0[2]
    648             vmlal.u16   q15, d27, d0[2]
    649     101:    vext.u16    q12, q6, q7, #6
    650             vext.u16    q13, q7, q8, #0
    651             vmlal.u16   q14, d24, d0[1]
    652             vmlal.u16   q15, d25, d0[1]
    653             vmlal.u16   q14, d26, d0[1]
    654             vmlal.u16   q15, d27, d0[1]
    655 
    656             vqrshrn.u32 d28, q14, #16
    657             vqrshrn.u32 d29, q15, #16
    658             vqrshrn.u16 d31, q14, #FRACTION_BITS
    659 
    660             vmov        d7, d9
    661             vmov        q4, q5
    662             vmov        q5, q6
    663             vmov        q6, q7
    664             vmov        q7, q8
    665             vmov        q8, q9
    666             vmov        q9, q10
    667             vmov        q10, q11
    668 .endm/*}}}*/
    669 
    670 #define TUNED_LIST4 6, 12
    671 .macro hconv4_6/*{{{*/
    672             vmull.u16   q14, d14, d0[0]
    673             vmull.u16   q15, d15, d0[0]
    674 
    675             ldr         r12, [pc, r5, LSL #2]
    676             add         pc, pc, r12
    677             bkpt
    678     100:    .word 101f-100b
    679             .word 102f-100b
    680             .word 103f-100b
    681             .word 104f-100b
    682             .word 105f-100b
    683             .word 106f-100b
    684     106:    vmlal.u16   q14, d8,  d1[2]
    685             vmlal.u16   q15, d9,  d1[2]
    686             vmlal.u16   q14, d20, d1[2]
    687             vmlal.u16   q15, d21, d1[2]
    688     105:    vmlal.u16   q14, d9,  d1[1]
    689             vmlal.u16   q15, d10, d1[1]
    690             vmlal.u16   q14, d19, d1[1]
    691             vmlal.u16   q15, d20, d1[1]
    692     104:    vmlal.u16   q14, d10, d1[0]
    693             vmlal.u16   q15, d11, d1[0]
    694             vmlal.u16   q14, d18, d1[0]
    695             vmlal.u16   q15, d19, d1[0]
    696     103:    vmlal.u16   q14, d11, d0[3]
    697             vmlal.u16   q15, d12, d0[3]
    698             vmlal.u16   q14, d17, d0[3]
    699             vmlal.u16   q15, d18, d0[3]
    700     102:    vmlal.u16   q14, d12, d0[2]
    701             vmlal.u16   q15, d13, d0[2]
    702             vmlal.u16   q14, d16, d0[2]
    703             vmlal.u16   q15, d17, d0[2]
    704     101:    vmlal.u16   q14, d13, d0[1]
    705             vmlal.u16   q15, d14, d0[1]
    706             vmlal.u16   q14, d15, d0[1]
    707             vmlal.u16   q15, d16, d0[1]
    708 
    709             vqrshrn.u32 d28, q14, #16
    710             vqrshrn.u32 d29, q15, #16
    711             vqrshrn.u16 d31, q14, #FRACTION_BITS
    712 
    713             vmov        q4, q5
    714             vmov        q5, q6
    715             vmov        q6, q7
    716             vmov        q7, q8
    717             vmov        q8, q9
    718             vmov        q9, q10
    719             vmov        q10, q11
    720 .endm/*}}}*/
    721 
    722 .macro hconv4_12/*{{{*/
    723             vmull.u16   q14, d8, d0[0]
    724             vmull.u16   q15, d9, d0[0]
    725 
    726             ldr         r12, [pc, r5, LSL #2]
    727             add         pc, pc, r12
    728             bkpt
    729     100:    .word 101f-100b
    730             .word 102f-100b
    731             .word 103f-100b
    732             .word 104f-100b
    733             .word 105f-100b
    734             .word 106f-100b
    735             .word 107f-100b
    736             .word 108f-100b
    737             .word 109f-100b
    738             .word 110f-100b
    739             .word 111f-100b
    740             .word 112f-100b
    741     112:    add         r12, r9, #0x1a0
    742             bic         r12, r12, #0x200
    743             vld1.u16    {d24,d25}, [r12:128]
    744             vmlal.u16   q14, d24, d3[0]
    745             vmlal.u16   q15, d25, d3[0]
    746             vmlal.u16   q14, d20, d3[0]
    747             vmlal.u16   q15, d21, d3[0]
    748     111:    add         r12, r9, #0x1a8
    749             bic         r12, r12, #0x200
    750             vld1.u16    {d24}, [r12:64]!
    751             bic         r12, r12, #0x200
    752             vld1.u16    {d25}, [r12:64]
    753             vmlal.u16   q14, d24, d2[3]
    754             vmlal.u16   q15, d25, d2[3]
    755             vmlal.u16   q14, d19, d2[3]
    756             vmlal.u16   q15, d20, d2[3]
    757     110:    add         r12, r9, #0x1b0
    758             bic         r12, r12, #0x200
    759             vld1.u16    {d24,d25}, [r12:128]
    760             vmlal.u16   q14, d24, d2[2]
    761             vmlal.u16   q15, d25, d2[2]
    762             vmlal.u16   q14, d18, d2[2]
    763             vmlal.u16   q15, d19, d2[2]
    764     109:    add         r12, r9, #0x1b8
    765             bic         r12, r12, #0x200
    766             vld1.u16    {d24}, [r12:64]!
    767             bic         r12, r12, #0x200
    768             vld1.u16    {d25}, [r12:64]
    769             vmlal.u16   q14, d24, d2[1]
    770             vmlal.u16   q15, d25, d2[1]
    771             vmlal.u16   q14, d17, d2[1]
    772             vmlal.u16   q15, d18, d2[1]
    773     108:    add         r12, r9, #0x1c0
    774             bic         r12, r12, #0x200
    775             vld1.u16    {d24,d25}, [r12:128]
    776             vmlal.u16   q14, d24, d2[0]
    777             vmlal.u16   q15, d25, d2[0]
    778             vmlal.u16   q14, d16, d2[0]
    779             vmlal.u16   q15, d17, d2[0]
    780     107:    add         r12, r9, #0x1c8
    781             bic         r12, r12, #0x200
    782             vld1.u16    {d24}, [r12:64]!
    783             bic         r12, r12, #0x200
    784             vld1.u16    {d25}, [r12:64]
    785             vmlal.u16   q14, d24, d1[3]
    786             vmlal.u16   q15, d25, d1[3]
    787             vmlal.u16   q14, d15, d1[3]
    788             vmlal.u16   q15, d16, d1[3]
    789     106:    add         r12, r9, #0x1d0
    790             bic         r12, r12, #0x200
    791             vld1.u16    {d24,d25}, [r12:128]
    792             vmlal.u16   q14, d24, d1[2]
    793             vmlal.u16   q15, d25, d1[2]
    794             vmlal.u16   q14, d14, d1[2]
    795             vmlal.u16   q15, d15, d1[2]
    796     105:    add         r12, r9, #0x1d8
    797             bic         r12, r12, #0x200
    798             vld1.u16    {d24}, [r12:64]!
    799             bic         r12, r12, #0x200
    800             vld1.u16    {d25}, [r12:64]
    801             vmlal.u16   q14, d24, d1[1]
    802             vmlal.u16   q15, d25, d1[1]
    803             vmlal.u16   q14, d13, d1[1]
    804             vmlal.u16   q15, d14, d1[1]
    805     104:    add         r12, r9, #0x1e0
    806             bic         r12, r12, #0x200
    807             vld1.u16    {d24,d25}, [r12:128]
    808             vmlal.u16   q14, d24, d1[0]
    809             vmlal.u16   q15, d25, d1[0]
    810             vmlal.u16   q14, d12, d1[0]
    811             vmlal.u16   q15, d13, d1[0]
    812     103:    add         r12, r9, #0x1e8
    813             bic         r12, r12, #0x200
    814             vld1.u16    {d24}, [r12:64]!
    815             bic         r12, r12, #0x200
    816             vld1.u16    {d25}, [r12:64]
    817             vmlal.u16   q14, d24, d0[3]
    818             vmlal.u16   q15, d25, d0[3]
    819             vmlal.u16   q14, d11, d0[3]
    820             vmlal.u16   q15, d12, d0[3]
    821     102:    add         r12, r9, #0x1f0
    822             bic         r12, r12, #0x200
    823             vld1.u16    {d24,d25}, [r12:128]
    824             vmlal.u16   q14, d24, d0[2]
    825             vmlal.u16   q15, d25, d0[2]
    826             vmlal.u16   q14, d10, d0[2]
    827             vmlal.u16   q15, d11, d0[2]
    828     101:    add         r12, r9, #0x1f8
    829             bic         r12, r12, #0x200
    830             vld1.u16    {d24}, [r12:64]
    831             vmlal.u16   q14, d24, d0[1]
    832             vmlal.u16   q15, d8,  d0[1]
    833             vmlal.u16   q14, d9,  d0[1]
    834             vmlal.u16   q15, d10, d0[1]
    835 
    836             vqrshrn.u32 d28, q14, #16
    837             vqrshrn.u32 d29, q15, #16
    838             vqrshrn.u16 d31, q14, #FRACTION_BITS
    839 
    840             vst1.u8     {q4}, [r9:128]!
    841             bic         r9, r9, #0x200
    842             vmov        q4, q5
    843             vmov        q5, q6
    844             vmov        q6, q7
    845             vmov        q7, q8
    846             vmov        q8, q9
    847             vmov        q9, q10
    848             vmov        q10, q11
    849 .endm/*}}}*/
    850 
    851 .macro hconv4_25/*{{{*/
    852             add         r12, r9, #0x198
    853             bic         r12, r12, #0x200
    854             vld1.u16    {d24}, [r12:64]!
    855             bic         r12, r12, #0x200
    856             vld1.u16    {d25}, [r12:64]
    857             vmull.u16   q14, d24, d0[0]
    858             vmull.u16   q15, d25, d0[0]
    859 
    860             ldr         r12, [pc, r5, LSL #2]
    861             add         pc, pc, r12
    862             bkpt
    863     100:    .word 101f-100b
    864             .word 102f-100b
    865             .word 103f-100b
    866             .word 104f-100b
    867             .word 105f-100b
    868             .word 106f-100b
    869             .word 107f-100b
    870             .word 108f-100b
    871             .word 109f-100b
    872             .word 110f-100b
    873             .word 111f-100b
    874             .word 112f-100b
    875             .word 113f-100b
    876             .word 114f-100b
    877             .word 115f-100b
    878             .word 116f-100b
    879             .word 117f-100b
    880             .word 118f-100b
    881             .word 119f-100b
    882             .word 120f-100b
    883             .word 121f-100b
    884             .word 122f-100b
    885             .word 123f-100b
    886             .word 124f-100b
    887             .word 125f-100b
    888     125:    add         r12, r9, #0x0d0
    889             bic         r12, r12, #0x200
    890             vld1.u16    {d24,d25}, [r12:128]
    891             vmlal.u16   q14, d24, d6[1]
    892             vmlal.u16   q15, d25, d6[1]
    893             vmlal.u16   q14, d20, d6[1]
    894             vmlal.u16   q15, d21, d6[1]
    895     124:    add         r12, r9, #0x0d8
    896             bic         r12, r12, #0x200
    897             vld1.u16    {d24}, [r12:64]!
    898             bic         r12, r12, #0x200
    899             vld1.u16    {d25}, [r12]
    900             vmlal.u16   q14, d24, d6[0]
    901             vmlal.u16   q15, d25, d6[0]
    902             vmlal.u16   q14, d19, d6[0]
    903             vmlal.u16   q15, d20, d6[0]
    904     123:    add         r12, r9, #0x0e0
    905             bic         r12, r12, #0x200
    906             vld1.u16    {d24,d25}, [r12:128]
    907             vmlal.u16   q14, d24, d5[3]
    908             vmlal.u16   q15, d25, d5[3]
    909             vmlal.u16   q14, d18, d5[3]
    910             vmlal.u16   q15, d19, d5[3]
    911     122:    add         r12, r9, #0x0e8
    912             bic         r12, r12, #0x200
    913             vld1.u16    {d24}, [r12:64]!
    914             bic         r12, r12, #0x200
    915             vld1.u16    {d25}, [r12]
    916             vmlal.u16   q14, d24, d5[2]
    917             vmlal.u16   q15, d25, d5[2]
    918             vmlal.u16   q14, d17, d5[2]
    919             vmlal.u16   q15, d18, d5[2]
    920     121:    add         r12, r9, #0x0f0
    921             bic         r12, r12, #0x200
    922             vld1.u16    {d24,d25}, [r12:128]
    923             vmlal.u16   q14, d24, d5[1]
    924             vmlal.u16   q15, d25, d5[1]
    925             vmlal.u16   q14, d16, d5[1]
    926             vmlal.u16   q15, d17, d5[1]
    927     120:    add         r12, r9, #0x0f8
    928             bic         r12, r12, #0x200
    929             vld1.u16    {d24}, [r12:64]!
    930             bic         r12, r12, #0x200
    931             vld1.u16    {d25}, [r12]
    932             vmlal.u16   q14, d24, d5[0]
    933             vmlal.u16   q15, d25, d5[0]
    934             vmlal.u16   q14, d15, d5[0]
    935             vmlal.u16   q15, d16, d5[0]
    936     119:    add         r12, r9, #0x100
    937             bic         r12, r12, #0x200
    938             vld1.u16    {d24,d25}, [r12:128]
    939             vmlal.u16   q14, d24, d4[3]
    940             vmlal.u16   q15, d25, d4[3]
    941             vmlal.u16   q14, d14, d4[3]
    942             vmlal.u16   q15, d15, d4[3]
    943     118:    add         r12, r9, #0x108
    944             bic         r12, r12, #0x200
    945             vld1.u16    {d24}, [r12:64]!
    946             bic         r12, r12, #0x200
    947             vld1.u16    {d25}, [r12]
    948             vmlal.u16   q14, d24, d4[2]
    949             vmlal.u16   q15, d25, d4[2]
    950             vmlal.u16   q14, d13, d4[2]
    951             vmlal.u16   q15, d14, d4[2]
    952     117:    add         r12, r9, #0x110
    953             bic         r12, r12, #0x200
    954             vld1.u16    {d24,d25}, [r12:128]
    955             vmlal.u16   q14, d24, d4[1]
    956             vmlal.u16   q15, d25, d4[1]
    957             vmlal.u16   q14, d12, d4[1]
    958             vmlal.u16   q15, d13, d4[1]
    959     116:    add         r12, r9, #0x118
    960             bic         r12, r12, #0x200
    961             vld1.u16    {d24}, [r12:64]!
    962             bic         r12, r12, #0x200
    963             vld1.u16    {d25}, [r12]
    964             vmlal.u16   q14, d24, d4[0]
    965             vmlal.u16   q15, d25, d4[0]
    966             vmlal.u16   q14, d11, d4[0]
    967             vmlal.u16   q15, d12, d4[0]
    968     115:    add         r12, r9, #0x120
    969             bic         r12, r12, #0x200
    970             vld1.u16    {d24,d25}, [r12:128]
    971             vmlal.u16   q14, d24, d3[3]
    972             vmlal.u16   q15, d25, d3[3]
    973             vmlal.u16   q14, d10, d3[3]
    974             vmlal.u16   q15, d11, d3[3]
    975     114:    add         r12, r9, #0x128
    976             bic         r12, r12, #0x200
    977             vld1.u16    {d24}, [r12:64]!
    978             bic         r12, r12, #0x200
    979             vld1.u16    {d25}, [r12]
    980             vmlal.u16   q14, d24, d3[2]
    981             vmlal.u16   q15, d25, d3[2]
    982             vmlal.u16   q14, d9,  d3[2]
    983             vmlal.u16   q15, d10, d3[2]
    984     113:    add         r12, r9, #0x130
    985             bic         r12, r12, #0x200
    986             vld1.u16    {d24,d25}, [r12:128]
    987             vmlal.u16   q14, d24, d3[1]
    988             vmlal.u16   q15, d25, d3[1]
    989             vmlal.u16   q14, d8,  d3[1]
    990             vmlal.u16   q15, d9,  d3[1]
    991     112:    add         r12, r9, #0x138
    992             bic         r12, r12, #0x200
    993             vld1.u16    {d24}, [r12:64]!
    994             bic         r12, r12, #0x200
    995             vld1.u16    {d25}, [r12]
    996                                             add         r12, r9, #0x1f8
    997                                             bic         r12, r12, #0x200
    998                                             vld1.u16    {d26}, [r12:64]
    999             vmlal.u16   q14, d24, d3[0]
   1000             vmlal.u16   q15, d25, d3[0]
   1001             vmlal.u16   q14, d26, d3[0]   @ Could be d7, without the load, right?
   1002             vmlal.u16   q15, d8,  d3[0]
   1003     111:    add         r12, r9, #0x140
   1004             bic         r12, r12, #0x200
   1005             vld1.u16    {d24,d25}, [r12:128]
   1006                                             add         r12, r9, #0x1f0
   1007                                             bic         r12, r12, #0x200
   1008                                             vld1.u16    {d26,d27}, [r12:128]
   1009             vmlal.u16   q14, d24, d2[3]
   1010             vmlal.u16   q15, d25, d2[3]
   1011             vmlal.u16   q14, d26, d2[3]
   1012             vmlal.u16   q15, d27, d2[3]
   1013     110:    add         r12, r9, #0x148
   1014             bic         r12, r12, #0x200
   1015             vld1.u16    {d24}, [r12:64]!
   1016             bic         r12, r12, #0x200
   1017             vld1.u16    {d25}, [r12]
   1018                                             add         r12, r9, #0x1e8
   1019                                             bic         r12, r12, #0x200
   1020                                             vld1.u16    {d26}, [r12:64]!
   1021                                             bic         r12, r12, #0x200
   1022                                             vld1.u16    {d27}, [r12:64]
   1023             vmlal.u16   q14, d24, d2[2]
   1024             vmlal.u16   q15, d25, d2[2]
   1025             vmlal.u16   q14, d26, d2[2]
   1026             vmlal.u16   q15, d27, d2[2]
   1027     109:    add         r12, r9, #0x150
   1028             bic         r12, r12, #0x200
   1029             vld1.u16    {d24,d25}, [r12:128]
   1030                                             add         r12, r9, #0x1e0
   1031                                             bic         r12, r12, #0x200
   1032                                             vld1.u16    {d26,d27}, [r12:128]
   1033             vmlal.u16   q14, d24, d2[1]
   1034             vmlal.u16   q15, d25, d2[1]
   1035             vmlal.u16   q14, d26, d2[1]
   1036             vmlal.u16   q15, d27, d2[1]
   1037     108:    add         r12, r9, #0x158
   1038             bic         r12, r12, #0x200
   1039             vld1.u16    {d24}, [r12:64]!
   1040             bic         r12, r12, #0x200
   1041             vld1.u16    {d25}, [r12]
   1042                                             add         r12, r9, #0x1d8
   1043                                             bic         r12, r12, #0x200
   1044                                             vld1.u16    {d26}, [r12:64]!
   1045                                             bic         r12, r12, #0x200
   1046                                             vld1.u16    {d27}, [r12:64]
   1047             vmlal.u16   q14, d24, d2[0]
   1048             vmlal.u16   q15, d25, d2[0]
   1049             vmlal.u16   q14, d26, d2[0]
   1050             vmlal.u16   q15, d27, d2[0]
   1051     107:    add         r12, r9, #0x160
   1052             bic         r12, r12, #0x200
   1053             vld1.u16    {d24,d25}, [r12:128]
   1054                                             add         r12, r9, #0x1d0
   1055                                             bic         r12, r12, #0x200
   1056                                             vld1.u16    {d26,d27}, [r12:128]
   1057             vmlal.u16   q14, d24, d1[3]
   1058             vmlal.u16   q15, d25, d1[3]
   1059             vmlal.u16   q14, d26, d1[3]
   1060             vmlal.u16   q15, d27, d1[3]
   1061     106:    add         r12, r9, #0x168
   1062             bic         r12, r12, #0x200
   1063             vld1.u16    {d24}, [r12:64]!
   1064             bic         r12, r12, #0x200
   1065             vld1.u16    {d25}, [r12]
   1066                                             add         r12, r9, #0x1c8
   1067                                             bic         r12, r12, #0x200
   1068                                             vld1.u16    {d26}, [r12:64]!
   1069                                             bic         r12, r12, #0x200
   1070                                             vld1.u16    {d27}, [r12:64]
   1071             vmlal.u16   q14, d24, d1[2]
   1072             vmlal.u16   q15, d25, d1[2]
   1073             vmlal.u16   q14, d26, d1[2]
   1074             vmlal.u16   q15, d27, d1[2]
   1075     105:    add         r12, r9, #0x170
   1076             bic         r12, r12, #0x200
   1077             vld1.u16    {d24,d25}, [r12:128]
   1078                                             add         r12, r9, #0x1c0
   1079                                             bic         r12, r12, #0x200
   1080                                             vld1.u16    {d26,d27}, [r12:128]
   1081             vmlal.u16   q14, d24, d1[1]
   1082             vmlal.u16   q15, d25, d1[1]
   1083             vmlal.u16   q14, d26, d1[1]
   1084             vmlal.u16   q15, d27, d1[1]
   1085     104:    add         r12, r9, #0x178
   1086             bic         r12, r12, #0x200
   1087             vld1.u16    {d24}, [r12:64]!
   1088             bic         r12, r12, #0x200
   1089             vld1.u16    {d25}, [r12]
   1090                                             add         r12, r9, #0x1b8
   1091                                             bic         r12, r12, #0x200
   1092                                             vld1.u16    {d26}, [r12:64]!
   1093                                             bic         r12, r12, #0x200
   1094                                             vld1.u16    {d27}, [r12:64]
   1095             vmlal.u16   q14, d24, d1[0]
   1096             vmlal.u16   q15, d25, d1[0]
   1097             vmlal.u16   q14, d26, d1[0]
   1098             vmlal.u16   q15, d27, d1[0]
   1099     103:    add         r12, r9, #0x180
   1100             bic         r12, r12, #0x200
   1101             vld1.u16    {d24,d25}, [r12:128]
   1102                                             add         r12, r9, #0x1b0
   1103                                             bic         r12, r12, #0x200
   1104                                             vld1.u16    {d26,d27}, [r12:128]
   1105             vmlal.u16   q14, d24, d0[3]
   1106             vmlal.u16   q15, d25, d0[3]
   1107             vmlal.u16   q14, d26, d0[3]
   1108             vmlal.u16   q15, d27, d0[3]
   1109     102:    add         r12, r9, #0x188
   1110             bic         r12, r12, #0x200
   1111             vld1.u16    {d24}, [r12:64]!
   1112             bic         r12, r12, #0x200
   1113             vld1.u16    {d25}, [r12]
   1114                                             add         r12, r9, #0x1a8
   1115                                             bic         r12, r12, #0x200
   1116                                             vld1.u16    {d26}, [r12:64]!
   1117                                             bic         r12, r12, #0x200
   1118                                             vld1.u16    {d27}, [r12:64]
   1119             vmlal.u16   q14, d24, d0[2]
   1120             vmlal.u16   q15, d25, d0[2]
   1121             vmlal.u16   q14, d26, d0[2]
   1122             vmlal.u16   q15, d27, d0[2]
   1123     101:    add         r12, r9, #0x190
   1124             bic         r12, r12, #0x200
   1125             vld1.u16    {d24,d25}, [r12:128]!
   1126             bic         r12, r12, #0x200
   1127             vld1.u16    {d26,d27}, [r12:128]
   1128             vmlal.u16   q14, d24, d0[1]
   1129             vmlal.u16   q15, d25, d0[1]
   1130             vmlal.u16   q14, d26, d0[1]
   1131             vmlal.u16   q15, d27, d0[1]
   1132 
   1133             vqrshrn.u32 d28, q14, #16
   1134             vqrshrn.u32 d29, q15, #16
   1135             vqrshrn.u16 d31, q14, #FRACTION_BITS
   1136 
   1137             vst1.u8     {q4}, [r9:128]!
   1138             bic         r9, r9, #0x200
   1139             vmov        q4, q5
   1140             vmov        q5, q6
   1141             vmov        q6, q7
   1142             vmov        q7, q8
   1143             vmov        q8, q9
   1144             vmov        q9, q10
   1145             vmov        q10, q11
   1146 .endm/*}}}*/
   1147 
   1148 /* Dedicated function wrapper for the fetch macro, for the cases where
   1149  * performance isn't that important, to keep code size down.
   1150  */
   1151 PRIVATE(fetch_generic_asm)
   1152             push        {r10,r11}
   1153             fetch
   1154             pop         {r10,r11}
   1155             bx          lr
   1156 END(fetch_generic_asm)
   1157 
   1158 
   1159 /* Fetch the next (16 - (r10 & 15)) columns of data, avoiding reading memory
   1160  * beyond that limit, and filling the rest of the vector with the last legal
   1161  * pixel.
   1162  * Result is in q10 and q11.  q8 and q9 are filled with the first legal pixel.
   1163  * Note: This function can read beyond the right edge of input if the image is
   1164  * narrower than 16 bytes.
   1165  */
   1166 PRIVATE(fetch_clampleft1)
   1167             push        {r12,lr}
   1168             bl          fetch_generic_asm
   1169             vdup.u16    q8, d20[0]
   1170             vdup.u16    q9, d20[0]
   1171             ands        r12, r10, #15
   1172             beq         1f
   1173             sub         r1, r1, r12
   1174             sub         r10, r10, r12
   1175             sub         sp, sp, #32
   1176             vst1.u16    {q10,q11}, [sp]
   1177             sub         r12, sp, r12, LSL #1
   1178             sub         sp, sp, #32
   1179             vst1.u16    {q8,q9}, [sp]
   1180             vld1.u16    {q10,q11}, [r12]
   1181             add         sp, sp, #64
   1182 1:          pop         {r12,pc}
   1183 END(fetch_clampleft1)
   1184 
   1185 PRIVATE(fetch_clampleft4)
   1186             push        {r12,lr}
   1187             bl          fetch_generic_asm
   1188             vmov.u16    d16, d20
   1189             vmov.u16    d17, d20
   1190             vmov.u16    d18, d20
   1191             vmov.u16    d19, d20
   1192             ands        r12, r10, #15
   1193             beq         1f
   1194             sub         r1, r1, r12
   1195             sub         r10, r10, r12
   1196             sub         sp, sp, #32
   1197             vst1.u16    {q10-q11}, [sp]
   1198             sub         r12, sp, r12, LSL #1
   1199             sub         sp, sp, #32
   1200             vst1.u16    {q8,q9}, [sp]
   1201             vld1.u16    {q10,q11}, [r12]
   1202             add         sp, sp, #64
   1203 1:          pop         {r12,pc}
   1204 END(fetch_clampleft4)
   1205 
   1206 /* Fetch only the next (r11 & 15) (where 0 means 16) columns of data, avoiding
   1207  * reading memory beyond that limit, and filling the rest of the vector with
   1208  * the last legal pixel.
   1209  * Result is in q10 and q11.  q12 and q13 are filled with the last legal pixel.
   1210  * Note: This function can read beyond the left edge of input if the image is
   1211  * narrower than 16 bytes.
   1212  */
   1213 PRIVATE(fetch_clampright1)
   1214             push        {r12, lr}
   1215             rsb         r12, r11, #0
   1216             ands        r12, r12, #15
   1217             beq         1f
   1218             sub         r1, r1, r12
   1219             bl          fetch_generic_asm
   1220             vdup.u16    q12, d23[3]
   1221             vdup.u16    q13, d23[3]
   1222             rsb         r12, r11, #0
   1223             and         r12, r12, #15
   1224             sub         sp, sp, #32
   1225             vst1.u16    {q12,q13}, [sp]
   1226             sub         sp, sp, #32
   1227             add         r12, sp, r12, LSL #1
   1228             vst1.u16    {q10,q11}, [sp]
   1229             vld1.u16    {q10,q11}, [r12]
   1230             add         sp, sp, #64
   1231             pop         {r12,pc}
   1232 1:          bl          fetch_generic_asm
   1233             vdup.u16    q12, d23[3]
   1234             vdup.u16    q13, d23[3]
   1235             pop         {r12,pc}
   1236 END(fetch_clampright1)
   1237 
   1238 PRIVATE(fetch_clampright4)
   1239             push        {r12, lr}
   1240             rsb         r12, r11, #0
   1241             ands        r12, r12, #15
   1242             beq         1f
   1243             sub         r1, r1, r12
   1244             bl          fetch_generic_asm
   1245             vmov.u16    d24, d23
   1246             vmov.u16    d25, d23
   1247             vmov.u16    d26, d23
   1248             vmov.u16    d27, d23
   1249             rsb         r12, r11, #0
   1250             and         r12, r12, #15
   1251             sub         sp, sp, #32
   1252             vst1.u16    {q12-q13}, [sp]
   1253             sub         sp, sp, #32
   1254             add         r12, sp, r12, LSL #1
   1255             vst1.u16    {q10,q11}, [sp]
   1256             vld1.u16    {q10,q11}, [r12]
   1257             add         sp, sp, #64
   1258             pop         {r12,pc}
   1259 1:          bl          fetch_generic_asm
   1260             vmov.u16    d24, d23
   1261             vmov.u16    d25, d23
   1262             vmov.u16    d26, d23
   1263             vmov.u16    d27, d23
   1264             pop         {r12,pc}
   1265 END(fetch_clampright4)
   1266 
   1267 /* Given values in q10 and q11, and an index in r11, sweep the (r11 & 15)th
   1268  * value across to fill the rest of the register pair.  Used for filling the
   1269  * right hand edge of the window when reading too close to the right hand edge
   1270  * of the image.
   1271  * Also returns a dup-ed copy of the last element in q12 for the tail-fill
   1272  * case (this happens incidentally in common path, but must be done
   1273  * deliberately in the fast-out path).
   1274  */
   1275 PRIVATE(prefill_sweepright1)
   1276             ands        r12, r11, #15
   1277             beq         1f
   1278             sub         r12, r12, #1
   1279             sub         sp, sp, #64
   1280             vst1.u16    {q10,q11}, [sp]
   1281             add         r12, sp, r12, LSL #1
   1282             vld1.u16    {d24[],d25[]}, [r12]
   1283             vld1.u16    {d26[],d27[]}, [r12]
   1284             vst1.u16    {q12,q13}, [r12]
   1285             vld1.u16    {q10,q11}, [sp]
   1286             add         sp, sp, #64
   1287             bx          lr
   1288 1:          vdup.u16    q12, d23[3]
   1289             vdup.u16    q13, d23[3]
   1290             bx          lr
   1291 END(prefill_sweepright1)
   1292 
   1293 PRIVATE(prefill_sweepright4)
   1294             ands        r12, r11, #15
   1295             beq         1f
   1296             sub         r12, r12, #4
   1297             sub         sp, sp, #64
   1298             vst1.u16    {q10,q11}, [sp]
   1299             add         r12, sp, r12, LSL #1
   1300             vld1.u64    {d24}, [r12]
   1301             vld1.u64    {d25}, [r12]
   1302             vld1.u64    {d26}, [r12]
   1303             vld1.u64    {d27}, [r12]
   1304             vst1.u16    {q12,q13}, [r12]
   1305             vld1.u16    {q10,q11}, [sp]
   1306             add         sp, sp, #64
   1307             bx          lr
   1308 1:          vmov.u16    d24, d23
   1309             vmov.u16    d25, d23
   1310             vmov.u16    d26, d23
   1311             vmov.u16    d27, d23
   1312             bx          lr
   1313 END(prefill_sweepright4)
   1314 
   1315 /* The main loop keeps a sliding window of data that has already been convolved
   1316  * in the vertical axis for the current line.  This usually stays in the
   1317  * register file, but spills to memory for large windows.  The first thing that
   1318  * needs to be done at start-up is to fill this window with image data, taking
   1319  * into account the padding needed if the left or right edges of the image fall
   1320  * within this window.
   1321  */
   1322 
   1323 /* Because the window is in the register file writes to it cannot be indexed
   1324  * by another register.  Consequently the fill loops are unrolled to address
   1325  * the registers directly.  This macro distinguishes between writes to the
   1326  * register file and writes to the spill buffer (indicated by a destination
   1327  * register named xx).
   1328  */
   1329 .macro prefill_out ra, rb, sra, srb, srb_hi
   1330   .ifc \ra,xx
   1331     .ifc \rb,xx
   1332             vst1.u16    {\sra,\srb}, [r9:128]!
   1333     .else
   1334             /* this case is used only for the last tap of uchar1 r=25 */
   1335             /* discard \sra */
   1336             vmov.u16    \rb, \srb_hi
   1337     .endif
   1338   .else
   1339     .ifnc \ra,\sra
   1340             vmov.u16    \ra, \sra
   1341     .endif
   1342     .ifnc \rb,\srb
   1343             vmov.u16    \rb, \srb
   1344     .endif
   1345   .endif
   1346 .endm
   1347 
   1348 /* This macro provides the list of registers representing the window, and the
   1349  * cases where the register file is too small and a spill buffer is used
   1350  * instead.
   1351  * Since several specialisations of each function are generated, this also
   1352  * culls superfluous iterations, and sets the variable `i` for subsequent
   1353  * macros indicating the current index into the window.
   1354  */
   1355 .macro prefill_list, macro, nextmacro, max_r, step, label
   1356   .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
   1357     .if windowsize >= (\line * 16)
   1358       .set i, windowsize - (\line * 16)
   1359 \label\macro\line:
   1360             prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
   1361     .endif
   1362   .endm
   1363   .if \step > 1
   1364             ifneeded \macro \nextmacro, 13, 12, xx, xx,  \step, \label
   1365             ifneeded \macro \nextmacro, 12, 11, xx, xx,  \step, \label
   1366             ifneeded \macro \nextmacro, 11, 10, xx, xx,  \step, \label
   1367             ifneeded \macro \nextmacro, 10,  9, xx, xx,  \step, \label
   1368             ifneeded \macro \nextmacro,  9,  8, xx, xx,  \step, \label
   1369             ifneeded \macro \nextmacro,  8,  7, xx, xx,  \step, \label
   1370             ifneeded \macro \nextmacro,  7,  6, xx, xx,  \step, \label
   1371             ifneeded \macro \nextmacro,  6,  5, xx, xx,  \step, \label
   1372             ifneeded \macro \nextmacro,  5,  4, xx, xx,  \step, \label
   1373             ifneeded \macro \nextmacro,  4,  3, xx, xx,  \step, \label
   1374   .else
   1375             /* q3 normally contains the coefficient table, but it's not fully
   1376              * used.  In the uchar1, r=25 case the other half of q3 is used for
   1377              * the last two window taps to avoid falling out to memory.
   1378              */
   1379             ifneeded \macro \nextmacro,  4,  3, xx, d7,   \step, \label
   1380   .endif
   1381             ifneeded \macro \nextmacro,  3,  2, q4, q5,   \step, \label
   1382             ifneeded \macro \nextmacro,  2,  1, q6, q7,   \step, \label
   1383             ifneeded \macro \nextmacro,  1,  0, q8, q9,   \step, \label
   1384 
   1385 \label\macro\()0:
   1386             b           \label\()_end
   1387   .purgem ifneeded
   1388 .endm
   1389 
   1390 /* These macros represent the possible stages of filling the window.
   1391  * Each macro is unrolled enough times that it can fill the entire window
   1392  * itself, but normally it will have to hand control to subsequent macros
   1393  * part-way through and this is done using labels named \next and \after, where
   1394  * \next is the next macro starting at the same window position and \after is
   1395  * the next macro starting after the current window position.
   1396  */
   1397 
   1398 /* leftfill: v8 and v9 contain the left padding value.  While the window
   1399  * extends outside of the image on the left-hand side, and at least 16 more
   1400  * padding values are needed in the window, store v8 and v9 into the window.
   1401  * Otherwise skip forward to storing image data.
   1402  */
   1403 .macro prefill_leftfill, next, after, ra, rb, step
   1404             cmp         r10, #i+16
   1405             blo         \next
   1406             prefill_out \ra, \rb, q8, q9, d19
   1407 .endm
   1408 
   1409 /* leftedge: The very first non-fill or partial-fill chunk from the image is
   1410  * already loaded (as it was used to calculate the left padding value), so
   1411  * store it here, and then drop into the regular load/store cycle in the next
   1412  * macro.
   1413  */
   1414 .macro prefill_leftedge, next, after, ra, rb, step
   1415 1:          prefill_out \ra, \rb, q10, q11, d23
   1416             b           \after
   1417 .endm
   1418 
   1419 /* dofetch: Copy chunks of the image into the window without any complications
   1420  * from edge conditions.
   1421  */
   1422 .macro prefill_dofetch, next, after, ra, rb, step
   1423             cmp         r11, #i+16
   1424             bls         \next
   1425             bl          fetch_generic_asm
   1426             prefill_out \ra, \rb, q10, q11, d23
   1427 .endm
   1428 
   1429 /* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
   1430  * the right-hand edge of the image.  In that case sweep the last valid pixel
   1431  * across the rest of the chunk, and in either case prepare padding data in v12
   1432  * and v13 for the next macro.  This is done in fetch_clampright.
   1433  * This only happens once before going on to the next macro.
   1434  * Sometimes leftedge also covers the rightedge case, in which case this has
   1435  * to be skipped altogether.
   1436  */
   1437 .macro prefill_rightedge, next, after, ra, rb, step
   1438             cmp         r11, #i
   1439             bls         \next
   1440             bl          fetch_clampright\step
   1441             prefill_out \ra, \rb, q10, q11, d23
   1442             b           \after
   1443 .endm
   1444 
   1445 /* rightfill: The rest of the window is simply filled with right padding from
   1446  * v12 and v13.
   1447  */
   1448 .macro prefill_rightfill, next, after, ra, rb, step
   1449             prefill_out \ra, \rb, q12, q13, d25
   1450 .endm
   1451 
   1452 /* Here all of the macros above are unrolled and laid out in the proper order.
   1453  */
   1454 .macro prefill_body, max_r, step, label
   1455             prefill_list leftfill,  leftedge,   \max_r, \step, \label
   1456             prefill_list leftedge,  dofetch,    \max_r, \step, \label
   1457             prefill_list dofetch,   rightedge,  \max_r, \step, \label
   1458             prefill_list rightedge, rightfill,  \max_r, \step, \label
   1459             prefill_list rightfill, oops,       \max_r, \step, \label
   1460 \label\()_end:
   1461 .endm
   1462 
   1463 /* Fill the convolution window with context data.  The aim here is to load
   1464  * exactly 2*r columns, and in the main loop to read as many columns as will be
   1465  * written.  This is complicated by the window being divided into chunks at
   1466  * register boundaries, and the need to handle cases when the input starts very
   1467  * close to the left or right (or both) edges of the image and the need to fill
   1468  * the spaces that leaves with left and right edge padding values.
   1469  *
   1470  * Input:
   1471  *      r1 -- src
   1472  *      r2 -- pitch
   1473  *      r3 -- count
   1474  *      r4 -- available image data right of src pointer
   1475  *      r5 -- r
   1476  *      r6 -- rup
   1477  *      r7 -- rdn
   1478  *      r8 -- available image data left of src pointer
   1479  *      r9 -- buffer (if needed)
   1480  * Output:
   1481  *      r4 -= min(inlen, count + windowsize - centertap)
   1482  *      r1 += min(inlen, count + windowsize - centertap)
   1483  * Modifies:
   1484  *      r10 -- fill start index in the window
   1485  *      r11 -- fill stop index in the window
   1486  *      r12 -- scratch
   1487  */
   1488 .macro prefill step=1, max_r=25, label=xx
   1489 .set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
   1490 .set centertap, (windowsize - \max_r * \step)
   1491             mov         r10, #centertap
   1492             subs        r10, r10, r8
   1493             movlo       r10, #0
   1494 
   1495             subs        r11, r4, #windowsize - centertap
   1496             movhs       r11, #0
   1497             add         r11, r11, #windowsize
   1498 
   1499             /* r10 indicates where in the window legal image data begins.
   1500              * r11 indicates where in the window legal image date ends.
   1501              * When starting near the centre of a large image these would be
   1502              * zero and windowsize respectively, but when starting near the
   1503              * edges this can change.
   1504              * When starting on the leftmost pixel, r10 will be centertap.
   1505              * When starting on the rightmost pixel, r11 will be centertap+1.
   1506              */
   1507 
   1508             /* r4 indicates how much data there is between the current pointers
   1509              * and the right edge of the image.  The pointers currently point
   1510              * to the data needed at centertap.  The subsequent code will
   1511              * consume (windowsize - r10) data, but only the data from
   1512              * centertap to windowsize comes out of r4's budget.
   1513              */
   1514 1:          subs        r4, r4, #windowsize - centertap
   1515             movlo       r4, #0
   1516 
   1517             /* And the pointers need to rewind to the start of the window.
   1518              */
   1519             sub         r1, r1, #centertap
   1520 
   1521             /* Unless x8 indicated that there wasn't that much data available.
   1522              */
   1523             add         r1, r1, r10
   1524 
   1525 
   1526             /* Get the first chunk, and add padding to align it to the window
   1527              * if necessary.
   1528              */
   1529             bl          fetch_clampleft\step
   1530 
   1531             /* Sometimes the start and the end of the window are in the same
   1532              * chunk.  In that case both ends need filler at the outset.
   1533              */
   1534             sub         r12, r11, #1
   1535             eor         r12,  r10, r12
   1536             cmp         r12, #16
   1537             bllo        prefill_sweepright\step
   1538 
   1539             /* Iterate through all the points in the window and fill them in
   1540              * with padding or image data as needed.
   1541              */
   1542             prefill_body \max_r, \step, \label
   1543 .endm
   1544 
   1545 /* The main body of the convolve functions.  Having already pre-filled the
   1546  * convolution window with 2*r input values, the logic settles into a regular
   1547  * pattern of reading and writing at a 1:1 rate until either input or output
   1548  * expires.  The input leads the output by r values, so when processing all the
   1549  * way to the right-hand edge, or within r pixels of that edge, the input will
   1550  * run out first.  In the case of very narrow images, or sub-windows starting
   1551  * near the right edge, the input may already have run out while the
   1552  * convolution window was being filled and this loop will start with a
   1553  * zero-length input.
   1554  *
   1555  * Once the input runs out, the rest of the output must be processed by padding
   1556  * the remainder of the window with pad value from the last valid pixel from
   1557  * the source.
   1558  *
   1559  * Input:
   1560  *      r0 = dst
   1561  *      r1 = src
   1562  *      r2 = pitch
   1563  *      r3 = count
   1564  *      r4 = inlen
   1565  *      r5 = r
   1566  *      r6 = rup
   1567  *      r7 = rdn
   1568  *      r9 = buffer
   1569  * Modifies
   1570  *      r8 = fetch code pointer
   1571  */
   1572 .macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
   1573 
   1574             /* If x4 >= x3 then there's no need for clipping.  The main loop
   1575              * needs to exit when either x3 or x4 runs out, so clamp x4 to be
   1576              * no greater than x3 and use x4 for the loop.
   1577              * However, if x4 comes out of the loop with less than 16 bytes
   1578              * left, a partial read would be necessary to avoid reading beyond
   1579              * the end of the image.  To avoid this, clamp x4 to the next
   1580              * multiple of 16, which is still sufficient to force it out of the
   1581              * loop but doesn't imply a rewind.
   1582              */
   1583             add         r12, r3, #15
   1584             bic         r12, r12, #15
   1585             cmp         r4, r12
   1586             movhi       r4, r12
   1587 
   1588             /* First calculate the entry-point into the internal fetch logic.
   1589              * This is done so the same function can service several kernel
   1590              * sizes.
   1591              */
   1592             ldr         r8, 3f
   1593 1:          add         r8, r8, pc
   1594             sub         r8, r5, LSL #5
   1595             sub         r8, r5, LSL #4
   1596             cmp         r5, r6
   1597             cmpeq       r5, r7
   1598             beq         5f
   1599 
   1600             /* if (r != rup || r != rdn) then the address-clamping table should
   1601              * be used rather than the short-cut version.
   1602              */
   1603             ldr         r8, 3f+4
   1604 2:          add         r8, r8, pc
   1605             sub         r8, r5, LSL #6
   1606             b           5f
   1607             .align 3
   1608 3:          .word       \labelnc-1b-8
   1609             .word       \labelc-2b-8
   1610 
   1611             /* Main loop: ... */
   1612             .align 4
   1613 3:          /* first perform a vertical convolution from memory to get the next
   1614              * 16 taps of the horizontal window into the register file...
   1615              */
   1616             fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=r8
   1617 
   1618             /* ...then perform a horizontal convolution on that window to
   1619              * produce eight output bytes, and slide the window along.
   1620              * This has to be done twice to match the 16-way vertical pass.
   1621              * It would be preferable to have twice the work done in \core, but
   1622              * that would demand yet another variant on those macros and would
   1623              * perturb the register allocation severely.
   1624              */
   1625             \core
   1626             vst1.u8     {d31}, [r0]!
   1627             \core
   1628             vst1.u8     {d31}, [r0]!
   1629 
   1630             sub         r3, r3, #16
   1631 5:          subs        r4, r4, #16
   1632             bhi         3b
   1633             /* Here there's 16 or fewer bytes available before the edge of the
   1634              * source image.  x4 holds that count minus 16 (because it was
   1635              * decremented before the first iteration ran).  The last read may
   1636              * not be a whole chunk, and beyond that a fill value must be used.
   1637              *
   1638              * Of course, none of that matters if there's no more output to
   1639              * produce...
   1640              */
   1641             cmp         r3, #0
   1642             beq         5f
   1643 
   1644             /* Oh well. */
   1645             adds        r4, r4, #16
   1646             bne         1f
   1647   .if \step==1
   1648             vdup.u16    q10, d19[3]
   1649             vdup.u16    q11, d19[3]
   1650   .else
   1651             vmov.u64    d20, d19
   1652             vmov.u64    d21, d19
   1653             vmov.u64    d22, d19
   1654             vmov.u64    d23, d19
   1655   .endif
   1656             b           3f
   1657 
   1658             /* To avoid reading past end of input, rewind pointers by (16-r4)
   1659              * to ensure that they're exactly 16 bytes from the edge.
   1660              */
   1661 1:          mov         r11, r4
   1662             bl          fetch_clampright\step
   1663             /* Now to put this padding to use, perform any remaining
   1664              * iterations.  This is done at half the rate of the main loop,
   1665              * because there's no longer pressure from a 16-lane window filler.
   1666              */
   1667 3:          \core
   1668   .if \step==1
   1669             vdup.u16    q11, d23[3]
   1670   .else
   1671             vmov.u64    d22, d23
   1672   .endif
   1673             subs        r3, r3, #8
   1674             blo         4f
   1675             vst1.u8     {d31}, [r0]!
   1676             bne         3b
   1677             b           5f
   1678 
   1679             /* If the final iteration contained 0 < l < 8 values, then perform
   1680              * a piecewise store of the final vector.
   1681              */
   1682 4:          tst         r3, #4
   1683             beq         1f
   1684             vst1.u32    {d31[0]}, [r0]!
   1685             vext.u8     d31, d31, d31, #4
   1686 1:          tst         r3, #2
   1687             beq         1f
   1688             vst1.u16    {d31[0]}, [r0]!
   1689             vext.u8     d31, d31, d31, #2
   1690 1:          tst         r3, #1
   1691             beq         5f
   1692             vst1.u8     {d31[0]}, [r0]!
   1693             vext.u8     d31, d31, d31, #1
   1694 5:          mov         r0, #0
   1695 .endm
   1696 
   1697 .irp r, TUNED_LIST1, 25
   1698 PRIVATE(convolve1_\r)
   1699             push        {r12,lr}
   1700 
   1701             prefill     step=1, max_r=\r, label=.Lcnv1_\r
   1702 
   1703             conv_body   core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
   1704 
   1705             pop         {r12,pc}
   1706 END(convolve1_\r)
   1707 .endr
   1708 
   1709 .irp r, TUNED_LIST4, 25
   1710 PRIVATE(convolve4_\r)
   1711             push        {r12,lr}
   1712             sub         r9, sp, #0x200
   1713             sub         sp, sp, #0x200 + 0x400
   1714             bic         r9, r9, #0x3fc
   1715 
   1716             /* r9 now points to a 0x200 byte buffer on the stack whose address
   1717              * has the low 10 bits clear.  This allows easy address calculation
   1718              * in the wrap-around cases.
   1719              */
   1720 
   1721             prefill     step=4, max_r=\r, label=.Lcnv4_\r
   1722 
   1723             conv_body   core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
   1724 
   1725             add         sp, sp, #0x200 + 0x400
   1726             pop         {r12,pc}
   1727 END(convolve4_\r)
   1728 .endr
   1729 
   1730 /* void rsdIntrinsicBlurU1_K(
   1731  *                  void *out,      // r0
   1732  *                  void *in,       // r1
   1733  *                  size_t w,       // r2
   1734  *                  size_t h,       // r3
   1735  *                  size_t p,       // [sp]
   1736  *                  size_t x,       // [sp,#4]
   1737  *                  size_t y,       // [sp,#8]
   1738  *                  size_t count,   // [sp,#12]
   1739  *                  size_t r,       // [sp,#16]
   1740  *                  uint16_t *tab); // [sp,#20]
   1741  */
   1742 ENTRY(rsdIntrinsicBlurU1_K)
   1743             push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
   1744             vpush       {d8-d15}
   1745             ldr         r6, [sp,#112]   // y
   1746             ldr         r8, [sp,#108]   // x
   1747             ldr         r5, [sp,#120]   // r
   1748             sub         r4, r2, r8      // inlen = w - x
   1749             sub         r7, r3, r6      // h - y
   1750             ldr         r2, [sp,#104]   // pitch
   1751             ldr         r3, [sp,#116]   // count
   1752             sub         r7, r7, #1      // h - y - 1
   1753 
   1754             ldr         r12, [sp,#124]
   1755 
   1756             add         r1, r1, r8      // src += x
   1757 
   1758             cmp         r6, r5
   1759             movhi       r6, r5          // rup = min(r, y)
   1760             cmp         r7, r5
   1761             movhi       r7, r5          // rdn = min(r, h - y - 1)
   1762 
   1763             vld1.u16    {d0,d1,d2,d3}, [r12]!
   1764             vld1.u16    {d4,d5,d6}, [r12]!
   1765 
   1766             adr         lr, 1f
   1767   .irp r, TUNED_LIST1
   1768             cmp         r5, #\r
   1769             bls         convolve1_\r
   1770   .endr
   1771             b           convolve1_25
   1772 
   1773 1:          vpop        {d8-d15}
   1774             pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
   1775 END(rsdIntrinsicBlurU1_K)
   1776 
   1777 /* void rsdIntrinsicBlurU4_K(
   1778  *                  void *out,      // r0
   1779  *                  void *in,       // r1
   1780  *                  size_t w,       // r2
   1781  *                  size_t h,       // r3
   1782  *                  size_t p,       // [sp]
   1783  *                  size_t x,       // [sp,#4]
   1784  *                  size_t y,       // [sp,#8]
   1785  *                  size_t count,   // [sp,#12]
   1786  *                  size_t r,       // [sp,#16]
   1787  *                  uint16_t *tab); // [sp,#20]
   1788  */
   1789 ENTRY(rsdIntrinsicBlurU4_K)
   1790             push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
   1791             vpush       {d8-d15}
   1792             ldr         r6, [sp,#112]   // y
   1793             ldr         r8, [sp,#108]   // x
   1794             ldr         r5, [sp,#120]   // r
   1795             lsl         r8, r8, #2
   1796             rsb         r4, r8, r2, LSL #2 // inlen = (w - x)
   1797             sub         r7, r3, r6      // h - y
   1798             ldr         r2, [sp,#104]   // pitch
   1799             ldr         r3, [sp,#116]   // count
   1800             sub         r7, r7, #1      // h - y - 1
   1801             lsl         r3, r3, #2      // count
   1802 
   1803             ldr         r12, [sp,#124]
   1804 
   1805             add         r1, r1, r8      // in += x
   1806 
   1807             cmp         r6, r5
   1808             movhi       r6, r5          // rup = min(r, y)
   1809             cmp         r7, r5
   1810             movhi       r7, r5          // rdn = min(r, h - y - 1)
   1811 
   1812             vld1.u16    {d0,d1,d2,d3}, [r12]!
   1813             vld1.u16    {d4,d5,d6}, [r12]!
   1814 
   1815             adr         lr, 1f
   1816   .irp r, TUNED_LIST4
   1817             cmp         r5, #\r
   1818             bls         convolve4_\r
   1819   .endr
   1820             b           convolve4_25
   1821 
   1822 1:          vpop        {d8-d15}
   1823             pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
   1824 END(rsdIntrinsicBlurU4_K)
   1825