Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2013-2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
     18 #define END(f) .size f, .-f;
     19 
     20 #define BLEND_LIST(X) \
     21     X(0, CLEAR) \
     22     X(1, SRC) \
     23     X(2, DST) \
     24     X(3, SRC_OVER) \
     25     X(4, DST_OVER) \
     26     X(5, SRC_IN) \
     27     X(6, DST_IN) \
     28     X(7, SRC_OUT) \
     29     X(8, DST_OUT) \
     30     X(9, SRC_ATOP) \
     31     X(10, DST_ATOP) \
     32     X(11, XOR) \
     33     X(14, MULTIPLY) \
     34     X(21, DIFFERENCE) \
     35     X(34, ADD) \
     36     X(35, SUBTRACT)
     37 
     38 /* For every blend operation supported, define a macro with just the arithmetic
     39  * component.  The rest can be handled later on.
     40  *
     41  * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
     42  * contain the data from the source buffer.  Both have already been split out
     43  * into one colour component per register (if necessary).  q3 and q11 contain
     44  * the alpha components.
     45  *
     46  * At the same time as defining the assembly macro, define a corresponding
     47  * preprocessor macro indicating any other requirements.
     48  *    zipped=0 -- The macro does not require the RGBA components to be
     49  *                separated.
     50  *    lddst=0  -- The macro does not require data from the destination buffer.
     51  *    ldsrc=0  -- The macro does not require data from the source buffer.
     52  *    nowrap=1 -- The macro requires no wrapper at all, and should simply be
     53  *                inserted without any surrounding load/store or loop code.
     54  */
     55 
     56 #define params_CLEAR zipped=0, lddst=0, ldsrc=0
     57 .macro blend_kernel_CLEAR
     58         movi    v0.16b, #0
     59         movi    v1.16b, #0
     60         movi    v2.16b, #0
     61         movi    v3.16b, #0
     62 .endm
     63 
     64 #define params_SRC zipped=0, lddst=0
     65 .macro blend_kernel_SRC
     66         mov     v0.16b, v8.16b
     67         mov     v1.16b, v9.16b
     68         mov     v2.16b, v10.16b
     69         mov     v3.16b, v11.16b
     70 .endm
     71 
     72 #define params_DST nowrap=1
     73 .macro blend_kernel_DST
     74         /* nop */
     75 .endm
     76 
     77 #define params_SRC_OVER zipped=1
     78 .macro blend_kernel_SRC_OVER
     79         mvn         v7.16b, v11.16b
     80 
     81         umull2      v12.8h, v7.16b, v0.16b
     82         umull       v0.8h,  v7.8b,  v0.8b
     83         umull2      v13.8h, v7.16b, v1.16b
     84         umull       v1.8h,  v7.8b,  v1.8b
     85         umull2      v14.8h, v7.16b, v2.16b
     86         umull       v2.8h,  v7.8b,  v2.8b
     87         umull2      v15.8h, v7.16b, v3.16b
     88         umull       v3.8h,  v7.8b,  v3.8b
     89 
     90         rshrn       v4.8b,  v0.8h,  #8
     91         rshrn2      v4.16b, v12.8h, #8
     92         rshrn       v5.8b,  v1.8h,  #8
     93         rshrn2      v5.16b, v13.8h, #8
     94         rshrn       v6.8b,  v2.8h,  #8
     95         rshrn2      v6.16b, v14.8h, #8
     96         rshrn       v7.8b,  v3.8h,  #8
     97         rshrn2      v7.16b, v15.8h, #8
     98 
     99         uaddw       v0.8h,  v0.8h,  v4.8b
    100         uaddw2      v12.8h, v12.8h, v4.16b
    101         uaddw       v1.8h,  v1.8h,  v5.8b
    102         uaddw2      v13.8h, v13.8h, v5.16b
    103         uaddw       v2.8h,  v2.8h,  v6.8b
    104         uaddw2      v14.8h, v14.8h, v6.16b
    105         uaddw       v3.8h,  v3.8h,  v7.8b
    106         uaddw2      v15.8h, v15.8h, v7.16b
    107 
    108         rshrn       v0.8b,  v0.8h,  #8
    109         rshrn2      v0.16b, v12.8h, #8
    110         rshrn       v1.8b,  v1.8h,  #8
    111         rshrn2      v1.16b, v13.8h, #8
    112         rshrn       v2.8b,  v2.8h,  #8
    113         rshrn2      v2.16b, v14.8h, #8
    114         rshrn       v3.8b,  v3.8h,  #8
    115         rshrn2      v3.16b, v15.8h, #8
    116 
    117         uqadd       v0.16b, v0.16b, v8.16b
    118         uqadd       v1.16b, v1.16b, v9.16b
    119         uqadd       v2.16b, v2.16b, v10.16b
    120         uqadd       v3.16b, v3.16b, v11.16b
    121 .endm
    122 
    123 #define params_DST_OVER zipped=1
    124 .macro blend_kernel_DST_OVER
    125         mvn         v7.16b, v3.16b
    126 
    127         umull2      v12.8h, v7.16b, v8.16b
    128         umull       v8.8h,  v7.8b,  v8.8b
    129         umull2      v13.8h, v7.16b, v9.16b
    130         umull       v9.8h,  v7.8b,  v9.8b
    131         umull2      v14.8h, v7.16b, v10.16b
    132         umull       v10.8h, v7.8b,  v10.8b
    133         umull2      v15.8h, v7.16b, v11.16b
    134         umull       v11.8h, v7.8b,  v11.8b
    135 
    136         rshrn       v4.8b,  v8.8h,  #8
    137         rshrn2      v4.16b, v12.8h, #8
    138         rshrn       v5.8b,  v9.8h,  #8
    139         rshrn2      v5.16b, v13.8h, #8
    140         rshrn       v6.8b,  v10.8h, #8
    141         rshrn2      v6.16b, v14.8h, #8
    142         rshrn       v7.8b,  v11.8h, #8
    143         rshrn2      v7.16b, v15.8h, #8
    144 
    145         uaddw       v8.8h,  v8.8h,  v4.8b
    146         uaddw2      v12.8h, v12.8h, v4.16b
    147         uaddw       v9.8h,  v9.8h,  v5.8b
    148         uaddw2      v13.8h, v13.8h, v5.16b
    149         uaddw       v10.8h, v10.8h, v6.8b
    150         uaddw2      v14.8h, v14.8h, v6.16b
    151         uaddw       v11.8h, v11.8h, v7.8b
    152         uaddw2      v15.8h, v15.8h, v7.16b
    153 
    154         rshrn       v8.8b,  v8.8h,  #8
    155         rshrn2      v8.16b, v12.8h, #8
    156         rshrn       v9.8b,  v9.8h,  #8
    157         rshrn2      v9.16b, v13.8h, #8
    158         rshrn       v10.8b,  v10.8h, #8
    159         rshrn2      v10.16b, v14.8h, #8
    160         rshrn       v11.8b,  v11.8h, #8
    161         rshrn2      v11.16b, v15.8h, #8
    162 
    163         uqadd       v0.16b, v0.16b, v8.16b
    164         uqadd       v1.16b, v1.16b, v9.16b
    165         uqadd       v2.16b, v2.16b, v10.16b
    166         uqadd       v3.16b, v3.16b, v11.16b
    167 .endm
    168 
    169 #define params_SRC_IN zipped=1
    170 .macro blend_kernel_SRC_IN
    171         umull2      v12.8h, v3.16b, v8.16b
    172         umull       v0.8h,  v3.8b,  v8.8b
    173         umull2      v13.8h, v3.16b, v9.16b
    174         umull       v1.8h,  v3.8b,  v9.8b
    175         umull2      v14.8h, v3.16b, v10.16b
    176         umull       v2.8h,  v3.8b,  v10.8b
    177         umull2      v15.8h, v3.16b, v11.16b
    178         umull       v3.8h,  v3.8b,  v11.8b
    179 
    180         rshrn       v4.8b,  v0.8h,  #8
    181         rshrn2      v4.16b, v12.8h, #8
    182         rshrn       v5.8b,  v1.8h,  #8
    183         rshrn2      v5.16b, v13.8h, #8
    184         rshrn       v6.8b,  v2.8h,  #8
    185         rshrn2      v6.16b, v14.8h, #8
    186         rshrn       v7.8b,  v3.8h,  #8
    187         rshrn2      v7.16b, v15.8h, #8
    188 
    189         uaddw       v0.8h,  v0.8h,  v4.8b
    190         uaddw2      v12.8h, v12.8h, v4.16b
    191         uaddw       v1.8h,  v1.8h,  v5.8b
    192         uaddw2      v13.8h, v13.8h, v5.16b
    193         uaddw       v2.8h,  v2.8h,  v6.8b
    194         uaddw2      v14.8h, v14.8h, v6.16b
    195         uaddw       v3.8h,  v3.8h,  v7.8b
    196         uaddw2      v15.8h, v15.8h, v7.16b
    197 
    198         rshrn       v0.8b,  v0.8h,  #8
    199         rshrn2      v0.16b, v12.8h, #8
    200         rshrn       v1.8b,  v1.8h,  #8
    201         rshrn2      v1.16b, v13.8h, #8
    202         rshrn       v2.8b,  v2.8h,  #8
    203         rshrn2      v2.16b, v14.8h, #8
    204         rshrn       v3.8b,  v3.8h,  #8
    205         rshrn2      v3.16b, v15.8h, #8
    206 .endm
    207 
    208 #define params_DST_IN zipped=1
    209 .macro blend_kernel_DST_IN
    210         umull2      v12.8h, v0.16b, v11.16b
    211         umull       v0.8h,  v0.8b,  v11.8b
    212         umull2      v13.8h, v1.16b, v11.16b
    213         umull       v1.8h,  v1.8b,  v11.8b
    214         umull2      v14.8h, v2.16b, v11.16b
    215         umull       v2.8h,  v2.8b,  v11.8b
    216         umull2      v15.8h, v3.16b, v11.16b
    217         umull       v3.8h,  v3.8b,  v11.8b
    218 
    219         rshrn       v4.8b,  v0.8h,  #8
    220         rshrn2      v4.16b, v12.8h, #8
    221         rshrn       v5.8b,  v1.8h,  #8
    222         rshrn2      v5.16b, v13.8h, #8
    223         rshrn       v6.8b,  v2.8h,  #8
    224         rshrn2      v6.16b, v14.8h, #8
    225         rshrn       v7.8b,  v3.8h,  #8
    226         rshrn2      v7.16b, v15.8h, #8
    227 
    228         uaddw       v0.8h,  v0.8h,  v4.8b
    229         uaddw2      v12.8h, v12.8h, v4.16b
    230         uaddw       v1.8h,  v1.8h,  v5.8b
    231         uaddw2      v13.8h, v13.8h, v5.16b
    232         uaddw       v2.8h,  v2.8h,  v6.8b
    233         uaddw2      v14.8h, v14.8h, v6.16b
    234         uaddw       v3.8h,  v3.8h,  v7.8b
    235         uaddw2      v15.8h, v15.8h, v7.16b
    236 
    237         rshrn       v0.8b,  v0.8h,  #8
    238         rshrn2      v0.16b, v12.8h, #8
    239         rshrn       v1.8b,  v1.8h,  #8
    240         rshrn2      v1.16b, v13.8h, #8
    241         rshrn       v2.8b,  v2.8h,  #8
    242         rshrn2      v2.16b, v14.8h, #8
    243         rshrn       v3.8b,  v3.8h,  #8
    244         rshrn2      v3.16b, v15.8h, #8
    245 .endm
    246 
    247 #define params_SRC_OUT zipped=1
    248 .macro blend_kernel_SRC_OUT
    249         mvn         v3.16b, v3.16b
    250         blend_kernel_SRC_IN
    251 .endm
    252 
    253 
    254 #define params_DST_OUT zipped=1
    255 .macro blend_kernel_DST_OUT
    256         mvn         v11.16b, v11.16b
    257         blend_kernel_DST_IN
    258 .endm
    259 
    260 #define params_SRC_ATOP zipped=1
    261 .macro blend_kernel_SRC_ATOP
    262         mvn         v11.16b, v11.16b
    263 
    264         umull2      v12.8h, v11.16b, v0.16b
    265         umull       v0.8h,  v11.8b,  v0.8b
    266         umull2      v13.8h, v11.16b, v1.16b
    267         umull       v1.8h,  v11.8b,  v1.8b
    268         umull2      v14.8h, v11.16b, v2.16b
    269         umull       v2.8h,  v11.8b,  v2.8b
    270 
    271         umull2      v4.8h,  v3.16b, v8.16b
    272         umull       v8.8h,  v3.8b,  v8.8b
    273         umull2      v5.8h,  v3.16b, v9.16b
    274         umull       v9.8h,  v3.8b,  v9.8b
    275         umull2      v6.8h,  v3.16b, v10.16b
    276         umull       v10.8h, v3.8b,  v10.8b
    277 
    278         uqadd       v12.8h, v12.8h, v4.8h
    279         uqadd       v0.8h,  v0.8h,  v8.8h
    280         uqadd       v13.8h, v13.8h, v5.8h
    281         uqadd       v1.8h,  v1.8h,  v9.8h
    282         uqadd       v14.8h, v14.8h, v6.8h
    283         uqadd       v2.8h,  v2.8h,  v10.8h
    284 
    285         urshr       v8.8h,  v0.8h,  #8
    286         urshr       v4.8h,  v12.8h, #8
    287         urshr       v9.8h,  v1.8h,  #8
    288         urshr       v5.8h,  v13.8h, #8
    289         urshr       v10.8h, v2.8h,  #8
    290         urshr       v6.8h,  v14.8h, #8
    291 
    292         uqadd       v0.8h,  v0.8h,  v8.8h
    293         uqadd       v12.8h, v12.8h, v4.8h
    294         uqadd       v1.8h,  v1.8h,  v9.8h
    295         uqadd       v13.8h, v13.8h, v5.8h
    296         uqadd       v2.8h,  v2.8h,  v10.8h
    297         uqadd       v14.8h, v14.8h, v6.8h
    298 
    299         uqrshrn     v0.8b,  v0.8h,  #8
    300         uqrshrn2    v0.16b, v12.8h, #8
    301         uqrshrn     v1.8b,  v1.8h,  #8
    302         uqrshrn2    v1.16b, v13.8h, #8
    303         uqrshrn     v2.8b,  v2.8h,  #8
    304         uqrshrn2    v2.16b, v14.8h, #8
    305 .endm
    306 
    307 #define params_DST_ATOP zipped=1
    308 .macro blend_kernel_DST_ATOP
    309         mvn         v3.16b, v3.16b
    310 
    311         umull2      v12.8h, v11.16b, v0.16b
    312         umull       v0.8h,  v11.8b,  v0.8b
    313         umull2      v13.8h, v11.16b, v1.16b
    314         umull       v1.8h,  v11.8b,  v1.8b
    315         umull2      v14.8h, v11.16b, v2.16b
    316         umull       v2.8h,  v11.8b,  v2.8b
    317 
    318         umull2      v4.8h,  v3.16b, v8.16b
    319         umull       v8.8h,  v3.8b,  v8.8b
    320         umull2      v5.8h,  v3.16b, v9.16b
    321         umull       v9.8h,  v3.8b,  v9.8b
    322         umull2      v6.8h,  v3.16b, v10.16b
    323         umull       v10.8h, v3.8b,  v10.8b
    324 
    325         uqadd       v12.8h, v12.8h, v4.8h
    326         uqadd       v0.8h,  v0.8h,  v8.8h
    327         uqadd       v13.8h, v13.8h, v5.8h
    328         uqadd       v1.8h,  v1.8h,  v9.8h
    329         uqadd       v14.8h, v14.8h, v6.8h
    330         uqadd       v2.8h,  v2.8h,  v10.8h
    331 
    332         urshr       v8.8h,  v0.8h,  #8
    333         urshr       v4.8h,  v12.8h, #8
    334         urshr       v9.8h,  v1.8h,  #8
    335         urshr       v5.8h,  v13.8h, #8
    336         urshr       v10.8h, v2.8h,  #8
    337         urshr       v6.8h,  v14.8h, #8
    338 
    339         uqadd       v0.8h,  v0.8h,  v8.8h
    340         uqadd       v12.8h, v12.8h, v4.8h
    341         uqadd       v1.8h,  v1.8h,  v9.8h
    342         uqadd       v13.8h, v13.8h, v5.8h
    343         uqadd       v2.8h,  v2.8h,  v10.8h
    344         uqadd       v14.8h, v14.8h, v6.8h
    345 
    346         uqrshrn     v0.8b,  v0.8h,  #8
    347         uqrshrn2    v0.16b, v12.8h, #8
    348         uqrshrn     v1.8b,  v1.8h,  #8
    349         uqrshrn2    v1.16b, v13.8h, #8
    350         uqrshrn     v2.8b,  v2.8h,  #8
    351         uqrshrn2    v2.16b, v14.8h, #8
    352 
    353         mov         v3.16b, v11.16b
    354 .endm
    355 
    356 #define params_MULTIPLY zipped=0
    357 .macro blend_kernel_MULTIPLY
    358         umull2      v12.8h, v0.16b, v8.16b
    359         umull       v0.8h,  v0.8b,  v8.8b
    360         umull2      v13.8h, v1.16b, v9.16b
    361         umull       v1.8h,  v1.8b,  v9.8b
    362         umull2      v14.8h, v2.16b, v10.16b
    363         umull       v2.8h,  v2.8b,  v10.8b
    364         umull2      v15.8h, v3.16b, v11.16b
    365         umull       v3.8h,  v3.8b,  v11.8b
    366 
    367         rshrn       v4.8b,  v0.8h,  #8
    368         rshrn2      v4.16b, v12.8h, #8
    369         rshrn       v5.8b,  v1.8h,  #8
    370         rshrn2      v5.16b, v13.8h, #8
    371         rshrn       v6.8b,  v2.8h,  #8
    372         rshrn2      v6.16b, v14.8h, #8
    373         rshrn       v7.8b,  v3.8h,  #8
    374         rshrn2      v7.16b, v15.8h, #8
    375 
    376         uaddw       v0.8h,  v0.8h,  v4.8b
    377         uaddw2      v12.8h, v12.8h, v4.16b
    378         uaddw       v1.8h,  v1.8h,  v5.8b
    379         uaddw2      v13.8h, v13.8h, v5.16b
    380         uaddw       v2.8h,  v2.8h,  v6.8b
    381         uaddw2      v14.8h, v14.8h, v6.16b
    382         uaddw       v3.8h,  v3.8h,  v7.8b
    383         uaddw2      v15.8h, v15.8h, v7.16b
    384 
    385         rshrn       v0.8b,  v0.8h,  #8
    386         rshrn2      v0.16b, v12.8h, #8
    387         rshrn       v1.8b,  v1.8h,  #8
    388         rshrn2      v1.16b, v13.8h, #8
    389         rshrn       v2.8b,  v2.8h,  #8
    390         rshrn2      v2.16b, v14.8h, #8
    391         rshrn       v3.8b,  v3.8h,  #8
    392         rshrn2      v3.16b, v15.8h, #8
    393 .endm
    394 
    395 #define params_ADD zipped=0
    396 .macro blend_kernel_ADD
    397         uqadd    v0.16b, v0.16b, v8.16b
    398         uqadd    v1.16b, v1.16b, v9.16b
    399         uqadd    v2.16b, v2.16b, v10.16b
    400         uqadd    v3.16b, v3.16b, v11.16b
    401 .endm
    402 
    403 #define params_SUBTRACT zipped=0
    404 .macro blend_kernel_SUBTRACT
    405         uqsub    v0.16b, v0.16b, v8.16b
    406         uqsub    v1.16b, v1.16b, v9.16b
    407         uqsub    v2.16b, v2.16b, v10.16b
    408         uqsub    v3.16b, v3.16b, v11.16b
    409 .endm
    410 
    411 #define params_DIFFERENCE zipped=0
    412 .macro blend_kernel_DIFFERENCE
    413         uabd    v0.16b, v0.16b, v8.16b
    414         uabd    v1.16b, v1.16b, v9.16b
    415         uabd    v2.16b, v2.16b, v10.16b
    416         uabd    v3.16b, v3.16b, v11.16b
    417 .endm
    418 
    419 #define params_XOR zipped=0
    420 .macro blend_kernel_XOR
    421         eor     v0.16b, v0.16b, v8.16b
    422         eor     v1.16b, v1.16b, v9.16b
    423         eor     v2.16b, v2.16b, v10.16b
    424         eor     v3.16b, v3.16b, v11.16b
    425 .endm
    426 
    427 
    428 /* Define the wrapper code which will load and store the data, iterate the
    429  * correct number of times, and safely handle the remainder at the end of the
    430  * loop.  Various sections of assembly code are dropped or substituted for
    431  * simpler operations if they're not needed.
    432  */
    433 .macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
    434 .if \nowrap
    435         \kernel
    436 .else
    437         sub     x3, sp, #32
    438         sub     sp, sp, #64
    439         st1     {v8.1d - v11.1d}, [sp]
    440         st1     {v12.1d - v15.1d}, [x3]
    441         subs    x2, x2, #64
    442         b       2f
    443 .align 4
    444 1:
    445   .if \lddst
    446     .if \zipped
    447         ld4     {v0.16b - v3.16b}, [x0]
    448     .else
    449         ld1     {v0.16b - v3.16b}, [x0]
    450     .endif
    451   .endif
    452   .if \ldsrc
    453     .if \zipped
    454         ld4     {v8.16b - v11.16b}, [x1], #64
    455     .else
    456         ld1     {v8.16b - v11.16b}, [x1], #64
    457     .endif
    458   .endif
    459   .if \pld
    460 #if 0 /* TODO: test this on real hardware */
    461     .if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif
    462     .if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif
    463 #endif
    464   .endif
    465 
    466         \kernel
    467 
    468         subs    x2, x2, #64
    469   .if \zipped
    470         st4     {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
    471   .else
    472         st1     {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
    473   .endif
    474 
    475 2:      bge     1b
    476         adds    x2, x2, #64
    477         beq     2f
    478 
    479         /* To handle the tail portion of the data (something less than 64
    480          * bytes) load small power-of-two chunks into working registers.  It
    481          * doesn't matter where they end up in the register; the same process
    482          * will store them back out using the same positions and the operations
    483          * don't require data to interact with its neighbours.
    484          */
    485         movi    v0.16b, #0
    486         movi    v1.16b, #0
    487         movi    v2.16b, #0
    488         movi    v3.16b, #0
    489 
    490         movi    v8.16b, #0
    491         movi    v9.16b, #0
    492         movi    v10.16b, #0
    493         movi    v11.16b, #0
    494 
    495         tbz     x2, #5, 1f
    496   .if \lddst ; ld1     {v2.16b,v3.16b}, [x0], #32   ; .endif
    497   .if \ldsrc ; ld1     {v10.16b,v11.16b}, [x1], #32 ; .endif
    498 1:      tbz     x2, #4, 1f
    499   .if \lddst ; ld1     {v1.16b}, [x0], #16  ; .endif
    500   .if \ldsrc ; ld1     {v9.16b}, [x1], #16  ; .endif
    501 1:      tbz     x2, #3, 1f
    502   .if \lddst ; ld1     {v0.d}[1], [x0], #8 ; .endif
    503   .if \ldsrc ; ld1     {v8.d}[1], [x1], #8 ; .endif
    504 1:      tbz     x2, #2, 1f
    505   .if \lddst ; ld1     {v0.s}[1], [x0], #4 ; .endif
    506   .if \ldsrc ; ld1     {v8.s}[1], [x1], #4 ; .endif
    507 1:      tbz     x2, #1, 1f
    508   .if \lddst ; ld1     {v0.h}[1], [x0], #2 ; .endif
    509   .if \ldsrc ; ld1     {v8.h}[1], [x1], #2 ; .endif
    510 1:      tbz     x2, #0, 1f
    511   .if \lddst ; ld1     {v0.b}[1], [x0], #1 ; .endif
    512   .if \ldsrc ; ld1     {v8.b}[1], [x1], #1 ; .endif
    513 1:
    514   .if \lddst ; sub     x0, x0, x2           ; .endif
    515 
    516 .if \zipped
    517         /* One small impediment in the process above is that some of the load
    518          * operations can't perform byte-wise structure deinterleaving at the
    519          * same time as loading only part of a register.  So the data is loaded
    520          * linearly and unpacked manually at this point.
    521          */
    522         uzp1    v4.16b, v0.16b, v1.16b
    523         uzp2    v5.16b, v0.16b, v1.16b
    524         uzp1    v6.16b, v2.16b, v3.16b
    525         uzp2    v7.16b, v2.16b, v3.16b
    526         uzp1    v0.16b, v4.16b, v6.16b
    527         uzp2    v2.16b, v4.16b, v6.16b
    528         uzp1    v1.16b, v5.16b, v7.16b
    529         uzp2    v3.16b, v5.16b, v7.16b
    530 
    531         uzp1    v4.16b, v8.16b, v9.16b
    532         uzp2    v5.16b, v8.16b, v9.16b
    533         uzp1    v6.16b, v10.16b, v11.16b
    534         uzp2    v7.16b, v10.16b, v11.16b
    535         uzp1    v8.16b, v4.16b, v6.16b
    536         uzp2    v10.16b, v4.16b, v6.16b
    537         uzp1    v9.16b, v5.16b, v7.16b
    538         uzp2    v11.16b, v5.16b, v7.16b
    539 
    540         \kernel
    541 
    542         zip1    v4.16b, v0.16b, v2.16b
    543         zip2    v6.16b, v0.16b, v2.16b
    544         zip1    v5.16b, v1.16b, v3.16b
    545         zip2    v7.16b, v1.16b, v3.16b
    546         zip1    v0.16b, v4.16b, v5.16b
    547         zip2    v1.16b, v4.16b, v5.16b
    548         zip1    v2.16b, v6.16b, v7.16b
    549         zip2    v3.16b, v6.16b, v7.16b
    550   .else
    551         \kernel
    552   .endif
    553 
    554         tbz     x2, #5, 1f
    555         st1     {v2.16b,v3.16b}, [x0], #32
    556 1:      tbz     x2, #4, 1f
    557         st1     {v1.16b}, [x0], #16
    558 1:      tbz     x2, #3, 1f
    559         st1     {v0.d}[1], [x0], #8
    560 1:      tbz     x2, #2, 1f
    561         st1     {v0.s}[1], [x0], #4
    562 1:      tbz     x2, #1, 1f
    563         st1     {v0.h}[1], [x0], #2
    564 1:      tbz     x2, #0, 2f
    565         st1     {v0.b}[1], [x0], #1
    566 2:      ld1     {v8.1d - v11.1d}, [sp], #32
    567         ld1     {v12.1d - v15.1d}, [sp], #32
    568 .endif
    569         mov     x0, #0
    570         ret
    571 .endm
    572 
    573 
    574 /* produce list of blend_line_XX() functions; each function uses the wrap_line
    575  * macro, passing it the name of the operation macro it wants along with
    576  * optional parameters to remove unnecessary operations.
    577  */
    578 #define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
    579     BLEND_LIST(BLEND_X)
    580 #undef BLEND_X
    581 
    582 #define BLEND_X(d, n) .set tablesize, d+1 ;
    583     BLEND_LIST(BLEND_X)
    584 #undef BLEND_X
    585 
    586 /*  int rsdIntrinsicBlend_K(
    587  *          uchar4 *out,        // x0
    588  *          uchar4 const *in,   // x1
    589  *          int slot,           // x2
    590  *          size_t xstart,      // x3
    591  *          size_t xend);       // x4
    592  */
    593 ENTRY(rsdIntrinsicBlend_K)
    594     adr     x5, 2f
    595     cmp     w2, tablesize >> 1
    596     bhs     1f
    597     ldrsh   x6, [x5, w2, uxtw #1]
    598     add     x0, x0, w3, uxtw #2
    599     add     x1, x1, w3, uxtw #2
    600     sub     w2, w4, w3
    601     ubfiz   x2, x2, #2, #32 /* TODO: fix */
    602     cbz     x6, 1f
    603     add     x6, x5, x6
    604     br      x6
    605 1:  mov     x0, #-1
    606     ret
    607 
    608 2:
    609 .set off,0
    610 #define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
    611         BLEND_LIST(BLEND_X)
    612 #undef BLEND_X
    613 3:
    614 
    615 END(rsdIntrinsicBlend_K)
    616