Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
     18 #define END(f) .size f, .-f;
     19 
     20 
     21 .macro vmxx_f32 i, mask, opd, opa, opb
     22   .if (\i) & \mask
     23     .if (\i) & (\mask - 1)
     24         fmla            \opd, \opa, \opb
     25     .else
     26         fmul            \opd, \opa, \opb
     27     .endif
     28   .endif
     29 .endm
     30 
     31 .macro vadd_f32 i, mask, opd, opa, opb, stupidsyntax1, stupidsyntax2
     32   .if (\i) & \mask
     33     .if (\i) & (\mask - 1)
     34         fadd            \opd, \opa, \opb
     35     .else
     36         mov             \stupidsyntax1, \stupidsyntax2
     37     .endif
     38   .endif
     39 .endm
     40 
     41 .macro vmxx_s16 i, mask, opd, opa, opb
     42   .if (\i) & \mask
     43     .if (\i) & (\mask - 1 + 16)
     44         smlal           \opd, \opa, \opb
     45     .else
     46         smull           \opd, \opa, \opb
     47     .endif
     48   .endif
     49 .endm
     50 
     51 .macro vmxx2_s16 i, mask, opd, opa, opb
     52   .if (\i) & \mask
     53     .if (\i) & (\mask - 1 + 16)
     54         smlal2          \opd, \opa, \opb
     55     .else
     56         smull2          \opd, \opa, \opb
     57     .endif
     58   .endif
     59 .endm
     60 
     61 /* x0 = dst
     62  * x1 = src
     63  * x2 = count
     64  * x3 = params
     65  * x4 = column0_fn
     66  * x5 = column1_fn
     67  * x6 = column2_fn
     68  * x7 = column3_fn
     69  * x8 = store_fn
     70  * x9 = load_fn
     71  */
     72 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
     73 
     74 .align 6
     75 colormatrix_int_col0_\i:
     76       .if \i & 16
     77             dup         v6.4s, v4.s[0]
     78             dup         v7.4s, v4.s[0]
     79       .endif
     80             vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[0]
     81             vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[4]
     82             vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[0]
     83             vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[4]
     84             vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[0]
     85             vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[4]
     86             vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[0]
     87             vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[4]
     88             sqshrun     v8.4h, v6.4s, #8
     89             sqshrun2    v8.8h, v7.4s, #8
     90             br          x5
     91 
     92 colormatrix_int_col0_n\i:
     93       .if (\i^31) & 16
     94             dup         v6.4s, v4.s[0]
     95             dup         v7.4s, v4.s[0]
     96       .endif
     97             vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[0]
     98             vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[4]
     99             vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[0]
    100             vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[4]
    101             vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[0]
    102             vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[4]
    103             vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[0]
    104             vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[4]
    105             sqshrun     v8.4h, v6.4s, #8
    106             sqshrun2    v8.8h, v7.4s, #8
    107             br          x5
    108 
    109 .align 6
    110 colormatrix_int_col1_\i:
    111       .if \i & 16
    112             dup         v6.4s, v4.s[1]
    113             dup         v7.4s, v4.s[1]
    114       .endif
    115             vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[1]
    116             vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[5]
    117             vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[1]
    118             vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[5]
    119             vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[1]
    120             vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[5]
    121             vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[1]
    122             vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[5]
    123             sqshrun     v9.4h, v6.4s, #8
    124             sqshrun2    v9.8h, v7.4s, #8
    125             br          x6
    126 
    127 colormatrix_int_col1_n\i:
    128       .if (\i^31) & 16
    129             dup         v6.4s, v4.s[1]
    130             dup         v7.4s, v4.s[1]
    131       .endif
    132             vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[1]
    133             vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[5]
    134             vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[1]
    135             vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[5]
    136             vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[1]
    137             vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[5]
    138             vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[1]
    139             vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[5]
    140             sqshrun     v9.4h, v6.4s, #8
    141             sqshrun2    v9.8h, v7.4s, #8
    142             br          x6
    143 
    144 .align 6
    145 colormatrix_int_col2_\i:
    146       .if \i & 16
    147             dup         v6.4s, v4.s[2]
    148             dup         v7.4s, v4.s[2]
    149       .endif
    150             vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[2]
    151             vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[6]
    152             vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[2]
    153             vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[6]
    154             vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[2]
    155             vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[6]
    156             vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[2]
    157             vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[6]
    158             sqshrun     v10.4h, v6.4s, #8
    159             sqshrun2    v10.8h, v7.4s, #8
    160             br          x7
    161 
    162 colormatrix_int_col2_n\i:
    163       .if (\i^31) & 16
    164             dup         v6.4s, v4.s[2]
    165             dup         v7.4s, v4.s[2]
    166       .endif
    167             vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[2]
    168             vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[6]
    169             vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[2]
    170             vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[6]
    171             vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[2]
    172             vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[6]
    173             vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[2]
    174             vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[6]
    175             sqshrun     v10.4h, v6.4s, #8
    176             sqshrun2    v10.8h, v7.4s, #8
    177             br          x7
    178 
    179 .align 6
    180 colormatrix_int_col3_\i:
    181       .if \i & 16
    182             dup         v6.4s, v4.s[3]
    183             dup         v7.4s, v4.s[3]
    184       .endif
    185             vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[3]
    186             vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[7]
    187             vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[3]
    188             vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[7]
    189             vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[3]
    190             vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[7]
    191             vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[3]
    192             vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[7]
    193             sqshrun     v11.4h, v6.4s, #8
    194             sqshrun2    v11.8h, v7.4s, #8
    195             br          x8
    196 
    197 colormatrix_int_col3_n\i:
    198       .if (\i^31) & 16
    199             dup         v6.4s, v4.s[3]
    200             dup         v7.4s, v4.s[3]
    201       .endif
    202             vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[3]
    203             vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[7]
    204             vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[3]
    205             vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[7]
    206             vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[3]
    207             vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[7]
    208             vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[3]
    209             vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[7]
    210             sqshrun     v11.4h, v6.4s, #8
    211             sqshrun2    v11.8h, v7.4s, #8
    212             br          x8
    213 
    214 .align 5
    215 colormatrix_float_col0_\i:
    216             vmxx_f32    \i, 1,  v8.4s, v12.4s, v0.s[0]
    217             vmxx_f32    \i, 2,  v8.4s, v13.4s, v1.s[0]
    218             vmxx_f32    \i, 4,  v8.4s, v14.4s, v2.s[0]
    219             vmxx_f32    \i, 8,  v8.4s, v15.4s, v3.s[0]
    220             vadd_f32    \i, 16, v8.4s, v8.4s, v4.4s,        v8.16b, v4.16b
    221             vmxx_f32    \i, 1,  v16.4s, v20.4s, v0.s[0]
    222             vmxx_f32    \i, 2,  v16.4s, v21.4s, v1.s[0]
    223             vmxx_f32    \i, 4,  v16.4s, v22.4s, v2.s[0]
    224             vmxx_f32    \i, 8,  v16.4s, v23.4s, v3.s[0]
    225             vadd_f32    \i, 16, v16.4s, v16.4s, v4.4s,      v16.16b, v4.16b
    226             br          x5
    227 
    228 .align 4
    229 colormatrix_float_col0_n\i:
    230             vmxx_f32    \i^31, 1,  v8.4s, v12.4s, v0.s[0]
    231             vmxx_f32    \i^31, 2,  v8.4s, v13.4s, v1.s[0]
    232             vmxx_f32    \i^31, 4,  v8.4s, v14.4s, v2.s[0]
    233             vmxx_f32    \i^31, 8,  v8.4s, v15.4s, v3.s[0]
    234             vadd_f32    \i^31, 16, v8.4s, v8.4s, v4.4s,     v8.16b, v4.16b
    235             vmxx_f32    \i^31, 1,  v16.4s, v20.4s, v0.s[0]
    236             vmxx_f32    \i^31, 2,  v16.4s, v21.4s, v1.s[0]
    237             vmxx_f32    \i^31, 4,  v16.4s, v22.4s, v2.s[0]
    238             vmxx_f32    \i^31, 8,  v16.4s, v23.4s, v3.s[0]
    239             vadd_f32    \i^31, 16, v16.4s, v16.4s, v4.4s,   v16.16b, v4.16b
    240             br          x5
    241 
    242 .align 5
    243 colormatrix_float_col1_\i:
    244             vmxx_f32    \i, 1,  v9.4s, v12.4s, v0.s[1]
    245             vmxx_f32    \i, 2,  v9.4s, v13.4s, v1.s[1]
    246             vmxx_f32    \i, 4,  v9.4s, v14.4s, v2.s[1]
    247             vmxx_f32    \i, 8,  v9.4s, v15.4s, v3.s[1]
    248             vadd_f32    \i, 16, v9.4s, v9.4s, v5.4s,        v9.16b, v5.16b
    249             vmxx_f32    \i, 1,  v17.4s, v20.4s, v0.s[1]
    250             vmxx_f32    \i, 2,  v17.4s, v21.4s, v1.s[1]
    251             vmxx_f32    \i, 4,  v17.4s, v22.4s, v2.s[1]
    252             vmxx_f32    \i, 8,  v17.4s, v23.4s, v3.s[1]
    253             vadd_f32    \i, 16, v17.4s, v17.4s, v5.4s,      v17.16b, v5.16b
    254             br          x6
    255 
    256 .align 4
    257 colormatrix_float_col1_n\i:
    258             vmxx_f32    \i^31, 1,  v9.4s, v12.4s, v0.s[1]
    259             vmxx_f32    \i^31, 2,  v9.4s, v13.4s, v1.s[1]
    260             vmxx_f32    \i^31, 4,  v9.4s, v14.4s, v2.s[1]
    261             vmxx_f32    \i^31, 8,  v9.4s, v15.4s, v3.s[1]
    262             vadd_f32    \i^31, 16, v9.4s, v9.4s, v5.4s,     v9.16b, v5.16b
    263             vmxx_f32    \i^31, 1,  v17.4s, v20.4s, v0.s[1]
    264             vmxx_f32    \i^31, 2,  v17.4s, v21.4s, v1.s[1]
    265             vmxx_f32    \i^31, 4,  v17.4s, v22.4s, v2.s[1]
    266             vmxx_f32    \i^31, 8,  v17.4s, v23.4s, v3.s[1]
    267             vadd_f32    \i^31, 16, v17.4s, v17.4s, v5.4s,   v17.16b, v5.16b
    268             br          x6
    269 
    270 .align 5
    271 colormatrix_float_col2_\i:
    272             vmxx_f32    \i, 1,  v10.4s, v12.4s, v0.s[2]
    273             vmxx_f32    \i, 2,  v10.4s, v13.4s, v1.s[2]
    274             vmxx_f32    \i, 4,  v10.4s, v14.4s, v2.s[2]
    275             vmxx_f32    \i, 8,  v10.4s, v15.4s, v3.s[2]
    276             vadd_f32    \i, 16, v10.4s, v10.4s, v6.4s,      v10.16b, v6.16b
    277             vmxx_f32    \i, 1,  v18.4s, v20.4s, v0.s[2]
    278             vmxx_f32    \i, 2,  v18.4s, v21.4s, v1.s[2]
    279             vmxx_f32    \i, 4,  v18.4s, v22.4s, v2.s[2]
    280             vmxx_f32    \i, 8,  v18.4s, v23.4s, v3.s[2]
    281             vadd_f32    \i, 16, v18.4s, v18.4s, v6.4s,      v18.16b, v6.16b
    282             br          x7
    283 
    284 .align 4
    285 colormatrix_float_col2_n\i:
    286             vmxx_f32    \i^31, 1,  v10.4s, v12.4s, v0.s[2]
    287             vmxx_f32    \i^31, 2,  v10.4s, v13.4s, v1.s[2]
    288             vmxx_f32    \i^31, 4,  v10.4s, v14.4s, v2.s[2]
    289             vmxx_f32    \i^31, 8,  v10.4s, v15.4s, v3.s[2]
    290             vadd_f32    \i^31, 16, v10.4s, v10.4s, v6.4s,   v10.16b, v6.16b
    291             vmxx_f32    \i^31, 1,  v18.4s, v20.4s, v0.s[2]
    292             vmxx_f32    \i^31, 2,  v18.4s, v21.4s, v1.s[2]
    293             vmxx_f32    \i^31, 4,  v18.4s, v22.4s, v2.s[2]
    294             vmxx_f32    \i^31, 8,  v18.4s, v23.4s, v3.s[2]
    295             vadd_f32    \i^31, 16, v18.4s, v18.4s, v6.4s,   v18.16b, v6.16b
    296             br          x7
    297 
    298 .align 5
    299 colormatrix_float_col3_\i:
    300             vmxx_f32    \i, 1,  v11.4s, v12.4s, v0.s[3]
    301             vmxx_f32    \i, 2,  v11.4s, v13.4s, v1.s[3]
    302             vmxx_f32    \i, 4,  v11.4s, v14.4s, v2.s[3]
    303             vmxx_f32    \i, 8,  v11.4s, v15.4s, v3.s[3]
    304             vadd_f32    \i, 16, v11.4s, v11.4s, v7.4s,      v11.16b, v7.16b
    305             vmxx_f32    \i, 1,  v19.4s, v20.4s, v0.s[3]
    306             vmxx_f32    \i, 2,  v19.4s, v21.4s, v1.s[3]
    307             vmxx_f32    \i, 4,  v19.4s, v22.4s, v2.s[3]
    308             vmxx_f32    \i, 8,  v19.4s, v23.4s, v3.s[3]
    309             vadd_f32    \i, 16, v19.4s, v19.4s, v7.4s,      v19.16b, v7.16b
    310             br          x8
    311 
    312 .align 4
    313 colormatrix_float_col3_n\i:
    314             vmxx_f32    \i^31, 1,  v11.4s, v12.4s, v0.s[3]
    315             vmxx_f32    \i^31, 2,  v11.4s, v13.4s, v1.s[3]
    316             vmxx_f32    \i^31, 4,  v11.4s, v14.4s, v2.s[3]
    317             vmxx_f32    \i^31, 8,  v11.4s, v15.4s, v3.s[3]
    318             vadd_f32    \i^31, 16, v11.4s, v11.4s, v7.4s,  v11.16b, v7.16b
    319             vmxx_f32    \i^31, 1,  v19.4s, v20.4s, v0.s[3]
    320             vmxx_f32    \i^31, 2,  v19.4s, v21.4s, v1.s[3]
    321             vmxx_f32    \i^31, 4,  v19.4s, v22.4s, v2.s[3]
    322             vmxx_f32    \i^31, 8,  v19.4s, v23.4s, v3.s[3]
    323             vadd_f32    \i^31, 16, v19.4s, v19.4s, v7.4s,  v19.16b, v7.16b
    324             br          x8
    325 
    326 .endr
    327 
    328 .align 6
    329 colormatrix_float_ldu4:
    330             ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
    331             uxtl        v20.8h, v20.8b
    332             uxtl        v21.8h, v21.8b
    333             uxtl        v22.8h, v22.8b
    334             uxtl        v23.8h, v23.8b
    335             uxtl        v12.4s, v20.4h
    336             uxtl        v13.4s, v21.4h
    337             uxtl        v14.4s, v22.4h
    338             uxtl        v15.4s, v23.4h
    339             uxtl2       v20.4s, v20.8h
    340             uxtl2       v21.4s, v21.8h
    341             uxtl2       v22.4s, v22.8h
    342             uxtl2       v23.4s, v23.8h
    343             ucvtf       v12.4s, v12.4s
    344             ucvtf       v13.4s, v13.4s
    345             ucvtf       v14.4s, v14.4s
    346             ucvtf       v15.4s, v15.4s
    347             ucvtf       v20.4s, v20.4s
    348             ucvtf       v21.4s, v21.4s
    349             ucvtf       v22.4s, v22.4s
    350             ucvtf       v23.4s, v23.4s
    351             br          x4
    352 
    353 .align 5
    354 colormatrix_int_ldu4:
    355             ld4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
    356             uxtl        v12.8h, v12.8b
    357             uxtl        v13.8h, v13.8b
    358             uxtl        v14.8h, v14.8b
    359             uxtl        v15.8h, v15.8b
    360             br          x4
    361 
    362 .align 6
    363 colormatrix_float_ldu3:
    364             ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
    365             uxtl        v20.8h, v20.8b
    366             uxtl        v21.8h, v21.8b
    367             uxtl        v22.8h, v22.8b
    368             uxtl        v12.4s, v20.4h
    369             uxtl        v13.4s, v21.4h
    370             uxtl        v14.4s, v22.4h
    371             uxtl2       v20.4s, v20.8h
    372             uxtl2       v21.4s, v21.8h
    373             uxtl2       v22.4s, v22.8h
    374             ucvtf       v12.4s, v12.4s
    375             ucvtf       v13.4s, v13.4s
    376             ucvtf       v14.4s, v14.4s
    377             ucvtf       v20.4s, v20.4s
    378             ucvtf       v21.4s, v21.4s
    379             ucvtf       v22.4s, v22.4s
    380             br          x4
    381 
    382 colormatrix_int_ldu3:
    383             ld4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
    384             uxtl        v12.8h, v12.8b
    385             uxtl        v13.8h, v13.8b
    386             uxtl        v14.8h, v14.8b
    387             br          x4
    388 
    389 .align 5
    390 colormatrix_float_ldu1:
    391             ld1         {v20.8b}, [x1], #8
    392             uxtl        v20.8h, v20.8b
    393             uxtl        v12.4s, v20.4h
    394             uxtl2       v20.4s, v20.8h
    395             ucvtf       v12.4s, v12.4s
    396             ucvtf       v20.4s, v20.4s
    397             br          x4
    398 
    399 .align 6
    400 colormatrix_float_ldu2:
    401             ld2         {v20.8b,v21.8b}, [x1], #16
    402             uxtl        v20.8h, v20.8b
    403             uxtl        v21.8h, v21.8b
    404             uxtl        v12.4s, v20.4h
    405             uxtl        v13.4s, v21.4h
    406             uxtl2       v20.4s, v20.8h
    407             uxtl2       v21.4s, v21.8h
    408             ucvtf       v12.4s, v12.4s
    409             ucvtf       v13.4s, v13.4s
    410             ucvtf       v20.4s, v20.4s
    411             ucvtf       v21.4s, v21.4s
    412             br          x4
    413 
    414 .align 4
    415 colormatrix_int_ldu2:
    416             ld2         {v12.8b,v13.8b}, [x1], #16
    417             uxtl        v12.8h, v12.8b
    418             uxtl        v13.8h, v13.8b
    419             br          x4
    420 
    421 .align 6
    422 colormatrix_float_stu4:
    423             fcvtzs      v24.4s, v8.4s, #1
    424             fcvtzs      v25.4s, v9.4s, #1
    425             fcvtzs      v26.4s, v10.4s, #1
    426             fcvtzs      v27.4s, v11.4s, #1
    427             fcvtzs      v28.4s, v16.4s, #1
    428             fcvtzs      v29.4s, v17.4s, #1
    429             fcvtzs      v30.4s, v18.4s, #1
    430             fcvtzs      v31.4s, v19.4s, #1
    431             sqrshrun    v24.4h, v24.4s, #1
    432             sqrshrun    v25.4h, v25.4s, #1
    433             sqrshrun    v26.4h, v26.4s, #1
    434             sqrshrun    v27.4h, v27.4s, #1
    435             sqrshrun2   v24.8h, v28.4s, #1
    436             sqrshrun2   v25.8h, v29.4s, #1
    437             sqrshrun2   v26.8h, v30.4s, #1
    438             sqrshrun2   v27.8h, v31.4s, #1
    439             uqxtn       v24.8b, v24.8h
    440             uqxtn       v25.8b, v25.8h
    441             uqxtn       v26.8b, v26.8h
    442             uqxtn       v27.8b, v27.8h
    443             subs        x2, x2, #8
    444             st4         {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
    445             blo         colormatrix_float_end
    446             br          x9
    447 
    448 .align 5
    449 colormatrix_int_stu4:
    450             uqxtn       v12.8b, v8.8h
    451             uqxtn       v13.8b, v9.8h
    452             uqxtn       v14.8b, v10.8h
    453             uqxtn       v15.8b, v11.8h
    454             subs        x2, x2, #8
    455             st4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
    456             blo         colormatrix_int_end
    457             br          x9
    458 
    459 .align 6
    460 colormatrix_float_stu3:
    461             fcvtzs      v24.4s, v8.4s, #1
    462             fcvtzs      v25.4s, v9.4s, #1
    463             fcvtzs      v26.4s, v10.4s, #1
    464             fcvtzs      v28.4s, v16.4s, #1
    465             fcvtzs      v29.4s, v17.4s, #1
    466             fcvtzs      v30.4s, v18.4s, #1
    467             sqrshrun    v24.4h, v24.4s, #1
    468             sqrshrun    v25.4h, v25.4s, #1
    469             sqrshrun    v26.4h, v26.4s, #1
    470             sqrshrun2   v24.8h, v28.4s, #1
    471             sqrshrun2   v25.8h, v29.4s, #1
    472             sqrshrun2   v26.8h, v30.4s, #1
    473             uqxtn       v24.8b, v24.8h
    474             uqxtn       v25.8b, v25.8h
    475             uqxtn       v26.8b, v26.8h
    476             movi        v27.8b, #0
    477             subs        x2, x2, #8
    478             st4         {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
    479             blo         colormatrix_float_end
    480             br          x9
    481 
    482 .align 4
    483 colormatrix_int_ldu1:
    484             ld1         {v12.8b}, [x1], #8
    485             uxtl        v12.8h, v12.8b
    486             br          x4
    487 
    488 .align 5
    489 colormatrix_int_stu3:
    490             uqxtn       v12.8b, v8.8h
    491             uqxtn       v13.8b, v9.8h
    492             uqxtn       v14.8b, v10.8h
    493             movi        v15.8b, #0
    494             subs        x2, x2, #8
    495             st4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
    496             blo         colormatrix_int_end
    497             br          x9
    498 
    499 .align 6
    500 colormatrix_float_stu2:
    501             fcvtzs      v24.4s, v8.4s, #1
    502             fcvtzs      v25.4s, v9.4s, #1
    503             fcvtzs      v28.4s, v16.4s, #1
    504             fcvtzs      v29.4s, v17.4s, #1
    505             sqrshrun    v24.4h, v24.4s, #1
    506             sqrshrun    v25.4h, v25.4s, #1
    507             sqrshrun2   v24.8h, v28.4s, #1
    508             sqrshrun2   v25.8h, v29.4s, #1
    509             uqxtn       v24.8b, v24.8h
    510             uqxtn       v25.8b, v25.8h
    511             subs        x2, x2, #8
    512             st2         {v24.8b,v25.8b}, [x0], #16
    513             blo         colormatrix_float_end
    514             br          x9
    515 
    516 .align 5
    517 colormatrix_int_stu2:
    518             uqxtn       v12.8b, v8.8h
    519             uqxtn       v13.8b, v9.8h
    520             subs        x2, x2, #8
    521             st2         {v12.8b,v13.8b}, [x0], #16
    522             blo         colormatrix_int_end
    523             br          x9
    524 
    525 .align 5
    526 colormatrix_int_stu1:
    527             uqxtn       v12.8b, v8.8h
    528             subs        x2, x2, #8
    529             st1         {v12.8b}, [x0], #8
    530             blo         colormatrix_int_end
    531             br          x9
    532 
    533 colormatrix_float_ldf3:
    534             ld4         {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
    535             ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
    536             br          x4
    537 
    538 .align 6
    539 colormatrix_float_stu1:
    540             fcvtzs      v24.4s, v8.4s, #1
    541             fcvtzs      v28.4s, v16.4s, #1
    542             sqrshrun    v24.4h, v24.4s, #1
    543             sqrshrun2   v24.8h, v28.4s, #1
    544             uqxtn       v24.8b, v24.8h
    545             subs        x2, x2, #8
    546             st1         {v24.8b}, [x0], #8
    547             blo         colormatrix_float_end
    548             br          x9
    549 
    550 colormatrix_float_stf3:
    551             movi        v11.16b, #0
    552             st4         {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
    553             movi        v19.16b, #0
    554             subs        x2, x2, #8
    555             st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
    556             blo         colormatrix_float_end
    557             br          x9
    558 
    559 .align 5
    560 colormatrix_float_stf4:
    561             st4         {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
    562             subs        x2, x2, #8
    563             st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
    564             blo         colormatrix_float_end
    565             br          x9
    566 
    567 colormatrix_float_ldf4:
    568             ld4         {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
    569             ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
    570             br          x4
    571 
    572 .align 5
    573 colormatrix_float_stf2:
    574             st2         {v8.4s, v9.4s}, [x0], #32
    575             subs        x2, x2, #8
    576             st2         {v16.4s, v17.4s}, [x0], #32
    577             blo         colormatrix_float_end
    578             br          x9
    579 
    580 colormatrix_float_ldf2:
    581             ld2         {v12.4s,v13.4s}, [x1], #32
    582             ld2         {v20.4s,v21.4s}, [x1], #32
    583             br          x4
    584 
    585 .align 5
    586 colormatrix_float_stf1:
    587             st1         {v8.4s}, [x0], #16
    588             subs        x2, x2, #8
    589             st1         {v16.4s}, [x0], #16
    590             blo         colormatrix_float_end
    591             br          x9
    592 
    593 colormatrix_float_ldf1:
    594             ld1         {v12.4s}, [x1], #16
    595             ld1         {v20.4s}, [x1], #16
    596             br          x4
    597 
    598 colormatrix_int_stu1_end:
    599             uqxtn       v12.8b, v8.8h
    600             tbz         x2, #2, 1f
    601             st1         {v12.s}[1], [x0], #4
    602 1:          tbz         x2, #1, 1f
    603             st1         {v12.h}[1], [x0], #2
    604 1:          tbz         x2, #0, 1f
    605             st1         {v12.b}[1], [x0], #1
    606 1:          b           colormatrix_int_realend
    607 
    608 colormatrix_int_stu2_end:
    609             uqxtn       v12.8b, v8.8h
    610             uqxtn       v13.8b, v9.8h
    611             zip1        v12.16b, v12.16b, v13.16b
    612             tbz         x2, #2, 1f
    613             st1         {v12.d}[1], [x0], #8
    614 1:          tbz         x2, #1, 1f
    615             st1         {v12.s}[1], [x0], #4
    616 1:          tbz         x2, #0, 1f
    617             st1         {v12.h}[1], [x0], #2
    618 1:          b           colormatrix_int_realend
    619 
    620 colormatrix_int_stu3_end:
    621             uqxtn       v12.8b, v8.8h
    622             uqxtn       v13.8b, v9.8h
    623             uqxtn       v14.8b, v10.8h
    624             movi        v15.8b, #0
    625             tbz         x2, #2, 1f
    626             st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
    627             st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
    628             st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
    629             st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
    630 1:          tbz         x2, #1, 1f
    631             st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
    632             st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
    633 1:          tbz         x2, #0, 1f
    634             st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
    635 1:          b           colormatrix_int_realend
    636 
    637 colormatrix_int_stu4_end:
    638             uqxtn       v12.8b, v8.8h
    639             uqxtn       v13.8b, v9.8h
    640             uqxtn       v14.8b, v10.8h
    641             uqxtn       v15.8b, v11.8h
    642             tbz         x2, #2, 1f
    643             st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
    644             st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
    645             st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
    646             st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
    647 1:          tbz         x2, #1, 1f
    648             st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
    649             st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
    650 1:          tbz         x2, #0, 1f
    651             st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
    652 1:          b           colormatrix_int_realend
    653 
    654 
    655 colormatrix_int_ldu1_end:
    656             tbz         x2, #2, 1f
    657             ld1         {v15.s}[3], [x1], #4
    658 1:          tbz         x2, #1, 1f
    659             ld1         {v15.h}[5], [x1], #2
    660 1:          tbz         x2, #0, 1f
    661             ld1         {v15.b}[9], [x1], #1
    662 1:          uxtl2       v12.8h, v15.16b
    663             br          x4
    664 
    665 colormatrix_int_ldu2_end:
    666             tbz         x2, #2, 1f
    667             ld1         {v15.d}[1], [x1], #8
    668 1:          tbz         x2, #1, 1f
    669             ld1         {v15.s}[1], [x1], #4
    670 1:          tbz         x2, #0, 1f
    671             ld1         {v15.h}[1], [x1], #2
    672 1:          uzp1        v14.16b, v15.16b, v15.16b
    673             uzp2        v15.16b, v15.16b, v15.16b
    674             uxtl        v12.8h, v14.8b
    675             uxtl        v13.8h, v15.8b
    676             br          x4
    677 
    678 colormatrix_int_ldu3_end:
    679             tbz         x2, #2, 1f
    680             ld4         {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
    681             ld4         {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
    682             ld4         {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
    683             ld4         {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
    684 1:          tbz         x2, #1, 1f
    685             ld4         {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
    686             ld4         {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
    687 1:          tbz         x2, #0, 1f
    688             ld4         {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
    689 1:          uxtl        v12.8h, v12.8b
    690             uxtl        v13.8h, v13.8b
    691             uxtl        v14.8h, v14.8b
    692             br          x4
    693 
    694 colormatrix_int_ldu4_end:
    695             tbz         x2, #2, 1f
    696             ld4         {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
    697             ld4         {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
    698             ld4         {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
    699             ld4         {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
    700 1:          tbz         x2, #1, 1f
    701             ld4         {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
    702             ld4         {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
    703 1:          tbz         x2, #0, 1f
    704             ld4         {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
    705 1:          uxtl        v12.8h, v12.8b
    706             uxtl        v13.8h, v13.8b
    707             uxtl        v14.8h, v14.8b
    708             uxtl        v15.8h, v15.8b
    709             br          x4
    710 
    711 colormatrix_float_stu1_end:
    712             fcvtzs      v12.4s, v8.4s, #1
    713             fcvtzs      v13.4s, v16.4s, #1
    714             sqrshrun    v12.4h, v12.4s, #1
    715             sqrshrun2   v12.8h, v13.4s, #1
    716             uqxtn       v12.8b, v12.8h
    717             tbz         x2, #2, 1f
    718             st1         {v12.s}[1], [x0], #4
    719 1:          tbz         x2, #1, 1f
    720             st1         {v12.h}[1], [x0], #2
    721 1:          tbz         x2, #0, 1f
    722             st1         {v12.b}[1], [x0], #1
    723 1:          b           colormatrix_float_realend
    724 
    725 colormatrix_float_stu2_end:
    726             fcvtzs      v12.4s, v8.4s, #1
    727             fcvtzs      v13.4s, v9.4s, #1
    728             fcvtzs      v14.4s, v16.4s, #1
    729             fcvtzs      v15.4s, v17.4s, #1
    730             sqrshrun    v12.4h, v12.4s, #1
    731             sqrshrun    v13.4h, v13.4s, #1
    732             sqrshrun    v14.4h, v14.4s, #1
    733             sqrshrun    v15.4h, v15.4s, #1
    734             zip1        v12.8h, v12.8h, v13.8h
    735             zip1        v13.8h, v14.8h, v15.8h
    736             uqxtn       v12.8b, v12.8h
    737             uqxtn2      v12.16b, v13.8h
    738             tbz         x2, #2, 1f
    739             st1         {v12.d}[1], [x0], #8
    740 1:          tbz         x2, #1, 1f
    741             st1         {v12.s}[1], [x0], #4
    742 1:          tbz         x2, #0, 1f
    743             st1         {v12.h}[1], [x0], #2
    744 1:          b           colormatrix_float_realend
    745 
    746 colormatrix_float_stu3_end:
    747             fcvtzs      v24.4s, v8.4s, #1
    748             fcvtzs      v25.4s, v9.4s, #1
    749             fcvtzs      v26.4s, v10.4s, #1
    750             fcvtzs      v28.4s, v16.4s, #1
    751             fcvtzs      v29.4s, v17.4s, #1
    752             fcvtzs      v30.4s, v18.4s, #1
    753             sqrshrun    v24.4h, v24.4s, #1
    754             sqrshrun    v25.4h, v25.4s, #1
    755             sqrshrun    v26.4h, v26.4s, #1
    756             sqrshrun2   v24.8h, v28.4s, #1
    757             sqrshrun2   v25.8h, v29.4s, #1
    758             sqrshrun2   v26.8h, v30.4s, #1
    759             uqxtn       v12.8b, v24.8h
    760             uqxtn       v13.8b, v25.8h
    761             uqxtn       v14.8b, v26.8h
    762             movi        v15.8b, #0
    763             tbz         x2, #2, 1f
    764             st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
    765             st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
    766             st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
    767             st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
    768 1:          tbz         x2, #1, 1f
    769             st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
    770             st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
    771 1:          tbz         x2, #0, 1f
    772             st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
    773 1:          b           colormatrix_float_realend
    774 
    775 colormatrix_float_stu4_end:
    776             fcvtzs      v24.4s, v8.4s, #1
    777             fcvtzs      v25.4s, v9.4s, #1
    778             fcvtzs      v26.4s, v10.4s, #1
    779             fcvtzs      v27.4s, v11.4s, #1
    780             fcvtzs      v28.4s, v16.4s, #1
    781             fcvtzs      v29.4s, v17.4s, #1
    782             fcvtzs      v30.4s, v18.4s, #1
    783             fcvtzs      v31.4s, v19.4s, #1
    784             sqrshrun    v24.4h, v24.4s, #1
    785             sqrshrun    v25.4h, v25.4s, #1
    786             sqrshrun    v26.4h, v26.4s, #1
    787             sqrshrun    v27.4h, v27.4s, #1
    788             sqrshrun2   v24.8h, v28.4s, #1
    789             sqrshrun2   v25.8h, v29.4s, #1
    790             sqrshrun2   v26.8h, v30.4s, #1
    791             sqrshrun2   v27.8h, v31.4s, #1
    792             uqxtn       v12.8b, v24.8h
    793             uqxtn       v13.8b, v25.8h
    794             uqxtn       v14.8b, v26.8h
    795             uqxtn       v15.8b, v27.8h
    796             tbz         x2, #2, 1f
    797             st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
    798             st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
    799             st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
    800             st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
    801 1:          tbz         x2, #1, 1f
    802             st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
    803             st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
    804 1:          tbz         x2, #0, 1f
    805             st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
    806 1:          b           colormatrix_float_realend
    807 
    808 colormatrix_float_stf1_end:
    809             tbz         x2, #2, 1f
    810             st1         {v16.4s}, [x0], #16
    811 1:          tbz         x2, #1, 1f
    812             st1         {v8.d}[1], [x0], #8
    813 1:          tbz         x2, #0, 1f
    814             st1         {v8.s}[1], [x0], #4
    815 1:          b           colormatrix_float_realend
    816 
    817 colormatrix_float_stf2_end:
    818             tbz         x2, #2, 1f
    819             st2         {v16.4s, v17.4s}, [x0], #32
    820 1:          tbz         x2, #1, 1f
    821             st2         {v8.s,v9.s}[2], [x0], #8
    822             st2         {v8.s,v9.s}[3], [x0], #8
    823 1:          tbz         x2, #0, 1f
    824             st2         {v8.s,v9.s}[1], [x0], #8
    825 1:          b           colormatrix_float_realend
    826 
    827 colormatrix_float_stf3_end:
    828             movi        v11.16b, #0
    829             movi        v19.16b, #0
    830 colormatrix_float_stf4_end:
    831             tbz         x2, #2, 1f
    832             st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
    833 1:          tbz         x2, #1, 1f
    834             st4         {v8.s,v9.s,v10.s,v11.s}[2], [x0], #16
    835             st4         {v8.s,v9.s,v10.s,v11.s}[3], [x0], #16
    836 1:          tbz         x2, #0, 1f
    837             st4         {v8.s,v9.s,v10.s,v11.s}[1], [x0], #16
    838 1:          b           colormatrix_float_realend
    839 
    840 colormatrix_float_ldu1_end:
    841             tbz         x2, #2, 1f
    842             ld1         {v15.s}[1], [x1], #4
    843 1:          tbz         x2, #1, 1f
    844             ld1         {v15.h}[1], [x1], #2
    845 1:          tbz         x2, #0, 1f
    846             ld1         {v15.b}[1], [x1], #1
    847 1:          uxtl        v15.8h, v15.8b
    848             uxtl        v12.4s, v15.4h
    849             uxtl2       v20.4s, v15.8h
    850             ucvtf       v12.4s, v12.4s
    851             ucvtf       v20.4s, v20.4s
    852             br          x4
    853 
    854 colormatrix_float_ldu2_end:
    855             tbz         x2, #2, 1f
    856             ld1         {v15.d}[1], [x1], #8
    857 1:          tbz         x2, #1, 1f
    858             ld1         {v15.s}[1], [x1], #4
    859 1:          tbz         x2, #0, 1f
    860             ld1         {v15.h}[1], [x1], #2
    861 1:          uxtl        v14.8h, v15.8b
    862             uxtl2       v15.8h, v15.16b
    863             uzp1        v12.8h, v14.8h, v14.8h
    864             uzp2        v13.8h, v14.8h, v14.8h
    865             uzp1        v20.8h, v15.8h, v15.8h
    866             uzp2        v21.8h, v15.8h, v15.8h
    867             uxtl        v12.4s, v12.4h
    868             uxtl        v13.4s, v13.4h
    869             uxtl        v20.4s, v20.4h
    870             uxtl        v21.4s, v21.4h
    871             ucvtf       v12.4s, v12.4s
    872             ucvtf       v13.4s, v13.4s
    873             ucvtf       v20.4s, v20.4s
    874             ucvtf       v21.4s, v21.4s
    875             br          x4
    876 
    877 colormatrix_float_ldu3_end:
    878             tbz         x2, #2, 1f
    879             ld4         {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
    880             ld4         {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
    881             ld4         {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
    882             ld4         {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
    883 1:          tbz         x2, #1, 1f
    884             ld4         {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
    885             ld4         {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
    886 1:          tbz         x2, #0, 1f
    887             ld4         {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
    888 1:          uxtl        v20.8h, v20.8b
    889             uxtl        v21.8h, v21.8b
    890             uxtl        v22.8h, v22.8b
    891             uxtl        v12.4s, v20.4h
    892             uxtl        v13.4s, v21.4h
    893             uxtl        v14.4s, v22.4h
    894             uxtl2       v20.4s, v20.8h
    895             uxtl2       v21.4s, v21.8h
    896             uxtl2       v22.4s, v22.8h
    897             ucvtf       v12.4s, v12.4s
    898             ucvtf       v13.4s, v13.4s
    899             ucvtf       v14.4s, v14.4s
    900             ucvtf       v20.4s, v20.4s
    901             ucvtf       v21.4s, v21.4s
    902             ucvtf       v22.4s, v22.4s
    903             br          x4
    904 
    905 colormatrix_float_ldu4_end:
    906             tbz         x2, #2, 1f
    907             ld4         {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
    908             ld4         {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
    909             ld4         {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
    910             ld4         {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
    911 1:          tbz         x2, #1, 1f
    912             ld4         {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
    913             ld4         {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
    914 1:          tbz         x2, #0, 1f
    915             ld4         {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
    916 1:          uxtl        v20.8h, v20.8b
    917             uxtl        v21.8h, v21.8b
    918             uxtl        v22.8h, v22.8b
    919             uxtl        v23.8h, v23.8b
    920             uxtl        v12.4s, v20.4h
    921             uxtl        v13.4s, v21.4h
    922             uxtl        v14.4s, v22.4h
    923             uxtl        v15.4s, v23.4h
    924             uxtl2       v20.4s, v20.8h
    925             uxtl2       v21.4s, v21.8h
    926             uxtl2       v22.4s, v22.8h
    927             uxtl2       v23.4s, v23.8h
    928             ucvtf       v12.4s, v12.4s
    929             ucvtf       v13.4s, v13.4s
    930             ucvtf       v14.4s, v14.4s
    931             ucvtf       v15.4s, v15.4s
    932             ucvtf       v20.4s, v20.4s
    933             ucvtf       v21.4s, v21.4s
    934             ucvtf       v22.4s, v22.4s
    935             ucvtf       v23.4s, v23.4s
    936             br          x4
    937 
    938 colormatrix_float_ldf1_end:
    939             tbz         x2, #2, 1f
    940             ld1         {v20.4s}, [x1], #16
    941 1:          tbz         x2, #1, 1f
    942             ld1         {v12.d}[1], [x1], #8
    943 1:          tbz         x2, #0, 1f
    944             ld1         {v12.s}[1], [x1], #4
    945 1:          br          x4
    946 
    947 colormatrix_float_ldf2_end:
    948             tbz         x2, #2, 1f
    949             ld2         {v20.4s,v21.4s}, [x1], #32
    950 1:          tbz         x2, #1, 1f
    951             ld2         {v12.s,v13.s}[2], [x1], #8
    952             ld2         {v12.s,v13.s}[3], [x1], #8
    953 1:          tbz         x2, #0, 1f
    954             ld2         {v12.s,v13.s}[1], [x1], #8
    955 1:          br          x4
    956 
    957 colormatrix_float_ldf3_end:
    958 colormatrix_float_ldf4_end:
    959             tbz         x2, #2, 1f
    960             ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
    961 1:          tbz         x2, #1, 1f
    962             ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x1], #16
    963             ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x1], #16
    964 1:          tbz         x2, #0, 1f
    965             ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x1], #16
    966 1:          br          x4
    967 
    968 /* void rsdIntrinsicColorMatrix_int_K(
    969  *          void *out,              // x0
    970  *          void const *in,         // x1
    971  *          size_t count,           // x2
    972  *          fntab_t const *fns,     // x3
    973  *          int16_t const *mult,    // x4
    974  *          int32_t const *add);    // x5
    975  */
    976 ENTRY(rsdIntrinsicColorMatrix_int_K)
    977             sub         x7, sp, #32
    978             sub         sp, sp, #64
    979             st1         {v8.1d-v11.1d}, [sp]
    980             st1         {v12.1d-v15.1d}, [x7]
    981 
    982             ld1         {v0.8h,v1.8h}, [x4], #32
    983             ld1         {v4.4s}, [x5], #16
    984 
    985             ldp         x4,x5, [x3],#16
    986             ldp         x6,x7, [x3],#16
    987             ldp         x8,x9, [x3],#16
    988 
    989             dup         v12.4s, v4.s[0]
    990             dup         v13.4s, v4.s[1]
    991             dup         v14.4s, v4.s[2]
    992             dup         v15.4s, v4.s[3]
    993             sqshrun     v8.4h, v12.4s, #8
    994             sqshrun2    v8.8h, v12.4s, #8
    995             sqshrun     v9.4h, v13.4s, #8
    996             sqshrun2    v9.8h, v13.4s, #8
    997             sqshrun     v10.4h, v14.4s, #8
    998             sqshrun2    v10.8h, v14.4s, #8
    999             sqshrun     v11.4h, v15.4s, #8
   1000             sqshrun2    v11.8h, v15.4s, #8
   1001 
   1002             subs        x2, x2, #8
   1003             blo         colormatrix_int_end
   1004             br          x9
   1005 
   1006 colormatrix_int_end:
   1007             adds        x2, x2, #8
   1008             bls         colormatrix_int_realend
   1009             mov         x16, x8
   1010             ldp         x8, x9, [x3], #16
   1011             cmp         x4, x16
   1012             csel        x4, x8, x4, eq
   1013             cmp         x5, x16
   1014             csel        x5, x8, x5, eq
   1015             cmp         x6, x16
   1016             csel        x6, x8, x6, eq
   1017             cmp         x7, x16
   1018             csel        x7, x8, x7, eq
   1019             br          x9
   1020 
   1021 colormatrix_int_realend:
   1022             ld1         {v8.1d-v11.1d}, [sp], #32
   1023             ld1         {v12.1d-v15.1d}, [sp], #32
   1024             ret
   1025 END(rsdIntrinsicColorMatrix_int_K)
   1026 
   1027 /* void rsdIntrinsicColorMatrixSetup_int_K(
   1028  *          fntab_t const *fns, // x0
   1029  *          uint32_t mask,      // x1
   1030  *          int dt,             // x2
   1031  *          int st);            // x3
   1032  */
   1033 ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
   1034             adr         x7, 2f
   1035             add         x4, x7, x2, LSL #2
   1036             ldrsh       x2, [x4], #2
   1037             ldrsh       x4, [x4]
   1038             add         x2, x2, x7
   1039             add         x4, x4, x7
   1040             adr         x7, 3f
   1041             add         x5, x7, x3, LSL #2
   1042             ldrsh       x3, [x5], #2
   1043             ldrsh       x5, [x5]
   1044             add         x3, x3, x7
   1045             add         x5, x5, x7
   1046             stp         x2, x3, [x0, #32]
   1047             stp         x4, x5, [x0, #48]
   1048 
   1049 /* For each column function, if the matrix is all zeroes then write NULL,
   1050  * otherwise look up the appropriate function and store that. */
   1051 
   1052             mov         x3, #4
   1053             adr         x7, 4f
   1054 1:          ands        x2, x1, #15
   1055             beq         9f
   1056             and         x2, x1, #31
   1057             lsl         x2, x2, #3
   1058             ldrsh       x2, [x7, x2]
   1059             add         x2, x2, x7
   1060 9:          str         x2, [x0], #8
   1061             lsr         x1, x1, #5
   1062             add         x7, x7, #2
   1063             subs        x3, x3, #1
   1064             bne         1b
   1065 
   1066 /* For every NULL entry, copy the non-NULL entry that follows it, or the store
   1067  * function. */
   1068 
   1069             ldr         x2, [x0]
   1070             mov         x3, #4
   1071 1:          ldr         x1, [x0, #-8]!
   1072             cmp         x1, #0
   1073             csel        x2, x1, x2, ne
   1074             str         x2, [x0]
   1075             subs        x3, x3, #1
   1076             bne         1b
   1077             ret
   1078 
   1079             .align 4
   1080 2:          .hword      colormatrix_int_stu1-2b
   1081             .hword      colormatrix_int_stu1_end-2b
   1082             .hword      colormatrix_int_stu2-2b
   1083             .hword      colormatrix_int_stu2_end-2b
   1084             .hword      colormatrix_int_stu3-2b
   1085             .hword      colormatrix_int_stu3_end-2b
   1086             .hword      colormatrix_int_stu4-2b
   1087             .hword      colormatrix_int_stu4_end-2b
   1088 3:          .hword      colormatrix_int_ldu1-3b
   1089             .hword      colormatrix_int_ldu1_end-3b
   1090             .hword      colormatrix_int_ldu2-3b
   1091             .hword      colormatrix_int_ldu2_end-3b
   1092             .hword      colormatrix_int_ldu3-3b
   1093             .hword      colormatrix_int_ldu3_end-3b
   1094             .hword      colormatrix_int_ldu4-3b
   1095             .hword      colormatrix_int_ldu4_end-3b
   1096 4:
   1097 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1098             .hword      colormatrix_int_col0_\i-4b
   1099             .hword      colormatrix_int_col1_\i-4b-2
   1100             .hword      colormatrix_int_col2_\i-4b-4
   1101             .hword      colormatrix_int_col3_\i-4b-6
   1102 .endr
   1103 .irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
   1104             .hword      colormatrix_int_col0_n\i-4b
   1105             .hword      colormatrix_int_col1_n\i-4b-2
   1106             .hword      colormatrix_int_col2_n\i-4b-4
   1107             .hword      colormatrix_int_col3_n\i-4b-6
   1108 .endr
   1109 END(rsdIntrinsicColorMatrixSetup_int_K)
   1110 
   1111 
   1112 /* void rsdIntrinsicColorMatrix_float_K(
   1113  *          void *out,              // x0
   1114  *          void const *in,         // x1
   1115  *          size_t count,           // x2
   1116  *          fntab_t const *fns,     // x3
   1117  *          float const *mult,      // x4
   1118  *          float const *add);      // x5
   1119  */
   1120 ENTRY(rsdIntrinsicColorMatrix_float_K)
   1121             sub         x7, sp, #32
   1122             sub         sp, sp, #64
   1123             st1         {v8.1d-v11.1d}, [sp]
   1124             st1         {v12.1d-v15.1d}, [x7]
   1125 
   1126             ld1         {v0.4s,v1.4s,v2.4s,v3.4s}, [x4], #64
   1127             ld1r        {v4.4s}, [x5], #4
   1128             ld1r        {v5.4s}, [x5], #4
   1129             ld1r        {v6.4s}, [x5], #4
   1130             ld1r        {v7.4s}, [x5], #4
   1131 
   1132             ldp         x4,x5, [x3], #16
   1133             ldp         x6,x7, [x3], #16
   1134             ldp         x8,x9, [x3], #16
   1135 
   1136             mov         v8.16b, v4.16b
   1137             mov         v9.16b, v5.16b
   1138             mov         v10.16b, v6.16b
   1139             mov         v11.16b, v7.16b
   1140 
   1141             mov         v16.16b, v4.16b
   1142             mov         v17.16b, v5.16b
   1143             mov         v18.16b, v6.16b
   1144             mov         v19.16b, v7.16b
   1145 
   1146             subs        x2, x2, #8
   1147             blo         colormatrix_float_end
   1148             br          x9
   1149 
   1150 colormatrix_float_end:
   1151             adds        x2, x2, #8
   1152             bls         colormatrix_int_realend
   1153             mov         x16, x8
   1154             ldp         x8,x9, [x3], #16
   1155             cmp         x4, x16
   1156             csel        x4, x8, x4, eq
   1157             cmp         x5, x16
   1158             csel        x5, x8, x5, eq
   1159             cmp         x6, x16
   1160             csel        x6, x8, x6, eq
   1161             cmp         x7, x16
   1162             csel        x7, x8, x7, eq
   1163             br          x9
   1164 
   1165 colormatrix_float_realend:
   1166             ld1         {v8.1d-v11.1d}, [sp], #32
   1167             ld1         {v12.1d-v15.1d}, [sp], #32
   1168             ret
   1169 END(rsdIntrinsicColorMatrix_float_K)
   1170 
   1171 /* void rsdIntrinsicColorMatrixSetup_float_K(
   1172  *          fntab_t const *fns, // x0
   1173  *          uint32_t mask,      // x1
   1174  *          int dt,             // x2
   1175  *          int st);            // x3
   1176  */
   1177 ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
   1178             adr         x7, 2f
   1179             add         x4, x7, x2, LSL #2
   1180             ldrsh       x2, [x4], #2
   1181             ldrsh       x4, [x4]
   1182             add         x2, x2, x7
   1183             add         x4, x4, x7
   1184             adr         x7, 3f
   1185             add         x5, x7, x3, LSL #2
   1186             ldrsh       x3, [x5], #2
   1187             ldrsh       x5, [x5]
   1188             add         x3, x3, x7
   1189             add         x5, x5, x7
   1190             stp         x2, x3, [x0, #32]
   1191             stp         x4, x5, [x0, #48]
   1192 
   1193 /* For each column function, if the matrix is all zeroes then write NULL,
   1194  * otherwise look up the appropriate function and store that. */
   1195 
   1196             mov         x3, #4
   1197             adr         x7, 4f
   1198 1:          ands        x2, x1, #15
   1199             beq         9f
   1200             and         x2, x1, #31
   1201             lsl         x2, x2, #3
   1202             ldrsh       x2, [x7, x2]
   1203             add         x2, x2, x7
   1204 9:          str         x2, [x0], #8
   1205             lsr         x1, x1, #5
   1206             add         x7, x7, #2
   1207             subs        x3, x3, #1
   1208             bne         1b
   1209 
   1210 /* For every NULL entry, copy the non-NULL entry that follows it, or the store
   1211  * function. */
   1212 
   1213             ldr         x2, [x0]
   1214             mov         x3, #4
   1215 1:          ldr         x1, [x0, #-8]!
   1216             cmp         x1, #0
   1217             csel        x2, x1, x2, ne
   1218             str         x2, [x0]
   1219             subs        x3, x3, #1
   1220             bne         1b
   1221             ret
   1222 
   1223             .align 4
   1224 2:          .hword      colormatrix_float_stu1-2b
   1225             .hword      colormatrix_float_stu1_end-2b
   1226             .hword      colormatrix_float_stu2-2b
   1227             .hword      colormatrix_float_stu2_end-2b
   1228             .hword      colormatrix_float_stu3-2b
   1229             .hword      colormatrix_float_stu3_end-2b
   1230             .hword      colormatrix_float_stu4-2b
   1231             .hword      colormatrix_float_stu4_end-2b
   1232             .hword      colormatrix_float_stf1-2b
   1233             .hword      colormatrix_float_stf1_end-2b
   1234             .hword      colormatrix_float_stf2-2b
   1235             .hword      colormatrix_float_stf2_end-2b
   1236             .hword      colormatrix_float_stf3-2b
   1237             .hword      colormatrix_float_stf3_end-2b
   1238             .hword      colormatrix_float_stf4-2b
   1239             .hword      colormatrix_float_stf4_end-2b
   1240 3:          .hword      colormatrix_float_ldu1-3b
   1241             .hword      colormatrix_float_ldu1_end-3b
   1242             .hword      colormatrix_float_ldu2-3b
   1243             .hword      colormatrix_float_ldu2_end-3b
   1244             .hword      colormatrix_float_ldu3-3b
   1245             .hword      colormatrix_float_ldu3_end-3b
   1246             .hword      colormatrix_float_ldu4-3b
   1247             .hword      colormatrix_float_ldu4_end-3b
   1248             .hword      colormatrix_float_ldf1-3b
   1249             .hword      colormatrix_float_ldf1_end-3b
   1250             .hword      colormatrix_float_ldf2-3b
   1251             .hword      colormatrix_float_ldf2_end-3b
   1252             .hword      colormatrix_float_ldf3-3b
   1253             .hword      colormatrix_float_ldf3_end-3b
   1254             .hword      colormatrix_float_ldf4-3b
   1255             .hword      colormatrix_float_ldf4_end-3b
   1256 4:
   1257 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1258             .hword      colormatrix_float_col0_\i-4b
   1259             .hword      colormatrix_float_col1_\i-4b-2
   1260             .hword      colormatrix_float_col2_\i-4b-4
   1261             .hword      colormatrix_float_col3_\i-4b-6
   1262 .endr
   1263 .irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
   1264             .hword      colormatrix_float_col0_n\i-4b
   1265             .hword      colormatrix_float_col1_n\i-4b-2
   1266             .hword      colormatrix_float_col2_n\i-4b-4
   1267             .hword      colormatrix_float_col3_n\i-4b-6
   1268 .endr
   1269 END(rsdIntrinsicColorMatrixSetup_float_K)
   1270