Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2012,2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /*
     18         x0 = dst
     19         x1 = y0 base pointer
     20         x2 = y1 base pointer
     21         x3 = y2 base pointer
     22         x4 = coeffs
     23         x5 = length / 2
     24 */
     25 
     26 #define ENTRY(f) .text; .align 2; .globl f; .type f,#function; f:
     27 #define END(f) .size f, .-f;
     28 
     29 ENTRY(rsdIntrinsicConvolve3x3_K)
     30         sub             x6, sp, #64
     31         sub             sp, sp, #64
     32         st1             {v8.1d-v11.1d}, [x6], #32
     33         st1             {v12.1d-v15.1d}, [x6]
     34 
     35         /* Load the coefficients in the v0, v1 registers */
     36         ld1     {v0.8h, v1.8h}, [x4]
     37 
     38         /* Load the frequently used immediate in a register */
     39         mov x4, #8
     40 
     41 1:
     42         /* Load and post-increase the address by x4=#8 */
     43         ld1     {v13.16b}, [x1], x4
     44         ld1     {v14.16b}, [x2], x4
     45         ld1     {v15.16b}, [x3], x4
     46 
     47         /* Signal memory for data that will be used in the loop after the next */
     48 //        prfm        PLDL1KEEP,[x1, x4] // TODO: test this
     49 //        prfm        PLDL1KEEP,[x2, x4] // TODO: test this
     50 //        prfm        PLDL1KEEP,[x3, x4] // TODO: test this
     51 
     52         uxtl      v2.8h, v13.8b
     53         uxtl2     v3.8h, v13.16b
     54         uxtl      v4.8h, v14.8b
     55         uxtl2     v5.8h, v14.16b
     56         uxtl      v6.8h, v15.8b
     57         uxtl2     v7.8h, v15.16b
     58 
     59 /*
     60         The two pixel source array is
     61         v2,  v2hi,  v3lo,  v3hi
     62         v4,  v4hi,  v5lo, v5hi
     63         v6, v6hi, v7lo, v7hi
     64 */
     65 
     66         smull     v8.4s, v2.4h, v0.h[0]
     67         smull2    v9.4s, v2.8h, v0.h[0]
     68         smlal2    v8.4s, v2.8h, v0.h[1]
     69         smlal     v9.4s, v3.4h, v0.h[1]
     70         smlal     v8.4s, v3.4h, v0.h[2]
     71         smlal2    v9.4s, v3.8h, v0.h[2]
     72         smlal     v8.4s, v4.4h, v0.h[3]
     73         smlal2    v9.4s, v4.8h, v0.h[3]
     74         smlal2    v8.4s, v4.8h, v0.h[4]
     75         smlal     v9.4s, v5.4h, v0.h[4]
     76         smlal     v8.4s, v5.4h, v0.h[5]
     77         smlal2    v9.4s, v5.8h, v0.h[5]
     78         smlal     v8.4s, v6.4h, v0.h[6]
     79         smlal2    v9.4s, v6.8h, v0.h[6]
     80         smlal2    v8.4s, v6.8h, v0.h[7]
     81         smlal     v9.4s, v7.4h, v0.h[7]
     82         smlal     v8.4s, v7.4h, v1.h[0]
     83         smlal2    v9.4s, v7.8h, v1.h[0]
     84 
     85         shrn      v8.4h, v8.4s, #8
     86         shrn2     v8.8h, v9.4s, #8
     87 
     88         sqxtun      v8.8b, v8.8h
     89         st1         {v8.8b}, [x0], #8
     90 
     91         /* Are we done yet? */
     92         subs x5, x5, #1
     93         bne 1b
     94 
     95         /* We're done, bye! */
     96         ld1             {v8.1d-v11.1d}, [sp], #32
     97         ld1             {v12.1d-v15.1d}, [sp], #32
     98         ret
     99 END(rsdIntrinsicConvolve3x3_K)
    100 
    101 
    102 /* Convolve 5x5 */
    103 
    104 /*
    105         x0 = dst
    106         x1 = y0 base pointer
    107         x2 = y1 base pointer
    108         x3 = y2 base pointer
    109         x4 = y3 base pointer
    110         x5 = y4 base pointer
    111         x6 = coeffs
    112         x7 = length
    113 */
    114 ENTRY(rsdIntrinsicConvolve5x5_K)
    115         sub         x8, sp, #64
    116         sub         sp, sp, #64
    117         st1         {v8.1d-v11.1d}, [x8], #32
    118         st1         {v12.1d-v15.1d}, [x8]
    119 
    120         /* Create the coefficients vector  */
    121         ld1         {v0.8h-v2.8h}, [x6], #48
    122         ld1         {v3.4h}, [x6], #8
    123 
    124         movi      v15.4s, #0x7f
    125 
    126         /* Load the frequently used immediate in a register */
    127         mov     x6, #8
    128 
    129 1:
    130         /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
    131         ld1     {v9.8b-v11.8b}, [x1], x6      //  y0 ( y - 2 )
    132         ld1     {v12.8b-v14.8b}, [x2], x6      //  y0 ( y - 1 )
    133 
    134         /* Signal memory for data that will be used in the loop after the next */
    135 //        prfm        PLDL1KEEP,[x1, x6] // TODO: test this
    136 //        prfm        PLDL1KEEP,[x2, x6] // TODO: test this
    137 
    138         /* Promoting the 8bit channels to 16bit */
    139         uxtl      v9.8h,  v9.8b
    140         uxtl      v10.8h, v10.8b
    141         uxtl      v11.8h, v11.8b
    142         uxtl      v12.8h, v12.8b
    143         uxtl      v13.8h, v13.8b
    144         uxtl      v14.8h, v14.8b
    145 
    146 /*
    147         v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
    148         v12,  v12hi
    149 */
    150         smull     v4.4s, v9.4h, v0.h[0]
    151         smull2    v5.4s, v9.8h, v0.h[0]
    152         smlal2    v4.4s, v9.8h, v0.h[1]
    153         smlal     v5.4s, v10.4h, v0.h[1]
    154         smlal     v4.4s, v10.4h, v0.h[2]
    155         smlal2    v5.4s, v10.8h, v0.h[2]
    156         smlal2    v4.4s, v10.8h, v0.h[3]
    157         smlal     v5.4s, v11.4h, v0.h[3]
    158         smlal     v4.4s, v11.4h, v0.h[4]
    159         smlal2    v5.4s, v11.8h, v0.h[4]
    160 
    161         smlal     v4.4s, v12.4h, v0.h[5]
    162         smlal2    v5.4s, v12.8h, v0.h[5]
    163         smlal2    v4.4s, v12.8h, v0.h[6]
    164         smlal     v5.4s, v13.4h, v0.h[6]
    165         smlal     v4.4s, v13.4h, v0.h[7]
    166         smlal2    v5.4s, v13.8h, v0.h[7]
    167         smlal2    v4.4s, v13.8h, v1.h[0]
    168         smlal     v5.4s, v14.4h, v1.h[0]
    169         smlal     v4.4s, v14.4h, v1.h[1]
    170         smlal2    v5.4s, v14.8h, v1.h[1]
    171 
    172         /* Next 2 rows */
    173         /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
    174         ld1     {v9.8b-v11.8b}, [x3], x6      //  y0 ( y )
    175         ld1     {v12.8b-v14.8b}, [x4], x6      //  y0 ( y + 1 )
    176 
    177         /* Signal memory for data that will be used in the loop after the next */
    178 //        prfm        PLDL1KEEP,[x3, x6] // TODO: test this
    179 //        prfm        PLDL1KEEP,[x4, x6] // TODO: test this
    180 
    181         /* Promoting the 8bit channels to 16bit */
    182         uxtl      v9.8h,  v9.8b
    183         uxtl      v10.8h, v10.8b
    184         uxtl      v11.8h, v11.8b
    185         uxtl      v12.8h, v12.8b
    186         uxtl      v13.8h, v13.8b
    187         uxtl      v14.8h, v14.8b
    188 
    189 /*
    190         v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
    191         v12,  v12hi
    192 */
    193         smlal     v4.4s, v9.4h, v1.h[2]
    194         smlal2    v5.4s, v9.8h, v1.h[2]
    195         smlal2    v4.4s, v9.8h, v1.h[3]
    196         smlal     v5.4s, v10.4h, v1.h[3]
    197         smlal     v4.4s, v10.4h, v1.h[4]
    198         smlal2    v5.4s, v10.8h, v1.h[4]
    199         smlal2    v4.4s, v10.8h, v1.h[5]
    200         smlal     v5.4s, v11.4h, v1.h[5]
    201         smlal     v4.4s, v11.4h, v1.h[6]
    202         smlal2    v5.4s, v11.8h, v1.h[6]
    203 
    204         smlal     v4.4s, v12.4h, v1.h[7]
    205         smlal2    v5.4s, v12.8h, v1.h[7]
    206         smlal2    v4.4s, v12.8h, v2.h[0]
    207         smlal     v5.4s, v13.4h, v2.h[0]
    208         smlal     v4.4s, v13.4h, v2.h[1]
    209         smlal2    v5.4s, v13.8h, v2.h[1]
    210         smlal2    v4.4s, v13.8h, v2.h[2]
    211         smlal     v5.4s, v14.4h, v2.h[2]
    212         smlal     v4.4s, v14.4h, v2.h[3]
    213         smlal2    v5.4s, v14.8h, v2.h[3]
    214 
    215         /* Last row */
    216         /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
    217         ld1     {v9.8b- v11.8b}, [x5], x6      //  y0 ( y + 2 )
    218 
    219         /* Signal memory for data that will be used in the loop after the next */
    220 //        prfm        PLDL1KEEP,[x5, x6] // TODO: test this
    221 
    222         /* Promoting the 8bit channels to 16bit */
    223         uxtl      v9.8h,  v9.8b
    224         uxtl      v10.8h, v10.8b
    225         uxtl      v11.8h, v11.8b
    226 
    227 /*
    228         v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
    229         v12,  v12hi
    230 */
    231 
    232         smlal     v4.4s, v9.4h, v2.h[4]
    233         smlal2    v5.4s, v9.8h, v2.h[4]
    234         smlal2    v4.4s, v9.8h, v2.h[5]
    235         smlal     v5.4s, v10.4h, v2.h[5]
    236         smlal     v4.4s, v10.4h, v2.h[6]
    237         smlal2    v5.4s, v10.8h, v2.h[6]
    238         smlal2    v4.4s, v10.8h, v2.h[7]
    239         smlal     v5.4s, v11.4h, v2.h[7]
    240         smlal     v4.4s, v11.4h, v3.h[0]
    241         smlal2    v5.4s, v11.8h, v3.h[0]
    242 
    243         add      v4.4s, v4.4s, v15.4s
    244         add      v5.4s, v5.4s, v15.4s
    245 
    246 /*      Narrow it to a d-reg 32 -> 16 bit */
    247         rshrn      v4.4h, v4.4s, #8
    248         rshrn2     v4.8h, v5.4s, #8
    249 
    250 
    251 /*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
    252         sqxtun      v4.8b, v4.8h
    253 
    254         st1     {v4.8b}, [x0], #8        // return the output and increase the address of x0
    255 
    256         /* Are we done? */
    257         subs x7, x7, #1
    258         bne 1b
    259 
    260         /* Yup, bye */
    261         ld1         {v8.1d-v11.1d}, [sp], #32
    262         ld1         {v12.1d-v15.1d}, [sp], #32
    263         ret
    264 
    265 END(rsdIntrinsicConvolve5x5_K)
    266