Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /*
     18         r0 = dst
     19         r1 = y0 base pointer
     20         r2 = y1 base pointer
     21         r3 = y2 base pointer
     22         sp = coeffs
     23         sp = length / 2
     24 */
     25 
     26 #define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart
     27 #define END(f) .fnend; .size f, .-f;
     28 
     29 ENTRY(rsdIntrinsicConvolve3x3_K)
     30         push            {r4-r8, r10, r11, lr}
     31         vpush           {q4-q7}
     32 
     33         /* Get the coeffs pointer from the stack and load the
     34            coefficients in the q0, q1 NEON registers */
     35         ldr r4, [sp, #32+64]
     36         vld1.16 {q0, q1}, [r4]
     37 
     38         /* Get count from the stack */
     39         ldr r4, [sp, #36+64]
     40 
     41         /* Load the frequently used immediate in a register */
     42         mov r5, #8
     43 
     44 1:
     45         /* Load and post-increase the address by r5=#8 */
     46         vld1.8 {q13}, [r1], r5
     47         vld1.8 {q14}, [r2], r5
     48         vld1.8 {q15}, [r3], r5
     49 
     50         /* Signal memory for data that will be used in the loop after the next */
     51         pld         [r1, r5]
     52         pld         [r2, r5]
     53         pld         [r3, r5]
     54 
     55         vmovl.u8 q2, d26
     56         vmovl.u8 q3, d27
     57         vmovl.u8 q4, d28
     58         vmovl.u8 q5, d29
     59         vmovl.u8 q6, d30
     60         vmovl.u8 q7, d31
     61 
     62 /*
     63         The two pixel source array is
     64         d4,  d5,  d6,  d7
     65         d8,  d9,  d10, d11
     66         d12, d13, d14, d15
     67 */
     68 
     69         vmull.s16 q8, d4, d0[0]
     70         vmlal.s16 q8, d5, d0[1]
     71         vmlal.s16 q8, d6, d0[2]
     72         vmlal.s16 q8, d8, d0[3]
     73         vmlal.s16 q8, d9, d1[0]
     74         vmlal.s16 q8, d10, d1[1]
     75         vmlal.s16 q8, d12, d1[2]
     76         vmlal.s16 q8, d13, d1[3]
     77         vmlal.s16 q8, d14, d2[0]
     78 
     79         vmull.s16 q9, d5, d0[0]
     80         vmlal.s16 q9, d6, d0[1]
     81         vmlal.s16 q9, d7, d0[2]
     82         vmlal.s16 q9, d9, d0[3]
     83         vmlal.s16 q9, d10, d1[0]
     84         vmlal.s16 q9, d11, d1[1]
     85         vmlal.s16 q9, d13, d1[2]
     86         vmlal.s16 q9, d14, d1[3]
     87         vmlal.s16 q9, d15, d2[0]
     88 
     89         vshrn.i32 d16, q8, #8
     90         vshrn.i32 d17, q9, #8
     91 
     92         vqmovun.s16 d16, q8
     93         vst1.8 d16, [r0]!
     94 
     95         /* Are we done yet? */
     96         subs r4, r4, #1
     97         bne 1b
     98 
     99         /* We're done, bye! */
    100         vpop            {q4-q7}
    101         pop             {r4-r8, r10, r11, lr}
    102         bx              lr
    103 END(rsdIntrinsicConvolve3x3_K)
    104 
    105 
    106 /* Convolve 5x5 */
    107 
    108 /*
    109         r0 = dst
    110         r1 = y0 base pointer
    111         r2 = y1 base pointer
    112         r3 = y2 base pointer
    113         r4 = y3 base pointer
    114         r5 = y4 base pointer
    115         r6 = coeffs
    116         r7 = length
    117 */
    118 ENTRY(rsdIntrinsicConvolve5x5_K)
    119         push        {r4-r7, lr}
    120         vpush       {q4-q7}
    121 
    122         /* load y3 in r4 */
    123         ldr     r4, [sp, #20 + 64]
    124 
    125         /* load y4 in r5 */
    126         ldr     r5, [sp, #24 + 64]
    127 
    128         /* Load the coefficients pointer */
    129         ldr     r6, [sp, #28 + 64]
    130 
    131         /* Create the coefficients vector */
    132         vld1.16     {d0, d1, d2, d3}, [r6]!
    133         vld1.16     {d4, d5, d6}, [r6]
    134 
    135         vmov.u32  q15, #0x7f
    136 
    137         /* load the count */
    138         ldr     r6, [sp, #32 + 64]
    139 
    140         /* Load the frequently used immediate in a register */
    141         mov     r7, #8
    142 
    143 1:
    144         /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
    145         vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
    146         vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )
    147 
    148         /* Signal memory for data that will be used in the loop after the next */
    149         pld         [r1, r7]
    150         pld         [r2, r7]
    151 
    152         /* Promoting the 8bit channels to 16bit */
    153         vmovl.u8 q9,  d24
    154         vmovl.u8 q10, d25
    155         vmovl.u8 q11, d26
    156         vmovl.u8 q12, d27
    157         vmovl.u8 q13, d28
    158         vmovl.u8 q14, d29
    159 
    160 /*
    161         d18,  d19,  d20, d21, d22, d23,
    162         d24,  d25
    163 */
    164         vmull.s16 q4, d18, d0[0]
    165         vmlal.s16 q4, d19, d0[1]
    166         vmlal.s16 q4, d20, d0[2]
    167         vmlal.s16 q4, d21, d0[3]
    168         vmlal.s16 q4, d22, d1[0]
    169 
    170         vmlal.s16 q4, d24, d1[1]
    171         vmlal.s16 q4, d25, d1[2]
    172         vmlal.s16 q4, d26, d1[3]
    173         vmlal.s16 q4, d27, d2[0]
    174         vmlal.s16 q4, d28, d2[1]
    175 
    176         vmull.s16 q5, d19, d0[0]
    177         vmlal.s16 q5, d20, d0[1]
    178         vmlal.s16 q5, d21, d0[2]
    179         vmlal.s16 q5, d22, d0[3]
    180         vmlal.s16 q5, d23, d1[0]
    181 
    182         vmlal.s16 q5, d25, d1[1]
    183         vmlal.s16 q5, d26, d1[2]
    184         vmlal.s16 q5, d27, d1[3]
    185         vmlal.s16 q5, d28, d2[0]
    186         vmlal.s16 q5, d29, d2[1]
    187 
    188 
    189         /* Next 2 rows */
    190         /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
    191         vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
    192         vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )
    193 
    194         /* Signal memory for data that will be used in the loop after the next */
    195         pld         [r3, r7]
    196         pld         [r4, r7]
    197 
    198         /* Promoting the 8bit channels to 16bit */
    199         vmovl.u8 q9,  d24
    200         vmovl.u8 q10, d25
    201         vmovl.u8 q11, d26
    202         vmovl.u8 q12, d27
    203         vmovl.u8 q13, d28
    204         vmovl.u8 q14, d29
    205 
    206 /*
    207         d18,  d19,  d20, d21, d22, d23,
    208         d24,  d25
    209 */
    210         vmlal.s16 q4, d18, d2[2]
    211         vmlal.s16 q4, d19, d2[3]
    212         vmlal.s16 q4, d20, d3[0]
    213         vmlal.s16 q4, d21, d3[1]
    214         vmlal.s16 q4, d22, d3[2]
    215 
    216         vmlal.s16 q4, d24, d3[3]
    217         vmlal.s16 q4, d25, d4[0]
    218         vmlal.s16 q4, d26, d4[1]
    219         vmlal.s16 q4, d27, d4[2]
    220         vmlal.s16 q4, d28, d4[3]
    221 
    222         vmlal.s16 q5, d19, d2[2]
    223         vmlal.s16 q5, d20, d2[3]
    224         vmlal.s16 q5, d21, d3[0]
    225         vmlal.s16 q5, d22, d3[1]
    226         vmlal.s16 q5, d23, d3[2]
    227 
    228         vmlal.s16 q5, d25, d3[3]
    229         vmlal.s16 q5, d26, d4[0]
    230         vmlal.s16 q5, d27, d4[1]
    231         vmlal.s16 q5, d28, d4[2]
    232         vmlal.s16 q5, d29, d4[3]
    233 
    234         /* Last row */
    235         /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
    236         vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )
    237 
    238         /* Signal memory for data that will be used in the loop after the next */
    239         pld         [r5, r7]
    240 
    241         /* Promoting the 8bit channels to 16bit */
    242         vmovl.u8 q9,  d24
    243         vmovl.u8 q10, d25
    244         vmovl.u8 q11, d26
    245 
    246 /*
    247         d18,  d19,  d20, d21, d22, d23,
    248         d24,  d25
    249 */
    250 
    251         vmlal.s16 q4, d18, d5[0]
    252         vmlal.s16 q4, d19, d5[1]
    253         vmlal.s16 q4, d20, d5[2]
    254         vmlal.s16 q4, d21, d5[3]
    255         vmlal.s16 q4, d22, d6[0]
    256 
    257         vmlal.s16 q5, d19, d5[0]
    258         vmlal.s16 q5, d20, d5[1]
    259         vmlal.s16 q5, d21, d5[2]
    260         vmlal.s16 q5, d22, d5[3]
    261         vmlal.s16 q5, d23, d6[0]
    262 
    263 
    264 
    265         vadd.i32 q4, q4, q15
    266         vadd.i32 q5, q5, q15
    267 
    268 /*      Narrow it to a d-reg 32 -> 16 bit */
    269         vrshrn.i32 d8, q4, #8
    270         vrshrn.i32 d9, q5, #8
    271 
    272 
    273 /*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
    274         vqmovun.s16 d8, q4
    275 
    276         vst1.8 d8, [r0]!           @ return the output and increase the address of r0
    277 
    278         /* Are we done? */
    279         subs r6, r6, #1
    280         bne 1b
    281 
    282         /* Yup, bye */
    283         vpop        {q4-q7}
    284         pop         {r4-r7, lr}
    285         bx          lr
    286 
    287 END(rsdIntrinsicConvolve5x5_K)
    288