1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 /* 18 r0 = dst 19 r1 = y0 base pointer 20 r2 = y1 base pointer 21 r3 = y2 base pointer 22 sp = coeffs 23 sp = length / 2 24 */ 25 26 #define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart 27 #define END(f) .fnend; .size f, .-f; 28 29 ENTRY(rsdIntrinsicConvolve3x3_K) 30 push {r4-r8, r10, r11, lr} 31 vpush {q4-q7} 32 33 /* Get the coeffs pointer from the stack and load the 34 coefficients in the q0, q1 NEON registers */ 35 ldr r4, [sp, #32+64] 36 vld1.16 {q0, q1}, [r4] 37 38 /* Get count from the stack */ 39 ldr r4, [sp, #36+64] 40 41 /* Load the frequently used immediate in a register */ 42 mov r5, #8 43 44 1: 45 /* Load and post-increase the address by r5=#8 */ 46 vld1.8 {q13}, [r1], r5 47 vld1.8 {q14}, [r2], r5 48 vld1.8 {q15}, [r3], r5 49 50 /* Signal memory for data that will be used in the loop after the next */ 51 pld [r1, r5] 52 pld [r2, r5] 53 pld [r3, r5] 54 55 vmovl.u8 q2, d26 56 vmovl.u8 q3, d27 57 vmovl.u8 q4, d28 58 vmovl.u8 q5, d29 59 vmovl.u8 q6, d30 60 vmovl.u8 q7, d31 61 62 /* 63 The two pixel source array is 64 d4, d5, d6, d7 65 d8, d9, d10, d11 66 d12, d13, d14, d15 67 */ 68 69 vmull.s16 q8, d4, d0[0] 70 vmlal.s16 q8, d5, d0[1] 71 vmlal.s16 q8, d6, d0[2] 72 vmlal.s16 q8, d8, d0[3] 73 vmlal.s16 q8, d9, d1[0] 74 vmlal.s16 q8, d10, d1[1] 75 vmlal.s16 q8, d12, d1[2] 76 vmlal.s16 q8, d13, d1[3] 77 vmlal.s16 q8, d14, d2[0] 78 79 vmull.s16 q9, d5, d0[0] 80 vmlal.s16 q9, d6, d0[1] 81 vmlal.s16 q9, d7, d0[2] 82 vmlal.s16 q9, d9, d0[3] 83 vmlal.s16 q9, d10, d1[0] 84 vmlal.s16 q9, d11, d1[1] 85 vmlal.s16 q9, d13, d1[2] 86 vmlal.s16 q9, d14, d1[3] 87 vmlal.s16 q9, d15, d2[0] 88 89 vshrn.i32 d16, q8, #8 90 vshrn.i32 d17, q9, #8 91 92 vqmovun.s16 d16, q8 93 vst1.8 d16, [r0]! 94 95 /* Are we done yet? */ 96 subs r4, r4, #1 97 bne 1b 98 99 /* We're done, bye! */ 100 vpop {q4-q7} 101 pop {r4-r8, r10, r11, lr} 102 bx lr 103 END(rsdIntrinsicConvolve3x3_K) 104 105 106 /* Convolve 5x5 */ 107 108 /* 109 r0 = dst 110 r1 = y0 base pointer 111 r2 = y1 base pointer 112 r3 = y2 base pointer 113 r4 = y3 base pointer 114 r5 = y4 base pointer 115 r6 = coeffs 116 r7 = length 117 */ 118 ENTRY(rsdIntrinsicConvolve5x5_K) 119 push {r4-r7, lr} 120 vpush {q4-q7} 121 122 /* load y3 in r4 */ 123 ldr r4, [sp, #20 + 64] 124 125 /* load y4 in r5 */ 126 ldr r5, [sp, #24 + 64] 127 128 /* Load the coefficients pointer */ 129 ldr r6, [sp, #28 + 64] 130 131 /* Create the coefficients vector */ 132 vld1.16 {d0, d1, d2, d3}, [r6]! 133 vld1.16 {d4, d5, d6}, [r6] 134 135 vmov.u32 q15, #0x7f 136 137 /* load the count */ 138 ldr r6, [sp, #32 + 64] 139 140 /* Load the frequently used immediate in a register */ 141 mov r7, #8 142 143 1: 144 /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ 145 vld1.8 {d24, d25, d26}, [r1], r7 @ y0 ( y - 2 ) 146 vld1.8 {d27, d28, d29}, [r2], r7 @ y0 ( y - 1 ) 147 148 /* Signal memory for data that will be used in the loop after the next */ 149 pld [r1, r7] 150 pld [r2, r7] 151 152 /* Promoting the 8bit channels to 16bit */ 153 vmovl.u8 q9, d24 154 vmovl.u8 q10, d25 155 vmovl.u8 q11, d26 156 vmovl.u8 q12, d27 157 vmovl.u8 q13, d28 158 vmovl.u8 q14, d29 159 160 /* 161 d18, d19, d20, d21, d22, d23, 162 d24, d25 163 */ 164 vmull.s16 q4, d18, d0[0] 165 vmlal.s16 q4, d19, d0[1] 166 vmlal.s16 q4, d20, d0[2] 167 vmlal.s16 q4, d21, d0[3] 168 vmlal.s16 q4, d22, d1[0] 169 170 vmlal.s16 q4, d24, d1[1] 171 vmlal.s16 q4, d25, d1[2] 172 vmlal.s16 q4, d26, d1[3] 173 vmlal.s16 q4, d27, d2[0] 174 vmlal.s16 q4, d28, d2[1] 175 176 vmull.s16 q5, d19, d0[0] 177 vmlal.s16 q5, d20, d0[1] 178 vmlal.s16 q5, d21, d0[2] 179 vmlal.s16 q5, d22, d0[3] 180 vmlal.s16 q5, d23, d1[0] 181 182 vmlal.s16 q5, d25, d1[1] 183 vmlal.s16 q5, d26, d1[2] 184 vmlal.s16 q5, d27, d1[3] 185 vmlal.s16 q5, d28, d2[0] 186 vmlal.s16 q5, d29, d2[1] 187 188 189 /* Next 2 rows */ 190 /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ 191 vld1.8 {d24, d25, d26}, [r3], r7 @ y0 ( y ) 192 vld1.8 {d27, d28, d29}, [r4], r7 @ y0 ( y + 1 ) 193 194 /* Signal memory for data that will be used in the loop after the next */ 195 pld [r3, r7] 196 pld [r4, r7] 197 198 /* Promoting the 8bit channels to 16bit */ 199 vmovl.u8 q9, d24 200 vmovl.u8 q10, d25 201 vmovl.u8 q11, d26 202 vmovl.u8 q12, d27 203 vmovl.u8 q13, d28 204 vmovl.u8 q14, d29 205 206 /* 207 d18, d19, d20, d21, d22, d23, 208 d24, d25 209 */ 210 vmlal.s16 q4, d18, d2[2] 211 vmlal.s16 q4, d19, d2[3] 212 vmlal.s16 q4, d20, d3[0] 213 vmlal.s16 q4, d21, d3[1] 214 vmlal.s16 q4, d22, d3[2] 215 216 vmlal.s16 q4, d24, d3[3] 217 vmlal.s16 q4, d25, d4[0] 218 vmlal.s16 q4, d26, d4[1] 219 vmlal.s16 q4, d27, d4[2] 220 vmlal.s16 q4, d28, d4[3] 221 222 vmlal.s16 q5, d19, d2[2] 223 vmlal.s16 q5, d20, d2[3] 224 vmlal.s16 q5, d21, d3[0] 225 vmlal.s16 q5, d22, d3[1] 226 vmlal.s16 q5, d23, d3[2] 227 228 vmlal.s16 q5, d25, d3[3] 229 vmlal.s16 q5, d26, d4[0] 230 vmlal.s16 q5, d27, d4[1] 231 vmlal.s16 q5, d28, d4[2] 232 vmlal.s16 q5, d29, d4[3] 233 234 /* Last row */ 235 /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ 236 vld1.8 {d24, d25, d26}, [r5], r7 @ y0 ( y + 2 ) 237 238 /* Signal memory for data that will be used in the loop after the next */ 239 pld [r5, r7] 240 241 /* Promoting the 8bit channels to 16bit */ 242 vmovl.u8 q9, d24 243 vmovl.u8 q10, d25 244 vmovl.u8 q11, d26 245 246 /* 247 d18, d19, d20, d21, d22, d23, 248 d24, d25 249 */ 250 251 vmlal.s16 q4, d18, d5[0] 252 vmlal.s16 q4, d19, d5[1] 253 vmlal.s16 q4, d20, d5[2] 254 vmlal.s16 q4, d21, d5[3] 255 vmlal.s16 q4, d22, d6[0] 256 257 vmlal.s16 q5, d19, d5[0] 258 vmlal.s16 q5, d20, d5[1] 259 vmlal.s16 q5, d21, d5[2] 260 vmlal.s16 q5, d22, d5[3] 261 vmlal.s16 q5, d23, d6[0] 262 263 264 265 vadd.i32 q4, q4, q15 266 vadd.i32 q5, q5, q15 267 268 /* Narrow it to a d-reg 32 -> 16 bit */ 269 vrshrn.i32 d8, q4, #8 270 vrshrn.i32 d9, q5, #8 271 272 273 /* Pack 16 -> 8 bit, saturate, put two pixels into D reg */ 274 vqmovun.s16 d8, q4 275 276 vst1.8 d8, [r0]! @ return the output and increase the address of r0 277 278 /* Are we done? */ 279 subs r6, r6, #1 280 bne 1b 281 282 /* Yup, bye */ 283 vpop {q4-q7} 284 pop {r4-r7, lr} 285 bx lr 286 287 END(rsdIntrinsicConvolve5x5_K) 288