1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart 18 #define PRIVATE(f) .text; .align 4; .type f,#function; f: .fnstart 19 #define END(f) .fnend; .size f, .-f; 20 21 #define ARCH_ARM_USE_BLUR_PRELOAD 22 23 .eabi_attribute 25,1 @Tag_ABI_align8_preserved 24 .arm 25 26 /* Number of fractional bits to preserve in intermediate results. The 27 * intermediate storage is 16-bit, and we started with 8 bit data (the integer 28 * part), so this should be between 0 and 8. 29 */ 30 .set FRACTION_BITS, 7 31 32 .set MAX_R, 25 33 34 35 /* A quick way of making a line of code conditional on some other condition. 36 * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with 37 * `ifcc`: 38 */ 39 .macro ifcc zzz:vararg 40 .if cc 41 \zzz 42 .endif 43 .endm 44 45 /* It's not always clear that prefetching is beneficial and this needs further 46 * testing on different cores, so it's made switchable here. 47 */ 48 #if defined(ARCH_ARM_USE_BLUR_PRELOAD) 49 #define VERTPLD(...) pld [__VA_ARGS__] 50 #else 51 #define VERTPLD(...) nop 52 #endif 53 54 /* Fetch 16 columns of bytes (regardless of image format), convolve these 55 * vertically, and leave them in the register file. If working near the top or 56 * bottom of an image then clamp the addressing while loading the data in. 57 * 58 * The convolution is fully unrolled for windows up to max_r, with the 59 * outermost edges calculated first. This way it's possible to branch directly 60 * into the relevant part of the code for an arbitrary convolution radius. Two 61 * variants of the loop are produced; one eliminates the clamping code for a 62 * slight speed advantage. 63 * 64 * Where the macro is called with reg=x, the specified register is taken to 65 * contain a pre-calculated pointer into one of the two loops. 66 * 67 * Input: 68 * r1 -- src 69 * r2 -- pitch 70 * r5 -- r 71 * r6 -- rup (r, unless clipped to top of source image) 72 * r7 -- rdn (r, unless clipped to bottom of source image) 73 * r12 -- switch index 74 * q0-q3 -- coefficient table 75 * Output: 76 * r1 += 16 77 * q10,q11 -- 16 convolved columns 78 * Modifies: 79 * r10 = upper row pointer 80 * r11 = lower row pointer 81 * q12-q15 = temporary sums 82 */ 83 .macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=r12 /*{{{*/ 84 .ifc \reg,r12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif 85 86 vld1.8 {d30,d31}, [r1] 87 mls r10, r2, r6, r1 88 89 vmovl.u8 q14, d30 90 VERTPLD(r1, #32) 91 vmovl.u8 q15, d31 92 .if \max_r < 16 // approximate 93 ifcc adr \reg, 1f 94 .else 95 ifcc ldr \reg, 2f 96 1: ifcc add \reg, \reg, pc 97 .endif 98 99 vmull.u16 q12, d28, d0[0] 100 ifcc sub \reg, r5, LSL #6 101 vmull.u16 q13, d29, d0[0] 102 mla r11, r2, r7, r1 103 vmull.u16 q14, d30, d0[0] 104 add r1, r1, #16 105 vmull.u16 q15, d31, d0[0] 106 bx \reg 107 108 ifcc .align 2 109 2: ifcc .word 1f-1b-8 110 111 /* This version of the vertical fetch loop body is used away from the edges 112 * of the source image. The pointers start at the top and bottom source rows 113 * and work their way towards the centre on each iteration. This way the 114 * number of taps used can be controlled by jumping directly into the middle 115 * of the loop and running to completion. 116 * If the loop body changes size then the code which caculates the address of 117 * the initial iteration must be updated to accordingly. 118 */ 119 .macro vertfetch_noclamp i, dreg 120 .if 0 < \i && \i <= \max_r 121 vld1.8 {d20,d21}, [r10], r2 122 vld1.8 {d22,d23}, [r11] 123 sub r11, r11, r2 124 vswp d21, d22 125 VERTPLD(r10, #32) 126 vaddl.u8 q10, d20, d21 127 vaddl.u8 q11, d22, d23 128 vmlal.u16 q12, d20, \dreg 129 VERTPLD(r11, #32) 130 vmlal.u16 q13, d21, \dreg 131 vmlal.u16 q14, d22, \dreg 132 vmlal.u16 q15, d23, \dreg 133 .endif 134 .endm 135 136 /* This version of the vertical fetch loop body is used near the edges of the 137 * source image, where one or both of the accesses may start with a clamped 138 * value, and the row addresses only begin to change after some number of 139 * iterations before the end. 140 * If the loop body changes size then the code which caculates the address of 141 * the initial iteration must be updated to accordingly. 142 */ 143 .macro vertfetch_clamped i, dreg 144 .if 0 < \i && \i <= \max_r 145 vld1.8 {d20,d21}, [r10] 146 vld1.8 {d22,d23}, [r11] 147 cmp r6, #\i 148 vswp d21, d22 149 VERTPLD(r10, #32) 150 vaddl.u8 q10, d20, d21 151 addhs r10, r10, r2 152 vaddl.u8 q11, d22, d23 153 cmp r7, #\i 154 vmlal.u16 q12, d20, \dreg 155 VERTPLD(r11, #32) 156 vmlal.u16 q13, d21, \dreg 157 subhs r11, r11, r2 158 vmlal.u16 q14, d22, \dreg 159 nop 160 vmlal.u16 q15, d23, \dreg 161 .endif 162 .endm 163 164 /* Entry into this unrolled loop is computed as a negative index from 165 * \labelc at the end of the block. 166 */ 167 .align 4 168 vertfetch_clamped 27, d6[3] 169 vertfetch_clamped 26, d6[2] 170 vertfetch_clamped 25, d6[1] 171 vertfetch_clamped 24, d6[0] 172 vertfetch_clamped 23, d5[3] 173 vertfetch_clamped 22, d5[2] 174 vertfetch_clamped 21, d5[1] 175 vertfetch_clamped 20, d5[0] 176 vertfetch_clamped 19, d4[3] 177 vertfetch_clamped 18, d4[2] 178 vertfetch_clamped 17, d4[1] 179 vertfetch_clamped 16, d4[0] 180 vertfetch_clamped 15, d3[3] 181 vertfetch_clamped 14, d3[2] 182 vertfetch_clamped 13, d3[1] 183 vertfetch_clamped 12, d3[0] 184 vertfetch_clamped 11, d2[3] 185 vertfetch_clamped 10, d2[2] 186 vertfetch_clamped 9, d2[1] 187 vertfetch_clamped 8, d2[0] 188 vertfetch_clamped 7, d1[3] 189 vertfetch_clamped 6, d1[2] 190 vertfetch_clamped 5, d1[1] 191 vertfetch_clamped 4, d1[0] 192 vertfetch_clamped 3, d0[3] 193 vertfetch_clamped 2, d0[2] 194 vertfetch_clamped 1, d0[1] 195 vertfetch_clamped 0, d0[0] 196 1: 197 \labelc : b 2f /* done with clamped loop, skip over non-clamped loop */ 198 199 /* Entry into this unrolled loop is computed as a negative index from 200 * \labelnc at the end of the block. 201 */ 202 .align 4 203 vertfetch_noclamp 27, d6[3] 204 vertfetch_noclamp 26, d6[2] 205 vertfetch_noclamp 25, d6[1] 206 vertfetch_noclamp 24, d6[0] 207 vertfetch_noclamp 23, d5[3] 208 vertfetch_noclamp 22, d5[2] 209 vertfetch_noclamp 21, d5[1] 210 vertfetch_noclamp 20, d5[0] 211 vertfetch_noclamp 19, d4[3] 212 vertfetch_noclamp 18, d4[2] 213 vertfetch_noclamp 17, d4[1] 214 vertfetch_noclamp 16, d4[0] 215 vertfetch_noclamp 15, d3[3] 216 vertfetch_noclamp 14, d3[2] 217 vertfetch_noclamp 13, d3[1] 218 vertfetch_noclamp 12, d3[0] 219 vertfetch_noclamp 11, d2[3] 220 vertfetch_noclamp 10, d2[2] 221 vertfetch_noclamp 9, d2[1] 222 vertfetch_noclamp 8, d2[0] 223 vertfetch_noclamp 7, d1[3] 224 vertfetch_noclamp 6, d1[2] 225 vertfetch_noclamp 5, d1[1] 226 vertfetch_noclamp 4, d1[0] 227 vertfetch_noclamp 3, d0[3] 228 vertfetch_noclamp 2, d0[2] 229 vertfetch_noclamp 1, d0[1] 230 vertfetch_noclamp 0, d0[0] 231 \labelnc : 232 233 .purgem vertfetch_clamped 234 .purgem vertfetch_noclamp 235 236 2: vqrshrn.u32 d20, q12, #16 - FRACTION_BITS 237 vqrshrn.u32 d21, q13, #16 - FRACTION_BITS 238 vqrshrn.u32 d22, q14, #16 - FRACTION_BITS 239 vqrshrn.u32 d23, q15, #16 - FRACTION_BITS 240 .endm /*}}}*/ 241 242 /* Some portion of the convolution window (as much as will fit, and all of it 243 * for the uchar1 cases) is kept in the register file to avoid unnecessary 244 * memory accesses. This forces the horizontal loops to be unrolled because 245 * there's no indexed addressing into the register file. 246 * 247 * As in the fetch macro, the operations are ordered from outside to inside, so 248 * that jumping into the middle of the block bypasses the unwanted window taps. 249 * 250 * There are several variants of the macro because of the fixed offets of the 251 * taps -- the wider the maximum radius the further the centre tap is from the 252 * most recently fetched data. This means that pre-filling the window requires 253 * more data that won't be used and it means that rotating the window involves 254 * more mov operations. 255 * 256 * When the buffer gets too big the buffer at [r9] is used. 257 * 258 * Input: 259 * q4-q11 -- convoltion window 260 * r9 -- pointer to additional convolution window data 261 * Output: 262 * r9 -- updated buffer pointer (if used) 263 * d31 -- result to be stored 264 * Modifies: 265 * r12 -- temp buffer pointer 266 * q12-q13 -- temporaries for load and vext operations. 267 * q14-q15 -- intermediate sums 268 */ 269 #define TUNED_LIST1 8, 16 270 .macro hconv1_8/*{{{*/ 271 vmull.u16 q14, d18, d0[0] 272 vmull.u16 q15, d19, d0[0] 273 274 ldr r12, [pc, r5, LSL #2] 275 add pc, pc, r12 276 bkpt 277 100: .word 101f-100b 278 .word 102f-100b 279 .word 103f-100b 280 .word 104f-100b 281 .word 105f-100b 282 .word 106f-100b 283 .word 107f-100b 284 .word 108f-100b 285 108: vmlal.u16 q14, d16, d2[0] 286 vmlal.u16 q15, d17, d2[0] 287 vmlal.u16 q14, d20, d2[0] 288 vmlal.u16 q15, d21, d2[0] 289 107: vext.u16 q12, q8, q9, #1 290 vext.u16 q13, q9, q10, #7 291 vmlal.u16 q14, d24, d1[3] 292 vmlal.u16 q15, d25, d1[3] 293 vmlal.u16 q14, d26, d1[3] 294 vmlal.u16 q15, d27, d1[3] 295 106: vext.u16 q12, q8, q9, #2 296 vext.u16 q13, q9, q10, #6 297 vmlal.u16 q14, d24, d1[2] 298 vmlal.u16 q15, d25, d1[2] 299 vmlal.u16 q14, d26, d1[2] 300 vmlal.u16 q15, d27, d1[2] 301 105: vext.u16 q12, q8, q9, #3 302 vext.u16 q13, q9, q10, #5 303 vmlal.u16 q14, d24, d1[1] 304 vmlal.u16 q15, d25, d1[1] 305 vmlal.u16 q14, d26, d1[1] 306 vmlal.u16 q15, d27, d1[1] 307 104: //vext.u16 q12, q8, q9, #4 308 //vext.u16 q13, q9, q10, #4 309 vmlal.u16 q14, d17, d1[0] 310 vmlal.u16 q15, d18, d1[0] 311 vmlal.u16 q14, d19, d1[0] 312 vmlal.u16 q15, d20, d1[0] 313 103: vext.u16 q12, q8, q9, #5 314 vext.u16 q13, q9, q10, #3 315 vmlal.u16 q14, d24, d0[3] 316 vmlal.u16 q15, d25, d0[3] 317 vmlal.u16 q14, d26, d0[3] 318 vmlal.u16 q15, d27, d0[3] 319 102: vext.u16 q12, q8, q9, #6 320 vext.u16 q13, q9, q10, #2 321 vmlal.u16 q14, d24, d0[2] 322 vmlal.u16 q15, d25, d0[2] 323 vmlal.u16 q14, d26, d0[2] 324 vmlal.u16 q15, d27, d0[2] 325 101: vext.u16 q12, q8, q9, #7 326 vext.u16 q13, q9, q10, #1 327 vmlal.u16 q14, d24, d0[1] 328 vmlal.u16 q15, d25, d0[1] 329 vmlal.u16 q14, d26, d0[1] 330 vmlal.u16 q15, d27, d0[1] 331 332 vqrshrn.u32 d28, q14, #16 333 vqrshrn.u32 d29, q15, #16 334 vqrshrn.u16 d31, q14, #FRACTION_BITS 335 336 vmov q8, q9 337 vmov q9, q10 338 vmov q10, q11 339 .endm/*}}}*/ 340 341 .macro hconv1_16/*{{{*/ 342 vmull.u16 q14, d16, d0[0] 343 vmull.u16 q15, d17, d0[0] 344 345 ldr r12, [pc, r5, LSL #2] 346 add pc, pc, r12 347 bkpt 348 100: .word 101f-100b 349 .word 102f-100b 350 .word 103f-100b 351 .word 104f-100b 352 .word 105f-100b 353 .word 106f-100b 354 .word 107f-100b 355 .word 108f-100b 356 .word 109f-100b 357 .word 110f-100b 358 .word 111f-100b 359 .word 112f-100b 360 .word 113f-100b 361 .word 114f-100b 362 .word 115f-100b 363 .word 116f-100b 364 116: //vext.u16 q12, q6, q7, #0 365 //vext.u16 q13, q10, q11, #0 366 vmlal.u16 q14, d12, d4[0] 367 vmlal.u16 q15, d13, d4[0] 368 vmlal.u16 q14, d20, d4[0] 369 vmlal.u16 q15, d21, d4[0] 370 115: vext.u16 q12, q6, q7, #1 371 vext.u16 q13, q9, q10, #7 372 vmlal.u16 q14, d24, d3[3] 373 vmlal.u16 q15, d25, d3[3] 374 vmlal.u16 q14, d26, d3[3] 375 vmlal.u16 q15, d27, d3[3] 376 114: vext.u16 q12, q6, q7, #2 377 vext.u16 q13, q9, q10, #6 378 vmlal.u16 q14, d24, d3[2] 379 vmlal.u16 q15, d25, d3[2] 380 vmlal.u16 q14, d26, d3[2] 381 vmlal.u16 q15, d27, d3[2] 382 113: vext.u16 q12, q6, q7, #3 383 vext.u16 q13, q9, q10, #5 384 vmlal.u16 q14, d24, d3[1] 385 vmlal.u16 q15, d25, d3[1] 386 vmlal.u16 q14, d26, d3[1] 387 vmlal.u16 q15, d27, d3[1] 388 112: //vext.u16 q12, q6, q7, #4 389 //vext.u16 q13, q9, q10, #4 390 vmlal.u16 q14, d13, d3[0] 391 vmlal.u16 q15, d14, d3[0] 392 vmlal.u16 q14, d19, d3[0] 393 vmlal.u16 q15, d20, d3[0] 394 111: vext.u16 q12, q6, q7, #5 395 vext.u16 q13, q9, q10, #3 396 vmlal.u16 q14, d24, d2[3] 397 vmlal.u16 q15, d25, d2[3] 398 vmlal.u16 q14, d26, d2[3] 399 vmlal.u16 q15, d27, d2[3] 400 110: vext.u16 q12, q6, q7, #6 401 vext.u16 q13, q9, q10, #2 402 vmlal.u16 q14, d24, d2[2] 403 vmlal.u16 q15, d25, d2[2] 404 vmlal.u16 q14, d26, d2[2] 405 vmlal.u16 q15, d27, d2[2] 406 109: vext.u16 q12, q6, q7, #7 407 vext.u16 q13, q9, q10, #1 408 vmlal.u16 q14, d24, d2[1] 409 vmlal.u16 q15, d25, d2[1] 410 vmlal.u16 q14, d26, d2[1] 411 vmlal.u16 q15, d27, d2[1] 412 108: //vext.u16 q12, q7, q8, #0 413 //vext.u16 q13, q9, q10, #0 414 vmlal.u16 q14, d14, d2[0] 415 vmlal.u16 q15, d15, d2[0] 416 vmlal.u16 q14, d18, d2[0] 417 vmlal.u16 q15, d19, d2[0] 418 107: vext.u16 q12, q7, q8, #1 419 vext.u16 q13, q8, q9, #7 420 vmlal.u16 q14, d24, d1[3] 421 vmlal.u16 q15, d25, d1[3] 422 vmlal.u16 q14, d26, d1[3] 423 vmlal.u16 q15, d27, d1[3] 424 106: vext.u16 q12, q7, q8, #2 425 vext.u16 q13, q8, q9, #6 426 vmlal.u16 q14, d24, d1[2] 427 vmlal.u16 q15, d25, d1[2] 428 vmlal.u16 q14, d26, d1[2] 429 vmlal.u16 q15, d27, d1[2] 430 105: vext.u16 q12, q7, q8, #3 431 vext.u16 q13, q8, q9, #5 432 vmlal.u16 q14, d24, d1[1] 433 vmlal.u16 q15, d25, d1[1] 434 vmlal.u16 q14, d26, d1[1] 435 vmlal.u16 q15, d27, d1[1] 436 104: //vext.u16 q12, q7, q8, #4 437 //vext.u16 q13, q8, q9, #4 438 vmlal.u16 q14, d15, d1[0] 439 vmlal.u16 q15, d16, d1[0] 440 vmlal.u16 q14, d17, d1[0] 441 vmlal.u16 q15, d18, d1[0] 442 103: vext.u16 q12, q7, q8, #5 443 vext.u16 q13, q8, q9, #3 444 vmlal.u16 q14, d24, d0[3] 445 vmlal.u16 q15, d25, d0[3] 446 vmlal.u16 q14, d26, d0[3] 447 vmlal.u16 q15, d27, d0[3] 448 102: vext.u16 q12, q7, q8, #6 449 vext.u16 q13, q8, q9, #2 450 vmlal.u16 q14, d24, d0[2] 451 vmlal.u16 q15, d25, d0[2] 452 vmlal.u16 q14, d26, d0[2] 453 vmlal.u16 q15, d27, d0[2] 454 101: vext.u16 q12, q7, q8, #7 455 vext.u16 q13, q8, q9, #1 456 vmlal.u16 q14, d24, d0[1] 457 vmlal.u16 q15, d25, d0[1] 458 vmlal.u16 q14, d26, d0[1] 459 vmlal.u16 q15, d27, d0[1] 460 461 vqrshrn.u32 d28, q14, #16 462 vqrshrn.u32 d29, q15, #16 463 vqrshrn.u16 d31, q14, #FRACTION_BITS 464 465 vmov q6, q7 466 vmov q7, q8 467 vmov q8, q9 468 vmov q9, q10 469 vmov q10, q11 470 .endm/*}}}*/ 471 472 .macro hconv1_25/*{{{*/ 473 vext.u16 q12, q6, q7, #7 474 vmull.u16 q14, d24, d0[0] 475 vmull.u16 q15, d25, d0[0] 476 477 ldr r12, [pc, r5, LSL #2] 478 add pc, pc, r12 479 bkpt 480 100: .word 101f-100b 481 .word 102f-100b 482 .word 103f-100b 483 .word 104f-100b 484 .word 105f-100b 485 .word 106f-100b 486 .word 107f-100b 487 .word 108f-100b 488 .word 109f-100b 489 .word 110f-100b 490 .word 111f-100b 491 .word 112f-100b 492 .word 113f-100b 493 .word 114f-100b 494 .word 115f-100b 495 .word 116f-100b 496 .word 117f-100b 497 .word 118f-100b 498 .word 119f-100b 499 .word 120f-100b 500 .word 121f-100b 501 .word 122f-100b 502 .word 123f-100b 503 .word 124f-100b 504 .word 125f-100b 505 125: vext.u16 q12, q3, q4, #6 506 vext.u16 q13, q10, q11, #0 507 vmlal.u16 q14, d24, d6[1] 508 vmlal.u16 q15, d25, d6[1] 509 vmlal.u16 q14, d26, d6[1] 510 vmlal.u16 q15, d27, d6[1] 511 124: vext.u16 q12, q3, q4, #7 512 vext.u16 q13, q9, q10, #7 513 vmlal.u16 q14, d24, d6[0] 514 vmlal.u16 q15, d25, d6[0] 515 vmlal.u16 q14, d26, d6[0] 516 vmlal.u16 q15, d27, d6[0] 517 123: vext.u16 q12, q4, q5, #0 518 vext.u16 q13, q9, q10, #6 519 vmlal.u16 q14, d24, d5[3] 520 vmlal.u16 q15, d25, d5[3] 521 vmlal.u16 q14, d26, d5[3] 522 vmlal.u16 q15, d27, d5[3] 523 122: vext.u16 q12, q4, q5, #1 524 vext.u16 q13, q9, q10, #5 525 vmlal.u16 q14, d24, d5[2] 526 vmlal.u16 q15, d25, d5[2] 527 vmlal.u16 q14, d26, d5[2] 528 vmlal.u16 q15, d27, d5[2] 529 121: vext.u16 q12, q4, q5, #2 530 vext.u16 q13, q9, q10, #4 531 vmlal.u16 q14, d24, d5[1] 532 vmlal.u16 q15, d25, d5[1] 533 vmlal.u16 q14, d26, d5[1] 534 vmlal.u16 q15, d27, d5[1] 535 120: vext.u16 q12, q4, q5, #3 536 vext.u16 q13, q9, q10, #3 537 vmlal.u16 q14, d24, d5[0] 538 vmlal.u16 q15, d25, d5[0] 539 vmlal.u16 q14, d26, d5[0] 540 vmlal.u16 q15, d27, d5[0] 541 119: vext.u16 q12, q4, q5, #4 542 vext.u16 q13, q9, q10, #2 543 vmlal.u16 q14, d24, d4[3] 544 vmlal.u16 q15, d25, d4[3] 545 vmlal.u16 q14, d26, d4[3] 546 vmlal.u16 q15, d27, d4[3] 547 118: vext.u16 q12, q4, q5, #5 548 vext.u16 q13, q9, q10, #1 549 vmlal.u16 q14, d24, d4[2] 550 vmlal.u16 q15, d25, d4[2] 551 vmlal.u16 q14, d26, d4[2] 552 vmlal.u16 q15, d27, d4[2] 553 117: vext.u16 q12, q4, q5, #6 554 vext.u16 q13, q9, q10, #0 555 vmlal.u16 q14, d24, d4[1] 556 vmlal.u16 q15, d25, d4[1] 557 vmlal.u16 q14, d26, d4[1] 558 vmlal.u16 q15, d27, d4[1] 559 116: vext.u16 q12, q4, q5, #7 560 vext.u16 q13, q8, q9, #7 561 vmlal.u16 q14, d24, d4[0] 562 vmlal.u16 q15, d25, d4[0] 563 vmlal.u16 q14, d26, d4[0] 564 vmlal.u16 q15, d27, d4[0] 565 115: vext.u16 q12, q5, q6, #0 566 vext.u16 q13, q8, q9, #6 567 vmlal.u16 q14, d24, d3[3] 568 vmlal.u16 q15, d25, d3[3] 569 vmlal.u16 q14, d26, d3[3] 570 vmlal.u16 q15, d27, d3[3] 571 114: vext.u16 q12, q5, q6, #1 572 vext.u16 q13, q8, q9, #5 573 vmlal.u16 q14, d24, d3[2] 574 vmlal.u16 q15, d25, d3[2] 575 vmlal.u16 q14, d26, d3[2] 576 vmlal.u16 q15, d27, d3[2] 577 113: vext.u16 q12, q5, q6, #2 578 vext.u16 q13, q8, q9, #4 579 vmlal.u16 q14, d24, d3[1] 580 vmlal.u16 q15, d25, d3[1] 581 vmlal.u16 q14, d26, d3[1] 582 vmlal.u16 q15, d27, d3[1] 583 112: vext.u16 q12, q5, q6, #3 584 vext.u16 q13, q8, q9, #3 585 vmlal.u16 q14, d24, d3[0] 586 vmlal.u16 q15, d25, d3[0] 587 vmlal.u16 q14, d26, d3[0] 588 vmlal.u16 q15, d27, d3[0] 589 111: vext.u16 q12, q5, q6, #4 590 vext.u16 q13, q8, q9, #2 591 vmlal.u16 q14, d24, d2[3] 592 vmlal.u16 q15, d25, d2[3] 593 vmlal.u16 q14, d26, d2[3] 594 vmlal.u16 q15, d27, d2[3] 595 110: vext.u16 q12, q5, q6, #5 596 vext.u16 q13, q8, q9, #1 597 vmlal.u16 q14, d24, d2[2] 598 vmlal.u16 q15, d25, d2[2] 599 vmlal.u16 q14, d26, d2[2] 600 vmlal.u16 q15, d27, d2[2] 601 109: vext.u16 q12, q5, q6, #6 602 vext.u16 q13, q8, q9, #0 603 vmlal.u16 q14, d24, d2[1] 604 vmlal.u16 q15, d25, d2[1] 605 vmlal.u16 q14, d26, d2[1] 606 vmlal.u16 q15, d27, d2[1] 607 108: vext.u16 q12, q5, q6, #7 608 vext.u16 q13, q7, q8, #7 609 vmlal.u16 q14, d24, d2[0] 610 vmlal.u16 q15, d25, d2[0] 611 vmlal.u16 q14, d26, d2[0] 612 vmlal.u16 q15, d27, d2[0] 613 107: vext.u16 q12, q6, q7, #0 614 vext.u16 q13, q7, q8, #6 615 vmlal.u16 q14, d24, d1[3] 616 vmlal.u16 q15, d25, d1[3] 617 vmlal.u16 q14, d26, d1[3] 618 vmlal.u16 q15, d27, d1[3] 619 106: vext.u16 q12, q6, q7, #1 620 vext.u16 q13, q7, q8, #5 621 vmlal.u16 q14, d24, d1[2] 622 vmlal.u16 q15, d25, d1[2] 623 vmlal.u16 q14, d26, d1[2] 624 vmlal.u16 q15, d27, d1[2] 625 105: vext.u16 q12, q6, q7, #2 626 vext.u16 q13, q7, q8, #4 627 vmlal.u16 q14, d24, d1[1] 628 vmlal.u16 q15, d25, d1[1] 629 vmlal.u16 q14, d26, d1[1] 630 vmlal.u16 q15, d27, d1[1] 631 104: vext.u16 q12, q6, q7, #3 632 vext.u16 q13, q7, q8, #3 633 vmlal.u16 q14, d24, d1[0] 634 vmlal.u16 q15, d25, d1[0] 635 vmlal.u16 q14, d26, d1[0] 636 vmlal.u16 q15, d27, d1[0] 637 103: vext.u16 q12, q6, q7, #4 638 vext.u16 q13, q7, q8, #2 639 vmlal.u16 q14, d24, d0[3] 640 vmlal.u16 q15, d25, d0[3] 641 vmlal.u16 q14, d26, d0[3] 642 vmlal.u16 q15, d27, d0[3] 643 102: vext.u16 q12, q6, q7, #5 644 vext.u16 q13, q7, q8, #1 645 vmlal.u16 q14, d24, d0[2] 646 vmlal.u16 q15, d25, d0[2] 647 vmlal.u16 q14, d26, d0[2] 648 vmlal.u16 q15, d27, d0[2] 649 101: vext.u16 q12, q6, q7, #6 650 vext.u16 q13, q7, q8, #0 651 vmlal.u16 q14, d24, d0[1] 652 vmlal.u16 q15, d25, d0[1] 653 vmlal.u16 q14, d26, d0[1] 654 vmlal.u16 q15, d27, d0[1] 655 656 vqrshrn.u32 d28, q14, #16 657 vqrshrn.u32 d29, q15, #16 658 vqrshrn.u16 d31, q14, #FRACTION_BITS 659 660 vmov d7, d9 661 vmov q4, q5 662 vmov q5, q6 663 vmov q6, q7 664 vmov q7, q8 665 vmov q8, q9 666 vmov q9, q10 667 vmov q10, q11 668 .endm/*}}}*/ 669 670 #define TUNED_LIST4 6, 12 671 .macro hconv4_6/*{{{*/ 672 vmull.u16 q14, d14, d0[0] 673 vmull.u16 q15, d15, d0[0] 674 675 ldr r12, [pc, r5, LSL #2] 676 add pc, pc, r12 677 bkpt 678 100: .word 101f-100b 679 .word 102f-100b 680 .word 103f-100b 681 .word 104f-100b 682 .word 105f-100b 683 .word 106f-100b 684 106: vmlal.u16 q14, d8, d1[2] 685 vmlal.u16 q15, d9, d1[2] 686 vmlal.u16 q14, d20, d1[2] 687 vmlal.u16 q15, d21, d1[2] 688 105: vmlal.u16 q14, d9, d1[1] 689 vmlal.u16 q15, d10, d1[1] 690 vmlal.u16 q14, d19, d1[1] 691 vmlal.u16 q15, d20, d1[1] 692 104: vmlal.u16 q14, d10, d1[0] 693 vmlal.u16 q15, d11, d1[0] 694 vmlal.u16 q14, d18, d1[0] 695 vmlal.u16 q15, d19, d1[0] 696 103: vmlal.u16 q14, d11, d0[3] 697 vmlal.u16 q15, d12, d0[3] 698 vmlal.u16 q14, d17, d0[3] 699 vmlal.u16 q15, d18, d0[3] 700 102: vmlal.u16 q14, d12, d0[2] 701 vmlal.u16 q15, d13, d0[2] 702 vmlal.u16 q14, d16, d0[2] 703 vmlal.u16 q15, d17, d0[2] 704 101: vmlal.u16 q14, d13, d0[1] 705 vmlal.u16 q15, d14, d0[1] 706 vmlal.u16 q14, d15, d0[1] 707 vmlal.u16 q15, d16, d0[1] 708 709 vqrshrn.u32 d28, q14, #16 710 vqrshrn.u32 d29, q15, #16 711 vqrshrn.u16 d31, q14, #FRACTION_BITS 712 713 vmov q4, q5 714 vmov q5, q6 715 vmov q6, q7 716 vmov q7, q8 717 vmov q8, q9 718 vmov q9, q10 719 vmov q10, q11 720 .endm/*}}}*/ 721 722 .macro hconv4_12/*{{{*/ 723 vmull.u16 q14, d8, d0[0] 724 vmull.u16 q15, d9, d0[0] 725 726 ldr r12, [pc, r5, LSL #2] 727 add pc, pc, r12 728 bkpt 729 100: .word 101f-100b 730 .word 102f-100b 731 .word 103f-100b 732 .word 104f-100b 733 .word 105f-100b 734 .word 106f-100b 735 .word 107f-100b 736 .word 108f-100b 737 .word 109f-100b 738 .word 110f-100b 739 .word 111f-100b 740 .word 112f-100b 741 112: add r12, r9, #0x1a0 742 bic r12, r12, #0x200 743 vld1.u16 {d24,d25}, [r12:128] 744 vmlal.u16 q14, d24, d3[0] 745 vmlal.u16 q15, d25, d3[0] 746 vmlal.u16 q14, d20, d3[0] 747 vmlal.u16 q15, d21, d3[0] 748 111: add r12, r9, #0x1a8 749 bic r12, r12, #0x200 750 vld1.u16 {d24}, [r12:64]! 751 bic r12, r12, #0x200 752 vld1.u16 {d25}, [r12:64] 753 vmlal.u16 q14, d24, d2[3] 754 vmlal.u16 q15, d25, d2[3] 755 vmlal.u16 q14, d19, d2[3] 756 vmlal.u16 q15, d20, d2[3] 757 110: add r12, r9, #0x1b0 758 bic r12, r12, #0x200 759 vld1.u16 {d24,d25}, [r12:128] 760 vmlal.u16 q14, d24, d2[2] 761 vmlal.u16 q15, d25, d2[2] 762 vmlal.u16 q14, d18, d2[2] 763 vmlal.u16 q15, d19, d2[2] 764 109: add r12, r9, #0x1b8 765 bic r12, r12, #0x200 766 vld1.u16 {d24}, [r12:64]! 767 bic r12, r12, #0x200 768 vld1.u16 {d25}, [r12:64] 769 vmlal.u16 q14, d24, d2[1] 770 vmlal.u16 q15, d25, d2[1] 771 vmlal.u16 q14, d17, d2[1] 772 vmlal.u16 q15, d18, d2[1] 773 108: add r12, r9, #0x1c0 774 bic r12, r12, #0x200 775 vld1.u16 {d24,d25}, [r12:128] 776 vmlal.u16 q14, d24, d2[0] 777 vmlal.u16 q15, d25, d2[0] 778 vmlal.u16 q14, d16, d2[0] 779 vmlal.u16 q15, d17, d2[0] 780 107: add r12, r9, #0x1c8 781 bic r12, r12, #0x200 782 vld1.u16 {d24}, [r12:64]! 783 bic r12, r12, #0x200 784 vld1.u16 {d25}, [r12:64] 785 vmlal.u16 q14, d24, d1[3] 786 vmlal.u16 q15, d25, d1[3] 787 vmlal.u16 q14, d15, d1[3] 788 vmlal.u16 q15, d16, d1[3] 789 106: add r12, r9, #0x1d0 790 bic r12, r12, #0x200 791 vld1.u16 {d24,d25}, [r12:128] 792 vmlal.u16 q14, d24, d1[2] 793 vmlal.u16 q15, d25, d1[2] 794 vmlal.u16 q14, d14, d1[2] 795 vmlal.u16 q15, d15, d1[2] 796 105: add r12, r9, #0x1d8 797 bic r12, r12, #0x200 798 vld1.u16 {d24}, [r12:64]! 799 bic r12, r12, #0x200 800 vld1.u16 {d25}, [r12:64] 801 vmlal.u16 q14, d24, d1[1] 802 vmlal.u16 q15, d25, d1[1] 803 vmlal.u16 q14, d13, d1[1] 804 vmlal.u16 q15, d14, d1[1] 805 104: add r12, r9, #0x1e0 806 bic r12, r12, #0x200 807 vld1.u16 {d24,d25}, [r12:128] 808 vmlal.u16 q14, d24, d1[0] 809 vmlal.u16 q15, d25, d1[0] 810 vmlal.u16 q14, d12, d1[0] 811 vmlal.u16 q15, d13, d1[0] 812 103: add r12, r9, #0x1e8 813 bic r12, r12, #0x200 814 vld1.u16 {d24}, [r12:64]! 815 bic r12, r12, #0x200 816 vld1.u16 {d25}, [r12:64] 817 vmlal.u16 q14, d24, d0[3] 818 vmlal.u16 q15, d25, d0[3] 819 vmlal.u16 q14, d11, d0[3] 820 vmlal.u16 q15, d12, d0[3] 821 102: add r12, r9, #0x1f0 822 bic r12, r12, #0x200 823 vld1.u16 {d24,d25}, [r12:128] 824 vmlal.u16 q14, d24, d0[2] 825 vmlal.u16 q15, d25, d0[2] 826 vmlal.u16 q14, d10, d0[2] 827 vmlal.u16 q15, d11, d0[2] 828 101: add r12, r9, #0x1f8 829 bic r12, r12, #0x200 830 vld1.u16 {d24}, [r12:64] 831 vmlal.u16 q14, d24, d0[1] 832 vmlal.u16 q15, d8, d0[1] 833 vmlal.u16 q14, d9, d0[1] 834 vmlal.u16 q15, d10, d0[1] 835 836 vqrshrn.u32 d28, q14, #16 837 vqrshrn.u32 d29, q15, #16 838 vqrshrn.u16 d31, q14, #FRACTION_BITS 839 840 vst1.u8 {q4}, [r9:128]! 841 bic r9, r9, #0x200 842 vmov q4, q5 843 vmov q5, q6 844 vmov q6, q7 845 vmov q7, q8 846 vmov q8, q9 847 vmov q9, q10 848 vmov q10, q11 849 .endm/*}}}*/ 850 851 .macro hconv4_25/*{{{*/ 852 add r12, r9, #0x198 853 bic r12, r12, #0x200 854 vld1.u16 {d24}, [r12:64]! 855 bic r12, r12, #0x200 856 vld1.u16 {d25}, [r12:64] 857 vmull.u16 q14, d24, d0[0] 858 vmull.u16 q15, d25, d0[0] 859 860 ldr r12, [pc, r5, LSL #2] 861 add pc, pc, r12 862 bkpt 863 100: .word 101f-100b 864 .word 102f-100b 865 .word 103f-100b 866 .word 104f-100b 867 .word 105f-100b 868 .word 106f-100b 869 .word 107f-100b 870 .word 108f-100b 871 .word 109f-100b 872 .word 110f-100b 873 .word 111f-100b 874 .word 112f-100b 875 .word 113f-100b 876 .word 114f-100b 877 .word 115f-100b 878 .word 116f-100b 879 .word 117f-100b 880 .word 118f-100b 881 .word 119f-100b 882 .word 120f-100b 883 .word 121f-100b 884 .word 122f-100b 885 .word 123f-100b 886 .word 124f-100b 887 .word 125f-100b 888 125: add r12, r9, #0x0d0 889 bic r12, r12, #0x200 890 vld1.u16 {d24,d25}, [r12:128] 891 vmlal.u16 q14, d24, d6[1] 892 vmlal.u16 q15, d25, d6[1] 893 vmlal.u16 q14, d20, d6[1] 894 vmlal.u16 q15, d21, d6[1] 895 124: add r12, r9, #0x0d8 896 bic r12, r12, #0x200 897 vld1.u16 {d24}, [r12:64]! 898 bic r12, r12, #0x200 899 vld1.u16 {d25}, [r12] 900 vmlal.u16 q14, d24, d6[0] 901 vmlal.u16 q15, d25, d6[0] 902 vmlal.u16 q14, d19, d6[0] 903 vmlal.u16 q15, d20, d6[0] 904 123: add r12, r9, #0x0e0 905 bic r12, r12, #0x200 906 vld1.u16 {d24,d25}, [r12:128] 907 vmlal.u16 q14, d24, d5[3] 908 vmlal.u16 q15, d25, d5[3] 909 vmlal.u16 q14, d18, d5[3] 910 vmlal.u16 q15, d19, d5[3] 911 122: add r12, r9, #0x0e8 912 bic r12, r12, #0x200 913 vld1.u16 {d24}, [r12:64]! 914 bic r12, r12, #0x200 915 vld1.u16 {d25}, [r12] 916 vmlal.u16 q14, d24, d5[2] 917 vmlal.u16 q15, d25, d5[2] 918 vmlal.u16 q14, d17, d5[2] 919 vmlal.u16 q15, d18, d5[2] 920 121: add r12, r9, #0x0f0 921 bic r12, r12, #0x200 922 vld1.u16 {d24,d25}, [r12:128] 923 vmlal.u16 q14, d24, d5[1] 924 vmlal.u16 q15, d25, d5[1] 925 vmlal.u16 q14, d16, d5[1] 926 vmlal.u16 q15, d17, d5[1] 927 120: add r12, r9, #0x0f8 928 bic r12, r12, #0x200 929 vld1.u16 {d24}, [r12:64]! 930 bic r12, r12, #0x200 931 vld1.u16 {d25}, [r12] 932 vmlal.u16 q14, d24, d5[0] 933 vmlal.u16 q15, d25, d5[0] 934 vmlal.u16 q14, d15, d5[0] 935 vmlal.u16 q15, d16, d5[0] 936 119: add r12, r9, #0x100 937 bic r12, r12, #0x200 938 vld1.u16 {d24,d25}, [r12:128] 939 vmlal.u16 q14, d24, d4[3] 940 vmlal.u16 q15, d25, d4[3] 941 vmlal.u16 q14, d14, d4[3] 942 vmlal.u16 q15, d15, d4[3] 943 118: add r12, r9, #0x108 944 bic r12, r12, #0x200 945 vld1.u16 {d24}, [r12:64]! 946 bic r12, r12, #0x200 947 vld1.u16 {d25}, [r12] 948 vmlal.u16 q14, d24, d4[2] 949 vmlal.u16 q15, d25, d4[2] 950 vmlal.u16 q14, d13, d4[2] 951 vmlal.u16 q15, d14, d4[2] 952 117: add r12, r9, #0x110 953 bic r12, r12, #0x200 954 vld1.u16 {d24,d25}, [r12:128] 955 vmlal.u16 q14, d24, d4[1] 956 vmlal.u16 q15, d25, d4[1] 957 vmlal.u16 q14, d12, d4[1] 958 vmlal.u16 q15, d13, d4[1] 959 116: add r12, r9, #0x118 960 bic r12, r12, #0x200 961 vld1.u16 {d24}, [r12:64]! 962 bic r12, r12, #0x200 963 vld1.u16 {d25}, [r12] 964 vmlal.u16 q14, d24, d4[0] 965 vmlal.u16 q15, d25, d4[0] 966 vmlal.u16 q14, d11, d4[0] 967 vmlal.u16 q15, d12, d4[0] 968 115: add r12, r9, #0x120 969 bic r12, r12, #0x200 970 vld1.u16 {d24,d25}, [r12:128] 971 vmlal.u16 q14, d24, d3[3] 972 vmlal.u16 q15, d25, d3[3] 973 vmlal.u16 q14, d10, d3[3] 974 vmlal.u16 q15, d11, d3[3] 975 114: add r12, r9, #0x128 976 bic r12, r12, #0x200 977 vld1.u16 {d24}, [r12:64]! 978 bic r12, r12, #0x200 979 vld1.u16 {d25}, [r12] 980 vmlal.u16 q14, d24, d3[2] 981 vmlal.u16 q15, d25, d3[2] 982 vmlal.u16 q14, d9, d3[2] 983 vmlal.u16 q15, d10, d3[2] 984 113: add r12, r9, #0x130 985 bic r12, r12, #0x200 986 vld1.u16 {d24,d25}, [r12:128] 987 vmlal.u16 q14, d24, d3[1] 988 vmlal.u16 q15, d25, d3[1] 989 vmlal.u16 q14, d8, d3[1] 990 vmlal.u16 q15, d9, d3[1] 991 112: add r12, r9, #0x138 992 bic r12, r12, #0x200 993 vld1.u16 {d24}, [r12:64]! 994 bic r12, r12, #0x200 995 vld1.u16 {d25}, [r12] 996 add r12, r9, #0x1f8 997 bic r12, r12, #0x200 998 vld1.u16 {d26}, [r12:64] 999 vmlal.u16 q14, d24, d3[0] 1000 vmlal.u16 q15, d25, d3[0] 1001 vmlal.u16 q14, d26, d3[0] @ Could be d7, without the load, right? 1002 vmlal.u16 q15, d8, d3[0] 1003 111: add r12, r9, #0x140 1004 bic r12, r12, #0x200 1005 vld1.u16 {d24,d25}, [r12:128] 1006 add r12, r9, #0x1f0 1007 bic r12, r12, #0x200 1008 vld1.u16 {d26,d27}, [r12:128] 1009 vmlal.u16 q14, d24, d2[3] 1010 vmlal.u16 q15, d25, d2[3] 1011 vmlal.u16 q14, d26, d2[3] 1012 vmlal.u16 q15, d27, d2[3] 1013 110: add r12, r9, #0x148 1014 bic r12, r12, #0x200 1015 vld1.u16 {d24}, [r12:64]! 1016 bic r12, r12, #0x200 1017 vld1.u16 {d25}, [r12] 1018 add r12, r9, #0x1e8 1019 bic r12, r12, #0x200 1020 vld1.u16 {d26}, [r12:64]! 1021 bic r12, r12, #0x200 1022 vld1.u16 {d27}, [r12:64] 1023 vmlal.u16 q14, d24, d2[2] 1024 vmlal.u16 q15, d25, d2[2] 1025 vmlal.u16 q14, d26, d2[2] 1026 vmlal.u16 q15, d27, d2[2] 1027 109: add r12, r9, #0x150 1028 bic r12, r12, #0x200 1029 vld1.u16 {d24,d25}, [r12:128] 1030 add r12, r9, #0x1e0 1031 bic r12, r12, #0x200 1032 vld1.u16 {d26,d27}, [r12:128] 1033 vmlal.u16 q14, d24, d2[1] 1034 vmlal.u16 q15, d25, d2[1] 1035 vmlal.u16 q14, d26, d2[1] 1036 vmlal.u16 q15, d27, d2[1] 1037 108: add r12, r9, #0x158 1038 bic r12, r12, #0x200 1039 vld1.u16 {d24}, [r12:64]! 1040 bic r12, r12, #0x200 1041 vld1.u16 {d25}, [r12] 1042 add r12, r9, #0x1d8 1043 bic r12, r12, #0x200 1044 vld1.u16 {d26}, [r12:64]! 1045 bic r12, r12, #0x200 1046 vld1.u16 {d27}, [r12:64] 1047 vmlal.u16 q14, d24, d2[0] 1048 vmlal.u16 q15, d25, d2[0] 1049 vmlal.u16 q14, d26, d2[0] 1050 vmlal.u16 q15, d27, d2[0] 1051 107: add r12, r9, #0x160 1052 bic r12, r12, #0x200 1053 vld1.u16 {d24,d25}, [r12:128] 1054 add r12, r9, #0x1d0 1055 bic r12, r12, #0x200 1056 vld1.u16 {d26,d27}, [r12:128] 1057 vmlal.u16 q14, d24, d1[3] 1058 vmlal.u16 q15, d25, d1[3] 1059 vmlal.u16 q14, d26, d1[3] 1060 vmlal.u16 q15, d27, d1[3] 1061 106: add r12, r9, #0x168 1062 bic r12, r12, #0x200 1063 vld1.u16 {d24}, [r12:64]! 1064 bic r12, r12, #0x200 1065 vld1.u16 {d25}, [r12] 1066 add r12, r9, #0x1c8 1067 bic r12, r12, #0x200 1068 vld1.u16 {d26}, [r12:64]! 1069 bic r12, r12, #0x200 1070 vld1.u16 {d27}, [r12:64] 1071 vmlal.u16 q14, d24, d1[2] 1072 vmlal.u16 q15, d25, d1[2] 1073 vmlal.u16 q14, d26, d1[2] 1074 vmlal.u16 q15, d27, d1[2] 1075 105: add r12, r9, #0x170 1076 bic r12, r12, #0x200 1077 vld1.u16 {d24,d25}, [r12:128] 1078 add r12, r9, #0x1c0 1079 bic r12, r12, #0x200 1080 vld1.u16 {d26,d27}, [r12:128] 1081 vmlal.u16 q14, d24, d1[1] 1082 vmlal.u16 q15, d25, d1[1] 1083 vmlal.u16 q14, d26, d1[1] 1084 vmlal.u16 q15, d27, d1[1] 1085 104: add r12, r9, #0x178 1086 bic r12, r12, #0x200 1087 vld1.u16 {d24}, [r12:64]! 1088 bic r12, r12, #0x200 1089 vld1.u16 {d25}, [r12] 1090 add r12, r9, #0x1b8 1091 bic r12, r12, #0x200 1092 vld1.u16 {d26}, [r12:64]! 1093 bic r12, r12, #0x200 1094 vld1.u16 {d27}, [r12:64] 1095 vmlal.u16 q14, d24, d1[0] 1096 vmlal.u16 q15, d25, d1[0] 1097 vmlal.u16 q14, d26, d1[0] 1098 vmlal.u16 q15, d27, d1[0] 1099 103: add r12, r9, #0x180 1100 bic r12, r12, #0x200 1101 vld1.u16 {d24,d25}, [r12:128] 1102 add r12, r9, #0x1b0 1103 bic r12, r12, #0x200 1104 vld1.u16 {d26,d27}, [r12:128] 1105 vmlal.u16 q14, d24, d0[3] 1106 vmlal.u16 q15, d25, d0[3] 1107 vmlal.u16 q14, d26, d0[3] 1108 vmlal.u16 q15, d27, d0[3] 1109 102: add r12, r9, #0x188 1110 bic r12, r12, #0x200 1111 vld1.u16 {d24}, [r12:64]! 1112 bic r12, r12, #0x200 1113 vld1.u16 {d25}, [r12] 1114 add r12, r9, #0x1a8 1115 bic r12, r12, #0x200 1116 vld1.u16 {d26}, [r12:64]! 1117 bic r12, r12, #0x200 1118 vld1.u16 {d27}, [r12:64] 1119 vmlal.u16 q14, d24, d0[2] 1120 vmlal.u16 q15, d25, d0[2] 1121 vmlal.u16 q14, d26, d0[2] 1122 vmlal.u16 q15, d27, d0[2] 1123 101: add r12, r9, #0x190 1124 bic r12, r12, #0x200 1125 vld1.u16 {d24,d25}, [r12:128]! 1126 bic r12, r12, #0x200 1127 vld1.u16 {d26,d27}, [r12:128] 1128 vmlal.u16 q14, d24, d0[1] 1129 vmlal.u16 q15, d25, d0[1] 1130 vmlal.u16 q14, d26, d0[1] 1131 vmlal.u16 q15, d27, d0[1] 1132 1133 vqrshrn.u32 d28, q14, #16 1134 vqrshrn.u32 d29, q15, #16 1135 vqrshrn.u16 d31, q14, #FRACTION_BITS 1136 1137 vst1.u8 {q4}, [r9:128]! 1138 bic r9, r9, #0x200 1139 vmov q4, q5 1140 vmov q5, q6 1141 vmov q6, q7 1142 vmov q7, q8 1143 vmov q8, q9 1144 vmov q9, q10 1145 vmov q10, q11 1146 .endm/*}}}*/ 1147 1148 /* Dedicated function wrapper for the fetch macro, for the cases where 1149 * performance isn't that important, to keep code size down. 1150 */ 1151 PRIVATE(fetch_generic_asm) 1152 push {r10,r11} 1153 fetch 1154 pop {r10,r11} 1155 bx lr 1156 END(fetch_generic_asm) 1157 1158 1159 /* Fetch the next (16 - (r10 & 15)) columns of data, avoiding reading memory 1160 * beyond that limit, and filling the rest of the vector with the last legal 1161 * pixel. 1162 * Result is in q10 and q11. q8 and q9 are filled with the first legal pixel. 1163 * Note: This function can read beyond the right edge of input if the image is 1164 * narrower than 16 bytes. 1165 */ 1166 PRIVATE(fetch_clampleft1) 1167 push {r12,lr} 1168 bl fetch_generic_asm 1169 vdup.u16 q8, d20[0] 1170 vdup.u16 q9, d20[0] 1171 ands r12, r10, #15 1172 beq 1f 1173 sub r1, r1, r12 1174 sub r10, r10, r12 1175 sub sp, sp, #32 1176 vst1.u16 {q10,q11}, [sp] 1177 sub r12, sp, r12, LSL #1 1178 sub sp, sp, #32 1179 vst1.u16 {q8,q9}, [sp] 1180 vld1.u16 {q10,q11}, [r12] 1181 add sp, sp, #64 1182 1: pop {r12,pc} 1183 END(fetch_clampleft1) 1184 1185 PRIVATE(fetch_clampleft4) 1186 push {r12,lr} 1187 bl fetch_generic_asm 1188 vmov.u16 d16, d20 1189 vmov.u16 d17, d20 1190 vmov.u16 d18, d20 1191 vmov.u16 d19, d20 1192 ands r12, r10, #15 1193 beq 1f 1194 sub r1, r1, r12 1195 sub r10, r10, r12 1196 sub sp, sp, #32 1197 vst1.u16 {q10-q11}, [sp] 1198 sub r12, sp, r12, LSL #1 1199 sub sp, sp, #32 1200 vst1.u16 {q8,q9}, [sp] 1201 vld1.u16 {q10,q11}, [r12] 1202 add sp, sp, #64 1203 1: pop {r12,pc} 1204 END(fetch_clampleft4) 1205 1206 /* Fetch only the next (r11 & 15) (where 0 means 16) columns of data, avoiding 1207 * reading memory beyond that limit, and filling the rest of the vector with 1208 * the last legal pixel. 1209 * Result is in q10 and q11. q12 and q13 are filled with the last legal pixel. 1210 * Note: This function can read beyond the left edge of input if the image is 1211 * narrower than 16 bytes. 1212 */ 1213 PRIVATE(fetch_clampright1) 1214 push {r12, lr} 1215 rsb r12, r11, #0 1216 ands r12, r12, #15 1217 beq 1f 1218 sub r1, r1, r12 1219 bl fetch_generic_asm 1220 vdup.u16 q12, d23[3] 1221 vdup.u16 q13, d23[3] 1222 rsb r12, r11, #0 1223 and r12, r12, #15 1224 sub sp, sp, #32 1225 vst1.u16 {q12,q13}, [sp] 1226 sub sp, sp, #32 1227 add r12, sp, r12, LSL #1 1228 vst1.u16 {q10,q11}, [sp] 1229 vld1.u16 {q10,q11}, [r12] 1230 add sp, sp, #64 1231 pop {r12,pc} 1232 1: bl fetch_generic_asm 1233 vdup.u16 q12, d23[3] 1234 vdup.u16 q13, d23[3] 1235 pop {r12,pc} 1236 END(fetch_clampright1) 1237 1238 PRIVATE(fetch_clampright4) 1239 push {r12, lr} 1240 rsb r12, r11, #0 1241 ands r12, r12, #15 1242 beq 1f 1243 sub r1, r1, r12 1244 bl fetch_generic_asm 1245 vmov.u16 d24, d23 1246 vmov.u16 d25, d23 1247 vmov.u16 d26, d23 1248 vmov.u16 d27, d23 1249 rsb r12, r11, #0 1250 and r12, r12, #15 1251 sub sp, sp, #32 1252 vst1.u16 {q12-q13}, [sp] 1253 sub sp, sp, #32 1254 add r12, sp, r12, LSL #1 1255 vst1.u16 {q10,q11}, [sp] 1256 vld1.u16 {q10,q11}, [r12] 1257 add sp, sp, #64 1258 pop {r12,pc} 1259 1: bl fetch_generic_asm 1260 vmov.u16 d24, d23 1261 vmov.u16 d25, d23 1262 vmov.u16 d26, d23 1263 vmov.u16 d27, d23 1264 pop {r12,pc} 1265 END(fetch_clampright4) 1266 1267 /* Given values in q10 and q11, and an index in r11, sweep the (r11 & 15)th 1268 * value across to fill the rest of the register pair. Used for filling the 1269 * right hand edge of the window when reading too close to the right hand edge 1270 * of the image. 1271 * Also returns a dup-ed copy of the last element in q12 for the tail-fill 1272 * case (this happens incidentally in common path, but must be done 1273 * deliberately in the fast-out path). 1274 */ 1275 PRIVATE(prefill_sweepright1) 1276 ands r12, r11, #15 1277 beq 1f 1278 sub r12, r12, #1 1279 sub sp, sp, #64 1280 vst1.u16 {q10,q11}, [sp] 1281 add r12, sp, r12, LSL #1 1282 vld1.u16 {d24[],d25[]}, [r12] 1283 vld1.u16 {d26[],d27[]}, [r12] 1284 vst1.u16 {q12,q13}, [r12] 1285 vld1.u16 {q10,q11}, [sp] 1286 add sp, sp, #64 1287 bx lr 1288 1: vdup.u16 q12, d23[3] 1289 vdup.u16 q13, d23[3] 1290 bx lr 1291 END(prefill_sweepright1) 1292 1293 PRIVATE(prefill_sweepright4) 1294 ands r12, r11, #15 1295 beq 1f 1296 sub r12, r12, #4 1297 sub sp, sp, #64 1298 vst1.u16 {q10,q11}, [sp] 1299 add r12, sp, r12, LSL #1 1300 vld1.u64 {d24}, [r12] 1301 vld1.u64 {d25}, [r12] 1302 vld1.u64 {d26}, [r12] 1303 vld1.u64 {d27}, [r12] 1304 vst1.u16 {q12,q13}, [r12] 1305 vld1.u16 {q10,q11}, [sp] 1306 add sp, sp, #64 1307 bx lr 1308 1: vmov.u16 d24, d23 1309 vmov.u16 d25, d23 1310 vmov.u16 d26, d23 1311 vmov.u16 d27, d23 1312 bx lr 1313 END(prefill_sweepright4) 1314 1315 /* The main loop keeps a sliding window of data that has already been convolved 1316 * in the vertical axis for the current line. This usually stays in the 1317 * register file, but spills to memory for large windows. The first thing that 1318 * needs to be done at start-up is to fill this window with image data, taking 1319 * into account the padding needed if the left or right edges of the image fall 1320 * within this window. 1321 */ 1322 1323 /* Because the window is in the register file writes to it cannot be indexed 1324 * by another register. Consequently the fill loops are unrolled to address 1325 * the registers directly. This macro distinguishes between writes to the 1326 * register file and writes to the spill buffer (indicated by a destination 1327 * register named xx). 1328 */ 1329 .macro prefill_out ra, rb, sra, srb, srb_hi 1330 .ifc \ra,xx 1331 .ifc \rb,xx 1332 vst1.u16 {\sra,\srb}, [r9:128]! 1333 .else 1334 /* this case is used only for the last tap of uchar1 r=25 */ 1335 /* discard \sra */ 1336 vmov.u16 \rb, \srb_hi 1337 .endif 1338 .else 1339 .ifnc \ra,\sra 1340 vmov.u16 \ra, \sra 1341 .endif 1342 .ifnc \rb,\srb 1343 vmov.u16 \rb, \srb 1344 .endif 1345 .endif 1346 .endm 1347 1348 /* This macro provides the list of registers representing the window, and the 1349 * cases where the register file is too small and a spill buffer is used 1350 * instead. 1351 * Since several specialisations of each function are generated, this also 1352 * culls superfluous iterations, and sets the variable `i` for subsequent 1353 * macros indicating the current index into the window. 1354 */ 1355 .macro prefill_list, macro, nextmacro, max_r, step, label 1356 .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label 1357 .if windowsize >= (\line * 16) 1358 .set i, windowsize - (\line * 16) 1359 \label\macro\line: 1360 prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step 1361 .endif 1362 .endm 1363 .if \step > 1 1364 ifneeded \macro \nextmacro, 13, 12, xx, xx, \step, \label 1365 ifneeded \macro \nextmacro, 12, 11, xx, xx, \step, \label 1366 ifneeded \macro \nextmacro, 11, 10, xx, xx, \step, \label 1367 ifneeded \macro \nextmacro, 10, 9, xx, xx, \step, \label 1368 ifneeded \macro \nextmacro, 9, 8, xx, xx, \step, \label 1369 ifneeded \macro \nextmacro, 8, 7, xx, xx, \step, \label 1370 ifneeded \macro \nextmacro, 7, 6, xx, xx, \step, \label 1371 ifneeded \macro \nextmacro, 6, 5, xx, xx, \step, \label 1372 ifneeded \macro \nextmacro, 5, 4, xx, xx, \step, \label 1373 ifneeded \macro \nextmacro, 4, 3, xx, xx, \step, \label 1374 .else 1375 /* q3 normally contains the coefficient table, but it's not fully 1376 * used. In the uchar1, r=25 case the other half of q3 is used for 1377 * the last two window taps to avoid falling out to memory. 1378 */ 1379 ifneeded \macro \nextmacro, 4, 3, xx, d7, \step, \label 1380 .endif 1381 ifneeded \macro \nextmacro, 3, 2, q4, q5, \step, \label 1382 ifneeded \macro \nextmacro, 2, 1, q6, q7, \step, \label 1383 ifneeded \macro \nextmacro, 1, 0, q8, q9, \step, \label 1384 1385 \label\macro\()0: 1386 b \label\()_end 1387 .purgem ifneeded 1388 .endm 1389 1390 /* These macros represent the possible stages of filling the window. 1391 * Each macro is unrolled enough times that it can fill the entire window 1392 * itself, but normally it will have to hand control to subsequent macros 1393 * part-way through and this is done using labels named \next and \after, where 1394 * \next is the next macro starting at the same window position and \after is 1395 * the next macro starting after the current window position. 1396 */ 1397 1398 /* leftfill: v8 and v9 contain the left padding value. While the window 1399 * extends outside of the image on the left-hand side, and at least 16 more 1400 * padding values are needed in the window, store v8 and v9 into the window. 1401 * Otherwise skip forward to storing image data. 1402 */ 1403 .macro prefill_leftfill, next, after, ra, rb, step 1404 cmp r10, #i+16 1405 blo \next 1406 prefill_out \ra, \rb, q8, q9, d19 1407 .endm 1408 1409 /* leftedge: The very first non-fill or partial-fill chunk from the image is 1410 * already loaded (as it was used to calculate the left padding value), so 1411 * store it here, and then drop into the regular load/store cycle in the next 1412 * macro. 1413 */ 1414 .macro prefill_leftedge, next, after, ra, rb, step 1415 1: prefill_out \ra, \rb, q10, q11, d23 1416 b \after 1417 .endm 1418 1419 /* dofetch: Copy chunks of the image into the window without any complications 1420 * from edge conditions. 1421 */ 1422 .macro prefill_dofetch, next, after, ra, rb, step 1423 cmp r11, #i+16 1424 bls \next 1425 bl fetch_generic_asm 1426 prefill_out \ra, \rb, q10, q11, d23 1427 .endm 1428 1429 /* rightedge: The last fetch (currently in v10 and v11) may have gone beyond 1430 * the right-hand edge of the image. In that case sweep the last valid pixel 1431 * across the rest of the chunk, and in either case prepare padding data in v12 1432 * and v13 for the next macro. This is done in fetch_clampright. 1433 * This only happens once before going on to the next macro. 1434 * Sometimes leftedge also covers the rightedge case, in which case this has 1435 * to be skipped altogether. 1436 */ 1437 .macro prefill_rightedge, next, after, ra, rb, step 1438 cmp r11, #i 1439 bls \next 1440 bl fetch_clampright\step 1441 prefill_out \ra, \rb, q10, q11, d23 1442 b \after 1443 .endm 1444 1445 /* rightfill: The rest of the window is simply filled with right padding from 1446 * v12 and v13. 1447 */ 1448 .macro prefill_rightfill, next, after, ra, rb, step 1449 prefill_out \ra, \rb, q12, q13, d25 1450 .endm 1451 1452 /* Here all of the macros above are unrolled and laid out in the proper order. 1453 */ 1454 .macro prefill_body, max_r, step, label 1455 prefill_list leftfill, leftedge, \max_r, \step, \label 1456 prefill_list leftedge, dofetch, \max_r, \step, \label 1457 prefill_list dofetch, rightedge, \max_r, \step, \label 1458 prefill_list rightedge, rightfill, \max_r, \step, \label 1459 prefill_list rightfill, oops, \max_r, \step, \label 1460 \label\()_end: 1461 .endm 1462 1463 /* Fill the convolution window with context data. The aim here is to load 1464 * exactly 2*r columns, and in the main loop to read as many columns as will be 1465 * written. This is complicated by the window being divided into chunks at 1466 * register boundaries, and the need to handle cases when the input starts very 1467 * close to the left or right (or both) edges of the image and the need to fill 1468 * the spaces that leaves with left and right edge padding values. 1469 * 1470 * Input: 1471 * r1 -- src 1472 * r2 -- pitch 1473 * r3 -- count 1474 * r4 -- available image data right of src pointer 1475 * r5 -- r 1476 * r6 -- rup 1477 * r7 -- rdn 1478 * r8 -- available image data left of src pointer 1479 * r9 -- buffer (if needed) 1480 * Output: 1481 * r4 -= min(inlen, count + windowsize - centertap) 1482 * r1 += min(inlen, count + windowsize - centertap) 1483 * Modifies: 1484 * r10 -- fill start index in the window 1485 * r11 -- fill stop index in the window 1486 * r12 -- scratch 1487 */ 1488 .macro prefill step=1, max_r=25, label=xx 1489 .set windowsize, (((\max_r + \max_r) * \step + 15) & ~15) 1490 .set centertap, (windowsize - \max_r * \step) 1491 mov r10, #centertap 1492 subs r10, r10, r8 1493 movlo r10, #0 1494 1495 subs r11, r4, #windowsize - centertap 1496 movhs r11, #0 1497 add r11, r11, #windowsize 1498 1499 /* r10 indicates where in the window legal image data begins. 1500 * r11 indicates where in the window legal image date ends. 1501 * When starting near the centre of a large image these would be 1502 * zero and windowsize respectively, but when starting near the 1503 * edges this can change. 1504 * When starting on the leftmost pixel, r10 will be centertap. 1505 * When starting on the rightmost pixel, r11 will be centertap+1. 1506 */ 1507 1508 /* r4 indicates how much data there is between the current pointers 1509 * and the right edge of the image. The pointers currently point 1510 * to the data needed at centertap. The subsequent code will 1511 * consume (windowsize - r10) data, but only the data from 1512 * centertap to windowsize comes out of r4's budget. 1513 */ 1514 1: subs r4, r4, #windowsize - centertap 1515 movlo r4, #0 1516 1517 /* And the pointers need to rewind to the start of the window. 1518 */ 1519 sub r1, r1, #centertap 1520 1521 /* Unless x8 indicated that there wasn't that much data available. 1522 */ 1523 add r1, r1, r10 1524 1525 1526 /* Get the first chunk, and add padding to align it to the window 1527 * if necessary. 1528 */ 1529 bl fetch_clampleft\step 1530 1531 /* Sometimes the start and the end of the window are in the same 1532 * chunk. In that case both ends need filler at the outset. 1533 */ 1534 sub r12, r11, #1 1535 eor r12, r10, r12 1536 cmp r12, #16 1537 bllo prefill_sweepright\step 1538 1539 /* Iterate through all the points in the window and fill them in 1540 * with padding or image data as needed. 1541 */ 1542 prefill_body \max_r, \step, \label 1543 .endm 1544 1545 /* The main body of the convolve functions. Having already pre-filled the 1546 * convolution window with 2*r input values, the logic settles into a regular 1547 * pattern of reading and writing at a 1:1 rate until either input or output 1548 * expires. The input leads the output by r values, so when processing all the 1549 * way to the right-hand edge, or within r pixels of that edge, the input will 1550 * run out first. In the case of very narrow images, or sub-windows starting 1551 * near the right edge, the input may already have run out while the 1552 * convolution window was being filled and this loop will start with a 1553 * zero-length input. 1554 * 1555 * Once the input runs out, the rest of the output must be processed by padding 1556 * the remainder of the window with pad value from the last valid pixel from 1557 * the source. 1558 * 1559 * Input: 1560 * r0 = dst 1561 * r1 = src 1562 * r2 = pitch 1563 * r3 = count 1564 * r4 = inlen 1565 * r5 = r 1566 * r6 = rup 1567 * r7 = rdn 1568 * r9 = buffer 1569 * Modifies 1570 * r8 = fetch code pointer 1571 */ 1572 .macro conv_body core, step=1, max_r=25, labelc="", labelnc="" 1573 1574 /* If x4 >= x3 then there's no need for clipping. The main loop 1575 * needs to exit when either x3 or x4 runs out, so clamp x4 to be 1576 * no greater than x3 and use x4 for the loop. 1577 * However, if x4 comes out of the loop with less than 16 bytes 1578 * left, a partial read would be necessary to avoid reading beyond 1579 * the end of the image. To avoid this, clamp x4 to the next 1580 * multiple of 16, which is still sufficient to force it out of the 1581 * loop but doesn't imply a rewind. 1582 */ 1583 add r12, r3, #15 1584 bic r12, r12, #15 1585 cmp r4, r12 1586 movhi r4, r12 1587 1588 /* First calculate the entry-point into the internal fetch logic. 1589 * This is done so the same function can service several kernel 1590 * sizes. 1591 */ 1592 ldr r8, 3f 1593 1: add r8, r8, pc 1594 sub r8, r5, LSL #5 1595 sub r8, r5, LSL #4 1596 cmp r5, r6 1597 cmpeq r5, r7 1598 beq 5f 1599 1600 /* if (r != rup || r != rdn) then the address-clamping table should 1601 * be used rather than the short-cut version. 1602 */ 1603 ldr r8, 3f+4 1604 2: add r8, r8, pc 1605 sub r8, r5, LSL #6 1606 b 5f 1607 .align 3 1608 3: .word \labelnc-1b-8 1609 .word \labelc-2b-8 1610 1611 /* Main loop: ... */ 1612 .align 4 1613 3: /* first perform a vertical convolution from memory to get the next 1614 * 16 taps of the horizontal window into the register file... 1615 */ 1616 fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=r8 1617 1618 /* ...then perform a horizontal convolution on that window to 1619 * produce eight output bytes, and slide the window along. 1620 * This has to be done twice to match the 16-way vertical pass. 1621 * It would be preferable to have twice the work done in \core, but 1622 * that would demand yet another variant on those macros and would 1623 * perturb the register allocation severely. 1624 */ 1625 \core 1626 vst1.u8 {d31}, [r0]! 1627 \core 1628 vst1.u8 {d31}, [r0]! 1629 1630 sub r3, r3, #16 1631 5: subs r4, r4, #16 1632 bhi 3b 1633 /* Here there's 16 or fewer bytes available before the edge of the 1634 * source image. x4 holds that count minus 16 (because it was 1635 * decremented before the first iteration ran). The last read may 1636 * not be a whole chunk, and beyond that a fill value must be used. 1637 * 1638 * Of course, none of that matters if there's no more output to 1639 * produce... 1640 */ 1641 cmp r3, #0 1642 beq 5f 1643 1644 /* Oh well. */ 1645 adds r4, r4, #16 1646 bne 1f 1647 .if \step==1 1648 vdup.u16 q10, d19[3] 1649 vdup.u16 q11, d19[3] 1650 .else 1651 vmov.u64 d20, d19 1652 vmov.u64 d21, d19 1653 vmov.u64 d22, d19 1654 vmov.u64 d23, d19 1655 .endif 1656 b 3f 1657 1658 /* To avoid reading past end of input, rewind pointers by (16-r4) 1659 * to ensure that they're exactly 16 bytes from the edge. 1660 */ 1661 1: mov r11, r4 1662 bl fetch_clampright\step 1663 /* Now to put this padding to use, perform any remaining 1664 * iterations. This is done at half the rate of the main loop, 1665 * because there's no longer pressure from a 16-lane window filler. 1666 */ 1667 3: \core 1668 .if \step==1 1669 vdup.u16 q11, d23[3] 1670 .else 1671 vmov.u64 d22, d23 1672 .endif 1673 subs r3, r3, #8 1674 blo 4f 1675 vst1.u8 {d31}, [r0]! 1676 bne 3b 1677 b 5f 1678 1679 /* If the final iteration contained 0 < l < 8 values, then perform 1680 * a piecewise store of the final vector. 1681 */ 1682 4: tst r3, #4 1683 beq 1f 1684 vst1.u32 {d31[0]}, [r0]! 1685 vext.u8 d31, d31, d31, #4 1686 1: tst r3, #2 1687 beq 1f 1688 vst1.u16 {d31[0]}, [r0]! 1689 vext.u8 d31, d31, d31, #2 1690 1: tst r3, #1 1691 beq 5f 1692 vst1.u8 {d31[0]}, [r0]! 1693 vext.u8 d31, d31, d31, #1 1694 5: mov r0, #0 1695 .endm 1696 1697 .irp r, TUNED_LIST1, 25 1698 PRIVATE(convolve1_\r) 1699 push {r12,lr} 1700 1701 prefill step=1, max_r=\r, label=.Lcnv1_\r 1702 1703 conv_body core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r 1704 1705 pop {r12,pc} 1706 END(convolve1_\r) 1707 .endr 1708 1709 .irp r, TUNED_LIST4, 25 1710 PRIVATE(convolve4_\r) 1711 push {r12,lr} 1712 sub r9, sp, #0x200 1713 sub sp, sp, #0x200 + 0x400 1714 bic r9, r9, #0x3fc 1715 1716 /* r9 now points to a 0x200 byte buffer on the stack whose address 1717 * has the low 10 bits clear. This allows easy address calculation 1718 * in the wrap-around cases. 1719 */ 1720 1721 prefill step=4, max_r=\r, label=.Lcnv4_\r 1722 1723 conv_body core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r 1724 1725 add sp, sp, #0x200 + 0x400 1726 pop {r12,pc} 1727 END(convolve4_\r) 1728 .endr 1729 1730 /* void rsdIntrinsicBlurU1_K( 1731 * void *out, // r0 1732 * void *in, // r1 1733 * size_t w, // r2 1734 * size_t h, // r3 1735 * size_t p, // [sp] 1736 * size_t x, // [sp,#4] 1737 * size_t y, // [sp,#8] 1738 * size_t count, // [sp,#12] 1739 * size_t r, // [sp,#16] 1740 * uint16_t *tab); // [sp,#20] 1741 */ 1742 ENTRY(rsdIntrinsicBlurU1_K) 1743 push {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 1744 vpush {d8-d15} 1745 ldr r6, [sp,#112] // y 1746 ldr r8, [sp,#108] // x 1747 ldr r5, [sp,#120] // r 1748 sub r4, r2, r8 // inlen = w - x 1749 sub r7, r3, r6 // h - y 1750 ldr r2, [sp,#104] // pitch 1751 ldr r3, [sp,#116] // count 1752 sub r7, r7, #1 // h - y - 1 1753 1754 ldr r12, [sp,#124] 1755 1756 add r1, r1, r8 // src += x 1757 1758 cmp r6, r5 1759 movhi r6, r5 // rup = min(r, y) 1760 cmp r7, r5 1761 movhi r7, r5 // rdn = min(r, h - y - 1) 1762 1763 vld1.u16 {d0,d1,d2,d3}, [r12]! 1764 vld1.u16 {d4,d5,d6}, [r12]! 1765 1766 adr lr, 1f 1767 .irp r, TUNED_LIST1 1768 cmp r5, #\r 1769 bls convolve1_\r 1770 .endr 1771 b convolve1_25 1772 1773 1: vpop {d8-d15} 1774 pop {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} 1775 END(rsdIntrinsicBlurU1_K) 1776 1777 /* void rsdIntrinsicBlurU4_K( 1778 * void *out, // r0 1779 * void *in, // r1 1780 * size_t w, // r2 1781 * size_t h, // r3 1782 * size_t p, // [sp] 1783 * size_t x, // [sp,#4] 1784 * size_t y, // [sp,#8] 1785 * size_t count, // [sp,#12] 1786 * size_t r, // [sp,#16] 1787 * uint16_t *tab); // [sp,#20] 1788 */ 1789 ENTRY(rsdIntrinsicBlurU4_K) 1790 push {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 1791 vpush {d8-d15} 1792 ldr r6, [sp,#112] // y 1793 ldr r8, [sp,#108] // x 1794 ldr r5, [sp,#120] // r 1795 lsl r8, r8, #2 1796 rsb r4, r8, r2, LSL #2 // inlen = (w - x) 1797 sub r7, r3, r6 // h - y 1798 ldr r2, [sp,#104] // pitch 1799 ldr r3, [sp,#116] // count 1800 sub r7, r7, #1 // h - y - 1 1801 lsl r3, r3, #2 // count 1802 1803 ldr r12, [sp,#124] 1804 1805 add r1, r1, r8 // in += x 1806 1807 cmp r6, r5 1808 movhi r6, r5 // rup = min(r, y) 1809 cmp r7, r5 1810 movhi r7, r5 // rdn = min(r, h - y - 1) 1811 1812 vld1.u16 {d0,d1,d2,d3}, [r12]! 1813 vld1.u16 {d4,d5,d6}, [r12]! 1814 1815 adr lr, 1f 1816 .irp r, TUNED_LIST4 1817 cmp r5, #\r 1818 bls convolve4_\r 1819 .endr 1820 b convolve4_25 1821 1822 1: vpop {d8-d15} 1823 pop {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} 1824 END(rsdIntrinsicBlurU4_K) 1825