1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18 #define END(f) .size f, .-f; 19 20 /* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1 21 * integer (bicubic has a little overshoot). It would also be possible to add 22 * a temporary DC bias to eliminate the sign bit for more precision, but that's 23 * extra arithmetic. 24 */ 25 .set VERTBITS, 14 26 27 /* The size of the scratch buffer in which we store our vertically convolved 28 * intermediates. 29 */ 30 .set CHUNKSHIFT, 7 /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */ 31 .set CHUNKSIZE, (1 << CHUNKSHIFT) 32 33 /* The number of components processed in a single iteration of the innermost 34 * loop. 35 */ 36 .set VECSHIFT, 3 37 .set VECSIZE, (1<<VECSHIFT) 38 39 /* Read four different lines (except at edges where addresses may be clamped, 40 * which is why we don't simply take base and stride registers), and multiply 41 * and accumulate them by the coefficients in v3[0..3], leaving the results in 42 * v12. This gives eight 16-bit results representing a horizontal line of 2-8 43 * input pixels (depending on number of components per pixel) to be fed into 44 * the horizontal scaling pass. 45 * 46 * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are 47 * known to represent negative values and VMLS is used to implement this). 48 * Output is VERTBITS signed fixed-point, which must leave room for a little 49 * v12. This gives eight 16-bit results. 50 */ 51 .macro vert8, dstlo=v12.4h, dsthi=v12.8h 52 ld1 {v8.8b}, [x4], #8 53 ld1 {v9.8b}, [x5], #8 54 ld1 {v10.8b}, [x6], #8 55 ld1 {v11.8b}, [x7], #8 56 uxtl v8.8h, v8.8b 57 uxtl v9.8h, v9.8b 58 uxtl v10.8h, v10.8b 59 uxtl v11.8h, v11.8b 60 umull v12.4s, v9.4h, v3.h[1] 61 umull2 v13.4s, v9.8h, v3.h[1] 62 umlsl v12.4s, v8.4h, v3.h[0] 63 umlsl2 v13.4s, v8.8h, v3.h[0] 64 umlal v12.4s, v10.4h, v3.h[2] 65 umlal2 v13.4s, v10.8h, v3.h[2] 66 umlsl v12.4s, v11.4h, v3.h[3] 67 umlsl2 v13.4s, v11.8h, v3.h[3] 68 69 /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies), 70 * minus VERTBITS (the number of fraction bits we want to keep from 71 * here on). 72 */ 73 sqshrn \dstlo, v12.4s, #8 + (16 - VERTBITS) 74 sqshrn2 \dsthi, v13.4s, #8 + (16 - VERTBITS) 75 .endm 76 77 /* As above, but only four 16-bit results into v12hi. 78 */ 79 .macro vert4, dst=v12.8h 80 ld1 {v8.s}[0], [x4], #4 81 ld1 {v9.s}[0], [x5], #4 82 ld1 {v10.s}[0], [x6], #4 83 ld1 {v11.s}[0], [x7], #4 84 uxtl v8.8h, v8.8b 85 uxtl v9.8h, v9.8b 86 uxtl v10.8h, v10.8b 87 uxtl v11.8h, v11.8b 88 umull v12.4s, v9.4h, v3.h[1] 89 umlsl v12.4s, v8.4h, v3.h[0] 90 umlal v12.4s, v10.4h, v3.h[2] 91 umlsl v12.4s, v11.4h, v3.h[3] 92 .ifc \dst,v12.8h 93 sqshrn2 \dst, v12.4s, #8 + (16 - VERTBITS) 94 .else 95 sqshrn \dst, v12.4s, #8 + (16 - VERTBITS) 96 .endif 97 .endm 98 99 100 /* During horizontal resize having CHUNKSIZE input available means being able 101 * to produce a varying amount of output, depending on the phase of the data. 102 * This function calculates the minimum number of VECSIZE chunks extracted from 103 * a CHUNKSIZE window (x1), and the threshold value for when the count will be 104 * one higher than that (x0). 105 * These work out, conveniently, to be the quotient and remainder from: 106 * (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE) 107 * 108 * The two values are packed together in a uint64_t for convenience; and 109 * they are, in fact, used this way as an arithmetic short-cut later on. 110 */ 111 /* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */ 112 ENTRY(rsdIntrinsicResize_oscctl_K) 113 lsl x2, x0, #VECSHIFT 114 mov x0, #(CHUNKSIZE << 16) - 1 115 add x0, x0, x2 116 udiv x1, x0, x2 117 msub x0, x1, x2, x0 118 add x0, x0, x1, LSL #32 119 ret 120 END(rsdIntrinsicResize_oscctl_K) 121 122 /* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code. 123 * For the most part the vertical pass (the outer loop) is the same for all 124 * versions. Exceptions are handled in-line with conditional assembly. 125 */ 126 .irp comp, 1, 2, 4 127 .if \comp == 1 128 .set COMPONENT_SHIFT, 0 129 .elseif \comp == 2 130 .set COMPONENT_SHIFT, 1 131 .elseif \comp == 4 132 .set COMPONENT_SHIFT, 2 133 .else 134 .error "Unknown component count" 135 .endif 136 .set COMPONENT_COUNT, (1 << COMPONENT_SHIFT) 137 .set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT) 138 139 .set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2 140 141 /* void rsdIntrinsicResizeB1_K( 142 * uint8_t * restrict dst, // x0 143 * size_t count, // x1 144 * uint32_t xf, // x2 145 * uint32_t xinc, // x3 146 * uint8_t const * restrict srcn, // x4 147 * uint8_t const * restrict src0, // x5 148 * uint8_t const * restrict src1, // x6 149 * uint8_t const * restrict src2, // x7 150 * size_t xclip, // [sp,#0] -> [sp,#64] -> x12 151 * size_t avail, // [sp,#8] -> [sp,#72] -> x11 152 * uint64_t osc_ctl, // [sp,#16] -> [sp,#80] -> x10 153 * int32 const *yr, // [sp,#24] -> [sp,#88] -> v4 (copied to v3 for scalar access) 154 */ 155 ENTRY(rsdIntrinsicResizeB\comp\()_K) 156 sub x8, sp, #32 157 sub sp, sp, #64 158 st1 {v8.1d - v11.1d}, [sp] 159 st1 {v12.1d - v15.1d}, [x8] 160 161 /* align the working buffer on the stack to make it easy to use bit 162 * twiddling for address calculations. 163 */ 164 sub x12, sp, #BUFFER_SIZE 165 bic x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1 166 167 ldr x8, [sp,#88] // yr 168 adr x9, 8f 169 ld1 {v4.4s}, [x8] 170 ld1 {v5.8h}, [x9] 171 sqxtun v4.4h, v4.4s // yr 172 dup v6.8h, w2 173 dup v7.8h, w3 174 mla v6.8h, v5.8h, v7.8h // vxf 175 shl v7.8h, v7.8h, #VECSHIFT // vxinc 176 177 /* Compute starting condition for oscillator used to compute ahead 178 * of time how many iterations are possible before needing to 179 * refill the working buffer. This is based on the fixed-point 180 * index of the last element in the vector of pixels processed in 181 * each iteration, counting up until it would overflow. 182 */ 183 sub x8, x2, x3 184 lsl x9, x3, #VECSHIFT 185 add x8, x8, x9 186 187 ldr x10, [sp,#80] // osc_ctl 188 ldp x13,x11, [sp,#64] // xclip, avail 189 190 mov x18, sp 191 mov sp, x12 192 193 /* x4-x7 contain pointers to the four lines of input to be 194 * convolved. These pointers have been clamped vertically and 195 * horizontally (which is why it's not a simple row/stride pair), 196 * and the xclip argument (now in x13) indicates how many pixels 197 * from true the x position of the pointer is. This value should 198 * be 0, 1, or 2 only. 199 * 200 * Start by placing four pixels worth of input at the far end of 201 * the buffer. As many as two of these may be clipped, so four 202 * pixels are fetched, and then the first pixel is duplicated and 203 * the data shifted according to xclip. The source pointers are 204 * then also adjusted according to xclip so that subsequent fetches 205 * match. 206 */ 207 mov v3.8b, v4.8b /* make y coeffs available for vert4 and vert8 macros */ 208 sub x14, x12, x13, LSL #(COMPONENT_SHIFT + 1) 209 add x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2 210 add x14, x14, #4 * COMPONENT_COUNT * 2 211 .if \comp == 1 212 vert4 v12.4h 213 dup v11.4h, v12.h[0] 214 st1 {v11.4h,v12.4h}, [x12] 215 ld1 {v12.4h}, [x14] 216 st1 {v12.4h}, [x15] 217 .elseif \comp == 2 218 vert8 219 dup v11.4s, v12.s[0] 220 st1 {v11.8h,v12.8h}, [x12] 221 ld1 {v12.8h}, [x14] 222 st1 {v12.8h}, [x15] 223 .elseif \comp == 4 224 vert8 v14.4h, v14.8h 225 vert8 v15.4h, v15.8h 226 dup v12.2d, v14.d[0] 227 dup v13.2d, v14.d[0] 228 st1 {v12.8h,v13.8h}, [x12], #32 229 st1 {v14.8h,v15.8h}, [x12] 230 sub x12, x12, #32 231 ld1 {v11.8h,v12.8h}, [x14] 232 st1 {v11.8h,v12.8h}, [x15] 233 .endif 234 /* Count off four pixels into the working buffer. 235 */ 236 sub x11, x11, #4 237 /* Incoming pointers were to the first _legal_ pixel. Four pixels 238 * were read unconditionally, but some may have been discarded by 239 * xclip, so we rewind the pointers to compensate. 240 */ 241 sub x4, x4, x13, LSL #(COMPONENT_SHIFT) 242 sub x5, x5, x13, LSL #(COMPONENT_SHIFT) 243 sub x6, x6, x13, LSL #(COMPONENT_SHIFT) 244 sub x7, x7, x13, LSL #(COMPONENT_SHIFT) 245 246 /* First tap starts where we just pre-filled, at the end of the 247 * buffer. 248 */ 249 add x2, x2, #(CHUNKSIZE * 2 - 4) << 16 250 251 /* Use overflowing arithmetic to implement wraparound array 252 * indexing. 253 */ 254 lsl x2, x2, #(47 - CHUNKSHIFT) 255 lsl x3, x3, #(47 - CHUNKSHIFT) 256 257 258 /* Start of outermost loop. 259 * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the 260 * number of iterations of the inner loop that can be performed and 261 * get into that. 262 * 263 * The fill is complicated by the possibility of running out of 264 * input before the scratch buffer is filled. If this isn't a risk 265 * then it's handled by the simple loop at 2:, otherwise the 266 * horrible loop at 3:. 267 */ 268 1: mov v3.8b, v4.8b /* put y scaling coefficients somewhere handy */ 269 subs x11, x11, #CHUNKSIZE 270 bge 2f /* if at least CHUNKSIZE are available... */ 271 add x11, x11, #CHUNKSIZE /* if they're not... */ 272 b 4f 273 /* ..just sneaking a literal in here after this unconditional branch.. */ 274 8: .hword 0, 1, 2, 3, 4, 5, 6, 7 275 /* basic fill loop, processing 8 bytes at a time until there are 276 * fewer than eight bytes available. 277 */ 278 3: vert8 279 sub x11, x11, #8 / COMPONENT_COUNT 280 st1 {v12.8h}, [x12], #16 281 4: cmp x11, #8 / COMPONENT_COUNT - 1 282 bgt 3b 283 .if \comp == 4 284 blt 3f 285 /* The last pixel (four bytes) if necessary */ 286 vert4 287 .else 288 cmp x11, #1 289 blt 3f 290 /* The last pixels if necessary */ 291 sub x4, x4, #8 292 sub x5, x5, #8 293 sub x6, x6, #8 294 sub x7, x7, #8 295 add x4, x4, x11, LSL #(COMPONENT_SHIFT) 296 add x5, x5, x11, LSL #(COMPONENT_SHIFT) 297 add x6, x6, x11, LSL #(COMPONENT_SHIFT) 298 add x7, x7, x11, LSL #(COMPONENT_SHIFT) 299 vert8 300 sub x11, sp, x11, LSL #(COMPONENT_SHIFT + 1) 301 sub sp, sp, #32 302 sub x11, x11, #16 303 .if \comp == 1 304 dup v13.8h, v12.h[7] 305 .elseif \comp == 2 306 dup v13.4s, v12.s[3] 307 .endif 308 st1 {v12.8h,v13.8h}, [sp] 309 ld1 {v12.8h}, [x11] 310 add sp, sp, #32 311 b 4f 312 .endif 313 /* Keep filling until we get to the end of this chunk of the buffer */ 314 3: 315 .if \comp == 1 316 dup v12.8h, v12.h[7] 317 .elseif \comp == 2 318 dup v12.4s, v12.s[3] 319 .elseif \comp == 4 320 dup v12.2d, v12.d[1] 321 .endif 322 4: st1 {v12.8h}, [x12], #16 323 tst x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2 324 bne 3b 325 b 4f 326 327 .align 4 328 2: /* Quickly pull a chunk of data into the working buffer. 329 */ 330 vert8 331 st1 {v12.8h}, [x12], #16 332 vert8 333 st1 {v12.8h}, [x12], #16 334 tst x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2 335 bne 2b 336 cmp x11, #0 337 bne 3f 338 4: /* if we end with 0 pixels left we'll have nothing handy to spread 339 * across to the right, so we rewind a bit. 340 */ 341 mov x11, #1 342 sub x4, x4, #COMPONENT_COUNT 343 sub x5, x5, #COMPONENT_COUNT 344 sub x6, x6, #COMPONENT_COUNT 345 sub x7, x7, #COMPONENT_COUNT 346 3: /* copy four taps (width of cubic window) to far end for overflow 347 * address handling 348 */ 349 sub x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2 350 eor x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2 351 .if \comp == 1 352 ld1 {v14.4h}, [x13] 353 .elseif \comp == 2 354 ld1 {v14.8h}, [x13] 355 .elseif \comp == 4 356 ld1 {v14.8h,v15.8h}, [x13] 357 .endif 358 add x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2 359 .if \comp == 1 360 st1 {v14.4h}, [x13] 361 .elseif \comp == 2 362 st1 {v14.8h}, [x13] 363 .elseif \comp == 4 364 st1 {v14.8h,v15.8h}, [x13] 365 .endif 366 /* The high 32-bits of x10 contains the maximum possible iteration 367 * count, but if x8 is greater than the low 32-bits of x10 then 368 * this indicates that the count must be reduced by one for this 369 * iteration to avoid reading past the end of the available data. 370 */ 371 sub x13, x10, x8 372 lsr x13, x13, #32 373 374 madd x8, x13, x9, x8 375 sub x8, x8, #(CHUNKSIZE << 16) 376 377 /* prefer to count pixels, rather than vectors, to clarify the tail 378 * store case on exit. 379 */ 380 lsl x13, x13, #VECSHIFT 381 cmp x13, x1 382 csel x13, x1, x13, gt 383 384 sub x1, x1, x13 385 386 lsl x13, x13, #COMPONENT_SHIFT 387 388 mov w14, #0x8000 389 movi v30.8h, #3 390 dup v31.8h, w14 391 392 cmp x13, #0 393 bgt 3f 394 cmp x1, #0 395 bgt 1b /* an extreme case where we shouldn't use code in this structure */ 396 b 9f 397 398 .align 4 399 2: /* Inner loop continues here, but starts at 3:, see end of loop 400 * below for explanation. */ 401 .if LOOP_OUTPUT_SIZE == 4 402 st1 {v8.s}[0], [x0], #4 403 .elseif LOOP_OUTPUT_SIZE == 8 404 st1 {v8.8b}, [x0], #8 405 .elseif LOOP_OUTPUT_SIZE == 16 406 st1 {v8.16b}, [x0], #16 407 .elseif LOOP_OUTPUT_SIZE == 32 408 st1 {v8.16b,v9.16b}, [x0], #32 409 .endif 410 /* Inner loop: here the four x coefficients for each tap are 411 * calculated in vector code, and the addresses are calculated in 412 * scalar code, and these calculations are interleaved. 413 */ 414 3: ushr v8.8h, v6.8h, #1 // sxf 415 lsr x14, x2, #(63 - CHUNKSHIFT) 416 sqrdmulh v9.8h, v8.8h, v8.8h // sxf**2 417 add x2, x2, x3 418 sqrdmulh v10.8h, v9.8h, v8.8h // sxf**3 419 lsr x15, x2, #(63 - CHUNKSHIFT) 420 sshll v11.4s, v9.4h, #2 421 sshll2 v12.4s, v9.8h, #2 422 add x2, x2, x3 423 smlsl v11.4s, v10.4h, v30.4h 424 smlsl2 v12.4s, v10.8h, v30.8h 425 lsr x16, x2, #(63 - CHUNKSHIFT) 426 427 shadd v0.8h, v10.8h, v8.8h 428 add x2, x2, x3 429 sub v0.8h, v9.8h, v0.8h 430 lsr x17, x2, #(63 - CHUNKSHIFT) 431 432 saddw v1.4s, v11.4s, v9.4h 433 saddw2 v13.4s, v12.4s, v9.8h 434 add x2, x2, x3 435 shrn v1.4h, v1.4s, #1 436 shrn2 v1.8h, v13.4s, #1 437 add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1) 438 sub v1.8h, v1.8h, v31.8h 439 add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1) 440 441 saddw v2.4s, v11.4s, v8.4h 442 saddw2 v13.4s, v12.4s, v8.8h 443 add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1) 444 shrn v2.4h, v2.4s, #1 445 shrn2 v2.8h, v13.4s, #1 446 add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1) 447 neg v2.8h, v2.8h 448 449 shsub v3.8h, v10.8h, v9.8h 450 451 /* increment the x fractional parts (oveflow is ignored, as the 452 * scalar arithmetic shadows this addition with full precision). 453 */ 454 add v6.8h, v6.8h, v7.8h 455 456 /* At this point we have four pointers in x8-x11, pointing to the 457 * four taps in the scratch buffer that must be convolved together 458 * to produce an output pixel (one output pixel per pointer). 459 * These pointers usually overlap, but their spacing is irregular 460 * so resolving the redundancy through L1 is a pragmatic solution. 461 * 462 * The scratch buffer is made of signed 16-bit data, holding over 463 * some extra precision, and overshoot, from the vertical pass. 464 * 465 * We also have the 16-bit unsigned fixed-point weights for each 466 * of the four taps in v0 - v3. That's eight pixels worth of 467 * coefficients when we have only four pointers, so calculations 468 * for four more pixels are interleaved with the fetch and permute 469 * code for each variant in the following code. 470 * 471 * The data arrangement is less than ideal for any pixel format, 472 * but permuting loads help to mitigate most of the problems. 473 * 474 * Note also that the two outside taps of a bicubic are negative, 475 * but these coefficients are unsigned. The sign is hard-coded by 476 * use of multiply-and-subtract operations. 477 */ 478 .if \comp == 1 479 /* The uchar 1 case. 480 * Issue one lanewise ld4.h to load four consecutive pixels from 481 * one pointer (one pixel) into four different registers; then load 482 * four consecutive s16 values from the next pointer (pixel) into 483 * the next lane of those four registers, etc., so that we finish 484 * with v12 - v15 representing the four taps, and each lane 485 * representing a separate pixel. 486 * 487 * The first ld4 uses a splat to avoid any false dependency on 488 * the previous state of the register. 489 */ 490 ld4r {v12.8h,v13.8h,v14.8h,v15.8h}, [x14] 491 lsr x14, x2, #(63 - CHUNKSHIFT) 492 add x2, x2, x3 493 ld4 {v12.h,v13.h,v14.h,v15.h}[1], [x15] 494 add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1) 495 lsr x15, x2, #(63 - CHUNKSHIFT) 496 add x2, x2, x3 497 ld4 {v12.h,v13.h,v14.h,v15.h}[2], [x16] 498 add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1) 499 lsr x16, x2, #(63 - CHUNKSHIFT) 500 add x2, x2, x3 501 ld4 {v12.h,v13.h,v14.h,v15.h}[3], [x17] 502 add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1) 503 lsr x17, x2, #(63 - CHUNKSHIFT) 504 add x2, x2, x3 505 ld4 {v12.h,v13.h,v14.h,v15.h}[4], [x14] 506 add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1) 507 ld4 {v12.h,v13.h,v14.h,v15.h}[5], [x15] 508 ld4 {v12.h,v13.h,v14.h,v15.h}[6], [x16] 509 ld4 {v12.h,v13.h,v14.h,v15.h}[7], [x17] 510 511 smull v8.4s, v12.4h, v0.4h 512 smull2 v9.4s, v12.8h, v0.8h 513 smlsl v8.4s, v13.4h, v1.4h 514 smlsl2 v9.4s, v13.8h, v1.8h 515 smlsl v8.4s, v14.4h, v2.4h 516 smlsl2 v9.4s, v14.8h, v2.8h 517 smlal v8.4s, v15.4h, v3.4h 518 smlal2 v9.4s, v15.8h, v3.8h 519 520 subs x13, x13, #LOOP_OUTPUT_SIZE 521 522 sqrshrn v8.4h, v8.4s, #15 523 sqrshrn2 v8.8h, v9.4s, #15 524 525 sqrshrun v8.8b, v8.8h, #VERTBITS - 8 526 .elseif \comp == 2 527 /* The uchar2 case: 528 * This time load pairs of values into adjacent lanes in v12 - v15 529 * by aliasing them as u32 data; leaving room for only four pixels, 530 * so the process has to be done twice. This also means that the 531 * coefficient registers fail to align with the coefficient data 532 * (eight separate pixels), so that has to be doubled-up to match. 533 */ 534 ld4r {v12.4s,v13.4s,v14.4s,v15.4s}, [x14] 535 lsr x14, x2, #(63 - CHUNKSHIFT) 536 add x2, x2, x3 537 ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x15] 538 add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1) 539 lsr x15, x2, #(63 - CHUNKSHIFT) 540 add x2, x2, x3 541 ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x16] 542 add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1) 543 lsr x16, x2, #(63 - CHUNKSHIFT) 544 add x2, x2, x3 545 ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x17] 546 add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1) 547 lsr x17, x2, #(63 - CHUNKSHIFT) 548 add x2, x2, x3 549 550 /* double-up coefficients to align with component pairs */ 551 zip1 v16.8h, v0.8h, v0.8h 552 add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1) 553 zip1 v17.8h, v1.8h, v1.8h 554 zip1 v18.8h, v2.8h, v2.8h 555 zip1 v19.8h, v3.8h, v3.8h 556 557 smull v8.4s, v12.4h, v16.4h 558 smull2 v9.4s, v12.8h, v16.8h 559 smlsl v8.4s, v13.4h, v17.4h 560 smlsl2 v9.4s, v13.8h, v17.8h 561 smlsl v8.4s, v14.4h, v18.4h 562 smlsl2 v9.4s, v14.8h, v18.8h 563 smlal v8.4s, v15.4h, v19.4h 564 smlal2 v9.4s, v15.8h, v19.8h 565 566 sqrshrn v8.4h, v8.4s, #15 567 sqrshrn2 v8.8h, v9.4s, #15 568 569 ld4r {v12.4s,v13.4s,v14.4s,v15.4s}, [x14] 570 ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x15] 571 ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x16] 572 ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x17] 573 574 /* double-up coefficients to align with component pairs */ 575 zip2 v16.8h, v0.8h, v0.8h 576 zip2 v17.8h, v1.8h, v1.8h 577 zip2 v18.8h, v2.8h, v2.8h 578 zip2 v19.8h, v3.8h, v3.8h 579 580 smull v10.4s, v12.4h, v16.4h 581 smull2 v11.4s, v12.8h, v16.8h 582 smlsl v10.4s, v13.4h, v17.4h 583 smlsl2 v11.4s, v13.8h, v17.8h 584 smlsl v10.4s, v14.4h, v18.4h 585 smlsl2 v11.4s, v14.8h, v18.8h 586 smlal v10.4s, v15.4h, v19.4h 587 smlal2 v11.4s, v15.8h, v19.8h 588 589 subs x13, x13, #LOOP_OUTPUT_SIZE 590 591 sqrshrn v9.4h, v10.4s, #15 592 sqrshrn2 v9.8h, v11.4s, #15 593 594 sqrshrun v8.8b, v8.8h, #VERTBITS - 8 595 sqrshrun2 v8.16b, v9.8h, #VERTBITS - 8 596 .elseif \comp == 4 597 /* The uchar4 case. 598 * This case is comparatively painless because four s16s are the 599 * smallest addressable unit for a vmul-by-scalar. Rather than 600 * permute the data, simply arrange the multiplies to suit the way 601 * the data comes in. That's a lot of data, though, so things 602 * progress in pairs of pixels at a time. 603 */ 604 ld1 {v12.8h,v13.8h}, [x14] 605 lsr x14, x2, #(63 - CHUNKSHIFT) 606 add x2, x2, x3 607 ld1 {v14.8h,v15.8h}, [x15] 608 add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1) 609 lsr x15, x2, #(63 - CHUNKSHIFT) 610 add x2, x2, x3 611 612 smull v8.4s, v12.4h, v0.h[0] 613 smull v9.4s, v14.4h, v0.h[1] 614 smlsl2 v8.4s, v12.8h, v1.h[0] 615 smlsl2 v9.4s, v14.8h, v1.h[1] 616 smlsl v8.4s, v13.4h, v2.h[0] 617 smlsl v9.4s, v15.4h, v2.h[1] 618 smlal2 v8.4s, v13.8h, v3.h[0] 619 smlal2 v9.4s, v15.8h, v3.h[1] 620 621 /* And two more... */ 622 ld1 {v12.8h,v13.8h}, [x16] 623 add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1) 624 lsr x16, x2, #(63 - CHUNKSHIFT) 625 add x2, x2, x3 626 ld1 {v14.8h,v15.8h}, [x17] 627 add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1) 628 lsr x17, x2, #(63 - CHUNKSHIFT) 629 add x2, x2, x3 630 631 sqrshrn v8.4h, v8.4s, #15 632 add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1) 633 sqrshrn2 v8.8h, v9.4s, #15 634 635 smull v10.4s, v12.4h, v0.h[2] 636 smull v11.4s, v14.4h, v0.h[3] 637 smlsl2 v10.4s, v12.8h, v1.h[2] 638 smlsl2 v11.4s, v14.8h, v1.h[3] 639 smlsl v10.4s, v13.4h, v2.h[2] 640 smlsl v11.4s, v15.4h, v2.h[3] 641 smlal2 v10.4s, v13.8h, v3.h[2] 642 smlal2 v11.4s, v15.8h, v3.h[3] 643 644 sqrshrn v9.4h, v10.4s, #15 645 sqrshrn2 v9.8h, v11.4s, #15 646 647 sqrshrun v8.8b, v8.8h, #VERTBITS - 8 648 sqrshrun2 v8.16b, v9.8h, #VERTBITS - 8 649 650 /* And two more... */ 651 ld1 {v12.8h,v13.8h}, [x14] 652 ld1 {v14.8h,v15.8h}, [x15] 653 654 smull v10.4s, v12.4h, v0.h[4] 655 smull v11.4s, v14.4h, v0.h[5] 656 smlsl2 v10.4s, v12.8h, v1.h[4] 657 smlsl2 v11.4s, v14.8h, v1.h[5] 658 smlsl v10.4s, v13.4h, v2.h[4] 659 smlsl v11.4s, v15.4h, v2.h[5] 660 smlal2 v10.4s, v13.8h, v3.h[4] 661 smlal2 v11.4s, v15.8h, v3.h[5] 662 663 /* And two more... */ 664 ld1 {v12.8h,v13.8h}, [x16] 665 ld1 {v14.8h,v15.8h}, [x17] 666 667 subs x13, x13, #LOOP_OUTPUT_SIZE 668 669 sqrshrn v9.4h, v10.4s, #15 670 sqrshrn2 v9.8h, v11.4s, #15 671 672 smull v10.4s, v12.4h, v0.h[6] 673 smull v11.4s, v14.4h, v0.h[7] 674 smlsl2 v10.4s, v12.8h, v1.h[6] 675 smlsl2 v11.4s, v14.8h, v1.h[7] 676 smlsl v10.4s, v13.4h, v2.h[6] 677 smlsl v11.4s, v15.4h, v2.h[7] 678 smlal2 v10.4s, v13.8h, v3.h[6] 679 smlal2 v11.4s, v15.8h, v3.h[7] 680 681 sqrshrn v10.4h, v10.4s, #15 682 sqrshrn2 v10.8h, v11.4s, #15 683 684 sqrshrun v9.8b, v9.8h, #VERTBITS - 8 685 sqrshrun2 v9.16b, v10.8h, #VERTBITS - 8 686 .endif 687 bgt 2b /* continue inner loop */ 688 /* The inner loop has already been limited to ensure that none of 689 * the earlier iterations could overfill the output, so the store 690 * appears within the loop but after the conditional branch (at the 691 * top). At the end, provided it won't overfill, perform the final 692 * store here. If it would, then break out to the tricky tail case 693 * instead. 694 */ 695 blt 1f 696 /* Store the amount of data appropriate to the configuration of the 697 * instance being assembled. 698 */ 699 .if LOOP_OUTPUT_SIZE == 4 700 st1 {v8.s}[0], [x0], #4 701 .elseif LOOP_OUTPUT_SIZE == 8 702 st1 {v8.8b}, [x0], #8 703 .elseif LOOP_OUTPUT_SIZE == 16 704 st1 {v8.16b}, [x0], #16 705 .elseif LOOP_OUTPUT_SIZE == 32 706 st1 {v8.16b,v9.16b}, [x0], #32 707 .endif 708 b 1b /* resume outer loop */ 709 /* Partial tail store case: 710 * Different versions of the code need different subsets of the 711 * following partial stores. Here the number of components and the 712 * size of the chunk of data produced by each inner loop iteration 713 * is tested to figure out whether or not each phrase is relevant. 714 */ 715 .if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16 716 1: tst x13, #16 717 beq 1f 718 st1 {v8.16b}, [x0], #16 719 mov v8.16b, v9.16b 720 .endif 721 .if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8 722 1: tst x13, #8 723 beq 1f 724 st1 {v8.8b}, [x0], #8 725 ext v8.16b, v8.16b, v8.16b, #8 726 .endif 727 .if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4 728 1: tst x13, #4 729 beq 1f 730 st1 {v8.s}[0], [x0], #4 731 ext v8.8b, v8.8b, v8.8b, #4 732 .endif 733 .if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2 734 1: tst x13, #2 735 beq 1f 736 st1 {v8.h}[0], [x0], #2 737 ext v8.8b, v8.8b, v8.8b, #2 738 .endif 739 .if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1 740 1: tst x13, #1 741 beq 1f 742 st1 {v8.b}[0], [x0], #1 743 .endif 744 1: 745 9: mov sp, x18 746 ld1 {v8.1d - v11.1d}, [sp], #32 747 ld1 {v12.1d - v15.1d}, [sp], #32 748 ret 749 END(rsdIntrinsicResizeB\comp\()_K) 750 .endr 751 752