1 /* 2 * Copyright 2009 Nokia Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 * Author: Siarhei Siamashka (siarhei.siamashka (at) nokia.com) 24 */ 25 26 /* 27 * This file contains implementations of NEON optimized pixel processing 28 * functions. There is no full and detailed tutorial, but some functions 29 * (those which are exposing some new or interesting features) are 30 * extensively commented and can be used as examples. 31 * 32 * You may want to have a look at the comments for following functions: 33 * - pixman_composite_over_8888_0565_asm_neon 34 * - pixman_composite_over_n_8_0565_asm_neon 35 */ 36 37 /* Prevent the stack from becoming executable for no reason... */ 38 #if defined(__linux__) && defined(__ELF__) 39 .section .note.GNU-stack,"",%progbits 40 #endif 41 42 .text 43 .fpu neon 44 .arch armv7a 45 .object_arch armv4 46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ 47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ 48 .arm 49 .altmacro 50 .p2align 2 51 52 #include "pixman-private.h" 53 #include "pixman-arm-neon-asm.h" 54 55 /* Global configuration options and preferences */ 56 57 /* 58 * The code can optionally make use of unaligned memory accesses to improve 59 * performance of handling leading/trailing pixels for each scanline. 60 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for 61 * example in linux if unaligned memory accesses are not configured to 62 * generate.exceptions. 63 */ 64 .set RESPECT_STRICT_ALIGNMENT, 1 65 66 /* 67 * Set default prefetch type. There is a choice between the following options: 68 * 69 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work 70 * as NOP to workaround some HW bugs or for whatever other reason) 71 * 72 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where 73 * advanced prefetch intruduces heavy overhead) 74 * 75 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 76 * which can run ARM and NEON instructions simultaneously so that extra ARM 77 * instructions do not add (many) extra cycles, but improve prefetch efficiency) 78 * 79 * Note: some types of function can't support advanced prefetch and fallback 80 * to simple one (those which handle 24bpp pixels) 81 */ 82 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED 83 84 /* Prefetch distance in pixels for simple prefetch */ 85 .set PREFETCH_DISTANCE_SIMPLE, 64 86 87 /* 88 * Implementation of pixman_composite_over_8888_0565_asm_neon 89 * 90 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and 91 * performs OVER compositing operation. Function fast_composite_over_8888_0565 92 * from pixman-fast-path.c does the same in C and can be used as a reference. 93 * 94 * First we need to have some NEON assembly code which can do the actual 95 * operation on the pixels and provide it to the template macro. 96 * 97 * Template macro quite conveniently takes care of emitting all the necessary 98 * code for memory reading and writing (including quite tricky cases of 99 * handling unaligned leading/trailing pixels), so we only need to deal with 100 * the data in NEON registers. 101 * 102 * NEON registers allocation in general is recommented to be the following: 103 * d0, d1, d2, d3 - contain loaded source pixel data 104 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) 105 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) 106 * d28, d29, d30, d31 - place for storing the result (destination pixels) 107 * 108 * As can be seen above, four 64-bit NEON registers are used for keeping 109 * intermediate pixel data and up to 8 pixels can be processed in one step 110 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). 111 * 112 * This particular function uses the following registers allocation: 113 * d0, d1, d2, d3 - contain loaded source pixel data 114 * d4, d5 - contain loaded destination pixels (they are needed) 115 * d28, d29 - place for storing the result (destination pixels) 116 */ 117 118 /* 119 * Step one. We need to have some code to do some arithmetics on pixel data. 120 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used 121 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, 122 * perform all the needed calculations and write the result to {d28, d29}. 123 * The rationale for having two macros and not just one will be explained 124 * later. In practice, any single monolitic function which does the work can 125 * be split into two parts in any arbitrary way without affecting correctness. 126 * 127 * There is one special trick here too. Common template macro can optionally 128 * make our life a bit easier by doing R, G, B, A color components 129 * deinterleaving for 32bpp pixel formats (and this feature is used in 130 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that 131 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we 132 * actually use d0 register for blue channel (a vector of eight 8-bit 133 * values), d1 register for green, d2 for red and d3 for alpha. This 134 * simple conversion can be also done with a few NEON instructions: 135 * 136 * Packed to planar conversion: 137 * vuzp.8 d0, d1 138 * vuzp.8 d2, d3 139 * vuzp.8 d1, d3 140 * vuzp.8 d0, d2 141 * 142 * Planar to packed conversion: 143 * vzip.8 d0, d2 144 * vzip.8 d1, d3 145 * vzip.8 d2, d3 146 * vzip.8 d0, d1 147 * 148 * But pixel can be loaded directly in planar format using VLD4.8 NEON 149 * instruction. It is 1 cycle slower than VLD1.32, so this is not always 150 * desirable, that's why deinterleaving is optional. 151 * 152 * But anyway, here is the code: 153 */ 154 .macro pixman_composite_over_8888_0565_process_pixblock_head 155 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 156 and put data into d6 - red, d7 - green, d30 - blue */ 157 vshrn.u16 d6, q2, #8 158 vshrn.u16 d7, q2, #3 159 vsli.u16 q2, q2, #5 160 vsri.u8 d6, d6, #5 161 vmvn.8 d3, d3 /* invert source alpha */ 162 vsri.u8 d7, d7, #6 163 vshrn.u16 d30, q2, #2 164 /* now do alpha blending, storing results in 8-bit planar format 165 into d16 - red, d19 - green, d18 - blue */ 166 vmull.u8 q10, d3, d6 167 vmull.u8 q11, d3, d7 168 vmull.u8 q12, d3, d30 169 vrshr.u16 q13, q10, #8 170 vrshr.u16 q3, q11, #8 171 vrshr.u16 q15, q12, #8 172 vraddhn.u16 d20, q10, q13 173 vraddhn.u16 d23, q11, q3 174 vraddhn.u16 d22, q12, q15 175 .endm 176 177 .macro pixman_composite_over_8888_0565_process_pixblock_tail 178 /* ... continue alpha blending */ 179 vqadd.u8 d16, d2, d20 180 vqadd.u8 q9, q0, q11 181 /* convert the result to r5g6b5 and store it into {d28, d29} */ 182 vshll.u8 q14, d16, #8 183 vshll.u8 q8, d19, #8 184 vshll.u8 q9, d18, #8 185 vsri.u16 q14, q8, #5 186 vsri.u16 q14, q9, #11 187 .endm 188 189 /* 190 * OK, now we got almost everything that we need. Using the above two 191 * macros, the work can be done right. But now we want to optimize 192 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really 193 * a lot from good code scheduling and software pipelining. 194 * 195 * Let's construct some code, which will run in the core main loop. 196 * Some pseudo-code of the main loop will look like this: 197 * head 198 * while (...) { 199 * tail 200 * head 201 * } 202 * tail 203 * 204 * It may look a bit weird, but this setup allows to hide instruction 205 * latencies better and also utilize dual-issue capability more 206 * efficiently (make pairs of load-store and ALU instructions). 207 * 208 * So what we need now is a '*_tail_head' macro, which will be used 209 * in the core main loop. A trivial straightforward implementation 210 * of this macro would look like this: 211 * 212 * pixman_composite_over_8888_0565_process_pixblock_tail 213 * vst1.16 {d28, d29}, [DST_W, :128]! 214 * vld1.16 {d4, d5}, [DST_R, :128]! 215 * vld4.32 {d0, d1, d2, d3}, [SRC]! 216 * pixman_composite_over_8888_0565_process_pixblock_head 217 * cache_preload 8, 8 218 * 219 * Now it also got some VLD/VST instructions. We simply can't move from 220 * processing one block of pixels to the other one with just arithmetics. 221 * The previously processed data needs to be written to memory and new 222 * data needs to be fetched. Fortunately, this main loop does not deal 223 * with partial leading/trailing pixels and can load/store a full block 224 * of pixels in a bulk. Additionally, destination buffer is already 225 * 16 bytes aligned here (which is good for performance). 226 * 227 * New things here are DST_R, DST_W, SRC and MASK identifiers. These 228 * are the aliases for ARM registers which are used as pointers for 229 * accessing data. We maintain separate pointers for reading and writing 230 * destination buffer (DST_R and DST_W). 231 * 232 * Another new thing is 'cache_preload' macro. It is used for prefetching 233 * data into CPU L2 cache and improve performance when dealing with large 234 * images which are far larger than cache size. It uses one argument 235 * (actually two, but they need to be the same here) - number of pixels 236 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some 237 * details about this macro. Moreover, if good performance is needed 238 * the code from this macro needs to be copied into '*_tail_head' macro 239 * and mixed with the rest of code for optimal instructions scheduling. 240 * We are actually doing it below. 241 * 242 * Now after all the explanations, here is the optimized code. 243 * Different instruction streams (originaling from '*_head', '*_tail' 244 * and 'cache_preload' macro) use different indentation levels for 245 * better readability. Actually taking the code from one of these 246 * indentation levels and ignoring a few VLD/VST instructions would 247 * result in exactly the code from '*_head', '*_tail' or 'cache_preload' 248 * macro! 249 */ 250 251 #if 1 252 253 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head 254 vqadd.u8 d16, d2, d20 255 vld1.16 {d4, d5}, [DST_R, :128]! 256 vqadd.u8 q9, q0, q11 257 vshrn.u16 d6, q2, #8 258 fetch_src_pixblock 259 vshrn.u16 d7, q2, #3 260 vsli.u16 q2, q2, #5 261 vshll.u8 q14, d16, #8 262 PF add PF_X, PF_X, #8 263 vshll.u8 q8, d19, #8 264 PF tst PF_CTL, #0xF 265 vsri.u8 d6, d6, #5 266 PF addne PF_X, PF_X, #8 267 vmvn.8 d3, d3 268 PF subne PF_CTL, PF_CTL, #1 269 vsri.u8 d7, d7, #6 270 vshrn.u16 d30, q2, #2 271 vmull.u8 q10, d3, d6 272 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 273 vmull.u8 q11, d3, d7 274 vmull.u8 q12, d3, d30 275 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 276 vsri.u16 q14, q8, #5 277 PF cmp PF_X, ORIG_W 278 vshll.u8 q9, d18, #8 279 vrshr.u16 q13, q10, #8 280 PF subge PF_X, PF_X, ORIG_W 281 vrshr.u16 q3, q11, #8 282 vrshr.u16 q15, q12, #8 283 PF subges PF_CTL, PF_CTL, #0x10 284 vsri.u16 q14, q9, #11 285 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 286 vraddhn.u16 d20, q10, q13 287 vraddhn.u16 d23, q11, q3 288 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 289 vraddhn.u16 d22, q12, q15 290 vst1.16 {d28, d29}, [DST_W, :128]! 291 .endm 292 293 #else 294 295 /* If we did not care much about the performance, we would just use this... */ 296 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head 297 pixman_composite_over_8888_0565_process_pixblock_tail 298 vst1.16 {d28, d29}, [DST_W, :128]! 299 vld1.16 {d4, d5}, [DST_R, :128]! 300 fetch_src_pixblock 301 pixman_composite_over_8888_0565_process_pixblock_head 302 cache_preload 8, 8 303 .endm 304 305 #endif 306 307 /* 308 * And now the final part. We are using 'generate_composite_function' macro 309 * to put all the stuff together. We are specifying the name of the function 310 * which we want to get, number of bits per pixel for the source, mask and 311 * destination (0 if unused, like mask in this case). Next come some bit 312 * flags: 313 * FLAG_DST_READWRITE - tells that the destination buffer is both read 314 * and written, for write-only buffer we would use 315 * FLAG_DST_WRITEONLY flag instead 316 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data 317 * and separate color channels for 32bpp format. 318 * The next things are: 319 * - the number of pixels processed per iteration (8 in this case, because 320 * that's the maximum what can fit into four 64-bit NEON registers). 321 * - prefetch distance, measured in pixel blocks. In this case it is 5 times 322 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal 323 * prefetch distance can be selected by running some benchmarks. 324 * 325 * After that we specify some macros, these are 'default_init', 326 * 'default_cleanup' here which are empty (but it is possible to have custom 327 * init/cleanup macros to be able to save/restore some extra NEON registers 328 * like d8-d15 or do anything else) followed by 329 * 'pixman_composite_over_8888_0565_process_pixblock_head', 330 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and 331 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' 332 * which we got implemented above. 333 * 334 * The last part is the NEON registers allocation scheme. 335 */ 336 generate_composite_function \ 337 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ 338 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 339 8, /* number of pixels, processed in a single block */ \ 340 5, /* prefetch distance */ \ 341 default_init, \ 342 default_cleanup, \ 343 pixman_composite_over_8888_0565_process_pixblock_head, \ 344 pixman_composite_over_8888_0565_process_pixblock_tail, \ 345 pixman_composite_over_8888_0565_process_pixblock_tail_head, \ 346 28, /* dst_w_basereg */ \ 347 4, /* dst_r_basereg */ \ 348 0, /* src_basereg */ \ 349 24 /* mask_basereg */ 350 351 /******************************************************************************/ 352 353 .macro pixman_composite_over_n_0565_process_pixblock_head 354 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 355 and put data into d6 - red, d7 - green, d30 - blue */ 356 vshrn.u16 d6, q2, #8 357 vshrn.u16 d7, q2, #3 358 vsli.u16 q2, q2, #5 359 vsri.u8 d6, d6, #5 360 vsri.u8 d7, d7, #6 361 vshrn.u16 d30, q2, #2 362 /* now do alpha blending, storing results in 8-bit planar format 363 into d16 - red, d19 - green, d18 - blue */ 364 vmull.u8 q10, d3, d6 365 vmull.u8 q11, d3, d7 366 vmull.u8 q12, d3, d30 367 vrshr.u16 q13, q10, #8 368 vrshr.u16 q3, q11, #8 369 vrshr.u16 q15, q12, #8 370 vraddhn.u16 d20, q10, q13 371 vraddhn.u16 d23, q11, q3 372 vraddhn.u16 d22, q12, q15 373 .endm 374 375 .macro pixman_composite_over_n_0565_process_pixblock_tail 376 /* ... continue alpha blending */ 377 vqadd.u8 d16, d2, d20 378 vqadd.u8 q9, q0, q11 379 /* convert the result to r5g6b5 and store it into {d28, d29} */ 380 vshll.u8 q14, d16, #8 381 vshll.u8 q8, d19, #8 382 vshll.u8 q9, d18, #8 383 vsri.u16 q14, q8, #5 384 vsri.u16 q14, q9, #11 385 .endm 386 387 /* TODO: expand macros and do better instructions scheduling */ 388 .macro pixman_composite_over_n_0565_process_pixblock_tail_head 389 pixman_composite_over_n_0565_process_pixblock_tail 390 vld1.16 {d4, d5}, [DST_R, :128]! 391 vst1.16 {d28, d29}, [DST_W, :128]! 392 pixman_composite_over_n_0565_process_pixblock_head 393 cache_preload 8, 8 394 .endm 395 396 .macro pixman_composite_over_n_0565_init 397 add DUMMY, sp, #ARGS_STACK_OFFSET 398 vld1.32 {d3[0]}, [DUMMY] 399 vdup.8 d0, d3[0] 400 vdup.8 d1, d3[1] 401 vdup.8 d2, d3[2] 402 vdup.8 d3, d3[3] 403 vmvn.8 d3, d3 /* invert source alpha */ 404 .endm 405 406 generate_composite_function \ 407 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ 408 FLAG_DST_READWRITE, \ 409 8, /* number of pixels, processed in a single block */ \ 410 5, /* prefetch distance */ \ 411 pixman_composite_over_n_0565_init, \ 412 default_cleanup, \ 413 pixman_composite_over_n_0565_process_pixblock_head, \ 414 pixman_composite_over_n_0565_process_pixblock_tail, \ 415 pixman_composite_over_n_0565_process_pixblock_tail_head, \ 416 28, /* dst_w_basereg */ \ 417 4, /* dst_r_basereg */ \ 418 0, /* src_basereg */ \ 419 24 /* mask_basereg */ 420 421 /******************************************************************************/ 422 423 .macro pixman_composite_src_8888_0565_process_pixblock_head 424 vshll.u8 q8, d1, #8 425 vshll.u8 q14, d2, #8 426 vshll.u8 q9, d0, #8 427 .endm 428 429 .macro pixman_composite_src_8888_0565_process_pixblock_tail 430 vsri.u16 q14, q8, #5 431 vsri.u16 q14, q9, #11 432 .endm 433 434 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head 435 vsri.u16 q14, q8, #5 436 PF add PF_X, PF_X, #8 437 PF tst PF_CTL, #0xF 438 fetch_src_pixblock 439 PF addne PF_X, PF_X, #8 440 PF subne PF_CTL, PF_CTL, #1 441 vsri.u16 q14, q9, #11 442 PF cmp PF_X, ORIG_W 443 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 444 vshll.u8 q8, d1, #8 445 vst1.16 {d28, d29}, [DST_W, :128]! 446 PF subge PF_X, PF_X, ORIG_W 447 PF subges PF_CTL, PF_CTL, #0x10 448 vshll.u8 q14, d2, #8 449 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 450 vshll.u8 q9, d0, #8 451 .endm 452 453 generate_composite_function \ 454 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ 455 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 456 8, /* number of pixels, processed in a single block */ \ 457 10, /* prefetch distance */ \ 458 default_init, \ 459 default_cleanup, \ 460 pixman_composite_src_8888_0565_process_pixblock_head, \ 461 pixman_composite_src_8888_0565_process_pixblock_tail, \ 462 pixman_composite_src_8888_0565_process_pixblock_tail_head 463 464 /******************************************************************************/ 465 466 .macro pixman_composite_src_0565_8888_process_pixblock_head 467 vshrn.u16 d30, q0, #8 468 vshrn.u16 d29, q0, #3 469 vsli.u16 q0, q0, #5 470 vmov.u8 d31, #255 471 vsri.u8 d30, d30, #5 472 vsri.u8 d29, d29, #6 473 vshrn.u16 d28, q0, #2 474 .endm 475 476 .macro pixman_composite_src_0565_8888_process_pixblock_tail 477 .endm 478 479 /* TODO: expand macros and do better instructions scheduling */ 480 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head 481 pixman_composite_src_0565_8888_process_pixblock_tail 482 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 483 fetch_src_pixblock 484 pixman_composite_src_0565_8888_process_pixblock_head 485 cache_preload 8, 8 486 .endm 487 488 generate_composite_function \ 489 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ 490 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 491 8, /* number of pixels, processed in a single block */ \ 492 10, /* prefetch distance */ \ 493 default_init, \ 494 default_cleanup, \ 495 pixman_composite_src_0565_8888_process_pixblock_head, \ 496 pixman_composite_src_0565_8888_process_pixblock_tail, \ 497 pixman_composite_src_0565_8888_process_pixblock_tail_head 498 499 /******************************************************************************/ 500 501 .macro pixman_composite_add_8_8_process_pixblock_head 502 vqadd.u8 q14, q0, q2 503 vqadd.u8 q15, q1, q3 504 .endm 505 506 .macro pixman_composite_add_8_8_process_pixblock_tail 507 .endm 508 509 .macro pixman_composite_add_8_8_process_pixblock_tail_head 510 fetch_src_pixblock 511 PF add PF_X, PF_X, #32 512 PF tst PF_CTL, #0xF 513 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 514 PF addne PF_X, PF_X, #32 515 PF subne PF_CTL, PF_CTL, #1 516 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 517 PF cmp PF_X, ORIG_W 518 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 519 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 520 PF subge PF_X, PF_X, ORIG_W 521 PF subges PF_CTL, PF_CTL, #0x10 522 vqadd.u8 q14, q0, q2 523 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 524 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 525 vqadd.u8 q15, q1, q3 526 .endm 527 528 generate_composite_function \ 529 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ 530 FLAG_DST_READWRITE, \ 531 32, /* number of pixels, processed in a single block */ \ 532 10, /* prefetch distance */ \ 533 default_init, \ 534 default_cleanup, \ 535 pixman_composite_add_8_8_process_pixblock_head, \ 536 pixman_composite_add_8_8_process_pixblock_tail, \ 537 pixman_composite_add_8_8_process_pixblock_tail_head 538 539 /******************************************************************************/ 540 541 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head 542 fetch_src_pixblock 543 PF add PF_X, PF_X, #8 544 PF tst PF_CTL, #0xF 545 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! 546 PF addne PF_X, PF_X, #8 547 PF subne PF_CTL, PF_CTL, #1 548 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! 549 PF cmp PF_X, ORIG_W 550 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 551 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 552 PF subge PF_X, PF_X, ORIG_W 553 PF subges PF_CTL, PF_CTL, #0x10 554 vqadd.u8 q14, q0, q2 555 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 556 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 557 vqadd.u8 q15, q1, q3 558 .endm 559 560 generate_composite_function \ 561 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ 562 FLAG_DST_READWRITE, \ 563 8, /* number of pixels, processed in a single block */ \ 564 10, /* prefetch distance */ \ 565 default_init, \ 566 default_cleanup, \ 567 pixman_composite_add_8_8_process_pixblock_head, \ 568 pixman_composite_add_8_8_process_pixblock_tail, \ 569 pixman_composite_add_8888_8888_process_pixblock_tail_head 570 571 generate_composite_function_single_scanline \ 572 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ 573 FLAG_DST_READWRITE, \ 574 8, /* number of pixels, processed in a single block */ \ 575 default_init, \ 576 default_cleanup, \ 577 pixman_composite_add_8_8_process_pixblock_head, \ 578 pixman_composite_add_8_8_process_pixblock_tail, \ 579 pixman_composite_add_8888_8888_process_pixblock_tail_head 580 581 /******************************************************************************/ 582 583 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head 584 vmvn.8 d24, d3 /* get inverted alpha */ 585 /* do alpha blending */ 586 vmull.u8 q8, d24, d4 587 vmull.u8 q9, d24, d5 588 vmull.u8 q10, d24, d6 589 vmull.u8 q11, d24, d7 590 .endm 591 592 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail 593 vrshr.u16 q14, q8, #8 594 vrshr.u16 q15, q9, #8 595 vrshr.u16 q12, q10, #8 596 vrshr.u16 q13, q11, #8 597 vraddhn.u16 d28, q14, q8 598 vraddhn.u16 d29, q15, q9 599 vraddhn.u16 d30, q12, q10 600 vraddhn.u16 d31, q13, q11 601 .endm 602 603 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head 604 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 605 vrshr.u16 q14, q8, #8 606 PF add PF_X, PF_X, #8 607 PF tst PF_CTL, #0xF 608 vrshr.u16 q15, q9, #8 609 vrshr.u16 q12, q10, #8 610 vrshr.u16 q13, q11, #8 611 PF addne PF_X, PF_X, #8 612 PF subne PF_CTL, PF_CTL, #1 613 vraddhn.u16 d28, q14, q8 614 vraddhn.u16 d29, q15, q9 615 PF cmp PF_X, ORIG_W 616 vraddhn.u16 d30, q12, q10 617 vraddhn.u16 d31, q13, q11 618 fetch_src_pixblock 619 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 620 vmvn.8 d22, d3 621 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 622 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 623 PF subge PF_X, PF_X, ORIG_W 624 vmull.u8 q8, d22, d4 625 PF subges PF_CTL, PF_CTL, #0x10 626 vmull.u8 q9, d22, d5 627 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 628 vmull.u8 q10, d22, d6 629 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 630 vmull.u8 q11, d22, d7 631 .endm 632 633 generate_composite_function_single_scanline \ 634 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ 635 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 636 8, /* number of pixels, processed in a single block */ \ 637 default_init, \ 638 default_cleanup, \ 639 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \ 640 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \ 641 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head 642 643 /******************************************************************************/ 644 645 .macro pixman_composite_over_8888_8888_process_pixblock_head 646 pixman_composite_out_reverse_8888_8888_process_pixblock_head 647 .endm 648 649 .macro pixman_composite_over_8888_8888_process_pixblock_tail 650 pixman_composite_out_reverse_8888_8888_process_pixblock_tail 651 vqadd.u8 q14, q0, q14 652 vqadd.u8 q15, q1, q15 653 .endm 654 655 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head 656 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 657 vrshr.u16 q14, q8, #8 658 PF add PF_X, PF_X, #8 659 PF tst PF_CTL, #0xF 660 vrshr.u16 q15, q9, #8 661 vrshr.u16 q12, q10, #8 662 vrshr.u16 q13, q11, #8 663 PF addne PF_X, PF_X, #8 664 PF subne PF_CTL, PF_CTL, #1 665 vraddhn.u16 d28, q14, q8 666 vraddhn.u16 d29, q15, q9 667 PF cmp PF_X, ORIG_W 668 vraddhn.u16 d30, q12, q10 669 vraddhn.u16 d31, q13, q11 670 vqadd.u8 q14, q0, q14 671 vqadd.u8 q15, q1, q15 672 fetch_src_pixblock 673 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 674 vmvn.8 d22, d3 675 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 676 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 677 PF subge PF_X, PF_X, ORIG_W 678 vmull.u8 q8, d22, d4 679 PF subges PF_CTL, PF_CTL, #0x10 680 vmull.u8 q9, d22, d5 681 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 682 vmull.u8 q10, d22, d6 683 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 684 vmull.u8 q11, d22, d7 685 .endm 686 687 generate_composite_function \ 688 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ 689 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 690 8, /* number of pixels, processed in a single block */ \ 691 5, /* prefetch distance */ \ 692 default_init, \ 693 default_cleanup, \ 694 pixman_composite_over_8888_8888_process_pixblock_head, \ 695 pixman_composite_over_8888_8888_process_pixblock_tail, \ 696 pixman_composite_over_8888_8888_process_pixblock_tail_head 697 698 generate_composite_function_single_scanline \ 699 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ 700 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 701 8, /* number of pixels, processed in a single block */ \ 702 default_init, \ 703 default_cleanup, \ 704 pixman_composite_over_8888_8888_process_pixblock_head, \ 705 pixman_composite_over_8888_8888_process_pixblock_tail, \ 706 pixman_composite_over_8888_8888_process_pixblock_tail_head 707 708 /******************************************************************************/ 709 710 .macro pixman_composite_over_n_8888_process_pixblock_head 711 /* deinterleaved source pixels in {d0, d1, d2, d3} */ 712 /* inverted alpha in {d24} */ 713 /* destination pixels in {d4, d5, d6, d7} */ 714 vmull.u8 q8, d24, d4 715 vmull.u8 q9, d24, d5 716 vmull.u8 q10, d24, d6 717 vmull.u8 q11, d24, d7 718 .endm 719 720 .macro pixman_composite_over_n_8888_process_pixblock_tail 721 vrshr.u16 q14, q8, #8 722 vrshr.u16 q15, q9, #8 723 vrshr.u16 q2, q10, #8 724 vrshr.u16 q3, q11, #8 725 vraddhn.u16 d28, q14, q8 726 vraddhn.u16 d29, q15, q9 727 vraddhn.u16 d30, q2, q10 728 vraddhn.u16 d31, q3, q11 729 vqadd.u8 q14, q0, q14 730 vqadd.u8 q15, q1, q15 731 .endm 732 733 .macro pixman_composite_over_n_8888_process_pixblock_tail_head 734 vrshr.u16 q14, q8, #8 735 vrshr.u16 q15, q9, #8 736 vrshr.u16 q2, q10, #8 737 vrshr.u16 q3, q11, #8 738 vraddhn.u16 d28, q14, q8 739 vraddhn.u16 d29, q15, q9 740 vraddhn.u16 d30, q2, q10 741 vraddhn.u16 d31, q3, q11 742 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 743 vqadd.u8 q14, q0, q14 744 PF add PF_X, PF_X, #8 745 PF tst PF_CTL, #0x0F 746 PF addne PF_X, PF_X, #8 747 PF subne PF_CTL, PF_CTL, #1 748 vqadd.u8 q15, q1, q15 749 PF cmp PF_X, ORIG_W 750 vmull.u8 q8, d24, d4 751 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 752 vmull.u8 q9, d24, d5 753 PF subge PF_X, PF_X, ORIG_W 754 vmull.u8 q10, d24, d6 755 PF subges PF_CTL, PF_CTL, #0x10 756 vmull.u8 q11, d24, d7 757 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 758 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 759 .endm 760 761 .macro pixman_composite_over_n_8888_init 762 add DUMMY, sp, #ARGS_STACK_OFFSET 763 vld1.32 {d3[0]}, [DUMMY] 764 vdup.8 d0, d3[0] 765 vdup.8 d1, d3[1] 766 vdup.8 d2, d3[2] 767 vdup.8 d3, d3[3] 768 vmvn.8 d24, d3 /* get inverted alpha */ 769 .endm 770 771 generate_composite_function \ 772 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ 773 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 774 8, /* number of pixels, processed in a single block */ \ 775 5, /* prefetch distance */ \ 776 pixman_composite_over_n_8888_init, \ 777 default_cleanup, \ 778 pixman_composite_over_8888_8888_process_pixblock_head, \ 779 pixman_composite_over_8888_8888_process_pixblock_tail, \ 780 pixman_composite_over_n_8888_process_pixblock_tail_head 781 782 /******************************************************************************/ 783 784 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head 785 vrshr.u16 q14, q8, #8 786 PF add PF_X, PF_X, #8 787 PF tst PF_CTL, #0xF 788 vrshr.u16 q15, q9, #8 789 vrshr.u16 q12, q10, #8 790 vrshr.u16 q13, q11, #8 791 PF addne PF_X, PF_X, #8 792 PF subne PF_CTL, PF_CTL, #1 793 vraddhn.u16 d28, q14, q8 794 vraddhn.u16 d29, q15, q9 795 PF cmp PF_X, ORIG_W 796 vraddhn.u16 d30, q12, q10 797 vraddhn.u16 d31, q13, q11 798 vqadd.u8 q14, q0, q14 799 vqadd.u8 q15, q1, q15 800 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! 801 vmvn.8 d22, d3 802 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 803 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 804 PF subge PF_X, PF_X, ORIG_W 805 vmull.u8 q8, d22, d4 806 PF subges PF_CTL, PF_CTL, #0x10 807 vmull.u8 q9, d22, d5 808 vmull.u8 q10, d22, d6 809 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 810 vmull.u8 q11, d22, d7 811 .endm 812 813 .macro pixman_composite_over_reverse_n_8888_init 814 add DUMMY, sp, #ARGS_STACK_OFFSET 815 vld1.32 {d7[0]}, [DUMMY] 816 vdup.8 d4, d7[0] 817 vdup.8 d5, d7[1] 818 vdup.8 d6, d7[2] 819 vdup.8 d7, d7[3] 820 .endm 821 822 generate_composite_function \ 823 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ 824 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 825 8, /* number of pixels, processed in a single block */ \ 826 5, /* prefetch distance */ \ 827 pixman_composite_over_reverse_n_8888_init, \ 828 default_cleanup, \ 829 pixman_composite_over_8888_8888_process_pixblock_head, \ 830 pixman_composite_over_8888_8888_process_pixblock_tail, \ 831 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ 832 28, /* dst_w_basereg */ \ 833 0, /* dst_r_basereg */ \ 834 4, /* src_basereg */ \ 835 24 /* mask_basereg */ 836 837 /******************************************************************************/ 838 839 .macro pixman_composite_over_8888_8_0565_process_pixblock_head 840 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ 841 vmull.u8 q1, d24, d9 842 vmull.u8 q6, d24, d10 843 vmull.u8 q7, d24, d11 844 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ 845 vshrn.u16 d7, q2, #3 846 vsli.u16 q2, q2, #5 847 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ 848 vrshr.u16 q9, q1, #8 849 vrshr.u16 q10, q6, #8 850 vrshr.u16 q11, q7, #8 851 vraddhn.u16 d0, q0, q8 852 vraddhn.u16 d1, q1, q9 853 vraddhn.u16 d2, q6, q10 854 vraddhn.u16 d3, q7, q11 855 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ 856 vsri.u8 d7, d7, #6 857 vmvn.8 d3, d3 858 vshrn.u16 d30, q2, #2 859 vmull.u8 q8, d3, d6 /* now do alpha blending */ 860 vmull.u8 q9, d3, d7 861 vmull.u8 q10, d3, d30 862 .endm 863 864 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail 865 /* 3 cycle bubble (after vmull.u8) */ 866 vrshr.u16 q13, q8, #8 867 vrshr.u16 q11, q9, #8 868 vrshr.u16 q15, q10, #8 869 vraddhn.u16 d16, q8, q13 870 vraddhn.u16 d27, q9, q11 871 vraddhn.u16 d26, q10, q15 872 vqadd.u8 d16, d2, d16 873 /* 1 cycle bubble */ 874 vqadd.u8 q9, q0, q13 875 vshll.u8 q14, d16, #8 /* convert to 16bpp */ 876 vshll.u8 q8, d19, #8 877 vshll.u8 q9, d18, #8 878 vsri.u16 q14, q8, #5 879 /* 1 cycle bubble */ 880 vsri.u16 q14, q9, #11 881 .endm 882 883 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head 884 vld1.16 {d4, d5}, [DST_R, :128]! 885 vshrn.u16 d6, q2, #8 886 fetch_mask_pixblock 887 vshrn.u16 d7, q2, #3 888 fetch_src_pixblock 889 vmull.u8 q6, d24, d10 890 vrshr.u16 q13, q8, #8 891 vrshr.u16 q11, q9, #8 892 vrshr.u16 q15, q10, #8 893 vraddhn.u16 d16, q8, q13 894 vraddhn.u16 d27, q9, q11 895 vraddhn.u16 d26, q10, q15 896 vqadd.u8 d16, d2, d16 897 vmull.u8 q1, d24, d9 898 vqadd.u8 q9, q0, q13 899 vshll.u8 q14, d16, #8 900 vmull.u8 q0, d24, d8 901 vshll.u8 q8, d19, #8 902 vshll.u8 q9, d18, #8 903 vsri.u16 q14, q8, #5 904 vmull.u8 q7, d24, d11 905 vsri.u16 q14, q9, #11 906 907 cache_preload 8, 8 908 909 vsli.u16 q2, q2, #5 910 vrshr.u16 q8, q0, #8 911 vrshr.u16 q9, q1, #8 912 vrshr.u16 q10, q6, #8 913 vrshr.u16 q11, q7, #8 914 vraddhn.u16 d0, q0, q8 915 vraddhn.u16 d1, q1, q9 916 vraddhn.u16 d2, q6, q10 917 vraddhn.u16 d3, q7, q11 918 vsri.u8 d6, d6, #5 919 vsri.u8 d7, d7, #6 920 vmvn.8 d3, d3 921 vshrn.u16 d30, q2, #2 922 vst1.16 {d28, d29}, [DST_W, :128]! 923 vmull.u8 q8, d3, d6 924 vmull.u8 q9, d3, d7 925 vmull.u8 q10, d3, d30 926 .endm 927 928 generate_composite_function \ 929 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ 930 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 931 8, /* number of pixels, processed in a single block */ \ 932 5, /* prefetch distance */ \ 933 default_init_need_all_regs, \ 934 default_cleanup_need_all_regs, \ 935 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 936 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 937 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 938 28, /* dst_w_basereg */ \ 939 4, /* dst_r_basereg */ \ 940 8, /* src_basereg */ \ 941 24 /* mask_basereg */ 942 943 /******************************************************************************/ 944 945 /* 946 * This function needs a special initialization of solid mask. 947 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET 948 * offset, split into color components and replicated in d8-d11 949 * registers. Additionally, this function needs all the NEON registers, 950 * so it has to save d8-d15 registers which are callee saved according 951 * to ABI. These registers are restored from 'cleanup' macro. All the 952 * other NEON registers are caller saved, so can be clobbered freely 953 * without introducing any problems. 954 */ 955 .macro pixman_composite_over_n_8_0565_init 956 add DUMMY, sp, #ARGS_STACK_OFFSET 957 vpush {d8-d15} 958 vld1.32 {d11[0]}, [DUMMY] 959 vdup.8 d8, d11[0] 960 vdup.8 d9, d11[1] 961 vdup.8 d10, d11[2] 962 vdup.8 d11, d11[3] 963 .endm 964 965 .macro pixman_composite_over_n_8_0565_cleanup 966 vpop {d8-d15} 967 .endm 968 969 generate_composite_function \ 970 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ 971 FLAG_DST_READWRITE, \ 972 8, /* number of pixels, processed in a single block */ \ 973 5, /* prefetch distance */ \ 974 pixman_composite_over_n_8_0565_init, \ 975 pixman_composite_over_n_8_0565_cleanup, \ 976 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 977 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 978 pixman_composite_over_8888_8_0565_process_pixblock_tail_head 979 980 /******************************************************************************/ 981 982 .macro pixman_composite_over_8888_n_0565_init 983 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 984 vpush {d8-d15} 985 vld1.32 {d24[0]}, [DUMMY] 986 vdup.8 d24, d24[3] 987 .endm 988 989 .macro pixman_composite_over_8888_n_0565_cleanup 990 vpop {d8-d15} 991 .endm 992 993 generate_composite_function \ 994 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ 995 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 996 8, /* number of pixels, processed in a single block */ \ 997 5, /* prefetch distance */ \ 998 pixman_composite_over_8888_n_0565_init, \ 999 pixman_composite_over_8888_n_0565_cleanup, \ 1000 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 1001 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 1002 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 1003 28, /* dst_w_basereg */ \ 1004 4, /* dst_r_basereg */ \ 1005 8, /* src_basereg */ \ 1006 24 /* mask_basereg */ 1007 1008 /******************************************************************************/ 1009 1010 .macro pixman_composite_src_0565_0565_process_pixblock_head 1011 .endm 1012 1013 .macro pixman_composite_src_0565_0565_process_pixblock_tail 1014 .endm 1015 1016 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head 1017 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! 1018 fetch_src_pixblock 1019 cache_preload 16, 16 1020 .endm 1021 1022 generate_composite_function \ 1023 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ 1024 FLAG_DST_WRITEONLY, \ 1025 16, /* number of pixels, processed in a single block */ \ 1026 10, /* prefetch distance */ \ 1027 default_init, \ 1028 default_cleanup, \ 1029 pixman_composite_src_0565_0565_process_pixblock_head, \ 1030 pixman_composite_src_0565_0565_process_pixblock_tail, \ 1031 pixman_composite_src_0565_0565_process_pixblock_tail_head, \ 1032 0, /* dst_w_basereg */ \ 1033 0, /* dst_r_basereg */ \ 1034 0, /* src_basereg */ \ 1035 0 /* mask_basereg */ 1036 1037 /******************************************************************************/ 1038 1039 .macro pixman_composite_src_n_8_process_pixblock_head 1040 .endm 1041 1042 .macro pixman_composite_src_n_8_process_pixblock_tail 1043 .endm 1044 1045 .macro pixman_composite_src_n_8_process_pixblock_tail_head 1046 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! 1047 .endm 1048 1049 .macro pixman_composite_src_n_8_init 1050 add DUMMY, sp, #ARGS_STACK_OFFSET 1051 vld1.32 {d0[0]}, [DUMMY] 1052 vsli.u64 d0, d0, #8 1053 vsli.u64 d0, d0, #16 1054 vsli.u64 d0, d0, #32 1055 vorr d1, d0, d0 1056 vorr q1, q0, q0 1057 .endm 1058 1059 .macro pixman_composite_src_n_8_cleanup 1060 .endm 1061 1062 generate_composite_function \ 1063 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ 1064 FLAG_DST_WRITEONLY, \ 1065 32, /* number of pixels, processed in a single block */ \ 1066 0, /* prefetch distance */ \ 1067 pixman_composite_src_n_8_init, \ 1068 pixman_composite_src_n_8_cleanup, \ 1069 pixman_composite_src_n_8_process_pixblock_head, \ 1070 pixman_composite_src_n_8_process_pixblock_tail, \ 1071 pixman_composite_src_n_8_process_pixblock_tail_head, \ 1072 0, /* dst_w_basereg */ \ 1073 0, /* dst_r_basereg */ \ 1074 0, /* src_basereg */ \ 1075 0 /* mask_basereg */ 1076 1077 /******************************************************************************/ 1078 1079 .macro pixman_composite_src_n_0565_process_pixblock_head 1080 .endm 1081 1082 .macro pixman_composite_src_n_0565_process_pixblock_tail 1083 .endm 1084 1085 .macro pixman_composite_src_n_0565_process_pixblock_tail_head 1086 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! 1087 .endm 1088 1089 .macro pixman_composite_src_n_0565_init 1090 add DUMMY, sp, #ARGS_STACK_OFFSET 1091 vld1.32 {d0[0]}, [DUMMY] 1092 vsli.u64 d0, d0, #16 1093 vsli.u64 d0, d0, #32 1094 vorr d1, d0, d0 1095 vorr q1, q0, q0 1096 .endm 1097 1098 .macro pixman_composite_src_n_0565_cleanup 1099 .endm 1100 1101 generate_composite_function \ 1102 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ 1103 FLAG_DST_WRITEONLY, \ 1104 16, /* number of pixels, processed in a single block */ \ 1105 0, /* prefetch distance */ \ 1106 pixman_composite_src_n_0565_init, \ 1107 pixman_composite_src_n_0565_cleanup, \ 1108 pixman_composite_src_n_0565_process_pixblock_head, \ 1109 pixman_composite_src_n_0565_process_pixblock_tail, \ 1110 pixman_composite_src_n_0565_process_pixblock_tail_head, \ 1111 0, /* dst_w_basereg */ \ 1112 0, /* dst_r_basereg */ \ 1113 0, /* src_basereg */ \ 1114 0 /* mask_basereg */ 1115 1116 /******************************************************************************/ 1117 1118 .macro pixman_composite_src_n_8888_process_pixblock_head 1119 .endm 1120 1121 .macro pixman_composite_src_n_8888_process_pixblock_tail 1122 .endm 1123 1124 .macro pixman_composite_src_n_8888_process_pixblock_tail_head 1125 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 1126 .endm 1127 1128 .macro pixman_composite_src_n_8888_init 1129 add DUMMY, sp, #ARGS_STACK_OFFSET 1130 vld1.32 {d0[0]}, [DUMMY] 1131 vsli.u64 d0, d0, #32 1132 vorr d1, d0, d0 1133 vorr q1, q0, q0 1134 .endm 1135 1136 .macro pixman_composite_src_n_8888_cleanup 1137 .endm 1138 1139 generate_composite_function \ 1140 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ 1141 FLAG_DST_WRITEONLY, \ 1142 8, /* number of pixels, processed in a single block */ \ 1143 0, /* prefetch distance */ \ 1144 pixman_composite_src_n_8888_init, \ 1145 pixman_composite_src_n_8888_cleanup, \ 1146 pixman_composite_src_n_8888_process_pixblock_head, \ 1147 pixman_composite_src_n_8888_process_pixblock_tail, \ 1148 pixman_composite_src_n_8888_process_pixblock_tail_head, \ 1149 0, /* dst_w_basereg */ \ 1150 0, /* dst_r_basereg */ \ 1151 0, /* src_basereg */ \ 1152 0 /* mask_basereg */ 1153 1154 /******************************************************************************/ 1155 1156 .macro pixman_composite_src_8888_8888_process_pixblock_head 1157 .endm 1158 1159 .macro pixman_composite_src_8888_8888_process_pixblock_tail 1160 .endm 1161 1162 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head 1163 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 1164 fetch_src_pixblock 1165 cache_preload 8, 8 1166 .endm 1167 1168 generate_composite_function \ 1169 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ 1170 FLAG_DST_WRITEONLY, \ 1171 8, /* number of pixels, processed in a single block */ \ 1172 10, /* prefetch distance */ \ 1173 default_init, \ 1174 default_cleanup, \ 1175 pixman_composite_src_8888_8888_process_pixblock_head, \ 1176 pixman_composite_src_8888_8888_process_pixblock_tail, \ 1177 pixman_composite_src_8888_8888_process_pixblock_tail_head, \ 1178 0, /* dst_w_basereg */ \ 1179 0, /* dst_r_basereg */ \ 1180 0, /* src_basereg */ \ 1181 0 /* mask_basereg */ 1182 1183 /******************************************************************************/ 1184 1185 .macro pixman_composite_src_x888_8888_process_pixblock_head 1186 vorr q0, q0, q2 1187 vorr q1, q1, q2 1188 .endm 1189 1190 .macro pixman_composite_src_x888_8888_process_pixblock_tail 1191 .endm 1192 1193 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head 1194 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 1195 fetch_src_pixblock 1196 vorr q0, q0, q2 1197 vorr q1, q1, q2 1198 cache_preload 8, 8 1199 .endm 1200 1201 .macro pixman_composite_src_x888_8888_init 1202 vmov.u8 q2, #0xFF 1203 vshl.u32 q2, q2, #24 1204 .endm 1205 1206 generate_composite_function \ 1207 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ 1208 FLAG_DST_WRITEONLY, \ 1209 8, /* number of pixels, processed in a single block */ \ 1210 10, /* prefetch distance */ \ 1211 pixman_composite_src_x888_8888_init, \ 1212 default_cleanup, \ 1213 pixman_composite_src_x888_8888_process_pixblock_head, \ 1214 pixman_composite_src_x888_8888_process_pixblock_tail, \ 1215 pixman_composite_src_x888_8888_process_pixblock_tail_head, \ 1216 0, /* dst_w_basereg */ \ 1217 0, /* dst_r_basereg */ \ 1218 0, /* src_basereg */ \ 1219 0 /* mask_basereg */ 1220 1221 /******************************************************************************/ 1222 1223 .macro pixman_composite_src_n_8_8888_process_pixblock_head 1224 /* expecting solid source in {d0, d1, d2, d3} */ 1225 /* mask is in d24 (d25, d26, d27 are unused) */ 1226 1227 /* in */ 1228 vmull.u8 q8, d24, d0 1229 vmull.u8 q9, d24, d1 1230 vmull.u8 q10, d24, d2 1231 vmull.u8 q11, d24, d3 1232 vrsra.u16 q8, q8, #8 1233 vrsra.u16 q9, q9, #8 1234 vrsra.u16 q10, q10, #8 1235 vrsra.u16 q11, q11, #8 1236 .endm 1237 1238 .macro pixman_composite_src_n_8_8888_process_pixblock_tail 1239 vrshrn.u16 d28, q8, #8 1240 vrshrn.u16 d29, q9, #8 1241 vrshrn.u16 d30, q10, #8 1242 vrshrn.u16 d31, q11, #8 1243 .endm 1244 1245 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head 1246 fetch_mask_pixblock 1247 PF add PF_X, PF_X, #8 1248 vrshrn.u16 d28, q8, #8 1249 PF tst PF_CTL, #0x0F 1250 vrshrn.u16 d29, q9, #8 1251 PF addne PF_X, PF_X, #8 1252 vrshrn.u16 d30, q10, #8 1253 PF subne PF_CTL, PF_CTL, #1 1254 vrshrn.u16 d31, q11, #8 1255 PF cmp PF_X, ORIG_W 1256 vmull.u8 q8, d24, d0 1257 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 1258 vmull.u8 q9, d24, d1 1259 PF subge PF_X, PF_X, ORIG_W 1260 vmull.u8 q10, d24, d2 1261 PF subges PF_CTL, PF_CTL, #0x10 1262 vmull.u8 q11, d24, d3 1263 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 1264 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1265 vrsra.u16 q8, q8, #8 1266 vrsra.u16 q9, q9, #8 1267 vrsra.u16 q10, q10, #8 1268 vrsra.u16 q11, q11, #8 1269 .endm 1270 1271 .macro pixman_composite_src_n_8_8888_init 1272 add DUMMY, sp, #ARGS_STACK_OFFSET 1273 vld1.32 {d3[0]}, [DUMMY] 1274 vdup.8 d0, d3[0] 1275 vdup.8 d1, d3[1] 1276 vdup.8 d2, d3[2] 1277 vdup.8 d3, d3[3] 1278 .endm 1279 1280 .macro pixman_composite_src_n_8_8888_cleanup 1281 .endm 1282 1283 generate_composite_function \ 1284 pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \ 1285 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 1286 8, /* number of pixels, processed in a single block */ \ 1287 5, /* prefetch distance */ \ 1288 pixman_composite_src_n_8_8888_init, \ 1289 pixman_composite_src_n_8_8888_cleanup, \ 1290 pixman_composite_src_n_8_8888_process_pixblock_head, \ 1291 pixman_composite_src_n_8_8888_process_pixblock_tail, \ 1292 pixman_composite_src_n_8_8888_process_pixblock_tail_head, \ 1293 1294 /******************************************************************************/ 1295 1296 .macro pixman_composite_src_n_8_8_process_pixblock_head 1297 vmull.u8 q0, d24, d16 1298 vmull.u8 q1, d25, d16 1299 vmull.u8 q2, d26, d16 1300 vmull.u8 q3, d27, d16 1301 vrsra.u16 q0, q0, #8 1302 vrsra.u16 q1, q1, #8 1303 vrsra.u16 q2, q2, #8 1304 vrsra.u16 q3, q3, #8 1305 .endm 1306 1307 .macro pixman_composite_src_n_8_8_process_pixblock_tail 1308 vrshrn.u16 d28, q0, #8 1309 vrshrn.u16 d29, q1, #8 1310 vrshrn.u16 d30, q2, #8 1311 vrshrn.u16 d31, q3, #8 1312 .endm 1313 1314 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head 1315 fetch_mask_pixblock 1316 PF add PF_X, PF_X, #8 1317 vrshrn.u16 d28, q0, #8 1318 PF tst PF_CTL, #0x0F 1319 vrshrn.u16 d29, q1, #8 1320 PF addne PF_X, PF_X, #8 1321 vrshrn.u16 d30, q2, #8 1322 PF subne PF_CTL, PF_CTL, #1 1323 vrshrn.u16 d31, q3, #8 1324 PF cmp PF_X, ORIG_W 1325 vmull.u8 q0, d24, d16 1326 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 1327 vmull.u8 q1, d25, d16 1328 PF subge PF_X, PF_X, ORIG_W 1329 vmull.u8 q2, d26, d16 1330 PF subges PF_CTL, PF_CTL, #0x10 1331 vmull.u8 q3, d27, d16 1332 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 1333 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1334 vrsra.u16 q0, q0, #8 1335 vrsra.u16 q1, q1, #8 1336 vrsra.u16 q2, q2, #8 1337 vrsra.u16 q3, q3, #8 1338 .endm 1339 1340 .macro pixman_composite_src_n_8_8_init 1341 add DUMMY, sp, #ARGS_STACK_OFFSET 1342 vld1.32 {d16[0]}, [DUMMY] 1343 vdup.8 d16, d16[3] 1344 .endm 1345 1346 .macro pixman_composite_src_n_8_8_cleanup 1347 .endm 1348 1349 generate_composite_function \ 1350 pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \ 1351 FLAG_DST_WRITEONLY, \ 1352 32, /* number of pixels, processed in a single block */ \ 1353 5, /* prefetch distance */ \ 1354 pixman_composite_src_n_8_8_init, \ 1355 pixman_composite_src_n_8_8_cleanup, \ 1356 pixman_composite_src_n_8_8_process_pixblock_head, \ 1357 pixman_composite_src_n_8_8_process_pixblock_tail, \ 1358 pixman_composite_src_n_8_8_process_pixblock_tail_head 1359 1360 /******************************************************************************/ 1361 1362 .macro pixman_composite_over_n_8_8888_process_pixblock_head 1363 /* expecting deinterleaved source data in {d8, d9, d10, d11} */ 1364 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ 1365 /* and destination data in {d4, d5, d6, d7} */ 1366 /* mask is in d24 (d25, d26, d27 are unused) */ 1367 1368 /* in */ 1369 vmull.u8 q6, d24, d8 1370 vmull.u8 q7, d24, d9 1371 vmull.u8 q8, d24, d10 1372 vmull.u8 q9, d24, d11 1373 vrshr.u16 q10, q6, #8 1374 vrshr.u16 q11, q7, #8 1375 vrshr.u16 q12, q8, #8 1376 vrshr.u16 q13, q9, #8 1377 vraddhn.u16 d0, q6, q10 1378 vraddhn.u16 d1, q7, q11 1379 vraddhn.u16 d2, q8, q12 1380 vraddhn.u16 d3, q9, q13 1381 vmvn.8 d25, d3 /* get inverted alpha */ 1382 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ 1383 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ 1384 /* now do alpha blending */ 1385 vmull.u8 q8, d25, d4 1386 vmull.u8 q9, d25, d5 1387 vmull.u8 q10, d25, d6 1388 vmull.u8 q11, d25, d7 1389 .endm 1390 1391 .macro pixman_composite_over_n_8_8888_process_pixblock_tail 1392 vrshr.u16 q14, q8, #8 1393 vrshr.u16 q15, q9, #8 1394 vrshr.u16 q6, q10, #8 1395 vrshr.u16 q7, q11, #8 1396 vraddhn.u16 d28, q14, q8 1397 vraddhn.u16 d29, q15, q9 1398 vraddhn.u16 d30, q6, q10 1399 vraddhn.u16 d31, q7, q11 1400 vqadd.u8 q14, q0, q14 1401 vqadd.u8 q15, q1, q15 1402 .endm 1403 1404 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head 1405 vrshr.u16 q14, q8, #8 1406 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1407 vrshr.u16 q15, q9, #8 1408 fetch_mask_pixblock 1409 vrshr.u16 q6, q10, #8 1410 PF add PF_X, PF_X, #8 1411 vrshr.u16 q7, q11, #8 1412 PF tst PF_CTL, #0x0F 1413 vraddhn.u16 d28, q14, q8 1414 PF addne PF_X, PF_X, #8 1415 vraddhn.u16 d29, q15, q9 1416 PF subne PF_CTL, PF_CTL, #1 1417 vraddhn.u16 d30, q6, q10 1418 PF cmp PF_X, ORIG_W 1419 vraddhn.u16 d31, q7, q11 1420 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 1421 vmull.u8 q6, d24, d8 1422 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 1423 vmull.u8 q7, d24, d9 1424 PF subge PF_X, PF_X, ORIG_W 1425 vmull.u8 q8, d24, d10 1426 PF subges PF_CTL, PF_CTL, #0x10 1427 vmull.u8 q9, d24, d11 1428 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 1429 vqadd.u8 q14, q0, q14 1430 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 1431 vqadd.u8 q15, q1, q15 1432 vrshr.u16 q10, q6, #8 1433 vrshr.u16 q11, q7, #8 1434 vrshr.u16 q12, q8, #8 1435 vrshr.u16 q13, q9, #8 1436 vraddhn.u16 d0, q6, q10 1437 vraddhn.u16 d1, q7, q11 1438 vraddhn.u16 d2, q8, q12 1439 vraddhn.u16 d3, q9, q13 1440 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1441 vmvn.8 d25, d3 1442 vmull.u8 q8, d25, d4 1443 vmull.u8 q9, d25, d5 1444 vmull.u8 q10, d25, d6 1445 vmull.u8 q11, d25, d7 1446 .endm 1447 1448 .macro pixman_composite_over_n_8_8888_init 1449 add DUMMY, sp, #ARGS_STACK_OFFSET 1450 vpush {d8-d15} 1451 vld1.32 {d11[0]}, [DUMMY] 1452 vdup.8 d8, d11[0] 1453 vdup.8 d9, d11[1] 1454 vdup.8 d10, d11[2] 1455 vdup.8 d11, d11[3] 1456 .endm 1457 1458 .macro pixman_composite_over_n_8_8888_cleanup 1459 vpop {d8-d15} 1460 .endm 1461 1462 generate_composite_function \ 1463 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ 1464 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1465 8, /* number of pixels, processed in a single block */ \ 1466 5, /* prefetch distance */ \ 1467 pixman_composite_over_n_8_8888_init, \ 1468 pixman_composite_over_n_8_8888_cleanup, \ 1469 pixman_composite_over_n_8_8888_process_pixblock_head, \ 1470 pixman_composite_over_n_8_8888_process_pixblock_tail, \ 1471 pixman_composite_over_n_8_8888_process_pixblock_tail_head 1472 1473 /******************************************************************************/ 1474 1475 .macro pixman_composite_over_n_8_8_process_pixblock_head 1476 vmull.u8 q0, d24, d8 1477 vmull.u8 q1, d25, d8 1478 vmull.u8 q6, d26, d8 1479 vmull.u8 q7, d27, d8 1480 vrshr.u16 q10, q0, #8 1481 vrshr.u16 q11, q1, #8 1482 vrshr.u16 q12, q6, #8 1483 vrshr.u16 q13, q7, #8 1484 vraddhn.u16 d0, q0, q10 1485 vraddhn.u16 d1, q1, q11 1486 vraddhn.u16 d2, q6, q12 1487 vraddhn.u16 d3, q7, q13 1488 vmvn.8 q12, q0 1489 vmvn.8 q13, q1 1490 vmull.u8 q8, d24, d4 1491 vmull.u8 q9, d25, d5 1492 vmull.u8 q10, d26, d6 1493 vmull.u8 q11, d27, d7 1494 .endm 1495 1496 .macro pixman_composite_over_n_8_8_process_pixblock_tail 1497 vrshr.u16 q14, q8, #8 1498 vrshr.u16 q15, q9, #8 1499 vrshr.u16 q12, q10, #8 1500 vrshr.u16 q13, q11, #8 1501 vraddhn.u16 d28, q14, q8 1502 vraddhn.u16 d29, q15, q9 1503 vraddhn.u16 d30, q12, q10 1504 vraddhn.u16 d31, q13, q11 1505 vqadd.u8 q14, q0, q14 1506 vqadd.u8 q15, q1, q15 1507 .endm 1508 1509 /* TODO: expand macros and do better instructions scheduling */ 1510 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head 1511 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1512 pixman_composite_over_n_8_8_process_pixblock_tail 1513 fetch_mask_pixblock 1514 cache_preload 32, 32 1515 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1516 pixman_composite_over_n_8_8_process_pixblock_head 1517 .endm 1518 1519 .macro pixman_composite_over_n_8_8_init 1520 add DUMMY, sp, #ARGS_STACK_OFFSET 1521 vpush {d8-d15} 1522 vld1.32 {d8[0]}, [DUMMY] 1523 vdup.8 d8, d8[3] 1524 .endm 1525 1526 .macro pixman_composite_over_n_8_8_cleanup 1527 vpop {d8-d15} 1528 .endm 1529 1530 generate_composite_function \ 1531 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ 1532 FLAG_DST_READWRITE, \ 1533 32, /* number of pixels, processed in a single block */ \ 1534 5, /* prefetch distance */ \ 1535 pixman_composite_over_n_8_8_init, \ 1536 pixman_composite_over_n_8_8_cleanup, \ 1537 pixman_composite_over_n_8_8_process_pixblock_head, \ 1538 pixman_composite_over_n_8_8_process_pixblock_tail, \ 1539 pixman_composite_over_n_8_8_process_pixblock_tail_head 1540 1541 /******************************************************************************/ 1542 1543 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head 1544 /* 1545 * 'combine_mask_ca' replacement 1546 * 1547 * input: solid src (n) in {d8, d9, d10, d11} 1548 * dest in {d4, d5, d6, d7 } 1549 * mask in {d24, d25, d26, d27} 1550 * output: updated src in {d0, d1, d2, d3 } 1551 * updated mask in {d24, d25, d26, d3 } 1552 */ 1553 vmull.u8 q0, d24, d8 1554 vmull.u8 q1, d25, d9 1555 vmull.u8 q6, d26, d10 1556 vmull.u8 q7, d27, d11 1557 vmull.u8 q9, d11, d25 1558 vmull.u8 q12, d11, d24 1559 vmull.u8 q13, d11, d26 1560 vrshr.u16 q8, q0, #8 1561 vrshr.u16 q10, q1, #8 1562 vrshr.u16 q11, q6, #8 1563 vraddhn.u16 d0, q0, q8 1564 vraddhn.u16 d1, q1, q10 1565 vraddhn.u16 d2, q6, q11 1566 vrshr.u16 q11, q12, #8 1567 vrshr.u16 q8, q9, #8 1568 vrshr.u16 q6, q13, #8 1569 vrshr.u16 q10, q7, #8 1570 vraddhn.u16 d24, q12, q11 1571 vraddhn.u16 d25, q9, q8 1572 vraddhn.u16 d26, q13, q6 1573 vraddhn.u16 d3, q7, q10 1574 /* 1575 * 'combine_over_ca' replacement 1576 * 1577 * output: updated dest in {d28, d29, d30, d31} 1578 */ 1579 vmvn.8 q12, q12 1580 vmvn.8 d26, d26 1581 vmull.u8 q8, d24, d4 1582 vmull.u8 q9, d25, d5 1583 vmvn.8 d27, d3 1584 vmull.u8 q10, d26, d6 1585 vmull.u8 q11, d27, d7 1586 .endm 1587 1588 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail 1589 /* ... continue 'combine_over_ca' replacement */ 1590 vrshr.u16 q14, q8, #8 1591 vrshr.u16 q15, q9, #8 1592 vrshr.u16 q6, q10, #8 1593 vrshr.u16 q7, q11, #8 1594 vraddhn.u16 d28, q14, q8 1595 vraddhn.u16 d29, q15, q9 1596 vraddhn.u16 d30, q6, q10 1597 vraddhn.u16 d31, q7, q11 1598 vqadd.u8 q14, q0, q14 1599 vqadd.u8 q15, q1, q15 1600 .endm 1601 1602 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 1603 vrshr.u16 q14, q8, #8 1604 vrshr.u16 q15, q9, #8 1605 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1606 vrshr.u16 q6, q10, #8 1607 vrshr.u16 q7, q11, #8 1608 vraddhn.u16 d28, q14, q8 1609 vraddhn.u16 d29, q15, q9 1610 vraddhn.u16 d30, q6, q10 1611 vraddhn.u16 d31, q7, q11 1612 fetch_mask_pixblock 1613 vqadd.u8 q14, q0, q14 1614 vqadd.u8 q15, q1, q15 1615 cache_preload 8, 8 1616 pixman_composite_over_n_8888_8888_ca_process_pixblock_head 1617 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1618 .endm 1619 1620 .macro pixman_composite_over_n_8888_8888_ca_init 1621 add DUMMY, sp, #ARGS_STACK_OFFSET 1622 vpush {d8-d15} 1623 vld1.32 {d11[0]}, [DUMMY] 1624 vdup.8 d8, d11[0] 1625 vdup.8 d9, d11[1] 1626 vdup.8 d10, d11[2] 1627 vdup.8 d11, d11[3] 1628 .endm 1629 1630 .macro pixman_composite_over_n_8888_8888_ca_cleanup 1631 vpop {d8-d15} 1632 .endm 1633 1634 generate_composite_function \ 1635 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ 1636 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1637 8, /* number of pixels, processed in a single block */ \ 1638 5, /* prefetch distance */ \ 1639 pixman_composite_over_n_8888_8888_ca_init, \ 1640 pixman_composite_over_n_8888_8888_ca_cleanup, \ 1641 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ 1642 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ 1643 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 1644 1645 /******************************************************************************/ 1646 1647 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head 1648 /* 1649 * 'combine_mask_ca' replacement 1650 * 1651 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] 1652 * mask in {d24, d25, d26} [B, G, R] 1653 * output: updated src in {d0, d1, d2 } [B, G, R] 1654 * updated mask in {d24, d25, d26} [B, G, R] 1655 */ 1656 vmull.u8 q0, d24, d8 1657 vmull.u8 q1, d25, d9 1658 vmull.u8 q6, d26, d10 1659 vmull.u8 q9, d11, d25 1660 vmull.u8 q12, d11, d24 1661 vmull.u8 q13, d11, d26 1662 vrshr.u16 q8, q0, #8 1663 vrshr.u16 q10, q1, #8 1664 vrshr.u16 q11, q6, #8 1665 vraddhn.u16 d0, q0, q8 1666 vraddhn.u16 d1, q1, q10 1667 vraddhn.u16 d2, q6, q11 1668 vrshr.u16 q11, q12, #8 1669 vrshr.u16 q8, q9, #8 1670 vrshr.u16 q6, q13, #8 1671 vraddhn.u16 d24, q12, q11 1672 vraddhn.u16 d25, q9, q8 1673 /* 1674 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 1675 * and put data into d16 - blue, d17 - green, d18 - red 1676 */ 1677 vshrn.u16 d17, q2, #3 1678 vshrn.u16 d18, q2, #8 1679 vraddhn.u16 d26, q13, q6 1680 vsli.u16 q2, q2, #5 1681 vsri.u8 d18, d18, #5 1682 vsri.u8 d17, d17, #6 1683 /* 1684 * 'combine_over_ca' replacement 1685 * 1686 * output: updated dest in d16 - blue, d17 - green, d18 - red 1687 */ 1688 vmvn.8 q12, q12 1689 vshrn.u16 d16, q2, #2 1690 vmvn.8 d26, d26 1691 vmull.u8 q6, d16, d24 1692 vmull.u8 q7, d17, d25 1693 vmull.u8 q11, d18, d26 1694 .endm 1695 1696 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail 1697 /* ... continue 'combine_over_ca' replacement */ 1698 vrshr.u16 q10, q6, #8 1699 vrshr.u16 q14, q7, #8 1700 vrshr.u16 q15, q11, #8 1701 vraddhn.u16 d16, q10, q6 1702 vraddhn.u16 d17, q14, q7 1703 vraddhn.u16 d18, q15, q11 1704 vqadd.u8 q8, q0, q8 1705 vqadd.u8 d18, d2, d18 1706 /* 1707 * convert the results in d16, d17, d18 to r5g6b5 and store 1708 * them into {d28, d29} 1709 */ 1710 vshll.u8 q14, d18, #8 1711 vshll.u8 q10, d17, #8 1712 vshll.u8 q15, d16, #8 1713 vsri.u16 q14, q10, #5 1714 vsri.u16 q14, q15, #11 1715 .endm 1716 1717 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head 1718 fetch_mask_pixblock 1719 vrshr.u16 q10, q6, #8 1720 vrshr.u16 q14, q7, #8 1721 vld1.16 {d4, d5}, [DST_R, :128]! 1722 vrshr.u16 q15, q11, #8 1723 vraddhn.u16 d16, q10, q6 1724 vraddhn.u16 d17, q14, q7 1725 vraddhn.u16 d22, q15, q11 1726 /* process_pixblock_head */ 1727 /* 1728 * 'combine_mask_ca' replacement 1729 * 1730 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] 1731 * mask in {d24, d25, d26} [B, G, R] 1732 * output: updated src in {d0, d1, d2 } [B, G, R] 1733 * updated mask in {d24, d25, d26} [B, G, R] 1734 */ 1735 vmull.u8 q6, d26, d10 1736 vqadd.u8 q8, q0, q8 1737 vmull.u8 q0, d24, d8 1738 vqadd.u8 d22, d2, d22 1739 vmull.u8 q1, d25, d9 1740 /* 1741 * convert the result in d16, d17, d22 to r5g6b5 and store 1742 * it into {d28, d29} 1743 */ 1744 vshll.u8 q14, d22, #8 1745 vshll.u8 q10, d17, #8 1746 vshll.u8 q15, d16, #8 1747 vmull.u8 q9, d11, d25 1748 vsri.u16 q14, q10, #5 1749 vmull.u8 q12, d11, d24 1750 vmull.u8 q13, d11, d26 1751 vsri.u16 q14, q15, #11 1752 cache_preload 8, 8 1753 vrshr.u16 q8, q0, #8 1754 vrshr.u16 q10, q1, #8 1755 vrshr.u16 q11, q6, #8 1756 vraddhn.u16 d0, q0, q8 1757 vraddhn.u16 d1, q1, q10 1758 vraddhn.u16 d2, q6, q11 1759 vrshr.u16 q11, q12, #8 1760 vrshr.u16 q8, q9, #8 1761 vrshr.u16 q6, q13, #8 1762 vraddhn.u16 d24, q12, q11 1763 vraddhn.u16 d25, q9, q8 1764 /* 1765 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 1766 * 8-bit format and put data into d16 - blue, d17 - green, 1767 * d18 - red 1768 */ 1769 vshrn.u16 d17, q2, #3 1770 vshrn.u16 d18, q2, #8 1771 vraddhn.u16 d26, q13, q6 1772 vsli.u16 q2, q2, #5 1773 vsri.u8 d17, d17, #6 1774 vsri.u8 d18, d18, #5 1775 /* 1776 * 'combine_over_ca' replacement 1777 * 1778 * output: updated dest in d16 - blue, d17 - green, d18 - red 1779 */ 1780 vmvn.8 q12, q12 1781 vshrn.u16 d16, q2, #2 1782 vmvn.8 d26, d26 1783 vmull.u8 q7, d17, d25 1784 vmull.u8 q6, d16, d24 1785 vmull.u8 q11, d18, d26 1786 vst1.16 {d28, d29}, [DST_W, :128]! 1787 .endm 1788 1789 .macro pixman_composite_over_n_8888_0565_ca_init 1790 add DUMMY, sp, #ARGS_STACK_OFFSET 1791 vpush {d8-d15} 1792 vld1.32 {d11[0]}, [DUMMY] 1793 vdup.8 d8, d11[0] 1794 vdup.8 d9, d11[1] 1795 vdup.8 d10, d11[2] 1796 vdup.8 d11, d11[3] 1797 .endm 1798 1799 .macro pixman_composite_over_n_8888_0565_ca_cleanup 1800 vpop {d8-d15} 1801 .endm 1802 1803 generate_composite_function \ 1804 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \ 1805 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1806 8, /* number of pixels, processed in a single block */ \ 1807 5, /* prefetch distance */ \ 1808 pixman_composite_over_n_8888_0565_ca_init, \ 1809 pixman_composite_over_n_8888_0565_ca_cleanup, \ 1810 pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \ 1811 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \ 1812 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head 1813 1814 /******************************************************************************/ 1815 1816 .macro pixman_composite_in_n_8_process_pixblock_head 1817 /* expecting source data in {d0, d1, d2, d3} */ 1818 /* and destination data in {d4, d5, d6, d7} */ 1819 vmull.u8 q8, d4, d3 1820 vmull.u8 q9, d5, d3 1821 vmull.u8 q10, d6, d3 1822 vmull.u8 q11, d7, d3 1823 .endm 1824 1825 .macro pixman_composite_in_n_8_process_pixblock_tail 1826 vrshr.u16 q14, q8, #8 1827 vrshr.u16 q15, q9, #8 1828 vrshr.u16 q12, q10, #8 1829 vrshr.u16 q13, q11, #8 1830 vraddhn.u16 d28, q8, q14 1831 vraddhn.u16 d29, q9, q15 1832 vraddhn.u16 d30, q10, q12 1833 vraddhn.u16 d31, q11, q13 1834 .endm 1835 1836 .macro pixman_composite_in_n_8_process_pixblock_tail_head 1837 pixman_composite_in_n_8_process_pixblock_tail 1838 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1839 cache_preload 32, 32 1840 pixman_composite_in_n_8_process_pixblock_head 1841 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1842 .endm 1843 1844 .macro pixman_composite_in_n_8_init 1845 add DUMMY, sp, #ARGS_STACK_OFFSET 1846 vld1.32 {d3[0]}, [DUMMY] 1847 vdup.8 d3, d3[3] 1848 .endm 1849 1850 .macro pixman_composite_in_n_8_cleanup 1851 .endm 1852 1853 generate_composite_function \ 1854 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \ 1855 FLAG_DST_READWRITE, \ 1856 32, /* number of pixels, processed in a single block */ \ 1857 5, /* prefetch distance */ \ 1858 pixman_composite_in_n_8_init, \ 1859 pixman_composite_in_n_8_cleanup, \ 1860 pixman_composite_in_n_8_process_pixblock_head, \ 1861 pixman_composite_in_n_8_process_pixblock_tail, \ 1862 pixman_composite_in_n_8_process_pixblock_tail_head, \ 1863 28, /* dst_w_basereg */ \ 1864 4, /* dst_r_basereg */ \ 1865 0, /* src_basereg */ \ 1866 24 /* mask_basereg */ 1867 1868 .macro pixman_composite_add_n_8_8_process_pixblock_head 1869 /* expecting source data in {d8, d9, d10, d11} */ 1870 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ 1871 /* and destination data in {d4, d5, d6, d7} */ 1872 /* mask is in d24, d25, d26, d27 */ 1873 vmull.u8 q0, d24, d11 1874 vmull.u8 q1, d25, d11 1875 vmull.u8 q6, d26, d11 1876 vmull.u8 q7, d27, d11 1877 vrshr.u16 q10, q0, #8 1878 vrshr.u16 q11, q1, #8 1879 vrshr.u16 q12, q6, #8 1880 vrshr.u16 q13, q7, #8 1881 vraddhn.u16 d0, q0, q10 1882 vraddhn.u16 d1, q1, q11 1883 vraddhn.u16 d2, q6, q12 1884 vraddhn.u16 d3, q7, q13 1885 vqadd.u8 q14, q0, q2 1886 vqadd.u8 q15, q1, q3 1887 .endm 1888 1889 .macro pixman_composite_add_n_8_8_process_pixblock_tail 1890 .endm 1891 1892 /* TODO: expand macros and do better instructions scheduling */ 1893 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head 1894 pixman_composite_add_n_8_8_process_pixblock_tail 1895 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1896 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1897 fetch_mask_pixblock 1898 cache_preload 32, 32 1899 pixman_composite_add_n_8_8_process_pixblock_head 1900 .endm 1901 1902 .macro pixman_composite_add_n_8_8_init 1903 add DUMMY, sp, #ARGS_STACK_OFFSET 1904 vpush {d8-d15} 1905 vld1.32 {d11[0]}, [DUMMY] 1906 vdup.8 d11, d11[3] 1907 .endm 1908 1909 .macro pixman_composite_add_n_8_8_cleanup 1910 vpop {d8-d15} 1911 .endm 1912 1913 generate_composite_function \ 1914 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ 1915 FLAG_DST_READWRITE, \ 1916 32, /* number of pixels, processed in a single block */ \ 1917 5, /* prefetch distance */ \ 1918 pixman_composite_add_n_8_8_init, \ 1919 pixman_composite_add_n_8_8_cleanup, \ 1920 pixman_composite_add_n_8_8_process_pixblock_head, \ 1921 pixman_composite_add_n_8_8_process_pixblock_tail, \ 1922 pixman_composite_add_n_8_8_process_pixblock_tail_head 1923 1924 /******************************************************************************/ 1925 1926 .macro pixman_composite_add_8_8_8_process_pixblock_head 1927 /* expecting source data in {d0, d1, d2, d3} */ 1928 /* destination data in {d4, d5, d6, d7} */ 1929 /* mask in {d24, d25, d26, d27} */ 1930 vmull.u8 q8, d24, d0 1931 vmull.u8 q9, d25, d1 1932 vmull.u8 q10, d26, d2 1933 vmull.u8 q11, d27, d3 1934 vrshr.u16 q0, q8, #8 1935 vrshr.u16 q1, q9, #8 1936 vrshr.u16 q12, q10, #8 1937 vrshr.u16 q13, q11, #8 1938 vraddhn.u16 d0, q0, q8 1939 vraddhn.u16 d1, q1, q9 1940 vraddhn.u16 d2, q12, q10 1941 vraddhn.u16 d3, q13, q11 1942 vqadd.u8 q14, q0, q2 1943 vqadd.u8 q15, q1, q3 1944 .endm 1945 1946 .macro pixman_composite_add_8_8_8_process_pixblock_tail 1947 .endm 1948 1949 /* TODO: expand macros and do better instructions scheduling */ 1950 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head 1951 pixman_composite_add_8_8_8_process_pixblock_tail 1952 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1953 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1954 fetch_mask_pixblock 1955 fetch_src_pixblock 1956 cache_preload 32, 32 1957 pixman_composite_add_8_8_8_process_pixblock_head 1958 .endm 1959 1960 .macro pixman_composite_add_8_8_8_init 1961 .endm 1962 1963 .macro pixman_composite_add_8_8_8_cleanup 1964 .endm 1965 1966 generate_composite_function \ 1967 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ 1968 FLAG_DST_READWRITE, \ 1969 32, /* number of pixels, processed in a single block */ \ 1970 5, /* prefetch distance */ \ 1971 pixman_composite_add_8_8_8_init, \ 1972 pixman_composite_add_8_8_8_cleanup, \ 1973 pixman_composite_add_8_8_8_process_pixblock_head, \ 1974 pixman_composite_add_8_8_8_process_pixblock_tail, \ 1975 pixman_composite_add_8_8_8_process_pixblock_tail_head 1976 1977 /******************************************************************************/ 1978 1979 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head 1980 /* expecting source data in {d0, d1, d2, d3} */ 1981 /* destination data in {d4, d5, d6, d7} */ 1982 /* mask in {d24, d25, d26, d27} */ 1983 vmull.u8 q8, d27, d0 1984 vmull.u8 q9, d27, d1 1985 vmull.u8 q10, d27, d2 1986 vmull.u8 q11, d27, d3 1987 /* 1 cycle bubble */ 1988 vrsra.u16 q8, q8, #8 1989 vrsra.u16 q9, q9, #8 1990 vrsra.u16 q10, q10, #8 1991 vrsra.u16 q11, q11, #8 1992 .endm 1993 1994 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail 1995 /* 2 cycle bubble */ 1996 vrshrn.u16 d28, q8, #8 1997 vrshrn.u16 d29, q9, #8 1998 vrshrn.u16 d30, q10, #8 1999 vrshrn.u16 d31, q11, #8 2000 vqadd.u8 q14, q2, q14 2001 /* 1 cycle bubble */ 2002 vqadd.u8 q15, q3, q15 2003 .endm 2004 2005 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 2006 fetch_src_pixblock 2007 vrshrn.u16 d28, q8, #8 2008 fetch_mask_pixblock 2009 vrshrn.u16 d29, q9, #8 2010 vmull.u8 q8, d27, d0 2011 vrshrn.u16 d30, q10, #8 2012 vmull.u8 q9, d27, d1 2013 vrshrn.u16 d31, q11, #8 2014 vmull.u8 q10, d27, d2 2015 vqadd.u8 q14, q2, q14 2016 vmull.u8 q11, d27, d3 2017 vqadd.u8 q15, q3, q15 2018 vrsra.u16 q8, q8, #8 2019 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2020 vrsra.u16 q9, q9, #8 2021 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2022 vrsra.u16 q10, q10, #8 2023 2024 cache_preload 8, 8 2025 2026 vrsra.u16 q11, q11, #8 2027 .endm 2028 2029 generate_composite_function \ 2030 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ 2031 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2032 8, /* number of pixels, processed in a single block */ \ 2033 10, /* prefetch distance */ \ 2034 default_init, \ 2035 default_cleanup, \ 2036 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2037 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2038 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 2039 2040 generate_composite_function_single_scanline \ 2041 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ 2042 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2043 8, /* number of pixels, processed in a single block */ \ 2044 default_init, \ 2045 default_cleanup, \ 2046 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2047 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2048 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 2049 2050 /******************************************************************************/ 2051 2052 generate_composite_function \ 2053 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ 2054 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2055 8, /* number of pixels, processed in a single block */ \ 2056 5, /* prefetch distance */ \ 2057 default_init, \ 2058 default_cleanup, \ 2059 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2060 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2061 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 2062 28, /* dst_w_basereg */ \ 2063 4, /* dst_r_basereg */ \ 2064 0, /* src_basereg */ \ 2065 27 /* mask_basereg */ 2066 2067 /******************************************************************************/ 2068 2069 .macro pixman_composite_add_n_8_8888_init 2070 add DUMMY, sp, #ARGS_STACK_OFFSET 2071 vld1.32 {d3[0]}, [DUMMY] 2072 vdup.8 d0, d3[0] 2073 vdup.8 d1, d3[1] 2074 vdup.8 d2, d3[2] 2075 vdup.8 d3, d3[3] 2076 .endm 2077 2078 .macro pixman_composite_add_n_8_8888_cleanup 2079 .endm 2080 2081 generate_composite_function \ 2082 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \ 2083 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2084 8, /* number of pixels, processed in a single block */ \ 2085 5, /* prefetch distance */ \ 2086 pixman_composite_add_n_8_8888_init, \ 2087 pixman_composite_add_n_8_8888_cleanup, \ 2088 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2089 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2090 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 2091 28, /* dst_w_basereg */ \ 2092 4, /* dst_r_basereg */ \ 2093 0, /* src_basereg */ \ 2094 27 /* mask_basereg */ 2095 2096 /******************************************************************************/ 2097 2098 .macro pixman_composite_add_8888_n_8888_init 2099 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 2100 vld1.32 {d27[0]}, [DUMMY] 2101 vdup.8 d27, d27[3] 2102 .endm 2103 2104 .macro pixman_composite_add_8888_n_8888_cleanup 2105 .endm 2106 2107 generate_composite_function \ 2108 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \ 2109 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2110 8, /* number of pixels, processed in a single block */ \ 2111 5, /* prefetch distance */ \ 2112 pixman_composite_add_8888_n_8888_init, \ 2113 pixman_composite_add_8888_n_8888_cleanup, \ 2114 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2115 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2116 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 2117 28, /* dst_w_basereg */ \ 2118 4, /* dst_r_basereg */ \ 2119 0, /* src_basereg */ \ 2120 27 /* mask_basereg */ 2121 2122 /******************************************************************************/ 2123 2124 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2125 /* expecting source data in {d0, d1, d2, d3} */ 2126 /* destination data in {d4, d5, d6, d7} */ 2127 /* solid mask is in d15 */ 2128 2129 /* 'in' */ 2130 vmull.u8 q8, d15, d3 2131 vmull.u8 q6, d15, d2 2132 vmull.u8 q5, d15, d1 2133 vmull.u8 q4, d15, d0 2134 vrshr.u16 q13, q8, #8 2135 vrshr.u16 q12, q6, #8 2136 vrshr.u16 q11, q5, #8 2137 vrshr.u16 q10, q4, #8 2138 vraddhn.u16 d3, q8, q13 2139 vraddhn.u16 d2, q6, q12 2140 vraddhn.u16 d1, q5, q11 2141 vraddhn.u16 d0, q4, q10 2142 vmvn.8 d24, d3 /* get inverted alpha */ 2143 /* now do alpha blending */ 2144 vmull.u8 q8, d24, d4 2145 vmull.u8 q9, d24, d5 2146 vmull.u8 q10, d24, d6 2147 vmull.u8 q11, d24, d7 2148 .endm 2149 2150 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2151 vrshr.u16 q14, q8, #8 2152 vrshr.u16 q15, q9, #8 2153 vrshr.u16 q12, q10, #8 2154 vrshr.u16 q13, q11, #8 2155 vraddhn.u16 d28, q14, q8 2156 vraddhn.u16 d29, q15, q9 2157 vraddhn.u16 d30, q12, q10 2158 vraddhn.u16 d31, q13, q11 2159 .endm 2160 2161 /* TODO: expand macros and do better instructions scheduling */ 2162 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head 2163 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2164 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2165 fetch_src_pixblock 2166 cache_preload 8, 8 2167 fetch_mask_pixblock 2168 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2169 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2170 .endm 2171 2172 generate_composite_function_single_scanline \ 2173 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ 2174 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2175 8, /* number of pixels, processed in a single block */ \ 2176 default_init_need_all_regs, \ 2177 default_cleanup_need_all_regs, \ 2178 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ 2179 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ 2180 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ 2181 28, /* dst_w_basereg */ \ 2182 4, /* dst_r_basereg */ \ 2183 0, /* src_basereg */ \ 2184 12 /* mask_basereg */ 2185 2186 /******************************************************************************/ 2187 2188 .macro pixman_composite_over_8888_n_8888_process_pixblock_head 2189 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2190 .endm 2191 2192 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail 2193 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2194 vqadd.u8 q14, q0, q14 2195 vqadd.u8 q15, q1, q15 2196 .endm 2197 2198 /* TODO: expand macros and do better instructions scheduling */ 2199 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head 2200 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2201 pixman_composite_over_8888_n_8888_process_pixblock_tail 2202 fetch_src_pixblock 2203 cache_preload 8, 8 2204 pixman_composite_over_8888_n_8888_process_pixblock_head 2205 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2206 .endm 2207 2208 .macro pixman_composite_over_8888_n_8888_init 2209 add DUMMY, sp, #48 2210 vpush {d8-d15} 2211 vld1.32 {d15[0]}, [DUMMY] 2212 vdup.8 d15, d15[3] 2213 .endm 2214 2215 .macro pixman_composite_over_8888_n_8888_cleanup 2216 vpop {d8-d15} 2217 .endm 2218 2219 generate_composite_function \ 2220 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ 2221 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2222 8, /* number of pixels, processed in a single block */ \ 2223 5, /* prefetch distance */ \ 2224 pixman_composite_over_8888_n_8888_init, \ 2225 pixman_composite_over_8888_n_8888_cleanup, \ 2226 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2227 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2228 pixman_composite_over_8888_n_8888_process_pixblock_tail_head 2229 2230 /******************************************************************************/ 2231 2232 /* TODO: expand macros and do better instructions scheduling */ 2233 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head 2234 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2235 pixman_composite_over_8888_n_8888_process_pixblock_tail 2236 fetch_src_pixblock 2237 cache_preload 8, 8 2238 fetch_mask_pixblock 2239 pixman_composite_over_8888_n_8888_process_pixblock_head 2240 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2241 .endm 2242 2243 generate_composite_function \ 2244 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ 2245 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2246 8, /* number of pixels, processed in a single block */ \ 2247 5, /* prefetch distance */ \ 2248 default_init_need_all_regs, \ 2249 default_cleanup_need_all_regs, \ 2250 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2251 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2252 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ 2253 28, /* dst_w_basereg */ \ 2254 4, /* dst_r_basereg */ \ 2255 0, /* src_basereg */ \ 2256 12 /* mask_basereg */ 2257 2258 generate_composite_function_single_scanline \ 2259 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ 2260 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2261 8, /* number of pixels, processed in a single block */ \ 2262 default_init_need_all_regs, \ 2263 default_cleanup_need_all_regs, \ 2264 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2265 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2266 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ 2267 28, /* dst_w_basereg */ \ 2268 4, /* dst_r_basereg */ \ 2269 0, /* src_basereg */ \ 2270 12 /* mask_basereg */ 2271 2272 /******************************************************************************/ 2273 2274 /* TODO: expand macros and do better instructions scheduling */ 2275 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head 2276 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2277 pixman_composite_over_8888_n_8888_process_pixblock_tail 2278 fetch_src_pixblock 2279 cache_preload 8, 8 2280 fetch_mask_pixblock 2281 pixman_composite_over_8888_n_8888_process_pixblock_head 2282 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2283 .endm 2284 2285 generate_composite_function \ 2286 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ 2287 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2288 8, /* number of pixels, processed in a single block */ \ 2289 5, /* prefetch distance */ \ 2290 default_init_need_all_regs, \ 2291 default_cleanup_need_all_regs, \ 2292 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2293 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2294 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ 2295 28, /* dst_w_basereg */ \ 2296 4, /* dst_r_basereg */ \ 2297 0, /* src_basereg */ \ 2298 15 /* mask_basereg */ 2299 2300 /******************************************************************************/ 2301 2302 .macro pixman_composite_src_0888_0888_process_pixblock_head 2303 .endm 2304 2305 .macro pixman_composite_src_0888_0888_process_pixblock_tail 2306 .endm 2307 2308 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head 2309 vst3.8 {d0, d1, d2}, [DST_W]! 2310 fetch_src_pixblock 2311 cache_preload 8, 8 2312 .endm 2313 2314 generate_composite_function \ 2315 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ 2316 FLAG_DST_WRITEONLY, \ 2317 8, /* number of pixels, processed in a single block */ \ 2318 10, /* prefetch distance */ \ 2319 default_init, \ 2320 default_cleanup, \ 2321 pixman_composite_src_0888_0888_process_pixblock_head, \ 2322 pixman_composite_src_0888_0888_process_pixblock_tail, \ 2323 pixman_composite_src_0888_0888_process_pixblock_tail_head, \ 2324 0, /* dst_w_basereg */ \ 2325 0, /* dst_r_basereg */ \ 2326 0, /* src_basereg */ \ 2327 0 /* mask_basereg */ 2328 2329 /******************************************************************************/ 2330 2331 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head 2332 vswp d0, d2 2333 .endm 2334 2335 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail 2336 .endm 2337 2338 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head 2339 vst4.8 {d0, d1, d2, d3}, [DST_W]! 2340 fetch_src_pixblock 2341 vswp d0, d2 2342 cache_preload 8, 8 2343 .endm 2344 2345 .macro pixman_composite_src_0888_8888_rev_init 2346 veor d3, d3, d3 2347 .endm 2348 2349 generate_composite_function \ 2350 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ 2351 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2352 8, /* number of pixels, processed in a single block */ \ 2353 10, /* prefetch distance */ \ 2354 pixman_composite_src_0888_8888_rev_init, \ 2355 default_cleanup, \ 2356 pixman_composite_src_0888_8888_rev_process_pixblock_head, \ 2357 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ 2358 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ 2359 0, /* dst_w_basereg */ \ 2360 0, /* dst_r_basereg */ \ 2361 0, /* src_basereg */ \ 2362 0 /* mask_basereg */ 2363 2364 /******************************************************************************/ 2365 2366 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head 2367 vshll.u8 q8, d1, #8 2368 vshll.u8 q9, d2, #8 2369 .endm 2370 2371 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail 2372 vshll.u8 q14, d0, #8 2373 vsri.u16 q14, q8, #5 2374 vsri.u16 q14, q9, #11 2375 .endm 2376 2377 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head 2378 vshll.u8 q14, d0, #8 2379 fetch_src_pixblock 2380 vsri.u16 q14, q8, #5 2381 vsri.u16 q14, q9, #11 2382 vshll.u8 q8, d1, #8 2383 vst1.16 {d28, d29}, [DST_W, :128]! 2384 vshll.u8 q9, d2, #8 2385 .endm 2386 2387 generate_composite_function \ 2388 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ 2389 FLAG_DST_WRITEONLY, \ 2390 8, /* number of pixels, processed in a single block */ \ 2391 10, /* prefetch distance */ \ 2392 default_init, \ 2393 default_cleanup, \ 2394 pixman_composite_src_0888_0565_rev_process_pixblock_head, \ 2395 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ 2396 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ 2397 28, /* dst_w_basereg */ \ 2398 0, /* dst_r_basereg */ \ 2399 0, /* src_basereg */ \ 2400 0 /* mask_basereg */ 2401 2402 /******************************************************************************/ 2403 2404 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head 2405 vmull.u8 q8, d3, d0 2406 vmull.u8 q9, d3, d1 2407 vmull.u8 q10, d3, d2 2408 .endm 2409 2410 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail 2411 vrshr.u16 q11, q8, #8 2412 vswp d3, d31 2413 vrshr.u16 q12, q9, #8 2414 vrshr.u16 q13, q10, #8 2415 vraddhn.u16 d30, q11, q8 2416 vraddhn.u16 d29, q12, q9 2417 vraddhn.u16 d28, q13, q10 2418 .endm 2419 2420 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head 2421 vrshr.u16 q11, q8, #8 2422 vswp d3, d31 2423 vrshr.u16 q12, q9, #8 2424 vrshr.u16 q13, q10, #8 2425 fetch_src_pixblock 2426 vraddhn.u16 d30, q11, q8 2427 PF add PF_X, PF_X, #8 2428 PF tst PF_CTL, #0xF 2429 PF addne PF_X, PF_X, #8 2430 PF subne PF_CTL, PF_CTL, #1 2431 vraddhn.u16 d29, q12, q9 2432 vraddhn.u16 d28, q13, q10 2433 vmull.u8 q8, d3, d0 2434 vmull.u8 q9, d3, d1 2435 vmull.u8 q10, d3, d2 2436 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2437 PF cmp PF_X, ORIG_W 2438 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 2439 PF subge PF_X, PF_X, ORIG_W 2440 PF subges PF_CTL, PF_CTL, #0x10 2441 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 2442 .endm 2443 2444 generate_composite_function \ 2445 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ 2446 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2447 8, /* number of pixels, processed in a single block */ \ 2448 10, /* prefetch distance */ \ 2449 default_init, \ 2450 default_cleanup, \ 2451 pixman_composite_src_pixbuf_8888_process_pixblock_head, \ 2452 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ 2453 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ 2454 28, /* dst_w_basereg */ \ 2455 0, /* dst_r_basereg */ \ 2456 0, /* src_basereg */ \ 2457 0 /* mask_basereg */ 2458 2459 /******************************************************************************/ 2460 2461 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head 2462 vmull.u8 q8, d3, d0 2463 vmull.u8 q9, d3, d1 2464 vmull.u8 q10, d3, d2 2465 .endm 2466 2467 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail 2468 vrshr.u16 q11, q8, #8 2469 vswp d3, d31 2470 vrshr.u16 q12, q9, #8 2471 vrshr.u16 q13, q10, #8 2472 vraddhn.u16 d28, q11, q8 2473 vraddhn.u16 d29, q12, q9 2474 vraddhn.u16 d30, q13, q10 2475 .endm 2476 2477 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head 2478 vrshr.u16 q11, q8, #8 2479 vswp d3, d31 2480 vrshr.u16 q12, q9, #8 2481 vrshr.u16 q13, q10, #8 2482 fetch_src_pixblock 2483 vraddhn.u16 d28, q11, q8 2484 PF add PF_X, PF_X, #8 2485 PF tst PF_CTL, #0xF 2486 PF addne PF_X, PF_X, #8 2487 PF subne PF_CTL, PF_CTL, #1 2488 vraddhn.u16 d29, q12, q9 2489 vraddhn.u16 d30, q13, q10 2490 vmull.u8 q8, d3, d0 2491 vmull.u8 q9, d3, d1 2492 vmull.u8 q10, d3, d2 2493 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2494 PF cmp PF_X, ORIG_W 2495 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 2496 PF subge PF_X, PF_X, ORIG_W 2497 PF subges PF_CTL, PF_CTL, #0x10 2498 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 2499 .endm 2500 2501 generate_composite_function \ 2502 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ 2503 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2504 8, /* number of pixels, processed in a single block */ \ 2505 10, /* prefetch distance */ \ 2506 default_init, \ 2507 default_cleanup, \ 2508 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \ 2509 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \ 2510 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \ 2511 28, /* dst_w_basereg */ \ 2512 0, /* dst_r_basereg */ \ 2513 0, /* src_basereg */ \ 2514 0 /* mask_basereg */ 2515 2516 /******************************************************************************/ 2517 2518 .macro pixman_composite_over_0565_8_0565_process_pixblock_head 2519 /* mask is in d15 */ 2520 convert_0565_to_x888 q4, d2, d1, d0 2521 convert_0565_to_x888 q5, d6, d5, d4 2522 /* source pixel data is in {d0, d1, d2, XX} */ 2523 /* destination pixel data is in {d4, d5, d6, XX} */ 2524 vmvn.8 d7, d15 2525 vmull.u8 q6, d15, d2 2526 vmull.u8 q5, d15, d1 2527 vmull.u8 q4, d15, d0 2528 vmull.u8 q8, d7, d4 2529 vmull.u8 q9, d7, d5 2530 vmull.u8 q13, d7, d6 2531 vrshr.u16 q12, q6, #8 2532 vrshr.u16 q11, q5, #8 2533 vrshr.u16 q10, q4, #8 2534 vraddhn.u16 d2, q6, q12 2535 vraddhn.u16 d1, q5, q11 2536 vraddhn.u16 d0, q4, q10 2537 .endm 2538 2539 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail 2540 vrshr.u16 q14, q8, #8 2541 vrshr.u16 q15, q9, #8 2542 vrshr.u16 q12, q13, #8 2543 vraddhn.u16 d28, q14, q8 2544 vraddhn.u16 d29, q15, q9 2545 vraddhn.u16 d30, q12, q13 2546 vqadd.u8 q0, q0, q14 2547 vqadd.u8 q1, q1, q15 2548 /* 32bpp result is in {d0, d1, d2, XX} */ 2549 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 2550 .endm 2551 2552 /* TODO: expand macros and do better instructions scheduling */ 2553 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head 2554 fetch_mask_pixblock 2555 pixman_composite_over_0565_8_0565_process_pixblock_tail 2556 fetch_src_pixblock 2557 vld1.16 {d10, d11}, [DST_R, :128]! 2558 cache_preload 8, 8 2559 pixman_composite_over_0565_8_0565_process_pixblock_head 2560 vst1.16 {d28, d29}, [DST_W, :128]! 2561 .endm 2562 2563 generate_composite_function \ 2564 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ 2565 FLAG_DST_READWRITE, \ 2566 8, /* number of pixels, processed in a single block */ \ 2567 5, /* prefetch distance */ \ 2568 default_init_need_all_regs, \ 2569 default_cleanup_need_all_regs, \ 2570 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 2571 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 2572 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 2573 28, /* dst_w_basereg */ \ 2574 10, /* dst_r_basereg */ \ 2575 8, /* src_basereg */ \ 2576 15 /* mask_basereg */ 2577 2578 /******************************************************************************/ 2579 2580 .macro pixman_composite_over_0565_n_0565_init 2581 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 2582 vpush {d8-d15} 2583 vld1.32 {d15[0]}, [DUMMY] 2584 vdup.8 d15, d15[3] 2585 .endm 2586 2587 .macro pixman_composite_over_0565_n_0565_cleanup 2588 vpop {d8-d15} 2589 .endm 2590 2591 generate_composite_function \ 2592 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ 2593 FLAG_DST_READWRITE, \ 2594 8, /* number of pixels, processed in a single block */ \ 2595 5, /* prefetch distance */ \ 2596 pixman_composite_over_0565_n_0565_init, \ 2597 pixman_composite_over_0565_n_0565_cleanup, \ 2598 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 2599 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 2600 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 2601 28, /* dst_w_basereg */ \ 2602 10, /* dst_r_basereg */ \ 2603 8, /* src_basereg */ \ 2604 15 /* mask_basereg */ 2605 2606 /******************************************************************************/ 2607 2608 .macro pixman_composite_add_0565_8_0565_process_pixblock_head 2609 /* mask is in d15 */ 2610 convert_0565_to_x888 q4, d2, d1, d0 2611 convert_0565_to_x888 q5, d6, d5, d4 2612 /* source pixel data is in {d0, d1, d2, XX} */ 2613 /* destination pixel data is in {d4, d5, d6, XX} */ 2614 vmull.u8 q6, d15, d2 2615 vmull.u8 q5, d15, d1 2616 vmull.u8 q4, d15, d0 2617 vrshr.u16 q12, q6, #8 2618 vrshr.u16 q11, q5, #8 2619 vrshr.u16 q10, q4, #8 2620 vraddhn.u16 d2, q6, q12 2621 vraddhn.u16 d1, q5, q11 2622 vraddhn.u16 d0, q4, q10 2623 .endm 2624 2625 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail 2626 vqadd.u8 q0, q0, q2 2627 vqadd.u8 q1, q1, q3 2628 /* 32bpp result is in {d0, d1, d2, XX} */ 2629 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 2630 .endm 2631 2632 /* TODO: expand macros and do better instructions scheduling */ 2633 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head 2634 fetch_mask_pixblock 2635 pixman_composite_add_0565_8_0565_process_pixblock_tail 2636 fetch_src_pixblock 2637 vld1.16 {d10, d11}, [DST_R, :128]! 2638 cache_preload 8, 8 2639 pixman_composite_add_0565_8_0565_process_pixblock_head 2640 vst1.16 {d28, d29}, [DST_W, :128]! 2641 .endm 2642 2643 generate_composite_function \ 2644 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ 2645 FLAG_DST_READWRITE, \ 2646 8, /* number of pixels, processed in a single block */ \ 2647 5, /* prefetch distance */ \ 2648 default_init_need_all_regs, \ 2649 default_cleanup_need_all_regs, \ 2650 pixman_composite_add_0565_8_0565_process_pixblock_head, \ 2651 pixman_composite_add_0565_8_0565_process_pixblock_tail, \ 2652 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \ 2653 28, /* dst_w_basereg */ \ 2654 10, /* dst_r_basereg */ \ 2655 8, /* src_basereg */ \ 2656 15 /* mask_basereg */ 2657 2658 /******************************************************************************/ 2659 2660 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head 2661 /* mask is in d15 */ 2662 convert_0565_to_x888 q5, d6, d5, d4 2663 /* destination pixel data is in {d4, d5, d6, xx} */ 2664 vmvn.8 d24, d15 /* get inverted alpha */ 2665 /* now do alpha blending */ 2666 vmull.u8 q8, d24, d4 2667 vmull.u8 q9, d24, d5 2668 vmull.u8 q10, d24, d6 2669 .endm 2670 2671 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail 2672 vrshr.u16 q14, q8, #8 2673 vrshr.u16 q15, q9, #8 2674 vrshr.u16 q12, q10, #8 2675 vraddhn.u16 d0, q14, q8 2676 vraddhn.u16 d1, q15, q9 2677 vraddhn.u16 d2, q12, q10 2678 /* 32bpp result is in {d0, d1, d2, XX} */ 2679 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 2680 .endm 2681 2682 /* TODO: expand macros and do better instructions scheduling */ 2683 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head 2684 fetch_src_pixblock 2685 pixman_composite_out_reverse_8_0565_process_pixblock_tail 2686 vld1.16 {d10, d11}, [DST_R, :128]! 2687 cache_preload 8, 8 2688 pixman_composite_out_reverse_8_0565_process_pixblock_head 2689 vst1.16 {d28, d29}, [DST_W, :128]! 2690 .endm 2691 2692 generate_composite_function \ 2693 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ 2694 FLAG_DST_READWRITE, \ 2695 8, /* number of pixels, processed in a single block */ \ 2696 5, /* prefetch distance */ \ 2697 default_init_need_all_regs, \ 2698 default_cleanup_need_all_regs, \ 2699 pixman_composite_out_reverse_8_0565_process_pixblock_head, \ 2700 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ 2701 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ 2702 28, /* dst_w_basereg */ \ 2703 10, /* dst_r_basereg */ \ 2704 15, /* src_basereg */ \ 2705 0 /* mask_basereg */ 2706 2707 /******************************************************************************/ 2708 2709 .macro pixman_composite_out_reverse_8_8888_process_pixblock_head 2710 /* src is in d0 */ 2711 /* destination pixel data is in {d4, d5, d6, d7} */ 2712 vmvn.8 d1, d0 /* get inverted alpha */ 2713 /* now do alpha blending */ 2714 vmull.u8 q8, d1, d4 2715 vmull.u8 q9, d1, d5 2716 vmull.u8 q10, d1, d6 2717 vmull.u8 q11, d1, d7 2718 .endm 2719 2720 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail 2721 vrshr.u16 q14, q8, #8 2722 vrshr.u16 q15, q9, #8 2723 vrshr.u16 q12, q10, #8 2724 vrshr.u16 q13, q11, #8 2725 vraddhn.u16 d28, q14, q8 2726 vraddhn.u16 d29, q15, q9 2727 vraddhn.u16 d30, q12, q10 2728 vraddhn.u16 d31, q13, q11 2729 /* 32bpp result is in {d28, d29, d30, d31} */ 2730 .endm 2731 2732 /* TODO: expand macros and do better instructions scheduling */ 2733 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head 2734 fetch_src_pixblock 2735 pixman_composite_out_reverse_8_8888_process_pixblock_tail 2736 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2737 cache_preload 8, 8 2738 pixman_composite_out_reverse_8_8888_process_pixblock_head 2739 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2740 .endm 2741 2742 generate_composite_function \ 2743 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \ 2744 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2745 8, /* number of pixels, processed in a single block */ \ 2746 5, /* prefetch distance */ \ 2747 default_init, \ 2748 default_cleanup, \ 2749 pixman_composite_out_reverse_8_8888_process_pixblock_head, \ 2750 pixman_composite_out_reverse_8_8888_process_pixblock_tail, \ 2751 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \ 2752 28, /* dst_w_basereg */ \ 2753 4, /* dst_r_basereg */ \ 2754 0, /* src_basereg */ \ 2755 0 /* mask_basereg */ 2756 2757 /******************************************************************************/ 2758 2759 generate_composite_function_nearest_scanline \ 2760 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ 2761 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2762 8, /* number of pixels, processed in a single block */ \ 2763 default_init, \ 2764 default_cleanup, \ 2765 pixman_composite_over_8888_8888_process_pixblock_head, \ 2766 pixman_composite_over_8888_8888_process_pixblock_tail, \ 2767 pixman_composite_over_8888_8888_process_pixblock_tail_head 2768 2769 generate_composite_function_nearest_scanline \ 2770 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \ 2771 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2772 8, /* number of pixels, processed in a single block */ \ 2773 default_init, \ 2774 default_cleanup, \ 2775 pixman_composite_over_8888_0565_process_pixblock_head, \ 2776 pixman_composite_over_8888_0565_process_pixblock_tail, \ 2777 pixman_composite_over_8888_0565_process_pixblock_tail_head, \ 2778 28, /* dst_w_basereg */ \ 2779 4, /* dst_r_basereg */ \ 2780 0, /* src_basereg */ \ 2781 24 /* mask_basereg */ 2782 2783 generate_composite_function_nearest_scanline \ 2784 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \ 2785 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2786 8, /* number of pixels, processed in a single block */ \ 2787 default_init, \ 2788 default_cleanup, \ 2789 pixman_composite_src_8888_0565_process_pixblock_head, \ 2790 pixman_composite_src_8888_0565_process_pixblock_tail, \ 2791 pixman_composite_src_8888_0565_process_pixblock_tail_head 2792 2793 generate_composite_function_nearest_scanline \ 2794 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ 2795 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2796 8, /* number of pixels, processed in a single block */ \ 2797 default_init, \ 2798 default_cleanup, \ 2799 pixman_composite_src_0565_8888_process_pixblock_head, \ 2800 pixman_composite_src_0565_8888_process_pixblock_tail, \ 2801 pixman_composite_src_0565_8888_process_pixblock_tail_head 2802 2803 generate_composite_function_nearest_scanline \ 2804 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \ 2805 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2806 8, /* number of pixels, processed in a single block */ \ 2807 default_init_need_all_regs, \ 2808 default_cleanup_need_all_regs, \ 2809 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 2810 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 2811 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 2812 28, /* dst_w_basereg */ \ 2813 4, /* dst_r_basereg */ \ 2814 8, /* src_basereg */ \ 2815 24 /* mask_basereg */ 2816 2817 generate_composite_function_nearest_scanline \ 2818 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \ 2819 FLAG_DST_READWRITE, \ 2820 8, /* number of pixels, processed in a single block */ \ 2821 default_init_need_all_regs, \ 2822 default_cleanup_need_all_regs, \ 2823 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 2824 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 2825 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 2826 28, /* dst_w_basereg */ \ 2827 10, /* dst_r_basereg */ \ 2828 8, /* src_basereg */ \ 2829 15 /* mask_basereg */ 2830 2831 /******************************************************************************/ 2832 2833 /* Supplementary macro for setting function attributes */ 2834 .macro pixman_asm_function fname 2835 .func fname 2836 .global fname 2837 #ifdef __ELF__ 2838 .hidden fname 2839 .type fname, %function 2840 #endif 2841 fname: 2842 .endm 2843 2844 /* 2845 * Bilinear scaling support code which tries to provide pixel fetching, color 2846 * format conversion, and interpolation as separate macros which can be used 2847 * as the basic building blocks for constructing bilinear scanline functions. 2848 */ 2849 2850 .macro bilinear_load_8888 reg1, reg2, tmp 2851 mov TMP1, X, asr #16 2852 add X, X, UX 2853 add TMP1, TOP, TMP1, asl #2 2854 vld1.32 {reg1}, [TMP1], STRIDE 2855 vld1.32 {reg2}, [TMP1] 2856 .endm 2857 2858 .macro bilinear_load_0565 reg1, reg2, tmp 2859 mov TMP1, X, asr #16 2860 add X, X, UX 2861 add TMP1, TOP, TMP1, asl #1 2862 vld1.32 {reg2[0]}, [TMP1], STRIDE 2863 vld1.32 {reg2[1]}, [TMP1] 2864 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp 2865 .endm 2866 2867 .macro bilinear_load_and_vertical_interpolate_two_8888 \ 2868 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 2869 2870 bilinear_load_8888 reg1, reg2, tmp1 2871 vmull.u8 acc1, reg1, d28 2872 vmlal.u8 acc1, reg2, d29 2873 bilinear_load_8888 reg3, reg4, tmp2 2874 vmull.u8 acc2, reg3, d28 2875 vmlal.u8 acc2, reg4, d29 2876 .endm 2877 2878 .macro bilinear_load_and_vertical_interpolate_four_8888 \ 2879 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 2880 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 2881 2882 bilinear_load_and_vertical_interpolate_two_8888 \ 2883 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi 2884 bilinear_load_and_vertical_interpolate_two_8888 \ 2885 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 2886 .endm 2887 2888 .macro bilinear_load_and_vertical_interpolate_two_0565 \ 2889 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi 2890 2891 mov TMP1, X, asr #16 2892 add X, X, UX 2893 add TMP1, TOP, TMP1, asl #1 2894 mov TMP2, X, asr #16 2895 add X, X, UX 2896 add TMP2, TOP, TMP2, asl #1 2897 vld1.32 {acc2lo[0]}, [TMP1], STRIDE 2898 vld1.32 {acc2hi[0]}, [TMP2], STRIDE 2899 vld1.32 {acc2lo[1]}, [TMP1] 2900 vld1.32 {acc2hi[1]}, [TMP2] 2901 convert_0565_to_x888 acc2, reg3, reg2, reg1 2902 vzip.u8 reg1, reg3 2903 vzip.u8 reg2, reg4 2904 vzip.u8 reg3, reg4 2905 vzip.u8 reg1, reg2 2906 vmull.u8 acc1, reg1, d28 2907 vmlal.u8 acc1, reg2, d29 2908 vmull.u8 acc2, reg3, d28 2909 vmlal.u8 acc2, reg4, d29 2910 .endm 2911 2912 .macro bilinear_load_and_vertical_interpolate_four_0565 \ 2913 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 2914 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 2915 2916 mov TMP1, X, asr #16 2917 add X, X, UX 2918 add TMP1, TOP, TMP1, asl #1 2919 mov TMP2, X, asr #16 2920 add X, X, UX 2921 add TMP2, TOP, TMP2, asl #1 2922 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE 2923 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE 2924 vld1.32 {xacc2lo[1]}, [TMP1] 2925 vld1.32 {xacc2hi[1]}, [TMP2] 2926 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 2927 mov TMP1, X, asr #16 2928 add X, X, UX 2929 add TMP1, TOP, TMP1, asl #1 2930 mov TMP2, X, asr #16 2931 add X, X, UX 2932 add TMP2, TOP, TMP2, asl #1 2933 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE 2934 vzip.u8 xreg1, xreg3 2935 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE 2936 vzip.u8 xreg2, xreg4 2937 vld1.32 {yacc2lo[1]}, [TMP1] 2938 vzip.u8 xreg3, xreg4 2939 vld1.32 {yacc2hi[1]}, [TMP2] 2940 vzip.u8 xreg1, xreg2 2941 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 2942 vmull.u8 xacc1, xreg1, d28 2943 vzip.u8 yreg1, yreg3 2944 vmlal.u8 xacc1, xreg2, d29 2945 vzip.u8 yreg2, yreg4 2946 vmull.u8 xacc2, xreg3, d28 2947 vzip.u8 yreg3, yreg4 2948 vmlal.u8 xacc2, xreg4, d29 2949 vzip.u8 yreg1, yreg2 2950 vmull.u8 yacc1, yreg1, d28 2951 vmlal.u8 yacc1, yreg2, d29 2952 vmull.u8 yacc2, yreg3, d28 2953 vmlal.u8 yacc2, yreg4, d29 2954 .endm 2955 2956 .macro bilinear_store_8888 numpix, tmp1, tmp2 2957 .if numpix == 4 2958 vst1.32 {d0, d1}, [OUT, :128]! 2959 .elseif numpix == 2 2960 vst1.32 {d0}, [OUT, :64]! 2961 .elseif numpix == 1 2962 vst1.32 {d0[0]}, [OUT, :32]! 2963 .else 2964 .error bilinear_store_8888 numpix is unsupported 2965 .endif 2966 .endm 2967 2968 .macro bilinear_store_0565 numpix, tmp1, tmp2 2969 vuzp.u8 d0, d1 2970 vuzp.u8 d2, d3 2971 vuzp.u8 d1, d3 2972 vuzp.u8 d0, d2 2973 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 2974 .if numpix == 4 2975 vst1.16 {d2}, [OUT, :64]! 2976 .elseif numpix == 2 2977 vst1.32 {d2[0]}, [OUT, :32]! 2978 .elseif numpix == 1 2979 vst1.16 {d2[0]}, [OUT, :16]! 2980 .else 2981 .error bilinear_store_0565 numpix is unsupported 2982 .endif 2983 .endm 2984 2985 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt 2986 bilinear_load_&src_fmt d0, d1, d2 2987 vmull.u8 q1, d0, d28 2988 vmlal.u8 q1, d1, d29 2989 /* 5 cycles bubble */ 2990 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 2991 vmlsl.u16 q0, d2, d30 2992 vmlal.u16 q0, d3, d30 2993 /* 5 cycles bubble */ 2994 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 2995 /* 3 cycles bubble */ 2996 vmovn.u16 d0, q0 2997 /* 1 cycle bubble */ 2998 bilinear_store_&dst_fmt 1, q2, q3 2999 .endm 3000 3001 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt 3002 bilinear_load_and_vertical_interpolate_two_&src_fmt \ 3003 q1, q11, d0, d1, d20, d21, d22, d23 3004 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 3005 vmlsl.u16 q0, d2, d30 3006 vmlal.u16 q0, d3, d30 3007 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 3008 vmlsl.u16 q10, d22, d31 3009 vmlal.u16 q10, d23, d31 3010 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3011 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 3012 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3013 vadd.u16 q12, q12, q13 3014 vmovn.u16 d0, q0 3015 bilinear_store_&dst_fmt 2, q2, q3 3016 .endm 3017 3018 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt 3019 bilinear_load_and_vertical_interpolate_four_&src_fmt \ 3020 q1, q11, d0, d1, d20, d21, d22, d23 \ 3021 q3, q9, d4, d5, d16, d17, d18, d19 3022 pld [TMP1, PF_OFFS] 3023 sub TMP1, TMP1, STRIDE 3024 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 3025 vmlsl.u16 q0, d2, d30 3026 vmlal.u16 q0, d3, d30 3027 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 3028 vmlsl.u16 q10, d22, d31 3029 vmlal.u16 q10, d23, d31 3030 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3031 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS 3032 vmlsl.u16 q2, d6, d30 3033 vmlal.u16 q2, d7, d30 3034 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS 3035 pld [TMP2, PF_OFFS] 3036 vmlsl.u16 q8, d18, d31 3037 vmlal.u16 q8, d19, d31 3038 vadd.u16 q12, q12, q13 3039 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3040 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 3041 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3042 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) 3043 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3044 vmovn.u16 d0, q0 3045 vmovn.u16 d1, q2 3046 vadd.u16 q12, q12, q13 3047 bilinear_store_&dst_fmt 4, q2, q3 3048 .endm 3049 3050 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 3051 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt 3052 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head 3053 .else 3054 bilinear_interpolate_four_pixels src_fmt, dst_fmt 3055 .endif 3056 .endm 3057 3058 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt 3059 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt 3060 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail 3061 .endif 3062 .endm 3063 3064 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 3065 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt 3066 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head 3067 .else 3068 bilinear_interpolate_four_pixels src_fmt, dst_fmt 3069 .endif 3070 .endm 3071 3072 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt 3073 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt 3074 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head 3075 .else 3076 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 3077 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 3078 .endif 3079 .endm 3080 3081 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt 3082 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt 3083 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail 3084 .else 3085 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt 3086 .endif 3087 .endm 3088 3089 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt 3090 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt 3091 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head 3092 .else 3093 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 3094 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 3095 .endif 3096 .endm 3097 3098 .set BILINEAR_FLAG_UNROLL_4, 0 3099 .set BILINEAR_FLAG_UNROLL_8, 1 3100 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 3101 3102 /* 3103 * Main template macro for generating NEON optimized bilinear scanline 3104 * functions. 3105 * 3106 * Bilinear scanline scaler macro template uses the following arguments: 3107 * fname - name of the function to generate 3108 * src_fmt - source color format (8888 or 0565) 3109 * dst_fmt - destination color format (8888 or 0565) 3110 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes 3111 * prefetch_distance - prefetch in the source image by that many 3112 * pixels ahead 3113 */ 3114 3115 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ 3116 src_bpp_shift, dst_bpp_shift, \ 3117 prefetch_distance, flags 3118 3119 pixman_asm_function fname 3120 OUT .req r0 3121 TOP .req r1 3122 BOTTOM .req r2 3123 WT .req r3 3124 WB .req r4 3125 X .req r5 3126 UX .req r6 3127 WIDTH .req ip 3128 TMP1 .req r3 3129 TMP2 .req r4 3130 PF_OFFS .req r7 3131 TMP3 .req r8 3132 TMP4 .req r9 3133 STRIDE .req r2 3134 3135 mov ip, sp 3136 push {r4, r5, r6, r7, r8, r9} 3137 mov PF_OFFS, #prefetch_distance 3138 ldmia ip, {WB, X, UX, WIDTH} 3139 mul PF_OFFS, PF_OFFS, UX 3140 3141 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 3142 vpush {d8-d15} 3143 .endif 3144 3145 sub STRIDE, BOTTOM, TOP 3146 .unreq BOTTOM 3147 3148 cmp WIDTH, #0 3149 ble 3f 3150 3151 vdup.u16 q12, X 3152 vdup.u16 q13, UX 3153 vdup.u8 d28, WT 3154 vdup.u8 d29, WB 3155 vadd.u16 d25, d25, d26 3156 3157 /* ensure good destination alignment */ 3158 cmp WIDTH, #1 3159 blt 0f 3160 tst OUT, #(1 << dst_bpp_shift) 3161 beq 0f 3162 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3163 vadd.u16 q12, q12, q13 3164 bilinear_interpolate_last_pixel src_fmt, dst_fmt 3165 sub WIDTH, WIDTH, #1 3166 0: 3167 vadd.u16 q13, q13, q13 3168 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3169 vadd.u16 q12, q12, q13 3170 3171 cmp WIDTH, #2 3172 blt 0f 3173 tst OUT, #(1 << (dst_bpp_shift + 1)) 3174 beq 0f 3175 bilinear_interpolate_two_pixels src_fmt, dst_fmt 3176 sub WIDTH, WIDTH, #2 3177 0: 3178 .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 3179 /*********** 8 pixels per iteration *****************/ 3180 cmp WIDTH, #4 3181 blt 0f 3182 tst OUT, #(1 << (dst_bpp_shift + 2)) 3183 beq 0f 3184 bilinear_interpolate_four_pixels src_fmt, dst_fmt 3185 sub WIDTH, WIDTH, #4 3186 0: 3187 subs WIDTH, WIDTH, #8 3188 blt 1f 3189 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) 3190 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt 3191 subs WIDTH, WIDTH, #8 3192 blt 5f 3193 0: 3194 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt 3195 subs WIDTH, WIDTH, #8 3196 bge 0b 3197 5: 3198 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt 3199 1: 3200 tst WIDTH, #4 3201 beq 2f 3202 bilinear_interpolate_four_pixels src_fmt, dst_fmt 3203 2: 3204 .else 3205 /*********** 4 pixels per iteration *****************/ 3206 subs WIDTH, WIDTH, #4 3207 blt 1f 3208 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) 3209 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 3210 subs WIDTH, WIDTH, #4 3211 blt 5f 3212 0: 3213 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 3214 subs WIDTH, WIDTH, #4 3215 bge 0b 3216 5: 3217 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt 3218 1: 3219 /****************************************************/ 3220 .endif 3221 /* handle the remaining trailing pixels */ 3222 tst WIDTH, #2 3223 beq 2f 3224 bilinear_interpolate_two_pixels src_fmt, dst_fmt 3225 2: 3226 tst WIDTH, #1 3227 beq 3f 3228 bilinear_interpolate_last_pixel src_fmt, dst_fmt 3229 3: 3230 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 3231 vpop {d8-d15} 3232 .endif 3233 pop {r4, r5, r6, r7, r8, r9} 3234 bx lr 3235 3236 .unreq OUT 3237 .unreq TOP 3238 .unreq WT 3239 .unreq WB 3240 .unreq X 3241 .unreq UX 3242 .unreq WIDTH 3243 .unreq TMP1 3244 .unreq TMP2 3245 .unreq PF_OFFS 3246 .unreq TMP3 3247 .unreq TMP4 3248 .unreq STRIDE 3249 .endfunc 3250 3251 .endm 3252 3253 /*****************************************************************************/ 3254 3255 .set have_bilinear_interpolate_four_pixels_8888_8888, 1 3256 3257 .macro bilinear_interpolate_four_pixels_8888_8888_head 3258 mov TMP1, X, asr #16 3259 add X, X, UX 3260 add TMP1, TOP, TMP1, asl #2 3261 mov TMP2, X, asr #16 3262 add X, X, UX 3263 add TMP2, TOP, TMP2, asl #2 3264 3265 vld1.32 {d22}, [TMP1], STRIDE 3266 vld1.32 {d23}, [TMP1] 3267 mov TMP3, X, asr #16 3268 add X, X, UX 3269 add TMP3, TOP, TMP3, asl #2 3270 vmull.u8 q8, d22, d28 3271 vmlal.u8 q8, d23, d29 3272 3273 vld1.32 {d22}, [TMP2], STRIDE 3274 vld1.32 {d23}, [TMP2] 3275 mov TMP4, X, asr #16 3276 add X, X, UX 3277 add TMP4, TOP, TMP4, asl #2 3278 vmull.u8 q9, d22, d28 3279 vmlal.u8 q9, d23, d29 3280 3281 vld1.32 {d22}, [TMP3], STRIDE 3282 vld1.32 {d23}, [TMP3] 3283 vmull.u8 q10, d22, d28 3284 vmlal.u8 q10, d23, d29 3285 3286 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3287 vmlsl.u16 q0, d16, d30 3288 vmlal.u16 q0, d17, d30 3289 3290 pld [TMP4, PF_OFFS] 3291 vld1.32 {d16}, [TMP4], STRIDE 3292 vld1.32 {d17}, [TMP4] 3293 pld [TMP4, PF_OFFS] 3294 vmull.u8 q11, d16, d28 3295 vmlal.u8 q11, d17, d29 3296 3297 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3298 vmlsl.u16 q1, d18, d31 3299 .endm 3300 3301 .macro bilinear_interpolate_four_pixels_8888_8888_tail 3302 vmlal.u16 q1, d19, d31 3303 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3304 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3305 vmlsl.u16 q2, d20, d30 3306 vmlal.u16 q2, d21, d30 3307 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3308 vmlsl.u16 q3, d22, d31 3309 vmlal.u16 q3, d23, d31 3310 vadd.u16 q12, q12, q13 3311 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3312 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3313 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3314 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3315 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3316 vmovn.u16 d6, q0 3317 vmovn.u16 d7, q2 3318 vadd.u16 q12, q12, q13 3319 vst1.32 {d6, d7}, [OUT, :128]! 3320 .endm 3321 3322 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head 3323 mov TMP1, X, asr #16 3324 add X, X, UX 3325 add TMP1, TOP, TMP1, asl #2 3326 mov TMP2, X, asr #16 3327 add X, X, UX 3328 add TMP2, TOP, TMP2, asl #2 3329 vmlal.u16 q1, d19, d31 3330 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3331 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3332 vmlsl.u16 q2, d20, d30 3333 vmlal.u16 q2, d21, d30 3334 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3335 vld1.32 {d20}, [TMP1], STRIDE 3336 vmlsl.u16 q3, d22, d31 3337 vmlal.u16 q3, d23, d31 3338 vld1.32 {d21}, [TMP1] 3339 vmull.u8 q8, d20, d28 3340 vmlal.u8 q8, d21, d29 3341 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3342 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3343 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3344 vld1.32 {d22}, [TMP2], STRIDE 3345 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3346 vadd.u16 q12, q12, q13 3347 vld1.32 {d23}, [TMP2] 3348 vmull.u8 q9, d22, d28 3349 mov TMP3, X, asr #16 3350 add X, X, UX 3351 add TMP3, TOP, TMP3, asl #2 3352 mov TMP4, X, asr #16 3353 add X, X, UX 3354 add TMP4, TOP, TMP4, asl #2 3355 vmlal.u8 q9, d23, d29 3356 vld1.32 {d22}, [TMP3], STRIDE 3357 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3358 vld1.32 {d23}, [TMP3] 3359 vmull.u8 q10, d22, d28 3360 vmlal.u8 q10, d23, d29 3361 vmovn.u16 d6, q0 3362 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3363 vmovn.u16 d7, q2 3364 vmlsl.u16 q0, d16, d30 3365 vmlal.u16 q0, d17, d30 3366 pld [TMP4, PF_OFFS] 3367 vld1.32 {d16}, [TMP4], STRIDE 3368 vadd.u16 q12, q12, q13 3369 vld1.32 {d17}, [TMP4] 3370 pld [TMP4, PF_OFFS] 3371 vmull.u8 q11, d16, d28 3372 vmlal.u8 q11, d17, d29 3373 vst1.32 {d6, d7}, [OUT, :128]! 3374 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3375 vmlsl.u16 q1, d18, d31 3376 .endm 3377 3378 /*****************************************************************************/ 3379 3380 .set have_bilinear_interpolate_eight_pixels_8888_0565, 1 3381 3382 .macro bilinear_interpolate_eight_pixels_8888_0565_head 3383 mov TMP1, X, asr #16 3384 add X, X, UX 3385 add TMP1, TOP, TMP1, asl #2 3386 mov TMP2, X, asr #16 3387 add X, X, UX 3388 add TMP2, TOP, TMP2, asl #2 3389 vld1.32 {d20}, [TMP1], STRIDE 3390 vld1.32 {d21}, [TMP1] 3391 vmull.u8 q8, d20, d28 3392 vmlal.u8 q8, d21, d29 3393 vld1.32 {d22}, [TMP2], STRIDE 3394 vld1.32 {d23}, [TMP2] 3395 vmull.u8 q9, d22, d28 3396 mov TMP3, X, asr #16 3397 add X, X, UX 3398 add TMP3, TOP, TMP3, asl #2 3399 mov TMP4, X, asr #16 3400 add X, X, UX 3401 add TMP4, TOP, TMP4, asl #2 3402 vmlal.u8 q9, d23, d29 3403 vld1.32 {d22}, [TMP3], STRIDE 3404 vld1.32 {d23}, [TMP3] 3405 vmull.u8 q10, d22, d28 3406 vmlal.u8 q10, d23, d29 3407 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3408 vmlsl.u16 q0, d16, d30 3409 vmlal.u16 q0, d17, d30 3410 pld [TMP4, PF_OFFS] 3411 vld1.32 {d16}, [TMP4], STRIDE 3412 vld1.32 {d17}, [TMP4] 3413 pld [TMP4, PF_OFFS] 3414 vmull.u8 q11, d16, d28 3415 vmlal.u8 q11, d17, d29 3416 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3417 vmlsl.u16 q1, d18, d31 3418 3419 mov TMP1, X, asr #16 3420 add X, X, UX 3421 add TMP1, TOP, TMP1, asl #2 3422 mov TMP2, X, asr #16 3423 add X, X, UX 3424 add TMP2, TOP, TMP2, asl #2 3425 vmlal.u16 q1, d19, d31 3426 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3427 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3428 vmlsl.u16 q2, d20, d30 3429 vmlal.u16 q2, d21, d30 3430 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3431 vld1.32 {d20}, [TMP1], STRIDE 3432 vmlsl.u16 q3, d22, d31 3433 vmlal.u16 q3, d23, d31 3434 vld1.32 {d21}, [TMP1] 3435 vmull.u8 q8, d20, d28 3436 vmlal.u8 q8, d21, d29 3437 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3438 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3439 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3440 vld1.32 {d22}, [TMP2], STRIDE 3441 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3442 vadd.u16 q12, q12, q13 3443 vld1.32 {d23}, [TMP2] 3444 vmull.u8 q9, d22, d28 3445 mov TMP3, X, asr #16 3446 add X, X, UX 3447 add TMP3, TOP, TMP3, asl #2 3448 mov TMP4, X, asr #16 3449 add X, X, UX 3450 add TMP4, TOP, TMP4, asl #2 3451 vmlal.u8 q9, d23, d29 3452 vld1.32 {d22}, [TMP3], STRIDE 3453 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3454 vld1.32 {d23}, [TMP3] 3455 vmull.u8 q10, d22, d28 3456 vmlal.u8 q10, d23, d29 3457 vmovn.u16 d8, q0 3458 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3459 vmovn.u16 d9, q2 3460 vmlsl.u16 q0, d16, d30 3461 vmlal.u16 q0, d17, d30 3462 pld [TMP4, PF_OFFS] 3463 vld1.32 {d16}, [TMP4], STRIDE 3464 vadd.u16 q12, q12, q13 3465 vld1.32 {d17}, [TMP4] 3466 pld [TMP4, PF_OFFS] 3467 vmull.u8 q11, d16, d28 3468 vmlal.u8 q11, d17, d29 3469 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3470 vmlsl.u16 q1, d18, d31 3471 .endm 3472 3473 .macro bilinear_interpolate_eight_pixels_8888_0565_tail 3474 vmlal.u16 q1, d19, d31 3475 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3476 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3477 vmlsl.u16 q2, d20, d30 3478 vmlal.u16 q2, d21, d30 3479 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3480 vmlsl.u16 q3, d22, d31 3481 vmlal.u16 q3, d23, d31 3482 vadd.u16 q12, q12, q13 3483 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3484 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3485 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3486 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3487 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3488 vmovn.u16 d10, q0 3489 vmovn.u16 d11, q2 3490 vadd.u16 q12, q12, q13 3491 3492 vuzp.u8 d8, d9 3493 vuzp.u8 d10, d11 3494 vuzp.u8 d9, d11 3495 vuzp.u8 d8, d10 3496 vshll.u8 q6, d9, #8 3497 vshll.u8 q5, d10, #8 3498 vshll.u8 q7, d8, #8 3499 vsri.u16 q5, q6, #5 3500 vsri.u16 q5, q7, #11 3501 vst1.32 {d10, d11}, [OUT, :128]! 3502 .endm 3503 3504 .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head 3505 mov TMP1, X, asr #16 3506 add X, X, UX 3507 add TMP1, TOP, TMP1, asl #2 3508 mov TMP2, X, asr #16 3509 add X, X, UX 3510 add TMP2, TOP, TMP2, asl #2 3511 vmlal.u16 q1, d19, d31 3512 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3513 vuzp.u8 d8, d9 3514 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3515 vmlsl.u16 q2, d20, d30 3516 vmlal.u16 q2, d21, d30 3517 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3518 vld1.32 {d20}, [TMP1], STRIDE 3519 vmlsl.u16 q3, d22, d31 3520 vmlal.u16 q3, d23, d31 3521 vld1.32 {d21}, [TMP1] 3522 vmull.u8 q8, d20, d28 3523 vmlal.u8 q8, d21, d29 3524 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3525 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3526 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3527 vld1.32 {d22}, [TMP2], STRIDE 3528 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3529 vadd.u16 q12, q12, q13 3530 vld1.32 {d23}, [TMP2] 3531 vmull.u8 q9, d22, d28 3532 mov TMP3, X, asr #16 3533 add X, X, UX 3534 add TMP3, TOP, TMP3, asl #2 3535 mov TMP4, X, asr #16 3536 add X, X, UX 3537 add TMP4, TOP, TMP4, asl #2 3538 vmlal.u8 q9, d23, d29 3539 vld1.32 {d22}, [TMP3], STRIDE 3540 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3541 vld1.32 {d23}, [TMP3] 3542 vmull.u8 q10, d22, d28 3543 vmlal.u8 q10, d23, d29 3544 vmovn.u16 d10, q0 3545 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3546 vmovn.u16 d11, q2 3547 vmlsl.u16 q0, d16, d30 3548 vmlal.u16 q0, d17, d30 3549 pld [TMP4, PF_OFFS] 3550 vld1.32 {d16}, [TMP4], STRIDE 3551 vadd.u16 q12, q12, q13 3552 vld1.32 {d17}, [TMP4] 3553 pld [TMP4, PF_OFFS] 3554 vmull.u8 q11, d16, d28 3555 vmlal.u8 q11, d17, d29 3556 vuzp.u8 d10, d11 3557 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3558 vmlsl.u16 q1, d18, d31 3559 3560 mov TMP1, X, asr #16 3561 add X, X, UX 3562 add TMP1, TOP, TMP1, asl #2 3563 mov TMP2, X, asr #16 3564 add X, X, UX 3565 add TMP2, TOP, TMP2, asl #2 3566 vmlal.u16 q1, d19, d31 3567 vuzp.u8 d9, d11 3568 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3569 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3570 vuzp.u8 d8, d10 3571 vmlsl.u16 q2, d20, d30 3572 vmlal.u16 q2, d21, d30 3573 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3574 vld1.32 {d20}, [TMP1], STRIDE 3575 vmlsl.u16 q3, d22, d31 3576 vmlal.u16 q3, d23, d31 3577 vld1.32 {d21}, [TMP1] 3578 vmull.u8 q8, d20, d28 3579 vmlal.u8 q8, d21, d29 3580 vshll.u8 q6, d9, #8 3581 vshll.u8 q5, d10, #8 3582 vshll.u8 q7, d8, #8 3583 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3584 vsri.u16 q5, q6, #5 3585 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3586 vsri.u16 q5, q7, #11 3587 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3588 vld1.32 {d22}, [TMP2], STRIDE 3589 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3590 vadd.u16 q12, q12, q13 3591 vld1.32 {d23}, [TMP2] 3592 vmull.u8 q9, d22, d28 3593 mov TMP3, X, asr #16 3594 add X, X, UX 3595 add TMP3, TOP, TMP3, asl #2 3596 mov TMP4, X, asr #16 3597 add X, X, UX 3598 add TMP4, TOP, TMP4, asl #2 3599 vmlal.u8 q9, d23, d29 3600 vld1.32 {d22}, [TMP3], STRIDE 3601 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3602 vld1.32 {d23}, [TMP3] 3603 vmull.u8 q10, d22, d28 3604 vmlal.u8 q10, d23, d29 3605 vmovn.u16 d8, q0 3606 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3607 vmovn.u16 d9, q2 3608 vmlsl.u16 q0, d16, d30 3609 vmlal.u16 q0, d17, d30 3610 pld [TMP4, PF_OFFS] 3611 vld1.32 {d16}, [TMP4], STRIDE 3612 vadd.u16 q12, q12, q13 3613 vld1.32 {d17}, [TMP4] 3614 pld [TMP4, PF_OFFS] 3615 vmull.u8 q11, d16, d28 3616 vmlal.u8 q11, d17, d29 3617 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3618 vst1.32 {d10, d11}, [OUT, :128]! 3619 vmlsl.u16 q1, d18, d31 3620 .endm 3621 /*****************************************************************************/ 3622 3623 generate_bilinear_scanline_func \ 3624 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ 3625 2, 2, 28, BILINEAR_FLAG_UNROLL_4 3626 3627 generate_bilinear_scanline_func \ 3628 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \ 3629 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS 3630 3631 generate_bilinear_scanline_func \ 3632 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \ 3633 1, 2, 28, BILINEAR_FLAG_UNROLL_4 3634 3635 generate_bilinear_scanline_func \ 3636 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \ 3637 1, 1, 28, BILINEAR_FLAG_UNROLL_4 3638