1 /* 2 * Copyright (C) 2013-2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18 #define END(f) .size f, .-f; 19 20 #define BLEND_LIST(X) \ 21 X(0, CLEAR) \ 22 X(1, SRC) \ 23 X(2, DST) \ 24 X(3, SRC_OVER) \ 25 X(4, DST_OVER) \ 26 X(5, SRC_IN) \ 27 X(6, DST_IN) \ 28 X(7, SRC_OUT) \ 29 X(8, DST_OUT) \ 30 X(9, SRC_ATOP) \ 31 X(10, DST_ATOP) \ 32 X(11, XOR) \ 33 X(14, MULTIPLY) \ 34 X(21, DIFFERENCE) \ 35 X(34, ADD) \ 36 X(35, SUBTRACT) 37 38 /* For every blend operation supported, define a macro with just the arithmetic 39 * component. The rest can be handled later on. 40 * 41 * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11 42 * contain the data from the source buffer. Both have already been split out 43 * into one colour component per register (if necessary). q3 and q11 contain 44 * the alpha components. 45 * 46 * At the same time as defining the assembly macro, define a corresponding 47 * preprocessor macro indicating any other requirements. 48 * zipped=0 -- The macro does not require the RGBA components to be 49 * separated. 50 * lddst=0 -- The macro does not require data from the destination buffer. 51 * ldsrc=0 -- The macro does not require data from the source buffer. 52 * nowrap=1 -- The macro requires no wrapper at all, and should simply be 53 * inserted without any surrounding load/store or loop code. 54 */ 55 56 #define params_CLEAR zipped=0, lddst=0, ldsrc=0 57 .macro blend_kernel_CLEAR 58 movi v0.16b, #0 59 movi v1.16b, #0 60 movi v2.16b, #0 61 movi v3.16b, #0 62 .endm 63 64 #define params_SRC zipped=0, lddst=0 65 .macro blend_kernel_SRC 66 mov v0.16b, v8.16b 67 mov v1.16b, v9.16b 68 mov v2.16b, v10.16b 69 mov v3.16b, v11.16b 70 .endm 71 72 #define params_DST nowrap=1 73 .macro blend_kernel_DST 74 /* nop */ 75 .endm 76 77 #define params_SRC_OVER zipped=1 78 .macro blend_kernel_SRC_OVER 79 mvn v7.16b, v11.16b 80 81 umull2 v12.8h, v7.16b, v0.16b 82 umull v0.8h, v7.8b, v0.8b 83 umull2 v13.8h, v7.16b, v1.16b 84 umull v1.8h, v7.8b, v1.8b 85 umull2 v14.8h, v7.16b, v2.16b 86 umull v2.8h, v7.8b, v2.8b 87 umull2 v15.8h, v7.16b, v3.16b 88 umull v3.8h, v7.8b, v3.8b 89 90 rshrn v4.8b, v0.8h, #8 91 rshrn2 v4.16b, v12.8h, #8 92 rshrn v5.8b, v1.8h, #8 93 rshrn2 v5.16b, v13.8h, #8 94 rshrn v6.8b, v2.8h, #8 95 rshrn2 v6.16b, v14.8h, #8 96 rshrn v7.8b, v3.8h, #8 97 rshrn2 v7.16b, v15.8h, #8 98 99 uaddw v0.8h, v0.8h, v4.8b 100 uaddw2 v12.8h, v12.8h, v4.16b 101 uaddw v1.8h, v1.8h, v5.8b 102 uaddw2 v13.8h, v13.8h, v5.16b 103 uaddw v2.8h, v2.8h, v6.8b 104 uaddw2 v14.8h, v14.8h, v6.16b 105 uaddw v3.8h, v3.8h, v7.8b 106 uaddw2 v15.8h, v15.8h, v7.16b 107 108 rshrn v0.8b, v0.8h, #8 109 rshrn2 v0.16b, v12.8h, #8 110 rshrn v1.8b, v1.8h, #8 111 rshrn2 v1.16b, v13.8h, #8 112 rshrn v2.8b, v2.8h, #8 113 rshrn2 v2.16b, v14.8h, #8 114 rshrn v3.8b, v3.8h, #8 115 rshrn2 v3.16b, v15.8h, #8 116 117 uqadd v0.16b, v0.16b, v8.16b 118 uqadd v1.16b, v1.16b, v9.16b 119 uqadd v2.16b, v2.16b, v10.16b 120 uqadd v3.16b, v3.16b, v11.16b 121 .endm 122 123 #define params_DST_OVER zipped=1 124 .macro blend_kernel_DST_OVER 125 mvn v7.16b, v3.16b 126 127 umull2 v12.8h, v7.16b, v8.16b 128 umull v8.8h, v7.8b, v8.8b 129 umull2 v13.8h, v7.16b, v9.16b 130 umull v9.8h, v7.8b, v9.8b 131 umull2 v14.8h, v7.16b, v10.16b 132 umull v10.8h, v7.8b, v10.8b 133 umull2 v15.8h, v7.16b, v11.16b 134 umull v11.8h, v7.8b, v11.8b 135 136 rshrn v4.8b, v8.8h, #8 137 rshrn2 v4.16b, v12.8h, #8 138 rshrn v5.8b, v9.8h, #8 139 rshrn2 v5.16b, v13.8h, #8 140 rshrn v6.8b, v10.8h, #8 141 rshrn2 v6.16b, v14.8h, #8 142 rshrn v7.8b, v11.8h, #8 143 rshrn2 v7.16b, v15.8h, #8 144 145 uaddw v8.8h, v8.8h, v4.8b 146 uaddw2 v12.8h, v12.8h, v4.16b 147 uaddw v9.8h, v9.8h, v5.8b 148 uaddw2 v13.8h, v13.8h, v5.16b 149 uaddw v10.8h, v10.8h, v6.8b 150 uaddw2 v14.8h, v14.8h, v6.16b 151 uaddw v11.8h, v11.8h, v7.8b 152 uaddw2 v15.8h, v15.8h, v7.16b 153 154 rshrn v8.8b, v8.8h, #8 155 rshrn2 v8.16b, v12.8h, #8 156 rshrn v9.8b, v9.8h, #8 157 rshrn2 v9.16b, v13.8h, #8 158 rshrn v10.8b, v10.8h, #8 159 rshrn2 v10.16b, v14.8h, #8 160 rshrn v11.8b, v11.8h, #8 161 rshrn2 v11.16b, v15.8h, #8 162 163 uqadd v0.16b, v0.16b, v8.16b 164 uqadd v1.16b, v1.16b, v9.16b 165 uqadd v2.16b, v2.16b, v10.16b 166 uqadd v3.16b, v3.16b, v11.16b 167 .endm 168 169 #define params_SRC_IN zipped=1 170 .macro blend_kernel_SRC_IN 171 umull2 v12.8h, v3.16b, v8.16b 172 umull v0.8h, v3.8b, v8.8b 173 umull2 v13.8h, v3.16b, v9.16b 174 umull v1.8h, v3.8b, v9.8b 175 umull2 v14.8h, v3.16b, v10.16b 176 umull v2.8h, v3.8b, v10.8b 177 umull2 v15.8h, v3.16b, v11.16b 178 umull v3.8h, v3.8b, v11.8b 179 180 rshrn v4.8b, v0.8h, #8 181 rshrn2 v4.16b, v12.8h, #8 182 rshrn v5.8b, v1.8h, #8 183 rshrn2 v5.16b, v13.8h, #8 184 rshrn v6.8b, v2.8h, #8 185 rshrn2 v6.16b, v14.8h, #8 186 rshrn v7.8b, v3.8h, #8 187 rshrn2 v7.16b, v15.8h, #8 188 189 uaddw v0.8h, v0.8h, v4.8b 190 uaddw2 v12.8h, v12.8h, v4.16b 191 uaddw v1.8h, v1.8h, v5.8b 192 uaddw2 v13.8h, v13.8h, v5.16b 193 uaddw v2.8h, v2.8h, v6.8b 194 uaddw2 v14.8h, v14.8h, v6.16b 195 uaddw v3.8h, v3.8h, v7.8b 196 uaddw2 v15.8h, v15.8h, v7.16b 197 198 rshrn v0.8b, v0.8h, #8 199 rshrn2 v0.16b, v12.8h, #8 200 rshrn v1.8b, v1.8h, #8 201 rshrn2 v1.16b, v13.8h, #8 202 rshrn v2.8b, v2.8h, #8 203 rshrn2 v2.16b, v14.8h, #8 204 rshrn v3.8b, v3.8h, #8 205 rshrn2 v3.16b, v15.8h, #8 206 .endm 207 208 #define params_DST_IN zipped=1 209 .macro blend_kernel_DST_IN 210 umull2 v12.8h, v0.16b, v11.16b 211 umull v0.8h, v0.8b, v11.8b 212 umull2 v13.8h, v1.16b, v11.16b 213 umull v1.8h, v1.8b, v11.8b 214 umull2 v14.8h, v2.16b, v11.16b 215 umull v2.8h, v2.8b, v11.8b 216 umull2 v15.8h, v3.16b, v11.16b 217 umull v3.8h, v3.8b, v11.8b 218 219 rshrn v4.8b, v0.8h, #8 220 rshrn2 v4.16b, v12.8h, #8 221 rshrn v5.8b, v1.8h, #8 222 rshrn2 v5.16b, v13.8h, #8 223 rshrn v6.8b, v2.8h, #8 224 rshrn2 v6.16b, v14.8h, #8 225 rshrn v7.8b, v3.8h, #8 226 rshrn2 v7.16b, v15.8h, #8 227 228 uaddw v0.8h, v0.8h, v4.8b 229 uaddw2 v12.8h, v12.8h, v4.16b 230 uaddw v1.8h, v1.8h, v5.8b 231 uaddw2 v13.8h, v13.8h, v5.16b 232 uaddw v2.8h, v2.8h, v6.8b 233 uaddw2 v14.8h, v14.8h, v6.16b 234 uaddw v3.8h, v3.8h, v7.8b 235 uaddw2 v15.8h, v15.8h, v7.16b 236 237 rshrn v0.8b, v0.8h, #8 238 rshrn2 v0.16b, v12.8h, #8 239 rshrn v1.8b, v1.8h, #8 240 rshrn2 v1.16b, v13.8h, #8 241 rshrn v2.8b, v2.8h, #8 242 rshrn2 v2.16b, v14.8h, #8 243 rshrn v3.8b, v3.8h, #8 244 rshrn2 v3.16b, v15.8h, #8 245 .endm 246 247 #define params_SRC_OUT zipped=1 248 .macro blend_kernel_SRC_OUT 249 mvn v3.16b, v3.16b 250 blend_kernel_SRC_IN 251 .endm 252 253 254 #define params_DST_OUT zipped=1 255 .macro blend_kernel_DST_OUT 256 mvn v11.16b, v11.16b 257 blend_kernel_DST_IN 258 .endm 259 260 #define params_SRC_ATOP zipped=1 261 .macro blend_kernel_SRC_ATOP 262 mvn v11.16b, v11.16b 263 264 umull2 v12.8h, v11.16b, v0.16b 265 umull v0.8h, v11.8b, v0.8b 266 umull2 v13.8h, v11.16b, v1.16b 267 umull v1.8h, v11.8b, v1.8b 268 umull2 v14.8h, v11.16b, v2.16b 269 umull v2.8h, v11.8b, v2.8b 270 271 umull2 v4.8h, v3.16b, v8.16b 272 umull v8.8h, v3.8b, v8.8b 273 umull2 v5.8h, v3.16b, v9.16b 274 umull v9.8h, v3.8b, v9.8b 275 umull2 v6.8h, v3.16b, v10.16b 276 umull v10.8h, v3.8b, v10.8b 277 278 uqadd v12.8h, v12.8h, v4.8h 279 uqadd v0.8h, v0.8h, v8.8h 280 uqadd v13.8h, v13.8h, v5.8h 281 uqadd v1.8h, v1.8h, v9.8h 282 uqadd v14.8h, v14.8h, v6.8h 283 uqadd v2.8h, v2.8h, v10.8h 284 285 urshr v8.8h, v0.8h, #8 286 urshr v4.8h, v12.8h, #8 287 urshr v9.8h, v1.8h, #8 288 urshr v5.8h, v13.8h, #8 289 urshr v10.8h, v2.8h, #8 290 urshr v6.8h, v14.8h, #8 291 292 uqadd v0.8h, v0.8h, v8.8h 293 uqadd v12.8h, v12.8h, v4.8h 294 uqadd v1.8h, v1.8h, v9.8h 295 uqadd v13.8h, v13.8h, v5.8h 296 uqadd v2.8h, v2.8h, v10.8h 297 uqadd v14.8h, v14.8h, v6.8h 298 299 uqrshrn v0.8b, v0.8h, #8 300 uqrshrn2 v0.16b, v12.8h, #8 301 uqrshrn v1.8b, v1.8h, #8 302 uqrshrn2 v1.16b, v13.8h, #8 303 uqrshrn v2.8b, v2.8h, #8 304 uqrshrn2 v2.16b, v14.8h, #8 305 .endm 306 307 #define params_DST_ATOP zipped=1 308 .macro blend_kernel_DST_ATOP 309 mvn v3.16b, v3.16b 310 311 umull2 v12.8h, v11.16b, v0.16b 312 umull v0.8h, v11.8b, v0.8b 313 umull2 v13.8h, v11.16b, v1.16b 314 umull v1.8h, v11.8b, v1.8b 315 umull2 v14.8h, v11.16b, v2.16b 316 umull v2.8h, v11.8b, v2.8b 317 318 umull2 v4.8h, v3.16b, v8.16b 319 umull v8.8h, v3.8b, v8.8b 320 umull2 v5.8h, v3.16b, v9.16b 321 umull v9.8h, v3.8b, v9.8b 322 umull2 v6.8h, v3.16b, v10.16b 323 umull v10.8h, v3.8b, v10.8b 324 325 uqadd v12.8h, v12.8h, v4.8h 326 uqadd v0.8h, v0.8h, v8.8h 327 uqadd v13.8h, v13.8h, v5.8h 328 uqadd v1.8h, v1.8h, v9.8h 329 uqadd v14.8h, v14.8h, v6.8h 330 uqadd v2.8h, v2.8h, v10.8h 331 332 urshr v8.8h, v0.8h, #8 333 urshr v4.8h, v12.8h, #8 334 urshr v9.8h, v1.8h, #8 335 urshr v5.8h, v13.8h, #8 336 urshr v10.8h, v2.8h, #8 337 urshr v6.8h, v14.8h, #8 338 339 uqadd v0.8h, v0.8h, v8.8h 340 uqadd v12.8h, v12.8h, v4.8h 341 uqadd v1.8h, v1.8h, v9.8h 342 uqadd v13.8h, v13.8h, v5.8h 343 uqadd v2.8h, v2.8h, v10.8h 344 uqadd v14.8h, v14.8h, v6.8h 345 346 uqrshrn v0.8b, v0.8h, #8 347 uqrshrn2 v0.16b, v12.8h, #8 348 uqrshrn v1.8b, v1.8h, #8 349 uqrshrn2 v1.16b, v13.8h, #8 350 uqrshrn v2.8b, v2.8h, #8 351 uqrshrn2 v2.16b, v14.8h, #8 352 353 mov v3.16b, v11.16b 354 .endm 355 356 #define params_MULTIPLY zipped=0 357 .macro blend_kernel_MULTIPLY 358 umull2 v12.8h, v0.16b, v8.16b 359 umull v0.8h, v0.8b, v8.8b 360 umull2 v13.8h, v1.16b, v9.16b 361 umull v1.8h, v1.8b, v9.8b 362 umull2 v14.8h, v2.16b, v10.16b 363 umull v2.8h, v2.8b, v10.8b 364 umull2 v15.8h, v3.16b, v11.16b 365 umull v3.8h, v3.8b, v11.8b 366 367 rshrn v4.8b, v0.8h, #8 368 rshrn2 v4.16b, v12.8h, #8 369 rshrn v5.8b, v1.8h, #8 370 rshrn2 v5.16b, v13.8h, #8 371 rshrn v6.8b, v2.8h, #8 372 rshrn2 v6.16b, v14.8h, #8 373 rshrn v7.8b, v3.8h, #8 374 rshrn2 v7.16b, v15.8h, #8 375 376 uaddw v0.8h, v0.8h, v4.8b 377 uaddw2 v12.8h, v12.8h, v4.16b 378 uaddw v1.8h, v1.8h, v5.8b 379 uaddw2 v13.8h, v13.8h, v5.16b 380 uaddw v2.8h, v2.8h, v6.8b 381 uaddw2 v14.8h, v14.8h, v6.16b 382 uaddw v3.8h, v3.8h, v7.8b 383 uaddw2 v15.8h, v15.8h, v7.16b 384 385 rshrn v0.8b, v0.8h, #8 386 rshrn2 v0.16b, v12.8h, #8 387 rshrn v1.8b, v1.8h, #8 388 rshrn2 v1.16b, v13.8h, #8 389 rshrn v2.8b, v2.8h, #8 390 rshrn2 v2.16b, v14.8h, #8 391 rshrn v3.8b, v3.8h, #8 392 rshrn2 v3.16b, v15.8h, #8 393 .endm 394 395 #define params_ADD zipped=0 396 .macro blend_kernel_ADD 397 uqadd v0.16b, v0.16b, v8.16b 398 uqadd v1.16b, v1.16b, v9.16b 399 uqadd v2.16b, v2.16b, v10.16b 400 uqadd v3.16b, v3.16b, v11.16b 401 .endm 402 403 #define params_SUBTRACT zipped=0 404 .macro blend_kernel_SUBTRACT 405 uqsub v0.16b, v0.16b, v8.16b 406 uqsub v1.16b, v1.16b, v9.16b 407 uqsub v2.16b, v2.16b, v10.16b 408 uqsub v3.16b, v3.16b, v11.16b 409 .endm 410 411 #define params_DIFFERENCE zipped=0 412 .macro blend_kernel_DIFFERENCE 413 uabd v0.16b, v0.16b, v8.16b 414 uabd v1.16b, v1.16b, v9.16b 415 uabd v2.16b, v2.16b, v10.16b 416 uabd v3.16b, v3.16b, v11.16b 417 .endm 418 419 #define params_XOR zipped=0 420 .macro blend_kernel_XOR 421 eor v0.16b, v0.16b, v8.16b 422 eor v1.16b, v1.16b, v9.16b 423 eor v2.16b, v2.16b, v10.16b 424 eor v3.16b, v3.16b, v11.16b 425 .endm 426 427 428 /* Define the wrapper code which will load and store the data, iterate the 429 * correct number of times, and safely handle the remainder at the end of the 430 * loop. Various sections of assembly code are dropped or substituted for 431 * simpler operations if they're not needed. 432 */ 433 .macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1 434 .if \nowrap 435 \kernel 436 .else 437 sub x3, sp, #32 438 sub sp, sp, #64 439 st1 {v8.1d - v11.1d}, [sp] 440 st1 {v12.1d - v15.1d}, [x3] 441 subs x2, x2, #64 442 b 2f 443 .align 4 444 1: 445 .if \lddst 446 .if \zipped 447 ld4 {v0.16b - v3.16b}, [x0] 448 .else 449 ld1 {v0.16b - v3.16b}, [x0] 450 .endif 451 .endif 452 .if \ldsrc 453 .if \zipped 454 ld4 {v8.16b - v11.16b}, [x1], #64 455 .else 456 ld1 {v8.16b - v11.16b}, [x1], #64 457 .endif 458 .endif 459 .if \pld 460 #if 0 /* TODO: test this on real hardware */ 461 .if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif 462 .if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif 463 #endif 464 .endif 465 466 \kernel 467 468 subs x2, x2, #64 469 .if \zipped 470 st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64 471 .else 472 st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64 473 .endif 474 475 2: bge 1b 476 adds x2, x2, #64 477 beq 2f 478 479 /* To handle the tail portion of the data (something less than 64 480 * bytes) load small power-of-two chunks into working registers. It 481 * doesn't matter where they end up in the register; the same process 482 * will store them back out using the same positions and the operations 483 * don't require data to interact with its neighbours. 484 */ 485 movi v0.16b, #0 486 movi v1.16b, #0 487 movi v2.16b, #0 488 movi v3.16b, #0 489 490 movi v8.16b, #0 491 movi v9.16b, #0 492 movi v10.16b, #0 493 movi v11.16b, #0 494 495 tbz x2, #5, 1f 496 .if \lddst ; ld1 {v2.16b,v3.16b}, [x0], #32 ; .endif 497 .if \ldsrc ; ld1 {v10.16b,v11.16b}, [x1], #32 ; .endif 498 1: tbz x2, #4, 1f 499 .if \lddst ; ld1 {v1.16b}, [x0], #16 ; .endif 500 .if \ldsrc ; ld1 {v9.16b}, [x1], #16 ; .endif 501 1: tbz x2, #3, 1f 502 .if \lddst ; ld1 {v0.d}[1], [x0], #8 ; .endif 503 .if \ldsrc ; ld1 {v8.d}[1], [x1], #8 ; .endif 504 1: tbz x2, #2, 1f 505 .if \lddst ; ld1 {v0.s}[1], [x0], #4 ; .endif 506 .if \ldsrc ; ld1 {v8.s}[1], [x1], #4 ; .endif 507 1: tbz x2, #1, 1f 508 .if \lddst ; ld1 {v0.h}[1], [x0], #2 ; .endif 509 .if \ldsrc ; ld1 {v8.h}[1], [x1], #2 ; .endif 510 1: tbz x2, #0, 1f 511 .if \lddst ; ld1 {v0.b}[1], [x0], #1 ; .endif 512 .if \ldsrc ; ld1 {v8.b}[1], [x1], #1 ; .endif 513 1: 514 .if \lddst ; sub x0, x0, x2 ; .endif 515 516 .if \zipped 517 /* One small impediment in the process above is that some of the load 518 * operations can't perform byte-wise structure deinterleaving at the 519 * same time as loading only part of a register. So the data is loaded 520 * linearly and unpacked manually at this point. 521 */ 522 uzp1 v4.16b, v0.16b, v1.16b 523 uzp2 v5.16b, v0.16b, v1.16b 524 uzp1 v6.16b, v2.16b, v3.16b 525 uzp2 v7.16b, v2.16b, v3.16b 526 uzp1 v0.16b, v4.16b, v6.16b 527 uzp2 v2.16b, v4.16b, v6.16b 528 uzp1 v1.16b, v5.16b, v7.16b 529 uzp2 v3.16b, v5.16b, v7.16b 530 531 uzp1 v4.16b, v8.16b, v9.16b 532 uzp2 v5.16b, v8.16b, v9.16b 533 uzp1 v6.16b, v10.16b, v11.16b 534 uzp2 v7.16b, v10.16b, v11.16b 535 uzp1 v8.16b, v4.16b, v6.16b 536 uzp2 v10.16b, v4.16b, v6.16b 537 uzp1 v9.16b, v5.16b, v7.16b 538 uzp2 v11.16b, v5.16b, v7.16b 539 540 \kernel 541 542 zip1 v4.16b, v0.16b, v2.16b 543 zip2 v6.16b, v0.16b, v2.16b 544 zip1 v5.16b, v1.16b, v3.16b 545 zip2 v7.16b, v1.16b, v3.16b 546 zip1 v0.16b, v4.16b, v5.16b 547 zip2 v1.16b, v4.16b, v5.16b 548 zip1 v2.16b, v6.16b, v7.16b 549 zip2 v3.16b, v6.16b, v7.16b 550 .else 551 \kernel 552 .endif 553 554 tbz x2, #5, 1f 555 st1 {v2.16b,v3.16b}, [x0], #32 556 1: tbz x2, #4, 1f 557 st1 {v1.16b}, [x0], #16 558 1: tbz x2, #3, 1f 559 st1 {v0.d}[1], [x0], #8 560 1: tbz x2, #2, 1f 561 st1 {v0.s}[1], [x0], #4 562 1: tbz x2, #1, 1f 563 st1 {v0.h}[1], [x0], #2 564 1: tbz x2, #0, 2f 565 st1 {v0.b}[1], [x0], #1 566 2: ld1 {v8.1d - v11.1d}, [sp], #32 567 ld1 {v12.1d - v15.1d}, [sp], #32 568 .endif 569 mov x0, #0 570 ret 571 .endm 572 573 574 /* produce list of blend_line_XX() functions; each function uses the wrap_line 575 * macro, passing it the name of the operation macro it wants along with 576 * optional parameters to remove unnecessary operations. 577 */ 578 #define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ; 579 BLEND_LIST(BLEND_X) 580 #undef BLEND_X 581 582 #define BLEND_X(d, n) .set tablesize, d+1 ; 583 BLEND_LIST(BLEND_X) 584 #undef BLEND_X 585 586 /* int rsdIntrinsicBlend_K( 587 * uchar4 *out, // x0 588 * uchar4 const *in, // x1 589 * int slot, // x2 590 * size_t xstart, // x3 591 * size_t xend); // x4 592 */ 593 ENTRY(rsdIntrinsicBlend_K) 594 adr x5, 2f 595 cmp w2, tablesize >> 1 596 bhs 1f 597 ldrsh x6, [x5, w2, uxtw #1] 598 add x0, x0, w3, uxtw #2 599 add x1, x1, w3, uxtw #2 600 sub w2, w4, w3 601 ubfiz x2, x2, #2, #32 /* TODO: fix */ 602 cbz x6, 1f 603 add x6, x5, x6 604 br x6 605 1: mov x0, #-1 606 ret 607 608 2: 609 .set off,0 610 #define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ; 611 BLEND_LIST(BLEND_X) 612 #undef BLEND_X 613 3: 614 615 END(rsdIntrinsicBlend_K) 616