1 /* 2 * ARM NEON optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). 5 * All rights reserved. 6 * Contact: Alexander Bokovoy <alexander.bokovoy (at) nokia.com> 7 * 8 * This software is provided 'as-is', without any express or implied 9 * warranty. In no event will the authors be held liable for any damages 10 * arising from the use of this software. 11 * 12 * Permission is granted to anyone to use this software for any purpose, 13 * including commercial applications, and to alter it and redistribute it 14 * freely, subject to the following restrictions: 15 * 16 * 1. The origin of this software must not be misrepresented; you must not 17 * claim that you wrote the original software. If you use this software 18 * in a product, an acknowledgment in the product documentation would be 19 * appreciated but is not required. 20 * 2. Altered source versions must be plainly marked as such, and must not be 21 * misrepresented as being the original software. 22 * 3. This notice may not be removed or altered from any source distribution. 23 */ 24 /* Copyright (c) 2011, NVIDIA CORPORATION. All rights reserved. 25 * 26 * Redistribution and use in source and binary forms, with or without 27 * modification, are permitted provided that the following conditions 28 * are met: 29 * 30 * * Redistributions of source code must retain the above copyright 31 * notice, this list of conditions and the following disclaimer. 32 * * Redistributions in binary form must reproduce the above copyright 33 * notice, this list of conditions and the following disclaimer in the 34 * documentation and/or other materials provided with the distribution. 35 * * Neither the name of the NVIDIA CORPORATION nor the names of its 36 * contributors may be used to endorse or promote products derived 37 * from this software without specific prior written permission. 38 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 39 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 41 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 42 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 43 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 44 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 45 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 46 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 47 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 48 * THE POSSIBILITY OF SUCH DAMAGE. 49 */ 50 51 52 53 #if defined(__linux__) && defined(__ELF__) 54 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ 55 #endif 56 57 .text 58 .fpu neon 59 .arch armv7a 60 .object_arch armv7a 61 .arm 62 63 64 #define RESPECT_STRICT_ALIGNMENT 1 65 66 /*****************************************************************************/ 67 68 /* Supplementary macro for setting function attributes */ 69 .macro asm_function fname 70 .func \fname 71 .global \fname 72 #ifdef __ELF__ 73 .hidden \fname 74 .type \fname, %function 75 #endif 76 \fname: 77 .endm 78 79 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ 80 .macro transpose_4x4 x0, x1, x2, x3 81 vtrn.16 \x0, \x1 82 vtrn.16 \x2, \x3 83 vtrn.32 \x0, \x2 84 vtrn.32 \x1, \x3 85 .endm 86 87 /*****************************************************************************/ 88 89 /* 90 * jsimd_idct_ifast_neon 91 * 92 * This function contains a fast, not so accurate integer implementation of 93 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 94 * and produces exactly the same output as IJG's original 'jpeg_idct_fast' 95 * function from jidctfst.c 96 * 97 * TODO: a bit better instructions scheduling is needed. 98 */ 99 100 #define XFIX_1_082392200 d0[0] 101 #define XFIX_1_414213562 d0[1] 102 #define XFIX_1_847759065 d0[2] 103 #define XFIX_2_613125930 d0[3] 104 105 .balign 16 106 jsimd_idct_ifast_neon_consts: 107 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 108 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 109 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 110 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 111 112 /* 1-D IDCT helper macro */ 113 114 .macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \ 115 t10, t11, t12, t13, t14 116 117 vsub.s16 \t10, \x0, \x4 118 vadd.s16 \x4, \x0, \x4 119 vswp.s16 \t10, \x0 120 vsub.s16 \t11, \x2, \x6 121 vadd.s16 \x6, \x2, \x6 122 vswp.s16 \t11, \x2 123 vsub.s16 \t10, \x3, \x5 124 vadd.s16 \x5, \x3, \x5 125 vswp.s16 \t10, \x3 126 vsub.s16 \t11, \x1, \x7 127 vadd.s16 \x7, \x1, \x7 128 vswp.s16 \t11, \x1 129 130 vqdmulh.s16 \t13, \x2, d0[1] 131 vadd.s16 \t12, \x3, \x3 132 vadd.s16 \x2, \x2, \t13 133 vqdmulh.s16 \t13, \x3, d0[3] 134 vsub.s16 \t10, \x1, \x3 135 vadd.s16 \t12, \t12, \t13 136 vqdmulh.s16 \t13, \t10, d0[2] 137 vsub.s16 \t11, \x7, \x5 138 vadd.s16 \t10, \t10, \t13 139 vqdmulh.s16 \t13, \t11, d0[1] 140 vadd.s16 \t11, \t11, \t13 141 142 vqdmulh.s16 \t13, \x1, d0[0] 143 vsub.s16 \x2, \x6, \x2 144 vsub.s16 \t14, \x0, \x2 145 vadd.s16 \x2, \x0, \x2 146 vadd.s16 \x0, \x4, \x6 147 vsub.s16 \x4, \x4, \x6 148 vadd.s16 \x1, \x1, \t13 149 vadd.s16 \t13, \x7, \x5 150 vsub.s16 \t12, \t13, \t12 151 vsub.s16 \t12, \t12, \t10 152 vadd.s16 \t11, \t12, \t11 153 vsub.s16 \t10, \x1, \t10 154 vadd.s16 \t10, \t10, \t11 155 156 vsub.s16 \x7, \x0, \t13 157 vadd.s16 \x0, \x0, \t13 158 vadd.s16 \x6, \t14, \t12 159 vsub.s16 \x1, \t14, \t12 160 vsub.s16 \x5, \x2, \t11 161 vadd.s16 \x2, \x2, \t11 162 vsub.s16 \x3, \x4, \t10 163 vadd.s16 \x4, \x4, \t10 164 .endm 165 166 asm_function jsimd_idct_ifast_neon 167 168 DCT_TABLE .req r0 169 COEF_BLOCK .req r1 170 OUTPUT_BUF .req r2 171 OUTPUT_COL .req r3 172 TMP .req ip 173 174 vpush {d8-d15} 175 176 /* Load constants */ 177 adr TMP, jsimd_idct_ifast_neon_consts 178 vld1.16 {d0}, [TMP, :64] 179 180 /* Load all COEF_BLOCK into NEON registers with the following allocation: 181 * 0 1 2 3 | 4 5 6 7 182 * ---------+-------- 183 * 0 | d4 | d5 184 * 1 | d6 | d7 185 * 2 | d8 | d9 186 * 3 | d10 | d11 187 * 4 | d12 | d13 188 * 5 | d14 | d15 189 * 6 | d16 | d17 190 * 7 | d18 | d19 191 */ 192 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]! 193 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]! 194 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]! 195 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]! 196 /* Dequantize */ 197 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! 198 vmul.s16 q2, q2, q10 199 vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]! 200 vmul.s16 q3, q3, q11 201 vmul.s16 q4, q4, q12 202 vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]! 203 vmul.s16 q5, q5, q13 204 vmul.s16 q6, q6, q14 205 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! 206 vmul.s16 q7, q7, q15 207 vmul.s16 q8, q8, q10 208 vmul.s16 q9, q9, q11 209 210 /* Pass 1 : process columns from input, store into work array.*/ 211 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 212 /* Transpose */ 213 vtrn.16 q2, q3 214 vtrn.16 q4, q5 215 vtrn.32 q2, q4 216 vtrn.32 q3, q5 217 218 vtrn.16 q6, q7 219 vtrn.16 q8, q9 220 vtrn.32 q6, q8 221 vtrn.32 q7, q9 222 223 vswp d12, d5 224 vswp d14, d7 225 vswp d16, d9 226 vswp d18, d11 227 228 /* Pass 2 */ 229 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 230 /* Transpose */ 231 232 vtrn.16 q2, q3 233 vtrn.16 q4, q5 234 vtrn.32 q2, q4 235 vtrn.32 q3, q5 236 237 vtrn.16 q6, q7 238 vtrn.16 q8, q9 239 vtrn.32 q6, q8 240 vtrn.32 q7, q9 241 242 vswp d12, d5 243 vswp d14, d7 244 vswp d16, d9 245 vswp d18, d11 246 247 /* Descale and range limit */ 248 vmov.s16 q15, #(0x80 << 5) 249 vqadd.s16 q2, q2, q15 250 vqadd.s16 q3, q3, q15 251 vqadd.s16 q4, q4, q15 252 vqadd.s16 q5, q5, q15 253 vqadd.s16 q6, q6, q15 254 vqadd.s16 q7, q7, q15 255 vqadd.s16 q8, q8, q15 256 vqadd.s16 q9, q9, q15 257 vqshrun.s16 d4, q2, #5 258 vqshrun.s16 d6, q3, #5 259 vqshrun.s16 d8, q4, #5 260 vqshrun.s16 d10, q5, #5 261 vqshrun.s16 d12, q6, #5 262 vqshrun.s16 d14, q7, #5 263 vqshrun.s16 d16, q8, #5 264 vqshrun.s16 d18, q9, #5 265 266 /* Store results to the output buffer */ 267 .irp x, d4, d6, d8, d10, d12, d14, d16, d18 268 ldr TMP, [OUTPUT_BUF], #4 269 add TMP, TMP, OUTPUT_COL 270 vst1.8 {\x}, [TMP]! 271 .endr 272 273 vpop {d8-d15} 274 bx lr 275 276 .unreq DCT_TABLE 277 .unreq COEF_BLOCK 278 .unreq OUTPUT_BUF 279 .unreq OUTPUT_COL 280 .unreq TMP 281 .endfunc 282 283 .purgem idct_helper 284 285 /*****************************************************************************/ 286 287 /* 288 * jsimd_idct_4x4_neon 289 * 290 * This function contains inverse-DCT code for getting reduced-size 291 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations 292 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' 293 * function from jpeg-6b (jidctred.c). 294 * 295 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which 296 * requires much less arithmetic operations and hence should be faster. 297 * The primary purpose of this particular NEON optimized function is 298 * bit exact compatibility with jpeg-6b. 299 * 300 * TODO: a bit better instructions scheduling can be achieved by expanding 301 * idct_helper/transpose_4x4 macros and reordering instructions, 302 * but readability will suffer somewhat. 303 */ 304 305 #define CONST_BITS 13 306 307 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */ 308 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */ 309 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */ 310 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */ 311 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */ 312 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */ 313 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */ 314 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ 315 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ 316 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ 317 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ 318 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ 319 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ 320 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ 321 322 .balign 16 323 jsimd_idct_4x4_neon_consts: 324 .short FIX_1_847759065 /* d0[0] */ 325 .short -FIX_0_765366865 /* d0[1] */ 326 .short -FIX_0_211164243 /* d0[2] */ 327 .short FIX_1_451774981 /* d0[3] */ 328 .short -FIX_2_172734803 /* d1[0] */ 329 .short FIX_1_061594337 /* d1[1] */ 330 .short -FIX_0_509795579 /* d1[2] */ 331 .short -FIX_0_601344887 /* d1[3] */ 332 .short FIX_0_899976223 /* d2[0] */ 333 .short FIX_2_562915447 /* d2[1] */ 334 .short 1 << (CONST_BITS+1) /* d2[2] */ 335 .short 0 /* d2[3] */ 336 337 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 338 vmull.s16 q14, \x4, d2[2] 339 vmlal.s16 q14, \x8, d0[0] 340 vmlal.s16 q14, \x14, d0[1] 341 342 vmull.s16 q13, \x16, d1[2] 343 vmlal.s16 q13, \x12, d1[3] 344 vmlal.s16 q13, \x10, d2[0] 345 vmlal.s16 q13, \x6, d2[1] 346 347 vmull.s16 q15, \x4, d2[2] 348 vmlsl.s16 q15, \x8, d0[0] 349 vmlsl.s16 q15, \x14, d0[1] 350 351 vmull.s16 q12, \x16, d0[2] 352 vmlal.s16 q12, \x12, d0[3] 353 vmlal.s16 q12, \x10, d1[0] 354 vmlal.s16 q12, \x6, d1[1] 355 356 vadd.s32 q10, q14, q13 357 vsub.s32 q14, q14, q13 358 359 .if \shift > 16 360 vrshr.s32 q10, q10, #\shift 361 vrshr.s32 q14, q14, #\shift 362 vmovn.s32 \y26, q10 363 vmovn.s32 \y29, q14 364 .else 365 vrshrn.s32 \y26, q10, #\shift 366 vrshrn.s32 \y29, q14, #\shift 367 .endif 368 369 vadd.s32 q10, q15, q12 370 vsub.s32 q15, q15, q12 371 372 .if \shift > 16 373 vrshr.s32 q10, q10, #\shift 374 vrshr.s32 q15, q15, #\shift 375 vmovn.s32 \y27, q10 376 vmovn.s32 \y28, q15 377 .else 378 vrshrn.s32 \y27, q10, #\shift 379 vrshrn.s32 \y28, q15, #\shift 380 .endif 381 382 .endm 383 384 asm_function jsimd_idct_4x4_neon 385 386 DCT_TABLE .req r0 387 COEF_BLOCK .req r1 388 OUTPUT_BUF .req r2 389 OUTPUT_COL .req r3 390 TMP1 .req r0 391 TMP2 .req r1 392 TMP3 .req r2 393 TMP4 .req ip 394 395 vpush {d8-d15} 396 397 /* Load constants (d3 is just used for padding) */ 398 adr TMP4, jsimd_idct_4x4_neon_consts 399 vld1.16 {d0, d1, d2, d3}, [TMP4, :128] 400 401 /* Load all COEF_BLOCK into NEON registers with the following allocation: 402 * 0 1 2 3 | 4 5 6 7 403 * ---------+-------- 404 * 0 | d4 | d5 405 * 1 | d6 | d7 406 * 2 | d8 | d9 407 * 3 | d10 | d11 408 * 4 | - | - 409 * 5 | d12 | d13 410 * 6 | d14 | d15 411 * 7 | d16 | d17 412 */ 413 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]! 414 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]! 415 add COEF_BLOCK, COEF_BLOCK, #16 416 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]! 417 vld1.16 {d16, d17}, [COEF_BLOCK]! 418 /* dequantize */ 419 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE]! 420 vmul.s16 q2, q2, q9 421 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE]! 422 vmul.s16 q3, q3, q10 423 vmul.s16 q4, q4, q11 424 add DCT_TABLE, DCT_TABLE, #16 425 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE]! 426 vmul.s16 q5, q5, q12 427 vmul.s16 q6, q6, q13 428 vld1.16 {d30, d31}, [DCT_TABLE]! 429 vmul.s16 q7, q7, q14 430 vmul.s16 q8, q8, q15 431 432 433 /* Pass 1 */ 434 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 435 transpose_4x4 d4, d6, d8, d10 436 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 437 transpose_4x4 d5, d7, d9, d11 438 439 /* Pass 2 */ 440 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 441 transpose_4x4 d26, d27, d28, d29 442 443 /* Range limit */ 444 vmov.u16 q15, #0x80 445 vadd.s16 q13, q13, q15 446 vadd.s16 q14, q14, q15 447 vqmovun.s16 d26, q13 448 vqmovun.s16 d27, q14 449 450 /* Store results to the output buffer */ 451 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 452 add TMP1, TMP1, OUTPUT_COL 453 add TMP2, TMP2, OUTPUT_COL 454 add TMP3, TMP3, OUTPUT_COL 455 add TMP4, TMP4, OUTPUT_COL 456 457 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT 458 /* We can use much less instructions on little endian systems if the 459 * OS kernel is not configured to trap unaligned memory accesses 460 */ 461 vst1.32 {d26[0]}, [TMP1]! 462 vst1.32 {d27[0]}, [TMP3]! 463 vst1.32 {d26[1]}, [TMP2]! 464 vst1.32 {d27[1]}, [TMP4]! 465 #else 466 vst1.8 {d26[0]}, [TMP1]! 467 vst1.8 {d27[0]}, [TMP3]! 468 vst1.8 {d26[1]}, [TMP1]! 469 vst1.8 {d27[1]}, [TMP3]! 470 vst1.8 {d26[2]}, [TMP1]! 471 vst1.8 {d27[2]}, [TMP3]! 472 vst1.8 {d26[3]}, [TMP1]! 473 vst1.8 {d27[3]}, [TMP3]! 474 475 vst1.8 {d26[4]}, [TMP2]! 476 vst1.8 {d27[4]}, [TMP4]! 477 vst1.8 {d26[5]}, [TMP2]! 478 vst1.8 {d27[5]}, [TMP4]! 479 vst1.8 {d26[6]}, [TMP2]! 480 vst1.8 {d27[6]}, [TMP4]! 481 vst1.8 {d26[7]}, [TMP2]! 482 vst1.8 {d27[7]}, [TMP4]! 483 #endif 484 485 vpop {d8-d15} 486 bx lr 487 488 .unreq DCT_TABLE 489 .unreq COEF_BLOCK 490 .unreq OUTPUT_BUF 491 .unreq OUTPUT_COL 492 .unreq TMP1 493 .unreq TMP2 494 .unreq TMP3 495 .unreq TMP4 496 .endfunc 497 498 .purgem idct_helper 499 500 /*****************************************************************************/ 501 502 /* 503 * jsimd_idct_2x2_neon 504 * 505 * This function contains inverse-DCT code for getting reduced-size 506 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations 507 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' 508 * function from jpeg-6b (jidctred.c). 509 * 510 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which 511 * requires much less arithmetic operations and hence should be faster. 512 * The primary purpose of this particular NEON optimized function is 513 * bit exact compatibility with jpeg-6b. 514 */ 515 516 .balign 8 517 jsimd_idct_2x2_neon_consts: 518 .short -FIX_0_720959822 /* d0[0] */ 519 .short FIX_0_850430095 /* d0[1] */ 520 .short -FIX_1_272758580 /* d0[2] */ 521 .short FIX_3_624509785 /* d0[3] */ 522 523 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 524 vshll.s16 q14, \x4, #15 525 vmull.s16 q13, \x6, d0[3] 526 vmlal.s16 q13, \x10, d0[2] 527 vmlal.s16 q13, \x12, d0[1] 528 vmlal.s16 q13, \x16, d0[0] 529 530 vadd.s32 q10, q14, q13 531 vsub.s32 q14, q14, q13 532 533 .if \shift > 16 534 vrshr.s32 q10, q10, #\shift 535 vrshr.s32 q14, q14, #\shift 536 vmovn.s32 \y26, q10 537 vmovn.s32 \y27, q14 538 .else 539 vrshrn.s32 \y26, q10, #\shift 540 vrshrn.s32 \y27, q14, #\shift 541 .endif 542 543 .endm 544 545 asm_function jsimd_idct_2x2_neon 546 547 DCT_TABLE .req r0 548 COEF_BLOCK .req r1 549 OUTPUT_BUF .req r2 550 OUTPUT_COL .req r3 551 TMP1 .req r0 552 TMP2 .req ip 553 554 vpush {d8-d15} 555 556 /* Load constants */ 557 adr TMP2, jsimd_idct_2x2_neon_consts 558 vld1.16 {d0}, [TMP2, :64] 559 560 /* Load all COEF_BLOCK into NEON registers with the following allocation: 561 * 0 1 2 3 | 4 5 6 7 562 * ---------+-------- 563 * 0 | d4 | d5 564 * 1 | d6 | d7 565 * 2 | - | - 566 * 3 | d10 | d11 567 * 4 | - | - 568 * 5 | d12 | d13 569 * 6 | - | - 570 * 7 | d16 | d17 571 */ 572 573 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]! 574 add COEF_BLOCK, COEF_BLOCK, #16 575 vld1.16 {d10, d11}, [COEF_BLOCK]! 576 add COEF_BLOCK, COEF_BLOCK, #16 577 vld1.16 {d12, d13}, [COEF_BLOCK]! 578 add COEF_BLOCK, COEF_BLOCK, #16 579 vld1.16 {d16, d17}, [COEF_BLOCK]! 580 /* Dequantize */ 581 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE]! 582 vmul.s16 q2, q2, q9 583 vmul.s16 q3, q3, q10 584 add DCT_TABLE, DCT_TABLE, #16 585 vld1.16 {d24, d25}, [DCT_TABLE]! 586 vmul.s16 q5, q5, q12 587 add DCT_TABLE, DCT_TABLE, #16 588 vld1.16 {d26, d27}, [DCT_TABLE]! 589 vmul.s16 q6, q6, q13 590 add DCT_TABLE, DCT_TABLE, #16 591 vld1.16 {d30, d31}, [DCT_TABLE]! 592 vmul.s16 q8, q8, q15 593 594 /* Pass 1 */ 595 vmull.s16 q13, d6, d0[3] 596 vmlal.s16 q13, d10, d0[2] 597 vmlal.s16 q13, d12, d0[1] 598 vmlal.s16 q13, d16, d0[0] 599 vmull.s16 q12, d7, d0[3] 600 vmlal.s16 q12, d11, d0[2] 601 vmlal.s16 q12, d13, d0[1] 602 vmlal.s16 q12, d17, d0[0] 603 vshll.s16 q14, d4, #15 604 vshll.s16 q15, d5, #15 605 vadd.s32 q10, q14, q13 606 vsub.s32 q14, q14, q13 607 vrshrn.s32 d4, q10, #13 608 vrshrn.s32 d6, q14, #13 609 vadd.s32 q10, q15, q12 610 vsub.s32 q14, q15, q12 611 vrshrn.s32 d5, q10, #13 612 vrshrn.s32 d7, q14, #13 613 vtrn.16 q2, q3 614 vtrn.32 q3, q5 615 616 /* Pass 2 */ 617 idct_helper d4, d6, d10, d7, d11, 20, d26, d27 618 619 /* Range limit */ 620 vmov.u16 q15, #0x80 621 vadd.s16 q13, q13, q15 622 vqmovun.s16 d26, q13 623 vqmovun.s16 d27, q13 624 625 /* Store results to the output buffer */ 626 ldmia OUTPUT_BUF, {TMP1, TMP2} 627 add TMP1, TMP1, OUTPUT_COL 628 add TMP2, TMP2, OUTPUT_COL 629 630 vst1.8 {d26[0]}, [TMP1]! 631 vst1.8 {d27[4]}, [TMP1]! 632 vst1.8 {d26[1]}, [TMP2]! 633 vst1.8 {d27[5]}, [TMP2]! 634 635 vpop {d8-d15} 636 bx lr 637 638 .unreq DCT_TABLE 639 .unreq COEF_BLOCK 640 .unreq OUTPUT_BUF 641 .unreq OUTPUT_COL 642 .unreq TMP1 643 .unreq TMP2 644 .endfunc 645 646 .purgem idct_helper 647 648 /*****************************************************************************/ 649 650 /* 651 * jsimd_ycc_rgba8888_convert_neon 652 * jsimd_ycc_rgb565_convert_neon 653 * Colorspace conversion YCbCr -> RGB 654 */ 655 656 657 .macro do_load size 658 .if \size == 8 659 vld1.8 {d4}, [U]! 660 vld1.8 {d5}, [V]! 661 vld1.8 {d0}, [Y]! 662 pld [Y, #64] 663 pld [U, #64] 664 pld [V, #64] 665 .elseif \size == 4 666 vld1.8 {d4[0]}, [U]! 667 vld1.8 {d4[1]}, [U]! 668 vld1.8 {d4[2]}, [U]! 669 vld1.8 {d4[3]}, [U]! 670 vld1.8 {d5[0]}, [V]! 671 vld1.8 {d5[1]}, [V]! 672 vld1.8 {d5[2]}, [V]! 673 vld1.8 {d5[3]}, [V]! 674 vld1.8 {d0[0]}, [Y]! 675 vld1.8 {d0[1]}, [Y]! 676 vld1.8 {d0[2]}, [Y]! 677 vld1.8 {d0[3]}, [Y]! 678 .elseif \size == 2 679 vld1.8 {d4[4]}, [U]! 680 vld1.8 {d4[5]}, [U]! 681 vld1.8 {d5[4]}, [V]! 682 vld1.8 {d5[5]}, [V]! 683 vld1.8 {d0[4]}, [Y]! 684 vld1.8 {d0[5]}, [Y]! 685 .elseif \size == 1 686 vld1.8 {d4[6]}, [U]! 687 vld1.8 {d5[6]}, [V]! 688 vld1.8 {d0[6]}, [Y]! 689 .else 690 .error unsupported macroblock size 691 .endif 692 .endm 693 694 695 696 697 698 .macro do_store bpp, size 699 .if \bpp == 16 700 /* if 16 bits, pack into RGB565 format */ 701 vmov d27, d10 /* insert red channel */ 702 vsri.u8 d27, d11, #5 /* shift and insert the green channel */ 703 vsli.u8 d26, d11, #3 704 vsri.u8 d26, d12, #3 /* shift and insert the blue channel */ 705 706 .if \size == 8 707 vst2.8 {d26, d27}, [RGB]! 708 .elseif \size == 4 709 vst2.8 {d26[0], d27[0]}, [RGB]! 710 vst2.8 {d26[1], d27[1]}, [RGB]! 711 vst2.8 {d26[2], d27[2]}, [RGB]! 712 vst2.8 {d26[3], d27[3]}, [RGB]! 713 .elseif \size == 2 714 vst2.8 {d26[4], d27[4]}, [RGB]! 715 vst2.8 {d26[5], d27[5]}, [RGB]! 716 .elseif \size == 1 717 vst2.8 {d26[6], d27[6]}, [RGB]! 718 .else 719 .error unsupported macroblock size 720 .endif 721 .elseif \bpp == 24 722 .if \size == 8 723 vst3.8 {d10, d11, d12}, [RGB]! 724 .elseif \size == 4 725 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! 726 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! 727 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! 728 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! 729 .elseif \size == 2 730 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! 731 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! 732 .elseif \size == 1 733 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! 734 .else 735 .error unsupported macroblock size 736 .endif 737 .elseif \bpp == 32 738 .if \size == 8 739 vst4.8 {d10, d11, d12, d13}, [RGB]! 740 .elseif \size == 4 741 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! 742 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! 743 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! 744 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! 745 .elseif \size == 2 746 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! 747 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! 748 .elseif \size == 1 749 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! 750 .else 751 .error unsupported macroblock size 752 .endif 753 .else 754 .error unsupported bpp 755 .endif 756 .endm 757 758 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs 759 760 .macro do_yuv_to_rgb 761 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ 762 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ 763 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ 764 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ 765 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ 766 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ 767 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ 768 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ 769 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ 770 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ 771 vrshrn.s32 d20, q10, #15 772 vrshrn.s32 d21, q11, #15 773 vrshrn.s32 d24, q12, #14 774 vrshrn.s32 d25, q13, #14 775 vrshrn.s32 d28, q14, #14 776 vrshrn.s32 d29, q15, #14 777 vaddw.u8 q10, q10, d0 778 vaddw.u8 q12, q12, d0 779 vaddw.u8 q14, q14, d0 780 vqmovun.s16 d1\g_offs, q10 781 vqmovun.s16 d1\r_offs, q12 782 vqmovun.s16 d1\b_offs, q14 783 .endm 784 785 /* Apple gas crashes on adrl, work around that by using adr. 786 * But this requires a copy of these constants for each function. 787 */ 788 789 .balign 16 790 jsimd_ycc_\colorid\()_neon_consts: 791 .short 0, 0, 0, 0 792 .short 22971, -11277, -23401, 29033 793 .short -128, -128, -128, -128 794 .short -128, -128, -128, -128 795 796 asm_function jsimd_ycc_\colorid\()_convert_neon 797 OUTPUT_WIDTH .req r0 798 INPUT_BUF .req r1 799 INPUT_ROW .req r2 800 OUTPUT_BUF .req r3 801 NUM_ROWS .req r4 802 803 INPUT_BUF0 .req r5 804 INPUT_BUF1 .req r6 805 INPUT_BUF2 .req INPUT_BUF 806 807 RGB .req r7 808 Y .req r8 809 U .req r9 810 V .req r10 811 N .req ip 812 813 /* Load constants to d1, d2, d3 (d0 is just used for padding) */ 814 adr ip, jsimd_ycc_\colorid\()_neon_consts 815 vld1.16 {d0, d1, d2, d3}, [ip, :128] 816 817 /* Save ARM registers and handle input arguments */ 818 push {r4, r5, r6, r7, r8, r9, r10, lr} 819 ldr NUM_ROWS, [sp, #(4 * 8)] 820 ldr INPUT_BUF0, [INPUT_BUF] 821 ldr INPUT_BUF1, [INPUT_BUF, #4] 822 ldr INPUT_BUF2, [INPUT_BUF, #8] 823 .unreq INPUT_BUF 824 825 /* Save NEON registers */ 826 vpush {d8-d15} 827 828 /* Initially set d10, d11, d12, d13 to 0xFF */ 829 vmov.u8 q5, #255 830 vmov.u8 q6, #255 831 832 /* Outer loop over scanlines */ 833 cmp NUM_ROWS, #1 834 blt 9f 835 0: 836 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] 837 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] 838 mov N, OUTPUT_WIDTH 839 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] 840 add INPUT_ROW, INPUT_ROW, #1 841 ldr RGB, [OUTPUT_BUF], #4 842 843 /* Inner loop over pixels */ 844 subs N, N, #8 845 blt 2f 846 1: 847 do_load 8 848 do_yuv_to_rgb 849 do_store \bpp, 8 850 subs N, N, #8 851 bge 1b 852 tst N, #7 853 beq 8f 854 2: 855 tst N, #4 856 beq 3f 857 do_load 4 858 3: 859 tst N, #2 860 beq 4f 861 do_load 2 862 4: 863 tst N, #1 864 beq 5f 865 do_load 1 866 5: 867 do_yuv_to_rgb 868 tst N, #4 869 beq 6f 870 do_store \bpp, 4 871 6: 872 tst N, #2 873 beq 7f 874 do_store \bpp, 2 875 7: 876 tst N, #1 877 beq 8f 878 do_store \bpp, 1 879 8: 880 subs NUM_ROWS, NUM_ROWS, #1 881 bgt 0b 882 9: 883 /* Restore all registers and return */ 884 vpop {d8-d15} 885 pop {r4, r5, r6, r7, r8, r9, r10, pc} 886 887 .unreq OUTPUT_WIDTH 888 .unreq INPUT_ROW 889 .unreq OUTPUT_BUF 890 .unreq NUM_ROWS 891 .unreq INPUT_BUF0 892 .unreq INPUT_BUF1 893 .unreq INPUT_BUF2 894 .unreq RGB 895 .unreq Y 896 .unreq U 897 .unreq V 898 .unreq N 899 .endfunc 900 901 .purgem do_yuv_to_rgb 902 903 .endm 904 905 /*--------------------------------- id ----- bpp R G B */ 906 generate_jsimd_ycc_rgb_convert_neon rgba8888, 32, 0, 1, 2 907 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 1, 2 908 909 910 .purgem do_load 911 .purgem do_store 912 913 /*****************************************************************************/ 914