1 /* 2 * ARMv7 NEON optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). 5 * All rights reserved. 6 * Author: Siarhei Siamashka <siarhei.siamashka (at) nokia.com> 7 * 8 * This software is provided 'as-is', without any express or implied 9 * warranty. In no event will the authors be held liable for any damages 10 * arising from the use of this software. 11 * 12 * Permission is granted to anyone to use this software for any purpose, 13 * including commercial applications, and to alter it and redistribute it 14 * freely, subject to the following restrictions: 15 * 16 * 1. The origin of this software must not be misrepresented; you must not 17 * claim that you wrote the original software. If you use this software 18 * in a product, an acknowledgment in the product documentation would be 19 * appreciated but is not required. 20 * 2. Altered source versions must be plainly marked as such, and must not be 21 * misrepresented as being the original software. 22 * 3. This notice may not be removed or altered from any source distribution. 23 */ 24 25 #if defined(__linux__) && defined(__ELF__) 26 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ 27 #endif 28 29 .text 30 .fpu neon 31 .arch armv7a 32 .object_arch armv4 33 .arm 34 35 36 #define RESPECT_STRICT_ALIGNMENT 1 37 38 39 /*****************************************************************************/ 40 41 /* Supplementary macro for setting function attributes */ 42 .macro asm_function fname 43 #ifdef __APPLE__ 44 .globl _\fname 45 _\fname: 46 #else 47 .global \fname 48 #ifdef __ELF__ 49 .hidden \fname 50 .type \fname, %function 51 #endif 52 \fname: 53 #endif 54 .endm 55 56 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ 57 .macro transpose_4x4 x0, x1, x2, x3 58 vtrn.16 \x0, \x1 59 vtrn.16 \x2, \x3 60 vtrn.32 \x0, \x2 61 vtrn.32 \x1, \x3 62 .endm 63 64 65 #define CENTERJSAMPLE 128 66 67 /*****************************************************************************/ 68 69 /* 70 * Perform dequantization and inverse DCT on one block of coefficients. 71 * 72 * GLOBAL(void) 73 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, 74 * JSAMPARRAY output_buf, JDIMENSION output_col) 75 */ 76 77 #define FIX_0_298631336 (2446) 78 #define FIX_0_390180644 (3196) 79 #define FIX_0_541196100 (4433) 80 #define FIX_0_765366865 (6270) 81 #define FIX_0_899976223 (7373) 82 #define FIX_1_175875602 (9633) 83 #define FIX_1_501321110 (12299) 84 #define FIX_1_847759065 (15137) 85 #define FIX_1_961570560 (16069) 86 #define FIX_2_053119869 (16819) 87 #define FIX_2_562915447 (20995) 88 #define FIX_3_072711026 (25172) 89 90 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) 91 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) 92 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) 93 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) 94 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) 95 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) 96 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) 97 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) 98 99 /* 100 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. 101 * Uses some ideas from the comments in 'simd/jiss2int-64.asm' 102 */ 103 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ 104 { \ 105 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ 106 INT32 q1, q2, q3, q4, q5, q6, q7; \ 107 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ 108 \ 109 /* 1-D iDCT input data */ \ 110 row0 = xrow0; \ 111 row1 = xrow1; \ 112 row2 = xrow2; \ 113 row3 = xrow3; \ 114 row4 = xrow4; \ 115 row5 = xrow5; \ 116 row6 = xrow6; \ 117 row7 = xrow7; \ 118 \ 119 q5 = row7 + row3; \ 120 q4 = row5 + row1; \ 121 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ 122 MULTIPLY(q4, FIX_1_175875602); \ 123 q7 = MULTIPLY(q5, FIX_1_175875602) + \ 124 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ 125 q2 = MULTIPLY(row2, FIX_0_541196100) + \ 126 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ 127 q4 = q6; \ 128 q3 = ((INT32) row0 - (INT32) row4) << 13; \ 129 q6 += MULTIPLY(row5, -FIX_2_562915447) + \ 130 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ 131 /* now we can use q1 (reloadable constants have been used up) */ \ 132 q1 = q3 + q2; \ 133 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ 134 MULTIPLY(row1, -FIX_0_899976223); \ 135 q5 = q7; \ 136 q1 = q1 + q6; \ 137 q7 += MULTIPLY(row7, -FIX_0_899976223) + \ 138 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ 139 \ 140 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ 141 tmp11_plus_tmp2 = q1; \ 142 row1 = 0; \ 143 \ 144 q1 = q1 - q6; \ 145 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ 146 MULTIPLY(row3, -FIX_2_562915447); \ 147 q1 = q1 - q6; \ 148 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ 149 MULTIPLY(row6, FIX_0_541196100); \ 150 q3 = q3 - q2; \ 151 \ 152 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ 153 tmp11_minus_tmp2 = q1; \ 154 \ 155 q1 = ((INT32) row0 + (INT32) row4) << 13; \ 156 q2 = q1 + q6; \ 157 q1 = q1 - q6; \ 158 \ 159 /* pick up the results */ \ 160 tmp0 = q4; \ 161 tmp1 = q5; \ 162 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ 163 tmp3 = q7; \ 164 tmp10 = q2; \ 165 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ 166 tmp12 = q3; \ 167 tmp13 = q1; \ 168 } 169 170 #define XFIX_0_899976223 d0[0] 171 #define XFIX_0_541196100 d0[1] 172 #define XFIX_2_562915447 d0[2] 173 #define XFIX_0_298631336_MINUS_0_899976223 d0[3] 174 #define XFIX_1_501321110_MINUS_0_899976223 d1[0] 175 #define XFIX_2_053119869_MINUS_2_562915447 d1[1] 176 #define XFIX_0_541196100_PLUS_0_765366865 d1[2] 177 #define XFIX_1_175875602 d1[3] 178 #define XFIX_1_175875602_MINUS_0_390180644 d2[0] 179 #define XFIX_0_541196100_MINUS_1_847759065 d2[1] 180 #define XFIX_3_072711026_MINUS_2_562915447 d2[2] 181 #define XFIX_1_175875602_MINUS_1_961570560 d2[3] 182 183 .balign 16 184 jsimd_idct_islow_neon_consts: 185 .short FIX_0_899976223 /* d0[0] */ 186 .short FIX_0_541196100 /* d0[1] */ 187 .short FIX_2_562915447 /* d0[2] */ 188 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ 189 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ 190 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ 191 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ 192 .short FIX_1_175875602 /* d1[3] */ 193 /* reloadable constants */ 194 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ 195 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ 196 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ 197 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ 198 199 asm_function jsimd_idct_islow_neon 200 201 DCT_TABLE .req r0 202 COEF_BLOCK .req r1 203 OUTPUT_BUF .req r2 204 OUTPUT_COL .req r3 205 TMP1 .req r0 206 TMP2 .req r1 207 TMP3 .req r2 208 TMP4 .req ip 209 210 ROW0L .req d16 211 ROW0R .req d17 212 ROW1L .req d18 213 ROW1R .req d19 214 ROW2L .req d20 215 ROW2R .req d21 216 ROW3L .req d22 217 ROW3R .req d23 218 ROW4L .req d24 219 ROW4R .req d25 220 ROW5L .req d26 221 ROW5R .req d27 222 ROW6L .req d28 223 ROW6R .req d29 224 ROW7L .req d30 225 ROW7R .req d31 226 227 /* Load and dequantize coefficients into NEON registers 228 * with the following allocation: 229 * 0 1 2 3 | 4 5 6 7 230 * ---------+-------- 231 * 0 | d16 | d17 ( q8 ) 232 * 1 | d18 | d19 ( q9 ) 233 * 2 | d20 | d21 ( q10 ) 234 * 3 | d22 | d23 ( q11 ) 235 * 4 | d24 | d25 ( q12 ) 236 * 5 | d26 | d27 ( q13 ) 237 * 6 | d28 | d29 ( q14 ) 238 * 7 | d30 | d31 ( q15 ) 239 */ 240 adr ip, jsimd_idct_islow_neon_consts 241 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! 242 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 243 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! 244 vmul.s16 q8, q8, q0 245 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 246 vmul.s16 q9, q9, q1 247 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! 248 vmul.s16 q10, q10, q2 249 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 250 vmul.s16 q11, q11, q3 251 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] 252 vmul.s16 q12, q12, q0 253 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 254 vmul.s16 q14, q14, q2 255 vmul.s16 q13, q13, q1 256 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ 257 add ip, ip, #16 258 vmul.s16 q15, q15, q3 259 vpush {d8-d15} /* save NEON registers */ 260 /* 1-D IDCT, pass 1, left 4x8 half */ 261 vadd.s16 d4, ROW7L, ROW3L 262 vadd.s16 d5, ROW5L, ROW1L 263 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 264 vmlal.s16 q6, d5, XFIX_1_175875602 265 vmull.s16 q7, d4, XFIX_1_175875602 266 /* Check for the zero coefficients in the right 4x8 half */ 267 push {r4, r5} 268 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 269 vsubl.s16 q3, ROW0L, ROW4L 270 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] 271 vmull.s16 q2, ROW2L, XFIX_0_541196100 272 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 273 orr r0, r4, r5 274 vmov q4, q6 275 vmlsl.s16 q6, ROW5L, XFIX_2_562915447 276 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] 277 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 278 vshl.s32 q3, q3, #13 279 orr r0, r0, r4 280 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 281 orr r0, r0, r5 282 vadd.s32 q1, q3, q2 283 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] 284 vmov q5, q7 285 vadd.s32 q1, q1, q6 286 orr r0, r0, r4 287 vmlsl.s16 q7, ROW7L, XFIX_0_899976223 288 orr r0, r0, r5 289 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 290 vrshrn.s32 ROW1L, q1, #11 291 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] 292 vsub.s32 q1, q1, q6 293 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 294 orr r0, r0, r4 295 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 296 orr r0, r0, r5 297 vsub.s32 q1, q1, q6 298 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 299 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] 300 vmlal.s16 q6, ROW6L, XFIX_0_541196100 301 vsub.s32 q3, q3, q2 302 orr r0, r0, r4 303 vrshrn.s32 ROW6L, q1, #11 304 orr r0, r0, r5 305 vadd.s32 q1, q3, q5 306 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] 307 vsub.s32 q3, q3, q5 308 vaddl.s16 q5, ROW0L, ROW4L 309 orr r0, r0, r4 310 vrshrn.s32 ROW2L, q1, #11 311 orr r0, r0, r5 312 vrshrn.s32 ROW5L, q3, #11 313 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] 314 vshl.s32 q5, q5, #13 315 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 316 orr r0, r0, r4 317 vadd.s32 q2, q5, q6 318 orrs r0, r0, r5 319 vsub.s32 q1, q5, q6 320 vadd.s32 q6, q2, q7 321 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] 322 vsub.s32 q2, q2, q7 323 vadd.s32 q5, q1, q4 324 orr r0, r4, r5 325 vsub.s32 q3, q1, q4 326 pop {r4, r5} 327 vrshrn.s32 ROW7L, q2, #11 328 vrshrn.s32 ROW3L, q5, #11 329 vrshrn.s32 ROW0L, q6, #11 330 vrshrn.s32 ROW4L, q3, #11 331 332 beq 3f /* Go to do some special handling for the sparse right 4x8 half */ 333 334 /* 1-D IDCT, pass 1, right 4x8 half */ 335 vld1.s16 {d2}, [ip, :64] /* reload constants */ 336 vadd.s16 d10, ROW7R, ROW3R 337 vadd.s16 d8, ROW5R, ROW1R 338 /* Transpose left 4x8 half */ 339 vtrn.16 ROW6L, ROW7L 340 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 341 vmlal.s16 q6, d8, XFIX_1_175875602 342 vtrn.16 ROW2L, ROW3L 343 vmull.s16 q7, d10, XFIX_1_175875602 344 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 345 vtrn.16 ROW0L, ROW1L 346 vsubl.s16 q3, ROW0R, ROW4R 347 vmull.s16 q2, ROW2R, XFIX_0_541196100 348 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 349 vtrn.16 ROW4L, ROW5L 350 vmov q4, q6 351 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 352 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 353 vtrn.32 ROW1L, ROW3L 354 vshl.s32 q3, q3, #13 355 vmlsl.s16 q4, ROW1R, XFIX_0_899976223 356 vtrn.32 ROW4L, ROW6L 357 vadd.s32 q1, q3, q2 358 vmov q5, q7 359 vadd.s32 q1, q1, q6 360 vtrn.32 ROW0L, ROW2L 361 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 362 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 363 vrshrn.s32 ROW1R, q1, #11 364 vtrn.32 ROW5L, ROW7L 365 vsub.s32 q1, q1, q6 366 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 367 vmlsl.s16 q5, ROW3R, XFIX_2_562915447 368 vsub.s32 q1, q1, q6 369 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 370 vmlal.s16 q6, ROW6R, XFIX_0_541196100 371 vsub.s32 q3, q3, q2 372 vrshrn.s32 ROW6R, q1, #11 373 vadd.s32 q1, q3, q5 374 vsub.s32 q3, q3, q5 375 vaddl.s16 q5, ROW0R, ROW4R 376 vrshrn.s32 ROW2R, q1, #11 377 vrshrn.s32 ROW5R, q3, #11 378 vshl.s32 q5, q5, #13 379 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 380 vadd.s32 q2, q5, q6 381 vsub.s32 q1, q5, q6 382 vadd.s32 q6, q2, q7 383 vsub.s32 q2, q2, q7 384 vadd.s32 q5, q1, q4 385 vsub.s32 q3, q1, q4 386 vrshrn.s32 ROW7R, q2, #11 387 vrshrn.s32 ROW3R, q5, #11 388 vrshrn.s32 ROW0R, q6, #11 389 vrshrn.s32 ROW4R, q3, #11 390 /* Transpose right 4x8 half */ 391 vtrn.16 ROW6R, ROW7R 392 vtrn.16 ROW2R, ROW3R 393 vtrn.16 ROW0R, ROW1R 394 vtrn.16 ROW4R, ROW5R 395 vtrn.32 ROW1R, ROW3R 396 vtrn.32 ROW4R, ROW6R 397 vtrn.32 ROW0R, ROW2R 398 vtrn.32 ROW5R, ROW7R 399 400 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ 401 vld1.s16 {d2}, [ip, :64] /* reload constants */ 402 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ 403 vmlal.s16 q6, ROW1L, XFIX_1_175875602 404 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ 405 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 406 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ 407 vmlal.s16 q7, ROW3L, XFIX_1_175875602 408 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ 409 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 410 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ 411 vmull.s16 q2, ROW2L, XFIX_0_541196100 412 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ 413 vmov q4, q6 414 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ 415 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 416 vshl.s32 q3, q3, #13 417 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 418 vadd.s32 q1, q3, q2 419 vmov q5, q7 420 vadd.s32 q1, q1, q6 421 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ 422 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 423 vshrn.s32 ROW1L, q1, #16 424 vsub.s32 q1, q1, q6 425 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ 426 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 427 vsub.s32 q1, q1, q6 428 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 429 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ 430 vsub.s32 q3, q3, q2 431 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ 432 vadd.s32 q1, q3, q5 433 vsub.s32 q3, q3, q5 434 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ 435 vshrn.s32 ROW2L, q1, #16 436 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ 437 vshl.s32 q5, q5, #13 438 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ 439 vadd.s32 q2, q5, q6 440 vsub.s32 q1, q5, q6 441 vadd.s32 q6, q2, q7 442 vsub.s32 q2, q2, q7 443 vadd.s32 q5, q1, q4 444 vsub.s32 q3, q1, q4 445 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ 446 vshrn.s32 ROW3L, q5, #16 447 vshrn.s32 ROW0L, q6, #16 448 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ 449 /* 1-D IDCT, pass 2, right 4x8 half */ 450 vld1.s16 {d2}, [ip, :64] /* reload constants */ 451 vmull.s16 q6, ROW5R, XFIX_1_175875602 452 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ 453 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 454 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ 455 vmull.s16 q7, ROW7R, XFIX_1_175875602 456 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ 457 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 458 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ 459 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ 460 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ 461 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 462 vmov q4, q6 463 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 464 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ 465 vshl.s32 q3, q3, #13 466 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ 467 vadd.s32 q1, q3, q2 468 vmov q5, q7 469 vadd.s32 q1, q1, q6 470 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 471 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ 472 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ 473 vsub.s32 q1, q1, q6 474 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 475 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ 476 vsub.s32 q1, q1, q6 477 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ 478 vmlal.s16 q6, ROW6R, XFIX_0_541196100 479 vsub.s32 q3, q3, q2 480 vshrn.s32 ROW6R, q1, #16 481 vadd.s32 q1, q3, q5 482 vsub.s32 q3, q3, q5 483 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ 484 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ 485 vshrn.s32 ROW5R, q3, #16 486 vshl.s32 q5, q5, #13 487 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 488 vadd.s32 q2, q5, q6 489 vsub.s32 q1, q5, q6 490 vadd.s32 q6, q2, q7 491 vsub.s32 q2, q2, q7 492 vadd.s32 q5, q1, q4 493 vsub.s32 q3, q1, q4 494 vshrn.s32 ROW7R, q2, #16 495 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ 496 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ 497 vshrn.s32 ROW4R, q3, #16 498 499 2: /* Descale to 8-bit and range limit */ 500 vqrshrn.s16 d16, q8, #2 501 vqrshrn.s16 d17, q9, #2 502 vqrshrn.s16 d18, q10, #2 503 vqrshrn.s16 d19, q11, #2 504 vpop {d8-d15} /* restore NEON registers */ 505 vqrshrn.s16 d20, q12, #2 506 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ 507 vtrn.16 q8, q9 508 vqrshrn.s16 d21, q13, #2 509 vqrshrn.s16 d22, q14, #2 510 vmov.u8 q0, #(CENTERJSAMPLE) 511 vqrshrn.s16 d23, q15, #2 512 vtrn.8 d16, d17 513 vtrn.8 d18, d19 514 vadd.u8 q8, q8, q0 515 vadd.u8 q9, q9, q0 516 vtrn.16 q10, q11 517 /* Store results to the output buffer */ 518 ldmia OUTPUT_BUF!, {TMP1, TMP2} 519 add TMP1, TMP1, OUTPUT_COL 520 add TMP2, TMP2, OUTPUT_COL 521 vst1.8 {d16}, [TMP1] 522 vtrn.8 d20, d21 523 vst1.8 {d17}, [TMP2] 524 ldmia OUTPUT_BUF!, {TMP1, TMP2} 525 add TMP1, TMP1, OUTPUT_COL 526 add TMP2, TMP2, OUTPUT_COL 527 vst1.8 {d18}, [TMP1] 528 vadd.u8 q10, q10, q0 529 vst1.8 {d19}, [TMP2] 530 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 531 add TMP1, TMP1, OUTPUT_COL 532 add TMP2, TMP2, OUTPUT_COL 533 add TMP3, TMP3, OUTPUT_COL 534 add TMP4, TMP4, OUTPUT_COL 535 vtrn.8 d22, d23 536 vst1.8 {d20}, [TMP1] 537 vadd.u8 q11, q11, q0 538 vst1.8 {d21}, [TMP2] 539 vst1.8 {d22}, [TMP3] 540 vst1.8 {d23}, [TMP4] 541 bx lr 542 543 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ 544 545 /* Transpose left 4x8 half */ 546 vtrn.16 ROW6L, ROW7L 547 vtrn.16 ROW2L, ROW3L 548 vtrn.16 ROW0L, ROW1L 549 vtrn.16 ROW4L, ROW5L 550 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ 551 vtrn.32 ROW1L, ROW3L 552 vtrn.32 ROW4L, ROW6L 553 vtrn.32 ROW0L, ROW2L 554 vtrn.32 ROW5L, ROW7L 555 556 cmp r0, #0 557 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */ 558 559 /* Only row 0 is non-zero for the right 4x8 half */ 560 vdup.s16 ROW1R, ROW0R[1] 561 vdup.s16 ROW2R, ROW0R[2] 562 vdup.s16 ROW3R, ROW0R[3] 563 vdup.s16 ROW4R, ROW0R[0] 564 vdup.s16 ROW5R, ROW0R[1] 565 vdup.s16 ROW6R, ROW0R[2] 566 vdup.s16 ROW7R, ROW0R[3] 567 vdup.s16 ROW0R, ROW0R[0] 568 b 1b /* Go to 'normal' second pass */ 569 570 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ 571 vld1.s16 {d2}, [ip, :64] /* reload constants */ 572 vmull.s16 q6, ROW1L, XFIX_1_175875602 573 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 574 vmull.s16 q7, ROW3L, XFIX_1_175875602 575 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 576 vmull.s16 q2, ROW2L, XFIX_0_541196100 577 vshll.s16 q3, ROW0L, #13 578 vmov q4, q6 579 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 580 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 581 vadd.s32 q1, q3, q2 582 vmov q5, q7 583 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 584 vadd.s32 q1, q1, q6 585 vadd.s32 q6, q6, q6 586 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 587 vshrn.s32 ROW1L, q1, #16 588 vsub.s32 q1, q1, q6 589 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 590 vsub.s32 q3, q3, q2 591 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ 592 vadd.s32 q1, q3, q5 593 vsub.s32 q3, q3, q5 594 vshll.s16 q5, ROW0L, #13 595 vshrn.s32 ROW2L, q1, #16 596 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ 597 vadd.s32 q2, q5, q6 598 vsub.s32 q1, q5, q6 599 vadd.s32 q6, q2, q7 600 vsub.s32 q2, q2, q7 601 vadd.s32 q5, q1, q4 602 vsub.s32 q3, q1, q4 603 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ 604 vshrn.s32 ROW3L, q5, #16 605 vshrn.s32 ROW0L, q6, #16 606 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ 607 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ 608 vld1.s16 {d2}, [ip, :64] /* reload constants */ 609 vmull.s16 q6, ROW5L, XFIX_1_175875602 610 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 611 vmull.s16 q7, ROW7L, XFIX_1_175875602 612 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 613 vmull.s16 q2, ROW6L, XFIX_0_541196100 614 vshll.s16 q3, ROW4L, #13 615 vmov q4, q6 616 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 617 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 618 vadd.s32 q1, q3, q2 619 vmov q5, q7 620 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 621 vadd.s32 q1, q1, q6 622 vadd.s32 q6, q6, q6 623 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 624 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ 625 vsub.s32 q1, q1, q6 626 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 627 vsub.s32 q3, q3, q2 628 vshrn.s32 ROW6R, q1, #16 629 vadd.s32 q1, q3, q5 630 vsub.s32 q3, q3, q5 631 vshll.s16 q5, ROW4L, #13 632 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ 633 vshrn.s32 ROW5R, q3, #16 634 vadd.s32 q2, q5, q6 635 vsub.s32 q1, q5, q6 636 vadd.s32 q6, q2, q7 637 vsub.s32 q2, q2, q7 638 vadd.s32 q5, q1, q4 639 vsub.s32 q3, q1, q4 640 vshrn.s32 ROW7R, q2, #16 641 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ 642 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ 643 vshrn.s32 ROW4R, q3, #16 644 b 2b /* Go to epilogue */ 645 646 .unreq DCT_TABLE 647 .unreq COEF_BLOCK 648 .unreq OUTPUT_BUF 649 .unreq OUTPUT_COL 650 .unreq TMP1 651 .unreq TMP2 652 .unreq TMP3 653 .unreq TMP4 654 655 .unreq ROW0L 656 .unreq ROW0R 657 .unreq ROW1L 658 .unreq ROW1R 659 .unreq ROW2L 660 .unreq ROW2R 661 .unreq ROW3L 662 .unreq ROW3R 663 .unreq ROW4L 664 .unreq ROW4R 665 .unreq ROW5L 666 .unreq ROW5R 667 .unreq ROW6L 668 .unreq ROW6R 669 .unreq ROW7L 670 .unreq ROW7R 671 672 673 /*****************************************************************************/ 674 675 /* 676 * jsimd_idct_ifast_neon 677 * 678 * This function contains a fast, not so accurate integer implementation of 679 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 680 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' 681 * function from jidctfst.c 682 * 683 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. 684 * But in ARM NEON case some extra additions are required because VQDMULH 685 * instruction can't handle the constants larger than 1. So the expressions 686 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", 687 * which introduces an extra addition. Overall, there are 6 extra additions 688 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. 689 */ 690 691 #define XFIX_1_082392200 d0[0] 692 #define XFIX_1_414213562 d0[1] 693 #define XFIX_1_847759065 d0[2] 694 #define XFIX_2_613125930 d0[3] 695 696 .balign 16 697 jsimd_idct_ifast_neon_consts: 698 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 699 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 700 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 701 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 702 703 asm_function jsimd_idct_ifast_neon 704 705 DCT_TABLE .req r0 706 COEF_BLOCK .req r1 707 OUTPUT_BUF .req r2 708 OUTPUT_COL .req r3 709 TMP1 .req r0 710 TMP2 .req r1 711 TMP3 .req r2 712 TMP4 .req ip 713 714 /* Load and dequantize coefficients into NEON registers 715 * with the following allocation: 716 * 0 1 2 3 | 4 5 6 7 717 * ---------+-------- 718 * 0 | d16 | d17 ( q8 ) 719 * 1 | d18 | d19 ( q9 ) 720 * 2 | d20 | d21 ( q10 ) 721 * 3 | d22 | d23 ( q11 ) 722 * 4 | d24 | d25 ( q12 ) 723 * 5 | d26 | d27 ( q13 ) 724 * 6 | d28 | d29 ( q14 ) 725 * 7 | d30 | d31 ( q15 ) 726 */ 727 adr ip, jsimd_idct_ifast_neon_consts 728 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! 729 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 730 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! 731 vmul.s16 q8, q8, q0 732 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 733 vmul.s16 q9, q9, q1 734 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! 735 vmul.s16 q10, q10, q2 736 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 737 vmul.s16 q11, q11, q3 738 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] 739 vmul.s16 q12, q12, q0 740 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 741 vmul.s16 q14, q14, q2 742 vmul.s16 q13, q13, q1 743 vld1.16 {d0}, [ip, :64] /* load constants */ 744 vmul.s16 q15, q15, q3 745 vpush {d8-d13} /* save NEON registers */ 746 /* 1-D IDCT, pass 1 */ 747 vsub.s16 q2, q10, q14 748 vadd.s16 q14, q10, q14 749 vsub.s16 q1, q11, q13 750 vadd.s16 q13, q11, q13 751 vsub.s16 q5, q9, q15 752 vadd.s16 q15, q9, q15 753 vqdmulh.s16 q4, q2, XFIX_1_414213562 754 vqdmulh.s16 q6, q1, XFIX_2_613125930 755 vadd.s16 q3, q1, q1 756 vsub.s16 q1, q5, q1 757 vadd.s16 q10, q2, q4 758 vqdmulh.s16 q4, q1, XFIX_1_847759065 759 vsub.s16 q2, q15, q13 760 vadd.s16 q3, q3, q6 761 vqdmulh.s16 q6, q2, XFIX_1_414213562 762 vadd.s16 q1, q1, q4 763 vqdmulh.s16 q4, q5, XFIX_1_082392200 764 vsub.s16 q10, q10, q14 765 vadd.s16 q2, q2, q6 766 vsub.s16 q6, q8, q12 767 vadd.s16 q12, q8, q12 768 vadd.s16 q9, q5, q4 769 vadd.s16 q5, q6, q10 770 vsub.s16 q10, q6, q10 771 vadd.s16 q6, q15, q13 772 vadd.s16 q8, q12, q14 773 vsub.s16 q3, q6, q3 774 vsub.s16 q12, q12, q14 775 vsub.s16 q3, q3, q1 776 vsub.s16 q1, q9, q1 777 vadd.s16 q2, q3, q2 778 vsub.s16 q15, q8, q6 779 vadd.s16 q1, q1, q2 780 vadd.s16 q8, q8, q6 781 vadd.s16 q14, q5, q3 782 vsub.s16 q9, q5, q3 783 vsub.s16 q13, q10, q2 784 vadd.s16 q10, q10, q2 785 /* Transpose */ 786 vtrn.16 q8, q9 787 vsub.s16 q11, q12, q1 788 vtrn.16 q14, q15 789 vadd.s16 q12, q12, q1 790 vtrn.16 q10, q11 791 vtrn.16 q12, q13 792 vtrn.32 q9, q11 793 vtrn.32 q12, q14 794 vtrn.32 q8, q10 795 vtrn.32 q13, q15 796 vswp d28, d21 797 vswp d26, d19 798 /* 1-D IDCT, pass 2 */ 799 vsub.s16 q2, q10, q14 800 vswp d30, d23 801 vadd.s16 q14, q10, q14 802 vswp d24, d17 803 vsub.s16 q1, q11, q13 804 vadd.s16 q13, q11, q13 805 vsub.s16 q5, q9, q15 806 vadd.s16 q15, q9, q15 807 vqdmulh.s16 q4, q2, XFIX_1_414213562 808 vqdmulh.s16 q6, q1, XFIX_2_613125930 809 vadd.s16 q3, q1, q1 810 vsub.s16 q1, q5, q1 811 vadd.s16 q10, q2, q4 812 vqdmulh.s16 q4, q1, XFIX_1_847759065 813 vsub.s16 q2, q15, q13 814 vadd.s16 q3, q3, q6 815 vqdmulh.s16 q6, q2, XFIX_1_414213562 816 vadd.s16 q1, q1, q4 817 vqdmulh.s16 q4, q5, XFIX_1_082392200 818 vsub.s16 q10, q10, q14 819 vadd.s16 q2, q2, q6 820 vsub.s16 q6, q8, q12 821 vadd.s16 q12, q8, q12 822 vadd.s16 q9, q5, q4 823 vadd.s16 q5, q6, q10 824 vsub.s16 q10, q6, q10 825 vadd.s16 q6, q15, q13 826 vadd.s16 q8, q12, q14 827 vsub.s16 q3, q6, q3 828 vsub.s16 q12, q12, q14 829 vsub.s16 q3, q3, q1 830 vsub.s16 q1, q9, q1 831 vadd.s16 q2, q3, q2 832 vsub.s16 q15, q8, q6 833 vadd.s16 q1, q1, q2 834 vadd.s16 q8, q8, q6 835 vadd.s16 q14, q5, q3 836 vsub.s16 q9, q5, q3 837 vsub.s16 q13, q10, q2 838 vpop {d8-d13} /* restore NEON registers */ 839 vadd.s16 q10, q10, q2 840 vsub.s16 q11, q12, q1 841 vadd.s16 q12, q12, q1 842 /* Descale to 8-bit and range limit */ 843 vmov.u8 q0, #0x80 844 vqshrn.s16 d16, q8, #5 845 vqshrn.s16 d17, q9, #5 846 vqshrn.s16 d18, q10, #5 847 vqshrn.s16 d19, q11, #5 848 vqshrn.s16 d20, q12, #5 849 vqshrn.s16 d21, q13, #5 850 vqshrn.s16 d22, q14, #5 851 vqshrn.s16 d23, q15, #5 852 vadd.u8 q8, q8, q0 853 vadd.u8 q9, q9, q0 854 vadd.u8 q10, q10, q0 855 vadd.u8 q11, q11, q0 856 /* Transpose the final 8-bit samples */ 857 vtrn.16 q8, q9 858 vtrn.16 q10, q11 859 vtrn.32 q8, q10 860 vtrn.32 q9, q11 861 vtrn.8 d16, d17 862 vtrn.8 d18, d19 863 /* Store results to the output buffer */ 864 ldmia OUTPUT_BUF!, {TMP1, TMP2} 865 add TMP1, TMP1, OUTPUT_COL 866 add TMP2, TMP2, OUTPUT_COL 867 vst1.8 {d16}, [TMP1] 868 vst1.8 {d17}, [TMP2] 869 ldmia OUTPUT_BUF!, {TMP1, TMP2} 870 add TMP1, TMP1, OUTPUT_COL 871 add TMP2, TMP2, OUTPUT_COL 872 vst1.8 {d18}, [TMP1] 873 vtrn.8 d20, d21 874 vst1.8 {d19}, [TMP2] 875 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 876 add TMP1, TMP1, OUTPUT_COL 877 add TMP2, TMP2, OUTPUT_COL 878 add TMP3, TMP3, OUTPUT_COL 879 add TMP4, TMP4, OUTPUT_COL 880 vst1.8 {d20}, [TMP1] 881 vtrn.8 d22, d23 882 vst1.8 {d21}, [TMP2] 883 vst1.8 {d22}, [TMP3] 884 vst1.8 {d23}, [TMP4] 885 bx lr 886 887 .unreq DCT_TABLE 888 .unreq COEF_BLOCK 889 .unreq OUTPUT_BUF 890 .unreq OUTPUT_COL 891 .unreq TMP1 892 .unreq TMP2 893 .unreq TMP3 894 .unreq TMP4 895 896 897 /*****************************************************************************/ 898 899 /* 900 * jsimd_idct_4x4_neon 901 * 902 * This function contains inverse-DCT code for getting reduced-size 903 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations 904 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' 905 * function from jpeg-6b (jidctred.c). 906 * 907 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which 908 * requires much less arithmetic operations and hence should be faster. 909 * The primary purpose of this particular NEON optimized function is 910 * bit exact compatibility with jpeg-6b. 911 * 912 * TODO: a bit better instructions scheduling can be achieved by expanding 913 * idct_helper/transpose_4x4 macros and reordering instructions, 914 * but readability will suffer somewhat. 915 */ 916 917 #define CONST_BITS 13 918 919 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */ 920 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */ 921 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */ 922 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */ 923 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */ 924 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */ 925 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */ 926 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ 927 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ 928 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ 929 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ 930 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ 931 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ 932 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ 933 934 .balign 16 935 jsimd_idct_4x4_neon_consts: 936 .short FIX_1_847759065 /* d0[0] */ 937 .short -FIX_0_765366865 /* d0[1] */ 938 .short -FIX_0_211164243 /* d0[2] */ 939 .short FIX_1_451774981 /* d0[3] */ 940 .short -FIX_2_172734803 /* d1[0] */ 941 .short FIX_1_061594337 /* d1[1] */ 942 .short -FIX_0_509795579 /* d1[2] */ 943 .short -FIX_0_601344887 /* d1[3] */ 944 .short FIX_0_899976223 /* d2[0] */ 945 .short FIX_2_562915447 /* d2[1] */ 946 .short 1 << (CONST_BITS+1) /* d2[2] */ 947 .short 0 /* d2[3] */ 948 949 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 950 vmull.s16 q14, \x4, d2[2] 951 vmlal.s16 q14, \x8, d0[0] 952 vmlal.s16 q14, \x14, d0[1] 953 954 vmull.s16 q13, \x16, d1[2] 955 vmlal.s16 q13, \x12, d1[3] 956 vmlal.s16 q13, \x10, d2[0] 957 vmlal.s16 q13, \x6, d2[1] 958 959 vmull.s16 q15, \x4, d2[2] 960 vmlsl.s16 q15, \x8, d0[0] 961 vmlsl.s16 q15, \x14, d0[1] 962 963 vmull.s16 q12, \x16, d0[2] 964 vmlal.s16 q12, \x12, d0[3] 965 vmlal.s16 q12, \x10, d1[0] 966 vmlal.s16 q12, \x6, d1[1] 967 968 vadd.s32 q10, q14, q13 969 vsub.s32 q14, q14, q13 970 971 .if \shift > 16 972 vrshr.s32 q10, q10, #\shift 973 vrshr.s32 q14, q14, #\shift 974 vmovn.s32 \y26, q10 975 vmovn.s32 \y29, q14 976 .else 977 vrshrn.s32 \y26, q10, #\shift 978 vrshrn.s32 \y29, q14, #\shift 979 .endif 980 981 vadd.s32 q10, q15, q12 982 vsub.s32 q15, q15, q12 983 984 .if \shift > 16 985 vrshr.s32 q10, q10, #\shift 986 vrshr.s32 q15, q15, #\shift 987 vmovn.s32 \y27, q10 988 vmovn.s32 \y28, q15 989 .else 990 vrshrn.s32 \y27, q10, #\shift 991 vrshrn.s32 \y28, q15, #\shift 992 .endif 993 994 .endm 995 996 asm_function jsimd_idct_4x4_neon 997 998 DCT_TABLE .req r0 999 COEF_BLOCK .req r1 1000 OUTPUT_BUF .req r2 1001 OUTPUT_COL .req r3 1002 TMP1 .req r0 1003 TMP2 .req r1 1004 TMP3 .req r2 1005 TMP4 .req ip 1006 1007 vpush {d8-d15} 1008 1009 /* Load constants (d3 is just used for padding) */ 1010 adr TMP4, jsimd_idct_4x4_neon_consts 1011 vld1.16 {d0, d1, d2, d3}, [TMP4, :128] 1012 1013 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1014 * 0 1 2 3 | 4 5 6 7 1015 * ---------+-------- 1016 * 0 | d4 | d5 1017 * 1 | d6 | d7 1018 * 2 | d8 | d9 1019 * 3 | d10 | d11 1020 * 4 | - | - 1021 * 5 | d12 | d13 1022 * 6 | d14 | d15 1023 * 7 | d16 | d17 1024 */ 1025 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! 1026 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! 1027 add COEF_BLOCK, COEF_BLOCK, #16 1028 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! 1029 vld1.16 {d16, d17}, [COEF_BLOCK, :128]! 1030 /* dequantize */ 1031 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! 1032 vmul.s16 q2, q2, q9 1033 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! 1034 vmul.s16 q3, q3, q10 1035 vmul.s16 q4, q4, q11 1036 add DCT_TABLE, DCT_TABLE, #16 1037 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! 1038 vmul.s16 q5, q5, q12 1039 vmul.s16 q6, q6, q13 1040 vld1.16 {d30, d31}, [DCT_TABLE, :128]! 1041 vmul.s16 q7, q7, q14 1042 vmul.s16 q8, q8, q15 1043 1044 /* Pass 1 */ 1045 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 1046 transpose_4x4 d4, d6, d8, d10 1047 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 1048 transpose_4x4 d5, d7, d9, d11 1049 1050 /* Pass 2 */ 1051 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 1052 transpose_4x4 d26, d27, d28, d29 1053 1054 /* Range limit */ 1055 vmov.u16 q15, #0x80 1056 vadd.s16 q13, q13, q15 1057 vadd.s16 q14, q14, q15 1058 vqmovun.s16 d26, q13 1059 vqmovun.s16 d27, q14 1060 1061 /* Store results to the output buffer */ 1062 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 1063 add TMP1, TMP1, OUTPUT_COL 1064 add TMP2, TMP2, OUTPUT_COL 1065 add TMP3, TMP3, OUTPUT_COL 1066 add TMP4, TMP4, OUTPUT_COL 1067 1068 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT 1069 /* We can use much less instructions on little endian systems if the 1070 * OS kernel is not configured to trap unaligned memory accesses 1071 */ 1072 vst1.32 {d26[0]}, [TMP1]! 1073 vst1.32 {d27[0]}, [TMP3]! 1074 vst1.32 {d26[1]}, [TMP2]! 1075 vst1.32 {d27[1]}, [TMP4]! 1076 #else 1077 vst1.8 {d26[0]}, [TMP1]! 1078 vst1.8 {d27[0]}, [TMP3]! 1079 vst1.8 {d26[1]}, [TMP1]! 1080 vst1.8 {d27[1]}, [TMP3]! 1081 vst1.8 {d26[2]}, [TMP1]! 1082 vst1.8 {d27[2]}, [TMP3]! 1083 vst1.8 {d26[3]}, [TMP1]! 1084 vst1.8 {d27[3]}, [TMP3]! 1085 1086 vst1.8 {d26[4]}, [TMP2]! 1087 vst1.8 {d27[4]}, [TMP4]! 1088 vst1.8 {d26[5]}, [TMP2]! 1089 vst1.8 {d27[5]}, [TMP4]! 1090 vst1.8 {d26[6]}, [TMP2]! 1091 vst1.8 {d27[6]}, [TMP4]! 1092 vst1.8 {d26[7]}, [TMP2]! 1093 vst1.8 {d27[7]}, [TMP4]! 1094 #endif 1095 1096 vpop {d8-d15} 1097 bx lr 1098 1099 .unreq DCT_TABLE 1100 .unreq COEF_BLOCK 1101 .unreq OUTPUT_BUF 1102 .unreq OUTPUT_COL 1103 .unreq TMP1 1104 .unreq TMP2 1105 .unreq TMP3 1106 .unreq TMP4 1107 1108 .purgem idct_helper 1109 1110 1111 /*****************************************************************************/ 1112 1113 /* 1114 * jsimd_idct_2x2_neon 1115 * 1116 * This function contains inverse-DCT code for getting reduced-size 1117 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations 1118 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' 1119 * function from jpeg-6b (jidctred.c). 1120 * 1121 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which 1122 * requires much less arithmetic operations and hence should be faster. 1123 * The primary purpose of this particular NEON optimized function is 1124 * bit exact compatibility with jpeg-6b. 1125 */ 1126 1127 .balign 8 1128 jsimd_idct_2x2_neon_consts: 1129 .short -FIX_0_720959822 /* d0[0] */ 1130 .short FIX_0_850430095 /* d0[1] */ 1131 .short -FIX_1_272758580 /* d0[2] */ 1132 .short FIX_3_624509785 /* d0[3] */ 1133 1134 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 1135 vshll.s16 q14, \x4, #15 1136 vmull.s16 q13, \x6, d0[3] 1137 vmlal.s16 q13, \x10, d0[2] 1138 vmlal.s16 q13, \x12, d0[1] 1139 vmlal.s16 q13, \x16, d0[0] 1140 1141 vadd.s32 q10, q14, q13 1142 vsub.s32 q14, q14, q13 1143 1144 .if \shift > 16 1145 vrshr.s32 q10, q10, #\shift 1146 vrshr.s32 q14, q14, #\shift 1147 vmovn.s32 \y26, q10 1148 vmovn.s32 \y27, q14 1149 .else 1150 vrshrn.s32 \y26, q10, #\shift 1151 vrshrn.s32 \y27, q14, #\shift 1152 .endif 1153 1154 .endm 1155 1156 asm_function jsimd_idct_2x2_neon 1157 1158 DCT_TABLE .req r0 1159 COEF_BLOCK .req r1 1160 OUTPUT_BUF .req r2 1161 OUTPUT_COL .req r3 1162 TMP1 .req r0 1163 TMP2 .req ip 1164 1165 vpush {d8-d15} 1166 1167 /* Load constants */ 1168 adr TMP2, jsimd_idct_2x2_neon_consts 1169 vld1.16 {d0}, [TMP2, :64] 1170 1171 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1172 * 0 1 2 3 | 4 5 6 7 1173 * ---------+-------- 1174 * 0 | d4 | d5 1175 * 1 | d6 | d7 1176 * 2 | - | - 1177 * 3 | d10 | d11 1178 * 4 | - | - 1179 * 5 | d12 | d13 1180 * 6 | - | - 1181 * 7 | d16 | d17 1182 */ 1183 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! 1184 add COEF_BLOCK, COEF_BLOCK, #16 1185 vld1.16 {d10, d11}, [COEF_BLOCK, :128]! 1186 add COEF_BLOCK, COEF_BLOCK, #16 1187 vld1.16 {d12, d13}, [COEF_BLOCK, :128]! 1188 add COEF_BLOCK, COEF_BLOCK, #16 1189 vld1.16 {d16, d17}, [COEF_BLOCK, :128]! 1190 /* Dequantize */ 1191 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! 1192 vmul.s16 q2, q2, q9 1193 vmul.s16 q3, q3, q10 1194 add DCT_TABLE, DCT_TABLE, #16 1195 vld1.16 {d24, d25}, [DCT_TABLE, :128]! 1196 vmul.s16 q5, q5, q12 1197 add DCT_TABLE, DCT_TABLE, #16 1198 vld1.16 {d26, d27}, [DCT_TABLE, :128]! 1199 vmul.s16 q6, q6, q13 1200 add DCT_TABLE, DCT_TABLE, #16 1201 vld1.16 {d30, d31}, [DCT_TABLE, :128]! 1202 vmul.s16 q8, q8, q15 1203 1204 /* Pass 1 */ 1205 #if 0 1206 idct_helper d4, d6, d10, d12, d16, 13, d4, d6 1207 transpose_4x4 d4, d6, d8, d10 1208 idct_helper d5, d7, d11, d13, d17, 13, d5, d7 1209 transpose_4x4 d5, d7, d9, d11 1210 #else 1211 vmull.s16 q13, d6, d0[3] 1212 vmlal.s16 q13, d10, d0[2] 1213 vmlal.s16 q13, d12, d0[1] 1214 vmlal.s16 q13, d16, d0[0] 1215 vmull.s16 q12, d7, d0[3] 1216 vmlal.s16 q12, d11, d0[2] 1217 vmlal.s16 q12, d13, d0[1] 1218 vmlal.s16 q12, d17, d0[0] 1219 vshll.s16 q14, d4, #15 1220 vshll.s16 q15, d5, #15 1221 vadd.s32 q10, q14, q13 1222 vsub.s32 q14, q14, q13 1223 vrshrn.s32 d4, q10, #13 1224 vrshrn.s32 d6, q14, #13 1225 vadd.s32 q10, q15, q12 1226 vsub.s32 q14, q15, q12 1227 vrshrn.s32 d5, q10, #13 1228 vrshrn.s32 d7, q14, #13 1229 vtrn.16 q2, q3 1230 vtrn.32 q3, q5 1231 #endif 1232 1233 /* Pass 2 */ 1234 idct_helper d4, d6, d10, d7, d11, 20, d26, d27 1235 1236 /* Range limit */ 1237 vmov.u16 q15, #0x80 1238 vadd.s16 q13, q13, q15 1239 vqmovun.s16 d26, q13 1240 vqmovun.s16 d27, q13 1241 1242 /* Store results to the output buffer */ 1243 ldmia OUTPUT_BUF, {TMP1, TMP2} 1244 add TMP1, TMP1, OUTPUT_COL 1245 add TMP2, TMP2, OUTPUT_COL 1246 1247 vst1.8 {d26[0]}, [TMP1]! 1248 vst1.8 {d27[4]}, [TMP1]! 1249 vst1.8 {d26[1]}, [TMP2]! 1250 vst1.8 {d27[5]}, [TMP2]! 1251 1252 vpop {d8-d15} 1253 bx lr 1254 1255 .unreq DCT_TABLE 1256 .unreq COEF_BLOCK 1257 .unreq OUTPUT_BUF 1258 .unreq OUTPUT_COL 1259 .unreq TMP1 1260 .unreq TMP2 1261 1262 .purgem idct_helper 1263 1264 1265 /*****************************************************************************/ 1266 1267 /* 1268 * jsimd_ycc_extrgb_convert_neon 1269 * jsimd_ycc_extbgr_convert_neon 1270 * jsimd_ycc_extrgbx_convert_neon 1271 * jsimd_ycc_extbgrx_convert_neon 1272 * jsimd_ycc_extxbgr_convert_neon 1273 * jsimd_ycc_extxrgb_convert_neon 1274 * 1275 * Colorspace conversion YCbCr -> RGB 1276 */ 1277 1278 1279 .macro do_load size 1280 .if \size == 8 1281 vld1.8 {d4}, [U, :64]! 1282 vld1.8 {d5}, [V, :64]! 1283 vld1.8 {d0}, [Y, :64]! 1284 pld [U, #64] 1285 pld [V, #64] 1286 pld [Y, #64] 1287 .elseif \size == 4 1288 vld1.8 {d4[0]}, [U]! 1289 vld1.8 {d4[1]}, [U]! 1290 vld1.8 {d4[2]}, [U]! 1291 vld1.8 {d4[3]}, [U]! 1292 vld1.8 {d5[0]}, [V]! 1293 vld1.8 {d5[1]}, [V]! 1294 vld1.8 {d5[2]}, [V]! 1295 vld1.8 {d5[3]}, [V]! 1296 vld1.8 {d0[0]}, [Y]! 1297 vld1.8 {d0[1]}, [Y]! 1298 vld1.8 {d0[2]}, [Y]! 1299 vld1.8 {d0[3]}, [Y]! 1300 .elseif \size == 2 1301 vld1.8 {d4[4]}, [U]! 1302 vld1.8 {d4[5]}, [U]! 1303 vld1.8 {d5[4]}, [V]! 1304 vld1.8 {d5[5]}, [V]! 1305 vld1.8 {d0[4]}, [Y]! 1306 vld1.8 {d0[5]}, [Y]! 1307 .elseif \size == 1 1308 vld1.8 {d4[6]}, [U]! 1309 vld1.8 {d5[6]}, [V]! 1310 vld1.8 {d0[6]}, [Y]! 1311 .else 1312 .error unsupported macroblock size 1313 .endif 1314 .endm 1315 1316 .macro do_store bpp, size 1317 .if \bpp == 24 1318 .if \size == 8 1319 vst3.8 {d10, d11, d12}, [RGB]! 1320 .elseif \size == 4 1321 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! 1322 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! 1323 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! 1324 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! 1325 .elseif \size == 2 1326 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! 1327 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! 1328 .elseif \size == 1 1329 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! 1330 .else 1331 .error unsupported macroblock size 1332 .endif 1333 .elseif \bpp == 32 1334 .if \size == 8 1335 vst4.8 {d10, d11, d12, d13}, [RGB]! 1336 .elseif \size == 4 1337 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! 1338 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! 1339 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! 1340 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! 1341 .elseif \size == 2 1342 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! 1343 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! 1344 .elseif \size == 1 1345 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! 1346 .else 1347 .error unsupported macroblock size 1348 .endif 1349 .else 1350 .error unsupported bpp 1351 .endif 1352 .endm 1353 1354 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs 1355 1356 /* 1357 * 2 stage pipelined YCbCr->RGB conversion 1358 */ 1359 1360 .macro do_yuv_to_rgb_stage1 1361 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ 1362 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ 1363 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ 1364 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ 1365 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ 1366 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ 1367 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ 1368 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ 1369 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ 1370 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ 1371 .endm 1372 1373 .macro do_yuv_to_rgb_stage2 1374 vrshrn.s32 d20, q10, #15 1375 vrshrn.s32 d21, q11, #15 1376 vrshrn.s32 d24, q12, #14 1377 vrshrn.s32 d25, q13, #14 1378 vrshrn.s32 d28, q14, #14 1379 vrshrn.s32 d29, q15, #14 1380 vaddw.u8 q10, q10, d0 1381 vaddw.u8 q12, q12, d0 1382 vaddw.u8 q14, q14, d0 1383 vqmovun.s16 d1\g_offs, q10 1384 vqmovun.s16 d1\r_offs, q12 1385 vqmovun.s16 d1\b_offs, q14 1386 .endm 1387 1388 .macro do_yuv_to_rgb_stage2_store_load_stage1 1389 vld1.8 {d4}, [U, :64]! 1390 vrshrn.s32 d20, q10, #15 1391 vrshrn.s32 d21, q11, #15 1392 vrshrn.s32 d24, q12, #14 1393 vrshrn.s32 d25, q13, #14 1394 vrshrn.s32 d28, q14, #14 1395 vld1.8 {d5}, [V, :64]! 1396 vrshrn.s32 d29, q15, #14 1397 vaddw.u8 q10, q10, d0 1398 vaddw.u8 q12, q12, d0 1399 vaddw.u8 q14, q14, d0 1400 vqmovun.s16 d1\g_offs, q10 1401 vld1.8 {d0}, [Y, :64]! 1402 vqmovun.s16 d1\r_offs, q12 1403 pld [U, #64] 1404 pld [V, #64] 1405 pld [Y, #64] 1406 vqmovun.s16 d1\b_offs, q14 1407 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ 1408 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ 1409 do_store \bpp, 8 1410 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ 1411 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ 1412 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ 1413 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ 1414 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ 1415 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ 1416 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ 1417 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ 1418 .endm 1419 1420 .macro do_yuv_to_rgb 1421 do_yuv_to_rgb_stage1 1422 do_yuv_to_rgb_stage2 1423 .endm 1424 1425 /* Apple gas crashes on adrl, work around that by using adr. 1426 * But this requires a copy of these constants for each function. 1427 */ 1428 1429 .balign 16 1430 jsimd_ycc_\colorid\()_neon_consts: 1431 .short 0, 0, 0, 0 1432 .short 22971, -11277, -23401, 29033 1433 .short -128, -128, -128, -128 1434 .short -128, -128, -128, -128 1435 1436 asm_function jsimd_ycc_\colorid\()_convert_neon 1437 OUTPUT_WIDTH .req r0 1438 INPUT_BUF .req r1 1439 INPUT_ROW .req r2 1440 OUTPUT_BUF .req r3 1441 NUM_ROWS .req r4 1442 1443 INPUT_BUF0 .req r5 1444 INPUT_BUF1 .req r6 1445 INPUT_BUF2 .req INPUT_BUF 1446 1447 RGB .req r7 1448 Y .req r8 1449 U .req r9 1450 V .req r10 1451 N .req ip 1452 1453 /* Load constants to d1, d2, d3 (d0 is just used for padding) */ 1454 adr ip, jsimd_ycc_\colorid\()_neon_consts 1455 vld1.16 {d0, d1, d2, d3}, [ip, :128] 1456 1457 /* Save ARM registers and handle input arguments */ 1458 push {r4, r5, r6, r7, r8, r9, r10, lr} 1459 ldr NUM_ROWS, [sp, #(4 * 8)] 1460 ldr INPUT_BUF0, [INPUT_BUF] 1461 ldr INPUT_BUF1, [INPUT_BUF, #4] 1462 ldr INPUT_BUF2, [INPUT_BUF, #8] 1463 .unreq INPUT_BUF 1464 1465 /* Save NEON registers */ 1466 vpush {d8-d15} 1467 1468 /* Initially set d10, d11, d12, d13 to 0xFF */ 1469 vmov.u8 q5, #255 1470 vmov.u8 q6, #255 1471 1472 /* Outer loop over scanlines */ 1473 cmp NUM_ROWS, #1 1474 blt 9f 1475 0: 1476 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] 1477 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] 1478 mov N, OUTPUT_WIDTH 1479 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] 1480 add INPUT_ROW, INPUT_ROW, #1 1481 ldr RGB, [OUTPUT_BUF], #4 1482 1483 /* Inner loop over pixels */ 1484 subs N, N, #8 1485 blt 3f 1486 do_load 8 1487 do_yuv_to_rgb_stage1 1488 subs N, N, #8 1489 blt 2f 1490 1: 1491 do_yuv_to_rgb_stage2_store_load_stage1 1492 subs N, N, #8 1493 bge 1b 1494 2: 1495 do_yuv_to_rgb_stage2 1496 do_store \bpp, 8 1497 tst N, #7 1498 beq 8f 1499 3: 1500 tst N, #4 1501 beq 3f 1502 do_load 4 1503 3: 1504 tst N, #2 1505 beq 4f 1506 do_load 2 1507 4: 1508 tst N, #1 1509 beq 5f 1510 do_load 1 1511 5: 1512 do_yuv_to_rgb 1513 tst N, #4 1514 beq 6f 1515 do_store \bpp, 4 1516 6: 1517 tst N, #2 1518 beq 7f 1519 do_store \bpp, 2 1520 7: 1521 tst N, #1 1522 beq 8f 1523 do_store \bpp, 1 1524 8: 1525 subs NUM_ROWS, NUM_ROWS, #1 1526 bgt 0b 1527 9: 1528 /* Restore all registers and return */ 1529 vpop {d8-d15} 1530 pop {r4, r5, r6, r7, r8, r9, r10, pc} 1531 1532 .unreq OUTPUT_WIDTH 1533 .unreq INPUT_ROW 1534 .unreq OUTPUT_BUF 1535 .unreq NUM_ROWS 1536 .unreq INPUT_BUF0 1537 .unreq INPUT_BUF1 1538 .unreq INPUT_BUF2 1539 .unreq RGB 1540 .unreq Y 1541 .unreq U 1542 .unreq V 1543 .unreq N 1544 1545 .purgem do_yuv_to_rgb 1546 .purgem do_yuv_to_rgb_stage1 1547 .purgem do_yuv_to_rgb_stage2 1548 .purgem do_yuv_to_rgb_stage2_store_load_stage1 1549 1550 .endm 1551 1552 /*--------------------------------- id ----- bpp R G B */ 1553 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 1554 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 1555 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 1556 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 1557 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 1558 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 1559 1560 .purgem do_load 1561 .purgem do_store 1562 1563 1564 /*****************************************************************************/ 1565 1566 /* 1567 * jsimd_extrgb_ycc_convert_neon 1568 * jsimd_extbgr_ycc_convert_neon 1569 * jsimd_extrgbx_ycc_convert_neon 1570 * jsimd_extbgrx_ycc_convert_neon 1571 * jsimd_extxbgr_ycc_convert_neon 1572 * jsimd_extxrgb_ycc_convert_neon 1573 * 1574 * Colorspace conversion RGB -> YCbCr 1575 */ 1576 1577 .macro do_store size 1578 .if \size == 8 1579 vst1.8 {d20}, [Y]! 1580 vst1.8 {d21}, [U]! 1581 vst1.8 {d22}, [V]! 1582 .elseif \size == 4 1583 vst1.8 {d20[0]}, [Y]! 1584 vst1.8 {d20[1]}, [Y]! 1585 vst1.8 {d20[2]}, [Y]! 1586 vst1.8 {d20[3]}, [Y]! 1587 vst1.8 {d21[0]}, [U]! 1588 vst1.8 {d21[1]}, [U]! 1589 vst1.8 {d21[2]}, [U]! 1590 vst1.8 {d21[3]}, [U]! 1591 vst1.8 {d22[0]}, [V]! 1592 vst1.8 {d22[1]}, [V]! 1593 vst1.8 {d22[2]}, [V]! 1594 vst1.8 {d22[3]}, [V]! 1595 .elseif \size == 2 1596 vst1.8 {d20[4]}, [Y]! 1597 vst1.8 {d20[5]}, [Y]! 1598 vst1.8 {d21[4]}, [U]! 1599 vst1.8 {d21[5]}, [U]! 1600 vst1.8 {d22[4]}, [V]! 1601 vst1.8 {d22[5]}, [V]! 1602 .elseif \size == 1 1603 vst1.8 {d20[6]}, [Y]! 1604 vst1.8 {d21[6]}, [U]! 1605 vst1.8 {d22[6]}, [V]! 1606 .else 1607 .error unsupported macroblock size 1608 .endif 1609 .endm 1610 1611 .macro do_load bpp, size 1612 .if \bpp == 24 1613 .if \size == 8 1614 vld3.8 {d10, d11, d12}, [RGB]! 1615 pld [RGB, #128] 1616 .elseif \size == 4 1617 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! 1618 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! 1619 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! 1620 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! 1621 .elseif \size == 2 1622 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! 1623 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! 1624 .elseif \size == 1 1625 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! 1626 .else 1627 .error unsupported macroblock size 1628 .endif 1629 .elseif \bpp == 32 1630 .if \size == 8 1631 vld4.8 {d10, d11, d12, d13}, [RGB]! 1632 pld [RGB, #128] 1633 .elseif \size == 4 1634 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! 1635 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! 1636 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! 1637 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! 1638 .elseif \size == 2 1639 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! 1640 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! 1641 .elseif \size == 1 1642 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! 1643 .else 1644 .error unsupported macroblock size 1645 .endif 1646 .else 1647 .error unsupported bpp 1648 .endif 1649 .endm 1650 1651 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs 1652 1653 /* 1654 * 2 stage pipelined RGB->YCbCr conversion 1655 */ 1656 1657 .macro do_rgb_to_yuv_stage1 1658 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ 1659 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ 1660 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ 1661 vmull.u16 q7, d4, d0[0] 1662 vmlal.u16 q7, d6, d0[1] 1663 vmlal.u16 q7, d8, d0[2] 1664 vmull.u16 q8, d5, d0[0] 1665 vmlal.u16 q8, d7, d0[1] 1666 vmlal.u16 q8, d9, d0[2] 1667 vrev64.32 q9, q1 1668 vrev64.32 q13, q1 1669 vmlsl.u16 q9, d4, d0[3] 1670 vmlsl.u16 q9, d6, d1[0] 1671 vmlal.u16 q9, d8, d1[1] 1672 vmlsl.u16 q13, d5, d0[3] 1673 vmlsl.u16 q13, d7, d1[0] 1674 vmlal.u16 q13, d9, d1[1] 1675 vrev64.32 q14, q1 1676 vrev64.32 q15, q1 1677 vmlal.u16 q14, d4, d1[1] 1678 vmlsl.u16 q14, d6, d1[2] 1679 vmlsl.u16 q14, d8, d1[3] 1680 vmlal.u16 q15, d5, d1[1] 1681 vmlsl.u16 q15, d7, d1[2] 1682 vmlsl.u16 q15, d9, d1[3] 1683 .endm 1684 1685 .macro do_rgb_to_yuv_stage2 1686 vrshrn.u32 d20, q7, #16 1687 vrshrn.u32 d21, q8, #16 1688 vshrn.u32 d22, q9, #16 1689 vshrn.u32 d23, q13, #16 1690 vshrn.u32 d24, q14, #16 1691 vshrn.u32 d25, q15, #16 1692 vmovn.u16 d20, q10 /* d20 = y */ 1693 vmovn.u16 d21, q11 /* d21 = u */ 1694 vmovn.u16 d22, q12 /* d22 = v */ 1695 .endm 1696 1697 .macro do_rgb_to_yuv 1698 do_rgb_to_yuv_stage1 1699 do_rgb_to_yuv_stage2 1700 .endm 1701 1702 .macro do_rgb_to_yuv_stage2_store_load_stage1 1703 vrshrn.u32 d20, q7, #16 1704 vrshrn.u32 d21, q8, #16 1705 vshrn.u32 d22, q9, #16 1706 vrev64.32 q9, q1 1707 vshrn.u32 d23, q13, #16 1708 vrev64.32 q13, q1 1709 vshrn.u32 d24, q14, #16 1710 vshrn.u32 d25, q15, #16 1711 do_load \bpp, 8 1712 vmovn.u16 d20, q10 /* d20 = y */ 1713 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ 1714 vmovn.u16 d21, q11 /* d21 = u */ 1715 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ 1716 vmovn.u16 d22, q12 /* d22 = v */ 1717 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ 1718 vmull.u16 q7, d4, d0[0] 1719 vmlal.u16 q7, d6, d0[1] 1720 vmlal.u16 q7, d8, d0[2] 1721 vst1.8 {d20}, [Y]! 1722 vmull.u16 q8, d5, d0[0] 1723 vmlal.u16 q8, d7, d0[1] 1724 vmlal.u16 q8, d9, d0[2] 1725 vmlsl.u16 q9, d4, d0[3] 1726 vmlsl.u16 q9, d6, d1[0] 1727 vmlal.u16 q9, d8, d1[1] 1728 vst1.8 {d21}, [U]! 1729 vmlsl.u16 q13, d5, d0[3] 1730 vmlsl.u16 q13, d7, d1[0] 1731 vmlal.u16 q13, d9, d1[1] 1732 vrev64.32 q14, q1 1733 vrev64.32 q15, q1 1734 vmlal.u16 q14, d4, d1[1] 1735 vmlsl.u16 q14, d6, d1[2] 1736 vmlsl.u16 q14, d8, d1[3] 1737 vst1.8 {d22}, [V]! 1738 vmlal.u16 q15, d5, d1[1] 1739 vmlsl.u16 q15, d7, d1[2] 1740 vmlsl.u16 q15, d9, d1[3] 1741 .endm 1742 1743 .balign 16 1744 jsimd_\colorid\()_ycc_neon_consts: 1745 .short 19595, 38470, 7471, 11059 1746 .short 21709, 32768, 27439, 5329 1747 .short 32767, 128, 32767, 128 1748 .short 32767, 128, 32767, 128 1749 1750 asm_function jsimd_\colorid\()_ycc_convert_neon 1751 OUTPUT_WIDTH .req r0 1752 INPUT_BUF .req r1 1753 OUTPUT_BUF .req r2 1754 OUTPUT_ROW .req r3 1755 NUM_ROWS .req r4 1756 1757 OUTPUT_BUF0 .req r5 1758 OUTPUT_BUF1 .req r6 1759 OUTPUT_BUF2 .req OUTPUT_BUF 1760 1761 RGB .req r7 1762 Y .req r8 1763 U .req r9 1764 V .req r10 1765 N .req ip 1766 1767 /* Load constants to d0, d1, d2, d3 */ 1768 adr ip, jsimd_\colorid\()_ycc_neon_consts 1769 vld1.16 {d0, d1, d2, d3}, [ip, :128] 1770 1771 /* Save ARM registers and handle input arguments */ 1772 push {r4, r5, r6, r7, r8, r9, r10, lr} 1773 ldr NUM_ROWS, [sp, #(4 * 8)] 1774 ldr OUTPUT_BUF0, [OUTPUT_BUF] 1775 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] 1776 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] 1777 .unreq OUTPUT_BUF 1778 1779 /* Save NEON registers */ 1780 vpush {d8-d15} 1781 1782 /* Outer loop over scanlines */ 1783 cmp NUM_ROWS, #1 1784 blt 9f 1785 0: 1786 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] 1787 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] 1788 mov N, OUTPUT_WIDTH 1789 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] 1790 add OUTPUT_ROW, OUTPUT_ROW, #1 1791 ldr RGB, [INPUT_BUF], #4 1792 1793 /* Inner loop over pixels */ 1794 subs N, N, #8 1795 blt 3f 1796 do_load \bpp, 8 1797 do_rgb_to_yuv_stage1 1798 subs N, N, #8 1799 blt 2f 1800 1: 1801 do_rgb_to_yuv_stage2_store_load_stage1 1802 subs N, N, #8 1803 bge 1b 1804 2: 1805 do_rgb_to_yuv_stage2 1806 do_store 8 1807 tst N, #7 1808 beq 8f 1809 3: 1810 tst N, #4 1811 beq 3f 1812 do_load \bpp, 4 1813 3: 1814 tst N, #2 1815 beq 4f 1816 do_load \bpp, 2 1817 4: 1818 tst N, #1 1819 beq 5f 1820 do_load \bpp, 1 1821 5: 1822 do_rgb_to_yuv 1823 tst N, #4 1824 beq 6f 1825 do_store 4 1826 6: 1827 tst N, #2 1828 beq 7f 1829 do_store 2 1830 7: 1831 tst N, #1 1832 beq 8f 1833 do_store 1 1834 8: 1835 subs NUM_ROWS, NUM_ROWS, #1 1836 bgt 0b 1837 9: 1838 /* Restore all registers and return */ 1839 vpop {d8-d15} 1840 pop {r4, r5, r6, r7, r8, r9, r10, pc} 1841 1842 .unreq OUTPUT_WIDTH 1843 .unreq OUTPUT_ROW 1844 .unreq INPUT_BUF 1845 .unreq NUM_ROWS 1846 .unreq OUTPUT_BUF0 1847 .unreq OUTPUT_BUF1 1848 .unreq OUTPUT_BUF2 1849 .unreq RGB 1850 .unreq Y 1851 .unreq U 1852 .unreq V 1853 .unreq N 1854 1855 .purgem do_rgb_to_yuv 1856 .purgem do_rgb_to_yuv_stage1 1857 .purgem do_rgb_to_yuv_stage2 1858 .purgem do_rgb_to_yuv_stage2_store_load_stage1 1859 1860 .endm 1861 1862 /*--------------------------------- id ----- bpp R G B */ 1863 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 1864 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 1865 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 1866 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 1867 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 1868 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 1869 1870 .purgem do_load 1871 .purgem do_store 1872 1873 1874 /*****************************************************************************/ 1875 1876 /* 1877 * Load data into workspace, applying unsigned->signed conversion 1878 * 1879 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get 1880 * rid of VST1.16 instructions 1881 */ 1882 1883 asm_function jsimd_convsamp_neon 1884 SAMPLE_DATA .req r0 1885 START_COL .req r1 1886 WORKSPACE .req r2 1887 TMP1 .req r3 1888 TMP2 .req r4 1889 TMP3 .req r5 1890 TMP4 .req ip 1891 1892 push {r4, r5} 1893 vmov.u8 d0, #128 1894 1895 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} 1896 add TMP1, TMP1, START_COL 1897 add TMP2, TMP2, START_COL 1898 add TMP3, TMP3, START_COL 1899 add TMP4, TMP4, START_COL 1900 vld1.8 {d16}, [TMP1] 1901 vsubl.u8 q8, d16, d0 1902 vld1.8 {d18}, [TMP2] 1903 vsubl.u8 q9, d18, d0 1904 vld1.8 {d20}, [TMP3] 1905 vsubl.u8 q10, d20, d0 1906 vld1.8 {d22}, [TMP4] 1907 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} 1908 vsubl.u8 q11, d22, d0 1909 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! 1910 add TMP1, TMP1, START_COL 1911 add TMP2, TMP2, START_COL 1912 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! 1913 add TMP3, TMP3, START_COL 1914 add TMP4, TMP4, START_COL 1915 vld1.8 {d24}, [TMP1] 1916 vsubl.u8 q12, d24, d0 1917 vld1.8 {d26}, [TMP2] 1918 vsubl.u8 q13, d26, d0 1919 vld1.8 {d28}, [TMP3] 1920 vsubl.u8 q14, d28, d0 1921 vld1.8 {d30}, [TMP4] 1922 vsubl.u8 q15, d30, d0 1923 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! 1924 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! 1925 pop {r4, r5} 1926 bx lr 1927 1928 .unreq SAMPLE_DATA 1929 .unreq START_COL 1930 .unreq WORKSPACE 1931 .unreq TMP1 1932 .unreq TMP2 1933 .unreq TMP3 1934 .unreq TMP4 1935 1936 1937 /*****************************************************************************/ 1938 1939 /* 1940 * jsimd_fdct_ifast_neon 1941 * 1942 * This function contains a fast, not so accurate integer implementation of 1943 * the forward DCT (Discrete Cosine Transform). It uses the same calculations 1944 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' 1945 * function from jfdctfst.c 1946 * 1947 * TODO: can be combined with 'jsimd_convsamp_neon' to get 1948 * rid of a bunch of VLD1.16 instructions 1949 */ 1950 1951 #define XFIX_0_382683433 d0[0] 1952 #define XFIX_0_541196100 d0[1] 1953 #define XFIX_0_707106781 d0[2] 1954 #define XFIX_1_306562965 d0[3] 1955 1956 .balign 16 1957 jsimd_fdct_ifast_neon_consts: 1958 .short (98 * 128) /* XFIX_0_382683433 */ 1959 .short (139 * 128) /* XFIX_0_541196100 */ 1960 .short (181 * 128) /* XFIX_0_707106781 */ 1961 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ 1962 1963 asm_function jsimd_fdct_ifast_neon 1964 1965 DATA .req r0 1966 TMP .req ip 1967 1968 vpush {d8-d15} 1969 1970 /* Load constants */ 1971 adr TMP, jsimd_fdct_ifast_neon_consts 1972 vld1.16 {d0}, [TMP, :64] 1973 1974 /* Load all DATA into NEON registers with the following allocation: 1975 * 0 1 2 3 | 4 5 6 7 1976 * ---------+-------- 1977 * 0 | d16 | d17 | q8 1978 * 1 | d18 | d19 | q9 1979 * 2 | d20 | d21 | q10 1980 * 3 | d22 | d23 | q11 1981 * 4 | d24 | d25 | q12 1982 * 5 | d26 | d27 | q13 1983 * 6 | d28 | d29 | q14 1984 * 7 | d30 | d31 | q15 1985 */ 1986 1987 vld1.16 {d16, d17, d18, d19}, [DATA, :128]! 1988 vld1.16 {d20, d21, d22, d23}, [DATA, :128]! 1989 vld1.16 {d24, d25, d26, d27}, [DATA, :128]! 1990 vld1.16 {d28, d29, d30, d31}, [DATA, :128] 1991 sub DATA, DATA, #(128 - 32) 1992 1993 mov TMP, #2 1994 1: 1995 /* Transpose */ 1996 vtrn.16 q12, q13 1997 vtrn.16 q10, q11 1998 vtrn.16 q8, q9 1999 vtrn.16 q14, q15 2000 vtrn.32 q9, q11 2001 vtrn.32 q13, q15 2002 vtrn.32 q8, q10 2003 vtrn.32 q12, q14 2004 vswp d30, d23 2005 vswp d24, d17 2006 vswp d26, d19 2007 /* 1-D FDCT */ 2008 vadd.s16 q2, q11, q12 2009 vswp d28, d21 2010 vsub.s16 q12, q11, q12 2011 vsub.s16 q6, q10, q13 2012 vadd.s16 q10, q10, q13 2013 vsub.s16 q7, q9, q14 2014 vadd.s16 q9, q9, q14 2015 vsub.s16 q1, q8, q15 2016 vadd.s16 q8, q8, q15 2017 vsub.s16 q4, q9, q10 2018 vsub.s16 q5, q8, q2 2019 vadd.s16 q3, q9, q10 2020 vadd.s16 q4, q4, q5 2021 vadd.s16 q2, q8, q2 2022 vqdmulh.s16 q4, q4, XFIX_0_707106781 2023 vadd.s16 q11, q12, q6 2024 vadd.s16 q8, q2, q3 2025 vsub.s16 q12, q2, q3 2026 vadd.s16 q3, q6, q7 2027 vadd.s16 q7, q7, q1 2028 vqdmulh.s16 q3, q3, XFIX_0_707106781 2029 vsub.s16 q6, q11, q7 2030 vadd.s16 q10, q5, q4 2031 vqdmulh.s16 q6, q6, XFIX_0_382683433 2032 vsub.s16 q14, q5, q4 2033 vqdmulh.s16 q11, q11, XFIX_0_541196100 2034 vqdmulh.s16 q5, q7, XFIX_1_306562965 2035 vadd.s16 q4, q1, q3 2036 vsub.s16 q3, q1, q3 2037 vadd.s16 q7, q7, q6 2038 vadd.s16 q11, q11, q6 2039 vadd.s16 q7, q7, q5 2040 vadd.s16 q13, q3, q11 2041 vsub.s16 q11, q3, q11 2042 vadd.s16 q9, q4, q7 2043 vsub.s16 q15, q4, q7 2044 subs TMP, TMP, #1 2045 bne 1b 2046 2047 /* store results */ 2048 vst1.16 {d16, d17, d18, d19}, [DATA, :128]! 2049 vst1.16 {d20, d21, d22, d23}, [DATA, :128]! 2050 vst1.16 {d24, d25, d26, d27}, [DATA, :128]! 2051 vst1.16 {d28, d29, d30, d31}, [DATA, :128] 2052 2053 vpop {d8-d15} 2054 bx lr 2055 2056 .unreq DATA 2057 .unreq TMP 2058 2059 2060 /*****************************************************************************/ 2061 2062 /* 2063 * GLOBAL(void) 2064 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, 2065 * DCTELEM * workspace); 2066 * 2067 * Note: the code uses 2 stage pipelining in order to improve instructions 2068 * scheduling and eliminate stalls (this provides ~15% better 2069 * performance for this function on both ARM Cortex-A8 and 2070 * ARM Cortex-A9 when compared to the non-pipelined variant). 2071 * The instructions which belong to the second stage use different 2072 * indentation for better readiability. 2073 */ 2074 asm_function jsimd_quantize_neon 2075 2076 COEF_BLOCK .req r0 2077 DIVISORS .req r1 2078 WORKSPACE .req r2 2079 2080 RECIPROCAL .req DIVISORS 2081 CORRECTION .req r3 2082 SHIFT .req ip 2083 LOOP_COUNT .req r4 2084 2085 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! 2086 vabs.s16 q12, q0 2087 add CORRECTION, DIVISORS, #(64 * 2) 2088 add SHIFT, DIVISORS, #(64 * 6) 2089 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! 2090 vabs.s16 q13, q1 2091 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! 2092 vadd.u16 q12, q12, q10 /* add correction */ 2093 vadd.u16 q13, q13, q11 2094 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ 2095 vmull.u16 q11, d25, d17 2096 vmull.u16 q8, d26, d18 2097 vmull.u16 q9, d27, d19 2098 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! 2099 vshrn.u32 d20, q10, #16 2100 vshrn.u32 d21, q11, #16 2101 vshrn.u32 d22, q8, #16 2102 vshrn.u32 d23, q9, #16 2103 vneg.s16 q12, q12 2104 vneg.s16 q13, q13 2105 vshr.s16 q2, q0, #15 /* extract sign */ 2106 vshr.s16 q3, q1, #15 2107 vshl.u16 q14, q10, q12 /* shift */ 2108 vshl.u16 q15, q11, q13 2109 2110 push {r4, r5} 2111 mov LOOP_COUNT, #3 2112 1: 2113 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! 2114 veor.u16 q14, q14, q2 /* restore sign */ 2115 vabs.s16 q12, q0 2116 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! 2117 vabs.s16 q13, q1 2118 veor.u16 q15, q15, q3 2119 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! 2120 vadd.u16 q12, q12, q10 /* add correction */ 2121 vadd.u16 q13, q13, q11 2122 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ 2123 vmull.u16 q11, d25, d17 2124 vmull.u16 q8, d26, d18 2125 vmull.u16 q9, d27, d19 2126 vsub.u16 q14, q14, q2 2127 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! 2128 vsub.u16 q15, q15, q3 2129 vshrn.u32 d20, q10, #16 2130 vshrn.u32 d21, q11, #16 2131 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! 2132 vshrn.u32 d22, q8, #16 2133 vshrn.u32 d23, q9, #16 2134 vneg.s16 q12, q12 2135 vneg.s16 q13, q13 2136 vshr.s16 q2, q0, #15 /* extract sign */ 2137 vshr.s16 q3, q1, #15 2138 vshl.u16 q14, q10, q12 /* shift */ 2139 vshl.u16 q15, q11, q13 2140 subs LOOP_COUNT, LOOP_COUNT, #1 2141 bne 1b 2142 pop {r4, r5} 2143 2144 veor.u16 q14, q14, q2 /* restore sign */ 2145 veor.u16 q15, q15, q3 2146 vsub.u16 q14, q14, q2 2147 vsub.u16 q15, q15, q3 2148 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! 2149 2150 bx lr /* return */ 2151 2152 .unreq COEF_BLOCK 2153 .unreq DIVISORS 2154 .unreq WORKSPACE 2155 .unreq RECIPROCAL 2156 .unreq CORRECTION 2157 .unreq SHIFT 2158 .unreq LOOP_COUNT 2159 2160 2161 /*****************************************************************************/ 2162 2163 /* 2164 * GLOBAL(void) 2165 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, 2166 * JDIMENSION downsampled_width, 2167 * JSAMPARRAY input_data, 2168 * JSAMPARRAY * output_data_ptr); 2169 * 2170 * Note: the use of unaligned writes is the main remaining bottleneck in 2171 * this code, which can be potentially solved to get up to tens 2172 * of percents performance improvement on Cortex-A8/Cortex-A9. 2173 */ 2174 2175 /* 2176 * Upsample 16 source pixels to 32 destination pixels. The new 16 source 2177 * pixels are loaded to q0. The previous 16 source pixels are in q1. The 2178 * shifted-by-one source pixels are constructed in q2 by using q0 and q1. 2179 * Register d28 is used for multiplication by 3. Register q15 is used 2180 * for adding +1 bias. 2181 */ 2182 .macro upsample16 OUTPTR, INPTR 2183 vld1.8 {q0}, [\INPTR]! 2184 vmovl.u8 q8, d0 2185 vext.8 q2, q1, q0, #15 2186 vmovl.u8 q9, d1 2187 vaddw.u8 q10, q15, d4 2188 vaddw.u8 q11, q15, d5 2189 vmlal.u8 q8, d4, d28 2190 vmlal.u8 q9, d5, d28 2191 vmlal.u8 q10, d0, d28 2192 vmlal.u8 q11, d1, d28 2193 vmov q1, q0 /* backup source pixels to q1 */ 2194 vrshrn.u16 d6, q8, #2 2195 vrshrn.u16 d7, q9, #2 2196 vshrn.u16 d8, q10, #2 2197 vshrn.u16 d9, q11, #2 2198 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 2199 .endm 2200 2201 /* 2202 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' 2203 * macro, the roles of q0 and q1 registers are reversed for even and odd 2204 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. 2205 * Also this unrolling allows to reorder loads and stores to compensate 2206 * multiplication latency and reduce stalls. 2207 */ 2208 .macro upsample32 OUTPTR, INPTR 2209 /* even 16 pixels group */ 2210 vld1.8 {q0}, [\INPTR]! 2211 vmovl.u8 q8, d0 2212 vext.8 q2, q1, q0, #15 2213 vmovl.u8 q9, d1 2214 vaddw.u8 q10, q15, d4 2215 vaddw.u8 q11, q15, d5 2216 vmlal.u8 q8, d4, d28 2217 vmlal.u8 q9, d5, d28 2218 vmlal.u8 q10, d0, d28 2219 vmlal.u8 q11, d1, d28 2220 /* odd 16 pixels group */ 2221 vld1.8 {q1}, [\INPTR]! 2222 vrshrn.u16 d6, q8, #2 2223 vrshrn.u16 d7, q9, #2 2224 vshrn.u16 d8, q10, #2 2225 vshrn.u16 d9, q11, #2 2226 vmovl.u8 q8, d2 2227 vext.8 q2, q0, q1, #15 2228 vmovl.u8 q9, d3 2229 vaddw.u8 q10, q15, d4 2230 vaddw.u8 q11, q15, d5 2231 vmlal.u8 q8, d4, d28 2232 vmlal.u8 q9, d5, d28 2233 vmlal.u8 q10, d2, d28 2234 vmlal.u8 q11, d3, d28 2235 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 2236 vrshrn.u16 d6, q8, #2 2237 vrshrn.u16 d7, q9, #2 2238 vshrn.u16 d8, q10, #2 2239 vshrn.u16 d9, q11, #2 2240 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 2241 .endm 2242 2243 /* 2244 * Upsample a row of WIDTH pixels from INPTR to OUTPTR. 2245 */ 2246 .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 2247 /* special case for the first and last pixels */ 2248 sub \WIDTH, \WIDTH, #1 2249 add \OUTPTR, \OUTPTR, #1 2250 ldrb \TMP1, [\INPTR, \WIDTH] 2251 strb \TMP1, [\OUTPTR, \WIDTH, asl #1] 2252 ldrb \TMP1, [\INPTR], #1 2253 strb \TMP1, [\OUTPTR, #-1] 2254 vmov.8 d3[7], \TMP1 2255 2256 subs \WIDTH, \WIDTH, #32 2257 blt 5f 2258 0: /* process 32 pixels per iteration */ 2259 upsample32 \OUTPTR, \INPTR 2260 subs \WIDTH, \WIDTH, #32 2261 bge 0b 2262 5: 2263 adds \WIDTH, \WIDTH, #16 2264 blt 1f 2265 0: /* process 16 pixels if needed */ 2266 upsample16 \OUTPTR, \INPTR 2267 subs \WIDTH, \WIDTH, #16 2268 1: 2269 adds \WIDTH, \WIDTH, #16 2270 beq 9f 2271 2272 /* load the remaining 1-15 pixels */ 2273 add \INPTR, \INPTR, \WIDTH 2274 tst \WIDTH, #1 2275 beq 2f 2276 sub \INPTR, \INPTR, #1 2277 vld1.8 {d0[0]}, [\INPTR] 2278 2: 2279 tst \WIDTH, #2 2280 beq 2f 2281 vext.8 d0, d0, d0, #6 2282 sub \INPTR, \INPTR, #1 2283 vld1.8 {d0[1]}, [\INPTR] 2284 sub \INPTR, \INPTR, #1 2285 vld1.8 {d0[0]}, [\INPTR] 2286 2: 2287 tst \WIDTH, #4 2288 beq 2f 2289 vrev64.32 d0, d0 2290 sub \INPTR, \INPTR, #1 2291 vld1.8 {d0[3]}, [\INPTR] 2292 sub \INPTR, \INPTR, #1 2293 vld1.8 {d0[2]}, [\INPTR] 2294 sub \INPTR, \INPTR, #1 2295 vld1.8 {d0[1]}, [\INPTR] 2296 sub \INPTR, \INPTR, #1 2297 vld1.8 {d0[0]}, [\INPTR] 2298 2: 2299 tst \WIDTH, #8 2300 beq 2f 2301 vmov d1, d0 2302 sub \INPTR, \INPTR, #8 2303 vld1.8 {d0}, [\INPTR] 2304 2: /* upsample the remaining pixels */ 2305 vmovl.u8 q8, d0 2306 vext.8 q2, q1, q0, #15 2307 vmovl.u8 q9, d1 2308 vaddw.u8 q10, q15, d4 2309 vaddw.u8 q11, q15, d5 2310 vmlal.u8 q8, d4, d28 2311 vmlal.u8 q9, d5, d28 2312 vmlal.u8 q10, d0, d28 2313 vmlal.u8 q11, d1, d28 2314 vrshrn.u16 d10, q8, #2 2315 vrshrn.u16 d12, q9, #2 2316 vshrn.u16 d11, q10, #2 2317 vshrn.u16 d13, q11, #2 2318 vzip.8 d10, d11 2319 vzip.8 d12, d13 2320 /* store the remaining pixels */ 2321 tst \WIDTH, #8 2322 beq 2f 2323 vst1.8 {d10, d11}, [\OUTPTR]! 2324 vmov q5, q6 2325 2: 2326 tst \WIDTH, #4 2327 beq 2f 2328 vst1.8 {d10}, [\OUTPTR]! 2329 vmov d10, d11 2330 2: 2331 tst \WIDTH, #2 2332 beq 2f 2333 vst1.8 {d10[0]}, [\OUTPTR]! 2334 vst1.8 {d10[1]}, [\OUTPTR]! 2335 vst1.8 {d10[2]}, [\OUTPTR]! 2336 vst1.8 {d10[3]}, [\OUTPTR]! 2337 vext.8 d10, d10, d10, #4 2338 2: 2339 tst \WIDTH, #1 2340 beq 2f 2341 vst1.8 {d10[0]}, [\OUTPTR]! 2342 vst1.8 {d10[1]}, [\OUTPTR]! 2343 2: 2344 9: 2345 .endm 2346 2347 asm_function jsimd_h2v1_fancy_upsample_neon 2348 2349 MAX_V_SAMP_FACTOR .req r0 2350 DOWNSAMPLED_WIDTH .req r1 2351 INPUT_DATA .req r2 2352 OUTPUT_DATA_PTR .req r3 2353 OUTPUT_DATA .req OUTPUT_DATA_PTR 2354 2355 OUTPTR .req r4 2356 INPTR .req r5 2357 WIDTH .req ip 2358 TMP .req lr 2359 2360 push {r4, r5, r6, lr} 2361 vpush {d8-d15} 2362 2363 ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] 2364 cmp MAX_V_SAMP_FACTOR, #0 2365 ble 99f 2366 2367 /* initialize constants */ 2368 vmov.u8 d28, #3 2369 vmov.u16 q15, #1 2370 11: 2371 ldr INPTR, [INPUT_DATA], #4 2372 ldr OUTPTR, [OUTPUT_DATA], #4 2373 mov WIDTH, DOWNSAMPLED_WIDTH 2374 upsample_row OUTPTR, INPTR, WIDTH, TMP 2375 subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 2376 bgt 11b 2377 2378 99: 2379 vpop {d8-d15} 2380 pop {r4, r5, r6, pc} 2381 2382 .unreq MAX_V_SAMP_FACTOR 2383 .unreq DOWNSAMPLED_WIDTH 2384 .unreq INPUT_DATA 2385 .unreq OUTPUT_DATA_PTR 2386 .unreq OUTPUT_DATA 2387 2388 .unreq OUTPTR 2389 .unreq INPTR 2390 .unreq WIDTH 2391 .unreq TMP 2392 2393 2394 .purgem upsample16 2395 .purgem upsample32 2396 .purgem upsample_row 2397