Home | History | Annotate | Download | only in jpeg
      1 /*
      2  * ARM NEON optimizations for libjpeg-turbo
      3  *
      4  * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
      5  * All rights reserved.
      6  * Contact: Alexander Bokovoy <alexander.bokovoy (at) nokia.com>
      7  *
      8  * This software is provided 'as-is', without any express or implied
      9  * warranty.  In no event will the authors be held liable for any damages
     10  * arising from the use of this software.
     11  *
     12  * Permission is granted to anyone to use this software for any purpose,
     13  * including commercial applications, and to alter it and redistribute it
     14  * freely, subject to the following restrictions:
     15  *
     16  * 1. The origin of this software must not be misrepresented; you must not
     17  *    claim that you wrote the original software. If you use this software
     18  *    in a product, an acknowledgment in the product documentation would be
     19  *    appreciated but is not required.
     20  * 2. Altered source versions must be plainly marked as such, and must not be
     21  *    misrepresented as being the original software.
     22  * 3. This notice may not be removed or altered from any source distribution.
     23  */
     24 /* Copyright (c) 2011,  NVIDIA CORPORATION. All rights reserved.
     25  *
     26  * Redistribution and use in source and binary forms, with or without
     27  * modification, are permitted provided that the following conditions
     28  * are met:
     29  *
     30  *  * Redistributions of source code must retain the above copyright
     31  *    notice, this list of conditions and the following disclaimer.
     32  *  * Redistributions in binary form must reproduce the above copyright
     33  *    notice, this list of conditions and the following disclaimer in the
     34  *    documentation and/or other materials provided with the distribution.
     35  *  * Neither the name of the NVIDIA CORPORATION nor the names of its
     36  *    contributors may be used to endorse or promote products derived
     37  *    from this software without specific prior written permission.
     38  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     39  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     40  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     41  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
     42  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     43  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     44  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     45  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     46  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     47  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
     48  * THE POSSIBILITY OF SUCH DAMAGE.
     49  */
     50 
     51 
     52 
     53 #if defined(__linux__) && defined(__ELF__)
     54 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
     55 #endif
     56 
     57 .text
     58 .fpu neon
     59 .arch armv7a
     60 .object_arch armv7a
     61 .arm
     62 
     63 
     64 #define RESPECT_STRICT_ALIGNMENT 1
     65 
     66 /*****************************************************************************/
     67 
     68 /* Supplementary macro for setting function attributes */
     69 .macro asm_function fname
     70     .func \fname
     71     .global \fname
     72 #ifdef __ELF__
     73     .hidden \fname
     74     .type \fname, %function
     75 #endif
     76 \fname:
     77 .endm
     78 
     79 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
     80 .macro transpose_4x4 x0, x1, x2, x3
     81     vtrn.16 \x0, \x1
     82     vtrn.16 \x2, \x3
     83     vtrn.32 \x0, \x2
     84     vtrn.32 \x1, \x3
     85 .endm
     86 
     87 /*****************************************************************************/
     88 
     89 /*
     90  * jsimd_idct_ifast_neon
     91  *
     92  * This function contains a fast, not so accurate integer implementation of
     93  * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
     94  * and produces exactly the same output as IJG's original 'jpeg_idct_fast'
     95  * function from jidctfst.c
     96  *
     97  * TODO: a bit better instructions scheduling is needed.
     98  */
     99 
    100 #define XFIX_1_082392200 d0[0]
    101 #define XFIX_1_414213562 d0[1]
    102 #define XFIX_1_847759065 d0[2]
    103 #define XFIX_2_613125930 d0[3]
    104 
    105 .balign 16
    106 jsimd_idct_ifast_neon_consts:
    107     .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
    108     .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
    109     .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
    110     .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
    111 
    112 /* 1-D IDCT helper macro */
    113 
    114 .macro idct_helper  x0, x1, x2, x3, x4, x5, x6, x7, \
    115                     t10, t11, t12, t13, t14
    116 
    117     vsub.s16        \t10, \x0, \x4
    118     vadd.s16        \x4,  \x0, \x4
    119     vswp.s16        \t10, \x0
    120     vsub.s16        \t11, \x2, \x6
    121     vadd.s16        \x6,  \x2, \x6
    122     vswp.s16        \t11, \x2
    123     vsub.s16        \t10, \x3, \x5
    124     vadd.s16        \x5,  \x3, \x5
    125     vswp.s16        \t10, \x3
    126     vsub.s16        \t11, \x1, \x7
    127     vadd.s16        \x7,  \x1, \x7
    128     vswp.s16        \t11, \x1
    129 
    130     vqdmulh.s16     \t13, \x2,  d0[1]
    131     vadd.s16        \t12, \x3,  \x3
    132     vadd.s16        \x2,  \x2,  \t13
    133     vqdmulh.s16     \t13, \x3,  d0[3]
    134     vsub.s16        \t10,  \x1, \x3
    135     vadd.s16        \t12, \t12, \t13
    136     vqdmulh.s16     \t13, \t10, d0[2]
    137     vsub.s16        \t11, \x7,  \x5
    138     vadd.s16        \t10, \t10, \t13
    139     vqdmulh.s16     \t13, \t11, d0[1]
    140     vadd.s16        \t11, \t11, \t13
    141 
    142     vqdmulh.s16     \t13, \x1,  d0[0]
    143     vsub.s16        \x2,  \x6,  \x2
    144     vsub.s16        \t14, \x0,  \x2
    145     vadd.s16        \x2,  \x0,  \x2
    146     vadd.s16        \x0,  \x4,  \x6
    147     vsub.s16        \x4,  \x4,  \x6
    148     vadd.s16        \x1,  \x1,  \t13
    149     vadd.s16        \t13, \x7,  \x5
    150     vsub.s16        \t12, \t13, \t12
    151     vsub.s16        \t12, \t12, \t10
    152     vadd.s16        \t11, \t12, \t11
    153     vsub.s16        \t10, \x1,  \t10
    154     vadd.s16        \t10, \t10, \t11
    155 
    156     vsub.s16        \x7,  \x0,  \t13
    157     vadd.s16        \x0,  \x0,  \t13
    158     vadd.s16        \x6,  \t14, \t12
    159     vsub.s16        \x1,  \t14, \t12
    160     vsub.s16        \x5,  \x2,  \t11
    161     vadd.s16        \x2,  \x2,  \t11
    162     vsub.s16        \x3,  \x4,  \t10
    163     vadd.s16        \x4,  \x4,  \t10
    164 .endm
    165 
    166 asm_function jsimd_idct_ifast_neon
    167 
    168     DCT_TABLE       .req r0
    169     COEF_BLOCK      .req r1
    170     OUTPUT_BUF      .req r2
    171     OUTPUT_COL      .req r3
    172     TMP             .req ip
    173 
    174     vpush           {d8-d15}
    175 
    176     /* Load constants */
    177     adr             TMP, jsimd_idct_ifast_neon_consts
    178     vld1.16         {d0}, [TMP, :64]
    179 
    180     /* Load all COEF_BLOCK into NEON registers with the following allocation:
    181      *       0 1 2 3 | 4 5 6 7
    182      *      ---------+--------
    183      *   0 | d4      | d5
    184      *   1 | d6      | d7
    185      *   2 | d8      | d9
    186      *   3 | d10     | d11
    187      *   4 | d12     | d13
    188      *   5 | d14     | d15
    189      *   6 | d16     | d17
    190      *   7 | d18     | d19
    191      */
    192     vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK]!
    193     vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK]!
    194     vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK]!
    195     vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK]!
    196     /* Dequantize */
    197     vld1.16         {d20, d21, d22, d23}, [DCT_TABLE]!
    198     vmul.s16        q2, q2, q10
    199     vld1.16         {d24, d25, d26, d27}, [DCT_TABLE]!
    200     vmul.s16        q3, q3, q11
    201     vmul.s16        q4, q4, q12
    202     vld1.16         {d28, d29, d30, d31}, [DCT_TABLE]!
    203     vmul.s16        q5, q5, q13
    204     vmul.s16        q6, q6, q14
    205     vld1.16         {d20, d21, d22, d23}, [DCT_TABLE]!
    206     vmul.s16        q7, q7, q15
    207     vmul.s16        q8, q8, q10
    208     vmul.s16        q9, q9, q11
    209 
    210     /* Pass 1 : process columns from input, store into work array.*/
    211     idct_helper     q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
    212     /* Transpose */
    213     vtrn.16 q2, q3
    214     vtrn.16 q4, q5
    215     vtrn.32 q2, q4
    216     vtrn.32 q3, q5
    217 
    218     vtrn.16 q6, q7
    219     vtrn.16 q8, q9
    220     vtrn.32 q6, q8
    221     vtrn.32 q7, q9
    222 
    223     vswp            d12, d5
    224     vswp            d14, d7
    225     vswp            d16, d9
    226     vswp            d18, d11
    227 
    228     /* Pass 2 */
    229     idct_helper     q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
    230     /* Transpose */
    231 
    232     vtrn.16 q2, q3
    233     vtrn.16 q4, q5
    234     vtrn.32 q2, q4
    235     vtrn.32 q3, q5
    236 
    237     vtrn.16 q6, q7
    238     vtrn.16 q8, q9
    239     vtrn.32 q6, q8
    240     vtrn.32 q7, q9
    241 
    242     vswp            d12, d5
    243     vswp            d14, d7
    244     vswp            d16, d9
    245     vswp            d18, d11
    246 
    247     /* Descale and range limit */
    248     vmov.s16        q15, #(0x80 << 5)
    249     vqadd.s16       q2, q2, q15
    250     vqadd.s16       q3, q3, q15
    251     vqadd.s16       q4, q4, q15
    252     vqadd.s16       q5, q5, q15
    253     vqadd.s16       q6, q6, q15
    254     vqadd.s16       q7, q7, q15
    255     vqadd.s16       q8, q8, q15
    256     vqadd.s16       q9, q9, q15
    257     vqshrun.s16     d4, q2, #5
    258     vqshrun.s16     d6, q3, #5
    259     vqshrun.s16     d8, q4, #5
    260     vqshrun.s16     d10, q5, #5
    261     vqshrun.s16     d12, q6, #5
    262     vqshrun.s16     d14, q7, #5
    263     vqshrun.s16     d16, q8, #5
    264     vqshrun.s16     d18, q9, #5
    265 
    266     /* Store results to the output buffer */
    267     .irp            x, d4, d6, d8, d10, d12, d14, d16, d18
    268     ldr             TMP, [OUTPUT_BUF], #4
    269     add             TMP, TMP, OUTPUT_COL
    270     vst1.8          {\x}, [TMP]!
    271     .endr
    272 
    273     vpop            {d8-d15}
    274     bx              lr
    275 
    276     .unreq          DCT_TABLE
    277     .unreq          COEF_BLOCK
    278     .unreq          OUTPUT_BUF
    279     .unreq          OUTPUT_COL
    280     .unreq          TMP
    281 .endfunc
    282 
    283 .purgem idct_helper
    284 
    285 /*****************************************************************************/
    286 
    287 /*
    288  * jsimd_idct_4x4_neon
    289  *
    290  * This function contains inverse-DCT code for getting reduced-size
    291  * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
    292  * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
    293  * function from jpeg-6b (jidctred.c).
    294  *
    295  * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
    296  *       requires much less arithmetic operations and hence should be faster.
    297  *       The primary purpose of this particular NEON optimized function is
    298  *       bit exact compatibility with jpeg-6b.
    299  *
    300  * TODO: a bit better instructions scheduling can be achieved by expanding
    301  *       idct_helper/transpose_4x4 macros and reordering instructions,
    302  *       but readability will suffer somewhat.
    303  */
    304 
    305 #define CONST_BITS  13
    306 
    307 #define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
    308 #define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
    309 #define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
    310 #define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
    311 #define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
    312 #define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
    313 #define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
    314 #define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
    315 #define FIX_1_272758580  (10426) /* FIX(1.272758580) */
    316 #define FIX_1_451774981  (11893) /* FIX(1.451774981) */
    317 #define FIX_1_847759065  (15137) /* FIX(1.847759065) */
    318 #define FIX_2_172734803  (17799) /* FIX(2.172734803) */
    319 #define FIX_2_562915447  (20995) /* FIX(2.562915447) */
    320 #define FIX_3_624509785  (29692) /* FIX(3.624509785) */
    321 
    322 .balign 16
    323 jsimd_idct_4x4_neon_consts:
    324     .short     FIX_1_847759065     /* d0[0] */
    325     .short     -FIX_0_765366865    /* d0[1] */
    326     .short     -FIX_0_211164243    /* d0[2] */
    327     .short     FIX_1_451774981     /* d0[3] */
    328     .short     -FIX_2_172734803    /* d1[0] */
    329     .short     FIX_1_061594337     /* d1[1] */
    330     .short     -FIX_0_509795579    /* d1[2] */
    331     .short     -FIX_0_601344887    /* d1[3] */
    332     .short     FIX_0_899976223     /* d2[0] */
    333     .short     FIX_2_562915447     /* d2[1] */
    334     .short     1 << (CONST_BITS+1) /* d2[2] */
    335     .short     0                   /* d2[3] */
    336 
    337 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
    338     vmull.s16       q14, \x4,  d2[2]
    339     vmlal.s16       q14, \x8,  d0[0]
    340     vmlal.s16       q14, \x14, d0[1]
    341 
    342     vmull.s16       q13, \x16, d1[2]
    343     vmlal.s16       q13, \x12, d1[3]
    344     vmlal.s16       q13, \x10, d2[0]
    345     vmlal.s16       q13, \x6,  d2[1]
    346 
    347     vmull.s16       q15, \x4,  d2[2]
    348     vmlsl.s16       q15, \x8,  d0[0]
    349     vmlsl.s16       q15, \x14, d0[1]
    350 
    351     vmull.s16       q12, \x16, d0[2]
    352     vmlal.s16       q12, \x12, d0[3]
    353     vmlal.s16       q12, \x10, d1[0]
    354     vmlal.s16       q12, \x6,  d1[1]
    355 
    356     vadd.s32        q10, q14, q13
    357     vsub.s32        q14, q14, q13
    358 
    359 .if \shift > 16
    360     vrshr.s32       q10,  q10, #\shift
    361     vrshr.s32       q14,  q14, #\shift
    362     vmovn.s32       \y26, q10
    363     vmovn.s32       \y29, q14
    364 .else
    365     vrshrn.s32      \y26, q10, #\shift
    366     vrshrn.s32      \y29, q14, #\shift
    367 .endif
    368 
    369     vadd.s32        q10, q15, q12
    370     vsub.s32        q15, q15, q12
    371 
    372 .if \shift > 16
    373     vrshr.s32       q10,  q10, #\shift
    374     vrshr.s32       q15,  q15, #\shift
    375     vmovn.s32       \y27, q10
    376     vmovn.s32       \y28, q15
    377 .else
    378     vrshrn.s32      \y27, q10, #\shift
    379     vrshrn.s32      \y28, q15, #\shift
    380 .endif
    381 
    382 .endm
    383 
    384 asm_function jsimd_idct_4x4_neon
    385 
    386     DCT_TABLE       .req r0
    387     COEF_BLOCK      .req r1
    388     OUTPUT_BUF      .req r2
    389     OUTPUT_COL      .req r3
    390     TMP1            .req r0
    391     TMP2            .req r1
    392     TMP3            .req r2
    393     TMP4            .req ip
    394 
    395     vpush           {d8-d15}
    396 
    397     /* Load constants (d3 is just used for padding) */
    398     adr             TMP4, jsimd_idct_4x4_neon_consts
    399     vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
    400 
    401     /* Load all COEF_BLOCK into NEON registers with the following allocation:
    402      *       0 1 2 3 | 4 5 6 7
    403      *      ---------+--------
    404      *   0 | d4      | d5
    405      *   1 | d6      | d7
    406      *   2 | d8      | d9
    407      *   3 | d10     | d11
    408      *   4 | -       | -
    409      *   5 | d12     | d13
    410      *   6 | d14     | d15
    411      *   7 | d16     | d17
    412      */
    413     vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK]!
    414     vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK]!
    415     add COEF_BLOCK, COEF_BLOCK, #16
    416     vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK]!
    417     vld1.16         {d16, d17}, [COEF_BLOCK]!
    418     /* dequantize */
    419     vld1.16         {d18, d19, d20, d21}, [DCT_TABLE]!
    420     vmul.s16        q2, q2, q9
    421     vld1.16         {d22, d23, d24, d25}, [DCT_TABLE]!
    422     vmul.s16        q3, q3, q10
    423     vmul.s16        q4, q4, q11
    424     add             DCT_TABLE, DCT_TABLE, #16
    425     vld1.16         {d26, d27, d28, d29}, [DCT_TABLE]!
    426     vmul.s16        q5, q5, q12
    427     vmul.s16        q6, q6, q13
    428     vld1.16         {d30, d31}, [DCT_TABLE]!
    429     vmul.s16        q7, q7, q14
    430     vmul.s16        q8, q8, q15
    431 
    432 
    433     /* Pass 1 */
    434     idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
    435     transpose_4x4   d4, d6, d8, d10
    436     idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
    437     transpose_4x4   d5, d7, d9, d11
    438 
    439     /* Pass 2 */
    440     idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
    441     transpose_4x4   d26, d27, d28, d29
    442 
    443     /* Range limit */
    444     vmov.u16        q15, #0x80
    445     vadd.s16        q13, q13, q15
    446     vadd.s16        q14, q14, q15
    447     vqmovun.s16     d26, q13
    448     vqmovun.s16     d27, q14
    449 
    450     /* Store results to the output buffer */
    451     ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
    452     add             TMP1, TMP1, OUTPUT_COL
    453     add             TMP2, TMP2, OUTPUT_COL
    454     add             TMP3, TMP3, OUTPUT_COL
    455     add             TMP4, TMP4, OUTPUT_COL
    456 
    457 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
    458     /* We can use much less instructions on little endian systems if the
    459      * OS kernel is not configured to trap unaligned memory accesses
    460      */
    461     vst1.32         {d26[0]}, [TMP1]!
    462     vst1.32         {d27[0]}, [TMP3]!
    463     vst1.32         {d26[1]}, [TMP2]!
    464     vst1.32         {d27[1]}, [TMP4]!
    465 #else
    466     vst1.8          {d26[0]}, [TMP1]!
    467     vst1.8          {d27[0]}, [TMP3]!
    468     vst1.8          {d26[1]}, [TMP1]!
    469     vst1.8          {d27[1]}, [TMP3]!
    470     vst1.8          {d26[2]}, [TMP1]!
    471     vst1.8          {d27[2]}, [TMP3]!
    472     vst1.8          {d26[3]}, [TMP1]!
    473     vst1.8          {d27[3]}, [TMP3]!
    474 
    475     vst1.8          {d26[4]}, [TMP2]!
    476     vst1.8          {d27[4]}, [TMP4]!
    477     vst1.8          {d26[5]}, [TMP2]!
    478     vst1.8          {d27[5]}, [TMP4]!
    479     vst1.8          {d26[6]}, [TMP2]!
    480     vst1.8          {d27[6]}, [TMP4]!
    481     vst1.8          {d26[7]}, [TMP2]!
    482     vst1.8          {d27[7]}, [TMP4]!
    483 #endif
    484 
    485     vpop            {d8-d15}
    486     bx              lr
    487 
    488     .unreq          DCT_TABLE
    489     .unreq          COEF_BLOCK
    490     .unreq          OUTPUT_BUF
    491     .unreq          OUTPUT_COL
    492     .unreq          TMP1
    493     .unreq          TMP2
    494     .unreq          TMP3
    495     .unreq          TMP4
    496 .endfunc
    497 
    498 .purgem idct_helper
    499 
    500 /*****************************************************************************/
    501 
    502 /*
    503  * jsimd_idct_2x2_neon
    504  *
    505  * This function contains inverse-DCT code for getting reduced-size
    506  * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
    507  * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
    508  * function from jpeg-6b (jidctred.c).
    509  *
    510  * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
    511  *       requires much less arithmetic operations and hence should be faster.
    512  *       The primary purpose of this particular NEON optimized function is
    513  *       bit exact compatibility with jpeg-6b.
    514  */
    515 
    516 .balign 8
    517 jsimd_idct_2x2_neon_consts:
    518     .short     -FIX_0_720959822    /* d0[0] */
    519     .short     FIX_0_850430095     /* d0[1] */
    520     .short     -FIX_1_272758580    /* d0[2] */
    521     .short     FIX_3_624509785     /* d0[3] */
    522 
    523 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
    524     vshll.s16  q14,  \x4,  #15
    525     vmull.s16  q13,  \x6,  d0[3]
    526     vmlal.s16  q13,  \x10, d0[2]
    527     vmlal.s16  q13,  \x12, d0[1]
    528     vmlal.s16  q13,  \x16, d0[0]
    529 
    530     vadd.s32   q10,  q14,  q13
    531     vsub.s32   q14,  q14,  q13
    532 
    533 .if \shift > 16
    534     vrshr.s32  q10,  q10,  #\shift
    535     vrshr.s32  q14,  q14,  #\shift
    536     vmovn.s32  \y26, q10
    537     vmovn.s32  \y27, q14
    538 .else
    539     vrshrn.s32 \y26, q10,  #\shift
    540     vrshrn.s32 \y27, q14,  #\shift
    541 .endif
    542 
    543 .endm
    544 
    545 asm_function jsimd_idct_2x2_neon
    546 
    547     DCT_TABLE       .req r0
    548     COEF_BLOCK      .req r1
    549     OUTPUT_BUF      .req r2
    550     OUTPUT_COL      .req r3
    551     TMP1            .req r0
    552     TMP2            .req ip
    553 
    554     vpush           {d8-d15}
    555 
    556     /* Load constants */
    557     adr             TMP2, jsimd_idct_2x2_neon_consts
    558     vld1.16         {d0}, [TMP2, :64]
    559 
    560     /* Load all COEF_BLOCK into NEON registers with the following allocation:
    561      *       0 1 2 3 | 4 5 6 7
    562      *      ---------+--------
    563      *   0 | d4      | d5
    564      *   1 | d6      | d7
    565      *   2 | -       | -
    566      *   3 | d10     | d11
    567      *   4 | -       | -
    568      *   5 | d12     | d13
    569      *   6 | -       | -
    570      *   7 | d16     | d17
    571      */
    572 
    573     vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK]!
    574     add             COEF_BLOCK, COEF_BLOCK, #16
    575     vld1.16         {d10, d11}, [COEF_BLOCK]!
    576     add             COEF_BLOCK, COEF_BLOCK, #16
    577     vld1.16         {d12, d13}, [COEF_BLOCK]!
    578     add             COEF_BLOCK, COEF_BLOCK, #16
    579     vld1.16         {d16, d17}, [COEF_BLOCK]!
    580     /* Dequantize */
    581     vld1.16         {d18, d19, d20, d21}, [DCT_TABLE]!
    582     vmul.s16        q2, q2, q9
    583     vmul.s16        q3, q3, q10
    584     add             DCT_TABLE, DCT_TABLE, #16
    585     vld1.16         {d24, d25}, [DCT_TABLE]!
    586     vmul.s16        q5, q5, q12
    587     add             DCT_TABLE, DCT_TABLE, #16
    588     vld1.16         {d26, d27}, [DCT_TABLE]!
    589     vmul.s16        q6, q6, q13
    590     add             DCT_TABLE, DCT_TABLE, #16
    591     vld1.16         {d30, d31}, [DCT_TABLE]!
    592     vmul.s16        q8, q8, q15
    593 
    594     /* Pass 1 */
    595     vmull.s16       q13, d6,  d0[3]
    596     vmlal.s16       q13, d10, d0[2]
    597     vmlal.s16       q13, d12, d0[1]
    598     vmlal.s16       q13, d16, d0[0]
    599     vmull.s16       q12, d7,  d0[3]
    600     vmlal.s16       q12, d11, d0[2]
    601     vmlal.s16       q12, d13, d0[1]
    602     vmlal.s16       q12, d17, d0[0]
    603     vshll.s16       q14, d4,  #15
    604     vshll.s16       q15, d5,  #15
    605     vadd.s32        q10, q14, q13
    606     vsub.s32        q14, q14, q13
    607     vrshrn.s32      d4,  q10, #13
    608     vrshrn.s32      d6,  q14, #13
    609     vadd.s32        q10, q15, q12
    610     vsub.s32        q14, q15, q12
    611     vrshrn.s32      d5,  q10, #13
    612     vrshrn.s32      d7,  q14, #13
    613     vtrn.16         q2,  q3
    614     vtrn.32         q3,  q5
    615 
    616     /* Pass 2 */
    617     idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
    618 
    619     /* Range limit */
    620     vmov.u16        q15, #0x80
    621     vadd.s16        q13, q13, q15
    622     vqmovun.s16     d26, q13
    623     vqmovun.s16     d27, q13
    624 
    625     /* Store results to the output buffer */
    626     ldmia           OUTPUT_BUF, {TMP1, TMP2}
    627     add             TMP1, TMP1, OUTPUT_COL
    628     add             TMP2, TMP2, OUTPUT_COL
    629 
    630     vst1.8          {d26[0]}, [TMP1]!
    631     vst1.8          {d27[4]}, [TMP1]!
    632     vst1.8          {d26[1]}, [TMP2]!
    633     vst1.8          {d27[5]}, [TMP2]!
    634 
    635     vpop            {d8-d15}
    636     bx              lr
    637 
    638     .unreq          DCT_TABLE
    639     .unreq          COEF_BLOCK
    640     .unreq          OUTPUT_BUF
    641     .unreq          OUTPUT_COL
    642     .unreq          TMP1
    643     .unreq          TMP2
    644 .endfunc
    645 
    646 .purgem idct_helper
    647 
    648 /*****************************************************************************/
    649 
    650 /*
    651  * jsimd_ycc_rgba8888_convert_neon
    652  * jsimd_ycc_rgb565_convert_neon
    653  * Colorspace conversion YCbCr -> RGB
    654  */
    655 
    656 
    657 .macro do_load size
    658     .if \size == 8
    659         vld1.8  {d4}, [U]!
    660         vld1.8  {d5}, [V]!
    661         vld1.8  {d0}, [Y]!
    662         pld     [Y, #64]
    663         pld     [U, #64]
    664         pld     [V, #64]
    665     .elseif \size == 4
    666         vld1.8  {d4[0]}, [U]!
    667         vld1.8  {d4[1]}, [U]!
    668         vld1.8  {d4[2]}, [U]!
    669         vld1.8  {d4[3]}, [U]!
    670         vld1.8  {d5[0]}, [V]!
    671         vld1.8  {d5[1]}, [V]!
    672         vld1.8  {d5[2]}, [V]!
    673         vld1.8  {d5[3]}, [V]!
    674         vld1.8  {d0[0]}, [Y]!
    675         vld1.8  {d0[1]}, [Y]!
    676         vld1.8  {d0[2]}, [Y]!
    677         vld1.8  {d0[3]}, [Y]!
    678     .elseif \size == 2
    679         vld1.8  {d4[4]}, [U]!
    680         vld1.8  {d4[5]}, [U]!
    681         vld1.8  {d5[4]}, [V]!
    682         vld1.8  {d5[5]}, [V]!
    683         vld1.8  {d0[4]}, [Y]!
    684         vld1.8  {d0[5]}, [Y]!
    685     .elseif \size == 1
    686         vld1.8  {d4[6]}, [U]!
    687         vld1.8  {d5[6]}, [V]!
    688         vld1.8  {d0[6]}, [Y]!
    689     .else
    690         .error unsupported macroblock size
    691     .endif
    692 .endm
    693 
    694 
    695 
    696 
    697 
    698 .macro do_store bpp, size
    699     .if \bpp == 16
    700             /* if 16 bits, pack into RGB565 format */
    701             vmov      d27, d10          /* insert red channel */
    702             vsri.u8   d27, d11, #5      /* shift and insert the green channel */
    703             vsli.u8   d26, d11, #3
    704             vsri.u8   d26, d12, #3     /* shift and insert the blue channel */
    705 
    706         .if \size == 8
    707             vst2.8  {d26, d27}, [RGB]!
    708         .elseif \size == 4
    709             vst2.8  {d26[0], d27[0]}, [RGB]!
    710             vst2.8  {d26[1], d27[1]}, [RGB]!
    711             vst2.8  {d26[2], d27[2]}, [RGB]!
    712             vst2.8  {d26[3], d27[3]}, [RGB]!
    713         .elseif \size == 2
    714             vst2.8  {d26[4], d27[4]}, [RGB]!
    715             vst2.8  {d26[5], d27[5]}, [RGB]!
    716         .elseif \size == 1
    717             vst2.8  {d26[6], d27[6]}, [RGB]!
    718         .else
    719             .error unsupported macroblock size
    720         .endif
    721     .elseif \bpp == 24
    722         .if \size == 8
    723             vst3.8  {d10, d11, d12}, [RGB]!
    724         .elseif \size == 4
    725             vst3.8  {d10[0], d11[0], d12[0]}, [RGB]!
    726             vst3.8  {d10[1], d11[1], d12[1]}, [RGB]!
    727             vst3.8  {d10[2], d11[2], d12[2]}, [RGB]!
    728             vst3.8  {d10[3], d11[3], d12[3]}, [RGB]!
    729         .elseif \size == 2
    730             vst3.8  {d10[4], d11[4], d12[4]}, [RGB]!
    731             vst3.8  {d10[5], d11[5], d12[5]}, [RGB]!
    732         .elseif \size == 1
    733             vst3.8  {d10[6], d11[6], d12[6]}, [RGB]!
    734         .else
    735             .error unsupported macroblock size
    736         .endif
    737     .elseif \bpp == 32
    738         .if \size == 8
    739             vst4.8  {d10, d11, d12, d13}, [RGB]!
    740         .elseif \size == 4
    741             vst4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
    742             vst4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
    743             vst4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
    744             vst4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
    745         .elseif \size == 2
    746             vst4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
    747             vst4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
    748         .elseif \size == 1
    749             vst4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
    750         .else
    751             .error unsupported macroblock size
    752         .endif
    753     .else
    754         .error unsupported bpp
    755     .endif
    756 .endm
    757 
    758 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
    759 
    760 .macro do_yuv_to_rgb
    761     vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
    762     vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
    763     vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
    764     vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
    765     vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
    766     vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
    767     vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
    768     vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
    769     vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
    770     vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
    771     vrshrn.s32      d20, q10, #15
    772     vrshrn.s32      d21, q11, #15
    773     vrshrn.s32      d24, q12, #14
    774     vrshrn.s32      d25, q13, #14
    775     vrshrn.s32      d28, q14, #14
    776     vrshrn.s32      d29, q15, #14
    777     vaddw.u8        q10, q10, d0
    778     vaddw.u8        q12, q12, d0
    779     vaddw.u8        q14, q14, d0
    780     vqmovun.s16     d1\g_offs, q10
    781     vqmovun.s16     d1\r_offs, q12
    782     vqmovun.s16     d1\b_offs, q14
    783 .endm
    784 
    785 /* Apple gas crashes on adrl, work around that by using adr.
    786  * But this requires a copy of these constants for each function.
    787  */
    788 
    789 .balign 16
    790 jsimd_ycc_\colorid\()_neon_consts:
    791     .short          0,      0,     0,      0
    792     .short          22971, -11277, -23401, 29033
    793     .short          -128,  -128,   -128,   -128
    794     .short          -128,  -128,   -128,   -128
    795 
    796 asm_function jsimd_ycc_\colorid\()_convert_neon
    797     OUTPUT_WIDTH    .req r0
    798     INPUT_BUF       .req r1
    799     INPUT_ROW       .req r2
    800     OUTPUT_BUF      .req r3
    801     NUM_ROWS        .req r4
    802 
    803     INPUT_BUF0      .req r5
    804     INPUT_BUF1      .req r6
    805     INPUT_BUF2      .req INPUT_BUF
    806 
    807     RGB             .req r7
    808     Y               .req r8
    809     U               .req r9
    810     V               .req r10
    811     N               .req ip
    812 
    813     /* Load constants to d1, d2, d3 (d0 is just used for padding) */
    814     adr             ip, jsimd_ycc_\colorid\()_neon_consts
    815     vld1.16         {d0, d1, d2, d3}, [ip, :128]
    816 
    817     /* Save ARM registers and handle input arguments */
    818     push            {r4, r5, r6, r7, r8, r9, r10, lr}
    819     ldr             NUM_ROWS, [sp, #(4 * 8)]
    820     ldr             INPUT_BUF0, [INPUT_BUF]
    821     ldr             INPUT_BUF1, [INPUT_BUF, #4]
    822     ldr             INPUT_BUF2, [INPUT_BUF, #8]
    823     .unreq          INPUT_BUF
    824 
    825     /* Save NEON registers */
    826     vpush           {d8-d15}
    827 
    828     /* Initially set d10, d11, d12, d13 to 0xFF */
    829     vmov.u8         q5, #255
    830     vmov.u8         q6, #255
    831 
    832     /* Outer loop over scanlines */
    833     cmp             NUM_ROWS, #1
    834     blt             9f
    835 0:
    836     ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
    837     ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
    838     mov             N, OUTPUT_WIDTH
    839     ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
    840     add             INPUT_ROW, INPUT_ROW, #1
    841     ldr             RGB, [OUTPUT_BUF], #4
    842 
    843     /* Inner loop over pixels */
    844     subs            N, N, #8
    845     blt             2f
    846 1:
    847     do_load         8
    848     do_yuv_to_rgb
    849     do_store        \bpp, 8
    850     subs            N, N, #8
    851     bge             1b
    852     tst             N, #7
    853     beq             8f
    854 2:
    855     tst             N, #4
    856     beq             3f
    857     do_load         4
    858 3:
    859     tst             N, #2
    860     beq             4f
    861     do_load         2
    862 4:
    863     tst             N, #1
    864     beq             5f
    865     do_load         1
    866 5:
    867     do_yuv_to_rgb
    868     tst             N, #4
    869     beq             6f
    870     do_store        \bpp, 4
    871 6:
    872     tst             N, #2
    873     beq             7f
    874     do_store        \bpp, 2
    875 7:
    876     tst             N, #1
    877     beq             8f
    878     do_store        \bpp, 1
    879 8:
    880     subs            NUM_ROWS, NUM_ROWS, #1
    881     bgt             0b
    882 9:
    883     /* Restore all registers and return */
    884     vpop            {d8-d15}
    885     pop             {r4, r5, r6, r7, r8, r9, r10, pc}
    886 
    887     .unreq          OUTPUT_WIDTH
    888     .unreq          INPUT_ROW
    889     .unreq          OUTPUT_BUF
    890     .unreq          NUM_ROWS
    891     .unreq          INPUT_BUF0
    892     .unreq          INPUT_BUF1
    893     .unreq          INPUT_BUF2
    894     .unreq          RGB
    895     .unreq          Y
    896     .unreq          U
    897     .unreq          V
    898     .unreq          N
    899 .endfunc
    900 
    901 .purgem do_yuv_to_rgb
    902 
    903 .endm
    904 
    905 /*--------------------------------- id ----- bpp R  G  B */
    906 generate_jsimd_ycc_rgb_convert_neon rgba8888, 32, 0, 1, 2
    907 generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, 1, 2
    908 
    909 
    910 .purgem do_load
    911 .purgem do_store
    912 
    913 /*****************************************************************************/
    914