Home | History | Annotate | Download | only in simd
      1 /*
      2  * AltiVec optimizations for libjpeg-turbo
      3  *
      4  * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
      5  * Copyright (C) 2014, Jay Foad.  All Rights Reserved.
      6  *
      7  * This software is provided 'as-is', without any express or implied
      8  * warranty.  In no event will the authors be held liable for any damages
      9  * arising from the use of this software.
     10  *
     11  * Permission is granted to anyone to use this software for any purpose,
     12  * including commercial applications, and to alter it and redistribute it
     13  * freely, subject to the following restrictions:
     14  *
     15  * 1. The origin of this software must not be misrepresented; you must not
     16  *    claim that you wrote the original software. If you use this software
     17  *    in a product, an acknowledgment in the product documentation would be
     18  *    appreciated but is not required.
     19  * 2. Altered source versions must be plainly marked as such, and must not be
     20  *    misrepresented as being the original software.
     21  * 3. This notice may not be removed or altered from any source distribution.
     22  */
     23 
     24 /* This file is included by jccolor-altivec.c */
     25 
     26 
     27 void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
     28                                     JSAMPIMAGE output_buf,
     29                                     JDIMENSION output_row, int num_rows)
     30 {
     31   JSAMPROW inptr, outptr0, outptr1, outptr2;
     32   int pitch = img_width * RGB_PIXELSIZE, num_cols;
     33 #if __BIG_ENDIAN__
     34   int offset;
     35 #endif
     36   unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
     37 
     38   __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
     39     rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
     40 #if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
     41   __vector unsigned char rgb3 = {0};
     42 #endif
     43 #if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
     44   __vector unsigned char rgb4 = {0};
     45 #endif
     46   __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
     47   __vector unsigned short yl, yh, crl, crh, cbl, cbh;
     48   __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
     49 
     50   /* Constants */
     51   __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
     52     pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) },
     53     pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) },
     54     pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) };
     55   __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) };
     56   __vector int pd_onehalf = { __4X(ONE_HALF) },
     57     pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
     58   __vector unsigned char pb_zero = { __16X(0) },
     59 #if __BIG_ENDIAN__
     60     shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
     61 #else
     62     shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
     63 #endif
     64 
     65   while (--num_rows >= 0) {
     66     inptr = *input_buf++;
     67     outptr0 = output_buf[0][output_row];
     68     outptr1 = output_buf[1][output_row];
     69     outptr2 = output_buf[2][output_row];
     70     output_row++;
     71 
     72     for (num_cols = pitch; num_cols > 0;
     73          num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
     74          outptr0 += 16, outptr1 += 16, outptr2 += 16) {
     75 
     76 #if __BIG_ENDIAN__
     77       /* Load 16 pixels == 48 or 64 bytes */
     78       offset = (size_t)inptr & 15;
     79       if (offset) {
     80         __vector unsigned char unaligned_shift_index;
     81         int bytes = num_cols + offset;
     82 
     83         if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
     84           /* Slow path to prevent buffer overread.  Since there is no way to
     85            * read a partial AltiVec register, overread would occur on the last
     86            * chunk of the last image row if the right edge is not on a 16-byte
     87            * boundary.  It could also occur on other rows if the bytes per row
     88            * is low enough.  Since we can't determine whether we're on the last
     89            * image row, we have to assume every row is the last.
     90            */
     91           memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
     92           rgb0 = vec_ld(0, tmpbuf);
     93           rgb1 = vec_ld(16, tmpbuf);
     94           rgb2 = vec_ld(32, tmpbuf);
     95 #if RGB_PIXELSIZE == 4
     96           rgb3 = vec_ld(48, tmpbuf);
     97 #endif
     98         } else {
     99           /* Fast path */
    100           rgb0 = vec_ld(0, inptr);
    101           if (bytes > 16)
    102             rgb1 = vec_ld(16, inptr);
    103           if (bytes > 32)
    104             rgb2 = vec_ld(32, inptr);
    105           if (bytes > 48)
    106             rgb3 = vec_ld(48, inptr);
    107 #if RGB_PIXELSIZE == 4
    108           if (bytes > 64)
    109             rgb4 = vec_ld(64, inptr);
    110 #endif
    111           unaligned_shift_index = vec_lvsl(0, inptr);
    112           rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
    113           rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
    114           rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
    115 #if RGB_PIXELSIZE == 4
    116           rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
    117 #endif
    118         }
    119       } else {
    120 #endif /* __BIG_ENDIAN__ */
    121         if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
    122           /* Slow path */
    123           memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
    124           rgb0 = VEC_LD(0, tmpbuf);
    125           rgb1 = VEC_LD(16, tmpbuf);
    126           rgb2 = VEC_LD(32, tmpbuf);
    127 #if RGB_PIXELSIZE == 4
    128           rgb3 = VEC_LD(48, tmpbuf);
    129 #endif
    130         } else {
    131           /* Fast path */
    132           rgb0 = VEC_LD(0, inptr);
    133           if (num_cols > 16)
    134             rgb1 = VEC_LD(16, inptr);
    135           if (num_cols > 32)
    136             rgb2 = VEC_LD(32, inptr);
    137 #if RGB_PIXELSIZE == 4
    138           if (num_cols > 48)
    139             rgb3 = VEC_LD(48, inptr);
    140 #endif
    141         }
    142 #if __BIG_ENDIAN__
    143       }
    144 #endif
    145 
    146 #if RGB_PIXELSIZE == 3
    147       /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
    148        * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
    149        * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
    150        *
    151        * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
    152        * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
    153        * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
    154        * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
    155        */
    156       rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
    157       rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
    158       rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
    159       rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
    160 #else
    161       /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
    162        * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
    163        * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
    164        * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
    165        *
    166        * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
    167        * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
    168        * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
    169        * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
    170        */
    171       rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
    172       rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
    173       rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
    174       rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
    175 #endif
    176 
    177       /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
    178        * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
    179        * ...
    180        *
    181        * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
    182        * support unsigned vectors.
    183        */
    184       rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
    185       bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
    186       rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
    187       bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
    188       rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
    189       bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
    190       rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
    191       bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
    192 
    193       /* (Original)
    194        * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    195        * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    196        * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    197        *
    198        * (This implementation)
    199        * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    200        * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    201        * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    202        */
    203 
    204       /* Calculate Y values */
    205 
    206       y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
    207       y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
    208       y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
    209       y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
    210       y0 = vec_msums(bg0, pw_f0114_f0250, y0);
    211       y1 = vec_msums(bg1, pw_f0114_f0250, y1);
    212       y2 = vec_msums(bg2, pw_f0114_f0250, y2);
    213       y3 = vec_msums(bg3, pw_f0114_f0250, y3);
    214       /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
    215        * each dword into a new 16-bit vector, which is the equivalent of
    216        * descaling the 32-bit results (right-shifting by 16 bits) and then
    217        * packing them.
    218        */
    219       yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
    220                     shift_pack_index);
    221       yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
    222                     shift_pack_index);
    223       y = vec_pack(yl, yh);
    224       vec_st(y, 0, outptr0);
    225 
    226       /* Calculate Cb values */
    227       cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj);
    228       cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj);
    229       cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj);
    230       cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj);
    231       cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000,
    232                                    (__vector unsigned int)cb0);
    233       cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000,
    234                                    (__vector unsigned int)cb1);
    235       cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000,
    236                                    (__vector unsigned int)cb2);
    237       cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000,
    238                                    (__vector unsigned int)cb3);
    239       cbl = vec_perm((__vector unsigned short)cb0,
    240                      (__vector unsigned short)cb1, shift_pack_index);
    241       cbh = vec_perm((__vector unsigned short)cb2,
    242                      (__vector unsigned short)cb3, shift_pack_index);
    243       cb = vec_pack(cbl, cbh);
    244       vec_st(cb, 0, outptr1);
    245 
    246       /* Calculate Cr values */
    247       cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj);
    248       cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj);
    249       cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj);
    250       cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj);
    251       cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000,
    252                                    (__vector unsigned int)cr0);
    253       cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000,
    254                                    (__vector unsigned int)cr1);
    255       cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000,
    256                                    (__vector unsigned int)cr2);
    257       cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000,
    258                                    (__vector unsigned int)cr3);
    259       crl = vec_perm((__vector unsigned short)cr0,
    260                      (__vector unsigned short)cr1, shift_pack_index);
    261       crh = vec_perm((__vector unsigned short)cr2,
    262                      (__vector unsigned short)cr3, shift_pack_index);
    263       cr = vec_pack(crl, crh);
    264       vec_st(cr, 0, outptr2);
    265     }
    266   }
    267 }
    268