Home | History | Annotate | Download | only in simd
      1 /*
      2  * AltiVec optimizations for libjpeg-turbo
      3  *
      4  * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
      5  *
      6  * This software is provided 'as-is', without any express or implied
      7  * warranty.  In no event will the authors be held liable for any damages
      8  * arising from the use of this software.
      9  *
     10  * Permission is granted to anyone to use this software for any purpose,
     11  * including commercial applications, and to alter it and redistribute it
     12  * freely, subject to the following restrictions:
     13  *
     14  * 1. The origin of this software must not be misrepresented; you must not
     15  *    claim that you wrote the original software. If you use this software
     16  *    in a product, an acknowledgment in the product documentation would be
     17  *    appreciated but is not required.
     18  * 2. Altered source versions must be plainly marked as such, and must not be
     19  *    misrepresented as being the original software.
     20  * 3. This notice may not be removed or altered from any source distribution.
     21  */
     22 
     23 /* CHROMA UPSAMPLING */
     24 
     25 #include "jsimd_altivec.h"
     26 
     27 
     28 void
     29 jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
     30                                    JDIMENSION downsampled_width,
     31                                    JSAMPARRAY input_data,
     32                                    JSAMPARRAY *output_data_ptr)
     33 {
     34   JSAMPARRAY output_data = *output_data_ptr;
     35   JSAMPROW inptr, outptr;
     36   int inrow, incol;
     37 
     38   __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
     39     out;
     40   __vector short this0e, this0o, this0l, this0h, last0l, last0h,
     41     next0l, next0h, outle, outhe, outlo, outho;
     42 
     43   /* Constants */
     44   __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
     45     last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
     46     last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
     47     next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
     48     next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
     49 #if __BIG_ENDIAN__
     50     merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
     51 #else
     52     merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
     53 #endif
     54   __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
     55 
     56   for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
     57     inptr = input_data[inrow];
     58     outptr = output_data[inrow];
     59 
     60     if (downsampled_width & 15)
     61       inptr[downsampled_width] = inptr[downsampled_width - 1];
     62 
     63     this0 = vec_ld(0, inptr);
     64     p_last0 = vec_perm(this0, this0, last_index_col0);
     65     last0 = this0;
     66 
     67     for (incol = downsampled_width; incol > 0;
     68          incol -= 16, inptr += 16, outptr += 32) {
     69 
     70       if (downsampled_width - incol > 0) {
     71         p_last0 = vec_perm(last0, this0, last_index);
     72         last0 = this0;
     73       }
     74 
     75       if (incol <= 16)
     76         p_next0 = vec_perm(this0, this0, next_index_lastcol);
     77       else {
     78         next0 = vec_ld(16, inptr);
     79         p_next0 = vec_perm(this0, next0, next_index);
     80       }
     81 
     82       this0e = (__vector short)vec_mule(this0, pb_three);
     83       this0o = (__vector short)vec_mulo(this0, pb_three);
     84       this0l = vec_mergeh(this0e, this0o);
     85       this0h = vec_mergel(this0e, this0o);
     86 
     87       last0l = (__vector short)VEC_UNPACKHU(p_last0);
     88       last0h = (__vector short)VEC_UNPACKLU(p_last0);
     89       last0l = vec_add(last0l, pw_one);
     90 
     91       next0l = (__vector short)VEC_UNPACKHU(p_next0);
     92       next0h = (__vector short)VEC_UNPACKLU(p_next0);
     93       next0l = vec_add(next0l, pw_two);
     94 
     95       outle = vec_add(this0l, last0l);
     96       outlo = vec_add(this0l, next0l);
     97       outle = vec_sr(outle, (__vector unsigned short)pw_two);
     98       outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
     99 
    100       out = vec_perm((__vector unsigned char)outle,
    101                      (__vector unsigned char)outlo, merge_pack_index);
    102       vec_st(out, 0, outptr);
    103 
    104       if (incol > 8) {
    105         last0h = vec_add(last0h, pw_one);
    106         next0h = vec_add(next0h, pw_two);
    107 
    108         outhe = vec_add(this0h, last0h);
    109         outho = vec_add(this0h, next0h);
    110         outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
    111         outho = vec_sr(outho, (__vector unsigned short)pw_two);
    112 
    113         out = vec_perm((__vector unsigned char)outhe,
    114                        (__vector unsigned char)outho, merge_pack_index);
    115         vec_st(out, 16, outptr);
    116       }
    117 
    118       this0 = next0;
    119     }
    120   }
    121 }
    122 
    123 
    124 void
    125 jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
    126                                    JDIMENSION downsampled_width,
    127                                    JSAMPARRAY input_data,
    128                                    JSAMPARRAY *output_data_ptr)
    129 {
    130   JSAMPARRAY output_data = *output_data_ptr;
    131   JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
    132   int inrow, outrow, incol;
    133 
    134   __vector unsigned char this_1, this0, this1, out;
    135   __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
    136     lastcolsum_1h, lastcolsum1h,
    137     p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
    138     thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
    139     nextcolsum_1l = {0}, nextcolsum_1h = {0},
    140     nextcolsum1l = {0}, nextcolsum1h = {0},
    141     p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
    142     tmpl, tmph, outle, outhe, outlo, outho;
    143 
    144   /* Constants */
    145   __vector unsigned char pb_zero = { __16X(0) },
    146     last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
    147     last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
    148     next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
    149     next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
    150 #if __BIG_ENDIAN__
    151     merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
    152 #else
    153     merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
    154 #endif
    155   __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
    156     pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
    157   __vector unsigned short pw_four = { __8X(4) };
    158 
    159   for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
    160 
    161     inptr_1 = input_data[inrow - 1];
    162     inptr0 = input_data[inrow];
    163     inptr1 = input_data[inrow + 1];
    164     outptr0 = output_data[outrow++];
    165     outptr1 = output_data[outrow++];
    166 
    167     if (downsampled_width & 15) {
    168       inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
    169       inptr0[downsampled_width] = inptr0[downsampled_width - 1];
    170       inptr1[downsampled_width] = inptr1[downsampled_width - 1];
    171     }
    172 
    173     this0 = vec_ld(0, inptr0);
    174     this0l = (__vector short)VEC_UNPACKHU(this0);
    175     this0h = (__vector short)VEC_UNPACKLU(this0);
    176     this0l = vec_mladd(this0l, pw_three, pw_zero);
    177     this0h = vec_mladd(this0h, pw_three, pw_zero);
    178 
    179     this_1 = vec_ld(0, inptr_1);
    180     this_1l = (__vector short)VEC_UNPACKHU(this_1);
    181     this_1h = (__vector short)VEC_UNPACKLU(this_1);
    182     thiscolsum_1l = vec_add(this0l, this_1l);
    183     thiscolsum_1h = vec_add(this0h, this_1h);
    184     lastcolsum_1h = thiscolsum_1h;
    185     p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
    186     p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
    187 
    188     this1 = vec_ld(0, inptr1);
    189     this1l = (__vector short)VEC_UNPACKHU(this1);
    190     this1h = (__vector short)VEC_UNPACKLU(this1);
    191     thiscolsum1l = vec_add(this0l, this1l);
    192     thiscolsum1h = vec_add(this0h, this1h);
    193     lastcolsum1h = thiscolsum1h;
    194     p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
    195     p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
    196 
    197     for (incol = downsampled_width; incol > 0;
    198          incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
    199          outptr0 += 32, outptr1 += 32) {
    200 
    201       if (downsampled_width - incol > 0) {
    202         p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
    203         p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
    204         p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
    205         p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
    206         lastcolsum_1h = thiscolsum_1h;  lastcolsum1h = thiscolsum1h;
    207       }
    208 
    209       if (incol <= 16) {
    210         p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
    211         p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
    212                                    next_index_lastcol);
    213         p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
    214         p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
    215                                   next_index_lastcol);
    216       } else {
    217         this0 = vec_ld(16, inptr0);
    218         this0l = (__vector short)VEC_UNPACKHU(this0);
    219         this0h = (__vector short)VEC_UNPACKLU(this0);
    220         this0l = vec_mladd(this0l, pw_three, pw_zero);
    221         this0h = vec_mladd(this0h, pw_three, pw_zero);
    222 
    223         this_1 = vec_ld(16, inptr_1);
    224         this_1l = (__vector short)VEC_UNPACKHU(this_1);
    225         this_1h = (__vector short)VEC_UNPACKLU(this_1);
    226         nextcolsum_1l = vec_add(this0l, this_1l);
    227         nextcolsum_1h = vec_add(this0h, this_1h);
    228         p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
    229         p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
    230 
    231         this1 = vec_ld(16, inptr1);
    232         this1l = (__vector short)VEC_UNPACKHU(this1);
    233         this1h = (__vector short)VEC_UNPACKLU(this1);
    234         nextcolsum1l = vec_add(this0l, this1l);
    235         nextcolsum1h = vec_add(this0h, this1h);
    236         p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
    237         p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
    238       }
    239 
    240       /* Process the upper row */
    241 
    242       tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
    243       outle = vec_add(tmpl, p_lastcolsum_1l);
    244       outle = vec_add(outle, pw_eight);
    245       outle = vec_sr(outle, pw_four);
    246 
    247       outlo = vec_add(tmpl, p_nextcolsum_1l);
    248       outlo = vec_add(outlo, pw_seven);
    249       outlo = vec_sr(outlo, pw_four);
    250 
    251       out = vec_perm((__vector unsigned char)outle,
    252                      (__vector unsigned char)outlo, merge_pack_index);
    253       vec_st(out, 0, outptr0);
    254 
    255       if (incol > 8) {
    256         tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
    257         outhe = vec_add(tmph, p_lastcolsum_1h);
    258         outhe = vec_add(outhe, pw_eight);
    259         outhe = vec_sr(outhe, pw_four);
    260 
    261         outho = vec_add(tmph, p_nextcolsum_1h);
    262         outho = vec_add(outho, pw_seven);
    263         outho = vec_sr(outho, pw_four);
    264 
    265         out = vec_perm((__vector unsigned char)outhe,
    266                        (__vector unsigned char)outho, merge_pack_index);
    267         vec_st(out, 16, outptr0);
    268       }
    269 
    270       /* Process the lower row */
    271 
    272       tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
    273       outle = vec_add(tmpl, p_lastcolsum1l);
    274       outle = vec_add(outle, pw_eight);
    275       outle = vec_sr(outle, pw_four);
    276 
    277       outlo = vec_add(tmpl, p_nextcolsum1l);
    278       outlo = vec_add(outlo, pw_seven);
    279       outlo = vec_sr(outlo, pw_four);
    280 
    281       out = vec_perm((__vector unsigned char)outle,
    282                      (__vector unsigned char)outlo, merge_pack_index);
    283       vec_st(out, 0, outptr1);
    284 
    285       if (incol > 8) {
    286         tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
    287         outhe = vec_add(tmph, p_lastcolsum1h);
    288         outhe = vec_add(outhe, pw_eight);
    289         outhe = vec_sr(outhe, pw_four);
    290 
    291         outho = vec_add(tmph, p_nextcolsum1h);
    292         outho = vec_add(outho, pw_seven);
    293         outho = vec_sr(outho, pw_four);
    294 
    295         out = vec_perm((__vector unsigned char)outhe,
    296                        (__vector unsigned char)outho, merge_pack_index);
    297         vec_st(out, 16, outptr1);
    298       }
    299 
    300       thiscolsum_1l = nextcolsum_1l;  thiscolsum_1h = nextcolsum_1h;
    301       thiscolsum1l = nextcolsum1l;  thiscolsum1h = nextcolsum1h;
    302     }
    303   }
    304 }
    305 
    306 
    307 /* These are rarely used (mainly just for decompressing YCCK images) */
    308 
    309 void
    310 jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
    311                              JDIMENSION output_width,
    312                              JSAMPARRAY input_data,
    313                              JSAMPARRAY *output_data_ptr)
    314 {
    315   JSAMPARRAY output_data = *output_data_ptr;
    316   JSAMPROW inptr, outptr;
    317   int inrow, incol;
    318 
    319   __vector unsigned char in, inl, inh;
    320 
    321   for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
    322     inptr = input_data[inrow];
    323     outptr = output_data[inrow];
    324 
    325     for (incol = (output_width + 31) & (~31); incol > 0;
    326          incol -= 64, inptr += 32, outptr += 64) {
    327 
    328       in = vec_ld(0, inptr);
    329       inl = vec_mergeh(in, in);
    330       inh = vec_mergel(in, in);
    331 
    332       vec_st(inl, 0, outptr);
    333       vec_st(inh, 16, outptr);
    334 
    335       if (incol > 32) {
    336         in = vec_ld(16, inptr);
    337         inl = vec_mergeh(in, in);
    338         inh = vec_mergel(in, in);
    339 
    340         vec_st(inl, 32, outptr);
    341         vec_st(inh, 48, outptr);
    342       }
    343     }
    344   }
    345 }
    346 
    347 
    348 void
    349 jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
    350                              JDIMENSION output_width,
    351                              JSAMPARRAY input_data,
    352                              JSAMPARRAY *output_data_ptr)
    353 {
    354   JSAMPARRAY output_data = *output_data_ptr;
    355   JSAMPROW inptr, outptr0, outptr1;
    356   int inrow, outrow, incol;
    357 
    358   __vector unsigned char in, inl, inh;
    359 
    360   for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
    361 
    362     inptr = input_data[inrow];
    363     outptr0 = output_data[outrow++];
    364     outptr1 = output_data[outrow++];
    365 
    366     for (incol = (output_width + 31) & (~31); incol > 0;
    367          incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
    368 
    369       in = vec_ld(0, inptr);
    370       inl = vec_mergeh(in, in);
    371       inh = vec_mergel(in, in);
    372 
    373       vec_st(inl, 0, outptr0);
    374       vec_st(inl, 0, outptr1);
    375 
    376       vec_st(inh, 16, outptr0);
    377       vec_st(inh, 16, outptr1);
    378 
    379       if (incol > 32) {
    380         in = vec_ld(16, inptr);
    381         inl = vec_mergeh(in, in);
    382         inh = vec_mergel(in, in);
    383 
    384         vec_st(inl, 32, outptr0);
    385         vec_st(inl, 32, outptr1);
    386 
    387         vec_st(inh, 48, outptr0);
    388         vec_st(inh, 48, outptr1);
    389       }
    390     }
    391   }
    392 }
    393