Home | History | Annotate | Download | only in simd
      1 /*
      2  * AltiVec optimizations for libjpeg-turbo
      3  *
      4  * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
      5  *
      6  * This software is provided 'as-is', without any express or implied
      7  * warranty.  In no event will the authors be held liable for any damages
      8  * arising from the use of this software.
      9  *
     10  * Permission is granted to anyone to use this software for any purpose,
     11  * including commercial applications, and to alter it and redistribute it
     12  * freely, subject to the following restrictions:
     13  *
     14  * 1. The origin of this software must not be misrepresented; you must not
     15  *    claim that you wrote the original software. If you use this software
     16  *    in a product, an acknowledgment in the product documentation would be
     17  *    appreciated but is not required.
     18  * 2. Altered source versions must be plainly marked as such, and must not be
     19  *    misrepresented as being the original software.
     20  * 3. This notice may not be removed or altered from any source distribution.
     21  */
     22 
     23 /* FAST INTEGER FORWARD DCT
     24  *
     25  * This is similar to the SSE2 implementation, except that we left-shift the
     26  * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
     27  * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
     28  *   the elements in arg3 + the most significant 17 bits of
     29  *     (the elements in arg1 * the elements in arg2).
     30  */
     31 
     32 #include "jsimd_altivec.h"
     33 
     34 
     35 #define F_0_382 98   /* FIX(0.382683433) */
     36 #define F_0_541 139  /* FIX(0.541196100) */
     37 #define F_0_707 181  /* FIX(0.707106781) */
     38 #define F_1_306 334  /* FIX(1.306562965) */
     39 
     40 #define CONST_BITS 8
     41 #define PRE_MULTIPLY_SCALE_BITS 2
     42 #define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
     43 
     44 
     45 #define DO_FDCT()  \
     46 {  \
     47   /* Even part */  \
     48   \
     49   tmp10 = vec_add(tmp0, tmp3);  \
     50   tmp13 = vec_sub(tmp0, tmp3);  \
     51   tmp11 = vec_add(tmp1, tmp2);  \
     52   tmp12 = vec_sub(tmp1, tmp2);  \
     53   \
     54   out0  = vec_add(tmp10, tmp11);  \
     55   out4  = vec_sub(tmp10, tmp11);  \
     56   \
     57   z1 = vec_add(tmp12, tmp13);  \
     58   z1 = vec_sl(z1, pre_multiply_scale_bits);  \
     59   z1 = vec_madds(z1, pw_0707, pw_zero);  \
     60   \
     61   out2 = vec_add(tmp13, z1);  \
     62   out6 = vec_sub(tmp13, z1);  \
     63   \
     64   /* Odd part */  \
     65   \
     66   tmp10 = vec_add(tmp4, tmp5);  \
     67   tmp11 = vec_add(tmp5, tmp6);  \
     68   tmp12 = vec_add(tmp6, tmp7);  \
     69   \
     70   tmp10 = vec_sl(tmp10, pre_multiply_scale_bits);  \
     71   tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
     72   z5 = vec_sub(tmp10, tmp12);  \
     73   z5 = vec_madds(z5, pw_0382, pw_zero);  \
     74   \
     75   z2 = vec_madds(tmp10, pw_0541, z5);  \
     76   z4 = vec_madds(tmp12, pw_1306, z5);  \
     77   \
     78   tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
     79   z3 = vec_madds(tmp11, pw_0707, pw_zero);  \
     80   \
     81   z11 = vec_add(tmp7, z3);  \
     82   z13 = vec_sub(tmp7, z3);  \
     83   \
     84   out5 = vec_add(z13, z2);  \
     85   out3 = vec_sub(z13, z2);  \
     86   out1 = vec_add(z11, z4);  \
     87   out7 = vec_sub(z11, z4);  \
     88 }
     89 
     90 
     91 void
     92 jsimd_fdct_ifast_altivec (DCTELEM *data)
     93 {
     94   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
     95     col0, col1, col2, col3, col4, col5, col6, col7,
     96     tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
     97     z1, z2, z3, z4, z5, z11, z13,
     98     out0, out1, out2, out3, out4, out5, out6, out7;
     99 
    100   /* Constants */
    101   __vector short pw_zero = { __8X(0) },
    102     pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
    103     pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
    104     pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
    105     pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
    106   __vector unsigned short
    107     pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
    108 
    109   /* Pass 1: process rows */
    110 
    111   row0 = vec_ld(0, data);
    112   row1 = vec_ld(16, data);
    113   row2 = vec_ld(32, data);
    114   row3 = vec_ld(48, data);
    115   row4 = vec_ld(64, data);
    116   row5 = vec_ld(80, data);
    117   row6 = vec_ld(96, data);
    118   row7 = vec_ld(112, data);
    119 
    120   TRANSPOSE(row, col);
    121 
    122   tmp0 = vec_add(col0, col7);
    123   tmp7 = vec_sub(col0, col7);
    124   tmp1 = vec_add(col1, col6);
    125   tmp6 = vec_sub(col1, col6);
    126   tmp2 = vec_add(col2, col5);
    127   tmp5 = vec_sub(col2, col5);
    128   tmp3 = vec_add(col3, col4);
    129   tmp4 = vec_sub(col3, col4);
    130 
    131   DO_FDCT();
    132 
    133   /* Pass 2: process columns */
    134 
    135   TRANSPOSE(out, row);
    136 
    137   tmp0 = vec_add(row0, row7);
    138   tmp7 = vec_sub(row0, row7);
    139   tmp1 = vec_add(row1, row6);
    140   tmp6 = vec_sub(row1, row6);
    141   tmp2 = vec_add(row2, row5);
    142   tmp5 = vec_sub(row2, row5);
    143   tmp3 = vec_add(row3, row4);
    144   tmp4 = vec_sub(row3, row4);
    145 
    146   DO_FDCT();
    147 
    148   vec_st(out0, 0, data);
    149   vec_st(out1, 16, data);
    150   vec_st(out2, 32, data);
    151   vec_st(out3, 48, data);
    152   vec_st(out4, 64, data);
    153   vec_st(out5, 80, data);
    154   vec_st(out6, 96, data);
    155   vec_st(out7, 112, data);
    156 }
    157