1 /* 2 * AltiVec optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. 5 * 6 * This software is provided 'as-is', without any express or implied 7 * warranty. In no event will the authors be held liable for any damages 8 * arising from the use of this software. 9 * 10 * Permission is granted to anyone to use this software for any purpose, 11 * including commercial applications, and to alter it and redistribute it 12 * freely, subject to the following restrictions: 13 * 14 * 1. The origin of this software must not be misrepresented; you must not 15 * claim that you wrote the original software. If you use this software 16 * in a product, an acknowledgment in the product documentation would be 17 * appreciated but is not required. 18 * 2. Altered source versions must be plainly marked as such, and must not be 19 * misrepresented as being the original software. 20 * 3. This notice may not be removed or altered from any source distribution. 21 */ 22 23 /* FAST INTEGER INVERSE DCT 24 * 25 * This is similar to the SSE2 implementation, except that we left-shift the 26 * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because 27 * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: 28 * the elements in arg3 + the most significant 17 bits of 29 * (the elements in arg1 * the elements in arg2). 30 */ 31 32 #include "jsimd_altivec.h" 33 34 35 #define F_1_082 277 /* FIX(1.082392200) */ 36 #define F_1_414 362 /* FIX(1.414213562) */ 37 #define F_1_847 473 /* FIX(1.847759065) */ 38 #define F_2_613 669 /* FIX(2.613125930) */ 39 #define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */ 40 41 #define CONST_BITS 8 42 #define PASS1_BITS 2 43 #define PRE_MULTIPLY_SCALE_BITS 2 44 #define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1) 45 46 47 #define DO_IDCT(in) \ 48 { \ 49 /* Even part */ \ 50 \ 51 tmp10 = vec_add(in##0, in##4); \ 52 tmp11 = vec_sub(in##0, in##4); \ 53 tmp13 = vec_add(in##2, in##6); \ 54 \ 55 tmp12 = vec_sub(in##2, in##6); \ 56 tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \ 57 tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \ 58 tmp12 = vec_sub(tmp12, tmp13); \ 59 \ 60 tmp0 = vec_add(tmp10, tmp13); \ 61 tmp3 = vec_sub(tmp10, tmp13); \ 62 tmp1 = vec_add(tmp11, tmp12); \ 63 tmp2 = vec_sub(tmp11, tmp12); \ 64 \ 65 /* Odd part */ \ 66 \ 67 z13 = vec_add(in##5, in##3); \ 68 z10 = vec_sub(in##5, in##3); \ 69 z10s = vec_sl(z10, pre_multiply_scale_bits); \ 70 z11 = vec_add(in##1, in##7); \ 71 z12s = vec_sub(in##1, in##7); \ 72 z12s = vec_sl(z12s, pre_multiply_scale_bits); \ 73 \ 74 tmp11 = vec_sub(z11, z13); \ 75 tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \ 76 tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \ 77 \ 78 tmp7 = vec_add(z11, z13); \ 79 \ 80 /* To avoid overflow... \ 81 * \ 82 * (Original) \ 83 * tmp12 = -2.613125930 * z10 + z5; \ 84 * \ 85 * (This implementation) \ 86 * tmp12 = (-1.613125930 - 1) * z10 + z5; \ 87 * = -1.613125930 * z10 - z10 + z5; \ 88 */ \ 89 \ 90 z5 = vec_add(z10s, z12s); \ 91 z5 = vec_madds(z5, pw_F1847, pw_zero); \ 92 \ 93 tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \ 94 tmp10 = vec_sub(tmp10, z5); \ 95 tmp12 = vec_madds(z10s, pw_MF1613, z5); \ 96 tmp12 = vec_sub(tmp12, z10); \ 97 \ 98 tmp6 = vec_sub(tmp12, tmp7); \ 99 tmp5 = vec_sub(tmp11, tmp6); \ 100 tmp4 = vec_add(tmp10, tmp5); \ 101 \ 102 out0 = vec_add(tmp0, tmp7); \ 103 out1 = vec_add(tmp1, tmp6); \ 104 out2 = vec_add(tmp2, tmp5); \ 105 out3 = vec_sub(tmp3, tmp4); \ 106 out4 = vec_add(tmp3, tmp4); \ 107 out5 = vec_sub(tmp2, tmp5); \ 108 out6 = vec_sub(tmp1, tmp6); \ 109 out7 = vec_sub(tmp0, tmp7); \ 110 } 111 112 113 void 114 jsimd_idct_ifast_altivec (void *dct_table_, JCOEFPTR coef_block, 115 JSAMPARRAY output_buf, JDIMENSION output_col) 116 { 117 short *dct_table = (short *)dct_table_; 118 int *outptr; 119 120 __vector short row0, row1, row2, row3, row4, row5, row6, row7, 121 col0, col1, col2, col3, col4, col5, col6, col7, 122 quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, 123 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, 124 z5, z10, z10s, z11, z12s, z13, 125 out0, out1, out2, out3, out4, out5, out6, out7; 126 __vector signed char outb; 127 128 /* Constants */ 129 __vector short pw_zero = { __8X(0) }, 130 pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) }, 131 pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) }, 132 pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) }, 133 pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) }; 134 __vector unsigned short 135 pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) }, 136 pass1_bits3 = { __8X(PASS1_BITS + 3) }; 137 __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) }; 138 139 /* Pass 1: process columns */ 140 141 col0 = vec_ld(0, coef_block); 142 col1 = vec_ld(16, coef_block); 143 col2 = vec_ld(32, coef_block); 144 col3 = vec_ld(48, coef_block); 145 col4 = vec_ld(64, coef_block); 146 col5 = vec_ld(80, coef_block); 147 col6 = vec_ld(96, coef_block); 148 col7 = vec_ld(112, coef_block); 149 150 tmp1 = vec_or(col1, col2); 151 tmp2 = vec_or(col3, col4); 152 tmp1 = vec_or(tmp1, tmp2); 153 tmp3 = vec_or(col5, col6); 154 tmp3 = vec_or(tmp3, col7); 155 tmp1 = vec_or(tmp1, tmp3); 156 157 quant0 = vec_ld(0, dct_table); 158 col0 = vec_mladd(col0, quant0, pw_zero); 159 160 if (vec_all_eq(tmp1, pw_zero)) { 161 /* AC terms all zero */ 162 163 row0 = vec_splat(col0, 0); 164 row1 = vec_splat(col0, 1); 165 row2 = vec_splat(col0, 2); 166 row3 = vec_splat(col0, 3); 167 row4 = vec_splat(col0, 4); 168 row5 = vec_splat(col0, 5); 169 row6 = vec_splat(col0, 6); 170 row7 = vec_splat(col0, 7); 171 172 } else { 173 174 quant1 = vec_ld(16, dct_table); 175 quant2 = vec_ld(32, dct_table); 176 quant3 = vec_ld(48, dct_table); 177 quant4 = vec_ld(64, dct_table); 178 quant5 = vec_ld(80, dct_table); 179 quant6 = vec_ld(96, dct_table); 180 quant7 = vec_ld(112, dct_table); 181 182 col1 = vec_mladd(col1, quant1, pw_zero); 183 col2 = vec_mladd(col2, quant2, pw_zero); 184 col3 = vec_mladd(col3, quant3, pw_zero); 185 col4 = vec_mladd(col4, quant4, pw_zero); 186 col5 = vec_mladd(col5, quant5, pw_zero); 187 col6 = vec_mladd(col6, quant6, pw_zero); 188 col7 = vec_mladd(col7, quant7, pw_zero); 189 190 DO_IDCT(col); 191 192 TRANSPOSE(out, row); 193 } 194 195 /* Pass 2: process rows */ 196 197 DO_IDCT(row); 198 199 out0 = vec_sra(out0, pass1_bits3); 200 out1 = vec_sra(out1, pass1_bits3); 201 out2 = vec_sra(out2, pass1_bits3); 202 out3 = vec_sra(out3, pass1_bits3); 203 out4 = vec_sra(out4, pass1_bits3); 204 out5 = vec_sra(out5, pass1_bits3); 205 out6 = vec_sra(out6, pass1_bits3); 206 out7 = vec_sra(out7, pass1_bits3); 207 208 TRANSPOSE(out, col); 209 210 outb = vec_packs(col0, col0); 211 outb = vec_add(outb, pb_centerjsamp); 212 outptr = (int *)(output_buf[0] + output_col); 213 vec_ste((__vector int)outb, 0, outptr); 214 vec_ste((__vector int)outb, 4, outptr); 215 216 outb = vec_packs(col1, col1); 217 outb = vec_add(outb, pb_centerjsamp); 218 outptr = (int *)(output_buf[1] + output_col); 219 vec_ste((__vector int)outb, 0, outptr); 220 vec_ste((__vector int)outb, 4, outptr); 221 222 outb = vec_packs(col2, col2); 223 outb = vec_add(outb, pb_centerjsamp); 224 outptr = (int *)(output_buf[2] + output_col); 225 vec_ste((__vector int)outb, 0, outptr); 226 vec_ste((__vector int)outb, 4, outptr); 227 228 outb = vec_packs(col3, col3); 229 outb = vec_add(outb, pb_centerjsamp); 230 outptr = (int *)(output_buf[3] + output_col); 231 vec_ste((__vector int)outb, 0, outptr); 232 vec_ste((__vector int)outb, 4, outptr); 233 234 outb = vec_packs(col4, col4); 235 outb = vec_add(outb, pb_centerjsamp); 236 outptr = (int *)(output_buf[4] + output_col); 237 vec_ste((__vector int)outb, 0, outptr); 238 vec_ste((__vector int)outb, 4, outptr); 239 240 outb = vec_packs(col5, col5); 241 outb = vec_add(outb, pb_centerjsamp); 242 outptr = (int *)(output_buf[5] + output_col); 243 vec_ste((__vector int)outb, 0, outptr); 244 vec_ste((__vector int)outb, 4, outptr); 245 246 outb = vec_packs(col6, col6); 247 outb = vec_add(outb, pb_centerjsamp); 248 outptr = (int *)(output_buf[6] + output_col); 249 vec_ste((__vector int)outb, 0, outptr); 250 vec_ste((__vector int)outb, 4, outptr); 251 252 outb = vec_packs(col7, col7); 253 outb = vec_add(outb, pb_centerjsamp); 254 outptr = (int *)(output_buf[7] + output_col); 255 vec_ste((__vector int)outb, 0, outptr); 256 vec_ste((__vector int)outb, 4, outptr); 257 } 258