1 /* 2 * AltiVec optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. 5 * 6 * This software is provided 'as-is', without any express or implied 7 * warranty. In no event will the authors be held liable for any damages 8 * arising from the use of this software. 9 * 10 * Permission is granted to anyone to use this software for any purpose, 11 * including commercial applications, and to alter it and redistribute it 12 * freely, subject to the following restrictions: 13 * 14 * 1. The origin of this software must not be misrepresented; you must not 15 * claim that you wrote the original software. If you use this software 16 * in a product, an acknowledgment in the product documentation would be 17 * appreciated but is not required. 18 * 2. Altered source versions must be plainly marked as such, and must not be 19 * misrepresented as being the original software. 20 * 3. This notice may not be removed or altered from any source distribution. 21 */ 22 23 /* FAST INTEGER INVERSE DCT 24 * 25 * This is similar to the SSE2 implementation, except that we left-shift the 26 * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because 27 * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: 28 * the elements in arg3 + the most significant 17 bits of 29 * (the elements in arg1 * the elements in arg2). 30 */ 31 32 #include "jsimd_altivec.h" 33 34 35 #define F_1_082 277 /* FIX(1.082392200) */ 36 #define F_1_414 362 /* FIX(1.414213562) */ 37 #define F_1_847 473 /* FIX(1.847759065) */ 38 #define F_2_613 669 /* FIX(2.613125930) */ 39 #define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */ 40 41 #define CONST_BITS 8 42 #define PASS1_BITS 2 43 #define PRE_MULTIPLY_SCALE_BITS 2 44 #define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1) 45 46 47 #define DO_IDCT(in) { \ 48 /* Even part */ \ 49 \ 50 tmp10 = vec_add(in##0, in##4); \ 51 tmp11 = vec_sub(in##0, in##4); \ 52 tmp13 = vec_add(in##2, in##6); \ 53 \ 54 tmp12 = vec_sub(in##2, in##6); \ 55 tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \ 56 tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \ 57 tmp12 = vec_sub(tmp12, tmp13); \ 58 \ 59 tmp0 = vec_add(tmp10, tmp13); \ 60 tmp3 = vec_sub(tmp10, tmp13); \ 61 tmp1 = vec_add(tmp11, tmp12); \ 62 tmp2 = vec_sub(tmp11, tmp12); \ 63 \ 64 /* Odd part */ \ 65 \ 66 z13 = vec_add(in##5, in##3); \ 67 z10 = vec_sub(in##5, in##3); \ 68 z10s = vec_sl(z10, pre_multiply_scale_bits); \ 69 z11 = vec_add(in##1, in##7); \ 70 z12s = vec_sub(in##1, in##7); \ 71 z12s = vec_sl(z12s, pre_multiply_scale_bits); \ 72 \ 73 tmp11 = vec_sub(z11, z13); \ 74 tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \ 75 tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \ 76 \ 77 tmp7 = vec_add(z11, z13); \ 78 \ 79 /* To avoid overflow... \ 80 * \ 81 * (Original) \ 82 * tmp12 = -2.613125930 * z10 + z5; \ 83 * \ 84 * (This implementation) \ 85 * tmp12 = (-1.613125930 - 1) * z10 + z5; \ 86 * = -1.613125930 * z10 - z10 + z5; \ 87 */ \ 88 \ 89 z5 = vec_add(z10s, z12s); \ 90 z5 = vec_madds(z5, pw_F1847, pw_zero); \ 91 \ 92 tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \ 93 tmp10 = vec_sub(tmp10, z5); \ 94 tmp12 = vec_madds(z10s, pw_MF1613, z5); \ 95 tmp12 = vec_sub(tmp12, z10); \ 96 \ 97 tmp6 = vec_sub(tmp12, tmp7); \ 98 tmp5 = vec_sub(tmp11, tmp6); \ 99 tmp4 = vec_add(tmp10, tmp5); \ 100 \ 101 out0 = vec_add(tmp0, tmp7); \ 102 out1 = vec_add(tmp1, tmp6); \ 103 out2 = vec_add(tmp2, tmp5); \ 104 out3 = vec_sub(tmp3, tmp4); \ 105 out4 = vec_add(tmp3, tmp4); \ 106 out5 = vec_sub(tmp2, tmp5); \ 107 out6 = vec_sub(tmp1, tmp6); \ 108 out7 = vec_sub(tmp0, tmp7); \ 109 } 110 111 112 void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block, 113 JSAMPARRAY output_buf, JDIMENSION output_col) 114 { 115 short *dct_table = (short *)dct_table_; 116 int *outptr; 117 118 __vector short row0, row1, row2, row3, row4, row5, row6, row7, 119 col0, col1, col2, col3, col4, col5, col6, col7, 120 quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, 121 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, 122 z5, z10, z10s, z11, z12s, z13, 123 out0, out1, out2, out3, out4, out5, out6, out7; 124 __vector signed char outb; 125 126 /* Constants */ 127 __vector short pw_zero = { __8X(0) }, 128 pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) }, 129 pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) }, 130 pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) }, 131 pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) }; 132 __vector unsigned short 133 pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) }, 134 pass1_bits3 = { __8X(PASS1_BITS + 3) }; 135 __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) }; 136 137 /* Pass 1: process columns */ 138 139 col0 = vec_ld(0, coef_block); 140 col1 = vec_ld(16, coef_block); 141 col2 = vec_ld(32, coef_block); 142 col3 = vec_ld(48, coef_block); 143 col4 = vec_ld(64, coef_block); 144 col5 = vec_ld(80, coef_block); 145 col6 = vec_ld(96, coef_block); 146 col7 = vec_ld(112, coef_block); 147 148 tmp1 = vec_or(col1, col2); 149 tmp2 = vec_or(col3, col4); 150 tmp1 = vec_or(tmp1, tmp2); 151 tmp3 = vec_or(col5, col6); 152 tmp3 = vec_or(tmp3, col7); 153 tmp1 = vec_or(tmp1, tmp3); 154 155 quant0 = vec_ld(0, dct_table); 156 col0 = vec_mladd(col0, quant0, pw_zero); 157 158 if (vec_all_eq(tmp1, pw_zero)) { 159 /* AC terms all zero */ 160 161 row0 = vec_splat(col0, 0); 162 row1 = vec_splat(col0, 1); 163 row2 = vec_splat(col0, 2); 164 row3 = vec_splat(col0, 3); 165 row4 = vec_splat(col0, 4); 166 row5 = vec_splat(col0, 5); 167 row6 = vec_splat(col0, 6); 168 row7 = vec_splat(col0, 7); 169 170 } else { 171 172 quant1 = vec_ld(16, dct_table); 173 quant2 = vec_ld(32, dct_table); 174 quant3 = vec_ld(48, dct_table); 175 quant4 = vec_ld(64, dct_table); 176 quant5 = vec_ld(80, dct_table); 177 quant6 = vec_ld(96, dct_table); 178 quant7 = vec_ld(112, dct_table); 179 180 col1 = vec_mladd(col1, quant1, pw_zero); 181 col2 = vec_mladd(col2, quant2, pw_zero); 182 col3 = vec_mladd(col3, quant3, pw_zero); 183 col4 = vec_mladd(col4, quant4, pw_zero); 184 col5 = vec_mladd(col5, quant5, pw_zero); 185 col6 = vec_mladd(col6, quant6, pw_zero); 186 col7 = vec_mladd(col7, quant7, pw_zero); 187 188 DO_IDCT(col); 189 190 TRANSPOSE(out, row); 191 } 192 193 /* Pass 2: process rows */ 194 195 DO_IDCT(row); 196 197 out0 = vec_sra(out0, pass1_bits3); 198 out1 = vec_sra(out1, pass1_bits3); 199 out2 = vec_sra(out2, pass1_bits3); 200 out3 = vec_sra(out3, pass1_bits3); 201 out4 = vec_sra(out4, pass1_bits3); 202 out5 = vec_sra(out5, pass1_bits3); 203 out6 = vec_sra(out6, pass1_bits3); 204 out7 = vec_sra(out7, pass1_bits3); 205 206 TRANSPOSE(out, col); 207 208 outb = vec_packs(col0, col0); 209 outb = vec_add(outb, pb_centerjsamp); 210 outptr = (int *)(output_buf[0] + output_col); 211 vec_ste((__vector int)outb, 0, outptr); 212 vec_ste((__vector int)outb, 4, outptr); 213 214 outb = vec_packs(col1, col1); 215 outb = vec_add(outb, pb_centerjsamp); 216 outptr = (int *)(output_buf[1] + output_col); 217 vec_ste((__vector int)outb, 0, outptr); 218 vec_ste((__vector int)outb, 4, outptr); 219 220 outb = vec_packs(col2, col2); 221 outb = vec_add(outb, pb_centerjsamp); 222 outptr = (int *)(output_buf[2] + output_col); 223 vec_ste((__vector int)outb, 0, outptr); 224 vec_ste((__vector int)outb, 4, outptr); 225 226 outb = vec_packs(col3, col3); 227 outb = vec_add(outb, pb_centerjsamp); 228 outptr = (int *)(output_buf[3] + output_col); 229 vec_ste((__vector int)outb, 0, outptr); 230 vec_ste((__vector int)outb, 4, outptr); 231 232 outb = vec_packs(col4, col4); 233 outb = vec_add(outb, pb_centerjsamp); 234 outptr = (int *)(output_buf[4] + output_col); 235 vec_ste((__vector int)outb, 0, outptr); 236 vec_ste((__vector int)outb, 4, outptr); 237 238 outb = vec_packs(col5, col5); 239 outb = vec_add(outb, pb_centerjsamp); 240 outptr = (int *)(output_buf[5] + output_col); 241 vec_ste((__vector int)outb, 0, outptr); 242 vec_ste((__vector int)outb, 4, outptr); 243 244 outb = vec_packs(col6, col6); 245 outb = vec_add(outb, pb_centerjsamp); 246 outptr = (int *)(output_buf[6] + output_col); 247 vec_ste((__vector int)outb, 0, outptr); 248 vec_ste((__vector int)outb, 4, outptr); 249 250 outb = vec_packs(col7, col7); 251 outb = vec_add(outb, pb_centerjsamp); 252 outptr = (int *)(output_buf[7] + output_col); 253 vec_ste((__vector int)outb, 0, outptr); 254 vec_ste((__vector int)outb, 4, outptr); 255 } 256