1 /* 2 * AltiVec optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2015, D. R. Commander. All Rights Reserved. 5 * 6 * This software is provided 'as-is', without any express or implied 7 * warranty. In no event will the authors be held liable for any damages 8 * arising from the use of this software. 9 * 10 * Permission is granted to anyone to use this software for any purpose, 11 * including commercial applications, and to alter it and redistribute it 12 * freely, subject to the following restrictions: 13 * 14 * 1. The origin of this software must not be misrepresented; you must not 15 * claim that you wrote the original software. If you use this software 16 * in a product, an acknowledgment in the product documentation would be 17 * appreciated but is not required. 18 * 2. Altered source versions must be plainly marked as such, and must not be 19 * misrepresented as being the original software. 20 * 3. This notice may not be removed or altered from any source distribution. 21 */ 22 23 /* CHROMA UPSAMPLING */ 24 25 #include "jsimd_altivec.h" 26 27 28 void 29 jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor, 30 JDIMENSION downsampled_width, 31 JSAMPARRAY input_data, 32 JSAMPARRAY *output_data_ptr) 33 { 34 JSAMPARRAY output_data = *output_data_ptr; 35 JSAMPROW inptr, outptr; 36 int inrow, incol; 37 38 __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0, 39 out; 40 __vector short this0e, this0o, this0l, this0h, last0l, last0h, 41 next0l, next0h, outle, outhe, outlo, outho; 42 43 /* Constants */ 44 __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) }, 45 last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14}, 46 last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30}, 47 next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, 48 next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15}, 49 #if __BIG_ENDIAN__ 50 merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31}; 51 #else 52 merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30}; 53 #endif 54 __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) }; 55 56 for (inrow = 0; inrow < max_v_samp_factor; inrow++) { 57 inptr = input_data[inrow]; 58 outptr = output_data[inrow]; 59 60 if (downsampled_width & 15) 61 inptr[downsampled_width] = inptr[downsampled_width - 1]; 62 63 this0 = vec_ld(0, inptr); 64 p_last0 = vec_perm(this0, this0, last_index_col0); 65 last0 = this0; 66 67 for (incol = downsampled_width; incol > 0; 68 incol -= 16, inptr += 16, outptr += 32) { 69 70 if (downsampled_width - incol > 0) { 71 p_last0 = vec_perm(last0, this0, last_index); 72 last0 = this0; 73 } 74 75 if (incol <= 16) 76 p_next0 = vec_perm(this0, this0, next_index_lastcol); 77 else { 78 next0 = vec_ld(16, inptr); 79 p_next0 = vec_perm(this0, next0, next_index); 80 } 81 82 this0e = (__vector short)vec_mule(this0, pb_three); 83 this0o = (__vector short)vec_mulo(this0, pb_three); 84 this0l = vec_mergeh(this0e, this0o); 85 this0h = vec_mergel(this0e, this0o); 86 87 last0l = (__vector short)VEC_UNPACKHU(p_last0); 88 last0h = (__vector short)VEC_UNPACKLU(p_last0); 89 last0l = vec_add(last0l, pw_one); 90 91 next0l = (__vector short)VEC_UNPACKHU(p_next0); 92 next0h = (__vector short)VEC_UNPACKLU(p_next0); 93 next0l = vec_add(next0l, pw_two); 94 95 outle = vec_add(this0l, last0l); 96 outlo = vec_add(this0l, next0l); 97 outle = vec_sr(outle, (__vector unsigned short)pw_two); 98 outlo = vec_sr(outlo, (__vector unsigned short)pw_two); 99 100 out = vec_perm((__vector unsigned char)outle, 101 (__vector unsigned char)outlo, merge_pack_index); 102 vec_st(out, 0, outptr); 103 104 if (incol > 8) { 105 last0h = vec_add(last0h, pw_one); 106 next0h = vec_add(next0h, pw_two); 107 108 outhe = vec_add(this0h, last0h); 109 outho = vec_add(this0h, next0h); 110 outhe = vec_sr(outhe, (__vector unsigned short)pw_two); 111 outho = vec_sr(outho, (__vector unsigned short)pw_two); 112 113 out = vec_perm((__vector unsigned char)outhe, 114 (__vector unsigned char)outho, merge_pack_index); 115 vec_st(out, 16, outptr); 116 } 117 118 this0 = next0; 119 } 120 } 121 } 122 123 124 void 125 jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor, 126 JDIMENSION downsampled_width, 127 JSAMPARRAY input_data, 128 JSAMPARRAY *output_data_ptr) 129 { 130 JSAMPARRAY output_data = *output_data_ptr; 131 JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; 132 int inrow, outrow, incol; 133 134 __vector unsigned char this_1, this0, this1, out; 135 __vector short this_1l, this_1h, this0l, this0h, this1l, this1h, 136 lastcolsum_1h, lastcolsum1h, 137 p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h, 138 thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h, 139 nextcolsum_1l = {0}, nextcolsum_1h = {0}, 140 nextcolsum1l = {0}, nextcolsum1h = {0}, 141 p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h, 142 tmpl, tmph, outle, outhe, outlo, outho; 143 144 /* Constants */ 145 __vector unsigned char pb_zero = { __16X(0) }, 146 last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13}, 147 last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29}, 148 next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17}, 149 next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15}, 150 #if __BIG_ENDIAN__ 151 merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31}; 152 #else 153 merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30}; 154 #endif 155 __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) }, 156 pw_seven = { __8X(7) }, pw_eight = { __8X(8) }; 157 __vector unsigned short pw_four = { __8X(4) }; 158 159 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { 160 161 inptr_1 = input_data[inrow - 1]; 162 inptr0 = input_data[inrow]; 163 inptr1 = input_data[inrow + 1]; 164 outptr0 = output_data[outrow++]; 165 outptr1 = output_data[outrow++]; 166 167 if (downsampled_width & 15) { 168 inptr_1[downsampled_width] = inptr_1[downsampled_width - 1]; 169 inptr0[downsampled_width] = inptr0[downsampled_width - 1]; 170 inptr1[downsampled_width] = inptr1[downsampled_width - 1]; 171 } 172 173 this0 = vec_ld(0, inptr0); 174 this0l = (__vector short)VEC_UNPACKHU(this0); 175 this0h = (__vector short)VEC_UNPACKLU(this0); 176 this0l = vec_mladd(this0l, pw_three, pw_zero); 177 this0h = vec_mladd(this0h, pw_three, pw_zero); 178 179 this_1 = vec_ld(0, inptr_1); 180 this_1l = (__vector short)VEC_UNPACKHU(this_1); 181 this_1h = (__vector short)VEC_UNPACKLU(this_1); 182 thiscolsum_1l = vec_add(this0l, this_1l); 183 thiscolsum_1h = vec_add(this0h, this_1h); 184 lastcolsum_1h = thiscolsum_1h; 185 p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0); 186 p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); 187 188 this1 = vec_ld(0, inptr1); 189 this1l = (__vector short)VEC_UNPACKHU(this1); 190 this1h = (__vector short)VEC_UNPACKLU(this1); 191 thiscolsum1l = vec_add(this0l, this1l); 192 thiscolsum1h = vec_add(this0h, this1h); 193 lastcolsum1h = thiscolsum1h; 194 p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0); 195 p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); 196 197 for (incol = downsampled_width; incol > 0; 198 incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16, 199 outptr0 += 32, outptr1 += 32) { 200 201 if (downsampled_width - incol > 0) { 202 p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index); 203 p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); 204 p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index); 205 p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); 206 lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h; 207 } 208 209 if (incol <= 16) { 210 p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); 211 p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h, 212 next_index_lastcol); 213 p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); 214 p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h, 215 next_index_lastcol); 216 } else { 217 this0 = vec_ld(16, inptr0); 218 this0l = (__vector short)VEC_UNPACKHU(this0); 219 this0h = (__vector short)VEC_UNPACKLU(this0); 220 this0l = vec_mladd(this0l, pw_three, pw_zero); 221 this0h = vec_mladd(this0h, pw_three, pw_zero); 222 223 this_1 = vec_ld(16, inptr_1); 224 this_1l = (__vector short)VEC_UNPACKHU(this_1); 225 this_1h = (__vector short)VEC_UNPACKLU(this_1); 226 nextcolsum_1l = vec_add(this0l, this_1l); 227 nextcolsum_1h = vec_add(this0h, this_1h); 228 p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); 229 p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index); 230 231 this1 = vec_ld(16, inptr1); 232 this1l = (__vector short)VEC_UNPACKHU(this1); 233 this1h = (__vector short)VEC_UNPACKLU(this1); 234 nextcolsum1l = vec_add(this0l, this1l); 235 nextcolsum1h = vec_add(this0h, this1h); 236 p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); 237 p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index); 238 } 239 240 /* Process the upper row */ 241 242 tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero); 243 outle = vec_add(tmpl, p_lastcolsum_1l); 244 outle = vec_add(outle, pw_eight); 245 outle = vec_sr(outle, pw_four); 246 247 outlo = vec_add(tmpl, p_nextcolsum_1l); 248 outlo = vec_add(outlo, pw_seven); 249 outlo = vec_sr(outlo, pw_four); 250 251 out = vec_perm((__vector unsigned char)outle, 252 (__vector unsigned char)outlo, merge_pack_index); 253 vec_st(out, 0, outptr0); 254 255 if (incol > 8) { 256 tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero); 257 outhe = vec_add(tmph, p_lastcolsum_1h); 258 outhe = vec_add(outhe, pw_eight); 259 outhe = vec_sr(outhe, pw_four); 260 261 outho = vec_add(tmph, p_nextcolsum_1h); 262 outho = vec_add(outho, pw_seven); 263 outho = vec_sr(outho, pw_four); 264 265 out = vec_perm((__vector unsigned char)outhe, 266 (__vector unsigned char)outho, merge_pack_index); 267 vec_st(out, 16, outptr0); 268 } 269 270 /* Process the lower row */ 271 272 tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero); 273 outle = vec_add(tmpl, p_lastcolsum1l); 274 outle = vec_add(outle, pw_eight); 275 outle = vec_sr(outle, pw_four); 276 277 outlo = vec_add(tmpl, p_nextcolsum1l); 278 outlo = vec_add(outlo, pw_seven); 279 outlo = vec_sr(outlo, pw_four); 280 281 out = vec_perm((__vector unsigned char)outle, 282 (__vector unsigned char)outlo, merge_pack_index); 283 vec_st(out, 0, outptr1); 284 285 if (incol > 8) { 286 tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero); 287 outhe = vec_add(tmph, p_lastcolsum1h); 288 outhe = vec_add(outhe, pw_eight); 289 outhe = vec_sr(outhe, pw_four); 290 291 outho = vec_add(tmph, p_nextcolsum1h); 292 outho = vec_add(outho, pw_seven); 293 outho = vec_sr(outho, pw_four); 294 295 out = vec_perm((__vector unsigned char)outhe, 296 (__vector unsigned char)outho, merge_pack_index); 297 vec_st(out, 16, outptr1); 298 } 299 300 thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h; 301 thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h; 302 } 303 } 304 } 305 306 307 /* These are rarely used (mainly just for decompressing YCCK images) */ 308 309 void 310 jsimd_h2v1_upsample_altivec (int max_v_samp_factor, 311 JDIMENSION output_width, 312 JSAMPARRAY input_data, 313 JSAMPARRAY *output_data_ptr) 314 { 315 JSAMPARRAY output_data = *output_data_ptr; 316 JSAMPROW inptr, outptr; 317 int inrow, incol; 318 319 __vector unsigned char in, inl, inh; 320 321 for (inrow = 0; inrow < max_v_samp_factor; inrow++) { 322 inptr = input_data[inrow]; 323 outptr = output_data[inrow]; 324 325 for (incol = (output_width + 31) & (~31); incol > 0; 326 incol -= 64, inptr += 32, outptr += 64) { 327 328 in = vec_ld(0, inptr); 329 inl = vec_mergeh(in, in); 330 inh = vec_mergel(in, in); 331 332 vec_st(inl, 0, outptr); 333 vec_st(inh, 16, outptr); 334 335 if (incol > 32) { 336 in = vec_ld(16, inptr); 337 inl = vec_mergeh(in, in); 338 inh = vec_mergel(in, in); 339 340 vec_st(inl, 32, outptr); 341 vec_st(inh, 48, outptr); 342 } 343 } 344 } 345 } 346 347 348 void 349 jsimd_h2v2_upsample_altivec (int max_v_samp_factor, 350 JDIMENSION output_width, 351 JSAMPARRAY input_data, 352 JSAMPARRAY *output_data_ptr) 353 { 354 JSAMPARRAY output_data = *output_data_ptr; 355 JSAMPROW inptr, outptr0, outptr1; 356 int inrow, outrow, incol; 357 358 __vector unsigned char in, inl, inh; 359 360 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { 361 362 inptr = input_data[inrow]; 363 outptr0 = output_data[outrow++]; 364 outptr1 = output_data[outrow++]; 365 366 for (incol = (output_width + 31) & (~31); incol > 0; 367 incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) { 368 369 in = vec_ld(0, inptr); 370 inl = vec_mergeh(in, in); 371 inh = vec_mergel(in, in); 372 373 vec_st(inl, 0, outptr0); 374 vec_st(inl, 0, outptr1); 375 376 vec_st(inh, 16, outptr0); 377 vec_st(inh, 16, outptr1); 378 379 if (incol > 32) { 380 in = vec_ld(16, inptr); 381 inl = vec_mergeh(in, in); 382 inh = vec_mergel(in, in); 383 384 vec_st(inl, 32, outptr0); 385 vec_st(inl, 32, outptr1); 386 387 vec_st(inh, 48, outptr0); 388 vec_st(inh, 48, outptr1); 389 } 390 } 391 } 392 } 393