1 /* 2 * AltiVec optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2015, D. R. Commander. All Rights Reserved. 5 * 6 * This software is provided 'as-is', without any express or implied 7 * warranty. In no event will the authors be held liable for any damages 8 * arising from the use of this software. 9 * 10 * Permission is granted to anyone to use this software for any purpose, 11 * including commercial applications, and to alter it and redistribute it 12 * freely, subject to the following restrictions: 13 * 14 * 1. The origin of this software must not be misrepresented; you must not 15 * claim that you wrote the original software. If you use this software 16 * in a product, an acknowledgment in the product documentation would be 17 * appreciated but is not required. 18 * 2. Altered source versions must be plainly marked as such, and must not be 19 * misrepresented as being the original software. 20 * 3. This notice may not be removed or altered from any source distribution. 21 */ 22 23 /* CHROMA DOWNSAMPLING */ 24 25 #include "jsimd_altivec.h" 26 #include "jcsample.h" 27 28 29 void 30 jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, 31 JDIMENSION v_samp_factor, 32 JDIMENSION width_blocks, 33 JSAMPARRAY input_data, JSAMPARRAY output_data) 34 { 35 int outrow, outcol; 36 JDIMENSION output_cols = width_blocks * DCTSIZE; 37 JSAMPROW inptr, outptr; 38 39 __vector unsigned char this0, next0, out; 40 __vector unsigned short this0e, this0o, next0e, next0o, outl, outh; 41 42 /* Constants */ 43 __vector unsigned short pw_bias = { __4X2(0, 1) }, 44 pw_one = { __8X(1) }; 45 __vector unsigned char even_odd_index = 46 {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15}, 47 pb_zero = { __16X(0) }; 48 49 expand_right_edge(input_data, max_v_samp_factor, image_width, 50 output_cols * 2); 51 52 for (outrow = 0; outrow < v_samp_factor; outrow++) { 53 outptr = output_data[outrow]; 54 inptr = input_data[outrow]; 55 56 for (outcol = output_cols; outcol > 0; 57 outcol -= 16, inptr += 32, outptr += 16) { 58 59 this0 = vec_ld(0, inptr); 60 this0 = vec_perm(this0, this0, even_odd_index); 61 this0e = (__vector unsigned short)VEC_UNPACKHU(this0); 62 this0o = (__vector unsigned short)VEC_UNPACKLU(this0); 63 outl = vec_add(this0e, this0o); 64 outl = vec_add(outl, pw_bias); 65 outl = vec_sr(outl, pw_one); 66 67 if (outcol > 8) { 68 next0 = vec_ld(16, inptr); 69 next0 = vec_perm(next0, next0, even_odd_index); 70 next0e = (__vector unsigned short)VEC_UNPACKHU(next0); 71 next0o = (__vector unsigned short)VEC_UNPACKLU(next0); 72 outh = vec_add(next0e, next0o); 73 outh = vec_add(outh, pw_bias); 74 outh = vec_sr(outh, pw_one); 75 } else 76 outh = vec_splat_u16(0); 77 78 out = vec_pack(outl, outh); 79 vec_st(out, 0, outptr); 80 } 81 } 82 } 83 84 85 void 86 jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, 87 JDIMENSION v_samp_factor, 88 JDIMENSION width_blocks, 89 JSAMPARRAY input_data, JSAMPARRAY output_data) 90 { 91 int inrow, outrow, outcol; 92 JDIMENSION output_cols = width_blocks * DCTSIZE; 93 JSAMPROW inptr0, inptr1, outptr; 94 95 __vector unsigned char this0, next0, this1, next1, out; 96 __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o, 97 next1e, next1o, out0l, out0h, out1l, out1h, outl, outh; 98 99 /* Constants */ 100 __vector unsigned short pw_bias = { __4X2(1, 2) }, 101 pw_two = { __8X(2) }; 102 __vector unsigned char even_odd_index = 103 { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, 104 pb_zero = { __16X(0) }; 105 106 expand_right_edge(input_data, max_v_samp_factor, image_width, 107 output_cols * 2); 108 109 for (inrow = 0, outrow = 0; outrow < v_samp_factor; 110 inrow += 2, outrow++) { 111 112 inptr0 = input_data[inrow]; 113 inptr1 = input_data[inrow + 1]; 114 outptr = output_data[outrow]; 115 116 for (outcol = output_cols; outcol > 0; 117 outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) { 118 119 this0 = vec_ld(0, inptr0); 120 this0 = vec_perm(this0, this0, even_odd_index); 121 this0e = (__vector unsigned short)VEC_UNPACKHU(this0); 122 this0o = (__vector unsigned short)VEC_UNPACKLU(this0); 123 out0l = vec_add(this0e, this0o); 124 125 this1 = vec_ld(0, inptr1); 126 this1 = vec_perm(this1, this1, even_odd_index); 127 this1e = (__vector unsigned short)VEC_UNPACKHU(this1); 128 this1o = (__vector unsigned short)VEC_UNPACKLU(this1); 129 out1l = vec_add(this1e, this1o); 130 131 outl = vec_add(out0l, out1l); 132 outl = vec_add(outl, pw_bias); 133 outl = vec_sr(outl, pw_two); 134 135 if (outcol > 8) { 136 next0 = vec_ld(16, inptr0); 137 next0 = vec_perm(next0, next0, even_odd_index); 138 next0e = (__vector unsigned short)VEC_UNPACKHU(next0); 139 next0o = (__vector unsigned short)VEC_UNPACKLU(next0); 140 out0h = vec_add(next0e, next0o); 141 142 next1 = vec_ld(16, inptr1); 143 next1 = vec_perm(next1, next1, even_odd_index); 144 next1e = (__vector unsigned short)VEC_UNPACKHU(next1); 145 next1o = (__vector unsigned short)VEC_UNPACKLU(next1); 146 out1h = vec_add(next1e, next1o); 147 148 outh = vec_add(out0h, out1h); 149 outh = vec_add(outh, pw_bias); 150 outh = vec_sr(outh, pw_two); 151 } else 152 outh = vec_splat_u16(0); 153 154 out = vec_pack(outl, outh); 155 vec_st(out, 0, outptr); 156 } 157 } 158 } 159