1 // qcms 2 // Copyright (C) 2009 Mozilla Foundation 3 // 4 // Permission is hereby granted, free of charge, to any person obtaining 5 // a copy of this software and associated documentation files (the "Software"), 6 // to deal in the Software without restriction, including without limitation 7 // the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 // and/or sell copies of the Software, and to permit persons to whom the Software 9 // is furnished to do so, subject to the following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included in 12 // all copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 16 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 22 #include <emmintrin.h> 23 24 #include "qcmsint.h" 25 26 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */ 27 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE) 28 #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE ) 29 static const ALIGN float floatScaleX4[4] = 30 { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE}; 31 static const ALIGN float clampMaxValueX4[4] = 32 { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL}; 33 34 void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform, 35 unsigned char *src, 36 unsigned char *dest, 37 size_t length, 38 qcms_format_type output_format) 39 { 40 unsigned int i; 41 float (*mat)[4] = transform->matrix; 42 char input_back[32]; 43 /* Ensure we have a buffer that's 16 byte aligned regardless of the original 44 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32)) 45 * because they don't work on stack variables. gcc 4.4 does do the right thing 46 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */ 47 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); 48 /* share input and output locations to save having to keep the 49 * locations in separate registers */ 50 uint32_t const * output = (uint32_t*)input; 51 52 /* deref *transform now to avoid it in loop */ 53 const float *igtbl_r = transform->input_gamma_table_r; 54 const float *igtbl_g = transform->input_gamma_table_g; 55 const float *igtbl_b = transform->input_gamma_table_b; 56 57 /* deref *transform now to avoid it in loop */ 58 const uint8_t *otdata_r = &transform->output_table_r->data[0]; 59 const uint8_t *otdata_g = &transform->output_table_g->data[0]; 60 const uint8_t *otdata_b = &transform->output_table_b->data[0]; 61 62 /* input matrix values never change */ 63 const __m128 mat0 = _mm_load_ps(mat[0]); 64 const __m128 mat1 = _mm_load_ps(mat[1]); 65 const __m128 mat2 = _mm_load_ps(mat[2]); 66 67 /* these values don't change, either */ 68 const __m128 max = _mm_load_ps(clampMaxValueX4); 69 const __m128 min = _mm_setzero_ps(); 70 const __m128 scale = _mm_load_ps(floatScaleX4); 71 72 /* working variables */ 73 __m128 vec_r, vec_g, vec_b, result; 74 const int r_out = output_format.r; 75 const int b_out = output_format.b; 76 77 /* CYA */ 78 if (!length) 79 return; 80 81 /* one pixel is handled outside of the loop */ 82 length--; 83 84 /* setup for transforming 1st pixel */ 85 vec_r = _mm_load_ss(&igtbl_r[src[0]]); 86 vec_g = _mm_load_ss(&igtbl_g[src[1]]); 87 vec_b = _mm_load_ss(&igtbl_b[src[2]]); 88 src += 3; 89 90 /* transform all but final pixel */ 91 92 for (i=0; i<length; i++) 93 { 94 /* position values from gamma tables */ 95 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); 96 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); 97 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); 98 99 /* gamma * matrix */ 100 vec_r = _mm_mul_ps(vec_r, mat0); 101 vec_g = _mm_mul_ps(vec_g, mat1); 102 vec_b = _mm_mul_ps(vec_b, mat2); 103 104 /* crunch, crunch, crunch */ 105 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); 106 vec_r = _mm_max_ps(min, vec_r); 107 vec_r = _mm_min_ps(max, vec_r); 108 result = _mm_mul_ps(vec_r, scale); 109 110 /* store calc'd output tables indices */ 111 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); 112 113 /* load for next loop while store completes */ 114 vec_r = _mm_load_ss(&igtbl_r[src[0]]); 115 vec_g = _mm_load_ss(&igtbl_g[src[1]]); 116 vec_b = _mm_load_ss(&igtbl_b[src[2]]); 117 src += 3; 118 119 /* use calc'd indices to output RGB values */ 120 dest[r_out] = otdata_r[output[0]]; 121 dest[1] = otdata_g[output[1]]; 122 dest[b_out] = otdata_b[output[2]]; 123 dest += 3; 124 } 125 126 /* handle final (maybe only) pixel */ 127 128 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); 129 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); 130 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); 131 132 vec_r = _mm_mul_ps(vec_r, mat0); 133 vec_g = _mm_mul_ps(vec_g, mat1); 134 vec_b = _mm_mul_ps(vec_b, mat2); 135 136 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); 137 vec_r = _mm_max_ps(min, vec_r); 138 vec_r = _mm_min_ps(max, vec_r); 139 result = _mm_mul_ps(vec_r, scale); 140 141 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); 142 143 dest[r_out] = otdata_r[output[0]]; 144 dest[1] = otdata_g[output[1]]; 145 dest[b_out] = otdata_b[output[2]]; 146 } 147 148 void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform, 149 unsigned char *src, 150 unsigned char *dest, 151 size_t length, 152 qcms_format_type output_format) 153 { 154 unsigned int i; 155 float (*mat)[4] = transform->matrix; 156 char input_back[32]; 157 /* Ensure we have a buffer that's 16 byte aligned regardless of the original 158 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32)) 159 * because they don't work on stack variables. gcc 4.4 does do the right thing 160 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */ 161 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); 162 /* share input and output locations to save having to keep the 163 * locations in separate registers */ 164 uint32_t const * output = (uint32_t*)input; 165 166 /* deref *transform now to avoid it in loop */ 167 const float *igtbl_r = transform->input_gamma_table_r; 168 const float *igtbl_g = transform->input_gamma_table_g; 169 const float *igtbl_b = transform->input_gamma_table_b; 170 171 /* deref *transform now to avoid it in loop */ 172 const uint8_t *otdata_r = &transform->output_table_r->data[0]; 173 const uint8_t *otdata_g = &transform->output_table_g->data[0]; 174 const uint8_t *otdata_b = &transform->output_table_b->data[0]; 175 176 /* input matrix values never change */ 177 const __m128 mat0 = _mm_load_ps(mat[0]); 178 const __m128 mat1 = _mm_load_ps(mat[1]); 179 const __m128 mat2 = _mm_load_ps(mat[2]); 180 181 /* these values don't change, either */ 182 const __m128 max = _mm_load_ps(clampMaxValueX4); 183 const __m128 min = _mm_setzero_ps(); 184 const __m128 scale = _mm_load_ps(floatScaleX4); 185 186 /* working variables */ 187 __m128 vec_r, vec_g, vec_b, result; 188 const int r_out = output_format.r; 189 const int b_out = output_format.b; 190 unsigned char alpha; 191 192 /* CYA */ 193 if (!length) 194 return; 195 196 /* one pixel is handled outside of the loop */ 197 length--; 198 199 /* setup for transforming 1st pixel */ 200 vec_r = _mm_load_ss(&igtbl_r[src[0]]); 201 vec_g = _mm_load_ss(&igtbl_g[src[1]]); 202 vec_b = _mm_load_ss(&igtbl_b[src[2]]); 203 alpha = src[3]; 204 src += 4; 205 206 /* transform all but final pixel */ 207 208 for (i=0; i<length; i++) 209 { 210 /* position values from gamma tables */ 211 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); 212 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); 213 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); 214 215 /* gamma * matrix */ 216 vec_r = _mm_mul_ps(vec_r, mat0); 217 vec_g = _mm_mul_ps(vec_g, mat1); 218 vec_b = _mm_mul_ps(vec_b, mat2); 219 220 /* store alpha for this pixel; load alpha for next */ 221 dest[3] = alpha; 222 alpha = src[3]; 223 224 /* crunch, crunch, crunch */ 225 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); 226 vec_r = _mm_max_ps(min, vec_r); 227 vec_r = _mm_min_ps(max, vec_r); 228 result = _mm_mul_ps(vec_r, scale); 229 230 /* store calc'd output tables indices */ 231 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); 232 233 /* load gamma values for next loop while store completes */ 234 vec_r = _mm_load_ss(&igtbl_r[src[0]]); 235 vec_g = _mm_load_ss(&igtbl_g[src[1]]); 236 vec_b = _mm_load_ss(&igtbl_b[src[2]]); 237 src += 4; 238 239 /* use calc'd indices to output RGB values */ 240 dest[r_out] = otdata_r[output[0]]; 241 dest[1] = otdata_g[output[1]]; 242 dest[b_out] = otdata_b[output[2]]; 243 dest += 4; 244 } 245 246 /* handle final (maybe only) pixel */ 247 248 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); 249 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); 250 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); 251 252 vec_r = _mm_mul_ps(vec_r, mat0); 253 vec_g = _mm_mul_ps(vec_g, mat1); 254 vec_b = _mm_mul_ps(vec_b, mat2); 255 256 dest[3] = alpha; 257 258 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); 259 vec_r = _mm_max_ps(min, vec_r); 260 vec_r = _mm_min_ps(max, vec_r); 261 result = _mm_mul_ps(vec_r, scale); 262 263 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); 264 265 dest[r_out] = otdata_r[output[0]]; 266 dest[1] = otdata_g[output[1]]; 267 dest[b_out] = otdata_b[output[2]]; 268 } 269