Home | History | Annotate | Download | only in src
      1 /* vim: set ts=8 sw=8 noexpandtab: */
      2 //  qcms
      3 //  Copyright (C) 2009 Mozilla Corporation
      4 //  Copyright (C) 1998-2007 Marti Maria
      5 //
      6 // Permission is hereby granted, free of charge, to any person obtaining
      7 // a copy of this software and associated documentation files (the "Software"),
      8 // to deal in the Software without restriction, including without limitation
      9 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10 // and/or sell copies of the Software, and to permit persons to whom the Software
     11 // is furnished to do so, subject to the following conditions:
     12 //
     13 // The above copyright notice and this permission notice shall be included in
     14 // all copies or substantial portions of the Software.
     15 //
     16 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     17 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
     18 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     19 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
     20 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
     21 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
     22 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     23 
     24 #include <stdlib.h>
     25 #include <math.h>
     26 #include <assert.h>
     27 #include <string.h> //memcpy
     28 #include "qcmsint.h"
     29 #include "chain.h"
     30 #include "matrix.h"
     31 #include "transform_util.h"
     32 
     33 /* for MSVC, GCC, Intel, and Sun compilers */
     34 #if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_M_AMD64) || defined(__x86_64__) || defined(__x86_64)
     35 #define X86
     36 #endif /* _M_IX86 || __i386__ || __i386 || _M_AMD64 || __x86_64__ || __x86_64 */
     37 
     38 // Build a White point, primary chromas transfer matrix from RGB to CIE XYZ
     39 // This is just an approximation, I am not handling all the non-linear
     40 // aspects of the RGB to XYZ process, and assumming that the gamma correction
     41 // has transitive property in the tranformation chain.
     42 //
     43 // the alghoritm:
     44 //
     45 //            - First I build the absolute conversion matrix using
     46 //              primaries in XYZ. This matrix is next inverted
     47 //            - Then I eval the source white point across this matrix
     48 //              obtaining the coeficients of the transformation
     49 //            - Then, I apply these coeficients to the original matrix
     50 static struct matrix build_RGB_to_XYZ_transfer_matrix(qcms_CIE_xyY white, qcms_CIE_xyYTRIPLE primrs)
     51 {
     52 	struct matrix primaries;
     53 	struct matrix primaries_invert;
     54 	struct matrix result;
     55 	struct vector white_point;
     56 	struct vector coefs;
     57 
     58 	double xn, yn;
     59 	double xr, yr;
     60 	double xg, yg;
     61 	double xb, yb;
     62 
     63 	xn = white.x;
     64 	yn = white.y;
     65 
     66 	if (yn == 0.0)
     67 		return matrix_invalid();
     68 
     69 	xr = primrs.red.x;
     70 	yr = primrs.red.y;
     71 	xg = primrs.green.x;
     72 	yg = primrs.green.y;
     73 	xb = primrs.blue.x;
     74 	yb = primrs.blue.y;
     75 
     76 	primaries.m[0][0] = xr;
     77 	primaries.m[0][1] = xg;
     78 	primaries.m[0][2] = xb;
     79 
     80 	primaries.m[1][0] = yr;
     81 	primaries.m[1][1] = yg;
     82 	primaries.m[1][2] = yb;
     83 
     84 	primaries.m[2][0] = 1 - xr - yr;
     85 	primaries.m[2][1] = 1 - xg - yg;
     86 	primaries.m[2][2] = 1 - xb - yb;
     87 	primaries.invalid = false;
     88 
     89 	white_point.v[0] = xn/yn;
     90 	white_point.v[1] = 1.;
     91 	white_point.v[2] = (1.0-xn-yn)/yn;
     92 
     93 	primaries_invert = matrix_invert(primaries);
     94 
     95 	coefs = matrix_eval(primaries_invert, white_point);
     96 
     97 	result.m[0][0] = coefs.v[0]*xr;
     98 	result.m[0][1] = coefs.v[1]*xg;
     99 	result.m[0][2] = coefs.v[2]*xb;
    100 
    101 	result.m[1][0] = coefs.v[0]*yr;
    102 	result.m[1][1] = coefs.v[1]*yg;
    103 	result.m[1][2] = coefs.v[2]*yb;
    104 
    105 	result.m[2][0] = coefs.v[0]*(1.-xr-yr);
    106 	result.m[2][1] = coefs.v[1]*(1.-xg-yg);
    107 	result.m[2][2] = coefs.v[2]*(1.-xb-yb);
    108 	result.invalid = primaries_invert.invalid;
    109 
    110 	return result;
    111 }
    112 
    113 struct CIE_XYZ {
    114 	double X;
    115 	double Y;
    116 	double Z;
    117 };
    118 
    119 /* CIE Illuminant D50 */
    120 static const struct CIE_XYZ D50_XYZ = {
    121 	0.9642,
    122 	1.0000,
    123 	0.8249
    124 };
    125 
    126 /* from lcms: xyY2XYZ()
    127  * corresponds to argyll: icmYxy2XYZ() */
    128 static struct CIE_XYZ xyY2XYZ(qcms_CIE_xyY source)
    129 {
    130 	struct CIE_XYZ dest;
    131 	dest.X = (source.x / source.y) * source.Y;
    132 	dest.Y = source.Y;
    133 	dest.Z = ((1 - source.x - source.y) / source.y) * source.Y;
    134 	return dest;
    135 }
    136 
    137 /* from lcms: ComputeChromaticAdaption */
    138 // Compute chromatic adaption matrix using chad as cone matrix
    139 static struct matrix
    140 compute_chromatic_adaption(struct CIE_XYZ source_white_point,
    141                            struct CIE_XYZ dest_white_point,
    142                            struct matrix chad)
    143 {
    144 	struct matrix chad_inv;
    145 	struct vector cone_source_XYZ, cone_source_rgb;
    146 	struct vector cone_dest_XYZ, cone_dest_rgb;
    147 	struct matrix cone, tmp;
    148 
    149 	tmp = chad;
    150 	chad_inv = matrix_invert(tmp);
    151 
    152 	cone_source_XYZ.v[0] = source_white_point.X;
    153 	cone_source_XYZ.v[1] = source_white_point.Y;
    154 	cone_source_XYZ.v[2] = source_white_point.Z;
    155 
    156 	cone_dest_XYZ.v[0] = dest_white_point.X;
    157 	cone_dest_XYZ.v[1] = dest_white_point.Y;
    158 	cone_dest_XYZ.v[2] = dest_white_point.Z;
    159 
    160 	cone_source_rgb = matrix_eval(chad, cone_source_XYZ);
    161 	cone_dest_rgb   = matrix_eval(chad, cone_dest_XYZ);
    162 
    163 	cone.m[0][0] = cone_dest_rgb.v[0]/cone_source_rgb.v[0];
    164 	cone.m[0][1] = 0;
    165 	cone.m[0][2] = 0;
    166 	cone.m[1][0] = 0;
    167 	cone.m[1][1] = cone_dest_rgb.v[1]/cone_source_rgb.v[1];
    168 	cone.m[1][2] = 0;
    169 	cone.m[2][0] = 0;
    170 	cone.m[2][1] = 0;
    171 	cone.m[2][2] = cone_dest_rgb.v[2]/cone_source_rgb.v[2];
    172 	cone.invalid = false;
    173 
    174 	// Normalize
    175 	return matrix_multiply(chad_inv, matrix_multiply(cone, chad));
    176 }
    177 
    178 /* from lcms: cmsAdaptionMatrix */
    179 // Returns the final chrmatic adaptation from illuminant FromIll to Illuminant ToIll
    180 // Bradford is assumed
    181 static struct matrix
    182 adaption_matrix(struct CIE_XYZ source_illumination, struct CIE_XYZ target_illumination)
    183 {
    184 #if defined (_MSC_VER)
    185 #pragma warning(push)
    186 /* Disable double to float truncation warning 4305 */
    187 #pragma warning(disable:4305)
    188 #endif
    189 	struct matrix lam_rigg = {{ // Bradford matrix
    190 	                         {  0.8951,  0.2664, -0.1614 },
    191 	                         { -0.7502,  1.7135,  0.0367 },
    192 	                         {  0.0389, -0.0685,  1.0296 }
    193 	                         }};
    194 #if defined (_MSC_VER)
    195 /* Restore warnings */
    196 #pragma warning(pop)
    197 #endif
    198 	return compute_chromatic_adaption(source_illumination, target_illumination, lam_rigg);
    199 }
    200 
    201 /* from lcms: cmsAdaptMatrixToD50 */
    202 static struct matrix adapt_matrix_to_D50(struct matrix r, qcms_CIE_xyY source_white_pt)
    203 {
    204 	struct CIE_XYZ Dn;
    205 	struct matrix Bradford;
    206 
    207 	if (source_white_pt.y == 0.0)
    208 		return matrix_invalid();
    209 
    210 	Dn = xyY2XYZ(source_white_pt);
    211 
    212 	Bradford = adaption_matrix(Dn, D50_XYZ);
    213 	return matrix_multiply(Bradford, r);
    214 }
    215 
    216 qcms_bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries)
    217 {
    218 	struct matrix colorants;
    219 	colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
    220 	colorants = adapt_matrix_to_D50(colorants, white_point);
    221 
    222 	if (colorants.invalid)
    223 		return false;
    224 
    225 	/* note: there's a transpose type of operation going on here */
    226 	profile->redColorant.X = double_to_s15Fixed16Number(colorants.m[0][0]);
    227 	profile->redColorant.Y = double_to_s15Fixed16Number(colorants.m[1][0]);
    228 	profile->redColorant.Z = double_to_s15Fixed16Number(colorants.m[2][0]);
    229 
    230 	profile->greenColorant.X = double_to_s15Fixed16Number(colorants.m[0][1]);
    231 	profile->greenColorant.Y = double_to_s15Fixed16Number(colorants.m[1][1]);
    232 	profile->greenColorant.Z = double_to_s15Fixed16Number(colorants.m[2][1]);
    233 
    234 	profile->blueColorant.X = double_to_s15Fixed16Number(colorants.m[0][2]);
    235 	profile->blueColorant.Y = double_to_s15Fixed16Number(colorants.m[1][2]);
    236 	profile->blueColorant.Z = double_to_s15Fixed16Number(colorants.m[2][2]);
    237 
    238 	return true;
    239 }
    240 
    241 #if 0
    242 static void qcms_transform_data_rgb_out_pow(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    243 {
    244 	const int r_out = output_format.r;
    245 	const int b_out = output_format.b;
    246 
    247 	int i;
    248 	float (*mat)[4] = transform->matrix;
    249 	for (i=0; i<length; i++) {
    250 		unsigned char device_r = *src++;
    251 		unsigned char device_g = *src++;
    252 		unsigned char device_b = *src++;
    253 
    254 		float linear_r = transform->input_gamma_table_r[device_r];
    255 		float linear_g = transform->input_gamma_table_g[device_g];
    256 		float linear_b = transform->input_gamma_table_b[device_b];
    257 
    258 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
    259 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
    260 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
    261 
    262 		float out_device_r = pow(out_linear_r, transform->out_gamma_r);
    263 		float out_device_g = pow(out_linear_g, transform->out_gamma_g);
    264 		float out_device_b = pow(out_linear_b, transform->out_gamma_b);
    265 
    266 		dest[r_out] = clamp_u8(out_device_r*255);
    267 		dest[1]     = clamp_u8(out_device_g*255);
    268 		dest[b_out] = clamp_u8(out_device_b*255);
    269 		dest += 3;
    270 	}
    271 }
    272 #endif
    273 
    274 static void qcms_transform_data_gray_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    275 {
    276 	const int r_out = output_format.r;
    277 	const int b_out = output_format.b;
    278 
    279 	unsigned int i;
    280 	for (i = 0; i < length; i++) {
    281 		float out_device_r, out_device_g, out_device_b;
    282 		unsigned char device = *src++;
    283 
    284 		float linear = transform->input_gamma_table_gray[device];
    285 
    286 		out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
    287 		out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
    288 		out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
    289 
    290 		dest[r_out] = clamp_u8(out_device_r*255);
    291 		dest[1]     = clamp_u8(out_device_g*255);
    292 		dest[b_out] = clamp_u8(out_device_b*255);
    293 		dest += 3;
    294 	}
    295 }
    296 
    297 /* Alpha is not corrected.
    298    A rationale for this is found in Alvy Ray's "Should Alpha Be Nonlinear If
    299    RGB Is?" Tech Memo 17 (December 14, 1998).
    300 	See: ftp://ftp.alvyray.com/Acrobat/17_Nonln.pdf
    301 */
    302 
    303 static void qcms_transform_data_graya_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    304 {
    305 	const int r_out = output_format.r;
    306 	const int b_out = output_format.b;
    307 
    308 	unsigned int i;
    309 	for (i = 0; i < length; i++) {
    310 		float out_device_r, out_device_g, out_device_b;
    311 		unsigned char device = *src++;
    312 		unsigned char alpha = *src++;
    313 
    314 		float linear = transform->input_gamma_table_gray[device];
    315 
    316 		out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
    317 		out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
    318 		out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
    319 
    320 		dest[r_out] = clamp_u8(out_device_r*255);
    321 		dest[1]     = clamp_u8(out_device_g*255);
    322 		dest[b_out] = clamp_u8(out_device_b*255);
    323 		dest[3]     = alpha;
    324 		dest += 4;
    325 	}
    326 }
    327 
    328 
    329 static void qcms_transform_data_gray_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    330 {
    331 	const int r_out = output_format.r;
    332 	const int b_out = output_format.b;
    333 
    334 	unsigned int i;
    335 	for (i = 0; i < length; i++) {
    336 		unsigned char device = *src++;
    337 		uint16_t gray;
    338 
    339 		float linear = transform->input_gamma_table_gray[device];
    340 
    341 		/* we could round here... */
    342 		gray = linear * PRECACHE_OUTPUT_MAX;
    343 
    344 		dest[r_out] = transform->output_table_r->data[gray];
    345 		dest[1]     = transform->output_table_g->data[gray];
    346 		dest[b_out] = transform->output_table_b->data[gray];
    347 		dest += 3;
    348 	}
    349 }
    350 
    351 
    352 static void qcms_transform_data_graya_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    353 {
    354 	const int r_out = output_format.r;
    355 	const int b_out = output_format.b;
    356 
    357 	unsigned int i;
    358 	for (i = 0; i < length; i++) {
    359 		unsigned char device = *src++;
    360 		unsigned char alpha = *src++;
    361 		uint16_t gray;
    362 
    363 		float linear = transform->input_gamma_table_gray[device];
    364 
    365 		/* we could round here... */
    366 		gray = linear * PRECACHE_OUTPUT_MAX;
    367 
    368 		dest[r_out] = transform->output_table_r->data[gray];
    369 		dest[1]     = transform->output_table_g->data[gray];
    370 		dest[b_out] = transform->output_table_b->data[gray];
    371 		dest[3]     = alpha;
    372 		dest += 4;
    373 	}
    374 }
    375 
    376 static void qcms_transform_data_rgb_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    377 {
    378 	const int r_out = output_format.r;
    379 	const int b_out = output_format.b;
    380 
    381 	unsigned int i;
    382 	float (*mat)[4] = transform->matrix;
    383 	for (i = 0; i < length; i++) {
    384 		unsigned char device_r = *src++;
    385 		unsigned char device_g = *src++;
    386 		unsigned char device_b = *src++;
    387 		uint16_t r, g, b;
    388 
    389 		float linear_r = transform->input_gamma_table_r[device_r];
    390 		float linear_g = transform->input_gamma_table_g[device_g];
    391 		float linear_b = transform->input_gamma_table_b[device_b];
    392 
    393 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
    394 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
    395 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
    396 
    397 		out_linear_r = clamp_float(out_linear_r);
    398 		out_linear_g = clamp_float(out_linear_g);
    399 		out_linear_b = clamp_float(out_linear_b);
    400 
    401 		/* we could round here... */
    402 		r = out_linear_r * PRECACHE_OUTPUT_MAX;
    403 		g = out_linear_g * PRECACHE_OUTPUT_MAX;
    404 		b = out_linear_b * PRECACHE_OUTPUT_MAX;
    405 
    406 		dest[r_out] = transform->output_table_r->data[r];
    407 		dest[1]     = transform->output_table_g->data[g];
    408 		dest[b_out] = transform->output_table_b->data[b];
    409 		dest += 3;
    410 	}
    411 }
    412 
    413 static void qcms_transform_data_rgba_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    414 {
    415 	const int r_out = output_format.r;
    416 	const int b_out = output_format.b;
    417 
    418 	unsigned int i;
    419 	float (*mat)[4] = transform->matrix;
    420 	for (i = 0; i < length; i++) {
    421 		unsigned char device_r = *src++;
    422 		unsigned char device_g = *src++;
    423 		unsigned char device_b = *src++;
    424 		unsigned char alpha = *src++;
    425 		uint16_t r, g, b;
    426 
    427 		float linear_r = transform->input_gamma_table_r[device_r];
    428 		float linear_g = transform->input_gamma_table_g[device_g];
    429 		float linear_b = transform->input_gamma_table_b[device_b];
    430 
    431 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
    432 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
    433 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
    434 
    435 		out_linear_r = clamp_float(out_linear_r);
    436 		out_linear_g = clamp_float(out_linear_g);
    437 		out_linear_b = clamp_float(out_linear_b);
    438 
    439 		/* we could round here... */
    440 		r = out_linear_r * PRECACHE_OUTPUT_MAX;
    441 		g = out_linear_g * PRECACHE_OUTPUT_MAX;
    442 		b = out_linear_b * PRECACHE_OUTPUT_MAX;
    443 
    444 		dest[r_out] = transform->output_table_r->data[r];
    445 		dest[1]     = transform->output_table_g->data[g];
    446 		dest[b_out] = transform->output_table_b->data[b];
    447 		dest[3]     = alpha;
    448 		dest += 4;
    449 	}
    450 }
    451 
    452 // Not used
    453 /*
    454 static void qcms_transform_data_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    455 {
    456 	const int r_out = output_format.r;
    457 	const int b_out = output_format.b;
    458 
    459 	unsigned int i;
    460 	int xy_len = 1;
    461 	int x_len = transform->grid_size;
    462 	int len = x_len * x_len;
    463 	float* r_table = transform->r_clut;
    464 	float* g_table = transform->g_clut;
    465 	float* b_table = transform->b_clut;
    466 
    467 	for (i = 0; i < length; i++) {
    468 		unsigned char in_r = *src++;
    469 		unsigned char in_g = *src++;
    470 		unsigned char in_b = *src++;
    471 		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
    472 
    473 		int x = floor(linear_r * (transform->grid_size-1));
    474 		int y = floor(linear_g * (transform->grid_size-1));
    475 		int z = floor(linear_b * (transform->grid_size-1));
    476 		int x_n = ceil(linear_r * (transform->grid_size-1));
    477 		int y_n = ceil(linear_g * (transform->grid_size-1));
    478 		int z_n = ceil(linear_b * (transform->grid_size-1));
    479 		float x_d = linear_r * (transform->grid_size-1) - x;
    480 		float y_d = linear_g * (transform->grid_size-1) - y;
    481 		float z_d = linear_b * (transform->grid_size-1) - z;
    482 
    483 		float r_x1 = lerp(CLU(r_table,x,y,z), CLU(r_table,x_n,y,z), x_d);
    484 		float r_x2 = lerp(CLU(r_table,x,y_n,z), CLU(r_table,x_n,y_n,z), x_d);
    485 		float r_y1 = lerp(r_x1, r_x2, y_d);
    486 		float r_x3 = lerp(CLU(r_table,x,y,z_n), CLU(r_table,x_n,y,z_n), x_d);
    487 		float r_x4 = lerp(CLU(r_table,x,y_n,z_n), CLU(r_table,x_n,y_n,z_n), x_d);
    488 		float r_y2 = lerp(r_x3, r_x4, y_d);
    489 		float clut_r = lerp(r_y1, r_y2, z_d);
    490 
    491 		float g_x1 = lerp(CLU(g_table,x,y,z), CLU(g_table,x_n,y,z), x_d);
    492 		float g_x2 = lerp(CLU(g_table,x,y_n,z), CLU(g_table,x_n,y_n,z), x_d);
    493 		float g_y1 = lerp(g_x1, g_x2, y_d);
    494 		float g_x3 = lerp(CLU(g_table,x,y,z_n), CLU(g_table,x_n,y,z_n), x_d);
    495 		float g_x4 = lerp(CLU(g_table,x,y_n,z_n), CLU(g_table,x_n,y_n,z_n), x_d);
    496 		float g_y2 = lerp(g_x3, g_x4, y_d);
    497 		float clut_g = lerp(g_y1, g_y2, z_d);
    498 
    499 		float b_x1 = lerp(CLU(b_table,x,y,z), CLU(b_table,x_n,y,z), x_d);
    500 		float b_x2 = lerp(CLU(b_table,x,y_n,z), CLU(b_table,x_n,y_n,z), x_d);
    501 		float b_y1 = lerp(b_x1, b_x2, y_d);
    502 		float b_x3 = lerp(CLU(b_table,x,y,z_n), CLU(b_table,x_n,y,z_n), x_d);
    503 		float b_x4 = lerp(CLU(b_table,x,y_n,z_n), CLU(b_table,x_n,y_n,z_n), x_d);
    504 		float b_y2 = lerp(b_x3, b_x4, y_d);
    505 		float clut_b = lerp(b_y1, b_y2, z_d);
    506 
    507 		dest[r_out] = clamp_u8(clut_r*255.0f);
    508 		dest[1]     = clamp_u8(clut_g*255.0f);
    509 		dest[b_out] = clamp_u8(clut_b*255.0f);
    510 		dest += 3;
    511 	}
    512 }
    513 */
    514 
    515 // Using lcms' tetra interpolation algorithm.
    516 static void qcms_transform_data_tetra_clut_rgba(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    517 {
    518 	const int r_out = output_format.r;
    519 	const int b_out = output_format.b;
    520 
    521 	unsigned int i;
    522 	int xy_len = 1;
    523 	int x_len = transform->grid_size;
    524 	int len = x_len * x_len;
    525 	float* r_table = transform->r_clut;
    526 	float* g_table = transform->g_clut;
    527 	float* b_table = transform->b_clut;
    528 	float c0_r, c1_r, c2_r, c3_r;
    529 	float c0_g, c1_g, c2_g, c3_g;
    530 	float c0_b, c1_b, c2_b, c3_b;
    531 	float clut_r, clut_g, clut_b;
    532 	for (i = 0; i < length; i++) {
    533 		unsigned char in_r = *src++;
    534 		unsigned char in_g = *src++;
    535 		unsigned char in_b = *src++;
    536 		unsigned char in_a = *src++;
    537 		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
    538 
    539 		int x = floor(linear_r * (transform->grid_size-1));
    540 		int y = floor(linear_g * (transform->grid_size-1));
    541 		int z = floor(linear_b * (transform->grid_size-1));
    542 		int x_n = ceil(linear_r * (transform->grid_size-1));
    543 		int y_n = ceil(linear_g * (transform->grid_size-1));
    544 		int z_n = ceil(linear_b * (transform->grid_size-1));
    545 		float rx = linear_r * (transform->grid_size-1) - x;
    546 		float ry = linear_g * (transform->grid_size-1) - y;
    547 		float rz = linear_b * (transform->grid_size-1) - z;
    548 
    549 		c0_r = CLU(r_table, x, y, z);
    550 		c0_g = CLU(g_table, x, y, z);
    551 		c0_b = CLU(b_table, x, y, z);
    552 
    553 		if( rx >= ry ) {
    554 			if (ry >= rz) { //rx >= ry && ry >= rz
    555 				c1_r = CLU(r_table, x_n, y, z) - c0_r;
    556 				c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
    557 				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
    558 				c1_g = CLU(g_table, x_n, y, z) - c0_g;
    559 				c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
    560 				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
    561 				c1_b = CLU(b_table, x_n, y, z) - c0_b;
    562 				c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
    563 				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
    564 			} else {
    565 				if (rx >= rz) { //rx >= rz && rz >= ry
    566 					c1_r = CLU(r_table, x_n, y, z) - c0_r;
    567 					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
    568 					c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
    569 					c1_g = CLU(g_table, x_n, y, z) - c0_g;
    570 					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
    571 					c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
    572 					c1_b = CLU(b_table, x_n, y, z) - c0_b;
    573 					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
    574 					c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
    575 				} else { //rz > rx && rx >= ry
    576 					c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
    577 					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
    578 					c3_r = CLU(r_table, x, y, z_n) - c0_r;
    579 					c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
    580 					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
    581 					c3_g = CLU(g_table, x, y, z_n) - c0_g;
    582 					c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
    583 					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
    584 					c3_b = CLU(b_table, x, y, z_n) - c0_b;
    585 				}
    586 			}
    587 		} else {
    588 			if (rx >= rz) { //ry > rx && rx >= rz
    589 				c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
    590 				c2_r = CLU(r_table, x, y_n, z) - c0_r;
    591 				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
    592 				c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
    593 				c2_g = CLU(g_table, x, y_n, z) - c0_g;
    594 				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
    595 				c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
    596 				c2_b = CLU(b_table, x, y_n, z) - c0_b;
    597 				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
    598 			} else {
    599 				if (ry >= rz) { //ry >= rz && rz > rx
    600 					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
    601 					c2_r = CLU(r_table, x, y_n, z) - c0_r;
    602 					c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
    603 					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
    604 					c2_g = CLU(g_table, x, y_n, z) - c0_g;
    605 					c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
    606 					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
    607 					c2_b = CLU(b_table, x, y_n, z) - c0_b;
    608 					c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
    609 				} else { //rz > ry && ry > rx
    610 					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
    611 					c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
    612 					c3_r = CLU(r_table, x, y, z_n) - c0_r;
    613 					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
    614 					c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
    615 					c3_g = CLU(g_table, x, y, z_n) - c0_g;
    616 					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
    617 					c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
    618 					c3_b = CLU(b_table, x, y, z_n) - c0_b;
    619 				}
    620 			}
    621 		}
    622 
    623 		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
    624 		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
    625 		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
    626 
    627 		dest[r_out] = clamp_u8(clut_r*255.0f);
    628 		dest[1]     = clamp_u8(clut_g*255.0f);
    629 		dest[b_out] = clamp_u8(clut_b*255.0f);
    630 		dest[3]     = in_a;
    631 		dest += 4;
    632 	}
    633 }
    634 
    635 // Using lcms' tetra interpolation code.
    636 static void qcms_transform_data_tetra_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    637 {
    638 	const int r_out = output_format.r;
    639 	const int b_out = output_format.b;
    640 
    641 	unsigned int i;
    642 	int xy_len = 1;
    643 	int x_len = transform->grid_size;
    644 	int len = x_len * x_len;
    645 	float* r_table = transform->r_clut;
    646 	float* g_table = transform->g_clut;
    647 	float* b_table = transform->b_clut;
    648 	float c0_r, c1_r, c2_r, c3_r;
    649 	float c0_g, c1_g, c2_g, c3_g;
    650 	float c0_b, c1_b, c2_b, c3_b;
    651 	float clut_r, clut_g, clut_b;
    652 	for (i = 0; i < length; i++) {
    653 		unsigned char in_r = *src++;
    654 		unsigned char in_g = *src++;
    655 		unsigned char in_b = *src++;
    656 		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
    657 
    658 		int x = floor(linear_r * (transform->grid_size-1));
    659 		int y = floor(linear_g * (transform->grid_size-1));
    660 		int z = floor(linear_b * (transform->grid_size-1));
    661 		int x_n = ceil(linear_r * (transform->grid_size-1));
    662 		int y_n = ceil(linear_g * (transform->grid_size-1));
    663 		int z_n = ceil(linear_b * (transform->grid_size-1));
    664 		float rx = linear_r * (transform->grid_size-1) - x;
    665 		float ry = linear_g * (transform->grid_size-1) - y;
    666 		float rz = linear_b * (transform->grid_size-1) - z;
    667 
    668 		c0_r = CLU(r_table, x, y, z);
    669 		c0_g = CLU(g_table, x, y, z);
    670 		c0_b = CLU(b_table, x, y, z);
    671 
    672 		if( rx >= ry ) {
    673 			if (ry >= rz) { //rx >= ry && ry >= rz
    674 				c1_r = CLU(r_table, x_n, y, z) - c0_r;
    675 				c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
    676 				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
    677 				c1_g = CLU(g_table, x_n, y, z) - c0_g;
    678 				c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
    679 				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
    680 				c1_b = CLU(b_table, x_n, y, z) - c0_b;
    681 				c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
    682 				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
    683 			} else {
    684 				if (rx >= rz) { //rx >= rz && rz >= ry
    685 					c1_r = CLU(r_table, x_n, y, z) - c0_r;
    686 					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
    687 					c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
    688 					c1_g = CLU(g_table, x_n, y, z) - c0_g;
    689 					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
    690 					c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
    691 					c1_b = CLU(b_table, x_n, y, z) - c0_b;
    692 					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
    693 					c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
    694 				} else { //rz > rx && rx >= ry
    695 					c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
    696 					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
    697 					c3_r = CLU(r_table, x, y, z_n) - c0_r;
    698 					c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
    699 					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
    700 					c3_g = CLU(g_table, x, y, z_n) - c0_g;
    701 					c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
    702 					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
    703 					c3_b = CLU(b_table, x, y, z_n) - c0_b;
    704 				}
    705 			}
    706 		} else {
    707 			if (rx >= rz) { //ry > rx && rx >= rz
    708 				c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
    709 				c2_r = CLU(r_table, x, y_n, z) - c0_r;
    710 				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
    711 				c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
    712 				c2_g = CLU(g_table, x, y_n, z) - c0_g;
    713 				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
    714 				c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
    715 				c2_b = CLU(b_table, x, y_n, z) - c0_b;
    716 				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
    717 			} else {
    718 				if (ry >= rz) { //ry >= rz && rz > rx
    719 					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
    720 					c2_r = CLU(r_table, x, y_n, z) - c0_r;
    721 					c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
    722 					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
    723 					c2_g = CLU(g_table, x, y_n, z) - c0_g;
    724 					c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
    725 					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
    726 					c2_b = CLU(b_table, x, y_n, z) - c0_b;
    727 					c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
    728 				} else { //rz > ry && ry > rx
    729 					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
    730 					c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
    731 					c3_r = CLU(r_table, x, y, z_n) - c0_r;
    732 					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
    733 					c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
    734 					c3_g = CLU(g_table, x, y, z_n) - c0_g;
    735 					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
    736 					c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
    737 					c3_b = CLU(b_table, x, y, z_n) - c0_b;
    738 				}
    739 			}
    740 		}
    741 
    742 		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
    743 		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
    744 		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
    745 
    746 		dest[r_out] = clamp_u8(clut_r*255.0f);
    747 		dest[1]     = clamp_u8(clut_g*255.0f);
    748 		dest[b_out] = clamp_u8(clut_b*255.0f);
    749 		dest += 3;
    750 	}
    751 }
    752 
    753 static void qcms_transform_data_rgb_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    754 {
    755 	const int r_out = output_format.r;
    756 	const int b_out = output_format.b;
    757 
    758 	unsigned int i;
    759 	float (*mat)[4] = transform->matrix;
    760 	for (i = 0; i < length; i++) {
    761 		unsigned char device_r = *src++;
    762 		unsigned char device_g = *src++;
    763 		unsigned char device_b = *src++;
    764 		float out_device_r, out_device_g, out_device_b;
    765 
    766 		float linear_r = transform->input_gamma_table_r[device_r];
    767 		float linear_g = transform->input_gamma_table_g[device_g];
    768 		float linear_b = transform->input_gamma_table_b[device_b];
    769 
    770 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
    771 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
    772 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
    773 
    774 		out_linear_r = clamp_float(out_linear_r);
    775 		out_linear_g = clamp_float(out_linear_g);
    776 		out_linear_b = clamp_float(out_linear_b);
    777 
    778 		out_device_r = lut_interp_linear(out_linear_r,
    779 				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
    780 		out_device_g = lut_interp_linear(out_linear_g,
    781 				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
    782 		out_device_b = lut_interp_linear(out_linear_b,
    783 				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
    784 
    785 		dest[r_out] = clamp_u8(out_device_r*255);
    786 		dest[1]     = clamp_u8(out_device_g*255);
    787 		dest[b_out] = clamp_u8(out_device_b*255);
    788 		dest += 3;
    789 	}
    790 }
    791 
    792 static void qcms_transform_data_rgba_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    793 {
    794 	const int r_out = output_format.r;
    795 	const int b_out = output_format.b;
    796 
    797 	unsigned int i;
    798 	float (*mat)[4] = transform->matrix;
    799 	for (i = 0; i < length; i++) {
    800 		unsigned char device_r = *src++;
    801 		unsigned char device_g = *src++;
    802 		unsigned char device_b = *src++;
    803 		unsigned char alpha = *src++;
    804 		float out_device_r, out_device_g, out_device_b;
    805 
    806 		float linear_r = transform->input_gamma_table_r[device_r];
    807 		float linear_g = transform->input_gamma_table_g[device_g];
    808 		float linear_b = transform->input_gamma_table_b[device_b];
    809 
    810 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
    811 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
    812 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
    813 
    814 		out_linear_r = clamp_float(out_linear_r);
    815 		out_linear_g = clamp_float(out_linear_g);
    816 		out_linear_b = clamp_float(out_linear_b);
    817 
    818 		out_device_r = lut_interp_linear(out_linear_r,
    819 				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
    820 		out_device_g = lut_interp_linear(out_linear_g,
    821 				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
    822 		out_device_b = lut_interp_linear(out_linear_b,
    823 				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
    824 
    825 		dest[r_out] = clamp_u8(out_device_r*255);
    826 		dest[1]     = clamp_u8(out_device_g*255);
    827 		dest[b_out] = clamp_u8(out_device_b*255);
    828 		dest[3]     = alpha;
    829 		dest += 4;
    830 	}
    831 }
    832 
    833 #if 0
    834 static void qcms_transform_data_rgb_out_linear(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    835 {
    836 	const int r_out = output_format.r;
    837 	const int b_out = output_format.b;
    838 
    839 	int i;
    840 	float (*mat)[4] = transform->matrix;
    841 	for (i = 0; i < length; i++) {
    842 		unsigned char device_r = *src++;
    843 		unsigned char device_g = *src++;
    844 		unsigned char device_b = *src++;
    845 
    846 		float linear_r = transform->input_gamma_table_r[device_r];
    847 		float linear_g = transform->input_gamma_table_g[device_g];
    848 		float linear_b = transform->input_gamma_table_b[device_b];
    849 
    850 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
    851 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
    852 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
    853 
    854 		dest[r_out] = clamp_u8(out_linear_r*255);
    855 		dest[1]     = clamp_u8(out_linear_g*255);
    856 		dest[b_out] = clamp_u8(out_linear_b*255);
    857 		dest += 3;
    858 	}
    859 }
    860 #endif
    861 
    862 /*
    863  * If users create and destroy objects on different threads, even if the same
    864  * objects aren't used on different threads at the same time, we can still run
    865  * in to trouble with refcounts if they aren't atomic.
    866  *
    867  * This can lead to us prematurely deleting the precache if threads get unlucky
    868  * and write the wrong value to the ref count.
    869  */
    870 static struct precache_output *precache_reference(struct precache_output *p)
    871 {
    872 	qcms_atomic_increment(p->ref_count);
    873 	return p;
    874 }
    875 
    876 static struct precache_output *precache_create()
    877 {
    878 	struct precache_output *p = malloc(sizeof(struct precache_output));
    879 	if (p)
    880 		p->ref_count = 1;
    881 	return p;
    882 }
    883 
    884 void precache_release(struct precache_output *p)
    885 {
    886 	if (qcms_atomic_decrement(p->ref_count) == 0) {
    887 		free(p);
    888 	}
    889 }
    890 
    891 #ifdef HAVE_POSIX_MEMALIGN
    892 static qcms_transform *transform_alloc(void)
    893 {
    894 	qcms_transform *t;
    895 	if (!posix_memalign(&t, 16, sizeof(*t))) {
    896 		return t;
    897 	} else {
    898 		return NULL;
    899 	}
    900 }
    901 static void transform_free(qcms_transform *t)
    902 {
    903 	free(t);
    904 }
    905 #else
    906 static qcms_transform *transform_alloc(void)
    907 {
    908 	/* transform needs to be aligned on a 16byte boundrary */
    909 	char *original_block = calloc(sizeof(qcms_transform) + sizeof(void*) + 16, 1);
    910 	/* make room for a pointer to the block returned by calloc */
    911 	void *transform_start = original_block + sizeof(void*);
    912 	/* align transform_start */
    913 	qcms_transform *transform_aligned = (qcms_transform*)(((uintptr_t)transform_start + 15) & ~0xf);
    914 
    915 	/* store a pointer to the block returned by calloc so that we can free it later */
    916 	void **(original_block_ptr) = (void**)transform_aligned;
    917 	if (!original_block)
    918 		return NULL;
    919 	original_block_ptr--;
    920 	*original_block_ptr = original_block;
    921 
    922 	return transform_aligned;
    923 }
    924 static void transform_free(qcms_transform *t)
    925 {
    926 	/* get at the pointer to the unaligned block returned by calloc */
    927 	void **p = (void**)t;
    928 	p--;
    929 	free(*p);
    930 }
    931 #endif
    932 
    933 void qcms_transform_release(qcms_transform *t)
    934 {
    935 	/* ensure we only free the gamma tables once even if there are
    936 	 * multiple references to the same data */
    937 
    938 	if (t->output_table_r)
    939 		precache_release(t->output_table_r);
    940 	if (t->output_table_g)
    941 		precache_release(t->output_table_g);
    942 	if (t->output_table_b)
    943 		precache_release(t->output_table_b);
    944 
    945 	free(t->input_gamma_table_r);
    946 	if (t->input_gamma_table_g != t->input_gamma_table_r)
    947 		free(t->input_gamma_table_g);
    948 	if (t->input_gamma_table_g != t->input_gamma_table_r &&
    949 	    t->input_gamma_table_g != t->input_gamma_table_b)
    950 		free(t->input_gamma_table_b);
    951 
    952 	free(t->input_gamma_table_gray);
    953 
    954 	free(t->output_gamma_lut_r);
    955 	free(t->output_gamma_lut_g);
    956 	free(t->output_gamma_lut_b);
    957 
    958 	transform_free(t);
    959 }
    960 
    961 #ifdef X86
    962 // Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
    963 // mozilla/jpeg)
    964  // -------------------------------------------------------------------------
    965 #if defined(_M_IX86) && defined(_MSC_VER)
    966 #define HAS_CPUID
    967 /* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC
    968    register - I'm not sure if that ever happens on windows, but cpuid isn't
    969    on the critical path so we just preserve the register to be safe and to be
    970    consistent with the non-windows version. */
    971 static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
    972        uint32_t a_, b_, c_, d_;
    973        __asm {
    974               xchg   ebx, esi
    975               mov    eax, fxn
    976               cpuid
    977               mov    a_, eax
    978               mov    b_, ebx
    979               mov    c_, ecx
    980               mov    d_, edx
    981               xchg   ebx, esi
    982        }
    983        *a = a_;
    984        *b = b_;
    985        *c = c_;
    986        *d = d_;
    987 }
    988 #elif (defined(__GNUC__) || defined(__SUNPRO_C)) && (defined(__i386__) || defined(__i386))
    989 #define HAS_CPUID
    990 /* Get us a CPUID function. We can't use ebx because it's the PIC register on
    991    some platforms, so we use ESI instead and save ebx to avoid clobbering it. */
    992 static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
    993 
    994 	uint32_t a_, b_, c_, d_;
    995        __asm__ __volatile__ ("xchgl %%ebx, %%esi; cpuid; xchgl %%ebx, %%esi;"
    996                              : "=a" (a_), "=S" (b_), "=c" (c_), "=d" (d_) : "a" (fxn));
    997 	   *a = a_;
    998 	   *b = b_;
    999 	   *c = c_;
   1000 	   *d = d_;
   1001 }
   1002 #endif
   1003 
   1004 // -------------------------Runtime SSEx Detection-----------------------------
   1005 
   1006 /* MMX is always supported per
   1007  *  Gecko v1.9.1 minimum CPU requirements */
   1008 #define SSE1_EDX_MASK (1UL << 25)
   1009 #define SSE2_EDX_MASK (1UL << 26)
   1010 #define SSE3_ECX_MASK (1UL <<  0)
   1011 
   1012 static int sse_version_available(void)
   1013 {
   1014 #if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
   1015 	/* we know at build time that 64-bit CPUs always have SSE2
   1016 	 * this tells the compiler that non-SSE2 branches will never be
   1017 	 * taken (i.e. OK to optimze away the SSE1 and non-SIMD code */
   1018 	return 2;
   1019 #elif defined(HAS_CPUID)
   1020 	static int sse_version = -1;
   1021 	uint32_t a, b, c, d;
   1022 	uint32_t function = 0x00000001;
   1023 
   1024 	if (sse_version == -1) {
   1025 		sse_version = 0;
   1026 		cpuid(function, &a, &b, &c, &d);
   1027 		if (c & SSE3_ECX_MASK)
   1028 			sse_version = 3;
   1029 		else if (d & SSE2_EDX_MASK)
   1030 			sse_version = 2;
   1031 		else if (d & SSE1_EDX_MASK)
   1032 			sse_version = 1;
   1033 	}
   1034 
   1035 	return sse_version;
   1036 #else
   1037 	return 0;
   1038 #endif
   1039 }
   1040 #endif
   1041 
   1042 static const struct matrix bradford_matrix = {{	{ 0.8951f, 0.2664f,-0.1614f},
   1043 						{-0.7502f, 1.7135f, 0.0367f},
   1044 						{ 0.0389f,-0.0685f, 1.0296f}},
   1045 						false};
   1046 
   1047 static const struct matrix bradford_matrix_inv = {{ { 0.9869929f,-0.1470543f, 0.1599627f},
   1048 						    { 0.4323053f, 0.5183603f, 0.0492912f},
   1049 						    {-0.0085287f, 0.0400428f, 0.9684867f}},
   1050 						    false};
   1051 
   1052 // See ICCv4 E.3
   1053 struct matrix compute_whitepoint_adaption(float X, float Y, float Z) {
   1054 	float p = (0.96422f*bradford_matrix.m[0][0] + 1.000f*bradford_matrix.m[1][0] + 0.82521f*bradford_matrix.m[2][0]) /
   1055 		  (X*bradford_matrix.m[0][0]      + Y*bradford_matrix.m[1][0]      + Z*bradford_matrix.m[2][0]     );
   1056 	float y = (0.96422f*bradford_matrix.m[0][1] + 1.000f*bradford_matrix.m[1][1] + 0.82521f*bradford_matrix.m[2][1]) /
   1057 		  (X*bradford_matrix.m[0][1]      + Y*bradford_matrix.m[1][1]      + Z*bradford_matrix.m[2][1]     );
   1058 	float b = (0.96422f*bradford_matrix.m[0][2] + 1.000f*bradford_matrix.m[1][2] + 0.82521f*bradford_matrix.m[2][2]) /
   1059 		  (X*bradford_matrix.m[0][2]      + Y*bradford_matrix.m[1][2]      + Z*bradford_matrix.m[2][2]     );
   1060 	struct matrix white_adaption = {{ {p,0,0}, {0,y,0}, {0,0,b}}, false};
   1061 	return matrix_multiply( bradford_matrix_inv, matrix_multiply(white_adaption, bradford_matrix) );
   1062 }
   1063 
   1064 void qcms_profile_precache_output_transform(qcms_profile *profile)
   1065 {
   1066 	/* we only support precaching on rgb profiles */
   1067 	if (profile->color_space != RGB_SIGNATURE)
   1068 		return;
   1069 
   1070 	if (qcms_supports_iccv4) {
   1071 		/* don't precache since we will use the B2A LUT */
   1072 		if (profile->B2A0)
   1073 			return;
   1074 
   1075 		/* don't precache since we will use the mBA LUT */
   1076 		if (profile->mBA)
   1077 			return;
   1078 	}
   1079 
   1080 	/* don't precache if we do not have the TRC curves */
   1081 	if (!profile->redTRC || !profile->greenTRC || !profile->blueTRC)
   1082 		return;
   1083 
   1084 	if (!profile->output_table_r) {
   1085 		profile->output_table_r = precache_create();
   1086 		if (profile->output_table_r &&
   1087 				!compute_precache(profile->redTRC, profile->output_table_r->data)) {
   1088 			precache_release(profile->output_table_r);
   1089 			profile->output_table_r = NULL;
   1090 		}
   1091 	}
   1092 	if (!profile->output_table_g) {
   1093 		profile->output_table_g = precache_create();
   1094 		if (profile->output_table_g &&
   1095 				!compute_precache(profile->greenTRC, profile->output_table_g->data)) {
   1096 			precache_release(profile->output_table_g);
   1097 			profile->output_table_g = NULL;
   1098 		}
   1099 	}
   1100 	if (!profile->output_table_b) {
   1101 		profile->output_table_b = precache_create();
   1102 		if (profile->output_table_b &&
   1103 				!compute_precache(profile->blueTRC, profile->output_table_b->data)) {
   1104 			precache_release(profile->output_table_b);
   1105 			profile->output_table_b = NULL;
   1106 		}
   1107 	}
   1108 }
   1109 
   1110 /* Replace the current transformation with a LUT transformation using a given number of sample points */
   1111 qcms_transform* qcms_transform_precacheLUT_float(qcms_transform *transform, qcms_profile *in, qcms_profile *out,
   1112                                                  int samples, qcms_data_type in_type)
   1113 {
   1114 	/* The range between which 2 consecutive sample points can be used to interpolate */
   1115 	uint16_t x,y,z;
   1116 	uint32_t l;
   1117 	uint32_t lutSize = 3 * samples * samples * samples;
   1118 	float* src = NULL;
   1119 	float* dest = NULL;
   1120 	float* lut = NULL;
   1121 
   1122 	src = malloc(lutSize*sizeof(float));
   1123 	dest = malloc(lutSize*sizeof(float));
   1124 
   1125 	if (src && dest) {
   1126 		/* Prepare a list of points we want to sample */
   1127 		l = 0;
   1128 		for (x = 0; x < samples; x++) {
   1129 			for (y = 0; y < samples; y++) {
   1130 				for (z = 0; z < samples; z++) {
   1131 					src[l++] = x / (float)(samples-1);
   1132 					src[l++] = y / (float)(samples-1);
   1133 					src[l++] = z / (float)(samples-1);
   1134 				}
   1135 			}
   1136 		}
   1137 
   1138 		lut = qcms_chain_transform(in, out, src, dest, lutSize);
   1139 		if (lut) {
   1140 			transform->r_clut = &lut[0];
   1141 			transform->g_clut = &lut[1];
   1142 			transform->b_clut = &lut[2];
   1143 			transform->grid_size = samples;
   1144 			if (in_type == QCMS_DATA_RGBA_8) {
   1145 				transform->transform_fn = qcms_transform_data_tetra_clut_rgba;
   1146 			} else {
   1147 				transform->transform_fn = qcms_transform_data_tetra_clut;
   1148 			}
   1149 		}
   1150 	}
   1151 
   1152 
   1153 	//XXX: qcms_modular_transform_data may return either the src or dest buffer. If so it must not be free-ed
   1154 	if (src && lut != src) {
   1155 		free(src);
   1156 	} else if (dest && lut != src) {
   1157 		free(dest);
   1158 	}
   1159 
   1160 	if (lut == NULL) {
   1161 		return NULL;
   1162 	}
   1163 	return transform;
   1164 }
   1165 
   1166 #define NO_MEM_TRANSFORM NULL
   1167 
   1168 qcms_transform* qcms_transform_create(
   1169 		qcms_profile *in, qcms_data_type in_type,
   1170 		qcms_profile *out, qcms_data_type out_type,
   1171 		qcms_intent intent)
   1172 {
   1173 	bool precache = false;
   1174 
   1175         qcms_transform *transform = transform_alloc();
   1176         if (!transform) {
   1177 		return NULL;
   1178 	}
   1179 	if (out_type != QCMS_DATA_RGB_8 &&
   1180                 out_type != QCMS_DATA_RGBA_8) {
   1181             assert(0 && "output type");
   1182 	    transform_free(transform);
   1183             return NULL;
   1184         }
   1185 
   1186 	if (out->output_table_r &&
   1187 			out->output_table_g &&
   1188 			out->output_table_b) {
   1189 		precache = true;
   1190 	}
   1191 
   1192 	if (qcms_supports_iccv4 && (in->A2B0 || out->B2A0 || in->mAB || out->mAB)) {
   1193 		// Precache the transformation to a CLUT 33x33x33 in size.
   1194 		// 33 is used by many profiles and works well in pratice.
   1195 		// This evenly divides 256 into blocks of 8x8x8.
   1196 		// TODO For transforming small data sets of about 200x200 or less
   1197 		// precaching should be avoided.
   1198 		qcms_transform *result = qcms_transform_precacheLUT_float(transform, in, out, 33, in_type);
   1199 		if (!result) {
   1200             		assert(0 && "precacheLUT failed");
   1201 			transform_free(transform);
   1202 			return NULL;
   1203 		}
   1204 		return result;
   1205 	}
   1206 
   1207 	if (precache) {
   1208 		transform->output_table_r = precache_reference(out->output_table_r);
   1209 		transform->output_table_g = precache_reference(out->output_table_g);
   1210 		transform->output_table_b = precache_reference(out->output_table_b);
   1211 	} else {
   1212 		if (!out->redTRC || !out->greenTRC || !out->blueTRC) {
   1213 			qcms_transform_release(transform);
   1214 			return NO_MEM_TRANSFORM;
   1215 		}
   1216 		build_output_lut(out->redTRC, &transform->output_gamma_lut_r, &transform->output_gamma_lut_r_length);
   1217 		build_output_lut(out->greenTRC, &transform->output_gamma_lut_g, &transform->output_gamma_lut_g_length);
   1218 		build_output_lut(out->blueTRC, &transform->output_gamma_lut_b, &transform->output_gamma_lut_b_length);
   1219 		if (!transform->output_gamma_lut_r || !transform->output_gamma_lut_g || !transform->output_gamma_lut_b) {
   1220 			qcms_transform_release(transform);
   1221 			return NO_MEM_TRANSFORM;
   1222 		}
   1223 	}
   1224 
   1225         if (in->color_space == RGB_SIGNATURE) {
   1226 		struct matrix in_matrix, out_matrix, result;
   1227 
   1228 		if (in_type != QCMS_DATA_RGB_8 &&
   1229                     in_type != QCMS_DATA_RGBA_8){
   1230                 	assert(0 && "input type");
   1231 			transform_free(transform);
   1232                 	return NULL;
   1233             	}
   1234 		if (precache) {
   1235 #if defined(SSE2_ENABLE) && defined(X86)
   1236 		    if (sse_version_available() >= 2) {
   1237 			    if (in_type == QCMS_DATA_RGB_8)
   1238 				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse2;
   1239 			    else
   1240 				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;
   1241 
   1242 #if defined(SSE2_ENABLE) && !(defined(_MSC_VER) && defined(_M_AMD64))
   1243                     /* Microsoft Compiler for x64 doesn't support MMX.
   1244                      * SSE code uses MMX so that we disable on x64 */
   1245 		    } else
   1246 		    if (sse_version_available() >= 1) {
   1247 			    if (in_type == QCMS_DATA_RGB_8)
   1248 				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse1;
   1249 			    else
   1250 				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse1;
   1251 #endif
   1252 		    } else
   1253 #endif
   1254 			{
   1255 				if (in_type == QCMS_DATA_RGB_8)
   1256 					transform->transform_fn = qcms_transform_data_rgb_out_lut_precache;
   1257 				else
   1258 					transform->transform_fn = qcms_transform_data_rgba_out_lut_precache;
   1259 			}
   1260 		} else {
   1261 			if (in_type == QCMS_DATA_RGB_8)
   1262 				transform->transform_fn = qcms_transform_data_rgb_out_lut;
   1263 			else
   1264 				transform->transform_fn = qcms_transform_data_rgba_out_lut;
   1265 		}
   1266 
   1267 		//XXX: avoid duplicating tables if we can
   1268 		transform->input_gamma_table_r = build_input_gamma_table(in->redTRC);
   1269 		transform->input_gamma_table_g = build_input_gamma_table(in->greenTRC);
   1270 		transform->input_gamma_table_b = build_input_gamma_table(in->blueTRC);
   1271 		if (!transform->input_gamma_table_r || !transform->input_gamma_table_g || !transform->input_gamma_table_b) {
   1272 			qcms_transform_release(transform);
   1273 			return NO_MEM_TRANSFORM;
   1274 		}
   1275 
   1276 
   1277 		/* build combined colorant matrix */
   1278 		in_matrix = build_colorant_matrix(in);
   1279 		out_matrix = build_colorant_matrix(out);
   1280 		out_matrix = matrix_invert(out_matrix);
   1281 		if (out_matrix.invalid) {
   1282 			qcms_transform_release(transform);
   1283 			return NULL;
   1284 		}
   1285 		result = matrix_multiply(out_matrix, in_matrix);
   1286 
   1287 		/* store the results in column major mode
   1288 		 * this makes doing the multiplication with sse easier */
   1289 		transform->matrix[0][0] = result.m[0][0];
   1290 		transform->matrix[1][0] = result.m[0][1];
   1291 		transform->matrix[2][0] = result.m[0][2];
   1292 		transform->matrix[0][1] = result.m[1][0];
   1293 		transform->matrix[1][1] = result.m[1][1];
   1294 		transform->matrix[2][1] = result.m[1][2];
   1295 		transform->matrix[0][2] = result.m[2][0];
   1296 		transform->matrix[1][2] = result.m[2][1];
   1297 		transform->matrix[2][2] = result.m[2][2];
   1298 
   1299 	} else if (in->color_space == GRAY_SIGNATURE) {
   1300 		if (in_type != QCMS_DATA_GRAY_8 &&
   1301 				in_type != QCMS_DATA_GRAYA_8){
   1302 			assert(0 && "input type");
   1303 			transform_free(transform);
   1304 			return NULL;
   1305 		}
   1306 
   1307 		transform->input_gamma_table_gray = build_input_gamma_table(in->grayTRC);
   1308 		if (!transform->input_gamma_table_gray) {
   1309 			qcms_transform_release(transform);
   1310 			return NO_MEM_TRANSFORM;
   1311 		}
   1312 
   1313 		if (precache) {
   1314 			if (in_type == QCMS_DATA_GRAY_8) {
   1315 				transform->transform_fn = qcms_transform_data_gray_out_precache;
   1316 			} else {
   1317 				transform->transform_fn = qcms_transform_data_graya_out_precache;
   1318 			}
   1319 		} else {
   1320 			if (in_type == QCMS_DATA_GRAY_8) {
   1321 				transform->transform_fn = qcms_transform_data_gray_out_lut;
   1322 			} else {
   1323 				transform->transform_fn = qcms_transform_data_graya_out_lut;
   1324 			}
   1325 		}
   1326 	} else {
   1327 		assert(0 && "unexpected colorspace");
   1328 		transform_free(transform);
   1329 		return NULL;
   1330 	}
   1331 	return transform;
   1332 }
   1333 
   1334 /* __force_align_arg_pointer__ is an x86-only attribute, and gcc/clang warns on unused
   1335  * attributes. Don't use this on ARM or AMD64. __has_attribute can detect the presence
   1336  * of the attribute but is currently only supported by clang */
   1337 #if defined(__has_attribute)
   1338 #define HAS_FORCE_ALIGN_ARG_POINTER __has_attribute(__force_align_arg_pointer__)
   1339 #elif defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) && !defined(__arm__) && !defined(__mips__)
   1340 #define HAS_FORCE_ALIGN_ARG_POINTER 1
   1341 #else
   1342 #define HAS_FORCE_ALIGN_ARG_POINTER 0
   1343 #endif
   1344 
   1345 #if HAS_FORCE_ALIGN_ARG_POINTER
   1346 /* we need this to avoid crashes when gcc assumes the stack is 128bit aligned */
   1347 __attribute__((__force_align_arg_pointer__))
   1348 #endif
   1349 void qcms_transform_data(qcms_transform *transform, void *src, void *dest, size_t length)
   1350 {
   1351 	static const struct _qcms_format_type output_rgbx = { 0, 2 };
   1352 
   1353 	transform->transform_fn(transform, src, dest, length, output_rgbx);
   1354 }
   1355 
   1356 void qcms_transform_data_type(qcms_transform *transform, void *src, void *dest, size_t length, qcms_output_type type)
   1357 {
   1358 	static const struct _qcms_format_type output_rgbx = { 0, 2 };
   1359 	static const struct _qcms_format_type output_bgrx = { 2, 0 };
   1360 
   1361 	transform->transform_fn(transform, src, dest, length, type == QCMS_OUTPUT_BGRX ? output_bgrx : output_rgbx);
   1362 }
   1363 
   1364 qcms_bool qcms_supports_iccv4;
   1365 void qcms_enable_iccv4()
   1366 {
   1367 	qcms_supports_iccv4 = true;
   1368 }
   1369