Home | History | Annotate | Download | only in src
      1 /* vim: set ts=8 sw=8 noexpandtab: */
      2 //  qcms
      3 //  Copyright (C) 2009 Mozilla Corporation
      4 //  Copyright (C) 1998-2007 Marti Maria
      5 //
      6 // Permission is hereby granted, free of charge, to any person obtaining
      7 // a copy of this software and associated documentation files (the "Software"),
      8 // to deal in the Software without restriction, including without limitation
      9 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10 // and/or sell copies of the Software, and to permit persons to whom the Software
     11 // is furnished to do so, subject to the following conditions:
     12 //
     13 // The above copyright notice and this permission notice shall be included in
     14 // all copies or substantial portions of the Software.
     15 //
     16 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     17 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
     18 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     19 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
     20 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
     21 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
     22 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     23 
     24 #include <stdlib.h>
     25 #include <math.h>
     26 #include <assert.h>
     27 #include <string.h> //memcpy
     28 #include "qcmsint.h"
     29 #include "chain.h"
     30 #include "matrix.h"
     31 #include "transform_util.h"
     32 
     33 /* for MSVC, GCC, Intel, and Sun compilers */
     34 #if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_M_AMD64) || defined(__x86_64__) || defined(__x86_64)
     35 #define X86
     36 #endif /* _M_IX86 || __i386__ || __i386 || _M_AMD64 || __x86_64__ || __x86_64 */
     37 
     38 // Build a White point, primary chromas transfer matrix from RGB to CIE XYZ
     39 // This is just an approximation, I am not handling all the non-linear
     40 // aspects of the RGB to XYZ process, and assumming that the gamma correction
     41 // has transitive property in the tranformation chain.
     42 //
     43 // the alghoritm:
     44 //
     45 //            - First I build the absolute conversion matrix using
     46 //              primaries in XYZ. This matrix is next inverted
     47 //            - Then I eval the source white point across this matrix
     48 //              obtaining the coeficients of the transformation
     49 //            - Then, I apply these coeficients to the original matrix
     50 static struct matrix build_RGB_to_XYZ_transfer_matrix(qcms_CIE_xyY white, qcms_CIE_xyYTRIPLE primrs)
     51 {
     52 	struct matrix primaries;
     53 	struct matrix primaries_invert;
     54 	struct matrix result;
     55 	struct vector white_point;
     56 	struct vector coefs;
     57 
     58 	double xn, yn;
     59 	double xr, yr;
     60 	double xg, yg;
     61 	double xb, yb;
     62 
     63 	xn = white.x;
     64 	yn = white.y;
     65 
     66 	if (yn == 0.0)
     67 		return matrix_invalid();
     68 
     69 	xr = primrs.red.x;
     70 	yr = primrs.red.y;
     71 	xg = primrs.green.x;
     72 	yg = primrs.green.y;
     73 	xb = primrs.blue.x;
     74 	yb = primrs.blue.y;
     75 
     76 	primaries.m[0][0] = xr;
     77 	primaries.m[0][1] = xg;
     78 	primaries.m[0][2] = xb;
     79 
     80 	primaries.m[1][0] = yr;
     81 	primaries.m[1][1] = yg;
     82 	primaries.m[1][2] = yb;
     83 
     84 	primaries.m[2][0] = 1 - xr - yr;
     85 	primaries.m[2][1] = 1 - xg - yg;
     86 	primaries.m[2][2] = 1 - xb - yb;
     87 	primaries.invalid = false;
     88 
     89 	white_point.v[0] = xn/yn;
     90 	white_point.v[1] = 1.;
     91 	white_point.v[2] = (1.0-xn-yn)/yn;
     92 
     93 	primaries_invert = matrix_invert(primaries);
     94 
     95 	coefs = matrix_eval(primaries_invert, white_point);
     96 
     97 	result.m[0][0] = coefs.v[0]*xr;
     98 	result.m[0][1] = coefs.v[1]*xg;
     99 	result.m[0][2] = coefs.v[2]*xb;
    100 
    101 	result.m[1][0] = coefs.v[0]*yr;
    102 	result.m[1][1] = coefs.v[1]*yg;
    103 	result.m[1][2] = coefs.v[2]*yb;
    104 
    105 	result.m[2][0] = coefs.v[0]*(1.-xr-yr);
    106 	result.m[2][1] = coefs.v[1]*(1.-xg-yg);
    107 	result.m[2][2] = coefs.v[2]*(1.-xb-yb);
    108 	result.invalid = primaries_invert.invalid;
    109 
    110 	return result;
    111 }
    112 
    113 struct CIE_XYZ {
    114 	double X;
    115 	double Y;
    116 	double Z;
    117 };
    118 
    119 /* CIE Illuminant D50 */
    120 static const struct CIE_XYZ D50_XYZ = {
    121 	0.9642,
    122 	1.0000,
    123 	0.8249
    124 };
    125 
    126 /* from lcms: xyY2XYZ()
    127  * corresponds to argyll: icmYxy2XYZ() */
    128 static struct CIE_XYZ xyY2XYZ(qcms_CIE_xyY source)
    129 {
    130 	struct CIE_XYZ dest;
    131 	dest.X = (source.x / source.y) * source.Y;
    132 	dest.Y = source.Y;
    133 	dest.Z = ((1 - source.x - source.y) / source.y) * source.Y;
    134 	return dest;
    135 }
    136 
    137 /* from lcms: ComputeChromaticAdaption */
    138 // Compute chromatic adaption matrix using chad as cone matrix
    139 static struct matrix
    140 compute_chromatic_adaption(struct CIE_XYZ source_white_point,
    141                            struct CIE_XYZ dest_white_point,
    142                            struct matrix chad)
    143 {
    144 	struct matrix chad_inv;
    145 	struct vector cone_source_XYZ, cone_source_rgb;
    146 	struct vector cone_dest_XYZ, cone_dest_rgb;
    147 	struct matrix cone, tmp;
    148 
    149 	tmp = chad;
    150 	chad_inv = matrix_invert(tmp);
    151 
    152 	cone_source_XYZ.v[0] = source_white_point.X;
    153 	cone_source_XYZ.v[1] = source_white_point.Y;
    154 	cone_source_XYZ.v[2] = source_white_point.Z;
    155 
    156 	cone_dest_XYZ.v[0] = dest_white_point.X;
    157 	cone_dest_XYZ.v[1] = dest_white_point.Y;
    158 	cone_dest_XYZ.v[2] = dest_white_point.Z;
    159 
    160 	cone_source_rgb = matrix_eval(chad, cone_source_XYZ);
    161 	cone_dest_rgb   = matrix_eval(chad, cone_dest_XYZ);
    162 
    163 	cone.m[0][0] = cone_dest_rgb.v[0]/cone_source_rgb.v[0];
    164 	cone.m[0][1] = 0;
    165 	cone.m[0][2] = 0;
    166 	cone.m[1][0] = 0;
    167 	cone.m[1][1] = cone_dest_rgb.v[1]/cone_source_rgb.v[1];
    168 	cone.m[1][2] = 0;
    169 	cone.m[2][0] = 0;
    170 	cone.m[2][1] = 0;
    171 	cone.m[2][2] = cone_dest_rgb.v[2]/cone_source_rgb.v[2];
    172 	cone.invalid = false;
    173 
    174 	// Normalize
    175 	return matrix_multiply(chad_inv, matrix_multiply(cone, chad));
    176 }
    177 
    178 /* from lcms: cmsAdaptionMatrix */
    179 // Returns the final chrmatic adaptation from illuminant FromIll to Illuminant ToIll
    180 // Bradford is assumed
    181 static struct matrix
    182 adaption_matrix(struct CIE_XYZ source_illumination, struct CIE_XYZ target_illumination)
    183 {
    184 #if defined (_MSC_VER)
    185 #pragma warning(push)
    186 /* Disable double to float truncation warning 4305 */
    187 #pragma warning(disable:4305)
    188 #endif
    189 	struct matrix lam_rigg = {{ // Bradford matrix
    190 	                         {  0.8951,  0.2664, -0.1614 },
    191 	                         { -0.7502,  1.7135,  0.0367 },
    192 	                         {  0.0389, -0.0685,  1.0296 }
    193 	                         }};
    194 #if defined (_MSC_VER)
    195 /* Restore warnings */
    196 #pragma warning(pop)
    197 #endif
    198 	return compute_chromatic_adaption(source_illumination, target_illumination, lam_rigg);
    199 }
    200 
    201 /* from lcms: cmsAdaptMatrixToD50 */
    202 static struct matrix adapt_matrix_to_D50(struct matrix r, qcms_CIE_xyY source_white_pt)
    203 {
    204 	struct CIE_XYZ Dn;
    205 	struct matrix Bradford;
    206 
    207 	if (source_white_pt.y == 0.0)
    208 		return matrix_invalid();
    209 
    210 	Dn = xyY2XYZ(source_white_pt);
    211 
    212 	Bradford = adaption_matrix(Dn, D50_XYZ);
    213 	return matrix_multiply(Bradford, r);
    214 }
    215 
    216 qcms_bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries)
    217 {
    218 	struct matrix colorants;
    219 	colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
    220 	colorants = adapt_matrix_to_D50(colorants, white_point);
    221 
    222 	if (colorants.invalid)
    223 		return false;
    224 
    225 	/* note: there's a transpose type of operation going on here */
    226 	profile->redColorant.X = double_to_s15Fixed16Number(colorants.m[0][0]);
    227 	profile->redColorant.Y = double_to_s15Fixed16Number(colorants.m[1][0]);
    228 	profile->redColorant.Z = double_to_s15Fixed16Number(colorants.m[2][0]);
    229 
    230 	profile->greenColorant.X = double_to_s15Fixed16Number(colorants.m[0][1]);
    231 	profile->greenColorant.Y = double_to_s15Fixed16Number(colorants.m[1][1]);
    232 	profile->greenColorant.Z = double_to_s15Fixed16Number(colorants.m[2][1]);
    233 
    234 	profile->blueColorant.X = double_to_s15Fixed16Number(colorants.m[0][2]);
    235 	profile->blueColorant.Y = double_to_s15Fixed16Number(colorants.m[1][2]);
    236 	profile->blueColorant.Z = double_to_s15Fixed16Number(colorants.m[2][2]);
    237 
    238 	return true;
    239 }
    240 
    241 #if 0
    242 static void qcms_transform_data_rgb_out_pow(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    243 {
    244 	const int r_out = output_format.r;
    245 	const int b_out = output_format.b;
    246 
    247 	int i;
    248 	float (*mat)[4] = transform->matrix;
    249 	for (i=0; i<length; i++) {
    250 		unsigned char device_r = *src++;
    251 		unsigned char device_g = *src++;
    252 		unsigned char device_b = *src++;
    253 
    254 		float linear_r = transform->input_gamma_table_r[device_r];
    255 		float linear_g = transform->input_gamma_table_g[device_g];
    256 		float linear_b = transform->input_gamma_table_b[device_b];
    257 
    258 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
    259 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
    260 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
    261 
    262 		float out_device_r = pow(out_linear_r, transform->out_gamma_r);
    263 		float out_device_g = pow(out_linear_g, transform->out_gamma_g);
    264 		float out_device_b = pow(out_linear_b, transform->out_gamma_b);
    265 
    266 		dest[r_out] = clamp_u8(out_device_r*255);
    267 		dest[1]     = clamp_u8(out_device_g*255);
    268 		dest[b_out] = clamp_u8(out_device_b*255);
    269 		dest += 3;
    270 	}
    271 }
    272 #endif
    273 
    274 static void qcms_transform_data_gray_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    275 {
    276 	const int r_out = output_format.r;
    277 	const int b_out = output_format.b;
    278 
    279 	unsigned int i;
    280 	for (i = 0; i < length; i++) {
    281 		float out_device_r, out_device_g, out_device_b;
    282 		unsigned char device = *src++;
    283 
    284 		float linear = transform->input_gamma_table_gray[device];
    285 
    286 		out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
    287 		out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
    288 		out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
    289 
    290 		dest[r_out] = clamp_u8(out_device_r*255);
    291 		dest[1]     = clamp_u8(out_device_g*255);
    292 		dest[b_out] = clamp_u8(out_device_b*255);
    293 		dest += 3;
    294 	}
    295 }
    296 
    297 /* Alpha is not corrected.
    298    A rationale for this is found in Alvy Ray's "Should Alpha Be Nonlinear If
    299    RGB Is?" Tech Memo 17 (December 14, 1998).
    300 	See: ftp://ftp.alvyray.com/Acrobat/17_Nonln.pdf
    301 */
    302 
    303 static void qcms_transform_data_graya_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    304 {
    305 	const int r_out = output_format.r;
    306 	const int b_out = output_format.b;
    307 
    308 	unsigned int i;
    309 	for (i = 0; i < length; i++) {
    310 		float out_device_r, out_device_g, out_device_b;
    311 		unsigned char device = *src++;
    312 		unsigned char alpha = *src++;
    313 
    314 		float linear = transform->input_gamma_table_gray[device];
    315 
    316 		out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
    317 		out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
    318 		out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
    319 
    320 		dest[r_out] = clamp_u8(out_device_r*255);
    321 		dest[1]     = clamp_u8(out_device_g*255);
    322 		dest[b_out] = clamp_u8(out_device_b*255);
    323 		dest[3]     = alpha;
    324 		dest += 4;
    325 	}
    326 }
    327 
    328 
    329 static void qcms_transform_data_gray_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    330 {
    331 	const int r_out = output_format.r;
    332 	const int b_out = output_format.b;
    333 
    334 	unsigned int i;
    335 	for (i = 0; i < length; i++) {
    336 		unsigned char device = *src++;
    337 		uint16_t gray;
    338 
    339 		float linear = transform->input_gamma_table_gray[device];
    340 
    341 		/* we could round here... */
    342 		gray = linear * PRECACHE_OUTPUT_MAX;
    343 
    344 		dest[r_out] = transform->output_table_r->data[gray];
    345 		dest[1]     = transform->output_table_g->data[gray];
    346 		dest[b_out] = transform->output_table_b->data[gray];
    347 		dest += 3;
    348 	}
    349 }
    350 
    351 
    352 static void qcms_transform_data_graya_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    353 {
    354 	const int r_out = output_format.r;
    355 	const int b_out = output_format.b;
    356 
    357 	unsigned int i;
    358 	for (i = 0; i < length; i++) {
    359 		unsigned char device = *src++;
    360 		unsigned char alpha = *src++;
    361 		uint16_t gray;
    362 
    363 		float linear = transform->input_gamma_table_gray[device];
    364 
    365 		/* we could round here... */
    366 		gray = linear * PRECACHE_OUTPUT_MAX;
    367 
    368 		dest[r_out] = transform->output_table_r->data[gray];
    369 		dest[1]     = transform->output_table_g->data[gray];
    370 		dest[b_out] = transform->output_table_b->data[gray];
    371 		dest[3]     = alpha;
    372 		dest += 4;
    373 	}
    374 }
    375 
    376 static void qcms_transform_data_rgb_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    377 {
    378 	const int r_out = output_format.r;
    379 	const int b_out = output_format.b;
    380 
    381 	unsigned int i;
    382 	float (*mat)[4] = transform->matrix;
    383 	for (i = 0; i < length; i++) {
    384 		unsigned char device_r = *src++;
    385 		unsigned char device_g = *src++;
    386 		unsigned char device_b = *src++;
    387 		uint16_t r, g, b;
    388 
    389 		float linear_r = transform->input_gamma_table_r[device_r];
    390 		float linear_g = transform->input_gamma_table_g[device_g];
    391 		float linear_b = transform->input_gamma_table_b[device_b];
    392 
    393 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
    394 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
    395 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
    396 
    397 		out_linear_r = clamp_float(out_linear_r);
    398 		out_linear_g = clamp_float(out_linear_g);
    399 		out_linear_b = clamp_float(out_linear_b);
    400 
    401 		/* we could round here... */
    402 		r = out_linear_r * PRECACHE_OUTPUT_MAX;
    403 		g = out_linear_g * PRECACHE_OUTPUT_MAX;
    404 		b = out_linear_b * PRECACHE_OUTPUT_MAX;
    405 
    406 		dest[r_out] = transform->output_table_r->data[r];
    407 		dest[1]     = transform->output_table_g->data[g];
    408 		dest[b_out] = transform->output_table_b->data[b];
    409 		dest += 3;
    410 	}
    411 }
    412 
    413 static void qcms_transform_data_rgba_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    414 {
    415 	const int r_out = output_format.r;
    416 	const int b_out = output_format.b;
    417 
    418 	unsigned int i;
    419 	float (*mat)[4] = transform->matrix;
    420 	for (i = 0; i < length; i++) {
    421 		unsigned char device_r = *src++;
    422 		unsigned char device_g = *src++;
    423 		unsigned char device_b = *src++;
    424 		unsigned char alpha = *src++;
    425 		uint16_t r, g, b;
    426 
    427 		float linear_r = transform->input_gamma_table_r[device_r];
    428 		float linear_g = transform->input_gamma_table_g[device_g];
    429 		float linear_b = transform->input_gamma_table_b[device_b];
    430 
    431 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
    432 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
    433 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
    434 
    435 		out_linear_r = clamp_float(out_linear_r);
    436 		out_linear_g = clamp_float(out_linear_g);
    437 		out_linear_b = clamp_float(out_linear_b);
    438 
    439 		/* we could round here... */
    440 		r = out_linear_r * PRECACHE_OUTPUT_MAX;
    441 		g = out_linear_g * PRECACHE_OUTPUT_MAX;
    442 		b = out_linear_b * PRECACHE_OUTPUT_MAX;
    443 
    444 		dest[r_out] = transform->output_table_r->data[r];
    445 		dest[1]     = transform->output_table_g->data[g];
    446 		dest[b_out] = transform->output_table_b->data[b];
    447 		dest[3]     = alpha;
    448 		dest += 4;
    449 	}
    450 }
    451 
    452 // Not used
    453 /*
    454 static void qcms_transform_data_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    455 {
    456 	const int r_out = output_format.r;
    457 	const int b_out = output_format.b;
    458 
    459 	unsigned int i;
    460 	int xy_len = 1;
    461 	int x_len = transform->grid_size;
    462 	int len = x_len * x_len;
    463 	float* r_table = transform->r_clut;
    464 	float* g_table = transform->g_clut;
    465 	float* b_table = transform->b_clut;
    466 
    467 	for (i = 0; i < length; i++) {
    468 		unsigned char in_r = *src++;
    469 		unsigned char in_g = *src++;
    470 		unsigned char in_b = *src++;
    471 		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
    472 
    473 		int x = floor(linear_r * (transform->grid_size-1));
    474 		int y = floor(linear_g * (transform->grid_size-1));
    475 		int z = floor(linear_b * (transform->grid_size-1));
    476 		int x_n = ceil(linear_r * (transform->grid_size-1));
    477 		int y_n = ceil(linear_g * (transform->grid_size-1));
    478 		int z_n = ceil(linear_b * (transform->grid_size-1));
    479 		float x_d = linear_r * (transform->grid_size-1) - x;
    480 		float y_d = linear_g * (transform->grid_size-1) - y;
    481 		float z_d = linear_b * (transform->grid_size-1) - z;
    482 
    483 		float r_x1 = lerp(CLU(r_table,x,y,z), CLU(r_table,x_n,y,z), x_d);
    484 		float r_x2 = lerp(CLU(r_table,x,y_n,z), CLU(r_table,x_n,y_n,z), x_d);
    485 		float r_y1 = lerp(r_x1, r_x2, y_d);
    486 		float r_x3 = lerp(CLU(r_table,x,y,z_n), CLU(r_table,x_n,y,z_n), x_d);
    487 		float r_x4 = lerp(CLU(r_table,x,y_n,z_n), CLU(r_table,x_n,y_n,z_n), x_d);
    488 		float r_y2 = lerp(r_x3, r_x4, y_d);
    489 		float clut_r = lerp(r_y1, r_y2, z_d);
    490 
    491 		float g_x1 = lerp(CLU(g_table,x,y,z), CLU(g_table,x_n,y,z), x_d);
    492 		float g_x2 = lerp(CLU(g_table,x,y_n,z), CLU(g_table,x_n,y_n,z), x_d);
    493 		float g_y1 = lerp(g_x1, g_x2, y_d);
    494 		float g_x3 = lerp(CLU(g_table,x,y,z_n), CLU(g_table,x_n,y,z_n), x_d);
    495 		float g_x4 = lerp(CLU(g_table,x,y_n,z_n), CLU(g_table,x_n,y_n,z_n), x_d);
    496 		float g_y2 = lerp(g_x3, g_x4, y_d);
    497 		float clut_g = lerp(g_y1, g_y2, z_d);
    498 
    499 		float b_x1 = lerp(CLU(b_table,x,y,z), CLU(b_table,x_n,y,z), x_d);
    500 		float b_x2 = lerp(CLU(b_table,x,y_n,z), CLU(b_table,x_n,y_n,z), x_d);
    501 		float b_y1 = lerp(b_x1, b_x2, y_d);
    502 		float b_x3 = lerp(CLU(b_table,x,y,z_n), CLU(b_table,x_n,y,z_n), x_d);
    503 		float b_x4 = lerp(CLU(b_table,x,y_n,z_n), CLU(b_table,x_n,y_n,z_n), x_d);
    504 		float b_y2 = lerp(b_x3, b_x4, y_d);
    505 		float clut_b = lerp(b_y1, b_y2, z_d);
    506 
    507 		dest[r_out] = clamp_u8(clut_r*255.0f);
    508 		dest[1]     = clamp_u8(clut_g*255.0f);
    509 		dest[b_out] = clamp_u8(clut_b*255.0f);
    510 		dest += 3;
    511 	}
    512 }
    513 */
    514 
    515 // Using lcms' tetra interpolation algorithm.
    516 static void qcms_transform_data_tetra_clut_rgba(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    517 {
    518 	const int r_out = output_format.r;
    519 	const int b_out = output_format.b;
    520 
    521 	unsigned int i;
    522 	int xy_len = 1;
    523 	int x_len = transform->grid_size;
    524 	int len = x_len * x_len;
    525 	float* r_table = transform->r_clut;
    526 	float* g_table = transform->g_clut;
    527 	float* b_table = transform->b_clut;
    528 	float c0_r, c1_r, c2_r, c3_r;
    529 	float c0_g, c1_g, c2_g, c3_g;
    530 	float c0_b, c1_b, c2_b, c3_b;
    531 	float clut_r, clut_g, clut_b;
    532 	for (i = 0; i < length; i++) {
    533 		unsigned char in_r = *src++;
    534 		unsigned char in_g = *src++;
    535 		unsigned char in_b = *src++;
    536 		unsigned char in_a = *src++;
    537 		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
    538 
    539 		int x = floor(linear_r * (transform->grid_size-1));
    540 		int y = floor(linear_g * (transform->grid_size-1));
    541 		int z = floor(linear_b * (transform->grid_size-1));
    542 		int x_n = ceil(linear_r * (transform->grid_size-1));
    543 		int y_n = ceil(linear_g * (transform->grid_size-1));
    544 		int z_n = ceil(linear_b * (transform->grid_size-1));
    545 		float rx = linear_r * (transform->grid_size-1) - x;
    546 		float ry = linear_g * (transform->grid_size-1) - y;
    547 		float rz = linear_b * (transform->grid_size-1) - z;
    548 
    549 		c0_r = CLU(r_table, x, y, z);
    550 		c0_g = CLU(g_table, x, y, z);
    551 		c0_b = CLU(b_table, x, y, z);
    552 
    553 		if( rx >= ry ) {
    554 			if (ry >= rz) { //rx >= ry && ry >= rz
    555 				c1_r = CLU(r_table, x_n, y, z) - c0_r;
    556 				c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
    557 				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
    558 				c1_g = CLU(g_table, x_n, y, z) - c0_g;
    559 				c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
    560 				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
    561 				c1_b = CLU(b_table, x_n, y, z) - c0_b;
    562 				c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
    563 				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
    564 			} else {
    565 				if (rx >= rz) { //rx >= rz && rz >= ry
    566 					c1_r = CLU(r_table, x_n, y, z) - c0_r;
    567 					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
    568 					c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
    569 					c1_g = CLU(g_table, x_n, y, z) - c0_g;
    570 					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
    571 					c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
    572 					c1_b = CLU(b_table, x_n, y, z) - c0_b;
    573 					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
    574 					c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
    575 				} else { //rz > rx && rx >= ry
    576 					c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
    577 					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
    578 					c3_r = CLU(r_table, x, y, z_n) - c0_r;
    579 					c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
    580 					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
    581 					c3_g = CLU(g_table, x, y, z_n) - c0_g;
    582 					c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
    583 					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
    584 					c3_b = CLU(b_table, x, y, z_n) - c0_b;
    585 				}
    586 			}
    587 		} else {
    588 			if (rx >= rz) { //ry > rx && rx >= rz
    589 				c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
    590 				c2_r = CLU(r_table, x, y_n, z) - c0_r;
    591 				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
    592 				c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
    593 				c2_g = CLU(g_table, x, y_n, z) - c0_g;
    594 				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
    595 				c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
    596 				c2_b = CLU(b_table, x, y_n, z) - c0_b;
    597 				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
    598 			} else {
    599 				if (ry >= rz) { //ry >= rz && rz > rx
    600 					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
    601 					c2_r = CLU(r_table, x, y_n, z) - c0_r;
    602 					c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
    603 					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
    604 					c2_g = CLU(g_table, x, y_n, z) - c0_g;
    605 					c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
    606 					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
    607 					c2_b = CLU(b_table, x, y_n, z) - c0_b;
    608 					c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
    609 				} else { //rz > ry && ry > rx
    610 					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
    611 					c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
    612 					c3_r = CLU(r_table, x, y, z_n) - c0_r;
    613 					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
    614 					c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
    615 					c3_g = CLU(g_table, x, y, z_n) - c0_g;
    616 					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
    617 					c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
    618 					c3_b = CLU(b_table, x, y, z_n) - c0_b;
    619 				}
    620 			}
    621 		}
    622 
    623 		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
    624 		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
    625 		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
    626 
    627 		dest[r_out] = clamp_u8(clut_r*255.0f);
    628 		dest[1]     = clamp_u8(clut_g*255.0f);
    629 		dest[b_out] = clamp_u8(clut_b*255.0f);
    630 		dest[3]     = in_a;
    631 		dest += 4;
    632 	}
    633 }
    634 
    635 // Using lcms' tetra interpolation code.
    636 static void qcms_transform_data_tetra_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    637 {
    638 	const int r_out = output_format.r;
    639 	const int b_out = output_format.b;
    640 
    641 	unsigned int i;
    642 	int xy_len = 1;
    643 	int x_len = transform->grid_size;
    644 	int len = x_len * x_len;
    645 	float* r_table = transform->r_clut;
    646 	float* g_table = transform->g_clut;
    647 	float* b_table = transform->b_clut;
    648 	float c0_r, c1_r, c2_r, c3_r;
    649 	float c0_g, c1_g, c2_g, c3_g;
    650 	float c0_b, c1_b, c2_b, c3_b;
    651 	float clut_r, clut_g, clut_b;
    652 	for (i = 0; i < length; i++) {
    653 		unsigned char in_r = *src++;
    654 		unsigned char in_g = *src++;
    655 		unsigned char in_b = *src++;
    656 		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
    657 
    658 		int x = floor(linear_r * (transform->grid_size-1));
    659 		int y = floor(linear_g * (transform->grid_size-1));
    660 		int z = floor(linear_b * (transform->grid_size-1));
    661 		int x_n = ceil(linear_r * (transform->grid_size-1));
    662 		int y_n = ceil(linear_g * (transform->grid_size-1));
    663 		int z_n = ceil(linear_b * (transform->grid_size-1));
    664 		float rx = linear_r * (transform->grid_size-1) - x;
    665 		float ry = linear_g * (transform->grid_size-1) - y;
    666 		float rz = linear_b * (transform->grid_size-1) - z;
    667 
    668 		c0_r = CLU(r_table, x, y, z);
    669 		c0_g = CLU(g_table, x, y, z);
    670 		c0_b = CLU(b_table, x, y, z);
    671 
    672 		if( rx >= ry ) {
    673 			if (ry >= rz) { //rx >= ry && ry >= rz
    674 				c1_r = CLU(r_table, x_n, y, z) - c0_r;
    675 				c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
    676 				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
    677 				c1_g = CLU(g_table, x_n, y, z) - c0_g;
    678 				c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
    679 				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
    680 				c1_b = CLU(b_table, x_n, y, z) - c0_b;
    681 				c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
    682 				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
    683 			} else {
    684 				if (rx >= rz) { //rx >= rz && rz >= ry
    685 					c1_r = CLU(r_table, x_n, y, z) - c0_r;
    686 					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
    687 					c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
    688 					c1_g = CLU(g_table, x_n, y, z) - c0_g;
    689 					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
    690 					c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
    691 					c1_b = CLU(b_table, x_n, y, z) - c0_b;
    692 					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
    693 					c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
    694 				} else { //rz > rx && rx >= ry
    695 					c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
    696 					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
    697 					c3_r = CLU(r_table, x, y, z_n) - c0_r;
    698 					c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
    699 					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
    700 					c3_g = CLU(g_table, x, y, z_n) - c0_g;
    701 					c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
    702 					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
    703 					c3_b = CLU(b_table, x, y, z_n) - c0_b;
    704 				}
    705 			}
    706 		} else {
    707 			if (rx >= rz) { //ry > rx && rx >= rz
    708 				c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
    709 				c2_r = CLU(r_table, x, y_n, z) - c0_r;
    710 				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
    711 				c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
    712 				c2_g = CLU(g_table, x, y_n, z) - c0_g;
    713 				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
    714 				c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
    715 				c2_b = CLU(b_table, x, y_n, z) - c0_b;
    716 				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
    717 			} else {
    718 				if (ry >= rz) { //ry >= rz && rz > rx
    719 					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
    720 					c2_r = CLU(r_table, x, y_n, z) - c0_r;
    721 					c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
    722 					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
    723 					c2_g = CLU(g_table, x, y_n, z) - c0_g;
    724 					c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
    725 					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
    726 					c2_b = CLU(b_table, x, y_n, z) - c0_b;
    727 					c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
    728 				} else { //rz > ry && ry > rx
    729 					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
    730 					c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
    731 					c3_r = CLU(r_table, x, y, z_n) - c0_r;
    732 					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
    733 					c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
    734 					c3_g = CLU(g_table, x, y, z_n) - c0_g;
    735 					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
    736 					c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
    737 					c3_b = CLU(b_table, x, y, z_n) - c0_b;
    738 				}
    739 			}
    740 		}
    741 
    742 		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
    743 		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
    744 		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
    745 
    746 		dest[r_out] = clamp_u8(clut_r*255.0f);
    747 		dest[1]     = clamp_u8(clut_g*255.0f);
    748 		dest[b_out] = clamp_u8(clut_b*255.0f);
    749 		dest += 3;
    750 	}
    751 }
    752 
    753 static void qcms_transform_data_rgb_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    754 {
    755 	const int r_out = output_format.r;
    756 	const int b_out = output_format.b;
    757 
    758 	unsigned int i;
    759 	float (*mat)[4] = transform->matrix;
    760 	for (i = 0; i < length; i++) {
    761 		unsigned char device_r = *src++;
    762 		unsigned char device_g = *src++;
    763 		unsigned char device_b = *src++;
    764 		float out_device_r, out_device_g, out_device_b;
    765 
    766 		float linear_r = transform->input_gamma_table_r[device_r];
    767 		float linear_g = transform->input_gamma_table_g[device_g];
    768 		float linear_b = transform->input_gamma_table_b[device_b];
    769 
    770 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
    771 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
    772 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
    773 
    774 		out_linear_r = clamp_float(out_linear_r);
    775 		out_linear_g = clamp_float(out_linear_g);
    776 		out_linear_b = clamp_float(out_linear_b);
    777 
    778 		out_device_r = lut_interp_linear(out_linear_r,
    779 				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
    780 		out_device_g = lut_interp_linear(out_linear_g,
    781 				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
    782 		out_device_b = lut_interp_linear(out_linear_b,
    783 				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
    784 
    785 		dest[r_out] = clamp_u8(out_device_r*255);
    786 		dest[1]     = clamp_u8(out_device_g*255);
    787 		dest[b_out] = clamp_u8(out_device_b*255);
    788 		dest += 3;
    789 	}
    790 }
    791 
    792 static void qcms_transform_data_rgba_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    793 {
    794 	const int r_out = output_format.r;
    795 	const int b_out = output_format.b;
    796 
    797 	unsigned int i;
    798 	float (*mat)[4] = transform->matrix;
    799 	for (i = 0; i < length; i++) {
    800 		unsigned char device_r = *src++;
    801 		unsigned char device_g = *src++;
    802 		unsigned char device_b = *src++;
    803 		unsigned char alpha = *src++;
    804 		float out_device_r, out_device_g, out_device_b;
    805 
    806 		float linear_r = transform->input_gamma_table_r[device_r];
    807 		float linear_g = transform->input_gamma_table_g[device_g];
    808 		float linear_b = transform->input_gamma_table_b[device_b];
    809 
    810 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
    811 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
    812 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
    813 
    814 		out_linear_r = clamp_float(out_linear_r);
    815 		out_linear_g = clamp_float(out_linear_g);
    816 		out_linear_b = clamp_float(out_linear_b);
    817 
    818 		out_device_r = lut_interp_linear(out_linear_r,
    819 				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
    820 		out_device_g = lut_interp_linear(out_linear_g,
    821 				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
    822 		out_device_b = lut_interp_linear(out_linear_b,
    823 				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
    824 
    825 		dest[r_out] = clamp_u8(out_device_r*255);
    826 		dest[1]     = clamp_u8(out_device_g*255);
    827 		dest[b_out] = clamp_u8(out_device_b*255);
    828 		dest[3]     = alpha;
    829 		dest += 4;
    830 	}
    831 }
    832 
    833 #if 0
    834 static void qcms_transform_data_rgb_out_linear(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length, qcms_format_type output_format)
    835 {
    836 	const int r_out = output_format.r;
    837 	const int b_out = output_format.b;
    838 
    839 	int i;
    840 	float (*mat)[4] = transform->matrix;
    841 	for (i = 0; i < length; i++) {
    842 		unsigned char device_r = *src++;
    843 		unsigned char device_g = *src++;
    844 		unsigned char device_b = *src++;
    845 
    846 		float linear_r = transform->input_gamma_table_r[device_r];
    847 		float linear_g = transform->input_gamma_table_g[device_g];
    848 		float linear_b = transform->input_gamma_table_b[device_b];
    849 
    850 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
    851 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
    852 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
    853 
    854 		dest[r_out] = clamp_u8(out_linear_r*255);
    855 		dest[1]     = clamp_u8(out_linear_g*255);
    856 		dest[b_out] = clamp_u8(out_linear_b*255);
    857 		dest += 3;
    858 	}
    859 }
    860 #endif
    861 
    862 static struct precache_output *precache_reference(struct precache_output *p)
    863 {
    864 	p->ref_count++;
    865 	return p;
    866 }
    867 
    868 static struct precache_output *precache_create()
    869 {
    870 	struct precache_output *p = malloc(sizeof(struct precache_output));
    871 	if (p)
    872 		p->ref_count = 1;
    873 	return p;
    874 }
    875 
    876 void precache_release(struct precache_output *p)
    877 {
    878 	if (--p->ref_count == 0) {
    879 		free(p);
    880 	}
    881 }
    882 
    883 #ifdef HAVE_POSIX_MEMALIGN
    884 static qcms_transform *transform_alloc(void)
    885 {
    886 	qcms_transform *t;
    887 	if (!posix_memalign(&t, 16, sizeof(*t))) {
    888 		return t;
    889 	} else {
    890 		return NULL;
    891 	}
    892 }
    893 static void transform_free(qcms_transform *t)
    894 {
    895 	free(t);
    896 }
    897 #else
    898 static qcms_transform *transform_alloc(void)
    899 {
    900 	/* transform needs to be aligned on a 16byte boundrary */
    901 	char *original_block = calloc(sizeof(qcms_transform) + sizeof(void*) + 16, 1);
    902 	/* make room for a pointer to the block returned by calloc */
    903 	void *transform_start = original_block + sizeof(void*);
    904 	/* align transform_start */
    905 	qcms_transform *transform_aligned = (qcms_transform*)(((uintptr_t)transform_start + 15) & ~0xf);
    906 
    907 	/* store a pointer to the block returned by calloc so that we can free it later */
    908 	void **(original_block_ptr) = (void**)transform_aligned;
    909 	if (!original_block)
    910 		return NULL;
    911 	original_block_ptr--;
    912 	*original_block_ptr = original_block;
    913 
    914 	return transform_aligned;
    915 }
    916 static void transform_free(qcms_transform *t)
    917 {
    918 	/* get at the pointer to the unaligned block returned by calloc */
    919 	void **p = (void**)t;
    920 	p--;
    921 	free(*p);
    922 }
    923 #endif
    924 
    925 void qcms_transform_release(qcms_transform *t)
    926 {
    927 	/* ensure we only free the gamma tables once even if there are
    928 	 * multiple references to the same data */
    929 
    930 	if (t->output_table_r)
    931 		precache_release(t->output_table_r);
    932 	if (t->output_table_g)
    933 		precache_release(t->output_table_g);
    934 	if (t->output_table_b)
    935 		precache_release(t->output_table_b);
    936 
    937 	free(t->input_gamma_table_r);
    938 	if (t->input_gamma_table_g != t->input_gamma_table_r)
    939 		free(t->input_gamma_table_g);
    940 	if (t->input_gamma_table_g != t->input_gamma_table_r &&
    941 	    t->input_gamma_table_g != t->input_gamma_table_b)
    942 		free(t->input_gamma_table_b);
    943 
    944 	free(t->input_gamma_table_gray);
    945 
    946 	free(t->output_gamma_lut_r);
    947 	free(t->output_gamma_lut_g);
    948 	free(t->output_gamma_lut_b);
    949 
    950 	transform_free(t);
    951 }
    952 
    953 #ifdef X86
    954 // Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
    955 // mozilla/jpeg)
    956  // -------------------------------------------------------------------------
    957 #if defined(_M_IX86) && defined(_MSC_VER)
    958 #define HAS_CPUID
    959 /* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC
    960    register - I'm not sure if that ever happens on windows, but cpuid isn't
    961    on the critical path so we just preserve the register to be safe and to be
    962    consistent with the non-windows version. */
    963 static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
    964        uint32_t a_, b_, c_, d_;
    965        __asm {
    966               xchg   ebx, esi
    967               mov    eax, fxn
    968               cpuid
    969               mov    a_, eax
    970               mov    b_, ebx
    971               mov    c_, ecx
    972               mov    d_, edx
    973               xchg   ebx, esi
    974        }
    975        *a = a_;
    976        *b = b_;
    977        *c = c_;
    978        *d = d_;
    979 }
    980 #elif (defined(__GNUC__) || defined(__SUNPRO_C)) && (defined(__i386__) || defined(__i386))
    981 #define HAS_CPUID
    982 /* Get us a CPUID function. We can't use ebx because it's the PIC register on
    983    some platforms, so we use ESI instead and save ebx to avoid clobbering it. */
    984 static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
    985 
    986 	uint32_t a_, b_, c_, d_;
    987        __asm__ __volatile__ ("xchgl %%ebx, %%esi; cpuid; xchgl %%ebx, %%esi;"
    988                              : "=a" (a_), "=S" (b_), "=c" (c_), "=d" (d_) : "a" (fxn));
    989 	   *a = a_;
    990 	   *b = b_;
    991 	   *c = c_;
    992 	   *d = d_;
    993 }
    994 #endif
    995 
    996 // -------------------------Runtime SSEx Detection-----------------------------
    997 
    998 /* MMX is always supported per
    999  *  Gecko v1.9.1 minimum CPU requirements */
   1000 #define SSE1_EDX_MASK (1UL << 25)
   1001 #define SSE2_EDX_MASK (1UL << 26)
   1002 #define SSE3_ECX_MASK (1UL <<  0)
   1003 
   1004 static int sse_version_available(void)
   1005 {
   1006 #if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
   1007 	/* we know at build time that 64-bit CPUs always have SSE2
   1008 	 * this tells the compiler that non-SSE2 branches will never be
   1009 	 * taken (i.e. OK to optimze away the SSE1 and non-SIMD code */
   1010 	return 2;
   1011 #elif defined(HAS_CPUID)
   1012 	static int sse_version = -1;
   1013 	uint32_t a, b, c, d;
   1014 	uint32_t function = 0x00000001;
   1015 
   1016 	if (sse_version == -1) {
   1017 		sse_version = 0;
   1018 		cpuid(function, &a, &b, &c, &d);
   1019 		if (c & SSE3_ECX_MASK)
   1020 			sse_version = 3;
   1021 		else if (d & SSE2_EDX_MASK)
   1022 			sse_version = 2;
   1023 		else if (d & SSE1_EDX_MASK)
   1024 			sse_version = 1;
   1025 	}
   1026 
   1027 	return sse_version;
   1028 #else
   1029 	return 0;
   1030 #endif
   1031 }
   1032 #endif
   1033 
   1034 static const struct matrix bradford_matrix = {{	{ 0.8951f, 0.2664f,-0.1614f},
   1035 						{-0.7502f, 1.7135f, 0.0367f},
   1036 						{ 0.0389f,-0.0685f, 1.0296f}},
   1037 						false};
   1038 
   1039 static const struct matrix bradford_matrix_inv = {{ { 0.9869929f,-0.1470543f, 0.1599627f},
   1040 						    { 0.4323053f, 0.5183603f, 0.0492912f},
   1041 						    {-0.0085287f, 0.0400428f, 0.9684867f}},
   1042 						    false};
   1043 
   1044 // See ICCv4 E.3
   1045 struct matrix compute_whitepoint_adaption(float X, float Y, float Z) {
   1046 	float p = (0.96422f*bradford_matrix.m[0][0] + 1.000f*bradford_matrix.m[1][0] + 0.82521f*bradford_matrix.m[2][0]) /
   1047 		  (X*bradford_matrix.m[0][0]      + Y*bradford_matrix.m[1][0]      + Z*bradford_matrix.m[2][0]     );
   1048 	float y = (0.96422f*bradford_matrix.m[0][1] + 1.000f*bradford_matrix.m[1][1] + 0.82521f*bradford_matrix.m[2][1]) /
   1049 		  (X*bradford_matrix.m[0][1]      + Y*bradford_matrix.m[1][1]      + Z*bradford_matrix.m[2][1]     );
   1050 	float b = (0.96422f*bradford_matrix.m[0][2] + 1.000f*bradford_matrix.m[1][2] + 0.82521f*bradford_matrix.m[2][2]) /
   1051 		  (X*bradford_matrix.m[0][2]      + Y*bradford_matrix.m[1][2]      + Z*bradford_matrix.m[2][2]     );
   1052 	struct matrix white_adaption = {{ {p,0,0}, {0,y,0}, {0,0,b}}, false};
   1053 	return matrix_multiply( bradford_matrix_inv, matrix_multiply(white_adaption, bradford_matrix) );
   1054 }
   1055 
   1056 void qcms_profile_precache_output_transform(qcms_profile *profile)
   1057 {
   1058 	/* we only support precaching on rgb profiles */
   1059 	if (profile->color_space != RGB_SIGNATURE)
   1060 		return;
   1061 
   1062 	if (qcms_supports_iccv4) {
   1063 		/* don't precache since we will use the B2A LUT */
   1064 		if (profile->B2A0)
   1065 			return;
   1066 
   1067 		/* don't precache since we will use the mBA LUT */
   1068 		if (profile->mBA)
   1069 			return;
   1070 	}
   1071 
   1072 	/* don't precache if we do not have the TRC curves */
   1073 	if (!profile->redTRC || !profile->greenTRC || !profile->blueTRC)
   1074 		return;
   1075 
   1076 	if (!profile->output_table_r) {
   1077 		profile->output_table_r = precache_create();
   1078 		if (profile->output_table_r &&
   1079 				!compute_precache(profile->redTRC, profile->output_table_r->data)) {
   1080 			precache_release(profile->output_table_r);
   1081 			profile->output_table_r = NULL;
   1082 		}
   1083 	}
   1084 	if (!profile->output_table_g) {
   1085 		profile->output_table_g = precache_create();
   1086 		if (profile->output_table_g &&
   1087 				!compute_precache(profile->greenTRC, profile->output_table_g->data)) {
   1088 			precache_release(profile->output_table_g);
   1089 			profile->output_table_g = NULL;
   1090 		}
   1091 	}
   1092 	if (!profile->output_table_b) {
   1093 		profile->output_table_b = precache_create();
   1094 		if (profile->output_table_b &&
   1095 				!compute_precache(profile->blueTRC, profile->output_table_b->data)) {
   1096 			precache_release(profile->output_table_b);
   1097 			profile->output_table_b = NULL;
   1098 		}
   1099 	}
   1100 }
   1101 
   1102 /* Replace the current transformation with a LUT transformation using a given number of sample points */
   1103 qcms_transform* qcms_transform_precacheLUT_float(qcms_transform *transform, qcms_profile *in, qcms_profile *out,
   1104                                                  int samples, qcms_data_type in_type)
   1105 {
   1106 	/* The range between which 2 consecutive sample points can be used to interpolate */
   1107 	uint16_t x,y,z;
   1108 	uint32_t l;
   1109 	uint32_t lutSize = 3 * samples * samples * samples;
   1110 	float* src = NULL;
   1111 	float* dest = NULL;
   1112 	float* lut = NULL;
   1113 
   1114 	src = malloc(lutSize*sizeof(float));
   1115 	dest = malloc(lutSize*sizeof(float));
   1116 
   1117 	if (src && dest) {
   1118 		/* Prepare a list of points we want to sample */
   1119 		l = 0;
   1120 		for (x = 0; x < samples; x++) {
   1121 			for (y = 0; y < samples; y++) {
   1122 				for (z = 0; z < samples; z++) {
   1123 					src[l++] = x / (float)(samples-1);
   1124 					src[l++] = y / (float)(samples-1);
   1125 					src[l++] = z / (float)(samples-1);
   1126 				}
   1127 			}
   1128 		}
   1129 
   1130 		lut = qcms_chain_transform(in, out, src, dest, lutSize);
   1131 		if (lut) {
   1132 			transform->r_clut = &lut[0];
   1133 			transform->g_clut = &lut[1];
   1134 			transform->b_clut = &lut[2];
   1135 			transform->grid_size = samples;
   1136 			if (in_type == QCMS_DATA_RGBA_8) {
   1137 				transform->transform_fn = qcms_transform_data_tetra_clut_rgba;
   1138 			} else {
   1139 				transform->transform_fn = qcms_transform_data_tetra_clut;
   1140 			}
   1141 		}
   1142 	}
   1143 
   1144 
   1145 	//XXX: qcms_modular_transform_data may return either the src or dest buffer. If so it must not be free-ed
   1146 	if (src && lut != src) {
   1147 		free(src);
   1148 	} else if (dest && lut != src) {
   1149 		free(dest);
   1150 	}
   1151 
   1152 	if (lut == NULL) {
   1153 		return NULL;
   1154 	}
   1155 	return transform;
   1156 }
   1157 
   1158 #define NO_MEM_TRANSFORM NULL
   1159 
   1160 qcms_transform* qcms_transform_create(
   1161 		qcms_profile *in, qcms_data_type in_type,
   1162 		qcms_profile *out, qcms_data_type out_type,
   1163 		qcms_intent intent)
   1164 {
   1165 	bool precache = false;
   1166 
   1167         qcms_transform *transform = transform_alloc();
   1168         if (!transform) {
   1169 		return NULL;
   1170 	}
   1171 	if (out_type != QCMS_DATA_RGB_8 &&
   1172                 out_type != QCMS_DATA_RGBA_8) {
   1173             assert(0 && "output type");
   1174 	    transform_free(transform);
   1175             return NULL;
   1176         }
   1177 
   1178 	if (out->output_table_r &&
   1179 			out->output_table_g &&
   1180 			out->output_table_b) {
   1181 		precache = true;
   1182 	}
   1183 
   1184 	if (qcms_supports_iccv4 && (in->A2B0 || out->B2A0 || in->mAB || out->mAB)) {
   1185 		// Precache the transformation to a CLUT 33x33x33 in size.
   1186 		// 33 is used by many profiles and works well in pratice.
   1187 		// This evenly divides 256 into blocks of 8x8x8.
   1188 		// TODO For transforming small data sets of about 200x200 or less
   1189 		// precaching should be avoided.
   1190 		qcms_transform *result = qcms_transform_precacheLUT_float(transform, in, out, 33, in_type);
   1191 		if (!result) {
   1192             		assert(0 && "precacheLUT failed");
   1193 			transform_free(transform);
   1194 			return NULL;
   1195 		}
   1196 		return result;
   1197 	}
   1198 
   1199 	if (precache) {
   1200 		transform->output_table_r = precache_reference(out->output_table_r);
   1201 		transform->output_table_g = precache_reference(out->output_table_g);
   1202 		transform->output_table_b = precache_reference(out->output_table_b);
   1203 	} else {
   1204 		if (!out->redTRC || !out->greenTRC || !out->blueTRC) {
   1205 			qcms_transform_release(transform);
   1206 			return NO_MEM_TRANSFORM;
   1207 		}
   1208 		build_output_lut(out->redTRC, &transform->output_gamma_lut_r, &transform->output_gamma_lut_r_length);
   1209 		build_output_lut(out->greenTRC, &transform->output_gamma_lut_g, &transform->output_gamma_lut_g_length);
   1210 		build_output_lut(out->blueTRC, &transform->output_gamma_lut_b, &transform->output_gamma_lut_b_length);
   1211 		if (!transform->output_gamma_lut_r || !transform->output_gamma_lut_g || !transform->output_gamma_lut_b) {
   1212 			qcms_transform_release(transform);
   1213 			return NO_MEM_TRANSFORM;
   1214 		}
   1215 	}
   1216 
   1217         if (in->color_space == RGB_SIGNATURE) {
   1218 		struct matrix in_matrix, out_matrix, result;
   1219 
   1220 		if (in_type != QCMS_DATA_RGB_8 &&
   1221                     in_type != QCMS_DATA_RGBA_8){
   1222                 	assert(0 && "input type");
   1223 			transform_free(transform);
   1224                 	return NULL;
   1225             	}
   1226 		if (precache) {
   1227 #if defined(SSE2_ENABLE) && defined(X86)
   1228 		    if (sse_version_available() >= 2) {
   1229 			    if (in_type == QCMS_DATA_RGB_8)
   1230 				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse2;
   1231 			    else
   1232 				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;
   1233 
   1234 #if defined(SSE2_ENABLE) && !(defined(_MSC_VER) && defined(_M_AMD64))
   1235                     /* Microsoft Compiler for x64 doesn't support MMX.
   1236                      * SSE code uses MMX so that we disable on x64 */
   1237 		    } else
   1238 		    if (sse_version_available() >= 1) {
   1239 			    if (in_type == QCMS_DATA_RGB_8)
   1240 				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse1;
   1241 			    else
   1242 				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse1;
   1243 #endif
   1244 		    } else
   1245 #endif
   1246 			{
   1247 				if (in_type == QCMS_DATA_RGB_8)
   1248 					transform->transform_fn = qcms_transform_data_rgb_out_lut_precache;
   1249 				else
   1250 					transform->transform_fn = qcms_transform_data_rgba_out_lut_precache;
   1251 			}
   1252 		} else {
   1253 			if (in_type == QCMS_DATA_RGB_8)
   1254 				transform->transform_fn = qcms_transform_data_rgb_out_lut;
   1255 			else
   1256 				transform->transform_fn = qcms_transform_data_rgba_out_lut;
   1257 		}
   1258 
   1259 		//XXX: avoid duplicating tables if we can
   1260 		transform->input_gamma_table_r = build_input_gamma_table(in->redTRC);
   1261 		transform->input_gamma_table_g = build_input_gamma_table(in->greenTRC);
   1262 		transform->input_gamma_table_b = build_input_gamma_table(in->blueTRC);
   1263 		if (!transform->input_gamma_table_r || !transform->input_gamma_table_g || !transform->input_gamma_table_b) {
   1264 			qcms_transform_release(transform);
   1265 			return NO_MEM_TRANSFORM;
   1266 		}
   1267 
   1268 
   1269 		/* build combined colorant matrix */
   1270 		in_matrix = build_colorant_matrix(in);
   1271 		out_matrix = build_colorant_matrix(out);
   1272 		out_matrix = matrix_invert(out_matrix);
   1273 		if (out_matrix.invalid) {
   1274 			qcms_transform_release(transform);
   1275 			return NULL;
   1276 		}
   1277 		result = matrix_multiply(out_matrix, in_matrix);
   1278 
   1279 		/* store the results in column major mode
   1280 		 * this makes doing the multiplication with sse easier */
   1281 		transform->matrix[0][0] = result.m[0][0];
   1282 		transform->matrix[1][0] = result.m[0][1];
   1283 		transform->matrix[2][0] = result.m[0][2];
   1284 		transform->matrix[0][1] = result.m[1][0];
   1285 		transform->matrix[1][1] = result.m[1][1];
   1286 		transform->matrix[2][1] = result.m[1][2];
   1287 		transform->matrix[0][2] = result.m[2][0];
   1288 		transform->matrix[1][2] = result.m[2][1];
   1289 		transform->matrix[2][2] = result.m[2][2];
   1290 
   1291 	} else if (in->color_space == GRAY_SIGNATURE) {
   1292 		if (in_type != QCMS_DATA_GRAY_8 &&
   1293 				in_type != QCMS_DATA_GRAYA_8){
   1294 			assert(0 && "input type");
   1295 			transform_free(transform);
   1296 			return NULL;
   1297 		}
   1298 
   1299 		transform->input_gamma_table_gray = build_input_gamma_table(in->grayTRC);
   1300 		if (!transform->input_gamma_table_gray) {
   1301 			qcms_transform_release(transform);
   1302 			return NO_MEM_TRANSFORM;
   1303 		}
   1304 
   1305 		if (precache) {
   1306 			if (in_type == QCMS_DATA_GRAY_8) {
   1307 				transform->transform_fn = qcms_transform_data_gray_out_precache;
   1308 			} else {
   1309 				transform->transform_fn = qcms_transform_data_graya_out_precache;
   1310 			}
   1311 		} else {
   1312 			if (in_type == QCMS_DATA_GRAY_8) {
   1313 				transform->transform_fn = qcms_transform_data_gray_out_lut;
   1314 			} else {
   1315 				transform->transform_fn = qcms_transform_data_graya_out_lut;
   1316 			}
   1317 		}
   1318 	} else {
   1319 		assert(0 && "unexpected colorspace");
   1320 		transform_free(transform);
   1321 		return NULL;
   1322 	}
   1323 	return transform;
   1324 }
   1325 
   1326 /* __force_align_arg_pointer__ is an x86-only attribute, and gcc/clang warns on unused
   1327  * attributes. Don't use this on ARM or AMD64. __has_attribute can detect the presence
   1328  * of the attribute but is currently only supported by clang */
   1329 #if defined(__has_attribute)
   1330 #define HAS_FORCE_ALIGN_ARG_POINTER __has_attribute(__force_align_arg_pointer__)
   1331 #elif defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) && !defined(__arm__) && !defined(__mips__)
   1332 #define HAS_FORCE_ALIGN_ARG_POINTER 1
   1333 #else
   1334 #define HAS_FORCE_ALIGN_ARG_POINTER 0
   1335 #endif
   1336 
   1337 #if HAS_FORCE_ALIGN_ARG_POINTER
   1338 /* we need this to avoid crashes when gcc assumes the stack is 128bit aligned */
   1339 __attribute__((__force_align_arg_pointer__))
   1340 #endif
   1341 void qcms_transform_data(qcms_transform *transform, void *src, void *dest, size_t length)
   1342 {
   1343 	static const struct _qcms_format_type output_rgbx = { 0, 2 };
   1344 
   1345 	transform->transform_fn(transform, src, dest, length, output_rgbx);
   1346 }
   1347 
   1348 void qcms_transform_data_type(qcms_transform *transform, void *src, void *dest, size_t length, qcms_output_type type)
   1349 {
   1350 	static const struct _qcms_format_type output_rgbx = { 0, 2 };
   1351 	static const struct _qcms_format_type output_bgrx = { 2, 0 };
   1352 
   1353 	transform->transform_fn(transform, src, dest, length, type == QCMS_OUTPUT_BGRX ? output_bgrx : output_rgbx);
   1354 }
   1355 
   1356 qcms_bool qcms_supports_iccv4;
   1357 void qcms_enable_iccv4()
   1358 {
   1359 	qcms_supports_iccv4 = true;
   1360 }
   1361