1 /************************************************************************** 2 * 3 * Copyright 2013 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29 /** 30 * @file 31 * Format conversion code for srgb formats. 32 * 33 * Functions for converting from srgb to linear and vice versa. 34 * From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt: 35 * 36 * srgb->linear: 37 * cl = cs / 12.92, cs <= 0.04045 38 * cl = ((cs + 0.055)/1.055)^2.4, cs > 0.04045 39 * 40 * linear->srgb: 41 * if (isnan(cl)) { 42 * Map IEEE-754 Not-a-number to zero. 43 * cs = 0.0; 44 * } else if (cl > 1.0) { 45 * cs = 1.0; 46 * } else if (cl < 0.0) { 47 * cs = 0.0; 48 * } else if (cl < 0.0031308) { 49 * cs = 12.92 * cl; 50 * } else { 51 * cs = 1.055 * pow(cl, 0.41666) - 0.055; 52 * } 53 * 54 * This does not need to be accurate, however at least for d3d10 55 * (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx): 56 * 1) For srgb->linear, it is required that the error on the srgb side is 57 * not larger than 0.5f, which I interpret that if you map the value back 58 * to srgb from linear using the ideal conversion, it would not be off by 59 * more than 0.5f (that is, it would map to the same 8-bit integer value 60 * as it was before conversion to linear). 61 * 2) linear->srgb is permitted 0.6f which luckily looks like quite a large 62 * error is allowed. 63 * 3) Additionally, all srgb values converted to linear and back must result 64 * in the same value as they were originally. 65 * 66 * @author Roland Scheidegger <sroland (at) vmware.com> 67 */ 68 69 70 #include "util/u_debug.h" 71 72 #include "lp_bld_type.h" 73 #include "lp_bld_const.h" 74 #include "lp_bld_arit.h" 75 #include "lp_bld_bitarit.h" 76 #include "lp_bld_logic.h" 77 #include "lp_bld_format.h" 78 79 80 81 /** 82 * Convert srgb int values to linear float values. 83 * Several possibilities how to do this, e.g. 84 * - table 85 * - doing the pow() with int-to-float and float-to-int tricks 86 * (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent) 87 * - just using standard polynomial approximation 88 * (3rd order polynomial is required for crappy but just sufficient accuracy) 89 * 90 * @param src integer (vector) value(s) to convert 91 * (chan_bits bit values unpacked to 32 bit already). 92 */ 93 LLVMValueRef 94 lp_build_srgb_to_linear(struct gallivm_state *gallivm, 95 struct lp_type src_type, 96 unsigned chan_bits, 97 LLVMValueRef src) 98 { 99 struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32); 100 struct lp_build_context f32_bld; 101 LLVMValueRef srcf, part_lin, part_pow, is_linear, lin_const, lin_thresh; 102 double coeffs[4] = {0.0023f, 103 0.0030f / 255.0f, 104 0.6935f / (255.0f * 255.0f), 105 0.3012f / (255.0f * 255.0f * 255.0f) 106 }; 107 108 assert(src_type.width == 32); 109 /* Technically this would work with more bits too but would be inaccurate. */ 110 assert(chan_bits <= 8); 111 112 lp_build_context_init(&f32_bld, gallivm, f32_type); 113 114 /* 115 * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023) 116 * ( poly = 0.3012*x^3 + 0.6935*x^2 + 0.0030*x + 0.0023) 117 * (found with octave polyfit and some magic as I couldn't get the error 118 * function right). Using the above mentioned error function, the values stay 119 * within +-0.35, except for the lowest values - hence tweaking linear segment 120 * to cover the first 16 instead of the first 11 values (the error stays 121 * just about acceptable there too). 122 * Hence: lin = src > 15 ? poly : src / 12.6 123 * This function really only makes sense for vectors, should use LUT otherwise. 124 * All in all (including float conversion) 11 instructions (with sse4.1), 125 * 6 constants (polynomial could be done with 1 instruction less at the cost 126 * of slightly worse dependency chain, fma should also help). 127 */ 128 /* doing the 1/255 mul as part of the approximation */ 129 srcf = lp_build_int_to_float(&f32_bld, src); 130 if (chan_bits != 8) { 131 /* could adjust all the constants instead */ 132 LLVMValueRef rescale_const = lp_build_const_vec(gallivm, f32_type, 133 255.0f / ((1 << chan_bits) - 1)); 134 srcf = lp_build_mul(&f32_bld, srcf, rescale_const); 135 } 136 lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f * 255.0f)); 137 part_lin = lp_build_mul(&f32_bld, srcf, lin_const); 138 139 part_pow = lp_build_polynomial(&f32_bld, srcf, coeffs, 4); 140 141 lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f); 142 is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf, lin_thresh); 143 return lp_build_select(&f32_bld, is_linear, part_lin, part_pow); 144 } 145 146 147 /** 148 * Convert linear float values to srgb int values. 149 * Several possibilities how to do this, e.g. 150 * - use table (based on exponent/highest order mantissa bits) and do 151 * linear interpolation (https://gist.github.com/rygorous/2203834) 152 * - Chebyshev polynomial 153 * - Approximation using reciprocals 154 * - using int-to-float and float-to-int tricks for pow() 155 * (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent) 156 * 157 * @param src float (vector) value(s) to convert. 158 */ 159 static LLVMValueRef 160 lp_build_linear_to_srgb(struct gallivm_state *gallivm, 161 struct lp_type src_type, 162 unsigned chan_bits, 163 LLVMValueRef src) 164 { 165 LLVMBuilderRef builder = gallivm->builder; 166 struct lp_build_context f32_bld; 167 LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final; 168 169 lp_build_context_init(&f32_bld, gallivm, src_type); 170 171 src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one); 172 173 if (0) { 174 /* 175 * using int-to-float and float-to-int trick for pow(). 176 * This is much more accurate than necessary thanks to the correction, 177 * but it most certainly makes no sense without rsqrt available. 178 * Bonus points if you understand how this works... 179 * All in all (including min/max clamp, conversion) 19 instructions. 180 */ 181 182 float exp_f = 2.0f / 3.0f; 183 /* some compilers can't do exp2f, so this is exp2f(127.0f/exp_f - 127.0f) */ 184 float exp2f_c = 1.30438178253e+19f; 185 float coeff_f = 0.62996f; 186 LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2; 187 struct lp_type int_type = lp_int_type(src_type); 188 189 /* 190 * First calculate approx x^8/12 191 */ 192 exponent = lp_build_const_vec(gallivm, src_type, exp_f); 193 coeff = lp_build_const_vec(gallivm, src_type, 194 exp2f_c * powf(coeff_f, 1.0f / exp_f)); 195 196 /* premultiply src */ 197 tmp = lp_build_mul(&f32_bld, coeff, src); 198 /* "log2" */ 199 tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, int_type), ""); 200 tmp = lp_build_int_to_float(&f32_bld, tmp); 201 /* multiply for pow */ 202 tmp = lp_build_mul(&f32_bld, tmp, exponent); 203 /* "exp2" */ 204 pow_approx = lp_build_itrunc(&f32_bld, tmp); 205 pow_approx = LLVMBuildBitCast(builder, pow_approx, 206 lp_build_vec_type(gallivm, src_type), ""); 207 208 /* 209 * Since that pow was inaccurate (like 3 bits, though each sqrt step would 210 * give another bit), compensate the error (which is why we chose another 211 * exponent in the first place). 212 */ 213 /* x * x^(8/12) = x^(20/12) */ 214 pow_1 = lp_build_mul(&f32_bld, pow_approx, src); 215 216 /* x * x * x^(-4/12) = x^(20/12) */ 217 /* Should avoid using rsqrt if it's not available, but 218 * using x * x^(4/12) * x^(4/12) instead will change error weight */ 219 tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx); 220 x2 = lp_build_mul(&f32_bld, src, src); 221 pow_2 = lp_build_mul(&f32_bld, x2, tmp); 222 223 /* average the values so the errors cancel out, compensate bias, 224 * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul 225 * for conversion to int in here */ 226 tmp = lp_build_add(&f32_bld, pow_1, pow_2); 227 coeff = lp_build_const_vec(gallivm, src_type, 228 1.0f / (3.0f * coeff_f) * 0.999852f * 229 powf(1.055f * 255.0f, 4.0f)); 230 pow_final = lp_build_mul(&f32_bld, tmp, coeff); 231 232 /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */ 233 if (lp_build_fast_rsqrt_available(src_type)) { 234 pow_final = lp_build_fast_rsqrt(&f32_bld, 235 lp_build_fast_rsqrt(&f32_bld, pow_final)); 236 } 237 else { 238 pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, pow_final)); 239 } 240 pow_final = lp_build_add(&f32_bld, pow_final, 241 lp_build_const_vec(gallivm, src_type, -0.055f * 255.0f)); 242 } 243 244 else { 245 /* 246 * using "rational polynomial" approximation here. 247 * Essentially y = a*x^0.375 + b*x^0.5 + c, with also 248 * factoring in the 255.0 mul and the scaling mul. 249 * (a is closer to actual value so has higher weight than b.) 250 * Note: the constants are magic values. They were found empirically, 251 * possibly could be improved but good enough (be VERY careful with 252 * error metric if you'd want to tweak them, they also MUST fit with 253 * the crappy polynomial above for srgb->linear since it is required 254 * that each srgb value maps back to the same value). 255 * This function has an error of max +-0.17. Not sure this is actually 256 * enough, we require +-0.6 but that may include the +-0.5 from integer 257 * conversion. Seems to pass all relevant tests though... 258 * For the approximated srgb->linear values the error is naturally larger 259 * (+-0.42) but still accurate enough (required +-0.5 essentially). 260 * All in all (including min/max clamp, conversion) 15 instructions. 261 * FMA would help (minus 2 instructions). 262 */ 263 264 LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2; 265 266 if (lp_build_fast_rsqrt_available(src_type)) { 267 tmp = lp_build_fast_rsqrt(&f32_bld, src); 268 x05 = lp_build_mul(&f32_bld, src, tmp); 269 } 270 else { 271 /* 272 * I don't really expect this to be practical without rsqrt 273 * but there's no reason for triple punishment so at least 274 * save the otherwise resulting division and unnecessary mul... 275 */ 276 x05 = lp_build_sqrt(&f32_bld, src); 277 } 278 279 tmp = lp_build_mul(&f32_bld, x05, src); 280 if (lp_build_fast_rsqrt_available(src_type)) { 281 x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, tmp)); 282 } 283 else { 284 x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp)); 285 } 286 287 a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * 255.0f); 288 b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * 255.0f); 289 c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f); 290 291 tmp = lp_build_mul(&f32_bld, a_const, x0375); 292 tmp2 = lp_build_mad(&f32_bld, b_const, x05, c_const); 293 pow_final = lp_build_add(&f32_bld, tmp, tmp2); 294 } 295 296 /* linear part is easy */ 297 lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f); 298 lin = lp_build_mul(&f32_bld, src, lin_const); 299 300 lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f); 301 is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, lin_thresh); 302 tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final); 303 304 if (chan_bits != 8) { 305 /* could adjust all the constants instead */ 306 LLVMValueRef rescale_const = lp_build_const_vec(gallivm, src_type, 307 ((1 << chan_bits) - 1) / 255.0f); 308 tmp = lp_build_mul(&f32_bld, tmp, rescale_const); 309 } 310 311 f32_bld.type.sign = 0; 312 return lp_build_iround(&f32_bld, tmp); 313 } 314 315 316 /** 317 * Convert linear float soa values to packed srgb AoS values. 318 * This only handles packed formats which are 4x8bit in size 319 * (rgba and rgbx plus swizzles), and 16bit 565-style formats 320 * with no alpha. (In the latter case the return values won't be 321 * fully packed, it will look like r5g6b5x16r5g6b5x16...) 322 * 323 * @param src float SoA (vector) values to convert. 324 */ 325 LLVMValueRef 326 lp_build_float_to_srgb_packed(struct gallivm_state *gallivm, 327 const struct util_format_description *dst_fmt, 328 struct lp_type src_type, 329 LLVMValueRef *src) 330 { 331 LLVMBuilderRef builder = gallivm->builder; 332 unsigned chan; 333 struct lp_build_context f32_bld; 334 struct lp_type int32_type = lp_int_type(src_type); 335 LLVMValueRef tmpsrgb[4], alpha, dst; 336 337 lp_build_context_init(&f32_bld, gallivm, src_type); 338 339 /* rgb is subject to linear->srgb conversion, alpha is not */ 340 for (chan = 0; chan < 3; chan++) { 341 unsigned chan_bits = dst_fmt->channel[dst_fmt->swizzle[chan]].size; 342 tmpsrgb[chan] = lp_build_linear_to_srgb(gallivm, src_type, chan_bits, src[chan]); 343 } 344 /* 345 * can't use lp_build_conv since we want to keep values as 32bit 346 * here so we can interleave with rgb to go from SoA->AoS. 347 */ 348 alpha = lp_build_clamp_zero_one_nanzero(&f32_bld, src[3]); 349 alpha = lp_build_mul(&f32_bld, alpha, 350 lp_build_const_vec(gallivm, src_type, 255.0f)); 351 tmpsrgb[3] = lp_build_iround(&f32_bld, alpha); 352 353 dst = lp_build_zero(gallivm, int32_type); 354 for (chan = 0; chan < dst_fmt->nr_channels; chan++) { 355 if (dst_fmt->swizzle[chan] <= PIPE_SWIZZLE_W) { 356 unsigned ls; 357 LLVMValueRef shifted, shift_val; 358 ls = dst_fmt->channel[dst_fmt->swizzle[chan]].shift; 359 shift_val = lp_build_const_int_vec(gallivm, int32_type, ls); 360 shifted = LLVMBuildShl(builder, tmpsrgb[chan], shift_val, ""); 361 dst = LLVMBuildOr(builder, dst, shifted, ""); 362 } 363 } 364 return dst; 365 } 366