Home | History | Annotate | Download | only in gallivm
      1 /**************************************************************************
      2  *
      3  * Copyright 2013 VMware, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 
     29 /**
     30  * @file
     31  * Format conversion code for srgb formats.
     32  *
     33  * Functions for converting from srgb to linear and vice versa.
     34  * From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt:
     35  *
     36  * srgb->linear:
     37  * cl = cs / 12.92,                 cs <= 0.04045
     38  * cl = ((cs + 0.055)/1.055)^2.4,   cs >  0.04045
     39  *
     40  * linear->srgb:
     41  * if (isnan(cl)) {
     42  *    Map IEEE-754 Not-a-number to zero.
     43  *    cs = 0.0;
     44  * } else if (cl > 1.0) {
     45  *    cs = 1.0;
     46  * } else if (cl < 0.0) {
     47  *    cs = 0.0;
     48  * } else if (cl < 0.0031308) {
     49  *    cs = 12.92 * cl;
     50  * } else {
     51  *    cs = 1.055 * pow(cl, 0.41666) - 0.055;
     52  * }
     53  *
     54  * This does not need to be accurate, however at least for d3d10
     55  * (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx):
     56  * 1) For srgb->linear, it is required that the error on the srgb side is
     57  *    not larger than 0.5f, which I interpret that if you map the value back
     58  *    to srgb from linear using the ideal conversion, it would not be off by
     59  *    more than 0.5f (that is, it would map to the same 8-bit integer value
     60  *    as it was before conversion to linear).
     61  * 2) linear->srgb is permitted 0.6f which luckily looks like quite a large
     62  *    error is allowed.
     63  * 3) Additionally, all srgb values converted to linear and back must result
     64  *    in the same value as they were originally.
     65  *
     66  * @author Roland Scheidegger <sroland (at) vmware.com>
     67  */
     68 
     69 
     70 #include "util/u_debug.h"
     71 
     72 #include "lp_bld_type.h"
     73 #include "lp_bld_const.h"
     74 #include "lp_bld_arit.h"
     75 #include "lp_bld_bitarit.h"
     76 #include "lp_bld_logic.h"
     77 #include "lp_bld_format.h"
     78 
     79 
     80 
     81 /**
     82  * Convert srgb int values to linear float values.
     83  * Several possibilities how to do this, e.g.
     84  * - table
     85  * - doing the pow() with int-to-float and float-to-int tricks
     86  *   (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
     87  * - just using standard polynomial approximation
     88  *   (3rd order polynomial is required for crappy but just sufficient accuracy)
     89  *
     90  * @param src   integer (vector) value(s) to convert
     91  *              (chan_bits bit values unpacked to 32 bit already).
     92  */
     93 LLVMValueRef
     94 lp_build_srgb_to_linear(struct gallivm_state *gallivm,
     95                         struct lp_type src_type,
     96                         unsigned chan_bits,
     97                         LLVMValueRef src)
     98 {
     99    struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32);
    100    struct lp_build_context f32_bld;
    101    LLVMValueRef srcf, part_lin, part_pow, is_linear, lin_const, lin_thresh;
    102    double coeffs[4] = {0.0023f,
    103                        0.0030f / 255.0f,
    104                        0.6935f / (255.0f * 255.0f),
    105                        0.3012f / (255.0f * 255.0f * 255.0f)
    106    };
    107 
    108    assert(src_type.width == 32);
    109    /* Technically this would work with more bits too but would be inaccurate. */
    110    assert(chan_bits <= 8);
    111 
    112    lp_build_context_init(&f32_bld, gallivm, f32_type);
    113 
    114    /*
    115     * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023)
    116     * ( poly =  0.3012*x^3 + 0.6935*x^2 + 0.0030*x + 0.0023)
    117     * (found with octave polyfit and some magic as I couldn't get the error
    118     * function right). Using the above mentioned error function, the values stay
    119     * within +-0.35, except for the lowest values - hence tweaking linear segment
    120     * to cover the first 16 instead of the first 11 values (the error stays
    121     * just about acceptable there too).
    122     * Hence: lin = src > 15 ? poly : src / 12.6
    123     * This function really only makes sense for vectors, should use LUT otherwise.
    124     * All in all (including float conversion) 11 instructions (with sse4.1),
    125     * 6 constants (polynomial could be done with 1 instruction less at the cost
    126     * of slightly worse dependency chain, fma should also help).
    127     */
    128    /* doing the 1/255 mul as part of the approximation */
    129    srcf = lp_build_int_to_float(&f32_bld, src);
    130    if (chan_bits != 8) {
    131       /* could adjust all the constants instead */
    132       LLVMValueRef rescale_const = lp_build_const_vec(gallivm, f32_type,
    133                                                       255.0f / ((1 << chan_bits) - 1));
    134       srcf = lp_build_mul(&f32_bld, srcf, rescale_const);
    135    }
    136    lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f * 255.0f));
    137    part_lin = lp_build_mul(&f32_bld, srcf, lin_const);
    138 
    139    part_pow = lp_build_polynomial(&f32_bld, srcf, coeffs, 4);
    140 
    141    lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f);
    142    is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf, lin_thresh);
    143    return lp_build_select(&f32_bld, is_linear, part_lin, part_pow);
    144 }
    145 
    146 
    147 /**
    148  * Convert linear float values to srgb int values.
    149  * Several possibilities how to do this, e.g.
    150  * - use table (based on exponent/highest order mantissa bits) and do
    151  *   linear interpolation (https://gist.github.com/rygorous/2203834)
    152  * - Chebyshev polynomial
    153  * - Approximation using reciprocals
    154  * - using int-to-float and float-to-int tricks for pow()
    155  *   (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
    156  *
    157  * @param src   float (vector) value(s) to convert.
    158  */
    159 static LLVMValueRef
    160 lp_build_linear_to_srgb(struct gallivm_state *gallivm,
    161                         struct lp_type src_type,
    162                         unsigned chan_bits,
    163                         LLVMValueRef src)
    164 {
    165    LLVMBuilderRef builder = gallivm->builder;
    166    struct lp_build_context f32_bld;
    167    LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final;
    168 
    169    lp_build_context_init(&f32_bld, gallivm, src_type);
    170 
    171    src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one);
    172 
    173    if (0) {
    174       /*
    175        * using int-to-float and float-to-int trick for pow().
    176        * This is much more accurate than necessary thanks to the correction,
    177        * but it most certainly makes no sense without rsqrt available.
    178        * Bonus points if you understand how this works...
    179        * All in all (including min/max clamp, conversion) 19 instructions.
    180        */
    181 
    182       float exp_f = 2.0f / 3.0f;
    183       /* some compilers can't do exp2f, so this is exp2f(127.0f/exp_f - 127.0f) */
    184       float exp2f_c = 1.30438178253e+19f;
    185       float coeff_f = 0.62996f;
    186       LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2;
    187       struct lp_type int_type = lp_int_type(src_type);
    188 
    189       /*
    190        * First calculate approx x^8/12
    191        */
    192       exponent = lp_build_const_vec(gallivm, src_type, exp_f);
    193       coeff = lp_build_const_vec(gallivm, src_type,
    194                                  exp2f_c * powf(coeff_f, 1.0f / exp_f));
    195 
    196       /* premultiply src */
    197       tmp = lp_build_mul(&f32_bld, coeff, src);
    198       /* "log2" */
    199       tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, int_type), "");
    200       tmp = lp_build_int_to_float(&f32_bld, tmp);
    201       /* multiply for pow */
    202       tmp = lp_build_mul(&f32_bld, tmp, exponent);
    203       /* "exp2" */
    204       pow_approx = lp_build_itrunc(&f32_bld, tmp);
    205       pow_approx = LLVMBuildBitCast(builder, pow_approx,
    206                                     lp_build_vec_type(gallivm, src_type), "");
    207 
    208       /*
    209        * Since that pow was inaccurate (like 3 bits, though each sqrt step would
    210        * give another bit), compensate the error (which is why we chose another
    211        * exponent in the first place).
    212        */
    213       /* x * x^(8/12) = x^(20/12) */
    214       pow_1 = lp_build_mul(&f32_bld, pow_approx, src);
    215 
    216       /* x * x * x^(-4/12) = x^(20/12) */
    217       /* Should avoid using rsqrt if it's not available, but
    218        * using x * x^(4/12) * x^(4/12) instead will change error weight */
    219       tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx);
    220       x2 = lp_build_mul(&f32_bld, src, src);
    221       pow_2 = lp_build_mul(&f32_bld, x2, tmp);
    222 
    223       /* average the values so the errors cancel out, compensate bias,
    224        * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul
    225        * for conversion to int in here */
    226       tmp = lp_build_add(&f32_bld, pow_1, pow_2);
    227       coeff = lp_build_const_vec(gallivm, src_type,
    228                                  1.0f / (3.0f * coeff_f) * 0.999852f *
    229                                  powf(1.055f * 255.0f, 4.0f));
    230       pow_final = lp_build_mul(&f32_bld, tmp, coeff);
    231 
    232       /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */
    233       if (lp_build_fast_rsqrt_available(src_type)) {
    234          pow_final = lp_build_fast_rsqrt(&f32_bld,
    235                         lp_build_fast_rsqrt(&f32_bld, pow_final));
    236       }
    237       else {
    238          pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, pow_final));
    239       }
    240       pow_final = lp_build_add(&f32_bld, pow_final,
    241                                lp_build_const_vec(gallivm, src_type, -0.055f * 255.0f));
    242    }
    243 
    244    else {
    245       /*
    246        * using "rational polynomial" approximation here.
    247        * Essentially y = a*x^0.375 + b*x^0.5 + c, with also
    248        * factoring in the 255.0 mul and the scaling mul.
    249        * (a is closer to actual value so has higher weight than b.)
    250        * Note: the constants are magic values. They were found empirically,
    251        * possibly could be improved but good enough (be VERY careful with
    252        * error metric if you'd want to tweak them, they also MUST fit with
    253        * the crappy polynomial above for srgb->linear since it is required
    254        * that each srgb value maps back to the same value).
    255        * This function has an error of max +-0.17. Not sure this is actually
    256        * enough, we require +-0.6 but that may include the +-0.5 from integer
    257        * conversion. Seems to pass all relevant tests though...
    258        * For the approximated srgb->linear values the error is naturally larger
    259        * (+-0.42) but still accurate enough (required +-0.5 essentially).
    260        * All in all (including min/max clamp, conversion) 15 instructions.
    261        * FMA would help (minus 2 instructions).
    262        */
    263 
    264       LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2;
    265 
    266       if (lp_build_fast_rsqrt_available(src_type)) {
    267          tmp = lp_build_fast_rsqrt(&f32_bld, src);
    268          x05 = lp_build_mul(&f32_bld, src, tmp);
    269       }
    270       else {
    271          /*
    272           * I don't really expect this to be practical without rsqrt
    273           * but there's no reason for triple punishment so at least
    274           * save the otherwise resulting division and unnecessary mul...
    275           */
    276          x05 = lp_build_sqrt(&f32_bld, src);
    277       }
    278 
    279       tmp = lp_build_mul(&f32_bld, x05, src);
    280       if (lp_build_fast_rsqrt_available(src_type)) {
    281          x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, tmp));
    282       }
    283       else {
    284          x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp));
    285       }
    286 
    287       a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * 255.0f);
    288       b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * 255.0f);
    289       c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f);
    290 
    291       tmp = lp_build_mul(&f32_bld, a_const, x0375);
    292       tmp2 = lp_build_mad(&f32_bld, b_const, x05, c_const);
    293       pow_final = lp_build_add(&f32_bld, tmp, tmp2);
    294    }
    295 
    296    /* linear part is easy */
    297    lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);
    298    lin = lp_build_mul(&f32_bld, src, lin_const);
    299 
    300    lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);
    301    is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, lin_thresh);
    302    tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);
    303 
    304    if (chan_bits != 8) {
    305       /* could adjust all the constants instead */
    306       LLVMValueRef rescale_const = lp_build_const_vec(gallivm, src_type,
    307                                                       ((1 << chan_bits) - 1) / 255.0f);
    308       tmp = lp_build_mul(&f32_bld, tmp, rescale_const);
    309    }
    310 
    311    f32_bld.type.sign = 0;
    312    return lp_build_iround(&f32_bld, tmp);
    313 }
    314 
    315 
    316 /**
    317  * Convert linear float soa values to packed srgb AoS values.
    318  * This only handles packed formats which are 4x8bit in size
    319  * (rgba and rgbx plus swizzles), and 16bit 565-style formats
    320  * with no alpha. (In the latter case the return values won't be
    321  * fully packed, it will look like r5g6b5x16r5g6b5x16...)
    322  *
    323  * @param src   float SoA (vector) values to convert.
    324  */
    325 LLVMValueRef
    326 lp_build_float_to_srgb_packed(struct gallivm_state *gallivm,
    327                               const struct util_format_description *dst_fmt,
    328                               struct lp_type src_type,
    329                               LLVMValueRef *src)
    330 {
    331    LLVMBuilderRef builder = gallivm->builder;
    332    unsigned chan;
    333    struct lp_build_context f32_bld;
    334    struct lp_type int32_type = lp_int_type(src_type);
    335    LLVMValueRef tmpsrgb[4], alpha, dst;
    336 
    337    lp_build_context_init(&f32_bld, gallivm, src_type);
    338 
    339    /* rgb is subject to linear->srgb conversion, alpha is not */
    340    for (chan = 0; chan < 3; chan++) {
    341       unsigned chan_bits = dst_fmt->channel[dst_fmt->swizzle[chan]].size;
    342       tmpsrgb[chan] = lp_build_linear_to_srgb(gallivm, src_type, chan_bits, src[chan]);
    343    }
    344    /*
    345     * can't use lp_build_conv since we want to keep values as 32bit
    346     * here so we can interleave with rgb to go from SoA->AoS.
    347     */
    348    alpha = lp_build_clamp_zero_one_nanzero(&f32_bld, src[3]);
    349    alpha = lp_build_mul(&f32_bld, alpha,
    350                         lp_build_const_vec(gallivm, src_type, 255.0f));
    351    tmpsrgb[3] = lp_build_iround(&f32_bld, alpha);
    352 
    353    dst = lp_build_zero(gallivm, int32_type);
    354    for (chan = 0; chan < dst_fmt->nr_channels; chan++) {
    355       if (dst_fmt->swizzle[chan] <= PIPE_SWIZZLE_W) {
    356          unsigned ls;
    357          LLVMValueRef shifted, shift_val;
    358          ls = dst_fmt->channel[dst_fmt->swizzle[chan]].shift;
    359          shift_val = lp_build_const_int_vec(gallivm, int32_type, ls);
    360          shifted = LLVMBuildShl(builder, tmpsrgb[chan], shift_val, "");
    361          dst = LLVMBuildOr(builder, dst, shifted, "");
    362       }
    363    }
    364    return dst;
    365 }
    366