Home | History | Annotate | Download | only in glsl
      1 /*
      2  * Copyright  2010 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21  * DEALINGS IN THE SOFTWARE.
     22  */
     23 
     24 /**
     25  * \file lower_instructions.cpp
     26  *
     27  * Many GPUs lack native instructions for certain expression operations, and
     28  * must replace them with some other expression tree.  This pass lowers some
     29  * of the most common cases, allowing the lowering code to be implemented once
     30  * rather than in each driver backend.
     31  *
     32  * Currently supported transformations:
     33  * - SUB_TO_ADD_NEG
     34  * - DIV_TO_MUL_RCP
     35  * - INT_DIV_TO_MUL_RCP
     36  * - EXP_TO_EXP2
     37  * - POW_TO_EXP2
     38  * - LOG_TO_LOG2
     39  * - MOD_TO_FLOOR
     40  * - LDEXP_TO_ARITH
     41  * - DFREXP_TO_ARITH
     42  * - CARRY_TO_ARITH
     43  * - BORROW_TO_ARITH
     44  * - SAT_TO_CLAMP
     45  * - DOPS_TO_DFRAC
     46  *
     47  * SUB_TO_ADD_NEG:
     48  * ---------------
     49  * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
     50  *
     51  * This simplifies expression reassociation, and for many backends
     52  * there is no subtract operation separate from adding the negation.
     53  * For backends with native subtract operations, they will probably
     54  * want to recognize add(op0, neg(op1)) or the other way around to
     55  * produce a subtract anyway.
     56  *
     57  * FDIV_TO_MUL_RCP, DDIV_TO_MUL_RCP, and INT_DIV_TO_MUL_RCP:
     58  * ---------------------------------------------------------
     59  * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
     60  *
     61  * Many GPUs don't have a divide instruction (945 and 965 included),
     62  * but they do have an RCP instruction to compute an approximate
     63  * reciprocal.  By breaking the operation down, constant reciprocals
     64  * can get constant folded.
     65  *
     66  * FDIV_TO_MUL_RCP only lowers single-precision floating point division;
     67  * DDIV_TO_MUL_RCP only lowers double-precision floating point division.
     68  * DIV_TO_MUL_RCP is a convenience macro that sets both flags.
     69  * INT_DIV_TO_MUL_RCP handles the integer case, converting to and from floating
     70  * point so that RCP is possible.
     71  *
     72  * EXP_TO_EXP2 and LOG_TO_LOG2:
     73  * ----------------------------
     74  * Many GPUs don't have a base e log or exponent instruction, but they
     75  * do have base 2 versions, so this pass converts exp and log to exp2
     76  * and log2 operations.
     77  *
     78  * POW_TO_EXP2:
     79  * -----------
     80  * Many older GPUs don't have an x**y instruction.  For these GPUs, convert
     81  * x**y to 2**(y * log2(x)).
     82  *
     83  * MOD_TO_FLOOR:
     84  * -------------
     85  * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1))
     86  *
     87  * Many GPUs don't have a MOD instruction (945 and 965 included), and
     88  * if we have to break it down like this anyway, it gives an
     89  * opportunity to do things like constant fold the (1.0 / op1) easily.
     90  *
     91  * Note: before we used to implement this as op1 * fract(op / op1) but this
     92  * implementation had significant precision errors.
     93  *
     94  * LDEXP_TO_ARITH:
     95  * -------------
     96  * Converts ir_binop_ldexp to arithmetic and bit operations for float sources.
     97  *
     98  * DFREXP_DLDEXP_TO_ARITH:
     99  * ---------------
    100  * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to
    101  * arithmetic and bit ops for double arguments.
    102  *
    103  * CARRY_TO_ARITH:
    104  * ---------------
    105  * Converts ir_carry into (x + y) < x.
    106  *
    107  * BORROW_TO_ARITH:
    108  * ----------------
    109  * Converts ir_borrow into (x < y).
    110  *
    111  * SAT_TO_CLAMP:
    112  * -------------
    113  * Converts ir_unop_saturate into min(max(x, 0.0), 1.0)
    114  *
    115  * DOPS_TO_DFRAC:
    116  * --------------
    117  * Converts double trunc, ceil, floor, round to fract
    118  */
    119 
    120 #include "c99_math.h"
    121 #include "program/prog_instruction.h" /* for swizzle */
    122 #include "compiler/glsl_types.h"
    123 #include "ir.h"
    124 #include "ir_builder.h"
    125 #include "ir_optimization.h"
    126 
    127 using namespace ir_builder;
    128 
    129 namespace {
    130 
    131 class lower_instructions_visitor : public ir_hierarchical_visitor {
    132 public:
    133    lower_instructions_visitor(unsigned lower)
    134       : progress(false), lower(lower) { }
    135 
    136    ir_visitor_status visit_leave(ir_expression *);
    137 
    138    bool progress;
    139 
    140 private:
    141    unsigned lower; /** Bitfield of which operations to lower */
    142 
    143    void sub_to_add_neg(ir_expression *);
    144    void div_to_mul_rcp(ir_expression *);
    145    void int_div_to_mul_rcp(ir_expression *);
    146    void mod_to_floor(ir_expression *);
    147    void exp_to_exp2(ir_expression *);
    148    void pow_to_exp2(ir_expression *);
    149    void log_to_log2(ir_expression *);
    150    void ldexp_to_arith(ir_expression *);
    151    void dldexp_to_arith(ir_expression *);
    152    void dfrexp_sig_to_arith(ir_expression *);
    153    void dfrexp_exp_to_arith(ir_expression *);
    154    void carry_to_arith(ir_expression *);
    155    void borrow_to_arith(ir_expression *);
    156    void sat_to_clamp(ir_expression *);
    157    void double_dot_to_fma(ir_expression *);
    158    void double_lrp(ir_expression *);
    159    void dceil_to_dfrac(ir_expression *);
    160    void dfloor_to_dfrac(ir_expression *);
    161    void dround_even_to_dfrac(ir_expression *);
    162    void dtrunc_to_dfrac(ir_expression *);
    163    void dsign_to_csel(ir_expression *);
    164    void bit_count_to_math(ir_expression *);
    165    void extract_to_shifts(ir_expression *);
    166    void insert_to_shifts(ir_expression *);
    167    void reverse_to_shifts(ir_expression *ir);
    168    void find_lsb_to_float_cast(ir_expression *ir);
    169    void find_msb_to_float_cast(ir_expression *ir);
    170    void imul_high_to_mul(ir_expression *ir);
    171    void sqrt_to_abs_sqrt(ir_expression *ir);
    172 
    173    ir_expression *_carry(operand a, operand b);
    174 };
    175 
    176 } /* anonymous namespace */
    177 
    178 /**
    179  * Determine if a particular type of lowering should occur
    180  */
    181 #define lowering(x) (this->lower & x)
    182 
    183 bool
    184 lower_instructions(exec_list *instructions, unsigned what_to_lower)
    185 {
    186    lower_instructions_visitor v(what_to_lower);
    187 
    188    visit_list_elements(&v, instructions);
    189    return v.progress;
    190 }
    191 
    192 void
    193 lower_instructions_visitor::sub_to_add_neg(ir_expression *ir)
    194 {
    195    ir->operation = ir_binop_add;
    196    ir->init_num_operands();
    197    ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type,
    198 					   ir->operands[1], NULL);
    199    this->progress = true;
    200 }
    201 
    202 void
    203 lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir)
    204 {
    205    assert(ir->operands[1]->type->is_float() || ir->operands[1]->type->is_double());
    206 
    207    /* New expression for the 1.0 / op1 */
    208    ir_rvalue *expr;
    209    expr = new(ir) ir_expression(ir_unop_rcp,
    210 				ir->operands[1]->type,
    211 				ir->operands[1]);
    212 
    213    /* op0 / op1 -> op0 * (1.0 / op1) */
    214    ir->operation = ir_binop_mul;
    215    ir->init_num_operands();
    216    ir->operands[1] = expr;
    217 
    218    this->progress = true;
    219 }
    220 
    221 void
    222 lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir)
    223 {
    224    assert(ir->operands[1]->type->is_integer());
    225 
    226    /* Be careful with integer division -- we need to do it as a
    227     * float and re-truncate, since rcp(n > 1) of an integer would
    228     * just be 0.
    229     */
    230    ir_rvalue *op0, *op1;
    231    const struct glsl_type *vec_type;
    232 
    233    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
    234 				      ir->operands[1]->type->vector_elements,
    235 				      ir->operands[1]->type->matrix_columns);
    236 
    237    if (ir->operands[1]->type->base_type == GLSL_TYPE_INT)
    238       op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL);
    239    else
    240       op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL);
    241 
    242    op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL);
    243 
    244    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
    245 				      ir->operands[0]->type->vector_elements,
    246 				      ir->operands[0]->type->matrix_columns);
    247 
    248    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT)
    249       op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL);
    250    else
    251       op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL);
    252 
    253    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
    254 				      ir->type->vector_elements,
    255 				      ir->type->matrix_columns);
    256 
    257    op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1);
    258 
    259    if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) {
    260       ir->operation = ir_unop_f2i;
    261       ir->operands[0] = op0;
    262    } else {
    263       ir->operation = ir_unop_i2u;
    264       ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0);
    265    }
    266    ir->init_num_operands();
    267    ir->operands[1] = NULL;
    268 
    269    this->progress = true;
    270 }
    271 
    272 void
    273 lower_instructions_visitor::exp_to_exp2(ir_expression *ir)
    274 {
    275    ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E));
    276 
    277    ir->operation = ir_unop_exp2;
    278    ir->init_num_operands();
    279    ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type,
    280 					   ir->operands[0], log2_e);
    281    this->progress = true;
    282 }
    283 
    284 void
    285 lower_instructions_visitor::pow_to_exp2(ir_expression *ir)
    286 {
    287    ir_expression *const log2_x =
    288       new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
    289 			    ir->operands[0]);
    290 
    291    ir->operation = ir_unop_exp2;
    292    ir->init_num_operands();
    293    ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type,
    294 					   ir->operands[1], log2_x);
    295    ir->operands[1] = NULL;
    296    this->progress = true;
    297 }
    298 
    299 void
    300 lower_instructions_visitor::log_to_log2(ir_expression *ir)
    301 {
    302    ir->operation = ir_binop_mul;
    303    ir->init_num_operands();
    304    ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
    305 					   ir->operands[0], NULL);
    306    ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E));
    307    this->progress = true;
    308 }
    309 
    310 void
    311 lower_instructions_visitor::mod_to_floor(ir_expression *ir)
    312 {
    313    ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x",
    314                                          ir_var_temporary);
    315    ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y",
    316                                          ir_var_temporary);
    317    this->base_ir->insert_before(x);
    318    this->base_ir->insert_before(y);
    319 
    320    ir_assignment *const assign_x =
    321       new(ir) ir_assignment(new(ir) ir_dereference_variable(x),
    322                             ir->operands[0]);
    323    ir_assignment *const assign_y =
    324       new(ir) ir_assignment(new(ir) ir_dereference_variable(y),
    325                             ir->operands[1]);
    326 
    327    this->base_ir->insert_before(assign_x);
    328    this->base_ir->insert_before(assign_y);
    329 
    330    ir_expression *const div_expr =
    331       new(ir) ir_expression(ir_binop_div, x->type,
    332                             new(ir) ir_dereference_variable(x),
    333                             new(ir) ir_dereference_variable(y));
    334 
    335    /* Don't generate new IR that would need to be lowered in an additional
    336     * pass.
    337     */
    338    if ((lowering(FDIV_TO_MUL_RCP) && ir->type->is_float()) ||
    339        (lowering(DDIV_TO_MUL_RCP) && ir->type->is_double()))
    340       div_to_mul_rcp(div_expr);
    341 
    342    ir_expression *const floor_expr =
    343       new(ir) ir_expression(ir_unop_floor, x->type, div_expr);
    344 
    345    if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
    346       dfloor_to_dfrac(floor_expr);
    347 
    348    ir_expression *const mul_expr =
    349       new(ir) ir_expression(ir_binop_mul,
    350                             new(ir) ir_dereference_variable(y),
    351                             floor_expr);
    352 
    353    ir->operation = ir_binop_sub;
    354    ir->init_num_operands();
    355    ir->operands[0] = new(ir) ir_dereference_variable(x);
    356    ir->operands[1] = mul_expr;
    357    this->progress = true;
    358 }
    359 
    360 void
    361 lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
    362 {
    363    /* Translates
    364     *    ir_binop_ldexp x exp
    365     * into
    366     *
    367     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
    368     *    resulting_biased_exp = min(extracted_biased_exp + exp, 255);
    369     *
    370     *    if (extracted_biased_exp >= 255)
    371     *       return x; // +/-inf, NaN
    372     *
    373     *    sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask;
    374     *
    375     *    if (min(resulting_biased_exp, extracted_biased_exp) < 1)
    376     *       resulting_biased_exp = 0;
    377     *    if (resulting_biased_exp >= 255 ||
    378     *        min(resulting_biased_exp, extracted_biased_exp) < 1) {
    379     *       sign_mantissa &= sign_mask;
    380     *    }
    381     *
    382     *    return bitcast_u2f(sign_mantissa |
    383     *                       lshift(i2u(resulting_biased_exp), exp_shift));
    384     *
    385     * which we can't actually implement as such, since the GLSL IR doesn't
    386     * have vectorized if-statements. We actually implement it without branches
    387     * using conditional-select:
    388     *
    389     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
    390     *    resulting_biased_exp = min(extracted_biased_exp + exp, 255);
    391     *
    392     *    sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask;
    393     *
    394     *    flush_to_zero = lequal(min(resulting_biased_exp, extracted_biased_exp), 0);
    395     *    resulting_biased_exp = csel(flush_to_zero, 0, resulting_biased_exp)
    396     *    zero_mantissa = logic_or(flush_to_zero,
    397     *                             gequal(resulting_biased_exp, 255));
    398     *    sign_mantissa = csel(zero_mantissa, sign_mantissa & sign_mask, sign_mantissa);
    399     *
    400     *    result = sign_mantissa |
    401     *             lshift(i2u(resulting_biased_exp), exp_shift));
    402     *
    403     *    return csel(extracted_biased_exp >= 255, x, bitcast_u2f(result));
    404     *
    405     * The definition of ldexp in the GLSL spec says:
    406     *
    407     *    "If this product is too large to be represented in the
    408     *     floating-point type, the result is undefined."
    409     *
    410     * However, the definition of ldexp in the GLSL ES spec does not contain
    411     * this sentence, so we do need to handle overflow correctly.
    412     *
    413     * There is additional language limiting the defined range of exp, but this
    414     * is merely to allow implementations that store 2^exp in a temporary
    415     * variable.
    416     */
    417 
    418    const unsigned vec_elem = ir->type->vector_elements;
    419 
    420    /* Types */
    421    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
    422    const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
    423    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
    424 
    425    /* Temporary variables */
    426    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
    427    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
    428    ir_variable *result = new(ir) ir_variable(uvec, "result", ir_var_temporary);
    429 
    430    ir_variable *extracted_biased_exp =
    431       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
    432    ir_variable *resulting_biased_exp =
    433       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
    434 
    435    ir_variable *sign_mantissa =
    436       new(ir) ir_variable(uvec, "sign_mantissa", ir_var_temporary);
    437 
    438    ir_variable *flush_to_zero =
    439       new(ir) ir_variable(bvec, "flush_to_zero", ir_var_temporary);
    440    ir_variable *zero_mantissa =
    441       new(ir) ir_variable(bvec, "zero_mantissa", ir_var_temporary);
    442 
    443    ir_instruction &i = *base_ir;
    444 
    445    /* Copy <x> and <exp> arguments. */
    446    i.insert_before(x);
    447    i.insert_before(assign(x, ir->operands[0]));
    448    i.insert_before(exp);
    449    i.insert_before(assign(exp, ir->operands[1]));
    450 
    451    /* Extract the biased exponent from <x>. */
    452    i.insert_before(extracted_biased_exp);
    453    i.insert_before(assign(extracted_biased_exp,
    454                           rshift(bitcast_f2i(abs(x)),
    455                                  new(ir) ir_constant(23, vec_elem))));
    456 
    457    /* The definition of ldexp in the GLSL 4.60 spec says:
    458     *
    459     *    "If exp is greater than +128 (single-precision) or +1024
    460     *     (double-precision), the value returned is undefined. If exp is less
    461     *     than -126 (single-precision) or -1022 (double-precision), the value
    462     *     returned may be flushed to zero."
    463     *
    464     * So we do not have to guard against the possibility of addition overflow,
    465     * which could happen when exp is close to INT_MAX. Addition underflow
    466     * cannot happen (the worst case is 0 + (-INT_MAX)).
    467     */
    468    i.insert_before(resulting_biased_exp);
    469    i.insert_before(assign(resulting_biased_exp,
    470                           min2(add(extracted_biased_exp, exp),
    471                                new(ir) ir_constant(255, vec_elem))));
    472 
    473    i.insert_before(sign_mantissa);
    474    i.insert_before(assign(sign_mantissa,
    475                           bit_and(bitcast_f2u(x),
    476                                   new(ir) ir_constant(0x807fffffu, vec_elem))));
    477 
    478    /* We flush to zero if the original or resulting biased exponent is 0,
    479     * indicating a +/-0.0 or subnormal input or output.
    480     *
    481     * The mantissa is set to 0 if the resulting biased exponent is 255, since
    482     * an overflow should produce a +/-inf result.
    483     *
    484     * Note that NaN inputs are handled separately.
    485     */
    486    i.insert_before(flush_to_zero);
    487    i.insert_before(assign(flush_to_zero,
    488                           lequal(min2(resulting_biased_exp,
    489                                       extracted_biased_exp),
    490                                  ir_constant::zero(ir, ivec))));
    491    i.insert_before(assign(resulting_biased_exp,
    492                           csel(flush_to_zero,
    493                                ir_constant::zero(ir, ivec),
    494                                resulting_biased_exp)));
    495 
    496    i.insert_before(zero_mantissa);
    497    i.insert_before(assign(zero_mantissa,
    498                           logic_or(flush_to_zero,
    499                                    equal(resulting_biased_exp,
    500                                          new(ir) ir_constant(255, vec_elem)))));
    501    i.insert_before(assign(sign_mantissa,
    502                           csel(zero_mantissa,
    503                                bit_and(sign_mantissa,
    504                                        new(ir) ir_constant(0x80000000u, vec_elem)),
    505                                sign_mantissa)));
    506 
    507    /* Don't generate new IR that would need to be lowered in an additional
    508     * pass.
    509     */
    510    i.insert_before(result);
    511    if (!lowering(INSERT_TO_SHIFTS)) {
    512       i.insert_before(assign(result,
    513                              bitfield_insert(sign_mantissa,
    514                                              i2u(resulting_biased_exp),
    515                                              new(ir) ir_constant(23u, vec_elem),
    516                                              new(ir) ir_constant(8u, vec_elem))));
    517    } else {
    518       i.insert_before(assign(result,
    519                              bit_or(sign_mantissa,
    520                                     lshift(i2u(resulting_biased_exp),
    521                                            new(ir) ir_constant(23, vec_elem)))));
    522    }
    523 
    524    ir->operation = ir_triop_csel;
    525    ir->init_num_operands();
    526    ir->operands[0] = gequal(extracted_biased_exp,
    527                             new(ir) ir_constant(255, vec_elem));
    528    ir->operands[1] = new(ir) ir_dereference_variable(x);
    529    ir->operands[2] = bitcast_u2f(result);
    530 
    531    this->progress = true;
    532 }
    533 
    534 void
    535 lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)
    536 {
    537    /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent
    538     * from the significand.
    539     */
    540 
    541    const unsigned vec_elem = ir->type->vector_elements;
    542 
    543    /* Types */
    544    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
    545    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
    546 
    547    /* Constants */
    548    ir_constant *zeroi = ir_constant::zero(ir, ivec);
    549 
    550    ir_constant *sign_mask = new(ir) ir_constant(0x80000000u);
    551 
    552    ir_constant *exp_shift = new(ir) ir_constant(20u);
    553    ir_constant *exp_width = new(ir) ir_constant(11u);
    554    ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem);
    555 
    556    /* Temporary variables */
    557    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
    558    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
    559 
    560    ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
    561                                                   ir_var_temporary);
    562 
    563    ir_variable *extracted_biased_exp =
    564       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
    565    ir_variable *resulting_biased_exp =
    566       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
    567 
    568    ir_variable *is_not_zero_or_underflow =
    569       new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
    570 
    571    ir_instruction &i = *base_ir;
    572 
    573    /* Copy <x> and <exp> arguments. */
    574    i.insert_before(x);
    575    i.insert_before(assign(x, ir->operands[0]));
    576    i.insert_before(exp);
    577    i.insert_before(assign(exp, ir->operands[1]));
    578 
    579    ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x);
    580    if (lowering(DFREXP_DLDEXP_TO_ARITH))
    581       dfrexp_exp_to_arith(frexp_exp);
    582 
    583    /* Extract the biased exponent from <x>. */
    584    i.insert_before(extracted_biased_exp);
    585    i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias)));
    586 
    587    i.insert_before(resulting_biased_exp);
    588    i.insert_before(assign(resulting_biased_exp,
    589                           add(extracted_biased_exp, exp)));
    590 
    591    /* Test if result is 0.0, subnormal, or underflow by checking if the
    592     * resulting biased exponent would be less than 0x1. If so, the result is
    593     * 0.0 with the sign of x. (Actually, invert the conditions so that
    594     * immediate values are the second arguments, which is better for i965)
    595     * TODO: Implement in a vector fashion.
    596     */
    597    i.insert_before(zero_sign_x);
    598    for (unsigned elem = 0; elem < vec_elem; elem++) {
    599       ir_variable *unpacked =
    600          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
    601       i.insert_before(unpacked);
    602       i.insert_before(
    603             assign(unpacked,
    604                    expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
    605       i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)),
    606                              WRITEMASK_Y));
    607       i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X));
    608       i.insert_before(assign(zero_sign_x,
    609                              expr(ir_unop_pack_double_2x32, unpacked),
    610                              1 << elem));
    611    }
    612    i.insert_before(is_not_zero_or_underflow);
    613    i.insert_before(assign(is_not_zero_or_underflow,
    614                           gequal(resulting_biased_exp,
    615                                   new(ir) ir_constant(0x1, vec_elem))));
    616    i.insert_before(assign(x, csel(is_not_zero_or_underflow,
    617                                   x, zero_sign_x)));
    618    i.insert_before(assign(resulting_biased_exp,
    619                           csel(is_not_zero_or_underflow,
    620                                resulting_biased_exp, zeroi)));
    621 
    622    /* We could test for overflows by checking if the resulting biased exponent
    623     * would be greater than 0xFE. Turns out we don't need to because the GLSL
    624     * spec says:
    625     *
    626     *    "If this product is too large to be represented in the
    627     *     floating-point type, the result is undefined."
    628     */
    629 
    630    ir_rvalue *results[4] = {NULL};
    631    for (unsigned elem = 0; elem < vec_elem; elem++) {
    632       ir_variable *unpacked =
    633          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
    634       i.insert_before(unpacked);
    635       i.insert_before(
    636             assign(unpacked,
    637                    expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
    638 
    639       ir_expression *bfi = bitfield_insert(
    640             swizzle_y(unpacked),
    641             i2u(swizzle(resulting_biased_exp, elem, 1)),
    642             exp_shift->clone(ir, NULL),
    643             exp_width->clone(ir, NULL));
    644 
    645       i.insert_before(assign(unpacked, bfi, WRITEMASK_Y));
    646 
    647       results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
    648    }
    649 
    650    ir->operation = ir_quadop_vector;
    651    ir->init_num_operands();
    652    ir->operands[0] = results[0];
    653    ir->operands[1] = results[1];
    654    ir->operands[2] = results[2];
    655    ir->operands[3] = results[3];
    656 
    657    /* Don't generate new IR that would need to be lowered in an additional
    658     * pass.
    659     */
    660 
    661    this->progress = true;
    662 }
    663 
    664 void
    665 lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir)
    666 {
    667    const unsigned vec_elem = ir->type->vector_elements;
    668    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
    669 
    670    /* Double-precision floating-point values are stored as
    671     *   1 sign bit;
    672     *   11 exponent bits;
    673     *   52 mantissa bits.
    674     *
    675     * We're just extracting the significand here, so we only need to modify
    676     * the upper 32-bit uint. Unfortunately we must extract each double
    677     * independently as there is no vector version of unpackDouble.
    678     */
    679 
    680    ir_instruction &i = *base_ir;
    681 
    682    ir_variable *is_not_zero =
    683       new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
    684    ir_rvalue *results[4] = {NULL};
    685 
    686    ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
    687    i.insert_before(is_not_zero);
    688    i.insert_before(
    689          assign(is_not_zero,
    690                 nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero)));
    691 
    692    /* TODO: Remake this as more vector-friendly when int64 support is
    693     * available.
    694     */
    695    for (unsigned elem = 0; elem < vec_elem; elem++) {
    696       ir_constant *zero = new(ir) ir_constant(0u, 1);
    697       ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1);
    698 
    699       /* Exponent of double floating-point values in the range [0.5, 1.0). */
    700       ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1);
    701 
    702       ir_variable *bits =
    703          new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary);
    704       ir_variable *unpacked =
    705          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
    706 
    707       ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1);
    708 
    709       i.insert_before(bits);
    710       i.insert_before(unpacked);
    711       i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x)));
    712 
    713       /* Manipulate the high uint to remove the exponent and replace it with
    714        * either the default exponent or zero.
    715        */
    716       i.insert_before(assign(bits, swizzle_y(unpacked)));
    717       i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask)));
    718       i.insert_before(assign(bits, bit_or(bits,
    719                                           csel(swizzle(is_not_zero, elem, 1),
    720                                                exponent_value,
    721                                                zero))));
    722       i.insert_before(assign(unpacked, bits, WRITEMASK_Y));
    723       results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
    724    }
    725 
    726    /* Put the dvec back together */
    727    ir->operation = ir_quadop_vector;
    728    ir->init_num_operands();
    729    ir->operands[0] = results[0];
    730    ir->operands[1] = results[1];
    731    ir->operands[2] = results[2];
    732    ir->operands[3] = results[3];
    733 
    734    this->progress = true;
    735 }
    736 
    737 void
    738 lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir)
    739 {
    740    const unsigned vec_elem = ir->type->vector_elements;
    741    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
    742    const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
    743 
    744    /* Double-precision floating-point values are stored as
    745     *   1 sign bit;
    746     *   11 exponent bits;
    747     *   52 mantissa bits.
    748     *
    749     * We're just extracting the exponent here, so we only care about the upper
    750     * 32-bit uint.
    751     */
    752 
    753    ir_instruction &i = *base_ir;
    754 
    755    ir_variable *is_not_zero =
    756       new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
    757    ir_variable *high_words =
    758       new(ir) ir_variable(uvec, "high_words", ir_var_temporary);
    759    ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
    760    ir_constant *izero = new(ir) ir_constant(0, vec_elem);
    761 
    762    ir_rvalue *absval = abs(ir->operands[0]);
    763 
    764    i.insert_before(is_not_zero);
    765    i.insert_before(high_words);
    766    i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero)));
    767 
    768    /* Extract all of the upper uints. */
    769    for (unsigned elem = 0; elem < vec_elem; elem++) {
    770       ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1);
    771 
    772       i.insert_before(assign(high_words,
    773                              swizzle_y(expr(ir_unop_unpack_double_2x32, x)),
    774                              1 << elem));
    775 
    776    }
    777    ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem);
    778    ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem);
    779 
    780    /* For non-zero inputs, shift the exponent down and apply bias. */
    781    ir->operation = ir_triop_csel;
    782    ir->init_num_operands();
    783    ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero);
    784    ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift)));
    785    ir->operands[2] = izero;
    786 
    787    this->progress = true;
    788 }
    789 
    790 void
    791 lower_instructions_visitor::carry_to_arith(ir_expression *ir)
    792 {
    793    /* Translates
    794     *   ir_binop_carry x y
    795     * into
    796     *   sum = ir_binop_add x y
    797     *   bcarry = ir_binop_less sum x
    798     *   carry = ir_unop_b2i bcarry
    799     */
    800 
    801    ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL);
    802    ir->operation = ir_unop_i2u;
    803    ir->init_num_operands();
    804    ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone));
    805    ir->operands[1] = NULL;
    806 
    807    this->progress = true;
    808 }
    809 
    810 void
    811 lower_instructions_visitor::borrow_to_arith(ir_expression *ir)
    812 {
    813    /* Translates
    814     *   ir_binop_borrow x y
    815     * into
    816     *   bcarry = ir_binop_less x y
    817     *   carry = ir_unop_b2i bcarry
    818     */
    819 
    820    ir->operation = ir_unop_i2u;
    821    ir->init_num_operands();
    822    ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1]));
    823    ir->operands[1] = NULL;
    824 
    825    this->progress = true;
    826 }
    827 
    828 void
    829 lower_instructions_visitor::sat_to_clamp(ir_expression *ir)
    830 {
    831    /* Translates
    832     *   ir_unop_saturate x
    833     * into
    834     *   ir_binop_min (ir_binop_max(x, 0.0), 1.0)
    835     */
    836 
    837    ir->operation = ir_binop_min;
    838    ir->init_num_operands();
    839    ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type,
    840                                            ir->operands[0],
    841                                            new(ir) ir_constant(0.0f));
    842    ir->operands[1] = new(ir) ir_constant(1.0f);
    843 
    844    this->progress = true;
    845 }
    846 
    847 void
    848 lower_instructions_visitor::double_dot_to_fma(ir_expression *ir)
    849 {
    850    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res",
    851 					   ir_var_temporary);
    852    this->base_ir->insert_before(temp);
    853 
    854    int nc = ir->operands[0]->type->components();
    855    for (int i = nc - 1; i >= 1; i--) {
    856       ir_assignment *assig;
    857       if (i == (nc - 1)) {
    858          assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
    859                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1)));
    860       } else {
    861          assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
    862                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1),
    863                                   temp));
    864       }
    865       this->base_ir->insert_before(assig);
    866    }
    867 
    868    ir->operation = ir_triop_fma;
    869    ir->init_num_operands();
    870    ir->operands[0] = swizzle(ir->operands[0], 0, 1);
    871    ir->operands[1] = swizzle(ir->operands[1], 0, 1);
    872    ir->operands[2] = new(ir) ir_dereference_variable(temp);
    873 
    874    this->progress = true;
    875 
    876 }
    877 
    878 void
    879 lower_instructions_visitor::double_lrp(ir_expression *ir)
    880 {
    881    int swizval;
    882    ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2];
    883    ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements);
    884 
    885    switch (op2->type->vector_elements) {
    886    case 1:
    887       swizval = SWIZZLE_XXXX;
    888       break;
    889    default:
    890       assert(op0->type->vector_elements == op2->type->vector_elements);
    891       swizval = SWIZZLE_XYZW;
    892       break;
    893    }
    894 
    895    ir->operation = ir_triop_fma;
    896    ir->init_num_operands();
    897    ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements);
    898    ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0);
    899 
    900    this->progress = true;
    901 }
    902 
    903 void
    904 lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir)
    905 {
    906    /*
    907     * frtemp = frac(x);
    908     * temp = sub(x, frtemp);
    909     * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0);
    910     */
    911    ir_instruction &i = *base_ir;
    912    ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
    913    ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
    914    ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
    915                                              ir_var_temporary);
    916 
    917    i.insert_before(frtemp);
    918    i.insert_before(assign(frtemp, fract(ir->operands[0])));
    919 
    920    ir->operation = ir_binop_add;
    921    ir->init_num_operands();
    922    ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp);
    923    ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL));
    924 
    925    this->progress = true;
    926 }
    927 
    928 void
    929 lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir)
    930 {
    931    /*
    932     * frtemp = frac(x);
    933     * result = sub(x, frtemp);
    934     */
    935    ir->operation = ir_binop_sub;
    936    ir->init_num_operands();
    937    ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL));
    938 
    939    this->progress = true;
    940 }
    941 void
    942 lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir)
    943 {
    944    /*
    945     * insane but works
    946     * temp = x + 0.5;
    947     * frtemp = frac(temp);
    948     * t2 = sub(temp, frtemp);
    949     * if (frac(x) == 0.5)
    950     *     result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1;
    951     *  else
    952     *     result = t2;
    953 
    954     */
    955    ir_instruction &i = *base_ir;
    956    ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
    957                                              ir_var_temporary);
    958    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
    959                                            ir_var_temporary);
    960    ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2",
    961                                            ir_var_temporary);
    962    ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements);
    963    ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
    964    ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
    965 
    966    i.insert_before(temp);
    967    i.insert_before(assign(temp, add(ir->operands[0], p5)));
    968 
    969    i.insert_before(frtemp);
    970    i.insert_before(assign(frtemp, fract(temp)));
    971 
    972    i.insert_before(t2);
    973    i.insert_before(assign(t2, sub(temp, frtemp)));
    974 
    975    ir->operation = ir_triop_csel;
    976    ir->init_num_operands();
    977    ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)),
    978                            p5->clone(ir, NULL));
    979    ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))),
    980                                 zero),
    981                           t2,
    982                           sub(t2, one));
    983    ir->operands[2] = new(ir) ir_dereference_variable(t2);
    984 
    985    this->progress = true;
    986 }
    987 
    988 void
    989 lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir)
    990 {
    991    /*
    992     * frtemp = frac(x);
    993     * temp = sub(x, frtemp);
    994     * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1;
    995     */
    996    ir_rvalue *arg = ir->operands[0];
    997    ir_instruction &i = *base_ir;
    998 
    999    ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
   1000    ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
   1001    ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp",
   1002                                              ir_var_temporary);
   1003    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
   1004                                            ir_var_temporary);
   1005 
   1006    i.insert_before(frtemp);
   1007    i.insert_before(assign(frtemp, fract(arg)));
   1008    i.insert_before(temp);
   1009    i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp)));
   1010 
   1011    ir->operation = ir_triop_csel;
   1012    ir->init_num_operands();
   1013    ir->operands[0] = gequal(arg->clone(ir, NULL), zero);
   1014    ir->operands[1] = new (ir) ir_dereference_variable(temp);
   1015    ir->operands[2] = add(temp,
   1016                          csel(equal(frtemp, zero->clone(ir, NULL)),
   1017                               zero->clone(ir, NULL),
   1018                               one));
   1019 
   1020    this->progress = true;
   1021 }
   1022 
   1023 void
   1024 lower_instructions_visitor::dsign_to_csel(ir_expression *ir)
   1025 {
   1026    /*
   1027     * temp = x > 0.0 ? 1.0 : 0.0;
   1028     * result = x < 0.0 ? -1.0 : temp;
   1029     */
   1030    ir_rvalue *arg = ir->operands[0];
   1031    ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
   1032    ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
   1033    ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements);
   1034 
   1035    ir->operation = ir_triop_csel;
   1036    ir->init_num_operands();
   1037    ir->operands[0] = less(arg->clone(ir, NULL),
   1038                           zero->clone(ir, NULL));
   1039    ir->operands[1] = neg_one;
   1040    ir->operands[2] = csel(greater(arg, zero),
   1041                           one,
   1042                           zero->clone(ir, NULL));
   1043 
   1044    this->progress = true;
   1045 }
   1046 
   1047 void
   1048 lower_instructions_visitor::bit_count_to_math(ir_expression *ir)
   1049 {
   1050    /* For more details, see:
   1051     *
   1052     * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel
   1053     */
   1054    const unsigned elements = ir->operands[0]->type->vector_elements;
   1055    ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp",
   1056                                            ir_var_temporary);
   1057    ir_constant *c55555555 = new(ir) ir_constant(0x55555555u);
   1058    ir_constant *c33333333 = new(ir) ir_constant(0x33333333u);
   1059    ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu);
   1060    ir_constant *c01010101 = new(ir) ir_constant(0x01010101u);
   1061    ir_constant *c1 = new(ir) ir_constant(1u);
   1062    ir_constant *c2 = new(ir) ir_constant(2u);
   1063    ir_constant *c4 = new(ir) ir_constant(4u);
   1064    ir_constant *c24 = new(ir) ir_constant(24u);
   1065 
   1066    base_ir->insert_before(temp);
   1067 
   1068    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
   1069       base_ir->insert_before(assign(temp, ir->operands[0]));
   1070    } else {
   1071       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
   1072       base_ir->insert_before(assign(temp, i2u(ir->operands[0])));
   1073    }
   1074 
   1075    /* temp = temp - ((temp >> 1) & 0x55555555u); */
   1076    base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1),
   1077                                                          c55555555))));
   1078 
   1079    /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */
   1080    base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333),
   1081                                            bit_and(rshift(temp, c2),
   1082                                                    c33333333->clone(ir, NULL)))));
   1083 
   1084    /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */
   1085    ir->operation = ir_unop_u2i;
   1086    ir->init_num_operands();
   1087    ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F),
   1088                                 c01010101),
   1089                             c24);
   1090 
   1091    this->progress = true;
   1092 }
   1093 
   1094 void
   1095 lower_instructions_visitor::extract_to_shifts(ir_expression *ir)
   1096 {
   1097    ir_variable *bits =
   1098       new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
   1099 
   1100    base_ir->insert_before(bits);
   1101    base_ir->insert_before(assign(bits, ir->operands[2]));
   1102 
   1103    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
   1104       ir_constant *c1 =
   1105          new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
   1106       ir_constant *c32 =
   1107          new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
   1108       ir_constant *cFFFFFFFF =
   1109          new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
   1110 
   1111       /* At least some hardware treats (x << y) as (x << (y%32)).  This means
   1112        * we'd get a mask of 0 when bits is 32.  Special case it.
   1113        *
   1114        * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u;
   1115        */
   1116       ir_expression *mask = csel(equal(bits, c32),
   1117                                  cFFFFFFFF,
   1118                                  sub(lshift(c1, bits), c1->clone(ir, NULL)));
   1119 
   1120       /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
   1121        *
   1122        *    If bits is zero, the result will be zero.
   1123        *
   1124        * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional
   1125        * select as in the signed integer case.
   1126        *
   1127        * (value >> offset) & mask;
   1128        */
   1129       ir->operation = ir_binop_bit_and;
   1130       ir->init_num_operands();
   1131       ir->operands[0] = rshift(ir->operands[0], ir->operands[1]);
   1132       ir->operands[1] = mask;
   1133       ir->operands[2] = NULL;
   1134    } else {
   1135       ir_constant *c0 =
   1136          new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements);
   1137       ir_constant *c32 =
   1138          new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
   1139       ir_variable *temp =
   1140          new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary);
   1141 
   1142       /* temp = 32 - bits; */
   1143       base_ir->insert_before(temp);
   1144       base_ir->insert_before(assign(temp, sub(c32, bits)));
   1145 
   1146       /* expr = value << (temp - offset)) >> temp; */
   1147       ir_expression *expr =
   1148          rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp);
   1149 
   1150       /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
   1151        *
   1152        *    If bits is zero, the result will be zero.
   1153        *
   1154        * Due to the (x << (y%32)) behavior mentioned before, the (value <<
   1155        * (32-0)) doesn't "erase" all of the data as we would like, so finish
   1156        * up with:
   1157        *
   1158        * (bits == 0) ? 0 : e;
   1159        */
   1160       ir->operation = ir_triop_csel;
   1161       ir->init_num_operands();
   1162       ir->operands[0] = equal(c0, bits);
   1163       ir->operands[1] = c0->clone(ir, NULL);
   1164       ir->operands[2] = expr;
   1165    }
   1166 
   1167    this->progress = true;
   1168 }
   1169 
   1170 void
   1171 lower_instructions_visitor::insert_to_shifts(ir_expression *ir)
   1172 {
   1173    ir_constant *c1;
   1174    ir_constant *c32;
   1175    ir_constant *cFFFFFFFF;
   1176    ir_variable *offset =
   1177       new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary);
   1178    ir_variable *bits =
   1179       new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
   1180    ir_variable *mask =
   1181       new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary);
   1182 
   1183    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
   1184       c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements);
   1185       c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
   1186       cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements);
   1187    } else {
   1188       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
   1189 
   1190       c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
   1191       c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
   1192       cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
   1193    }
   1194 
   1195    base_ir->insert_before(offset);
   1196    base_ir->insert_before(assign(offset, ir->operands[2]));
   1197 
   1198    base_ir->insert_before(bits);
   1199    base_ir->insert_before(assign(bits, ir->operands[3]));
   1200 
   1201    /* At least some hardware treats (x << y) as (x << (y%32)).  This means
   1202     * we'd get a mask of 0 when bits is 32.  Special case it.
   1203     *
   1204     * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset;
   1205     *
   1206     * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
   1207     *
   1208     *    The result will be undefined if offset or bits is negative, or if the
   1209     *    sum of offset and bits is greater than the number of bits used to
   1210     *    store the operand.
   1211     *
   1212     * Since it's undefined, there are a couple other ways this could be
   1213     * implemented.  The other way that was considered was to put the csel
   1214     * around the whole thing:
   1215     *
   1216     *    final_result = bits == 32 ? insert : ... ;
   1217     */
   1218    base_ir->insert_before(mask);
   1219 
   1220    base_ir->insert_before(assign(mask, csel(equal(bits, c32),
   1221                                             cFFFFFFFF,
   1222                                             lshift(sub(lshift(c1, bits),
   1223                                                        c1->clone(ir, NULL)),
   1224                                                    offset))));
   1225 
   1226    /* (base & ~mask) | ((insert << offset) & mask) */
   1227    ir->operation = ir_binop_bit_or;
   1228    ir->init_num_operands();
   1229    ir->operands[0] = bit_and(ir->operands[0], bit_not(mask));
   1230    ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask);
   1231    ir->operands[2] = NULL;
   1232    ir->operands[3] = NULL;
   1233 
   1234    this->progress = true;
   1235 }
   1236 
   1237 void
   1238 lower_instructions_visitor::reverse_to_shifts(ir_expression *ir)
   1239 {
   1240    /* For more details, see:
   1241     *
   1242     * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
   1243     */
   1244    ir_constant *c1 =
   1245       new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
   1246    ir_constant *c2 =
   1247       new(ir) ir_constant(2u, ir->operands[0]->type->vector_elements);
   1248    ir_constant *c4 =
   1249       new(ir) ir_constant(4u, ir->operands[0]->type->vector_elements);
   1250    ir_constant *c8 =
   1251       new(ir) ir_constant(8u, ir->operands[0]->type->vector_elements);
   1252    ir_constant *c16 =
   1253       new(ir) ir_constant(16u, ir->operands[0]->type->vector_elements);
   1254    ir_constant *c33333333 =
   1255       new(ir) ir_constant(0x33333333u, ir->operands[0]->type->vector_elements);
   1256    ir_constant *c55555555 =
   1257       new(ir) ir_constant(0x55555555u, ir->operands[0]->type->vector_elements);
   1258    ir_constant *c0F0F0F0F =
   1259       new(ir) ir_constant(0x0F0F0F0Fu, ir->operands[0]->type->vector_elements);
   1260    ir_constant *c00FF00FF =
   1261       new(ir) ir_constant(0x00FF00FFu, ir->operands[0]->type->vector_elements);
   1262    ir_variable *temp =
   1263       new(ir) ir_variable(glsl_type::uvec(ir->operands[0]->type->vector_elements),
   1264                           "temp", ir_var_temporary);
   1265    ir_instruction &i = *base_ir;
   1266 
   1267    i.insert_before(temp);
   1268 
   1269    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
   1270       i.insert_before(assign(temp, ir->operands[0]));
   1271    } else {
   1272       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
   1273       i.insert_before(assign(temp, i2u(ir->operands[0])));
   1274    }
   1275 
   1276    /* Swap odd and even bits.
   1277     *
   1278     * temp = ((temp >> 1) & 0x55555555u) | ((temp & 0x55555555u) << 1);
   1279     */
   1280    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c1), c55555555),
   1281                                        lshift(bit_and(temp, c55555555->clone(ir, NULL)),
   1282                                               c1->clone(ir, NULL)))));
   1283    /* Swap consecutive pairs.
   1284     *
   1285     * temp = ((temp >> 2) & 0x33333333u) | ((temp & 0x33333333u) << 2);
   1286     */
   1287    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c2), c33333333),
   1288                                        lshift(bit_and(temp, c33333333->clone(ir, NULL)),
   1289                                               c2->clone(ir, NULL)))));
   1290 
   1291    /* Swap nibbles.
   1292     *
   1293     * temp = ((temp >> 4) & 0x0F0F0F0Fu) | ((temp & 0x0F0F0F0Fu) << 4);
   1294     */
   1295    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c4), c0F0F0F0F),
   1296                                        lshift(bit_and(temp, c0F0F0F0F->clone(ir, NULL)),
   1297                                               c4->clone(ir, NULL)))));
   1298 
   1299    /* The last step is, basically, bswap.  Swap the bytes, then swap the
   1300     * words.  When this code is run through GCC on x86, it does generate a
   1301     * bswap instruction.
   1302     *
   1303     * temp = ((temp >> 8) & 0x00FF00FFu) | ((temp & 0x00FF00FFu) << 8);
   1304     * temp = ( temp >> 16              ) | ( temp                << 16);
   1305     */
   1306    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c8), c00FF00FF),
   1307                                        lshift(bit_and(temp, c00FF00FF->clone(ir, NULL)),
   1308                                               c8->clone(ir, NULL)))));
   1309 
   1310    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
   1311       ir->operation = ir_binop_bit_or;
   1312       ir->init_num_operands();
   1313       ir->operands[0] = rshift(temp, c16);
   1314       ir->operands[1] = lshift(temp, c16->clone(ir, NULL));
   1315    } else {
   1316       ir->operation = ir_unop_u2i;
   1317       ir->init_num_operands();
   1318       ir->operands[0] = bit_or(rshift(temp, c16),
   1319                                lshift(temp, c16->clone(ir, NULL)));
   1320    }
   1321 
   1322    this->progress = true;
   1323 }
   1324 
   1325 void
   1326 lower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir)
   1327 {
   1328    /* For more details, see:
   1329     *
   1330     * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
   1331     */
   1332    const unsigned elements = ir->operands[0]->type->vector_elements;
   1333    ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements);
   1334    ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
   1335    ir_constant *c23 = new(ir) ir_constant(int(23), elements);
   1336    ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
   1337    ir_variable *temp =
   1338       new(ir) ir_variable(glsl_type::ivec(elements), "temp", ir_var_temporary);
   1339    ir_variable *lsb_only =
   1340       new(ir) ir_variable(glsl_type::uvec(elements), "lsb_only", ir_var_temporary);
   1341    ir_variable *as_float =
   1342       new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
   1343    ir_variable *lsb =
   1344       new(ir) ir_variable(glsl_type::ivec(elements), "lsb", ir_var_temporary);
   1345 
   1346    ir_instruction &i = *base_ir;
   1347 
   1348    i.insert_before(temp);
   1349 
   1350    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
   1351       i.insert_before(assign(temp, ir->operands[0]));
   1352    } else {
   1353       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
   1354       i.insert_before(assign(temp, u2i(ir->operands[0])));
   1355    }
   1356 
   1357    /* The int-to-float conversion is lossless because (value & -value) is
   1358     * either a power of two or zero.  We don't use the result in the zero
   1359     * case.  The uint() cast is necessary so that 0x80000000 does not
   1360     * generate a negative value.
   1361     *
   1362     * uint lsb_only = uint(value & -value);
   1363     * float as_float = float(lsb_only);
   1364     */
   1365    i.insert_before(lsb_only);
   1366    i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp)))));
   1367 
   1368    i.insert_before(as_float);
   1369    i.insert_before(assign(as_float, u2f(lsb_only)));
   1370 
   1371    /* This is basically an open-coded frexp.  Implementations that have a
   1372     * native frexp instruction would be better served by that.  This is
   1373     * optimized versus a full-featured open-coded implementation in two ways:
   1374     *
   1375     * - We don't care about a correct result from subnormal numbers (including
   1376     *   0.0), so the raw exponent can always be safely unbiased.
   1377     *
   1378     * - The value cannot be negative, so it does not need to be masked off to
   1379     *   extract the exponent.
   1380     *
   1381     * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f;
   1382     */
   1383    i.insert_before(lsb);
   1384    i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
   1385 
   1386    /* Use lsb_only in the comparison instead of temp so that the & (far above)
   1387     * can possibly generate the result without an explicit comparison.
   1388     *
   1389     * (lsb_only == 0) ? -1 : lsb;
   1390     *
   1391     * Since our input values are all integers, the unbiased exponent must not
   1392     * be negative.  It will only be negative (-0x7f, in fact) if lsb_only is
   1393     * 0.  Instead of using (lsb_only == 0), we could use (lsb >= 0).  Which is
   1394     * better is likely GPU dependent.  Either way, the difference should be
   1395     * small.
   1396     */
   1397    ir->operation = ir_triop_csel;
   1398    ir->init_num_operands();
   1399    ir->operands[0] = equal(lsb_only, c0);
   1400    ir->operands[1] = cminus1;
   1401    ir->operands[2] = new(ir) ir_dereference_variable(lsb);
   1402 
   1403    this->progress = true;
   1404 }
   1405 
   1406 void
   1407 lower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir)
   1408 {
   1409    /* For more details, see:
   1410     *
   1411     * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
   1412     */
   1413    const unsigned elements = ir->operands[0]->type->vector_elements;
   1414    ir_constant *c0 = new(ir) ir_constant(int(0), elements);
   1415    ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
   1416    ir_constant *c23 = new(ir) ir_constant(int(23), elements);
   1417    ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
   1418    ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements);
   1419    ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements);
   1420    ir_variable *temp =
   1421       new(ir) ir_variable(glsl_type::uvec(elements), "temp", ir_var_temporary);
   1422    ir_variable *as_float =
   1423       new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
   1424    ir_variable *msb =
   1425       new(ir) ir_variable(glsl_type::ivec(elements), "msb", ir_var_temporary);
   1426 
   1427    ir_instruction &i = *base_ir;
   1428 
   1429    i.insert_before(temp);
   1430 
   1431    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
   1432       i.insert_before(assign(temp, ir->operands[0]));
   1433    } else {
   1434       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
   1435 
   1436       /* findMSB(uint(abs(some_int))) almost always does the right thing.
   1437        * There are two problem values:
   1438        *
   1439        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, findMSB returns
   1440        *   31.  However, findMSB(int(0x80000000)) == 30.
   1441        *
   1442        * * 0xffffffff.  Since abs(0xffffffff) == 1, findMSB returns
   1443        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
   1444        *
   1445        *    For a value of zero or negative one, -1 will be returned.
   1446        *
   1447        * For all negative number cases, including 0x80000000 and 0xffffffff,
   1448        * the correct value is obtained from findMSB if instead of negating the
   1449        * (already negative) value the logical-not is used.  A conditonal
   1450        * logical-not can be achieved in two instructions.
   1451        */
   1452       ir_variable *as_int =
   1453          new(ir) ir_variable(glsl_type::ivec(elements), "as_int", ir_var_temporary);
   1454       ir_constant *c31 = new(ir) ir_constant(int(31), elements);
   1455 
   1456       i.insert_before(as_int);
   1457       i.insert_before(assign(as_int, ir->operands[0]));
   1458       i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor,
   1459                                             as_int,
   1460                                             rshift(as_int, c31)))));
   1461    }
   1462 
   1463    /* The int-to-float conversion is lossless because bits are conditionally
   1464     * masked off the bottom of temp to ensure the value has at most 24 bits of
   1465     * data or is zero.  We don't use the result in the zero case.  The uint()
   1466     * cast is necessary so that 0x80000000 does not generate a negative value.
   1467     *
   1468     * float as_float = float(temp > 255 ? temp & ~255 : temp);
   1469     */
   1470    i.insert_before(as_float);
   1471    i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF),
   1472                                              bit_and(temp, cFFFFFF00),
   1473                                              temp))));
   1474 
   1475    /* This is basically an open-coded frexp.  Implementations that have a
   1476     * native frexp instruction would be better served by that.  This is
   1477     * optimized versus a full-featured open-coded implementation in two ways:
   1478     *
   1479     * - We don't care about a correct result from subnormal numbers (including
   1480     *   0.0), so the raw exponent can always be safely unbiased.
   1481     *
   1482     * - The value cannot be negative, so it does not need to be masked off to
   1483     *   extract the exponent.
   1484     *
   1485     * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f;
   1486     */
   1487    i.insert_before(msb);
   1488    i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
   1489 
   1490    /* Use msb in the comparison instead of temp so that the subtract can
   1491     * possibly generate the result without an explicit comparison.
   1492     *
   1493     * (msb < 0) ? -1 : msb;
   1494     *
   1495     * Since our input values are all integers, the unbiased exponent must not
   1496     * be negative.  It will only be negative (-0x7f, in fact) if temp is 0.
   1497     */
   1498    ir->operation = ir_triop_csel;
   1499    ir->init_num_operands();
   1500    ir->operands[0] = less(msb, c0);
   1501    ir->operands[1] = cminus1;
   1502    ir->operands[2] = new(ir) ir_dereference_variable(msb);
   1503 
   1504    this->progress = true;
   1505 }
   1506 
   1507 ir_expression *
   1508 lower_instructions_visitor::_carry(operand a, operand b)
   1509 {
   1510    if (lowering(CARRY_TO_ARITH))
   1511       return i2u(b2i(less(add(a, b),
   1512                           a.val->clone(ralloc_parent(a.val), NULL))));
   1513    else
   1514       return carry(a, b);
   1515 }
   1516 
   1517 void
   1518 lower_instructions_visitor::imul_high_to_mul(ir_expression *ir)
   1519 {
   1520    /*   ABCD
   1521     * * EFGH
   1522     * ======
   1523     * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32
   1524     *
   1525     * In GLSL, (a * b) becomes
   1526     *
   1527     * uint m1 = (a & 0x0000ffffu) * (b & 0x0000ffffu);
   1528     * uint m2 = (a & 0x0000ffffu) * (b >> 16);
   1529     * uint m3 = (a >> 16)         * (b & 0x0000ffffu);
   1530     * uint m4 = (a >> 16)         * (b >> 16);
   1531     *
   1532     * uint c1;
   1533     * uint c2;
   1534     * uint lo_result;
   1535     * uint hi_result;
   1536     *
   1537     * lo_result = uaddCarry(m1, m2 << 16, c1);
   1538     * hi_result = m4 + c1;
   1539     * lo_result = uaddCarry(lo_result, m3 << 16, c2);
   1540     * hi_result = hi_result + c2;
   1541     * hi_result = hi_result + (m2 >> 16) + (m3 >> 16);
   1542     */
   1543    const unsigned elements = ir->operands[0]->type->vector_elements;
   1544    ir_variable *src1 =
   1545       new(ir) ir_variable(glsl_type::uvec(elements), "src1", ir_var_temporary);
   1546    ir_variable *src1h =
   1547       new(ir) ir_variable(glsl_type::uvec(elements), "src1h", ir_var_temporary);
   1548    ir_variable *src1l =
   1549       new(ir) ir_variable(glsl_type::uvec(elements), "src1l", ir_var_temporary);
   1550    ir_variable *src2 =
   1551       new(ir) ir_variable(glsl_type::uvec(elements), "src2", ir_var_temporary);
   1552    ir_variable *src2h =
   1553       new(ir) ir_variable(glsl_type::uvec(elements), "src2h", ir_var_temporary);
   1554    ir_variable *src2l =
   1555       new(ir) ir_variable(glsl_type::uvec(elements), "src2l", ir_var_temporary);
   1556    ir_variable *t1 =
   1557       new(ir) ir_variable(glsl_type::uvec(elements), "t1", ir_var_temporary);
   1558    ir_variable *t2 =
   1559       new(ir) ir_variable(glsl_type::uvec(elements), "t2", ir_var_temporary);
   1560    ir_variable *lo =
   1561       new(ir) ir_variable(glsl_type::uvec(elements), "lo", ir_var_temporary);
   1562    ir_variable *hi =
   1563       new(ir) ir_variable(glsl_type::uvec(elements), "hi", ir_var_temporary);
   1564    ir_variable *different_signs = NULL;
   1565    ir_constant *c0000FFFF = new(ir) ir_constant(0x0000FFFFu, elements);
   1566    ir_constant *c16 = new(ir) ir_constant(16u, elements);
   1567 
   1568    ir_instruction &i = *base_ir;
   1569 
   1570    i.insert_before(src1);
   1571    i.insert_before(src2);
   1572    i.insert_before(src1h);
   1573    i.insert_before(src2h);
   1574    i.insert_before(src1l);
   1575    i.insert_before(src2l);
   1576 
   1577    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
   1578       i.insert_before(assign(src1, ir->operands[0]));
   1579       i.insert_before(assign(src2, ir->operands[1]));
   1580    } else {
   1581       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
   1582 
   1583       ir_variable *itmp1 =
   1584          new(ir) ir_variable(glsl_type::ivec(elements), "itmp1", ir_var_temporary);
   1585       ir_variable *itmp2 =
   1586          new(ir) ir_variable(glsl_type::ivec(elements), "itmp2", ir_var_temporary);
   1587       ir_constant *c0 = new(ir) ir_constant(int(0), elements);
   1588 
   1589       i.insert_before(itmp1);
   1590       i.insert_before(itmp2);
   1591       i.insert_before(assign(itmp1, ir->operands[0]));
   1592       i.insert_before(assign(itmp2, ir->operands[1]));
   1593 
   1594       different_signs =
   1595          new(ir) ir_variable(glsl_type::bvec(elements), "different_signs",
   1596                              ir_var_temporary);
   1597 
   1598       i.insert_before(different_signs);
   1599       i.insert_before(assign(different_signs, expr(ir_binop_logic_xor,
   1600                                                    less(itmp1, c0),
   1601                                                    less(itmp2, c0->clone(ir, NULL)))));
   1602 
   1603       i.insert_before(assign(src1, i2u(abs(itmp1))));
   1604       i.insert_before(assign(src2, i2u(abs(itmp2))));
   1605    }
   1606 
   1607    i.insert_before(assign(src1l, bit_and(src1, c0000FFFF)));
   1608    i.insert_before(assign(src2l, bit_and(src2, c0000FFFF->clone(ir, NULL))));
   1609    i.insert_before(assign(src1h, rshift(src1, c16)));
   1610    i.insert_before(assign(src2h, rshift(src2, c16->clone(ir, NULL))));
   1611 
   1612    i.insert_before(lo);
   1613    i.insert_before(hi);
   1614    i.insert_before(t1);
   1615    i.insert_before(t2);
   1616 
   1617    i.insert_before(assign(lo, mul(src1l, src2l)));
   1618    i.insert_before(assign(t1, mul(src1l, src2h)));
   1619    i.insert_before(assign(t2, mul(src1h, src2l)));
   1620    i.insert_before(assign(hi, mul(src1h, src2h)));
   1621 
   1622    i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t1, c16->clone(ir, NULL))))));
   1623    i.insert_before(assign(lo,            add(lo, lshift(t1, c16->clone(ir, NULL)))));
   1624 
   1625    i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t2, c16->clone(ir, NULL))))));
   1626    i.insert_before(assign(lo,            add(lo, lshift(t2, c16->clone(ir, NULL)))));
   1627 
   1628    if (different_signs == NULL) {
   1629       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
   1630 
   1631       ir->operation = ir_binop_add;
   1632       ir->init_num_operands();
   1633       ir->operands[0] = add(hi, rshift(t1, c16->clone(ir, NULL)));
   1634       ir->operands[1] = rshift(t2, c16->clone(ir, NULL));
   1635    } else {
   1636       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
   1637 
   1638       i.insert_before(assign(hi, add(add(hi, rshift(t1, c16->clone(ir, NULL))),
   1639                                      rshift(t2, c16->clone(ir, NULL)))));
   1640 
   1641       /* For channels where different_signs is set we have to perform a 64-bit
   1642        * negation.  This is *not* the same as just negating the high 32-bits.
   1643        * Consider -3 * 2.  The high 32-bits is 0, but the desired result is
   1644        * -1, not -0!  Recall -x == ~x + 1.
   1645        */
   1646       ir_variable *neg_hi =
   1647          new(ir) ir_variable(glsl_type::ivec(elements), "neg_hi", ir_var_temporary);
   1648       ir_constant *c1 = new(ir) ir_constant(1u, elements);
   1649 
   1650       i.insert_before(neg_hi);
   1651       i.insert_before(assign(neg_hi, add(bit_not(u2i(hi)),
   1652                                          u2i(_carry(bit_not(lo), c1)))));
   1653 
   1654       ir->operation = ir_triop_csel;
   1655       ir->init_num_operands();
   1656       ir->operands[0] = new(ir) ir_dereference_variable(different_signs);
   1657       ir->operands[1] = new(ir) ir_dereference_variable(neg_hi);
   1658       ir->operands[2] = u2i(hi);
   1659    }
   1660 }
   1661 
   1662 void
   1663 lower_instructions_visitor::sqrt_to_abs_sqrt(ir_expression *ir)
   1664 {
   1665    ir->operands[0] = new(ir) ir_expression(ir_unop_abs, ir->operands[0]);
   1666    this->progress = true;
   1667 }
   1668 
   1669 ir_visitor_status
   1670 lower_instructions_visitor::visit_leave(ir_expression *ir)
   1671 {
   1672    switch (ir->operation) {
   1673    case ir_binop_dot:
   1674       if (ir->operands[0]->type->is_double())
   1675          double_dot_to_fma(ir);
   1676       break;
   1677    case ir_triop_lrp:
   1678       if (ir->operands[0]->type->is_double())
   1679          double_lrp(ir);
   1680       break;
   1681    case ir_binop_sub:
   1682       if (lowering(SUB_TO_ADD_NEG))
   1683 	 sub_to_add_neg(ir);
   1684       break;
   1685 
   1686    case ir_binop_div:
   1687       if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP))
   1688 	 int_div_to_mul_rcp(ir);
   1689       else if ((ir->operands[1]->type->is_float() && lowering(FDIV_TO_MUL_RCP)) ||
   1690                (ir->operands[1]->type->is_double() && lowering(DDIV_TO_MUL_RCP)))
   1691 	 div_to_mul_rcp(ir);
   1692       break;
   1693 
   1694    case ir_unop_exp:
   1695       if (lowering(EXP_TO_EXP2))
   1696 	 exp_to_exp2(ir);
   1697       break;
   1698 
   1699    case ir_unop_log:
   1700       if (lowering(LOG_TO_LOG2))
   1701 	 log_to_log2(ir);
   1702       break;
   1703 
   1704    case ir_binop_mod:
   1705       if (lowering(MOD_TO_FLOOR) && (ir->type->is_float() || ir->type->is_double()))
   1706 	 mod_to_floor(ir);
   1707       break;
   1708 
   1709    case ir_binop_pow:
   1710       if (lowering(POW_TO_EXP2))
   1711 	 pow_to_exp2(ir);
   1712       break;
   1713 
   1714    case ir_binop_ldexp:
   1715       if (lowering(LDEXP_TO_ARITH) && ir->type->is_float())
   1716          ldexp_to_arith(ir);
   1717       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double())
   1718          dldexp_to_arith(ir);
   1719       break;
   1720 
   1721    case ir_unop_frexp_exp:
   1722       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
   1723          dfrexp_exp_to_arith(ir);
   1724       break;
   1725 
   1726    case ir_unop_frexp_sig:
   1727       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
   1728          dfrexp_sig_to_arith(ir);
   1729       break;
   1730 
   1731    case ir_binop_carry:
   1732       if (lowering(CARRY_TO_ARITH))
   1733          carry_to_arith(ir);
   1734       break;
   1735 
   1736    case ir_binop_borrow:
   1737       if (lowering(BORROW_TO_ARITH))
   1738          borrow_to_arith(ir);
   1739       break;
   1740 
   1741    case ir_unop_saturate:
   1742       if (lowering(SAT_TO_CLAMP))
   1743          sat_to_clamp(ir);
   1744       break;
   1745 
   1746    case ir_unop_trunc:
   1747       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
   1748          dtrunc_to_dfrac(ir);
   1749       break;
   1750 
   1751    case ir_unop_ceil:
   1752       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
   1753          dceil_to_dfrac(ir);
   1754       break;
   1755 
   1756    case ir_unop_floor:
   1757       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
   1758          dfloor_to_dfrac(ir);
   1759       break;
   1760 
   1761    case ir_unop_round_even:
   1762       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
   1763          dround_even_to_dfrac(ir);
   1764       break;
   1765 
   1766    case ir_unop_sign:
   1767       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
   1768          dsign_to_csel(ir);
   1769       break;
   1770 
   1771    case ir_unop_bit_count:
   1772       if (lowering(BIT_COUNT_TO_MATH))
   1773          bit_count_to_math(ir);
   1774       break;
   1775 
   1776    case ir_triop_bitfield_extract:
   1777       if (lowering(EXTRACT_TO_SHIFTS))
   1778          extract_to_shifts(ir);
   1779       break;
   1780 
   1781    case ir_quadop_bitfield_insert:
   1782       if (lowering(INSERT_TO_SHIFTS))
   1783          insert_to_shifts(ir);
   1784       break;
   1785 
   1786    case ir_unop_bitfield_reverse:
   1787       if (lowering(REVERSE_TO_SHIFTS))
   1788          reverse_to_shifts(ir);
   1789       break;
   1790 
   1791    case ir_unop_find_lsb:
   1792       if (lowering(FIND_LSB_TO_FLOAT_CAST))
   1793          find_lsb_to_float_cast(ir);
   1794       break;
   1795 
   1796    case ir_unop_find_msb:
   1797       if (lowering(FIND_MSB_TO_FLOAT_CAST))
   1798          find_msb_to_float_cast(ir);
   1799       break;
   1800 
   1801    case ir_binop_imul_high:
   1802       if (lowering(IMUL_HIGH_TO_MUL))
   1803          imul_high_to_mul(ir);
   1804       break;
   1805 
   1806    case ir_unop_rsq:
   1807    case ir_unop_sqrt:
   1808       if (lowering(SQRT_TO_ABS_SQRT))
   1809          sqrt_to_abs_sqrt(ir);
   1810       break;
   1811 
   1812    default:
   1813       return visit_continue;
   1814    }
   1815 
   1816    return visit_continue;
   1817 }
   1818