Home | History | Annotate | Download | only in glsl
      1 /*
      2  * Copyright  2010 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21  * DEALINGS IN THE SOFTWARE.
     22  */
     23 
     24 /**
     25  * \file lower_instructions.cpp
     26  *
     27  * Many GPUs lack native instructions for certain expression operations, and
     28  * must replace them with some other expression tree.  This pass lowers some
     29  * of the most common cases, allowing the lowering code to be implemented once
     30  * rather than in each driver backend.
     31  *
     32  * Currently supported transformations:
     33  * - SUB_TO_ADD_NEG
     34  * - DIV_TO_MUL_RCP
     35  * - INT_DIV_TO_MUL_RCP
     36  * - EXP_TO_EXP2
     37  * - POW_TO_EXP2
     38  * - LOG_TO_LOG2
     39  * - MOD_TO_FLOOR
     40  * - LDEXP_TO_ARITH
     41  * - DFREXP_TO_ARITH
     42  * - CARRY_TO_ARITH
     43  * - BORROW_TO_ARITH
     44  * - SAT_TO_CLAMP
     45  * - DOPS_TO_DFRAC
     46  *
     47  * SUB_TO_ADD_NEG:
     48  * ---------------
     49  * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
     50  *
     51  * This simplifies expression reassociation, and for many backends
     52  * there is no subtract operation separate from adding the negation.
     53  * For backends with native subtract operations, they will probably
     54  * want to recognize add(op0, neg(op1)) or the other way around to
     55  * produce a subtract anyway.
     56  *
     57  * FDIV_TO_MUL_RCP, DDIV_TO_MUL_RCP, and INT_DIV_TO_MUL_RCP:
     58  * ---------------------------------------------------------
     59  * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
     60  *
     61  * Many GPUs don't have a divide instruction (945 and 965 included),
     62  * but they do have an RCP instruction to compute an approximate
     63  * reciprocal.  By breaking the operation down, constant reciprocals
     64  * can get constant folded.
     65  *
     66  * FDIV_TO_MUL_RCP only lowers single-precision floating point division;
     67  * DDIV_TO_MUL_RCP only lowers double-precision floating point division.
     68  * DIV_TO_MUL_RCP is a convenience macro that sets both flags.
     69  * INT_DIV_TO_MUL_RCP handles the integer case, converting to and from floating
     70  * point so that RCP is possible.
     71  *
     72  * EXP_TO_EXP2 and LOG_TO_LOG2:
     73  * ----------------------------
     74  * Many GPUs don't have a base e log or exponent instruction, but they
     75  * do have base 2 versions, so this pass converts exp and log to exp2
     76  * and log2 operations.
     77  *
     78  * POW_TO_EXP2:
     79  * -----------
     80  * Many older GPUs don't have an x**y instruction.  For these GPUs, convert
     81  * x**y to 2**(y * log2(x)).
     82  *
     83  * MOD_TO_FLOOR:
     84  * -------------
     85  * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1))
     86  *
     87  * Many GPUs don't have a MOD instruction (945 and 965 included), and
     88  * if we have to break it down like this anyway, it gives an
     89  * opportunity to do things like constant fold the (1.0 / op1) easily.
     90  *
     91  * Note: before we used to implement this as op1 * fract(op / op1) but this
     92  * implementation had significant precision errors.
     93  *
     94  * LDEXP_TO_ARITH:
     95  * -------------
     96  * Converts ir_binop_ldexp to arithmetic and bit operations for float sources.
     97  *
     98  * DFREXP_DLDEXP_TO_ARITH:
     99  * ---------------
    100  * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to
    101  * arithmetic and bit ops for double arguments.
    102  *
    103  * CARRY_TO_ARITH:
    104  * ---------------
    105  * Converts ir_carry into (x + y) < x.
    106  *
    107  * BORROW_TO_ARITH:
    108  * ----------------
    109  * Converts ir_borrow into (x < y).
    110  *
    111  * SAT_TO_CLAMP:
    112  * -------------
    113  * Converts ir_unop_saturate into min(max(x, 0.0), 1.0)
    114  *
    115  * DOPS_TO_DFRAC:
    116  * --------------
    117  * Converts double trunc, ceil, floor, round to fract
    118  */
    119 
    120 #include "c99_math.h"
    121 #include "program/prog_instruction.h" /* for swizzle */
    122 #include "compiler/glsl_types.h"
    123 #include "ir.h"
    124 #include "ir_builder.h"
    125 #include "ir_optimization.h"
    126 
    127 using namespace ir_builder;
    128 
    129 namespace {
    130 
    131 class lower_instructions_visitor : public ir_hierarchical_visitor {
    132 public:
    133    lower_instructions_visitor(unsigned lower)
    134       : progress(false), lower(lower) { }
    135 
    136    ir_visitor_status visit_leave(ir_expression *);
    137 
    138    bool progress;
    139 
    140 private:
    141    unsigned lower; /** Bitfield of which operations to lower */
    142 
    143    void sub_to_add_neg(ir_expression *);
    144    void div_to_mul_rcp(ir_expression *);
    145    void int_div_to_mul_rcp(ir_expression *);
    146    void mod_to_floor(ir_expression *);
    147    void exp_to_exp2(ir_expression *);
    148    void pow_to_exp2(ir_expression *);
    149    void log_to_log2(ir_expression *);
    150    void ldexp_to_arith(ir_expression *);
    151    void dldexp_to_arith(ir_expression *);
    152    void dfrexp_sig_to_arith(ir_expression *);
    153    void dfrexp_exp_to_arith(ir_expression *);
    154    void carry_to_arith(ir_expression *);
    155    void borrow_to_arith(ir_expression *);
    156    void sat_to_clamp(ir_expression *);
    157    void double_dot_to_fma(ir_expression *);
    158    void double_lrp(ir_expression *);
    159    void dceil_to_dfrac(ir_expression *);
    160    void dfloor_to_dfrac(ir_expression *);
    161    void dround_even_to_dfrac(ir_expression *);
    162    void dtrunc_to_dfrac(ir_expression *);
    163    void dsign_to_csel(ir_expression *);
    164    void bit_count_to_math(ir_expression *);
    165    void extract_to_shifts(ir_expression *);
    166    void insert_to_shifts(ir_expression *);
    167    void reverse_to_shifts(ir_expression *ir);
    168    void find_lsb_to_float_cast(ir_expression *ir);
    169    void find_msb_to_float_cast(ir_expression *ir);
    170    void imul_high_to_mul(ir_expression *ir);
    171 
    172    ir_expression *_carry(operand a, operand b);
    173 };
    174 
    175 } /* anonymous namespace */
    176 
    177 /**
    178  * Determine if a particular type of lowering should occur
    179  */
    180 #define lowering(x) (this->lower & x)
    181 
    182 bool
    183 lower_instructions(exec_list *instructions, unsigned what_to_lower)
    184 {
    185    lower_instructions_visitor v(what_to_lower);
    186 
    187    visit_list_elements(&v, instructions);
    188    return v.progress;
    189 }
    190 
    191 void
    192 lower_instructions_visitor::sub_to_add_neg(ir_expression *ir)
    193 {
    194    ir->operation = ir_binop_add;
    195    ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type,
    196 					   ir->operands[1], NULL);
    197    this->progress = true;
    198 }
    199 
    200 void
    201 lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir)
    202 {
    203    assert(ir->operands[1]->type->is_float() || ir->operands[1]->type->is_double());
    204 
    205    /* New expression for the 1.0 / op1 */
    206    ir_rvalue *expr;
    207    expr = new(ir) ir_expression(ir_unop_rcp,
    208 				ir->operands[1]->type,
    209 				ir->operands[1]);
    210 
    211    /* op0 / op1 -> op0 * (1.0 / op1) */
    212    ir->operation = ir_binop_mul;
    213    ir->operands[1] = expr;
    214 
    215    this->progress = true;
    216 }
    217 
    218 void
    219 lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir)
    220 {
    221    assert(ir->operands[1]->type->is_integer());
    222 
    223    /* Be careful with integer division -- we need to do it as a
    224     * float and re-truncate, since rcp(n > 1) of an integer would
    225     * just be 0.
    226     */
    227    ir_rvalue *op0, *op1;
    228    const struct glsl_type *vec_type;
    229 
    230    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
    231 				      ir->operands[1]->type->vector_elements,
    232 				      ir->operands[1]->type->matrix_columns);
    233 
    234    if (ir->operands[1]->type->base_type == GLSL_TYPE_INT)
    235       op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL);
    236    else
    237       op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL);
    238 
    239    op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL);
    240 
    241    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
    242 				      ir->operands[0]->type->vector_elements,
    243 				      ir->operands[0]->type->matrix_columns);
    244 
    245    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT)
    246       op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL);
    247    else
    248       op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL);
    249 
    250    vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
    251 				      ir->type->vector_elements,
    252 				      ir->type->matrix_columns);
    253 
    254    op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1);
    255 
    256    if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) {
    257       ir->operation = ir_unop_f2i;
    258       ir->operands[0] = op0;
    259    } else {
    260       ir->operation = ir_unop_i2u;
    261       ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0);
    262    }
    263    ir->operands[1] = NULL;
    264 
    265    this->progress = true;
    266 }
    267 
    268 void
    269 lower_instructions_visitor::exp_to_exp2(ir_expression *ir)
    270 {
    271    ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E));
    272 
    273    ir->operation = ir_unop_exp2;
    274    ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type,
    275 					   ir->operands[0], log2_e);
    276    this->progress = true;
    277 }
    278 
    279 void
    280 lower_instructions_visitor::pow_to_exp2(ir_expression *ir)
    281 {
    282    ir_expression *const log2_x =
    283       new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
    284 			    ir->operands[0]);
    285 
    286    ir->operation = ir_unop_exp2;
    287    ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type,
    288 					   ir->operands[1], log2_x);
    289    ir->operands[1] = NULL;
    290    this->progress = true;
    291 }
    292 
    293 void
    294 lower_instructions_visitor::log_to_log2(ir_expression *ir)
    295 {
    296    ir->operation = ir_binop_mul;
    297    ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
    298 					   ir->operands[0], NULL);
    299    ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E));
    300    this->progress = true;
    301 }
    302 
    303 void
    304 lower_instructions_visitor::mod_to_floor(ir_expression *ir)
    305 {
    306    ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x",
    307                                          ir_var_temporary);
    308    ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y",
    309                                          ir_var_temporary);
    310    this->base_ir->insert_before(x);
    311    this->base_ir->insert_before(y);
    312 
    313    ir_assignment *const assign_x =
    314       new(ir) ir_assignment(new(ir) ir_dereference_variable(x),
    315                             ir->operands[0], NULL);
    316    ir_assignment *const assign_y =
    317       new(ir) ir_assignment(new(ir) ir_dereference_variable(y),
    318                             ir->operands[1], NULL);
    319 
    320    this->base_ir->insert_before(assign_x);
    321    this->base_ir->insert_before(assign_y);
    322 
    323    ir_expression *const div_expr =
    324       new(ir) ir_expression(ir_binop_div, x->type,
    325                             new(ir) ir_dereference_variable(x),
    326                             new(ir) ir_dereference_variable(y));
    327 
    328    /* Don't generate new IR that would need to be lowered in an additional
    329     * pass.
    330     */
    331    if ((lowering(FDIV_TO_MUL_RCP) && ir->type->is_float()) ||
    332        (lowering(DDIV_TO_MUL_RCP) && ir->type->is_double()))
    333       div_to_mul_rcp(div_expr);
    334 
    335    ir_expression *const floor_expr =
    336       new(ir) ir_expression(ir_unop_floor, x->type, div_expr);
    337 
    338    if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
    339       dfloor_to_dfrac(floor_expr);
    340 
    341    ir_expression *const mul_expr =
    342       new(ir) ir_expression(ir_binop_mul,
    343                             new(ir) ir_dereference_variable(y),
    344                             floor_expr);
    345 
    346    ir->operation = ir_binop_sub;
    347    ir->operands[0] = new(ir) ir_dereference_variable(x);
    348    ir->operands[1] = mul_expr;
    349    this->progress = true;
    350 }
    351 
    352 void
    353 lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
    354 {
    355    /* Translates
    356     *    ir_binop_ldexp x exp
    357     * into
    358     *
    359     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
    360     *    resulting_biased_exp = extracted_biased_exp + exp;
    361     *
    362     *    if (resulting_biased_exp < 1 || x == 0.0f) {
    363     *       return copysign(0.0, x);
    364     *    }
    365     *
    366     *    return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
    367     *                       lshift(i2u(resulting_biased_exp), exp_shift));
    368     *
    369     * which we can't actually implement as such, since the GLSL IR doesn't
    370     * have vectorized if-statements. We actually implement it without branches
    371     * using conditional-select:
    372     *
    373     *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
    374     *    resulting_biased_exp = extracted_biased_exp + exp;
    375     *
    376     *    is_not_zero_or_underflow = logic_and(nequal(x, 0.0f),
    377     *                                         gequal(resulting_biased_exp, 1);
    378     *    x = csel(is_not_zero_or_underflow, x, copysign(0.0f, x));
    379     *    resulting_biased_exp = csel(is_not_zero_or_underflow,
    380     *                                resulting_biased_exp, 0);
    381     *
    382     *    return bitcast_u2f((bitcast_f2u(x) & sign_mantissa_mask) |
    383     *                       lshift(i2u(resulting_biased_exp), exp_shift));
    384     */
    385 
    386    const unsigned vec_elem = ir->type->vector_elements;
    387 
    388    /* Types */
    389    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
    390    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
    391 
    392    /* Constants */
    393    ir_constant *zeroi = ir_constant::zero(ir, ivec);
    394 
    395    ir_constant *sign_mask = new(ir) ir_constant(0x80000000u, vec_elem);
    396 
    397    ir_constant *exp_shift = new(ir) ir_constant(23, vec_elem);
    398 
    399    /* Temporary variables */
    400    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
    401    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
    402 
    403    ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
    404                                                   ir_var_temporary);
    405 
    406    ir_variable *extracted_biased_exp =
    407       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
    408    ir_variable *resulting_biased_exp =
    409       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
    410 
    411    ir_variable *is_not_zero_or_underflow =
    412       new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
    413 
    414    ir_instruction &i = *base_ir;
    415 
    416    /* Copy <x> and <exp> arguments. */
    417    i.insert_before(x);
    418    i.insert_before(assign(x, ir->operands[0]));
    419    i.insert_before(exp);
    420    i.insert_before(assign(exp, ir->operands[1]));
    421 
    422    /* Extract the biased exponent from <x>. */
    423    i.insert_before(extracted_biased_exp);
    424    i.insert_before(assign(extracted_biased_exp,
    425                           rshift(bitcast_f2i(abs(x)), exp_shift)));
    426 
    427    i.insert_before(resulting_biased_exp);
    428    i.insert_before(assign(resulting_biased_exp,
    429                           add(extracted_biased_exp, exp)));
    430 
    431    /* Test if result is 0.0, subnormal, or underflow by checking if the
    432     * resulting biased exponent would be less than 0x1. If so, the result is
    433     * 0.0 with the sign of x. (Actually, invert the conditions so that
    434     * immediate values are the second arguments, which is better for i965)
    435     */
    436    i.insert_before(zero_sign_x);
    437    i.insert_before(assign(zero_sign_x,
    438                           bitcast_u2f(bit_and(bitcast_f2u(x), sign_mask))));
    439 
    440    i.insert_before(is_not_zero_or_underflow);
    441    i.insert_before(assign(is_not_zero_or_underflow,
    442                           logic_and(nequal(x, new(ir) ir_constant(0.0f, vec_elem)),
    443                                     gequal(resulting_biased_exp,
    444                                            new(ir) ir_constant(0x1, vec_elem)))));
    445    i.insert_before(assign(x, csel(is_not_zero_or_underflow,
    446                                   x, zero_sign_x)));
    447    i.insert_before(assign(resulting_biased_exp,
    448                           csel(is_not_zero_or_underflow,
    449                                resulting_biased_exp, zeroi)));
    450 
    451    /* We could test for overflows by checking if the resulting biased exponent
    452     * would be greater than 0xFE. Turns out we don't need to because the GLSL
    453     * spec says:
    454     *
    455     *    "If this product is too large to be represented in the
    456     *     floating-point type, the result is undefined."
    457     */
    458 
    459    ir_constant *exp_shift_clone = exp_shift->clone(ir, NULL);
    460 
    461    /* Don't generate new IR that would need to be lowered in an additional
    462     * pass.
    463     */
    464    if (!lowering(INSERT_TO_SHIFTS)) {
    465       ir_constant *exp_width = new(ir) ir_constant(8, vec_elem);
    466       ir->operation = ir_unop_bitcast_i2f;
    467       ir->operands[0] = bitfield_insert(bitcast_f2i(x), resulting_biased_exp,
    468                                         exp_shift_clone, exp_width);
    469       ir->operands[1] = NULL;
    470    } else {
    471       ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x807fffffu, vec_elem);
    472       ir->operation = ir_unop_bitcast_u2f;
    473       ir->operands[0] = bit_or(bit_and(bitcast_f2u(x), sign_mantissa_mask),
    474                                lshift(i2u(resulting_biased_exp), exp_shift_clone));
    475    }
    476 
    477    this->progress = true;
    478 }
    479 
    480 void
    481 lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)
    482 {
    483    /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent
    484     * from the significand.
    485     */
    486 
    487    const unsigned vec_elem = ir->type->vector_elements;
    488 
    489    /* Types */
    490    const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
    491    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
    492 
    493    /* Constants */
    494    ir_constant *zeroi = ir_constant::zero(ir, ivec);
    495 
    496    ir_constant *sign_mask = new(ir) ir_constant(0x80000000u);
    497 
    498    ir_constant *exp_shift = new(ir) ir_constant(20u);
    499    ir_constant *exp_width = new(ir) ir_constant(11u);
    500    ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem);
    501 
    502    /* Temporary variables */
    503    ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
    504    ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
    505 
    506    ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
    507                                                   ir_var_temporary);
    508 
    509    ir_variable *extracted_biased_exp =
    510       new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
    511    ir_variable *resulting_biased_exp =
    512       new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
    513 
    514    ir_variable *is_not_zero_or_underflow =
    515       new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
    516 
    517    ir_instruction &i = *base_ir;
    518 
    519    /* Copy <x> and <exp> arguments. */
    520    i.insert_before(x);
    521    i.insert_before(assign(x, ir->operands[0]));
    522    i.insert_before(exp);
    523    i.insert_before(assign(exp, ir->operands[1]));
    524 
    525    ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x);
    526    if (lowering(DFREXP_DLDEXP_TO_ARITH))
    527       dfrexp_exp_to_arith(frexp_exp);
    528 
    529    /* Extract the biased exponent from <x>. */
    530    i.insert_before(extracted_biased_exp);
    531    i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias)));
    532 
    533    i.insert_before(resulting_biased_exp);
    534    i.insert_before(assign(resulting_biased_exp,
    535                           add(extracted_biased_exp, exp)));
    536 
    537    /* Test if result is 0.0, subnormal, or underflow by checking if the
    538     * resulting biased exponent would be less than 0x1. If so, the result is
    539     * 0.0 with the sign of x. (Actually, invert the conditions so that
    540     * immediate values are the second arguments, which is better for i965)
    541     * TODO: Implement in a vector fashion.
    542     */
    543    i.insert_before(zero_sign_x);
    544    for (unsigned elem = 0; elem < vec_elem; elem++) {
    545       ir_variable *unpacked =
    546          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
    547       i.insert_before(unpacked);
    548       i.insert_before(
    549             assign(unpacked,
    550                    expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
    551       i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)),
    552                              WRITEMASK_Y));
    553       i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X));
    554       i.insert_before(assign(zero_sign_x,
    555                              expr(ir_unop_pack_double_2x32, unpacked),
    556                              1 << elem));
    557    }
    558    i.insert_before(is_not_zero_or_underflow);
    559    i.insert_before(assign(is_not_zero_or_underflow,
    560                           gequal(resulting_biased_exp,
    561                                   new(ir) ir_constant(0x1, vec_elem))));
    562    i.insert_before(assign(x, csel(is_not_zero_or_underflow,
    563                                   x, zero_sign_x)));
    564    i.insert_before(assign(resulting_biased_exp,
    565                           csel(is_not_zero_or_underflow,
    566                                resulting_biased_exp, zeroi)));
    567 
    568    /* We could test for overflows by checking if the resulting biased exponent
    569     * would be greater than 0xFE. Turns out we don't need to because the GLSL
    570     * spec says:
    571     *
    572     *    "If this product is too large to be represented in the
    573     *     floating-point type, the result is undefined."
    574     */
    575 
    576    ir_rvalue *results[4] = {NULL};
    577    for (unsigned elem = 0; elem < vec_elem; elem++) {
    578       ir_variable *unpacked =
    579          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
    580       i.insert_before(unpacked);
    581       i.insert_before(
    582             assign(unpacked,
    583                    expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
    584 
    585       ir_expression *bfi = bitfield_insert(
    586             swizzle_y(unpacked),
    587             i2u(swizzle(resulting_biased_exp, elem, 1)),
    588             exp_shift->clone(ir, NULL),
    589             exp_width->clone(ir, NULL));
    590 
    591       i.insert_before(assign(unpacked, bfi, WRITEMASK_Y));
    592 
    593       results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
    594    }
    595 
    596    ir->operation = ir_quadop_vector;
    597    ir->operands[0] = results[0];
    598    ir->operands[1] = results[1];
    599    ir->operands[2] = results[2];
    600    ir->operands[3] = results[3];
    601 
    602    /* Don't generate new IR that would need to be lowered in an additional
    603     * pass.
    604     */
    605 
    606    this->progress = true;
    607 }
    608 
    609 void
    610 lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir)
    611 {
    612    const unsigned vec_elem = ir->type->vector_elements;
    613    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
    614 
    615    /* Double-precision floating-point values are stored as
    616     *   1 sign bit;
    617     *   11 exponent bits;
    618     *   52 mantissa bits.
    619     *
    620     * We're just extracting the significand here, so we only need to modify
    621     * the upper 32-bit uint. Unfortunately we must extract each double
    622     * independently as there is no vector version of unpackDouble.
    623     */
    624 
    625    ir_instruction &i = *base_ir;
    626 
    627    ir_variable *is_not_zero =
    628       new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
    629    ir_rvalue *results[4] = {NULL};
    630 
    631    ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
    632    i.insert_before(is_not_zero);
    633    i.insert_before(
    634          assign(is_not_zero,
    635                 nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero)));
    636 
    637    /* TODO: Remake this as more vector-friendly when int64 support is
    638     * available.
    639     */
    640    for (unsigned elem = 0; elem < vec_elem; elem++) {
    641       ir_constant *zero = new(ir) ir_constant(0u, 1);
    642       ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1);
    643 
    644       /* Exponent of double floating-point values in the range [0.5, 1.0). */
    645       ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1);
    646 
    647       ir_variable *bits =
    648          new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary);
    649       ir_variable *unpacked =
    650          new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
    651 
    652       ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1);
    653 
    654       i.insert_before(bits);
    655       i.insert_before(unpacked);
    656       i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x)));
    657 
    658       /* Manipulate the high uint to remove the exponent and replace it with
    659        * either the default exponent or zero.
    660        */
    661       i.insert_before(assign(bits, swizzle_y(unpacked)));
    662       i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask)));
    663       i.insert_before(assign(bits, bit_or(bits,
    664                                           csel(swizzle(is_not_zero, elem, 1),
    665                                                exponent_value,
    666                                                zero))));
    667       i.insert_before(assign(unpacked, bits, WRITEMASK_Y));
    668       results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
    669    }
    670 
    671    /* Put the dvec back together */
    672    ir->operation = ir_quadop_vector;
    673    ir->operands[0] = results[0];
    674    ir->operands[1] = results[1];
    675    ir->operands[2] = results[2];
    676    ir->operands[3] = results[3];
    677 
    678    this->progress = true;
    679 }
    680 
    681 void
    682 lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir)
    683 {
    684    const unsigned vec_elem = ir->type->vector_elements;
    685    const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
    686    const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
    687 
    688    /* Double-precision floating-point values are stored as
    689     *   1 sign bit;
    690     *   11 exponent bits;
    691     *   52 mantissa bits.
    692     *
    693     * We're just extracting the exponent here, so we only care about the upper
    694     * 32-bit uint.
    695     */
    696 
    697    ir_instruction &i = *base_ir;
    698 
    699    ir_variable *is_not_zero =
    700       new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
    701    ir_variable *high_words =
    702       new(ir) ir_variable(uvec, "high_words", ir_var_temporary);
    703    ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
    704    ir_constant *izero = new(ir) ir_constant(0, vec_elem);
    705 
    706    ir_rvalue *absval = abs(ir->operands[0]);
    707 
    708    i.insert_before(is_not_zero);
    709    i.insert_before(high_words);
    710    i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero)));
    711 
    712    /* Extract all of the upper uints. */
    713    for (unsigned elem = 0; elem < vec_elem; elem++) {
    714       ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1);
    715 
    716       i.insert_before(assign(high_words,
    717                              swizzle_y(expr(ir_unop_unpack_double_2x32, x)),
    718                              1 << elem));
    719 
    720    }
    721    ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem);
    722    ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem);
    723 
    724    /* For non-zero inputs, shift the exponent down and apply bias. */
    725    ir->operation = ir_triop_csel;
    726    ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero);
    727    ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift)));
    728    ir->operands[2] = izero;
    729 
    730    this->progress = true;
    731 }
    732 
    733 void
    734 lower_instructions_visitor::carry_to_arith(ir_expression *ir)
    735 {
    736    /* Translates
    737     *   ir_binop_carry x y
    738     * into
    739     *   sum = ir_binop_add x y
    740     *   bcarry = ir_binop_less sum x
    741     *   carry = ir_unop_b2i bcarry
    742     */
    743 
    744    ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL);
    745    ir->operation = ir_unop_i2u;
    746    ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone));
    747    ir->operands[1] = NULL;
    748 
    749    this->progress = true;
    750 }
    751 
    752 void
    753 lower_instructions_visitor::borrow_to_arith(ir_expression *ir)
    754 {
    755    /* Translates
    756     *   ir_binop_borrow x y
    757     * into
    758     *   bcarry = ir_binop_less x y
    759     *   carry = ir_unop_b2i bcarry
    760     */
    761 
    762    ir->operation = ir_unop_i2u;
    763    ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1]));
    764    ir->operands[1] = NULL;
    765 
    766    this->progress = true;
    767 }
    768 
    769 void
    770 lower_instructions_visitor::sat_to_clamp(ir_expression *ir)
    771 {
    772    /* Translates
    773     *   ir_unop_saturate x
    774     * into
    775     *   ir_binop_min (ir_binop_max(x, 0.0), 1.0)
    776     */
    777 
    778    ir->operation = ir_binop_min;
    779    ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type,
    780                                            ir->operands[0],
    781                                            new(ir) ir_constant(0.0f));
    782    ir->operands[1] = new(ir) ir_constant(1.0f);
    783 
    784    this->progress = true;
    785 }
    786 
    787 void
    788 lower_instructions_visitor::double_dot_to_fma(ir_expression *ir)
    789 {
    790    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res",
    791 					   ir_var_temporary);
    792    this->base_ir->insert_before(temp);
    793 
    794    int nc = ir->operands[0]->type->components();
    795    for (int i = nc - 1; i >= 1; i--) {
    796       ir_assignment *assig;
    797       if (i == (nc - 1)) {
    798          assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
    799                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1)));
    800       } else {
    801          assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
    802                                   swizzle(ir->operands[1]->clone(ir, NULL), i, 1),
    803                                   temp));
    804       }
    805       this->base_ir->insert_before(assig);
    806    }
    807 
    808    ir->operation = ir_triop_fma;
    809    ir->operands[0] = swizzle(ir->operands[0], 0, 1);
    810    ir->operands[1] = swizzle(ir->operands[1], 0, 1);
    811    ir->operands[2] = new(ir) ir_dereference_variable(temp);
    812 
    813    this->progress = true;
    814 
    815 }
    816 
    817 void
    818 lower_instructions_visitor::double_lrp(ir_expression *ir)
    819 {
    820    int swizval;
    821    ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2];
    822    ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements);
    823 
    824    switch (op2->type->vector_elements) {
    825    case 1:
    826       swizval = SWIZZLE_XXXX;
    827       break;
    828    default:
    829       assert(op0->type->vector_elements == op2->type->vector_elements);
    830       swizval = SWIZZLE_XYZW;
    831       break;
    832    }
    833 
    834    ir->operation = ir_triop_fma;
    835    ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements);
    836    ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0);
    837 
    838    this->progress = true;
    839 }
    840 
    841 void
    842 lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir)
    843 {
    844    /*
    845     * frtemp = frac(x);
    846     * temp = sub(x, frtemp);
    847     * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0);
    848     */
    849    ir_instruction &i = *base_ir;
    850    ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
    851    ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
    852    ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
    853                                              ir_var_temporary);
    854 
    855    i.insert_before(frtemp);
    856    i.insert_before(assign(frtemp, fract(ir->operands[0])));
    857 
    858    ir->operation = ir_binop_add;
    859    ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp);
    860    ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL));
    861 
    862    this->progress = true;
    863 }
    864 
    865 void
    866 lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir)
    867 {
    868    /*
    869     * frtemp = frac(x);
    870     * result = sub(x, frtemp);
    871     */
    872    ir->operation = ir_binop_sub;
    873    ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL));
    874 
    875    this->progress = true;
    876 }
    877 void
    878 lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir)
    879 {
    880    /*
    881     * insane but works
    882     * temp = x + 0.5;
    883     * frtemp = frac(temp);
    884     * t2 = sub(temp, frtemp);
    885     * if (frac(x) == 0.5)
    886     *     result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1;
    887     *  else
    888     *     result = t2;
    889 
    890     */
    891    ir_instruction &i = *base_ir;
    892    ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
    893                                              ir_var_temporary);
    894    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
    895                                            ir_var_temporary);
    896    ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2",
    897                                            ir_var_temporary);
    898    ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements);
    899    ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
    900    ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
    901 
    902    i.insert_before(temp);
    903    i.insert_before(assign(temp, add(ir->operands[0], p5)));
    904 
    905    i.insert_before(frtemp);
    906    i.insert_before(assign(frtemp, fract(temp)));
    907 
    908    i.insert_before(t2);
    909    i.insert_before(assign(t2, sub(temp, frtemp)));
    910 
    911    ir->operation = ir_triop_csel;
    912    ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)),
    913                            p5->clone(ir, NULL));
    914    ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))),
    915                                 zero),
    916                           t2,
    917                           sub(t2, one));
    918    ir->operands[2] = new(ir) ir_dereference_variable(t2);
    919 
    920    this->progress = true;
    921 }
    922 
    923 void
    924 lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir)
    925 {
    926    /*
    927     * frtemp = frac(x);
    928     * temp = sub(x, frtemp);
    929     * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1;
    930     */
    931    ir_rvalue *arg = ir->operands[0];
    932    ir_instruction &i = *base_ir;
    933 
    934    ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
    935    ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
    936    ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp",
    937                                              ir_var_temporary);
    938    ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
    939                                            ir_var_temporary);
    940 
    941    i.insert_before(frtemp);
    942    i.insert_before(assign(frtemp, fract(arg)));
    943    i.insert_before(temp);
    944    i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp)));
    945 
    946    ir->operation = ir_triop_csel;
    947    ir->operands[0] = gequal(arg->clone(ir, NULL), zero);
    948    ir->operands[1] = new (ir) ir_dereference_variable(temp);
    949    ir->operands[2] = add(temp,
    950                          csel(equal(frtemp, zero->clone(ir, NULL)),
    951                               zero->clone(ir, NULL),
    952                               one));
    953 
    954    this->progress = true;
    955 }
    956 
    957 void
    958 lower_instructions_visitor::dsign_to_csel(ir_expression *ir)
    959 {
    960    /*
    961     * temp = x > 0.0 ? 1.0 : 0.0;
    962     * result = x < 0.0 ? -1.0 : temp;
    963     */
    964    ir_rvalue *arg = ir->operands[0];
    965    ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
    966    ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
    967    ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements);
    968 
    969    ir->operation = ir_triop_csel;
    970    ir->operands[0] = less(arg->clone(ir, NULL),
    971                           zero->clone(ir, NULL));
    972    ir->operands[1] = neg_one;
    973    ir->operands[2] = csel(greater(arg, zero),
    974                           one,
    975                           zero->clone(ir, NULL));
    976 
    977    this->progress = true;
    978 }
    979 
    980 void
    981 lower_instructions_visitor::bit_count_to_math(ir_expression *ir)
    982 {
    983    /* For more details, see:
    984     *
    985     * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel
    986     */
    987    const unsigned elements = ir->operands[0]->type->vector_elements;
    988    ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp",
    989                                            ir_var_temporary);
    990    ir_constant *c55555555 = new(ir) ir_constant(0x55555555u);
    991    ir_constant *c33333333 = new(ir) ir_constant(0x33333333u);
    992    ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu);
    993    ir_constant *c01010101 = new(ir) ir_constant(0x01010101u);
    994    ir_constant *c1 = new(ir) ir_constant(1u);
    995    ir_constant *c2 = new(ir) ir_constant(2u);
    996    ir_constant *c4 = new(ir) ir_constant(4u);
    997    ir_constant *c24 = new(ir) ir_constant(24u);
    998 
    999    base_ir->insert_before(temp);
   1000 
   1001    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
   1002       base_ir->insert_before(assign(temp, ir->operands[0]));
   1003    } else {
   1004       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
   1005       base_ir->insert_before(assign(temp, i2u(ir->operands[0])));
   1006    }
   1007 
   1008    /* temp = temp - ((temp >> 1) & 0x55555555u); */
   1009    base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1),
   1010                                                          c55555555))));
   1011 
   1012    /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */
   1013    base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333),
   1014                                            bit_and(rshift(temp, c2),
   1015                                                    c33333333->clone(ir, NULL)))));
   1016 
   1017    /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */
   1018    ir->operation = ir_unop_u2i;
   1019    ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F),
   1020                                 c01010101),
   1021                             c24);
   1022 
   1023    this->progress = true;
   1024 }
   1025 
   1026 void
   1027 lower_instructions_visitor::extract_to_shifts(ir_expression *ir)
   1028 {
   1029    ir_variable *bits =
   1030       new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
   1031 
   1032    base_ir->insert_before(bits);
   1033    base_ir->insert_before(assign(bits, ir->operands[2]));
   1034 
   1035    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
   1036       ir_constant *c1 =
   1037          new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
   1038       ir_constant *c32 =
   1039          new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
   1040       ir_constant *cFFFFFFFF =
   1041          new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
   1042 
   1043       /* At least some hardware treats (x << y) as (x << (y%32)).  This means
   1044        * we'd get a mask of 0 when bits is 32.  Special case it.
   1045        *
   1046        * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u;
   1047        */
   1048       ir_expression *mask = csel(equal(bits, c32),
   1049                                  cFFFFFFFF,
   1050                                  sub(lshift(c1, bits), c1->clone(ir, NULL)));
   1051 
   1052       /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
   1053        *
   1054        *    If bits is zero, the result will be zero.
   1055        *
   1056        * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional
   1057        * select as in the signed integer case.
   1058        *
   1059        * (value >> offset) & mask;
   1060        */
   1061       ir->operation = ir_binop_bit_and;
   1062       ir->operands[0] = rshift(ir->operands[0], ir->operands[1]);
   1063       ir->operands[1] = mask;
   1064       ir->operands[2] = NULL;
   1065    } else {
   1066       ir_constant *c0 =
   1067          new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements);
   1068       ir_constant *c32 =
   1069          new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
   1070       ir_variable *temp =
   1071          new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary);
   1072 
   1073       /* temp = 32 - bits; */
   1074       base_ir->insert_before(temp);
   1075       base_ir->insert_before(assign(temp, sub(c32, bits)));
   1076 
   1077       /* expr = value << (temp - offset)) >> temp; */
   1078       ir_expression *expr =
   1079          rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp);
   1080 
   1081       /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
   1082        *
   1083        *    If bits is zero, the result will be zero.
   1084        *
   1085        * Due to the (x << (y%32)) behavior mentioned before, the (value <<
   1086        * (32-0)) doesn't "erase" all of the data as we would like, so finish
   1087        * up with:
   1088        *
   1089        * (bits == 0) ? 0 : e;
   1090        */
   1091       ir->operation = ir_triop_csel;
   1092       ir->operands[0] = equal(c0, bits);
   1093       ir->operands[1] = c0->clone(ir, NULL);
   1094       ir->operands[2] = expr;
   1095    }
   1096 
   1097    this->progress = true;
   1098 }
   1099 
   1100 void
   1101 lower_instructions_visitor::insert_to_shifts(ir_expression *ir)
   1102 {
   1103    ir_constant *c1;
   1104    ir_constant *c32;
   1105    ir_constant *cFFFFFFFF;
   1106    ir_variable *offset =
   1107       new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary);
   1108    ir_variable *bits =
   1109       new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
   1110    ir_variable *mask =
   1111       new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary);
   1112 
   1113    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
   1114       c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements);
   1115       c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
   1116       cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements);
   1117    } else {
   1118       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
   1119 
   1120       c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
   1121       c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
   1122       cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
   1123    }
   1124 
   1125    base_ir->insert_before(offset);
   1126    base_ir->insert_before(assign(offset, ir->operands[2]));
   1127 
   1128    base_ir->insert_before(bits);
   1129    base_ir->insert_before(assign(bits, ir->operands[3]));
   1130 
   1131    /* At least some hardware treats (x << y) as (x << (y%32)).  This means
   1132     * we'd get a mask of 0 when bits is 32.  Special case it.
   1133     *
   1134     * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset;
   1135     *
   1136     * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
   1137     *
   1138     *    The result will be undefined if offset or bits is negative, or if the
   1139     *    sum of offset and bits is greater than the number of bits used to
   1140     *    store the operand.
   1141     *
   1142     * Since it's undefined, there are a couple other ways this could be
   1143     * implemented.  The other way that was considered was to put the csel
   1144     * around the whole thing:
   1145     *
   1146     *    final_result = bits == 32 ? insert : ... ;
   1147     */
   1148    base_ir->insert_before(mask);
   1149 
   1150    base_ir->insert_before(assign(mask, csel(equal(bits, c32),
   1151                                             cFFFFFFFF,
   1152                                             lshift(sub(lshift(c1, bits),
   1153                                                        c1->clone(ir, NULL)),
   1154                                                    offset))));
   1155 
   1156    /* (base & ~mask) | ((insert << offset) & mask) */
   1157    ir->operation = ir_binop_bit_or;
   1158    ir->operands[0] = bit_and(ir->operands[0], bit_not(mask));
   1159    ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask);
   1160    ir->operands[2] = NULL;
   1161    ir->operands[3] = NULL;
   1162 
   1163    this->progress = true;
   1164 }
   1165 
   1166 void
   1167 lower_instructions_visitor::reverse_to_shifts(ir_expression *ir)
   1168 {
   1169    /* For more details, see:
   1170     *
   1171     * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
   1172     */
   1173    ir_constant *c1 =
   1174       new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
   1175    ir_constant *c2 =
   1176       new(ir) ir_constant(2u, ir->operands[0]->type->vector_elements);
   1177    ir_constant *c4 =
   1178       new(ir) ir_constant(4u, ir->operands[0]->type->vector_elements);
   1179    ir_constant *c8 =
   1180       new(ir) ir_constant(8u, ir->operands[0]->type->vector_elements);
   1181    ir_constant *c16 =
   1182       new(ir) ir_constant(16u, ir->operands[0]->type->vector_elements);
   1183    ir_constant *c33333333 =
   1184       new(ir) ir_constant(0x33333333u, ir->operands[0]->type->vector_elements);
   1185    ir_constant *c55555555 =
   1186       new(ir) ir_constant(0x55555555u, ir->operands[0]->type->vector_elements);
   1187    ir_constant *c0F0F0F0F =
   1188       new(ir) ir_constant(0x0F0F0F0Fu, ir->operands[0]->type->vector_elements);
   1189    ir_constant *c00FF00FF =
   1190       new(ir) ir_constant(0x00FF00FFu, ir->operands[0]->type->vector_elements);
   1191    ir_variable *temp =
   1192       new(ir) ir_variable(glsl_type::uvec(ir->operands[0]->type->vector_elements),
   1193                           "temp", ir_var_temporary);
   1194    ir_instruction &i = *base_ir;
   1195 
   1196    i.insert_before(temp);
   1197 
   1198    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
   1199       i.insert_before(assign(temp, ir->operands[0]));
   1200    } else {
   1201       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
   1202       i.insert_before(assign(temp, i2u(ir->operands[0])));
   1203    }
   1204 
   1205    /* Swap odd and even bits.
   1206     *
   1207     * temp = ((temp >> 1) & 0x55555555u) | ((temp & 0x55555555u) << 1);
   1208     */
   1209    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c1), c55555555),
   1210                                        lshift(bit_and(temp, c55555555->clone(ir, NULL)),
   1211                                               c1->clone(ir, NULL)))));
   1212    /* Swap consecutive pairs.
   1213     *
   1214     * temp = ((temp >> 2) & 0x33333333u) | ((temp & 0x33333333u) << 2);
   1215     */
   1216    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c2), c33333333),
   1217                                        lshift(bit_and(temp, c33333333->clone(ir, NULL)),
   1218                                               c2->clone(ir, NULL)))));
   1219 
   1220    /* Swap nibbles.
   1221     *
   1222     * temp = ((temp >> 4) & 0x0F0F0F0Fu) | ((temp & 0x0F0F0F0Fu) << 4);
   1223     */
   1224    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c4), c0F0F0F0F),
   1225                                        lshift(bit_and(temp, c0F0F0F0F->clone(ir, NULL)),
   1226                                               c4->clone(ir, NULL)))));
   1227 
   1228    /* The last step is, basically, bswap.  Swap the bytes, then swap the
   1229     * words.  When this code is run through GCC on x86, it does generate a
   1230     * bswap instruction.
   1231     *
   1232     * temp = ((temp >> 8) & 0x00FF00FFu) | ((temp & 0x00FF00FFu) << 8);
   1233     * temp = ( temp >> 16              ) | ( temp                << 16);
   1234     */
   1235    i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c8), c00FF00FF),
   1236                                        lshift(bit_and(temp, c00FF00FF->clone(ir, NULL)),
   1237                                               c8->clone(ir, NULL)))));
   1238 
   1239    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
   1240       ir->operation = ir_binop_bit_or;
   1241       ir->operands[0] = rshift(temp, c16);
   1242       ir->operands[1] = lshift(temp, c16->clone(ir, NULL));
   1243    } else {
   1244       ir->operation = ir_unop_u2i;
   1245       ir->operands[0] = bit_or(rshift(temp, c16),
   1246                                lshift(temp, c16->clone(ir, NULL)));
   1247    }
   1248 
   1249    this->progress = true;
   1250 }
   1251 
   1252 void
   1253 lower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir)
   1254 {
   1255    /* For more details, see:
   1256     *
   1257     * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
   1258     */
   1259    const unsigned elements = ir->operands[0]->type->vector_elements;
   1260    ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements);
   1261    ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
   1262    ir_constant *c23 = new(ir) ir_constant(int(23), elements);
   1263    ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
   1264    ir_variable *temp =
   1265       new(ir) ir_variable(glsl_type::ivec(elements), "temp", ir_var_temporary);
   1266    ir_variable *lsb_only =
   1267       new(ir) ir_variable(glsl_type::uvec(elements), "lsb_only", ir_var_temporary);
   1268    ir_variable *as_float =
   1269       new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
   1270    ir_variable *lsb =
   1271       new(ir) ir_variable(glsl_type::ivec(elements), "lsb", ir_var_temporary);
   1272 
   1273    ir_instruction &i = *base_ir;
   1274 
   1275    i.insert_before(temp);
   1276 
   1277    if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
   1278       i.insert_before(assign(temp, ir->operands[0]));
   1279    } else {
   1280       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
   1281       i.insert_before(assign(temp, u2i(ir->operands[0])));
   1282    }
   1283 
   1284    /* The int-to-float conversion is lossless because (value & -value) is
   1285     * either a power of two or zero.  We don't use the result in the zero
   1286     * case.  The uint() cast is necessary so that 0x80000000 does not
   1287     * generate a negative value.
   1288     *
   1289     * uint lsb_only = uint(value & -value);
   1290     * float as_float = float(lsb_only);
   1291     */
   1292    i.insert_before(lsb_only);
   1293    i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp)))));
   1294 
   1295    i.insert_before(as_float);
   1296    i.insert_before(assign(as_float, u2f(lsb_only)));
   1297 
   1298    /* This is basically an open-coded frexp.  Implementations that have a
   1299     * native frexp instruction would be better served by that.  This is
   1300     * optimized versus a full-featured open-coded implementation in two ways:
   1301     *
   1302     * - We don't care about a correct result from subnormal numbers (including
   1303     *   0.0), so the raw exponent can always be safely unbiased.
   1304     *
   1305     * - The value cannot be negative, so it does not need to be masked off to
   1306     *   extract the exponent.
   1307     *
   1308     * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f;
   1309     */
   1310    i.insert_before(lsb);
   1311    i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
   1312 
   1313    /* Use lsb_only in the comparison instead of temp so that the & (far above)
   1314     * can possibly generate the result without an explicit comparison.
   1315     *
   1316     * (lsb_only == 0) ? -1 : lsb;
   1317     *
   1318     * Since our input values are all integers, the unbiased exponent must not
   1319     * be negative.  It will only be negative (-0x7f, in fact) if lsb_only is
   1320     * 0.  Instead of using (lsb_only == 0), we could use (lsb >= 0).  Which is
   1321     * better is likely GPU dependent.  Either way, the difference should be
   1322     * small.
   1323     */
   1324    ir->operation = ir_triop_csel;
   1325    ir->operands[0] = equal(lsb_only, c0);
   1326    ir->operands[1] = cminus1;
   1327    ir->operands[2] = new(ir) ir_dereference_variable(lsb);
   1328 
   1329    this->progress = true;
   1330 }
   1331 
   1332 void
   1333 lower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir)
   1334 {
   1335    /* For more details, see:
   1336     *
   1337     * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
   1338     */
   1339    const unsigned elements = ir->operands[0]->type->vector_elements;
   1340    ir_constant *c0 = new(ir) ir_constant(int(0), elements);
   1341    ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
   1342    ir_constant *c23 = new(ir) ir_constant(int(23), elements);
   1343    ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
   1344    ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements);
   1345    ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements);
   1346    ir_variable *temp =
   1347       new(ir) ir_variable(glsl_type::uvec(elements), "temp", ir_var_temporary);
   1348    ir_variable *as_float =
   1349       new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
   1350    ir_variable *msb =
   1351       new(ir) ir_variable(glsl_type::ivec(elements), "msb", ir_var_temporary);
   1352 
   1353    ir_instruction &i = *base_ir;
   1354 
   1355    i.insert_before(temp);
   1356 
   1357    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
   1358       i.insert_before(assign(temp, ir->operands[0]));
   1359    } else {
   1360       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
   1361 
   1362       /* findMSB(uint(abs(some_int))) almost always does the right thing.
   1363        * There are two problem values:
   1364        *
   1365        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, findMSB returns
   1366        *   31.  However, findMSB(int(0x80000000)) == 30.
   1367        *
   1368        * * 0xffffffff.  Since abs(0xffffffff) == 1, findMSB returns
   1369        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
   1370        *
   1371        *    For a value of zero or negative one, -1 will be returned.
   1372        *
   1373        * For all negative number cases, including 0x80000000 and 0xffffffff,
   1374        * the correct value is obtained from findMSB if instead of negating the
   1375        * (already negative) value the logical-not is used.  A conditonal
   1376        * logical-not can be achieved in two instructions.
   1377        */
   1378       ir_variable *as_int =
   1379          new(ir) ir_variable(glsl_type::ivec(elements), "as_int", ir_var_temporary);
   1380       ir_constant *c31 = new(ir) ir_constant(int(31), elements);
   1381 
   1382       i.insert_before(as_int);
   1383       i.insert_before(assign(as_int, ir->operands[0]));
   1384       i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor,
   1385                                             as_int,
   1386                                             rshift(as_int, c31)))));
   1387    }
   1388 
   1389    /* The int-to-float conversion is lossless because bits are conditionally
   1390     * masked off the bottom of temp to ensure the value has at most 24 bits of
   1391     * data or is zero.  We don't use the result in the zero case.  The uint()
   1392     * cast is necessary so that 0x80000000 does not generate a negative value.
   1393     *
   1394     * float as_float = float(temp > 255 ? temp & ~255 : temp);
   1395     */
   1396    i.insert_before(as_float);
   1397    i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF),
   1398                                              bit_and(temp, cFFFFFF00),
   1399                                              temp))));
   1400 
   1401    /* This is basically an open-coded frexp.  Implementations that have a
   1402     * native frexp instruction would be better served by that.  This is
   1403     * optimized versus a full-featured open-coded implementation in two ways:
   1404     *
   1405     * - We don't care about a correct result from subnormal numbers (including
   1406     *   0.0), so the raw exponent can always be safely unbiased.
   1407     *
   1408     * - The value cannot be negative, so it does not need to be masked off to
   1409     *   extract the exponent.
   1410     *
   1411     * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f;
   1412     */
   1413    i.insert_before(msb);
   1414    i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
   1415 
   1416    /* Use msb in the comparison instead of temp so that the subtract can
   1417     * possibly generate the result without an explicit comparison.
   1418     *
   1419     * (msb < 0) ? -1 : msb;
   1420     *
   1421     * Since our input values are all integers, the unbiased exponent must not
   1422     * be negative.  It will only be negative (-0x7f, in fact) if temp is 0.
   1423     */
   1424    ir->operation = ir_triop_csel;
   1425    ir->operands[0] = less(msb, c0);
   1426    ir->operands[1] = cminus1;
   1427    ir->operands[2] = new(ir) ir_dereference_variable(msb);
   1428 
   1429    this->progress = true;
   1430 }
   1431 
   1432 ir_expression *
   1433 lower_instructions_visitor::_carry(operand a, operand b)
   1434 {
   1435    if (lowering(CARRY_TO_ARITH))
   1436       return i2u(b2i(less(add(a, b),
   1437                           a.val->clone(ralloc_parent(a.val), NULL))));
   1438    else
   1439       return carry(a, b);
   1440 }
   1441 
   1442 void
   1443 lower_instructions_visitor::imul_high_to_mul(ir_expression *ir)
   1444 {
   1445    /*   ABCD
   1446     * * EFGH
   1447     * ======
   1448     * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32
   1449     *
   1450     * In GLSL, (a * b) becomes
   1451     *
   1452     * uint m1 = (a & 0x0000ffffu) * (b & 0x0000ffffu);
   1453     * uint m2 = (a & 0x0000ffffu) * (b >> 16);
   1454     * uint m3 = (a >> 16)         * (b & 0x0000ffffu);
   1455     * uint m4 = (a >> 16)         * (b >> 16);
   1456     *
   1457     * uint c1;
   1458     * uint c2;
   1459     * uint lo_result;
   1460     * uint hi_result;
   1461     *
   1462     * lo_result = uaddCarry(m1, m2 << 16, c1);
   1463     * hi_result = m4 + c1;
   1464     * lo_result = uaddCarry(lo_result, m3 << 16, c2);
   1465     * hi_result = hi_result + c2;
   1466     * hi_result = hi_result + (m2 >> 16) + (m3 >> 16);
   1467     */
   1468    const unsigned elements = ir->operands[0]->type->vector_elements;
   1469    ir_variable *src1 =
   1470       new(ir) ir_variable(glsl_type::uvec(elements), "src1", ir_var_temporary);
   1471    ir_variable *src1h =
   1472       new(ir) ir_variable(glsl_type::uvec(elements), "src1h", ir_var_temporary);
   1473    ir_variable *src1l =
   1474       new(ir) ir_variable(glsl_type::uvec(elements), "src1l", ir_var_temporary);
   1475    ir_variable *src2 =
   1476       new(ir) ir_variable(glsl_type::uvec(elements), "src2", ir_var_temporary);
   1477    ir_variable *src2h =
   1478       new(ir) ir_variable(glsl_type::uvec(elements), "src2h", ir_var_temporary);
   1479    ir_variable *src2l =
   1480       new(ir) ir_variable(glsl_type::uvec(elements), "src2l", ir_var_temporary);
   1481    ir_variable *t1 =
   1482       new(ir) ir_variable(glsl_type::uvec(elements), "t1", ir_var_temporary);
   1483    ir_variable *t2 =
   1484       new(ir) ir_variable(glsl_type::uvec(elements), "t2", ir_var_temporary);
   1485    ir_variable *lo =
   1486       new(ir) ir_variable(glsl_type::uvec(elements), "lo", ir_var_temporary);
   1487    ir_variable *hi =
   1488       new(ir) ir_variable(glsl_type::uvec(elements), "hi", ir_var_temporary);
   1489    ir_variable *different_signs = NULL;
   1490    ir_constant *c0000FFFF = new(ir) ir_constant(0x0000FFFFu, elements);
   1491    ir_constant *c16 = new(ir) ir_constant(16u, elements);
   1492 
   1493    ir_instruction &i = *base_ir;
   1494 
   1495    i.insert_before(src1);
   1496    i.insert_before(src2);
   1497    i.insert_before(src1h);
   1498    i.insert_before(src2h);
   1499    i.insert_before(src1l);
   1500    i.insert_before(src2l);
   1501 
   1502    if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
   1503       i.insert_before(assign(src1, ir->operands[0]));
   1504       i.insert_before(assign(src2, ir->operands[1]));
   1505    } else {
   1506       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
   1507 
   1508       ir_variable *itmp1 =
   1509          new(ir) ir_variable(glsl_type::ivec(elements), "itmp1", ir_var_temporary);
   1510       ir_variable *itmp2 =
   1511          new(ir) ir_variable(glsl_type::ivec(elements), "itmp2", ir_var_temporary);
   1512       ir_constant *c0 = new(ir) ir_constant(int(0), elements);
   1513 
   1514       i.insert_before(itmp1);
   1515       i.insert_before(itmp2);
   1516       i.insert_before(assign(itmp1, ir->operands[0]));
   1517       i.insert_before(assign(itmp2, ir->operands[1]));
   1518 
   1519       different_signs =
   1520          new(ir) ir_variable(glsl_type::bvec(elements), "different_signs",
   1521                              ir_var_temporary);
   1522 
   1523       i.insert_before(different_signs);
   1524       i.insert_before(assign(different_signs, expr(ir_binop_logic_xor,
   1525                                                    less(itmp1, c0),
   1526                                                    less(itmp2, c0->clone(ir, NULL)))));
   1527 
   1528       i.insert_before(assign(src1, i2u(abs(itmp1))));
   1529       i.insert_before(assign(src2, i2u(abs(itmp2))));
   1530    }
   1531 
   1532    i.insert_before(assign(src1l, bit_and(src1, c0000FFFF)));
   1533    i.insert_before(assign(src2l, bit_and(src2, c0000FFFF->clone(ir, NULL))));
   1534    i.insert_before(assign(src1h, rshift(src1, c16)));
   1535    i.insert_before(assign(src2h, rshift(src2, c16->clone(ir, NULL))));
   1536 
   1537    i.insert_before(lo);
   1538    i.insert_before(hi);
   1539    i.insert_before(t1);
   1540    i.insert_before(t2);
   1541 
   1542    i.insert_before(assign(lo, mul(src1l, src2l)));
   1543    i.insert_before(assign(t1, mul(src1l, src2h)));
   1544    i.insert_before(assign(t2, mul(src1h, src2l)));
   1545    i.insert_before(assign(hi, mul(src1h, src2h)));
   1546 
   1547    i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t1, c16->clone(ir, NULL))))));
   1548    i.insert_before(assign(lo,            add(lo, lshift(t1, c16->clone(ir, NULL)))));
   1549 
   1550    i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t2, c16->clone(ir, NULL))))));
   1551    i.insert_before(assign(lo,            add(lo, lshift(t2, c16->clone(ir, NULL)))));
   1552 
   1553    if (different_signs == NULL) {
   1554       assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
   1555 
   1556       ir->operation = ir_binop_add;
   1557       ir->operands[0] = add(hi, rshift(t1, c16->clone(ir, NULL)));
   1558       ir->operands[1] = rshift(t2, c16->clone(ir, NULL));
   1559    } else {
   1560       assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
   1561 
   1562       i.insert_before(assign(hi, add(add(hi, rshift(t1, c16->clone(ir, NULL))),
   1563                                      rshift(t2, c16->clone(ir, NULL)))));
   1564 
   1565       /* For channels where different_signs is set we have to perform a 64-bit
   1566        * negation.  This is *not* the same as just negating the high 32-bits.
   1567        * Consider -3 * 2.  The high 32-bits is 0, but the desired result is
   1568        * -1, not -0!  Recall -x == ~x + 1.
   1569        */
   1570       ir_variable *neg_hi =
   1571          new(ir) ir_variable(glsl_type::ivec(elements), "neg_hi", ir_var_temporary);
   1572       ir_constant *c1 = new(ir) ir_constant(1u, elements);
   1573 
   1574       i.insert_before(neg_hi);
   1575       i.insert_before(assign(neg_hi, add(bit_not(u2i(hi)),
   1576                                          u2i(_carry(bit_not(lo), c1)))));
   1577 
   1578       ir->operation = ir_triop_csel;
   1579       ir->operands[0] = new(ir) ir_dereference_variable(different_signs);
   1580       ir->operands[1] = new(ir) ir_dereference_variable(neg_hi);
   1581       ir->operands[2] = u2i(hi);
   1582    }
   1583 }
   1584 
   1585 ir_visitor_status
   1586 lower_instructions_visitor::visit_leave(ir_expression *ir)
   1587 {
   1588    switch (ir->operation) {
   1589    case ir_binop_dot:
   1590       if (ir->operands[0]->type->is_double())
   1591          double_dot_to_fma(ir);
   1592       break;
   1593    case ir_triop_lrp:
   1594       if (ir->operands[0]->type->is_double())
   1595          double_lrp(ir);
   1596       break;
   1597    case ir_binop_sub:
   1598       if (lowering(SUB_TO_ADD_NEG))
   1599 	 sub_to_add_neg(ir);
   1600       break;
   1601 
   1602    case ir_binop_div:
   1603       if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP))
   1604 	 int_div_to_mul_rcp(ir);
   1605       else if ((ir->operands[1]->type->is_float() && lowering(FDIV_TO_MUL_RCP)) ||
   1606                (ir->operands[1]->type->is_double() && lowering(DDIV_TO_MUL_RCP)))
   1607 	 div_to_mul_rcp(ir);
   1608       break;
   1609 
   1610    case ir_unop_exp:
   1611       if (lowering(EXP_TO_EXP2))
   1612 	 exp_to_exp2(ir);
   1613       break;
   1614 
   1615    case ir_unop_log:
   1616       if (lowering(LOG_TO_LOG2))
   1617 	 log_to_log2(ir);
   1618       break;
   1619 
   1620    case ir_binop_mod:
   1621       if (lowering(MOD_TO_FLOOR) && (ir->type->is_float() || ir->type->is_double()))
   1622 	 mod_to_floor(ir);
   1623       break;
   1624 
   1625    case ir_binop_pow:
   1626       if (lowering(POW_TO_EXP2))
   1627 	 pow_to_exp2(ir);
   1628       break;
   1629 
   1630    case ir_binop_ldexp:
   1631       if (lowering(LDEXP_TO_ARITH) && ir->type->is_float())
   1632          ldexp_to_arith(ir);
   1633       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double())
   1634          dldexp_to_arith(ir);
   1635       break;
   1636 
   1637    case ir_unop_frexp_exp:
   1638       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
   1639          dfrexp_exp_to_arith(ir);
   1640       break;
   1641 
   1642    case ir_unop_frexp_sig:
   1643       if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
   1644          dfrexp_sig_to_arith(ir);
   1645       break;
   1646 
   1647    case ir_binop_carry:
   1648       if (lowering(CARRY_TO_ARITH))
   1649          carry_to_arith(ir);
   1650       break;
   1651 
   1652    case ir_binop_borrow:
   1653       if (lowering(BORROW_TO_ARITH))
   1654          borrow_to_arith(ir);
   1655       break;
   1656 
   1657    case ir_unop_saturate:
   1658       if (lowering(SAT_TO_CLAMP))
   1659          sat_to_clamp(ir);
   1660       break;
   1661 
   1662    case ir_unop_trunc:
   1663       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
   1664          dtrunc_to_dfrac(ir);
   1665       break;
   1666 
   1667    case ir_unop_ceil:
   1668       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
   1669          dceil_to_dfrac(ir);
   1670       break;
   1671 
   1672    case ir_unop_floor:
   1673       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
   1674          dfloor_to_dfrac(ir);
   1675       break;
   1676 
   1677    case ir_unop_round_even:
   1678       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
   1679          dround_even_to_dfrac(ir);
   1680       break;
   1681 
   1682    case ir_unop_sign:
   1683       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
   1684          dsign_to_csel(ir);
   1685       break;
   1686 
   1687    case ir_unop_bit_count:
   1688       if (lowering(BIT_COUNT_TO_MATH))
   1689          bit_count_to_math(ir);
   1690       break;
   1691 
   1692    case ir_triop_bitfield_extract:
   1693       if (lowering(EXTRACT_TO_SHIFTS))
   1694          extract_to_shifts(ir);
   1695       break;
   1696 
   1697    case ir_quadop_bitfield_insert:
   1698       if (lowering(INSERT_TO_SHIFTS))
   1699          insert_to_shifts(ir);
   1700       break;
   1701 
   1702    case ir_unop_bitfield_reverse:
   1703       if (lowering(REVERSE_TO_SHIFTS))
   1704          reverse_to_shifts(ir);
   1705       break;
   1706 
   1707    case ir_unop_find_lsb:
   1708       if (lowering(FIND_LSB_TO_FLOAT_CAST))
   1709          find_lsb_to_float_cast(ir);
   1710       break;
   1711 
   1712    case ir_unop_find_msb:
   1713       if (lowering(FIND_MSB_TO_FLOAT_CAST))
   1714          find_msb_to_float_cast(ir);
   1715       break;
   1716 
   1717    case ir_binop_imul_high:
   1718       if (lowering(IMUL_HIGH_TO_MUL))
   1719          imul_high_to_mul(ir);
   1720       break;
   1721 
   1722    default:
   1723       return visit_continue;
   1724    }
   1725 
   1726    return visit_continue;
   1727 }
   1728