Home | History | Annotate | Download | only in glsl
      1 /*
      2  * Copyright  2012 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21  * DEALINGS IN THE SOFTWARE.
     22  */
     23 
     24 #include "ir.h"
     25 #include "ir_builder.h"
     26 #include "ir_optimization.h"
     27 #include "ir_rvalue_visitor.h"
     28 
     29 namespace {
     30 
     31 using namespace ir_builder;
     32 
     33 /**
     34  * A visitor that lowers built-in floating-point pack/unpack expressions
     35  * such packSnorm2x16.
     36  */
     37 class lower_packing_builtins_visitor : public ir_rvalue_visitor {
     38 public:
     39    /**
     40     * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
     41     */
     42    explicit lower_packing_builtins_visitor(int op_mask)
     43       : op_mask(op_mask),
     44         progress(false)
     45    {
     46       factory.instructions = &factory_instructions;
     47    }
     48 
     49    virtual ~lower_packing_builtins_visitor()
     50    {
     51       assert(factory_instructions.is_empty());
     52    }
     53 
     54    bool get_progress() { return progress; }
     55 
     56    void handle_rvalue(ir_rvalue **rvalue)
     57    {
     58       if (!*rvalue)
     59 	 return;
     60 
     61       ir_expression *expr = (*rvalue)->as_expression();
     62       if (!expr)
     63 	 return;
     64 
     65       enum lower_packing_builtins_op lowering_op =
     66          choose_lowering_op(expr->operation);
     67 
     68       if (lowering_op == LOWER_PACK_UNPACK_NONE)
     69          return;
     70 
     71       setup_factory(ralloc_parent(expr));
     72 
     73       ir_rvalue *op0 = expr->operands[0];
     74       ralloc_steal(factory.mem_ctx, op0);
     75 
     76       switch (lowering_op) {
     77       case LOWER_PACK_SNORM_2x16:
     78          *rvalue = lower_pack_snorm_2x16(op0);
     79          break;
     80       case LOWER_PACK_SNORM_4x8:
     81          *rvalue = lower_pack_snorm_4x8(op0);
     82          break;
     83       case LOWER_PACK_UNORM_2x16:
     84          *rvalue = lower_pack_unorm_2x16(op0);
     85          break;
     86       case LOWER_PACK_UNORM_4x8:
     87          *rvalue = lower_pack_unorm_4x8(op0);
     88          break;
     89       case LOWER_PACK_HALF_2x16:
     90          *rvalue = lower_pack_half_2x16(op0);
     91          break;
     92       case LOWER_UNPACK_SNORM_2x16:
     93          *rvalue = lower_unpack_snorm_2x16(op0);
     94          break;
     95       case LOWER_UNPACK_SNORM_4x8:
     96          *rvalue = lower_unpack_snorm_4x8(op0);
     97          break;
     98       case LOWER_UNPACK_UNORM_2x16:
     99          *rvalue = lower_unpack_unorm_2x16(op0);
    100          break;
    101       case LOWER_UNPACK_UNORM_4x8:
    102          *rvalue = lower_unpack_unorm_4x8(op0);
    103          break;
    104       case LOWER_UNPACK_HALF_2x16:
    105          *rvalue = lower_unpack_half_2x16(op0);
    106          break;
    107       case LOWER_PACK_UNPACK_NONE:
    108       case LOWER_PACK_USE_BFI:
    109       case LOWER_PACK_USE_BFE:
    110          assert(!"not reached");
    111          break;
    112       }
    113 
    114       teardown_factory();
    115       progress = true;
    116    }
    117 
    118 private:
    119    const int op_mask;
    120    bool progress;
    121    ir_factory factory;
    122    exec_list factory_instructions;
    123 
    124    /**
    125     * Determine the needed lowering operation by filtering \a expr_op
    126     * through \ref op_mask.
    127     */
    128    enum lower_packing_builtins_op
    129    choose_lowering_op(ir_expression_operation expr_op)
    130    {
    131       /* C++ regards int and enum as fundamentally different types.
    132        * So, we can't simply return from each case; we must cast the return
    133        * value.
    134        */
    135       int result;
    136 
    137       switch (expr_op) {
    138       case ir_unop_pack_snorm_2x16:
    139          result = op_mask & LOWER_PACK_SNORM_2x16;
    140          break;
    141       case ir_unop_pack_snorm_4x8:
    142          result = op_mask & LOWER_PACK_SNORM_4x8;
    143          break;
    144       case ir_unop_pack_unorm_2x16:
    145          result = op_mask & LOWER_PACK_UNORM_2x16;
    146          break;
    147       case ir_unop_pack_unorm_4x8:
    148          result = op_mask & LOWER_PACK_UNORM_4x8;
    149          break;
    150       case ir_unop_pack_half_2x16:
    151          result = op_mask & LOWER_PACK_HALF_2x16;
    152          break;
    153       case ir_unop_unpack_snorm_2x16:
    154          result = op_mask & LOWER_UNPACK_SNORM_2x16;
    155          break;
    156       case ir_unop_unpack_snorm_4x8:
    157          result = op_mask & LOWER_UNPACK_SNORM_4x8;
    158          break;
    159       case ir_unop_unpack_unorm_2x16:
    160          result = op_mask & LOWER_UNPACK_UNORM_2x16;
    161          break;
    162       case ir_unop_unpack_unorm_4x8:
    163          result = op_mask & LOWER_UNPACK_UNORM_4x8;
    164          break;
    165       case ir_unop_unpack_half_2x16:
    166          result = op_mask & LOWER_UNPACK_HALF_2x16;
    167          break;
    168       default:
    169          result = LOWER_PACK_UNPACK_NONE;
    170          break;
    171       }
    172 
    173       return static_cast<enum lower_packing_builtins_op>(result);
    174    }
    175 
    176    void
    177    setup_factory(void *mem_ctx)
    178    {
    179       assert(factory.mem_ctx == NULL);
    180       assert(factory.instructions->is_empty());
    181 
    182       factory.mem_ctx = mem_ctx;
    183    }
    184 
    185    void
    186    teardown_factory()
    187    {
    188       base_ir->insert_before(factory.instructions);
    189       assert(factory.instructions->is_empty());
    190       factory.mem_ctx = NULL;
    191    }
    192 
    193    template <typename T>
    194    ir_constant*
    195    constant(T x)
    196    {
    197       return factory.constant(x);
    198    }
    199 
    200    /**
    201     * \brief Pack two uint16's into a single uint32.
    202     *
    203     * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
    204     * where the least significant bits specify the first element of the pair.
    205     * Return the uint32.
    206     */
    207    ir_rvalue*
    208    pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
    209    {
    210       assert(uvec2_rval->type == glsl_type::uvec2_type);
    211 
    212       /* uvec2 u = UVEC2_RVAL; */
    213       ir_variable *u = factory.make_temp(glsl_type::uvec2_type,
    214                                          "tmp_pack_uvec2_to_uint");
    215       factory.emit(assign(u, uvec2_rval));
    216 
    217       if (op_mask & LOWER_PACK_USE_BFI) {
    218          return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
    219                                 swizzle_y(u),
    220                                 constant(16u),
    221                                 constant(16u));
    222       }
    223 
    224       /* return (u.y << 16) | (u.x & 0xffff); */
    225       return bit_or(lshift(swizzle_y(u), constant(16u)),
    226                     bit_and(swizzle_x(u), constant(0xffffu)));
    227    }
    228 
    229    /**
    230     * \brief Pack four uint8's into a single uint32.
    231     *
    232     * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
    233     * uint32 where the least significant bits specify the first element of the
    234     * 4-tuple. Return the uint32.
    235     */
    236    ir_rvalue*
    237    pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
    238    {
    239       assert(uvec4_rval->type == glsl_type::uvec4_type);
    240 
    241       ir_variable *u = factory.make_temp(glsl_type::uvec4_type,
    242                                          "tmp_pack_uvec4_to_uint");
    243 
    244       if (op_mask & LOWER_PACK_USE_BFI) {
    245          /* uvec4 u = UVEC4_RVAL; */
    246          factory.emit(assign(u, uvec4_rval));
    247 
    248          return bitfield_insert(bitfield_insert(
    249                                    bitfield_insert(
    250                                       bit_and(swizzle_x(u), constant(0xffu)),
    251                                       swizzle_y(u), constant(8u), constant(8u)),
    252                                    swizzle_z(u), constant(16u), constant(8u)),
    253                                 swizzle_w(u), constant(24u), constant(8u));
    254       }
    255 
    256       /* uvec4 u = UVEC4_RVAL & 0xff */
    257       factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
    258 
    259       /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
    260       return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
    261                            lshift(swizzle_z(u), constant(16u))),
    262                     bit_or(lshift(swizzle_y(u), constant(8u)),
    263                            swizzle_x(u)));
    264    }
    265 
    266    /**
    267     * \brief Unpack a uint32 into two uint16's.
    268     *
    269     * Interpret the given uint32 as a uint16 pair where the uint32's least
    270     * significant bits specify the pair's first element. Return the uint16
    271     * pair as a uvec2.
    272     */
    273    ir_rvalue*
    274    unpack_uint_to_uvec2(ir_rvalue *uint_rval)
    275    {
    276       assert(uint_rval->type == glsl_type::uint_type);
    277 
    278       /* uint u = UINT_RVAL; */
    279       ir_variable *u = factory.make_temp(glsl_type::uint_type,
    280                                           "tmp_unpack_uint_to_uvec2_u");
    281       factory.emit(assign(u, uint_rval));
    282 
    283       /* uvec2 u2; */
    284       ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type,
    285                                            "tmp_unpack_uint_to_uvec2_u2");
    286 
    287       /* u2.x = u & 0xffffu; */
    288       factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
    289 
    290       /* u2.y = u >> 16u; */
    291       factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
    292 
    293       return deref(u2).val;
    294    }
    295 
    296    /**
    297     * \brief Unpack a uint32 into two int16's.
    298     *
    299     * Specifically each 16-bit value is sign-extended to the full width of an
    300     * int32 on return.
    301     */
    302    ir_rvalue *
    303    unpack_uint_to_ivec2(ir_rvalue *uint_rval)
    304    {
    305       assert(uint_rval->type == glsl_type::uint_type);
    306 
    307       if (!(op_mask & LOWER_PACK_USE_BFE)) {
    308          return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
    309                               constant(16u)),
    310                        constant(16u));
    311       }
    312 
    313       ir_variable *i = factory.make_temp(glsl_type::int_type,
    314                                          "tmp_unpack_uint_to_ivec2_i");
    315       factory.emit(assign(i, u2i(uint_rval)));
    316 
    317       /* ivec2 i2; */
    318       ir_variable *i2 = factory.make_temp(glsl_type::ivec2_type,
    319                                           "tmp_unpack_uint_to_ivec2_i2");
    320 
    321       factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
    322                           WRITEMASK_X));
    323       factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
    324                           WRITEMASK_Y));
    325 
    326       return deref(i2).val;
    327    }
    328 
    329    /**
    330     * \brief Unpack a uint32 into four uint8's.
    331     *
    332     * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
    333     * significant bits specify the 4-tuple's first element. Return the uint8
    334     * 4-tuple as a uvec4.
    335     */
    336    ir_rvalue*
    337    unpack_uint_to_uvec4(ir_rvalue *uint_rval)
    338    {
    339       assert(uint_rval->type == glsl_type::uint_type);
    340 
    341       /* uint u = UINT_RVAL; */
    342       ir_variable *u = factory.make_temp(glsl_type::uint_type,
    343                                           "tmp_unpack_uint_to_uvec4_u");
    344       factory.emit(assign(u, uint_rval));
    345 
    346       /* uvec4 u4; */
    347       ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type,
    348                                            "tmp_unpack_uint_to_uvec4_u4");
    349 
    350       /* u4.x = u & 0xffu; */
    351       factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
    352 
    353       if (op_mask & LOWER_PACK_USE_BFE) {
    354          /* u4.y = bitfield_extract(u, 8, 8); */
    355          factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
    356                              WRITEMASK_Y));
    357 
    358          /* u4.z = bitfield_extract(u, 16, 8); */
    359          factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
    360                              WRITEMASK_Z));
    361       } else {
    362          /* u4.y = (u >> 8u) & 0xffu; */
    363          factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
    364                                          constant(0xffu)), WRITEMASK_Y));
    365 
    366          /* u4.z = (u >> 16u) & 0xffu; */
    367          factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
    368                                          constant(0xffu)), WRITEMASK_Z));
    369       }
    370 
    371       /* u4.w = (u >> 24u) */
    372       factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
    373 
    374       return deref(u4).val;
    375    }
    376 
    377    /**
    378     * \brief Unpack a uint32 into four int8's.
    379     *
    380     * Specifically each 8-bit value is sign-extended to the full width of an
    381     * int32 on return.
    382     */
    383    ir_rvalue *
    384    unpack_uint_to_ivec4(ir_rvalue *uint_rval)
    385    {
    386       assert(uint_rval->type == glsl_type::uint_type);
    387 
    388       if (!(op_mask & LOWER_PACK_USE_BFE)) {
    389          return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
    390                               constant(24u)),
    391                        constant(24u));
    392       }
    393 
    394       ir_variable *i = factory.make_temp(glsl_type::int_type,
    395                                          "tmp_unpack_uint_to_ivec4_i");
    396       factory.emit(assign(i, u2i(uint_rval)));
    397 
    398       /* ivec4 i4; */
    399       ir_variable *i4 = factory.make_temp(glsl_type::ivec4_type,
    400                                           "tmp_unpack_uint_to_ivec4_i4");
    401 
    402       factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
    403                           WRITEMASK_X));
    404       factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
    405                           WRITEMASK_Y));
    406       factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
    407                           WRITEMASK_Z));
    408       factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
    409                           WRITEMASK_W));
    410 
    411       return deref(i4).val;
    412    }
    413 
    414    /**
    415     * \brief Lower a packSnorm2x16 expression.
    416     *
    417     * \param vec2_rval is packSnorm2x16's input
    418     * \return packSnorm2x16's output as a uint rvalue
    419     */
    420    ir_rvalue*
    421    lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
    422    {
    423       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
    424        *
    425        *    highp uint packSnorm2x16(vec2 v)
    426        *    --------------------------------
    427        *    First, converts each component of the normalized floating-point value
    428        *    v into 16-bit integer values. Then, the results are packed into the
    429        *    returned 32-bit unsigned integer.
    430        *
    431        *    The conversion for component c of v to fixed point is done as
    432        *    follows:
    433        *
    434        *       packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
    435        *
    436        *    The first component of the vector will be written to the least
    437        *    significant bits of the output; the last component will be written to
    438        *    the most significant bits.
    439        *
    440        * This function generates IR that approximates the following pseudo-GLSL:
    441        *
    442        *     return pack_uvec2_to_uint(
    443        *         uvec2(ivec2(
    444        *           round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
    445        *
    446        * It is necessary to first convert the vec2 to ivec2 rather than directly
    447        * converting vec2 to uvec2 because the latter conversion is undefined.
    448        * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
    449        * convert a negative floating point value to an uint".
    450        */
    451       assert(vec2_rval->type == glsl_type::vec2_type);
    452 
    453       ir_rvalue *result = pack_uvec2_to_uint(
    454             i2u(f2i(round_even(mul(clamp(vec2_rval,
    455                                          constant(-1.0f),
    456                                          constant(1.0f)),
    457                                    constant(32767.0f))))));
    458 
    459       assert(result->type == glsl_type::uint_type);
    460       return result;
    461    }
    462 
    463    /**
    464     * \brief Lower a packSnorm4x8 expression.
    465     *
    466     * \param vec4_rval is packSnorm4x8's input
    467     * \return packSnorm4x8's output as a uint rvalue
    468     */
    469    ir_rvalue*
    470    lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
    471    {
    472       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
    473        *
    474        *    highp uint packSnorm4x8(vec4 v)
    475        *    -------------------------------
    476        *    First, converts each component of the normalized floating-point value
    477        *    v into 8-bit integer values. Then, the results are packed into the
    478        *    returned 32-bit unsigned integer.
    479        *
    480        *    The conversion for component c of v to fixed point is done as
    481        *    follows:
    482        *
    483        *       packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
    484        *
    485        *    The first component of the vector will be written to the least
    486        *    significant bits of the output; the last component will be written to
    487        *    the most significant bits.
    488        *
    489        * This function generates IR that approximates the following pseudo-GLSL:
    490        *
    491        *     return pack_uvec4_to_uint(
    492        *         uvec4(ivec4(
    493        *           round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
    494        *
    495        * It is necessary to first convert the vec4 to ivec4 rather than directly
    496        * converting vec4 to uvec4 because the latter conversion is undefined.
    497        * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
    498        * convert a negative floating point value to an uint".
    499        */
    500       assert(vec4_rval->type == glsl_type::vec4_type);
    501 
    502       ir_rvalue *result = pack_uvec4_to_uint(
    503             i2u(f2i(round_even(mul(clamp(vec4_rval,
    504                                          constant(-1.0f),
    505                                          constant(1.0f)),
    506                                    constant(127.0f))))));
    507 
    508       assert(result->type == glsl_type::uint_type);
    509       return result;
    510    }
    511 
    512    /**
    513     * \brief Lower an unpackSnorm2x16 expression.
    514     *
    515     * \param uint_rval is unpackSnorm2x16's input
    516     * \return unpackSnorm2x16's output as a vec2 rvalue
    517     */
    518    ir_rvalue*
    519    lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
    520    {
    521       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
    522        *
    523        *    highp vec2 unpackSnorm2x16 (highp uint p)
    524        *    -----------------------------------------
    525        *    First, unpacks a single 32-bit unsigned integer p into a pair of
    526        *    16-bit unsigned integers. Then, each component is converted to
    527        *    a normalized floating-point value to generate the returned
    528        *    two-component vector.
    529        *
    530        *    The conversion for unpacked fixed-point value f to floating point is
    531        *    done as follows:
    532        *
    533        *       unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
    534        *
    535        *    The first component of the returned vector will be extracted from the
    536        *    least significant bits of the input; the last component will be
    537        *    extracted from the most significant bits.
    538        *
    539        * This function generates IR that approximates the following pseudo-GLSL:
    540        *
    541        *    return clamp(
    542        *       ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
    543        *       -1.0f, 1.0f);
    544        *
    545        * The above IR may appear unnecessarily complex, but the intermediate
    546        * conversion to ivec2 and the bit shifts are necessary to correctly unpack
    547        * negative floats.
    548        *
    549        * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
    550        * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
    551        * place that int16 into an int32, which results in the *positive* integer
    552        * 0x0000ffff.  The int16's sign bit becomes, in the int32, the rather
    553        * unimportant bit 16. We must now extend the int16's sign bit into bits
    554        * 17-32, which is accomplished by left-shifting then right-shifting.
    555        */
    556 
    557       assert(uint_rval->type == glsl_type::uint_type);
    558 
    559       ir_rvalue *result =
    560         clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
    561                   constant(32767.0f)),
    562               constant(-1.0f),
    563               constant(1.0f));
    564 
    565       assert(result->type == glsl_type::vec2_type);
    566       return result;
    567    }
    568 
    569    /**
    570     * \brief Lower an unpackSnorm4x8 expression.
    571     *
    572     * \param uint_rval is unpackSnorm4x8's input
    573     * \return unpackSnorm4x8's output as a vec4 rvalue
    574     */
    575    ir_rvalue*
    576    lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
    577    {
    578       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
    579        *
    580        *    highp vec4 unpackSnorm4x8 (highp uint p)
    581        *    ----------------------------------------
    582        *    First, unpacks a single 32-bit unsigned integer p into four
    583        *    8-bit unsigned integers. Then, each component is converted to
    584        *    a normalized floating-point value to generate the returned
    585        *    four-component vector.
    586        *
    587        *    The conversion for unpacked fixed-point value f to floating point is
    588        *    done as follows:
    589        *
    590        *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
    591        *
    592        *    The first component of the returned vector will be extracted from the
    593        *    least significant bits of the input; the last component will be
    594        *    extracted from the most significant bits.
    595        *
    596        * This function generates IR that approximates the following pseudo-GLSL:
    597        *
    598        *    return clamp(
    599        *       ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
    600        *       -1.0f, 1.0f);
    601        *
    602        * The above IR may appear unnecessarily complex, but the intermediate
    603        * conversion to ivec4 and the bit shifts are necessary to correctly unpack
    604        * negative floats.
    605        *
    606        * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
    607        * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
    608        * place that int8 into an int32, which results in the *positive* integer
    609        * 0x000000ff.  The int8's sign bit becomes, in the int32, the rather
    610        * unimportant bit 8. We must now extend the int8's sign bit into bits
    611        * 9-32, which is accomplished by left-shifting then right-shifting.
    612        */
    613 
    614       assert(uint_rval->type == glsl_type::uint_type);
    615 
    616       ir_rvalue *result =
    617         clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
    618                   constant(127.0f)),
    619               constant(-1.0f),
    620               constant(1.0f));
    621 
    622       assert(result->type == glsl_type::vec4_type);
    623       return result;
    624    }
    625 
    626    /**
    627     * \brief Lower a packUnorm2x16 expression.
    628     *
    629     * \param vec2_rval is packUnorm2x16's input
    630     * \return packUnorm2x16's output as a uint rvalue
    631     */
    632    ir_rvalue*
    633    lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
    634    {
    635       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
    636        *
    637        *    highp uint packUnorm2x16 (vec2 v)
    638        *    ---------------------------------
    639        *    First, converts each component of the normalized floating-point value
    640        *    v into 16-bit integer values. Then, the results are packed into the
    641        *    returned 32-bit unsigned integer.
    642        *
    643        *    The conversion for component c of v to fixed point is done as
    644        *    follows:
    645        *
    646        *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
    647        *
    648        *    The first component of the vector will be written to the least
    649        *    significant bits of the output; the last component will be written to
    650        *    the most significant bits.
    651        *
    652        * This function generates IR that approximates the following pseudo-GLSL:
    653        *
    654        *     return pack_uvec2_to_uint(uvec2(
    655        *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
    656        *
    657        * Here it is safe to directly convert the vec2 to uvec2 because the vec2
    658        * has been clamped to a non-negative range.
    659        */
    660 
    661       assert(vec2_rval->type == glsl_type::vec2_type);
    662 
    663       ir_rvalue *result = pack_uvec2_to_uint(
    664          f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
    665 
    666       assert(result->type == glsl_type::uint_type);
    667       return result;
    668    }
    669 
    670    /**
    671     * \brief Lower a packUnorm4x8 expression.
    672     *
    673     * \param vec4_rval is packUnorm4x8's input
    674     * \return packUnorm4x8's output as a uint rvalue
    675     */
    676    ir_rvalue*
    677    lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
    678    {
    679       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
    680        *
    681        *    highp uint packUnorm4x8 (vec4 v)
    682        *    --------------------------------
    683        *    First, converts each component of the normalized floating-point value
    684        *    v into 8-bit integer values. Then, the results are packed into the
    685        *    returned 32-bit unsigned integer.
    686        *
    687        *    The conversion for component c of v to fixed point is done as
    688        *    follows:
    689        *
    690        *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
    691        *
    692        *    The first component of the vector will be written to the least
    693        *    significant bits of the output; the last component will be written to
    694        *    the most significant bits.
    695        *
    696        * This function generates IR that approximates the following pseudo-GLSL:
    697        *
    698        *     return pack_uvec4_to_uint(uvec4(
    699        *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
    700        *
    701        * Here it is safe to directly convert the vec4 to uvec4 because the vec4
    702        * has been clamped to a non-negative range.
    703        */
    704 
    705       assert(vec4_rval->type == glsl_type::vec4_type);
    706 
    707       ir_rvalue *result = pack_uvec4_to_uint(
    708          f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
    709 
    710       assert(result->type == glsl_type::uint_type);
    711       return result;
    712    }
    713 
    714    /**
    715     * \brief Lower an unpackUnorm2x16 expression.
    716     *
    717     * \param uint_rval is unpackUnorm2x16's input
    718     * \return unpackUnorm2x16's output as a vec2 rvalue
    719     */
    720    ir_rvalue*
    721    lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
    722    {
    723       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
    724        *
    725        *    highp vec2 unpackUnorm2x16 (highp uint p)
    726        *    -----------------------------------------
    727        *    First, unpacks a single 32-bit unsigned integer p into a pair of
    728        *    16-bit unsigned integers. Then, each component is converted to
    729        *    a normalized floating-point value to generate the returned
    730        *    two-component vector.
    731        *
    732        *    The conversion for unpacked fixed-point value f to floating point is
    733        *    done as follows:
    734        *
    735        *       unpackUnorm2x16: f / 65535.0
    736        *
    737        *    The first component of the returned vector will be extracted from the
    738        *    least significant bits of the input; the last component will be
    739        *    extracted from the most significant bits.
    740        *
    741        * This function generates IR that approximates the following pseudo-GLSL:
    742        *
    743        *     return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
    744        */
    745 
    746       assert(uint_rval->type == glsl_type::uint_type);
    747 
    748       ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
    749                               constant(65535.0f));
    750 
    751       assert(result->type == glsl_type::vec2_type);
    752       return result;
    753    }
    754 
    755    /**
    756     * \brief Lower an unpackUnorm4x8 expression.
    757     *
    758     * \param uint_rval is unpackUnorm4x8's input
    759     * \return unpackUnorm4x8's output as a vec4 rvalue
    760     */
    761    ir_rvalue*
    762    lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
    763    {
    764       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
    765        *
    766        *    highp vec4 unpackUnorm4x8 (highp uint p)
    767        *    ----------------------------------------
    768        *    First, unpacks a single 32-bit unsigned integer p into four
    769        *    8-bit unsigned integers. Then, each component is converted to
    770        *    a normalized floating-point value to generate the returned
    771        *    two-component vector.
    772        *
    773        *    The conversion for unpacked fixed-point value f to floating point is
    774        *    done as follows:
    775        *
    776        *       unpackUnorm4x8: f / 255.0
    777        *
    778        *    The first component of the returned vector will be extracted from the
    779        *    least significant bits of the input; the last component will be
    780        *    extracted from the most significant bits.
    781        *
    782        * This function generates IR that approximates the following pseudo-GLSL:
    783        *
    784        *     return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
    785        */
    786 
    787       assert(uint_rval->type == glsl_type::uint_type);
    788 
    789       ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
    790                               constant(255.0f));
    791 
    792       assert(result->type == glsl_type::vec4_type);
    793       return result;
    794    }
    795 
    796    /**
    797     * \brief Lower the component-wise calculation of packHalf2x16.
    798     *
    799     * \param f_rval is one component of packHafl2x16's input
    800     * \param e_rval is the unshifted exponent bits of f_rval
    801     * \param m_rval is the unshifted mantissa bits of f_rval
    802     *
    803     * \return a uint rvalue that encodes a float16 in its lower 16 bits
    804     */
    805    ir_rvalue*
    806    pack_half_1x16_nosign(ir_rvalue *f_rval,
    807                          ir_rvalue *e_rval,
    808                          ir_rvalue *m_rval)
    809    {
    810       assert(e_rval->type == glsl_type::uint_type);
    811       assert(m_rval->type == glsl_type::uint_type);
    812 
    813       /* uint u16; */
    814       ir_variable *u16 = factory.make_temp(glsl_type::uint_type,
    815                                            "tmp_pack_half_1x16_u16");
    816 
    817       /* float f = FLOAT_RVAL; */
    818       ir_variable *f = factory.make_temp(glsl_type::float_type,
    819                                           "tmp_pack_half_1x16_f");
    820       factory.emit(assign(f, f_rval));
    821 
    822       /* uint e = E_RVAL; */
    823       ir_variable *e = factory.make_temp(glsl_type::uint_type,
    824                                           "tmp_pack_half_1x16_e");
    825       factory.emit(assign(e, e_rval));
    826 
    827       /* uint m = M_RVAL; */
    828       ir_variable *m = factory.make_temp(glsl_type::uint_type,
    829                                           "tmp_pack_half_1x16_m");
    830       factory.emit(assign(m, m_rval));
    831 
    832       /* Preliminaries
    833        * -------------
    834        *
    835        * For a float16, the bit layout is:
    836        *
    837        *   sign:     15
    838        *   exponent: 10:14
    839        *   mantissa: 0:9
    840        *
    841        * Let f16 be a float16 value. The sign, exponent, and mantissa
    842        * determine its value thus:
    843        *
    844        *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
    845        *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
    846        *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
    847        *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
    848        *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
    849        *
    850        * where 0 <= m16 < 2^10.
    851        *
    852        * For a float32, the bit layout is:
    853        *
    854        *   sign:     31
    855        *   exponent: 23:30
    856        *   mantissa: 0:22
    857        *
    858        * Let f32 be a float32 value. The sign, exponent, and mantissa
    859        * determine its value thus:
    860        *
    861        *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
    862        *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
    863        *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
    864        *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
    865        *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
    866        *
    867        * where 0 <= m32 < 2^23.
    868        *
    869        * The minimum and maximum normal float16 values are
    870        *
    871        *   min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14)   (20)
    872        *   max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10)         (21)
    873        *
    874        * The step at max_norm16 is
    875        *
    876        *   max_step16 = 2^5                                     (22)
    877        *
    878        * Observe that the float16 boundary values in equations 20-21 lie in the
    879        * range of normal float32 values.
    880        *
    881        *
    882        * Rounding Behavior
    883        * -----------------
    884        * Not all float32 values can be exactly represented as a float16. We
    885        * round all such intermediate float32 values to the nearest float16; if
    886        * the float32 is exactly between to float16 values, we round to the one
    887        * with an even mantissa. This rounding behavior has several benefits:
    888        *
    889        *   - It has no sign bias.
    890        *
    891        *   - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
    892        *     GPU ISA.
    893        *
    894        *   - By reproducing the behavior of the GPU (at least on Intel hardware),
    895        *     compile-time evaluation of constant packHalf2x16 GLSL expressions will
    896        *     result in the same value as if the expression were executed on the
    897        *     GPU.
    898        *
    899        * Calculation
    900        * -----------
    901        * Our task is to compute s16, e16, m16 given f32.  Since this function
    902        * ignores the sign bit, assume that s32 = s16 = 0.  There are several
    903        * cases consider.
    904        */
    905 
    906       factory.emit(
    907 
    908          /* Case 1) f32 is NaN
    909           *
    910           *   The resultant f16 will also be NaN.
    911           */
    912 
    913          /* if (e32 == 255 && m32 != 0) { */
    914          if_tree(logic_and(equal(e, constant(0xffu << 23u)),
    915                            logic_not(equal(m, constant(0u)))),
    916 
    917             assign(u16, constant(0x7fffu)),
    918 
    919          /* Case 2) f32 lies in the range [0, min_norm16).
    920           *
    921           *   The resultant float16 will be either zero, subnormal, or normal.
    922           *
    923           *   Solving
    924           *
    925           *     f32 = min_norm16       (30)
    926           *
    927           *   gives
    928           *
    929           *     e32 = 113 and m32 = 0  (31)
    930           *
    931           *   Therefore this case occurs if and only if
    932           *
    933           *     e32 < 113              (32)
    934           */
    935 
    936          /* } else if (e32 < 113) { */
    937          if_tree(less(e, constant(113u << 23u)),
    938 
    939             /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
    940             assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
    941                                            constant((float) (1 << 24)))))),
    942 
    943          /* Case 3) f32 lies in the range
    944           *         [min_norm16, max_norm16 + max_step16).
    945           *
    946           *   The resultant float16 will be either normal or infinite.
    947           *
    948           *   Solving
    949           *
    950           *     f32 = max_norm16 + max_step16           (40)
    951           *         = 2^15 * (1 + 1023 / 2^10) + 2^5    (41)
    952           *         = 2^16                              (42)
    953           *   gives
    954           *
    955           *     e32 = 143 and m32 = 0                   (43)
    956           *
    957           *   We already solved the boundary condition f32 = min_norm16 above
    958           *   in equation 31. Therefore this case occurs if and only if
    959           *
    960           *     113 <= e32 and e32 < 143
    961           */
    962 
    963          /* } else if (e32 < 143) { */
    964          if_tree(less(e, constant(143u << 23u)),
    965 
    966             /* The addition below handles the case where the mantissa rounds
    967              * up to 1024 and bumps the exponent.
    968              *
    969              * u16 = ((e - (112u << 23u)) >> 13u)
    970              *     + round_to_even((float(m) / (1u << 13u));
    971              */
    972             assign(u16, add(rshift(sub(e, constant(112u << 23u)),
    973                                    constant(13u)),
    974                             f2u(round_even(
    975                                   div(u2f(m), constant((float) (1 << 13))))))),
    976 
    977          /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
    978           *
    979           *   The resultant float16 will be infinite.
    980           *
    981           *   The cases above caught all float32 values in the range
    982           *   [0, max_norm16 + max_step16), so this is the fall-through case.
    983           */
    984 
    985          /* } else { */
    986 
    987             assign(u16, constant(31u << 10u))))));
    988 
    989          /* } */
    990 
    991        return deref(u16).val;
    992    }
    993 
    994    /**
    995     * \brief Lower a packHalf2x16 expression.
    996     *
    997     * \param vec2_rval is packHalf2x16's input
    998     * \return packHalf2x16's output as a uint rvalue
    999     */
   1000    ir_rvalue*
   1001    lower_pack_half_2x16(ir_rvalue *vec2_rval)
   1002    {
   1003       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
   1004        *
   1005        *    highp uint packHalf2x16 (mediump vec2 v)
   1006        *    ----------------------------------------
   1007        *    Returns an unsigned integer obtained by converting the components of
   1008        *    a two-component floating-point vector to the 16-bit floating-point
   1009        *    representation found in the OpenGL ES Specification, and then packing
   1010        *    these two 16-bit integers into a 32-bit unsigned integer.
   1011        *
   1012        *    The first vector component specifies the 16 least- significant bits
   1013        *    of the result; the second component specifies the 16 most-significant
   1014        *    bits.
   1015        */
   1016 
   1017       assert(vec2_rval->type == glsl_type::vec2_type);
   1018 
   1019       /* vec2 f = VEC2_RVAL; */
   1020       ir_variable *f = factory.make_temp(glsl_type::vec2_type,
   1021                                          "tmp_pack_half_2x16_f");
   1022       factory.emit(assign(f, vec2_rval));
   1023 
   1024       /* uvec2 f32 = bitcast_f2u(f); */
   1025       ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
   1026                                             "tmp_pack_half_2x16_f32");
   1027       factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
   1028 
   1029       /* uvec2 f16; */
   1030       ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
   1031                                         "tmp_pack_half_2x16_f16");
   1032 
   1033       /* Get f32's unshifted exponent bits.
   1034        *
   1035        *   uvec2 e = f32 & 0x7f800000u;
   1036        */
   1037       ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
   1038                                           "tmp_pack_half_2x16_e");
   1039       factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
   1040 
   1041       /* Get f32's unshifted mantissa bits.
   1042        *
   1043        *   uvec2 m = f32 & 0x007fffffu;
   1044        */
   1045       ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
   1046                                           "tmp_pack_half_2x16_m");
   1047       factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
   1048 
   1049       /* Set f16's exponent and mantissa bits.
   1050        *
   1051        *   f16.x = pack_half_1x16_nosign(e.x, m.x);
   1052        *   f16.y = pack_half_1y16_nosign(e.y, m.y);
   1053        */
   1054       factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
   1055                                                      swizzle_x(e),
   1056                                                      swizzle_x(m)),
   1057                            WRITEMASK_X));
   1058       factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
   1059                                                      swizzle_y(e),
   1060                                                      swizzle_y(m)),
   1061                            WRITEMASK_Y));
   1062 
   1063       /* Set f16's sign bits.
   1064        *
   1065        *   f16 |= (f32 & (1u << 31u) >> 16u;
   1066        */
   1067       factory.emit(
   1068          assign(f16, bit_or(f16,
   1069                             rshift(bit_and(f32, constant(1u << 31u)),
   1070                                    constant(16u)))));
   1071 
   1072 
   1073       /* return (f16.y << 16u) | f16.x; */
   1074       ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
   1075                                         constant(16u)),
   1076                                  swizzle_x(f16));
   1077 
   1078       assert(result->type == glsl_type::uint_type);
   1079       return result;
   1080    }
   1081 
   1082    /**
   1083     * \brief Lower the component-wise calculation of unpackHalf2x16.
   1084     *
   1085     * Given a uint that encodes a float16 in its lower 16 bits, this function
   1086     * returns a uint that encodes a float32 with the same value. The sign bit
   1087     * of the float16 is ignored.
   1088     *
   1089     * \param e_rval is the unshifted exponent bits of a float16
   1090     * \param m_rval is the unshifted mantissa bits of a float16
   1091     * \param a uint rvalue that encodes a float32
   1092     */
   1093    ir_rvalue*
   1094    unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
   1095    {
   1096       assert(e_rval->type == glsl_type::uint_type);
   1097       assert(m_rval->type == glsl_type::uint_type);
   1098 
   1099       /* uint u32; */
   1100       ir_variable *u32 = factory.make_temp(glsl_type::uint_type,
   1101                                            "tmp_unpack_half_1x16_u32");
   1102 
   1103       /* uint e = E_RVAL; */
   1104       ir_variable *e = factory.make_temp(glsl_type::uint_type,
   1105                                           "tmp_unpack_half_1x16_e");
   1106       factory.emit(assign(e, e_rval));
   1107 
   1108       /* uint m = M_RVAL; */
   1109       ir_variable *m = factory.make_temp(glsl_type::uint_type,
   1110                                           "tmp_unpack_half_1x16_m");
   1111       factory.emit(assign(m, m_rval));
   1112 
   1113       /* Preliminaries
   1114        * -------------
   1115        *
   1116        * For a float16, the bit layout is:
   1117        *
   1118        *   sign:     15
   1119        *   exponent: 10:14
   1120        *   mantissa: 0:9
   1121        *
   1122        * Let f16 be a float16 value. The sign, exponent, and mantissa
   1123        * determine its value thus:
   1124        *
   1125        *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
   1126        *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
   1127        *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
   1128        *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
   1129        *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
   1130        *
   1131        * where 0 <= m16 < 2^10.
   1132        *
   1133        * For a float32, the bit layout is:
   1134        *
   1135        *   sign: 31
   1136        *   exponent: 23:30
   1137        *   mantissa: 0:22
   1138        *
   1139        * Let f32 be a float32 value. The sign, exponent, and mantissa
   1140        * determine its value thus:
   1141        *
   1142        *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
   1143        *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
   1144        *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
   1145        *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
   1146        *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
   1147        *
   1148        * where 0 <= m32 < 2^23.
   1149        *
   1150        * Calculation
   1151        * -----------
   1152        * Our task is to compute s32, e32, m32 given f16.  Since this function
   1153        * ignores the sign bit, assume that s32 = s16 = 0.  There are several
   1154        * cases consider.
   1155        */
   1156 
   1157       factory.emit(
   1158 
   1159          /* Case 1) f16 is zero or subnormal.
   1160           *
   1161           *   The simplest method of calcuating f32 in this case is
   1162           *
   1163           *     f32 = f16                       (20)
   1164           *         = 2^(-14) * (m16 / 2^10)    (21)
   1165           *         = m16 / 2^(-24)             (22)
   1166           */
   1167 
   1168          /* if (e16 == 0) { */
   1169          if_tree(equal(e, constant(0u)),
   1170 
   1171             /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
   1172             assign(u32, expr(ir_unop_bitcast_f2u,
   1173                                 div(u2f(m), constant((float)(1 << 24))))),
   1174 
   1175          /* Case 2) f16 is normal.
   1176           *
   1177           *   The equation
   1178           *
   1179           *     f32 = f16                              (30)
   1180           *     2^(e32 - 127) * (1 + m32 / 2^23) =     (31)
   1181           *       2^(e16 - 15) * (1 + m16 / 2^10)
   1182           *
   1183           *   can be decomposed into two
   1184           *
   1185           *     2^(e32 - 127) = 2^(e16 - 15)           (32)
   1186           *     1 + m32 / 2^23 = 1 + m16 / 2^10        (33)
   1187           *
   1188           *   which solve to
   1189           *
   1190           *     e32 = e16 + 112                        (34)
   1191           *     m32 = m16 * 2^13                       (35)
   1192           */
   1193 
   1194          /* } else if (e16 < 31)) { */
   1195          if_tree(less(e, constant(31u << 10u)),
   1196 
   1197               /* u32 = ((e + (112 << 10)) | m) << 13;
   1198                */
   1199               assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
   1200                                  constant(13u))),
   1201 
   1202 
   1203          /* Case 3) f16 is infinite. */
   1204          if_tree(equal(m, constant(0u)),
   1205 
   1206                  assign(u32, constant(255u << 23u)),
   1207 
   1208          /* Case 4) f16 is NaN. */
   1209          /* } else { */
   1210 
   1211             assign(u32, constant(0x7fffffffu))))));
   1212 
   1213          /* } */
   1214 
   1215       return deref(u32).val;
   1216    }
   1217 
   1218    /**
   1219     * \brief Lower an unpackHalf2x16 expression.
   1220     *
   1221     * \param uint_rval is unpackHalf2x16's input
   1222     * \return unpackHalf2x16's output as a vec2 rvalue
   1223     */
   1224    ir_rvalue*
   1225    lower_unpack_half_2x16(ir_rvalue *uint_rval)
   1226    {
   1227       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
   1228        *
   1229        *    mediump vec2 unpackHalf2x16 (highp uint v)
   1230        *    ------------------------------------------
   1231        *    Returns a two-component floating-point vector with components
   1232        *    obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
   1233        *    values, interpreting those values as 16-bit floating-point numbers
   1234        *    according to the OpenGL ES Specification, and converting them to
   1235        *    32-bit floating-point values.
   1236        *
   1237        *    The first component of the vector is obtained from the
   1238        *    16 least-significant bits of v; the second component is obtained
   1239        *    from the 16 most-significant bits of v.
   1240        */
   1241       assert(uint_rval->type == glsl_type::uint_type);
   1242 
   1243       /* uint u = RVALUE;
   1244        * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
   1245        */
   1246       ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
   1247                                             "tmp_unpack_half_2x16_f16");
   1248       factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
   1249 
   1250       /* uvec2 f32; */
   1251       ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
   1252                                             "tmp_unpack_half_2x16_f32");
   1253 
   1254       /* Get f16's unshifted exponent bits.
   1255        *
   1256        *    uvec2 e = f16 & 0x7c00u;
   1257        */
   1258       ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
   1259                                           "tmp_unpack_half_2x16_e");
   1260       factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
   1261 
   1262       /* Get f16's unshifted mantissa bits.
   1263        *
   1264        *    uvec2 m = f16 & 0x03ffu;
   1265        */
   1266       ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
   1267                                           "tmp_unpack_half_2x16_m");
   1268       factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
   1269 
   1270       /* Set f32's exponent and mantissa bits.
   1271        *
   1272        *   f32.x = unpack_half_1x16_nosign(e.x, m.x);
   1273        *   f32.y = unpack_half_1x16_nosign(e.y, m.y);
   1274        */
   1275       factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
   1276                                                        swizzle_x(m)),
   1277                            WRITEMASK_X));
   1278       factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
   1279                                                        swizzle_y(m)),
   1280                            WRITEMASK_Y));
   1281 
   1282       /* Set f32's sign bit.
   1283        *
   1284        *    f32 |= (f16 & 0x8000u) << 16u;
   1285        */
   1286       factory.emit(assign(f32, bit_or(f32,
   1287                                        lshift(bit_and(f16,
   1288                                                       constant(0x8000u)),
   1289                                               constant(16u)))));
   1290 
   1291       /* return bitcast_u2f(f32); */
   1292       ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
   1293       assert(result->type == glsl_type::vec2_type);
   1294       return result;
   1295    }
   1296 };
   1297 
   1298 } // namespace anonymous
   1299 
   1300 /**
   1301  * \brief Lower the builtin packing functions.
   1302  *
   1303  * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
   1304  */
   1305 bool
   1306 lower_packing_builtins(exec_list *instructions, int op_mask)
   1307 {
   1308    lower_packing_builtins_visitor v(op_mask);
   1309    visit_list_elements(&v, instructions, true);
   1310    return v.get_progress();
   1311 }
   1312