Home | History | Annotate | Download | only in nir
      1 #! /usr/bin/env python
      2 #
      3 # Copyright (C) 2014 Connor Abbott
      4 #
      5 # Permission is hereby granted, free of charge, to any person obtaining a
      6 # copy of this software and associated documentation files (the "Software"),
      7 # to deal in the Software without restriction, including without limitation
      8 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
      9 # and/or sell copies of the Software, and to permit persons to whom the
     10 # Software is furnished to do so, subject to the following conditions:
     11 #
     12 # The above copyright notice and this permission notice (including the next
     13 # paragraph) shall be included in all copies or substantial portions of the
     14 # Software.
     15 #
     16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     21 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     22 # IN THE SOFTWARE.
     23 #
     24 # Authors:
     25 #    Connor Abbott (cwabbott0 (at] gmail.com)
     26 
     27 
     28 # Class that represents all the information we have about the opcode
     29 # NOTE: this must be kept in sync with nir_op_info
     30 
     31 class Opcode(object):
     32    """Class that represents all the information we have about the opcode
     33    NOTE: this must be kept in sync with nir_op_info
     34    """
     35    def __init__(self, name, output_size, output_type, input_sizes,
     36                 input_types, algebraic_properties, const_expr):
     37       """Parameters:
     38 
     39       - name is the name of the opcode (prepend nir_op_ for the enum name)
     40       - all types are strings that get nir_type_ prepended to them
     41       - input_types is a list of types
     42       - algebraic_properties is a space-seperated string, where nir_op_is_ is
     43         prepended before each entry
     44       - const_expr is an expression or series of statements that computes the
     45         constant value of the opcode given the constant values of its inputs.
     46 
     47       Constant expressions are formed from the variables src0, src1, ...,
     48       src(N-1), where N is the number of arguments.  The output of the
     49       expression should be stored in the dst variable.  Per-component input
     50       and output variables will be scalars and non-per-component input and
     51       output variables will be a struct with fields named x, y, z, and w
     52       all of the correct type.  Input and output variables can be assumed
     53       to already be of the correct type and need no conversion.  In
     54       particular, the conversion from the C bool type to/from  NIR_TRUE and
     55       NIR_FALSE happens automatically.
     56 
     57       For per-component instructions, the entire expression will be
     58       executed once for each component.  For non-per-component
     59       instructions, the expression is expected to store the correct values
     60       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
     61       constant expression, an assignment to dst will happen automatically
     62       and the result will be equivalent to "dst = <expression>" for
     63       per-component instructions and "dst.x = dst.y = ... = <expression>"
     64       for non-per-component instructions.
     65       """
     66       assert isinstance(name, str)
     67       assert isinstance(output_size, int)
     68       assert isinstance(output_type, str)
     69       assert isinstance(input_sizes, list)
     70       assert isinstance(input_sizes[0], int)
     71       assert isinstance(input_types, list)
     72       assert isinstance(input_types[0], str)
     73       assert isinstance(algebraic_properties, str)
     74       assert isinstance(const_expr, str)
     75       assert len(input_sizes) == len(input_types)
     76       assert 0 <= output_size <= 4
     77       for size in input_sizes:
     78          assert 0 <= size <= 4
     79          if output_size != 0:
     80             assert size != 0
     81       self.name = name
     82       self.num_inputs = len(input_sizes)
     83       self.output_size = output_size
     84       self.output_type = output_type
     85       self.input_sizes = input_sizes
     86       self.input_types = input_types
     87       self.algebraic_properties = algebraic_properties
     88       self.const_expr = const_expr
     89 
     90 # helper variables for strings
     91 tfloat = "float"
     92 tint = "int"
     93 tbool = "bool32"
     94 tuint = "uint"
     95 tfloat32 = "float32"
     96 tint32 = "int32"
     97 tuint32 = "uint32"
     98 tuint64 = "uint64"
     99 tfloat64 = "float64"
    100 
    101 commutative = "commutative "
    102 associative = "associative "
    103 
    104 # global dictionary of opcodes
    105 opcodes = {}
    106 
    107 def opcode(name, output_size, output_type, input_sizes, input_types,
    108            algebraic_properties, const_expr):
    109    assert name not in opcodes
    110    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
    111                           input_types, algebraic_properties, const_expr)
    112 
    113 def unop_convert(name, out_type, in_type, const_expr):
    114    opcode(name, 0, out_type, [0], [in_type], "", const_expr)
    115 
    116 def unop(name, ty, const_expr):
    117    opcode(name, 0, ty, [0], [ty], "", const_expr)
    118 
    119 def unop_horiz(name, output_size, output_type, input_size, input_type,
    120                const_expr):
    121    opcode(name, output_size, output_type, [input_size], [input_type], "",
    122           const_expr)
    123 
    124 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
    125                 reduce_expr, final_expr):
    126    def prereduce(src):
    127       return "(" + prereduce_expr.format(src=src) + ")"
    128    def final(src):
    129       return final_expr.format(src="(" + src + ")")
    130    def reduce_(src0, src1):
    131       return reduce_expr.format(src0=src0, src1=src1)
    132    src0 = prereduce("src0.x")
    133    src1 = prereduce("src0.y")
    134    src2 = prereduce("src0.z")
    135    src3 = prereduce("src0.w")
    136    unop_horiz(name + "2", output_size, output_type, 2, input_type,
    137               final(reduce_(src0, src1)))
    138    unop_horiz(name + "3", output_size, output_type, 3, input_type,
    139               final(reduce_(reduce_(src0, src1), src2)))
    140    unop_horiz(name + "4", output_size, output_type, 4, input_type,
    141               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
    142 
    143 
    144 # These two move instructions differ in what modifiers they support and what
    145 # the negate modifier means. Otherwise, they are identical.
    146 unop("fmov", tfloat, "src0")
    147 unop("imov", tint, "src0")
    148 
    149 unop("ineg", tint, "-src0")
    150 unop("fneg", tfloat, "-src0")
    151 unop("inot", tint, "~src0") # invert every bit of the integer
    152 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
    153                       "((src0 == 0.0f) ? 1.0f : 0.0f)"))
    154 unop("fsign", tfloat, ("bit_size == 64 ? " +
    155                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
    156                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
    157 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
    158 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
    159 unop("fabs", tfloat, "bit_size == 64 ? fabs(src0) : fabsf(src0)")
    160 unop("fsat", tfloat, ("bit_size == 64 ? " +
    161                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
    162                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
    163 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
    164 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
    165 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
    166 unop("fexp2", tfloat, "exp2f(src0)")
    167 unop("flog2", tfloat, "log2f(src0)")
    168 unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
    169 unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
    170 unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion.
    171 unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion.
    172 unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
    173 unop_convert("i2d", tfloat64, tint32, "src0") # Integer-to-double conversion.
    174 # Float-to-boolean conversion
    175 unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
    176 unop_convert("d2b", tbool, tfloat64, "src0 != 0.0")
    177 # Boolean-to-float conversion
    178 unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
    179 # Int-to-boolean conversion
    180 unop_convert("i2b", tbool, tint32, "src0 != 0")
    181 unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
    182 unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
    183 unop_convert("u2d", tfloat64, tuint32, "src0") # Unsigned-to-double conversion.
    184 # double-to-float conversion
    185 unop_convert("d2f", tfloat32, tfloat64, "src0") # Double to single precision
    186 unop_convert("f2d", tfloat64, tfloat32, "src0") # Single to double precision
    187 
    188 # Unary floating-point rounding operations.
    189 
    190 
    191 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
    192 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
    193 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
    194 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
    195 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
    196 
    197 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
    198 
    199 # Trigonometric operations.
    200 
    201 
    202 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
    203 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
    204 
    205 
    206 # Partial derivatives.
    207 
    208 
    209 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
    210 unop("fddy", tfloat, "0.0")
    211 unop("fddx_fine", tfloat, "0.0")
    212 unop("fddy_fine", tfloat, "0.0")
    213 unop("fddx_coarse", tfloat, "0.0")
    214 unop("fddy_coarse", tfloat, "0.0")
    215 
    216 
    217 # Floating point pack and unpack operations.
    218 
    219 def pack_2x16(fmt):
    220    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
    221 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
    222 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
    223 """.replace("fmt", fmt))
    224 
    225 def pack_4x8(fmt):
    226    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
    227 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
    228 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
    229 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
    230 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
    231 """.replace("fmt", fmt))
    232 
    233 def unpack_2x16(fmt):
    234    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
    235 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
    236 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
    237 """.replace("fmt", fmt))
    238 
    239 def unpack_4x8(fmt):
    240    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
    241 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
    242 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
    243 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
    244 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
    245 """.replace("fmt", fmt))
    246 
    247 
    248 pack_2x16("snorm")
    249 pack_4x8("snorm")
    250 pack_2x16("unorm")
    251 pack_4x8("unorm")
    252 pack_2x16("half")
    253 unpack_2x16("snorm")
    254 unpack_4x8("snorm")
    255 unpack_2x16("unorm")
    256 unpack_4x8("unorm")
    257 unpack_2x16("half")
    258 
    259 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
    260 dst.x = (src0.x & 0xffff) | (src0.y << 16);
    261 """)
    262 
    263 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
    264 dst.x = (src0.x <<  0) |
    265         (src0.y <<  8) |
    266         (src0.z << 16) |
    267         (src0.w << 24);
    268 """)
    269 
    270 unop_horiz("pack_double_2x32", 1, tuint64, 2, tuint32,
    271            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
    272 
    273 unop_horiz("unpack_double_2x32", 2, tuint32, 1, tuint64,
    274            "dst.x = src0.x; dst.y = src0.x >> 32;")
    275 
    276 # Lowered floating point unpacking operations.
    277 
    278 
    279 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
    280            "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
    281 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
    282            "unpack_half_1x16((uint16_t)(src0.x >> 16))")
    283 
    284 unop_convert("unpack_double_2x32_split_x", tuint32, tuint64, "src0")
    285 unop_convert("unpack_double_2x32_split_y", tuint32, tuint64, "src0 >> 32")
    286 
    287 # Bit operations, part of ARB_gpu_shader5.
    288 
    289 
    290 unop("bitfield_reverse", tuint32, """
    291 /* we're not winning any awards for speed here, but that's ok */
    292 dst = 0;
    293 for (unsigned bit = 0; bit < 32; bit++)
    294    dst |= ((src0 >> bit) & 1) << (31 - bit);
    295 """)
    296 unop("bit_count", tuint32, """
    297 dst = 0;
    298 for (unsigned bit = 0; bit < 32; bit++) {
    299    if ((src0 >> bit) & 1)
    300       dst++;
    301 }
    302 """)
    303 
    304 unop_convert("ufind_msb", tint32, tuint32, """
    305 dst = -1;
    306 for (int bit = 31; bit > 0; bit--) {
    307    if ((src0 >> bit) & 1) {
    308       dst = bit;
    309       break;
    310    }
    311 }
    312 """)
    313 
    314 unop("ifind_msb", tint32, """
    315 dst = -1;
    316 for (int bit = 31; bit >= 0; bit--) {
    317    /* If src0 < 0, we're looking for the first 0 bit.
    318     * if src0 >= 0, we're looking for the first 1 bit.
    319     */
    320    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
    321       (!((src0 >> bit) & 1) && (src0 < 0))) {
    322       dst = bit;
    323       break;
    324    }
    325 }
    326 """)
    327 
    328 unop("find_lsb", tint32, """
    329 dst = -1;
    330 for (unsigned bit = 0; bit < 32; bit++) {
    331    if ((src0 >> bit) & 1) {
    332       dst = bit;
    333       break;
    334    }
    335 }
    336 """)
    337 
    338 
    339 for i in xrange(1, 5):
    340    for j in xrange(1, 5):
    341       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
    342 
    343 def binop_convert(name, out_type, in_type, alg_props, const_expr):
    344    opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
    345 
    346 def binop(name, ty, alg_props, const_expr):
    347    binop_convert(name, ty, ty, alg_props, const_expr)
    348 
    349 def binop_compare(name, ty, alg_props, const_expr):
    350    binop_convert(name, tbool, ty, alg_props, const_expr)
    351 
    352 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
    353                 src2_type, const_expr):
    354    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
    355           "", const_expr)
    356 
    357 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
    358                  reduce_expr, final_expr):
    359    def final(src):
    360       return final_expr.format(src= "(" + src + ")")
    361    def reduce_(src0, src1):
    362       return reduce_expr.format(src0=src0, src1=src1)
    363    def prereduce(src0, src1):
    364       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
    365    src0 = prereduce("src0.x", "src1.x")
    366    src1 = prereduce("src0.y", "src1.y")
    367    src2 = prereduce("src0.z", "src1.z")
    368    src3 = prereduce("src0.w", "src1.w")
    369    opcode(name + "2", output_size, output_type,
    370           [2, 2], [src_type, src_type], commutative,
    371           final(reduce_(src0, src1)))
    372    opcode(name + "3", output_size, output_type,
    373           [3, 3], [src_type, src_type], commutative,
    374           final(reduce_(reduce_(src0, src1), src2)))
    375    opcode(name + "4", output_size, output_type,
    376           [4, 4], [src_type, src_type], commutative,
    377           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
    378 
    379 binop("fadd", tfloat, commutative + associative, "src0 + src1")
    380 binop("iadd", tint, commutative + associative, "src0 + src1")
    381 binop("fsub", tfloat, "", "src0 - src1")
    382 binop("isub", tint, "", "src0 - src1")
    383 
    384 binop("fmul", tfloat, commutative + associative, "src0 * src1")
    385 # low 32-bits of signed/unsigned integer multiply
    386 binop("imul", tint, commutative + associative, "src0 * src1")
    387 # high 32-bits of signed integer multiply
    388 binop("imul_high", tint32, commutative,
    389       "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
    390 # high 32-bits of unsigned integer multiply
    391 binop("umul_high", tuint32, commutative,
    392       "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
    393 
    394 binop("fdiv", tfloat, "", "src0 / src1")
    395 binop("idiv", tint, "", "src0 / src1")
    396 binop("udiv", tuint, "", "src0 / src1")
    397 
    398 # returns a boolean representing the carry resulting from the addition of
    399 # the two unsigned arguments.
    400 
    401 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
    402 
    403 # returns a boolean representing the borrow resulting from the subtraction
    404 # of the two unsigned arguments.
    405 
    406 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
    407 
    408 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
    409 
    410 # For signed integers, there are several different possible definitions of
    411 # "modulus" or "remainder".  We follow the conventions used by LLVM and
    412 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
    413 # operation while the imod opcode implements the more mathematical
    414 # "modulus" operation.  For details on the difference, see
    415 #
    416 # http://mathforum.org/library/drmath/view/52343.html
    417 
    418 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
    419 binop("imod", tint, "",
    420       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
    421       "                 src0 % src1 : src0 % src1 + src1)")
    422 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
    423 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
    424 
    425 #
    426 # Comparisons
    427 #
    428 
    429 
    430 # these integer-aware comparisons return a boolean (0 or ~0)
    431 
    432 binop_compare("flt", tfloat, "", "src0 < src1")
    433 binop_compare("fge", tfloat, "", "src0 >= src1")
    434 binop_compare("feq", tfloat, commutative, "src0 == src1")
    435 binop_compare("fne", tfloat, commutative, "src0 != src1")
    436 binop_compare("ilt", tint, "", "src0 < src1")
    437 binop_compare("ige", tint, "", "src0 >= src1")
    438 binop_compare("ieq", tint, commutative, "src0 == src1")
    439 binop_compare("ine", tint, commutative, "src0 != src1")
    440 binop_compare("ult", tuint, "", "src0 < src1")
    441 binop_compare("uge", tuint, "", "src0 >= src1")
    442 
    443 # integer-aware GLSL-style comparisons that compare floats and ints
    444 
    445 binop_reduce("ball_fequal",  1, tbool, tfloat, "{src0} == {src1}",
    446              "{src0} && {src1}", "{src}")
    447 binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
    448              "{src0} || {src1}", "{src}")
    449 binop_reduce("ball_iequal",  1, tbool, tint, "{src0} == {src1}",
    450              "{src0} && {src1}", "{src}")
    451 binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
    452              "{src0} || {src1}", "{src}")
    453 
    454 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
    455 
    456 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
    457              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
    458 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
    459              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
    460 
    461 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
    462 # and false respectively
    463 
    464 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
    465 binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
    466 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
    467 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
    468 
    469 
    470 binop("ishl", tint, "", "src0 << src1")
    471 binop("ishr", tint, "", "src0 >> src1")
    472 binop("ushr", tuint, "", "src0 >> src1")
    473 
    474 # bitwise logic operators
    475 #
    476 # These are also used as boolean and, or, xor for hardware supporting
    477 # integers.
    478 
    479 
    480 binop("iand", tuint, commutative + associative, "src0 & src1")
    481 binop("ior", tuint, commutative + associative, "src0 | src1")
    482 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
    483 
    484 
    485 # floating point logic operators
    486 #
    487 # These use (src != 0.0) for testing the truth of the input, and output 1.0
    488 # for true and 0.0 for false
    489 
    490 binop("fand", tfloat32, commutative,
    491       "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
    492 binop("for", tfloat32, commutative,
    493       "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
    494 binop("fxor", tfloat32, commutative,
    495       "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
    496 
    497 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
    498              "{src}")
    499 
    500 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
    501              "{src0} * {src1}", "{src0} + {src1}", "{src}")
    502 
    503 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
    504        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
    505 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
    506        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
    507 
    508 binop("fmin", tfloat, "", "fminf(src0, src1)")
    509 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
    510 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
    511 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
    512 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
    513 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
    514 
    515 # Saturated vector add for 4 8bit ints.
    516 binop("usadd_4x8", tint32, commutative + associative, """
    517 dst = 0;
    518 for (int i = 0; i < 32; i += 8) {
    519    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
    520 }
    521 """)
    522 
    523 # Saturated vector subtract for 4 8bit ints.
    524 binop("ussub_4x8", tint32, "", """
    525 dst = 0;
    526 for (int i = 0; i < 32; i += 8) {
    527    int src0_chan = (src0 >> i) & 0xff;
    528    int src1_chan = (src1 >> i) & 0xff;
    529    if (src0_chan > src1_chan)
    530       dst |= (src0_chan - src1_chan) << i;
    531 }
    532 """)
    533 
    534 # vector min for 4 8bit ints.
    535 binop("umin_4x8", tint32, commutative + associative, """
    536 dst = 0;
    537 for (int i = 0; i < 32; i += 8) {
    538    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
    539 }
    540 """)
    541 
    542 # vector max for 4 8bit ints.
    543 binop("umax_4x8", tint32, commutative + associative, """
    544 dst = 0;
    545 for (int i = 0; i < 32; i += 8) {
    546    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
    547 }
    548 """)
    549 
    550 # unorm multiply: (a * b) / 255.
    551 binop("umul_unorm_4x8", tint32, commutative + associative, """
    552 dst = 0;
    553 for (int i = 0; i < 32; i += 8) {
    554    int src0_chan = (src0 >> i) & 0xff;
    555    int src1_chan = (src1 >> i) & 0xff;
    556    dst |= ((src0_chan * src1_chan) / 255) << i;
    557 }
    558 """)
    559 
    560 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
    561 
    562 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
    563             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
    564 
    565 binop_convert("pack_double_2x32_split", tuint64, tuint32, "",
    566               "src0 | ((uint64_t)src1 << 32)")
    567 
    568 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
    569 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
    570 # if either of its arguments are 32.
    571 binop_convert("bfm", tuint32, tint32, "", """
    572 int bits = src0, offset = src1;
    573 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
    574    dst = 0; /* undefined */
    575 else
    576    dst = ((1u << bits) - 1) << offset;
    577 """)
    578 
    579 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
    580 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
    581 /* flush denormals to zero. */
    582 if (!isnormal(dst))
    583    dst = copysignf(0.0f, src0);
    584 """)
    585 
    586 # Combines the first component of each input to make a 2-component vector.
    587 
    588 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
    589 dst.x = src0.x;
    590 dst.y = src1.x;
    591 """)
    592 
    593 # Byte extraction
    594 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
    595 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
    596 
    597 # Word extraction
    598 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
    599 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
    600 
    601 
    602 def triop(name, ty, const_expr):
    603    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
    604 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
    605    opcode(name, output_size, tuint,
    606    [src1_size, src2_size, src3_size],
    607    [tuint, tuint, tuint], "", const_expr)
    608 
    609 triop("ffma", tfloat, "src0 * src1 + src2")
    610 
    611 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
    612 
    613 # Conditional Select
    614 #
    615 # A vector conditional select instruction (like ?:, but operating per-
    616 # component on vectors). There are two versions, one for floating point
    617 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
    618 
    619 
    620 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
    621 opcode("bcsel", 0, tuint, [0, 0, 0],
    622       [tbool, tuint, tuint], "", "src0 ? src1 : src2")
    623 
    624 # SM5 bfi assembly
    625 triop("bfi", tuint32, """
    626 unsigned mask = src0, insert = src1, base = src2;
    627 if (mask == 0) {
    628    dst = base;
    629 } else {
    630    unsigned tmp = mask;
    631    while (!(tmp & 1)) {
    632       tmp >>= 1;
    633       insert <<= 1;
    634    }
    635    dst = (base & ~mask) | (insert & mask);
    636 }
    637 """)
    638 
    639 # SM5 ubfe/ibfe assembly
    640 opcode("ubfe", 0, tuint32,
    641        [0, 0, 0], [tuint32, tint32, tint32], "", """
    642 unsigned base = src0;
    643 int offset = src1, bits = src2;
    644 if (bits == 0) {
    645    dst = 0;
    646 } else if (bits < 0 || offset < 0) {
    647    dst = 0; /* undefined */
    648 } else if (offset + bits < 32) {
    649    dst = (base << (32 - bits - offset)) >> (32 - bits);
    650 } else {
    651    dst = base >> offset;
    652 }
    653 """)
    654 opcode("ibfe", 0, tint32,
    655        [0, 0, 0], [tint32, tint32, tint32], "", """
    656 int base = src0;
    657 int offset = src1, bits = src2;
    658 if (bits == 0) {
    659    dst = 0;
    660 } else if (bits < 0 || offset < 0) {
    661    dst = 0; /* undefined */
    662 } else if (offset + bits < 32) {
    663    dst = (base << (32 - bits - offset)) >> (32 - bits);
    664 } else {
    665    dst = base >> offset;
    666 }
    667 """)
    668 
    669 # GLSL bitfieldExtract()
    670 opcode("ubitfield_extract", 0, tuint32,
    671        [0, 0, 0], [tuint32, tint32, tint32], "", """
    672 unsigned base = src0;
    673 int offset = src1, bits = src2;
    674 if (bits == 0) {
    675    dst = 0;
    676 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
    677    dst = 0; /* undefined per the spec */
    678 } else {
    679    dst = (base >> offset) & ((1ull << bits) - 1);
    680 }
    681 """)
    682 opcode("ibitfield_extract", 0, tint32,
    683        [0, 0, 0], [tint32, tint32, tint32], "", """
    684 int base = src0;
    685 int offset = src1, bits = src2;
    686 if (bits == 0) {
    687    dst = 0;
    688 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
    689    dst = 0;
    690 } else {
    691    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
    692 }
    693 """)
    694 
    695 # Combines the first component of each input to make a 3-component vector.
    696 
    697 triop_horiz("vec3", 3, 1, 1, 1, """
    698 dst.x = src0.x;
    699 dst.y = src1.x;
    700 dst.z = src2.x;
    701 """)
    702 
    703 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
    704                  src4_size, const_expr):
    705    opcode(name, output_size, tuint,
    706           [src1_size, src2_size, src3_size, src4_size],
    707           [tuint, tuint, tuint, tuint],
    708           "", const_expr)
    709 
    710 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
    711        [tuint32, tuint32, tint32, tint32], "", """
    712 unsigned base = src0, insert = src1;
    713 int offset = src2, bits = src3;
    714 if (bits == 0) {
    715    dst = 0;
    716 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
    717    dst = 0;
    718 } else {
    719    unsigned mask = ((1ull << bits) - 1) << offset;
    720    dst = (base & ~mask) | ((insert << bits) & mask);
    721 }
    722 """)
    723 
    724 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
    725 dst.x = src0.x;
    726 dst.y = src1.x;
    727 dst.z = src2.x;
    728 dst.w = src3.x;
    729 """)
    730 
    731 
    732