1 #! /usr/bin/env python 2 # 3 # Copyright (C) 2014 Connor Abbott 4 # 5 # Permission is hereby granted, free of charge, to any person obtaining a 6 # copy of this software and associated documentation files (the "Software"), 7 # to deal in the Software without restriction, including without limitation 8 # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 # and/or sell copies of the Software, and to permit persons to whom the 10 # Software is furnished to do so, subject to the following conditions: 11 # 12 # The above copyright notice and this permission notice (including the next 13 # paragraph) shall be included in all copies or substantial portions of the 14 # Software. 15 # 16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 # IN THE SOFTWARE. 23 # 24 # Authors: 25 # Connor Abbott (cwabbott0 (at] gmail.com) 26 27 28 # Class that represents all the information we have about the opcode 29 # NOTE: this must be kept in sync with nir_op_info 30 31 class Opcode(object): 32 """Class that represents all the information we have about the opcode 33 NOTE: this must be kept in sync with nir_op_info 34 """ 35 def __init__(self, name, output_size, output_type, input_sizes, 36 input_types, algebraic_properties, const_expr): 37 """Parameters: 38 39 - name is the name of the opcode (prepend nir_op_ for the enum name) 40 - all types are strings that get nir_type_ prepended to them 41 - input_types is a list of types 42 - algebraic_properties is a space-seperated string, where nir_op_is_ is 43 prepended before each entry 44 - const_expr is an expression or series of statements that computes the 45 constant value of the opcode given the constant values of its inputs. 46 47 Constant expressions are formed from the variables src0, src1, ..., 48 src(N-1), where N is the number of arguments. The output of the 49 expression should be stored in the dst variable. Per-component input 50 and output variables will be scalars and non-per-component input and 51 output variables will be a struct with fields named x, y, z, and w 52 all of the correct type. Input and output variables can be assumed 53 to already be of the correct type and need no conversion. In 54 particular, the conversion from the C bool type to/from NIR_TRUE and 55 NIR_FALSE happens automatically. 56 57 For per-component instructions, the entire expression will be 58 executed once for each component. For non-per-component 59 instructions, the expression is expected to store the correct values 60 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the 61 constant expression, an assignment to dst will happen automatically 62 and the result will be equivalent to "dst = <expression>" for 63 per-component instructions and "dst.x = dst.y = ... = <expression>" 64 for non-per-component instructions. 65 """ 66 assert isinstance(name, str) 67 assert isinstance(output_size, int) 68 assert isinstance(output_type, str) 69 assert isinstance(input_sizes, list) 70 assert isinstance(input_sizes[0], int) 71 assert isinstance(input_types, list) 72 assert isinstance(input_types[0], str) 73 assert isinstance(algebraic_properties, str) 74 assert isinstance(const_expr, str) 75 assert len(input_sizes) == len(input_types) 76 assert 0 <= output_size <= 4 77 for size in input_sizes: 78 assert 0 <= size <= 4 79 if output_size != 0: 80 assert size != 0 81 self.name = name 82 self.num_inputs = len(input_sizes) 83 self.output_size = output_size 84 self.output_type = output_type 85 self.input_sizes = input_sizes 86 self.input_types = input_types 87 self.algebraic_properties = algebraic_properties 88 self.const_expr = const_expr 89 90 # helper variables for strings 91 tfloat = "float" 92 tint = "int" 93 tbool = "bool32" 94 tuint = "uint" 95 tfloat32 = "float32" 96 tint32 = "int32" 97 tuint32 = "uint32" 98 tuint64 = "uint64" 99 tfloat64 = "float64" 100 101 commutative = "commutative " 102 associative = "associative " 103 104 # global dictionary of opcodes 105 opcodes = {} 106 107 def opcode(name, output_size, output_type, input_sizes, input_types, 108 algebraic_properties, const_expr): 109 assert name not in opcodes 110 opcodes[name] = Opcode(name, output_size, output_type, input_sizes, 111 input_types, algebraic_properties, const_expr) 112 113 def unop_convert(name, out_type, in_type, const_expr): 114 opcode(name, 0, out_type, [0], [in_type], "", const_expr) 115 116 def unop(name, ty, const_expr): 117 opcode(name, 0, ty, [0], [ty], "", const_expr) 118 119 def unop_horiz(name, output_size, output_type, input_size, input_type, 120 const_expr): 121 opcode(name, output_size, output_type, [input_size], [input_type], "", 122 const_expr) 123 124 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr, 125 reduce_expr, final_expr): 126 def prereduce(src): 127 return "(" + prereduce_expr.format(src=src) + ")" 128 def final(src): 129 return final_expr.format(src="(" + src + ")") 130 def reduce_(src0, src1): 131 return reduce_expr.format(src0=src0, src1=src1) 132 src0 = prereduce("src0.x") 133 src1 = prereduce("src0.y") 134 src2 = prereduce("src0.z") 135 src3 = prereduce("src0.w") 136 unop_horiz(name + "2", output_size, output_type, 2, input_type, 137 final(reduce_(src0, src1))) 138 unop_horiz(name + "3", output_size, output_type, 3, input_type, 139 final(reduce_(reduce_(src0, src1), src2))) 140 unop_horiz(name + "4", output_size, output_type, 4, input_type, 141 final(reduce_(reduce_(src0, src1), reduce_(src2, src3)))) 142 143 144 # These two move instructions differ in what modifiers they support and what 145 # the negate modifier means. Otherwise, they are identical. 146 unop("fmov", tfloat, "src0") 147 unop("imov", tint, "src0") 148 149 unop("ineg", tint, "-src0") 150 unop("fneg", tfloat, "-src0") 151 unop("inot", tint, "~src0") # invert every bit of the integer 152 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " + 153 "((src0 == 0.0f) ? 1.0f : 0.0f)")) 154 unop("fsign", tfloat, ("bit_size == 64 ? " + 155 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " + 156 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))")) 157 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)") 158 unop("iabs", tint, "(src0 < 0) ? -src0 : src0") 159 unop("fabs", tfloat, "bit_size == 64 ? fabs(src0) : fabsf(src0)") 160 unop("fsat", tfloat, ("bit_size == 64 ? " + 161 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " + 162 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))")) 163 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0") 164 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)") 165 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)") 166 unop("fexp2", tfloat, "exp2f(src0)") 167 unop("flog2", tfloat, "log2f(src0)") 168 unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion. 169 unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion 170 unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion. 171 unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion. 172 unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion. 173 unop_convert("i2d", tfloat64, tint32, "src0") # Integer-to-double conversion. 174 # Float-to-boolean conversion 175 unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f") 176 unop_convert("d2b", tbool, tfloat64, "src0 != 0.0") 177 # Boolean-to-float conversion 178 unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f") 179 # Int-to-boolean conversion 180 unop_convert("i2b", tbool, tint32, "src0 != 0") 181 unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion 182 unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion. 183 unop_convert("u2d", tfloat64, tuint32, "src0") # Unsigned-to-double conversion. 184 # double-to-float conversion 185 unop_convert("d2f", tfloat32, tfloat64, "src0") # Double to single precision 186 unop_convert("f2d", tfloat64, tfloat32, "src0") # Single to double precision 187 188 # Unary floating-point rounding operations. 189 190 191 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)") 192 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)") 193 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)") 194 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))") 195 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)") 196 197 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))") 198 199 # Trigonometric operations. 200 201 202 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)") 203 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)") 204 205 206 # Partial derivatives. 207 208 209 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0. 210 unop("fddy", tfloat, "0.0") 211 unop("fddx_fine", tfloat, "0.0") 212 unop("fddy_fine", tfloat, "0.0") 213 unop("fddx_coarse", tfloat, "0.0") 214 unop("fddy_coarse", tfloat, "0.0") 215 216 217 # Floating point pack and unpack operations. 218 219 def pack_2x16(fmt): 220 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """ 221 dst.x = (uint32_t) pack_fmt_1x16(src0.x); 222 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16; 223 """.replace("fmt", fmt)) 224 225 def pack_4x8(fmt): 226 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """ 227 dst.x = (uint32_t) pack_fmt_1x8(src0.x); 228 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8; 229 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16; 230 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24; 231 """.replace("fmt", fmt)) 232 233 def unpack_2x16(fmt): 234 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """ 235 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff)); 236 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16)); 237 """.replace("fmt", fmt)) 238 239 def unpack_4x8(fmt): 240 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """ 241 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff)); 242 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff)); 243 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff)); 244 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24)); 245 """.replace("fmt", fmt)) 246 247 248 pack_2x16("snorm") 249 pack_4x8("snorm") 250 pack_2x16("unorm") 251 pack_4x8("unorm") 252 pack_2x16("half") 253 unpack_2x16("snorm") 254 unpack_4x8("snorm") 255 unpack_2x16("unorm") 256 unpack_4x8("unorm") 257 unpack_2x16("half") 258 259 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """ 260 dst.x = (src0.x & 0xffff) | (src0.y << 16); 261 """) 262 263 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """ 264 dst.x = (src0.x << 0) | 265 (src0.y << 8) | 266 (src0.z << 16) | 267 (src0.w << 24); 268 """) 269 270 unop_horiz("pack_double_2x32", 1, tuint64, 2, tuint32, 271 "dst.x = src0.x | ((uint64_t)src0.y << 32);") 272 273 unop_horiz("unpack_double_2x32", 2, tuint32, 1, tuint64, 274 "dst.x = src0.x; dst.y = src0.x >> 32;") 275 276 # Lowered floating point unpacking operations. 277 278 279 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32, 280 "unpack_half_1x16((uint16_t)(src0.x & 0xffff))") 281 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32, 282 "unpack_half_1x16((uint16_t)(src0.x >> 16))") 283 284 unop_convert("unpack_double_2x32_split_x", tuint32, tuint64, "src0") 285 unop_convert("unpack_double_2x32_split_y", tuint32, tuint64, "src0 >> 32") 286 287 # Bit operations, part of ARB_gpu_shader5. 288 289 290 unop("bitfield_reverse", tuint32, """ 291 /* we're not winning any awards for speed here, but that's ok */ 292 dst = 0; 293 for (unsigned bit = 0; bit < 32; bit++) 294 dst |= ((src0 >> bit) & 1) << (31 - bit); 295 """) 296 unop("bit_count", tuint32, """ 297 dst = 0; 298 for (unsigned bit = 0; bit < 32; bit++) { 299 if ((src0 >> bit) & 1) 300 dst++; 301 } 302 """) 303 304 unop_convert("ufind_msb", tint32, tuint32, """ 305 dst = -1; 306 for (int bit = 31; bit > 0; bit--) { 307 if ((src0 >> bit) & 1) { 308 dst = bit; 309 break; 310 } 311 } 312 """) 313 314 unop("ifind_msb", tint32, """ 315 dst = -1; 316 for (int bit = 31; bit >= 0; bit--) { 317 /* If src0 < 0, we're looking for the first 0 bit. 318 * if src0 >= 0, we're looking for the first 1 bit. 319 */ 320 if ((((src0 >> bit) & 1) && (src0 >= 0)) || 321 (!((src0 >> bit) & 1) && (src0 < 0))) { 322 dst = bit; 323 break; 324 } 325 } 326 """) 327 328 unop("find_lsb", tint32, """ 329 dst = -1; 330 for (unsigned bit = 0; bit < 32; bit++) { 331 if ((src0 >> bit) & 1) { 332 dst = bit; 333 break; 334 } 335 } 336 """) 337 338 339 for i in xrange(1, 5): 340 for j in xrange(1, 5): 341 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f") 342 343 def binop_convert(name, out_type, in_type, alg_props, const_expr): 344 opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr) 345 346 def binop(name, ty, alg_props, const_expr): 347 binop_convert(name, ty, ty, alg_props, const_expr) 348 349 def binop_compare(name, ty, alg_props, const_expr): 350 binop_convert(name, tbool, ty, alg_props, const_expr) 351 352 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size, 353 src2_type, const_expr): 354 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type], 355 "", const_expr) 356 357 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr, 358 reduce_expr, final_expr): 359 def final(src): 360 return final_expr.format(src= "(" + src + ")") 361 def reduce_(src0, src1): 362 return reduce_expr.format(src0=src0, src1=src1) 363 def prereduce(src0, src1): 364 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")" 365 src0 = prereduce("src0.x", "src1.x") 366 src1 = prereduce("src0.y", "src1.y") 367 src2 = prereduce("src0.z", "src1.z") 368 src3 = prereduce("src0.w", "src1.w") 369 opcode(name + "2", output_size, output_type, 370 [2, 2], [src_type, src_type], commutative, 371 final(reduce_(src0, src1))) 372 opcode(name + "3", output_size, output_type, 373 [3, 3], [src_type, src_type], commutative, 374 final(reduce_(reduce_(src0, src1), src2))) 375 opcode(name + "4", output_size, output_type, 376 [4, 4], [src_type, src_type], commutative, 377 final(reduce_(reduce_(src0, src1), reduce_(src2, src3)))) 378 379 binop("fadd", tfloat, commutative + associative, "src0 + src1") 380 binop("iadd", tint, commutative + associative, "src0 + src1") 381 binop("fsub", tfloat, "", "src0 - src1") 382 binop("isub", tint, "", "src0 - src1") 383 384 binop("fmul", tfloat, commutative + associative, "src0 * src1") 385 # low 32-bits of signed/unsigned integer multiply 386 binop("imul", tint, commutative + associative, "src0 * src1") 387 # high 32-bits of signed integer multiply 388 binop("imul_high", tint32, commutative, 389 "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)") 390 # high 32-bits of unsigned integer multiply 391 binop("umul_high", tuint32, commutative, 392 "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)") 393 394 binop("fdiv", tfloat, "", "src0 / src1") 395 binop("idiv", tint, "", "src0 / src1") 396 binop("udiv", tuint, "", "src0 / src1") 397 398 # returns a boolean representing the carry resulting from the addition of 399 # the two unsigned arguments. 400 401 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0") 402 403 # returns a boolean representing the borrow resulting from the subtraction 404 # of the two unsigned arguments. 405 406 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1") 407 408 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1") 409 410 # For signed integers, there are several different possible definitions of 411 # "modulus" or "remainder". We follow the conventions used by LLVM and 412 # SPIR-V. The irem opcode implements the standard C/C++ signed "%" 413 # operation while the imod opcode implements the more mathematical 414 # "modulus" operation. For details on the difference, see 415 # 416 # http://mathforum.org/library/drmath/view/52343.html 417 418 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1") 419 binop("imod", tint, "", 420 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?" 421 " src0 % src1 : src0 % src1 + src1)") 422 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)") 423 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)") 424 425 # 426 # Comparisons 427 # 428 429 430 # these integer-aware comparisons return a boolean (0 or ~0) 431 432 binop_compare("flt", tfloat, "", "src0 < src1") 433 binop_compare("fge", tfloat, "", "src0 >= src1") 434 binop_compare("feq", tfloat, commutative, "src0 == src1") 435 binop_compare("fne", tfloat, commutative, "src0 != src1") 436 binop_compare("ilt", tint, "", "src0 < src1") 437 binop_compare("ige", tint, "", "src0 >= src1") 438 binop_compare("ieq", tint, commutative, "src0 == src1") 439 binop_compare("ine", tint, commutative, "src0 != src1") 440 binop_compare("ult", tuint, "", "src0 < src1") 441 binop_compare("uge", tuint, "", "src0 >= src1") 442 443 # integer-aware GLSL-style comparisons that compare floats and ints 444 445 binop_reduce("ball_fequal", 1, tbool, tfloat, "{src0} == {src1}", 446 "{src0} && {src1}", "{src}") 447 binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}", 448 "{src0} || {src1}", "{src}") 449 binop_reduce("ball_iequal", 1, tbool, tint, "{src0} == {src1}", 450 "{src0} && {src1}", "{src}") 451 binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}", 452 "{src0} || {src1}", "{src}") 453 454 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0 455 456 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}", 457 "{src0} && {src1}", "{src} ? 1.0f : 0.0f") 458 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}", 459 "{src0} || {src1}", "{src} ? 1.0f : 0.0f") 460 461 # These comparisons for integer-less hardware return 1.0 and 0.0 for true 462 # and false respectively 463 464 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than 465 binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal 466 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal 467 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal 468 469 470 binop("ishl", tint, "", "src0 << src1") 471 binop("ishr", tint, "", "src0 >> src1") 472 binop("ushr", tuint, "", "src0 >> src1") 473 474 # bitwise logic operators 475 # 476 # These are also used as boolean and, or, xor for hardware supporting 477 # integers. 478 479 480 binop("iand", tuint, commutative + associative, "src0 & src1") 481 binop("ior", tuint, commutative + associative, "src0 | src1") 482 binop("ixor", tuint, commutative + associative, "src0 ^ src1") 483 484 485 # floating point logic operators 486 # 487 # These use (src != 0.0) for testing the truth of the input, and output 1.0 488 # for true and 0.0 for false 489 490 binop("fand", tfloat32, commutative, 491 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f") 492 binop("for", tfloat32, commutative, 493 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f") 494 binop("fxor", tfloat32, commutative, 495 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f") 496 497 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}", 498 "{src}") 499 500 binop_reduce("fdot_replicated", 4, tfloat, tfloat, 501 "{src0} * {src1}", "{src0} + {src1}", "{src}") 502 503 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "", 504 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 505 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "", 506 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 507 508 binop("fmin", tfloat, "", "fminf(src0, src1)") 509 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1") 510 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1") 511 binop("fmax", tfloat, "", "fmaxf(src0, src1)") 512 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0") 513 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0") 514 515 # Saturated vector add for 4 8bit ints. 516 binop("usadd_4x8", tint32, commutative + associative, """ 517 dst = 0; 518 for (int i = 0; i < 32; i += 8) { 519 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i; 520 } 521 """) 522 523 # Saturated vector subtract for 4 8bit ints. 524 binop("ussub_4x8", tint32, "", """ 525 dst = 0; 526 for (int i = 0; i < 32; i += 8) { 527 int src0_chan = (src0 >> i) & 0xff; 528 int src1_chan = (src1 >> i) & 0xff; 529 if (src0_chan > src1_chan) 530 dst |= (src0_chan - src1_chan) << i; 531 } 532 """) 533 534 # vector min for 4 8bit ints. 535 binop("umin_4x8", tint32, commutative + associative, """ 536 dst = 0; 537 for (int i = 0; i < 32; i += 8) { 538 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 539 } 540 """) 541 542 # vector max for 4 8bit ints. 543 binop("umax_4x8", tint32, commutative + associative, """ 544 dst = 0; 545 for (int i = 0; i < 32; i += 8) { 546 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 547 } 548 """) 549 550 # unorm multiply: (a * b) / 255. 551 binop("umul_unorm_4x8", tint32, commutative + associative, """ 552 dst = 0; 553 for (int i = 0; i < 32; i += 8) { 554 int src0_chan = (src0 >> i) & 0xff; 555 int src1_chan = (src1 >> i) & 0xff; 556 dst |= ((src0_chan * src1_chan) / 255) << i; 557 } 558 """) 559 560 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)") 561 562 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32, 563 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)") 564 565 binop_convert("pack_double_2x32_split", tuint64, tuint32, "", 566 "src0 | ((uint64_t)src1 << 32)") 567 568 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly 569 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior 570 # if either of its arguments are 32. 571 binop_convert("bfm", tuint32, tint32, "", """ 572 int bits = src0, offset = src1; 573 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32) 574 dst = 0; /* undefined */ 575 else 576 dst = ((1u << bits) - 1) << offset; 577 """) 578 579 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """ 580 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1); 581 /* flush denormals to zero. */ 582 if (!isnormal(dst)) 583 dst = copysignf(0.0f, src0); 584 """) 585 586 # Combines the first component of each input to make a 2-component vector. 587 588 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """ 589 dst.x = src0.x; 590 dst.y = src1.x; 591 """) 592 593 # Byte extraction 594 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))") 595 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))") 596 597 # Word extraction 598 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))") 599 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))") 600 601 602 def triop(name, ty, const_expr): 603 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr) 604 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr): 605 opcode(name, output_size, tuint, 606 [src1_size, src2_size, src3_size], 607 [tuint, tuint, tuint], "", const_expr) 608 609 triop("ffma", tfloat, "src0 * src1 + src2") 610 611 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2") 612 613 # Conditional Select 614 # 615 # A vector conditional select instruction (like ?:, but operating per- 616 # component on vectors). There are two versions, one for floating point 617 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0). 618 619 620 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2") 621 opcode("bcsel", 0, tuint, [0, 0, 0], 622 [tbool, tuint, tuint], "", "src0 ? src1 : src2") 623 624 # SM5 bfi assembly 625 triop("bfi", tuint32, """ 626 unsigned mask = src0, insert = src1, base = src2; 627 if (mask == 0) { 628 dst = base; 629 } else { 630 unsigned tmp = mask; 631 while (!(tmp & 1)) { 632 tmp >>= 1; 633 insert <<= 1; 634 } 635 dst = (base & ~mask) | (insert & mask); 636 } 637 """) 638 639 # SM5 ubfe/ibfe assembly 640 opcode("ubfe", 0, tuint32, 641 [0, 0, 0], [tuint32, tint32, tint32], "", """ 642 unsigned base = src0; 643 int offset = src1, bits = src2; 644 if (bits == 0) { 645 dst = 0; 646 } else if (bits < 0 || offset < 0) { 647 dst = 0; /* undefined */ 648 } else if (offset + bits < 32) { 649 dst = (base << (32 - bits - offset)) >> (32 - bits); 650 } else { 651 dst = base >> offset; 652 } 653 """) 654 opcode("ibfe", 0, tint32, 655 [0, 0, 0], [tint32, tint32, tint32], "", """ 656 int base = src0; 657 int offset = src1, bits = src2; 658 if (bits == 0) { 659 dst = 0; 660 } else if (bits < 0 || offset < 0) { 661 dst = 0; /* undefined */ 662 } else if (offset + bits < 32) { 663 dst = (base << (32 - bits - offset)) >> (32 - bits); 664 } else { 665 dst = base >> offset; 666 } 667 """) 668 669 # GLSL bitfieldExtract() 670 opcode("ubitfield_extract", 0, tuint32, 671 [0, 0, 0], [tuint32, tint32, tint32], "", """ 672 unsigned base = src0; 673 int offset = src1, bits = src2; 674 if (bits == 0) { 675 dst = 0; 676 } else if (bits < 0 || offset < 0 || offset + bits > 32) { 677 dst = 0; /* undefined per the spec */ 678 } else { 679 dst = (base >> offset) & ((1ull << bits) - 1); 680 } 681 """) 682 opcode("ibitfield_extract", 0, tint32, 683 [0, 0, 0], [tint32, tint32, tint32], "", """ 684 int base = src0; 685 int offset = src1, bits = src2; 686 if (bits == 0) { 687 dst = 0; 688 } else if (offset < 0 || bits < 0 || offset + bits > 32) { 689 dst = 0; 690 } else { 691 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */ 692 } 693 """) 694 695 # Combines the first component of each input to make a 3-component vector. 696 697 triop_horiz("vec3", 3, 1, 1, 1, """ 698 dst.x = src0.x; 699 dst.y = src1.x; 700 dst.z = src2.x; 701 """) 702 703 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size, 704 src4_size, const_expr): 705 opcode(name, output_size, tuint, 706 [src1_size, src2_size, src3_size, src4_size], 707 [tuint, tuint, tuint, tuint], 708 "", const_expr) 709 710 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0], 711 [tuint32, tuint32, tint32, tint32], "", """ 712 unsigned base = src0, insert = src1; 713 int offset = src2, bits = src3; 714 if (bits == 0) { 715 dst = 0; 716 } else if (offset < 0 || bits < 0 || bits + offset > 32) { 717 dst = 0; 718 } else { 719 unsigned mask = ((1ull << bits) - 1) << offset; 720 dst = (base & ~mask) | ((insert << bits) & mask); 721 } 722 """) 723 724 quadop_horiz("vec4", 4, 1, 1, 1, 1, """ 725 dst.x = src0.x; 726 dst.y = src1.x; 727 dst.z = src2.x; 728 dst.w = src3.x; 729 """) 730 731 732