1 /* 2 * Copyright 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24 /** 25 * \file lower_instructions.cpp 26 * 27 * Many GPUs lack native instructions for certain expression operations, and 28 * must replace them with some other expression tree. This pass lowers some 29 * of the most common cases, allowing the lowering code to be implemented once 30 * rather than in each driver backend. 31 * 32 * Currently supported transformations: 33 * - SUB_TO_ADD_NEG 34 * - DIV_TO_MUL_RCP 35 * - INT_DIV_TO_MUL_RCP 36 * - EXP_TO_EXP2 37 * - POW_TO_EXP2 38 * - LOG_TO_LOG2 39 * - MOD_TO_FLOOR 40 * - LDEXP_TO_ARITH 41 * - DFREXP_TO_ARITH 42 * - CARRY_TO_ARITH 43 * - BORROW_TO_ARITH 44 * - SAT_TO_CLAMP 45 * - DOPS_TO_DFRAC 46 * 47 * SUB_TO_ADD_NEG: 48 * --------------- 49 * Breaks an ir_binop_sub expression down to add(op0, neg(op1)) 50 * 51 * This simplifies expression reassociation, and for many backends 52 * there is no subtract operation separate from adding the negation. 53 * For backends with native subtract operations, they will probably 54 * want to recognize add(op0, neg(op1)) or the other way around to 55 * produce a subtract anyway. 56 * 57 * FDIV_TO_MUL_RCP, DDIV_TO_MUL_RCP, and INT_DIV_TO_MUL_RCP: 58 * --------------------------------------------------------- 59 * Breaks an ir_binop_div expression down to op0 * (rcp(op1)). 60 * 61 * Many GPUs don't have a divide instruction (945 and 965 included), 62 * but they do have an RCP instruction to compute an approximate 63 * reciprocal. By breaking the operation down, constant reciprocals 64 * can get constant folded. 65 * 66 * FDIV_TO_MUL_RCP only lowers single-precision floating point division; 67 * DDIV_TO_MUL_RCP only lowers double-precision floating point division. 68 * DIV_TO_MUL_RCP is a convenience macro that sets both flags. 69 * INT_DIV_TO_MUL_RCP handles the integer case, converting to and from floating 70 * point so that RCP is possible. 71 * 72 * EXP_TO_EXP2 and LOG_TO_LOG2: 73 * ---------------------------- 74 * Many GPUs don't have a base e log or exponent instruction, but they 75 * do have base 2 versions, so this pass converts exp and log to exp2 76 * and log2 operations. 77 * 78 * POW_TO_EXP2: 79 * ----------- 80 * Many older GPUs don't have an x**y instruction. For these GPUs, convert 81 * x**y to 2**(y * log2(x)). 82 * 83 * MOD_TO_FLOOR: 84 * ------------- 85 * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1)) 86 * 87 * Many GPUs don't have a MOD instruction (945 and 965 included), and 88 * if we have to break it down like this anyway, it gives an 89 * opportunity to do things like constant fold the (1.0 / op1) easily. 90 * 91 * Note: before we used to implement this as op1 * fract(op / op1) but this 92 * implementation had significant precision errors. 93 * 94 * LDEXP_TO_ARITH: 95 * ------------- 96 * Converts ir_binop_ldexp to arithmetic and bit operations for float sources. 97 * 98 * DFREXP_DLDEXP_TO_ARITH: 99 * --------------- 100 * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to 101 * arithmetic and bit ops for double arguments. 102 * 103 * CARRY_TO_ARITH: 104 * --------------- 105 * Converts ir_carry into (x + y) < x. 106 * 107 * BORROW_TO_ARITH: 108 * ---------------- 109 * Converts ir_borrow into (x < y). 110 * 111 * SAT_TO_CLAMP: 112 * ------------- 113 * Converts ir_unop_saturate into min(max(x, 0.0), 1.0) 114 * 115 * DOPS_TO_DFRAC: 116 * -------------- 117 * Converts double trunc, ceil, floor, round to fract 118 */ 119 120 #include "c99_math.h" 121 #include "program/prog_instruction.h" /* for swizzle */ 122 #include "compiler/glsl_types.h" 123 #include "ir.h" 124 #include "ir_builder.h" 125 #include "ir_optimization.h" 126 127 using namespace ir_builder; 128 129 namespace { 130 131 class lower_instructions_visitor : public ir_hierarchical_visitor { 132 public: 133 lower_instructions_visitor(unsigned lower) 134 : progress(false), lower(lower) { } 135 136 ir_visitor_status visit_leave(ir_expression *); 137 138 bool progress; 139 140 private: 141 unsigned lower; /** Bitfield of which operations to lower */ 142 143 void sub_to_add_neg(ir_expression *); 144 void div_to_mul_rcp(ir_expression *); 145 void int_div_to_mul_rcp(ir_expression *); 146 void mod_to_floor(ir_expression *); 147 void exp_to_exp2(ir_expression *); 148 void pow_to_exp2(ir_expression *); 149 void log_to_log2(ir_expression *); 150 void ldexp_to_arith(ir_expression *); 151 void dldexp_to_arith(ir_expression *); 152 void dfrexp_sig_to_arith(ir_expression *); 153 void dfrexp_exp_to_arith(ir_expression *); 154 void carry_to_arith(ir_expression *); 155 void borrow_to_arith(ir_expression *); 156 void sat_to_clamp(ir_expression *); 157 void double_dot_to_fma(ir_expression *); 158 void double_lrp(ir_expression *); 159 void dceil_to_dfrac(ir_expression *); 160 void dfloor_to_dfrac(ir_expression *); 161 void dround_even_to_dfrac(ir_expression *); 162 void dtrunc_to_dfrac(ir_expression *); 163 void dsign_to_csel(ir_expression *); 164 void bit_count_to_math(ir_expression *); 165 void extract_to_shifts(ir_expression *); 166 void insert_to_shifts(ir_expression *); 167 void reverse_to_shifts(ir_expression *ir); 168 void find_lsb_to_float_cast(ir_expression *ir); 169 void find_msb_to_float_cast(ir_expression *ir); 170 void imul_high_to_mul(ir_expression *ir); 171 void sqrt_to_abs_sqrt(ir_expression *ir); 172 173 ir_expression *_carry(operand a, operand b); 174 }; 175 176 } /* anonymous namespace */ 177 178 /** 179 * Determine if a particular type of lowering should occur 180 */ 181 #define lowering(x) (this->lower & x) 182 183 bool 184 lower_instructions(exec_list *instructions, unsigned what_to_lower) 185 { 186 lower_instructions_visitor v(what_to_lower); 187 188 visit_list_elements(&v, instructions); 189 return v.progress; 190 } 191 192 void 193 lower_instructions_visitor::sub_to_add_neg(ir_expression *ir) 194 { 195 ir->operation = ir_binop_add; 196 ir->init_num_operands(); 197 ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type, 198 ir->operands[1], NULL); 199 this->progress = true; 200 } 201 202 void 203 lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir) 204 { 205 assert(ir->operands[1]->type->is_float() || ir->operands[1]->type->is_double()); 206 207 /* New expression for the 1.0 / op1 */ 208 ir_rvalue *expr; 209 expr = new(ir) ir_expression(ir_unop_rcp, 210 ir->operands[1]->type, 211 ir->operands[1]); 212 213 /* op0 / op1 -> op0 * (1.0 / op1) */ 214 ir->operation = ir_binop_mul; 215 ir->init_num_operands(); 216 ir->operands[1] = expr; 217 218 this->progress = true; 219 } 220 221 void 222 lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir) 223 { 224 assert(ir->operands[1]->type->is_integer()); 225 226 /* Be careful with integer division -- we need to do it as a 227 * float and re-truncate, since rcp(n > 1) of an integer would 228 * just be 0. 229 */ 230 ir_rvalue *op0, *op1; 231 const struct glsl_type *vec_type; 232 233 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 234 ir->operands[1]->type->vector_elements, 235 ir->operands[1]->type->matrix_columns); 236 237 if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) 238 op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL); 239 else 240 op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL); 241 242 op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL); 243 244 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 245 ir->operands[0]->type->vector_elements, 246 ir->operands[0]->type->matrix_columns); 247 248 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) 249 op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL); 250 else 251 op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL); 252 253 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 254 ir->type->vector_elements, 255 ir->type->matrix_columns); 256 257 op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1); 258 259 if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) { 260 ir->operation = ir_unop_f2i; 261 ir->operands[0] = op0; 262 } else { 263 ir->operation = ir_unop_i2u; 264 ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0); 265 } 266 ir->init_num_operands(); 267 ir->operands[1] = NULL; 268 269 this->progress = true; 270 } 271 272 void 273 lower_instructions_visitor::exp_to_exp2(ir_expression *ir) 274 { 275 ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E)); 276 277 ir->operation = ir_unop_exp2; 278 ir->init_num_operands(); 279 ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type, 280 ir->operands[0], log2_e); 281 this->progress = true; 282 } 283 284 void 285 lower_instructions_visitor::pow_to_exp2(ir_expression *ir) 286 { 287 ir_expression *const log2_x = 288 new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type, 289 ir->operands[0]); 290 291 ir->operation = ir_unop_exp2; 292 ir->init_num_operands(); 293 ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type, 294 ir->operands[1], log2_x); 295 ir->operands[1] = NULL; 296 this->progress = true; 297 } 298 299 void 300 lower_instructions_visitor::log_to_log2(ir_expression *ir) 301 { 302 ir->operation = ir_binop_mul; 303 ir->init_num_operands(); 304 ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type, 305 ir->operands[0], NULL); 306 ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E)); 307 this->progress = true; 308 } 309 310 void 311 lower_instructions_visitor::mod_to_floor(ir_expression *ir) 312 { 313 ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x", 314 ir_var_temporary); 315 ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y", 316 ir_var_temporary); 317 this->base_ir->insert_before(x); 318 this->base_ir->insert_before(y); 319 320 ir_assignment *const assign_x = 321 new(ir) ir_assignment(new(ir) ir_dereference_variable(x), 322 ir->operands[0]); 323 ir_assignment *const assign_y = 324 new(ir) ir_assignment(new(ir) ir_dereference_variable(y), 325 ir->operands[1]); 326 327 this->base_ir->insert_before(assign_x); 328 this->base_ir->insert_before(assign_y); 329 330 ir_expression *const div_expr = 331 new(ir) ir_expression(ir_binop_div, x->type, 332 new(ir) ir_dereference_variable(x), 333 new(ir) ir_dereference_variable(y)); 334 335 /* Don't generate new IR that would need to be lowered in an additional 336 * pass. 337 */ 338 if ((lowering(FDIV_TO_MUL_RCP) && ir->type->is_float()) || 339 (lowering(DDIV_TO_MUL_RCP) && ir->type->is_double())) 340 div_to_mul_rcp(div_expr); 341 342 ir_expression *const floor_expr = 343 new(ir) ir_expression(ir_unop_floor, x->type, div_expr); 344 345 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 346 dfloor_to_dfrac(floor_expr); 347 348 ir_expression *const mul_expr = 349 new(ir) ir_expression(ir_binop_mul, 350 new(ir) ir_dereference_variable(y), 351 floor_expr); 352 353 ir->operation = ir_binop_sub; 354 ir->init_num_operands(); 355 ir->operands[0] = new(ir) ir_dereference_variable(x); 356 ir->operands[1] = mul_expr; 357 this->progress = true; 358 } 359 360 void 361 lower_instructions_visitor::ldexp_to_arith(ir_expression *ir) 362 { 363 /* Translates 364 * ir_binop_ldexp x exp 365 * into 366 * 367 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift); 368 * resulting_biased_exp = min(extracted_biased_exp + exp, 255); 369 * 370 * if (extracted_biased_exp >= 255) 371 * return x; // +/-inf, NaN 372 * 373 * sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask; 374 * 375 * if (min(resulting_biased_exp, extracted_biased_exp) < 1) 376 * resulting_biased_exp = 0; 377 * if (resulting_biased_exp >= 255 || 378 * min(resulting_biased_exp, extracted_biased_exp) < 1) { 379 * sign_mantissa &= sign_mask; 380 * } 381 * 382 * return bitcast_u2f(sign_mantissa | 383 * lshift(i2u(resulting_biased_exp), exp_shift)); 384 * 385 * which we can't actually implement as such, since the GLSL IR doesn't 386 * have vectorized if-statements. We actually implement it without branches 387 * using conditional-select: 388 * 389 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift); 390 * resulting_biased_exp = min(extracted_biased_exp + exp, 255); 391 * 392 * sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask; 393 * 394 * flush_to_zero = lequal(min(resulting_biased_exp, extracted_biased_exp), 0); 395 * resulting_biased_exp = csel(flush_to_zero, 0, resulting_biased_exp) 396 * zero_mantissa = logic_or(flush_to_zero, 397 * gequal(resulting_biased_exp, 255)); 398 * sign_mantissa = csel(zero_mantissa, sign_mantissa & sign_mask, sign_mantissa); 399 * 400 * result = sign_mantissa | 401 * lshift(i2u(resulting_biased_exp), exp_shift)); 402 * 403 * return csel(extracted_biased_exp >= 255, x, bitcast_u2f(result)); 404 * 405 * The definition of ldexp in the GLSL spec says: 406 * 407 * "If this product is too large to be represented in the 408 * floating-point type, the result is undefined." 409 * 410 * However, the definition of ldexp in the GLSL ES spec does not contain 411 * this sentence, so we do need to handle overflow correctly. 412 * 413 * There is additional language limiting the defined range of exp, but this 414 * is merely to allow implementations that store 2^exp in a temporary 415 * variable. 416 */ 417 418 const unsigned vec_elem = ir->type->vector_elements; 419 420 /* Types */ 421 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1); 422 const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1); 423 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 424 425 /* Temporary variables */ 426 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary); 427 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary); 428 ir_variable *result = new(ir) ir_variable(uvec, "result", ir_var_temporary); 429 430 ir_variable *extracted_biased_exp = 431 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary); 432 ir_variable *resulting_biased_exp = 433 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary); 434 435 ir_variable *sign_mantissa = 436 new(ir) ir_variable(uvec, "sign_mantissa", ir_var_temporary); 437 438 ir_variable *flush_to_zero = 439 new(ir) ir_variable(bvec, "flush_to_zero", ir_var_temporary); 440 ir_variable *zero_mantissa = 441 new(ir) ir_variable(bvec, "zero_mantissa", ir_var_temporary); 442 443 ir_instruction &i = *base_ir; 444 445 /* Copy <x> and <exp> arguments. */ 446 i.insert_before(x); 447 i.insert_before(assign(x, ir->operands[0])); 448 i.insert_before(exp); 449 i.insert_before(assign(exp, ir->operands[1])); 450 451 /* Extract the biased exponent from <x>. */ 452 i.insert_before(extracted_biased_exp); 453 i.insert_before(assign(extracted_biased_exp, 454 rshift(bitcast_f2i(abs(x)), 455 new(ir) ir_constant(23, vec_elem)))); 456 457 /* The definition of ldexp in the GLSL 4.60 spec says: 458 * 459 * "If exp is greater than +128 (single-precision) or +1024 460 * (double-precision), the value returned is undefined. If exp is less 461 * than -126 (single-precision) or -1022 (double-precision), the value 462 * returned may be flushed to zero." 463 * 464 * So we do not have to guard against the possibility of addition overflow, 465 * which could happen when exp is close to INT_MAX. Addition underflow 466 * cannot happen (the worst case is 0 + (-INT_MAX)). 467 */ 468 i.insert_before(resulting_biased_exp); 469 i.insert_before(assign(resulting_biased_exp, 470 min2(add(extracted_biased_exp, exp), 471 new(ir) ir_constant(255, vec_elem)))); 472 473 i.insert_before(sign_mantissa); 474 i.insert_before(assign(sign_mantissa, 475 bit_and(bitcast_f2u(x), 476 new(ir) ir_constant(0x807fffffu, vec_elem)))); 477 478 /* We flush to zero if the original or resulting biased exponent is 0, 479 * indicating a +/-0.0 or subnormal input or output. 480 * 481 * The mantissa is set to 0 if the resulting biased exponent is 255, since 482 * an overflow should produce a +/-inf result. 483 * 484 * Note that NaN inputs are handled separately. 485 */ 486 i.insert_before(flush_to_zero); 487 i.insert_before(assign(flush_to_zero, 488 lequal(min2(resulting_biased_exp, 489 extracted_biased_exp), 490 ir_constant::zero(ir, ivec)))); 491 i.insert_before(assign(resulting_biased_exp, 492 csel(flush_to_zero, 493 ir_constant::zero(ir, ivec), 494 resulting_biased_exp))); 495 496 i.insert_before(zero_mantissa); 497 i.insert_before(assign(zero_mantissa, 498 logic_or(flush_to_zero, 499 equal(resulting_biased_exp, 500 new(ir) ir_constant(255, vec_elem))))); 501 i.insert_before(assign(sign_mantissa, 502 csel(zero_mantissa, 503 bit_and(sign_mantissa, 504 new(ir) ir_constant(0x80000000u, vec_elem)), 505 sign_mantissa))); 506 507 /* Don't generate new IR that would need to be lowered in an additional 508 * pass. 509 */ 510 i.insert_before(result); 511 if (!lowering(INSERT_TO_SHIFTS)) { 512 i.insert_before(assign(result, 513 bitfield_insert(sign_mantissa, 514 i2u(resulting_biased_exp), 515 new(ir) ir_constant(23u, vec_elem), 516 new(ir) ir_constant(8u, vec_elem)))); 517 } else { 518 i.insert_before(assign(result, 519 bit_or(sign_mantissa, 520 lshift(i2u(resulting_biased_exp), 521 new(ir) ir_constant(23, vec_elem))))); 522 } 523 524 ir->operation = ir_triop_csel; 525 ir->init_num_operands(); 526 ir->operands[0] = gequal(extracted_biased_exp, 527 new(ir) ir_constant(255, vec_elem)); 528 ir->operands[1] = new(ir) ir_dereference_variable(x); 529 ir->operands[2] = bitcast_u2f(result); 530 531 this->progress = true; 532 } 533 534 void 535 lower_instructions_visitor::dldexp_to_arith(ir_expression *ir) 536 { 537 /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent 538 * from the significand. 539 */ 540 541 const unsigned vec_elem = ir->type->vector_elements; 542 543 /* Types */ 544 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1); 545 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 546 547 /* Constants */ 548 ir_constant *zeroi = ir_constant::zero(ir, ivec); 549 550 ir_constant *sign_mask = new(ir) ir_constant(0x80000000u); 551 552 ir_constant *exp_shift = new(ir) ir_constant(20u); 553 ir_constant *exp_width = new(ir) ir_constant(11u); 554 ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem); 555 556 /* Temporary variables */ 557 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary); 558 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary); 559 560 ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x", 561 ir_var_temporary); 562 563 ir_variable *extracted_biased_exp = 564 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary); 565 ir_variable *resulting_biased_exp = 566 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary); 567 568 ir_variable *is_not_zero_or_underflow = 569 new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary); 570 571 ir_instruction &i = *base_ir; 572 573 /* Copy <x> and <exp> arguments. */ 574 i.insert_before(x); 575 i.insert_before(assign(x, ir->operands[0])); 576 i.insert_before(exp); 577 i.insert_before(assign(exp, ir->operands[1])); 578 579 ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x); 580 if (lowering(DFREXP_DLDEXP_TO_ARITH)) 581 dfrexp_exp_to_arith(frexp_exp); 582 583 /* Extract the biased exponent from <x>. */ 584 i.insert_before(extracted_biased_exp); 585 i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias))); 586 587 i.insert_before(resulting_biased_exp); 588 i.insert_before(assign(resulting_biased_exp, 589 add(extracted_biased_exp, exp))); 590 591 /* Test if result is 0.0, subnormal, or underflow by checking if the 592 * resulting biased exponent would be less than 0x1. If so, the result is 593 * 0.0 with the sign of x. (Actually, invert the conditions so that 594 * immediate values are the second arguments, which is better for i965) 595 * TODO: Implement in a vector fashion. 596 */ 597 i.insert_before(zero_sign_x); 598 for (unsigned elem = 0; elem < vec_elem; elem++) { 599 ir_variable *unpacked = 600 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 601 i.insert_before(unpacked); 602 i.insert_before( 603 assign(unpacked, 604 expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1)))); 605 i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)), 606 WRITEMASK_Y)); 607 i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X)); 608 i.insert_before(assign(zero_sign_x, 609 expr(ir_unop_pack_double_2x32, unpacked), 610 1 << elem)); 611 } 612 i.insert_before(is_not_zero_or_underflow); 613 i.insert_before(assign(is_not_zero_or_underflow, 614 gequal(resulting_biased_exp, 615 new(ir) ir_constant(0x1, vec_elem)))); 616 i.insert_before(assign(x, csel(is_not_zero_or_underflow, 617 x, zero_sign_x))); 618 i.insert_before(assign(resulting_biased_exp, 619 csel(is_not_zero_or_underflow, 620 resulting_biased_exp, zeroi))); 621 622 /* We could test for overflows by checking if the resulting biased exponent 623 * would be greater than 0xFE. Turns out we don't need to because the GLSL 624 * spec says: 625 * 626 * "If this product is too large to be represented in the 627 * floating-point type, the result is undefined." 628 */ 629 630 ir_rvalue *results[4] = {NULL}; 631 for (unsigned elem = 0; elem < vec_elem; elem++) { 632 ir_variable *unpacked = 633 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 634 i.insert_before(unpacked); 635 i.insert_before( 636 assign(unpacked, 637 expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1)))); 638 639 ir_expression *bfi = bitfield_insert( 640 swizzle_y(unpacked), 641 i2u(swizzle(resulting_biased_exp, elem, 1)), 642 exp_shift->clone(ir, NULL), 643 exp_width->clone(ir, NULL)); 644 645 i.insert_before(assign(unpacked, bfi, WRITEMASK_Y)); 646 647 results[elem] = expr(ir_unop_pack_double_2x32, unpacked); 648 } 649 650 ir->operation = ir_quadop_vector; 651 ir->init_num_operands(); 652 ir->operands[0] = results[0]; 653 ir->operands[1] = results[1]; 654 ir->operands[2] = results[2]; 655 ir->operands[3] = results[3]; 656 657 /* Don't generate new IR that would need to be lowered in an additional 658 * pass. 659 */ 660 661 this->progress = true; 662 } 663 664 void 665 lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir) 666 { 667 const unsigned vec_elem = ir->type->vector_elements; 668 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 669 670 /* Double-precision floating-point values are stored as 671 * 1 sign bit; 672 * 11 exponent bits; 673 * 52 mantissa bits. 674 * 675 * We're just extracting the significand here, so we only need to modify 676 * the upper 32-bit uint. Unfortunately we must extract each double 677 * independently as there is no vector version of unpackDouble. 678 */ 679 680 ir_instruction &i = *base_ir; 681 682 ir_variable *is_not_zero = 683 new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary); 684 ir_rvalue *results[4] = {NULL}; 685 686 ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem); 687 i.insert_before(is_not_zero); 688 i.insert_before( 689 assign(is_not_zero, 690 nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero))); 691 692 /* TODO: Remake this as more vector-friendly when int64 support is 693 * available. 694 */ 695 for (unsigned elem = 0; elem < vec_elem; elem++) { 696 ir_constant *zero = new(ir) ir_constant(0u, 1); 697 ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1); 698 699 /* Exponent of double floating-point values in the range [0.5, 1.0). */ 700 ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1); 701 702 ir_variable *bits = 703 new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary); 704 ir_variable *unpacked = 705 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 706 707 ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1); 708 709 i.insert_before(bits); 710 i.insert_before(unpacked); 711 i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x))); 712 713 /* Manipulate the high uint to remove the exponent and replace it with 714 * either the default exponent or zero. 715 */ 716 i.insert_before(assign(bits, swizzle_y(unpacked))); 717 i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask))); 718 i.insert_before(assign(bits, bit_or(bits, 719 csel(swizzle(is_not_zero, elem, 1), 720 exponent_value, 721 zero)))); 722 i.insert_before(assign(unpacked, bits, WRITEMASK_Y)); 723 results[elem] = expr(ir_unop_pack_double_2x32, unpacked); 724 } 725 726 /* Put the dvec back together */ 727 ir->operation = ir_quadop_vector; 728 ir->init_num_operands(); 729 ir->operands[0] = results[0]; 730 ir->operands[1] = results[1]; 731 ir->operands[2] = results[2]; 732 ir->operands[3] = results[3]; 733 734 this->progress = true; 735 } 736 737 void 738 lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir) 739 { 740 const unsigned vec_elem = ir->type->vector_elements; 741 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 742 const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1); 743 744 /* Double-precision floating-point values are stored as 745 * 1 sign bit; 746 * 11 exponent bits; 747 * 52 mantissa bits. 748 * 749 * We're just extracting the exponent here, so we only care about the upper 750 * 32-bit uint. 751 */ 752 753 ir_instruction &i = *base_ir; 754 755 ir_variable *is_not_zero = 756 new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary); 757 ir_variable *high_words = 758 new(ir) ir_variable(uvec, "high_words", ir_var_temporary); 759 ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem); 760 ir_constant *izero = new(ir) ir_constant(0, vec_elem); 761 762 ir_rvalue *absval = abs(ir->operands[0]); 763 764 i.insert_before(is_not_zero); 765 i.insert_before(high_words); 766 i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero))); 767 768 /* Extract all of the upper uints. */ 769 for (unsigned elem = 0; elem < vec_elem; elem++) { 770 ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1); 771 772 i.insert_before(assign(high_words, 773 swizzle_y(expr(ir_unop_unpack_double_2x32, x)), 774 1 << elem)); 775 776 } 777 ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem); 778 ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem); 779 780 /* For non-zero inputs, shift the exponent down and apply bias. */ 781 ir->operation = ir_triop_csel; 782 ir->init_num_operands(); 783 ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero); 784 ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift))); 785 ir->operands[2] = izero; 786 787 this->progress = true; 788 } 789 790 void 791 lower_instructions_visitor::carry_to_arith(ir_expression *ir) 792 { 793 /* Translates 794 * ir_binop_carry x y 795 * into 796 * sum = ir_binop_add x y 797 * bcarry = ir_binop_less sum x 798 * carry = ir_unop_b2i bcarry 799 */ 800 801 ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL); 802 ir->operation = ir_unop_i2u; 803 ir->init_num_operands(); 804 ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone)); 805 ir->operands[1] = NULL; 806 807 this->progress = true; 808 } 809 810 void 811 lower_instructions_visitor::borrow_to_arith(ir_expression *ir) 812 { 813 /* Translates 814 * ir_binop_borrow x y 815 * into 816 * bcarry = ir_binop_less x y 817 * carry = ir_unop_b2i bcarry 818 */ 819 820 ir->operation = ir_unop_i2u; 821 ir->init_num_operands(); 822 ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1])); 823 ir->operands[1] = NULL; 824 825 this->progress = true; 826 } 827 828 void 829 lower_instructions_visitor::sat_to_clamp(ir_expression *ir) 830 { 831 /* Translates 832 * ir_unop_saturate x 833 * into 834 * ir_binop_min (ir_binop_max(x, 0.0), 1.0) 835 */ 836 837 ir->operation = ir_binop_min; 838 ir->init_num_operands(); 839 ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type, 840 ir->operands[0], 841 new(ir) ir_constant(0.0f)); 842 ir->operands[1] = new(ir) ir_constant(1.0f); 843 844 this->progress = true; 845 } 846 847 void 848 lower_instructions_visitor::double_dot_to_fma(ir_expression *ir) 849 { 850 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res", 851 ir_var_temporary); 852 this->base_ir->insert_before(temp); 853 854 int nc = ir->operands[0]->type->components(); 855 for (int i = nc - 1; i >= 1; i--) { 856 ir_assignment *assig; 857 if (i == (nc - 1)) { 858 assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1), 859 swizzle(ir->operands[1]->clone(ir, NULL), i, 1))); 860 } else { 861 assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1), 862 swizzle(ir->operands[1]->clone(ir, NULL), i, 1), 863 temp)); 864 } 865 this->base_ir->insert_before(assig); 866 } 867 868 ir->operation = ir_triop_fma; 869 ir->init_num_operands(); 870 ir->operands[0] = swizzle(ir->operands[0], 0, 1); 871 ir->operands[1] = swizzle(ir->operands[1], 0, 1); 872 ir->operands[2] = new(ir) ir_dereference_variable(temp); 873 874 this->progress = true; 875 876 } 877 878 void 879 lower_instructions_visitor::double_lrp(ir_expression *ir) 880 { 881 int swizval; 882 ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2]; 883 ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements); 884 885 switch (op2->type->vector_elements) { 886 case 1: 887 swizval = SWIZZLE_XXXX; 888 break; 889 default: 890 assert(op0->type->vector_elements == op2->type->vector_elements); 891 swizval = SWIZZLE_XYZW; 892 break; 893 } 894 895 ir->operation = ir_triop_fma; 896 ir->init_num_operands(); 897 ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements); 898 ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0); 899 900 this->progress = true; 901 } 902 903 void 904 lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir) 905 { 906 /* 907 * frtemp = frac(x); 908 * temp = sub(x, frtemp); 909 * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0); 910 */ 911 ir_instruction &i = *base_ir; 912 ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements); 913 ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements); 914 ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp", 915 ir_var_temporary); 916 917 i.insert_before(frtemp); 918 i.insert_before(assign(frtemp, fract(ir->operands[0]))); 919 920 ir->operation = ir_binop_add; 921 ir->init_num_operands(); 922 ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp); 923 ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL)); 924 925 this->progress = true; 926 } 927 928 void 929 lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir) 930 { 931 /* 932 * frtemp = frac(x); 933 * result = sub(x, frtemp); 934 */ 935 ir->operation = ir_binop_sub; 936 ir->init_num_operands(); 937 ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL)); 938 939 this->progress = true; 940 } 941 void 942 lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir) 943 { 944 /* 945 * insane but works 946 * temp = x + 0.5; 947 * frtemp = frac(temp); 948 * t2 = sub(temp, frtemp); 949 * if (frac(x) == 0.5) 950 * result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1; 951 * else 952 * result = t2; 953 954 */ 955 ir_instruction &i = *base_ir; 956 ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp", 957 ir_var_temporary); 958 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp", 959 ir_var_temporary); 960 ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2", 961 ir_var_temporary); 962 ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements); 963 ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements); 964 ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements); 965 966 i.insert_before(temp); 967 i.insert_before(assign(temp, add(ir->operands[0], p5))); 968 969 i.insert_before(frtemp); 970 i.insert_before(assign(frtemp, fract(temp))); 971 972 i.insert_before(t2); 973 i.insert_before(assign(t2, sub(temp, frtemp))); 974 975 ir->operation = ir_triop_csel; 976 ir->init_num_operands(); 977 ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)), 978 p5->clone(ir, NULL)); 979 ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))), 980 zero), 981 t2, 982 sub(t2, one)); 983 ir->operands[2] = new(ir) ir_dereference_variable(t2); 984 985 this->progress = true; 986 } 987 988 void 989 lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir) 990 { 991 /* 992 * frtemp = frac(x); 993 * temp = sub(x, frtemp); 994 * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1; 995 */ 996 ir_rvalue *arg = ir->operands[0]; 997 ir_instruction &i = *base_ir; 998 999 ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements); 1000 ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements); 1001 ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp", 1002 ir_var_temporary); 1003 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp", 1004 ir_var_temporary); 1005 1006 i.insert_before(frtemp); 1007 i.insert_before(assign(frtemp, fract(arg))); 1008 i.insert_before(temp); 1009 i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp))); 1010 1011 ir->operation = ir_triop_csel; 1012 ir->init_num_operands(); 1013 ir->operands[0] = gequal(arg->clone(ir, NULL), zero); 1014 ir->operands[1] = new (ir) ir_dereference_variable(temp); 1015 ir->operands[2] = add(temp, 1016 csel(equal(frtemp, zero->clone(ir, NULL)), 1017 zero->clone(ir, NULL), 1018 one)); 1019 1020 this->progress = true; 1021 } 1022 1023 void 1024 lower_instructions_visitor::dsign_to_csel(ir_expression *ir) 1025 { 1026 /* 1027 * temp = x > 0.0 ? 1.0 : 0.0; 1028 * result = x < 0.0 ? -1.0 : temp; 1029 */ 1030 ir_rvalue *arg = ir->operands[0]; 1031 ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements); 1032 ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements); 1033 ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements); 1034 1035 ir->operation = ir_triop_csel; 1036 ir->init_num_operands(); 1037 ir->operands[0] = less(arg->clone(ir, NULL), 1038 zero->clone(ir, NULL)); 1039 ir->operands[1] = neg_one; 1040 ir->operands[2] = csel(greater(arg, zero), 1041 one, 1042 zero->clone(ir, NULL)); 1043 1044 this->progress = true; 1045 } 1046 1047 void 1048 lower_instructions_visitor::bit_count_to_math(ir_expression *ir) 1049 { 1050 /* For more details, see: 1051 * 1052 * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel 1053 */ 1054 const unsigned elements = ir->operands[0]->type->vector_elements; 1055 ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp", 1056 ir_var_temporary); 1057 ir_constant *c55555555 = new(ir) ir_constant(0x55555555u); 1058 ir_constant *c33333333 = new(ir) ir_constant(0x33333333u); 1059 ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu); 1060 ir_constant *c01010101 = new(ir) ir_constant(0x01010101u); 1061 ir_constant *c1 = new(ir) ir_constant(1u); 1062 ir_constant *c2 = new(ir) ir_constant(2u); 1063 ir_constant *c4 = new(ir) ir_constant(4u); 1064 ir_constant *c24 = new(ir) ir_constant(24u); 1065 1066 base_ir->insert_before(temp); 1067 1068 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1069 base_ir->insert_before(assign(temp, ir->operands[0])); 1070 } else { 1071 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1072 base_ir->insert_before(assign(temp, i2u(ir->operands[0]))); 1073 } 1074 1075 /* temp = temp - ((temp >> 1) & 0x55555555u); */ 1076 base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1), 1077 c55555555)))); 1078 1079 /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */ 1080 base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333), 1081 bit_and(rshift(temp, c2), 1082 c33333333->clone(ir, NULL))))); 1083 1084 /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */ 1085 ir->operation = ir_unop_u2i; 1086 ir->init_num_operands(); 1087 ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F), 1088 c01010101), 1089 c24); 1090 1091 this->progress = true; 1092 } 1093 1094 void 1095 lower_instructions_visitor::extract_to_shifts(ir_expression *ir) 1096 { 1097 ir_variable *bits = 1098 new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary); 1099 1100 base_ir->insert_before(bits); 1101 base_ir->insert_before(assign(bits, ir->operands[2])); 1102 1103 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1104 ir_constant *c1 = 1105 new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 1106 ir_constant *c32 = 1107 new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements); 1108 ir_constant *cFFFFFFFF = 1109 new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements); 1110 1111 /* At least some hardware treats (x << y) as (x << (y%32)). This means 1112 * we'd get a mask of 0 when bits is 32. Special case it. 1113 * 1114 * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u; 1115 */ 1116 ir_expression *mask = csel(equal(bits, c32), 1117 cFFFFFFFF, 1118 sub(lshift(c1, bits), c1->clone(ir, NULL))); 1119 1120 /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1121 * 1122 * If bits is zero, the result will be zero. 1123 * 1124 * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional 1125 * select as in the signed integer case. 1126 * 1127 * (value >> offset) & mask; 1128 */ 1129 ir->operation = ir_binop_bit_and; 1130 ir->init_num_operands(); 1131 ir->operands[0] = rshift(ir->operands[0], ir->operands[1]); 1132 ir->operands[1] = mask; 1133 ir->operands[2] = NULL; 1134 } else { 1135 ir_constant *c0 = 1136 new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements); 1137 ir_constant *c32 = 1138 new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements); 1139 ir_variable *temp = 1140 new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary); 1141 1142 /* temp = 32 - bits; */ 1143 base_ir->insert_before(temp); 1144 base_ir->insert_before(assign(temp, sub(c32, bits))); 1145 1146 /* expr = value << (temp - offset)) >> temp; */ 1147 ir_expression *expr = 1148 rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp); 1149 1150 /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1151 * 1152 * If bits is zero, the result will be zero. 1153 * 1154 * Due to the (x << (y%32)) behavior mentioned before, the (value << 1155 * (32-0)) doesn't "erase" all of the data as we would like, so finish 1156 * up with: 1157 * 1158 * (bits == 0) ? 0 : e; 1159 */ 1160 ir->operation = ir_triop_csel; 1161 ir->init_num_operands(); 1162 ir->operands[0] = equal(c0, bits); 1163 ir->operands[1] = c0->clone(ir, NULL); 1164 ir->operands[2] = expr; 1165 } 1166 1167 this->progress = true; 1168 } 1169 1170 void 1171 lower_instructions_visitor::insert_to_shifts(ir_expression *ir) 1172 { 1173 ir_constant *c1; 1174 ir_constant *c32; 1175 ir_constant *cFFFFFFFF; 1176 ir_variable *offset = 1177 new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary); 1178 ir_variable *bits = 1179 new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary); 1180 ir_variable *mask = 1181 new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary); 1182 1183 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) { 1184 c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements); 1185 c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements); 1186 cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements); 1187 } else { 1188 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 1189 1190 c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 1191 c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements); 1192 cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements); 1193 } 1194 1195 base_ir->insert_before(offset); 1196 base_ir->insert_before(assign(offset, ir->operands[2])); 1197 1198 base_ir->insert_before(bits); 1199 base_ir->insert_before(assign(bits, ir->operands[3])); 1200 1201 /* At least some hardware treats (x << y) as (x << (y%32)). This means 1202 * we'd get a mask of 0 when bits is 32. Special case it. 1203 * 1204 * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset; 1205 * 1206 * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1207 * 1208 * The result will be undefined if offset or bits is negative, or if the 1209 * sum of offset and bits is greater than the number of bits used to 1210 * store the operand. 1211 * 1212 * Since it's undefined, there are a couple other ways this could be 1213 * implemented. The other way that was considered was to put the csel 1214 * around the whole thing: 1215 * 1216 * final_result = bits == 32 ? insert : ... ; 1217 */ 1218 base_ir->insert_before(mask); 1219 1220 base_ir->insert_before(assign(mask, csel(equal(bits, c32), 1221 cFFFFFFFF, 1222 lshift(sub(lshift(c1, bits), 1223 c1->clone(ir, NULL)), 1224 offset)))); 1225 1226 /* (base & ~mask) | ((insert << offset) & mask) */ 1227 ir->operation = ir_binop_bit_or; 1228 ir->init_num_operands(); 1229 ir->operands[0] = bit_and(ir->operands[0], bit_not(mask)); 1230 ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask); 1231 ir->operands[2] = NULL; 1232 ir->operands[3] = NULL; 1233 1234 this->progress = true; 1235 } 1236 1237 void 1238 lower_instructions_visitor::reverse_to_shifts(ir_expression *ir) 1239 { 1240 /* For more details, see: 1241 * 1242 * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel 1243 */ 1244 ir_constant *c1 = 1245 new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 1246 ir_constant *c2 = 1247 new(ir) ir_constant(2u, ir->operands[0]->type->vector_elements); 1248 ir_constant *c4 = 1249 new(ir) ir_constant(4u, ir->operands[0]->type->vector_elements); 1250 ir_constant *c8 = 1251 new(ir) ir_constant(8u, ir->operands[0]->type->vector_elements); 1252 ir_constant *c16 = 1253 new(ir) ir_constant(16u, ir->operands[0]->type->vector_elements); 1254 ir_constant *c33333333 = 1255 new(ir) ir_constant(0x33333333u, ir->operands[0]->type->vector_elements); 1256 ir_constant *c55555555 = 1257 new(ir) ir_constant(0x55555555u, ir->operands[0]->type->vector_elements); 1258 ir_constant *c0F0F0F0F = 1259 new(ir) ir_constant(0x0F0F0F0Fu, ir->operands[0]->type->vector_elements); 1260 ir_constant *c00FF00FF = 1261 new(ir) ir_constant(0x00FF00FFu, ir->operands[0]->type->vector_elements); 1262 ir_variable *temp = 1263 new(ir) ir_variable(glsl_type::uvec(ir->operands[0]->type->vector_elements), 1264 "temp", ir_var_temporary); 1265 ir_instruction &i = *base_ir; 1266 1267 i.insert_before(temp); 1268 1269 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1270 i.insert_before(assign(temp, ir->operands[0])); 1271 } else { 1272 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1273 i.insert_before(assign(temp, i2u(ir->operands[0]))); 1274 } 1275 1276 /* Swap odd and even bits. 1277 * 1278 * temp = ((temp >> 1) & 0x55555555u) | ((temp & 0x55555555u) << 1); 1279 */ 1280 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c1), c55555555), 1281 lshift(bit_and(temp, c55555555->clone(ir, NULL)), 1282 c1->clone(ir, NULL))))); 1283 /* Swap consecutive pairs. 1284 * 1285 * temp = ((temp >> 2) & 0x33333333u) | ((temp & 0x33333333u) << 2); 1286 */ 1287 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c2), c33333333), 1288 lshift(bit_and(temp, c33333333->clone(ir, NULL)), 1289 c2->clone(ir, NULL))))); 1290 1291 /* Swap nibbles. 1292 * 1293 * temp = ((temp >> 4) & 0x0F0F0F0Fu) | ((temp & 0x0F0F0F0Fu) << 4); 1294 */ 1295 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c4), c0F0F0F0F), 1296 lshift(bit_and(temp, c0F0F0F0F->clone(ir, NULL)), 1297 c4->clone(ir, NULL))))); 1298 1299 /* The last step is, basically, bswap. Swap the bytes, then swap the 1300 * words. When this code is run through GCC on x86, it does generate a 1301 * bswap instruction. 1302 * 1303 * temp = ((temp >> 8) & 0x00FF00FFu) | ((temp & 0x00FF00FFu) << 8); 1304 * temp = ( temp >> 16 ) | ( temp << 16); 1305 */ 1306 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c8), c00FF00FF), 1307 lshift(bit_and(temp, c00FF00FF->clone(ir, NULL)), 1308 c8->clone(ir, NULL))))); 1309 1310 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1311 ir->operation = ir_binop_bit_or; 1312 ir->init_num_operands(); 1313 ir->operands[0] = rshift(temp, c16); 1314 ir->operands[1] = lshift(temp, c16->clone(ir, NULL)); 1315 } else { 1316 ir->operation = ir_unop_u2i; 1317 ir->init_num_operands(); 1318 ir->operands[0] = bit_or(rshift(temp, c16), 1319 lshift(temp, c16->clone(ir, NULL))); 1320 } 1321 1322 this->progress = true; 1323 } 1324 1325 void 1326 lower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir) 1327 { 1328 /* For more details, see: 1329 * 1330 * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast 1331 */ 1332 const unsigned elements = ir->operands[0]->type->vector_elements; 1333 ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements); 1334 ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements); 1335 ir_constant *c23 = new(ir) ir_constant(int(23), elements); 1336 ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements); 1337 ir_variable *temp = 1338 new(ir) ir_variable(glsl_type::ivec(elements), "temp", ir_var_temporary); 1339 ir_variable *lsb_only = 1340 new(ir) ir_variable(glsl_type::uvec(elements), "lsb_only", ir_var_temporary); 1341 ir_variable *as_float = 1342 new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary); 1343 ir_variable *lsb = 1344 new(ir) ir_variable(glsl_type::ivec(elements), "lsb", ir_var_temporary); 1345 1346 ir_instruction &i = *base_ir; 1347 1348 i.insert_before(temp); 1349 1350 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) { 1351 i.insert_before(assign(temp, ir->operands[0])); 1352 } else { 1353 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 1354 i.insert_before(assign(temp, u2i(ir->operands[0]))); 1355 } 1356 1357 /* The int-to-float conversion is lossless because (value & -value) is 1358 * either a power of two or zero. We don't use the result in the zero 1359 * case. The uint() cast is necessary so that 0x80000000 does not 1360 * generate a negative value. 1361 * 1362 * uint lsb_only = uint(value & -value); 1363 * float as_float = float(lsb_only); 1364 */ 1365 i.insert_before(lsb_only); 1366 i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp))))); 1367 1368 i.insert_before(as_float); 1369 i.insert_before(assign(as_float, u2f(lsb_only))); 1370 1371 /* This is basically an open-coded frexp. Implementations that have a 1372 * native frexp instruction would be better served by that. This is 1373 * optimized versus a full-featured open-coded implementation in two ways: 1374 * 1375 * - We don't care about a correct result from subnormal numbers (including 1376 * 0.0), so the raw exponent can always be safely unbiased. 1377 * 1378 * - The value cannot be negative, so it does not need to be masked off to 1379 * extract the exponent. 1380 * 1381 * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f; 1382 */ 1383 i.insert_before(lsb); 1384 i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F))); 1385 1386 /* Use lsb_only in the comparison instead of temp so that the & (far above) 1387 * can possibly generate the result without an explicit comparison. 1388 * 1389 * (lsb_only == 0) ? -1 : lsb; 1390 * 1391 * Since our input values are all integers, the unbiased exponent must not 1392 * be negative. It will only be negative (-0x7f, in fact) if lsb_only is 1393 * 0. Instead of using (lsb_only == 0), we could use (lsb >= 0). Which is 1394 * better is likely GPU dependent. Either way, the difference should be 1395 * small. 1396 */ 1397 ir->operation = ir_triop_csel; 1398 ir->init_num_operands(); 1399 ir->operands[0] = equal(lsb_only, c0); 1400 ir->operands[1] = cminus1; 1401 ir->operands[2] = new(ir) ir_dereference_variable(lsb); 1402 1403 this->progress = true; 1404 } 1405 1406 void 1407 lower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir) 1408 { 1409 /* For more details, see: 1410 * 1411 * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast 1412 */ 1413 const unsigned elements = ir->operands[0]->type->vector_elements; 1414 ir_constant *c0 = new(ir) ir_constant(int(0), elements); 1415 ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements); 1416 ir_constant *c23 = new(ir) ir_constant(int(23), elements); 1417 ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements); 1418 ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements); 1419 ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements); 1420 ir_variable *temp = 1421 new(ir) ir_variable(glsl_type::uvec(elements), "temp", ir_var_temporary); 1422 ir_variable *as_float = 1423 new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary); 1424 ir_variable *msb = 1425 new(ir) ir_variable(glsl_type::ivec(elements), "msb", ir_var_temporary); 1426 1427 ir_instruction &i = *base_ir; 1428 1429 i.insert_before(temp); 1430 1431 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1432 i.insert_before(assign(temp, ir->operands[0])); 1433 } else { 1434 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1435 1436 /* findMSB(uint(abs(some_int))) almost always does the right thing. 1437 * There are two problem values: 1438 * 1439 * * 0x80000000. Since abs(0x80000000) == 0x80000000, findMSB returns 1440 * 31. However, findMSB(int(0x80000000)) == 30. 1441 * 1442 * * 0xffffffff. Since abs(0xffffffff) == 1, findMSB returns 1443 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1444 * 1445 * For a value of zero or negative one, -1 will be returned. 1446 * 1447 * For all negative number cases, including 0x80000000 and 0xffffffff, 1448 * the correct value is obtained from findMSB if instead of negating the 1449 * (already negative) value the logical-not is used. A conditonal 1450 * logical-not can be achieved in two instructions. 1451 */ 1452 ir_variable *as_int = 1453 new(ir) ir_variable(glsl_type::ivec(elements), "as_int", ir_var_temporary); 1454 ir_constant *c31 = new(ir) ir_constant(int(31), elements); 1455 1456 i.insert_before(as_int); 1457 i.insert_before(assign(as_int, ir->operands[0])); 1458 i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor, 1459 as_int, 1460 rshift(as_int, c31))))); 1461 } 1462 1463 /* The int-to-float conversion is lossless because bits are conditionally 1464 * masked off the bottom of temp to ensure the value has at most 24 bits of 1465 * data or is zero. We don't use the result in the zero case. The uint() 1466 * cast is necessary so that 0x80000000 does not generate a negative value. 1467 * 1468 * float as_float = float(temp > 255 ? temp & ~255 : temp); 1469 */ 1470 i.insert_before(as_float); 1471 i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF), 1472 bit_and(temp, cFFFFFF00), 1473 temp)))); 1474 1475 /* This is basically an open-coded frexp. Implementations that have a 1476 * native frexp instruction would be better served by that. This is 1477 * optimized versus a full-featured open-coded implementation in two ways: 1478 * 1479 * - We don't care about a correct result from subnormal numbers (including 1480 * 0.0), so the raw exponent can always be safely unbiased. 1481 * 1482 * - The value cannot be negative, so it does not need to be masked off to 1483 * extract the exponent. 1484 * 1485 * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f; 1486 */ 1487 i.insert_before(msb); 1488 i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F))); 1489 1490 /* Use msb in the comparison instead of temp so that the subtract can 1491 * possibly generate the result without an explicit comparison. 1492 * 1493 * (msb < 0) ? -1 : msb; 1494 * 1495 * Since our input values are all integers, the unbiased exponent must not 1496 * be negative. It will only be negative (-0x7f, in fact) if temp is 0. 1497 */ 1498 ir->operation = ir_triop_csel; 1499 ir->init_num_operands(); 1500 ir->operands[0] = less(msb, c0); 1501 ir->operands[1] = cminus1; 1502 ir->operands[2] = new(ir) ir_dereference_variable(msb); 1503 1504 this->progress = true; 1505 } 1506 1507 ir_expression * 1508 lower_instructions_visitor::_carry(operand a, operand b) 1509 { 1510 if (lowering(CARRY_TO_ARITH)) 1511 return i2u(b2i(less(add(a, b), 1512 a.val->clone(ralloc_parent(a.val), NULL)))); 1513 else 1514 return carry(a, b); 1515 } 1516 1517 void 1518 lower_instructions_visitor::imul_high_to_mul(ir_expression *ir) 1519 { 1520 /* ABCD 1521 * * EFGH 1522 * ====== 1523 * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32 1524 * 1525 * In GLSL, (a * b) becomes 1526 * 1527 * uint m1 = (a & 0x0000ffffu) * (b & 0x0000ffffu); 1528 * uint m2 = (a & 0x0000ffffu) * (b >> 16); 1529 * uint m3 = (a >> 16) * (b & 0x0000ffffu); 1530 * uint m4 = (a >> 16) * (b >> 16); 1531 * 1532 * uint c1; 1533 * uint c2; 1534 * uint lo_result; 1535 * uint hi_result; 1536 * 1537 * lo_result = uaddCarry(m1, m2 << 16, c1); 1538 * hi_result = m4 + c1; 1539 * lo_result = uaddCarry(lo_result, m3 << 16, c2); 1540 * hi_result = hi_result + c2; 1541 * hi_result = hi_result + (m2 >> 16) + (m3 >> 16); 1542 */ 1543 const unsigned elements = ir->operands[0]->type->vector_elements; 1544 ir_variable *src1 = 1545 new(ir) ir_variable(glsl_type::uvec(elements), "src1", ir_var_temporary); 1546 ir_variable *src1h = 1547 new(ir) ir_variable(glsl_type::uvec(elements), "src1h", ir_var_temporary); 1548 ir_variable *src1l = 1549 new(ir) ir_variable(glsl_type::uvec(elements), "src1l", ir_var_temporary); 1550 ir_variable *src2 = 1551 new(ir) ir_variable(glsl_type::uvec(elements), "src2", ir_var_temporary); 1552 ir_variable *src2h = 1553 new(ir) ir_variable(glsl_type::uvec(elements), "src2h", ir_var_temporary); 1554 ir_variable *src2l = 1555 new(ir) ir_variable(glsl_type::uvec(elements), "src2l", ir_var_temporary); 1556 ir_variable *t1 = 1557 new(ir) ir_variable(glsl_type::uvec(elements), "t1", ir_var_temporary); 1558 ir_variable *t2 = 1559 new(ir) ir_variable(glsl_type::uvec(elements), "t2", ir_var_temporary); 1560 ir_variable *lo = 1561 new(ir) ir_variable(glsl_type::uvec(elements), "lo", ir_var_temporary); 1562 ir_variable *hi = 1563 new(ir) ir_variable(glsl_type::uvec(elements), "hi", ir_var_temporary); 1564 ir_variable *different_signs = NULL; 1565 ir_constant *c0000FFFF = new(ir) ir_constant(0x0000FFFFu, elements); 1566 ir_constant *c16 = new(ir) ir_constant(16u, elements); 1567 1568 ir_instruction &i = *base_ir; 1569 1570 i.insert_before(src1); 1571 i.insert_before(src2); 1572 i.insert_before(src1h); 1573 i.insert_before(src2h); 1574 i.insert_before(src1l); 1575 i.insert_before(src2l); 1576 1577 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1578 i.insert_before(assign(src1, ir->operands[0])); 1579 i.insert_before(assign(src2, ir->operands[1])); 1580 } else { 1581 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1582 1583 ir_variable *itmp1 = 1584 new(ir) ir_variable(glsl_type::ivec(elements), "itmp1", ir_var_temporary); 1585 ir_variable *itmp2 = 1586 new(ir) ir_variable(glsl_type::ivec(elements), "itmp2", ir_var_temporary); 1587 ir_constant *c0 = new(ir) ir_constant(int(0), elements); 1588 1589 i.insert_before(itmp1); 1590 i.insert_before(itmp2); 1591 i.insert_before(assign(itmp1, ir->operands[0])); 1592 i.insert_before(assign(itmp2, ir->operands[1])); 1593 1594 different_signs = 1595 new(ir) ir_variable(glsl_type::bvec(elements), "different_signs", 1596 ir_var_temporary); 1597 1598 i.insert_before(different_signs); 1599 i.insert_before(assign(different_signs, expr(ir_binop_logic_xor, 1600 less(itmp1, c0), 1601 less(itmp2, c0->clone(ir, NULL))))); 1602 1603 i.insert_before(assign(src1, i2u(abs(itmp1)))); 1604 i.insert_before(assign(src2, i2u(abs(itmp2)))); 1605 } 1606 1607 i.insert_before(assign(src1l, bit_and(src1, c0000FFFF))); 1608 i.insert_before(assign(src2l, bit_and(src2, c0000FFFF->clone(ir, NULL)))); 1609 i.insert_before(assign(src1h, rshift(src1, c16))); 1610 i.insert_before(assign(src2h, rshift(src2, c16->clone(ir, NULL)))); 1611 1612 i.insert_before(lo); 1613 i.insert_before(hi); 1614 i.insert_before(t1); 1615 i.insert_before(t2); 1616 1617 i.insert_before(assign(lo, mul(src1l, src2l))); 1618 i.insert_before(assign(t1, mul(src1l, src2h))); 1619 i.insert_before(assign(t2, mul(src1h, src2l))); 1620 i.insert_before(assign(hi, mul(src1h, src2h))); 1621 1622 i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t1, c16->clone(ir, NULL)))))); 1623 i.insert_before(assign(lo, add(lo, lshift(t1, c16->clone(ir, NULL))))); 1624 1625 i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t2, c16->clone(ir, NULL)))))); 1626 i.insert_before(assign(lo, add(lo, lshift(t2, c16->clone(ir, NULL))))); 1627 1628 if (different_signs == NULL) { 1629 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 1630 1631 ir->operation = ir_binop_add; 1632 ir->init_num_operands(); 1633 ir->operands[0] = add(hi, rshift(t1, c16->clone(ir, NULL))); 1634 ir->operands[1] = rshift(t2, c16->clone(ir, NULL)); 1635 } else { 1636 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1637 1638 i.insert_before(assign(hi, add(add(hi, rshift(t1, c16->clone(ir, NULL))), 1639 rshift(t2, c16->clone(ir, NULL))))); 1640 1641 /* For channels where different_signs is set we have to perform a 64-bit 1642 * negation. This is *not* the same as just negating the high 32-bits. 1643 * Consider -3 * 2. The high 32-bits is 0, but the desired result is 1644 * -1, not -0! Recall -x == ~x + 1. 1645 */ 1646 ir_variable *neg_hi = 1647 new(ir) ir_variable(glsl_type::ivec(elements), "neg_hi", ir_var_temporary); 1648 ir_constant *c1 = new(ir) ir_constant(1u, elements); 1649 1650 i.insert_before(neg_hi); 1651 i.insert_before(assign(neg_hi, add(bit_not(u2i(hi)), 1652 u2i(_carry(bit_not(lo), c1))))); 1653 1654 ir->operation = ir_triop_csel; 1655 ir->init_num_operands(); 1656 ir->operands[0] = new(ir) ir_dereference_variable(different_signs); 1657 ir->operands[1] = new(ir) ir_dereference_variable(neg_hi); 1658 ir->operands[2] = u2i(hi); 1659 } 1660 } 1661 1662 void 1663 lower_instructions_visitor::sqrt_to_abs_sqrt(ir_expression *ir) 1664 { 1665 ir->operands[0] = new(ir) ir_expression(ir_unop_abs, ir->operands[0]); 1666 this->progress = true; 1667 } 1668 1669 ir_visitor_status 1670 lower_instructions_visitor::visit_leave(ir_expression *ir) 1671 { 1672 switch (ir->operation) { 1673 case ir_binop_dot: 1674 if (ir->operands[0]->type->is_double()) 1675 double_dot_to_fma(ir); 1676 break; 1677 case ir_triop_lrp: 1678 if (ir->operands[0]->type->is_double()) 1679 double_lrp(ir); 1680 break; 1681 case ir_binop_sub: 1682 if (lowering(SUB_TO_ADD_NEG)) 1683 sub_to_add_neg(ir); 1684 break; 1685 1686 case ir_binop_div: 1687 if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP)) 1688 int_div_to_mul_rcp(ir); 1689 else if ((ir->operands[1]->type->is_float() && lowering(FDIV_TO_MUL_RCP)) || 1690 (ir->operands[1]->type->is_double() && lowering(DDIV_TO_MUL_RCP))) 1691 div_to_mul_rcp(ir); 1692 break; 1693 1694 case ir_unop_exp: 1695 if (lowering(EXP_TO_EXP2)) 1696 exp_to_exp2(ir); 1697 break; 1698 1699 case ir_unop_log: 1700 if (lowering(LOG_TO_LOG2)) 1701 log_to_log2(ir); 1702 break; 1703 1704 case ir_binop_mod: 1705 if (lowering(MOD_TO_FLOOR) && (ir->type->is_float() || ir->type->is_double())) 1706 mod_to_floor(ir); 1707 break; 1708 1709 case ir_binop_pow: 1710 if (lowering(POW_TO_EXP2)) 1711 pow_to_exp2(ir); 1712 break; 1713 1714 case ir_binop_ldexp: 1715 if (lowering(LDEXP_TO_ARITH) && ir->type->is_float()) 1716 ldexp_to_arith(ir); 1717 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double()) 1718 dldexp_to_arith(ir); 1719 break; 1720 1721 case ir_unop_frexp_exp: 1722 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double()) 1723 dfrexp_exp_to_arith(ir); 1724 break; 1725 1726 case ir_unop_frexp_sig: 1727 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double()) 1728 dfrexp_sig_to_arith(ir); 1729 break; 1730 1731 case ir_binop_carry: 1732 if (lowering(CARRY_TO_ARITH)) 1733 carry_to_arith(ir); 1734 break; 1735 1736 case ir_binop_borrow: 1737 if (lowering(BORROW_TO_ARITH)) 1738 borrow_to_arith(ir); 1739 break; 1740 1741 case ir_unop_saturate: 1742 if (lowering(SAT_TO_CLAMP)) 1743 sat_to_clamp(ir); 1744 break; 1745 1746 case ir_unop_trunc: 1747 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1748 dtrunc_to_dfrac(ir); 1749 break; 1750 1751 case ir_unop_ceil: 1752 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1753 dceil_to_dfrac(ir); 1754 break; 1755 1756 case ir_unop_floor: 1757 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1758 dfloor_to_dfrac(ir); 1759 break; 1760 1761 case ir_unop_round_even: 1762 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1763 dround_even_to_dfrac(ir); 1764 break; 1765 1766 case ir_unop_sign: 1767 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1768 dsign_to_csel(ir); 1769 break; 1770 1771 case ir_unop_bit_count: 1772 if (lowering(BIT_COUNT_TO_MATH)) 1773 bit_count_to_math(ir); 1774 break; 1775 1776 case ir_triop_bitfield_extract: 1777 if (lowering(EXTRACT_TO_SHIFTS)) 1778 extract_to_shifts(ir); 1779 break; 1780 1781 case ir_quadop_bitfield_insert: 1782 if (lowering(INSERT_TO_SHIFTS)) 1783 insert_to_shifts(ir); 1784 break; 1785 1786 case ir_unop_bitfield_reverse: 1787 if (lowering(REVERSE_TO_SHIFTS)) 1788 reverse_to_shifts(ir); 1789 break; 1790 1791 case ir_unop_find_lsb: 1792 if (lowering(FIND_LSB_TO_FLOAT_CAST)) 1793 find_lsb_to_float_cast(ir); 1794 break; 1795 1796 case ir_unop_find_msb: 1797 if (lowering(FIND_MSB_TO_FLOAT_CAST)) 1798 find_msb_to_float_cast(ir); 1799 break; 1800 1801 case ir_binop_imul_high: 1802 if (lowering(IMUL_HIGH_TO_MUL)) 1803 imul_high_to_mul(ir); 1804 break; 1805 1806 case ir_unop_rsq: 1807 case ir_unop_sqrt: 1808 if (lowering(SQRT_TO_ABS_SQRT)) 1809 sqrt_to_abs_sqrt(ir); 1810 break; 1811 1812 default: 1813 return visit_continue; 1814 } 1815 1816 return visit_continue; 1817 } 1818