1 /* 2 * Copyright 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include "nir.h" 26 #include "nir_builder.h" 27 #include "c99_math.h" 28 29 /* 30 * Lowers some unsupported double operations, using only: 31 * 32 * - pack/unpackDouble2x32 33 * - conversion to/from single-precision 34 * - double add, mul, and fma 35 * - conditional select 36 * - 32-bit integer and floating point arithmetic 37 */ 38 39 /* Creates a double with the exponent bits set to a given integer value */ 40 static nir_ssa_def * 41 set_exponent(nir_builder *b, nir_ssa_def *src, nir_ssa_def *exp) 42 { 43 /* Split into bits 0-31 and 32-63 */ 44 nir_ssa_def *lo = nir_unpack_double_2x32_split_x(b, src); 45 nir_ssa_def *hi = nir_unpack_double_2x32_split_y(b, src); 46 47 /* The exponent is bits 52-62, or 20-30 of the high word, so set the exponent 48 * to 1023 49 */ 50 nir_ssa_def *new_hi = nir_bfi(b, nir_imm_int(b, 0x7ff00000), exp, hi); 51 /* recombine */ 52 return nir_pack_double_2x32_split(b, lo, new_hi); 53 } 54 55 static nir_ssa_def * 56 get_exponent(nir_builder *b, nir_ssa_def *src) 57 { 58 /* get bits 32-63 */ 59 nir_ssa_def *hi = nir_unpack_double_2x32_split_y(b, src); 60 61 /* extract bits 20-30 of the high word */ 62 return nir_ubitfield_extract(b, hi, nir_imm_int(b, 20), nir_imm_int(b, 11)); 63 } 64 65 /* Return infinity with the sign of the given source which is +/-0 */ 66 67 static nir_ssa_def * 68 get_signed_inf(nir_builder *b, nir_ssa_def *zero) 69 { 70 nir_ssa_def *zero_hi = nir_unpack_double_2x32_split_y(b, zero); 71 72 /* The bit pattern for infinity is 0x7ff0000000000000, where the sign bit 73 * is the highest bit. Only the sign bit can be non-zero in the passed in 74 * source. So we essentially need to OR the infinity and the zero, except 75 * the low 32 bits are always 0 so we can construct the correct high 32 76 * bits and then pack it together with zero low 32 bits. 77 */ 78 nir_ssa_def *inf_hi = nir_ior(b, nir_imm_int(b, 0x7ff00000), zero_hi); 79 return nir_pack_double_2x32_split(b, nir_imm_int(b, 0), inf_hi); 80 } 81 82 /* 83 * Generates the correctly-signed infinity if the source was zero, and flushes 84 * the result to 0 if the source was infinity or the calculated exponent was 85 * too small to be representable. 86 */ 87 88 static nir_ssa_def * 89 fix_inv_result(nir_builder *b, nir_ssa_def *res, nir_ssa_def *src, 90 nir_ssa_def *exp) 91 { 92 /* If the exponent is too small or the original input was infinity/NaN, 93 * force the result to 0 (flush denorms) to avoid the work of handling 94 * denorms properly. Note that this doesn't preserve positive/negative 95 * zeros, but GLSL doesn't require it. 96 */ 97 res = nir_bcsel(b, nir_ior(b, nir_ige(b, nir_imm_int(b, 0), exp), 98 nir_feq(b, nir_fabs(b, src), 99 nir_imm_double(b, INFINITY))), 100 nir_imm_double(b, 0.0f), res); 101 102 /* If the original input was 0, generate the correctly-signed infinity */ 103 res = nir_bcsel(b, nir_fne(b, src, nir_imm_double(b, 0.0f)), 104 res, get_signed_inf(b, src)); 105 106 return res; 107 108 } 109 110 static nir_ssa_def * 111 lower_rcp(nir_builder *b, nir_ssa_def *src) 112 { 113 /* normalize the input to avoid range issues */ 114 nir_ssa_def *src_norm = set_exponent(b, src, nir_imm_int(b, 1023)); 115 116 /* cast to float, do an rcp, and then cast back to get an approximate 117 * result 118 */ 119 nir_ssa_def *ra = nir_f2d(b, nir_frcp(b, nir_d2f(b, src_norm))); 120 121 /* Fixup the exponent of the result - note that we check if this is too 122 * small below. 123 */ 124 nir_ssa_def *new_exp = nir_isub(b, get_exponent(b, ra), 125 nir_isub(b, get_exponent(b, src), 126 nir_imm_int(b, 1023))); 127 128 ra = set_exponent(b, ra, new_exp); 129 130 /* Do a few Newton-Raphson steps to improve precision. 131 * 132 * Each step doubles the precision, and we started off with around 24 bits, 133 * so we only need to do 2 steps to get to full precision. The step is: 134 * 135 * x_new = x * (2 - x*src) 136 * 137 * But we can re-arrange this to improve precision by using another fused 138 * multiply-add: 139 * 140 * x_new = x + x * (1 - x*src) 141 * 142 * See https://en.wikipedia.org/wiki/Division_algorithm for more details. 143 */ 144 145 ra = nir_ffma(b, ra, nir_ffma(b, ra, src, nir_imm_double(b, -1)), ra); 146 ra = nir_ffma(b, ra, nir_ffma(b, ra, src, nir_imm_double(b, -1)), ra); 147 148 return fix_inv_result(b, ra, src, new_exp); 149 } 150 151 static nir_ssa_def * 152 lower_sqrt_rsq(nir_builder *b, nir_ssa_def *src, bool sqrt) 153 { 154 /* We want to compute: 155 * 156 * 1/sqrt(m * 2^e) 157 * 158 * When the exponent is even, this is equivalent to: 159 * 160 * 1/sqrt(m) * 2^(-e/2) 161 * 162 * and then the exponent is odd, this is equal to: 163 * 164 * 1/sqrt(m * 2) * 2^(-(e - 1)/2) 165 * 166 * where the m * 2 is absorbed into the exponent. So we want the exponent 167 * inside the square root to be 1 if e is odd and 0 if e is even, and we 168 * want to subtract off e/2 from the final exponent, rounded to negative 169 * infinity. We can do the former by first computing the unbiased exponent, 170 * and then AND'ing it with 1 to get 0 or 1, and we can do the latter by 171 * shifting right by 1. 172 */ 173 174 nir_ssa_def *unbiased_exp = nir_isub(b, get_exponent(b, src), 175 nir_imm_int(b, 1023)); 176 nir_ssa_def *even = nir_iand(b, unbiased_exp, nir_imm_int(b, 1)); 177 nir_ssa_def *half = nir_ishr(b, unbiased_exp, nir_imm_int(b, 1)); 178 179 nir_ssa_def *src_norm = set_exponent(b, src, 180 nir_iadd(b, nir_imm_int(b, 1023), 181 even)); 182 183 nir_ssa_def *ra = nir_f2d(b, nir_frsq(b, nir_d2f(b, src_norm))); 184 nir_ssa_def *new_exp = nir_isub(b, get_exponent(b, ra), half); 185 ra = set_exponent(b, ra, new_exp); 186 187 /* 188 * The following implements an iterative algorithm that's very similar 189 * between sqrt and rsqrt. We start with an iteration of Goldschmit's 190 * algorithm, which looks like: 191 * 192 * a = the source 193 * y_0 = initial (single-precision) rsqrt estimate 194 * 195 * h_0 = .5 * y_0 196 * g_0 = a * y_0 197 * r_0 = .5 - h_0 * g_0 198 * g_1 = g_0 * r_0 + g_0 199 * h_1 = h_0 * r_0 + h_0 200 * 201 * Now g_1 ~= sqrt(a), and h_1 ~= 1/(2 * sqrt(a)). We could continue 202 * applying another round of Goldschmit, but since we would never refer 203 * back to a (the original source), we would add too much rounding error. 204 * So instead, we do one last round of Newton-Raphson, which has better 205 * rounding characteristics, to get the final rounding correct. This is 206 * split into two cases: 207 * 208 * 1. sqrt 209 * 210 * Normally, doing a round of Newton-Raphson for sqrt involves taking a 211 * reciprocal of the original estimate, which is slow since it isn't 212 * supported in HW. But we can take advantage of the fact that we already 213 * computed a good estimate of 1/(2 * g_1) by rearranging it like so: 214 * 215 * g_2 = .5 * (g_1 + a / g_1) 216 * = g_1 + .5 * (a / g_1 - g_1) 217 * = g_1 + (.5 / g_1) * (a - g_1^2) 218 * = g_1 + h_1 * (a - g_1^2) 219 * 220 * The second term represents the error, and by splitting it out we can get 221 * better precision by computing it as part of a fused multiply-add. Since 222 * both Newton-Raphson and Goldschmit approximately double the precision of 223 * the result, these two steps should be enough. 224 * 225 * 2. rsqrt 226 * 227 * First off, note that the first round of the Goldschmit algorithm is 228 * really just a Newton-Raphson step in disguise: 229 * 230 * h_1 = h_0 * (.5 - h_0 * g_0) + h_0 231 * = h_0 * (1.5 - h_0 * g_0) 232 * = h_0 * (1.5 - .5 * a * y_0^2) 233 * = (.5 * y_0) * (1.5 - .5 * a * y_0^2) 234 * 235 * which is the standard formula multiplied by .5. Unlike in the sqrt case, 236 * we don't need the inverse to do a Newton-Raphson step; we just need h_1, 237 * so we can skip the calculation of g_1. Instead, we simply do another 238 * Newton-Raphson step: 239 * 240 * y_1 = 2 * h_1 241 * r_1 = .5 - h_1 * y_1 * a 242 * y_2 = y_1 * r_1 + y_1 243 * 244 * Where the difference from Goldschmit is that we calculate y_1 * a 245 * instead of using g_1. Doing it this way should be as fast as computing 246 * y_1 up front instead of h_1, and it lets us share the code for the 247 * initial Goldschmit step with the sqrt case. 248 * 249 * Putting it together, the computations are: 250 * 251 * h_0 = .5 * y_0 252 * g_0 = a * y_0 253 * r_0 = .5 - h_0 * g_0 254 * h_1 = h_0 * r_0 + h_0 255 * if sqrt: 256 * g_1 = g_0 * r_0 + g_0 257 * r_1 = a - g_1 * g_1 258 * g_2 = h_1 * r_1 + g_1 259 * else: 260 * y_1 = 2 * h_1 261 * r_1 = .5 - y_1 * (h_1 * a) 262 * y_2 = y_1 * r_1 + y_1 263 * 264 * For more on the ideas behind this, see "Software Division and Square 265 * Root Using Goldschmit's Algorithms" by Markstein and the Wikipedia page 266 * on square roots 267 * (https://en.wikipedia.org/wiki/Methods_of_computing_square_roots). 268 */ 269 270 nir_ssa_def *one_half = nir_imm_double(b, 0.5); 271 nir_ssa_def *h_0 = nir_fmul(b, one_half, ra); 272 nir_ssa_def *g_0 = nir_fmul(b, src, ra); 273 nir_ssa_def *r_0 = nir_ffma(b, nir_fneg(b, h_0), g_0, one_half); 274 nir_ssa_def *h_1 = nir_ffma(b, h_0, r_0, h_0); 275 nir_ssa_def *res; 276 if (sqrt) { 277 nir_ssa_def *g_1 = nir_ffma(b, g_0, r_0, g_0); 278 nir_ssa_def *r_1 = nir_ffma(b, nir_fneg(b, g_1), g_1, src); 279 res = nir_ffma(b, h_1, r_1, g_1); 280 } else { 281 nir_ssa_def *y_1 = nir_fmul(b, nir_imm_double(b, 2.0), h_1); 282 nir_ssa_def *r_1 = nir_ffma(b, nir_fneg(b, y_1), nir_fmul(b, h_1, src), 283 one_half); 284 res = nir_ffma(b, y_1, r_1, y_1); 285 } 286 287 if (sqrt) { 288 /* Here, the special cases we need to handle are 289 * 0 -> 0 and 290 * +inf -> +inf 291 */ 292 res = nir_bcsel(b, nir_ior(b, nir_feq(b, src, nir_imm_double(b, 0.0)), 293 nir_feq(b, src, nir_imm_double(b, INFINITY))), 294 src, res); 295 } else { 296 res = fix_inv_result(b, res, src, new_exp); 297 } 298 299 return res; 300 } 301 302 static nir_ssa_def * 303 lower_trunc(nir_builder *b, nir_ssa_def *src) 304 { 305 nir_ssa_def *unbiased_exp = nir_isub(b, get_exponent(b, src), 306 nir_imm_int(b, 1023)); 307 308 nir_ssa_def *frac_bits = nir_isub(b, nir_imm_int(b, 52), unbiased_exp); 309 310 /* 311 * Decide the operation to apply depending on the unbiased exponent: 312 * 313 * if (unbiased_exp < 0) 314 * return 0 315 * else if (unbiased_exp > 52) 316 * return src 317 * else 318 * return src & (~0 << frac_bits) 319 * 320 * Notice that the else branch is a 64-bit integer operation that we need 321 * to implement in terms of 32-bit integer arithmetics (at least until we 322 * support 64-bit integer arithmetics). 323 */ 324 325 /* Compute "~0 << frac_bits" in terms of hi/lo 32-bit integer math */ 326 nir_ssa_def *mask_lo = 327 nir_bcsel(b, 328 nir_ige(b, frac_bits, nir_imm_int(b, 32)), 329 nir_imm_int(b, 0), 330 nir_ishl(b, nir_imm_int(b, ~0), frac_bits)); 331 332 nir_ssa_def *mask_hi = 333 nir_bcsel(b, 334 nir_ilt(b, frac_bits, nir_imm_int(b, 33)), 335 nir_imm_int(b, ~0), 336 nir_ishl(b, 337 nir_imm_int(b, ~0), 338 nir_isub(b, frac_bits, nir_imm_int(b, 32)))); 339 340 nir_ssa_def *src_lo = nir_unpack_double_2x32_split_x(b, src); 341 nir_ssa_def *src_hi = nir_unpack_double_2x32_split_y(b, src); 342 343 return 344 nir_bcsel(b, 345 nir_ilt(b, unbiased_exp, nir_imm_int(b, 0)), 346 nir_imm_double(b, 0.0), 347 nir_bcsel(b, nir_ige(b, unbiased_exp, nir_imm_int(b, 53)), 348 src, 349 nir_pack_double_2x32_split(b, 350 nir_iand(b, mask_lo, src_lo), 351 nir_iand(b, mask_hi, src_hi)))); 352 } 353 354 static nir_ssa_def * 355 lower_floor(nir_builder *b, nir_ssa_def *src) 356 { 357 /* 358 * For x >= 0, floor(x) = trunc(x) 359 * For x < 0, 360 * - if x is integer, floor(x) = x 361 * - otherwise, floor(x) = trunc(x) - 1 362 */ 363 nir_ssa_def *tr = nir_ftrunc(b, src); 364 nir_ssa_def *positive = nir_fge(b, src, nir_imm_double(b, 0.0)); 365 return nir_bcsel(b, 366 nir_ior(b, positive, nir_feq(b, src, tr)), 367 tr, 368 nir_fsub(b, tr, nir_imm_double(b, 1.0))); 369 } 370 371 static nir_ssa_def * 372 lower_ceil(nir_builder *b, nir_ssa_def *src) 373 { 374 /* if x < 0, ceil(x) = trunc(x) 375 * else if (x - trunc(x) == 0), ceil(x) = x 376 * else, ceil(x) = trunc(x) + 1 377 */ 378 nir_ssa_def *tr = nir_ftrunc(b, src); 379 nir_ssa_def *negative = nir_flt(b, src, nir_imm_double(b, 0.0)); 380 return nir_bcsel(b, 381 nir_ior(b, negative, nir_feq(b, src, tr)), 382 tr, 383 nir_fadd(b, tr, nir_imm_double(b, 1.0))); 384 } 385 386 static nir_ssa_def * 387 lower_fract(nir_builder *b, nir_ssa_def *src) 388 { 389 return nir_fsub(b, src, nir_ffloor(b, src)); 390 } 391 392 static nir_ssa_def * 393 lower_round_even(nir_builder *b, nir_ssa_def *src) 394 { 395 /* If fract(src) == 0.5, then we will have to decide the rounding direction. 396 * We will do this by computing the mod(abs(src), 2) and testing if it 397 * is < 1 or not. 398 * 399 * We compute mod(abs(src), 2) as: 400 * abs(src) - 2.0 * floor(abs(src) / 2.0) 401 */ 402 nir_ssa_def *two = nir_imm_double(b, 2.0); 403 nir_ssa_def *abs_src = nir_fabs(b, src); 404 nir_ssa_def *mod = 405 nir_fsub(b, 406 abs_src, 407 nir_fmul(b, 408 two, 409 nir_ffloor(b, 410 nir_fmul(b, 411 abs_src, 412 nir_imm_double(b, 0.5))))); 413 414 /* 415 * If fract(src) != 0.5, then we round as floor(src + 0.5) 416 * 417 * If fract(src) == 0.5, then we have to check the modulo: 418 * 419 * if it is < 1 we need a trunc operation so we get: 420 * 0.5 -> 0, -0.5 -> -0 421 * 2.5 -> 2, -2.5 -> -2 422 * 423 * otherwise we need to check if src >= 0, in which case we need to round 424 * upwards, or not, in which case we need to round downwards so we get: 425 * 1.5 -> 2, -1.5 -> -2 426 * 3.5 -> 4, -3.5 -> -4 427 */ 428 nir_ssa_def *fract = nir_ffract(b, src); 429 return nir_bcsel(b, 430 nir_fne(b, fract, nir_imm_double(b, 0.5)), 431 nir_ffloor(b, nir_fadd(b, src, nir_imm_double(b, 0.5))), 432 nir_bcsel(b, 433 nir_flt(b, mod, nir_imm_double(b, 1.0)), 434 nir_ftrunc(b, src), 435 nir_bcsel(b, 436 nir_fge(b, src, nir_imm_double(b, 0.0)), 437 nir_fadd(b, src, nir_imm_double(b, 0.5)), 438 nir_fsub(b, src, nir_imm_double(b, 0.5))))); 439 } 440 441 static nir_ssa_def * 442 lower_mod(nir_builder *b, nir_ssa_def *src0, nir_ssa_def *src1) 443 { 444 /* mod(x,y) = x - y * floor(x/y) 445 * 446 * If the division is lowered, it could add some rounding errors that make 447 * floor() to return the quotient minus one when x = N * y. If this is the 448 * case, we return zero because mod(x, y) output value is [0, y). 449 */ 450 nir_ssa_def *floor = nir_ffloor(b, nir_fdiv(b, src0, src1)); 451 nir_ssa_def *mod = nir_fsub(b, src0, nir_fmul(b, src1, floor)); 452 453 return nir_bcsel(b, 454 nir_fne(b, mod, src1), 455 mod, 456 nir_imm_double(b, 0.0)); 457 } 458 459 static void 460 lower_doubles_instr(nir_alu_instr *instr, nir_lower_doubles_options options) 461 { 462 assert(instr->dest.dest.is_ssa); 463 if (instr->dest.dest.ssa.bit_size != 64) 464 return; 465 466 switch (instr->op) { 467 case nir_op_frcp: 468 if (!(options & nir_lower_drcp)) 469 return; 470 break; 471 472 case nir_op_fsqrt: 473 if (!(options & nir_lower_dsqrt)) 474 return; 475 break; 476 477 case nir_op_frsq: 478 if (!(options & nir_lower_drsq)) 479 return; 480 break; 481 482 case nir_op_ftrunc: 483 if (!(options & nir_lower_dtrunc)) 484 return; 485 break; 486 487 case nir_op_ffloor: 488 if (!(options & nir_lower_dfloor)) 489 return; 490 break; 491 492 case nir_op_fceil: 493 if (!(options & nir_lower_dceil)) 494 return; 495 break; 496 497 case nir_op_ffract: 498 if (!(options & nir_lower_dfract)) 499 return; 500 break; 501 502 case nir_op_fround_even: 503 if (!(options & nir_lower_dround_even)) 504 return; 505 break; 506 507 case nir_op_fmod: 508 if (!(options & nir_lower_dmod)) 509 return; 510 break; 511 512 default: 513 return; 514 } 515 516 nir_builder bld; 517 nir_builder_init(&bld, nir_cf_node_get_function(&instr->instr.block->cf_node)); 518 bld.cursor = nir_before_instr(&instr->instr); 519 520 nir_ssa_def *src = nir_fmov_alu(&bld, instr->src[0], 521 instr->dest.dest.ssa.num_components); 522 523 nir_ssa_def *result; 524 525 switch (instr->op) { 526 case nir_op_frcp: 527 result = lower_rcp(&bld, src); 528 break; 529 case nir_op_fsqrt: 530 result = lower_sqrt_rsq(&bld, src, true); 531 break; 532 case nir_op_frsq: 533 result = lower_sqrt_rsq(&bld, src, false); 534 break; 535 case nir_op_ftrunc: 536 result = lower_trunc(&bld, src); 537 break; 538 case nir_op_ffloor: 539 result = lower_floor(&bld, src); 540 break; 541 case nir_op_fceil: 542 result = lower_ceil(&bld, src); 543 break; 544 case nir_op_ffract: 545 result = lower_fract(&bld, src); 546 break; 547 case nir_op_fround_even: 548 result = lower_round_even(&bld, src); 549 break; 550 551 case nir_op_fmod: { 552 nir_ssa_def *src1 = nir_fmov_alu(&bld, instr->src[1], 553 instr->dest.dest.ssa.num_components); 554 result = lower_mod(&bld, src, src1); 555 } 556 break; 557 default: 558 unreachable("unhandled opcode"); 559 } 560 561 nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(result)); 562 nir_instr_remove(&instr->instr); 563 } 564 565 void 566 nir_lower_doubles(nir_shader *shader, nir_lower_doubles_options options) 567 { 568 nir_foreach_function(function, shader) { 569 if (!function->impl) 570 continue; 571 572 nir_foreach_block(block, function->impl) { 573 nir_foreach_instr_safe(instr, block) { 574 if (instr->type == nir_instr_type_alu) 575 lower_doubles_instr(nir_instr_as_alu(instr), options); 576 } 577 } 578 } 579 } 580