1 // Copyright 2015, VIXL authors 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions are met: 6 // 7 // * Redistributions of source code must retain the above copyright notice, 8 // this list of conditions and the following disclaimer. 9 // * Redistributions in binary form must reproduce the above copyright notice, 10 // this list of conditions and the following disclaimer in the documentation 11 // and/or other materials provided with the distribution. 12 // * Neither the name of ARM Limited nor the names of its contributors may be 13 // used to endorse or promote products derived from this software without 14 // specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND 17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 27 #include <cstdio> 28 29 #include "utils-vixl.h" 30 31 namespace vixl { 32 33 // The default NaN values (for FPCR.DN=1). 34 const double kFP64DefaultNaN = RawbitsToDouble(UINT64_C(0x7ff8000000000000)); 35 const float kFP32DefaultNaN = RawbitsToFloat(0x7fc00000); 36 const Float16 kFP16DefaultNaN = RawbitsToFloat16(0x7e00); 37 38 // Floating-point zero values. 39 const Float16 kFP16PositiveZero = RawbitsToFloat16(0x0); 40 const Float16 kFP16NegativeZero = RawbitsToFloat16(0x8000); 41 42 // Floating-point infinity values. 43 const Float16 kFP16PositiveInfinity = RawbitsToFloat16(0x7c00); 44 const Float16 kFP16NegativeInfinity = RawbitsToFloat16(0xfc00); 45 const float kFP32PositiveInfinity = RawbitsToFloat(0x7f800000); 46 const float kFP32NegativeInfinity = RawbitsToFloat(0xff800000); 47 const double kFP64PositiveInfinity = 48 RawbitsToDouble(UINT64_C(0x7ff0000000000000)); 49 const double kFP64NegativeInfinity = 50 RawbitsToDouble(UINT64_C(0xfff0000000000000)); 51 52 bool IsZero(Float16 value) { 53 uint16_t bits = Float16ToRawbits(value); 54 return (bits == Float16ToRawbits(kFP16PositiveZero) || 55 bits == Float16ToRawbits(kFP16NegativeZero)); 56 } 57 58 uint16_t Float16ToRawbits(Float16 value) { return value.rawbits_; } 59 60 uint32_t FloatToRawbits(float value) { 61 uint32_t bits = 0; 62 memcpy(&bits, &value, 4); 63 return bits; 64 } 65 66 67 uint64_t DoubleToRawbits(double value) { 68 uint64_t bits = 0; 69 memcpy(&bits, &value, 8); 70 return bits; 71 } 72 73 74 Float16 RawbitsToFloat16(uint16_t bits) { 75 Float16 f; 76 f.rawbits_ = bits; 77 return f; 78 } 79 80 81 float RawbitsToFloat(uint32_t bits) { 82 float value = 0.0; 83 memcpy(&value, &bits, 4); 84 return value; 85 } 86 87 88 double RawbitsToDouble(uint64_t bits) { 89 double value = 0.0; 90 memcpy(&value, &bits, 8); 91 return value; 92 } 93 94 95 uint32_t Float16Sign(internal::SimFloat16 val) { 96 uint16_t rawbits = Float16ToRawbits(val); 97 return ExtractUnsignedBitfield32(15, 15, rawbits); 98 } 99 100 101 uint32_t Float16Exp(internal::SimFloat16 val) { 102 uint16_t rawbits = Float16ToRawbits(val); 103 return ExtractUnsignedBitfield32(14, 10, rawbits); 104 } 105 106 uint32_t Float16Mantissa(internal::SimFloat16 val) { 107 uint16_t rawbits = Float16ToRawbits(val); 108 return ExtractUnsignedBitfield32(9, 0, rawbits); 109 } 110 111 112 uint32_t FloatSign(float val) { 113 uint32_t rawbits = FloatToRawbits(val); 114 return ExtractUnsignedBitfield32(31, 31, rawbits); 115 } 116 117 118 uint32_t FloatExp(float val) { 119 uint32_t rawbits = FloatToRawbits(val); 120 return ExtractUnsignedBitfield32(30, 23, rawbits); 121 } 122 123 124 uint32_t FloatMantissa(float val) { 125 uint32_t rawbits = FloatToRawbits(val); 126 return ExtractUnsignedBitfield32(22, 0, rawbits); 127 } 128 129 130 uint32_t DoubleSign(double val) { 131 uint64_t rawbits = DoubleToRawbits(val); 132 return static_cast<uint32_t>(ExtractUnsignedBitfield64(63, 63, rawbits)); 133 } 134 135 136 uint32_t DoubleExp(double val) { 137 uint64_t rawbits = DoubleToRawbits(val); 138 return static_cast<uint32_t>(ExtractUnsignedBitfield64(62, 52, rawbits)); 139 } 140 141 142 uint64_t DoubleMantissa(double val) { 143 uint64_t rawbits = DoubleToRawbits(val); 144 return ExtractUnsignedBitfield64(51, 0, rawbits); 145 } 146 147 148 internal::SimFloat16 Float16Pack(uint16_t sign, 149 uint16_t exp, 150 uint16_t mantissa) { 151 uint16_t bits = (sign << 15) | (exp << 10) | mantissa; 152 return RawbitsToFloat16(bits); 153 } 154 155 156 float FloatPack(uint32_t sign, uint32_t exp, uint32_t mantissa) { 157 uint32_t bits = (sign << 31) | (exp << 23) | mantissa; 158 return RawbitsToFloat(bits); 159 } 160 161 162 double DoublePack(uint64_t sign, uint64_t exp, uint64_t mantissa) { 163 uint64_t bits = (sign << 63) | (exp << 52) | mantissa; 164 return RawbitsToDouble(bits); 165 } 166 167 168 int Float16Classify(Float16 value) { 169 uint16_t bits = Float16ToRawbits(value); 170 uint16_t exponent_max = (1 << 5) - 1; 171 uint16_t exponent_mask = exponent_max << 10; 172 uint16_t mantissa_mask = (1 << 10) - 1; 173 174 uint16_t exponent = (bits & exponent_mask) >> 10; 175 uint16_t mantissa = bits & mantissa_mask; 176 if (exponent == 0) { 177 if (mantissa == 0) { 178 return FP_ZERO; 179 } 180 return FP_SUBNORMAL; 181 } else if (exponent == exponent_max) { 182 if (mantissa == 0) { 183 return FP_INFINITE; 184 } 185 return FP_NAN; 186 } 187 return FP_NORMAL; 188 } 189 190 191 unsigned CountClearHalfWords(uint64_t imm, unsigned reg_size) { 192 VIXL_ASSERT((reg_size % 8) == 0); 193 int count = 0; 194 for (unsigned i = 0; i < (reg_size / 16); i++) { 195 if ((imm & 0xffff) == 0) { 196 count++; 197 } 198 imm >>= 16; 199 } 200 return count; 201 } 202 203 204 int BitCount(uint64_t value) { return CountSetBits(value); } 205 206 // Float16 definitions. 207 208 Float16::Float16(double dvalue) { 209 rawbits_ = 210 Float16ToRawbits(FPToFloat16(dvalue, FPTieEven, kIgnoreDefaultNaN)); 211 } 212 213 namespace internal { 214 215 SimFloat16 SimFloat16::operator-() const { 216 return RawbitsToFloat16(rawbits_ ^ 0x8000); 217 } 218 219 // SimFloat16 definitions. 220 SimFloat16 SimFloat16::operator+(SimFloat16 rhs) const { 221 return static_cast<double>(*this) + static_cast<double>(rhs); 222 } 223 224 SimFloat16 SimFloat16::operator-(SimFloat16 rhs) const { 225 return static_cast<double>(*this) - static_cast<double>(rhs); 226 } 227 228 SimFloat16 SimFloat16::operator*(SimFloat16 rhs) const { 229 return static_cast<double>(*this) * static_cast<double>(rhs); 230 } 231 232 SimFloat16 SimFloat16::operator/(SimFloat16 rhs) const { 233 return static_cast<double>(*this) / static_cast<double>(rhs); 234 } 235 236 bool SimFloat16::operator<(SimFloat16 rhs) const { 237 return static_cast<double>(*this) < static_cast<double>(rhs); 238 } 239 240 bool SimFloat16::operator>(SimFloat16 rhs) const { 241 return static_cast<double>(*this) > static_cast<double>(rhs); 242 } 243 244 bool SimFloat16::operator==(SimFloat16 rhs) const { 245 if (IsNaN(*this) || IsNaN(rhs)) { 246 return false; 247 } else if (IsZero(rhs) && IsZero(*this)) { 248 // +0 and -0 should be treated as equal. 249 return true; 250 } 251 return this->rawbits_ == rhs.rawbits_; 252 } 253 254 bool SimFloat16::operator!=(SimFloat16 rhs) const { return !(*this == rhs); } 255 256 bool SimFloat16::operator==(double rhs) const { 257 return static_cast<double>(*this) == static_cast<double>(rhs); 258 } 259 260 SimFloat16::operator double() const { 261 return FPToDouble(*this, kIgnoreDefaultNaN); 262 } 263 264 Int64 BitCount(Uint32 value) { return CountSetBits(value.Get()); } 265 266 } // namespace internal 267 268 float FPToFloat(Float16 value, UseDefaultNaN DN, bool* exception) { 269 uint16_t bits = Float16ToRawbits(value); 270 uint32_t sign = bits >> 15; 271 uint32_t exponent = 272 ExtractUnsignedBitfield32(kFloat16MantissaBits + kFloat16ExponentBits - 1, 273 kFloat16MantissaBits, 274 bits); 275 uint32_t mantissa = 276 ExtractUnsignedBitfield32(kFloat16MantissaBits - 1, 0, bits); 277 278 switch (Float16Classify(value)) { 279 case FP_ZERO: 280 return (sign == 0) ? 0.0f : -0.0f; 281 282 case FP_INFINITE: 283 return (sign == 0) ? kFP32PositiveInfinity : kFP32NegativeInfinity; 284 285 case FP_SUBNORMAL: { 286 // Calculate shift required to put mantissa into the most-significant bits 287 // of the destination mantissa. 288 int shift = CountLeadingZeros(mantissa << (32 - 10)); 289 290 // Shift mantissa and discard implicit '1'. 291 mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits) + shift + 1; 292 mantissa &= (1 << kFloatMantissaBits) - 1; 293 294 // Adjust the exponent for the shift applied, and rebias. 295 exponent = exponent - shift + (-15 + 127); 296 break; 297 } 298 299 case FP_NAN: 300 if (IsSignallingNaN(value)) { 301 if (exception != NULL) { 302 *exception = true; 303 } 304 } 305 if (DN == kUseDefaultNaN) return kFP32DefaultNaN; 306 307 // Convert NaNs as the processor would: 308 // - The sign is propagated. 309 // - The payload (mantissa) is transferred entirely, except that the top 310 // bit is forced to '1', making the result a quiet NaN. The unused 311 // (low-order) payload bits are set to 0. 312 exponent = (1 << kFloatExponentBits) - 1; 313 314 // Increase bits in mantissa, making low-order bits 0. 315 mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits); 316 mantissa |= 1 << 22; // Force a quiet NaN. 317 break; 318 319 case FP_NORMAL: 320 // Increase bits in mantissa, making low-order bits 0. 321 mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits); 322 323 // Change exponent bias. 324 exponent += (-15 + 127); 325 break; 326 327 default: 328 VIXL_UNREACHABLE(); 329 } 330 return RawbitsToFloat((sign << 31) | (exponent << kFloatMantissaBits) | 331 mantissa); 332 } 333 334 335 float FPToFloat(double value, 336 FPRounding round_mode, 337 UseDefaultNaN DN, 338 bool* exception) { 339 // Only the FPTieEven rounding mode is implemented. 340 VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd)); 341 USE(round_mode); 342 343 switch (std::fpclassify(value)) { 344 case FP_NAN: { 345 if (IsSignallingNaN(value)) { 346 if (exception != NULL) { 347 *exception = true; 348 } 349 } 350 if (DN == kUseDefaultNaN) return kFP32DefaultNaN; 351 352 // Convert NaNs as the processor would: 353 // - The sign is propagated. 354 // - The payload (mantissa) is transferred as much as possible, except 355 // that the top bit is forced to '1', making the result a quiet NaN. 356 uint64_t raw = DoubleToRawbits(value); 357 358 uint32_t sign = raw >> 63; 359 uint32_t exponent = (1 << 8) - 1; 360 uint32_t payload = 361 static_cast<uint32_t>(ExtractUnsignedBitfield64(50, 52 - 23, raw)); 362 payload |= (1 << 22); // Force a quiet NaN. 363 364 return RawbitsToFloat((sign << 31) | (exponent << 23) | payload); 365 } 366 367 case FP_ZERO: 368 case FP_INFINITE: { 369 // In a C++ cast, any value representable in the target type will be 370 // unchanged. This is always the case for +/-0.0 and infinities. 371 return static_cast<float>(value); 372 } 373 374 case FP_NORMAL: 375 case FP_SUBNORMAL: { 376 // Convert double-to-float as the processor would, assuming that FPCR.FZ 377 // (flush-to-zero) is not set. 378 uint64_t raw = DoubleToRawbits(value); 379 // Extract the IEEE-754 double components. 380 uint32_t sign = raw >> 63; 381 // Extract the exponent and remove the IEEE-754 encoding bias. 382 int32_t exponent = 383 static_cast<int32_t>(ExtractUnsignedBitfield64(62, 52, raw)) - 1023; 384 // Extract the mantissa and add the implicit '1' bit. 385 uint64_t mantissa = ExtractUnsignedBitfield64(51, 0, raw); 386 if (std::fpclassify(value) == FP_NORMAL) { 387 mantissa |= (UINT64_C(1) << 52); 388 } 389 return FPRoundToFloat(sign, exponent, mantissa, round_mode); 390 } 391 } 392 393 VIXL_UNREACHABLE(); 394 return value; 395 } 396 397 // TODO: We should consider implementing a full FPToDouble(Float16) 398 // conversion function (for performance reasons). 399 double FPToDouble(Float16 value, UseDefaultNaN DN, bool* exception) { 400 // We can rely on implicit float to double conversion here. 401 return FPToFloat(value, DN, exception); 402 } 403 404 405 double FPToDouble(float value, UseDefaultNaN DN, bool* exception) { 406 switch (std::fpclassify(value)) { 407 case FP_NAN: { 408 if (IsSignallingNaN(value)) { 409 if (exception != NULL) { 410 *exception = true; 411 } 412 } 413 if (DN == kUseDefaultNaN) return kFP64DefaultNaN; 414 415 // Convert NaNs as the processor would: 416 // - The sign is propagated. 417 // - The payload (mantissa) is transferred entirely, except that the top 418 // bit is forced to '1', making the result a quiet NaN. The unused 419 // (low-order) payload bits are set to 0. 420 uint32_t raw = FloatToRawbits(value); 421 422 uint64_t sign = raw >> 31; 423 uint64_t exponent = (1 << 11) - 1; 424 uint64_t payload = ExtractUnsignedBitfield64(21, 0, raw); 425 payload <<= (52 - 23); // The unused low-order bits should be 0. 426 payload |= (UINT64_C(1) << 51); // Force a quiet NaN. 427 428 return RawbitsToDouble((sign << 63) | (exponent << 52) | payload); 429 } 430 431 case FP_ZERO: 432 case FP_NORMAL: 433 case FP_SUBNORMAL: 434 case FP_INFINITE: { 435 // All other inputs are preserved in a standard cast, because every value 436 // representable using an IEEE-754 float is also representable using an 437 // IEEE-754 double. 438 return static_cast<double>(value); 439 } 440 } 441 442 VIXL_UNREACHABLE(); 443 return static_cast<double>(value); 444 } 445 446 447 Float16 FPToFloat16(float value, 448 FPRounding round_mode, 449 UseDefaultNaN DN, 450 bool* exception) { 451 // Only the FPTieEven rounding mode is implemented. 452 VIXL_ASSERT(round_mode == FPTieEven); 453 USE(round_mode); 454 455 uint32_t raw = FloatToRawbits(value); 456 int32_t sign = raw >> 31; 457 int32_t exponent = ExtractUnsignedBitfield32(30, 23, raw) - 127; 458 uint32_t mantissa = ExtractUnsignedBitfield32(22, 0, raw); 459 460 switch (std::fpclassify(value)) { 461 case FP_NAN: { 462 if (IsSignallingNaN(value)) { 463 if (exception != NULL) { 464 *exception = true; 465 } 466 } 467 if (DN == kUseDefaultNaN) return kFP16DefaultNaN; 468 469 // Convert NaNs as the processor would: 470 // - The sign is propagated. 471 // - The payload (mantissa) is transferred as much as possible, except 472 // that the top bit is forced to '1', making the result a quiet NaN. 473 uint16_t result = (sign == 0) ? Float16ToRawbits(kFP16PositiveInfinity) 474 : Float16ToRawbits(kFP16NegativeInfinity); 475 result |= mantissa >> (kFloatMantissaBits - kFloat16MantissaBits); 476 result |= (1 << 9); // Force a quiet NaN; 477 return RawbitsToFloat16(result); 478 } 479 480 case FP_ZERO: 481 return (sign == 0) ? kFP16PositiveZero : kFP16NegativeZero; 482 483 case FP_INFINITE: 484 return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity; 485 486 case FP_NORMAL: 487 case FP_SUBNORMAL: { 488 // Convert float-to-half as the processor would, assuming that FPCR.FZ 489 // (flush-to-zero) is not set. 490 491 // Add the implicit '1' bit to the mantissa. 492 mantissa += (1 << 23); 493 return FPRoundToFloat16(sign, exponent, mantissa, round_mode); 494 } 495 } 496 497 VIXL_UNREACHABLE(); 498 return kFP16PositiveZero; 499 } 500 501 502 Float16 FPToFloat16(double value, 503 FPRounding round_mode, 504 UseDefaultNaN DN, 505 bool* exception) { 506 // Only the FPTieEven rounding mode is implemented. 507 VIXL_ASSERT(round_mode == FPTieEven); 508 USE(round_mode); 509 510 uint64_t raw = DoubleToRawbits(value); 511 int32_t sign = raw >> 63; 512 int64_t exponent = ExtractUnsignedBitfield64(62, 52, raw) - 1023; 513 uint64_t mantissa = ExtractUnsignedBitfield64(51, 0, raw); 514 515 switch (std::fpclassify(value)) { 516 case FP_NAN: { 517 if (IsSignallingNaN(value)) { 518 if (exception != NULL) { 519 *exception = true; 520 } 521 } 522 if (DN == kUseDefaultNaN) return kFP16DefaultNaN; 523 524 // Convert NaNs as the processor would: 525 // - The sign is propagated. 526 // - The payload (mantissa) is transferred as much as possible, except 527 // that the top bit is forced to '1', making the result a quiet NaN. 528 uint16_t result = (sign == 0) ? Float16ToRawbits(kFP16PositiveInfinity) 529 : Float16ToRawbits(kFP16NegativeInfinity); 530 result |= mantissa >> (kDoubleMantissaBits - kFloat16MantissaBits); 531 result |= (1 << 9); // Force a quiet NaN; 532 return RawbitsToFloat16(result); 533 } 534 535 case FP_ZERO: 536 return (sign == 0) ? kFP16PositiveZero : kFP16NegativeZero; 537 538 case FP_INFINITE: 539 return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity; 540 case FP_NORMAL: 541 case FP_SUBNORMAL: { 542 // Convert double-to-half as the processor would, assuming that FPCR.FZ 543 // (flush-to-zero) is not set. 544 545 // Add the implicit '1' bit to the mantissa. 546 mantissa += (UINT64_C(1) << 52); 547 return FPRoundToFloat16(sign, exponent, mantissa, round_mode); 548 } 549 } 550 551 VIXL_UNREACHABLE(); 552 return kFP16PositiveZero; 553 } 554 555 } // namespace vixl 556