1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_ 17 #define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_ 18 19 #define _USE_MATH_DEFINES 20 #include <math.h> 21 #include <cmath> 22 #undef _USE_MATH_DEFINES 23 24 #include <string.h> 25 #include <algorithm> 26 #include <type_traits> 27 28 #include "philox_random.h" 29 #include "tensorflow/core/lib/bfloat16/bfloat16.h" 30 #include "unsupported/Eigen/CXX11/Tensor" 31 32 namespace tensorflow { 33 namespace random { 34 35 // Helper function to convert a 16-bit integer to a half between [0..1). 36 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x); 37 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1). 38 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x); 39 // Helper function to convert a 32-bit integer to a float between [0..1). 40 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x); 41 // Helper function to convert two 32-bit integers to a double between [0..1). 42 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1); 43 44 // Computes a + b. Requires that the result is representable in the destination 45 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b 46 // need *not* be representable in that type. (The condition on b excludes the 47 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot 48 // compute.) 49 template <typename Int> 50 PHILOX_DEVICE_INLINE Int SignedAdd(Int a, typename std::make_unsigned<Int>::type b) { 51 // Implementation note: both b_div_2 and b - b_div_2 are positive and 52 // representatble as Int. 53 auto b_div_2 = b >> 1; 54 return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2); 55 } 56 57 // A class that generates uniform distribution random numbers from the 58 // underlying random integer generator. 59 // Arguments: 60 // Generator: a generator type that returns a number of uint32 upon each 61 // invocation. It needs to define kResultElementCount for the 62 // sample count for each invocation, and ResultType for the 63 // actual returned sample type. 64 // RealType: the data type of the real numbers that will be returned by the 65 // distribution. This could be either float or double for now. 66 // This class is meant to be implemented through specialization. The default 67 // is not defined by design. 68 template <class Generator, typename RealType> 69 class UniformDistribution; 70 71 template <class Generator> 72 class UniformDistribution<Generator, Eigen::half> { 73 public: 74 // The number of elements that will be returned. 75 static const int kResultElementCount = Generator::kResultElementCount; 76 // Cost of generation of a single element (in cycles). 77 static const int kElementCost = 3; 78 // Indicate that this distribution may take variable number of samples 79 // during the runtime. 80 static const bool kVariableSamplesPerOutput = false; 81 typedef Array<Eigen::half, kResultElementCount> ResultType; 82 typedef Eigen::half ResultElementType; 83 84 PHILOX_DEVICE_INLINE 85 ResultType operator()(Generator* gen) { 86 typename Generator::ResultType sample = (*gen)(); 87 ResultType result; 88 for (int i = 0; i < kResultElementCount; ++i) { 89 result[i] = Uint16ToHalf(sample[i]); // Truncate the upper 16 bits. 90 } 91 return result; 92 } 93 }; 94 95 template <class Generator> 96 class UniformDistribution<Generator, bfloat16> { 97 public: 98 // The number of elements that will be returned. 99 static const int kResultElementCount = Generator::kResultElementCount; 100 // Cost of generation of a single element (in cycles). 101 static const int kElementCost = 3; 102 // Indicate that this distribution may take variable number of samples 103 // during the runtime. 104 static const bool kVariableSamplesPerOutput = false; 105 typedef Array<bfloat16, kResultElementCount> ResultType; 106 typedef bfloat16 ResultElementType; 107 108 PHILOX_DEVICE_INLINE 109 ResultType operator()(Generator* gen) { 110 typename Generator::ResultType sample = (*gen)(); 111 ResultType result; 112 for (int i = 0; i < kResultElementCount; ++i) { 113 result[i] = Uint16ToGfloat16(sample[i]); 114 } 115 return result; 116 } 117 }; 118 119 template <class Generator> 120 class UniformDistribution<Generator, float> { 121 public: 122 // The number of elements that will be returned. 123 static const int kResultElementCount = Generator::kResultElementCount; 124 // Cost of generation of a single element (in cycles). 125 static const int kElementCost = 3; 126 // Indicate that this distribution may take variable number of samples 127 // during the runtime. 128 static const bool kVariableSamplesPerOutput = false; 129 typedef Array<float, kResultElementCount> ResultType; 130 typedef float ResultElementType; 131 132 PHILOX_DEVICE_INLINE 133 ResultType operator()(Generator* gen) { 134 typename Generator::ResultType sample = (*gen)(); 135 ResultType result; 136 for (int i = 0; i < kResultElementCount; ++i) { 137 result[i] = Uint32ToFloat(sample[i]); 138 } 139 return result; 140 } 141 }; 142 143 template <class Generator> 144 class UniformDistribution<Generator, double> { 145 public: 146 // The number of elements that will be returned. 147 static const int kResultElementCount = Generator::kResultElementCount / 2; 148 // Cost of generation of a single element (in cycles). 149 static const int kElementCost = 3; 150 // Indicate that this distribution may take variable number of samples 151 // during the runtime. 152 static const bool kVariableSamplesPerOutput = false; 153 typedef Array<double, kResultElementCount> ResultType; 154 typedef double ResultElementType; 155 156 PHILOX_DEVICE_INLINE 157 ResultType operator()(Generator* gen) { 158 typename Generator::ResultType sample = (*gen)(); 159 ResultType result; 160 for (int i = 0; i < kResultElementCount; ++i) { 161 result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]); 162 } 163 return result; 164 } 165 }; 166 167 template <class Generator> 168 class UniformDistribution<Generator, int32> { 169 public: 170 // The number of elements that will be returned. 171 static const int kResultElementCount = Generator::kResultElementCount; 172 // Cost of generation of a single element (in cycles). 173 static const int kElementCost = 3; 174 // Indicate that this distribution may take variable number of samples 175 // during the runtime. 176 static const bool kVariableSamplesPerOutput = false; 177 typedef Array<int32, kResultElementCount> ResultType; 178 typedef int32 ResultElementType; 179 180 // Must have lo < hi 181 UniformDistribution(int32 lo, int32 hi) 182 : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {} 183 184 PHILOX_DEVICE_INLINE 185 ResultType operator()(Generator* gen) { 186 typename Generator::ResultType sample = (*gen)(); 187 ResultType result; 188 for (int i = 0; i < kResultElementCount; ++i) { 189 result[i] = SignedAdd(lo_, sample[i] % range_); 190 } 191 return result; 192 } 193 194 private: 195 // Note that lo_ is intentionally signed while range_ is intentionally 196 // unsigned. This is because hi - lo can overflow signed integers if 197 // lo < 0 < hi, but always fits in unsigned. 198 int32 lo_; 199 uint32 range_; 200 }; 201 202 template <class Generator> 203 class UniformDistribution<Generator, int64> { 204 public: 205 // The number of elements that will be returned. 206 static const int kResultElementCount = Generator::kResultElementCount / 2; 207 // Cost of generation of a single element (in cycles). 208 static const int kElementCost = 3; 209 // Indicate that this distribution may take variable number of samples 210 // during the runtime. 211 static const bool kVariableSamplesPerOutput = false; 212 typedef Array<int64, kResultElementCount> ResultType; 213 typedef int64 ResultElementType; 214 215 // Must have lo < hi 216 UniformDistribution(int64 lo, int64 hi) 217 : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {} 218 219 PHILOX_DEVICE_INLINE 220 ResultType operator()(Generator* gen) { 221 typename Generator::ResultType sample = (*gen)(); 222 ResultType result; 223 for (int i = 0; i < kResultElementCount; ++i) { 224 auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32; 225 result[i] = SignedAdd(lo_, bits % range_); 226 } 227 return result; 228 } 229 230 private: 231 // Note that lo_ is intentionally signed while range_ is intentionally 232 // unsigned. This is because hi - lo can overflow signed integers if 233 // lo < 0 < hi, but always fits in unsigned. 234 int64 lo_; 235 uint64 range_; 236 }; 237 238 // A class that adapts the underlying native multiple samples to return a single 239 // sample at a time. 240 template <class Generator> 241 class SingleSampleAdapter { 242 public: 243 // The number of elements that will be returned. 244 static const int kResultElementCount = 1; 245 // The number of elements that will be returned by the underlying generator. 246 static const int kNativeElementCount = Generator::kResultElementCount; 247 typedef typename Generator::ResultElementType ResultType; 248 typedef typename Generator::ResultElementType ResultElementType; 249 250 PHILOX_DEVICE_INLINE 251 explicit SingleSampleAdapter(Generator* gen) 252 : generator_(gen), used_result_index_(Generator::kResultElementCount) {} 253 254 PHILOX_DEVICE_INLINE 255 ResultType operator()() { 256 if (used_result_index_ == Generator::kResultElementCount) { 257 unused_results_ = (*generator_)(); 258 used_result_index_ = 0; 259 } 260 261 return unused_results_[used_result_index_++]; 262 } 263 264 PHILOX_DEVICE_INLINE 265 void Skip(uint64 num_skips) { 266 if (!num_skips) { 267 return; 268 } 269 int num_unused_results = kNativeElementCount - used_result_index_; 270 if (num_skips <= num_unused_results) { 271 used_result_index_ += num_skips; 272 return; 273 } 274 num_skips -= num_unused_results; 275 used_result_index_ = kNativeElementCount; 276 SkipFromGenerator(num_skips / kNativeElementCount); 277 num_skips = num_skips % kNativeElementCount; 278 if (num_skips) { 279 unused_results_ = (*generator_)(); 280 used_result_index_ = num_skips; 281 } 282 } 283 284 private: 285 // This implementation iteratively skips over `num_skips` samples 286 // from `generator_`. There is an O(1) implementation for PhiloxRandom 287 // in random_distributions.cc. 288 PHILOX_DEVICE_INLINE 289 void SkipFromGenerator(uint64 num_skips) { 290 while (num_skips--) { 291 (*generator_)(); 292 } 293 } 294 295 Generator* generator_; 296 typename Generator::ResultType unused_results_; 297 int used_result_index_; 298 }; 299 300 // A class that generates unit normal distribution random numbers from the 301 // underlying random integer generator. 302 // Arguments: 303 // Generator: a generator type that returns a number of uint32 upon each 304 // each invocation. It needs to define kResultElementCount for the 305 // sample count for each invocation, and ResultType for actual 306 // returned sample type. 307 // RealType: the data type of the real numbers that will be returned by the 308 // distribution. This could be either float or double for now. 309 // This class is meant to be implemented through specialization. The default 310 // is not defined by design. 311 template <class Generator, typename RealType> 312 class NormalDistribution; 313 314 PHILOX_DEVICE_INLINE 315 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1); 316 317 PHILOX_DEVICE_INLINE 318 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, double* d1); 319 320 // Exactly like the float version, except that we convert to half afterwards; 321 // since we don't have half-precision sin/cos even on GPUs, there's nothing to 322 // gain from working in half internally. 323 template <class Generator> 324 class NormalDistribution<Generator, Eigen::half> { 325 public: 326 // The number of elements that will be returned. 327 static const int kResultElementCount = Generator::kResultElementCount; 328 // Cost of generation of a single element (in cycles). 329 static const int kElementCost = 70; 330 // Indicate that this distribution may take variable number of samples 331 // during the runtime. 332 static const bool kVariableSamplesPerOutput = false; 333 typedef Array<Eigen::half, kResultElementCount> ResultType; 334 typedef Eigen::half ResultElementType; 335 336 PHILOX_DEVICE_INLINE 337 ResultType operator()(Generator* gen) { 338 typename Generator::ResultType sample = (*gen)(); 339 ResultType result; 340 for (int i = 0; i < kResultElementCount; i += 2) { 341 float f[2]; 342 BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]); 343 result[i] = Eigen::half(f[0]); 344 result[i + 1] = Eigen::half(f[1]); 345 } 346 return result; 347 } 348 }; 349 350 template <class Generator> 351 class NormalDistribution<Generator, bfloat16> { 352 public: 353 // The number of elements that will be returned. 354 static const int kResultElementCount = Generator::kResultElementCount; 355 // Cost of generation of a single element (in cycles). 356 static const int kElementCost = 70; 357 // Indicate that this distribution may take variable number of samples 358 // during the runtime. 359 static const bool kVariableSamplesPerOutput = false; 360 typedef Array<bfloat16, kResultElementCount> ResultType; 361 typedef bfloat16 ResultElementType; 362 363 PHILOX_DEVICE_INLINE 364 ResultType operator()(Generator* gen) { 365 typename Generator::ResultType sample = (*gen)(); 366 ResultType result; 367 static_assert(kResultElementCount % 2 == 0, "kResultElementCount should be an even number"); 368 for (int i = 0; i < kResultElementCount; i += 2) { 369 float f[2]; 370 // Box-Muller transform requires processing 2 elements at a time. 371 BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]); 372 result[i] = bfloat16(f[0]); 373 result[i + 1] = bfloat16(f[1]); 374 } 375 return result; 376 } 377 }; 378 379 template <class Generator> 380 class NormalDistribution<Generator, float> { 381 public: 382 // The number of elements that will be returned. 383 static const int kResultElementCount = Generator::kResultElementCount; 384 // Cost of generation of a single element (in cycles). 385 static const int kElementCost = 70; 386 // Indicate that this distribution may take variable number of samples 387 // during the runtime. 388 static const bool kVariableSamplesPerOutput = false; 389 typedef Array<float, kResultElementCount> ResultType; 390 typedef float ResultElementType; 391 392 PHILOX_DEVICE_INLINE 393 ResultType operator()(Generator* gen) { 394 typename Generator::ResultType sample = (*gen)(); 395 ResultType result; 396 for (int i = 0; i < kResultElementCount; i += 2) { 397 BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]); 398 } 399 return result; 400 } 401 }; 402 403 template <class Generator> 404 class NormalDistribution<Generator, double> { 405 public: 406 // The number of elements that will be returned. 407 static const int kResultElementCount = Generator::kResultElementCount / 2; 408 // Cost of generation of a single element (in cycles). 409 static const int kElementCost = 70; 410 // Indicate that this distribution may take variable number of samples 411 // during the runtime. 412 static const bool kVariableSamplesPerOutput = false; 413 typedef Array<double, kResultElementCount> ResultType; 414 typedef double ResultElementType; 415 416 PHILOX_DEVICE_INLINE 417 ResultType operator()(Generator* gen) { 418 typename Generator::ResultType sample = (*gen)(); 419 ResultType result; 420 for (int i = 0; i < kResultElementCount; i += 2) { 421 const int i2 = 2 * i; 422 BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2], sample[i2 + 3], &result[i], 423 &result[i + 1]); 424 } 425 return result; 426 } 427 }; 428 429 // A class that returns standard normal distribution between 430 // [-kTruncateValue, kTruncateValue]. 431 // Arguments: 432 // Generator: a generator type that returns a number of uint32 upon each 433 // each invocation. It needs to define kResultElementCount for the 434 // sample count for each invocation, and ResultType for actual 435 // returned sample type. 436 // RealType: the data type of the real numbers that will be returned by the 437 // distribution. This could be either float or double for now. 438 // This class is meant to be implemented through specialization. The default 439 // is not defined by design. 440 template <class SingleSampleGenerator, typename RealType> 441 class TruncatedNormalDistribution; 442 443 // Exactly like the float version, except that we convert to half afterwards; 444 // since we don't have half-precision sin/cos even on GPUs, there's nothing to 445 // gain from working in half internally. 446 template <class SingleSampleGenerator> 447 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> { 448 public: 449 // The number of elements that will be returned. 450 static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount; 451 // Cost of generation of a single element (in cycles). 452 static const int kElementCost = 90; 453 // Indicate that this distribution may take variable number of samples 454 // during the runtime. 455 static const bool kVariableSamplesPerOutput = true; 456 // The threshold where the normal distribution is truncated. 457 const float kTruncateValue = 2.0f; 458 459 typedef Array<Eigen::half, kResultElementCount> ResultType; 460 typedef Eigen::half ResultElementType; 461 462 PHILOX_DEVICE_INLINE 463 ResultType operator()(SingleSampleGenerator* gen) { 464 ResultType results; 465 int index = 0; 466 while (true) { 467 // Repeatedly take samples from the normal distribution, until we have 468 // the desired number of elements that fall within the pre-defined cutoff 469 // threshold. 470 const uint32 x0 = (*gen)(); 471 const uint32 x1 = (*gen)(); 472 float f[2]; 473 BoxMullerFloat(x0, x1, &f[0], &f[1]); 474 475 for (int i = 0; i < 2; ++i) { 476 if (Eigen::numext::abs(f[i]) < kTruncateValue) { 477 results[index++] = Eigen::half(f[i]); 478 if (index >= kResultElementCount) { 479 return results; 480 } 481 } 482 } 483 } 484 } 485 }; 486 487 template <class SingleSampleGenerator> 488 class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> { 489 public: 490 // The number of elements that will be returned. 491 static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount; 492 // Cost of generation of a single element (in cycles). 493 static const int kElementCost = 90; 494 // Indicate that this distribution may take variable number of samples 495 // during the runtime. 496 static const bool kVariableSamplesPerOutput = true; 497 // The threshold where the normal distribution is truncated. 498 const float kTruncateValue = 2.0f; 499 500 typedef Array<bfloat16, kResultElementCount> ResultType; 501 typedef bfloat16 ResultElementType; 502 503 PHILOX_DEVICE_INLINE 504 ResultType operator()(SingleSampleGenerator* gen) { 505 ResultType results; 506 int index = 0; 507 while (true) { 508 // Repeatedly take samples from the normal distribution, until we have 509 // the desired number of elements that fall within the pre-defined cutoff 510 // threshold. 511 const uint32 x0 = (*gen)(); 512 const uint32 x1 = (*gen)(); 513 float f[2]; 514 BoxMullerFloat(x0, x1, &f[0], &f[1]); 515 516 for (int i = 0; i < 2; ++i) { 517 if (Eigen::numext::abs(f[i]) < kTruncateValue) { 518 results[index++] = bfloat16(f[i]); 519 if (index >= kResultElementCount) { 520 return results; 521 } 522 } 523 } 524 } 525 } 526 }; 527 528 // Partial specialization for float. 529 template <class SingleSampleGenerator> 530 class TruncatedNormalDistribution<SingleSampleGenerator, float> { 531 public: 532 // The number of elements that will be returned. 533 static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount; 534 // Cost of generation of a single element (in cycles). 535 static const int kElementCost = 90; 536 // Indicate that this distribution may take variable number of samples 537 // during the runtime. 538 static const bool kVariableSamplesPerOutput = true; 539 // The threshold where the normal distribution is truncated. 540 const float kTruncateValue = 2.0f; 541 542 typedef Array<float, kResultElementCount> ResultType; 543 typedef float ResultElementType; 544 545 PHILOX_DEVICE_INLINE 546 ResultType operator()(SingleSampleGenerator* gen) { 547 ResultType results; 548 int index = 0; 549 while (true) { 550 // Repeatedly take samples from the normal distribution, until we have 551 // the desired number of elements that fall within the pre-defined cutoff 552 // threshold. 553 const uint32 x0 = (*gen)(); 554 const uint32 x1 = (*gen)(); 555 float f[2]; 556 BoxMullerFloat(x0, x1, &f[0], &f[1]); 557 558 for (int i = 0; i < 2; ++i) { 559 if (Eigen::numext::abs(f[i]) < kTruncateValue) { 560 results[index++] = f[i]; 561 if (index >= kResultElementCount) { 562 return results; 563 } 564 } 565 } 566 } 567 } 568 }; 569 570 // Partial specialization for double. 571 template <class SingleSampleGenerator> 572 class TruncatedNormalDistribution<SingleSampleGenerator, double> { 573 public: 574 // The number of elements that will be returned. 575 static const int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1) 576 ? SingleSampleGenerator::kNativeElementCount / 2 577 : 1; 578 // Cost of generation of a single element (in cycles). 579 static const int kElementCost = 90; 580 // Indicate that this distribution may take variable number of samples 581 // during the runtime. 582 static const bool kVariableSamplesPerOutput = true; 583 typedef Array<double, kResultElementCount> ResultType; 584 typedef double ResultElementType; 585 const double kTruncateValue = 2.0; 586 587 PHILOX_DEVICE_INLINE 588 ResultType operator()(SingleSampleGenerator* gen) { 589 ResultType results; 590 int index = 0; 591 while (1) { 592 const uint32 x0 = (*gen)(); 593 const uint32 x1 = (*gen)(); 594 const uint32 x2 = (*gen)(); 595 const uint32 x3 = (*gen)(); 596 double d[2]; 597 BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]); 598 599 for (int i = 0; i < 2; ++i) { 600 if (Eigen::numext::abs(d[i]) < kTruncateValue) { 601 results[index++] = d[i]; 602 if (index >= kResultElementCount) { 603 return results; 604 } 605 } 606 } 607 } 608 } 609 }; 610 611 // Helper function to convert two 32-bit uniform integers to two floats 612 // under the unit normal distribution. 613 PHILOX_DEVICE_INLINE 614 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) { 615 // This function implements the Box-Muller transform: 616 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form 617 // Do not send a really small number to log(). 618 // We cannot mark "epsilon" as "static const" because NVCC would complain 619 const float epsilon = 1.0e-7f; 620 float u1 = Uint32ToFloat(x0); 621 if (u1 < epsilon) { 622 u1 = epsilon; 623 } 624 const float v1 = 2.0f * M_PI * Uint32ToFloat(x1); 625 const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1)); 626 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__) 627 *f0 = Eigen::numext::sin(v1); 628 *f1 = Eigen::numext::cos(v1); 629 #else 630 sincosf(v1, f0, f1); 631 #endif 632 *f0 *= u2; 633 *f1 *= u2; 634 } 635 636 // Helper function to convert four 32-bit uniform integers to two doubles 637 // under the unit normal distribution. 638 PHILOX_DEVICE_INLINE 639 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, double* d1) { 640 // This function implements the Box-Muller transform: 641 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form 642 // Do not send a really small number to log(). 643 // We cannot mark "epsilon" as "static const" because NVCC would complain 644 const double epsilon = 1.0e-7; 645 double u1 = Uint64ToDouble(x0, x1); 646 if (u1 < epsilon) { 647 u1 = epsilon; 648 } 649 const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3); 650 const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1)); 651 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__) 652 *d0 = Eigen::numext::sin(v1); 653 *d1 = Eigen::numext::cos(v1); 654 #else 655 sincos(v1, d0, d1); 656 #endif 657 *d0 *= u2; 658 *d1 *= u2; 659 } 660 661 // Helper function to convert an 16-bit integer to a half between [0..1). 662 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) { 663 // IEEE754 halfs are formatted as follows (MSB first): 664 // sign(1) exponent(5) mantissa(10) 665 // Conceptually construct the following: 666 // sign == 0 667 // exponent == 15 -- an excess 15 representation of a zero exponent 668 // mantissa == 10 random bits 669 const uint16 man = x & 0x3ffu; // 10 bit mantissa 670 const uint16 exp = static_cast<uint16>(15); 671 const uint16 val = (exp << 10) | man; 672 673 Eigen::half result; 674 result.x = val; 675 return result - Eigen::half(1.0); 676 } 677 678 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1). 679 // This can create a uniform distribution of values between [0..1). 680 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) { 681 // bfloat are formatted as follows (MSB first): 682 // sign(1) exponent(8) mantissa(7) 683 // Conceptually construct the following: 684 // sign == 0 685 // exponent == 127 -- an excess 127 representation of a zero exponent 686 // mantissa == 7 random bits 687 const uint16 man = x & 0x7fu; // 7 bit mantissa 688 const uint16 exp = static_cast<uint16>(127); 689 const uint16 val = (exp << 7) | man; 690 691 bfloat16 result; 692 memcpy(&result, &val, sizeof(val)); 693 // The mantissa has an implicit leading 1, so the above code creates a value 694 // in [1, 2). The minus will not cause a rounding that makes the result 1. 695 // Instead it will just be close to 1. 696 return result - bfloat16(1.0); 697 } 698 699 // Helper function to convert an 32-bit integer to a float between [0..1). 700 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) { 701 // IEEE754 floats are formatted as follows (MSB first): 702 // sign(1) exponent(8) mantissa(23) 703 // Conceptually construct the following: 704 // sign == 0 705 // exponent == 127 -- an excess 127 representation of a zero exponent 706 // mantissa == 23 random bits 707 const uint32 man = x & 0x7fffffu; // 23 bit mantissa 708 const uint32 exp = static_cast<uint32>(127); 709 const uint32 val = (exp << 23) | man; 710 711 // Assumes that endian-ness is same for float and uint32. 712 float result; 713 memcpy(&result, &val, sizeof(val)); 714 return result - 1.0f; 715 } 716 717 // Helper function to convert two 32-bit integers to a double between [0..1). 718 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) { 719 // IEEE754 doubles are formatted as follows (MSB first): 720 // sign(1) exponent(11) mantissa(52) 721 // Conceptually construct the following: 722 // sign == 0 723 // exponent == 1023 -- an excess 1023 representation of a zero exponent 724 // mantissa == 52 random bits 725 const uint32 mhi = x0 & 0xfffffu; // upper 20 bits of mantissa 726 const uint32 mlo = x1; // lower 32 bits of mantissa 727 const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo; // mantissa 728 const uint64 exp = static_cast<uint64>(1023); 729 const uint64 val = (exp << 52) | man; 730 // Assumes that endian-ness is same for double and uint64. 731 double result; 732 memcpy(&result, &val, sizeof(val)); 733 return result - 1.0; 734 } 735 736 } // namespace random 737 } // namespace tensorflow 738 739 #endif // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_ 740