1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_ 17 #define TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_ 18 19 #define _USE_MATH_DEFINES 20 #include <math.h> 21 #include <cmath> 22 #undef _USE_MATH_DEFINES 23 24 #include <string.h> 25 #include <algorithm> 26 27 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 28 #include "tensorflow/core/lib/random/philox_random.h" 29 30 namespace tensorflow { 31 namespace random { 32 33 // Helper function to convert a 16-bit integer to a half between [0..1). 34 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x); 35 // Helper function to convert a 32-bit integer to a float between [0..1). 36 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x); 37 // Helper function to convert two 32-bit integers to a double between [0..1). 38 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1); 39 40 // A class that generates uniform distribution random numbers from the 41 // underlying random integer generator. 42 // Arguments: 43 // Generator: a generator type that returns a number of uint32 upon each 44 // invocation. It needs to define kResultElementCount for the 45 // sample count for each invocation, and ResultType for the 46 // actual returned sample type. 47 // RealType: the data type of the real numbers that will be returned by the 48 // distribution. This could be either float or double for now. 49 // This class is meant to be implemented through specialization. The default 50 // is not defined by design. 51 template <class Generator, typename RealType> 52 class UniformDistribution; 53 54 template <class Generator> 55 class UniformDistribution<Generator, Eigen::half> { 56 public: 57 // The number of elements that will be returned. 58 static const int kResultElementCount = Generator::kResultElementCount; 59 // Cost of generation of a single element (in cycles). 60 static const int kElementCost = 3; 61 // Indicate that this distribution may take variable number of samples 62 // during the runtime. 63 static const bool kVariableSamplesPerOutput = false; 64 typedef Array<Eigen::half, kResultElementCount> ResultType; 65 typedef Eigen::half ResultElementType; 66 67 PHILOX_DEVICE_INLINE 68 ResultType operator()(Generator* gen) { 69 typename Generator::ResultType sample = (*gen)(); 70 ResultType result; 71 for (int i = 0; i < kResultElementCount; ++i) { 72 result[i] = Uint16ToHalf(sample[i]); // Truncate the upper 16 bits. 73 } 74 return result; 75 } 76 }; 77 78 template <class Generator> 79 class UniformDistribution<Generator, float> { 80 public: 81 // The number of elements that will be returned. 82 static const int kResultElementCount = Generator::kResultElementCount; 83 // Cost of generation of a single element (in cycles). 84 static const int kElementCost = 3; 85 // Indicate that this distribution may take variable number of samples 86 // during the runtime. 87 static const bool kVariableSamplesPerOutput = false; 88 typedef Array<float, kResultElementCount> ResultType; 89 typedef float ResultElementType; 90 91 PHILOX_DEVICE_INLINE 92 ResultType operator()(Generator* gen) { 93 typename Generator::ResultType sample = (*gen)(); 94 ResultType result; 95 for (int i = 0; i < kResultElementCount; ++i) { 96 result[i] = Uint32ToFloat(sample[i]); 97 } 98 return result; 99 } 100 }; 101 102 template <class Generator> 103 class UniformDistribution<Generator, double> { 104 public: 105 // The number of elements that will be returned. 106 static const int kResultElementCount = Generator::kResultElementCount / 2; 107 // Cost of generation of a single element (in cycles). 108 static const int kElementCost = 3; 109 // Indicate that this distribution may take variable number of samples 110 // during the runtime. 111 static const bool kVariableSamplesPerOutput = false; 112 typedef Array<double, kResultElementCount> ResultType; 113 typedef double ResultElementType; 114 115 PHILOX_DEVICE_INLINE 116 ResultType operator()(Generator* gen) { 117 typename Generator::ResultType sample = (*gen)(); 118 ResultType result; 119 for (int i = 0; i < kResultElementCount; ++i) { 120 result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]); 121 } 122 return result; 123 } 124 }; 125 126 template <class Generator> 127 class UniformDistribution<Generator, int32> { 128 public: 129 // The number of elements that will be returned. 130 static const int kResultElementCount = Generator::kResultElementCount; 131 // Cost of generation of a single element (in cycles). 132 static const int kElementCost = 3; 133 // Indicate that this distribution may take variable number of samples 134 // during the runtime. 135 static const bool kVariableSamplesPerOutput = false; 136 typedef Array<int32, kResultElementCount> ResultType; 137 typedef int32 ResultElementType; 138 139 // Must have lo < hi 140 UniformDistribution(int32 lo, int32 hi) : lo_(lo), range_(hi - lo) {} 141 142 PHILOX_DEVICE_INLINE 143 ResultType operator()(Generator* gen) { 144 typename Generator::ResultType sample = (*gen)(); 145 ResultType result; 146 for (int i = 0; i < kResultElementCount; ++i) { 147 result[i] = lo_ + static_cast<int32>(sample[i] % range_); 148 } 149 return result; 150 } 151 152 private: 153 // Note that lo_ is intentionally signed while range_ is intentionally 154 // unsigned. This is because hi - lo can overflow signed integers if 155 // lo < 0 < hi, but always fits in unsigned. 156 int32 lo_; 157 uint32 range_; 158 }; 159 160 template <class Generator> 161 class UniformDistribution<Generator, int64> { 162 public: 163 // The number of elements that will be returned. 164 static const int kResultElementCount = Generator::kResultElementCount / 2; 165 // Cost of generation of a single element (in cycles). 166 static const int kElementCost = 3; 167 // Indicate that this distribution may take variable number of samples 168 // during the runtime. 169 static const bool kVariableSamplesPerOutput = false; 170 typedef Array<int64, kResultElementCount> ResultType; 171 typedef int64 ResultElementType; 172 173 // Must have lo < hi 174 UniformDistribution(int64 lo, int64 hi) : lo_(lo), range_(hi - lo) {} 175 176 PHILOX_DEVICE_INLINE 177 ResultType operator()(Generator* gen) { 178 typename Generator::ResultType sample = (*gen)(); 179 ResultType result; 180 for (int i = 0; i < kResultElementCount; ++i) { 181 auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32; 182 result[i] = lo_ + static_cast<int64>(bits % range_); 183 } 184 return result; 185 } 186 187 private: 188 // Note that lo_ is intentionally signed while range_ is intentionally 189 // unsigned. This is because hi - lo can overflow signed integers if 190 // lo < 0 < hi, but always fits in unsigned. 191 int64 lo_; 192 uint64 range_; 193 }; 194 195 // A class that adapts the underlying native multiple samples to return a single 196 // sample at a time. 197 template <class Generator> 198 class SingleSampleAdapter { 199 public: 200 // The number of elements that will be returned. 201 static const int kResultElementCount = 1; 202 // The number of elements that will be returned by the underlying generator. 203 static const int kNativeElementCount = Generator::kResultElementCount; 204 typedef typename Generator::ResultElementType ResultType; 205 typedef typename Generator::ResultElementType ResultElementType; 206 207 PHILOX_DEVICE_INLINE 208 explicit SingleSampleAdapter(Generator* gen) 209 : generator_(gen), used_result_index_(Generator::kResultElementCount) {} 210 211 PHILOX_DEVICE_INLINE 212 ResultType operator()() { 213 if (used_result_index_ == Generator::kResultElementCount) { 214 unused_results_ = (*generator_)(); 215 used_result_index_ = 0; 216 } 217 218 return unused_results_[used_result_index_++]; 219 } 220 221 PHILOX_DEVICE_INLINE 222 void Skip(uint64 num_skips) { 223 if (!num_skips) { 224 return; 225 } 226 int num_unused_results = kNativeElementCount - used_result_index_; 227 if (num_skips <= num_unused_results) { 228 used_result_index_ += num_skips; 229 return; 230 } 231 num_skips -= num_unused_results; 232 used_result_index_ = kNativeElementCount; 233 SkipFromGenerator(num_skips / kNativeElementCount); 234 num_skips = num_skips % kNativeElementCount; 235 if (num_skips) { 236 unused_results_ = (*generator_)(); 237 used_result_index_ = num_skips; 238 } 239 } 240 241 private: 242 // This implementation iteratively skips over `num_skips` samples 243 // from `generator_`. There is an O(1) implementation for PhiloxRandom 244 // in random_distributions.cc. 245 PHILOX_DEVICE_INLINE 246 void SkipFromGenerator(uint64 num_skips) { 247 while (num_skips--) { 248 (*generator_)(); 249 } 250 } 251 252 Generator* generator_; 253 typename Generator::ResultType unused_results_; 254 int used_result_index_; 255 }; 256 257 // A class that generates unit normal distribution random numbers from the 258 // underlying random integer generator. 259 // Arguments: 260 // Generator: a generator type that returns a number of uint32 upon each 261 // each invocation. It needs to define kResultElementCount for the 262 // sample count for each invocation, and ResultType for actual 263 // returned sample type. 264 // RealType: the data type of the real numbers that will be returned by the 265 // distribution. This could be either float or double for now. 266 // This class is meant to be implemented through specialization. The default 267 // is not defined by design. 268 template <class Generator, typename RealType> 269 class NormalDistribution; 270 271 PHILOX_DEVICE_INLINE 272 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1); 273 274 PHILOX_DEVICE_INLINE 275 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, 276 double* d1); 277 278 // Exactly like the float version, except that we convert to half afterwards; 279 // since we don't have half-precision sin/cos even on GPUs, there's nothing to 280 // gain from working in half internally. 281 template <class Generator> 282 class NormalDistribution<Generator, Eigen::half> { 283 public: 284 // The number of elements that will be returned. 285 static const int kResultElementCount = Generator::kResultElementCount; 286 // Cost of generation of a single element (in cycles). 287 static const int kElementCost = 70; 288 // Indicate that this distribution may take variable number of samples 289 // during the runtime. 290 static const bool kVariableSamplesPerOutput = false; 291 typedef Array<Eigen::half, kResultElementCount> ResultType; 292 typedef Eigen::half ResultElementType; 293 294 PHILOX_DEVICE_INLINE 295 ResultType operator()(Generator* gen) { 296 typename Generator::ResultType sample = (*gen)(); 297 ResultType result; 298 for (int i = 0; i < kResultElementCount; i += 2) { 299 float f[2]; 300 BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]); 301 result[i] = Eigen::half(f[0]); 302 result[i + 1] = Eigen::half(f[1]); 303 } 304 return result; 305 } 306 }; 307 308 template <class Generator> 309 class NormalDistribution<Generator, float> { 310 public: 311 // The number of elements that will be returned. 312 static const int kResultElementCount = Generator::kResultElementCount; 313 // Cost of generation of a single element (in cycles). 314 static const int kElementCost = 70; 315 // Indicate that this distribution may take variable number of samples 316 // during the runtime. 317 static const bool kVariableSamplesPerOutput = false; 318 typedef Array<float, kResultElementCount> ResultType; 319 typedef float ResultElementType; 320 321 PHILOX_DEVICE_INLINE 322 ResultType operator()(Generator* gen) { 323 typename Generator::ResultType sample = (*gen)(); 324 ResultType result; 325 for (int i = 0; i < kResultElementCount; i += 2) { 326 BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]); 327 } 328 return result; 329 } 330 }; 331 332 template <class Generator> 333 class NormalDistribution<Generator, double> { 334 public: 335 // The number of elements that will be returned. 336 static const int kResultElementCount = Generator::kResultElementCount / 2; 337 // Cost of generation of a single element (in cycles). 338 static const int kElementCost = 70; 339 // Indicate that this distribution may take variable number of samples 340 // during the runtime. 341 static const bool kVariableSamplesPerOutput = false; 342 typedef Array<double, kResultElementCount> ResultType; 343 typedef double ResultElementType; 344 345 PHILOX_DEVICE_INLINE 346 ResultType operator()(Generator* gen) { 347 typename Generator::ResultType sample = (*gen)(); 348 ResultType result; 349 for (int i = 0; i < kResultElementCount; i += 2) { 350 const int i2 = 2 * i; 351 BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2], 352 sample[i2 + 3], &result[i], &result[i + 1]); 353 } 354 return result; 355 } 356 }; 357 358 // A class that returns standard normal distribution between 359 // [-kTruncateValue, kTruncateValue]. 360 // Arguments: 361 // Generator: a generator type that returns a number of uint32 upon each 362 // each invocation. It needs to define kResultElementCount for the 363 // sample count for each invocation, and ResultType for actual 364 // returned sample type. 365 // RealType: the data type of the real numbers that will be returned by the 366 // distribution. This could be either float or double for now. 367 // This class is meant to be implemented through specialization. The default 368 // is not defined by design. 369 template <class SingleSampleGenerator, typename RealType> 370 class TruncatedNormalDistribution; 371 372 // Exactly like the float version, except that we convert to half afterwards; 373 // since we don't have half-precision sin/cos even on GPUs, there's nothing to 374 // gain from working in half internally. 375 template <class SingleSampleGenerator> 376 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> { 377 public: 378 // The number of elements that will be returned. 379 static const int kResultElementCount = 380 SingleSampleGenerator::kNativeElementCount; 381 // Cost of generation of a single element (in cycles). 382 static const int kElementCost = 90; 383 // Indicate that this distribution may take variable number of samples 384 // during the runtime. 385 static const bool kVariableSamplesPerOutput = true; 386 // The threshold where the normal distribution is truncated. 387 const float kTruncateValue = 2.0f; 388 389 typedef Array<Eigen::half, kResultElementCount> ResultType; 390 typedef Eigen::half ResultElementType; 391 392 PHILOX_DEVICE_INLINE 393 ResultType operator()(SingleSampleGenerator* gen) { 394 ResultType results; 395 int index = 0; 396 while (true) { 397 // Repeatedly take samples from the normal distribution, until we have 398 // the desired number of elements that fall within the pre-defined cutoff 399 // threshold. 400 const uint32 x0 = (*gen)(); 401 const uint32 x1 = (*gen)(); 402 float f[2]; 403 BoxMullerFloat(x0, x1, &f[0], &f[1]); 404 405 for (int i = 0; i < 2; ++i) { 406 if (Eigen::numext::abs(f[i]) < kTruncateValue) { 407 results[index++] = Eigen::half(f[i]); 408 if (index >= kResultElementCount) { 409 return results; 410 } 411 } 412 } 413 } 414 } 415 }; 416 417 // Partial specialization for float. 418 template <class SingleSampleGenerator> 419 class TruncatedNormalDistribution<SingleSampleGenerator, float> { 420 public: 421 // The number of elements that will be returned. 422 static const int kResultElementCount = 423 SingleSampleGenerator::kNativeElementCount; 424 // Cost of generation of a single element (in cycles). 425 static const int kElementCost = 90; 426 // Indicate that this distribution may take variable number of samples 427 // during the runtime. 428 static const bool kVariableSamplesPerOutput = true; 429 // The threshold where the normal distribution is truncated. 430 const float kTruncateValue = 2.0f; 431 432 typedef Array<float, kResultElementCount> ResultType; 433 typedef float ResultElementType; 434 435 PHILOX_DEVICE_INLINE 436 ResultType operator()(SingleSampleGenerator* gen) { 437 ResultType results; 438 int index = 0; 439 while (true) { 440 // Repeatedly take samples from the normal distribution, until we have 441 // the desired number of elements that fall within the pre-defined cutoff 442 // threshold. 443 const uint32 x0 = (*gen)(); 444 const uint32 x1 = (*gen)(); 445 float f[2]; 446 BoxMullerFloat(x0, x1, &f[0], &f[1]); 447 448 for (int i = 0; i < 2; ++i) { 449 if (Eigen::numext::abs(f[i]) < kTruncateValue) { 450 results[index++] = f[i]; 451 if (index >= kResultElementCount) { 452 return results; 453 } 454 } 455 } 456 } 457 } 458 }; 459 460 // Partial specialization for double. 461 template <class SingleSampleGenerator> 462 class TruncatedNormalDistribution<SingleSampleGenerator, double> { 463 public: 464 // The number of elements that will be returned. 465 static const int kResultElementCount = 466 (SingleSampleGenerator::kNativeElementCount > 1) 467 ? SingleSampleGenerator::kNativeElementCount / 2 468 : 1; 469 // Cost of generation of a single element (in cycles). 470 static const int kElementCost = 90; 471 // Indicate that this distribution may take variable number of samples 472 // during the runtime. 473 static const bool kVariableSamplesPerOutput = true; 474 typedef Array<double, kResultElementCount> ResultType; 475 typedef double ResultElementType; 476 const double kTruncateValue = 2.0; 477 478 PHILOX_DEVICE_INLINE 479 ResultType operator()(SingleSampleGenerator* gen) { 480 ResultType results; 481 int index = 0; 482 while (1) { 483 const uint32 x0 = (*gen)(); 484 const uint32 x1 = (*gen)(); 485 const uint32 x2 = (*gen)(); 486 const uint32 x3 = (*gen)(); 487 double d[2]; 488 BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]); 489 490 for (int i = 0; i < 2; ++i) { 491 if (Eigen::numext::abs(d[i]) < kTruncateValue) { 492 results[index++] = d[i]; 493 if (index >= kResultElementCount) { 494 return results; 495 } 496 } 497 } 498 } 499 } 500 }; 501 502 // Helper function to convert two 32-bit uniform integers to two floats 503 // under the unit normal distribution. 504 PHILOX_DEVICE_INLINE 505 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) { 506 // This function implements the Box-Muller transform: 507 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form 508 // Do not send a really small number to log(). 509 // We cannot mark "epsilon" as "static const" because NVCC would complain 510 const float epsilon = 1.0e-7f; 511 float u1 = Uint32ToFloat(x0); 512 if (u1 < epsilon) { 513 u1 = epsilon; 514 } 515 const float v1 = 2.0f * M_PI * Uint32ToFloat(x1); 516 const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1)); 517 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__) 518 *f0 = Eigen::numext::sin(v1); 519 *f1 = Eigen::numext::cos(v1); 520 #else 521 sincosf(v1, f0, f1); 522 #endif 523 *f0 *= u2; 524 *f1 *= u2; 525 } 526 527 // Helper function to convert four 32-bit uniform integers to two doubles 528 // under the unit normal distribution. 529 PHILOX_DEVICE_INLINE 530 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, 531 double* d1) { 532 // This function implements the Box-Muller transform: 533 // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form 534 // Do not send a really small number to log(). 535 // We cannot mark "epsilon" as "static const" because NVCC would complain 536 const double epsilon = 1.0e-7; 537 double u1 = Uint64ToDouble(x0, x1); 538 if (u1 < epsilon) { 539 u1 = epsilon; 540 } 541 const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3); 542 const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1)); 543 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__) 544 *d0 = Eigen::numext::sin(v1); 545 *d1 = Eigen::numext::cos(v1); 546 #else 547 sincos(v1, d0, d1); 548 #endif 549 *d0 *= u2; 550 *d1 *= u2; 551 } 552 553 // Helper function to convert an 16-bit integer to a half between [0..1). 554 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) { 555 // IEEE754 halfs are formatted as follows (MSB first): 556 // sign(1) exponent(5) mantissa(10) 557 // Conceptually construct the following: 558 // sign == 0 559 // exponent == 15 -- an excess 15 representation of a zero exponent 560 // mantissa == 10 random bits 561 const uint16 man = x & 0x3ffu; // 10 bit mantissa 562 const uint16 exp = static_cast<uint16>(15); 563 const uint16 val = (exp << 10) | man; 564 565 Eigen::half result; 566 result.x = val; 567 return result - Eigen::half(1.0); 568 } 569 570 // Helper function to convert an 32-bit integer to a float between [0..1). 571 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) { 572 // IEEE754 floats are formatted as follows (MSB first): 573 // sign(1) exponent(8) mantissa(23) 574 // Conceptually construct the following: 575 // sign == 0 576 // exponent == 127 -- an excess 127 representation of a zero exponent 577 // mantissa == 23 random bits 578 const uint32 man = x & 0x7fffffu; // 23 bit mantissa 579 const uint32 exp = static_cast<uint32>(127); 580 const uint32 val = (exp << 23) | man; 581 582 // Assumes that endian-ness is same for float and uint32. 583 float result; 584 memcpy(&result, &val, sizeof(val)); 585 return result - 1.0f; 586 } 587 588 // Helper function to convert two 32-bit integers to a double between [0..1). 589 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) { 590 // IEEE754 doubles are formatted as follows (MSB first): 591 // sign(1) exponent(11) mantissa(52) 592 // Conceptually construct the following: 593 // sign == 0 594 // exponent == 1023 -- an excess 1023 representation of a zero exponent 595 // mantissa == 52 random bits 596 const uint32 mhi = x0 & 0xfffffu; // upper 20 bits of mantissa 597 const uint32 mlo = x1; // lower 32 bits of mantissa 598 const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo; // mantissa 599 const uint64 exp = static_cast<uint64>(1023); 600 const uint64 val = (exp << 52) | man; 601 // Assumes that endian-ness is same for double and uint64. 602 double result; 603 memcpy(&result, &val, sizeof(val)); 604 return result - 1.0; 605 } 606 607 } // namespace random 608 } // namespace tensorflow 609 610 #endif // TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_ 611