Home | History | Annotate | Download | only in bits
      1 // Optimizations for random number functions, x86 version -*- C++ -*-
      2 
      3 // Copyright (C) 2012-2013 Free Software Foundation, Inc.
      4 //
      5 // This file is part of the GNU ISO C++ Library.  This library is free
      6 // software; you can redistribute it and/or modify it under the
      7 // terms of the GNU General Public License as published by the
      8 // Free Software Foundation; either version 3, or (at your option)
      9 // any later version.
     10 
     11 // This library is distributed in the hope that it will be useful,
     12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 // GNU General Public License for more details.
     15 
     16 // Under Section 7 of GPL version 3, you are granted additional
     17 // permissions described in the GCC Runtime Library Exception, version
     18 // 3.1, as published by the Free Software Foundation.
     19 
     20 // You should have received a copy of the GNU General Public License and
     21 // a copy of the GCC Runtime Library Exception along with this program;
     22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     23 // <http://www.gnu.org/licenses/>.
     24 
     25 /** @file bits/opt_random.h
     26  *  This is an internal header file, included by other library headers.
     27  *  Do not attempt to use it directly. @headername{random}
     28  */
     29 
     30 #ifndef _BITS_OPT_RANDOM_H
     31 #define _BITS_OPT_RANDOM_H 1
     32 
     33 #include <x86intrin.h>
     34 
     35 
     36 #pragma GCC system_header
     37 
     38 
     39 namespace std _GLIBCXX_VISIBILITY(default)
     40 {
     41 _GLIBCXX_BEGIN_NAMESPACE_VERSION
     42 
     43 #ifdef __SSE3__
     44   template<>
     45     template<typename _UniformRandomNumberGenerator>
     46       void
     47       normal_distribution<double>::
     48       __generate(typename normal_distribution<double>::result_type* __f,
     49 		 typename normal_distribution<double>::result_type* __t,
     50 		 _UniformRandomNumberGenerator& __urng,
     51 		 const param_type& __param)
     52       {
     53 	typedef uint64_t __uctype;
     54 
     55 	if (__f == __t)
     56 	  return;
     57 
     58 	if (_M_saved_available)
     59 	  {
     60 	    _M_saved_available = false;
     61 	    *__f++ = _M_saved * __param.stddev() + __param.mean();
     62 
     63 	    if (__f == __t)
     64 	      return;
     65 	  }
     66 
     67 	constexpr uint64_t __maskval = 0xfffffffffffffull;
     68 	static const __m128i __mask = _mm_set1_epi64x(__maskval);
     69 	static const __m128i __two = _mm_set1_epi64x(0x4000000000000000ull);
     70 	static const __m128d __three = _mm_set1_pd(3.0);
     71 	const __m128d __av = _mm_set1_pd(__param.mean());
     72 
     73 	const __uctype __urngmin = __urng.min();
     74 	const __uctype __urngmax = __urng.max();
     75 	const __uctype __urngrange = __urngmax - __urngmin;
     76 	const __uctype __uerngrange = __urngrange + 1;
     77 
     78 	while (__f + 1 < __t)
     79 	  {
     80 	    double __le;
     81 	    __m128d __x;
     82 	    do
     83 	      {
     84                 union
     85                 {
     86                   __m128i __i;
     87                   __m128d __d;
     88 		} __v;
     89 
     90 		if (__urngrange > __maskval)
     91 		  {
     92 		    if (__detail::_Power_of_2(__uerngrange))
     93 		      __v.__i = _mm_and_si128(_mm_set_epi64x(__urng(),
     94 							     __urng()),
     95 					      __mask);
     96 		    else
     97 		      {
     98 			const __uctype __uerange = __maskval + 1;
     99 			const __uctype __scaling = __urngrange / __uerange;
    100 			const __uctype __past = __uerange * __scaling;
    101 			uint64_t __v1;
    102 			do
    103 			  __v1 = __uctype(__urng()) - __urngmin;
    104 			while (__v1 >= __past);
    105 			__v1 /= __scaling;
    106 			uint64_t __v2;
    107 			do
    108 			  __v2 = __uctype(__urng()) - __urngmin;
    109 			while (__v2 >= __past);
    110 			__v2 /= __scaling;
    111 
    112 			__v.__i = _mm_set_epi64x(__v1, __v2);
    113 		      }
    114 		  }
    115 		else if (__urngrange == __maskval)
    116 		  __v.__i = _mm_set_epi64x(__urng(), __urng());
    117 		else if ((__urngrange + 2) * __urngrange >= __maskval
    118 			 && __detail::_Power_of_2(__uerngrange))
    119 		  {
    120 		    uint64_t __v1 = __urng() * __uerngrange + __urng();
    121 		    uint64_t __v2 = __urng() * __uerngrange + __urng();
    122 
    123 		    __v.__i = _mm_and_si128(_mm_set_epi64x(__v1, __v2),
    124 					    __mask);
    125 		  }
    126 		else
    127 		  {
    128 		    size_t __nrng = 2;
    129 		    __uctype __high = __maskval / __uerngrange / __uerngrange;
    130 		    while (__high > __uerngrange)
    131 		      {
    132 			++__nrng;
    133 			__high /= __uerngrange;
    134 		      }
    135 		    const __uctype __highrange = __high + 1;
    136 		    const __uctype __scaling = __urngrange / __highrange;
    137 		    const __uctype __past = __highrange * __scaling;
    138 		    __uctype __tmp;
    139 
    140 		    uint64_t __v1;
    141 		    do
    142 		      {
    143 			do
    144 			  __tmp = __uctype(__urng()) - __urngmin;
    145 			while (__tmp >= __past);
    146 			__v1 = __tmp / __scaling;
    147 			for (size_t __cnt = 0; __cnt < __nrng; ++__cnt)
    148 			  {
    149 			    __tmp = __v1;
    150 			    __v1 *= __uerngrange;
    151 			    __v1 += __uctype(__urng()) - __urngmin;
    152 			  }
    153 		      }
    154 		    while (__v1 > __maskval || __v1 < __tmp);
    155 
    156 		    uint64_t __v2;
    157 		    do
    158 		      {
    159 			do
    160 			  __tmp = __uctype(__urng()) - __urngmin;
    161 			while (__tmp >= __past);
    162 			__v2 = __tmp / __scaling;
    163 			for (size_t __cnt = 0; __cnt < __nrng; ++__cnt)
    164 			  {
    165 			    __tmp = __v2;
    166 			    __v2 *= __uerngrange;
    167 			    __v2 += __uctype(__urng()) - __urngmin;
    168 			  }
    169 		      }
    170 		    while (__v2 > __maskval || __v2 < __tmp);
    171 
    172 		    __v.__i = _mm_set_epi64x(__v1, __v2);
    173 		  }
    174 
    175 		__v.__i = _mm_or_si128(__v.__i, __two);
    176 		__x = _mm_sub_pd(__v.__d, __three);
    177 		__m128d __m = _mm_mul_pd(__x, __x);
    178 		__le = _mm_cvtsd_f64(_mm_hadd_pd (__m, __m));
    179               }
    180             while (__le == 0.0 || __le >= 1.0);
    181 
    182             double __mult = (std::sqrt(-2.0 * std::log(__le) / __le)
    183                              * __param.stddev());
    184 
    185             __x = _mm_add_pd(_mm_mul_pd(__x, _mm_set1_pd(__mult)), __av);
    186 
    187             _mm_storeu_pd(__f, __x);
    188             __f += 2;
    189           }
    190 
    191         if (__f != __t)
    192           {
    193             result_type __x, __y, __r2;
    194 
    195             __detail::_Adaptor<_UniformRandomNumberGenerator, result_type>
    196               __aurng(__urng);
    197 
    198             do
    199               {
    200                 __x = result_type(2.0) * __aurng() - 1.0;
    201                 __y = result_type(2.0) * __aurng() - 1.0;
    202                 __r2 = __x * __x + __y * __y;
    203               }
    204             while (__r2 > 1.0 || __r2 == 0.0);
    205 
    206             const result_type __mult = std::sqrt(-2 * std::log(__r2) / __r2);
    207             _M_saved = __x * __mult;
    208             _M_saved_available = true;
    209             *__f = __y * __mult * __param.stddev() + __param.mean();
    210           }
    211       }
    212 #endif
    213 
    214 
    215 _GLIBCXX_END_NAMESPACE_VERSION
    216 } // namespace
    217 
    218 
    219 #endif // _BITS_OPT_RANDOM_H
    220