x86_64-linux/bits/opt_random.h

// Optimizations for random number functions, x86 version -*- C++ -*-

// Copyright (C) 2012-2013 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library.  This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.

// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.

// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
// <http://www.gnu.org/licenses/>.

/** @file bits/opt_random.h
 *  This is an internal header file, included by other library headers.
 *  Do not attempt to use it directly. @headername{random}
 */

#ifndef _BITS_OPT_RANDOM_H
#define _BITS_OPT_RANDOM_H 1

#include <x86intrin.h>


#pragma GCC system_header


namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION

#ifdef __SSE3__
  template<>
    template<typename _UniformRandomNumberGenerator>
      void
      normal_distribution<double>::
      __generate(typename normal_distribution<double>::result_type* __f,
		 typename normal_distribution<double>::result_type* __t,
		 _UniformRandomNumberGenerator& __urng,
		 const param_type& __param)
      {
	typedef uint64_t __uctype;

	if (__f == __t)
	  return;

	if (_M_saved_available)
	  {
	    _M_saved_available = false;
	    *__f++ = _M_saved * __param.stddev() + __param.mean();

	    if (__f == __t)
	      return;
	  }

	constexpr uint64_t __maskval = 0xfffffffffffffull;
	static const __m128i __mask = _mm_set1_epi64x(__maskval);
	static const __m128i __two = _mm_set1_epi64x(0x4000000000000000ull);
	static const __m128d __three = _mm_set1_pd(3.0);
	const __m128d __av = _mm_set1_pd(__param.mean());

	const __uctype __urngmin = __urng.min();
	const __uctype __urngmax = __urng.max();
	const __uctype __urngrange = __urngmax - __urngmin;
	const __uctype __uerngrange = __urngrange + 1;

	while (__f + 1 < __t)
	  {
	    double __le;
	    __m128d __x;
	    do
	      {
                union
                {
                  __m128i __i;
                  __m128d __d;
		} __v;

		if (__urngrange > __maskval)
		  {
		    if (__detail::_Power_of_2(__uerngrange))
		      __v.__i = _mm_and_si128(_mm_set_epi64x(__urng(),
							     __urng()),
					      __mask);
		    else
		      {
			const __uctype __uerange = __maskval + 1;
			const __uctype __scaling = __urngrange / __uerange;
			const __uctype __past = __uerange * __scaling;
			uint64_t __v1;
			do
			  __v1 = __uctype(__urng()) - __urngmin;
			while (__v1 >= __past);
			__v1 /= __scaling;
			uint64_t __v2;
			do
			  __v2 = __uctype(__urng()) - __urngmin;
			while (__v2 >= __past);
			__v2 /= __scaling;

			__v.__i = _mm_set_epi64x(__v1, __v2);
		      }
		  }
		else if (__urngrange == __maskval)
		  __v.__i = _mm_set_epi64x(__urng(), __urng());
		else if ((__urngrange + 2) * __urngrange >= __maskval
			 && __detail::_Power_of_2(__uerngrange))
		  {
		    uint64_t __v1 = __urng() * __uerngrange + __urng();
		    uint64_t __v2 = __urng() * __uerngrange + __urng();

		    __v.__i = _mm_and_si128(_mm_set_epi64x(__v1, __v2),
					    __mask);
		  }
		else
		  {
		    size_t __nrng = 2;
		    __uctype __high = __maskval / __uerngrange / __uerngrange;
		    while (__high > __uerngrange)
		      {
			++__nrng;
			__high /= __uerngrange;
		      }
		    const __uctype __highrange = __high + 1;
		    const __uctype __scaling = __urngrange / __highrange;
		    const __uctype __past = __highrange * __scaling;
		    __uctype __tmp;

		    uint64_t __v1;
		    do
		      {
			do
			  __tmp = __uctype(__urng()) - __urngmin;
			while (__tmp >= __past);
			__v1 = __tmp / __scaling;
			for (size_t __cnt = 0; __cnt < __nrng; ++__cnt)
			  {
			    __tmp = __v1;
			    __v1 *= __uerngrange;
			    __v1 += __uctype(__urng()) - __urngmin;
			  }
		      }
		    while (__v1 > __maskval || __v1 < __tmp);

		    uint64_t __v2;
		    do
		      {
			do
			  __tmp = __uctype(__urng()) - __urngmin;
			while (__tmp >= __past);
			__v2 = __tmp / __scaling;
			for (size_t __cnt = 0; __cnt < __nrng; ++__cnt)
			  {
			    __tmp = __v2;
			    __v2 *= __uerngrange;
			    __v2 += __uctype(__urng()) - __urngmin;
			  }
		      }
		    while (__v2 > __maskval || __v2 < __tmp);

		    __v.__i = _mm_set_epi64x(__v1, __v2);
		  }

		__v.__i = _mm_or_si128(__v.__i, __two);
		__x = _mm_sub_pd(__v.__d, __three);
		__m128d __m = _mm_mul_pd(__x, __x);
		__le = _mm_cvtsd_f64(_mm_hadd_pd (__m, __m));
              }
            while (__le == 0.0 || __le >= 1.0);

            double __mult = (std::sqrt(-2.0 * std::log(__le) / __le)
                             * __param.stddev());

            __x = _mm_add_pd(_mm_mul_pd(__x, _mm_set1_pd(__mult)), __av);

            _mm_storeu_pd(__f, __x);
            __f += 2;
          }

        if (__f != __t)
          {
            result_type __x, __y, __r2;

            __detail::_Adaptor<_UniformRandomNumberGenerator, result_type>
              __aurng(__urng);

            do
              {
                __x = result_type(2.0) * __aurng() - 1.0;
                __y = result_type(2.0) * __aurng() - 1.0;
                __r2 = __x * __x + __y * __y;
              }
            while (__r2 > 1.0 || __r2 == 0.0);

            const result_type __mult = std::sqrt(-2 * std::log(__r2) / __r2);
            _M_saved = __x * __mult;
            _M_saved_available = true;
            *__f = __y * __mult * __param.stddev() + __param.mean();
          }
      }
#endif


_GLIBCXX_END_NAMESPACE_VERSION
} // namespace


#endif // _BITS_OPT_RANDOM_H