Home | History | Annotate | Download | only in parallel
      1 // -*- C++ -*-
      2 
      3 // Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
      4 //
      5 // This file is part of the GNU ISO C++ Library.  This library is free
      6 // software; you can redistribute it and/or modify it under the terms
      7 // of the GNU General Public License as published by the Free Software
      8 // Foundation; either version 3, or (at your option) any later
      9 // version.
     10 
     11 // This library is distributed in the hope that it will be useful, but
     12 // WITHOUT ANY WARRANTY; without even the implied warranty of
     13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 // General Public License for more details.
     15 
     16 // Under Section 7 of GPL version 3, you are granted additional
     17 // permissions described in the GCC Runtime Library Exception, version
     18 // 3.1, as published by the Free Software Foundation.
     19 
     20 // You should have received a copy of the GNU General Public License and
     21 // a copy of the GCC Runtime Library Exception along with this program;
     22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     23 // <http://www.gnu.org/licenses/>.
     24 
     25 /** @file parallel/random_shuffle.h
     26  *  @brief Parallel implementation of std::random_shuffle().
     27  *  This file is a GNU parallel extension to the Standard C++ Library.
     28  */
     29 
     30 // Written by Johannes Singler.
     31 
     32 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
     33 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
     34 
     35 #include <limits>
     36 #include <bits/stl_numeric.h>
     37 #include <parallel/parallel.h>
     38 #include <parallel/random_number.h>
     39 
     40 namespace __gnu_parallel
     41 {
     42 /** @brief Type to hold the index of a bin.
     43   *
     44   *  Since many variables of this type are allocated, it should be
     45   *  chosen as small as possible.
     46   */
     47 typedef unsigned short bin_index;
     48 
     49 /** @brief Data known to every thread participating in
     50     __gnu_parallel::parallel_random_shuffle(). */
     51 template<typename RandomAccessIterator>
     52   struct DRandomShufflingGlobalData
     53   {
     54     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     55     typedef typename traits_type::value_type value_type;
     56     typedef typename traits_type::difference_type difference_type;
     57 
     58     /** @brief Begin iterator of the source. */
     59     RandomAccessIterator& source;
     60 
     61     /** @brief Temporary arrays for each thread. */
     62     value_type** temporaries;
     63 
     64     /** @brief Two-dimensional array to hold the thread-bin distribution.
     65      *
     66      *  Dimensions (num_threads + 1) x (num_bins + 1). */
     67     difference_type** dist;
     68 
     69     /** @brief Start indexes of the threads' chunks. */
     70     difference_type* starts;
     71 
     72     /** @brief Number of the thread that will further process the
     73 	corresponding bin. */
     74     thread_index_t* bin_proc;
     75 
     76     /** @brief Number of bins to distribute to. */
     77     int num_bins;
     78 
     79     /** @brief Number of bits needed to address the bins. */
     80     int num_bits;
     81 
     82     /** @brief Constructor. */
     83     DRandomShufflingGlobalData(RandomAccessIterator& _source)
     84     : source(_source) { }
     85   };
     86 
     87 /** @brief Local data for a thread participating in
     88     __gnu_parallel::parallel_random_shuffle().
     89   */
     90 template<typename RandomAccessIterator, typename RandomNumberGenerator>
     91   struct DRSSorterPU
     92   {
     93     /** @brief Number of threads participating in total. */
     94     int num_threads;
     95 
     96     /** @brief Begin index for bins taken care of by this thread. */
     97     bin_index bins_begin;
     98 
     99     /** @brief End index for bins taken care of by this thread. */
    100     bin_index bins_end;
    101 
    102     /** @brief Random seed for this thread. */
    103     uint32 seed;
    104 
    105     /** @brief Pointer to global data. */
    106     DRandomShufflingGlobalData<RandomAccessIterator>* sd;
    107   };
    108 
    109 /** @brief Generate a random number in @c [0,2^logp).
    110   *  @param logp Logarithm (basis 2) of the upper range bound.
    111   *  @param rng Random number generator to use.
    112   */
    113 template<typename RandomNumberGenerator>
    114   inline int
    115   random_number_pow2(int logp, RandomNumberGenerator& rng)
    116   { return rng.genrand_bits(logp); }
    117 
    118 /** @brief Random shuffle code executed by each thread.
    119   *  @param pus Array of thread-local data records. */
    120 template<typename RandomAccessIterator, typename RandomNumberGenerator>
    121   void
    122   parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
    123                                  RandomNumberGenerator>* pus)
    124   {
    125     typedef std::iterator_traits<RandomAccessIterator> traits_type;
    126     typedef typename traits_type::value_type value_type;
    127     typedef typename traits_type::difference_type difference_type;
    128 
    129     thread_index_t iam = omp_get_thread_num();
    130     DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam];
    131     DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
    132 
    133     // Indexing: dist[bin][processor]
    134     difference_type length = sd->starts[iam + 1] - sd->starts[iam];
    135     bin_index* oracles = new bin_index[length];
    136     difference_type* dist = new difference_type[sd->num_bins + 1];
    137     bin_index* bin_proc = new bin_index[sd->num_bins];
    138     value_type** temporaries = new value_type*[d->num_threads];
    139 
    140     // Compute oracles and count appearances.
    141     for (bin_index b = 0; b < sd->num_bins + 1; ++b)
    142       dist[b] = 0;
    143     int num_bits = sd->num_bits;
    144 
    145     random_number rng(d->seed);
    146 
    147     // First main loop.
    148     for (difference_type i = 0; i < length; ++i)
    149       {
    150         bin_index oracle = random_number_pow2(num_bits, rng);
    151         oracles[i] = oracle;
    152 
    153         // To allow prefix (partial) sum.
    154         ++(dist[oracle + 1]);
    155       }
    156 
    157     for (bin_index b = 0; b < sd->num_bins + 1; ++b)
    158       sd->dist[b][iam + 1] = dist[b];
    159 
    160 #   pragma omp barrier
    161 
    162 #   pragma omp single
    163     {
    164       // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
    165       // total number of items in bin s
    166       for (bin_index s = 0; s < sd->num_bins; ++s)
    167         __gnu_sequential::partial_sum(sd->dist[s + 1],
    168                                       sd->dist[s + 1] + d->num_threads + 1,
    169                                       sd->dist[s + 1]);
    170     }
    171 
    172 #   pragma omp barrier
    173 
    174     sequence_index_t offset = 0, global_offset = 0;
    175     for (bin_index s = 0; s < d->bins_begin; ++s)
    176       global_offset += sd->dist[s + 1][d->num_threads];
    177 
    178 #   pragma omp barrier
    179 
    180     for (bin_index s = d->bins_begin; s < d->bins_end; ++s)
    181       {
    182 	for (int t = 0; t < d->num_threads + 1; ++t)
    183 	  sd->dist[s + 1][t] += offset;
    184 	offset = sd->dist[s + 1][d->num_threads];
    185       }
    186 
    187     sd->temporaries[iam] = static_cast<value_type*>(
    188       ::operator new(sizeof(value_type) * offset));
    189 
    190 #   pragma omp barrier
    191 
    192     // Draw local copies to avoid false sharing.
    193     for (bin_index b = 0; b < sd->num_bins + 1; ++b)
    194       dist[b] = sd->dist[b][iam];
    195     for (bin_index b = 0; b < sd->num_bins; ++b)
    196       bin_proc[b] = sd->bin_proc[b];
    197     for (thread_index_t t = 0; t < d->num_threads; ++t)
    198       temporaries[t] = sd->temporaries[t];
    199 
    200     RandomAccessIterator source = sd->source;
    201     difference_type start = sd->starts[iam];
    202 
    203     // Distribute according to oracles, second main loop.
    204     for (difference_type i = 0; i < length; ++i)
    205       {
    206         bin_index target_bin = oracles[i];
    207         thread_index_t target_p = bin_proc[target_bin];
    208 
    209         // Last column [d->num_threads] stays unchanged.
    210         ::new(&(temporaries[target_p][dist[target_bin + 1]++]))
    211 	    value_type(*(source + i + start));
    212       }
    213 
    214     delete[] oracles;
    215     delete[] dist;
    216     delete[] bin_proc;
    217     delete[] temporaries;
    218 
    219 #   pragma omp barrier
    220 
    221     // Shuffle bins internally.
    222     for (bin_index b = d->bins_begin; b < d->bins_end; ++b)
    223       {
    224         value_type* begin =
    225                     sd->temporaries[iam] +
    226                     ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
    227                   * end =
    228                     sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
    229         sequential_random_shuffle(begin, end, rng);
    230         std::copy(begin, end, sd->source + global_offset +
    231             ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
    232       }
    233 
    234     ::operator delete(sd->temporaries[iam]);
    235   }
    236 
    237 /** @brief Round up to the next greater power of 2.
    238   *  @param x Integer to round up */
    239 template<typename T>
    240   T
    241   round_up_to_pow2(T x)
    242   {
    243     if (x <= 1)
    244       return 1;
    245     else
    246       return (T)1 << (__log2(x - 1) + 1);
    247   }
    248 
    249 /** @brief Main parallel random shuffle step.
    250   *  @param begin Begin iterator of sequence.
    251   *  @param end End iterator of sequence.
    252   *  @param n Length of sequence.
    253   *  @param num_threads Number of threads to use.
    254   *  @param rng Random number generator to use.
    255   */
    256 template<typename RandomAccessIterator, typename RandomNumberGenerator>
    257   void
    258   parallel_random_shuffle_drs(RandomAccessIterator begin,
    259 			      RandomAccessIterator end,
    260 			      typename std::iterator_traits
    261 			      <RandomAccessIterator>::difference_type n,
    262 			      thread_index_t num_threads,
    263 			      RandomNumberGenerator& rng)
    264   {
    265     typedef std::iterator_traits<RandomAccessIterator> traits_type;
    266     typedef typename traits_type::value_type value_type;
    267     typedef typename traits_type::difference_type difference_type;
    268 
    269     _GLIBCXX_CALL(n)
    270 
    271     const _Settings& __s = _Settings::get();
    272 
    273     if (num_threads > n)
    274       num_threads = static_cast<thread_index_t>(n);
    275 
    276     bin_index num_bins, num_bins_cache;
    277 
    278 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
    279     // Try the L1 cache first.
    280 
    281     // Must fit into L1.
    282     num_bins_cache = std::max<difference_type>(
    283         1, n / (__s.L1_cache_size_lb / sizeof(value_type)));
    284     num_bins_cache = round_up_to_pow2(num_bins_cache);
    285 
    286     // No more buckets than TLB entries, power of 2
    287     // Power of 2 and at least one element per bin, at most the TLB size.
    288     num_bins = std::min<difference_type>(n, num_bins_cache);
    289 
    290 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
    291     // 2 TLB entries needed per bin.
    292     num_bins = std::min<difference_type>(__s.TLB_size / 2, num_bins);
    293 #endif
    294     num_bins = round_up_to_pow2(num_bins);
    295 
    296     if (num_bins < num_bins_cache)
    297       {
    298 #endif
    299         // Now try the L2 cache
    300         // Must fit into L2
    301         num_bins_cache = static_cast<bin_index>(std::max<difference_type>(
    302             1, n / (__s.L2_cache_size / sizeof(value_type))));
    303         num_bins_cache = round_up_to_pow2(num_bins_cache);
    304 
    305         // No more buckets than TLB entries, power of 2.
    306         num_bins = static_cast<bin_index>(
    307             std::min(n, static_cast<difference_type>(num_bins_cache)));
    308         // Power of 2 and at least one element per bin, at most the TLB size.
    309 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
    310         // 2 TLB entries needed per bin.
    311         num_bins = std::min(
    312             static_cast<difference_type>(__s.TLB_size / 2), num_bins);
    313 #endif
    314           num_bins = round_up_to_pow2(num_bins);
    315 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
    316       }
    317 #endif
    318 
    319     num_threads = std::min<bin_index>(num_threads, num_bins);
    320 
    321     if (num_threads <= 1)
    322       return sequential_random_shuffle(begin, end, rng);
    323 
    324     DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
    325     DRSSorterPU<RandomAccessIterator, random_number >* pus;
    326     difference_type* starts;
    327 
    328 #   pragma omp parallel num_threads(num_threads)
    329       {
    330         thread_index_t num_threads = omp_get_num_threads();
    331 #       pragma omp single
    332           {
    333             pus = new DRSSorterPU<RandomAccessIterator, random_number>
    334                 [num_threads];
    335 
    336             sd.temporaries = new value_type*[num_threads];
    337             sd.dist = new difference_type*[num_bins + 1];
    338             sd.bin_proc = new thread_index_t[num_bins];
    339             for (bin_index b = 0; b < num_bins + 1; ++b)
    340               sd.dist[b] = new difference_type[num_threads + 1];
    341             for (bin_index b = 0; b < (num_bins + 1); ++b)
    342               {
    343                 sd.dist[0][0] = 0;
    344                 sd.dist[b][0] = 0;
    345               }
    346             starts = sd.starts = new difference_type[num_threads + 1];
    347             int bin_cursor = 0;
    348             sd.num_bins = num_bins;
    349             sd.num_bits = __log2(num_bins);
    350 
    351             difference_type chunk_length = n / num_threads,
    352                             split = n % num_threads, start = 0;
    353             difference_type bin_chunk_length = num_bins / num_threads,
    354                             bin_split = num_bins % num_threads;
    355             for (thread_index_t i = 0; i < num_threads; ++i)
    356               {
    357                 starts[i] = start;
    358                 start += (i < split) ? (chunk_length + 1) : chunk_length;
    359                 int j = pus[i].bins_begin = bin_cursor;
    360 
    361                 // Range of bins for this processor.
    362                 bin_cursor += (i < bin_split) ?
    363                     (bin_chunk_length + 1) : bin_chunk_length;
    364                 pus[i].bins_end = bin_cursor;
    365                 for (; j < bin_cursor; ++j)
    366                   sd.bin_proc[j] = i;
    367                 pus[i].num_threads = num_threads;
    368                 pus[i].seed = rng(std::numeric_limits<uint32>::max());
    369                 pus[i].sd = &sd;
    370               }
    371             starts[num_threads] = start;
    372           } //single
    373         // Now shuffle in parallel.
    374         parallel_random_shuffle_drs_pu(pus);
    375       }  // parallel
    376 
    377     delete[] starts;
    378     delete[] sd.bin_proc;
    379     for (int s = 0; s < (num_bins + 1); ++s)
    380       delete[] sd.dist[s];
    381     delete[] sd.dist;
    382     delete[] sd.temporaries;
    383 
    384     delete[] pus;
    385   }
    386 
    387 /** @brief Sequential cache-efficient random shuffle.
    388  *  @param begin Begin iterator of sequence.
    389  *  @param end End iterator of sequence.
    390  *  @param rng Random number generator to use.
    391  */
    392 template<typename RandomAccessIterator, typename RandomNumberGenerator>
    393   void
    394   sequential_random_shuffle(RandomAccessIterator begin,
    395                             RandomAccessIterator end,
    396                             RandomNumberGenerator& rng)
    397   {
    398     typedef std::iterator_traits<RandomAccessIterator> traits_type;
    399     typedef typename traits_type::value_type value_type;
    400     typedef typename traits_type::difference_type difference_type;
    401 
    402     difference_type n = end - begin;
    403     const _Settings& __s = _Settings::get();
    404 
    405     bin_index num_bins, num_bins_cache;
    406 
    407 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
    408     // Try the L1 cache first, must fit into L1.
    409     num_bins_cache =
    410         std::max<difference_type>
    411             (1, n / (__s.L1_cache_size_lb / sizeof(value_type)));
    412     num_bins_cache = round_up_to_pow2(num_bins_cache);
    413 
    414     // No more buckets than TLB entries, power of 2
    415     // Power of 2 and at least one element per bin, at most the TLB size
    416     num_bins = std::min(n, (difference_type)num_bins_cache);
    417 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
    418     // 2 TLB entries needed per bin
    419     num_bins = std::min((difference_type)__s.TLB_size / 2, num_bins);
    420 #endif
    421     num_bins = round_up_to_pow2(num_bins);
    422 
    423     if (num_bins < num_bins_cache)
    424       {
    425 #endif
    426         // Now try the L2 cache, must fit into L2.
    427         num_bins_cache =
    428             static_cast<bin_index>(std::max<difference_type>(
    429                 1, n / (__s.L2_cache_size / sizeof(value_type))));
    430         num_bins_cache = round_up_to_pow2(num_bins_cache);
    431 
    432         // No more buckets than TLB entries, power of 2
    433         // Power of 2 and at least one element per bin, at most the TLB size.
    434         num_bins = static_cast<bin_index>
    435             (std::min(n, static_cast<difference_type>(num_bins_cache)));
    436 
    437 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
    438         // 2 TLB entries needed per bin
    439         num_bins =
    440             std::min<difference_type>(__s.TLB_size / 2, num_bins);
    441 #endif
    442         num_bins = round_up_to_pow2(num_bins);
    443 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
    444       }
    445 #endif
    446 
    447     int num_bits = __log2(num_bins);
    448 
    449     if (num_bins > 1)
    450       {
    451         value_type* target = static_cast<value_type*>(
    452           ::operator new(sizeof(value_type) * n));
    453         bin_index* oracles = new bin_index[n];
    454         difference_type* dist0 = new difference_type[num_bins + 1],
    455                        * dist1 = new difference_type[num_bins + 1];
    456 
    457         for (int b = 0; b < num_bins + 1; ++b)
    458           dist0[b] = 0;
    459 
    460         random_number bitrng(rng(0xFFFFFFFF));
    461 
    462         for (difference_type i = 0; i < n; ++i)
    463           {
    464             bin_index oracle = random_number_pow2(num_bits, bitrng);
    465             oracles[i] = oracle;
    466 
    467             // To allow prefix (partial) sum.
    468             ++(dist0[oracle + 1]);
    469           }
    470 
    471         // Sum up bins.
    472         __gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0);
    473 
    474         for (int b = 0; b < num_bins + 1; ++b)
    475           dist1[b] = dist0[b];
    476 
    477         // Distribute according to oracles.
    478         for (difference_type i = 0; i < n; ++i)
    479           ::new(&(target[(dist0[oracles[i]])++])) value_type(*(begin + i));
    480 
    481         for (int b = 0; b < num_bins; ++b)
    482           {
    483             sequential_random_shuffle(target + dist1[b],
    484                                       target + dist1[b + 1],
    485                                       rng);
    486           }
    487 
    488         // Copy elements back.
    489         std::copy(target, target + n, begin);
    490 
    491         delete[] dist0;
    492         delete[] dist1;
    493         delete[] oracles;
    494         ::operator delete(target);
    495       }
    496     else
    497       __gnu_sequential::random_shuffle(begin, end, rng);
    498   }
    499 
    500 /** @brief Parallel random public call.
    501  *  @param begin Begin iterator of sequence.
    502  *  @param end End iterator of sequence.
    503  *  @param rng Random number generator to use.
    504  */
    505 template<typename RandomAccessIterator, typename RandomNumberGenerator>
    506   inline void
    507   parallel_random_shuffle(RandomAccessIterator begin,
    508                           RandomAccessIterator end,
    509                           RandomNumberGenerator rng = random_number())
    510   {
    511     typedef std::iterator_traits<RandomAccessIterator> traits_type;
    512     typedef typename traits_type::difference_type difference_type;
    513     difference_type n = end - begin;
    514     parallel_random_shuffle_drs(begin, end, n, get_max_threads(), rng) ;
    515   }
    516 
    517 }
    518 
    519 #endif /* _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H */
    520