1 // -*- C++ -*- 2 3 // Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the terms 7 // of the GNU General Public License as published by the Free Software 8 // Foundation; either version 3, or (at your option) any later 9 // version. 10 11 // This library is distributed in the hope that it will be useful, but 12 // WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 // General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 /** @file parallel/random_shuffle.h 26 * @brief Parallel implementation of std::random_shuffle(). 27 * This file is a GNU parallel extension to the Standard C++ Library. 28 */ 29 30 // Written by Johannes Singler. 31 32 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 33 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1 34 35 #include <limits> 36 #include <bits/stl_numeric.h> 37 #include <parallel/parallel.h> 38 #include <parallel/random_number.h> 39 40 namespace __gnu_parallel 41 { 42 /** @brief Type to hold the index of a bin. 43 * 44 * Since many variables of this type are allocated, it should be 45 * chosen as small as possible. 46 */ 47 typedef unsigned short bin_index; 48 49 /** @brief Data known to every thread participating in 50 __gnu_parallel::parallel_random_shuffle(). */ 51 template<typename RandomAccessIterator> 52 struct DRandomShufflingGlobalData 53 { 54 typedef std::iterator_traits<RandomAccessIterator> traits_type; 55 typedef typename traits_type::value_type value_type; 56 typedef typename traits_type::difference_type difference_type; 57 58 /** @brief Begin iterator of the source. */ 59 RandomAccessIterator& source; 60 61 /** @brief Temporary arrays for each thread. */ 62 value_type** temporaries; 63 64 /** @brief Two-dimensional array to hold the thread-bin distribution. 65 * 66 * Dimensions (num_threads + 1) x (num_bins + 1). */ 67 difference_type** dist; 68 69 /** @brief Start indexes of the threads' chunks. */ 70 difference_type* starts; 71 72 /** @brief Number of the thread that will further process the 73 corresponding bin. */ 74 thread_index_t* bin_proc; 75 76 /** @brief Number of bins to distribute to. */ 77 int num_bins; 78 79 /** @brief Number of bits needed to address the bins. */ 80 int num_bits; 81 82 /** @brief Constructor. */ 83 DRandomShufflingGlobalData(RandomAccessIterator& _source) 84 : source(_source) { } 85 }; 86 87 /** @brief Local data for a thread participating in 88 __gnu_parallel::parallel_random_shuffle(). 89 */ 90 template<typename RandomAccessIterator, typename RandomNumberGenerator> 91 struct DRSSorterPU 92 { 93 /** @brief Number of threads participating in total. */ 94 int num_threads; 95 96 /** @brief Begin index for bins taken care of by this thread. */ 97 bin_index bins_begin; 98 99 /** @brief End index for bins taken care of by this thread. */ 100 bin_index bins_end; 101 102 /** @brief Random seed for this thread. */ 103 uint32 seed; 104 105 /** @brief Pointer to global data. */ 106 DRandomShufflingGlobalData<RandomAccessIterator>* sd; 107 }; 108 109 /** @brief Generate a random number in @c [0,2^logp). 110 * @param logp Logarithm (basis 2) of the upper range bound. 111 * @param rng Random number generator to use. 112 */ 113 template<typename RandomNumberGenerator> 114 inline int 115 random_number_pow2(int logp, RandomNumberGenerator& rng) 116 { return rng.genrand_bits(logp); } 117 118 /** @brief Random shuffle code executed by each thread. 119 * @param pus Array of thread-local data records. */ 120 template<typename RandomAccessIterator, typename RandomNumberGenerator> 121 void 122 parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator, 123 RandomNumberGenerator>* pus) 124 { 125 typedef std::iterator_traits<RandomAccessIterator> traits_type; 126 typedef typename traits_type::value_type value_type; 127 typedef typename traits_type::difference_type difference_type; 128 129 thread_index_t iam = omp_get_thread_num(); 130 DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam]; 131 DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd; 132 133 // Indexing: dist[bin][processor] 134 difference_type length = sd->starts[iam + 1] - sd->starts[iam]; 135 bin_index* oracles = new bin_index[length]; 136 difference_type* dist = new difference_type[sd->num_bins + 1]; 137 bin_index* bin_proc = new bin_index[sd->num_bins]; 138 value_type** temporaries = new value_type*[d->num_threads]; 139 140 // Compute oracles and count appearances. 141 for (bin_index b = 0; b < sd->num_bins + 1; ++b) 142 dist[b] = 0; 143 int num_bits = sd->num_bits; 144 145 random_number rng(d->seed); 146 147 // First main loop. 148 for (difference_type i = 0; i < length; ++i) 149 { 150 bin_index oracle = random_number_pow2(num_bits, rng); 151 oracles[i] = oracle; 152 153 // To allow prefix (partial) sum. 154 ++(dist[oracle + 1]); 155 } 156 157 for (bin_index b = 0; b < sd->num_bins + 1; ++b) 158 sd->dist[b][iam + 1] = dist[b]; 159 160 # pragma omp barrier 161 162 # pragma omp single 163 { 164 // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the 165 // total number of items in bin s 166 for (bin_index s = 0; s < sd->num_bins; ++s) 167 __gnu_sequential::partial_sum(sd->dist[s + 1], 168 sd->dist[s + 1] + d->num_threads + 1, 169 sd->dist[s + 1]); 170 } 171 172 # pragma omp barrier 173 174 sequence_index_t offset = 0, global_offset = 0; 175 for (bin_index s = 0; s < d->bins_begin; ++s) 176 global_offset += sd->dist[s + 1][d->num_threads]; 177 178 # pragma omp barrier 179 180 for (bin_index s = d->bins_begin; s < d->bins_end; ++s) 181 { 182 for (int t = 0; t < d->num_threads + 1; ++t) 183 sd->dist[s + 1][t] += offset; 184 offset = sd->dist[s + 1][d->num_threads]; 185 } 186 187 sd->temporaries[iam] = static_cast<value_type*>( 188 ::operator new(sizeof(value_type) * offset)); 189 190 # pragma omp barrier 191 192 // Draw local copies to avoid false sharing. 193 for (bin_index b = 0; b < sd->num_bins + 1; ++b) 194 dist[b] = sd->dist[b][iam]; 195 for (bin_index b = 0; b < sd->num_bins; ++b) 196 bin_proc[b] = sd->bin_proc[b]; 197 for (thread_index_t t = 0; t < d->num_threads; ++t) 198 temporaries[t] = sd->temporaries[t]; 199 200 RandomAccessIterator source = sd->source; 201 difference_type start = sd->starts[iam]; 202 203 // Distribute according to oracles, second main loop. 204 for (difference_type i = 0; i < length; ++i) 205 { 206 bin_index target_bin = oracles[i]; 207 thread_index_t target_p = bin_proc[target_bin]; 208 209 // Last column [d->num_threads] stays unchanged. 210 ::new(&(temporaries[target_p][dist[target_bin + 1]++])) 211 value_type(*(source + i + start)); 212 } 213 214 delete[] oracles; 215 delete[] dist; 216 delete[] bin_proc; 217 delete[] temporaries; 218 219 # pragma omp barrier 220 221 // Shuffle bins internally. 222 for (bin_index b = d->bins_begin; b < d->bins_end; ++b) 223 { 224 value_type* begin = 225 sd->temporaries[iam] + 226 ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]), 227 * end = 228 sd->temporaries[iam] + sd->dist[b + 1][d->num_threads]; 229 sequential_random_shuffle(begin, end, rng); 230 std::copy(begin, end, sd->source + global_offset + 231 ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads])); 232 } 233 234 ::operator delete(sd->temporaries[iam]); 235 } 236 237 /** @brief Round up to the next greater power of 2. 238 * @param x Integer to round up */ 239 template<typename T> 240 T 241 round_up_to_pow2(T x) 242 { 243 if (x <= 1) 244 return 1; 245 else 246 return (T)1 << (__log2(x - 1) + 1); 247 } 248 249 /** @brief Main parallel random shuffle step. 250 * @param begin Begin iterator of sequence. 251 * @param end End iterator of sequence. 252 * @param n Length of sequence. 253 * @param num_threads Number of threads to use. 254 * @param rng Random number generator to use. 255 */ 256 template<typename RandomAccessIterator, typename RandomNumberGenerator> 257 void 258 parallel_random_shuffle_drs(RandomAccessIterator begin, 259 RandomAccessIterator end, 260 typename std::iterator_traits 261 <RandomAccessIterator>::difference_type n, 262 thread_index_t num_threads, 263 RandomNumberGenerator& rng) 264 { 265 typedef std::iterator_traits<RandomAccessIterator> traits_type; 266 typedef typename traits_type::value_type value_type; 267 typedef typename traits_type::difference_type difference_type; 268 269 _GLIBCXX_CALL(n) 270 271 const _Settings& __s = _Settings::get(); 272 273 if (num_threads > n) 274 num_threads = static_cast<thread_index_t>(n); 275 276 bin_index num_bins, num_bins_cache; 277 278 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 279 // Try the L1 cache first. 280 281 // Must fit into L1. 282 num_bins_cache = std::max<difference_type>( 283 1, n / (__s.L1_cache_size_lb / sizeof(value_type))); 284 num_bins_cache = round_up_to_pow2(num_bins_cache); 285 286 // No more buckets than TLB entries, power of 2 287 // Power of 2 and at least one element per bin, at most the TLB size. 288 num_bins = std::min<difference_type>(n, num_bins_cache); 289 290 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 291 // 2 TLB entries needed per bin. 292 num_bins = std::min<difference_type>(__s.TLB_size / 2, num_bins); 293 #endif 294 num_bins = round_up_to_pow2(num_bins); 295 296 if (num_bins < num_bins_cache) 297 { 298 #endif 299 // Now try the L2 cache 300 // Must fit into L2 301 num_bins_cache = static_cast<bin_index>(std::max<difference_type>( 302 1, n / (__s.L2_cache_size / sizeof(value_type)))); 303 num_bins_cache = round_up_to_pow2(num_bins_cache); 304 305 // No more buckets than TLB entries, power of 2. 306 num_bins = static_cast<bin_index>( 307 std::min(n, static_cast<difference_type>(num_bins_cache))); 308 // Power of 2 and at least one element per bin, at most the TLB size. 309 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 310 // 2 TLB entries needed per bin. 311 num_bins = std::min( 312 static_cast<difference_type>(__s.TLB_size / 2), num_bins); 313 #endif 314 num_bins = round_up_to_pow2(num_bins); 315 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 316 } 317 #endif 318 319 num_threads = std::min<bin_index>(num_threads, num_bins); 320 321 if (num_threads <= 1) 322 return sequential_random_shuffle(begin, end, rng); 323 324 DRandomShufflingGlobalData<RandomAccessIterator> sd(begin); 325 DRSSorterPU<RandomAccessIterator, random_number >* pus; 326 difference_type* starts; 327 328 # pragma omp parallel num_threads(num_threads) 329 { 330 thread_index_t num_threads = omp_get_num_threads(); 331 # pragma omp single 332 { 333 pus = new DRSSorterPU<RandomAccessIterator, random_number> 334 [num_threads]; 335 336 sd.temporaries = new value_type*[num_threads]; 337 sd.dist = new difference_type*[num_bins + 1]; 338 sd.bin_proc = new thread_index_t[num_bins]; 339 for (bin_index b = 0; b < num_bins + 1; ++b) 340 sd.dist[b] = new difference_type[num_threads + 1]; 341 for (bin_index b = 0; b < (num_bins + 1); ++b) 342 { 343 sd.dist[0][0] = 0; 344 sd.dist[b][0] = 0; 345 } 346 starts = sd.starts = new difference_type[num_threads + 1]; 347 int bin_cursor = 0; 348 sd.num_bins = num_bins; 349 sd.num_bits = __log2(num_bins); 350 351 difference_type chunk_length = n / num_threads, 352 split = n % num_threads, start = 0; 353 difference_type bin_chunk_length = num_bins / num_threads, 354 bin_split = num_bins % num_threads; 355 for (thread_index_t i = 0; i < num_threads; ++i) 356 { 357 starts[i] = start; 358 start += (i < split) ? (chunk_length + 1) : chunk_length; 359 int j = pus[i].bins_begin = bin_cursor; 360 361 // Range of bins for this processor. 362 bin_cursor += (i < bin_split) ? 363 (bin_chunk_length + 1) : bin_chunk_length; 364 pus[i].bins_end = bin_cursor; 365 for (; j < bin_cursor; ++j) 366 sd.bin_proc[j] = i; 367 pus[i].num_threads = num_threads; 368 pus[i].seed = rng(std::numeric_limits<uint32>::max()); 369 pus[i].sd = &sd; 370 } 371 starts[num_threads] = start; 372 } //single 373 // Now shuffle in parallel. 374 parallel_random_shuffle_drs_pu(pus); 375 } // parallel 376 377 delete[] starts; 378 delete[] sd.bin_proc; 379 for (int s = 0; s < (num_bins + 1); ++s) 380 delete[] sd.dist[s]; 381 delete[] sd.dist; 382 delete[] sd.temporaries; 383 384 delete[] pus; 385 } 386 387 /** @brief Sequential cache-efficient random shuffle. 388 * @param begin Begin iterator of sequence. 389 * @param end End iterator of sequence. 390 * @param rng Random number generator to use. 391 */ 392 template<typename RandomAccessIterator, typename RandomNumberGenerator> 393 void 394 sequential_random_shuffle(RandomAccessIterator begin, 395 RandomAccessIterator end, 396 RandomNumberGenerator& rng) 397 { 398 typedef std::iterator_traits<RandomAccessIterator> traits_type; 399 typedef typename traits_type::value_type value_type; 400 typedef typename traits_type::difference_type difference_type; 401 402 difference_type n = end - begin; 403 const _Settings& __s = _Settings::get(); 404 405 bin_index num_bins, num_bins_cache; 406 407 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 408 // Try the L1 cache first, must fit into L1. 409 num_bins_cache = 410 std::max<difference_type> 411 (1, n / (__s.L1_cache_size_lb / sizeof(value_type))); 412 num_bins_cache = round_up_to_pow2(num_bins_cache); 413 414 // No more buckets than TLB entries, power of 2 415 // Power of 2 and at least one element per bin, at most the TLB size 416 num_bins = std::min(n, (difference_type)num_bins_cache); 417 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 418 // 2 TLB entries needed per bin 419 num_bins = std::min((difference_type)__s.TLB_size / 2, num_bins); 420 #endif 421 num_bins = round_up_to_pow2(num_bins); 422 423 if (num_bins < num_bins_cache) 424 { 425 #endif 426 // Now try the L2 cache, must fit into L2. 427 num_bins_cache = 428 static_cast<bin_index>(std::max<difference_type>( 429 1, n / (__s.L2_cache_size / sizeof(value_type)))); 430 num_bins_cache = round_up_to_pow2(num_bins_cache); 431 432 // No more buckets than TLB entries, power of 2 433 // Power of 2 and at least one element per bin, at most the TLB size. 434 num_bins = static_cast<bin_index> 435 (std::min(n, static_cast<difference_type>(num_bins_cache))); 436 437 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 438 // 2 TLB entries needed per bin 439 num_bins = 440 std::min<difference_type>(__s.TLB_size / 2, num_bins); 441 #endif 442 num_bins = round_up_to_pow2(num_bins); 443 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 444 } 445 #endif 446 447 int num_bits = __log2(num_bins); 448 449 if (num_bins > 1) 450 { 451 value_type* target = static_cast<value_type*>( 452 ::operator new(sizeof(value_type) * n)); 453 bin_index* oracles = new bin_index[n]; 454 difference_type* dist0 = new difference_type[num_bins + 1], 455 * dist1 = new difference_type[num_bins + 1]; 456 457 for (int b = 0; b < num_bins + 1; ++b) 458 dist0[b] = 0; 459 460 random_number bitrng(rng(0xFFFFFFFF)); 461 462 for (difference_type i = 0; i < n; ++i) 463 { 464 bin_index oracle = random_number_pow2(num_bits, bitrng); 465 oracles[i] = oracle; 466 467 // To allow prefix (partial) sum. 468 ++(dist0[oracle + 1]); 469 } 470 471 // Sum up bins. 472 __gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0); 473 474 for (int b = 0; b < num_bins + 1; ++b) 475 dist1[b] = dist0[b]; 476 477 // Distribute according to oracles. 478 for (difference_type i = 0; i < n; ++i) 479 ::new(&(target[(dist0[oracles[i]])++])) value_type(*(begin + i)); 480 481 for (int b = 0; b < num_bins; ++b) 482 { 483 sequential_random_shuffle(target + dist1[b], 484 target + dist1[b + 1], 485 rng); 486 } 487 488 // Copy elements back. 489 std::copy(target, target + n, begin); 490 491 delete[] dist0; 492 delete[] dist1; 493 delete[] oracles; 494 ::operator delete(target); 495 } 496 else 497 __gnu_sequential::random_shuffle(begin, end, rng); 498 } 499 500 /** @brief Parallel random public call. 501 * @param begin Begin iterator of sequence. 502 * @param end End iterator of sequence. 503 * @param rng Random number generator to use. 504 */ 505 template<typename RandomAccessIterator, typename RandomNumberGenerator> 506 inline void 507 parallel_random_shuffle(RandomAccessIterator begin, 508 RandomAccessIterator end, 509 RandomNumberGenerator rng = random_number()) 510 { 511 typedef std::iterator_traits<RandomAccessIterator> traits_type; 512 typedef typename traits_type::difference_type difference_type; 513 difference_type n = end - begin; 514 parallel_random_shuffle_drs(begin, end, n, get_max_threads(), rng) ; 515 } 516 517 } 518 519 #endif /* _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H */ 520