1 // -*- C++ -*- 2 3 // Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the terms 7 // of the GNU General Public License as published by the Free Software 8 // Foundation; either version 3, or (at your option) any later 9 // version. 10 11 // This library is distributed in the hope that it will be useful, but 12 // WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 // General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 /** @file parallel/settings.h 26 * @brief Runtime settings and tuning parameters, heuristics to decide 27 * whether to use parallelized algorithms. 28 * This file is a GNU parallel extension to the Standard C++ Library. 29 * 30 * @section parallelization_decision 31 * The decision whether to run an algorithm in parallel. 32 * 33 * There are several ways the user can switch on and off the parallel 34 * execution of an algorithm, both at compile- and run-time. 35 * 36 * Only sequential execution can be forced at compile-time. This 37 * reduces code size and protects code parts that have 38 * non-thread-safe side effects. 39 * 40 * Ultimately, forcing parallel execution at compile-time makes 41 * sense. Often, the sequential algorithm implementation is used as 42 * a subroutine, so no reduction in code size can be achieved. Also, 43 * the machine the program is run on might have only one processor 44 * core, so to avoid overhead, the algorithm is executed 45 * sequentially. 46 * 47 * To force sequential execution of an algorithm ultimately at 48 * compile-time, the user must add the tag 49 * __gnu_parallel::sequential_tag() to the end of the parameter list, 50 * e. g. 51 * 52 * \code 53 * std::sort(v.begin(), v.end(), __gnu_parallel::sequential_tag()); 54 * \endcode 55 * 56 * This is compatible with all overloaded algorithm variants. No 57 * additional code will be instantiated, at all. The same holds for 58 * most algorithm calls with iterators not providing random access. 59 * 60 * If the algorithm call is not forced to be executed sequentially 61 * at compile-time, the decision is made at run-time. 62 * The global variable __gnu_parallel::_Settings::algorithm_strategy 63 * is checked. It is a tristate variable corresponding to: 64 * 65 * a. force_sequential, meaning the sequential algorithm is executed. 66 * b. force_parallel, meaning the parallel algorithm is executed. 67 * c. heuristic 68 * 69 * For heuristic, the parallel algorithm implementation is called 70 * only if the input size is sufficiently large. For most 71 * algorithms, the input size is the (combined) length of the input 72 * sequence(s). The threshold can be set by the user, individually 73 * for each algorithm. The according variables are called 74 * __gnu_parallel::_Settings::[algorithm]_minimal_n . 75 * 76 * For some of the algorithms, there are even more tuning options, 77 * e. g. the ability to choose from multiple algorithm variants. See 78 * below for details. 79 */ 80 81 // Written by Johannes Singler and Felix Putze. 82 83 #ifndef _GLIBCXX_PARALLEL_SETTINGS_H 84 #define _GLIBCXX_PARALLEL_SETTINGS_H 1 85 86 #include <parallel/types.h> 87 88 /** 89 * @brief Determine at compile(?)-time if the parallel variant of an 90 * algorithm should be called. 91 * @param c A condition that is convertible to bool that is overruled by 92 * __gnu_parallel::_Settings::algorithm_strategy. Usually a decision 93 * based on the input size. 94 */ 95 #define _GLIBCXX_PARALLEL_CONDITION(c) (__gnu_parallel::_Settings::get().algorithm_strategy != __gnu_parallel::force_sequential && ((__gnu_parallel::get_max_threads() > 1 && (c)) || __gnu_parallel::_Settings::get().algorithm_strategy == __gnu_parallel::force_parallel)) 96 97 /* 98 inline bool 99 parallel_condition(bool c) 100 { 101 bool ret = false; 102 const _Settings& s = _Settings::get(); 103 if (s.algorithm_strategy != force_seqential) 104 { 105 if (s.algorithm_strategy == force_parallel) 106 ret = true; 107 else 108 ret = get_max_threads() > 1 && c; 109 } 110 return ret; 111 } 112 */ 113 114 namespace __gnu_parallel 115 { 116 /// class _Settings 117 /// Run-time settings for the parallel mode, including all tunable parameters. 118 struct _Settings 119 { 120 _AlgorithmStrategy algorithm_strategy; 121 122 _SortAlgorithm sort_algorithm; 123 _PartialSumAlgorithm partial_sum_algorithm; 124 _MultiwayMergeAlgorithm multiway_merge_algorithm; 125 _FindAlgorithm find_algorithm; 126 127 _SplittingAlgorithm sort_splitting; 128 _SplittingAlgorithm merge_splitting; 129 _SplittingAlgorithm multiway_merge_splitting; 130 131 // Per-algorithm settings. 132 133 /// Minimal input size for accumulate. 134 sequence_index_t accumulate_minimal_n; 135 136 /// Minimal input size for adjacent_difference. 137 unsigned int adjacent_difference_minimal_n; 138 139 /// Minimal input size for count and count_if. 140 sequence_index_t count_minimal_n; 141 142 /// Minimal input size for fill. 143 sequence_index_t fill_minimal_n; 144 145 /// Block size increase factor for find. 146 double find_increasing_factor; 147 148 /// Initial block size for find. 149 sequence_index_t find_initial_block_size; 150 151 /// Maximal block size for find. 152 sequence_index_t find_maximum_block_size; 153 154 /// Start with looking for this many elements sequentially, for find. 155 sequence_index_t find_sequential_search_size; 156 157 /// Minimal input size for for_each. 158 sequence_index_t for_each_minimal_n; 159 160 /// Minimal input size for generate. 161 sequence_index_t generate_minimal_n; 162 163 /// Minimal input size for max_element. 164 sequence_index_t max_element_minimal_n; 165 166 /// Minimal input size for merge. 167 sequence_index_t merge_minimal_n; 168 169 /// Oversampling factor for merge. 170 unsigned int merge_oversampling; 171 172 /// Minimal input size for min_element. 173 sequence_index_t min_element_minimal_n; 174 175 /// Minimal input size for multiway_merge. 176 sequence_index_t multiway_merge_minimal_n; 177 178 /// Oversampling factor for multiway_merge. 179 int multiway_merge_minimal_k; 180 181 /// Oversampling factor for multiway_merge. 182 unsigned int multiway_merge_oversampling; 183 184 /// Minimal input size for nth_element. 185 sequence_index_t nth_element_minimal_n; 186 187 /// Chunk size for partition. 188 sequence_index_t partition_chunk_size; 189 190 /// Chunk size for partition, relative to input size. If > 0.0, 191 /// this value overrides partition_chunk_size. 192 double partition_chunk_share; 193 194 /// Minimal input size for partition. 195 sequence_index_t partition_minimal_n; 196 197 /// Minimal input size for partial_sort. 198 sequence_index_t partial_sort_minimal_n; 199 200 /// Ratio for partial_sum. Assume "sum and write result" to be 201 /// this factor slower than just "sum". 202 float partial_sum_dilation; 203 204 /// Minimal input size for partial_sum. 205 unsigned int partial_sum_minimal_n; 206 207 /// Minimal input size for random_shuffle. 208 unsigned int random_shuffle_minimal_n; 209 210 /// Minimal input size for replace and replace_if. 211 sequence_index_t replace_minimal_n; 212 213 /// Minimal input size for set_difference. 214 sequence_index_t set_difference_minimal_n; 215 216 /// Minimal input size for set_intersection. 217 sequence_index_t set_intersection_minimal_n; 218 219 /// Minimal input size for set_symmetric_difference. 220 sequence_index_t set_symmetric_difference_minimal_n; 221 222 /// Minimal input size for set_union. 223 sequence_index_t set_union_minimal_n; 224 225 /// Minimal input size for parallel sorting. 226 sequence_index_t sort_minimal_n; 227 228 /// Oversampling factor for parallel std::sort (MWMS). 229 unsigned int sort_mwms_oversampling; 230 231 /// Such many samples to take to find a good pivot (quicksort). 232 unsigned int sort_qs_num_samples_preset; 233 234 /// Maximal subsequence length to switch to unbalanced base case. 235 /// Applies to std::sort with dynamically load-balanced quicksort. 236 sequence_index_t sort_qsb_base_case_maximal_n; 237 238 /// Minimal input size for parallel std::transform. 239 sequence_index_t transform_minimal_n; 240 241 /// Minimal input size for unique_copy. 242 sequence_index_t unique_copy_minimal_n; 243 244 sequence_index_t workstealing_chunk_size; 245 246 // Hardware dependent tuning parameters. 247 248 /// Size of the L1 cache in bytes (underestimation). 249 unsigned long long L1_cache_size; 250 251 /// Size of the L2 cache in bytes (underestimation). 252 unsigned long long L2_cache_size; 253 254 /// Size of the Translation Lookaside Buffer (underestimation). 255 unsigned int TLB_size; 256 257 /// Overestimation of cache line size. Used to avoid false 258 /// sharing, i. e. elements of different threads are at least this 259 /// amount apart. 260 unsigned int cache_line_size; 261 262 // Statistics. 263 264 /// The number of stolen ranges in load-balanced quicksort. 265 sequence_index_t qsb_steals; 266 267 /// Get the global settings. 268 static const _Settings& 269 get() throw(); 270 271 /// Set the global settings. 272 static void 273 set(_Settings&) throw(); 274 275 explicit 276 _Settings() : algorithm_strategy(heuristic), sort_algorithm(MWMS), partial_sum_algorithm(LINEAR), multiway_merge_algorithm(LOSER_TREE), find_algorithm(CONSTANT_SIZE_BLOCKS), sort_splitting(EXACT), merge_splitting(EXACT), multiway_merge_splitting(EXACT), accumulate_minimal_n(1000), adjacent_difference_minimal_n(1000), count_minimal_n(1000), fill_minimal_n(1000), find_increasing_factor(2.0), find_initial_block_size(256), find_maximum_block_size(8192), find_sequential_search_size(256), for_each_minimal_n(1000), generate_minimal_n(1000), max_element_minimal_n(1000), merge_minimal_n(1000), merge_oversampling(10), min_element_minimal_n(1000), multiway_merge_minimal_n(1000), multiway_merge_minimal_k(2), multiway_merge_oversampling(10), nth_element_minimal_n(1000), partition_chunk_size(1000), partition_chunk_share(0.0), partition_minimal_n(1000), partial_sort_minimal_n(1000), partial_sum_dilation(1.0f), partial_sum_minimal_n(1000), random_shuffle_minimal_n(1000), replace_minimal_n(1000), set_difference_minimal_n(1000), set_intersection_minimal_n(1000), set_symmetric_difference_minimal_n(1000), set_union_minimal_n(1000), sort_minimal_n(1000), sort_mwms_oversampling(10), sort_qs_num_samples_preset(100), sort_qsb_base_case_maximal_n(100), transform_minimal_n(1000), unique_copy_minimal_n(10000), workstealing_chunk_size(100), L1_cache_size(16 << 10), L2_cache_size(256 << 10), TLB_size(128), cache_line_size(64), qsb_steals(0) 277 { } 278 }; 279 } 280 281 #endif /* _GLIBCXX_PARALLEL_SETTINGS_H */ 282