1 /* 2 * Generate/analyze pareto/zipf distributions to better understand 3 * what an access pattern would look like. 4 * 5 * For instance, the following would generate a zipf distribution 6 * with theta 1.2, using 100,000 values and split the reporting into 7 * 20 buckets: 8 * 9 * t/genzipf zipf 1.2 100000 20 10 * 11 * Only the distribution type (zipf or pareto) and spread input need 12 * to be given, if not given defaults are used. 13 * 14 */ 15 #include <stdio.h> 16 #include <stdlib.h> 17 #include <fcntl.h> 18 #include <string.h> 19 #include <unistd.h> 20 21 #include "../lib/zipf.h" 22 #include "../flist.h" 23 #include "../hash.h" 24 25 #define DEF_NR 1000000 26 #define DEF_NR_OUTPUT 23 27 28 struct node { 29 struct flist_head list; 30 unsigned long long val; 31 unsigned long hits; 32 }; 33 34 static struct flist_head *hash; 35 static unsigned long hash_bits = 24; 36 static unsigned long hash_size = 1 << 24; 37 38 enum { 39 TYPE_NONE = 0, 40 TYPE_ZIPF, 41 TYPE_PARETO, 42 }; 43 static const char *dist_types[] = { "None", "Zipf", "Pareto" }; 44 45 static int dist_type = TYPE_ZIPF; 46 static unsigned long gb_size = 500; 47 static unsigned long block_size = 4096; 48 static unsigned long output_nranges = DEF_NR_OUTPUT; 49 static double percentage; 50 static double dist_val; 51 static int output_csv = 0; 52 53 #define DEF_ZIPF_VAL 1.2 54 #define DEF_PARETO_VAL 0.3 55 56 static struct node *hash_lookup(unsigned long long val) 57 { 58 struct flist_head *l = &hash[hash_long(val, hash_bits)]; 59 struct flist_head *entry; 60 struct node *n; 61 62 flist_for_each(entry, l) { 63 n = flist_entry(entry, struct node, list); 64 if (n->val == val) 65 return n; 66 } 67 68 return NULL; 69 } 70 71 static struct node *hash_insert(struct node *n, unsigned long long val) 72 { 73 struct flist_head *l = &hash[hash_long(val, hash_bits)]; 74 75 n->val = val; 76 n->hits = 1; 77 flist_add_tail(&n->list, l); 78 return n; 79 } 80 81 static void usage(void) 82 { 83 printf("genzipf: test zipf/pareto values for fio input\n"); 84 printf("\t-h\tThis help screen\n"); 85 printf("\t-p\tGenerate size of data set that are hit by this percentage\n"); 86 printf("\t-t\tDistribution type (zipf or pareto)\n"); 87 printf("\t-i\tDistribution algorithm input (zipf theta or pareto power)\n"); 88 printf("\t-b\tBlock size of a given range (in bytes)\n"); 89 printf("\t-g\tSize of data set (in gigabytes)\n"); 90 printf("\t-o\tNumber of output columns\n"); 91 printf("\t-c\tOutput ranges in CSV format\n"); 92 } 93 94 static int parse_options(int argc, char *argv[]) 95 { 96 const char *optstring = "t:g:i:o:b:p:ch"; 97 int c, dist_val_set = 0; 98 99 while ((c = getopt(argc, argv, optstring)) != -1) { 100 switch (c) { 101 case 'h': 102 usage(); 103 return 1; 104 case 'p': 105 percentage = atof(optarg); 106 break; 107 case 'b': 108 block_size = strtoul(optarg, NULL, 10); 109 break; 110 case 't': 111 if (!strncmp(optarg, "zipf", 4)) 112 dist_type = TYPE_ZIPF; 113 else if (!strncmp(optarg, "pareto", 6)) 114 dist_type = TYPE_PARETO; 115 else { 116 printf("wrong dist type: %s\n", optarg); 117 return 1; 118 } 119 break; 120 case 'g': 121 gb_size = strtoul(optarg, NULL, 10); 122 break; 123 case 'i': 124 dist_val = atof(optarg); 125 dist_val_set = 1; 126 break; 127 case 'o': 128 output_nranges = strtoul(optarg, NULL, 10); 129 break; 130 case 'c': 131 output_csv = 1; 132 break; 133 default: 134 printf("bad option %c\n", c); 135 return 1; 136 } 137 } 138 139 if (dist_type == TYPE_PARETO) { 140 if ((dist_val >= 1.00 || dist_val < 0.00)) { 141 printf("pareto input must be > 0.00 and < 1.00\n"); 142 return 1; 143 } 144 if (!dist_val_set) 145 dist_val = DEF_PARETO_VAL; 146 } else if (dist_type == TYPE_ZIPF) { 147 if (dist_val == 1.0) { 148 printf("zipf input must be different than 1.0\n"); 149 return 1; 150 } 151 if (!dist_val_set) 152 dist_val = DEF_ZIPF_VAL; 153 } 154 155 return 0; 156 } 157 158 struct output_sum { 159 double output; 160 unsigned int nranges; 161 }; 162 163 static int node_cmp(const void *p1, const void *p2) 164 { 165 const struct node *n1 = p1; 166 const struct node *n2 = p2; 167 168 return n2->hits - n1->hits; 169 } 170 171 int main(int argc, char *argv[]) 172 { 173 unsigned long offset; 174 unsigned long i, j, k, nr_vals, cur_vals, interval, total_vals, nnodes; 175 unsigned long long nranges; 176 struct output_sum *output_sums; 177 struct node *nodes; 178 double perc, perc_i; 179 struct zipf_state zs; 180 181 if (parse_options(argc, argv)) 182 return 1; 183 184 if( !output_csv ) 185 printf("Generating %s distribution with %f input and %lu GB size and %lu block_size.\n", dist_types[dist_type], dist_val, gb_size, block_size); 186 187 nranges = gb_size * 1024 * 1024 * 1024ULL; 188 nranges /= block_size; 189 190 if (dist_type == TYPE_ZIPF) 191 zipf_init(&zs, nranges, dist_val, 1); 192 else 193 pareto_init(&zs, nranges, dist_val, 1); 194 195 hash_bits = 0; 196 hash_size = nranges; 197 while ((hash_size >>= 1) != 0) 198 hash_bits++; 199 200 hash_size = 1 << hash_bits; 201 202 hash = malloc(hash_size * sizeof(struct flist_head)); 203 for (i = 0; i < hash_size; i++) 204 INIT_FLIST_HEAD(&hash[i]); 205 206 nodes = malloc(nranges * sizeof(struct node)); 207 208 for (nr_vals = i = j = 0; i < nranges; i++) { 209 struct node *n; 210 211 if (dist_type == TYPE_ZIPF) 212 offset = zipf_next(&zs); 213 else 214 offset = pareto_next(&zs); 215 216 n = hash_lookup(offset); 217 if (n) 218 n->hits++; 219 else { 220 hash_insert(&nodes[j], offset); 221 j++; 222 } 223 224 nr_vals++; 225 } 226 227 qsort(nodes, j, sizeof(struct node), node_cmp); 228 nnodes = j; 229 nr_vals = nnodes; 230 231 if (output_csv) { 232 printf("rank, count\n"); 233 for (k = 0; k < nnodes; k++) 234 printf("%lu, %lu\n", k, nodes[k].hits); 235 } else { 236 interval = (nr_vals + output_nranges - 1) / output_nranges; 237 238 output_sums = malloc(output_nranges * sizeof(struct output_sum)); 239 for (i = 0; i < output_nranges; i++) { 240 output_sums[i].output = 0.0; 241 output_sums[i].nranges = 1; 242 } 243 244 total_vals = i = j = cur_vals = 0; 245 246 for (k = 0; k < nnodes; k++) { 247 struct output_sum *os = &output_sums[j]; 248 struct node *node = &nodes[k]; 249 250 if (i >= interval) { 251 os->output = 252 (double)(cur_vals + 1) / (double)nranges; 253 os->output *= 100.0; 254 j++; 255 cur_vals = node->hits; 256 interval += 257 (nr_vals + output_nranges - 258 1) / output_nranges; 259 } else { 260 cur_vals += node->hits; 261 os->nranges += node->hits; 262 } 263 264 i++; 265 total_vals += node->hits; 266 267 if (percentage) { 268 unsigned long blocks = 269 percentage * nranges / 100; 270 271 if (total_vals >= blocks) { 272 double cs = 273 i * block_size / (1024 * 1024); 274 char p = 'M'; 275 276 if (cs > 1024.0) { 277 cs /= 1024.0; 278 p = 'G'; 279 } 280 if (cs > 1024.0) { 281 cs /= 1024.0; 282 p = 'T'; 283 } 284 285 printf("%.2f%% of hits satisfied in %.3f%cB of cache\n", percentage, cs, p); 286 percentage = 0.0; 287 } 288 } 289 } 290 291 perc_i = 100.0 / (double)output_nranges; 292 perc = 0.0; 293 294 printf("\n Rows Hits No Hits Size\n"); 295 printf("--------------------------------------------------------\n"); 296 for (i = 0; i < j; i++) { 297 struct output_sum *os = &output_sums[i]; 298 double gb = (double)os->nranges * block_size / 1024.0; 299 char p = 'K'; 300 301 if (gb > 1024.0) { 302 p = 'M'; 303 gb /= 1024.0; 304 } 305 if (gb > 1024.0) { 306 p = 'G'; 307 gb /= 1024.0; 308 } 309 310 perc += perc_i; 311 printf("%s %6.2f%%\t%6.2f%%\t\t%8u\t%6.2f%c\n", 312 i ? "|->" : "Top", perc, os->output, os->nranges, 313 gb, p); 314 } 315 316 free(output_sums); 317 } 318 319 free(hash); 320 free(nodes); 321 return 0; 322 } 323