1 #include <cstdlib> 2 #include <fstream> 3 #include <iostream> 4 #include <limits> 5 #include <string> 6 #include <utility> 7 #include <vector> 8 9 #include <marisa_alpha.h> 10 11 #include "./cmdopt.h" 12 13 namespace { 14 15 typedef std::pair<std::string, double> Key; 16 17 int param_num_tries = MARISA_ALPHA_DEFAULT_NUM_TRIES; 18 int param_trie = MARISA_ALPHA_DEFAULT_TRIE; 19 int param_tail = MARISA_ALPHA_DEFAULT_TAIL; 20 int param_order = MARISA_ALPHA_DEFAULT_ORDER; 21 const char *output_filename = NULL; 22 23 void print_help(const char *cmd) { 24 std::cerr << "Usage: " << cmd << " [OPTION]... [FILE]...\n\n" 25 "Options:\n" 26 " -n, --num-tries=[N] limits the number of tries to N" 27 " (default: 3)\n" 28 " -P, --patricia-trie build patricia tries (default)\n" 29 " -p, --prefix-trie build prefix tries\n" 30 " -T, --text-tail build a dictionary with text TAIL (default)\n" 31 " -b, --binary-tail build a dictionary with binary TAIL\n" 32 " -t, --without-tail build a dictionary without TAIL\n" 33 " -w, --weight-order arranges siblings in weight order (default)\n" 34 " -l, --label-order arranges siblings in label order\n" 35 " -o, --output=[FILE] write tries to FILE (default: stdout)\n" 36 " -h, --help print this help\n" 37 << std::endl; 38 } 39 40 void read_keys(std::istream *input, std::vector<Key> *keys) { 41 Key key; 42 std::string line; 43 while (std::getline(*input, line)) { 44 const std::string::size_type delim_pos = line.find_last_of('\t'); 45 if (delim_pos != line.npos) { 46 char *end_of_value; 47 key.second = std::strtod(&line[delim_pos + 1], &end_of_value); 48 if (*end_of_value == '\0') { 49 line.resize(delim_pos); 50 } else { 51 key.second = 1.0; 52 } 53 } else { 54 key.second = 1.0; 55 } 56 key.first = line; 57 keys->push_back(key); 58 } 59 } 60 61 int build(const char * const *args, std::size_t num_args) { 62 std::vector<Key> keys; 63 if (num_args == 0) { 64 read_keys(&std::cin, &keys); 65 } 66 67 for (std::size_t i = 0; i < num_args; ++i) { 68 std::ifstream input_file(args[i], std::ios::binary); 69 if (!input_file) { 70 std::cerr << "error: failed to open a keyset file: " 71 << args[i] << std::endl; 72 return 10; 73 } 74 read_keys(&input_file, &keys); 75 } 76 77 marisa_alpha::Trie trie; 78 try { 79 trie.build(keys, NULL, param_num_tries 80 | param_trie | param_tail | param_order); 81 } catch (const marisa_alpha::Exception &ex) { 82 std::cerr << ex.filename() << ':' << ex.line() << ": " << ex.what() 83 << ": failed to build a dictionary" << std::endl; 84 return 20; 85 } 86 87 std::cerr << "#keys: " << trie.num_keys() << std::endl; 88 std::cerr << "#tries: " << trie.num_tries() << std::endl; 89 std::cerr << "#nodes: " << trie.num_nodes() << std::endl; 90 std::cerr << "size: " << trie.total_size() << std::endl; 91 92 if (output_filename != NULL) { 93 try { 94 trie.save(output_filename); 95 } catch (const marisa_alpha::Exception &ex) { 96 std::cerr << ex.filename() << ':' << ex.line() << ": " << ex.what() 97 << ": failed to write a dictionary to file: " 98 << output_filename << std::endl; 99 return 30; 100 } 101 } else { 102 try { 103 trie.write(std::cout); 104 } catch (const marisa_alpha::Exception &ex) { 105 std::cerr << ex.filename() << ':' << ex.line() << ": " << ex.what() 106 << ": failed to write a dictionary to standard output" << std::endl; 107 return 31; 108 } 109 } 110 return 0; 111 } 112 113 } // namespace 114 115 int main(int argc, char *argv[]) { 116 std::ios::sync_with_stdio(false); 117 118 ::cmdopt_option long_options[] = { 119 { "max-num-tries", 1, NULL, 'n' }, 120 { "patricia-trie", 0, NULL, 'P' }, 121 { "prefix-trie", 0, NULL, 'p' }, 122 { "text-tail", 0, NULL, 'T' }, 123 { "binary-tail", 0, NULL, 'b' }, 124 { "without-tail", 0, NULL, 't' }, 125 { "weight-order", 0, NULL, 'w' }, 126 { "label-order", 0, NULL, 'l' }, 127 { "output", 1, NULL, 'o' }, 128 { "help", 0, NULL, 'h' }, 129 { NULL, 0, NULL, 0 } 130 }; 131 ::cmdopt_t cmdopt; 132 ::cmdopt_init(&cmdopt, argc, argv, "n:PpTbtwlo:h", long_options); 133 int label; 134 while ((label = ::cmdopt_get(&cmdopt)) != -1) { 135 switch (label) { 136 case 'n': { 137 char *end_of_value; 138 const long value = std::strtol(cmdopt.optarg, &end_of_value, 10); 139 if ((*end_of_value != '\0') || (value <= 0) || 140 (value > MARISA_ALPHA_MAX_NUM_TRIES)) { 141 std::cerr << "error: option `-n' with an invalid argument: " 142 << cmdopt.optarg << std::endl; 143 } 144 param_num_tries = (int)value; 145 break; 146 } 147 case 'P': { 148 param_trie = MARISA_ALPHA_PATRICIA_TRIE; 149 break; 150 } 151 case 'p': { 152 param_trie = MARISA_ALPHA_PREFIX_TRIE; 153 break; 154 } 155 case 'T': { 156 param_tail = MARISA_ALPHA_TEXT_TAIL; 157 break; 158 } 159 case 'b': { 160 param_tail = MARISA_ALPHA_BINARY_TAIL; 161 break; 162 } 163 case 't': { 164 param_tail = MARISA_ALPHA_WITHOUT_TAIL; 165 break; 166 } 167 case 'w': { 168 param_order = MARISA_ALPHA_WEIGHT_ORDER; 169 break; 170 } 171 case 'l': { 172 param_order = MARISA_ALPHA_LABEL_ORDER; 173 break; 174 } 175 case 'o': { 176 output_filename = cmdopt.optarg; 177 break; 178 } 179 case 'h': { 180 print_help(argv[0]); 181 return 0; 182 } 183 default: { 184 return 1; 185 } 186 } 187 } 188 return build(cmdopt.argv + cmdopt.optind, cmdopt.argc - cmdopt.optind); 189 } 190