1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "datetime/extractor.h" 18 19 #include "util/base/logging.h" 20 21 namespace libtextclassifier2 { 22 23 bool DatetimeExtractor::Extract(DateParseData* result, 24 CodepointSpan* result_span) const { 25 result->field_set_mask = 0; 26 *result_span = {kInvalidIndex, kInvalidIndex}; 27 28 if (rule_.regex->groups() == nullptr) { 29 return false; 30 } 31 32 for (int group_id = 0; group_id < rule_.regex->groups()->size(); group_id++) { 33 UnicodeText group_text; 34 const int group_type = rule_.regex->groups()->Get(group_id); 35 if (group_type == DatetimeGroupType_GROUP_UNUSED) { 36 continue; 37 } 38 if (!GroupTextFromMatch(group_id, &group_text)) { 39 TC_LOG(ERROR) << "Couldn't retrieve group."; 40 return false; 41 } 42 // The pattern can have a group defined in a part that was not matched, 43 // e.g. an optional part. In this case we'll get an empty content here. 44 if (group_text.empty()) { 45 continue; 46 } 47 switch (group_type) { 48 case DatetimeGroupType_GROUP_YEAR: { 49 if (!ParseYear(group_text, &(result->year))) { 50 TC_LOG(ERROR) << "Couldn't extract YEAR."; 51 return false; 52 } 53 result->field_set_mask |= DateParseData::YEAR_FIELD; 54 break; 55 } 56 case DatetimeGroupType_GROUP_MONTH: { 57 if (!ParseMonth(group_text, &(result->month))) { 58 TC_LOG(ERROR) << "Couldn't extract MONTH."; 59 return false; 60 } 61 result->field_set_mask |= DateParseData::MONTH_FIELD; 62 break; 63 } 64 case DatetimeGroupType_GROUP_DAY: { 65 if (!ParseDigits(group_text, &(result->day_of_month))) { 66 TC_LOG(ERROR) << "Couldn't extract DAY."; 67 return false; 68 } 69 result->field_set_mask |= DateParseData::DAY_FIELD; 70 break; 71 } 72 case DatetimeGroupType_GROUP_HOUR: { 73 if (!ParseDigits(group_text, &(result->hour))) { 74 TC_LOG(ERROR) << "Couldn't extract HOUR."; 75 return false; 76 } 77 result->field_set_mask |= DateParseData::HOUR_FIELD; 78 break; 79 } 80 case DatetimeGroupType_GROUP_MINUTE: { 81 if (!ParseDigits(group_text, &(result->minute))) { 82 TC_LOG(ERROR) << "Couldn't extract MINUTE."; 83 return false; 84 } 85 result->field_set_mask |= DateParseData::MINUTE_FIELD; 86 break; 87 } 88 case DatetimeGroupType_GROUP_SECOND: { 89 if (!ParseDigits(group_text, &(result->second))) { 90 TC_LOG(ERROR) << "Couldn't extract SECOND."; 91 return false; 92 } 93 result->field_set_mask |= DateParseData::SECOND_FIELD; 94 break; 95 } 96 case DatetimeGroupType_GROUP_AMPM: { 97 if (!ParseAMPM(group_text, &(result->ampm))) { 98 TC_LOG(ERROR) << "Couldn't extract AMPM."; 99 return false; 100 } 101 result->field_set_mask |= DateParseData::AMPM_FIELD; 102 break; 103 } 104 case DatetimeGroupType_GROUP_RELATIONDISTANCE: { 105 if (!ParseRelationDistance(group_text, &(result->relation_distance))) { 106 TC_LOG(ERROR) << "Couldn't extract RELATION_DISTANCE_FIELD."; 107 return false; 108 } 109 result->field_set_mask |= DateParseData::RELATION_DISTANCE_FIELD; 110 break; 111 } 112 case DatetimeGroupType_GROUP_RELATION: { 113 if (!ParseRelation(group_text, &(result->relation))) { 114 TC_LOG(ERROR) << "Couldn't extract RELATION_FIELD."; 115 return false; 116 } 117 result->field_set_mask |= DateParseData::RELATION_FIELD; 118 break; 119 } 120 case DatetimeGroupType_GROUP_RELATIONTYPE: { 121 if (!ParseRelationType(group_text, &(result->relation_type))) { 122 TC_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD."; 123 return false; 124 } 125 result->field_set_mask |= DateParseData::RELATION_TYPE_FIELD; 126 break; 127 } 128 case DatetimeGroupType_GROUP_DUMMY1: 129 case DatetimeGroupType_GROUP_DUMMY2: 130 break; 131 default: 132 TC_LOG(INFO) << "Unknown group type."; 133 continue; 134 } 135 if (!UpdateMatchSpan(group_id, result_span)) { 136 TC_LOG(ERROR) << "Couldn't update span."; 137 return false; 138 } 139 } 140 141 if (result_span->first == kInvalidIndex || 142 result_span->second == kInvalidIndex) { 143 *result_span = {kInvalidIndex, kInvalidIndex}; 144 } 145 146 return true; 147 } 148 149 bool DatetimeExtractor::RuleIdForType(DatetimeExtractorType type, 150 int* rule_id) const { 151 auto type_it = type_and_locale_to_rule_.find(type); 152 if (type_it == type_and_locale_to_rule_.end()) { 153 return false; 154 } 155 156 auto locale_it = type_it->second.find(locale_id_); 157 if (locale_it == type_it->second.end()) { 158 return false; 159 } 160 *rule_id = locale_it->second; 161 return true; 162 } 163 164 bool DatetimeExtractor::ExtractType(const UnicodeText& input, 165 DatetimeExtractorType extractor_type, 166 UnicodeText* match_result) const { 167 int rule_id; 168 if (!RuleIdForType(extractor_type, &rule_id)) { 169 return false; 170 } 171 172 std::unique_ptr<UniLib::RegexMatcher> matcher = 173 rules_[rule_id]->Matcher(input); 174 if (!matcher) { 175 return false; 176 } 177 178 int status; 179 if (!matcher->Find(&status)) { 180 return false; 181 } 182 183 if (match_result != nullptr) { 184 *match_result = matcher->Group(&status); 185 if (status != UniLib::RegexMatcher::kNoError) { 186 return false; 187 } 188 } 189 return true; 190 } 191 192 bool DatetimeExtractor::GroupTextFromMatch(int group_id, 193 UnicodeText* result) const { 194 int status; 195 *result = matcher_.Group(group_id, &status); 196 if (status != UniLib::RegexMatcher::kNoError) { 197 return false; 198 } 199 return true; 200 } 201 202 bool DatetimeExtractor::UpdateMatchSpan(int group_id, 203 CodepointSpan* span) const { 204 int status; 205 const int match_start = matcher_.Start(group_id, &status); 206 if (status != UniLib::RegexMatcher::kNoError) { 207 return false; 208 } 209 const int match_end = matcher_.End(group_id, &status); 210 if (status != UniLib::RegexMatcher::kNoError) { 211 return false; 212 } 213 if (span->first == kInvalidIndex || span->first > match_start) { 214 span->first = match_start; 215 } 216 if (span->second == kInvalidIndex || span->second < match_end) { 217 span->second = match_end; 218 } 219 220 return true; 221 } 222 223 template <typename T> 224 bool DatetimeExtractor::MapInput( 225 const UnicodeText& input, 226 const std::vector<std::pair<DatetimeExtractorType, T>>& mapping, 227 T* result) const { 228 for (const auto& type_value_pair : mapping) { 229 if (ExtractType(input, type_value_pair.first)) { 230 *result = type_value_pair.second; 231 return true; 232 } 233 } 234 return false; 235 } 236 237 bool DatetimeExtractor::ParseWrittenNumber(const UnicodeText& input, 238 int* parsed_number) const { 239 std::vector<std::pair<int, int>> found_numbers; 240 for (const auto& type_value_pair : 241 std::vector<std::pair<DatetimeExtractorType, int>>{ 242 {DatetimeExtractorType_ZERO, 0}, 243 {DatetimeExtractorType_ONE, 1}, 244 {DatetimeExtractorType_TWO, 2}, 245 {DatetimeExtractorType_THREE, 3}, 246 {DatetimeExtractorType_FOUR, 4}, 247 {DatetimeExtractorType_FIVE, 5}, 248 {DatetimeExtractorType_SIX, 6}, 249 {DatetimeExtractorType_SEVEN, 7}, 250 {DatetimeExtractorType_EIGHT, 8}, 251 {DatetimeExtractorType_NINE, 9}, 252 {DatetimeExtractorType_TEN, 10}, 253 {DatetimeExtractorType_ELEVEN, 11}, 254 {DatetimeExtractorType_TWELVE, 12}, 255 {DatetimeExtractorType_THIRTEEN, 13}, 256 {DatetimeExtractorType_FOURTEEN, 14}, 257 {DatetimeExtractorType_FIFTEEN, 15}, 258 {DatetimeExtractorType_SIXTEEN, 16}, 259 {DatetimeExtractorType_SEVENTEEN, 17}, 260 {DatetimeExtractorType_EIGHTEEN, 18}, 261 {DatetimeExtractorType_NINETEEN, 19}, 262 {DatetimeExtractorType_TWENTY, 20}, 263 {DatetimeExtractorType_THIRTY, 30}, 264 {DatetimeExtractorType_FORTY, 40}, 265 {DatetimeExtractorType_FIFTY, 50}, 266 {DatetimeExtractorType_SIXTY, 60}, 267 {DatetimeExtractorType_SEVENTY, 70}, 268 {DatetimeExtractorType_EIGHTY, 80}, 269 {DatetimeExtractorType_NINETY, 90}, 270 {DatetimeExtractorType_HUNDRED, 100}, 271 {DatetimeExtractorType_THOUSAND, 1000}, 272 }) { 273 int rule_id; 274 if (!RuleIdForType(type_value_pair.first, &rule_id)) { 275 return false; 276 } 277 278 std::unique_ptr<UniLib::RegexMatcher> matcher = 279 rules_[rule_id]->Matcher(input); 280 if (!matcher) { 281 return false; 282 } 283 284 int status; 285 while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) { 286 int span_start = matcher->Start(&status); 287 if (status != UniLib::RegexMatcher::kNoError) { 288 return false; 289 } 290 found_numbers.push_back({span_start, type_value_pair.second}); 291 } 292 } 293 294 std::sort(found_numbers.begin(), found_numbers.end(), 295 [](const std::pair<int, int>& a, const std::pair<int, int>& b) { 296 return a.first < b.first; 297 }); 298 299 int sum = 0; 300 int running_value = -1; 301 // Simple math to make sure we handle written numerical modifiers correctly 302 // so that :="fifty one thousand and one" maps to 51001 and not 50 1 1000 1. 303 for (const std::pair<int, int> position_number_pair : found_numbers) { 304 if (running_value >= 0) { 305 if (running_value > position_number_pair.second) { 306 sum += running_value; 307 running_value = position_number_pair.second; 308 } else { 309 running_value *= position_number_pair.second; 310 } 311 } else { 312 running_value = position_number_pair.second; 313 } 314 } 315 sum += running_value; 316 *parsed_number = sum; 317 return true; 318 } 319 320 bool DatetimeExtractor::ParseDigits(const UnicodeText& input, 321 int* parsed_digits) const { 322 UnicodeText digit; 323 if (!ExtractType(input, DatetimeExtractorType_DIGITS, &digit)) { 324 return false; 325 } 326 327 if (!unilib_.ParseInt32(digit, parsed_digits)) { 328 return false; 329 } 330 return true; 331 } 332 333 bool DatetimeExtractor::ParseYear(const UnicodeText& input, 334 int* parsed_year) const { 335 if (!ParseDigits(input, parsed_year)) { 336 return false; 337 } 338 339 if (*parsed_year < 100) { 340 if (*parsed_year < 50) { 341 *parsed_year += 2000; 342 } else { 343 *parsed_year += 1900; 344 } 345 } 346 347 return true; 348 } 349 350 bool DatetimeExtractor::ParseMonth(const UnicodeText& input, 351 int* parsed_month) const { 352 if (ParseDigits(input, parsed_month)) { 353 return true; 354 } 355 356 if (MapInput(input, 357 { 358 {DatetimeExtractorType_JANUARY, 1}, 359 {DatetimeExtractorType_FEBRUARY, 2}, 360 {DatetimeExtractorType_MARCH, 3}, 361 {DatetimeExtractorType_APRIL, 4}, 362 {DatetimeExtractorType_MAY, 5}, 363 {DatetimeExtractorType_JUNE, 6}, 364 {DatetimeExtractorType_JULY, 7}, 365 {DatetimeExtractorType_AUGUST, 8}, 366 {DatetimeExtractorType_SEPTEMBER, 9}, 367 {DatetimeExtractorType_OCTOBER, 10}, 368 {DatetimeExtractorType_NOVEMBER, 11}, 369 {DatetimeExtractorType_DECEMBER, 12}, 370 }, 371 parsed_month)) { 372 return true; 373 } 374 375 return false; 376 } 377 378 bool DatetimeExtractor::ParseAMPM(const UnicodeText& input, 379 int* parsed_ampm) const { 380 return MapInput(input, 381 { 382 {DatetimeExtractorType_AM, DateParseData::AMPM::AM}, 383 {DatetimeExtractorType_PM, DateParseData::AMPM::PM}, 384 }, 385 parsed_ampm); 386 } 387 388 bool DatetimeExtractor::ParseRelationDistance(const UnicodeText& input, 389 int* parsed_distance) const { 390 if (ParseDigits(input, parsed_distance)) { 391 return true; 392 } 393 if (ParseWrittenNumber(input, parsed_distance)) { 394 return true; 395 } 396 return false; 397 } 398 399 bool DatetimeExtractor::ParseRelation( 400 const UnicodeText& input, DateParseData::Relation* parsed_relation) const { 401 return MapInput( 402 input, 403 { 404 {DatetimeExtractorType_NOW, DateParseData::Relation::NOW}, 405 {DatetimeExtractorType_YESTERDAY, DateParseData::Relation::YESTERDAY}, 406 {DatetimeExtractorType_TOMORROW, DateParseData::Relation::TOMORROW}, 407 {DatetimeExtractorType_NEXT, DateParseData::Relation::NEXT}, 408 {DatetimeExtractorType_NEXT_OR_SAME, 409 DateParseData::Relation::NEXT_OR_SAME}, 410 {DatetimeExtractorType_LAST, DateParseData::Relation::LAST}, 411 {DatetimeExtractorType_PAST, DateParseData::Relation::PAST}, 412 {DatetimeExtractorType_FUTURE, DateParseData::Relation::FUTURE}, 413 }, 414 parsed_relation); 415 } 416 417 bool DatetimeExtractor::ParseRelationType( 418 const UnicodeText& input, 419 DateParseData::RelationType* parsed_relation_type) const { 420 return MapInput( 421 input, 422 { 423 {DatetimeExtractorType_MONDAY, DateParseData::MONDAY}, 424 {DatetimeExtractorType_TUESDAY, DateParseData::TUESDAY}, 425 {DatetimeExtractorType_WEDNESDAY, DateParseData::WEDNESDAY}, 426 {DatetimeExtractorType_THURSDAY, DateParseData::THURSDAY}, 427 {DatetimeExtractorType_FRIDAY, DateParseData::FRIDAY}, 428 {DatetimeExtractorType_SATURDAY, DateParseData::SATURDAY}, 429 {DatetimeExtractorType_SUNDAY, DateParseData::SUNDAY}, 430 {DatetimeExtractorType_DAY, DateParseData::DAY}, 431 {DatetimeExtractorType_WEEK, DateParseData::WEEK}, 432 {DatetimeExtractorType_MONTH, DateParseData::MONTH}, 433 {DatetimeExtractorType_YEAR, DateParseData::YEAR}, 434 }, 435 parsed_relation_type); 436 } 437 438 bool DatetimeExtractor::ParseTimeUnit(const UnicodeText& input, 439 int* parsed_time_unit) const { 440 return MapInput(input, 441 { 442 {DatetimeExtractorType_DAYS, DateParseData::DAYS}, 443 {DatetimeExtractorType_WEEKS, DateParseData::WEEKS}, 444 {DatetimeExtractorType_MONTHS, DateParseData::MONTHS}, 445 {DatetimeExtractorType_HOURS, DateParseData::HOURS}, 446 {DatetimeExtractorType_MINUTES, DateParseData::MINUTES}, 447 {DatetimeExtractorType_SECONDS, DateParseData::SECONDS}, 448 {DatetimeExtractorType_YEARS, DateParseData::YEARS}, 449 }, 450 parsed_time_unit); 451 } 452 453 bool DatetimeExtractor::ParseWeekday(const UnicodeText& input, 454 int* parsed_weekday) const { 455 return MapInput( 456 input, 457 { 458 {DatetimeExtractorType_MONDAY, DateParseData::MONDAY}, 459 {DatetimeExtractorType_TUESDAY, DateParseData::TUESDAY}, 460 {DatetimeExtractorType_WEDNESDAY, DateParseData::WEDNESDAY}, 461 {DatetimeExtractorType_THURSDAY, DateParseData::THURSDAY}, 462 {DatetimeExtractorType_FRIDAY, DateParseData::FRIDAY}, 463 {DatetimeExtractorType_SATURDAY, DateParseData::SATURDAY}, 464 {DatetimeExtractorType_SUNDAY, DateParseData::SUNDAY}, 465 }, 466 parsed_weekday); 467 } 468 469 } // namespace libtextclassifier2 470