Home | History | Annotate | Download | only in datetime
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "datetime/extractor.h"
     18 
     19 #include "util/base/logging.h"
     20 
     21 namespace libtextclassifier2 {
     22 
     23 bool DatetimeExtractor::Extract(DateParseData* result,
     24                                 CodepointSpan* result_span) const {
     25   result->field_set_mask = 0;
     26   *result_span = {kInvalidIndex, kInvalidIndex};
     27 
     28   if (rule_.regex->groups() == nullptr) {
     29     return false;
     30   }
     31 
     32   for (int group_id = 0; group_id < rule_.regex->groups()->size(); group_id++) {
     33     UnicodeText group_text;
     34     const int group_type = rule_.regex->groups()->Get(group_id);
     35     if (group_type == DatetimeGroupType_GROUP_UNUSED) {
     36       continue;
     37     }
     38     if (!GroupTextFromMatch(group_id, &group_text)) {
     39       TC_LOG(ERROR) << "Couldn't retrieve group.";
     40       return false;
     41     }
     42     // The pattern can have a group defined in a part that was not matched,
     43     // e.g. an optional part. In this case we'll get an empty content here.
     44     if (group_text.empty()) {
     45       continue;
     46     }
     47     switch (group_type) {
     48       case DatetimeGroupType_GROUP_YEAR: {
     49         if (!ParseYear(group_text, &(result->year))) {
     50           TC_LOG(ERROR) << "Couldn't extract YEAR.";
     51           return false;
     52         }
     53         result->field_set_mask |= DateParseData::YEAR_FIELD;
     54         break;
     55       }
     56       case DatetimeGroupType_GROUP_MONTH: {
     57         if (!ParseMonth(group_text, &(result->month))) {
     58           TC_LOG(ERROR) << "Couldn't extract MONTH.";
     59           return false;
     60         }
     61         result->field_set_mask |= DateParseData::MONTH_FIELD;
     62         break;
     63       }
     64       case DatetimeGroupType_GROUP_DAY: {
     65         if (!ParseDigits(group_text, &(result->day_of_month))) {
     66           TC_LOG(ERROR) << "Couldn't extract DAY.";
     67           return false;
     68         }
     69         result->field_set_mask |= DateParseData::DAY_FIELD;
     70         break;
     71       }
     72       case DatetimeGroupType_GROUP_HOUR: {
     73         if (!ParseDigits(group_text, &(result->hour))) {
     74           TC_LOG(ERROR) << "Couldn't extract HOUR.";
     75           return false;
     76         }
     77         result->field_set_mask |= DateParseData::HOUR_FIELD;
     78         break;
     79       }
     80       case DatetimeGroupType_GROUP_MINUTE: {
     81         if (!ParseDigits(group_text, &(result->minute))) {
     82           TC_LOG(ERROR) << "Couldn't extract MINUTE.";
     83           return false;
     84         }
     85         result->field_set_mask |= DateParseData::MINUTE_FIELD;
     86         break;
     87       }
     88       case DatetimeGroupType_GROUP_SECOND: {
     89         if (!ParseDigits(group_text, &(result->second))) {
     90           TC_LOG(ERROR) << "Couldn't extract SECOND.";
     91           return false;
     92         }
     93         result->field_set_mask |= DateParseData::SECOND_FIELD;
     94         break;
     95       }
     96       case DatetimeGroupType_GROUP_AMPM: {
     97         if (!ParseAMPM(group_text, &(result->ampm))) {
     98           TC_LOG(ERROR) << "Couldn't extract AMPM.";
     99           return false;
    100         }
    101         result->field_set_mask |= DateParseData::AMPM_FIELD;
    102         break;
    103       }
    104       case DatetimeGroupType_GROUP_RELATIONDISTANCE: {
    105         if (!ParseRelationDistance(group_text, &(result->relation_distance))) {
    106           TC_LOG(ERROR) << "Couldn't extract RELATION_DISTANCE_FIELD.";
    107           return false;
    108         }
    109         result->field_set_mask |= DateParseData::RELATION_DISTANCE_FIELD;
    110         break;
    111       }
    112       case DatetimeGroupType_GROUP_RELATION: {
    113         if (!ParseRelation(group_text, &(result->relation))) {
    114           TC_LOG(ERROR) << "Couldn't extract RELATION_FIELD.";
    115           return false;
    116         }
    117         result->field_set_mask |= DateParseData::RELATION_FIELD;
    118         break;
    119       }
    120       case DatetimeGroupType_GROUP_RELATIONTYPE: {
    121         if (!ParseRelationType(group_text, &(result->relation_type))) {
    122           TC_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
    123           return false;
    124         }
    125         result->field_set_mask |= DateParseData::RELATION_TYPE_FIELD;
    126         break;
    127       }
    128       case DatetimeGroupType_GROUP_DUMMY1:
    129       case DatetimeGroupType_GROUP_DUMMY2:
    130         break;
    131       default:
    132         TC_LOG(INFO) << "Unknown group type.";
    133         continue;
    134     }
    135     if (!UpdateMatchSpan(group_id, result_span)) {
    136       TC_LOG(ERROR) << "Couldn't update span.";
    137       return false;
    138     }
    139   }
    140 
    141   if (result_span->first == kInvalidIndex ||
    142       result_span->second == kInvalidIndex) {
    143     *result_span = {kInvalidIndex, kInvalidIndex};
    144   }
    145 
    146   return true;
    147 }
    148 
    149 bool DatetimeExtractor::RuleIdForType(DatetimeExtractorType type,
    150                                       int* rule_id) const {
    151   auto type_it = type_and_locale_to_rule_.find(type);
    152   if (type_it == type_and_locale_to_rule_.end()) {
    153     return false;
    154   }
    155 
    156   auto locale_it = type_it->second.find(locale_id_);
    157   if (locale_it == type_it->second.end()) {
    158     return false;
    159   }
    160   *rule_id = locale_it->second;
    161   return true;
    162 }
    163 
    164 bool DatetimeExtractor::ExtractType(const UnicodeText& input,
    165                                     DatetimeExtractorType extractor_type,
    166                                     UnicodeText* match_result) const {
    167   int rule_id;
    168   if (!RuleIdForType(extractor_type, &rule_id)) {
    169     return false;
    170   }
    171 
    172   std::unique_ptr<UniLib::RegexMatcher> matcher =
    173       rules_[rule_id]->Matcher(input);
    174   if (!matcher) {
    175     return false;
    176   }
    177 
    178   int status;
    179   if (!matcher->Find(&status)) {
    180     return false;
    181   }
    182 
    183   if (match_result != nullptr) {
    184     *match_result = matcher->Group(&status);
    185     if (status != UniLib::RegexMatcher::kNoError) {
    186       return false;
    187     }
    188   }
    189   return true;
    190 }
    191 
    192 bool DatetimeExtractor::GroupTextFromMatch(int group_id,
    193                                            UnicodeText* result) const {
    194   int status;
    195   *result = matcher_.Group(group_id, &status);
    196   if (status != UniLib::RegexMatcher::kNoError) {
    197     return false;
    198   }
    199   return true;
    200 }
    201 
    202 bool DatetimeExtractor::UpdateMatchSpan(int group_id,
    203                                         CodepointSpan* span) const {
    204   int status;
    205   const int match_start = matcher_.Start(group_id, &status);
    206   if (status != UniLib::RegexMatcher::kNoError) {
    207     return false;
    208   }
    209   const int match_end = matcher_.End(group_id, &status);
    210   if (status != UniLib::RegexMatcher::kNoError) {
    211     return false;
    212   }
    213   if (span->first == kInvalidIndex || span->first > match_start) {
    214     span->first = match_start;
    215   }
    216   if (span->second == kInvalidIndex || span->second < match_end) {
    217     span->second = match_end;
    218   }
    219 
    220   return true;
    221 }
    222 
    223 template <typename T>
    224 bool DatetimeExtractor::MapInput(
    225     const UnicodeText& input,
    226     const std::vector<std::pair<DatetimeExtractorType, T>>& mapping,
    227     T* result) const {
    228   for (const auto& type_value_pair : mapping) {
    229     if (ExtractType(input, type_value_pair.first)) {
    230       *result = type_value_pair.second;
    231       return true;
    232     }
    233   }
    234   return false;
    235 }
    236 
    237 bool DatetimeExtractor::ParseWrittenNumber(const UnicodeText& input,
    238                                            int* parsed_number) const {
    239   std::vector<std::pair<int, int>> found_numbers;
    240   for (const auto& type_value_pair :
    241        std::vector<std::pair<DatetimeExtractorType, int>>{
    242            {DatetimeExtractorType_ZERO, 0},
    243            {DatetimeExtractorType_ONE, 1},
    244            {DatetimeExtractorType_TWO, 2},
    245            {DatetimeExtractorType_THREE, 3},
    246            {DatetimeExtractorType_FOUR, 4},
    247            {DatetimeExtractorType_FIVE, 5},
    248            {DatetimeExtractorType_SIX, 6},
    249            {DatetimeExtractorType_SEVEN, 7},
    250            {DatetimeExtractorType_EIGHT, 8},
    251            {DatetimeExtractorType_NINE, 9},
    252            {DatetimeExtractorType_TEN, 10},
    253            {DatetimeExtractorType_ELEVEN, 11},
    254            {DatetimeExtractorType_TWELVE, 12},
    255            {DatetimeExtractorType_THIRTEEN, 13},
    256            {DatetimeExtractorType_FOURTEEN, 14},
    257            {DatetimeExtractorType_FIFTEEN, 15},
    258            {DatetimeExtractorType_SIXTEEN, 16},
    259            {DatetimeExtractorType_SEVENTEEN, 17},
    260            {DatetimeExtractorType_EIGHTEEN, 18},
    261            {DatetimeExtractorType_NINETEEN, 19},
    262            {DatetimeExtractorType_TWENTY, 20},
    263            {DatetimeExtractorType_THIRTY, 30},
    264            {DatetimeExtractorType_FORTY, 40},
    265            {DatetimeExtractorType_FIFTY, 50},
    266            {DatetimeExtractorType_SIXTY, 60},
    267            {DatetimeExtractorType_SEVENTY, 70},
    268            {DatetimeExtractorType_EIGHTY, 80},
    269            {DatetimeExtractorType_NINETY, 90},
    270            {DatetimeExtractorType_HUNDRED, 100},
    271            {DatetimeExtractorType_THOUSAND, 1000},
    272        }) {
    273     int rule_id;
    274     if (!RuleIdForType(type_value_pair.first, &rule_id)) {
    275       return false;
    276     }
    277 
    278     std::unique_ptr<UniLib::RegexMatcher> matcher =
    279         rules_[rule_id]->Matcher(input);
    280     if (!matcher) {
    281       return false;
    282     }
    283 
    284     int status;
    285     while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {
    286       int span_start = matcher->Start(&status);
    287       if (status != UniLib::RegexMatcher::kNoError) {
    288         return false;
    289       }
    290       found_numbers.push_back({span_start, type_value_pair.second});
    291     }
    292   }
    293 
    294   std::sort(found_numbers.begin(), found_numbers.end(),
    295             [](const std::pair<int, int>& a, const std::pair<int, int>& b) {
    296               return a.first < b.first;
    297             });
    298 
    299   int sum = 0;
    300   int running_value = -1;
    301   // Simple math to make sure we handle written numerical modifiers correctly
    302   // so that :="fifty one  thousand and one" maps to 51001 and not 50 1 1000 1.
    303   for (const std::pair<int, int> position_number_pair : found_numbers) {
    304     if (running_value >= 0) {
    305       if (running_value > position_number_pair.second) {
    306         sum += running_value;
    307         running_value = position_number_pair.second;
    308       } else {
    309         running_value *= position_number_pair.second;
    310       }
    311     } else {
    312       running_value = position_number_pair.second;
    313     }
    314   }
    315   sum += running_value;
    316   *parsed_number = sum;
    317   return true;
    318 }
    319 
    320 bool DatetimeExtractor::ParseDigits(const UnicodeText& input,
    321                                     int* parsed_digits) const {
    322   UnicodeText digit;
    323   if (!ExtractType(input, DatetimeExtractorType_DIGITS, &digit)) {
    324     return false;
    325   }
    326 
    327   if (!unilib_.ParseInt32(digit, parsed_digits)) {
    328     return false;
    329   }
    330   return true;
    331 }
    332 
    333 bool DatetimeExtractor::ParseYear(const UnicodeText& input,
    334                                   int* parsed_year) const {
    335   if (!ParseDigits(input, parsed_year)) {
    336     return false;
    337   }
    338 
    339   if (*parsed_year < 100) {
    340     if (*parsed_year < 50) {
    341       *parsed_year += 2000;
    342     } else {
    343       *parsed_year += 1900;
    344     }
    345   }
    346 
    347   return true;
    348 }
    349 
    350 bool DatetimeExtractor::ParseMonth(const UnicodeText& input,
    351                                    int* parsed_month) const {
    352   if (ParseDigits(input, parsed_month)) {
    353     return true;
    354   }
    355 
    356   if (MapInput(input,
    357                {
    358                    {DatetimeExtractorType_JANUARY, 1},
    359                    {DatetimeExtractorType_FEBRUARY, 2},
    360                    {DatetimeExtractorType_MARCH, 3},
    361                    {DatetimeExtractorType_APRIL, 4},
    362                    {DatetimeExtractorType_MAY, 5},
    363                    {DatetimeExtractorType_JUNE, 6},
    364                    {DatetimeExtractorType_JULY, 7},
    365                    {DatetimeExtractorType_AUGUST, 8},
    366                    {DatetimeExtractorType_SEPTEMBER, 9},
    367                    {DatetimeExtractorType_OCTOBER, 10},
    368                    {DatetimeExtractorType_NOVEMBER, 11},
    369                    {DatetimeExtractorType_DECEMBER, 12},
    370                },
    371                parsed_month)) {
    372     return true;
    373   }
    374 
    375   return false;
    376 }
    377 
    378 bool DatetimeExtractor::ParseAMPM(const UnicodeText& input,
    379                                   int* parsed_ampm) const {
    380   return MapInput(input,
    381                   {
    382                       {DatetimeExtractorType_AM, DateParseData::AMPM::AM},
    383                       {DatetimeExtractorType_PM, DateParseData::AMPM::PM},
    384                   },
    385                   parsed_ampm);
    386 }
    387 
    388 bool DatetimeExtractor::ParseRelationDistance(const UnicodeText& input,
    389                                               int* parsed_distance) const {
    390   if (ParseDigits(input, parsed_distance)) {
    391     return true;
    392   }
    393   if (ParseWrittenNumber(input, parsed_distance)) {
    394     return true;
    395   }
    396   return false;
    397 }
    398 
    399 bool DatetimeExtractor::ParseRelation(
    400     const UnicodeText& input, DateParseData::Relation* parsed_relation) const {
    401   return MapInput(
    402       input,
    403       {
    404           {DatetimeExtractorType_NOW, DateParseData::Relation::NOW},
    405           {DatetimeExtractorType_YESTERDAY, DateParseData::Relation::YESTERDAY},
    406           {DatetimeExtractorType_TOMORROW, DateParseData::Relation::TOMORROW},
    407           {DatetimeExtractorType_NEXT, DateParseData::Relation::NEXT},
    408           {DatetimeExtractorType_NEXT_OR_SAME,
    409            DateParseData::Relation::NEXT_OR_SAME},
    410           {DatetimeExtractorType_LAST, DateParseData::Relation::LAST},
    411           {DatetimeExtractorType_PAST, DateParseData::Relation::PAST},
    412           {DatetimeExtractorType_FUTURE, DateParseData::Relation::FUTURE},
    413       },
    414       parsed_relation);
    415 }
    416 
    417 bool DatetimeExtractor::ParseRelationType(
    418     const UnicodeText& input,
    419     DateParseData::RelationType* parsed_relation_type) const {
    420   return MapInput(
    421       input,
    422       {
    423           {DatetimeExtractorType_MONDAY, DateParseData::MONDAY},
    424           {DatetimeExtractorType_TUESDAY, DateParseData::TUESDAY},
    425           {DatetimeExtractorType_WEDNESDAY, DateParseData::WEDNESDAY},
    426           {DatetimeExtractorType_THURSDAY, DateParseData::THURSDAY},
    427           {DatetimeExtractorType_FRIDAY, DateParseData::FRIDAY},
    428           {DatetimeExtractorType_SATURDAY, DateParseData::SATURDAY},
    429           {DatetimeExtractorType_SUNDAY, DateParseData::SUNDAY},
    430           {DatetimeExtractorType_DAY, DateParseData::DAY},
    431           {DatetimeExtractorType_WEEK, DateParseData::WEEK},
    432           {DatetimeExtractorType_MONTH, DateParseData::MONTH},
    433           {DatetimeExtractorType_YEAR, DateParseData::YEAR},
    434       },
    435       parsed_relation_type);
    436 }
    437 
    438 bool DatetimeExtractor::ParseTimeUnit(const UnicodeText& input,
    439                                       int* parsed_time_unit) const {
    440   return MapInput(input,
    441                   {
    442                       {DatetimeExtractorType_DAYS, DateParseData::DAYS},
    443                       {DatetimeExtractorType_WEEKS, DateParseData::WEEKS},
    444                       {DatetimeExtractorType_MONTHS, DateParseData::MONTHS},
    445                       {DatetimeExtractorType_HOURS, DateParseData::HOURS},
    446                       {DatetimeExtractorType_MINUTES, DateParseData::MINUTES},
    447                       {DatetimeExtractorType_SECONDS, DateParseData::SECONDS},
    448                       {DatetimeExtractorType_YEARS, DateParseData::YEARS},
    449                   },
    450                   parsed_time_unit);
    451 }
    452 
    453 bool DatetimeExtractor::ParseWeekday(const UnicodeText& input,
    454                                      int* parsed_weekday) const {
    455   return MapInput(
    456       input,
    457       {
    458           {DatetimeExtractorType_MONDAY, DateParseData::MONDAY},
    459           {DatetimeExtractorType_TUESDAY, DateParseData::TUESDAY},
    460           {DatetimeExtractorType_WEDNESDAY, DateParseData::WEDNESDAY},
    461           {DatetimeExtractorType_THURSDAY, DateParseData::THURSDAY},
    462           {DatetimeExtractorType_FRIDAY, DateParseData::FRIDAY},
    463           {DatetimeExtractorType_SATURDAY, DateParseData::SATURDAY},
    464           {DatetimeExtractorType_SUNDAY, DateParseData::SUNDAY},
    465       },
    466       parsed_weekday);
    467 }
    468 
    469 }  // namespace libtextclassifier2
    470