Home | History | Annotate | Download | only in libtextclassifier
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "strip-unpaired-brackets.h"
     18 
     19 #include <iterator>
     20 
     21 #include "util/base/logging.h"
     22 #include "util/utf8/unicodetext.h"
     23 
     24 namespace libtextclassifier2 {
     25 namespace {
     26 
     27 // Returns true if given codepoint is contained in the given span in context.
     28 bool IsCodepointInSpan(const char32 codepoint,
     29                        const UnicodeText& context_unicode,
     30                        const CodepointSpan span) {
     31   auto begin_it = context_unicode.begin();
     32   std::advance(begin_it, span.first);
     33   auto end_it = context_unicode.begin();
     34   std::advance(end_it, span.second);
     35 
     36   return std::find(begin_it, end_it, codepoint) != end_it;
     37 }
     38 
     39 // Returns the first codepoint of the span.
     40 char32 FirstSpanCodepoint(const UnicodeText& context_unicode,
     41                           const CodepointSpan span) {
     42   auto it = context_unicode.begin();
     43   std::advance(it, span.first);
     44   return *it;
     45 }
     46 
     47 // Returns the last codepoint of the span.
     48 char32 LastSpanCodepoint(const UnicodeText& context_unicode,
     49                          const CodepointSpan span) {
     50   auto it = context_unicode.begin();
     51   std::advance(it, span.second - 1);
     52   return *it;
     53 }
     54 
     55 }  // namespace
     56 
     57 CodepointSpan StripUnpairedBrackets(const std::string& context,
     58                                     CodepointSpan span, const UniLib& unilib) {
     59   const UnicodeText context_unicode =
     60       UTF8ToUnicodeText(context, /*do_copy=*/false);
     61   return StripUnpairedBrackets(context_unicode, span, unilib);
     62 }
     63 
     64 // If the first or the last codepoint of the given span is a bracket, the
     65 // bracket is stripped if the span does not contain its corresponding paired
     66 // version.
     67 CodepointSpan StripUnpairedBrackets(const UnicodeText& context_unicode,
     68                                     CodepointSpan span, const UniLib& unilib) {
     69   if (context_unicode.empty() || !ValidNonEmptySpan(span)) {
     70     return span;
     71   }
     72 
     73   const char32 begin_char = FirstSpanCodepoint(context_unicode, span);
     74   const char32 paired_begin_char = unilib.GetPairedBracket(begin_char);
     75   if (paired_begin_char != begin_char) {
     76     if (!unilib.IsOpeningBracket(begin_char) ||
     77         !IsCodepointInSpan(paired_begin_char, context_unicode, span)) {
     78       ++span.first;
     79     }
     80   }
     81 
     82   if (span.first == span.second) {
     83     return span;
     84   }
     85 
     86   const char32 end_char = LastSpanCodepoint(context_unicode, span);
     87   const char32 paired_end_char = unilib.GetPairedBracket(end_char);
     88   if (paired_end_char != end_char) {
     89     if (!unilib.IsClosingBracket(end_char) ||
     90         !IsCodepointInSpan(paired_end_char, context_unicode, span)) {
     91       --span.second;
     92     }
     93   }
     94 
     95   // Should not happen, but let's make sure.
     96   if (span.first > span.second) {
     97     TC_LOG(WARNING) << "Inverse indices result: " << span.first << ", "
     98                     << span.second;
     99     span.second = span.first;
    100   }
    101 
    102   return span;
    103 }
    104 
    105 }  // namespace libtextclassifier2
    106