Home | History | Annotate | Download | only in spellchecker
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Defines an iterator class that enumerates words supported by our spellchecker
      6 // from multi-language text. This class is used for filtering out characters
      7 // not supported by our spellchecker.
      8 
      9 #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
     10 #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
     11 
     12 #include <string>
     13 
     14 #include "base/basictypes.h"
     15 #include "base/memory/scoped_ptr.h"
     16 #include "base/strings/string16.h"
     17 #include "third_party/icu/source/common/unicode/uscript.h"
     18 
     19 namespace base {
     20 namespace i18n {
     21 class BreakIterator;
     22 } // namespace i18n
     23 } // namespace base
     24 
     25 // A class which encapsulates language-specific operations used by
     26 // SpellcheckWordIterator. When we set the spellchecker language, this class
     27 // creates rule sets that filter out the characters not supported by the
     28 // spellchecker. (Please read the comment in the SpellcheckWordIterator class
     29 // about how to use this class.)
     30 class SpellcheckCharAttribute {
     31  public:
     32   SpellcheckCharAttribute();
     33   ~SpellcheckCharAttribute();
     34 
     35   // Sets the language of the spellchecker. When this function is called with an
     36   // ISO language code, this function creates the custom rule-sets used by
     37   // the ICU break iterator so it can extract only words used by the language.
     38   // GetRuleSet() returns the rule-sets created in this function.
     39   void SetDefaultLanguage(const std::string& language);
     40 
     41   // Returns a custom rule-set string used by the ICU break iterator. This class
     42   // has two rule-sets, one splits a contraction and the other does not, so we
     43   // can split a concaticated word (e.g. "seven-year-old") into words (e.g.
     44   // "seven", "year", and "old") and check their spellings. The result stirng is
     45   // encoded in UTF-16 since ICU needs UTF-16 strings.
     46   base::string16 GetRuleSet(bool allow_contraction) const;
     47 
     48   // Outputs a character only if it is a word character. (Please read the
     49   // comments in CreateRuleSets() why we need this function.)
     50   bool OutputChar(UChar c, base::string16* output) const;
     51 
     52  private:
     53   // Creates the rule-sets that return words possibly used by the given
     54   // language. Unfortunately, these rule-sets are not perfect and have some
     55   // false-positives. For example, they return combined accent marks even though
     56   // we need English words only. We call OutputCharacter() to filter out such
     57   // false-positive characters.
     58   void CreateRuleSets(const std::string& language);
     59 
     60   // Outputs a character only if it is one used by the given language. These
     61   // functions are called from OutputChar().
     62   bool OutputArabic(UChar c, base::string16* output) const;
     63   bool OutputHangul(UChar c, base::string16* output) const;
     64   bool OutputHebrew(UChar c, base::string16* output) const;
     65   bool OutputDefault(UChar c, base::string16* output) const;
     66 
     67   // The custom rule-set strings used by ICU break iterator. Since it is not so
     68   // easy to create custom rule-sets from an ISO language code, this class
     69   // saves these rule-set strings created when we set the language.
     70   base::string16 ruleset_allow_contraction_;
     71   base::string16 ruleset_disallow_contraction_;
     72 
     73   // The script code used by this language.
     74   UScriptCode script_code_;
     75 
     76   DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute);
     77 };
     78 
     79 // A class which extracts words that can be checked for spelling from a
     80 // multi-language string. The ICU word-break iterator does not discard some
     81 // punctuation characters attached to a word. For example, when we set a word
     82 // "_hello_" to a word-break iterator, it just returns "_hello_". Neither does
     83 // it discard characters not used by the language. For example, it returns
     84 // Russian words even though we need English words only. To extract only the
     85 // words that our spellchecker can check their spellings, this class uses custom
     86 // rule-sets created by the SpellcheckCharAttribute class. Also, this class
     87 // normalizes extracted words so our spellchecker can check the spellings of
     88 // words that include ligatures, combined characters, full-width characters,
     89 // etc. This class uses UTF-16 strings as its input and output strings since
     90 // UTF-16 is the native encoding of ICU and avoid unnecessary conversions
     91 // when changing the encoding of this string for our spellchecker. (Chrome can
     92 // use two or more spellcheckers and we cannot assume their encodings.)
     93 // The following snippet is an example that extracts words with this class.
     94 //
     95 //   // Creates the language-specific attributes for US English.
     96 //   SpellcheckCharAttribute attribute;
     97 //   attribute.SetDefaultLanguage("en-US");
     98 //
     99 //   // Set up a SpellcheckWordIterator object which extracts English words,
    100 //   // and retrieve them.
    101 //   SpellcheckWordIterator iterator;
    102 //   base::string16 text(base::UTF8ToUTF16("this is a test."));
    103 //   iterator.Initialize(&attribute, true);
    104 //   iterator.SetText(text.c_str(), text_.length());
    105 //
    106 //   base::string16 word;
    107 //   int offset;
    108 //   int length;
    109 //   while (iterator.GetNextWord(&word, &offset, &length)) {
    110 //     ...
    111 //   }
    112 //
    113 class SpellcheckWordIterator {
    114  public:
    115   SpellcheckWordIterator();
    116   ~SpellcheckWordIterator();
    117 
    118   // Initializes a word-iterator object with the language-specific attribute. If
    119   // we need to split contractions and concatenated words, call this function
    120   // with its 'allow_contraction' parameter false. (This function uses lots of
    121   // temporal memory to compile a custom word-break rule into an automaton.)
    122   bool Initialize(const SpellcheckCharAttribute* attribute,
    123                   bool allow_contraction);
    124 
    125   // Returns whether this word iterator is initialized.
    126   bool IsInitialized() const;
    127 
    128   // Set text to be iterated. (This text does not have to be NULL-terminated.)
    129   // This function also resets internal state so we can reuse this iterator
    130   // without calling Initialize().
    131   bool SetText(const base::char16* text, size_t length);
    132 
    133   // Retrieves a word (or a contraction), stores its copy to 'word_string', and
    134   // stores the position and the length for input word to 'word_start'. Since
    135   // this function normalizes the output word, the length of 'word_string' may
    136   // be different from the 'word_length'. Therefore, when we call functions that
    137   // changes the input text, such as string16::replace(), we need to use
    138   // 'word_start' and 'word_length' as listed in the following snippet.
    139   //
    140   //   while(iterator.GetNextWord(&word, &offset, &length))
    141   //     text.replace(offset, length, word);
    142   //
    143   bool GetNextWord(base::string16* word_string,
    144                    int* word_start,
    145                    int* word_length);
    146 
    147   // Releases all the resources attached to this object.
    148   void Reset();
    149 
    150  private:
    151   // Normalizes a non-terminated string returned from an ICU word-break
    152   // iterator. A word returned from an ICU break iterator may include characters
    153   // not supported by our spellchecker, e.g. ligatures, combining/ characters,
    154   // full-width letters, etc. This function replaces such characters with
    155   // alternative characters supported by our spellchecker. This function also
    156   // calls SpellcheckWordIterator::OutputChar() to filter out false-positive
    157   // characters.
    158   bool Normalize(int input_start,
    159                  int input_length,
    160                  base::string16* output_string) const;
    161 
    162   // The pointer to the input string from which we are extracting words.
    163   const base::char16* text_;
    164 
    165   // The language-specific attributes used for filtering out non-word
    166   // characters.
    167   const SpellcheckCharAttribute* attribute_;
    168 
    169   // The break iterator.
    170   scoped_ptr<base::i18n::BreakIterator> iterator_;
    171 
    172   DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator);
    173 };
    174 
    175 #endif  // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
    176 
    177