Home | History | Annotate | Download | only in compact_lang_det
      1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
      6 #define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
      7 
      8 #include "encodings/compact_lang_det/letterscript_enum.h"
      9 #include "encodings/compact_lang_det/compact_lang_det_impl.h"
     10 
     11 namespace getone {
     12   static const int kMaxScriptBuffer = 4096;
     13   static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
     14   static const int kMaxScriptBytes = kMaxScriptBuffer- 8;   // Leave some room
     15   static const int kMaxAnswerBuffer = 256;
     16 
     17   typedef enum UnicodeLScript ULScript;
     18 
     19   typedef struct {
     20     char* text;             // Pointer to the span, somewhere
     21     int text_bytes;         // Number of bytes of text in the span
     22     int offset;             // Offset of start of span in original input buffer
     23     ULScript script;        // Script of all the letters in this span
     24     Language lang;          // Language identified for this span
     25     bool truncated;         // true if buffer filled up before a
     26                             // different script or EOF was found
     27   } LangSpan;
     28 
     29 
     30   static inline bool IsContinuationByte(char c) {
     31     return static_cast<signed char>(c) < -64;
     32   }
     33 
     34   // Gets lscript number for letters; always returns
     35   //   0 (common script) for non-letters
     36   int GetUTF8LetterScriptNum(const char* src);
     37 
     38 
     39   // Update src pointer to point to next quadgram, +2..+5
     40   // Looks at src[0..4]
     41   const char* AdvanceQuad(const char* src);
     42 }     // end namespace getone
     43 
     44 
     45 
     46 
     47 
     48 
     49 class ScriptScanner {
     50  public:
     51   ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
     52   ~ScriptScanner();
     53 
     54   // Copy next run of same-script non-tag letters to buffer [NUL terminated]
     55   bool GetOneScriptSpan(getone::LangSpan* span);
     56 
     57   // Force Latin and Cyrillic scripts to be lowercase
     58   void LowerScriptSpan(getone::LangSpan* span);
     59 
     60   // Copy next run of same-script non-tag letters to buffer [NUL terminated]
     61   // Force Latin and Cyrillic scripts to be lowercase
     62   bool GetOneScriptSpanLower(getone::LangSpan* span);
     63 
     64  private:
     65   int SkipToFrontOfSpan(const char* src, int len, int* script);
     66 
     67   const char* start_byte_;
     68   const char* next_byte_;
     69   const char* next_byte_limit_;
     70   int byte_length_;
     71   bool is_plain_text_;
     72   char* script_buffer_;           // Holds text with expanded entities
     73   char* script_buffer_lower_;     // Holds lowercased text
     74 };
     75 
     76 
     77 class LangScanner {
     78  public:
     79   LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
     80               getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
     81               int maxlangs, int minlangspan);
     82   ~LangScanner();
     83 
     84 
     85   int script() {return script_;}
     86 
     87   // Use new text
     88   // Keep smoothing state if same script, otherwise reinit smoothing
     89   void NewText(getone::LangSpan* spn);
     90 
     91   bool GetOneShortLangSpanBoot(getone::LangSpan* span);  // Just for bootstrapping
     92   bool GetOneLangSpanBoot(getone::LangSpan* span);       // Just for bootstrapping
     93 
     94   // The real ones
     95   bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
     96                            getone::LangSpan* span);
     97   bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
     98                       getone::LangSpan* span);
     99 
    100   // Increases language bias by delta
    101   void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
    102                        Language key, int delta);
    103 
    104   // For debugging output
    105   int next_answer_;
    106   char answer_buffer_[getone::kMaxAnswerBuffer];
    107   char answer_buffer2_[getone::kMaxAnswerBuffer];
    108   char answer_buffer3_[getone::kMaxAnswerBuffer];
    109   char answer_buffer4_[getone::kMaxAnswerBuffer];
    110 
    111  private:
    112   const char* start_byte_;
    113   const char* next_byte_limit_;
    114   const char* next_byte_;
    115   const char* onelangspan_begin_;
    116   int byte_length_;
    117   int script_;
    118   Language spanlang_;
    119   int smoothwidth_;
    120   int smoothwidth_2_;
    121   int smoothcandidates_;
    122   int maxlangs_;
    123   int minlangspan_;
    124   int rb_size_;
    125   int next_rb_;
    126   int rb_mask_;
    127   uint32* rb_;
    128   int* offset_rb_;
    129 };
    130 
    131 #endif  // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
    132