Home | History | Annotate | Download | only in sbcs
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  ***********************************************************************
      5  * Copyright (C) 2005, International Business Machines Corporation and *
      6  * others. All Rights Reserved.                                        *
      7  ***********************************************************************
      8  *
      9  */
     10 
     11 package com.ibm.icu.dev.tool.charsetdet.sbcs;
     12 
     13 import com.ibm.icu.text.UnicodeSet;
     14 
     15 /**
     16  * @author emader
     17  *
     18  * TODO To change the template for this generated type comment go to
     19  * Window - Preferences - Java - Code Style - Code Templates
     20  */
     21 public class NGramParser
     22 {
     23 
     24     public interface NGramParserClient
     25     {
     26         char nextChar();
     27         void handleNGram(String key);
     28     }
     29 
     30     private static final int A_NULL = 0;
     31     private static final int A_ADDC = 1;
     32     private static final int A_ADDS = 2;
     33 
     34     /*
     35      * Character classes
     36      */
     37     public static final int C_IGNORE = 0;
     38     public static final int C_LETTER = 1;
     39     public static final int C_PUNCT  = 2;
     40 
     41     private static final int S_START  = 0;
     42     private static final int S_LETTER = 1;
     43     private static final int S_PUNCT  = 2;
     44 
     45     static final class StateEntry
     46     {
     47         private int newState;
     48         private int action;
     49 
     50         StateEntry(int theState, int theAction)
     51         {
     52             newState = theState;
     53             action   = theAction;
     54         }
     55 
     56         public int getNewState()
     57         {
     58             return newState;
     59         }
     60 
     61         public int getAction()
     62         {
     63             return action;
     64         }
     65     }
     66 
     67     private StateEntry[][] stateTable = {
     68             {new StateEntry(S_START,  A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_ADDS)},
     69             {new StateEntry(S_LETTER, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_ADDS)},
     70             {new StateEntry(S_PUNCT,  A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_NULL)}
     71     };
     72 
     73     protected final int N_GRAM_SIZE = 3;
     74 
     75     private char[] letters = new char[N_GRAM_SIZE];
     76     private int letterCount;
     77 
     78     private static UnicodeSet letterSet = new UnicodeSet("[:letter:]");
     79 
     80     private NGramParserClient client;
     81 
     82     /**
     83      *
     84      */
     85     public NGramParser(NGramParserClient theClient)
     86     {
     87         client = theClient;
     88         letterCount = 0;
     89     }
     90 
     91     public void setClient(NGramParserClient theClient)
     92     {
     93         client = theClient;
     94     }
     95 
     96     // TODO Is this good enough, or are there other C_IGNORE characters?
     97     // TODO Could this make Latin letters C_PUNCT for non-Latin scripts?
     98     public static int getCharClass(char ch)
     99     {
    100         if (ch == '\'' || ch == '\uFEFF') {
    101             return C_IGNORE;
    102         }
    103 
    104         if (letterSet.contains(ch)) {
    105             return C_LETTER;
    106         }
    107 
    108         return C_PUNCT;
    109     }
    110 
    111     public void reset()
    112     {
    113         letterCount = 0;
    114     }
    115 
    116     public void addLetter(char letter)
    117     {
    118         // somewhat clever stuff goes here...
    119         letters[letterCount++] = letter;
    120 
    121         if (letterCount >= N_GRAM_SIZE) {
    122             String key = new String(letters);
    123 
    124             client.handleNGram(key);
    125 
    126             letterCount = N_GRAM_SIZE - 1;
    127             for (int i = 0; i < letterCount; i += 1) {
    128                 letters[i] = letters[i + 1];
    129             }
    130         }
    131     }
    132 
    133     public void parse()
    134     {
    135         char ch;
    136         int state = 0;
    137 
    138         // this is where the clever stuff goes...
    139         while ((ch = client.nextChar()) != 0) {
    140             int charClass = getCharClass(ch);
    141             StateEntry entry = stateTable[state][charClass];
    142 
    143             state = entry.getNewState();
    144 
    145             switch (entry.getAction())
    146             {
    147             case A_ADDC:
    148                 addLetter(Character.toLowerCase(ch));
    149                 break;
    150 
    151             case A_ADDS:
    152                 addLetter(' ');
    153                 break;
    154 
    155             case A_NULL:
    156             default:
    157                 break;
    158             }
    159         }
    160 
    161         addLetter(' ');
    162     }
    163 }
    164