Home | History | Annotate | Download | only in sbcs
      1 /*
      2  ***********************************************************************
      3  * Copyright (C) 2005, International Business Machines Corporation and *
      4  * others. All Rights Reserved.                                        *
      5  ***********************************************************************
      6  *
      7  */
      8 
      9 package com.ibm.icu.dev.tool.charsetdet.sbcs;
     10 
     11 import com.ibm.icu.text.UnicodeSet;
     12 
     13 /**
     14  * @author emader
     15  *
     16  * TODO To change the template for this generated type comment go to
     17  * Window - Preferences - Java - Code Style - Code Templates
     18  */
     19 public class NGramParser
     20 {
     21 
     22     public interface NGramParserClient
     23     {
     24         char nextChar();
     25         void handleNGram(String key);
     26     }
     27 
     28     private static final int A_NULL = 0;
     29     private static final int A_ADDC = 1;
     30     private static final int A_ADDS = 2;
     31 
     32     /*
     33      * Character classes
     34      */
     35     public static final int C_IGNORE = 0;
     36     public static final int C_LETTER = 1;
     37     public static final int C_PUNCT  = 2;
     38 
     39     private static final int S_START  = 0;
     40     private static final int S_LETTER = 1;
     41     private static final int S_PUNCT  = 2;
     42 
     43     static final class StateEntry
     44     {
     45         private int newState;
     46         private int action;
     47 
     48         StateEntry(int theState, int theAction)
     49         {
     50             newState = theState;
     51             action   = theAction;
     52         }
     53 
     54         public int getNewState()
     55         {
     56             return newState;
     57         }
     58 
     59         public int getAction()
     60         {
     61             return action;
     62         }
     63     }
     64 
     65     private StateEntry[][] stateTable = {
     66             {new StateEntry(S_START,  A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_ADDS)},
     67             {new StateEntry(S_LETTER, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_ADDS)},
     68             {new StateEntry(S_PUNCT,  A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_NULL)}
     69     };
     70 
     71     protected final int N_GRAM_SIZE = 3;
     72 
     73     private char[] letters = new char[N_GRAM_SIZE];
     74     private int letterCount;
     75 
     76     private static UnicodeSet letterSet = new UnicodeSet("[:letter:]");
     77 
     78     private NGramParserClient client;
     79 
     80     /**
     81      *
     82      */
     83     public NGramParser(NGramParserClient theClient)
     84     {
     85         client = theClient;
     86         letterCount = 0;
     87     }
     88 
     89     public void setClient(NGramParserClient theClient)
     90     {
     91         client = theClient;
     92     }
     93 
     94     // TODO Is this good enough, or are there other C_IGNORE characters?
     95     // TODO Could this make Latin letters C_PUNCT for non-Latin scripts?
     96     public static int getCharClass(char ch)
     97     {
     98         if (ch == '\'' || ch == '\uFEFF') {
     99             return C_IGNORE;
    100         }
    101 
    102         if (letterSet.contains(ch)) {
    103             return C_LETTER;
    104         }
    105 
    106         return C_PUNCT;
    107     }
    108 
    109     public void reset()
    110     {
    111         letterCount = 0;
    112     }
    113 
    114     public void addLetter(char letter)
    115     {
    116         // somewhat clever stuff goes here...
    117         letters[letterCount++] = letter;
    118 
    119         if (letterCount >= N_GRAM_SIZE) {
    120             String key = new String(letters);
    121 
    122             client.handleNGram(key);
    123 
    124             letterCount = N_GRAM_SIZE - 1;
    125             for (int i = 0; i < letterCount; i += 1) {
    126                 letters[i] = letters[i + 1];
    127             }
    128         }
    129     }
    130 
    131     public void parse()
    132     {
    133         char ch;
    134         int state = 0;
    135 
    136         // this is where the clever stuff goes...
    137         while ((ch = client.nextChar()) != 0) {
    138             int charClass = getCharClass(ch);
    139             StateEntry entry = stateTable[state][charClass];
    140 
    141             state = entry.getNewState();
    142 
    143             switch (entry.getAction())
    144             {
    145             case A_ADDC:
    146                 addLetter(Character.toLowerCase(ch));
    147                 break;
    148 
    149             case A_ADDS:
    150                 addLetter(' ');
    151                 break;
    152 
    153             case A_NULL:
    154             default:
    155                 break;
    156             }
    157         }
    158 
    159         addLetter(' ');
    160     }
    161 }
    162