Home | History | Annotate | Download | only in toolutil
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2011-2012, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  ppucd.h
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2011dec11
     12 *   created by: Markus W. Scherer
     13 */
     14 
     15 #ifndef __PPUCD_H__
     16 #define __PPUCD_H__
     17 
     18 #include "unicode/utypes.h"
     19 #include "unicode/uniset.h"
     20 #include "unicode/unistr.h"
     21 
     22 #include <stdio.h>
     23 
     24 /** Additions to the uchar.h enum UProperty. */
     25 enum {
     26     /** Name_Alias */
     27     PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT,
     28     PPUCD_CONDITIONAL_CASE_MAPPINGS,
     29     PPUCD_TURKIC_CASE_FOLDING
     30 };
     31 
     32 U_NAMESPACE_BEGIN
     33 
     34 class U_TOOLUTIL_API PropertyNames {
     35 public:
     36     virtual ~PropertyNames();
     37     virtual int32_t getPropertyEnum(const char *name) const;
     38     virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const;
     39 };
     40 
     41 struct U_TOOLUTIL_API UniProps {
     42     UniProps();
     43     ~UniProps();
     44 
     45     int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; }
     46 
     47     UChar32 start, end;
     48     UBool binProps[UCHAR_BINARY_LIMIT];
     49     int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START];
     50     UVersionInfo age;
     51     UChar32 bmg;
     52     UChar32 scf, slc, stc, suc;
     53     int32_t digitValue;
     54     const char *numericValue;
     55     const char *name;
     56     const char *nameAlias;
     57     UnicodeString cf, lc, tc, uc;
     58     UnicodeSet scx;
     59 };
     60 
     61 class U_TOOLUTIL_API PreparsedUCD {
     62 public:
     63     enum LineType {
     64         /** No line, end of file. */
     65         NO_LINE,
     66         /** Empty line. (Might contain a comment.) */
     67         EMPTY_LINE,
     68 
     69         /** ucd;6.1.0 */
     70         UNICODE_VERSION_LINE,
     71 
     72         /** property;Binary;Alpha;Alphabetic */
     73         PROPERTY_LINE,
     74         /** binary;N;No;F;False */
     75         BINARY_LINE,
     76         /** value;gc;Zs;Space_Separator */
     77         VALUE_LINE,
     78 
     79         /** defaults;0000..10FFFF;age=NA;bc=L;... */
     80         DEFAULTS_LINE,
     81         /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */
     82         BLOCK_LINE,
     83         /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */
     84         CP_LINE,
     85 
     86         /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */
     87         ALG_NAMES_RANGE_LINE,
     88 
     89         LINE_TYPE_COUNT
     90     };
     91 
     92     /**
     93      * Constructor.
     94      * Prepare this object for a new, empty package.
     95      */
     96     PreparsedUCD(const char *filename, UErrorCode &errorCode);
     97 
     98     /** Destructor. */
     99     ~PreparsedUCD();
    100 
    101     /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */
    102     void setPropertyNames(const PropertyNames *pn) { pnames=pn; }
    103 
    104     /**
    105      * Reads a line from the preparsed UCD file.
    106      * Splits the line by replacing each ';' with a NUL.
    107      */
    108     LineType readLine(UErrorCode &errorCode);
    109 
    110     /** Returns the number of the line read by readLine(). */
    111     int32_t getLineNumber() const { return lineNumber; }
    112 
    113     /** Returns the line's next field, or NULL. */
    114     const char *nextField();
    115 
    116     /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */
    117     const UVersionInfo &getUnicodeVersion() const { return ucdVersion; }
    118 
    119     /** Returns TRUE if the current line has property values. */
    120     UBool lineHasPropertyValues() const { return DEFAULTS_LINE<=lineType && lineType<=CP_LINE; }
    121 
    122     /**
    123      * Parses properties from the current line.
    124      * Clears newValues and sets UProperty codes for property values mentioned
    125      * on the current line (as opposed to being inherited).
    126      * Returns a pointer to the filled-in UniProps, or NULL if something went wrong.
    127      * The returned UniProps are usable until the next line of the same type is read.
    128      */
    129     const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode);
    130 
    131     /**
    132      * Returns the code point range for the current algnamesrange line.
    133      * Calls & parses nextField().
    134      * Further nextField() calls will yield the range's type & prefix string.
    135      * Returns U_SUCCESS(errorCode).
    136      */
    137     UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode);
    138 
    139 private:
    140     UBool isLineBufferAvailable(int32_t i) {
    141         return defaultLineIndex!=i && blockLineIndex!=i;
    142     }
    143 
    144     /** Resets the field iterator and returns the line's first field (the line type field). */
    145     const char *firstField();
    146 
    147     UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
    148                         UErrorCode &errorCode);
    149     UChar32 parseCodePoint(const char *s, UErrorCode &errorCode);
    150     UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode);
    151     void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode);
    152     void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode);
    153 
    154     static const int32_t kNumLineBuffers=3;
    155 
    156     PropertyNames *icuPnames;  // owned
    157     const PropertyNames *pnames;  // aliased
    158     FILE *file;
    159     int32_t defaultLineIndex, blockLineIndex, lineIndex;
    160     int32_t lineNumber;
    161     LineType lineType;
    162     char *fieldLimit;
    163     char *lineLimit;
    164 
    165     UVersionInfo ucdVersion;
    166     UniProps defaultProps, blockProps, cpProps;
    167     // Multiple lines so that default and block properties can maintain pointers
    168     // into their line buffers.
    169     char lines[kNumLineBuffers][4096];
    170 };
    171 
    172 U_NAMESPACE_END
    173 
    174 #endif  // __PPUCD_H__
    175