1 /* 2 ******************************************************************************* 3 * Copyright (C) 2011-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ppucd.h 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2011dec11 12 * created by: Markus W. Scherer 13 */ 14 15 #ifndef __PPUCD_H__ 16 #define __PPUCD_H__ 17 18 #include "unicode/utypes.h" 19 #include "unicode/uniset.h" 20 #include "unicode/unistr.h" 21 22 #include <stdio.h> 23 24 /** Additions to the uchar.h enum UProperty. */ 25 enum { 26 /** Name_Alias */ 27 PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT, 28 PPUCD_CONDITIONAL_CASE_MAPPINGS, 29 PPUCD_TURKIC_CASE_FOLDING 30 }; 31 32 U_NAMESPACE_BEGIN 33 34 class U_TOOLUTIL_API PropertyNames { 35 public: 36 virtual ~PropertyNames(); 37 virtual int32_t getPropertyEnum(const char *name) const; 38 virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const; 39 }; 40 41 struct U_TOOLUTIL_API UniProps { 42 UniProps(); 43 ~UniProps(); 44 45 int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; } 46 47 UChar32 start, end; 48 UBool binProps[UCHAR_BINARY_LIMIT]; 49 int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]; 50 UVersionInfo age; 51 UChar32 bmg; 52 UChar32 scf, slc, stc, suc; 53 int32_t digitValue; 54 const char *numericValue; 55 const char *name; 56 const char *nameAlias; 57 UnicodeString cf, lc, tc, uc; 58 UnicodeSet scx; 59 }; 60 61 class U_TOOLUTIL_API PreparsedUCD { 62 public: 63 enum LineType { 64 /** No line, end of file. */ 65 NO_LINE, 66 /** Empty line. (Might contain a comment.) */ 67 EMPTY_LINE, 68 69 /** ucd;6.1.0 */ 70 UNICODE_VERSION_LINE, 71 72 /** property;Binary;Alpha;Alphabetic */ 73 PROPERTY_LINE, 74 /** binary;N;No;F;False */ 75 BINARY_LINE, 76 /** value;gc;Zs;Space_Separator */ 77 VALUE_LINE, 78 79 /** defaults;0000..10FFFF;age=NA;bc=L;... */ 80 DEFAULTS_LINE, 81 /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */ 82 BLOCK_LINE, 83 /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */ 84 CP_LINE, 85 86 /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */ 87 ALG_NAMES_RANGE_LINE, 88 89 LINE_TYPE_COUNT 90 }; 91 92 /** 93 * Constructor. 94 * Prepare this object for a new, empty package. 95 */ 96 PreparsedUCD(const char *filename, UErrorCode &errorCode); 97 98 /** Destructor. */ 99 ~PreparsedUCD(); 100 101 /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */ 102 void setPropertyNames(const PropertyNames *pn) { pnames=pn; } 103 104 /** 105 * Reads a line from the preparsed UCD file. 106 * Splits the line by replacing each ';' with a NUL. 107 */ 108 LineType readLine(UErrorCode &errorCode); 109 110 /** Returns the number of the line read by readLine(). */ 111 int32_t getLineNumber() const { return lineNumber; } 112 113 /** Returns the line's next field, or NULL. */ 114 const char *nextField(); 115 116 /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */ 117 const UVersionInfo &getUnicodeVersion() const { return ucdVersion; } 118 119 /** Returns TRUE if the current line has property values. */ 120 UBool lineHasPropertyValues() const { return DEFAULTS_LINE<=lineType && lineType<=CP_LINE; } 121 122 /** 123 * Parses properties from the current line. 124 * Clears newValues and sets UProperty codes for property values mentioned 125 * on the current line (as opposed to being inherited). 126 * Returns a pointer to the filled-in UniProps, or NULL if something went wrong. 127 * The returned UniProps are usable until the next line of the same type is read. 128 */ 129 const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode); 130 131 /** 132 * Returns the code point range for the current algnamesrange line. 133 * Calls & parses nextField(). 134 * Further nextField() calls will yield the range's type & prefix string. 135 * Returns U_SUCCESS(errorCode). 136 */ 137 UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode); 138 139 private: 140 UBool isLineBufferAvailable(int32_t i) { 141 return defaultLineIndex!=i && blockLineIndex!=i; 142 } 143 144 /** Resets the field iterator and returns the line's first field (the line type field). */ 145 const char *firstField(); 146 147 UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, 148 UErrorCode &errorCode); 149 UChar32 parseCodePoint(const char *s, UErrorCode &errorCode); 150 UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode); 151 void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode); 152 void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode); 153 154 static const int32_t kNumLineBuffers=3; 155 156 PropertyNames *icuPnames; // owned 157 const PropertyNames *pnames; // aliased 158 FILE *file; 159 int32_t defaultLineIndex, blockLineIndex, lineIndex; 160 int32_t lineNumber; 161 LineType lineType; 162 char *fieldLimit; 163 char *lineLimit; 164 165 UVersionInfo ucdVersion; 166 UniProps defaultProps, blockProps, cpProps; 167 // Multiple lines so that default and block properties can maintain pointers 168 // into their line buffers. 169 char lines[kNumLineBuffers][4096]; 170 }; 171 172 U_NAMESPACE_END 173 174 #endif // __PPUCD_H__ 175