1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 1999-2011, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************/ 8 9 10 /*------------------------------------------------------------------------------ 11 * 12 * UCommonData An abstract interface for dealing with ICU Common Data Files. 13 * ICU Common Data Files are a grouping of a number of individual 14 * data items (resources, converters, tables, anything) into a 15 * single file or dll. The combined format includes a table of 16 * contents for locating the individual items by name. 17 * 18 * Two formats for the table of contents are supported, which is 19 * why there is an abstract inteface involved. 20 * 21 */ 22 23 #include "unicode/utypes.h" 24 #include "unicode/udata.h" 25 #include "cstring.h" 26 #include "ucmndata.h" 27 #include "udatamem.h" 28 29 #if defined(UDATA_DEBUG) || defined(UDATA_DEBUG_DUMP) 30 # include <stdio.h> 31 #endif 32 33 U_CFUNC uint16_t 34 udata_getHeaderSize(const DataHeader *udh) { 35 if(udh==NULL) { 36 return 0; 37 } else if(udh->info.isBigEndian==U_IS_BIG_ENDIAN) { 38 /* same endianness */ 39 return udh->dataHeader.headerSize; 40 } else { 41 /* opposite endianness */ 42 uint16_t x=udh->dataHeader.headerSize; 43 return (uint16_t)((x<<8)|(x>>8)); 44 } 45 } 46 47 U_CFUNC uint16_t 48 udata_getInfoSize(const UDataInfo *info) { 49 if(info==NULL) { 50 return 0; 51 } else if(info->isBigEndian==U_IS_BIG_ENDIAN) { 52 /* same endianness */ 53 return info->size; 54 } else { 55 /* opposite endianness */ 56 uint16_t x=info->size; 57 return (uint16_t)((x<<8)|(x>>8)); 58 } 59 } 60 61 /*-----------------------------------------------------------------------------* 62 * * 63 * Pointer TOCs. TODO: This form of table-of-contents should be removed * 64 * because DLLs must be relocated on loading to correct the * 65 * pointer values and this operation makes shared memory * 66 * mapping of the data much less likely to work. * 67 * * 68 *-----------------------------------------------------------------------------*/ 69 typedef struct { 70 const char *entryName; 71 const DataHeader *pHeader; 72 } PointerTOCEntry; 73 74 75 typedef struct { 76 uint32_t count; 77 uint32_t reserved; 78 PointerTOCEntry entry[2]; /* Actual size is from count. */ 79 } PointerTOC; 80 81 82 /* definition of OffsetTOC struct types moved to ucmndata.h */ 83 84 /*-----------------------------------------------------------------------------* 85 * * 86 * entry point lookup implementations * 87 * * 88 *-----------------------------------------------------------------------------*/ 89 90 #ifndef MIN 91 #define MIN(a,b) (((a)<(b)) ? (a) : (b)) 92 #endif 93 94 /** 95 * Compare strings where we know the shared prefix length, 96 * and advance the prefix length as we find that the strings share even more characters. 97 */ 98 static int32_t 99 strcmpAfterPrefix(const char *s1, const char *s2, int32_t *pPrefixLength) { 100 int32_t pl=*pPrefixLength; 101 int32_t cmp=0; 102 s1+=pl; 103 s2+=pl; 104 for(;;) { 105 int32_t c1=(uint8_t)*s1++; 106 int32_t c2=(uint8_t)*s2++; 107 cmp=c1-c2; 108 if(cmp!=0 || c1==0) { /* different or done */ 109 break; 110 } 111 ++pl; /* increment shared same-prefix length */ 112 } 113 *pPrefixLength=pl; 114 return cmp; 115 } 116 117 static int32_t 118 offsetTOCPrefixBinarySearch(const char *s, const char *names, 119 const UDataOffsetTOCEntry *toc, int32_t count) { 120 int32_t start=0; 121 int32_t limit=count; 122 /* 123 * Remember the shared prefix between s, start and limit, 124 * and don't compare that shared prefix again. 125 * The shared prefix should get longer as we narrow the [start, limit[ range. 126 */ 127 int32_t startPrefixLength=0; 128 int32_t limitPrefixLength=0; 129 if(count==0) { 130 return -1; 131 } 132 /* 133 * Prime the prefix lengths so that we don't keep prefixLength at 0 until 134 * both the start and limit indexes have moved. 135 * At the same time, we find if s is one of the start and (limit-1) names, 136 * and if not, exclude them from the actual binary search. 137 */ 138 if(0==strcmpAfterPrefix(s, names+toc[0].nameOffset, &startPrefixLength)) { 139 return 0; 140 } 141 ++start; 142 --limit; 143 if(0==strcmpAfterPrefix(s, names+toc[limit].nameOffset, &limitPrefixLength)) { 144 return limit; 145 } 146 while(start<limit) { 147 int32_t i=(start+limit)/2; 148 int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength); 149 int32_t cmp=strcmpAfterPrefix(s, names+toc[i].nameOffset, &prefixLength); 150 if(cmp<0) { 151 limit=i; 152 limitPrefixLength=prefixLength; 153 } else if(cmp==0) { 154 return i; 155 } else { 156 start=i+1; 157 startPrefixLength=prefixLength; 158 } 159 } 160 return -1; 161 } 162 163 static int32_t 164 pointerTOCPrefixBinarySearch(const char *s, const PointerTOCEntry *toc, int32_t count) { 165 int32_t start=0; 166 int32_t limit=count; 167 /* 168 * Remember the shared prefix between s, start and limit, 169 * and don't compare that shared prefix again. 170 * The shared prefix should get longer as we narrow the [start, limit[ range. 171 */ 172 int32_t startPrefixLength=0; 173 int32_t limitPrefixLength=0; 174 if(count==0) { 175 return -1; 176 } 177 /* 178 * Prime the prefix lengths so that we don't keep prefixLength at 0 until 179 * both the start and limit indexes have moved. 180 * At the same time, we find if s is one of the start and (limit-1) names, 181 * and if not, exclude them from the actual binary search. 182 */ 183 if(0==strcmpAfterPrefix(s, toc[0].entryName, &startPrefixLength)) { 184 return 0; 185 } 186 ++start; 187 --limit; 188 if(0==strcmpAfterPrefix(s, toc[limit].entryName, &limitPrefixLength)) { 189 return limit; 190 } 191 while(start<limit) { 192 int32_t i=(start+limit)/2; 193 int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength); 194 int32_t cmp=strcmpAfterPrefix(s, toc[i].entryName, &prefixLength); 195 if(cmp<0) { 196 limit=i; 197 limitPrefixLength=prefixLength; 198 } else if(cmp==0) { 199 return i; 200 } else { 201 start=i+1; 202 startPrefixLength=prefixLength; 203 } 204 } 205 return -1; 206 } 207 208 static uint32_t offsetTOCEntryCount(const UDataMemory *pData) { 209 int32_t retVal=0; 210 const UDataOffsetTOC *toc = (UDataOffsetTOC *)pData->toc; 211 if (toc != NULL) { 212 retVal = toc->count; 213 } 214 return retVal; 215 } 216 217 static const DataHeader * 218 offsetTOCLookupFn(const UDataMemory *pData, 219 const char *tocEntryName, 220 int32_t *pLength, 221 UErrorCode *pErrorCode) { 222 const UDataOffsetTOC *toc = (UDataOffsetTOC *)pData->toc; 223 if(toc!=NULL) { 224 const char *base=(const char *)toc; 225 int32_t number, count=(int32_t)toc->count; 226 227 /* perform a binary search for the data in the common data's table of contents */ 228 #if defined (UDATA_DEBUG_DUMP) 229 /* list the contents of the TOC each time .. not recommended */ 230 for(number=0; number<count; ++number) { 231 fprintf(stderr, "\tx%d: %s\n", number, &base[toc->entry[number].nameOffset]); 232 } 233 #endif 234 number=offsetTOCPrefixBinarySearch(tocEntryName, base, toc->entry, count); 235 if(number>=0) { 236 /* found it */ 237 const UDataOffsetTOCEntry *entry=toc->entry+number; 238 #ifdef UDATA_DEBUG 239 fprintf(stderr, "%s: Found.\n", tocEntryName); 240 #endif 241 if((number+1) < count) { 242 *pLength = (int32_t)(entry[1].dataOffset - entry->dataOffset); 243 } else { 244 *pLength = -1; 245 } 246 return (const DataHeader *)(base+entry->dataOffset); 247 } else { 248 #ifdef UDATA_DEBUG 249 fprintf(stderr, "%s: Not found.\n", tocEntryName); 250 #endif 251 return NULL; 252 } 253 } else { 254 #ifdef UDATA_DEBUG 255 fprintf(stderr, "returning header\n"); 256 #endif 257 258 return pData->pHeader; 259 } 260 } 261 262 263 static uint32_t pointerTOCEntryCount(const UDataMemory *pData) { 264 const PointerTOC *toc = (PointerTOC *)pData->toc; 265 return (uint32_t)((toc != NULL) ? (toc->count) : 0); 266 } 267 268 269 static const DataHeader *pointerTOCLookupFn(const UDataMemory *pData, 270 const char *name, 271 int32_t *pLength, 272 UErrorCode *pErrorCode) { 273 if(pData->toc!=NULL) { 274 const PointerTOC *toc = (PointerTOC *)pData->toc; 275 int32_t number, count=(int32_t)toc->count; 276 277 #if defined (UDATA_DEBUG_DUMP) 278 /* list the contents of the TOC each time .. not recommended */ 279 for(number=0; number<count; ++number) { 280 fprintf(stderr, "\tx%d: %s\n", number, toc->entry[number].entryName); 281 } 282 #endif 283 number=pointerTOCPrefixBinarySearch(name, toc->entry, count); 284 if(number>=0) { 285 /* found it */ 286 #ifdef UDATA_DEBUG 287 fprintf(stderr, "%s: Found.\n", toc->entry[number].entryName); 288 #endif 289 *pLength=-1; 290 return UDataMemory_normalizeDataPointer(toc->entry[number].pHeader); 291 } else { 292 #ifdef UDATA_DEBUG 293 fprintf(stderr, "%s: Not found.\n", name); 294 #endif 295 return NULL; 296 } 297 } else { 298 return pData->pHeader; 299 } 300 } 301 302 static const commonDataFuncs CmnDFuncs = {offsetTOCLookupFn, offsetTOCEntryCount}; 303 static const commonDataFuncs ToCPFuncs = {pointerTOCLookupFn, pointerTOCEntryCount}; 304 305 306 307 /*----------------------------------------------------------------------* 308 * * 309 * checkCommonData Validate the format of a common data file. * 310 * Fill in the virtual function ptr based on TOC type * 311 * If the data is invalid, close the UDataMemory * 312 * and set the appropriate error code. * 313 * * 314 *----------------------------------------------------------------------*/ 315 U_CFUNC void udata_checkCommonData(UDataMemory *udm, UErrorCode *err) { 316 if (U_FAILURE(*err)) { 317 return; 318 } 319 320 if(udm==NULL || udm->pHeader==NULL) { 321 *err=U_INVALID_FORMAT_ERROR; 322 } else if(!(udm->pHeader->dataHeader.magic1==0xda && 323 udm->pHeader->dataHeader.magic2==0x27 && 324 udm->pHeader->info.isBigEndian==U_IS_BIG_ENDIAN && 325 udm->pHeader->info.charsetFamily==U_CHARSET_FAMILY) 326 ) { 327 /* header not valid */ 328 *err=U_INVALID_FORMAT_ERROR; 329 } 330 else if (udm->pHeader->info.dataFormat[0]==0x43 && 331 udm->pHeader->info.dataFormat[1]==0x6d && 332 udm->pHeader->info.dataFormat[2]==0x6e && 333 udm->pHeader->info.dataFormat[3]==0x44 && 334 udm->pHeader->info.formatVersion[0]==1 335 ) { 336 /* dataFormat="CmnD" */ 337 udm->vFuncs = &CmnDFuncs; 338 udm->toc=(const char *)udm->pHeader+udata_getHeaderSize(udm->pHeader); 339 } 340 else if(udm->pHeader->info.dataFormat[0]==0x54 && 341 udm->pHeader->info.dataFormat[1]==0x6f && 342 udm->pHeader->info.dataFormat[2]==0x43 && 343 udm->pHeader->info.dataFormat[3]==0x50 && 344 udm->pHeader->info.formatVersion[0]==1 345 ) { 346 /* dataFormat="ToCP" */ 347 udm->vFuncs = &ToCPFuncs; 348 udm->toc=(const char *)udm->pHeader+udata_getHeaderSize(udm->pHeader); 349 } 350 else { 351 /* dataFormat not recognized */ 352 *err=U_INVALID_FORMAT_ERROR; 353 } 354 355 if (U_FAILURE(*err)) { 356 /* If the data is no good and we memory-mapped it ourselves, 357 * close the memory mapping so it doesn't leak. Note that this has 358 * no effect on non-memory mapped data, other than clearing fields in udm. 359 */ 360 udata_close(udm); 361 } 362 } 363 364 /* 365 * TODO: Add a udata_swapPackageHeader() function that swaps an ICU .dat package 366 * header but not its sub-items. 367 * This function will be needed for automatic runtime swapping. 368 * Sub-items should not be swapped to limit the swapping to the parts of the 369 * package that are actually used. 370 * 371 * Since lengths of items are implicit in the order and offsets of their 372 * ToC entries, and since offsets are relative to the start of the ToC, 373 * a swapped version may need to generate a different data structure 374 * with pointers to the original data items and with their lengths 375 * (-1 for the last one if it is not known), and maybe even pointers to the 376 * swapped versions of the items. 377 * These pointers to swapped versions would establish a cache; 378 * instead, each open data item could simply own the storage for its swapped 379 * data. This fits better with the current design. 380 * 381 * markus 2003sep18 Jitterbug 2235 382 */ 383