1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (c) 2003-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Author: Alan Liu 9 * Created: July 10 2003 10 * Since: ICU 2.8 11 ********************************************************************** 12 */ 13 #include "tzfile.h" // from Olson tzcode archive, copied to this dir 14 15 #ifdef WIN32 16 17 #include <windows.h> 18 #undef min // windows.h/STL conflict 19 #undef max // windows.h/STL conflict 20 // "identifier was truncated to 'number' characters" warning 21 #pragma warning(disable: 4786) 22 23 #else 24 25 #include <unistd.h> 26 #include <stdio.h> 27 #include <dirent.h> 28 #include <string.h> 29 #include <sys/stat.h> 30 31 #endif 32 33 #include <algorithm> 34 #include <cassert> 35 #include <ctime> 36 #include <fstream> 37 #include <iomanip> 38 #include <iostream> 39 #include <iterator> 40 #include <limits> 41 #include <map> 42 #include <set> 43 #include <sstream> 44 #include <sstream> 45 #include <stdexcept> 46 #include <string> 47 #include <vector> 48 49 #include "tz2icu.h" 50 #include "unicode/uversion.h" 51 52 using namespace std; 53 54 bool ICU44PLUS = TRUE; 55 string TZ_RESOURCE_NAME = ICU_TZ_RESOURCE; 56 57 //-------------------------------------------------------------------- 58 // Time utilities 59 //-------------------------------------------------------------------- 60 61 const int64_t SECS_PER_YEAR = 31536000; // 365 days 62 const int64_t SECS_PER_LEAP_YEAR = 31622400; // 366 days 63 const int64_t LOWEST_TIME32 = (int64_t)((int32_t)0x80000000); 64 const int64_t HIGHEST_TIME32 = (int64_t)((int32_t)0x7fffffff); 65 66 bool isLeap(int32_t y) { 67 return (y%4 == 0) && ((y%100 != 0) || (y%400 == 0)); // Gregorian 68 } 69 70 int64_t secsPerYear(int32_t y) { 71 return isLeap(y) ? SECS_PER_LEAP_YEAR : SECS_PER_YEAR; 72 } 73 74 /** 75 * Given a calendar year, return the GMT epoch seconds for midnight 76 * GMT of January 1 of that year. yearToSeconds(1970) == 0. 77 */ 78 int64_t yearToSeconds(int32_t year) { 79 // inefficient but foolproof 80 int64_t s = 0; 81 int32_t y = 1970; 82 while (y < year) { 83 s += secsPerYear(y++); 84 } 85 while (y > year) { 86 s -= secsPerYear(--y); 87 } 88 return s; 89 } 90 91 /** 92 * Given 1970 GMT epoch seconds, return the calendar year containing 93 * that time. secondsToYear(0) == 1970. 94 */ 95 int32_t secondsToYear(int64_t seconds) { 96 // inefficient but foolproof 97 int32_t y = 1970; 98 int64_t s = 0; 99 if (seconds >= 0) { 100 for (;;) { 101 s += secsPerYear(y++); 102 if (s > seconds) break; 103 } 104 --y; 105 } else { 106 for (;;) { 107 s -= secsPerYear(--y); 108 if (s <= seconds) break; 109 } 110 } 111 return y; 112 } 113 114 //-------------------------------------------------------------------- 115 // Types 116 //-------------------------------------------------------------------- 117 118 struct FinalZone; 119 struct FinalRule; 120 struct SimplifiedZoneType; 121 122 // A transition from one ZoneType to another 123 // Minimal size = 5 bytes (4+1) 124 struct Transition { 125 int64_t time; // seconds, 1970 epoch 126 int32_t type; // index into 'ZoneInfo.types' 0..255 127 Transition(int64_t _time, int32_t _type) { 128 time = _time; 129 type = _type; 130 } 131 }; 132 133 // A behavior mode (what zic calls a 'type') of a time zone. 134 // Minimal size = 6 bytes (4+1+3bits) 135 // SEE: SimplifiedZoneType 136 struct ZoneType { 137 int64_t rawoffset; // raw seconds offset from GMT 138 int64_t dstoffset; // dst seconds offset from GMT 139 140 // We don't really need any of the following, but they are 141 // retained for possible future use. See SimplifiedZoneType. 142 int32_t abbr; // index into ZoneInfo.abbrs 0..n-1 143 bool isdst; 144 bool isstd; 145 bool isgmt; 146 147 ZoneType(const SimplifiedZoneType&); // used by optimizeTypeList 148 149 ZoneType() : rawoffset(-1), dstoffset(-1), abbr(-1) {} 150 151 // A restricted equality, of just the raw and dst offset 152 bool matches(const ZoneType& other) { 153 return rawoffset == other.rawoffset && 154 dstoffset == other.dstoffset; 155 } 156 }; 157 158 // A collection of transitions from one ZoneType to another, together 159 // with a list of the ZoneTypes. A ZoneInfo object may have a long 160 // list of transitions between a smaller list of ZoneTypes. 161 // 162 // This object represents the contents of a single zic-created 163 // zoneinfo file. 164 struct ZoneInfo { 165 vector<Transition> transitions; 166 vector<ZoneType> types; 167 vector<string> abbrs; 168 169 string finalRuleID; 170 int32_t finalOffset; 171 int32_t finalYear; // -1 if none 172 173 // If this is an alias, then all other fields are meaningless, and 174 // this field will point to the "real" zone 0..n-1. 175 int32_t aliasTo; // -1 if this is a "real" zone 176 177 // If there are aliases TO this zone, then the following set will 178 // contain their index numbers (each index >= 0). 179 set<int32_t> aliases; 180 181 ZoneInfo() : finalYear(-1), aliasTo(-1) {} 182 183 void mergeFinalData(const FinalZone& fz); 184 185 void optimizeTypeList(); 186 187 // Set this zone to be an alias TO another zone. 188 void setAliasTo(int32_t index); 189 190 // Clear the list of aliases OF this zone. 191 void clearAliases(); 192 193 // Add an alias to the list of aliases OF this zone. 194 void addAlias(int32_t index); 195 196 // Is this an alias to another zone? 197 bool isAlias() const { 198 return aliasTo >= 0; 199 } 200 201 // Retrieve alias list 202 const set<int32_t>& getAliases() const { 203 return aliases; 204 } 205 206 void print(ostream& os, const string& id) const; 207 }; 208 209 void ZoneInfo::clearAliases() { 210 assert(aliasTo < 0); 211 aliases.clear(); 212 } 213 214 void ZoneInfo::addAlias(int32_t index) { 215 assert(aliasTo < 0 && index >= 0 && aliases.find(index) == aliases.end()); 216 aliases.insert(index); 217 } 218 219 void ZoneInfo::setAliasTo(int32_t index) { 220 assert(index >= 0); 221 assert(aliases.size() == 0); 222 aliasTo = index; 223 } 224 225 typedef map<string, ZoneInfo> ZoneMap; 226 227 typedef ZoneMap::const_iterator ZoneMapIter; 228 229 //-------------------------------------------------------------------- 230 // ZONEINFO 231 //-------------------------------------------------------------------- 232 233 // Global map holding all our ZoneInfo objects, indexed by id. 234 ZoneMap ZONEINFO; 235 236 //-------------------------------------------------------------------- 237 // zoneinfo file parsing 238 //-------------------------------------------------------------------- 239 240 // Read zic-coded 32-bit integer from file 241 int64_t readcoded(ifstream& file, int64_t minv=numeric_limits<int64_t>::min(), 242 int64_t maxv=numeric_limits<int64_t>::max()) { 243 unsigned char buf[4]; // must be UNSIGNED 244 int64_t val=0; 245 file.read((char*)buf, 4); 246 for(int32_t i=0,shift=24;i<4;++i,shift-=8) { 247 val |= buf[i] << shift; 248 } 249 if (val < minv || val > maxv) { 250 ostringstream os; 251 os << "coded value out-of-range: " << val << ", expected [" 252 << minv << ", " << maxv << "]"; 253 throw out_of_range(os.str()); 254 } 255 return val; 256 } 257 258 // Read zic-coded 64-bit integer from file 259 int64_t readcoded64(ifstream& file, int64_t minv=numeric_limits<int64_t>::min(), 260 int64_t maxv=numeric_limits<int64_t>::max()) { 261 unsigned char buf[8]; // must be UNSIGNED 262 int64_t val=0; 263 file.read((char*)buf, 8); 264 for(int32_t i=0,shift=56;i<8;++i,shift-=8) { 265 val |= (int64_t)buf[i] << shift; 266 } 267 if (val < minv || val > maxv) { 268 ostringstream os; 269 os << "coded value out-of-range: " << val << ", expected [" 270 << minv << ", " << maxv << "]"; 271 throw out_of_range(os.str()); 272 } 273 return val; 274 } 275 276 // Read a boolean value 277 bool readbool(ifstream& file) { 278 char c; 279 file.read(&c, 1); 280 if (c!=0 && c!=1) { 281 ostringstream os; 282 os << "boolean value out-of-range: " << (int32_t)c; 283 throw out_of_range(os.str()); 284 } 285 return (c!=0); 286 } 287 288 /** 289 * Read the zoneinfo file structure (see tzfile.h) into a ZoneInfo 290 * @param file an already-open file stream 291 */ 292 void readzoneinfo(ifstream& file, ZoneInfo& info, bool is64bitData) { 293 int32_t i; 294 295 // Check for TZ_ICU_MAGIC signature at file start. If we get a 296 // signature mismatch, it means we're trying to read a file which 297 // isn't a ICU-modified-zic-created zoneinfo file. Typically this 298 // means the user is passing in a "normal" zoneinfo directory, or 299 // a zoneinfo directory that is polluted with other files, or that 300 // the user passed in the wrong directory. 301 char buf[32]; 302 file.read(buf, 4); 303 if (strncmp(buf, TZ_ICU_MAGIC, 4) != 0) { 304 throw invalid_argument("TZ_ICU_MAGIC signature missing"); 305 } 306 // skip additional Olson byte version 307 file.read(buf, 1); 308 // if '\0', we have just one copy of data, if '2' or '3', there is additional 309 // 64 bit version at the end. 310 if(buf[0]!=0 && buf[0]!='2' && buf[0]!='3') { 311 throw invalid_argument("Bad Olson version info"); 312 } 313 314 // Read reserved bytes. The first of these will be a version byte. 315 file.read(buf, 15); 316 if (*(ICUZoneinfoVersion*)&buf != TZ_ICU_VERSION) { 317 throw invalid_argument("File version mismatch"); 318 } 319 320 // Read array sizes 321 int64_t isgmtcnt = readcoded(file, 0); 322 int64_t isdstcnt = readcoded(file, 0); 323 int64_t leapcnt = readcoded(file, 0); 324 int64_t timecnt = readcoded(file, 0); 325 int64_t typecnt = readcoded(file, 0); 326 int64_t charcnt = readcoded(file, 0); 327 328 // Confirm sizes that we assume to be equal. These assumptions 329 // are drawn from a reading of the zic source (2003a), so they 330 // should hold unless the zic source changes. 331 if (isgmtcnt != typecnt || isdstcnt != typecnt) { 332 throw invalid_argument("count mismatch between tzh_ttisgmtcnt, tzh_ttisdstcnt, tth_typecnt"); 333 } 334 335 // Used temporarily to store transition times and types. We need 336 // to do this because the times and types are stored in two 337 // separate arrays. 338 vector<int64_t> transitionTimes(timecnt, -1); // temporary 339 vector<int32_t> transitionTypes(timecnt, -1); // temporary 340 341 // Read transition times 342 for (i=0; i<timecnt; ++i) { 343 if (is64bitData) { 344 transitionTimes[i] = readcoded64(file); 345 } else { 346 transitionTimes[i] = readcoded(file); 347 } 348 } 349 350 // Read transition types 351 for (i=0; i<timecnt; ++i) { 352 unsigned char c; 353 file.read((char*) &c, 1); 354 int32_t t = (int32_t) c; 355 if (t < 0 || t >= typecnt) { 356 ostringstream os; 357 os << "illegal type: " << t << ", expected [0, " << (typecnt-1) << "]"; 358 throw out_of_range(os.str()); 359 } 360 transitionTypes[i] = t; 361 } 362 363 // Build transitions vector out of corresponding times and types. 364 bool insertInitial = false; 365 if (is64bitData && !ICU44PLUS) { 366 if (timecnt > 0) { 367 int32_t minidx = -1; 368 for (i=0; i<timecnt; ++i) { 369 if (transitionTimes[i] < LOWEST_TIME32) { 370 if (minidx == -1 || transitionTimes[i] > transitionTimes[minidx]) { 371 // Preserve the latest transition before the 32bit minimum time 372 minidx = i; 373 } 374 } else if (transitionTimes[i] > HIGHEST_TIME32) { 375 // Skipping the rest of the transition data. We cannot put such 376 // transitions into zoneinfo.res, because data is limited to singed 377 // 32bit int by the ICU resource bundle. 378 break; 379 } else { 380 info.transitions.push_back(Transition(transitionTimes[i], transitionTypes[i])); 381 } 382 } 383 384 if (minidx != -1) { 385 // If there are any transitions before the 32bit minimum time, 386 // put the type information with the 32bit minimum time 387 vector<Transition>::iterator itr = info.transitions.begin(); 388 info.transitions.insert(itr, Transition(LOWEST_TIME32, transitionTypes[minidx])); 389 } else { 390 // Otherwise, we need insert the initial type later 391 insertInitial = true; 392 } 393 } 394 } else { 395 for (i=0; i<timecnt; ++i) { 396 info.transitions.push_back(Transition(transitionTimes[i], transitionTypes[i])); 397 } 398 } 399 400 // Read types (except for the isdst and isgmt flags, which come later (why??)) 401 for (i=0; i<typecnt; ++i) { 402 ZoneType type; 403 404 type.rawoffset = readcoded(file); 405 type.dstoffset = readcoded(file); 406 type.isdst = readbool(file); 407 408 unsigned char c; 409 file.read((char*) &c, 1); 410 type.abbr = (int32_t) c; 411 412 if (type.isdst != (type.dstoffset != 0)) { 413 throw invalid_argument("isdst does not reflect dstoffset"); 414 } 415 416 info.types.push_back(type); 417 } 418 419 assert(info.types.size() == (unsigned) typecnt); 420 421 if (insertInitial) { 422 assert(timecnt > 0); 423 assert(typecnt > 0); 424 425 int32_t initialTypeIdx = -1; 426 427 // Check if the first type is not dst 428 if (info.types.at(0).dstoffset != 0) { 429 // Initial type's rawoffset is same with the rawoffset after the 430 // first transition, but no DST is observed. 431 int64_t rawoffset0 = (info.types.at(info.transitions.at(0).type)).rawoffset; 432 // Look for matching type 433 for (i=0; i<(int32_t)info.types.size(); ++i) { 434 if (info.types.at(i).rawoffset == rawoffset0 435 && info.types.at(i).dstoffset == 0) { 436 initialTypeIdx = i; 437 break; 438 } 439 } 440 } else { 441 initialTypeIdx = 0; 442 } 443 assert(initialTypeIdx >= 0); 444 // Add the initial type associated with the lowest int32 time 445 vector<Transition>::iterator itr = info.transitions.begin(); 446 info.transitions.insert(itr, Transition(LOWEST_TIME32, initialTypeIdx)); 447 } 448 449 450 // Read the abbreviation string 451 if (charcnt) { 452 // All abbreviations are concatenated together, with a 0 at 453 // the end of each abbr. 454 char* str = new char[charcnt + 8]; 455 file.read(str, charcnt); 456 457 // Split abbreviations apart into individual strings. Record 458 // offset of each abbr in a vector. 459 vector<int32_t> abbroffset; 460 char *limit=str+charcnt; 461 for (char* p=str; p<limit; ++p) { 462 char* start = p; 463 while (*p != 0) ++p; 464 info.abbrs.push_back(string(start, p-start)); 465 abbroffset.push_back(start-str); 466 } 467 468 // Remap all the abbrs. Old value is offset into concatenated 469 // raw abbr strings. New value is index into vector of 470 // strings. E.g., 0,5,10,14 => 0,1,2,3. 471 472 // Keep track of which abbreviations get used. 473 vector<bool> abbrseen(abbroffset.size(), false); 474 475 for (vector<ZoneType>::iterator it=info.types.begin(); 476 it!=info.types.end(); 477 ++it) { 478 vector<int32_t>::const_iterator x= 479 find(abbroffset.begin(), abbroffset.end(), it->abbr); 480 if (x==abbroffset.end()) { 481 // TODO: Modify code to add a new string to the end of 482 // the abbr list when a middle offset is given, e.g., 483 // "abc*def*" where * == '\0', take offset of 1 and 484 // make the array "abc", "def", "bc", and translate 1 485 // => 2. NOT CRITICAL since we don't even use the 486 // abbr at this time. 487 #if 0 488 // TODO: Re-enable this warning if we start using 489 // the Olson abbr data, or if the above TODO is completed. 490 ostringstream os; 491 os << "Warning: unusual abbr offset " << it->abbr 492 << ", expected one of"; 493 for (vector<int32_t>::const_iterator y=abbroffset.begin(); 494 y!=abbroffset.end(); ++y) { 495 os << ' ' << *y; 496 } 497 cerr << os.str() << "; using 0" << endl; 498 #endif 499 it->abbr = 0; 500 } else { 501 int32_t index = x - abbroffset.begin(); 502 it->abbr = index; 503 abbrseen[index] = true; 504 } 505 } 506 507 for (int32_t ii=0;ii<(int32_t) abbrseen.size();++ii) { 508 if (!abbrseen[ii]) { 509 cerr << "Warning: unused abbreviation: " << ii << endl; 510 } 511 } 512 } 513 514 // Read leap second info, if any. 515 // *** We discard leap second data. *** 516 for (i=0; i<leapcnt; ++i) { 517 readcoded(file); // transition time 518 readcoded(file); // total correction after above 519 } 520 521 // Read isstd flags 522 for (i=0; i<typecnt; ++i) info.types[i].isstd = readbool(file); 523 524 // Read isgmt flags 525 for (i=0; i<typecnt; ++i) info.types[i].isgmt = readbool(file); 526 } 527 528 //-------------------------------------------------------------------- 529 // Directory and file reading 530 //-------------------------------------------------------------------- 531 532 /** 533 * Process a single zoneinfo file, adding the data to ZONEINFO 534 * @param path the full path to the file, e.g., ".\zoneinfo\America\Los_Angeles" 535 * @param id the zone ID, e.g., "America/Los_Angeles" 536 */ 537 void handleFile(string path, string id) { 538 // Check for duplicate id 539 if (ZONEINFO.find(id) != ZONEINFO.end()) { 540 ostringstream os; 541 os << "duplicate zone ID: " << id; 542 throw invalid_argument(os.str()); 543 } 544 545 ifstream file(path.c_str(), ios::in | ios::binary); 546 if (!file) { 547 throw invalid_argument("can't open file"); 548 } 549 550 // eat 32bit data part 551 ZoneInfo info; 552 readzoneinfo(file, info, false); 553 554 // Check for errors 555 if (!file) { 556 throw invalid_argument("read error"); 557 } 558 559 // we only use 64bit part 560 ZoneInfo info64; 561 readzoneinfo(file, info64, true); 562 563 bool alldone = false; 564 int64_t eofPos = (int64_t) file.tellg(); 565 566 // '\n' + <envvar string> + '\n' after the 64bit version data 567 char ch = file.get(); 568 if (ch == 0x0a) { 569 bool invalidchar = false; 570 while (file.get(ch)) { 571 if (ch == 0x0a) { 572 break; 573 } 574 if (ch < 0x20) { 575 // must be printable ascii 576 invalidchar = true; 577 break; 578 } 579 } 580 if (!invalidchar) { 581 eofPos = (int64_t) file.tellg(); 582 file.seekg(0, ios::end); 583 eofPos = eofPos - (int64_t) file.tellg(); 584 if (eofPos == 0) { 585 alldone = true; 586 } 587 } 588 } 589 if (!alldone) { 590 ostringstream os; 591 os << (-eofPos) << " unprocessed bytes at end"; 592 throw invalid_argument(os.str()); 593 } 594 595 ZONEINFO[id] = info64; 596 } 597 598 /** 599 * Recursively scan the given directory, calling handleFile() for each 600 * file in the tree. The user should call with the root directory and 601 * a prefix of "". The function will call itself with non-empty 602 * prefix values. 603 */ 604 #ifdef WIN32 605 606 void scandir(string dirname, string prefix="") { 607 HANDLE hList; 608 WIN32_FIND_DATA FileData; 609 610 // Get the first file 611 hList = FindFirstFile((dirname + "\\*").c_str(), &FileData); 612 if (hList == INVALID_HANDLE_VALUE) { 613 cerr << "Error: Invalid directory: " << dirname << endl; 614 exit(1); 615 } 616 for (;;) { 617 string name(FileData.cFileName); 618 string path(dirname + "\\" + name); 619 if (FileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) { 620 if (name != "." && name != "..") { 621 scandir(path, prefix + name + "/"); 622 } 623 } else { 624 try { 625 string id = prefix + name; 626 handleFile(path, id); 627 } catch (const exception& e) { 628 cerr << "Error: While processing \"" << path << "\", " 629 << e.what() << endl; 630 exit(1); 631 } 632 } 633 634 if (!FindNextFile(hList, &FileData)) { 635 if (GetLastError() == ERROR_NO_MORE_FILES) { 636 break; 637 } // else...? 638 } 639 } 640 FindClose(hList); 641 } 642 643 #else 644 645 void scandir(string dir, string prefix="") { 646 DIR *dp; 647 struct dirent *dir_entry; 648 struct stat stat_info; 649 char pwd[512]; 650 vector<string> subdirs; 651 vector<string> subfiles; 652 653 if ((dp = opendir(dir.c_str())) == NULL) { 654 cerr << "Error: Invalid directory: " << dir << endl; 655 exit(1); 656 } 657 if (!getcwd(pwd, sizeof(pwd))) { 658 cerr << "Error: Directory name too long" << endl; 659 exit(1); 660 } 661 chdir(dir.c_str()); 662 while ((dir_entry = readdir(dp)) != NULL) { 663 string name = dir_entry->d_name; 664 string path = dir + "/" + name; 665 lstat(dir_entry->d_name,&stat_info); 666 if (S_ISDIR(stat_info.st_mode)) { 667 if (name != "." && name != "..") { 668 subdirs.push_back(path); 669 subdirs.push_back(prefix + name + "/"); 670 // scandir(path, prefix + name + "/"); 671 } 672 } else { 673 try { 674 string id = prefix + name; 675 subfiles.push_back(path); 676 subfiles.push_back(id); 677 // handleFile(path, id); 678 } catch (const exception& e) { 679 cerr << "Error: While processing \"" << path << "\", " 680 << e.what() << endl; 681 exit(1); 682 } 683 } 684 } 685 closedir(dp); 686 chdir(pwd); 687 688 for(int32_t i=0;i<(int32_t)subfiles.size();i+=2) { 689 try { 690 handleFile(subfiles[i], subfiles[i+1]); 691 } catch (const exception& e) { 692 cerr << "Error: While processing \"" << subfiles[i] << "\", " 693 << e.what() << endl; 694 exit(1); 695 } 696 } 697 for(int32_t i=0;i<(int32_t)subdirs.size();i+=2) { 698 scandir(subdirs[i], subdirs[i+1]); 699 } 700 } 701 702 #endif 703 704 //-------------------------------------------------------------------- 705 // Final zone and rule info 706 //-------------------------------------------------------------------- 707 708 /** 709 * Read and discard the current line. 710 */ 711 void consumeLine(istream& in) { 712 int32_t c; 713 do { 714 c = in.get(); 715 } while (c != EOF && c != '\n'); 716 } 717 718 enum { 719 DOM = 0, 720 DOWGEQ = 1, 721 DOWLEQ = 2 722 }; 723 724 const char* TIME_MODE[] = {"w", "s", "u"}; 725 726 // Allow 29 days in February because zic outputs February 29 727 // for rules like "last Sunday in February". 728 const int32_t MONTH_LEN[] = {31,29,31,30,31,30,31,31,30,31,30,31}; 729 730 const int32_t HOUR = 3600; 731 732 struct FinalZone { 733 int32_t offset; // raw offset 734 int32_t year; // takes effect for y >= year 735 string ruleid; 736 set<string> aliases; 737 FinalZone(int32_t _offset, int32_t _year, const string& _ruleid) : 738 offset(_offset), year(_year), ruleid(_ruleid) { 739 if (offset <= -16*HOUR || offset >= 16*HOUR) { 740 ostringstream os; 741 os << "Invalid input offset " << offset 742 << " for year " << year 743 << " and rule ID " << ruleid; 744 throw invalid_argument(os.str()); 745 } 746 if (year < 1900) { 747 ostringstream os; 748 os << "Invalid input year " << year 749 << " with offset " << offset 750 << " and rule ID " << ruleid; 751 throw invalid_argument(os.str()); 752 } 753 } 754 FinalZone() : offset(-1), year(-1) {} 755 void addLink(const string& alias) { 756 if (aliases.find(alias) != aliases.end()) { 757 ostringstream os; 758 os << "Duplicate alias " << alias; 759 throw invalid_argument(os.str()); 760 } 761 aliases.insert(alias); 762 } 763 }; 764 765 struct FinalRulePart { 766 int32_t mode; 767 int32_t month; 768 int32_t dom; 769 int32_t dow; 770 int32_t time; 771 int32_t offset; // dst offset, usually either 0 or 1:00 772 773 // Isstd and isgmt only have 3 valid states, corresponding to local 774 // wall time, local standard time, and GMT standard time. 775 // Here is how the isstd & isgmt flags are set by zic: 776 //| case 's': /* Standard */ 777 //| rp->r_todisstd = TRUE; 778 //| rp->r_todisgmt = FALSE; 779 //| case 'w': /* Wall */ 780 //| rp->r_todisstd = FALSE; 781 //| rp->r_todisgmt = FALSE; 782 //| case 'g': /* Greenwich */ 783 //| case 'u': /* Universal */ 784 //| case 'z': /* Zulu */ 785 //| rp->r_todisstd = TRUE; 786 //| rp->r_todisgmt = TRUE; 787 bool isstd; 788 bool isgmt; 789 790 bool isset; // used during building; later ignored 791 792 FinalRulePart() : isset(false) {} 793 void set(const string& id, 794 const string& _mode, 795 int32_t _month, 796 int32_t _dom, 797 int32_t _dow, 798 int32_t _time, 799 bool _isstd, 800 bool _isgmt, 801 int32_t _offset) { 802 if (isset) { 803 throw invalid_argument("FinalRulePart set twice"); 804 } 805 isset = true; 806 if (_mode == "DOWLEQ") { 807 mode = DOWLEQ; 808 } else if (_mode == "DOWGEQ") { 809 mode = DOWGEQ; 810 } else if (_mode == "DOM") { 811 mode = DOM; 812 } else { 813 throw invalid_argument("Unrecognized FinalRulePart mode"); 814 } 815 month = _month; 816 dom = _dom; 817 dow = _dow; 818 time = _time; 819 isstd = _isstd; 820 isgmt = _isgmt; 821 offset = _offset; 822 823 ostringstream os; 824 if (month < 0 || month >= 12) { 825 os << "Invalid input month " << month; 826 } 827 if (dom < 1 || dom > MONTH_LEN[month]) { 828 os << "Invalid input day of month " << dom; 829 } 830 if (mode != DOM && (dow < 0 || dow >= 7)) { 831 os << "Invalid input day of week " << dow; 832 } 833 if (offset < (-1 * HOUR) || offset > (2 * HOUR)) { 834 os << "Invalid input offset " << offset; 835 } 836 if (isgmt && !isstd) { 837 os << "Invalid input isgmt && !isstd"; 838 } 839 if (!os.str().empty()) { 840 os << " for rule " 841 << id 842 << _mode 843 << month << dom << dow << time 844 << isstd << isgmt 845 << offset; 846 throw invalid_argument(os.str()); 847 } 848 } 849 850 /** 851 * Return the time mode as an ICU SimpleTimeZone int from 0..2; 852 * see simpletz.h. 853 */ 854 int32_t timemode() const { 855 if (isgmt) { 856 assert(isstd); 857 return 2; // gmt standard 858 } 859 if (isstd) { 860 return 1; // local standard 861 } 862 return 0; // local wall 863 } 864 865 // The SimpleTimeZone encoding method for rules is as follows: 866 // stz_dowim stz_dow 867 // DOM: dom 0 868 // DOWGEQ: dom -(dow+1) 869 // DOWLEQ: -dom -(dow+1) 870 // E.g., to encode Mon>=7, use stz_dowim=7, stz_dow=-2 871 // to encode Mon<=7, use stz_dowim=-7, stz_dow=-2 872 // to encode 7, use stz_dowim=7, stz_dow=0 873 // Note that for this program and for SimpleTimeZone, 0==Jan, 874 // but for this program 0==Sun while for SimpleTimeZone 1==Sun. 875 876 /** 877 * Return a "dowim" param suitable for SimpleTimeZone. 878 */ 879 int32_t stz_dowim() const { 880 return (mode == DOWLEQ) ? -dom : dom; 881 } 882 883 /** 884 * Return a "dow" param suitable for SimpleTimeZone. 885 */ 886 int32_t stz_dow() const { 887 return (mode == DOM) ? 0 : -(dow+1); 888 } 889 }; 890 891 struct FinalRule { 892 FinalRulePart part[2]; 893 894 bool isset() const { 895 return part[0].isset && part[1].isset; 896 } 897 898 void print(ostream& os) const; 899 }; 900 901 map<string,FinalZone> finalZones; 902 map<string,FinalRule> finalRules; 903 904 map<string, set<string> > links; 905 map<string, string> reverseLinks; 906 907 /** 908 * Predicate used to find FinalRule objects that do not have both 909 * sub-parts set (indicating an error in the input file). 910 */ 911 bool isNotSet(const pair<const string,FinalRule>& p) { 912 return !p.second.isset(); 913 } 914 915 /** 916 * Predicate used to find FinalZone objects that do not map to a known 917 * rule (indicating an error in the input file). 918 */ 919 bool mapsToUnknownRule(const pair<const string,FinalZone>& p) { 920 return finalRules.find(p.second.ruleid) == finalRules.end(); 921 } 922 923 /** 924 * This set is used to make sure each rule in finalRules is used at 925 * least once. First we populate it with all the rules from 926 * finalRules; then we remove all the rules referred to in 927 * finaleZones. 928 */ 929 set<string> ruleIDset; 930 931 void insertRuleID(const pair<string,FinalRule>& p) { 932 ruleIDset.insert(p.first); 933 } 934 935 void eraseRuleID(const pair<string,FinalZone>& p) { 936 ruleIDset.erase(p.second.ruleid); 937 } 938 939 /** 940 * Populate finalZones and finalRules from the given istream. 941 */ 942 void readFinalZonesAndRules(istream& in) { 943 944 for (;;) { 945 string token; 946 in >> token; 947 if (in.eof() || !in) { 948 break; 949 } else if (token == "zone") { 950 // zone Africa/Cairo 7200 1995 Egypt # zone Africa/Cairo, offset 7200, year >= 1995, rule Egypt (0) 951 string id, ruleid; 952 int32_t offset, year; 953 in >> id >> offset >> year >> ruleid; 954 consumeLine(in); 955 finalZones[id] = FinalZone(offset, year, ruleid); 956 } else if (token == "rule") { 957 // rule US DOWGEQ 3 1 0 7200 0 0 3600 # 52: US, file data/northamerica, line 119, mode DOWGEQ, April, dom 1, Sunday, time 7200, isstd 0, isgmt 0, offset 3600 958 // rule US DOWLEQ 9 31 0 7200 0 0 0 # 53: US, file data/northamerica, line 114, mode DOWLEQ, October, dom 31, Sunday, time 7200, isstd 0, isgmt 0, offset 0 959 string id, mode; 960 int32_t month, dom, dow, time, offset; 961 bool isstd, isgmt; 962 in >> id >> mode >> month >> dom >> dow >> time >> isstd >> isgmt >> offset; 963 consumeLine(in); 964 FinalRule& fr = finalRules[id]; 965 int32_t p = fr.part[0].isset ? 1 : 0; 966 fr.part[p].set(id, mode, month, dom, dow, time, isstd, isgmt, offset); 967 } else if (token == "link") { 968 string fromid, toid; // fromid == "real" zone, toid == alias 969 in >> fromid >> toid; 970 // DO NOT consumeLine(in); 971 if (finalZones.find(toid) != finalZones.end()) { 972 throw invalid_argument("Bad link: `to' id is a \"real\" zone"); 973 } 974 975 links[fromid].insert(toid); 976 reverseLinks[toid] = fromid; 977 } else if (token.length() > 0 && token[0] == '#') { 978 consumeLine(in); 979 } else { 980 throw invalid_argument("Unrecognized keyword"); 981 } 982 } 983 984 if (!in.eof() && !in) { 985 throw invalid_argument("Parse failure"); 986 } 987 988 // Perform validity check: Each rule should have data for 2 parts. 989 if (count_if(finalRules.begin(), finalRules.end(), isNotSet) != 0) { 990 throw invalid_argument("One or more incomplete rule pairs"); 991 } 992 993 // Perform validity check: Each zone should map to a known rule. 994 if (count_if(finalZones.begin(), finalZones.end(), mapsToUnknownRule) != 0) { 995 throw invalid_argument("One or more zones refers to an unknown rule"); 996 } 997 998 // Perform validity check: Each rule should be referred to by a zone. 999 ruleIDset.clear(); 1000 for_each(finalRules.begin(), finalRules.end(), insertRuleID); 1001 for_each(finalZones.begin(), finalZones.end(), eraseRuleID); 1002 if (ruleIDset.size() != 0) { 1003 throw invalid_argument("Unused rules"); 1004 } 1005 } 1006 1007 //-------------------------------------------------------------------- 1008 // Resource bundle output 1009 //-------------------------------------------------------------------- 1010 1011 // SEE olsontz.h FOR RESOURCE BUNDLE DATA LAYOUT 1012 1013 void ZoneInfo::print(ostream& os, const string& id) const { 1014 // Implement compressed format #2: 1015 os << " /* " << id << " */ "; 1016 1017 if (aliasTo >= 0) { 1018 assert(aliases.size() == 0); 1019 os << ":int { " << aliasTo << " } "; // No endl - save room for comment. 1020 return; 1021 } 1022 1023 if (ICU44PLUS) { 1024 os << ":table {" << endl; 1025 } else { 1026 os << ":array {" << endl; 1027 } 1028 1029 vector<Transition>::const_iterator trn; 1030 vector<ZoneType>::const_iterator typ; 1031 1032 bool first; 1033 1034 if (ICU44PLUS) { 1035 trn = transitions.begin(); 1036 1037 // pre 32bit transitions 1038 if (trn != transitions.end() && trn->time < LOWEST_TIME32) { 1039 os << " transPre32:intvector { "; 1040 for (first = true; trn != transitions.end() && trn->time < LOWEST_TIME32; ++trn) { 1041 if (!first) { 1042 os<< ", "; 1043 } 1044 first = false; 1045 os << (int32_t)(trn->time >> 32) << ", " << (int32_t)(trn->time & 0x00000000ffffffff); 1046 } 1047 os << " }" << endl; 1048 } 1049 1050 // 32bit transtions 1051 if (trn != transitions.end() && trn->time < HIGHEST_TIME32) { 1052 os << " trans:intvector { "; 1053 for (first = true; trn != transitions.end() && trn->time < HIGHEST_TIME32; ++trn) { 1054 if (!first) { 1055 os << ", "; 1056 } 1057 first = false; 1058 os << trn->time; 1059 } 1060 os << " }" << endl; 1061 } 1062 1063 // post 32bit transitons 1064 if (trn != transitions.end()) { 1065 os << " transPost32:intvector { "; 1066 for (first = true; trn != transitions.end(); ++trn) { 1067 if (!first) { 1068 os<< ", "; 1069 } 1070 first = false; 1071 os << (int32_t)(trn->time >> 32) << ", " << (int32_t)(trn->time & 0x00000000ffffffff); 1072 } 1073 os << " }" << endl; 1074 } 1075 } else { 1076 os << " :intvector { "; 1077 for (trn = transitions.begin(), first = true; trn != transitions.end(); ++trn) { 1078 if (!first) os << ", "; 1079 first = false; 1080 os << trn->time; 1081 } 1082 os << " }" << endl; 1083 } 1084 1085 1086 first=true; 1087 if (ICU44PLUS) { 1088 os << " typeOffsets:intvector { "; 1089 } else { 1090 os << " :intvector { "; 1091 } 1092 for (typ = types.begin(); typ != types.end(); ++typ) { 1093 if (!first) os << ", "; 1094 first = false; 1095 os << typ->rawoffset << ", " << typ->dstoffset; 1096 } 1097 os << " }" << endl; 1098 1099 if (ICU44PLUS) { 1100 if (transitions.size() != 0) { 1101 os << " typeMap:bin { \"" << hex << setfill('0'); 1102 for (trn = transitions.begin(); trn != transitions.end(); ++trn) { 1103 os << setw(2) << trn->type; 1104 } 1105 os << dec << "\" }" << endl; 1106 } 1107 } else { 1108 os << " :bin { \"" << hex << setfill('0'); 1109 for (trn = transitions.begin(); trn != transitions.end(); ++trn) { 1110 os << setw(2) << trn->type; 1111 } 1112 os << dec << "\" }" << endl; 1113 } 1114 1115 // Final zone info, if any 1116 if (finalYear != -1) { 1117 if (ICU44PLUS) { 1118 os << " finalRule { \"" << finalRuleID << "\" }" << endl; 1119 os << " finalRaw:int { " << finalOffset << " }" << endl; 1120 os << " finalYear:int { " << finalYear << " }" << endl; 1121 } else { 1122 os << " \"" << finalRuleID << "\"" << endl; 1123 os << " :intvector { " << finalOffset << ", " 1124 << finalYear << " }" << endl; 1125 } 1126 } 1127 1128 // Alias list, if any 1129 if (aliases.size() != 0) { 1130 first = true; 1131 if (ICU44PLUS) { 1132 os << " links:intvector { "; 1133 } else { 1134 os << " :intvector { "; 1135 } 1136 for (set<int32_t>::const_iterator i=aliases.begin(); i!=aliases.end(); ++i) { 1137 if (!first) os << ", "; 1138 first = false; 1139 os << *i; 1140 } 1141 os << " }" << endl; 1142 } 1143 1144 os << " } "; // no trailing 'endl', so comments can be placed. 1145 } 1146 1147 inline ostream& 1148 operator<<(ostream& os, const ZoneMap& zoneinfo) { 1149 int32_t c = 0; 1150 for (ZoneMapIter it = zoneinfo.begin(); 1151 it != zoneinfo.end(); 1152 ++it) { 1153 if(c && !ICU44PLUS) os << ","; 1154 it->second.print(os, it->first); 1155 os << "//Z#" << c++ << endl; 1156 } 1157 return os; 1158 } 1159 1160 // print the string list 1161 ostream& printStringList( ostream& os, const ZoneMap& zoneinfo) { 1162 int32_t n = 0; // count 1163 int32_t col = 0; // column 1164 os << " Names {" << endl 1165 << " "; 1166 for (ZoneMapIter it = zoneinfo.begin(); 1167 it != zoneinfo.end(); 1168 ++it) { 1169 if(n) { 1170 os << ","; 1171 col ++; 1172 } 1173 const string& id = it->first; 1174 os << "\"" << id << "\""; 1175 col += id.length() + 2; 1176 if(col >= 50) { 1177 os << " // " << n << endl 1178 << " "; 1179 col = 0; 1180 } 1181 n++; 1182 } 1183 os << " // " << (n-1) << endl 1184 << " }" << endl; 1185 1186 return os; 1187 } 1188 1189 //-------------------------------------------------------------------- 1190 // main 1191 //-------------------------------------------------------------------- 1192 1193 // Unary predicate for finding transitions after a given time 1194 bool isAfter(const Transition t, int64_t thresh) { 1195 return t.time >= thresh; 1196 } 1197 1198 /** 1199 * A zone type that contains only the raw and dst offset. Used by the 1200 * optimizeTypeList() method. 1201 */ 1202 struct SimplifiedZoneType { 1203 int64_t rawoffset; 1204 int64_t dstoffset; 1205 SimplifiedZoneType() : rawoffset(-1), dstoffset(-1) {} 1206 SimplifiedZoneType(const ZoneType& t) : rawoffset(t.rawoffset), 1207 dstoffset(t.dstoffset) {} 1208 bool operator<(const SimplifiedZoneType& t) const { 1209 return rawoffset < t.rawoffset || 1210 (rawoffset == t.rawoffset && 1211 dstoffset < t.dstoffset); 1212 } 1213 }; 1214 1215 /** 1216 * Construct a ZoneType from a SimplifiedZoneType. Note that this 1217 * discards information; the new ZoneType will have meaningless 1218 * (empty) abbr, isdst, isstd, and isgmt flags; this is appropriate, 1219 * since ignoring these is how we do optimization (we have no use for 1220 * these in historical transitions). 1221 */ 1222 ZoneType::ZoneType(const SimplifiedZoneType& t) : 1223 rawoffset(t.rawoffset), dstoffset(t.dstoffset), 1224 abbr(-1), isdst(false), isstd(false), isgmt(false) {} 1225 1226 /** 1227 * Optimize the type list to remove excess entries. The type list may 1228 * contain entries that are distinct only in terms of their dst, std, 1229 * or gmt flags. Since we don't care about those flags, we can reduce 1230 * the type list to a set of unique raw/dst offset pairs, and remap 1231 * the type indices in the transition list, which stores, for each 1232 * transition, a transition time and a type index. 1233 */ 1234 void ZoneInfo::optimizeTypeList() { 1235 // Assemble set of unique types; only those in the `transitions' 1236 // list, since there may be unused types in the `types' list 1237 // corresponding to transitions that have been trimmed (during 1238 // merging of final data). 1239 1240 if (aliasTo >= 0) return; // Nothing to do for aliases 1241 1242 if (!ICU44PLUS) { 1243 // This is the old logic which has a bug, which occasionally removes 1244 // the type before the first transition. The problem was fixed 1245 // by inserting the dummy transition indirectly. 1246 1247 // If there are zero transitions and one type, then leave that as-is. 1248 if (transitions.size() == 0) { 1249 if (types.size() != 1) { 1250 cerr << "Error: transition count = 0, type count = " << types.size() << endl; 1251 } 1252 return; 1253 } 1254 1255 set<SimplifiedZoneType> simpleset; 1256 for (vector<Transition>::const_iterator i=transitions.begin(); 1257 i!=transitions.end(); ++i) { 1258 assert(i->type < (int32_t)types.size()); 1259 simpleset.insert(types[i->type]); 1260 } 1261 1262 // Map types to integer indices 1263 map<SimplifiedZoneType,int32_t> simplemap; 1264 int32_t n=0; 1265 for (set<SimplifiedZoneType>::const_iterator i=simpleset.begin(); 1266 i!=simpleset.end(); ++i) { 1267 simplemap[*i] = n++; 1268 } 1269 1270 // Remap transitions 1271 for (vector<Transition>::iterator i=transitions.begin(); 1272 i!=transitions.end(); ++i) { 1273 assert(i->type < (int32_t)types.size()); 1274 ZoneType oldtype = types[i->type]; 1275 SimplifiedZoneType newtype(oldtype); 1276 assert(simplemap.find(newtype) != simplemap.end()); 1277 i->type = simplemap[newtype]; 1278 } 1279 1280 // Replace type list 1281 types.clear(); 1282 copy(simpleset.begin(), simpleset.end(), back_inserter(types)); 1283 1284 } else { 1285 if (types.size() > 1) { 1286 // Note: localtime uses the very first non-dst type as initial offsets. 1287 // If all types are DSTs, the very first type is treated as the initial offsets. 1288 1289 // Decide a type used as the initial offsets. ICU put the type at index 0. 1290 ZoneType initialType = types[0]; 1291 for (vector<ZoneType>::const_iterator i=types.begin(); i!=types.end(); ++i) { 1292 if (i->dstoffset == 0) { 1293 initialType = *i; 1294 break; 1295 } 1296 } 1297 1298 SimplifiedZoneType initialSimplifiedType(initialType); 1299 1300 // create a set of unique types, but ignoring fields which we're not interested in 1301 set<SimplifiedZoneType> simpleset; 1302 simpleset.insert(initialSimplifiedType); 1303 for (vector<Transition>::const_iterator i=transitions.begin(); i!=transitions.end(); ++i) { 1304 assert(i->type < (int32_t)types.size()); 1305 simpleset.insert(types[i->type]); 1306 } 1307 1308 // Map types to integer indices, however, keeping the first type at offset 0 1309 map<SimplifiedZoneType,int32_t> simplemap; 1310 simplemap[initialSimplifiedType] = 0; 1311 int32_t n = 1; 1312 for (set<SimplifiedZoneType>::const_iterator i=simpleset.begin(); i!=simpleset.end(); ++i) { 1313 if (*i < initialSimplifiedType || initialSimplifiedType < *i) { 1314 simplemap[*i] = n++; 1315 } 1316 } 1317 1318 // Remap transitions 1319 for (vector<Transition>::iterator i=transitions.begin(); 1320 i!=transitions.end(); ++i) { 1321 assert(i->type < (int32_t)types.size()); 1322 ZoneType oldtype = types[i->type]; 1323 SimplifiedZoneType newtype(oldtype); 1324 assert(simplemap.find(newtype) != simplemap.end()); 1325 i->type = simplemap[newtype]; 1326 } 1327 1328 // Replace type list 1329 types.clear(); 1330 types.push_back(initialSimplifiedType); 1331 for (set<SimplifiedZoneType>::const_iterator i=simpleset.begin(); i!=simpleset.end(); ++i) { 1332 if (*i < initialSimplifiedType || initialSimplifiedType < *i) { 1333 types.push_back(*i); 1334 } 1335 } 1336 1337 // Reiterating transitions to remove any transitions which 1338 // do not actually change the raw/dst offsets 1339 int32_t prevTypeIdx = 0; 1340 for (vector<Transition>::iterator i=transitions.begin(); i!=transitions.end();) { 1341 if (i->type == prevTypeIdx) { 1342 // this is not a time transition, probably just name change 1343 // e.g. America/Resolute after 2006 in 2010b 1344 transitions.erase(i); 1345 } else { 1346 prevTypeIdx = i->type; 1347 i++; 1348 } 1349 } 1350 } 1351 } 1352 1353 } 1354 1355 /** 1356 * Merge final zone data into this zone. 1357 */ 1358 void ZoneInfo::mergeFinalData(const FinalZone& fz) { 1359 int32_t year = fz.year; 1360 int64_t seconds = yearToSeconds(year); 1361 1362 if (!ICU44PLUS) { 1363 if (seconds > HIGHEST_TIME32) { 1364 // Avoid transitions beyond signed 32bit max second. 1365 // This may result incorrect offset computation around 1366 // HIGHEST_TIME32. This is a limitation of ICU 1367 // before 4.4. 1368 seconds = HIGHEST_TIME32; 1369 } 1370 } 1371 1372 vector<Transition>::iterator it = 1373 find_if(transitions.begin(), transitions.end(), 1374 bind2nd(ptr_fun(isAfter), seconds)); 1375 transitions.erase(it, transitions.end()); 1376 1377 if (finalYear != -1) { 1378 throw invalid_argument("Final zone already merged in"); 1379 } 1380 finalYear = fz.year; 1381 finalOffset = fz.offset; 1382 finalRuleID = fz.ruleid; 1383 } 1384 1385 /** 1386 * Merge the data from the given final zone into the core zone data by 1387 * calling the ZoneInfo member function mergeFinalData. 1388 */ 1389 void mergeOne(const string& zoneid, const FinalZone& fz) { 1390 if (ZONEINFO.find(zoneid) == ZONEINFO.end()) { 1391 throw invalid_argument("Unrecognized final zone ID"); 1392 } 1393 ZONEINFO[zoneid].mergeFinalData(fz); 1394 } 1395 1396 /** 1397 * Visitor function that merges the final zone data into the main zone 1398 * data structures. It calls mergeOne for each final zone and its 1399 * list of aliases. 1400 */ 1401 void mergeFinalZone(const pair<string,FinalZone>& p) { 1402 const string& id = p.first; 1403 const FinalZone& fz = p.second; 1404 1405 mergeOne(id, fz); 1406 } 1407 1408 /** 1409 * Print this rule in resource bundle format to os. ID and enclosing 1410 * braces handled elsewhere. 1411 */ 1412 void FinalRule::print(ostream& os) const { 1413 // First print the rule part that enters DST; then the rule part 1414 // that exits it. 1415 int32_t whichpart = (part[0].offset != 0) ? 0 : 1; 1416 assert(part[whichpart].offset != 0); 1417 assert(part[1-whichpart].offset == 0); 1418 1419 os << " "; 1420 for (int32_t i=0; i<2; ++i) { 1421 const FinalRulePart& p = part[whichpart]; 1422 whichpart = 1-whichpart; 1423 os << p.month << ", " << p.stz_dowim() << ", " << p.stz_dow() << ", " 1424 << p.time << ", " << p.timemode() << ", "; 1425 } 1426 os << part[whichpart].offset << endl; 1427 } 1428 1429 #define ICU_ZONE_OVERRIDE_SUFFIX "--ICU" 1430 #define ICU_ZONE_OVERRIDE_SUFFIX_LEN 5 1431 1432 int main(int argc, char *argv[]) { 1433 string rootpath, zonetab, version; 1434 bool validArgs = FALSE; 1435 1436 if (argc == 4 || argc == 5) { 1437 validArgs = TRUE; 1438 rootpath = argv[1]; 1439 zonetab = argv[2]; 1440 version = argv[3]; 1441 if (argc == 5) { 1442 if (strcmp(argv[4], "--old") == 0) { 1443 ICU44PLUS = FALSE; 1444 TZ_RESOURCE_NAME = ICU_TZ_RESOURCE_OLD; 1445 } else { 1446 validArgs = FALSE; 1447 } 1448 } 1449 } 1450 if (!validArgs) { 1451 cout << "Usage: tz2icu <dir> <cmap> <tzver> [--old]" << endl 1452 << " <dir> path to zoneinfo file tree generated by" << endl 1453 << " ICU-patched version of zic" << endl 1454 << " <cmap> country map, from tzdata archive," << endl 1455 << " typically named \"zone.tab\"" << endl 1456 << " <tzver> version string, such as \"2003e\"" << endl 1457 << " --old generating resource format before ICU4.4" << endl; 1458 exit(1); 1459 } 1460 1461 cout << "Olson data version: " << version << endl; 1462 cout << "ICU 4.4+ format: " << (ICU44PLUS ? "Yes" : "No") << endl; 1463 1464 try { 1465 ifstream finals(ICU_ZONE_FILE); 1466 if (finals) { 1467 readFinalZonesAndRules(finals); 1468 1469 cout << "Finished reading " << finalZones.size() 1470 << " final zones and " << finalRules.size() 1471 << " final rules from " ICU_ZONE_FILE << endl; 1472 } else { 1473 cerr << "Error: Unable to open " ICU_ZONE_FILE << endl; 1474 return 1; 1475 } 1476 } catch (const exception& error) { 1477 cerr << "Error: While reading " ICU_ZONE_FILE ": " << error.what() << endl; 1478 return 1; 1479 } 1480 1481 try { 1482 // Recursively scan all files below the given path, accumulating 1483 // their data into ZONEINFO. All files must be TZif files. Any 1484 // failure along the way will result in a call to exit(1). 1485 scandir(rootpath); 1486 } catch (const exception& error) { 1487 cerr << "Error: While scanning " << rootpath << ": " << error.what() << endl; 1488 return 1; 1489 } 1490 1491 cout << "Finished reading " << ZONEINFO.size() << " zoneinfo files [" 1492 << (ZONEINFO.begin())->first << ".." 1493 << (--ZONEINFO.end())->first << "]" << endl; 1494 1495 // Overrides TZ database zones with ICU custom zone definition. 1496 // These ICU zone overrides are defined in icuzones, with suffix --ICU. 1497 // If there is a matching TZ database zone, the zoneinfo is replaced 1498 // with the ICU definition. Then, the zone ID with --ICU suffix 1499 // will be deleted from the final list. 1500 // For example, zoneinfo for Europe/Dublin imported from the TZ database 1501 // will be replaced with the zone definition for Europe/Dublin--ICU 1502 // in icuzones. 1503 1504 // Collect zone IDs to be modified with ICU definition. 1505 vector<string> customZones; 1506 for (ZoneMapIter i = ZONEINFO.begin(); i != ZONEINFO.end(); ++i) { 1507 const string& id = i->first; 1508 size_t idx = id.rfind(ICU_ZONE_OVERRIDE_SUFFIX); 1509 if (idx != string::npos && idx == id.length() - ICU_ZONE_OVERRIDE_SUFFIX_LEN) { 1510 cout << "ICU zone override: " << id << endl; 1511 customZones.push_back(id.substr(0, idx)); 1512 } 1513 } 1514 1515 // 1516 // BEGIN ICU Custom ZoneInfo Override Handling 1517 // 1518 1519 // Replace zoneinfo with ICU definition, then remove ICU zone ID with 1520 // the special suffix. 1521 for (vector<string>::iterator i = customZones.begin(); i != customZones.end(); i++) { 1522 string& origId = *i; 1523 string custId = origId + ICU_ZONE_OVERRIDE_SUFFIX; 1524 1525 map<string,ZoneInfo>::iterator origZi = ZONEINFO.find(origId); 1526 map<string,ZoneInfo>::iterator custZi = ZONEINFO.find(custId); 1527 if (origZi != ZONEINFO.end() && custZi != ZONEINFO.end()) { 1528 // replace original zone info with custom override, 1529 // then delete one custom ID 1530 cout << "Replacing ZoneInfo " << origId << " with " << custId << endl; 1531 origZi->second = custZi->second; 1532 ZONEINFO.erase(custZi); 1533 } 1534 1535 // Also replace final rule 1536 map<string,FinalZone>::iterator origFz = finalZones.find(origId); 1537 map<string,FinalZone>::iterator custFz = finalZones.find(custId); 1538 if (origFz != finalZones.end() && custFz != finalZones.end()) { 1539 // replace original final zone with custom override, 1540 // then delete one for custom ID 1541 cout << "Replacing FinalZone for " << origId << " with " << custId << endl; 1542 origFz->second = custFz->second; 1543 finalZones.erase(custFz); 1544 } 1545 } 1546 1547 // Also remove aliases for ICU custom zoneinfo overrides. 1548 for (map<string,set<string>>::const_iterator i = links.begin(); i != links.end(); ) { 1549 const string& id = i->first; 1550 size_t idx = id.rfind(ICU_ZONE_OVERRIDE_SUFFIX); 1551 if (idx != string::npos && idx == id.length() - ICU_ZONE_OVERRIDE_SUFFIX_LEN) { 1552 const set<string>& aliases = i->second; 1553 // Also remove all revserse links 1554 for (set<string>::const_iterator j = aliases.begin(); j != aliases.end(); j++) { 1555 const string& alias = *j; 1556 cout << "Removing alias " << alias << endl; 1557 reverseLinks.erase(alias); 1558 } 1559 1560 links.erase(i++); 1561 } else { 1562 i++; 1563 } 1564 } 1565 1566 1567 // 1568 // END ICU Custom ZoneInfo Override Handling 1569 // 1570 1571 try { 1572 for_each(finalZones.begin(), finalZones.end(), mergeFinalZone); 1573 } catch (const exception& error) { 1574 cerr << "Error: While merging final zone data: " << error.what() << endl; 1575 return 1; 1576 } 1577 1578 // Process links (including ICU aliases). For each link set we have 1579 // a canonical ID (e.g., America/Los_Angeles) and a set of one or more 1580 // aliases (e.g., PST, PST8PDT, ...). 1581 1582 // 1. Add all aliases as zone objects in ZONEINFO 1583 for (map<string,set<string> >::const_iterator i = links.begin(); 1584 i!=links.end(); ++i) { 1585 const string& olson = i->first; 1586 const set<string>& aliases = i->second; 1587 if (ZONEINFO.find(olson) == ZONEINFO.end()) { 1588 cerr << "Error: Invalid 'Link' to non-existent \"" 1589 << olson << "\"" << endl; 1590 return 1; 1591 } 1592 for (set<string>::const_iterator j=aliases.begin(); 1593 j!=aliases.end(); ++j) { 1594 ZONEINFO[*j] = ZoneInfo(); 1595 } 1596 } 1597 1598 // 2. Create a mapping from zones to index numbers 0..n-1. 1599 map<string,int32_t> zoneIDs; 1600 vector<string> zoneIDlist; 1601 int32_t z=0; 1602 for (ZoneMap::iterator i=ZONEINFO.begin(); i!=ZONEINFO.end(); ++i) { 1603 zoneIDs[i->first] = z++; 1604 zoneIDlist.push_back(i->first); 1605 } 1606 assert(z == (int32_t) ZONEINFO.size()); 1607 1608 // 3. Merge aliases. Sometimes aliases link to other aliases; we 1609 // resolve these into simplest possible sets. 1610 map<string,set<string> > links2; 1611 map<string,string> reverse2; 1612 for (map<string,set<string> >::const_iterator i = links.begin(); 1613 i!=links.end(); ++i) { 1614 string olson = i->first; 1615 while (reverseLinks.find(olson) != reverseLinks.end()) { 1616 olson = reverseLinks[olson]; 1617 } 1618 for (set<string>::const_iterator j=i->second.begin(); j!=i->second.end(); ++j) { 1619 links2[olson].insert(*j); 1620 reverse2[*j] = olson; 1621 } 1622 } 1623 links = links2; 1624 reverseLinks = reverse2; 1625 1626 if (false) { // Debugging: Emit link map 1627 for (map<string,set<string> >::const_iterator i = links.begin(); 1628 i!=links.end(); ++i) { 1629 cout << i->first << ": "; 1630 for (set<string>::const_iterator j=i->second.begin(); j!=i->second.end(); ++j) { 1631 cout << *j << ", "; 1632 } 1633 cout << endl; 1634 } 1635 } 1636 1637 // 4. Update aliases 1638 for (map<string,set<string> >::const_iterator i = links.begin(); 1639 i!=links.end(); ++i) { 1640 const string& olson = i->first; 1641 const set<string>& aliases = i->second; 1642 ZONEINFO[olson].clearAliases(); 1643 ZONEINFO[olson].addAlias(zoneIDs[olson]); 1644 for (set<string>::const_iterator j=aliases.begin(); 1645 j!=aliases.end(); ++j) { 1646 assert(zoneIDs.find(olson) != zoneIDs.end()); 1647 assert(zoneIDs.find(*j) != zoneIDs.end()); 1648 assert(ZONEINFO.find(*j) != ZONEINFO.end()); 1649 ZONEINFO[*j].setAliasTo(zoneIDs[olson]); 1650 ZONEINFO[olson].addAlias(zoneIDs[*j]); 1651 } 1652 } 1653 1654 // Once merging of final data is complete, we can optimize the type list 1655 for (ZoneMap::iterator i=ZONEINFO.begin(); i!=ZONEINFO.end(); ++i) { 1656 i->second.optimizeTypeList(); 1657 } 1658 1659 // Create the country map 1660 map<string, string> icuRegions; // ICU's custom zone -> country override 1661 map<string, set<string> > countryMap; // country -> set of zones 1662 map<string, string> reverseCountryMap; // zone -> country 1663 1664 try { 1665 // Read icuregions file to collect ICU's own zone-region mapping data. 1666 ifstream frg(ICU_REGIONS); 1667 if (frg) { 1668 string line; 1669 while (getline(frg, line)) { 1670 if (line[0] == '#') continue; 1671 1672 string zone, country; 1673 istringstream is(line); 1674 is >> zone >> country; 1675 if (zone.size() == 0) continue; 1676 if (country.size() < 2) { 1677 cerr << "Error: Can't parse " << line << " in " << ICU_REGIONS << endl; 1678 return 1; 1679 } 1680 icuRegions[zone] = country; 1681 } 1682 } else { 1683 cout << "No custom region map [icuregions]" << endl; 1684 } 1685 } catch (const exception& error) { 1686 cerr << "Error: While reading " << ICU_REGIONS << ": " << error.what() << endl; 1687 return 1; 1688 } 1689 1690 try { 1691 ifstream f(zonetab.c_str()); 1692 if (!f) { 1693 cerr << "Error: Unable to open " << zonetab << endl; 1694 return 1; 1695 } 1696 int32_t n = 0; 1697 string line; 1698 while (getline(f, line)) { 1699 string::size_type lb = line.find('#'); 1700 if (lb != string::npos) { 1701 line.resize(lb); // trim comments 1702 } 1703 string country, coord, zone; 1704 istringstream is(line); 1705 is >> country >> coord >> zone; 1706 if (country.size() == 0) continue; 1707 if (country.size() != 2 || zone.size() < 1) { 1708 cerr << "Error: Can't parse " << line << " in " << zonetab << endl; 1709 return 1; 1710 } 1711 if (ZONEINFO.find(zone) == ZONEINFO.end()) { 1712 cerr << "Error: Country maps to invalid zone " << zone 1713 << " in " << zonetab << endl; 1714 return 1; 1715 } 1716 if (icuRegions.find(zone) != icuRegions.end()) { 1717 // Custom override 1718 string customCountry = icuRegions[zone]; 1719 cout << "Region Mapping: custom override for " << zone 1720 << " " << country << " -> " << customCountry << endl; 1721 country = customCountry; 1722 } 1723 countryMap[country].insert(zone); 1724 reverseCountryMap[zone] = country; 1725 //cerr << (n+1) << ": " << country << " <=> " << zone << endl; 1726 ++n; 1727 } 1728 cout << "Finished reading " << n 1729 << " country entries from " << zonetab << endl; 1730 } catch (const exception& error) { 1731 cerr << "Error: While reading " << zonetab << ": " << error.what() << endl; 1732 return 1; 1733 } 1734 1735 // Merge ICU's own zone-region mapping data 1736 for (map<string,string>::const_iterator i = icuRegions.begin(); 1737 i != icuRegions.end(); ++i) { 1738 const string& zid(i->first); 1739 if (reverseCountryMap.find(zid) != reverseCountryMap.end()) { 1740 continue; 1741 } 1742 cout << "Region Mapping: custom data zone=" << zid 1743 << ", region=" << i->second << endl; 1744 1745 reverseCountryMap[zid] = i->second; 1746 countryMap[i->second].insert(zid); 1747 } 1748 1749 // Merge ICU aliases into country map. Don't merge any alias 1750 // that already has a country map, since that doesn't make sense. 1751 // E.g. "Link Europe/Oslo Arctic/Longyearbyen" doesn't mean we 1752 // should cross-map the countries between these two zones. 1753 for (map<string,set<string> >::const_iterator i = links.begin(); 1754 i!=links.end(); ++i) { 1755 const string& olson(i->first); 1756 if (reverseCountryMap.find(olson) == reverseCountryMap.end()) { 1757 continue; 1758 } 1759 string c = reverseCountryMap[olson]; 1760 const set<string>& aliases(i->second); 1761 for (set<string>::const_iterator j=aliases.begin(); 1762 j != aliases.end(); ++j) { 1763 if (reverseCountryMap.find(*j) == reverseCountryMap.end()) { 1764 countryMap[c].insert(*j); 1765 reverseCountryMap[*j] = c; 1766 //cerr << "Aliased country: " << c << " <=> " << *j << endl; 1767 } 1768 } 1769 } 1770 1771 // Create a pseudo-country containing all zones belonging to no country 1772 set<string> nocountry; 1773 for (ZoneMap::iterator i=ZONEINFO.begin(); i!=ZONEINFO.end(); ++i) { 1774 if (reverseCountryMap.find(i->first) == reverseCountryMap.end()) { 1775 nocountry.insert(i->first); 1776 } 1777 } 1778 countryMap[""] = nocountry; 1779 1780 // Get local time & year for below 1781 time_t sec; 1782 time(&sec); 1783 struct tm* now = localtime(&sec); 1784 int32_t thisYear = now->tm_year + 1900; 1785 1786 string filename = TZ_RESOURCE_NAME + ".txt"; 1787 // Write out a resource-bundle source file containing data for 1788 // all zones. 1789 ofstream file(filename.c_str()); 1790 if (file) { 1791 file << "//---------------------------------------------------------" << endl 1792 << "// Copyright (C) 2016 and later: Unicode, Inc. and others." << endl 1793 << "// License & terms of use: http://www.unicode.org/copyright.html#License" << endl 1794 << "//---------------------------------------------------------" << endl 1795 << "// Build tool: tz2icu" << endl 1796 << "// Build date: " << asctime(now) /* << endl -- asctime emits CR */ 1797 << "// tz database: ftp://ftp.iana.org/tz/" << endl 1798 << "// tz version: " << version << endl 1799 << "// ICU version: " << U_ICU_VERSION << endl 1800 << "//---------------------------------------------------------" << endl 1801 << "// >> !!! >> THIS IS A MACHINE-GENERATED FILE << !!! <<" << endl 1802 << "// >> !!! >>> DO NOT EDIT <<< !!! <<" << endl 1803 << "//---------------------------------------------------------" << endl 1804 << endl 1805 << TZ_RESOURCE_NAME << ":table(nofallback) {" << endl 1806 << " TZVersion { \"" << version << "\" }" << endl 1807 << " Zones:array { " << endl 1808 << ZONEINFO // Zones (the actual data) 1809 << " }" << endl; 1810 1811 // Names correspond to the Zones list, used for binary searching. 1812 printStringList ( file, ZONEINFO ); // print the Names list 1813 1814 // Final Rules are used if requested by the zone 1815 file << " Rules { " << endl; 1816 // Emit final rules 1817 int32_t frc = 0; 1818 for(map<string,FinalRule>::iterator i=finalRules.begin(); 1819 i!=finalRules.end(); ++i) { 1820 const string& id = i->first; 1821 const FinalRule& r = i->second; 1822 file << " " << id << ":intvector {" << endl; 1823 r.print(file); 1824 file << " } //_#" << frc++ << endl; 1825 } 1826 file << " }" << endl; 1827 1828 // Emit country (region) map. 1829 if (ICU44PLUS) { 1830 file << " Regions:array {" << endl; 1831 int32_t zn = 0; 1832 for (ZoneMap::iterator i=ZONEINFO.begin(); i!=ZONEINFO.end(); ++i) { 1833 map<string, string>::iterator cit = reverseCountryMap.find(i->first); 1834 if (cit == reverseCountryMap.end()) { 1835 file << " \"001\","; 1836 } else { 1837 file << " \"" << cit->second << "\", "; 1838 } 1839 file << "//Z#" << zn++ << " " << i->first << endl; 1840 } 1841 file << " }" << endl; 1842 } else { 1843 file << " Regions { " << endl; 1844 int32_t rc = 0; 1845 for (map<string, set<string> >::const_iterator i=countryMap.begin(); 1846 i != countryMap.end(); ++i) { 1847 string country = i->first; 1848 const set<string>& zones(i->second); 1849 file << " "; 1850 if(country[0]==0) { 1851 file << "Default"; 1852 } 1853 file << country << ":intvector { "; 1854 bool first = true; 1855 for (set<string>::const_iterator j=zones.begin(); 1856 j != zones.end(); ++j) { 1857 if (!first) file << ", "; 1858 first = false; 1859 if (zoneIDs.find(*j) == zoneIDs.end()) { 1860 cerr << "Error: Nonexistent zone in country map: " << *j << endl; 1861 return 1; 1862 } 1863 file << zoneIDs[*j]; // emit the zone's index number 1864 } 1865 file << " } //R#" << rc++ << endl; 1866 } 1867 file << " }" << endl; 1868 } 1869 1870 file << "}" << endl; 1871 } 1872 1873 file.close(); 1874 1875 if (file) { // recheck error bit 1876 cout << "Finished writing " << TZ_RESOURCE_NAME << ".txt" << endl; 1877 } else { 1878 cerr << "Error: Unable to open/write to " << TZ_RESOURCE_NAME << ".txt" << endl; 1879 return 1; 1880 } 1881 } 1882 //eof 1883