Home | History | Annotate | Download | only in tool
      1 package org.unicode.cldr.tool;
      2 
      3 import java.io.IOException;
      4 import java.text.ParseException;
      5 import java.util.ArrayList;
      6 import java.util.Iterator;
      7 import java.util.List;
      8 import java.util.Locale;
      9 import java.util.Set;
     10 import java.util.TreeSet;
     11 
     12 import org.unicode.cldr.util.CldrUtility;
     13 import org.unicode.cldr.util.CldrUtility.LineHandler;
     14 import org.unicode.cldr.util.Counter2;
     15 import org.unicode.cldr.util.StandardCodes;
     16 
     17 import com.ibm.icu.text.NumberFormat;
     18 import com.ibm.icu.text.UnicodeSet;
     19 import com.ibm.icu.util.ULocale;
     20 
     21 public class AddPopulationData {
     22     static boolean ADD_POP = CldrUtility.getProperty("ADD_POP", false);
     23     static boolean SHOW_ALTERNATE_NAMES = CldrUtility.getProperty("SHOW_ALTERNATE_NAMES", false);
     24 
     25     enum WBLine {
     26         // "Afghanistan","AFG","GNI, PPP (current international $)","NY.GNP.MKTP.PP.CD","..","..","13144920451.3325","16509662130.816","18932631964.8727","22408872945.1924","25820670505.2627","30783369469.7509","32116190092.1429","..",
     27 
     28         Country_Name, Country_Code, Series_Name, Series_Code, YR2000, YR2001, YR2002, YR2003, YR2004, YR2005, YR2006, YR2007, YR2008, YR2009, YR2010, YR2011, YR2012, YR2013, YR2014, YR2015, YR2016, YR2017;
     29         String get(String[] pieces) {
     30             return ordinal() < pieces.length ? pieces[ordinal()] : EMPTY;
     31         }
     32     }
     33 
     34     enum FBLine {
     35         Rank, Country, Value, Year;
     36         String get(String[] pieces) {
     37             return pieces[ordinal()];
     38         }
     39     }
     40 
     41     enum FBLiteracy {
     42         Rank, Country, Percent;
     43         String get(String[] pieces) {
     44             return pieces[ordinal()];
     45         }
     46     }
     47 
     48     private static final String GCP = "NY.GNP.MKTP.PP.CD";
     49     private static final String POP = "SP.POP.TOTL";
     50     private static final String EMPTY = "..";
     51     private static Counter2<String> worldbank_gdp = new Counter2<String>();
     52     private static Counter2<String> worldbank_population = new Counter2<String>();
     53     private static Counter2<String> un_literacy = new Counter2<String>();
     54 
     55     private static Counter2<String> factbook_gdp = new Counter2<String>();
     56     private static Counter2<String> factbook_population = new Counter2<String>();
     57     private static Counter2<String> factbook_literacy = new Counter2<String>();
     58 
     59     private static CountryData other = new CountryData();
     60 
     61     static class CountryData {
     62         private static Counter2<String> population = new Counter2<String>();
     63         private static Counter2<String> gdp = new Counter2<String>();
     64         private static Counter2<String> literacy = new Counter2<String>();
     65     }
     66 
     67     public static void main(String[] args) throws IOException {
     68 
     69         System.out.println("Code"
     70             + "\t" + "Name"
     71             + "\t" + "Pop"
     72             + "\t" + "GDP-PPP"
     73             + "\t" + "UN Literacy");
     74 
     75         for (String country : StandardCodes.make().getGoodCountries()) {
     76             showCountryData(country);
     77         }
     78         Set<String> outliers = new TreeSet<String>();
     79         outliers.addAll(factbook_population.keySet());
     80         outliers.addAll(worldbank_population.keySet());
     81         outliers.addAll(factbook_gdp.keySet());
     82         outliers.addAll(worldbank_gdp.keySet());
     83         outliers.addAll(un_literacy.keySet());
     84         for (Iterator<String> it = outliers.iterator(); it.hasNext();) {
     85             if (StandardCodes.isCountry(it.next())) {
     86                 it.remove();
     87             }
     88         }
     89         // outliers.remove("AN");
     90         if (outliers.size() != 0) {
     91             System.out.println("Mistakes: data for non-UN codes");
     92             for (String country : outliers) {
     93                 showCountryData(country);
     94             }
     95             throw new IllegalArgumentException("Mistakes: data for non-country codes");
     96         }
     97         Set<String> altNames = new TreeSet<String>();
     98         String oldCode = "";
     99         for (String display : CountryCodeConverter.names()) {
    100             String code = CountryCodeConverter.getCodeFromName(display);
    101             String icu = ULocale.getDisplayCountry("und-" + code, "en");
    102             if (!display.equalsIgnoreCase(icu)) {
    103                 altNames.add(code + "\t" + display + "\t" + icu);
    104             }
    105         }
    106         oldCode = "";
    107         if (SHOW_ALTERNATE_NAMES) {
    108             for (String altName : altNames) {
    109                 String[] pieces = altName.split("\t");
    110                 String code = pieces[0];
    111                 if (code.equals("ZZ")) continue;
    112                 if (!code.equals(oldCode)) {
    113                     oldCode = code;
    114                     System.out.println();
    115                 }
    116                 System.out.println(code + "; " + pieces[2] + "; " + pieces[1]);
    117                 // System.out.println("<territory type=\"" + code + "\" alt=\"v" + (++alt) + "\">" + pieces[1] +
    118                 // "</territory> <!-- " + pieces[2] + " -->");
    119             }
    120         }
    121     }
    122 
    123     private static void showCountryData(String country) {
    124         number.setMaximumFractionDigits(0);
    125         System.out.println(country
    126             + "\t" + ULocale.getDisplayCountry("und-" + country, "en")
    127             + "\t" + number.format(getPopulation(country))
    128             + "\t" + number.format(getGdp(country))
    129             + "\t" + percent.format(getLiteracy(country) / 100));
    130     }
    131 
    132     public static Double getLiteracy(String country) {
    133         return firstNonZero(factbook_literacy.getCount(country),
    134             un_literacy.getCount(country),
    135             CountryData.literacy.getCount(country));
    136     }
    137 
    138     public static Double getGdp(String country) {
    139         return firstNonZero(factbook_gdp.getCount(country),
    140             worldbank_gdp.getCount(country),
    141             CountryData.gdp.getCount(country));
    142     }
    143 
    144     public static Double getPopulation(String country) {
    145         return firstNonZero(factbook_population.getCount(country),
    146             worldbank_population.getCount(country),
    147             CountryData.population.getCount(country));
    148     }
    149 
    150     private static Double firstNonZero(Double... items) {
    151         for (Double item : items) {
    152             if (item.doubleValue() != 0) {
    153                 return item;
    154             }
    155         }
    156         return 0.0;
    157     }
    158 
    159     static String[] splitCommaSeparated(String line) {
    160         // items are separated by ','
    161         // each item is of the form abc...
    162         // or "..." (required if a comma or quote is contained)
    163         // " in a field is represented by ""
    164         List<String> result = new ArrayList<String>();
    165         StringBuilder item = new StringBuilder();
    166         boolean inQuote = false;
    167         for (int i = 0; i < line.length(); ++i) {
    168             char ch = line.charAt(i); // don't worry about supplementaries
    169             switch (ch) {
    170             case '"':
    171                 inQuote = !inQuote;
    172                 // at start or end, that's enough
    173                 // if get a quote when we are not in a quote, and not at start, then add it and return to inQuote
    174                 if (inQuote && item.length() != 0) {
    175                     item.append('"');
    176                     inQuote = true;
    177                 }
    178                 break;
    179             case ',':
    180                 if (!inQuote) {
    181                     result.add(item.toString());
    182                     item.setLength(0);
    183                 } else {
    184                     item.append(ch);
    185                 }
    186                 break;
    187             default:
    188                 item.append(ch);
    189                 break;
    190             }
    191         }
    192         result.add(item.toString());
    193         return result.toArray(new String[result.size()]);
    194     }
    195 
    196     private static void loadFactbookInfo(String filename, final Counter2<String> factbookGdp) throws IOException {
    197         CldrUtility.handleFile(filename, new LineHandler() {
    198             public boolean handle(String line) {
    199                 if (line.length() == 0 || line.startsWith("This tab") || line.startsWith("Rank")
    200                     || line.startsWith(" This file")) {
    201                     return false;
    202                 }
    203                 String[] pieces = line.split("\\s{2,}");
    204                 String code = CountryCodeConverter.getCodeFromName(FBLine.Country.get(pieces));
    205                 if (code == null) {
    206                     return false;
    207                 }
    208                 if (!StandardCodes.isCountry(code)) {
    209                     if (ADD_POP) {
    210                         System.out.println("Skipping factbook info for: " + code);
    211                     }
    212                     return false;
    213                 }
    214                 code = code.toUpperCase(Locale.ENGLISH);
    215                 String valueString = FBLine.Value.get(pieces).trim();
    216                 if (valueString.startsWith("$")) {
    217                     valueString = valueString.substring(1);
    218                 }
    219                 valueString = valueString.replace(",", "");
    220                 double value = Double.parseDouble(valueString.trim());
    221                 factbookGdp.add(code, value);
    222                 if (ADD_POP) {
    223                     System.out.println("Factbook gdp:\t" + code + "\t" + value);
    224                 }
    225                 return true;
    226             }
    227         });
    228     }
    229 
    230     static final NumberFormat dollars = NumberFormat.getCurrencyInstance(ULocale.US);
    231     static final NumberFormat number = NumberFormat.getNumberInstance(ULocale.US);
    232     static final NumberFormat percent = NumberFormat.getPercentInstance(ULocale.US);
    233 
    234     static class MyLineHandler implements LineHandler {
    235         CountryData countryData;
    236 
    237         public MyLineHandler(CountryData countryData) {
    238             super();
    239             this.countryData = countryData;
    240         }
    241 
    242         public boolean handle(String line) throws ParseException {
    243             if (line.startsWith("#")) return true;
    244             if (line.length() == 0) {
    245                 return true;
    246             }
    247             String[] pieces = line.split(";");
    248             final String code = pieces[0].trim();
    249             if (code.equals("Code")) {
    250                 return false;
    251             }
    252             // Code;Name;Type;Data;Source
    253             final String typeString = pieces[2].trim();
    254             final String data = pieces[3].trim();
    255             if (typeString.equals("gdp-ppp")) {
    256                 if (StandardCodes.isCountry(data)) {
    257                     Double otherPop = getPopulation(data);
    258                     Double otherGdp = getGdp(data);
    259                     Double myPop = getPopulation(code);
    260                     if (myPop.doubleValue() == 0 || otherPop.doubleValue() == 0 || otherGdp.doubleValue() == 0) {
    261                         otherPop = getPopulation(data);
    262                         otherGdp = getPopulation(data);
    263                         myPop = getPopulation(code);
    264                         throw new IllegalArgumentException("Zero population");
    265                     }
    266                     CountryData.gdp.add(code, otherGdp * myPop / otherPop);
    267                 } else {
    268                     CountryData.gdp.add(code, dollars.parse(data).doubleValue());
    269                 }
    270             } else if (typeString.equals("population")) {
    271                 if (StandardCodes.isCountry(data)) {
    272                     throw new IllegalArgumentException("Population can't use other country's");
    273                 }
    274                 CountryData.population.add(code, number.parse(data).doubleValue());
    275             } else if (typeString.equals("literacy")) {
    276                 if (StandardCodes.isCountry(data)) {
    277                     Double otherPop = getLiteracy(data);
    278                     CountryData.literacy.add(code, otherPop);
    279                 } else {
    280                     CountryData.literacy.add(code, number.parse(data).doubleValue());
    281                 }
    282             } else {
    283                 throw new IllegalArgumentException("Illegal type");
    284             }
    285             return true;
    286         }
    287     }
    288 
    289     static final UnicodeSet DIGITS = (UnicodeSet) new UnicodeSet("[:Nd:]").freeze();
    290 
    291     private static void loadFactbookLiteracy() throws IOException {
    292         final String filename = "external/factbook_literacy.txt";
    293         CldrUtility.handleFile(filename, new LineHandler() {
    294             public boolean handle(String line) {
    295                 String[] pieces = line.split("\\t");
    296                 String code = CountryCodeConverter.getCodeFromName(FBLiteracy.Country.get(pieces));
    297                 if (code == null) {
    298                     return false;
    299                 }
    300                 if (!StandardCodes.isCountry(code)) {
    301                     if (ADD_POP) {
    302                         System.out.println("Skipping factbook literacy for: " + code);
    303                     }
    304                     return false;
    305                 }
    306                 code = code.toUpperCase(Locale.ENGLISH);
    307                 String valueString = FBLiteracy.Percent.get(pieces).trim();
    308                 double percent = Double.parseDouble(valueString);
    309                 factbook_literacy.put(code, percent);
    310                 if (ADD_POP) {
    311                     System.out.println("Factbook literacy:\t" + code + "\t" + percent);
    312                 }
    313                 code = null;
    314                 return true;
    315             }
    316         });
    317     }
    318 
    319     private static void loadWorldBankInfo() throws IOException {
    320         final String filename = "external/world_bank_data.csv";
    321 
    322         // List<List<String>> data = SpreadSheet.convert(CldrUtility.getUTF8Data(filename));
    323 
    324         CldrUtility.handleFile(filename, new LineHandler() {
    325             public boolean handle(String line) {
    326                 if (line.contains("Series Code")) {
    327                     return false;
    328                 }
    329                 String[] pieces = splitCommaSeparated(line);
    330 
    331                 // String[] pieces = line.substring(1, line.length() - 2).split("\"\t\"");
    332 
    333                 final String seriesCode = WBLine.Series_Code.get(pieces);
    334 
    335                 String last = null;
    336                 for (WBLine i : WBLine.values()) {
    337                     if (i.compareTo(WBLine.YR2000) >= 0) {
    338                         String current = i.get(pieces);
    339                         if (current.length() != 0 && !current.equals(EMPTY)) {
    340                             last = current;
    341                         }
    342                     }
    343                 }
    344                 if (last == null) {
    345                     return false;
    346                 }
    347                 String country = CountryCodeConverter.getCodeFromName(WBLine.Country_Name.get(pieces));
    348                 if (country == null) {
    349                     return false;
    350                 }
    351                 if (!StandardCodes.isCountry(country)) {
    352                     if (ADD_POP) {
    353                         System.out.println("Skipping worldbank info for: " + country);
    354                     }
    355                     return false;
    356                 }
    357                 double value;
    358                 try {
    359                     value = Double.parseDouble(last);
    360                 } catch (NumberFormatException e) {
    361                     throw new IllegalArgumentException("File changed format: need to modify code");
    362                 }
    363                 if (seriesCode.equals(GCP)) {
    364                     worldbank_gdp.add(country, value);
    365                 } else if (seriesCode.equals(POP)) {
    366                     worldbank_population.add(country, value);
    367                 } else {
    368                     throw new IllegalArgumentException();
    369                 }
    370                 return true;
    371             }
    372         });
    373     }
    374 
    375     private static void loadUnLiteracy() throws IOException {
    376         CldrUtility.handleFile("external/un_literacy.csv", new CldrUtility.LineHandler() {
    377             public boolean handle(String line) {
    378                 // Afghanistan,2000, ,28,43,13,,34,51,18
    379                 // "Country or area","Year",,"Adult (15+) literacy rate",,,,,,"         Youth (15-24) literacy rate",,,,
    380                 // ,,,Total,Men,Women,,Total,Men,Women
    381                 // "Albania",2008,,96,,97,,95,,99,,99,,99
    382                 String[] pieces = splitCommaSeparated(line);
    383                 if (pieces.length != 14 || pieces[1].length() == 0 || !DIGITS.containsAll(pieces[1])) {
    384                     return false;
    385                 }
    386                 String code = CountryCodeConverter.getCodeFromName(pieces[0]);
    387                 if (code == null) {
    388                     return false;
    389                 }
    390                 if (!StandardCodes.isCountry(code)) {
    391                     if (ADD_POP) {
    392                         System.out.println("Skipping UN info for: " + code);
    393                     }
    394                     return false;
    395                 }
    396                 String totalLiteracy = pieces[3];
    397                 if (totalLiteracy.equals("") || totalLiteracy.equals("") || totalLiteracy.isEmpty()) {
    398                     return true;
    399                 }
    400                 double percent = Double.parseDouble(totalLiteracy);
    401                 un_literacy.add(code, percent);
    402                 return true;
    403             }
    404         });
    405     }
    406 
    407     static {
    408         try {
    409             loadFactbookLiteracy();
    410             loadUnLiteracy();
    411 
    412             loadFactbookInfo("external/factbook_gdp_ppp.txt", factbook_gdp);
    413             loadFactbookInfo("external/factbook_population.txt", factbook_population);
    414             CldrUtility.handleFile("external/other_country_data.txt", new MyLineHandler(other));
    415 
    416             loadWorldBankInfo();
    417             StandardCodes sc = StandardCodes.make();
    418             StringBuilder myErrors = new StringBuilder();
    419             for (String territory : sc.getGoodAvailableCodes("territory")) {
    420                 if (!StandardCodes.isCountry(territory)) {
    421                     continue;
    422                 }
    423                 double gdp = getGdp(territory);
    424                 double literacy = getLiteracy(territory);
    425                 double population = getPopulation(territory);
    426                 if (gdp == 0) {
    427                     // AX;Aland Islands;population;26,200;www.aland.ax
    428                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";gdp-ppp;0;reason");
    429                 }
    430                 if (literacy == 0) {
    431                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";literacy;0;reason");
    432                 }
    433                 if (population == 0) {
    434                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory)
    435                         + ";population;0;reason");
    436                 }
    437             }
    438             if (myErrors.length() != 0) {
    439                 throw new IllegalArgumentException(
    440                     "Missing Country values, the following and add to external/other_country_data to fix:"
    441                         + myErrors);
    442             }
    443         } catch (IOException e) {
    444         }
    445     }
    446 }
    447