Home | History | Annotate | Download | only in src
      1 // Copyright (C) 2014 Google Inc.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 // http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include <libaddressinput/address_formatter.h>
     16 
     17 #include <libaddressinput/address_data.h>
     18 #include <libaddressinput/address_field.h>
     19 #include <libaddressinput/util/basictypes.h>
     20 
     21 #include <algorithm>
     22 #include <cassert>
     23 #include <cstddef>
     24 #include <functional>
     25 #include <string>
     26 #include <vector>
     27 
     28 #include "format_element.h"
     29 #include "language.h"
     30 #include "region_data_constants.h"
     31 #include "rule.h"
     32 #include "util/cctype_tolower_equal.h"
     33 
     34 namespace i18n {
     35 namespace addressinput {
     36 
     37 namespace {
     38 
     39 const char kCommaSeparator[] = ", ";
     40 const char kSpaceSeparator[] = " ";
     41 const char kArabicCommaSeparator[] = "\xD8\x8C" " ";  /* " " */
     42 
     43 const char* kLanguagesThatUseSpace[] = {
     44   "th",
     45   "ko"
     46 };
     47 
     48 const char* kLanguagesThatHaveNoSeparator[] = {
     49   "ja",
     50   "zh"  // All Chinese variants.
     51 };
     52 
     53 // This data is based on CLDR, for languages that are in official use in some
     54 // country, where Arabic is the most likely script tag.
     55 // TODO: Consider supporting variants such as tr-Arab by detecting the script
     56 // code.
     57 const char* kLanguagesThatUseAnArabicComma[] = {
     58   "ar",
     59   "az",
     60   "fa",
     61   "kk",
     62   "ku",
     63   "ky",
     64   "ps",
     65   "tg",
     66   "tk",
     67   "ur",
     68   "uz"
     69 };
     70 
     71 std::string GetLineSeparatorForLanguage(const std::string& language_tag) {
     72   Language address_language(language_tag);
     73 
     74   // First deal with explicit script tags.
     75   if (address_language.has_latin_script) {
     76     return kCommaSeparator;
     77   }
     78 
     79   // Now guess something appropriate based on the base language.
     80   const std::string& base_language = address_language.base;
     81   if (std::find_if(kLanguagesThatUseSpace,
     82                    kLanguagesThatUseSpace + arraysize(kLanguagesThatUseSpace),
     83                    std::bind2nd(EqualToTolowerString(), base_language)) !=
     84       kLanguagesThatUseSpace + arraysize(kLanguagesThatUseSpace)) {
     85     return kSpaceSeparator;
     86   } else if (std::find_if(
     87                  kLanguagesThatHaveNoSeparator,
     88                  kLanguagesThatHaveNoSeparator +
     89                      arraysize(kLanguagesThatHaveNoSeparator),
     90                  std::bind2nd(EqualToTolowerString(), base_language)) !=
     91              kLanguagesThatHaveNoSeparator +
     92                  arraysize(kLanguagesThatHaveNoSeparator)) {
     93     return "";
     94   } else if (std::find_if(
     95                  kLanguagesThatUseAnArabicComma,
     96                  kLanguagesThatUseAnArabicComma +
     97                      arraysize(kLanguagesThatUseAnArabicComma),
     98                  std::bind2nd(EqualToTolowerString(), base_language)) !=
     99              kLanguagesThatUseAnArabicComma +
    100                  arraysize(kLanguagesThatUseAnArabicComma)) {
    101     return kArabicCommaSeparator;
    102   }
    103   // Either the language is a Latin-script language, or no language was
    104   // specified. In the latter case we still return ", " as the most common
    105   // separator in use. In countries that don't use this, e.g. Thailand,
    106   // addresses are often written in Latin script where this would still be
    107   // appropriate, so this is a reasonable default in the absence of information.
    108   return kCommaSeparator;
    109 }
    110 
    111 void CombineLinesForLanguage(const std::vector<std::string>& lines,
    112                              const std::string& language_tag,
    113                              std::string* line) {
    114   line->clear();
    115   std::string separator = GetLineSeparatorForLanguage(language_tag);
    116   for (std::vector<std::string>::const_iterator it = lines.begin();
    117        it != lines.end();
    118        ++it) {
    119     if (it != lines.begin()) {
    120       line->append(separator);
    121     }
    122     line->append(*it);
    123   }
    124 }
    125 
    126 }  // namespace
    127 
    128 void GetFormattedNationalAddress(
    129     const AddressData& address_data, std::vector<std::string>* lines) {
    130   assert(lines != NULL);
    131   lines->clear();
    132 
    133   Rule rule;
    134   rule.CopyFrom(Rule::GetDefault());
    135   // TODO: Eventually, we should get the best rule for this country and
    136   // language, rather than just for the country.
    137   rule.ParseSerializedRule(RegionDataConstants::GetRegionData(
    138       address_data.region_code));
    139 
    140   Language language(address_data.language_code);
    141 
    142   // If Latin-script rules are available and the |language_code| of this address
    143   // is explicitly tagged as being Latin, then use the Latin-script formatting
    144   // rules.
    145   const std::vector<FormatElement>& format =
    146       language.has_latin_script && !rule.GetLatinFormat().empty()
    147           ? rule.GetLatinFormat()
    148           : rule.GetFormat();
    149 
    150   // Address format without the unnecessary elements (based on which address
    151   // fields are empty). We assume all literal strings that are not at the start
    152   // or end of a line are separators, and therefore only relevant if the
    153   // surrounding fields are filled in. This works with the data we have
    154   // currently.
    155   std::vector<FormatElement> pruned_format;
    156   for (std::vector<FormatElement>::const_iterator
    157        element_it = format.begin();
    158        element_it != format.end();
    159        ++element_it) {
    160     // Always keep the newlines.
    161     if (element_it->IsNewline() ||
    162         // Always keep the non-empty address fields.
    163         (element_it->IsField() &&
    164          !address_data.IsFieldEmpty(element_it->GetField())) ||
    165         // Only keep literals that satisfy these 2 conditions:
    166         (!element_it->IsField() &&
    167          // (1) Not preceding an empty field.
    168          (element_it + 1 == format.end() ||
    169           !(element_it + 1)->IsField() ||
    170           !address_data.IsFieldEmpty((element_it + 1)->GetField())) &&
    171          // (2) Not following a removed field.
    172          (element_it == format.begin() ||
    173           !(element_it - 1)->IsField() ||
    174           (!pruned_format.empty() && pruned_format.back().IsField())))) {
    175       pruned_format.push_back(*element_it);
    176     }
    177   }
    178 
    179   std::string line;
    180   for (std::vector<FormatElement>::const_iterator
    181        element_it = pruned_format.begin();
    182        element_it != pruned_format.end();
    183        ++element_it) {
    184     if (element_it->IsNewline()) {
    185       if (!line.empty()) {
    186         lines->push_back(line);
    187         line.clear();
    188       }
    189     } else if (element_it->IsField()) {
    190       AddressField field = element_it->GetField();
    191       if (field == STREET_ADDRESS) {
    192         // The field "street address" represents the street address lines of an
    193         // address, so there can be multiple values.
    194         if (!address_data.IsFieldEmpty(field)) {
    195           line.append(address_data.address_line.front());
    196           if (address_data.address_line.size() > 1U) {
    197             lines->push_back(line);
    198             line.clear();
    199             lines->insert(lines->end(),
    200                           address_data.address_line.begin() + 1,
    201                           address_data.address_line.end());
    202           }
    203         }
    204       } else {
    205         line.append(address_data.GetFieldValue(field));
    206       }
    207     } else {
    208       line.append(element_it->GetLiteral());
    209     }
    210   }
    211   if (!line.empty()) {
    212     lines->push_back(line);
    213   }
    214 }
    215 
    216 void GetFormattedNationalAddressLine(
    217     const AddressData& address_data, std::string* line) {
    218   std::vector<std::string> address_lines;
    219   GetFormattedNationalAddress(address_data, &address_lines);
    220   CombineLinesForLanguage(address_lines, address_data.language_code, line);
    221 }
    222 
    223 void GetStreetAddressLinesAsSingleLine(
    224     const AddressData& address_data, std::string* line) {
    225   CombineLinesForLanguage(
    226       address_data.address_line, address_data.language_code, line);
    227 }
    228 
    229 }  // namespace addressinput
    230 }  // namespace i18n
    231