1 // Copyright (C) 2014 Google Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include <libaddressinput/address_formatter.h> 16 17 #include <libaddressinput/address_data.h> 18 #include <libaddressinput/address_field.h> 19 #include <libaddressinput/util/basictypes.h> 20 21 #include <algorithm> 22 #include <cassert> 23 #include <cstddef> 24 #include <functional> 25 #include <string> 26 #include <vector> 27 28 #include "format_element.h" 29 #include "language.h" 30 #include "region_data_constants.h" 31 #include "rule.h" 32 #include "util/cctype_tolower_equal.h" 33 34 namespace i18n { 35 namespace addressinput { 36 37 namespace { 38 39 const char kCommaSeparator[] = ", "; 40 const char kSpaceSeparator[] = " "; 41 const char kArabicCommaSeparator[] = "\xD8\x8C" " "; /* " " */ 42 43 const char* kLanguagesThatUseSpace[] = { 44 "th", 45 "ko" 46 }; 47 48 const char* kLanguagesThatHaveNoSeparator[] = { 49 "ja", 50 "zh" // All Chinese variants. 51 }; 52 53 // This data is based on CLDR, for languages that are in official use in some 54 // country, where Arabic is the most likely script tag. 55 // TODO: Consider supporting variants such as tr-Arab by detecting the script 56 // code. 57 const char* kLanguagesThatUseAnArabicComma[] = { 58 "ar", 59 "az", 60 "fa", 61 "kk", 62 "ku", 63 "ky", 64 "ps", 65 "tg", 66 "tk", 67 "ur", 68 "uz" 69 }; 70 71 std::string GetLineSeparatorForLanguage(const std::string& language_tag) { 72 Language address_language(language_tag); 73 74 // First deal with explicit script tags. 75 if (address_language.has_latin_script) { 76 return kCommaSeparator; 77 } 78 79 // Now guess something appropriate based on the base language. 80 const std::string& base_language = address_language.base; 81 if (std::find_if(kLanguagesThatUseSpace, 82 kLanguagesThatUseSpace + arraysize(kLanguagesThatUseSpace), 83 std::bind2nd(EqualToTolowerString(), base_language)) != 84 kLanguagesThatUseSpace + arraysize(kLanguagesThatUseSpace)) { 85 return kSpaceSeparator; 86 } else if (std::find_if( 87 kLanguagesThatHaveNoSeparator, 88 kLanguagesThatHaveNoSeparator + 89 arraysize(kLanguagesThatHaveNoSeparator), 90 std::bind2nd(EqualToTolowerString(), base_language)) != 91 kLanguagesThatHaveNoSeparator + 92 arraysize(kLanguagesThatHaveNoSeparator)) { 93 return ""; 94 } else if (std::find_if( 95 kLanguagesThatUseAnArabicComma, 96 kLanguagesThatUseAnArabicComma + 97 arraysize(kLanguagesThatUseAnArabicComma), 98 std::bind2nd(EqualToTolowerString(), base_language)) != 99 kLanguagesThatUseAnArabicComma + 100 arraysize(kLanguagesThatUseAnArabicComma)) { 101 return kArabicCommaSeparator; 102 } 103 // Either the language is a Latin-script language, or no language was 104 // specified. In the latter case we still return ", " as the most common 105 // separator in use. In countries that don't use this, e.g. Thailand, 106 // addresses are often written in Latin script where this would still be 107 // appropriate, so this is a reasonable default in the absence of information. 108 return kCommaSeparator; 109 } 110 111 void CombineLinesForLanguage(const std::vector<std::string>& lines, 112 const std::string& language_tag, 113 std::string* line) { 114 line->clear(); 115 std::string separator = GetLineSeparatorForLanguage(language_tag); 116 for (std::vector<std::string>::const_iterator it = lines.begin(); 117 it != lines.end(); 118 ++it) { 119 if (it != lines.begin()) { 120 line->append(separator); 121 } 122 line->append(*it); 123 } 124 } 125 126 } // namespace 127 128 void GetFormattedNationalAddress( 129 const AddressData& address_data, std::vector<std::string>* lines) { 130 assert(lines != NULL); 131 lines->clear(); 132 133 Rule rule; 134 rule.CopyFrom(Rule::GetDefault()); 135 // TODO: Eventually, we should get the best rule for this country and 136 // language, rather than just for the country. 137 rule.ParseSerializedRule(RegionDataConstants::GetRegionData( 138 address_data.region_code)); 139 140 Language language(address_data.language_code); 141 142 // If Latin-script rules are available and the |language_code| of this address 143 // is explicitly tagged as being Latin, then use the Latin-script formatting 144 // rules. 145 const std::vector<FormatElement>& format = 146 language.has_latin_script && !rule.GetLatinFormat().empty() 147 ? rule.GetLatinFormat() 148 : rule.GetFormat(); 149 150 // Address format without the unnecessary elements (based on which address 151 // fields are empty). We assume all literal strings that are not at the start 152 // or end of a line are separators, and therefore only relevant if the 153 // surrounding fields are filled in. This works with the data we have 154 // currently. 155 std::vector<FormatElement> pruned_format; 156 for (std::vector<FormatElement>::const_iterator 157 element_it = format.begin(); 158 element_it != format.end(); 159 ++element_it) { 160 // Always keep the newlines. 161 if (element_it->IsNewline() || 162 // Always keep the non-empty address fields. 163 (element_it->IsField() && 164 !address_data.IsFieldEmpty(element_it->GetField())) || 165 // Only keep literals that satisfy these 2 conditions: 166 (!element_it->IsField() && 167 // (1) Not preceding an empty field. 168 (element_it + 1 == format.end() || 169 !(element_it + 1)->IsField() || 170 !address_data.IsFieldEmpty((element_it + 1)->GetField())) && 171 // (2) Not following a removed field. 172 (element_it == format.begin() || 173 !(element_it - 1)->IsField() || 174 (!pruned_format.empty() && pruned_format.back().IsField())))) { 175 pruned_format.push_back(*element_it); 176 } 177 } 178 179 std::string line; 180 for (std::vector<FormatElement>::const_iterator 181 element_it = pruned_format.begin(); 182 element_it != pruned_format.end(); 183 ++element_it) { 184 if (element_it->IsNewline()) { 185 if (!line.empty()) { 186 lines->push_back(line); 187 line.clear(); 188 } 189 } else if (element_it->IsField()) { 190 AddressField field = element_it->GetField(); 191 if (field == STREET_ADDRESS) { 192 // The field "street address" represents the street address lines of an 193 // address, so there can be multiple values. 194 if (!address_data.IsFieldEmpty(field)) { 195 line.append(address_data.address_line.front()); 196 if (address_data.address_line.size() > 1U) { 197 lines->push_back(line); 198 line.clear(); 199 lines->insert(lines->end(), 200 address_data.address_line.begin() + 1, 201 address_data.address_line.end()); 202 } 203 } 204 } else { 205 line.append(address_data.GetFieldValue(field)); 206 } 207 } else { 208 line.append(element_it->GetLiteral()); 209 } 210 } 211 if (!line.empty()) { 212 lines->push_back(line); 213 } 214 } 215 216 void GetFormattedNationalAddressLine( 217 const AddressData& address_data, std::string* line) { 218 std::vector<std::string> address_lines; 219 GetFormattedNationalAddress(address_data, &address_lines); 220 CombineLinesForLanguage(address_lines, address_data.language_code, line); 221 } 222 223 void GetStreetAddressLinesAsSingleLine( 224 const AddressData& address_data, std::string* line) { 225 CombineLinesForLanguage( 226 address_data.address_line, address_data.language_code, line); 227 } 228 229 } // namespace addressinput 230 } // namespace i18n 231