1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 #include <stdio.h> 5 #include <string> 6 #include <stdlib.h> 7 #include <unistd.h> 8 #include <errno.h> 9 #include <string.h> 10 #include <iostream> 11 #include <fstream> 12 13 // with caution: 14 #include "unicode/utf8.h" 15 16 static const char 17 kSPACE = 0x20, 18 kTAB = 0x09, 19 kLF = 0x0A, 20 kCR = 0x0D; 21 // kHASH = 0x23, 22 // kSLASH = 0x2f, 23 // kSTAR = 0x2A, 24 25 # include "cptbl.h" 26 27 # define cp1047_to_8859(c) cp1047_8859_1[c] 28 29 std::string prog; 30 31 void usage() { 32 fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str()); 33 } 34 35 36 int cleanup(const std::string &outfile) { 37 const char *outstr = outfile.c_str(); 38 if(outstr && *outstr) { 39 int rc = unlink(outstr); 40 if(rc == 0) { 41 fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr); 42 return 0; 43 } else { 44 if( errno == ENOENT ) { 45 return 0; // File did not exist - no error. 46 } else { 47 perror("unlink"); 48 return 1; 49 } 50 } 51 } 52 return 0; 53 } 54 55 // inline bool hasNonAscii(const char *line, size_t len) { 56 // const unsigned char *uline = reinterpret_cast<const unsigned char*>(line); 57 // for(size_t i=0;i<len; i++) { 58 // if( uline[i] > 0x7F) { 59 // return true; 60 // } 61 // } 62 // return false; 63 // } 64 65 inline const char *skipws(const char *p, const char *e) { 66 for(;p<e;p++) { 67 switch(*p) { 68 case kSPACE: 69 case kTAB: 70 case kLF: 71 case kCR: 72 break; 73 default: 74 return p; // non ws 75 } 76 } 77 return p; 78 } 79 80 // inline bool isCommentOrEmpty(const char* line, size_t len) { 81 // const char *p = line; 82 // const char *e = line+len; 83 // p = skipws(p,e); 84 // if(p==e) { 85 // return true; // whitespace only 86 // } 87 // p++; 88 // switch(*p) { 89 // case kHASH: return true; // #directive 90 // case kSLASH: 91 // p++; 92 // if(p==e) return false; // single slash 93 // switch(*p) { 94 // case kSLASH: // '/ /' 95 // case kSTAR: // '/ *' 96 // return true; // start of comment 97 // default: return false; // something else 98 // } 99 // default: return false; // something else 100 // } 101 // /*NOTREACHED*/ 102 // } 103 104 void appendByte(std::string &outstr, 105 uint8_t byte) { 106 char tmp2[5]; 107 sprintf(tmp2, "\\x%02X", 0xFF & (int)(byte)); 108 outstr += tmp2; 109 } 110 111 /** 112 * @return true on failure 113 */ 114 bool appendUtf8(std::string &outstr, 115 const std::string &linestr, 116 size_t &pos, 117 size_t chars) { 118 char tmp[9]; 119 for(size_t i=0;i<chars;i++) { 120 tmp[i] = linestr[++pos]; 121 } 122 tmp[chars] = 0; 123 unsigned int c; 124 sscanf(tmp, "%X", &c); 125 UChar32 ch = c & 0x1FFFFF; 126 127 // now to append \\x%% etc 128 uint8_t bytesNeeded = U8_LENGTH(ch); 129 if(bytesNeeded == 0) { 130 fprintf(stderr, "Illegal code point U+%X\n", ch); 131 return true; 132 } 133 uint8_t bytes[4]; 134 uint8_t *s = bytes; 135 size_t i = 0; 136 U8_APPEND_UNSAFE(s, i, ch); 137 for(size_t t = 0; t<i; t++) { 138 appendByte(outstr, s[t]); 139 } 140 return false; 141 } 142 143 /** 144 * @param linestr string to mutate. Already escaped into \u format. 145 * @param origpos beginning, points to 'u8"' 146 * @param pos end, points to " 147 * @return false for no-problem, true for failure! 148 */ 149 bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) { 150 size_t pos = origpos + 3; 151 std::string outstr; 152 outstr += '\"'; // local encoding 153 for(;pos<endpos;pos++) { 154 char c = linestr[pos]; 155 if(c == '\\') { 156 char c2 = linestr[++pos]; 157 switch(c2) { 158 case '\'': 159 case '"': 160 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) 161 c2 = cp1047_to_8859(c2); 162 #endif 163 appendByte(outstr, c2); 164 break; 165 case 'u': 166 appendUtf8(outstr, linestr, pos, 4); 167 break; 168 case 'U': 169 appendUtf8(outstr, linestr, pos, 8); 170 break; 171 } 172 } else { 173 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) 174 c = cp1047_to_8859(c); 175 #endif 176 appendByte(outstr, c); 177 } 178 } 179 outstr += ('\"'); 180 181 linestr.replace(origpos, (endpos-origpos+1), outstr); 182 183 return false; // OK 184 } 185 186 /** 187 * fix the string at the position 188 * false = no err 189 * true = had err 190 */ 191 bool fixAt(std::string &linestr, size_t pos) { 192 size_t origpos = pos; 193 194 if(linestr[pos] != 'u') { 195 fprintf(stderr, "Not a 'u'?"); 196 return true; 197 } 198 199 pos++; // past 'u' 200 201 bool utf8 = false; 202 203 if(linestr[pos] == '8') { // u8" 204 utf8 = true; 205 pos++; 206 } 207 208 char quote = linestr[pos]; 209 210 if(quote != '\'' && quote != '\"') { 211 fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote); 212 return true; 213 } 214 215 if(quote == '\'' && utf8) { 216 fprintf(stderr, "Cannot do u8'...'\n"); 217 return true; 218 } 219 220 pos ++; 221 222 //printf("u%c%c\n", quote, quote); 223 224 for(; pos < linestr.size(); pos++) { 225 if(linestr[pos] == quote) { 226 if(utf8) { 227 return fixu8(linestr, origpos, pos); // fix u8"..." 228 } else { 229 return false; // end of quote 230 } 231 } 232 if(linestr[pos] == '\\') { 233 pos++; 234 if(linestr[pos] == quote) continue; // quoted quote 235 if(linestr[pos] == 'u') continue; // for now ... unicode escape 236 if(linestr[pos] == '\\') continue; 237 // some other escape ignore 238 } else { 239 size_t old_pos = pos; 240 int32_t i = pos; 241 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) 242 // mogrify 1-4 bytes from 1047 'back' to utf-8 243 char old_byte = linestr[pos]; 244 linestr[pos] = cp1047_to_8859(linestr[pos]); 245 // how many more? 246 int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]); 247 for(size_t pos2 = pos+1; trail>0; pos2++,trail--) { 248 linestr[pos2] = cp1047_to_8859(linestr[pos2]); 249 if(linestr[pos2] == 0x0A) { 250 linestr[pos2] = 0x85; // NL is ambiguous here 251 } 252 } 253 #endif 254 255 // Proceed to decode utf-8 256 const uint8_t *s = (const uint8_t*) (linestr.c_str()); 257 int32_t length = linestr.size(); 258 UChar32 c; 259 if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) { 260 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) 261 linestr[pos] = old_byte; // put it back 262 #endif 263 continue; // single code point not previously legal for \u escaping 264 } 265 266 // otherwise, convert it to \u / \U 267 { 268 U8_NEXT(s, i, length, c); 269 } 270 if(c<0) { 271 fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", (int)old_pos); 272 fprintf(stderr, "Line: >>%s<<\n", linestr.c_str()); 273 return true; 274 } 275 276 size_t seqLen = (i-pos); 277 278 //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout); 279 280 char newSeq[20]; 281 if( c <= 0xFFFF) { 282 sprintf(newSeq, "\\u%04X", c); 283 } else { 284 sprintf(newSeq, "\\U%08X", c); 285 } 286 linestr.replace(pos, seqLen, newSeq); 287 pos += strlen(newSeq) - 1; 288 } 289 } 290 291 return false; 292 } 293 294 /** 295 * false = no err 296 * true = had err 297 */ 298 bool fixLine(int /*no*/, std::string &linestr) { 299 const char *line = linestr.c_str(); 300 size_t len = linestr.size(); 301 302 // no u' in the line? 303 if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) { 304 return false; // Nothing to do. No u' or u" detected 305 } 306 307 // lines such as u8"\u0308" are all ASCII. 308 // // Quick Check: all ascii? 309 // if(!hasNonAscii(line, len)) { 310 // return false; // ASCII 311 // } 312 313 // // comment or empty line? 314 // if(isCommentOrEmpty(line, len)) { 315 // return false; // Comment or just empty 316 // } 317 318 // start from the end and find all u" cases 319 size_t pos = len = linestr.size(); 320 while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) { 321 //printf("found doublequote at %d\n", pos); 322 if(fixAt(linestr, pos)) return true; 323 if(pos == 0) break; 324 pos--; 325 } 326 327 // reset and find all u' cases 328 pos = len = linestr.size(); 329 while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) { 330 //printf("found singlequote at %d\n", pos); 331 if(fixAt(linestr, pos)) return true; 332 if(pos == 0) break; 333 pos--; 334 } 335 336 // reset and find all u8" cases 337 pos = len = linestr.size(); 338 while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) { 339 if(fixAt(linestr, pos)) return true; 340 if(pos == 0) break; 341 pos--; 342 } 343 344 //fprintf(stderr, "%d - fixed\n", no); 345 return false; 346 } 347 348 int convert(const std::string &infile, const std::string &outfile) { 349 fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str()); 350 351 std::ifstream inf; 352 353 inf.open(infile.c_str(), std::ios::in); 354 355 if(!inf.is_open()) { 356 fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str()); 357 cleanup(outfile); 358 return 1; 359 } 360 361 std::ofstream outf; 362 363 outf.open(outfile.c_str(), std::ios::out); 364 365 if(!outf.is_open()) { 366 fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str()); 367 return 1; 368 } 369 370 // TODO: any platform variations of #line? 371 outf << "#line 1 \"" << infile << "\"" << '\n'; 372 373 int no = 0; 374 std::string linestr; 375 while( getline( inf, linestr)) { 376 no++; 377 if(fixLine(no, linestr)) { 378 outf.close(); 379 fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str()); 380 cleanup(outfile); 381 return 1; 382 } 383 outf << linestr << '\n'; 384 } 385 386 return 0; 387 } 388 389 int main(int argc, const char *argv[]) { 390 prog = argv[0]; 391 392 if(argc != 3) { 393 usage(); 394 return 1; 395 } 396 397 std::string infile = argv[1]; 398 std::string outfile = argv[2]; 399 400 return convert(infile, outfile); 401 } 402 403 404 #include "utf_impl.cpp" 405