Home | History | Annotate | Download | only in escapesrc
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 #include <stdio.h>
      5 #include <string>
      6 #include <stdlib.h>
      7 #include <unistd.h>
      8 #include <errno.h>
      9 #include <string.h>
     10 #include <iostream>
     11 #include <fstream>
     12 
     13 // with caution:
     14 #include "unicode/utf8.h"
     15 
     16 static const char
     17   kSPACE   = 0x20,
     18   kTAB     = 0x09,
     19   kLF      = 0x0A,
     20   kCR      = 0x0D;
     21   // kHASH    = 0x23,
     22   // kSLASH   = 0x2f,
     23   // kSTAR    = 0x2A,
     24 
     25 # include "cptbl.h"
     26 
     27 # define cp1047_to_8859(c) cp1047_8859_1[c]
     28 
     29 std::string prog;
     30 
     31 void usage() {
     32   fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str());
     33 }
     34 
     35 
     36 int cleanup(const std::string &outfile) {
     37   const char *outstr = outfile.c_str();
     38   if(outstr && *outstr) {
     39     int rc = unlink(outstr);
     40     if(rc == 0) {
     41       fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr);
     42       return 0;
     43     } else {
     44       if( errno == ENOENT ) {
     45         return 0; // File did not exist - no error.
     46       } else {
     47         perror("unlink");
     48         return 1;
     49       }
     50     }
     51   }
     52   return 0;
     53 }
     54 
     55 // inline bool hasNonAscii(const char *line, size_t len) {
     56 //   const unsigned char *uline = reinterpret_cast<const unsigned char*>(line);
     57 //   for(size_t i=0;i<len; i++) {
     58 //     if( uline[i] > 0x7F) {
     59 //       return true;
     60 //     }
     61 //   }
     62 //   return false;
     63 // }
     64 
     65 inline const char *skipws(const char *p, const char *e) {
     66   for(;p<e;p++) {
     67     switch(*p) {
     68     case kSPACE:
     69     case kTAB:
     70     case kLF:
     71     case kCR:
     72       break;
     73     default:
     74       return p; // non ws
     75     }
     76   }
     77   return p;
     78 }
     79 
     80 // inline bool isCommentOrEmpty(const char* line, size_t len) {
     81 //   const char *p = line;
     82 //   const char *e = line+len;
     83 //   p = skipws(p,e);
     84 //   if(p==e) {
     85 //     return true; // whitespace only
     86 //   }
     87 //   p++;
     88 //   switch(*p) {
     89 //   case kHASH: return true; // #directive
     90 //   case kSLASH:
     91 //     p++;
     92 //     if(p==e) return false; // single slash
     93 //     switch(*p) {
     94 //     case kSLASH: // '/ /'
     95 //     case kSTAR: // '/ *'
     96 //       return true; // start of comment
     97 //     default: return false; // something else
     98 //     }
     99 //   default: return false; // something else
    100 //   }
    101 //   /*NOTREACHED*/
    102 // }
    103 
    104 void appendByte(std::string &outstr,
    105                 uint8_t byte) {
    106     char tmp2[5];
    107     sprintf(tmp2, "\\x%02X", 0xFF & (int)(byte));
    108     outstr += tmp2;
    109 }
    110 
    111 /**
    112  * @return true on failure
    113  */
    114 bool appendUtf8(std::string &outstr,
    115                 const std::string &linestr,
    116                 size_t &pos,
    117                 size_t chars) {
    118   char tmp[9];
    119   for(size_t i=0;i<chars;i++) {
    120     tmp[i] = linestr[++pos];
    121   }
    122   tmp[chars] = 0;
    123   unsigned int c;
    124   sscanf(tmp, "%X", &c);
    125   UChar32 ch = c & 0x1FFFFF;
    126 
    127   // now to append \\x%% etc
    128   uint8_t bytesNeeded = U8_LENGTH(ch);
    129   if(bytesNeeded == 0) {
    130     fprintf(stderr, "Illegal code point U+%X\n", ch);
    131     return true;
    132   }
    133   uint8_t bytes[4];
    134   uint8_t *s = bytes;
    135   size_t i = 0;
    136   U8_APPEND_UNSAFE(s, i, ch);
    137   for(size_t t = 0; t<i; t++) {
    138     appendByte(outstr, s[t]);
    139   }
    140   return false;
    141 }
    142 
    143 /**
    144  * @param linestr string to mutate. Already escaped into \u format.
    145  * @param origpos beginning, points to 'u8"'
    146  * @param pos end, points to "
    147  * @return false for no-problem, true for failure!
    148  */
    149 bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) {
    150   size_t pos = origpos + 3;
    151   std::string outstr;
    152   outstr += '\"'; // local encoding
    153   for(;pos<endpos;pos++) {
    154     char c = linestr[pos];
    155     if(c == '\\') {
    156       char c2 = linestr[++pos];
    157       switch(c2) {
    158       case '\'':
    159       case '"':
    160 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
    161         c2 = cp1047_to_8859(c2);
    162 #endif
    163         appendByte(outstr, c2);
    164         break;
    165       case 'u':
    166         appendUtf8(outstr, linestr, pos, 4);
    167         break;
    168       case 'U':
    169         appendUtf8(outstr, linestr, pos, 8);
    170         break;
    171       }
    172     } else {
    173 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
    174       c = cp1047_to_8859(c);
    175 #endif
    176       appendByte(outstr, c);
    177     }
    178   }
    179   outstr += ('\"');
    180 
    181   linestr.replace(origpos, (endpos-origpos+1), outstr);
    182 
    183   return false; // OK
    184 }
    185 
    186 /**
    187  * fix the string at the position
    188  * false = no err
    189  * true = had err
    190  */
    191 bool fixAt(std::string &linestr, size_t pos) {
    192   size_t origpos = pos;
    193 
    194   if(linestr[pos] != 'u') {
    195     fprintf(stderr, "Not a 'u'?");
    196     return true;
    197   }
    198 
    199   pos++; // past 'u'
    200 
    201   bool utf8 = false;
    202 
    203   if(linestr[pos] == '8') { // u8"
    204     utf8 = true;
    205     pos++;
    206   }
    207 
    208   char quote = linestr[pos];
    209 
    210   if(quote != '\'' && quote != '\"') {
    211     fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote);
    212     return true;
    213   }
    214 
    215   if(quote == '\'' && utf8) {
    216     fprintf(stderr, "Cannot do u8'...'\n");
    217     return true;
    218   }
    219 
    220   pos ++;
    221 
    222   //printf("u%c%c\n", quote, quote);
    223 
    224   for(; pos < linestr.size(); pos++) {
    225     if(linestr[pos] == quote) {
    226       if(utf8) {
    227         return fixu8(linestr, origpos, pos); // fix u8"..."
    228       } else {
    229         return false; // end of quote
    230       }
    231     }
    232     if(linestr[pos] == '\\') {
    233       pos++;
    234       if(linestr[pos] == quote) continue; // quoted quote
    235       if(linestr[pos] == 'u') continue; // for now ... unicode escape
    236       if(linestr[pos] == '\\') continue;
    237       // some other escape ignore
    238     } else {
    239       size_t old_pos = pos;
    240       int32_t i = pos;
    241 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
    242       // mogrify 1-4 bytes from 1047 'back' to utf-8
    243       char old_byte = linestr[pos];
    244       linestr[pos] = cp1047_to_8859(linestr[pos]);
    245       // how many more?
    246       int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]);
    247       for(size_t pos2 = pos+1; trail>0; pos2++,trail--) {
    248         linestr[pos2] = cp1047_to_8859(linestr[pos2]);
    249         if(linestr[pos2] == 0x0A) {
    250           linestr[pos2] = 0x85; // NL is ambiguous here
    251         }
    252       }
    253 #endif
    254 
    255       // Proceed to decode utf-8
    256       const uint8_t *s = (const uint8_t*) (linestr.c_str());
    257       int32_t length = linestr.size();
    258       UChar32 c;
    259       if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) {
    260 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
    261         linestr[pos] = old_byte; // put it back
    262 #endif
    263         continue; // single code point not previously legal for \u escaping
    264       }
    265 
    266       // otherwise, convert it to \u / \U
    267       {
    268         U8_NEXT(s, i, length, c);
    269       }
    270       if(c<0) {
    271         fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", (int)old_pos);
    272         fprintf(stderr, "Line: >>%s<<\n", linestr.c_str());
    273         return true;
    274       }
    275 
    276       size_t seqLen = (i-pos);
    277 
    278       //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout);
    279 
    280       char newSeq[20];
    281       if( c <= 0xFFFF) {
    282         sprintf(newSeq, "\\u%04X", c);
    283       } else {
    284         sprintf(newSeq, "\\U%08X", c);
    285       }
    286       linestr.replace(pos, seqLen, newSeq);
    287       pos += strlen(newSeq) - 1;
    288     }
    289   }
    290 
    291   return false;
    292 }
    293 
    294 /**
    295  * false = no err
    296  * true = had err
    297  */
    298 bool fixLine(int /*no*/, std::string &linestr) {
    299   const char *line = linestr.c_str();
    300   size_t len = linestr.size();
    301 
    302   // no u' in the line?
    303   if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) {
    304     return false; // Nothing to do. No u' or u" detected
    305   }
    306 
    307   // lines such as u8"\u0308" are all ASCII.
    308   // // Quick Check: all ascii?
    309   // if(!hasNonAscii(line, len)) {
    310   //   return false; // ASCII
    311   // }
    312 
    313   // // comment or empty line?
    314   // if(isCommentOrEmpty(line, len)) {
    315   //   return false; // Comment or just empty
    316   // }
    317 
    318   // start from the end and find all u" cases
    319   size_t pos = len = linestr.size();
    320   while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) {
    321     //printf("found doublequote at %d\n", pos);
    322     if(fixAt(linestr, pos)) return true;
    323     if(pos == 0) break;
    324     pos--;
    325   }
    326 
    327   // reset and find all u' cases
    328   pos = len = linestr.size();
    329   while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) {
    330     //printf("found singlequote at %d\n", pos);
    331     if(fixAt(linestr, pos)) return true;
    332     if(pos == 0) break;
    333     pos--;
    334   }
    335 
    336   // reset and find all u8" cases
    337   pos = len = linestr.size();
    338   while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) {
    339     if(fixAt(linestr, pos)) return true;
    340     if(pos == 0) break;
    341     pos--;
    342   }
    343 
    344   //fprintf(stderr, "%d - fixed\n", no);
    345   return false;
    346 }
    347 
    348 int convert(const std::string &infile, const std::string &outfile) {
    349   fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str());
    350 
    351   std::ifstream inf;
    352 
    353   inf.open(infile.c_str(), std::ios::in);
    354 
    355   if(!inf.is_open()) {
    356     fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str());
    357     cleanup(outfile);
    358     return 1;
    359   }
    360 
    361   std::ofstream outf;
    362 
    363   outf.open(outfile.c_str(), std::ios::out);
    364 
    365   if(!outf.is_open()) {
    366     fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str());
    367     return 1;
    368   }
    369 
    370   // TODO: any platform variations of #line?
    371   outf << "#line 1 \"" << infile << "\"" << '\n';
    372 
    373   int no = 0;
    374   std::string linestr;
    375   while( getline( inf, linestr)) {
    376     no++;
    377     if(fixLine(no, linestr)) {
    378       outf.close();
    379       fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str());
    380       cleanup(outfile);
    381       return 1;
    382     }
    383     outf << linestr << '\n';
    384   }
    385 
    386   return 0;
    387 }
    388 
    389 int main(int argc, const char *argv[]) {
    390   prog = argv[0];
    391 
    392   if(argc != 3) {
    393     usage();
    394     return 1;
    395   }
    396 
    397   std::string infile = argv[1];
    398   std::string outfile = argv[2];
    399 
    400   return convert(infile, outfile);
    401 }
    402 
    403 
    404 #include "utf_impl.cpp"
    405