Home | History | Annotate | Download | only in base
      1 /*
      2  * libjingle
      3  * Copyright 2011, Google Inc.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions are met:
      7  *
      8  *  1. Redistributions of source code must retain the above copyright notice,
      9  *     this list of conditions and the following disclaimer.
     10  *  2. Redistributions in binary form must reproduce the above copyright notice,
     11  *     this list of conditions and the following disclaimer in the documentation
     12  *     and/or other materials provided with the distribution.
     13  *  3. The name of the author may not be used to endorse or promote products
     14  *     derived from this software without specific prior written permission.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
     17  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
     18  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
     19  * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     20  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
     22  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
     23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
     24  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
     25  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  */
     27 
     28 #include "talk/base/stringencode.h"
     29 
     30 #include <cstdio>
     31 #include <cstdlib>
     32 
     33 #include "talk/base/basictypes.h"
     34 #include "talk/base/common.h"
     35 #include "talk/base/stringutils.h"
     36 
     37 namespace talk_base {
     38 
     39 /////////////////////////////////////////////////////////////////////////////
     40 // String Encoding Utilities
     41 /////////////////////////////////////////////////////////////////////////////
     42 
     43 static const char HEX[] = "0123456789abcdef";
     44 
     45 char hex_encode(unsigned char val) {
     46   ASSERT(val < 16);
     47   return (val < 16) ? HEX[val] : '!';
     48 }
     49 
     50 bool hex_decode(char ch, unsigned char* val) {
     51   if ((ch >= '0') && (ch <= '9')) {
     52     *val = ch - '0';
     53   } else if ((ch >= 'A') && (ch <= 'Z')) {
     54     *val = (ch - 'A') + 10;
     55   } else if ((ch >= 'a') && (ch <= 'z')) {
     56     *val = (ch - 'a') + 10;
     57   } else {
     58     return false;
     59   }
     60   return true;
     61 }
     62 
     63 size_t escape(char * buffer, size_t buflen,
     64               const char * source, size_t srclen,
     65               const char * illegal, char escape) {
     66   ASSERT(NULL != buffer);  // TODO: estimate output size
     67   if (buflen <= 0)
     68     return 0;
     69 
     70   size_t srcpos = 0, bufpos = 0;
     71   while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
     72     char ch = source[srcpos++];
     73     if ((ch == escape) || ::strchr(illegal, ch)) {
     74       if (bufpos + 2 >= buflen)
     75         break;
     76       buffer[bufpos++] = escape;
     77     }
     78     buffer[bufpos++] = ch;
     79   }
     80 
     81   buffer[bufpos] = '\0';
     82   return bufpos;
     83 }
     84 
     85 size_t unescape(char * buffer, size_t buflen,
     86                 const char * source, size_t srclen,
     87                 char escape) {
     88   ASSERT(NULL != buffer);  // TODO: estimate output size
     89   if (buflen <= 0)
     90     return 0;
     91 
     92   size_t srcpos = 0, bufpos = 0;
     93   while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
     94     char ch = source[srcpos++];
     95     if ((ch == escape) && (srcpos < srclen)) {
     96       ch = source[srcpos++];
     97     }
     98     buffer[bufpos++] = ch;
     99   }
    100   buffer[bufpos] = '\0';
    101   return bufpos;
    102 }
    103 
    104 size_t encode(char * buffer, size_t buflen,
    105               const char * source, size_t srclen,
    106               const char * illegal, char escape) {
    107   ASSERT(NULL != buffer);  // TODO: estimate output size
    108   if (buflen <= 0)
    109     return 0;
    110 
    111   size_t srcpos = 0, bufpos = 0;
    112   while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
    113     char ch = source[srcpos++];
    114     if ((ch != escape) && !::strchr(illegal, ch)) {
    115       buffer[bufpos++] = ch;
    116     } else if (bufpos + 3 >= buflen) {
    117       break;
    118     } else {
    119       buffer[bufpos+0] = escape;
    120       buffer[bufpos+1] = hex_encode((static_cast<unsigned char>(ch) >> 4) & 0xF);
    121       buffer[bufpos+2] = hex_encode((static_cast<unsigned char>(ch)     ) & 0xF);
    122       bufpos += 3;
    123     }
    124   }
    125   buffer[bufpos] = '\0';
    126   return bufpos;
    127 }
    128 
    129 size_t decode(char * buffer, size_t buflen,
    130               const char * source, size_t srclen,
    131               char escape) {
    132   if (buflen <= 0)
    133     return 0;
    134 
    135   unsigned char h1, h2;
    136   size_t srcpos = 0, bufpos = 0;
    137   while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
    138     char ch = source[srcpos++];
    139     if ((ch == escape)
    140         && (srcpos + 1 < srclen)
    141         && hex_decode(source[srcpos], &h1)
    142         && hex_decode(source[srcpos+1], &h2)) {
    143       buffer[bufpos++] = (h1 << 4) | h2;
    144       srcpos += 2;
    145     } else {
    146       buffer[bufpos++] = ch;
    147     }
    148   }
    149   buffer[bufpos] = '\0';
    150   return bufpos;
    151 }
    152 
    153 const char* unsafe_filename_characters() {
    154   // It might be better to have a single specification which is the union of
    155   // all operating systems, unless one system is overly restrictive.
    156 #ifdef WIN32
    157   return "\\/:*?\"<>|";
    158 #else  // !WIN32
    159   // TODO
    160   ASSERT(false);
    161   return "";
    162 #endif  // !WIN23
    163 }
    164 
    165 const unsigned char URL_UNSAFE  = 0x1; // 0-33 "#$%&+,/:;<=>?@[\]^`{|} 127
    166 const unsigned char XML_UNSAFE  = 0x2; // "&'<>
    167 const unsigned char HTML_UNSAFE = 0x2; // "&'<>
    168 
    169 //  ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 6 5 7 8 9 : ; < = > ?
    170 //@ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
    171 //` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
    172 
    173 const unsigned char ASCII_CLASS[128] = {
    174   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    175   1,0,3,1,1,1,3,2,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,3,1,3,1,
    176   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,
    177   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,
    178 };
    179 
    180 size_t url_encode(char * buffer, size_t buflen,
    181                   const char * source, size_t srclen) {
    182   if (NULL == buffer)
    183     return srclen * 3 + 1;
    184   if (buflen <= 0)
    185     return 0;
    186 
    187   size_t srcpos = 0, bufpos = 0;
    188   while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
    189     unsigned char ch = source[srcpos++];
    190     if ((ch < 128) && (ASCII_CLASS[ch] & URL_UNSAFE)) {
    191       if (bufpos + 3 >= buflen) {
    192         break;
    193       }
    194       buffer[bufpos+0] = '%';
    195       buffer[bufpos+1] = hex_encode((ch >> 4) & 0xF);
    196       buffer[bufpos+2] = hex_encode((ch     ) & 0xF);
    197       bufpos += 3;
    198     } else {
    199       buffer[bufpos++] = ch;
    200     }
    201   }
    202   buffer[bufpos] = '\0';
    203   return bufpos;
    204 }
    205 
    206 size_t url_decode(char * buffer, size_t buflen,
    207                   const char * source, size_t srclen) {
    208   if (NULL == buffer)
    209     return srclen + 1;
    210   if (buflen <= 0)
    211     return 0;
    212 
    213   unsigned char h1, h2;
    214   size_t srcpos = 0, bufpos = 0;
    215   while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
    216     unsigned char ch = source[srcpos++];
    217     if (ch == '+') {
    218       buffer[bufpos++] = ' ';
    219     } else if ((ch == '%')
    220                && (srcpos + 1 < srclen)
    221                && hex_decode(source[srcpos], &h1)
    222                && hex_decode(source[srcpos+1], &h2))
    223     {
    224       buffer[bufpos++] = (h1 << 4) | h2;
    225       srcpos += 2;
    226     } else {
    227       buffer[bufpos++] = ch;
    228     }
    229   }
    230   buffer[bufpos] = '\0';
    231   return bufpos;
    232 }
    233 
    234 size_t utf8_decode(const char* source, size_t srclen, unsigned long* value) {
    235   const unsigned char* s = reinterpret_cast<const unsigned char*>(source);
    236   if ((s[0] & 0x80) == 0x00) {                    // Check s[0] == 0xxxxxxx
    237     *value = s[0];
    238     return 1;
    239   }
    240   if ((srclen < 2) || ((s[1] & 0xC0) != 0x80)) {  // Check s[1] != 10xxxxxx
    241     return 0;
    242   }
    243   // Accumulate the trailer byte values in value16, and combine it with the
    244   // relevant bits from s[0], once we've determined the sequence length.
    245   unsigned long value16 = (s[1] & 0x3F);
    246   if ((s[0] & 0xE0) == 0xC0) {                    // Check s[0] == 110xxxxx
    247     *value = ((s[0] & 0x1F) << 6) | value16;
    248     return 2;
    249   }
    250   if ((srclen < 3) || ((s[2] & 0xC0) != 0x80)) {  // Check s[2] != 10xxxxxx
    251     return 0;
    252   }
    253   value16 = (value16 << 6) | (s[2] & 0x3F);
    254   if ((s[0] & 0xF0) == 0xE0) {                    // Check s[0] == 1110xxxx
    255     *value = ((s[0] & 0x0F) << 12) | value16;
    256     return 3;
    257   }
    258   if ((srclen < 4) || ((s[3] & 0xC0) != 0x80)) {  // Check s[3] != 10xxxxxx
    259     return 0;
    260   }
    261   value16 = (value16 << 6) | (s[3] & 0x3F);
    262   if ((s[0] & 0xF8) == 0xF0) {                    // Check s[0] == 11110xxx
    263     *value = ((s[0] & 0x07) << 18) | value16;
    264     return 4;
    265   }
    266   return 0;
    267 }
    268 
    269 size_t utf8_encode(char* buffer, size_t buflen, unsigned long value) {
    270   if ((value <= 0x7F) && (buflen >= 1)) {
    271     buffer[0] = static_cast<unsigned char>(value);
    272     return 1;
    273   }
    274   if ((value <= 0x7FF) && (buflen >= 2)) {
    275     buffer[0] = 0xC0 | static_cast<unsigned char>(value >> 6);
    276     buffer[1] = 0x80 | static_cast<unsigned char>(value & 0x3F);
    277     return 2;
    278   }
    279   if ((value <= 0xFFFF) && (buflen >= 3)) {
    280     buffer[0] = 0xE0 | static_cast<unsigned char>(value >> 12);
    281     buffer[1] = 0x80 | static_cast<unsigned char>((value >> 6) & 0x3F);
    282     buffer[2] = 0x80 | static_cast<unsigned char>(value & 0x3F);
    283     return 3;
    284   }
    285   if ((value <= 0x1FFFFF) && (buflen >= 4)) {
    286     buffer[0] = 0xF0 | static_cast<unsigned char>(value >> 18);
    287     buffer[1] = 0x80 | static_cast<unsigned char>((value >> 12) & 0x3F);
    288     buffer[2] = 0x80 | static_cast<unsigned char>((value >> 6) & 0x3F);
    289     buffer[3] = 0x80 | static_cast<unsigned char>(value & 0x3F);
    290     return 4;
    291   }
    292   return 0;
    293 }
    294 
    295 size_t html_encode(char * buffer, size_t buflen,
    296                    const char * source, size_t srclen) {
    297   ASSERT(NULL != buffer);  // TODO: estimate output size
    298   if (buflen <= 0)
    299     return 0;
    300 
    301   size_t srcpos = 0, bufpos = 0;
    302   while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
    303     unsigned char ch = source[srcpos];
    304     if (ch < 128) {
    305       srcpos += 1;
    306       if (ASCII_CLASS[ch] & HTML_UNSAFE) {
    307         const char * escseq = 0;
    308         size_t esclen = 0;
    309         switch (ch) {
    310           case '<':  escseq = "&lt;";   esclen = 4; break;
    311           case '>':  escseq = "&gt;";   esclen = 4; break;
    312           case '\'': escseq = "&#39;";  esclen = 5; break;
    313           case '\"': escseq = "&quot;"; esclen = 6; break;
    314           case '&':  escseq = "&amp;";  esclen = 5; break;
    315           default: ASSERT(false);
    316         }
    317         if (bufpos + esclen >= buflen) {
    318           break;
    319         }
    320         memcpy(buffer + bufpos, escseq, esclen);
    321         bufpos += esclen;
    322       } else {
    323         buffer[bufpos++] = ch;
    324       }
    325     } else {
    326       // Largest value is 0x1FFFFF => &#2097151;  (10 characters)
    327       char escseq[11];
    328       unsigned long val;
    329       if (size_t vallen = utf8_decode(&source[srcpos], srclen - srcpos, &val)) {
    330         srcpos += vallen;
    331       } else {
    332         // Not a valid utf8 sequence, just use the raw character.
    333         val = static_cast<unsigned char>(source[srcpos++]);
    334       }
    335       size_t esclen = sprintfn(escseq, ARRAY_SIZE(escseq), "&#%lu;", val);
    336       if (bufpos + esclen >= buflen) {
    337         break;
    338       }
    339       memcpy(buffer + bufpos, escseq, esclen);
    340       bufpos += esclen;
    341     }
    342   }
    343   buffer[bufpos] = '\0';
    344   return bufpos;
    345 }
    346 
    347 size_t html_decode(char * buffer, size_t buflen,
    348                    const char * source, size_t srclen) {
    349   ASSERT(NULL != buffer);  // TODO: estimate output size
    350   return xml_decode(buffer, buflen, source, srclen);
    351 }
    352 
    353 size_t xml_encode(char * buffer, size_t buflen,
    354                   const char * source, size_t srclen) {
    355   ASSERT(NULL != buffer);  // TODO: estimate output size
    356   if (buflen <= 0)
    357     return 0;
    358 
    359   size_t srcpos = 0, bufpos = 0;
    360   while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
    361     unsigned char ch = source[srcpos++];
    362     if ((ch < 128) && (ASCII_CLASS[ch] & XML_UNSAFE)) {
    363       const char * escseq = 0;
    364       size_t esclen = 0;
    365       switch (ch) {
    366         case '<':  escseq = "&lt;";   esclen = 4; break;
    367         case '>':  escseq = "&gt;";   esclen = 4; break;
    368         case '\'': escseq = "&apos;"; esclen = 6; break;
    369         case '\"': escseq = "&quot;"; esclen = 6; break;
    370         case '&':  escseq = "&amp;";  esclen = 5; break;
    371         default: ASSERT(false);
    372       }
    373       if (bufpos + esclen >= buflen) {
    374         break;
    375       }
    376       memcpy(buffer + bufpos, escseq, esclen);
    377       bufpos += esclen;
    378     } else {
    379       buffer[bufpos++] = ch;
    380     }
    381   }
    382   buffer[bufpos] = '\0';
    383   return bufpos;
    384 }
    385 
    386 size_t xml_decode(char * buffer, size_t buflen,
    387                   const char * source, size_t srclen) {
    388   ASSERT(NULL != buffer);  // TODO: estimate output size
    389   if (buflen <= 0)
    390     return 0;
    391 
    392   size_t srcpos = 0, bufpos = 0;
    393   while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
    394     unsigned char ch = source[srcpos++];
    395     if (ch != '&') {
    396       buffer[bufpos++] = ch;
    397     } else if ((srcpos + 2 < srclen)
    398                && (memcmp(source + srcpos, "lt;", 3) == 0)) {
    399       buffer[bufpos++] = '<';
    400       srcpos += 3;
    401     } else if ((srcpos + 2 < srclen)
    402                && (memcmp(source + srcpos, "gt;", 3) == 0)) {
    403       buffer[bufpos++] = '>';
    404       srcpos += 3;
    405     } else if ((srcpos + 4 < srclen)
    406                && (memcmp(source + srcpos, "apos;", 5) == 0)) {
    407       buffer[bufpos++] = '\'';
    408       srcpos += 5;
    409     } else if ((srcpos + 4 < srclen)
    410                && (memcmp(source + srcpos, "quot;", 5) == 0)) {
    411       buffer[bufpos++] = '\"';
    412       srcpos += 5;
    413     } else if ((srcpos + 3 < srclen)
    414                && (memcmp(source + srcpos, "amp;", 4) == 0)) {
    415       buffer[bufpos++] = '&';
    416       srcpos += 4;
    417     } else if ((srcpos < srclen) && (source[srcpos] == '#')) {
    418       int int_base = 10;
    419       if ((srcpos + 1 < srclen) && (source[srcpos+1] == 'x')) {
    420         int_base = 16;
    421         srcpos += 1;
    422       }
    423       char * ptr;
    424       // TODO: Fix hack (ptr may go past end of data)
    425       unsigned long val = strtoul(source + srcpos + 1, &ptr, int_base);
    426       if ((static_cast<size_t>(ptr - source) < srclen) && (*ptr == ';')) {
    427         srcpos = ptr - source + 1;
    428       } else {
    429         // Not a valid escape sequence.
    430         break;
    431       }
    432       if (size_t esclen = utf8_encode(buffer + bufpos, buflen - bufpos, val)) {
    433         bufpos += esclen;
    434       } else {
    435         // Not enough room to encode the character, or illegal character
    436         break;
    437       }
    438     } else {
    439       // Unrecognized escape sequence.
    440       break;
    441     }
    442   }
    443   buffer[bufpos] = '\0';
    444   return bufpos;
    445 }
    446 
    447 std::string hex_encode(const char * source, size_t srclen) {
    448   const size_t kBufferSize = srclen * 2 + 1;
    449   char* buffer = STACK_ARRAY(char, kBufferSize);
    450   size_t length = hex_encode(buffer, kBufferSize, source, srclen);
    451   return std::string(buffer, length);
    452 }
    453 
    454 size_t hex_encode(char * buffer, size_t buflen,
    455                   const char * csource, size_t srclen) {
    456   ASSERT(NULL != buffer);  // TODO: estimate output size
    457   if (buflen <= 0)
    458     return 0;
    459 
    460   const unsigned char * bsource =
    461     reinterpret_cast<const unsigned char *>(csource);
    462 
    463   size_t srcpos = 0, bufpos = 0;
    464   srclen = _min(srclen, (buflen - 1) / 2);
    465   while (srcpos < srclen) {
    466     unsigned char ch = bsource[srcpos++];
    467     buffer[bufpos  ] = hex_encode((ch >> 4) & 0xF);
    468     buffer[bufpos+1] = hex_encode((ch     ) & 0xF);
    469     bufpos += 2;
    470   }
    471   buffer[bufpos] = '\0';
    472   return bufpos;
    473 }
    474 
    475 size_t hex_decode(char * cbuffer, size_t buflen,
    476                   const char * source, size_t srclen) {
    477   ASSERT(NULL != cbuffer);  // TODO: estimate output size
    478   if (buflen <= 0)
    479     return 0;
    480 
    481   unsigned char * bbuffer = reinterpret_cast<unsigned char *>(cbuffer);
    482 
    483   unsigned char h1, h2;
    484   size_t srcpos = 0, bufpos = 0;
    485   while ((srcpos + 1 < srclen)
    486          && (bufpos + 1 < buflen)
    487          && hex_decode(source[srcpos], &h1)
    488          && hex_decode(source[srcpos+1], &h2))
    489   {
    490     bbuffer[bufpos++] = (h1 << 4) | h2;
    491     srcpos += 2;
    492   }
    493   bbuffer[bufpos] = '\0';
    494   return bufpos;
    495 }
    496 
    497 size_t transform(std::string& value, size_t maxlen, const std::string& source,
    498                  Transform t) {
    499   char* buffer = STACK_ARRAY(char, maxlen + 1);
    500   size_t length = t(buffer, maxlen + 1, source.data(), source.length());
    501   value.assign(buffer, length);
    502   return length;
    503 }
    504 
    505 std::string s_transform(const std::string& source, Transform t) {
    506   // Ask transformation function to approximate the destination size (returns upper bound)
    507   size_t maxlen = t(NULL, 0, source.data(), source.length());
    508   char * buffer = STACK_ARRAY(char, maxlen);
    509   size_t len = t(buffer, maxlen, source.data(), source.length());
    510   std::string result(buffer, len);
    511   return result;
    512 }
    513 
    514 size_t tokenize(const std::string& source, char delimiter,
    515                 std::vector<std::string>* fields) {
    516   ASSERT(NULL != fields);
    517   fields->clear();
    518   size_t last = 0;
    519   for (size_t i = 0; i < source.length(); ++i) {
    520     if (source[i] == delimiter) {
    521       if (i != last) {
    522         fields->push_back(source.substr(last, i - last));
    523       }
    524       last = i + 1;
    525     }
    526   }
    527   if (last != source.length()) {
    528     fields->push_back(source.substr(last, source.length() - last));
    529   }
    530   return fields->size();
    531 }
    532 
    533 size_t split(const std::string& source, char delimiter,
    534              std::vector<std::string>* fields) {
    535   ASSERT(NULL != fields);
    536   fields->clear();
    537   size_t last = 0;
    538   for (size_t i = 0; i < source.length(); ++i) {
    539     if (source[i] == delimiter) {
    540       fields->push_back(source.substr(last, i - last));
    541       last = i + 1;
    542     }
    543   }
    544   fields->push_back(source.substr(last, source.length() - last));
    545   return fields->size();
    546 }
    547 
    548 char make_char_safe_for_filename(char c) {
    549   if (c < 32)
    550     return '_';
    551 
    552   switch (c) {
    553     case '<':
    554     case '>':
    555     case ':':
    556     case '"':
    557     case '/':
    558     case '\\':
    559     case '|':
    560     case '*':
    561     case '?':
    562       return '_';
    563 
    564     default:
    565       return c;
    566   }
    567 }
    568 
    569 /*
    570 void sprintf(std::string& value, size_t maxlen, const char * format, ...) {
    571   char * buffer = STACK_ARRAY(char, maxlen + 1);
    572   va_list args;
    573   va_start(args, format);
    574   value.assign(buffer, vsprintfn(buffer, maxlen + 1, format, args));
    575   va_end(args);
    576 }
    577 */
    578 
    579 /////////////////////////////////////////////////////////////////////////////
    580 
    581 }  // namespace talk_base
    582