Home | History | Annotate | Download | only in src
      1 /**
      2  * \file unicode.c
      3  *
      4  * This file contains general Unicode string manipulation functions.
      5  * It mainly consist of functions for converting between UCS-2 (used on
      6  * the devices) and UTF-8 (used by several applications).
      7  *
      8  * For a deeper understanding of Unicode encoding formats see the
      9  * Wikipedia entries for
     10  * <a href="http://en.wikipedia.org/wiki/UTF-16/UCS-2">UTF-16/UCS-2</a>
     11  * and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>.
     12  *
     13  * Copyright (C) 2005-2009 Linus Walleij <triad (at) df.lth.se>
     14  *
     15  * This library is free software; you can redistribute it and/or
     16  * modify it under the terms of the GNU Lesser General Public
     17  * License as published by the Free Software Foundation; either
     18  * version 2 of the License, or (at your option) any later version.
     19  *
     20  * This library is distributed in the hope that it will be useful,
     21  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     22  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     23  * Lesser General Public License for more details.
     24  *
     25  * You should have received a copy of the GNU Lesser General Public
     26  * License along with this library; if not, write to the
     27  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
     28  * Boston, MA 02111-1307, USA.
     29  *
     30  */
     31 
     32 #include <config.h>
     33 #include <stdlib.h>
     34 #include <string.h>
     35 #ifdef HAVE_ICONV
     36 #include "iconv.h"
     37 #else
     38 #error "libmtp unicode.c needs fixing to work without iconv()!"
     39 #endif
     40 #include "libmtp.h"
     41 #include "unicode.h"
     42 #include "util.h"
     43 #include "ptp.h"
     44 
     45 /**
     46  * The size of the buffer (in characters) used for creating string copies.
     47  */
     48 #define STRING_BUFFER_LENGTH 1024
     49 
     50 /**
     51  * Gets the length (in characters, not bytes) of a unicode
     52  * UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00
     53  * will return a value of 1.
     54  *
     55  * @param unicstr a UCS-2 Unicode string
     56  * @return the length of the string, in number of characters. If you
     57  *         want to know the length in bytes, multiply this by two and
     58  *         add two (for zero terminator).
     59  */
     60 int ucs2_strlen(uint16_t const * const unicstr)
     61 {
     62   int length;
     63 
     64   /* Unicode strings are terminated with 2 * 0x00 */
     65   for(length = 0; unicstr[length] != 0x0000U; length ++);
     66   return length;
     67 }
     68 
     69 /**
     70  * Converts a big-endian UTF-16 2-byte string
     71  * to a UTF-8 string. Actually just a UCS-2 internal conversion
     72  * routine that strips off the BOM if there is one.
     73  *
     74  * @param device a pointer to the current device.
     75  * @param unicstr the UTF-16 unicode string to convert
     76  * @return a UTF-8 string.
     77  */
     78 char *utf16_to_utf8(LIBMTP_mtpdevice_t *device, const uint16_t *unicstr)
     79 {
     80   PTPParams *params = (PTPParams *) device->params;
     81   char *stringp = (char *) unicstr;
     82   char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char.
     83   char *locp = loclstr;
     84   size_t nconv;
     85   size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator
     86   size_t convmax = STRING_BUFFER_LENGTH*3;
     87 
     88   loclstr[0]='\0';
     89   /* Do the conversion.  */
     90   nconv = iconv(params->cd_ucs2_to_locale, &stringp, &convlen, &locp, &convmax);
     91   if (nconv == (size_t) -1) {
     92     // Return partial string anyway.
     93     *locp = '\0';
     94   }
     95   loclstr[STRING_BUFFER_LENGTH*3] = '\0';
     96   // Strip off any BOM, it's totally useless...
     97   if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) {
     98     return strdup(loclstr+3);
     99   }
    100   return strdup(loclstr);
    101 }
    102 
    103 /**
    104  * Converts a UTF-8 string to a big-endian UTF-16 2-byte string
    105  * Actually just a UCS-2 internal conversion.
    106  *
    107  * @param device a pointer to the current device.
    108  * @param localstr the UTF-8 unicode string to convert
    109  * @return a UTF-16 string.
    110  */
    111 uint16_t *utf8_to_utf16(LIBMTP_mtpdevice_t *device, const char *localstr)
    112 {
    113   PTPParams *params = (PTPParams *) device->params;
    114   char *stringp = (char *) localstr; // cast away "const"
    115   char unicstr[(STRING_BUFFER_LENGTH+1)*2]; // UCS2 encoding is 2 bytes per UTF-8 char.
    116   char *unip = unicstr;
    117   size_t nconv = 0;
    118   size_t convlen = strlen(localstr)+1; // utf8 bytes, include terminator
    119   size_t convmax = STRING_BUFFER_LENGTH*2;
    120 
    121   unicstr[0]='\0';
    122   unicstr[1]='\0';
    123 
    124   /* Do the conversion.  */
    125   nconv = iconv(params->cd_locale_to_ucs2, &stringp, &convlen, &unip, &convmax);
    126 
    127   if (nconv == (size_t) -1) {
    128     // Return partial string anyway.
    129     unip[0] = '\0';
    130     unip[1] = '\0';
    131   }
    132   // make sure the string is null terminated
    133   unicstr[STRING_BUFFER_LENGTH*2] = '\0';
    134   unicstr[STRING_BUFFER_LENGTH*2+1] = '\0';
    135 
    136   // allocate the string to be returned
    137   // Note: can't use strdup since every other byte is a null byte
    138   int ret_len = ucs2_strlen((uint16_t*)unicstr)*sizeof(uint16_t)+2;
    139   uint16_t* ret = malloc(ret_len);
    140   memcpy(ret,unicstr,(size_t)ret_len);
    141   return ret;
    142 }
    143 
    144 /**
    145  * This helper function simply removes any consecutive chars
    146  * > 0x7F and replace then with an underscore. In UTF-8
    147  * consequtive chars > 0x7F represent one single character so
    148  * it has to be done like this (and it's elegant). It will only
    149  * shrink the string in size so no copying is needed.
    150  */
    151 void strip_7bit_from_utf8(char *str)
    152 {
    153   int i,j,k;
    154   i = 0;
    155   j = 0;
    156   k = strlen(str);
    157   while (i < k) {
    158     if ((uint8_t) str[i] > 0x7FU) {
    159       str[j] = '_';
    160       i++;
    161       // Skip over any consequtive > 0x7F chars.
    162       while((uint8_t) str[i] > 0x7FU) {
    163 	i++;
    164       }
    165     } else {
    166       str[j] = str[i];
    167       i++;
    168     }
    169     j++;
    170   }
    171   // Terminate stripped string...
    172   str[j] = '\0';
    173 }
    174