1 /** 2 * \file unicode.c 3 * 4 * This file contains general Unicode string manipulation functions. 5 * It mainly consist of functions for converting between UCS-2 (used on 6 * the devices) and UTF-8 (used by several applications). 7 * 8 * For a deeper understanding of Unicode encoding formats see the 9 * Wikipedia entries for 10 * <a href="http://en.wikipedia.org/wiki/UTF-16/UCS-2">UTF-16/UCS-2</a> 11 * and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>. 12 * 13 * Copyright (C) 2005-2009 Linus Walleij <triad (at) df.lth.se> 14 * 15 * This library is free software; you can redistribute it and/or 16 * modify it under the terms of the GNU Lesser General Public 17 * License as published by the Free Software Foundation; either 18 * version 2 of the License, or (at your option) any later version. 19 * 20 * This library is distributed in the hope that it will be useful, 21 * but WITHOUT ANY WARRANTY; without even the implied warranty of 22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 23 * Lesser General Public License for more details. 24 * 25 * You should have received a copy of the GNU Lesser General Public 26 * License along with this library; if not, write to the 27 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 28 * Boston, MA 02111-1307, USA. 29 * 30 */ 31 32 #include "config.h" 33 34 #include <stdlib.h> 35 #include <string.h> 36 #ifdef HAVE_ICONV 37 #include "iconv.h" 38 #else 39 #error "libmtp unicode.c needs fixing to work without iconv()!" 40 #endif 41 #include "libmtp.h" 42 #include "unicode.h" 43 #include "util.h" 44 #include "ptp.h" 45 46 /** 47 * The size of the buffer (in characters) used for creating string copies. 48 */ 49 #define STRING_BUFFER_LENGTH 1024 50 51 /** 52 * Gets the length (in characters, not bytes) of a unicode 53 * UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00 54 * will return a value of 1. 55 * 56 * @param unicstr a UCS-2 Unicode string 57 * @return the length of the string, in number of characters. If you 58 * want to know the length in bytes, multiply this by two and 59 * add two (for zero terminator). 60 */ 61 int ucs2_strlen(uint16_t const * const unicstr) 62 { 63 int length; 64 65 /* Unicode strings are terminated with 2 * 0x00 */ 66 for(length = 0; unicstr[length] != 0x0000U; length ++); 67 return length; 68 } 69 70 /** 71 * Converts a big-endian UTF-16 2-byte string 72 * to a UTF-8 string. Actually just a UCS-2 internal conversion 73 * routine that strips off the BOM if there is one. 74 * 75 * @param device a pointer to the current device. 76 * @param unicstr the UTF-16 unicode string to convert 77 * @return a UTF-8 string. 78 */ 79 char *utf16_to_utf8(LIBMTP_mtpdevice_t *device, const uint16_t *unicstr) 80 { 81 PTPParams *params = (PTPParams *) device->params; 82 char *stringp = (char *) unicstr; 83 char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char. 84 char *locp = loclstr; 85 size_t nconv; 86 size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator 87 size_t convmax = STRING_BUFFER_LENGTH*3; 88 89 loclstr[0]='\0'; 90 /* Do the conversion. */ 91 nconv = iconv(params->cd_ucs2_to_locale, &stringp, &convlen, &locp, &convmax); 92 if (nconv == (size_t) -1) { 93 // Return partial string anyway. 94 *locp = '\0'; 95 } 96 loclstr[STRING_BUFFER_LENGTH*3] = '\0'; 97 // Strip off any BOM, it's totally useless... 98 if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) { 99 return strdup(loclstr+3); 100 } 101 return strdup(loclstr); 102 } 103 104 /** 105 * Converts a UTF-8 string to a big-endian UTF-16 2-byte string 106 * Actually just a UCS-2 internal conversion. 107 * 108 * @param device a pointer to the current device. 109 * @param localstr the UTF-8 unicode string to convert 110 * @return a UTF-16 string. 111 */ 112 uint16_t *utf8_to_utf16(LIBMTP_mtpdevice_t *device, const char *localstr) 113 { 114 PTPParams *params = (PTPParams *) device->params; 115 char *stringp = (char *) localstr; // cast away "const" 116 char unicstr[(STRING_BUFFER_LENGTH+1)*2]; // UCS2 encoding is 2 bytes per UTF-8 char. 117 char *unip = unicstr; 118 size_t nconv = 0; 119 size_t convlen = strlen(localstr)+1; // utf8 bytes, include terminator 120 size_t convmax = STRING_BUFFER_LENGTH*2; 121 122 unicstr[0]='\0'; 123 unicstr[1]='\0'; 124 125 /* Do the conversion. */ 126 nconv = iconv(params->cd_locale_to_ucs2, &stringp, &convlen, &unip, &convmax); 127 128 if (nconv == (size_t) -1) { 129 // Return partial string anyway. 130 unip[0] = '\0'; 131 unip[1] = '\0'; 132 } 133 // make sure the string is null terminated 134 unicstr[STRING_BUFFER_LENGTH*2] = '\0'; 135 unicstr[STRING_BUFFER_LENGTH*2+1] = '\0'; 136 137 // allocate the string to be returned 138 // Note: can't use strdup since every other byte is a null byte 139 int ret_len = ucs2_strlen((uint16_t*)unicstr)*sizeof(uint16_t)+2; 140 uint16_t* ret = malloc(ret_len); 141 memcpy(ret,unicstr,(size_t)ret_len); 142 return ret; 143 } 144 145 /** 146 * This helper function simply removes any consecutive chars 147 * > 0x7F and replace then with an underscore. In UTF-8 148 * consequtive chars > 0x7F represent one single character so 149 * it has to be done like this (and it's elegant). It will only 150 * shrink the string in size so no copying is needed. 151 */ 152 void strip_7bit_from_utf8(char *str) 153 { 154 int i,j,k; 155 i = 0; 156 j = 0; 157 k = strlen(str); 158 while (i < k) { 159 if ((uint8_t) str[i] > 0x7FU) { 160 str[j] = '_'; 161 i++; 162 // Skip over any consequtive > 0x7F chars. 163 while((uint8_t) str[i] > 0x7FU) { 164 i++; 165 } 166 } else { 167 str[j] = str[i]; 168 i++; 169 } 170 j++; 171 } 172 // Terminate stripped string... 173 str[j] = '\0'; 174 } 175