1 /* 2 * Copyright (c) 1995, 2006, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.net; 27 28 import java.io.ByteArrayOutputStream; 29 import java.io.BufferedWriter; 30 import java.io.OutputStreamWriter; 31 import java.io.IOException; 32 import java.io.UnsupportedEncodingException; 33 import java.io.CharArrayWriter; 34 import java.nio.charset.Charset; 35 import java.nio.charset.IllegalCharsetNameException; 36 import java.nio.charset.UnsupportedCharsetException ; 37 import java.util.BitSet; 38 import java.security.AccessController; 39 import java.security.PrivilegedAction; 40 import sun.security.action.GetBooleanAction; 41 import sun.security.action.GetPropertyAction; 42 43 /** 44 * Utility class for HTML form encoding. This class contains static methods 45 * for converting a String to the <CODE>application/x-www-form-urlencoded</CODE> MIME 46 * format. For more information about HTML form encoding, consult the HTML 47 * <A HREF="http://www.w3.org/TR/html4/">specification</A>. 48 * 49 * <p> 50 * When encoding a String, the following rules apply: 51 * 52 * <p> 53 * <ul> 54 * <li>The alphanumeric characters "<code>a</code>" through 55 * "<code>z</code>", "<code>A</code>" through 56 * "<code>Z</code>" and "<code>0</code>" 57 * through "<code>9</code>" remain the same. 58 * <li>The special characters "<code>.</code>", 59 * "<code>-</code>", "<code>*</code>", and 60 * "<code>_</code>" remain the same. 61 * <li>The space character "<code> </code>" is 62 * converted into a plus sign "<code>+</code>". 63 * <li>All other characters are unsafe and are first converted into 64 * one or more bytes using some encoding scheme. Then each byte is 65 * represented by the 3-character string 66 * "<code>%<i>xy</i></code>", where <i>xy</i> is the 67 * two-digit hexadecimal representation of the byte. 68 * The recommended encoding scheme to use is UTF-8. However, 69 * for compatibility reasons, if an encoding is not specified, 70 * then the default encoding of the platform is used. 71 * </ul> 72 * 73 * <p> 74 * For example using UTF-8 as the encoding scheme the string "The 75 * string ü@foo-bar" would get converted to 76 * "The+string+%C3%BC%40foo-bar" because in UTF-8 the character 77 * ü is encoded as two bytes C3 (hex) and BC (hex), and the 78 * character @ is encoded as one byte 40 (hex). 79 * 80 * @author Herb Jellinek 81 * @since JDK1.0 82 */ 83 public class URLEncoder { 84 static BitSet dontNeedEncoding; 85 static final int caseDiff = ('a' - 'A'); 86 static String dfltEncName = null; 87 88 static { 89 90 /* The list of characters that are not encoded has been 91 * determined as follows: 92 * 93 * RFC 2396 states: 94 * ----- 95 * Data characters that are allowed in a URI but do not have a 96 * reserved purpose are called unreserved. These include upper 97 * and lower case letters, decimal digits, and a limited set of 98 * punctuation marks and symbols. 99 * 100 * unreserved = alphanum | mark 101 * 102 * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" 103 * 104 * Unreserved characters can be escaped without changing the 105 * semantics of the URI, but this should not be done unless the 106 * URI is being used in a context that does not allow the 107 * unescaped character to appear. 108 * ----- 109 * 110 * It appears that both Netscape and Internet Explorer escape 111 * all special characters from this list with the exception 112 * of "-", "_", ".", "*". While it is not clear why they are 113 * escaping the other characters, perhaps it is safest to 114 * assume that there might be contexts in which the others 115 * are unsafe if not escaped. Therefore, we will use the same 116 * list. It is also noteworthy that this is consistent with 117 * O'Reilly's "HTML: The Definitive Guide" (page 164). 118 * 119 * As a last note, Intenet Explorer does not encode the "@" 120 * character which is clearly not unreserved according to the 121 * RFC. We are being consistent with the RFC in this matter, 122 * as is Netscape. 123 * 124 */ 125 126 dontNeedEncoding = new BitSet(256); 127 int i; 128 for (i = 'a'; i <= 'z'; i++) { 129 dontNeedEncoding.set(i); 130 } 131 for (i = 'A'; i <= 'Z'; i++) { 132 dontNeedEncoding.set(i); 133 } 134 for (i = '0'; i <= '9'; i++) { 135 dontNeedEncoding.set(i); 136 } 137 dontNeedEncoding.set(' '); /* encoding a space to a + is done 138 * in the encode() method */ 139 dontNeedEncoding.set('-'); 140 dontNeedEncoding.set('_'); 141 dontNeedEncoding.set('.'); 142 dontNeedEncoding.set('*'); 143 144 dfltEncName = AccessController.doPrivileged( 145 new GetPropertyAction("file.encoding") 146 ); 147 } 148 149 /** 150 * You can't call the constructor. 151 */ 152 private URLEncoder() { } 153 154 /** 155 * Translates a string into <code>x-www-form-urlencoded</code> 156 * format. This method uses the platform's default encoding 157 * as the encoding scheme to obtain the bytes for unsafe characters. 158 * 159 * @param s <code>String</code> to be translated. 160 * @deprecated The resulting string may vary depending on the platform's 161 * default encoding. Instead, use the encode(String,String) 162 * method to specify the encoding. 163 * @return the translated <code>String</code>. 164 */ 165 @Deprecated 166 public static String encode(String s) { 167 168 String str = null; 169 170 try { 171 str = encode(s, dfltEncName); 172 } catch (UnsupportedEncodingException e) { 173 // The system should always have the platform default 174 } 175 176 return str; 177 } 178 179 /** 180 * Translates a string into <code>application/x-www-form-urlencoded</code> 181 * format using a specific encoding scheme. This method uses the 182 * supplied encoding scheme to obtain the bytes for unsafe 183 * characters. 184 * <p> 185 * <em><strong>Note:</strong> The <a href= 186 * "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars"> 187 * World Wide Web Consortium Recommendation</a> states that 188 * UTF-8 should be used. Not doing so may introduce 189 * incompatibilites.</em> 190 * 191 * @param s <code>String</code> to be translated. 192 * @param enc The name of a supported 193 * <a href="../lang/package-summary.html#charenc">character 194 * encoding</a>. 195 * @return the translated <code>String</code>. 196 * @exception UnsupportedEncodingException 197 * If the named encoding is not supported 198 * @see URLDecoder#decode(java.lang.String, java.lang.String) 199 * @since 1.4 200 */ 201 public static String encode(String s, String enc) 202 throws UnsupportedEncodingException { 203 204 boolean needToChange = false; 205 StringBuffer out = new StringBuffer(s.length()); 206 Charset charset; 207 CharArrayWriter charArrayWriter = new CharArrayWriter(); 208 209 if (enc == null) 210 throw new NullPointerException("charsetName"); 211 212 try { 213 charset = Charset.forName(enc); 214 } catch (IllegalCharsetNameException e) { 215 throw new UnsupportedEncodingException(enc); 216 } catch (UnsupportedCharsetException e) { 217 throw new UnsupportedEncodingException(enc); 218 } 219 220 for (int i = 0; i < s.length();) { 221 int c = (int) s.charAt(i); 222 //System.out.println("Examining character: " + c); 223 if (dontNeedEncoding.get(c)) { 224 if (c == ' ') { 225 c = '+'; 226 needToChange = true; 227 } 228 //System.out.println("Storing: " + c); 229 out.append((char)c); 230 i++; 231 } else { 232 // convert to external encoding before hex conversion 233 do { 234 charArrayWriter.write(c); 235 /* 236 * If this character represents the start of a Unicode 237 * surrogate pair, then pass in two characters. It's not 238 * clear what should be done if a bytes reserved in the 239 * surrogate pairs range occurs outside of a legal 240 * surrogate pair. For now, just treat it as if it were 241 * any other character. 242 */ 243 if (c >= 0xD800 && c <= 0xDBFF) { 244 /* 245 System.out.println(Integer.toHexString(c) 246 + " is high surrogate"); 247 */ 248 if ( (i+1) < s.length()) { 249 int d = (int) s.charAt(i+1); 250 /* 251 System.out.println("\tExamining " 252 + Integer.toHexString(d)); 253 */ 254 if (d >= 0xDC00 && d <= 0xDFFF) { 255 /* 256 System.out.println("\t" 257 + Integer.toHexString(d) 258 + " is low surrogate"); 259 */ 260 charArrayWriter.write(d); 261 i++; 262 } 263 } 264 } 265 i++; 266 } while (i < s.length() && !dontNeedEncoding.get((c = (int) s.charAt(i)))); 267 268 charArrayWriter.flush(); 269 String str = new String(charArrayWriter.toCharArray()); 270 byte[] ba = str.getBytes(charset); 271 for (int j = 0; j < ba.length; j++) { 272 out.append('%'); 273 char ch = Character.forDigit((ba[j] >> 4) & 0xF, 16); 274 // converting to use uppercase letter as part of 275 // the hex value if ch is a letter. 276 if (Character.isLetter(ch)) { 277 ch -= caseDiff; 278 } 279 out.append(ch); 280 ch = Character.forDigit(ba[j] & 0xF, 16); 281 if (Character.isLetter(ch)) { 282 ch -= caseDiff; 283 } 284 out.append(ch); 285 } 286 charArrayWriter.reset(); 287 needToChange = true; 288 } 289 } 290 291 return (needToChange? out.toString() : s); 292 } 293 } 294