1 /* 2 * $HeadURL: http://svn.apache.org/repos/asf/httpcomponents/httpcore/trunk/module-main/src/main/java/org/apache/http/message/BasicTokenIterator.java $ 3 * $Revision: 602520 $ 4 * $Date: 2007-12-08 09:42:26 -0800 (Sat, 08 Dec 2007) $ 5 * 6 * ==================================================================== 7 * Licensed to the Apache Software Foundation (ASF) under one 8 * or more contributor license agreements. See the NOTICE file 9 * distributed with this work for additional information 10 * regarding copyright ownership. The ASF licenses this file 11 * to you under the Apache License, Version 2.0 (the 12 * "License"); you may not use this file except in compliance 13 * with the License. You may obtain a copy of the License at 14 * 15 * http://www.apache.org/licenses/LICENSE-2.0 16 * 17 * Unless required by applicable law or agreed to in writing, 18 * software distributed under the License is distributed on an 19 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 20 * KIND, either express or implied. See the License for the 21 * specific language governing permissions and limitations 22 * under the License. 23 * ==================================================================== 24 * 25 * This software consists of voluntary contributions made by many 26 * individuals on behalf of the Apache Software Foundation. For more 27 * information on the Apache Software Foundation, please see 28 * <http://www.apache.org/>. 29 * 30 */ 31 32 package org.apache.http.message; 33 34 import java.util.NoSuchElementException; 35 36 import org.apache.http.HeaderIterator; 37 import org.apache.http.ParseException; 38 import org.apache.http.TokenIterator; 39 40 /** 41 * Basic implementation of a {@link TokenIterator}. 42 * This implementation parses <tt>#token<tt> sequences as 43 * defined by RFC 2616, section 2. 44 * It extends that definition somewhat beyond US-ASCII. 45 * 46 * @version $Revision: 602520 $ 47 */ 48 public class BasicTokenIterator implements TokenIterator { 49 50 /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */ 51 // the order of the characters here is adjusted to put the 52 // most likely candidates at the beginning of the collection 53 public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t"; 54 55 56 /** The iterator from which to obtain the next header. */ 57 protected final HeaderIterator headerIt; 58 59 /** 60 * The value of the current header. 61 * This is the header value that includes {@link #currentToken}. 62 * Undefined if the iteration is over. 63 */ 64 protected String currentHeader; 65 66 /** 67 * The token to be returned by the next call to {@link #currentToken}. 68 * <code>null</code> if the iteration is over. 69 */ 70 protected String currentToken; 71 72 /** 73 * The position after {@link #currentToken} in {@link #currentHeader}. 74 * Undefined if the iteration is over. 75 */ 76 protected int searchPos; 77 78 79 /** 80 * Creates a new instance of {@link BasicTokenIterator}. 81 * 82 * @param headerIterator the iterator for the headers to tokenize 83 */ 84 public BasicTokenIterator(final HeaderIterator headerIterator) { 85 if (headerIterator == null) { 86 throw new IllegalArgumentException 87 ("Header iterator must not be null."); 88 } 89 90 this.headerIt = headerIterator; 91 this.searchPos = findNext(-1); 92 } 93 94 95 // non-javadoc, see interface TokenIterator 96 public boolean hasNext() { 97 return (this.currentToken != null); 98 } 99 100 101 /** 102 * Obtains the next token from this iteration. 103 * 104 * @return the next token in this iteration 105 * 106 * @throws NoSuchElementException if the iteration is already over 107 * @throws ParseException if an invalid header value is encountered 108 */ 109 public String nextToken() 110 throws NoSuchElementException, ParseException { 111 112 if (this.currentToken == null) { 113 throw new NoSuchElementException("Iteration already finished."); 114 } 115 116 final String result = this.currentToken; 117 // updates currentToken, may trigger ParseException: 118 this.searchPos = findNext(this.searchPos); 119 120 return result; 121 } 122 123 124 /** 125 * Returns the next token. 126 * Same as {@link #nextToken}, but with generic return type. 127 * 128 * @return the next token in this iteration 129 * 130 * @throws NoSuchElementException if there are no more tokens 131 * @throws ParseException if an invalid header value is encountered 132 */ 133 public final Object next() 134 throws NoSuchElementException, ParseException { 135 return nextToken(); 136 } 137 138 139 /** 140 * Removing tokens is not supported. 141 * 142 * @throws UnsupportedOperationException always 143 */ 144 public final void remove() 145 throws UnsupportedOperationException { 146 147 throw new UnsupportedOperationException 148 ("Removing tokens is not supported."); 149 } 150 151 152 /** 153 * Determines the next token. 154 * If found, the token is stored in {@link #currentToken}. 155 * The return value indicates the position after the token 156 * in {@link #currentHeader}. If necessary, the next header 157 * will be obtained from {@link #headerIt}. 158 * If not found, {@link #currentToken} is set to <code>null</code>. 159 * 160 * @param from the position in the current header at which to 161 * start the search, -1 to search in the first header 162 * 163 * @return the position after the found token in the current header, or 164 * negative if there was no next token 165 * 166 * @throws ParseException if an invalid header value is encountered 167 */ 168 protected int findNext(int from) 169 throws ParseException { 170 171 if (from < 0) { 172 // called from the constructor, initialize the first header 173 if (!this.headerIt.hasNext()) { 174 return -1; 175 } 176 this.currentHeader = this.headerIt.nextHeader().getValue(); 177 from = 0; 178 } else { 179 // called after a token, make sure there is a separator 180 from = findTokenSeparator(from); 181 } 182 183 int start = findTokenStart(from); 184 if (start < 0) { 185 this.currentToken = null; 186 return -1; // nothing found 187 } 188 189 int end = findTokenEnd(start); 190 this.currentToken = createToken(this.currentHeader, start, end); 191 return end; 192 } 193 194 195 /** 196 * Creates a new token to be returned. 197 * Called from {@link #findNext findNext} after the token is identified. 198 * The default implementation simply calls 199 * {@link java.lang.String#substring String.substring}. 200 * <br/> 201 * If header values are significantly longer than tokens, and some 202 * tokens are permanently referenced by the application, there can 203 * be problems with garbage collection. A substring will hold a 204 * reference to the full characters of the original string and 205 * therefore occupies more memory than might be expected. 206 * To avoid this, override this method and create a new string 207 * instead of a substring. 208 * 209 * @param value the full header value from which to create a token 210 * @param start the index of the first token character 211 * @param end the index after the last token character 212 * 213 * @return a string representing the token identified by the arguments 214 */ 215 protected String createToken(String value, int start, int end) { 216 return value.substring(start, end); 217 } 218 219 220 /** 221 * Determines the starting position of the next token. 222 * This method will iterate over headers if necessary. 223 * 224 * @param from the position in the current header at which to 225 * start the search 226 * 227 * @return the position of the token start in the current header, 228 * negative if no token start could be found 229 */ 230 protected int findTokenStart(int from) { 231 if (from < 0) { 232 throw new IllegalArgumentException 233 ("Search position must not be negative: " + from); 234 } 235 236 boolean found = false; 237 while (!found && (this.currentHeader != null)) { 238 239 final int to = this.currentHeader.length(); 240 while (!found && (from < to)) { 241 242 final char ch = this.currentHeader.charAt(from); 243 if (isTokenSeparator(ch) || isWhitespace(ch)) { 244 // whitspace and token separators are skipped 245 from++; 246 } else if (isTokenChar(this.currentHeader.charAt(from))) { 247 // found the start of a token 248 found = true; 249 } else { 250 throw new ParseException 251 ("Invalid character before token (pos " + from + 252 "): " + this.currentHeader); 253 } 254 } 255 if (!found) { 256 if (this.headerIt.hasNext()) { 257 this.currentHeader = this.headerIt.nextHeader().getValue(); 258 from = 0; 259 } else { 260 this.currentHeader = null; 261 } 262 } 263 } // while headers 264 265 return found ? from : -1; 266 } 267 268 269 /** 270 * Determines the position of the next token separator. 271 * Because of multi-header joining rules, the end of a 272 * header value is a token separator. This method does 273 * therefore not need to iterate over headers. 274 * 275 * @param from the position in the current header at which to 276 * start the search 277 * 278 * @return the position of a token separator in the current header, 279 * or at the end 280 * 281 * @throws ParseException 282 * if a new token is found before a token separator. 283 * RFC 2616, section 2.1 explicitly requires a comma between 284 * tokens for <tt>#</tt>. 285 */ 286 protected int findTokenSeparator(int from) { 287 if (from < 0) { 288 throw new IllegalArgumentException 289 ("Search position must not be negative: " + from); 290 } 291 292 boolean found = false; 293 final int to = this.currentHeader.length(); 294 while (!found && (from < to)) { 295 final char ch = this.currentHeader.charAt(from); 296 if (isTokenSeparator(ch)) { 297 found = true; 298 } else if (isWhitespace(ch)) { 299 from++; 300 } else if (isTokenChar(ch)) { 301 throw new ParseException 302 ("Tokens without separator (pos " + from + 303 "): " + this.currentHeader); 304 } else { 305 throw new ParseException 306 ("Invalid character after token (pos " + from + 307 "): " + this.currentHeader); 308 } 309 } 310 311 return from; 312 } 313 314 315 /** 316 * Determines the ending position of the current token. 317 * This method will not leave the current header value, 318 * since the end of the header value is a token boundary. 319 * 320 * @param from the position of the first character of the token 321 * 322 * @return the position after the last character of the token. 323 * The behavior is undefined if <code>from</code> does not 324 * point to a token character in the current header value. 325 */ 326 protected int findTokenEnd(int from) { 327 if (from < 0) { 328 throw new IllegalArgumentException 329 ("Token start position must not be negative: " + from); 330 } 331 332 final int to = this.currentHeader.length(); 333 int end = from+1; 334 while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) { 335 end++; 336 } 337 338 return end; 339 } 340 341 342 /** 343 * Checks whether a character is a token separator. 344 * RFC 2616, section 2.1 defines comma as the separator for 345 * <tt>#token</tt> sequences. The end of a header value will 346 * also separate tokens, but that is not a character check. 347 * 348 * @param ch the character to check 349 * 350 * @return <code>true</code> if the character is a token separator, 351 * <code>false</code> otherwise 352 */ 353 protected boolean isTokenSeparator(char ch) { 354 return (ch == ','); 355 } 356 357 358 /** 359 * Checks whether a character is a whitespace character. 360 * RFC 2616, section 2.2 defines space and horizontal tab as whitespace. 361 * The optional preceeding line break is irrelevant, since header 362 * continuation is handled transparently when parsing messages. 363 * 364 * @param ch the character to check 365 * 366 * @return <code>true</code> if the character is whitespace, 367 * <code>false</code> otherwise 368 */ 369 protected boolean isWhitespace(char ch) { 370 371 // we do not use Character.isWhitspace(ch) here, since that allows 372 // many control characters which are not whitespace as per RFC 2616 373 return ((ch == '\t') || Character.isSpaceChar(ch)); 374 } 375 376 377 /** 378 * Checks whether a character is a valid token character. 379 * Whitespace, control characters, and HTTP separators are not 380 * valid token characters. The HTTP specification (RFC 2616, section 2.2) 381 * defines tokens only for the US-ASCII character set, this 382 * method extends the definition to other character sets. 383 * 384 * @param ch the character to check 385 * 386 * @return <code>true</code> if the character is a valid token start, 387 * <code>false</code> otherwise 388 */ 389 protected boolean isTokenChar(char ch) { 390 391 // common sense extension of ALPHA + DIGIT 392 if (Character.isLetterOrDigit(ch)) 393 return true; 394 395 // common sense extension of CTL 396 if (Character.isISOControl(ch)) 397 return false; 398 399 // no common sense extension for this 400 if (isHttpSeparator(ch)) 401 return false; 402 403 // RFC 2616, section 2.2 defines a token character as 404 // "any CHAR except CTLs or separators". The controls 405 // and separators are included in the checks above. 406 // This will yield unexpected results for Unicode format characters. 407 // If that is a problem, overwrite isHttpSeparator(char) to filter 408 // out the false positives. 409 return true; 410 } 411 412 413 /** 414 * Checks whether a character is an HTTP separator. 415 * The implementation in this class checks only for the HTTP separators 416 * defined in RFC 2616, section 2.2. If you need to detect other 417 * separators beyond the US-ASCII character set, override this method. 418 * 419 * @param ch the character to check 420 * 421 * @return <code>true</code> if the character is an HTTP separator 422 */ 423 protected boolean isHttpSeparator(char ch) { 424 return (HTTP_SEPARATORS.indexOf(ch) >= 0); 425 } 426 427 428 } // class BasicTokenIterator 429 430