Home | History | Annotate | Download | only in message
      1 /*
      2  * $HeadURL: http://svn.apache.org/repos/asf/httpcomponents/httpcore/trunk/module-main/src/main/java/org/apache/http/message/BasicTokenIterator.java $
      3  * $Revision: 602520 $
      4  * $Date: 2007-12-08 09:42:26 -0800 (Sat, 08 Dec 2007) $
      5  *
      6  * ====================================================================
      7  * Licensed to the Apache Software Foundation (ASF) under one
      8  * or more contributor license agreements.  See the NOTICE file
      9  * distributed with this work for additional information
     10  * regarding copyright ownership.  The ASF licenses this file
     11  * to you under the Apache License, Version 2.0 (the
     12  * "License"); you may not use this file except in compliance
     13  * with the License.  You may obtain a copy of the License at
     14  *
     15  *   http://www.apache.org/licenses/LICENSE-2.0
     16  *
     17  * Unless required by applicable law or agreed to in writing,
     18  * software distributed under the License is distributed on an
     19  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
     20  * KIND, either express or implied.  See the License for the
     21  * specific language governing permissions and limitations
     22  * under the License.
     23  * ====================================================================
     24  *
     25  * This software consists of voluntary contributions made by many
     26  * individuals on behalf of the Apache Software Foundation.  For more
     27  * information on the Apache Software Foundation, please see
     28  * <http://www.apache.org/>.
     29  *
     30  */
     31 
     32 package org.apache.http.message;
     33 
     34 import java.util.NoSuchElementException;
     35 
     36 import org.apache.http.HeaderIterator;
     37 import org.apache.http.ParseException;
     38 import org.apache.http.TokenIterator;
     39 
     40 /**
     41  * Basic implementation of a {@link TokenIterator}.
     42  * This implementation parses <tt>#token<tt> sequences as
     43  * defined by RFC 2616, section 2.
     44  * It extends that definition somewhat beyond US-ASCII.
     45  *
     46  * @version $Revision: 602520 $
     47  */
     48 public class BasicTokenIterator implements TokenIterator {
     49 
     50     /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
     51     // the order of the characters here is adjusted to put the
     52     // most likely candidates at the beginning of the collection
     53     public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";
     54 
     55 
     56     /** The iterator from which to obtain the next header. */
     57     protected final HeaderIterator headerIt;
     58 
     59     /**
     60      * The value of the current header.
     61      * This is the header value that includes {@link #currentToken}.
     62      * Undefined if the iteration is over.
     63      */
     64     protected String currentHeader;
     65 
     66     /**
     67      * The token to be returned by the next call to {@link #currentToken}.
     68      * <code>null</code> if the iteration is over.
     69      */
     70     protected String currentToken;
     71 
     72     /**
     73      * The position after {@link #currentToken} in {@link #currentHeader}.
     74      * Undefined if the iteration is over.
     75      */
     76     protected int searchPos;
     77 
     78 
     79     /**
     80      * Creates a new instance of {@link BasicTokenIterator}.
     81      *
     82      * @param headerIterator    the iterator for the headers to tokenize
     83      */
     84     public BasicTokenIterator(final HeaderIterator headerIterator) {
     85         if (headerIterator == null) {
     86             throw new IllegalArgumentException
     87                 ("Header iterator must not be null.");
     88         }
     89 
     90         this.headerIt = headerIterator;
     91         this.searchPos = findNext(-1);
     92     }
     93 
     94 
     95     // non-javadoc, see interface TokenIterator
     96     public boolean hasNext() {
     97         return (this.currentToken != null);
     98     }
     99 
    100 
    101     /**
    102      * Obtains the next token from this iteration.
    103      *
    104      * @return  the next token in this iteration
    105      *
    106      * @throws NoSuchElementException   if the iteration is already over
    107      * @throws ParseException   if an invalid header value is encountered
    108      */
    109     public String nextToken()
    110         throws NoSuchElementException, ParseException {
    111 
    112         if (this.currentToken == null) {
    113             throw new NoSuchElementException("Iteration already finished.");
    114         }
    115 
    116         final String result = this.currentToken;
    117         // updates currentToken, may trigger ParseException:
    118         this.searchPos = findNext(this.searchPos);
    119 
    120         return result;
    121     }
    122 
    123 
    124     /**
    125      * Returns the next token.
    126      * Same as {@link #nextToken}, but with generic return type.
    127      *
    128      * @return  the next token in this iteration
    129      *
    130      * @throws NoSuchElementException   if there are no more tokens
    131      * @throws ParseException   if an invalid header value is encountered
    132      */
    133     public final Object next()
    134         throws NoSuchElementException, ParseException {
    135         return nextToken();
    136     }
    137 
    138 
    139     /**
    140      * Removing tokens is not supported.
    141      *
    142      * @throws UnsupportedOperationException    always
    143      */
    144     public final void remove()
    145         throws UnsupportedOperationException {
    146 
    147         throw new UnsupportedOperationException
    148             ("Removing tokens is not supported.");
    149     }
    150 
    151 
    152     /**
    153      * Determines the next token.
    154      * If found, the token is stored in {@link #currentToken}.
    155      * The return value indicates the position after the token
    156      * in {@link #currentHeader}. If necessary, the next header
    157      * will be obtained from {@link #headerIt}.
    158      * If not found, {@link #currentToken} is set to <code>null</code>.
    159      *
    160      * @param from      the position in the current header at which to
    161      *                  start the search, -1 to search in the first header
    162      *
    163      * @return  the position after the found token in the current header, or
    164      *          negative if there was no next token
    165      *
    166      * @throws ParseException   if an invalid header value is encountered
    167      */
    168     protected int findNext(int from)
    169         throws ParseException {
    170 
    171         if (from < 0) {
    172             // called from the constructor, initialize the first header
    173             if (!this.headerIt.hasNext()) {
    174                 return -1;
    175             }
    176             this.currentHeader = this.headerIt.nextHeader().getValue();
    177             from = 0;
    178         } else {
    179             // called after a token, make sure there is a separator
    180             from = findTokenSeparator(from);
    181         }
    182 
    183         int start = findTokenStart(from);
    184         if (start < 0) {
    185             this.currentToken = null;
    186             return -1; // nothing found
    187         }
    188 
    189         int end = findTokenEnd(start);
    190         this.currentToken = createToken(this.currentHeader, start, end);
    191         return end;
    192     }
    193 
    194 
    195     /**
    196      * Creates a new token to be returned.
    197      * Called from {@link #findNext findNext} after the token is identified.
    198      * The default implementation simply calls
    199      * {@link java.lang.String#substring String.substring}.
    200      * <br/>
    201      * If header values are significantly longer than tokens, and some
    202      * tokens are permanently referenced by the application, there can
    203      * be problems with garbage collection. A substring will hold a
    204      * reference to the full characters of the original string and
    205      * therefore occupies more memory than might be expected.
    206      * To avoid this, override this method and create a new string
    207      * instead of a substring.
    208      *
    209      * @param value     the full header value from which to create a token
    210      * @param start     the index of the first token character
    211      * @param end       the index after the last token character
    212      *
    213      * @return  a string representing the token identified by the arguments
    214      */
    215     protected String createToken(String value, int start, int end) {
    216         return value.substring(start, end);
    217     }
    218 
    219 
    220     /**
    221      * Determines the starting position of the next token.
    222      * This method will iterate over headers if necessary.
    223      *
    224      * @param from      the position in the current header at which to
    225      *                  start the search
    226      *
    227      * @return  the position of the token start in the current header,
    228      *          negative if no token start could be found
    229      */
    230     protected int findTokenStart(int from) {
    231         if (from < 0) {
    232             throw new IllegalArgumentException
    233                 ("Search position must not be negative: " + from);
    234         }
    235 
    236         boolean found = false;
    237         while (!found && (this.currentHeader != null)) {
    238 
    239             final int to = this.currentHeader.length();
    240             while (!found && (from < to)) {
    241 
    242                 final char ch = this.currentHeader.charAt(from);
    243                 if (isTokenSeparator(ch) || isWhitespace(ch)) {
    244                     // whitspace and token separators are skipped
    245                     from++;
    246                 } else if (isTokenChar(this.currentHeader.charAt(from))) {
    247                     // found the start of a token
    248                     found = true;
    249                 } else {
    250                     throw new ParseException
    251                         ("Invalid character before token (pos " + from +
    252                          "): " + this.currentHeader);
    253                 }
    254             }
    255             if (!found) {
    256                 if (this.headerIt.hasNext()) {
    257                     this.currentHeader = this.headerIt.nextHeader().getValue();
    258                     from = 0;
    259                 } else {
    260                     this.currentHeader = null;
    261                 }
    262             }
    263         } // while headers
    264 
    265         return found ? from : -1;
    266     }
    267 
    268 
    269     /**
    270      * Determines the position of the next token separator.
    271      * Because of multi-header joining rules, the end of a
    272      * header value is a token separator. This method does
    273      * therefore not need to iterate over headers.
    274      *
    275      * @param from      the position in the current header at which to
    276      *                  start the search
    277      *
    278      * @return  the position of a token separator in the current header,
    279      *          or at the end
    280      *
    281      * @throws ParseException
    282      *         if a new token is found before a token separator.
    283      *         RFC 2616, section 2.1 explicitly requires a comma between
    284      *         tokens for <tt>#</tt>.
    285      */
    286     protected int findTokenSeparator(int from) {
    287         if (from < 0) {
    288             throw new IllegalArgumentException
    289                 ("Search position must not be negative: " + from);
    290         }
    291 
    292         boolean found = false;
    293         final int to = this.currentHeader.length();
    294         while (!found && (from < to)) {
    295             final char ch = this.currentHeader.charAt(from);
    296             if (isTokenSeparator(ch)) {
    297                 found = true;
    298             } else if (isWhitespace(ch)) {
    299                 from++;
    300             } else if (isTokenChar(ch)) {
    301                 throw new ParseException
    302                     ("Tokens without separator (pos " + from +
    303                      "): " + this.currentHeader);
    304             } else {
    305                 throw new ParseException
    306                     ("Invalid character after token (pos " + from +
    307                      "): " + this.currentHeader);
    308             }
    309         }
    310 
    311         return from;
    312     }
    313 
    314 
    315     /**
    316      * Determines the ending position of the current token.
    317      * This method will not leave the current header value,
    318      * since the end of the header value is a token boundary.
    319      *
    320      * @param from      the position of the first character of the token
    321      *
    322      * @return  the position after the last character of the token.
    323      *          The behavior is undefined if <code>from</code> does not
    324      *          point to a token character in the current header value.
    325      */
    326     protected int findTokenEnd(int from) {
    327         if (from < 0) {
    328             throw new IllegalArgumentException
    329                 ("Token start position must not be negative: " + from);
    330         }
    331 
    332         final int to = this.currentHeader.length();
    333         int end = from+1;
    334         while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
    335             end++;
    336         }
    337 
    338         return end;
    339     }
    340 
    341 
    342     /**
    343      * Checks whether a character is a token separator.
    344      * RFC 2616, section 2.1 defines comma as the separator for
    345      * <tt>#token</tt> sequences. The end of a header value will
    346      * also separate tokens, but that is not a character check.
    347      *
    348      * @param ch        the character to check
    349      *
    350      * @return  <code>true</code> if the character is a token separator,
    351      *          <code>false</code> otherwise
    352      */
    353     protected boolean isTokenSeparator(char ch) {
    354         return (ch == ',');
    355     }
    356 
    357 
    358     /**
    359      * Checks whether a character is a whitespace character.
    360      * RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
    361      * The optional preceeding line break is irrelevant, since header
    362      * continuation is handled transparently when parsing messages.
    363      *
    364      * @param ch        the character to check
    365      *
    366      * @return  <code>true</code> if the character is whitespace,
    367      *          <code>false</code> otherwise
    368      */
    369     protected boolean isWhitespace(char ch) {
    370 
    371         // we do not use Character.isWhitspace(ch) here, since that allows
    372         // many control characters which are not whitespace as per RFC 2616
    373         return ((ch == '\t') || Character.isSpaceChar(ch));
    374     }
    375 
    376 
    377     /**
    378      * Checks whether a character is a valid token character.
    379      * Whitespace, control characters, and HTTP separators are not
    380      * valid token characters. The HTTP specification (RFC 2616, section 2.2)
    381      * defines tokens only for the US-ASCII character set, this
    382      * method extends the definition to other character sets.
    383      *
    384      * @param ch        the character to check
    385      *
    386      * @return  <code>true</code> if the character is a valid token start,
    387      *          <code>false</code> otherwise
    388      */
    389     protected boolean isTokenChar(char ch) {
    390 
    391         // common sense extension of ALPHA + DIGIT
    392         if (Character.isLetterOrDigit(ch))
    393             return true;
    394 
    395         // common sense extension of CTL
    396         if (Character.isISOControl(ch))
    397             return false;
    398 
    399         // no common sense extension for this
    400         if (isHttpSeparator(ch))
    401             return false;
    402 
    403         // RFC 2616, section 2.2 defines a token character as
    404         // "any CHAR except CTLs or separators". The controls
    405         // and separators are included in the checks above.
    406         // This will yield unexpected results for Unicode format characters.
    407         // If that is a problem, overwrite isHttpSeparator(char) to filter
    408         // out the false positives.
    409         return true;
    410     }
    411 
    412 
    413     /**
    414      * Checks whether a character is an HTTP separator.
    415      * The implementation in this class checks only for the HTTP separators
    416      * defined in RFC 2616, section 2.2. If you need to detect other
    417      * separators beyond the US-ASCII character set, override this method.
    418      *
    419      * @param ch        the character to check
    420      *
    421      * @return  <code>true</code> if the character is an HTTP separator
    422      */
    423     protected boolean isHttpSeparator(char ch) {
    424         return (HTTP_SEPARATORS.indexOf(ch) >= 0);
    425     }
    426 
    427 
    428 } // class BasicTokenIterator
    429 
    430