Home | History | Annotate | Download | only in mime4j
      1 /****************************************************************
      2  * Licensed to the Apache Software Foundation (ASF) under one   *
      3  * or more contributor license agreements.  See the NOTICE file *
      4  * distributed with this work for additional information        *
      5  * regarding copyright ownership.  The ASF licenses this file   *
      6  * to you under the Apache License, Version 2.0 (the            *
      7  * "License"); you may not use this file except in compliance   *
      8  * with the License.  You may obtain a copy of the License at   *
      9  *                                                              *
     10  *   http://www.apache.org/licenses/LICENSE-2.0                 *
     11  *                                                              *
     12  * Unless required by applicable law or agreed to in writing,   *
     13  * software distributed under the License is distributed on an  *
     14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
     15  * KIND, either express or implied.  See the License for the    *
     16  * specific language governing permissions and limitations      *
     17  * under the License.                                           *
     18  ****************************************************************/
     19 
     20 package org.apache.james.mime4j;
     21 
     22 import org.apache.james.mime4j.decoder.Base64InputStream;
     23 import org.apache.james.mime4j.decoder.QuotedPrintableInputStream;
     24 
     25 import java.io.IOException;
     26 import java.io.InputStream;
     27 import java.util.BitSet;
     28 import java.util.LinkedList;
     29 
     30 /**
     31  * <p>
     32  * Parses MIME (or RFC822) message streams of bytes or characters and reports
     33  * parsing events to a <code>ContentHandler</code> instance.
     34  * </p>
     35  * <p>
     36  * Typical usage:<br/>
     37  * <pre>
     38  *      ContentHandler handler = new MyHandler();
     39  *      MimeStreamParser parser = new MimeStreamParser();
     40  *      parser.setContentHandler(handler);
     41  *      parser.parse(new BufferedInputStream(new FileInputStream("mime.msg")));
     42  * </pre>
     43  * <strong>NOTE:</strong> All lines must end with CRLF
     44  * (<code>\r\n</code>). If you are unsure of the line endings in your stream
     45  * you should wrap it in a {@link org.apache.james.mime4j.EOLConvertingInputStream} instance.
     46  *
     47  *
     48  * @version $Id: MimeStreamParser.java,v 1.8 2005/02/11 10:12:02 ntherning Exp $
     49  */
     50 public class MimeStreamParser {
     51     private static final Log log = LogFactory.getLog(MimeStreamParser.class);
     52 
     53     private static BitSet fieldChars = null;
     54 
     55     private RootInputStream rootStream = null;
     56     private LinkedList<BodyDescriptor> bodyDescriptors = new LinkedList<BodyDescriptor>();
     57     private ContentHandler handler = null;
     58     private boolean raw = false;
     59     private boolean prematureEof = false;
     60 
     61     static {
     62         fieldChars = new BitSet();
     63         for (int i = 0x21; i <= 0x39; i++) {
     64             fieldChars.set(i);
     65         }
     66         for (int i = 0x3b; i <= 0x7e; i++) {
     67             fieldChars.set(i);
     68         }
     69     }
     70 
     71     /**
     72      * Creates a new <code>MimeStreamParser</code> instance.
     73      */
     74     public MimeStreamParser() {
     75     }
     76 
     77     /**
     78      * Parses a stream of bytes containing a MIME message.
     79      *
     80      * @param is the stream to parse.
     81      * @throws IOException on I/O errors.
     82      */
     83     public void parse(InputStream is) throws IOException {
     84         rootStream = new RootInputStream(is);
     85         parseMessage(rootStream);
     86     }
     87 
     88     /**
     89      * Determines if this parser is currently in raw mode.
     90      *
     91      * @return <code>true</code> if in raw mode, <code>false</code>
     92      *         otherwise.
     93      * @see #setRaw(boolean)
     94      */
     95     public boolean isRaw() {
     96         return raw;
     97     }
     98 
     99     /**
    100      * Enables or disables raw mode. In raw mode all future entities
    101      * (messages or body parts) in the stream will be reported to the
    102      * {@link ContentHandler#raw(InputStream)} handler method only.
    103      * The stream will contain the entire unparsed entity contents
    104      * including header fields and whatever is in the body.
    105      *
    106      * @param raw <code>true</code> enables raw mode, <code>false</code>
    107      *        disables it.
    108      */
    109     public void setRaw(boolean raw) {
    110         this.raw = raw;
    111     }
    112 
    113     /**
    114      * Finishes the parsing and stops reading lines.
    115      * NOTE: No more lines will be parsed but the parser
    116      * will still call
    117      * {@link ContentHandler#endMultipart()},
    118      * {@link ContentHandler#endBodyPart()},
    119      * {@link ContentHandler#endMessage()}, etc to match previous calls
    120      * to
    121      * {@link ContentHandler#startMultipart(BodyDescriptor)},
    122      * {@link ContentHandler#startBodyPart()},
    123      * {@link ContentHandler#startMessage()}, etc.
    124      */
    125     public void stop() {
    126         rootStream.truncate();
    127     }
    128 
    129     /**
    130      * Parses an entity which consists of a header followed by a body containing
    131      * arbitrary data, body parts or an embedded message.
    132      *
    133      * @param is the stream to parse.
    134      * @throws IOException on I/O errors.
    135      */
    136     private void parseEntity(InputStream is) throws IOException {
    137         BodyDescriptor bd = parseHeader(is);
    138 
    139         if (bd.isMultipart()) {
    140             bodyDescriptors.addFirst(bd);
    141 
    142             handler.startMultipart(bd);
    143 
    144             MimeBoundaryInputStream tempIs =
    145                 new MimeBoundaryInputStream(is, bd.getBoundary());
    146             handler.preamble(new CloseShieldInputStream(tempIs));
    147             tempIs.consume();
    148 
    149             while (tempIs.hasMoreParts()) {
    150                 tempIs = new MimeBoundaryInputStream(is, bd.getBoundary());
    151                 parseBodyPart(tempIs);
    152                 tempIs.consume();
    153                 if (tempIs.parentEOF()) {
    154                     prematureEof = true;
    155 //                    if (log.isWarnEnabled()) {
    156 //                        log.warn("Line " + rootStream.getLineNumber()
    157 //                                + ": Body part ended prematurely. "
    158 //                                + "Higher level boundary detected or "
    159 //                                + "EOF reached.");
    160 //                    }
    161                     break;
    162                 }
    163             }
    164 
    165             handler.epilogue(new CloseShieldInputStream(is));
    166 
    167             handler.endMultipart();
    168 
    169             bodyDescriptors.removeFirst();
    170 
    171         } else if (bd.isMessage()) {
    172             if (bd.isBase64Encoded()) {
    173                 log.warn("base64 encoded message/rfc822 detected");
    174                 is = new EOLConvertingInputStream(
    175                         new Base64InputStream(is));
    176             } else if (bd.isQuotedPrintableEncoded()) {
    177                 log.warn("quoted-printable encoded message/rfc822 detected");
    178                 is = new EOLConvertingInputStream(
    179                         new QuotedPrintableInputStream(is));
    180             }
    181             bodyDescriptors.addFirst(bd);
    182             parseMessage(is);
    183             bodyDescriptors.removeFirst();
    184         } else {
    185             handler.body(bd, new CloseShieldInputStream(is));
    186         }
    187 
    188         /*
    189          * Make sure the stream has been consumed.
    190          */
    191         while (is.read() != -1) {
    192         }
    193     }
    194 
    195     private void parseMessage(InputStream is) throws IOException {
    196         if (raw) {
    197             handler.raw(new CloseShieldInputStream(is));
    198         } else {
    199             handler.startMessage();
    200             parseEntity(is);
    201             handler.endMessage();
    202         }
    203     }
    204 
    205     public boolean getPrematureEof() {
    206         return prematureEof;
    207     }
    208 
    209     private void parseBodyPart(InputStream is) throws IOException {
    210         if (raw) {
    211             handler.raw(new CloseShieldInputStream(is));
    212         } else {
    213             handler.startBodyPart();
    214             parseEntity(is);
    215             handler.endBodyPart();
    216         }
    217     }
    218 
    219     /**
    220      * Parses a header.
    221      *
    222      * @param is the stream to parse.
    223      * @return a <code>BodyDescriptor</code> describing the body following
    224      *         the header.
    225      */
    226     private BodyDescriptor parseHeader(InputStream is) throws IOException {
    227         BodyDescriptor bd = new BodyDescriptor(bodyDescriptors.isEmpty()
    228                         ? null : (BodyDescriptor) bodyDescriptors.getFirst());
    229 
    230         handler.startHeader();
    231 
    232         int lineNumber = rootStream.getLineNumber();
    233 
    234         StringBuffer sb = new StringBuffer();
    235         int curr = 0;
    236         int prev = 0;
    237         while ((curr = is.read()) != -1) {
    238             if (curr == '\n' && (prev == '\n' || prev == 0)) {
    239                 /*
    240                  * [\r]\n[\r]\n or an immediate \r\n have been seen.
    241                  */
    242                 sb.deleteCharAt(sb.length() - 1);
    243                 break;
    244             }
    245             sb.append((char) curr);
    246             prev = curr == '\r' ? prev : curr;
    247         }
    248 
    249 //        if (curr == -1 && log.isWarnEnabled()) {
    250 //            log.warn("Line " + rootStream.getLineNumber()
    251 //                    + ": Unexpected end of headers detected. "
    252 //                    + "Boundary detected in header or EOF reached.");
    253 //        }
    254 
    255         int start = 0;
    256         int pos = 0;
    257         int startLineNumber = lineNumber;
    258         while (pos < sb.length()) {
    259             while (pos < sb.length() && sb.charAt(pos) != '\r') {
    260                 pos++;
    261             }
    262             if (pos < sb.length() - 1 && sb.charAt(pos + 1) != '\n') {
    263                 pos++;
    264                 continue;
    265             }
    266 
    267             if (pos >= sb.length() - 2 || fieldChars.get(sb.charAt(pos + 2))) {
    268 
    269                 /*
    270                  * field should be the complete field data excluding the
    271                  * trailing \r\n.
    272                  */
    273                 String field = sb.substring(start, pos);
    274                 start = pos + 2;
    275 
    276                 /*
    277                  * Check for a valid field.
    278                  */
    279                 int index = field.indexOf(':');
    280                 boolean valid = false;
    281                 if (index != -1 && fieldChars.get(field.charAt(0))) {
    282                     valid = true;
    283                     String fieldName = field.substring(0, index).trim();
    284                     for (int i = 0; i < fieldName.length(); i++) {
    285                         if (!fieldChars.get(fieldName.charAt(i))) {
    286                             valid = false;
    287                             break;
    288                         }
    289                     }
    290 
    291                     if (valid) {
    292                         handler.field(field);
    293                         bd.addField(fieldName, field.substring(index + 1));
    294                     }
    295                 }
    296 
    297                 if (!valid && log.isWarnEnabled()) {
    298                     log.warn("Line " + startLineNumber
    299                             + ": Ignoring invalid field: '" + field.trim() + "'");
    300                 }
    301 
    302                 startLineNumber = lineNumber;
    303             }
    304 
    305             pos += 2;
    306             lineNumber++;
    307         }
    308 
    309         handler.endHeader();
    310 
    311         return bd;
    312     }
    313 
    314     /**
    315      * Sets the <code>ContentHandler</code> to use when reporting
    316      * parsing events.
    317      *
    318      * @param h the <code>ContentHandler</code>.
    319      */
    320     public void setContentHandler(ContentHandler h) {
    321         this.handler = h;
    322     }
    323 
    324 }