1 /**************************************************************** 2 * Licensed to the Apache Software Foundation (ASF) under one * 3 * or more contributor license agreements. See the NOTICE file * 4 * distributed with this work for additional information * 5 * regarding copyright ownership. The ASF licenses this file * 6 * to you under the Apache License, Version 2.0 (the * 7 * "License"); you may not use this file except in compliance * 8 * with the License. You may obtain a copy of the License at * 9 * * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, * 13 * software distributed under the License is distributed on an * 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * 15 * KIND, either express or implied. See the License for the * 16 * specific language governing permissions and limitations * 17 * under the License. * 18 ****************************************************************/ 19 20 package org.apache.james.mime4j; 21 22 import com.android.mail.utils.LoggingInputStream; 23 24 import org.apache.james.mime4j.decoder.Base64InputStream; 25 import org.apache.james.mime4j.decoder.QuotedPrintableInputStream; 26 27 import java.io.IOException; 28 import java.io.InputStream; 29 import java.util.BitSet; 30 import java.util.LinkedList; 31 32 /** 33 * <p> 34 * Parses MIME (or RFC822) message streams of bytes or characters and reports 35 * parsing events to a <code>ContentHandler</code> instance. 36 * </p> 37 * <p> 38 * Typical usage:<br/> 39 * <pre> 40 * ContentHandler handler = new MyHandler(); 41 * MimeStreamParser parser = new MimeStreamParser(); 42 * parser.setContentHandler(handler); 43 * parser.parse(new BufferedInputStream(new FileInputStream("mime.msg"))); 44 * </pre> 45 * <strong>NOTE:</strong> All lines must end with CRLF 46 * (<code>\r\n</code>). If you are unsure of the line endings in your stream 47 * you should wrap it in a {@link org.apache.james.mime4j.EOLConvertingInputStream} instance. 48 * 49 * 50 * @version $Id: MimeStreamParser.java,v 1.8 2005/02/11 10:12:02 ntherning Exp $ 51 */ 52 public class MimeStreamParser { 53 private static final Log log = LogFactory.getLog(MimeStreamParser.class); 54 55 private static final boolean DEBUG_LOG_MESSAGE = false; //DO NOT RELEASE AS 'TRUE' 56 57 private static BitSet fieldChars = null; 58 59 private RootInputStream rootStream = null; 60 private LinkedList<BodyDescriptor> bodyDescriptors = new LinkedList<BodyDescriptor>(); 61 private ContentHandler handler = null; 62 private boolean raw = false; 63 64 static { 65 fieldChars = new BitSet(); 66 for (int i = 0x21; i <= 0x39; i++) { 67 fieldChars.set(i); 68 } 69 for (int i = 0x3b; i <= 0x7e; i++) { 70 fieldChars.set(i); 71 } 72 } 73 74 /** 75 * Creates a new <code>MimeStreamParser</code> instance. 76 */ 77 public MimeStreamParser() { 78 } 79 80 /** 81 * Parses a stream of bytes containing a MIME message. 82 * 83 * @param is the stream to parse. 84 * @throws IOException on I/O errors. 85 */ 86 public void parse(InputStream is) throws IOException { 87 if (DEBUG_LOG_MESSAGE) { 88 is = new LoggingInputStream(is, "MIME", true); 89 } 90 rootStream = new RootInputStream(is); 91 parseMessage(rootStream); 92 } 93 94 /** 95 * Determines if this parser is currently in raw mode. 96 * 97 * @return <code>true</code> if in raw mode, <code>false</code> 98 * otherwise. 99 * @see #setRaw(boolean) 100 */ 101 public boolean isRaw() { 102 return raw; 103 } 104 105 /** 106 * Enables or disables raw mode. In raw mode all future entities 107 * (messages or body parts) in the stream will be reported to the 108 * {@link ContentHandler#raw(InputStream)} handler method only. 109 * The stream will contain the entire unparsed entity contents 110 * including header fields and whatever is in the body. 111 * 112 * @param raw <code>true</code> enables raw mode, <code>false</code> 113 * disables it. 114 */ 115 public void setRaw(boolean raw) { 116 this.raw = raw; 117 } 118 119 /** 120 * Finishes the parsing and stops reading lines. 121 * NOTE: No more lines will be parsed but the parser 122 * will still call 123 * {@link ContentHandler#endMultipart()}, 124 * {@link ContentHandler#endBodyPart()}, 125 * {@link ContentHandler#endMessage()}, etc to match previous calls 126 * to 127 * {@link ContentHandler#startMultipart(BodyDescriptor)}, 128 * {@link ContentHandler#startBodyPart()}, 129 * {@link ContentHandler#startMessage()}, etc. 130 */ 131 public void stop() { 132 rootStream.truncate(); 133 } 134 135 /** 136 * Parses an entity which consists of a header followed by a body containing 137 * arbitrary data, body parts or an embedded message. 138 * 139 * @param is the stream to parse. 140 * @throws IOException on I/O errors. 141 */ 142 private void parseEntity(InputStream is) throws IOException { 143 BodyDescriptor bd = parseHeader(is); 144 145 if (bd.isMultipart()) { 146 bodyDescriptors.addFirst(bd); 147 148 handler.startMultipart(bd); 149 150 MimeBoundaryInputStream tempIs = 151 new MimeBoundaryInputStream(is, bd.getBoundary()); 152 handler.preamble(new CloseShieldInputStream(tempIs)); 153 tempIs.consume(); 154 155 while (tempIs.hasMoreParts()) { 156 tempIs = new MimeBoundaryInputStream(is, bd.getBoundary()); 157 parseBodyPart(tempIs); 158 tempIs.consume(); 159 if (tempIs.parentEOF()) { 160 // if (log.isWarnEnabled()) { 161 // log.warn("Line " + rootStream.getLineNumber() 162 // + ": Body part ended prematurely. " 163 // + "Higher level boundary detected or " 164 // + "EOF reached."); 165 // } 166 break; 167 } 168 } 169 170 handler.epilogue(new CloseShieldInputStream(is)); 171 172 handler.endMultipart(); 173 174 bodyDescriptors.removeFirst(); 175 176 } else if (bd.isMessage()) { 177 if (bd.isBase64Encoded()) { 178 log.warn("base64 encoded message/rfc822 detected"); 179 is = new EOLConvertingInputStream( 180 new Base64InputStream(is)); 181 } else if (bd.isQuotedPrintableEncoded()) { 182 log.warn("quoted-printable encoded message/rfc822 detected"); 183 is = new EOLConvertingInputStream( 184 new QuotedPrintableInputStream(is)); 185 } 186 bodyDescriptors.addFirst(bd); 187 parseMessage(is); 188 bodyDescriptors.removeFirst(); 189 } else { 190 handler.body(bd, new CloseShieldInputStream(is)); 191 } 192 193 /* 194 * Make sure the stream has been consumed. 195 */ 196 while (is.read() != -1) { 197 } 198 } 199 200 private void parseMessage(InputStream is) throws IOException { 201 if (raw) { 202 handler.raw(new CloseShieldInputStream(is)); 203 } else { 204 handler.startMessage(); 205 parseEntity(is); 206 handler.endMessage(); 207 } 208 } 209 210 private void parseBodyPart(InputStream is) throws IOException { 211 if (raw) { 212 handler.raw(new CloseShieldInputStream(is)); 213 } else { 214 handler.startBodyPart(); 215 parseEntity(is); 216 handler.endBodyPart(); 217 } 218 } 219 220 /** 221 * Parses a header. 222 * 223 * @param is the stream to parse. 224 * @return a <code>BodyDescriptor</code> describing the body following 225 * the header. 226 */ 227 private BodyDescriptor parseHeader(InputStream is) throws IOException { 228 BodyDescriptor bd = new BodyDescriptor(bodyDescriptors.isEmpty() 229 ? null : (BodyDescriptor) bodyDescriptors.getFirst()); 230 231 handler.startHeader(); 232 233 int lineNumber = rootStream.getLineNumber(); 234 235 StringBuffer sb = new StringBuffer(); 236 int curr = 0; 237 int prev = 0; 238 while ((curr = is.read()) != -1) { 239 if (curr == '\n' && (prev == '\n' || prev == 0)) { 240 /* 241 * [\r]\n[\r]\n or an immediate \r\n have been seen. 242 */ 243 sb.deleteCharAt(sb.length() - 1); 244 break; 245 } 246 sb.append((char) curr); 247 prev = curr == '\r' ? prev : curr; 248 } 249 250 // if (curr == -1 && log.isWarnEnabled()) { 251 // log.warn("Line " + rootStream.getLineNumber() 252 // + ": Unexpected end of headers detected. " 253 // + "Boundary detected in header or EOF reached."); 254 // } 255 256 int start = 0; 257 int pos = 0; 258 int startLineNumber = lineNumber; 259 while (pos < sb.length()) { 260 while (pos < sb.length() && sb.charAt(pos) != '\r') { 261 pos++; 262 } 263 if (pos < sb.length() - 1 && sb.charAt(pos + 1) != '\n') { 264 pos++; 265 continue; 266 } 267 268 if (pos >= sb.length() - 2 || fieldChars.get(sb.charAt(pos + 2))) { 269 270 /* 271 * field should be the complete field data excluding the 272 * trailing \r\n. 273 */ 274 String field = sb.substring(start, pos); 275 start = pos + 2; 276 277 /* 278 * Check for a valid field. 279 */ 280 int index = field.indexOf(':'); 281 boolean valid = false; 282 if (index != -1 && fieldChars.get(field.charAt(0))) { 283 valid = true; 284 String fieldName = field.substring(0, index).trim(); 285 for (int i = 0; i < fieldName.length(); i++) { 286 if (!fieldChars.get(fieldName.charAt(i))) { 287 valid = false; 288 break; 289 } 290 } 291 292 if (valid) { 293 handler.field(field); 294 bd.addField(fieldName, field.substring(index + 1)); 295 } 296 } 297 298 if (!valid && log.isWarnEnabled()) { 299 log.warn("Line " + startLineNumber 300 + ": Ignoring invalid field: '" + field.trim() + "'"); 301 } 302 303 startLineNumber = lineNumber; 304 } 305 306 pos += 2; 307 lineNumber++; 308 } 309 310 handler.endHeader(); 311 312 return bd; 313 } 314 315 /** 316 * Sets the <code>ContentHandler</code> to use when reporting 317 * parsing events. 318 * 319 * @param h the <code>ContentHandler</code>. 320 */ 321 public void setContentHandler(ContentHandler h) { 322 this.handler = h; 323 } 324 325 } 326