Home | History | Annotate | Download | only in impl
      1 // =================================================================================================
      2 // ADOBE SYSTEMS INCORPORATED
      3 // Copyright 2006 Adobe Systems Incorporated
      4 // All Rights Reserved
      5 //
      6 // NOTICE:  Adobe permits you to use, modify, and distribute this file in accordance with the terms
      7 // of the Adobe license agreement accompanying it.
      8 // =================================================================================================
      9 
     10 package com.adobe.xmp.impl;
     11 
     12 import java.io.IOException;
     13 import java.io.InputStream;
     14 import java.io.InputStreamReader;
     15 import java.io.Reader;
     16 import java.io.StringReader;
     17 import java.io.UnsupportedEncodingException;
     18 
     19 import javax.xml.XMLConstants;
     20 import javax.xml.parsers.DocumentBuilder;
     21 import javax.xml.parsers.DocumentBuilderFactory;
     22 import javax.xml.parsers.ParserConfigurationException;
     23 
     24 import org.w3c.dom.Document;
     25 import org.w3c.dom.Node;
     26 import org.w3c.dom.NodeList;
     27 import org.w3c.dom.ProcessingInstruction;
     28 import org.xml.sax.InputSource;
     29 import org.xml.sax.SAXException;
     30 
     31 import com.adobe.xmp.XMPConst;
     32 import com.adobe.xmp.XMPError;
     33 import com.adobe.xmp.XMPException;
     34 import com.adobe.xmp.XMPMeta;
     35 import com.adobe.xmp.options.ParseOptions;
     36 
     37 
     38 /**
     39  * This class replaces the <code>ExpatAdapter.cpp</code> and does the
     40  * XML-parsing and fixes the prefix. After the parsing several normalisations
     41  * are applied to the XMPTree.
     42  *
     43  * @since 01.02.2006
     44  */
     45 public class XMPMetaParser
     46 {
     47 	/**  */
     48 	private static final Object XMP_RDF = new Object();
     49 	/** the DOM Parser Factory, options are set */
     50 	private static DocumentBuilderFactory factory = createDocumentBuilderFactory();
     51 
     52 	/**
     53 	 * Hidden constructor, initialises the SAX parser handler.
     54 	 */
     55 	private XMPMetaParser()
     56 	{
     57 		// EMPTY
     58 	}
     59 
     60 
     61 
     62 	/**
     63 	 * Parses the input source into an XMP metadata object, including
     64 	 * de-aliasing and normalisation.
     65 	 *
     66 	 * @param input the input can be an <code>InputStream</code>, a <code>String</code> or
     67 	 * 			a byte buffer containing the XMP packet.
     68 	 * @param options the parse options
     69 	 * @return Returns the resulting XMP metadata object
     70 	 * @throws XMPException Thrown if parsing or normalisation fails.
     71 	 */
     72 	public static XMPMeta parse(Object input, ParseOptions options) throws XMPException
     73 	{
     74 		ParameterAsserts.assertNotNull(input);
     75 		options = options != null ? options : new ParseOptions();
     76 
     77 		Document document = parseXml(input, options);
     78 
     79 		boolean xmpmetaRequired = options.getRequireXMPMeta();
     80 		Object[] result = new Object[3];
     81 		result = findRootNode(document, xmpmetaRequired, result);
     82 
     83 		if (result != null  &&  result[1] == XMP_RDF)
     84 		{
     85 			XMPMetaImpl xmp = ParseRDF.parse((Node) result[0]);
     86 			xmp.setPacketHeader((String) result[2]);
     87 
     88 			// Check if the XMP object shall be normalized
     89 			if (!options.getOmitNormalization())
     90 			{
     91 				return XMPNormalizer.process(xmp, options);
     92 			}
     93 			else
     94 			{
     95 				return xmp;
     96 			}
     97 		}
     98 		else
     99 		{
    100 			// no appropriate root node found, return empty metadata object
    101 			return new XMPMetaImpl();
    102 		}
    103 	}
    104 
    105 
    106 	/**
    107 	 * Parses the raw XML metadata packet considering the parsing options.
    108 	 * Latin-1/ISO-8859-1 can be accepted when the input is a byte stream
    109 	 * (some old toolkits versions such packets). The stream is
    110 	 * then wrapped in another stream that converts Latin-1 to UTF-8.
    111 	 * <p>
    112 	 * If control characters shall be fixed, a reader is used that fixes the chars to spaces
    113 	 * (if the input is a byte stream is has to be read as character stream).
    114 	 * <p>
    115 	 * Both options reduce the performance of the parser.
    116 	 *
    117 	 * @param input the input can be an <code>InputStream</code>, a <code>String</code> or
    118 	 * 			a byte buffer containing the XMP packet.
    119 	 * @param options the parsing options
    120 	 * @return Returns the parsed XML document or an exception.
    121 	 * @throws XMPException Thrown if the parsing fails for different reasons
    122 	 */
    123 	private static Document parseXml(Object input, ParseOptions options)
    124 			throws XMPException
    125 	{
    126 		if (input instanceof InputStream)
    127 		{
    128 			return parseXmlFromInputStream((InputStream) input, options);
    129 		}
    130 		else if (input instanceof byte[])
    131 		{
    132 			return parseXmlFromBytebuffer(new ByteBuffer((byte[]) input), options);
    133 		}
    134 		else
    135 		{
    136 			return parseXmlFromString((String) input, options);
    137 		}
    138 	}
    139 
    140 
    141 	/**
    142 	 * Parses XML from an {@link InputStream},
    143 	 * fixing the encoding (Latin-1 to UTF-8) and illegal control character optionally.
    144 	 *
    145 	 * @param stream an <code>InputStream</code>
    146 	 * @param options the parsing options
    147 	 * @return Returns an XML DOM-Document.
    148 	 * @throws XMPException Thrown when the parsing fails.
    149 	 */
    150 	private static Document parseXmlFromInputStream(InputStream stream, ParseOptions options)
    151 			throws XMPException
    152 	{
    153 		if (!options.getAcceptLatin1()  &&  !options.getFixControlChars())
    154 		{
    155 			return parseInputSource(new InputSource(stream));
    156 		}
    157 		else
    158 		{
    159 			// load stream into bytebuffer
    160 			try
    161 			{
    162 				ByteBuffer buffer = new ByteBuffer(stream);
    163 				return parseXmlFromBytebuffer(buffer, options);
    164 			}
    165 			catch (IOException e)
    166 			{
    167 				throw new XMPException("Error reading the XML-file",
    168 						XMPError.BADSTREAM, e);
    169 			}
    170 		}
    171 	}
    172 
    173 
    174 	/**
    175 	 * Parses XML from a byte buffer,
    176 	 * fixing the encoding (Latin-1 to UTF-8) and illegal control character optionally.
    177 	 *
    178 	 * @param buffer a byte buffer containing the XMP packet
    179 	 * @param options the parsing options
    180 	 * @return Returns an XML DOM-Document.
    181 	 * @throws XMPException Thrown when the parsing fails.
    182 	 */
    183 	private static Document parseXmlFromBytebuffer(ByteBuffer buffer, ParseOptions options)
    184 		throws XMPException
    185 	{
    186 		InputSource source = new InputSource(buffer.getByteStream());
    187 		try
    188 		{
    189 			return parseInputSource(source);
    190 		}
    191 		catch (XMPException e)
    192 		{
    193 			if (e.getErrorCode() == XMPError.BADXML  ||
    194 				e.getErrorCode() == XMPError.BADSTREAM)
    195 			{
    196 				if (options.getAcceptLatin1())
    197 				{
    198 					buffer = Latin1Converter.convert(buffer);
    199 				}
    200 
    201 				if (options.getFixControlChars())
    202 				{
    203 					try
    204 					{
    205 						String encoding = buffer.getEncoding();
    206 						Reader fixReader = new FixASCIIControlsReader(
    207 							new InputStreamReader(
    208 								buffer.getByteStream(), encoding));
    209 						return parseInputSource(new InputSource(fixReader));
    210 					}
    211 					catch (UnsupportedEncodingException e1)
    212 					{
    213 						// can normally not happen as the encoding is provided by a util function
    214 						throw new XMPException("Unsupported Encoding",
    215 								XMPError.INTERNALFAILURE, e);
    216 					}
    217 				}
    218 				source = new InputSource(buffer.getByteStream());
    219 				return parseInputSource(source);
    220 			}
    221 			else
    222 			{
    223 				throw e;
    224 			}
    225 		}
    226 	}
    227 
    228 
    229 	/**
    230 	 * Parses XML from a {@link String},
    231 	 * fixing the illegal control character optionally.
    232 	 *
    233 	 * @param input a <code>String</code> containing the XMP packet
    234 	 * @param options the parsing options
    235 	 * @return Returns an XML DOM-Document.
    236 	 * @throws XMPException Thrown when the parsing fails.
    237 	 */
    238 	private static Document parseXmlFromString(String input, ParseOptions options)
    239 			throws XMPException
    240 	{
    241 		InputSource source = new InputSource(new StringReader(input));
    242 		try
    243 		{
    244 			return parseInputSource(source);
    245 		}
    246 		catch (XMPException e)
    247 		{
    248 			if (e.getErrorCode() == XMPError.BADXML  &&  options.getFixControlChars())
    249 			{
    250 				source = new InputSource(new FixASCIIControlsReader(new StringReader(input)));
    251 				return parseInputSource(source);
    252 			}
    253 			else
    254 			{
    255 				throw e;
    256 			}
    257 		}
    258 	}
    259 
    260 
    261 	/**
    262 	 * Runs the XML-Parser.
    263 	 * @param source an <code>InputSource</code>
    264 	 * @return Returns an XML DOM-Document.
    265 	 * @throws XMPException Wraps parsing and I/O-exceptions into an XMPException.
    266 	 */
    267 	private static Document parseInputSource(InputSource source) throws XMPException
    268 	{
    269 		try
    270 		{
    271 			DocumentBuilder builder = factory.newDocumentBuilder();
    272 			builder.setErrorHandler(null);
    273 			return builder.parse(source);
    274 		}
    275 		catch (SAXException e)
    276 		{
    277 			throw new XMPException("XML parsing failure", XMPError.BADXML, e);
    278 		}
    279 		catch (ParserConfigurationException e)
    280 		{
    281 			throw new XMPException("XML Parser not correctly configured",
    282 					XMPError.UNKNOWN, e);
    283 		}
    284 		catch (IOException e)
    285 		{
    286 			throw new XMPException("Error reading the XML-file", XMPError.BADSTREAM, e);
    287 		}
    288 	}
    289 
    290 
    291 	/**
    292 	 * Find the XML node that is the root of the XMP data tree. Generally this
    293 	 * will be an outer node, but it could be anywhere if a general XML document
    294 	 * is parsed (e.g. SVG). The XML parser counted all rdf:RDF and
    295 	 * pxmp:XMP_Packet nodes, and kept a pointer to the last one. If there is
    296 	 * more than one possible root use PickBestRoot to choose among them.
    297 	 * <p>
    298 	 * If there is a root node, try to extract the version of the previous XMP
    299 	 * toolkit.
    300 	 * <p>
    301 	 * Pick the first x:xmpmeta among multiple root candidates. If there aren't
    302 	 * any, pick the first bare rdf:RDF if that is allowed. The returned root is
    303 	 * the rdf:RDF child if an x:xmpmeta element was chosen. The search is
    304 	 * breadth first, so a higher level candiate is chosen over a lower level
    305 	 * one that was textually earlier in the serialized XML.
    306 	 *
    307 	 * @param root the root of the xml document
    308 	 * @param xmpmetaRequired flag if the xmpmeta-tag is still required, might be set
    309 	 * 		initially to <code>true</code>, if the parse option "REQUIRE_XMP_META" is set
    310 	 * @param result The result array that is filled during the recursive process.
    311 	 * @return Returns an array that contains the result or <code>null</code>.
    312 	 * 		   The array contains:
    313 	 * <ol>
    314 	 * 		<li>the rdf:RDF-node
    315 	 * 		<li>an object that is either XMP_RDF or XMP_PLAIN
    316 	 * 		<li>a flag that is true if a <?xpacket..> processing instruction has been found
    317 	 * 		<li>the body text of the xpacket-instruction.
    318 	 * </ol>
    319 	 *
    320 	 */
    321 	private static Object[] findRootNode(Node root, boolean xmpmetaRequired, Object[] result)
    322 	{
    323 		// Look among this parent's content for x:xapmeta or x:xmpmeta.
    324 		// The recursion for x:xmpmeta is broader than the strictly defined choice,
    325 		// but gives us smaller code.
    326 		NodeList children = root.getChildNodes();
    327 		for (int i = 0; i < children.getLength(); i++)
    328 		{
    329 			root = children.item(i);
    330 			if (Node.PROCESSING_INSTRUCTION_NODE == root.getNodeType()  &&
    331 				((ProcessingInstruction) root).getTarget() == XMPConst.XMP_PI)
    332 			{
    333 				// Store the processing instructions content
    334 				if (result != null)
    335 				{
    336 					result[2] = ((ProcessingInstruction) root).getData();
    337 				}
    338 			}
    339 			else if (Node.TEXT_NODE != root.getNodeType()  &&
    340 				Node.PROCESSING_INSTRUCTION_NODE != root.getNodeType())
    341 			{
    342 				String rootNS = root.getNamespaceURI();
    343 				String rootLocal = root.getLocalName();
    344 				if (
    345 						(
    346 							XMPConst.TAG_XMPMETA.equals(rootLocal)  ||
    347 							XMPConst.TAG_XAPMETA.equals(rootLocal)
    348 						)  &&
    349 						XMPConst.NS_X.equals(rootNS)
    350 				   )
    351 				{
    352 					// by not passing the RequireXMPMeta-option, the rdf-Node will be valid
    353 					return findRootNode(root, false, result);
    354 				}
    355 				else if (!xmpmetaRequired  &&
    356 						"RDF".equals(rootLocal)  &&
    357 						 XMPConst.NS_RDF.equals(rootNS))
    358 				{
    359 					if (result != null)
    360 					{
    361 						result[0] = root;
    362 						result[1] = XMP_RDF;
    363 					}
    364 					return result;
    365 				}
    366 				else
    367 				{
    368 					// continue searching
    369 					Object[] newResult = findRootNode(root, xmpmetaRequired, result);
    370 					if (newResult != null)
    371 					{
    372 						return newResult;
    373 					}
    374 					else
    375 					{
    376 						continue;
    377 					}
    378 				}
    379 			}
    380 		}
    381 
    382 		// no appropriate node has been found
    383 		return null;
    384 		//     is extracted here in the C++ Toolkit
    385 	}
    386 
    387 
    388 	/**
    389 	 * @return Creates, configures and returnes the document builder factory for
    390 	 *         the Metadata Parser.
    391 	 */
    392 	private static DocumentBuilderFactory createDocumentBuilderFactory()
    393 	{
    394 		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    395 		factory.setNamespaceAware(true);
    396 		factory.setIgnoringComments(true);
    397 
    398 		try
    399 		{
    400 			// honor System parsing limits, e.g.
    401 			// System.setProperty("entityExpansionLimit", "10");
    402 			factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
    403 		}
    404 		catch (Exception e)
    405 		{
    406 			// Ignore IllegalArgumentException and ParserConfigurationException
    407 			// in case the configured XML-Parser does not implement the feature.
    408 		}
    409 		return factory;
    410 	}
    411 }