1 // ================================================================================================= 2 // ADOBE SYSTEMS INCORPORATED 3 // Copyright 2006 Adobe Systems Incorporated 4 // All Rights Reserved 5 // 6 // NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms 7 // of the Adobe license agreement accompanying it. 8 // ================================================================================================= 9 10 package com.adobe.xmp.impl; 11 12 import java.io.IOException; 13 import java.io.InputStream; 14 import java.io.InputStreamReader; 15 import java.io.Reader; 16 import java.io.StringReader; 17 import java.io.UnsupportedEncodingException; 18 19 import javax.xml.XMLConstants; 20 import javax.xml.parsers.DocumentBuilder; 21 import javax.xml.parsers.DocumentBuilderFactory; 22 import javax.xml.parsers.ParserConfigurationException; 23 24 import org.w3c.dom.Document; 25 import org.w3c.dom.Node; 26 import org.w3c.dom.NodeList; 27 import org.w3c.dom.ProcessingInstruction; 28 import org.xml.sax.InputSource; 29 import org.xml.sax.SAXException; 30 31 import com.adobe.xmp.XMPConst; 32 import com.adobe.xmp.XMPError; 33 import com.adobe.xmp.XMPException; 34 import com.adobe.xmp.XMPMeta; 35 import com.adobe.xmp.options.ParseOptions; 36 37 38 /** 39 * This class replaces the <code>ExpatAdapter.cpp</code> and does the 40 * XML-parsing and fixes the prefix. After the parsing several normalisations 41 * are applied to the XMPTree. 42 * 43 * @since 01.02.2006 44 */ 45 public class XMPMetaParser 46 { 47 /** */ 48 private static final Object XMP_RDF = new Object(); 49 /** the DOM Parser Factory, options are set */ 50 private static DocumentBuilderFactory factory = createDocumentBuilderFactory(); 51 52 /** 53 * Hidden constructor, initialises the SAX parser handler. 54 */ 55 private XMPMetaParser() 56 { 57 // EMPTY 58 } 59 60 61 62 /** 63 * Parses the input source into an XMP metadata object, including 64 * de-aliasing and normalisation. 65 * 66 * @param input the input can be an <code>InputStream</code>, a <code>String</code> or 67 * a byte buffer containing the XMP packet. 68 * @param options the parse options 69 * @return Returns the resulting XMP metadata object 70 * @throws XMPException Thrown if parsing or normalisation fails. 71 */ 72 public static XMPMeta parse(Object input, ParseOptions options) throws XMPException 73 { 74 ParameterAsserts.assertNotNull(input); 75 options = options != null ? options : new ParseOptions(); 76 77 Document document = parseXml(input, options); 78 79 boolean xmpmetaRequired = options.getRequireXMPMeta(); 80 Object[] result = new Object[3]; 81 result = findRootNode(document, xmpmetaRequired, result); 82 83 if (result != null && result[1] == XMP_RDF) 84 { 85 XMPMetaImpl xmp = ParseRDF.parse((Node) result[0]); 86 xmp.setPacketHeader((String) result[2]); 87 88 // Check if the XMP object shall be normalized 89 if (!options.getOmitNormalization()) 90 { 91 return XMPNormalizer.process(xmp, options); 92 } 93 else 94 { 95 return xmp; 96 } 97 } 98 else 99 { 100 // no appropriate root node found, return empty metadata object 101 return new XMPMetaImpl(); 102 } 103 } 104 105 106 /** 107 * Parses the raw XML metadata packet considering the parsing options. 108 * Latin-1/ISO-8859-1 can be accepted when the input is a byte stream 109 * (some old toolkits versions such packets). The stream is 110 * then wrapped in another stream that converts Latin-1 to UTF-8. 111 * <p> 112 * If control characters shall be fixed, a reader is used that fixes the chars to spaces 113 * (if the input is a byte stream is has to be read as character stream). 114 * <p> 115 * Both options reduce the performance of the parser. 116 * 117 * @param input the input can be an <code>InputStream</code>, a <code>String</code> or 118 * a byte buffer containing the XMP packet. 119 * @param options the parsing options 120 * @return Returns the parsed XML document or an exception. 121 * @throws XMPException Thrown if the parsing fails for different reasons 122 */ 123 private static Document parseXml(Object input, ParseOptions options) 124 throws XMPException 125 { 126 if (input instanceof InputStream) 127 { 128 return parseXmlFromInputStream((InputStream) input, options); 129 } 130 else if (input instanceof byte[]) 131 { 132 return parseXmlFromBytebuffer(new ByteBuffer((byte[]) input), options); 133 } 134 else 135 { 136 return parseXmlFromString((String) input, options); 137 } 138 } 139 140 141 /** 142 * Parses XML from an {@link InputStream}, 143 * fixing the encoding (Latin-1 to UTF-8) and illegal control character optionally. 144 * 145 * @param stream an <code>InputStream</code> 146 * @param options the parsing options 147 * @return Returns an XML DOM-Document. 148 * @throws XMPException Thrown when the parsing fails. 149 */ 150 private static Document parseXmlFromInputStream(InputStream stream, ParseOptions options) 151 throws XMPException 152 { 153 if (!options.getAcceptLatin1() && !options.getFixControlChars()) 154 { 155 return parseInputSource(new InputSource(stream)); 156 } 157 else 158 { 159 // load stream into bytebuffer 160 try 161 { 162 ByteBuffer buffer = new ByteBuffer(stream); 163 return parseXmlFromBytebuffer(buffer, options); 164 } 165 catch (IOException e) 166 { 167 throw new XMPException("Error reading the XML-file", 168 XMPError.BADSTREAM, e); 169 } 170 } 171 } 172 173 174 /** 175 * Parses XML from a byte buffer, 176 * fixing the encoding (Latin-1 to UTF-8) and illegal control character optionally. 177 * 178 * @param buffer a byte buffer containing the XMP packet 179 * @param options the parsing options 180 * @return Returns an XML DOM-Document. 181 * @throws XMPException Thrown when the parsing fails. 182 */ 183 private static Document parseXmlFromBytebuffer(ByteBuffer buffer, ParseOptions options) 184 throws XMPException 185 { 186 InputSource source = new InputSource(buffer.getByteStream()); 187 try 188 { 189 return parseInputSource(source); 190 } 191 catch (XMPException e) 192 { 193 if (e.getErrorCode() == XMPError.BADXML || 194 e.getErrorCode() == XMPError.BADSTREAM) 195 { 196 if (options.getAcceptLatin1()) 197 { 198 buffer = Latin1Converter.convert(buffer); 199 } 200 201 if (options.getFixControlChars()) 202 { 203 try 204 { 205 String encoding = buffer.getEncoding(); 206 Reader fixReader = new FixASCIIControlsReader( 207 new InputStreamReader( 208 buffer.getByteStream(), encoding)); 209 return parseInputSource(new InputSource(fixReader)); 210 } 211 catch (UnsupportedEncodingException e1) 212 { 213 // can normally not happen as the encoding is provided by a util function 214 throw new XMPException("Unsupported Encoding", 215 XMPError.INTERNALFAILURE, e); 216 } 217 } 218 source = new InputSource(buffer.getByteStream()); 219 return parseInputSource(source); 220 } 221 else 222 { 223 throw e; 224 } 225 } 226 } 227 228 229 /** 230 * Parses XML from a {@link String}, 231 * fixing the illegal control character optionally. 232 * 233 * @param input a <code>String</code> containing the XMP packet 234 * @param options the parsing options 235 * @return Returns an XML DOM-Document. 236 * @throws XMPException Thrown when the parsing fails. 237 */ 238 private static Document parseXmlFromString(String input, ParseOptions options) 239 throws XMPException 240 { 241 InputSource source = new InputSource(new StringReader(input)); 242 try 243 { 244 return parseInputSource(source); 245 } 246 catch (XMPException e) 247 { 248 if (e.getErrorCode() == XMPError.BADXML && options.getFixControlChars()) 249 { 250 source = new InputSource(new FixASCIIControlsReader(new StringReader(input))); 251 return parseInputSource(source); 252 } 253 else 254 { 255 throw e; 256 } 257 } 258 } 259 260 261 /** 262 * Runs the XML-Parser. 263 * @param source an <code>InputSource</code> 264 * @return Returns an XML DOM-Document. 265 * @throws XMPException Wraps parsing and I/O-exceptions into an XMPException. 266 */ 267 private static Document parseInputSource(InputSource source) throws XMPException 268 { 269 try 270 { 271 DocumentBuilder builder = factory.newDocumentBuilder(); 272 builder.setErrorHandler(null); 273 return builder.parse(source); 274 } 275 catch (SAXException e) 276 { 277 throw new XMPException("XML parsing failure", XMPError.BADXML, e); 278 } 279 catch (ParserConfigurationException e) 280 { 281 throw new XMPException("XML Parser not correctly configured", 282 XMPError.UNKNOWN, e); 283 } 284 catch (IOException e) 285 { 286 throw new XMPException("Error reading the XML-file", XMPError.BADSTREAM, e); 287 } 288 } 289 290 291 /** 292 * Find the XML node that is the root of the XMP data tree. Generally this 293 * will be an outer node, but it could be anywhere if a general XML document 294 * is parsed (e.g. SVG). The XML parser counted all rdf:RDF and 295 * pxmp:XMP_Packet nodes, and kept a pointer to the last one. If there is 296 * more than one possible root use PickBestRoot to choose among them. 297 * <p> 298 * If there is a root node, try to extract the version of the previous XMP 299 * toolkit. 300 * <p> 301 * Pick the first x:xmpmeta among multiple root candidates. If there aren't 302 * any, pick the first bare rdf:RDF if that is allowed. The returned root is 303 * the rdf:RDF child if an x:xmpmeta element was chosen. The search is 304 * breadth first, so a higher level candiate is chosen over a lower level 305 * one that was textually earlier in the serialized XML. 306 * 307 * @param root the root of the xml document 308 * @param xmpmetaRequired flag if the xmpmeta-tag is still required, might be set 309 * initially to <code>true</code>, if the parse option "REQUIRE_XMP_META" is set 310 * @param result The result array that is filled during the recursive process. 311 * @return Returns an array that contains the result or <code>null</code>. 312 * The array contains: 313 * <ol> 314 * <li>the rdf:RDF-node 315 * <li>an object that is either XMP_RDF or XMP_PLAIN 316 * <li>a flag that is true if a <?xpacket..> processing instruction has been found 317 * <li>the body text of the xpacket-instruction. 318 * </ol> 319 * 320 */ 321 private static Object[] findRootNode(Node root, boolean xmpmetaRequired, Object[] result) 322 { 323 // Look among this parent's content for x:xapmeta or x:xmpmeta. 324 // The recursion for x:xmpmeta is broader than the strictly defined choice, 325 // but gives us smaller code. 326 NodeList children = root.getChildNodes(); 327 for (int i = 0; i < children.getLength(); i++) 328 { 329 root = children.item(i); 330 if (Node.PROCESSING_INSTRUCTION_NODE == root.getNodeType() && 331 ((ProcessingInstruction) root).getTarget() == XMPConst.XMP_PI) 332 { 333 // Store the processing instructions content 334 if (result != null) 335 { 336 result[2] = ((ProcessingInstruction) root).getData(); 337 } 338 } 339 else if (Node.TEXT_NODE != root.getNodeType() && 340 Node.PROCESSING_INSTRUCTION_NODE != root.getNodeType()) 341 { 342 String rootNS = root.getNamespaceURI(); 343 String rootLocal = root.getLocalName(); 344 if ( 345 ( 346 XMPConst.TAG_XMPMETA.equals(rootLocal) || 347 XMPConst.TAG_XAPMETA.equals(rootLocal) 348 ) && 349 XMPConst.NS_X.equals(rootNS) 350 ) 351 { 352 // by not passing the RequireXMPMeta-option, the rdf-Node will be valid 353 return findRootNode(root, false, result); 354 } 355 else if (!xmpmetaRequired && 356 "RDF".equals(rootLocal) && 357 XMPConst.NS_RDF.equals(rootNS)) 358 { 359 if (result != null) 360 { 361 result[0] = root; 362 result[1] = XMP_RDF; 363 } 364 return result; 365 } 366 else 367 { 368 // continue searching 369 Object[] newResult = findRootNode(root, xmpmetaRequired, result); 370 if (newResult != null) 371 { 372 return newResult; 373 } 374 else 375 { 376 continue; 377 } 378 } 379 } 380 } 381 382 // no appropriate node has been found 383 return null; 384 // is extracted here in the C++ Toolkit 385 } 386 387 388 /** 389 * @return Creates, configures and returnes the document builder factory for 390 * the Metadata Parser. 391 */ 392 private static DocumentBuilderFactory createDocumentBuilderFactory() 393 { 394 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 395 factory.setNamespaceAware(true); 396 factory.setIgnoringComments(true); 397 398 try 399 { 400 // honor System parsing limits, e.g. 401 // System.setProperty("entityExpansionLimit", "10"); 402 factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); 403 } 404 catch (Exception e) 405 { 406 // Ignore IllegalArgumentException and ParserConfigurationException 407 // in case the configured XML-Parser does not implement the feature. 408 } 409 return factory; 410 } 411 }