Home | History | Annotate | Download | only in tagsoup
      1 // This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
      2 //
      3 // TagSoup is licensed under the Apache License,
      4 // Version 2.0.  You may obtain a copy of this license at
      5 // http://www.apache.org/licenses/LICENSE-2.0 .  You may also have
      6 // additional legal rights not granted by this license.
      7 //
      8 // TagSoup is distributed in the hope that it will be useful, but
      9 // unless required by applicable law or agreed to in writing, TagSoup
     10 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
     11 // OF ANY KIND, either express or implied; not even the implied warranty
     12 // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
     13 //
     14 //
     15 // The TagSoup command line UI
     16 
     17 package org.ccil.cowan.tagsoup;
     18 import java.util.Hashtable;
     19 import java.util.Enumeration;
     20 import java.io.*;
     21 import java.net.URL;
     22 import java.net.URLConnection;
     23 import org.xml.sax.*;
     24 import org.xml.sax.helpers.DefaultHandler;
     25 import org.xml.sax.ext.LexicalHandler;
     26 
     27 
     28 /**
     29 The stand-alone TagSoup program.
     30 **/
     31 public class CommandLine {
     32 
     33 	static Hashtable options = new Hashtable(); static {
     34 		options.put("--nocdata", Boolean.FALSE); // CDATA elements are normal
     35 		options.put("--files", Boolean.FALSE);	// process arguments as separate files
     36 		options.put("--reuse", Boolean.FALSE);	// reuse a single Parser
     37 		options.put("--nons", Boolean.FALSE);	// no namespaces
     38 		options.put("--nobogons", Boolean.FALSE);  // suppress unknown elements
     39 		options.put("--any", Boolean.FALSE);	// unknowns have ANY content model
     40 		options.put("--emptybogons", Boolean.FALSE);	// unknowns have EMPTY content model
     41 		options.put("--norootbogons", Boolean.FALSE);	// unknowns can't be the root
     42 		options.put("--pyxin", Boolean.FALSE);	// input is PYX
     43 		options.put("--lexical", Boolean.FALSE); // output comments
     44 		options.put("--pyx", Boolean.FALSE);	// output is PYX
     45 		options.put("--html", Boolean.FALSE);	// output is HTML
     46 		options.put("--method=", Boolean.FALSE); // output method
     47 		options.put("--doctype-public=", Boolean.FALSE); // override public id
     48 		options.put("--doctype-system=", Boolean.FALSE); // override system id
     49 		options.put("--output-encoding=", Boolean.FALSE); // output encoding
     50 		options.put("--omit-xml-declaration", Boolean.FALSE); // omit XML decl
     51 		options.put("--encoding=", Boolean.FALSE); // specify encoding
     52 		options.put("--help", Boolean.FALSE); 	// display help
     53 		options.put("--version", Boolean.FALSE);	// display version
     54 		options.put("--nodefaults", Boolean.FALSE); // no default attrs
     55 		options.put("--nocolons", Boolean.FALSE); // colon to underscore
     56 		options.put("--norestart", Boolean.FALSE); // no restartable elements
     57 		options.put("--ignorable", Boolean.FALSE);  // return ignorable whitespace
     58 		}
     59 
     60 	/**
     61 	Main method.  Processes specified files or standard input.
     62 	**/
     63 
     64 	public static void main(String[] argv) throws IOException, SAXException {
     65 		int optind = getopts(options, argv);
     66 		if (hasOption(options, "--help")) {
     67 			doHelp();
     68 			return;
     69 			}
     70 		if (hasOption(options, "--version")) {
     71 			System.err.println("TagSoup version 1.2");
     72 			return;
     73 			}
     74 		if (argv.length == optind) {
     75 			process("", System.out);
     76 			}
     77 		else if (hasOption(options, "--files")) {
     78 			for (int i = optind; i < argv.length; i++) {
     79 				String src = argv[i];
     80 				String dst;
     81 				int j = src.lastIndexOf('.');
     82 				if (j == -1)
     83 					dst = src + ".xhtml";
     84 				else if (src.endsWith(".xhtml"))
     85 					dst = src + "_";
     86 				else
     87 					dst = src.substring(0, j) + ".xhtml";
     88 				System.err.println("src: " + src + " dst: " + dst);
     89 				OutputStream os = new FileOutputStream(dst);
     90 				process(src, os);
     91 				}
     92 			}
     93 		else {
     94 			for (int i = optind; i < argv.length; i++) {
     95 				System.err.println("src: " + argv[i]);
     96 				process(argv[i], System.out);
     97 				}
     98 			}
     99 		}
    100 
    101 	// Print the help message
    102 
    103 	private static void doHelp() {
    104 		System.err.print("usage: java -jar tagsoup-*.jar ");
    105 		System.err.print(" [ ");
    106 		boolean first = true;
    107 		for (Enumeration e = options.keys(); e.hasMoreElements(); ) {
    108 			if (!first) {
    109 				System.err.print("| ");
    110 				}
    111 			first = false;
    112 			String key = (String)(e.nextElement());
    113 			System.err.print(key);
    114 			if (key.endsWith("="))
    115 				System.err.print("?");
    116 				System.err.print(" ");
    117 			}
    118 		System.err.println("]*");
    119 	}
    120 
    121 	private static Parser theParser = null;
    122 	private static HTMLSchema theSchema = null;
    123 	private static String theOutputEncoding = null;
    124 
    125 	// Process one source onto an output stream.
    126 
    127 	private static void process(String src, OutputStream os)
    128 			throws IOException, SAXException {
    129 		XMLReader r;
    130 		if (hasOption(options, "--reuse")) {
    131 			if (theParser == null) theParser = new Parser();
    132 			r = theParser;
    133 			}
    134 		else {
    135 			r = new Parser();
    136 			}
    137 		theSchema = new HTMLSchema();
    138 		r.setProperty(Parser.schemaProperty, theSchema);
    139 
    140 		if (hasOption(options, "--nocdata")) {
    141 			r.setFeature(Parser.CDATAElementsFeature, false);
    142 			}
    143 
    144 		if (hasOption(options, "--nons") || hasOption(options, "--html")) {
    145 			r.setFeature(Parser.namespacesFeature, false);
    146 			}
    147 
    148 		if (hasOption(options, "--nobogons")) {
    149 			r.setFeature(Parser.ignoreBogonsFeature, true);
    150 			}
    151 
    152 		if (hasOption(options, "--any")) {
    153 			r.setFeature(Parser.bogonsEmptyFeature, false);
    154 			}
    155 		else if (hasOption(options, "--emptybogons")) {
    156 			r.setFeature(Parser.bogonsEmptyFeature, true);
    157 			}
    158 
    159 		if (hasOption(options, "--norootbogons")) {
    160 			r.setFeature(Parser.rootBogonsFeature, false);
    161 			}
    162 
    163 		if (hasOption(options, "--nodefaults")) {
    164 			r.setFeature(Parser.defaultAttributesFeature, false);
    165 			}
    166 		if (hasOption(options, "--nocolons")) {
    167 			r.setFeature(Parser.translateColonsFeature, true);
    168 			}
    169 
    170 		if (hasOption(options, "--norestart")) {
    171 			r.setFeature(Parser.restartElementsFeature, false);
    172 			}
    173 
    174 		if (hasOption(options, "--ignorable")) {
    175 			r.setFeature(Parser.ignorableWhitespaceFeature, true);
    176 			}
    177 
    178 		if (hasOption(options, "--pyxin")) {
    179 			r.setProperty(Parser.scannerProperty, new PYXScanner());
    180 			}
    181 
    182 		Writer w;
    183 		if (theOutputEncoding == null) {
    184 			w = new OutputStreamWriter(os);
    185 			}
    186 		else {
    187 			w = new OutputStreamWriter(os, theOutputEncoding);
    188 			}
    189 		ContentHandler h = chooseContentHandler(w);
    190 		r.setContentHandler(h);
    191 		if (hasOption(options, "--lexical") && h instanceof LexicalHandler) {
    192 			r.setProperty(Parser.lexicalHandlerProperty, h);
    193 			}
    194 		InputSource s = new InputSource();
    195 		if (src != "") {
    196 			s.setSystemId(src);
    197 			}
    198 		else {
    199 			s.setByteStream(System.in);
    200 			}
    201 		if (hasOption(options, "--encoding=")) {
    202 //			System.out.println("%% Found --encoding");
    203 			String encoding = (String)options.get("--encoding=");
    204 			if (encoding != null) s.setEncoding(encoding);
    205 			}
    206 		r.parse(s);
    207 		}
    208 
    209 	// Pick a content handler to generate the desired format.
    210 
    211 	private static ContentHandler chooseContentHandler(Writer w) {
    212 		XMLWriter x;
    213 		if (hasOption(options, "--pyx")) {
    214 			return new PYXWriter(w);
    215 			}
    216 
    217 		x = new XMLWriter(w);
    218 		if (hasOption(options, "--html")) {
    219 			x.setOutputProperty(XMLWriter.METHOD, "html");
    220 			x.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
    221 			}
    222 		if (hasOption(options, "--method=")) {
    223 			String method = (String)options.get("--method=");
    224 			if (method != null) {
    225 				x.setOutputProperty(XMLWriter.METHOD, method);
    226 				}
    227 			}
    228 		if (hasOption(options, "--doctype-public=")) {
    229 			String doctype_public = (String)options.get("--doctype-public=");
    230 			if (doctype_public != null) {
    231 				x.setOutputProperty(XMLWriter.DOCTYPE_PUBLIC, doctype_public);
    232 				}
    233 			}
    234 		if (hasOption(options, "--doctype-system=")) {
    235 			String doctype_system = (String)options.get("--doctype-system=");
    236 			if (doctype_system != null) {
    237 				x.setOutputProperty(XMLWriter.DOCTYPE_SYSTEM, doctype_system);
    238 				}
    239 			}
    240 		if (hasOption(options, "--output-encoding=")) {
    241 			theOutputEncoding = (String)options.get("--output-encoding=");
    242 //			System.err.println("%%%% Output encoding is " + theOutputEncoding);
    243 			if (theOutputEncoding != null) {
    244 				x.setOutputProperty(XMLWriter.ENCODING, theOutputEncoding);
    245 				}
    246 			}
    247 		if (hasOption(options, "--omit-xml-declaration")) {
    248 			x.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
    249 			}
    250 		x.setPrefix(theSchema.getURI(), "");
    251 		return x;
    252 		}
    253 
    254 	// Options processing
    255 
    256 	private static int getopts(Hashtable options, String[] argv) {
    257 		int optind;
    258 		for (optind = 0; optind < argv.length; optind++) {
    259 			String arg = argv[optind];
    260 			String value = null;
    261 			if (arg.charAt(0) != '-') break;
    262 			int eqsign = arg.indexOf('=');
    263 			if (eqsign != -1) {
    264 				value = arg.substring(eqsign + 1, arg.length());
    265 				arg = arg.substring(0, eqsign + 1);
    266 				}
    267 			if (options.containsKey(arg)) {
    268 				if (value == null) options.put(arg, Boolean.TRUE);
    269 				else options.put(arg, value);
    270 //				System.out.println("%% Parsed [" + arg + "]=[" + value + "]");
    271 				}
    272 			else {
    273 				System.err.print("Unknown option ");
    274 				System.err.println(arg);
    275 				System.exit(1);
    276 				}
    277 			}
    278 		return optind;
    279 		}
    280 
    281 	// Return true if an option exists.
    282 
    283 	private static boolean hasOption(Hashtable options, String option) {
    284 		if (Boolean.getBoolean(option)) return true;
    285 		else if (options.get(option) != Boolean.FALSE) return true;
    286 		return false;
    287 		}
    288 
    289 	}
    290