Home | History | Annotate | Download | only in codegen
      1 /*
      2  * [The "BSD license"]
      3  *  Copyright (c) 2010 Terence Parr
      4  *  All rights reserved.
      5  *
      6  *  Redistribution and use in source and binary forms, with or without
      7  *  modification, are permitted provided that the following conditions
      8  *  are met:
      9  *  1. Redistributions of source code must retain the above copyright
     10  *      notice, this list of conditions and the following disclaimer.
     11  *  2. Redistributions in binary form must reproduce the above copyright
     12  *      notice, this list of conditions and the following disclaimer in the
     13  *      documentation and/or other materials provided with the distribution.
     14  *  3. The name of the author may not be used to endorse or promote products
     15  *      derived from this software without specific prior written permission.
     16  *
     17  *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     18  *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     19  *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     20  *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     21  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     22  *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23  *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     26  *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 package org.antlr.codegen;
     29 
     30 import org.antlr.Tool;
     31 import org.antlr.analysis.Label;
     32 import org.antlr.runtime.Token;
     33 import org.stringtemplate.v4.ST;
     34 import org.antlr.tool.Grammar;
     35 
     36 import java.io.IOException;
     37 import java.util.List;
     38 
     39 /** The code generator for ANTLR can usually be retargeted just by providing
     40  *  a new X.stg file for language X, however, sometimes the files that must
     41  *  be generated vary enough that some X-specific functionality is required.
     42  *  For example, in C, you must generate header files whereas in Java you do not.
     43  *  Other languages may want to keep DFA separate from the main
     44  *  generated recognizer file.
     45  *
     46  *  The notion of a Code Generator target abstracts out the creation
     47  *  of the various files.  As new language targets get added to the ANTLR
     48  *  system, this target class may have to be altered to handle more
     49  *  functionality.  Eventually, just about all language generation issues
     50  *  will be expressible in terms of these methods.
     51  *
     52  *  If org.antlr.codegen.XTarget class exists, it is used else
     53  *  Target base class is used.  I am using a superclass rather than an
     54  *  interface for this target concept because I can add functionality
     55  *  later without breaking previously written targets (extra interface
     56  *  methods would force adding dummy functions to all code generator
     57  *  target classes).
     58  *
     59  */
     60 public class Target {
     61 
     62 	/** For pure strings of Java 16-bit unicode char, how can we display
     63 	 *  it in the target language as a literal.  Useful for dumping
     64 	 *  predicates and such that may refer to chars that need to be escaped
     65 	 *  when represented as strings.  Also, templates need to be escaped so
     66 	 *  that the target language can hold them as a string.
     67 	 *
     68 	 *  I have defined (via the constructor) the set of typical escapes,
     69 	 *  but your Target subclass is free to alter the translated chars or
     70 	 *  add more definitions.  This is nonstatic so each target can have
     71 	 *  a different set in memory at same time.
     72 	 */
     73 	protected String[] targetCharValueEscape = new String[255];
     74 
     75 	public Target() {
     76 		targetCharValueEscape['\n'] = "\\n";
     77 		targetCharValueEscape['\r'] = "\\r";
     78 		targetCharValueEscape['\t'] = "\\t";
     79 		targetCharValueEscape['\b'] = "\\b";
     80 		targetCharValueEscape['\f'] = "\\f";
     81 		targetCharValueEscape['\\'] = "\\\\";
     82 		targetCharValueEscape['\''] = "\\'";
     83 		targetCharValueEscape['"'] = "\\\"";
     84 	}
     85 
     86 	protected void genRecognizerFile(Tool tool,
     87 									 CodeGenerator generator,
     88 									 Grammar grammar,
     89 									 ST outputFileST)
     90 		throws IOException
     91 	{
     92 		String fileName =
     93 			generator.getRecognizerFileName(grammar.name, grammar.type);
     94 		generator.write(outputFileST, fileName);
     95 	}
     96 
     97 	protected void genRecognizerHeaderFile(Tool tool,
     98 										   CodeGenerator generator,
     99 										   Grammar grammar,
    100 										   ST headerFileST,
    101 										   String extName) // e.g., ".h"
    102 		throws IOException
    103 	{
    104 		// no header file by default
    105 	}
    106 
    107 	protected void performGrammarAnalysis(CodeGenerator generator,
    108 										  Grammar grammar)
    109 	{
    110 		// Build NFAs from the grammar AST
    111 		grammar.buildNFA();
    112 
    113 		// Create the DFA predictors for each decision
    114 		grammar.createLookaheadDFAs();
    115 	}
    116 
    117 	/** Is scope in @scope::name {action} valid for this kind of grammar?
    118 	 *  Targets like C++ may want to allow new scopes like headerfile or
    119 	 *  some such.  The action names themselves are not policed at the
    120 	 *  moment so targets can add template actions w/o having to recompile
    121 	 *  ANTLR.
    122 	 */
    123 	public boolean isValidActionScope(int grammarType, String scope) {
    124 		switch (grammarType) {
    125 			case Grammar.LEXER :
    126 				if ( scope.equals("lexer") ) {return true;}
    127 				break;
    128 			case Grammar.PARSER :
    129 				if ( scope.equals("parser") ) {return true;}
    130 				break;
    131 			case Grammar.COMBINED :
    132 				if ( scope.equals("parser") ) {return true;}
    133 				if ( scope.equals("lexer") ) {return true;}
    134 				break;
    135 			case Grammar.TREE_PARSER :
    136 				if ( scope.equals("treeparser") ) {return true;}
    137 				break;
    138 		}
    139 		return false;
    140 	}
    141 
    142 	/** Target must be able to override the labels used for token types */
    143 	public String getTokenTypeAsTargetLabel(CodeGenerator generator, int ttype) {
    144 		String name = generator.grammar.getTokenDisplayName(ttype);
    145 		// If name is a literal, return the token type instead
    146 		if ( name.charAt(0)=='\'' ) {
    147 			return String.valueOf(ttype);
    148 		}
    149 		return name;
    150 	}
    151 
    152 	/** Convert from an ANTLR char literal found in a grammar file to
    153 	 *  an equivalent char literal in the target language.  For most
    154 	 *  languages, this means leaving 'x' as 'x'.  Actually, we need
    155 	 *  to escape '\u000A' so that it doesn't get converted to \n by
    156 	 *  the compiler.  Convert the literal to the char value and then
    157 	 *  to an appropriate target char literal.
    158 	 *
    159 	 *  Expect single quotes around the incoming literal.
    160 	 */
    161 	public String getTargetCharLiteralFromANTLRCharLiteral(
    162 		CodeGenerator generator,
    163 		String literal)
    164 	{
    165 		StringBuffer buf = new StringBuffer();
    166 		buf.append('\'');
    167 		int c = Grammar.getCharValueFromGrammarCharLiteral(literal);
    168 		if ( c<Label.MIN_CHAR_VALUE ) {
    169 			return "'\u0000'";
    170 		}
    171 		if ( c<targetCharValueEscape.length &&
    172 			 targetCharValueEscape[c]!=null )
    173 		{
    174 			buf.append(targetCharValueEscape[c]);
    175 		}
    176 		else if ( Character.UnicodeBlock.of((char)c)==
    177 				  Character.UnicodeBlock.BASIC_LATIN &&
    178 				  !Character.isISOControl((char)c) )
    179 		{
    180 			// normal char
    181 			buf.append((char)c);
    182 		}
    183 		else {
    184 			// must be something unprintable...use \\uXXXX
    185 			// turn on the bit above max "\\uFFFF" value so that we pad with zeros
    186 			// then only take last 4 digits
    187 			String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5);
    188 			buf.append("\\u");
    189 			buf.append(hex);
    190 		}
    191 
    192 		buf.append('\'');
    193 		return buf.toString();
    194 	}
    195 
    196 	/** Convert from an ANTLR string literal found in a grammar file to
    197 	 *  an equivalent string literal in the target language.  For Java, this
    198 	 *  is the translation 'a\n"' -> "a\n\"".  Expect single quotes
    199 	 *  around the incoming literal.  Just flip the quotes and replace
    200 	 *  double quotes with \"
    201      *
    202      *  Note that we have decided to allow poeple to use '\"' without
    203      *  penalty, so we must build the target string in a loop as Utils.replae
    204      *  cannot handle both \" and " without a lot of messing around.
    205      *
    206 	 */
    207 	public String getTargetStringLiteralFromANTLRStringLiteral(
    208 		CodeGenerator generator,
    209 		String literal)
    210 	{
    211         StringBuilder sb = new StringBuilder();
    212         StringBuffer is = new StringBuffer(literal);
    213 
    214         // Opening quote
    215         //
    216         sb.append('"');
    217 
    218         for (int i = 1; i < is.length() -1; i++) {
    219             if  (is.charAt(i) == '\\') {
    220                 // Anything escaped is what it is! We assume that
    221                 // people know how to escape characters correctly. However
    222                 // we catch anything that does not need an escape in Java (which
    223                 // is what the default implementation is dealing with and remove
    224                 // the escape. The C target does this for instance.
    225                 //
    226                 switch (is.charAt(i+1)) {
    227                     // Pass through any escapes that Java also needs
    228                     //
    229                     case    '"':
    230                     case    'n':
    231                     case    'r':
    232                     case    't':
    233                     case    'b':
    234                     case    'f':
    235                     case    '\\':
    236                     case    'u':    // Assume unnnn
    237                         sb.append('\\');    // Pass the escape through
    238                         break;
    239                     default:
    240                         // Remove the escape by virtue of not adding it here
    241                         // Thus \' becomes ' and so on
    242                         //
    243                         break;
    244                 }
    245 
    246                 // Go past the \ character
    247                 //
    248                 i++;
    249             } else {
    250                 // Chracters that don't need \ in ANTLR 'strings' but do in Java
    251                 //
    252                 if (is.charAt(i) == '"') {
    253                     // We need to escape " in Java
    254                     //
    255                     sb.append('\\');
    256                 }
    257             }
    258             // Add in the next character, which may have been escaped
    259             //
    260             sb.append(is.charAt(i));
    261         }
    262 
    263         // Append closing " and return
    264         //
    265         sb.append('"');
    266 
    267 		return sb.toString();
    268 	}
    269 
    270 	/** Given a random string of Java unicode chars, return a new string with
    271 	 *  optionally appropriate quote characters for target language and possibly
    272 	 *  with some escaped characters.  For example, if the incoming string has
    273 	 *  actual newline characters, the output of this method would convert them
    274 	 *  to the two char sequence \n for Java, C, C++, ...  The new string has
    275 	 *  double-quotes around it as well.  Example String in memory:
    276 	 *
    277 	 *     a"[newlinechar]b'c[carriagereturnchar]d[tab]e\f
    278 	 *
    279 	 *  would be converted to the valid Java s:
    280 	 *
    281 	 *     "a\"\nb'c\rd\te\\f"
    282 	 *
    283 	 *  or
    284 	 *
    285 	 *     a\"\nb'c\rd\te\\f
    286 	 *
    287 	 *  depending on the quoted arg.
    288 	 */
    289 	public String getTargetStringLiteralFromString(String s, boolean quoted) {
    290 		if ( s==null ) {
    291 			return null;
    292 		}
    293 
    294 		StringBuffer buf = new StringBuffer();
    295 		if ( quoted ) {
    296 			buf.append('"');
    297 		}
    298 		for (int i=0; i<s.length(); i++) {
    299 			int c = s.charAt(i);
    300 			if ( c!='\'' && // don't escape single quotes in strings for java
    301 				 c<targetCharValueEscape.length &&
    302 				 targetCharValueEscape[c]!=null )
    303 			{
    304 				buf.append(targetCharValueEscape[c]);
    305 			}
    306 			else {
    307 				buf.append((char)c);
    308 			}
    309 		}
    310 		if ( quoted ) {
    311 			buf.append('"');
    312 		}
    313 		return buf.toString();
    314 	}
    315 
    316 	public String getTargetStringLiteralFromString(String s) {
    317 		return getTargetStringLiteralFromString(s, false);
    318 	}
    319 
    320 	/** Convert long to 0xNNNNNNNNNNNNNNNN by default for spitting out
    321 	 *  with bitsets.  I.e., convert bytes to hex string.
    322 	 */
    323 	public String getTarget64BitStringFromValue(long word) {
    324 		int numHexDigits = 8*2;
    325 		StringBuffer buf = new StringBuffer(numHexDigits+2);
    326 		buf.append("0x");
    327 		String digits = Long.toHexString(word);
    328 		digits = digits.toUpperCase();
    329 		int padding = numHexDigits - digits.length();
    330 		// pad left with zeros
    331 		for (int i=1; i<=padding; i++) {
    332 			buf.append('0');
    333 		}
    334 		buf.append(digits);
    335 		return buf.toString();
    336 	}
    337 
    338 	public String encodeIntAsCharEscape(int v) {
    339 		if ( v<=127 ) {
    340 			return "\\"+Integer.toOctalString(v);
    341 		}
    342 		String hex = Integer.toHexString(v|0x10000).substring(1,5);
    343 		return "\\u"+hex;
    344 	}
    345 
    346 	/** Some targets only support ASCII or 8-bit chars/strings.  For example,
    347 	 *  C++ will probably want to return 0xFF here.
    348 	 */
    349 	public int getMaxCharValue(CodeGenerator generator) {
    350 		return Label.MAX_CHAR_VALUE;
    351 	}
    352 
    353 	/** Give target a chance to do some postprocessing on actions.
    354 	 *  Python for example will have to fix the indention.
    355 	 */
    356 	public List postProcessAction(List chunks, Token actionToken) {
    357 		return chunks;
    358 	}
    359 
    360 }
    361