Home | History | Annotate | Download | only in scripts
      1 """ TeXcheck.py -- rough syntax checking on Python style LaTeX documents.
      2 
      3    Written by Raymond D. Hettinger <python at rcn.com>
      4    Copyright (c) 2003 Python Software Foundation.  All rights reserved.
      5 
      6 Designed to catch common markup errors including:
      7 * Unbalanced or mismatched parenthesis, brackets, and braces.
      8 * Unbalanced or mismatched \\begin and \\end blocks.
      9 * Misspelled or invalid LaTeX commands.
     10 * Use of forward slashes instead of backslashes for commands.
     11 * Table line size mismatches.
     12 
     13 Sample command line usage:
     14     python texcheck.py -k chapterheading -m lib/librandomtex *.tex
     15 
     16 Options:
     17     -m          Munge parenthesis and brackets. [0,n) would normally mismatch.
     18     -k keyword: Keyword is a valid LaTeX command. Do not include the backslash.
     19     -d:         Delimiter check only (useful for non-LaTeX files).
     20     -h:         Help
     21     -s lineno:  Start at lineno (useful for skipping complex sections).
     22     -v:         Verbose.  Trace the matching of //begin and //end blocks.
     23 """
     24 
     25 import re
     26 import sys
     27 import getopt
     28 from itertools import izip, count, islice
     29 import glob
     30 
     31 cmdstr = r"""
     32     \section \module \declaremodule \modulesynopsis \moduleauthor
     33     \sectionauthor \versionadded \code \class \method \begin
     34     \optional \var \ref \end \subsection \lineiii \hline \label
     35     \indexii \textrm \ldots \keyword \stindex \index \item \note
     36     \withsubitem \ttindex \footnote \citetitle \samp \opindex
     37     \noindent \exception \strong \dfn \ctype \obindex \character
     38     \indexiii \function \bifuncindex \refmodule \refbimodindex
     39     \subsubsection \nodename \member \chapter \emph \ASCII \UNIX
     40     \regexp \program \production \token \productioncont \term
     41     \grammartoken \lineii \seemodule \file \EOF \documentclass
     42     \usepackage \title \input \maketitle \ifhtml \fi \url \Cpp
     43     \tableofcontents \kbd \programopt \envvar \refstmodindex
     44     \cfunction \constant \NULL \moreargs \cfuncline \cdata
     45     \textasciicircum \n \ABC \setindexsubitem \versionchanged
     46     \deprecated \seetext \newcommand \POSIX \pep \warning \rfc
     47     \verbatiminput \methodline \textgreater \seetitle \lineiv
     48     \funclineni \ulink \manpage \funcline \dataline \unspecified
     49     \textbackslash \mimetype \mailheader \seepep \textunderscore
     50     \longprogramopt \infinity \plusminus \shortversion \version
     51     \refmodindex \seerfc \makeindex \makemodindex \renewcommand
     52     \indexname \appendix \protect \indexiv \mbox \textasciitilde
     53     \platform \seeurl \leftmargin \labelwidth \localmoduletable
     54     \LaTeX \copyright \memberline \backslash \pi \centerline
     55     \caption \vspace \textwidth \menuselection \textless
     56     \makevar \csimplemacro \menuselection \bfcode \sub \release
     57     \email \kwindex \refexmodindex \filenq \e \menuselection
     58     \exindex \linev \newsgroup \verbatim \setshortversion
     59     \author \authoraddress \paragraph \subparagraph \cmemberline
     60     \textbar \C \seelink
     61 """
     62 
     63 def matchclose(c_lineno, c_symbol, openers, pairmap):
     64     "Verify that closing delimiter matches most recent opening delimiter"
     65     try:
     66         o_lineno, o_symbol = openers.pop()
     67     except IndexError:
     68         print "\nDelimiter mismatch.  On line %d, encountered closing '%s' without corresponding open" % (c_lineno, c_symbol)
     69         return
     70     if o_symbol in pairmap.get(c_symbol, [c_symbol]): return
     71     print "\nOpener '%s' on line %d was not closed before encountering '%s' on line %d" % (o_symbol, o_lineno, c_symbol, c_lineno)
     72     return
     73 
     74 def checkit(source, opts, morecmds=[]):
     75     """Check the LaTeX formatting in a sequence of lines.
     76 
     77     Opts is a mapping of options to option values if any:
     78         -m          munge parenthesis and brackets
     79         -d          delimiters only checking
     80         -v          verbose trace of delimiter matching
     81         -s lineno:  linenumber to start scan (default is 1).
     82 
     83     Morecmds is a sequence of LaTeX commands (without backslashes) that
     84     are to be considered valid in the scan.
     85     """
     86 
     87     texcmd = re.compile(r'\\[A-Za-z]+')
     88     falsetexcmd = re.compile(r'\/([A-Za-z]+)') # Mismarked with forward slash
     89 
     90     validcmds = set(cmdstr.split())
     91     for cmd in morecmds:
     92         validcmds.add('\\' + cmd)
     93 
     94     if '-m' in opts:
     95         pairmap = {']':'[(', ')':'(['}      # Munged openers
     96     else:
     97         pairmap = {']':'[', ')':'('}        # Normal opener for a given closer
     98     openpunct = set('([')                   # Set of valid openers
     99 
    100     delimiters = re.compile(r'\\(begin|end){([_a-zA-Z]+)}|([()\[\]])')
    101     braces = re.compile(r'({)|(})')
    102     doubledwords = re.compile(r'(\b[A-za-z]+\b) \b\1\b')
    103     spacingmarkup = re.compile(r'\\(ABC|ASCII|C|Cpp|EOF|infinity|NULL|plusminus|POSIX|UNIX)\s')
    104 
    105     openers = []                            # Stack of pending open delimiters
    106     bracestack = []                         # Stack of pending open braces
    107 
    108     tablestart = re.compile(r'\\begin{(?:long)?table([iv]+)}')
    109     tableline = re.compile(r'\\line([iv]+){')
    110     tableend = re.compile(r'\\end{(?:long)?table([iv]+)}')
    111     tablelevel = ''
    112     tablestartline = 0
    113 
    114     startline = int(opts.get('-s', '1'))
    115     lineno = 0
    116 
    117     for lineno, line in izip(count(startline), islice(source, startline-1, None)):
    118         line = line.rstrip()
    119 
    120         # Check balancing of open/close parenthesis, brackets, and begin/end blocks
    121         for begend, name, punct in delimiters.findall(line):
    122             if '-v' in opts:
    123                 print lineno, '|', begend, name, punct,
    124             if begend == 'begin' and '-d' not in opts:
    125                 openers.append((lineno, name))
    126             elif punct in openpunct:
    127                 openers.append((lineno, punct))
    128             elif begend == 'end' and '-d' not in opts:
    129                 matchclose(lineno, name, openers, pairmap)
    130             elif punct in pairmap:
    131                 matchclose(lineno, punct, openers, pairmap)
    132             if '-v' in opts:
    133                 print '   --> ', openers
    134 
    135         # Balance opening and closing braces
    136         for open, close in braces.findall(line):
    137             if open == '{':
    138                 bracestack.append(lineno)
    139             if close == '}':
    140                 try:
    141                     bracestack.pop()
    142                 except IndexError:
    143                     print r'Warning, unmatched } on line %s.' % (lineno,)
    144 
    145         # Optionally, skip LaTeX specific checks
    146         if '-d' in opts:
    147             continue
    148 
    149         # Warn whenever forward slashes encountered with a LaTeX command
    150         for cmd in falsetexcmd.findall(line):
    151             if '822' in line or '.html' in line:
    152                 continue    # Ignore false positives for urls and for /rfc822
    153             if '\\' + cmd in validcmds:
    154                 print 'Warning, forward slash used on line %d with cmd: /%s' % (lineno, cmd)
    155 
    156         # Check for markup requiring {} for correct spacing
    157         for cmd in spacingmarkup.findall(line):
    158             print r'Warning, \%s should be written as \%s{} on line %d' % (cmd, cmd, lineno)
    159 
    160         # Validate commands
    161         nc = line.find(r'\newcommand')
    162         if nc != -1:
    163             start = line.find('{', nc)
    164             end = line.find('}', start)
    165             validcmds.add(line[start+1:end])
    166         for cmd in texcmd.findall(line):
    167             if cmd not in validcmds:
    168                 print r'Warning, unknown tex cmd on line %d: \%s' % (lineno, cmd)
    169 
    170         # Check table levels (make sure lineii only inside tableii)
    171         m = tablestart.search(line)
    172         if m:
    173             tablelevel = m.group(1)
    174             tablestartline = lineno
    175         m = tableline.search(line)
    176         if m and m.group(1) != tablelevel:
    177             print r'Warning, \line%s on line %d does not match \table%s on line %d' % (m.group(1), lineno, tablelevel, tablestartline)
    178         if tableend.search(line):
    179             tablelevel = ''
    180 
    181         # Style guide warnings
    182         if 'e.g.' in line or 'i.e.' in line:
    183             print r'Style warning, avoid use of i.e or e.g. on line %d' % (lineno,)
    184 
    185         for dw in doubledwords.findall(line):
    186             print r'Doubled word warning.  "%s" on line %d' % (dw, lineno)
    187 
    188     lastline = lineno
    189     for lineno, symbol in openers:
    190         print "Unmatched open delimiter '%s' on line %d" % (symbol, lineno)
    191     for lineno in bracestack:
    192         print "Unmatched { on line %d" % (lineno,)
    193     print 'Done checking %d lines.' % (lastline,)
    194     return 0
    195 
    196 def main(args=None):
    197     if args is None:
    198         args = sys.argv[1:]
    199     optitems, arglist = getopt.getopt(args, "k:mdhs:v")
    200     opts = dict(optitems)
    201     if '-h' in opts or args==[]:
    202         print __doc__
    203         return 0
    204 
    205     if len(arglist) < 1:
    206         print 'Please specify a file to be checked'
    207         return 1
    208 
    209     for i, filespec in enumerate(arglist):
    210         if '*' in filespec or '?' in filespec:
    211             arglist[i:i+1] = glob.glob(filespec)
    212 
    213     morecmds = [v for k,v in optitems if k=='-k']
    214     err = []
    215 
    216     for filename in arglist:
    217         print '=' * 30
    218         print "Checking", filename
    219         try:
    220             f = open(filename)
    221         except IOError:
    222             print 'Cannot open file %s.' % arglist[0]
    223             return 2
    224 
    225         try:
    226             err.append(checkit(f, opts, morecmds))
    227         finally:
    228             f.close()
    229 
    230     return max(err)
    231 
    232 if __name__ == '__main__':
    233     sys.exit(main())
    234