Home | History | Annotate | Download | only in Lib
      1 """Text wrapping and filling.
      2 """
      3 
      4 # Copyright (C) 1999-2001 Gregory P. Ward.
      5 # Copyright (C) 2002, 2003 Python Software Foundation.
      6 # Written by Greg Ward <gward (at] python.net>
      7 
      8 import re
      9 
     10 __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
     11 
     12 # Hardcode the recognized whitespace characters to the US-ASCII
     13 # whitespace characters.  The main reason for doing this is that
     14 # some Unicode spaces (like \u00a0) are non-breaking whitespaces.
     15 _whitespace = '\t\n\x0b\x0c\r '
     16 
     17 class TextWrapper:
     18     """
     19     Object for wrapping/filling text.  The public interface consists of
     20     the wrap() and fill() methods; the other methods are just there for
     21     subclasses to override in order to tweak the default behaviour.
     22     If you want to completely replace the main wrapping algorithm,
     23     you'll probably have to override _wrap_chunks().
     24 
     25     Several instance attributes control various aspects of wrapping:
     26       width (default: 70)
     27         the maximum width of wrapped lines (unless break_long_words
     28         is false)
     29       initial_indent (default: "")
     30         string that will be prepended to the first line of wrapped
     31         output.  Counts towards the line's width.
     32       subsequent_indent (default: "")
     33         string that will be prepended to all lines save the first
     34         of wrapped output; also counts towards each line's width.
     35       expand_tabs (default: true)
     36         Expand tabs in input text to spaces before further processing.
     37         Each tab will become 0 .. 'tabsize' spaces, depending on its position
     38         in its line.  If false, each tab is treated as a single character.
     39       tabsize (default: 8)
     40         Expand tabs in input text to 0 .. 'tabsize' spaces, unless
     41         'expand_tabs' is false.
     42       replace_whitespace (default: true)
     43         Replace all whitespace characters in the input text by spaces
     44         after tab expansion.  Note that if expand_tabs is false and
     45         replace_whitespace is true, every tab will be converted to a
     46         single space!
     47       fix_sentence_endings (default: false)
     48         Ensure that sentence-ending punctuation is always followed
     49         by two spaces.  Off by default because the algorithm is
     50         (unavoidably) imperfect.
     51       break_long_words (default: true)
     52         Break words longer than 'width'.  If false, those words will not
     53         be broken, and some lines might be longer than 'width'.
     54       break_on_hyphens (default: true)
     55         Allow breaking hyphenated words. If true, wrapping will occur
     56         preferably on whitespaces and right after hyphens part of
     57         compound words.
     58       drop_whitespace (default: true)
     59         Drop leading and trailing whitespace from lines.
     60       max_lines (default: None)
     61         Truncate wrapped lines.
     62       placeholder (default: ' [...]')
     63         Append to the last line of truncated text.
     64     """
     65 
     66     unicode_whitespace_trans = {}
     67     uspace = ord(' ')
     68     for x in _whitespace:
     69         unicode_whitespace_trans[ord(x)] = uspace
     70 
     71     # This funky little regex is just the trick for splitting
     72     # text up into word-wrappable chunks.  E.g.
     73     #   "Hello there -- you goof-ball, use the -b option!"
     74     # splits into
     75     #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
     76     # (after stripping out empty strings).
     77     word_punct = r'[\w!"\'&.,?]'
     78     letter = r'[^\d\W]'
     79     whitespace = r'[%s]' % re.escape(_whitespace)
     80     nowhitespace = '[^' + whitespace[1:]
     81     wordsep_re = re.compile(r'''
     82         ( # any whitespace
     83           %(ws)s+
     84         | # em-dash between words
     85           (?<=%(wp)s) -{2,} (?=\w)
     86         | # word, possibly hyphenated
     87           %(nws)s+? (?:
     88             # hyphenated word
     89               -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
     90               (?= %(lt)s -? %(lt)s)
     91             | # end of word
     92               (?=%(ws)s|\Z)
     93             | # em-dash
     94               (?<=%(wp)s) (?=-{2,}\w)
     95             )
     96         )''' % {'wp': word_punct, 'lt': letter,
     97                 'ws': whitespace, 'nws': nowhitespace},
     98         re.VERBOSE)
     99     del word_punct, letter, nowhitespace
    100 
    101     # This less funky little regex just split on recognized spaces. E.g.
    102     #   "Hello there -- you goof-ball, use the -b option!"
    103     # splits into
    104     #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
    105     wordsep_simple_re = re.compile(r'(%s+)' % whitespace)
    106     del whitespace
    107 
    108     # XXX this is not locale- or charset-aware -- string.lowercase
    109     # is US-ASCII only (and therefore English-only)
    110     sentence_end_re = re.compile(r'[a-z]'             # lowercase letter
    111                                  r'[\.\!\?]'          # sentence-ending punct.
    112                                  r'[\"\']?'           # optional end-of-quote
    113                                  r'\Z')               # end of chunk
    114 
    115     def __init__(self,
    116                  width=70,
    117                  initial_indent="",
    118                  subsequent_indent="",
    119                  expand_tabs=True,
    120                  replace_whitespace=True,
    121                  fix_sentence_endings=False,
    122                  break_long_words=True,
    123                  drop_whitespace=True,
    124                  break_on_hyphens=True,
    125                  tabsize=8,
    126                  *,
    127                  max_lines=None,
    128                  placeholder=' [...]'):
    129         self.width = width
    130         self.initial_indent = initial_indent
    131         self.subsequent_indent = subsequent_indent
    132         self.expand_tabs = expand_tabs
    133         self.replace_whitespace = replace_whitespace
    134         self.fix_sentence_endings = fix_sentence_endings
    135         self.break_long_words = break_long_words
    136         self.drop_whitespace = drop_whitespace
    137         self.break_on_hyphens = break_on_hyphens
    138         self.tabsize = tabsize
    139         self.max_lines = max_lines
    140         self.placeholder = placeholder
    141 
    142 
    143     # -- Private methods -----------------------------------------------
    144     # (possibly useful for subclasses to override)
    145 
    146     def _munge_whitespace(self, text):
    147         """_munge_whitespace(text : string) -> string
    148 
    149         Munge whitespace in text: expand tabs and convert all other
    150         whitespace characters to spaces.  Eg. " foo\\tbar\\n\\nbaz"
    151         becomes " foo    bar  baz".
    152         """
    153         if self.expand_tabs:
    154             text = text.expandtabs(self.tabsize)
    155         if self.replace_whitespace:
    156             text = text.translate(self.unicode_whitespace_trans)
    157         return text
    158 
    159 
    160     def _split(self, text):
    161         """_split(text : string) -> [string]
    162 
    163         Split the text to wrap into indivisible chunks.  Chunks are
    164         not quite the same as words; see _wrap_chunks() for full
    165         details.  As an example, the text
    166           Look, goof-ball -- use the -b option!
    167         breaks into the following chunks:
    168           'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
    169           'use', ' ', 'the', ' ', '-b', ' ', 'option!'
    170         if break_on_hyphens is True, or in:
    171           'Look,', ' ', 'goof-ball', ' ', '--', ' ',
    172           'use', ' ', 'the', ' ', '-b', ' ', option!'
    173         otherwise.
    174         """
    175         if self.break_on_hyphens is True:
    176             chunks = self.wordsep_re.split(text)
    177         else:
    178             chunks = self.wordsep_simple_re.split(text)
    179         chunks = [c for c in chunks if c]
    180         return chunks
    181 
    182     def _fix_sentence_endings(self, chunks):
    183         """_fix_sentence_endings(chunks : [string])
    184 
    185         Correct for sentence endings buried in 'chunks'.  Eg. when the
    186         original text contains "... foo.\\nBar ...", munge_whitespace()
    187         and split() will convert that to [..., "foo.", " ", "Bar", ...]
    188         which has one too few spaces; this method simply changes the one
    189         space to two.
    190         """
    191         i = 0
    192         patsearch = self.sentence_end_re.search
    193         while i < len(chunks)-1:
    194             if chunks[i+1] == " " and patsearch(chunks[i]):
    195                 chunks[i+1] = "  "
    196                 i += 2
    197             else:
    198                 i += 1
    199 
    200     def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
    201         """_handle_long_word(chunks : [string],
    202                              cur_line : [string],
    203                              cur_len : int, width : int)
    204 
    205         Handle a chunk of text (most likely a word, not whitespace) that
    206         is too long to fit in any line.
    207         """
    208         # Figure out when indent is larger than the specified width, and make
    209         # sure at least one character is stripped off on every pass
    210         if width < 1:
    211             space_left = 1
    212         else:
    213             space_left = width - cur_len
    214 
    215         # If we're allowed to break long words, then do so: put as much
    216         # of the next chunk onto the current line as will fit.
    217         if self.break_long_words:
    218             cur_line.append(reversed_chunks[-1][:space_left])
    219             reversed_chunks[-1] = reversed_chunks[-1][space_left:]
    220 
    221         # Otherwise, we have to preserve the long word intact.  Only add
    222         # it to the current line if there's nothing already there --
    223         # that minimizes how much we violate the width constraint.
    224         elif not cur_line:
    225             cur_line.append(reversed_chunks.pop())
    226 
    227         # If we're not allowed to break long words, and there's already
    228         # text on the current line, do nothing.  Next time through the
    229         # main loop of _wrap_chunks(), we'll wind up here again, but
    230         # cur_len will be zero, so the next line will be entirely
    231         # devoted to the long word that we can't handle right now.
    232 
    233     def _wrap_chunks(self, chunks):
    234         """_wrap_chunks(chunks : [string]) -> [string]
    235 
    236         Wrap a sequence of text chunks and return a list of lines of
    237         length 'self.width' or less.  (If 'break_long_words' is false,
    238         some lines may be longer than this.)  Chunks correspond roughly
    239         to words and the whitespace between them: each chunk is
    240         indivisible (modulo 'break_long_words'), but a line break can
    241         come between any two chunks.  Chunks should not have internal
    242         whitespace; ie. a chunk is either all whitespace or a "word".
    243         Whitespace chunks will be removed from the beginning and end of
    244         lines, but apart from that whitespace is preserved.
    245         """
    246         lines = []
    247         if self.width <= 0:
    248             raise ValueError("invalid width %r (must be > 0)" % self.width)
    249         if self.max_lines is not None:
    250             if self.max_lines > 1:
    251                 indent = self.subsequent_indent
    252             else:
    253                 indent = self.initial_indent
    254             if len(indent) + len(self.placeholder.lstrip()) > self.width:
    255                 raise ValueError("placeholder too large for max width")
    256 
    257         # Arrange in reverse order so items can be efficiently popped
    258         # from a stack of chucks.
    259         chunks.reverse()
    260 
    261         while chunks:
    262 
    263             # Start the list of chunks that will make up the current line.
    264             # cur_len is just the length of all the chunks in cur_line.
    265             cur_line = []
    266             cur_len = 0
    267 
    268             # Figure out which static string will prefix this line.
    269             if lines:
    270                 indent = self.subsequent_indent
    271             else:
    272                 indent = self.initial_indent
    273 
    274             # Maximum width for this line.
    275             width = self.width - len(indent)
    276 
    277             # First chunk on line is whitespace -- drop it, unless this
    278             # is the very beginning of the text (ie. no lines started yet).
    279             if self.drop_whitespace and chunks[-1].strip() == '' and lines:
    280                 del chunks[-1]
    281 
    282             while chunks:
    283                 l = len(chunks[-1])
    284 
    285                 # Can at least squeeze this chunk onto the current line.
    286                 if cur_len + l <= width:
    287                     cur_line.append(chunks.pop())
    288                     cur_len += l
    289 
    290                 # Nope, this line is full.
    291                 else:
    292                     break
    293 
    294             # The current line is full, and the next chunk is too big to
    295             # fit on *any* line (not just this one).
    296             if chunks and len(chunks[-1]) > width:
    297                 self._handle_long_word(chunks, cur_line, cur_len, width)
    298                 cur_len = sum(map(len, cur_line))
    299 
    300             # If the last chunk on this line is all whitespace, drop it.
    301             if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
    302                 cur_len -= len(cur_line[-1])
    303                 del cur_line[-1]
    304 
    305             if cur_line:
    306                 if (self.max_lines is None or
    307                     len(lines) + 1 < self.max_lines or
    308                     (not chunks or
    309                      self.drop_whitespace and
    310                      len(chunks) == 1 and
    311                      not chunks[0].strip()) and cur_len <= width):
    312                     # Convert current line back to a string and store it in
    313                     # list of all lines (return value).
    314                     lines.append(indent + ''.join(cur_line))
    315                 else:
    316                     while cur_line:
    317                         if (cur_line[-1].strip() and
    318                             cur_len + len(self.placeholder) <= width):
    319                             cur_line.append(self.placeholder)
    320                             lines.append(indent + ''.join(cur_line))
    321                             break
    322                         cur_len -= len(cur_line[-1])
    323                         del cur_line[-1]
    324                     else:
    325                         if lines:
    326                             prev_line = lines[-1].rstrip()
    327                             if (len(prev_line) + len(self.placeholder) <=
    328                                     self.width):
    329                                 lines[-1] = prev_line + self.placeholder
    330                                 break
    331                         lines.append(indent + self.placeholder.lstrip())
    332                     break
    333 
    334         return lines
    335 
    336     def _split_chunks(self, text):
    337         text = self._munge_whitespace(text)
    338         return self._split(text)
    339 
    340     # -- Public interface ----------------------------------------------
    341 
    342     def wrap(self, text):
    343         """wrap(text : string) -> [string]
    344 
    345         Reformat the single paragraph in 'text' so it fits in lines of
    346         no more than 'self.width' columns, and return a list of wrapped
    347         lines.  Tabs in 'text' are expanded with string.expandtabs(),
    348         and all other whitespace characters (including newline) are
    349         converted to space.
    350         """
    351         chunks = self._split_chunks(text)
    352         if self.fix_sentence_endings:
    353             self._fix_sentence_endings(chunks)
    354         return self._wrap_chunks(chunks)
    355 
    356     def fill(self, text):
    357         """fill(text : string) -> string
    358 
    359         Reformat the single paragraph in 'text' to fit in lines of no
    360         more than 'self.width' columns, and return a new string
    361         containing the entire wrapped paragraph.
    362         """
    363         return "\n".join(self.wrap(text))
    364 
    365 
    366 # -- Convenience interface ---------------------------------------------
    367 
    368 def wrap(text, width=70, **kwargs):
    369     """Wrap a single paragraph of text, returning a list of wrapped lines.
    370 
    371     Reformat the single paragraph in 'text' so it fits in lines of no
    372     more than 'width' columns, and return a list of wrapped lines.  By
    373     default, tabs in 'text' are expanded with string.expandtabs(), and
    374     all other whitespace characters (including newline) are converted to
    375     space.  See TextWrapper class for available keyword args to customize
    376     wrapping behaviour.
    377     """
    378     w = TextWrapper(width=width, **kwargs)
    379     return w.wrap(text)
    380 
    381 def fill(text, width=70, **kwargs):
    382     """Fill a single paragraph of text, returning a new string.
    383 
    384     Reformat the single paragraph in 'text' to fit in lines of no more
    385     than 'width' columns, and return a new string containing the entire
    386     wrapped paragraph.  As with wrap(), tabs are expanded and other
    387     whitespace characters converted to space.  See TextWrapper class for
    388     available keyword args to customize wrapping behaviour.
    389     """
    390     w = TextWrapper(width=width, **kwargs)
    391     return w.fill(text)
    392 
    393 def shorten(text, width, **kwargs):
    394     """Collapse and truncate the given text to fit in the given width.
    395 
    396     The text first has its whitespace collapsed.  If it then fits in
    397     the *width*, it is returned as is.  Otherwise, as many words
    398     as possible are joined and then the placeholder is appended::
    399 
    400         >>> textwrap.shorten("Hello  world!", width=12)
    401         'Hello world!'
    402         >>> textwrap.shorten("Hello  world!", width=11)
    403         'Hello [...]'
    404     """
    405     w = TextWrapper(width=width, max_lines=1, **kwargs)
    406     return w.fill(' '.join(text.strip().split()))
    407 
    408 
    409 # -- Loosely related functionality -------------------------------------
    410 
    411 _whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
    412 _leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
    413 
    414 def dedent(text):
    415     """Remove any common leading whitespace from every line in `text`.
    416 
    417     This can be used to make triple-quoted strings line up with the left
    418     edge of the display, while still presenting them in the source code
    419     in indented form.
    420 
    421     Note that tabs and spaces are both treated as whitespace, but they
    422     are not equal: the lines "  hello" and "\\thello" are
    423     considered to have no common leading whitespace.  (This behaviour is
    424     new in Python 2.5; older versions of this module incorrectly
    425     expanded tabs before searching for common leading whitespace.)
    426     """
    427     # Look for the longest leading string of spaces and tabs common to
    428     # all lines.
    429     margin = None
    430     text = _whitespace_only_re.sub('', text)
    431     indents = _leading_whitespace_re.findall(text)
    432     for indent in indents:
    433         if margin is None:
    434             margin = indent
    435 
    436         # Current line more deeply indented than previous winner:
    437         # no change (previous winner is still on top).
    438         elif indent.startswith(margin):
    439             pass
    440 
    441         # Current line consistent with and no deeper than previous winner:
    442         # it's the new winner.
    443         elif margin.startswith(indent):
    444             margin = indent
    445 
    446         # Find the largest common whitespace between current line and previous
    447         # winner.
    448         else:
    449             for i, (x, y) in enumerate(zip(margin, indent)):
    450                 if x != y:
    451                     margin = margin[:i]
    452                     break
    453             else:
    454                 margin = margin[:len(indent)]
    455 
    456     # sanity check (testing/debugging only)
    457     if 0 and margin:
    458         for line in text.split("\n"):
    459             assert not line or line.startswith(margin), \
    460                    "line = %r, margin = %r" % (line, margin)
    461 
    462     if margin:
    463         text = re.sub(r'(?m)^' + margin, '', text)
    464     return text
    465 
    466 
    467 def indent(text, prefix, predicate=None):
    468     """Adds 'prefix' to the beginning of selected lines in 'text'.
    469 
    470     If 'predicate' is provided, 'prefix' will only be added to the lines
    471     where 'predicate(line)' is True. If 'predicate' is not provided,
    472     it will default to adding 'prefix' to all non-empty lines that do not
    473     consist solely of whitespace characters.
    474     """
    475     if predicate is None:
    476         def predicate(line):
    477             return line.strip()
    478 
    479     def prefixed_lines():
    480         for line in text.splitlines(True):
    481             yield (prefix + line if predicate(line) else line)
    482     return ''.join(prefixed_lines())
    483 
    484 
    485 if __name__ == "__main__":
    486     #print dedent("\tfoo\n\tbar")
    487     #print dedent("  \thello there\n  \t  how are you?")
    488     print(dedent("Hello there.\n  This is indented."))
    489