1 """Text wrapping and filling. 2 """ 3 4 # Copyright (C) 1999-2001 Gregory P. Ward. 5 # Copyright (C) 2002, 2003 Python Software Foundation. 6 # Written by Greg Ward <gward (at] python.net> 7 8 __revision__ = "$Id$" 9 10 import string, re 11 12 try: 13 _unicode = unicode 14 except NameError: 15 # If Python is built without Unicode support, the unicode type 16 # will not exist. Fake one. 17 class _unicode(object): 18 pass 19 20 # Do the right thing with boolean values for all known Python versions 21 # (so this module can be copied to projects that don't depend on Python 22 # 2.3, e.g. Optik and Docutils) by uncommenting the block of code below. 23 #try: 24 # True, False 25 #except NameError: 26 # (True, False) = (1, 0) 27 28 __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent'] 29 30 # Hardcode the recognized whitespace characters to the US-ASCII 31 # whitespace characters. The main reason for doing this is that in 32 # ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales 33 # that character winds up in string.whitespace. Respecting 34 # string.whitespace in those cases would 1) make textwrap treat 0xa0 the 35 # same as any other whitespace char, which is clearly wrong (it's a 36 # *non-breaking* space), 2) possibly cause problems with Unicode, 37 # since 0xa0 is not in range(128). 38 _whitespace = '\t\n\x0b\x0c\r ' 39 40 class TextWrapper: 41 """ 42 Object for wrapping/filling text. The public interface consists of 43 the wrap() and fill() methods; the other methods are just there for 44 subclasses to override in order to tweak the default behaviour. 45 If you want to completely replace the main wrapping algorithm, 46 you'll probably have to override _wrap_chunks(). 47 48 Several instance attributes control various aspects of wrapping: 49 width (default: 70) 50 the maximum width of wrapped lines (unless break_long_words 51 is false) 52 initial_indent (default: "") 53 string that will be prepended to the first line of wrapped 54 output. Counts towards the line's width. 55 subsequent_indent (default: "") 56 string that will be prepended to all lines save the first 57 of wrapped output; also counts towards each line's width. 58 expand_tabs (default: true) 59 Expand tabs in input text to spaces before further processing. 60 Each tab will become 1 .. 8 spaces, depending on its position in 61 its line. If false, each tab is treated as a single character. 62 replace_whitespace (default: true) 63 Replace all whitespace characters in the input text by spaces 64 after tab expansion. Note that if expand_tabs is false and 65 replace_whitespace is true, every tab will be converted to a 66 single space! 67 fix_sentence_endings (default: false) 68 Ensure that sentence-ending punctuation is always followed 69 by two spaces. Off by default because the algorithm is 70 (unavoidably) imperfect. 71 break_long_words (default: true) 72 Break words longer than 'width'. If false, those words will not 73 be broken, and some lines might be longer than 'width'. 74 break_on_hyphens (default: true) 75 Allow breaking hyphenated words. If true, wrapping will occur 76 preferably on whitespaces and right after hyphens part of 77 compound words. 78 drop_whitespace (default: true) 79 Drop leading and trailing whitespace from lines. 80 """ 81 82 whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace)) 83 84 unicode_whitespace_trans = {} 85 uspace = ord(u' ') 86 for x in map(ord, _whitespace): 87 unicode_whitespace_trans[x] = uspace 88 89 # This funky little regex is just the trick for splitting 90 # text up into word-wrappable chunks. E.g. 91 # "Hello there -- you goof-ball, use the -b option!" 92 # splits into 93 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! 94 # (after stripping out empty strings). 95 wordsep_re = re.compile( 96 r'(\s+|' # any whitespace 97 r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words 98 r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash 99 100 # This less funky little regex just split on recognized spaces. E.g. 101 # "Hello there -- you goof-ball, use the -b option!" 102 # splits into 103 # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ 104 wordsep_simple_re = re.compile(r'(\s+)') 105 106 # XXX this is not locale- or charset-aware -- string.lowercase 107 # is US-ASCII only (and therefore English-only) 108 sentence_end_re = re.compile(r'[%s]' # lowercase letter 109 r'[\.\!\?]' # sentence-ending punct. 110 r'[\"\']?' # optional end-of-quote 111 r'\Z' # end of chunk 112 % string.lowercase) 113 114 115 def __init__(self, 116 width=70, 117 initial_indent="", 118 subsequent_indent="", 119 expand_tabs=True, 120 replace_whitespace=True, 121 fix_sentence_endings=False, 122 break_long_words=True, 123 drop_whitespace=True, 124 break_on_hyphens=True): 125 self.width = width 126 self.initial_indent = initial_indent 127 self.subsequent_indent = subsequent_indent 128 self.expand_tabs = expand_tabs 129 self.replace_whitespace = replace_whitespace 130 self.fix_sentence_endings = fix_sentence_endings 131 self.break_long_words = break_long_words 132 self.drop_whitespace = drop_whitespace 133 self.break_on_hyphens = break_on_hyphens 134 135 # recompile the regexes for Unicode mode -- done in this clumsy way for 136 # backwards compatibility because it's rather common to monkey-patch 137 # the TextWrapper class' wordsep_re attribute. 138 self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U) 139 self.wordsep_simple_re_uni = re.compile( 140 self.wordsep_simple_re.pattern, re.U) 141 142 143 # -- Private methods ----------------------------------------------- 144 # (possibly useful for subclasses to override) 145 146 def _munge_whitespace(self, text): 147 """_munge_whitespace(text : string) -> string 148 149 Munge whitespace in text: expand tabs and convert all other 150 whitespace characters to spaces. Eg. " foo\tbar\n\nbaz" 151 becomes " foo bar baz". 152 """ 153 if self.expand_tabs: 154 text = text.expandtabs() 155 if self.replace_whitespace: 156 if isinstance(text, str): 157 text = text.translate(self.whitespace_trans) 158 elif isinstance(text, _unicode): 159 text = text.translate(self.unicode_whitespace_trans) 160 return text 161 162 163 def _split(self, text): 164 """_split(text : string) -> [string] 165 166 Split the text to wrap into indivisible chunks. Chunks are 167 not quite the same as words; see _wrap_chunks() for full 168 details. As an example, the text 169 Look, goof-ball -- use the -b option! 170 breaks into the following chunks: 171 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ', 172 'use', ' ', 'the', ' ', '-b', ' ', 'option!' 173 if break_on_hyphens is True, or in: 174 'Look,', ' ', 'goof-ball', ' ', '--', ' ', 175 'use', ' ', 'the', ' ', '-b', ' ', option!' 176 otherwise. 177 """ 178 if isinstance(text, _unicode): 179 if self.break_on_hyphens: 180 pat = self.wordsep_re_uni 181 else: 182 pat = self.wordsep_simple_re_uni 183 else: 184 if self.break_on_hyphens: 185 pat = self.wordsep_re 186 else: 187 pat = self.wordsep_simple_re 188 chunks = pat.split(text) 189 chunks = filter(None, chunks) # remove empty chunks 190 return chunks 191 192 def _fix_sentence_endings(self, chunks): 193 """_fix_sentence_endings(chunks : [string]) 194 195 Correct for sentence endings buried in 'chunks'. Eg. when the 196 original text contains "... foo.\nBar ...", munge_whitespace() 197 and split() will convert that to [..., "foo.", " ", "Bar", ...] 198 which has one too few spaces; this method simply changes the one 199 space to two. 200 """ 201 i = 0 202 patsearch = self.sentence_end_re.search 203 while i < len(chunks)-1: 204 if chunks[i+1] == " " and patsearch(chunks[i]): 205 chunks[i+1] = " " 206 i += 2 207 else: 208 i += 1 209 210 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): 211 """_handle_long_word(chunks : [string], 212 cur_line : [string], 213 cur_len : int, width : int) 214 215 Handle a chunk of text (most likely a word, not whitespace) that 216 is too long to fit in any line. 217 """ 218 # Figure out when indent is larger than the specified width, and make 219 # sure at least one character is stripped off on every pass 220 if width < 1: 221 space_left = 1 222 else: 223 space_left = width - cur_len 224 225 # If we're allowed to break long words, then do so: put as much 226 # of the next chunk onto the current line as will fit. 227 if self.break_long_words: 228 cur_line.append(reversed_chunks[-1][:space_left]) 229 reversed_chunks[-1] = reversed_chunks[-1][space_left:] 230 231 # Otherwise, we have to preserve the long word intact. Only add 232 # it to the current line if there's nothing already there -- 233 # that minimizes how much we violate the width constraint. 234 elif not cur_line: 235 cur_line.append(reversed_chunks.pop()) 236 237 # If we're not allowed to break long words, and there's already 238 # text on the current line, do nothing. Next time through the 239 # main loop of _wrap_chunks(), we'll wind up here again, but 240 # cur_len will be zero, so the next line will be entirely 241 # devoted to the long word that we can't handle right now. 242 243 def _wrap_chunks(self, chunks): 244 """_wrap_chunks(chunks : [string]) -> [string] 245 246 Wrap a sequence of text chunks and return a list of lines of 247 length 'self.width' or less. (If 'break_long_words' is false, 248 some lines may be longer than this.) Chunks correspond roughly 249 to words and the whitespace between them: each chunk is 250 indivisible (modulo 'break_long_words'), but a line break can 251 come between any two chunks. Chunks should not have internal 252 whitespace; ie. a chunk is either all whitespace or a "word". 253 Whitespace chunks will be removed from the beginning and end of 254 lines, but apart from that whitespace is preserved. 255 """ 256 lines = [] 257 if self.width <= 0: 258 raise ValueError("invalid width %r (must be > 0)" % self.width) 259 260 # Arrange in reverse order so items can be efficiently popped 261 # from a stack of chucks. 262 chunks.reverse() 263 264 while chunks: 265 266 # Start the list of chunks that will make up the current line. 267 # cur_len is just the length of all the chunks in cur_line. 268 cur_line = [] 269 cur_len = 0 270 271 # Figure out which static string will prefix this line. 272 if lines: 273 indent = self.subsequent_indent 274 else: 275 indent = self.initial_indent 276 277 # Maximum width for this line. 278 width = self.width - len(indent) 279 280 # First chunk on line is whitespace -- drop it, unless this 281 # is the very beginning of the text (ie. no lines started yet). 282 if self.drop_whitespace and chunks[-1].strip() == '' and lines: 283 del chunks[-1] 284 285 while chunks: 286 l = len(chunks[-1]) 287 288 # Can at least squeeze this chunk onto the current line. 289 if cur_len + l <= width: 290 cur_line.append(chunks.pop()) 291 cur_len += l 292 293 # Nope, this line is full. 294 else: 295 break 296 297 # The current line is full, and the next chunk is too big to 298 # fit on *any* line (not just this one). 299 if chunks and len(chunks[-1]) > width: 300 self._handle_long_word(chunks, cur_line, cur_len, width) 301 302 # If the last chunk on this line is all whitespace, drop it. 303 if self.drop_whitespace and cur_line and cur_line[-1].strip() == '': 304 del cur_line[-1] 305 306 # Convert current line back to a string and store it in list 307 # of all lines (return value). 308 if cur_line: 309 lines.append(indent + ''.join(cur_line)) 310 311 return lines 312 313 314 # -- Public interface ---------------------------------------------- 315 316 def wrap(self, text): 317 """wrap(text : string) -> [string] 318 319 Reformat the single paragraph in 'text' so it fits in lines of 320 no more than 'self.width' columns, and return a list of wrapped 321 lines. Tabs in 'text' are expanded with string.expandtabs(), 322 and all other whitespace characters (including newline) are 323 converted to space. 324 """ 325 text = self._munge_whitespace(text) 326 chunks = self._split(text) 327 if self.fix_sentence_endings: 328 self._fix_sentence_endings(chunks) 329 return self._wrap_chunks(chunks) 330 331 def fill(self, text): 332 """fill(text : string) -> string 333 334 Reformat the single paragraph in 'text' to fit in lines of no 335 more than 'self.width' columns, and return a new string 336 containing the entire wrapped paragraph. 337 """ 338 return "\n".join(self.wrap(text)) 339 340 341 # -- Convenience interface --------------------------------------------- 342 343 def wrap(text, width=70, **kwargs): 344 """Wrap a single paragraph of text, returning a list of wrapped lines. 345 346 Reformat the single paragraph in 'text' so it fits in lines of no 347 more than 'width' columns, and return a list of wrapped lines. By 348 default, tabs in 'text' are expanded with string.expandtabs(), and 349 all other whitespace characters (including newline) are converted to 350 space. See TextWrapper class for available keyword args to customize 351 wrapping behaviour. 352 """ 353 w = TextWrapper(width=width, **kwargs) 354 return w.wrap(text) 355 356 def fill(text, width=70, **kwargs): 357 """Fill a single paragraph of text, returning a new string. 358 359 Reformat the single paragraph in 'text' to fit in lines of no more 360 than 'width' columns, and return a new string containing the entire 361 wrapped paragraph. As with wrap(), tabs are expanded and other 362 whitespace characters converted to space. See TextWrapper class for 363 available keyword args to customize wrapping behaviour. 364 """ 365 w = TextWrapper(width=width, **kwargs) 366 return w.fill(text) 367 368 369 # -- Loosely related functionality ------------------------------------- 370 371 _whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE) 372 _leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE) 373 374 def dedent(text): 375 """Remove any common leading whitespace from every line in `text`. 376 377 This can be used to make triple-quoted strings line up with the left 378 edge of the display, while still presenting them in the source code 379 in indented form. 380 381 Note that tabs and spaces are both treated as whitespace, but they 382 are not equal: the lines " hello" and "\thello" are 383 considered to have no common leading whitespace. (This behaviour is 384 new in Python 2.5; older versions of this module incorrectly 385 expanded tabs before searching for common leading whitespace.) 386 """ 387 # Look for the longest leading string of spaces and tabs common to 388 # all lines. 389 margin = None 390 text = _whitespace_only_re.sub('', text) 391 indents = _leading_whitespace_re.findall(text) 392 for indent in indents: 393 if margin is None: 394 margin = indent 395 396 # Current line more deeply indented than previous winner: 397 # no change (previous winner is still on top). 398 elif indent.startswith(margin): 399 pass 400 401 # Current line consistent with and no deeper than previous winner: 402 # it's the new winner. 403 elif margin.startswith(indent): 404 margin = indent 405 406 # Current line and previous winner have no common whitespace: 407 # there is no margin. 408 else: 409 margin = "" 410 break 411 412 # sanity check (testing/debugging only) 413 if 0 and margin: 414 for line in text.split("\n"): 415 assert not line or line.startswith(margin), \ 416 "line = %r, margin = %r" % (line, margin) 417 418 if margin: 419 text = re.sub(r'(?m)^' + margin, '', text) 420 return text 421 422 if __name__ == "__main__": 423 #print dedent("\tfoo\n\tbar") 424 #print dedent(" \thello there\n \t how are you?") 425 print dedent("Hello there.\n This is indented.") 426