Home | History | Annotate | Download | only in scripts
      1 #! /usr/bin/env python

      2 
      3 class Markov:
      4     def __init__(self, histsize, choice):
      5         self.histsize = histsize
      6         self.choice = choice
      7         self.trans = {}
      8 
      9     def add(self, state, next):
     10         self.trans.setdefault(state, []).append(next)
     11 
     12     def put(self, seq):
     13         n = self.histsize
     14         add = self.add
     15         add(None, seq[:0])
     16         for i in range(len(seq)):
     17             add(seq[max(0, i-n):i], seq[i:i+1])
     18         add(seq[len(seq)-n:], None)
     19 
     20     def get(self):
     21         choice = self.choice
     22         trans = self.trans
     23         n = self.histsize
     24         seq = choice(trans[None])
     25         while True:
     26             subseq = seq[max(0, len(seq)-n):]
     27             options = trans[subseq]
     28             next = choice(options)
     29             if not next:
     30                 break
     31             seq += next
     32         return seq
     33 
     34 
     35 def test():
     36     import sys, random, getopt
     37     args = sys.argv[1:]
     38     try:
     39         opts, args = getopt.getopt(args, '0123456789cdwq')
     40     except getopt.error:
     41         print 'Usage: %s [-#] [-cddqw] [file] ...' % sys.argv[0]
     42         print 'Options:'
     43         print '-#: 1-digit history size (default 2)'
     44         print '-c: characters (default)'
     45         print '-w: words'
     46         print '-d: more debugging output'
     47         print '-q: no debugging output'
     48         print 'Input files (default stdin) are split in paragraphs'
     49         print 'separated blank lines and each paragraph is split'
     50         print 'in words by whitespace, then reconcatenated with'
     51         print 'exactly one space separating words.'
     52         print 'Output consists of paragraphs separated by blank'
     53         print 'lines, where lines are no longer than 72 characters.'
     54         sys.exit(2)
     55     histsize = 2
     56     do_words = False
     57     debug = 1
     58     for o, a in opts:
     59         if '-0' <= o <= '-9': histsize = int(o[1:])
     60         if o == '-c': do_words = False
     61         if o == '-d': debug += 1
     62         if o == '-q': debug = 0
     63         if o == '-w': do_words = True
     64     if not args:
     65         args = ['-']
     66 
     67     m = Markov(histsize, random.choice)
     68     try:
     69         for filename in args:
     70             if filename == '-':
     71                 f = sys.stdin
     72                 if f.isatty():
     73                     print 'Sorry, need stdin from file'
     74                     continue
     75             else:
     76                 f = open(filename, 'r')
     77             if debug: print 'processing', filename, '...'
     78             text = f.read()
     79             f.close()
     80             paralist = text.split('\n\n')
     81             for para in paralist:
     82                 if debug > 1: print 'feeding ...'
     83                 words = para.split()
     84                 if words:
     85                     if do_words:
     86                         data = tuple(words)
     87                     else:
     88                         data = ' '.join(words)
     89                     m.put(data)
     90     except KeyboardInterrupt:
     91         print 'Interrupted -- continue with data read so far'
     92     if not m.trans:
     93         print 'No valid input files'
     94         return
     95     if debug: print 'done.'
     96 
     97     if debug > 1:
     98         for key in m.trans.keys():
     99             if key is None or len(key) < histsize:
    100                 print repr(key), m.trans[key]
    101         if histsize == 0: print repr(''), m.trans['']
    102         print
    103     while True:
    104         data = m.get()
    105         if do_words:
    106             words = data
    107         else:
    108             words = data.split()
    109         n = 0
    110         limit = 72
    111         for w in words:
    112             if n + len(w) > limit:
    113                 print
    114                 n = 0
    115             print w,
    116             n += len(w) + 1
    117         print
    118         print
    119 
    120 if __name__ == "__main__":
    121     test()
    122