1 #! /usr/bin/python 2 3 #this is a script to extract given named nodes from a dot file, with 4 #the associated edges. An edge is kept iff for edge x -> y 5 # x and y are both nodes specified to be kept. 6 7 #known issues: if a line contains '->' and is not an edge line 8 #problems will occur. If node labels do not begin with 9 #Node this also will not work. Since this is designed to work 10 #on DSA dot output and not general dot files this is ok. 11 #If you want to use this on other files rename the node labels 12 #to Node[.*] with a script or something. This also relies on 13 #the length of a node name being 13 characters (as it is in all 14 #DSA dot output files) 15 16 #Note that the name of the node can be any substring of the actual 17 #name in the dot file. Thus if you say specify COLLAPSED 18 #as a parameter this script will pull out all COLLAPSED 19 #nodes in the file 20 21 #Specifying escape characters in the name like \n also will not work, 22 #as Python 23 #will make it \\n, I'm not really sure how to fix this 24 25 #currently the script prints the names it is searching for 26 #to STDOUT, so you can check to see if they are what you intend 27 28 import re 29 import string 30 import sys 31 32 33 if len(sys.argv) < 3: 34 print 'usage is ./DSAextract <dot_file_to_modify> \ 35 <output_file> [list of nodes to extract]' 36 37 #open the input file 38 input = open(sys.argv[1], 'r') 39 40 #construct a set of node names 41 node_name_set = set() 42 for name in sys.argv[3:]: 43 node_name_set |= set([name]) 44 45 #construct a list of compiled regular expressions from the 46 #node_name_set 47 regexp_list = [] 48 for name in node_name_set: 49 regexp_list.append(re.compile(name)) 50 51 #used to see what kind of line we are on 52 nodeexp = re.compile('Node') 53 #used to check to see if the current line is an edge line 54 arrowexp = re.compile('->') 55 56 node_set = set() 57 58 #read the file one line at a time 59 buffer = input.readline() 60 while buffer != '': 61 #filter out the unnecessary checks on all the edge lines 62 if not arrowexp.search(buffer): 63 #check to see if this is a node we are looking for 64 for regexp in regexp_list: 65 #if this name is for the current node, add the dot variable name 66 #for the node (it will be Node(hex number)) to our set of nodes 67 if regexp.search(buffer): 68 node_set |= set([re.split('\s+',buffer,2)[1]]) 69 break 70 buffer = input.readline() 71 72 73 #test code 74 #print '\n' 75 76 print node_name_set 77 78 #print node_set 79 80 81 #open the output file 82 output = open(sys.argv[2], 'w') 83 #start the second pass over the file 84 input = open(sys.argv[1], 'r') 85 86 buffer = input.readline() 87 while buffer != '': 88 #there are three types of lines we are looking for 89 #1) node lines, 2) edge lines 3) support lines (like page size, etc) 90 91 #is this an edge line? 92 #note that this is no completely robust, if a none edge line 93 #for some reason contains -> it will be missidentified 94 #hand edit the file if this happens 95 if arrowexp.search(buffer): 96 #check to make sure that both nodes are in the node list 97 #if they are print this to output 98 nodes = arrowexp.split(buffer) 99 nodes[0] = string.strip(nodes[0]) 100 nodes[1] = string.strip(nodes[1]) 101 if nodes[0][:13] in node_set and \ 102 nodes[1][:13] in node_set: 103 output.write(buffer) 104 elif nodeexp.search(buffer): #this is a node line 105 node = re.split('\s+', buffer,2)[1] 106 if node in node_set: 107 output.write(buffer) 108 else: #this is a support line 109 output.write(buffer) 110 buffer = input.readline() 111 112