1 // 2 // A utility for finding substring embeddings in patterns 3 4 #include <stdio.h> 5 #include <string.h> 6 #include <stdlib.h> 7 8 #define MAXPATHS (256*1024) 9 10 // 11 // 12 static void die( 13 const char*msg 14 ) { 15 fprintf(stderr,"%s\n",msg); 16 exit(1); 17 } 18 19 20 // Finds the index of an entry, only used on xxx_key arrays 21 // Caveat: the table has to be sorted 22 static int find_in( 23 char *tab[], 24 int max, 25 const char *pat 26 ) { 27 int left=0, right=max-1; 28 while (left <= right) { 29 int mid = ((right-left)/2)+left; 30 int v = strcmp(pat,tab[mid]); 31 if (v>0) { 32 left = mid + 1; 33 } else if (v<0) { 34 right = mid -1; 35 } else { 36 return mid; 37 } 38 } 39 return -1; 40 } 41 42 43 // used by partition (which is used by qsort_arr) 44 // 45 static void swap2( 46 char *a[], 47 char *b[], 48 int i, 49 int j 50 ) { 51 if (i==j) return; 52 char*v; 53 v=a[i]; a[i]=a[j]; a[j]=v; 54 v=b[i]; b[i]=b[j]; b[j]=v; 55 } 56 57 58 // used by qsort_arr 59 // 60 static int partition( 61 char *a[], 62 char *b[], 63 int left, 64 int right, 65 int p 66 ) { 67 const char *pivotValue = a[p]; 68 int i; 69 swap2(a,b,p,right); // Move pivot to end 70 p = left; 71 for (i=left; i<right; i++) { 72 if (strcmp(a[i],pivotValue)<=0) { 73 swap2(a,b,p,i); 74 p++; 75 } 76 } 77 swap2(a,b,right,p); // Move pivot to its final place 78 return p; 79 } 80 81 82 // 83 // 84 static void qsort_arr( 85 char *a[], 86 char *b[], 87 int left, 88 int right 89 ) { 90 while (right > left) { 91 int p = left + (right-left)/2; //select a pivot 92 p = partition(a,b, left, right, p); 93 if ((p-1) - left < right - (p+1)) { 94 qsort_arr(a,b, left, p-1); 95 left = p+1; 96 } else { 97 qsort_arr(a,b, p+1, right); 98 right = p-1; 99 } 100 } 101 } 102 103 104 // Removes extra '0' entries from the string 105 // 106 static char* compact( 107 char *expr 108 ) { 109 int l=strlen(expr); 110 int i,j; 111 for (i=0,j=0; i<l; i++) { 112 if (expr[i]!='0') expr[j++] = expr[i]; 113 } 114 expr[j]=0; 115 return expr; 116 } 117 118 119 // convert 'n1im' to 0n1i0m0 expressed as a string 120 // 121 static void expand( 122 char *expr, 123 const char *pat, 124 int l 125 ) { 126 int el = 0; 127 char last = '.'; 128 int i; 129 for (i=0; i<l; i++) { 130 char c = pat[i]; 131 if ( (last<'0' || last>'9') 132 && (c <'0' || c >'9') 133 ) { 134 expr[el++] = '0'; 135 } 136 expr[el++] = c; 137 last = c; 138 } 139 if (last<'0' || last>'9') expr[el++] = '0'; 140 expr[el]=0; 141 } 142 143 144 // Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der 145 // The second pattern needs to be a right side match of the first 146 // (modulo digits) 147 static char *combine( 148 char *expr, 149 const char *subexpr 150 ) { 151 int l1 = strlen(expr); 152 int l2 = strlen(subexpr); 153 int off = l1-l2; 154 int j; 155 // this works also for utf8 sequences because the substring is identical 156 // to the last substring-length bytes of expr except for the (single byte) 157 // hyphenation encoders 158 for (j=0; j<l2; j++) { 159 if (subexpr[j]>expr[off+j]) { 160 expr[off+j] = subexpr[j]; 161 } 162 } 163 return expr; 164 } 165 166 167 // 168 // 169 int main(int argc, const char* argv[]) { 170 FILE *in, *out; 171 char *pattab_key[MAXPATHS]; 172 char *pattab_val[MAXPATHS]; 173 int patterns = 0; 174 char *newpattab_key[MAXPATHS]; 175 char *newpattab_val[MAXPATHS]; 176 int newpatterns = 0; 177 char format[132]; // 64+65+newline+zero+spare 178 int p; 179 if (argc!=3) die("Usage: <orig-file> <new-file>\n"); 180 if ((in = fopen(argv[1],"r"))==NULL) die("Could not read input"); 181 if ((out = fopen(argv[2],"w"))==NULL) die("Could not create output"); 182 // read all patterns and split in pure text (_key) & expanded patterns (_val) 183 while(fgets(format,132,in)) { 184 int l = strlen(format); 185 if (format[l-1]=='\n') { l--; format[l]=0; } // Chomp 186 if (format[0]=='%' || format[0]==0) { 187 // skip 188 } else { 189 if (format[l-1]=='%') { 190 l--; 191 format[l] = 0; // remove '%' 192 } 193 int i,j; 194 char *pat = (char*) malloc(l+1); 195 char *org = (char*) malloc(l*2+1); 196 expand(org,format,l); 197 // remove hyphenation encoders (digits) from pat 198 for (i=0,j=0; i<l; i++) { 199 // odd, but utf-8 proof 200 char c = format[i]; 201 if (c<'0' || c>'9') pat[j++]=c; 202 } 203 pat[j]=0; 204 p = patterns; 205 pattab_key[patterns] = pat; 206 pattab_val[patterns++] = org; 207 if (patterns>MAXPATHS) die("to many base patterns"); 208 } 209 } 210 fclose(in); 211 // As we use binairy search, make sure it is sorted 212 qsort_arr(pattab_key,pattab_val,0,patterns-1); 213 214 for (p=0; p<patterns; p++) { 215 char *pat = pattab_key[p]; 216 int patsize = strlen(pat); 217 int j,l; 218 for (l=1; l<=patsize; l++) { 219 for (j=1; j<=l; j++) { 220 int i = l-j; 221 int subpat_ndx; 222 char subpat[132]; 223 strncpy(subpat,pat+i,j); subpat[j]=0; 224 if ((subpat_ndx = find_in(pattab_key,patterns,subpat))>=0) { 225 int newpat_ndx; 226 char *newpat=malloc(l+1); 227 //printf("%s is embedded in %s\n",pattab_val[subpat_ndx],pattab_val[p]); 228 strncpy(newpat, pat+0,l); newpat[l]=0; 229 if ((newpat_ndx = find_in(newpattab_key,newpatterns,newpat))<0) { 230 char *neworg = malloc(132); // TODO: compute exact length 231 expand(neworg,newpat,l); 232 newpattab_key[newpatterns] = newpat; 233 newpattab_val[newpatterns++] = combine(neworg,pattab_val[subpat_ndx]); 234 if (newpatterns>MAXPATHS) die("to many new patterns"); 235 //printf("%*.*s|%*.*s[%s] (%s|%s) = %s\n",i,i,pat,j,j,pat+i,pat+i+j,pattab_val[p],pattab_val[subpat_ndx],neworg); 236 } else { 237 free(newpat); 238 newpattab_val[newpat_ndx] = combine( 239 newpattab_val[newpat_ndx], pattab_val[subpat_ndx] ); 240 } 241 } 242 } 243 } 244 } 245 246 /* for some tiny extra speed, one could forget the free()s 247 * as the memory is freed anyway on exit(). 248 * However, the gain is minimal and now the code can be cleanly 249 * incorporated into other code */ 250 for (p=0; p<newpatterns; p++) { 251 fprintf(out,"%s\n",compact(newpattab_val[p])); 252 free(newpattab_key[p]); 253 free(newpattab_val[p]); 254 } 255 fclose(out); 256 257 for (p=0; p<patterns; p++) { 258 free(pattab_key[p]); 259 free(pattab_val[p]); 260 } 261 return 0; 262 } 263