Home | History | Annotate | Download | only in hyphenation
      1 //
      2 // A utility for finding substring embeddings in patterns
      3 
      4 #include <stdio.h>
      5 #include <string.h>
      6 #include <stdlib.h>
      7 
      8 #define MAXPATHS (256*1024)
      9 
     10 //
     11 //
     12 static void die(
     13   const char*msg
     14 ) {
     15   fprintf(stderr,"%s\n",msg);
     16   exit(1);
     17 }
     18 
     19 
     20 // Finds the index of an entry, only used on xxx_key arrays
     21 // Caveat: the table has to be sorted
     22 static int find_in(
     23   char *tab[],
     24   int max,
     25   const char *pat
     26 ) {
     27   int left=0, right=max-1;
     28   while (left <= right) {
     29     int mid = ((right-left)/2)+left;
     30     int v   = strcmp(pat,tab[mid]);
     31     if (v>0) {
     32       left = mid + 1;
     33     } else if (v<0) {
     34       right = mid -1;
     35     } else {
     36       return mid;
     37     }
     38   }
     39   return -1;
     40 }
     41 
     42 
     43 // used by partition (which is used by qsort_arr)
     44 //
     45 static void swap2(
     46   char *a[],
     47   char *b[],
     48   int i,
     49   int j
     50 ) {
     51   if (i==j) return;
     52   char*v;
     53   v=a[i]; a[i]=a[j]; a[j]=v;
     54   v=b[i]; b[i]=b[j]; b[j]=v;
     55 }
     56 
     57 
     58 // used by qsort_arr
     59 //
     60 static int partition(
     61   char *a[],
     62   char *b[],
     63   int left,
     64   int right,
     65   int p
     66 ) {
     67   const char *pivotValue = a[p];
     68   int i;
     69   swap2(a,b,p,right); // Move pivot to end
     70   p = left;
     71   for (i=left; i<right; i++) {
     72     if (strcmp(a[i],pivotValue)<=0) {
     73       swap2(a,b,p,i);
     74       p++;
     75     }
     76   }
     77   swap2(a,b,right,p); // Move pivot to its final place
     78   return p;
     79 }
     80 
     81 
     82 //
     83 //
     84 static void qsort_arr(
     85   char *a[],
     86   char *b[],
     87   int left,
     88   int right
     89 ) {
     90   while (right > left) {
     91     int p = left + (right-left)/2; //select a pivot
     92     p = partition(a,b, left, right, p);
     93     if ((p-1) - left < right - (p+1)) {
     94       qsort_arr(a,b, left, p-1);
     95       left  = p+1;
     96     } else {
     97       qsort_arr(a,b, p+1, right);
     98       right = p-1;
     99     }
    100   }
    101 }
    102 
    103 
    104 // Removes extra '0' entries from the string
    105 //
    106 static char* compact(
    107   char *expr
    108 ) {
    109   int l=strlen(expr);
    110   int i,j;
    111   for (i=0,j=0; i<l; i++) {
    112     if (expr[i]!='0') expr[j++] = expr[i];
    113   }
    114   expr[j]=0;
    115   return expr;
    116 }
    117 
    118 
    119 // convert 'n1im' to 0n1i0m0 expressed as a string
    120 //
    121 static void expand(
    122   char *expr,
    123   const char *pat,
    124   int l
    125 ) {
    126   int  el = 0;
    127   char last = '.';
    128   int  i;
    129   for (i=0; i<l; i++) {
    130     char c = pat[i];
    131     if ( (last<'0' || last>'9')
    132       && (c   <'0' || c   >'9')
    133       ) {
    134       expr[el++] = '0';
    135     }
    136     expr[el++] = c;
    137     last = c;
    138   }
    139   if (last<'0' || last>'9') expr[el++] = '0';
    140   expr[el]=0;
    141 }
    142 
    143 
    144 // Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der
    145 // The second pattern needs to be a right side match of the first
    146 // (modulo digits)
    147 static char *combine(
    148   char *expr,
    149   const char *subexpr
    150 ) {
    151   int l1 = strlen(expr);
    152   int l2 = strlen(subexpr);
    153   int off = l1-l2;
    154   int j;
    155   // this works also for utf8 sequences because the substring is identical
    156   // to the last substring-length bytes of expr except for the (single byte)
    157   // hyphenation encoders
    158   for (j=0; j<l2; j++) {
    159     if (subexpr[j]>expr[off+j]) {
    160       expr[off+j] = subexpr[j];
    161     }
    162   }
    163   return expr;
    164 }
    165 
    166 
    167 //
    168 //
    169 int main(int argc, const char* argv[]) {
    170   FILE *in, *out;
    171   char *pattab_key[MAXPATHS];
    172   char *pattab_val[MAXPATHS];
    173   int   patterns = 0;
    174   char *newpattab_key[MAXPATHS];
    175   char *newpattab_val[MAXPATHS];
    176   int   newpatterns = 0;
    177   char format[132]; // 64+65+newline+zero+spare
    178   int p;
    179   if (argc!=3) die("Usage: <orig-file> <new-file>\n");
    180   if ((in = fopen(argv[1],"r"))==NULL) die("Could not read input");
    181   if ((out = fopen(argv[2],"w"))==NULL) die("Could not create output");
    182   // read all patterns and split in pure text (_key) & expanded patterns (_val)
    183   while(fgets(format,132,in)) {
    184     int l = strlen(format);
    185     if (format[l-1]=='\n') { l--; format[l]=0; } // Chomp
    186     if (format[0]=='%' || format[0]==0) {
    187       // skip
    188     } else {
    189       if (format[l-1]=='%') {
    190         l--;
    191         format[l] = 0; // remove '%'
    192       }
    193       int i,j;
    194       char *pat = (char*) malloc(l+1);
    195       char *org = (char*) malloc(l*2+1);
    196       expand(org,format,l);
    197       // remove hyphenation encoders (digits) from pat
    198       for (i=0,j=0; i<l; i++) {
    199         // odd, but utf-8 proof
    200         char c = format[i];
    201         if (c<'0' || c>'9') pat[j++]=c;
    202       }
    203       pat[j]=0;
    204       p = patterns;
    205       pattab_key[patterns]   = pat;
    206       pattab_val[patterns++] = org;
    207       if (patterns>MAXPATHS) die("to many base patterns");
    208     }
    209   }
    210   fclose(in);
    211   // As we use binairy search, make sure it is sorted
    212   qsort_arr(pattab_key,pattab_val,0,patterns-1);
    213 
    214   for (p=0; p<patterns; p++) {
    215     char *pat = pattab_key[p];
    216     int   patsize = strlen(pat);
    217     int   j,l;
    218     for (l=1; l<=patsize; l++) {
    219       for (j=1; j<=l; j++) {
    220         int i = l-j;
    221         int  subpat_ndx;
    222         char subpat[132];
    223         strncpy(subpat,pat+i,j); subpat[j]=0;
    224         if ((subpat_ndx = find_in(pattab_key,patterns,subpat))>=0) {
    225           int   newpat_ndx;
    226           char *newpat=malloc(l+1);
    227       //printf("%s is embedded in %s\n",pattab_val[subpat_ndx],pattab_val[p]);
    228           strncpy(newpat, pat+0,l); newpat[l]=0;
    229           if ((newpat_ndx = find_in(newpattab_key,newpatterns,newpat))<0) {
    230             char *neworg = malloc(132); // TODO: compute exact length
    231             expand(neworg,newpat,l);
    232             newpattab_key[newpatterns]   = newpat;
    233             newpattab_val[newpatterns++] = combine(neworg,pattab_val[subpat_ndx]);
    234             if (newpatterns>MAXPATHS) die("to many new patterns");
    235     //printf("%*.*s|%*.*s[%s] (%s|%s) = %s\n",i,i,pat,j,j,pat+i,pat+i+j,pattab_val[p],pattab_val[subpat_ndx],neworg);
    236           } else {
    237             free(newpat);
    238             newpattab_val[newpat_ndx] = combine(
    239               newpattab_val[newpat_ndx], pattab_val[subpat_ndx] );
    240           }
    241         }
    242       }
    243     }
    244   }
    245 
    246   /* for some tiny extra speed, one could forget the free()s
    247    * as the memory is freed anyway on exit().
    248    * However, the gain is minimal and now the code can be cleanly
    249    * incorporated into other code */
    250   for (p=0; p<newpatterns; p++) {
    251     fprintf(out,"%s\n",compact(newpattab_val[p]));
    252     free(newpattab_key[p]);
    253     free(newpattab_val[p]);
    254   }
    255   fclose(out);
    256 
    257   for (p=0; p<patterns; p++) {
    258     free(pattab_key[p]);
    259     free(pattab_val[p]);
    260   }
    261   return 0;
    262 }
    263