Home | History | Annotate | Download | only in misc
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2003, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucdmerge.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2003feb20
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Simple tool for Unicode Character Database files with semicolon-delimited fields.
     17 *   Merges adjacent, identical per-code point data lines into one line with range syntax.
     18 *
     19 *   To compile, just call a C compiler/linker with this source file.
     20 *   On Windows: cl ucdmerge.c
     21 */
     22 
     23 #include <stdio.h>
     24 #include <string.h>
     25 #include <stdlib.h>
     26 
     27 static const char *
     28 skipWhitespace(const char *s) {
     29     while(*s==' ' || *s=='\t') {
     30         ++s;
     31     }
     32     return s;
     33 }
     34 
     35 /* return the first character position after the end of the data */
     36 static char *
     37 endOfData(const char *l) {
     38     char *end;
     39     char c;
     40 
     41     end=strchr(l, '#');
     42     if(end!=NULL) {
     43         /* ignore whitespace before the comment */
     44         while(l!=end && ((c=*(end-1))==' ' || c=='\t')) {
     45             --end;
     46         }
     47     } else {
     48         end=strchr(l, 0);
     49     }
     50     return end;
     51 }
     52 
     53 static int
     54 sameData(const char *l1, const char *l2) {
     55     char *end1, *end2;
     56     int length;
     57 
     58     /* find the first semicolon in each line - there must be one */
     59     l1=strchr(l1, ';')+1;
     60     l2=strchr(l2, ';')+1;
     61 
     62     /* find the end of data: end of string or start of comment */
     63     end1=endOfData(l1);
     64     end2=endOfData(l2);
     65 
     66     /* compare the line data portions */
     67     length=end1-l1;
     68     return length==(end2-l2) && 0==memcmp(l1, l2, length);
     69 }
     70 
     71 extern int
     72 main(int argc, const char *argv[]) {
     73     static char line[2000], firstLine[2000], lastLine[2000];
     74     char *end;
     75     long first, last, c;
     76     int finished;
     77 
     78     first=last=-1;
     79     finished=0;
     80 
     81     for(;;) {
     82         if(gets(line)!=NULL) {
     83             /* parse the initial code point, if any */
     84             c=strtol(line, &end, 16);
     85             if(end!=line && *skipWhitespace(end)==';') {
     86                 /* single code point followed by semicolon and data, keep c */
     87             } else {
     88                 c=-1;
     89             }
     90         } else {
     91             line[0]=0;
     92             c=-1;
     93             finished=1;
     94         }
     95 
     96         if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) {
     97             /* output the current range */
     98             if(first==last) {
     99                 /* there was no range, just output the one line we found */
    100                 puts(firstLine);
    101             } else {
    102                 /* there was a real range, merge their lines */
    103                 end=strchr(lastLine, '#');
    104                 if(end==NULL) {
    105                     /* no comment in second line */
    106                     printf("%04lX..%04lX%s\n",
    107                             first, last,            /* code point range */
    108                             strchr(firstLine, ';'));/* first line starting from the first ; */
    109                 } else if(strchr(firstLine, '#')==NULL) {
    110                     /* no comment in first line */
    111                     printf("%04lX..%04lX%s%s\n",
    112                             first, last,            /* code point range */
    113                             strchr(firstLine, ';'), /* first line starting from the first ; */
    114                             end);                   /* comment from second line */
    115                 } else {
    116                     /* merge comments from both lines */
    117                     printf("%04lX..%04lX%s..%s\n",
    118                             first, last,            /* code point range */
    119                             strchr(firstLine, ';'), /* first line starting from the first ; */
    120                             skipWhitespace(end+1)); /* comment from second line, after # and spaces */
    121                 }
    122             }
    123             first=last=-1;
    124         }
    125 
    126         if(c<0) {
    127             if(finished) {
    128                 break;
    129             }
    130 
    131             /* no data on this line, output as is */
    132             puts(line);
    133         } else {
    134             /* data on this line, store for possible range compaction */
    135             if(last<0) {
    136                 /* set as the first line in a possible range */
    137                 first=last=c;
    138                 strcpy(firstLine, line);
    139                 lastLine[0]=0;
    140             } else /* must be c==(last+1) && sameData() because of previous conditions */ {
    141                 /* continue with the current range */
    142                 last=c;
    143                 strcpy(lastLine, line);
    144             }
    145         }
    146     }
    147 
    148     return 0;
    149 }
    150