// // A utility for finding substring embeddings in patterns #include <stdio.h> #include <string.h> #include <stdlib.h> #define MAXPATHS (256*1024) // // static void die( const char*msg ) { fprintf(stderr,"%s\n",msg); exit(1); } // Finds the index of an entry, only used on xxx_key arrays // Caveat: the table has to be sorted static int find_in( char *tab[], int max, const char *pat ) { int left=0, right=max-1; while (left <= right) { int mid = ((right-left)/2)+left; int v = strcmp(pat,tab[mid]); if (v>0) { left = mid + 1; } else if (v<0) { right = mid -1; } else { return mid; } } return -1; } // used by partition (which is used by qsort_arr) // static void swap2( char *a[], char *b[], int i, int j ) { if (i==j) return; char*v; v=a[i]; a[i]=a[j]; a[j]=v; v=b[i]; b[i]=b[j]; b[j]=v; } // used by qsort_arr // static int partition( char *a[], char *b[], int left, int right, int p ) { const char *pivotValue = a[p]; int i; swap2(a,b,p,right); // Move pivot to end p = left; for (i=left; i<right; i++) { if (strcmp(a[i],pivotValue)<=0) { swap2(a,b,p,i); p++; } } swap2(a,b,right,p); // Move pivot to its final place return p; } // // static void qsort_arr( char *a[], char *b[], int left, int right ) { while (right > left) { int p = left + (right-left)/2; //select a pivot p = partition(a,b, left, right, p); if ((p-1) - left < right - (p+1)) { qsort_arr(a,b, left, p-1); left = p+1; } else { qsort_arr(a,b, p+1, right); right = p-1; } } } // Removes extra '0' entries from the string // static char* compact( char *expr ) { int l=strlen(expr); int i,j; for (i=0,j=0; i<l; i++) { if (expr[i]!='0') expr[j++] = expr[i]; } expr[j]=0; return expr; } // convert 'n1im' to 0n1i0m0 expressed as a string // static void expand( char *expr, const char *pat, int l ) { int el = 0; char last = '.'; int i; for (i=0; i<l; i++) { char c = pat[i]; if ( (last<'0' || last>'9') && (c <'0' || c >'9') ) { expr[el++] = '0'; } expr[el++] = c; last = c; } if (last<'0' || last>'9') expr[el++] = '0'; expr[el]=0; } // Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der // The second pattern needs to be a right side match of the first // (modulo digits) static char *combine( char *expr, const char *subexpr ) { int l1 = strlen(expr); int l2 = strlen(subexpr); int off = l1-l2; int j; // this works also for utf8 sequences because the substring is identical // to the last substring-length bytes of expr except for the (single byte) // hyphenation encoders for (j=0; j<l2; j++) { if (subexpr[j]>expr[off+j]) { expr[off+j] = subexpr[j]; } } return expr; } // // int main(int argc, const char* argv[]) { FILE *in, *out; char *pattab_key[MAXPATHS]; char *pattab_val[MAXPATHS]; int patterns = 0; char *newpattab_key[MAXPATHS]; char *newpattab_val[MAXPATHS]; int newpatterns = 0; char format[132]; // 64+65+newline+zero+spare int p; if (argc!=3) die("Usage: <orig-file> <new-file>\n"); if ((in = fopen(argv[1],"r"))==NULL) die("Could not read input"); if ((out = fopen(argv[2],"w"))==NULL) die("Could not create output"); // read all patterns and split in pure text (_key) & expanded patterns (_val) while(fgets(format,132,in)) { int l = strlen(format); if (format[l-1]=='\n') { l--; format[l]=0; } // Chomp if (format[0]=='%' || format[0]==0) { // skip } else { if (format[l-1]=='%') { l--; format[l] = 0; // remove '%' } int i,j; char *pat = (char*) malloc(l+1); char *org = (char*) malloc(l*2+1); expand(org,format,l); // remove hyphenation encoders (digits) from pat for (i=0,j=0; i<l; i++) { // odd, but utf-8 proof char c = format[i]; if (c<'0' || c>'9') pat[j++]=c; } pat[j]=0; p = patterns; pattab_key[patterns] = pat; pattab_val[patterns++] = org; if (patterns>MAXPATHS) die("to many base patterns"); } } fclose(in); // As we use binairy search, make sure it is sorted qsort_arr(pattab_key,pattab_val,0,patterns-1); for (p=0; p<patterns; p++) { char *pat = pattab_key[p]; int patsize = strlen(pat); int j,l; for (l=1; l<=patsize; l++) { for (j=1; j<=l; j++) { int i = l-j; int subpat_ndx; char subpat[132]; strncpy(subpat,pat+i,j); subpat[j]=0; if ((subpat_ndx = find_in(pattab_key,patterns,subpat))>=0) { int newpat_ndx; char *newpat=malloc(l+1); //printf("%s is embedded in %s\n",pattab_val[subpat_ndx],pattab_val[p]); strncpy(newpat, pat+0,l); newpat[l]=0; if ((newpat_ndx = find_in(newpattab_key,newpatterns,newpat))<0) { char *neworg = malloc(132); // TODO: compute exact length expand(neworg,newpat,l); newpattab_key[newpatterns] = newpat; newpattab_val[newpatterns++] = combine(neworg,pattab_val[subpat_ndx]); if (newpatterns>MAXPATHS) die("to many new patterns"); //printf("%*.*s|%*.*s[%s] (%s|%s) = %s\n",i,i,pat,j,j,pat+i,pat+i+j,pattab_val[p],pattab_val[subpat_ndx],neworg); } else { free(newpat); newpattab_val[newpat_ndx] = combine( newpattab_val[newpat_ndx], pattab_val[subpat_ndx] ); } } } } } /* for some tiny extra speed, one could forget the free()s * as the memory is freed anyway on exit(). * However, the gain is minimal and now the code can be cleanly * incorporated into other code */ for (p=0; p<newpatterns; p++) { fprintf(out,"%s\n",compact(newpattab_val[p])); free(newpattab_key[p]); free(newpattab_val[p]); } fclose(out); for (p=0; p<patterns; p++) { free(pattab_key[p]); free(pattab_val[p]); } return 0; }