C++程序  |  263行  |  6.17 KB

//
// A utility for finding substring embeddings in patterns

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define MAXPATHS (256*1024)

//
//
static void die(
  const char*msg
) {
  fprintf(stderr,"%s\n",msg);
  exit(1);
}


// Finds the index of an entry, only used on xxx_key arrays
// Caveat: the table has to be sorted
static int find_in(
  char *tab[],
  int max,
  const char *pat
) {
  int left=0, right=max-1;
  while (left <= right) {
    int mid = ((right-left)/2)+left;
    int v   = strcmp(pat,tab[mid]);
    if (v>0) {
      left = mid + 1;
    } else if (v<0) {
      right = mid -1;
    } else {
      return mid;
    }
  }
  return -1;
}


// used by partition (which is used by qsort_arr)
//
static void swap2(
  char *a[],
  char *b[],
  int i,
  int j
) {
  if (i==j) return;
  char*v;
  v=a[i]; a[i]=a[j]; a[j]=v;
  v=b[i]; b[i]=b[j]; b[j]=v;
}


// used by qsort_arr
//
static int partition(
  char *a[],
  char *b[],
  int left,
  int right,
  int p
) {
  const char *pivotValue = a[p];
  int i;
  swap2(a,b,p,right); // Move pivot to end
  p = left;
  for (i=left; i<right; i++) {
    if (strcmp(a[i],pivotValue)<=0) {
      swap2(a,b,p,i);
      p++;
    }
  }
  swap2(a,b,right,p); // Move pivot to its final place
  return p;
}


//
//
static void qsort_arr(
  char *a[],
  char *b[],
  int left,
  int right
) {
  while (right > left) {
    int p = left + (right-left)/2; //select a pivot
    p = partition(a,b, left, right, p);
    if ((p-1) - left < right - (p+1)) {
      qsort_arr(a,b, left, p-1);
      left  = p+1;
    } else {
      qsort_arr(a,b, p+1, right);
      right = p-1;
    }
  }
}


// Removes extra '0' entries from the string
//
static char* compact(
  char *expr
) {
  int l=strlen(expr);
  int i,j;
  for (i=0,j=0; i<l; i++) {
    if (expr[i]!='0') expr[j++] = expr[i];
  }
  expr[j]=0;
  return expr;
}


// convert 'n1im' to 0n1i0m0 expressed as a string
//
static void expand(
  char *expr,
  const char *pat,
  int l
) {
  int  el = 0;
  char last = '.';
  int  i;
  for (i=0; i<l; i++) {
    char c = pat[i];
    if ( (last<'0' || last>'9')
      && (c   <'0' || c   >'9')
      ) {
      expr[el++] = '0';
    }
    expr[el++] = c;
    last = c;
  }
  if (last<'0' || last>'9') expr[el++] = '0';
  expr[el]=0;
}


// Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der
// The second pattern needs to be a right side match of the first
// (modulo digits)
static char *combine(
  char *expr,
  const char *subexpr
) {
  int l1 = strlen(expr);
  int l2 = strlen(subexpr);
  int off = l1-l2;
  int j;
  // this works also for utf8 sequences because the substring is identical
  // to the last substring-length bytes of expr except for the (single byte)
  // hyphenation encoders
  for (j=0; j<l2; j++) {
    if (subexpr[j]>expr[off+j]) {
      expr[off+j] = subexpr[j];
    }
  }
  return expr;
}


//
//
int main(int argc, const char* argv[]) {
  FILE *in, *out;
  char *pattab_key[MAXPATHS];
  char *pattab_val[MAXPATHS];
  int   patterns = 0;
  char *newpattab_key[MAXPATHS];
  char *newpattab_val[MAXPATHS];
  int   newpatterns = 0;
  char format[132]; // 64+65+newline+zero+spare
  int p;
  if (argc!=3) die("Usage: <orig-file> <new-file>\n");
  if ((in = fopen(argv[1],"r"))==NULL) die("Could not read input");
  if ((out = fopen(argv[2],"w"))==NULL) die("Could not create output");
  // read all patterns and split in pure text (_key) & expanded patterns (_val)
  while(fgets(format,132,in)) {
    int l = strlen(format);
    if (format[l-1]=='\n') { l--; format[l]=0; } // Chomp
    if (format[0]=='%' || format[0]==0) {
      // skip
    } else {
      if (format[l-1]=='%') {
        l--;
        format[l] = 0; // remove '%'
      }
      int i,j;
      char *pat = (char*) malloc(l+1);
      char *org = (char*) malloc(l*2+1);
      expand(org,format,l);
      // remove hyphenation encoders (digits) from pat
      for (i=0,j=0; i<l; i++) {
        // odd, but utf-8 proof
        char c = format[i];
        if (c<'0' || c>'9') pat[j++]=c;
      }
      pat[j]=0;
      p = patterns;
      pattab_key[patterns]   = pat;
      pattab_val[patterns++] = org;
      if (patterns>MAXPATHS) die("to many base patterns");
    }
  }
  fclose(in);
  // As we use binairy search, make sure it is sorted
  qsort_arr(pattab_key,pattab_val,0,patterns-1);

  for (p=0; p<patterns; p++) {
    char *pat = pattab_key[p];
    int   patsize = strlen(pat);
    int   j,l;
    for (l=1; l<=patsize; l++) {
      for (j=1; j<=l; j++) {
        int i = l-j;
        int  subpat_ndx;
        char subpat[132];
        strncpy(subpat,pat+i,j); subpat[j]=0;
        if ((subpat_ndx = find_in(pattab_key,patterns,subpat))>=0) {
          int   newpat_ndx;
          char *newpat=malloc(l+1);
      //printf("%s is embedded in %s\n",pattab_val[subpat_ndx],pattab_val[p]);
          strncpy(newpat, pat+0,l); newpat[l]=0;
          if ((newpat_ndx = find_in(newpattab_key,newpatterns,newpat))<0) {
            char *neworg = malloc(132); // TODO: compute exact length
            expand(neworg,newpat,l);
            newpattab_key[newpatterns]   = newpat;
            newpattab_val[newpatterns++] = combine(neworg,pattab_val[subpat_ndx]);
            if (newpatterns>MAXPATHS) die("to many new patterns");
    //printf("%*.*s|%*.*s[%s] (%s|%s) = %s\n",i,i,pat,j,j,pat+i,pat+i+j,pattab_val[p],pattab_val[subpat_ndx],neworg);
          } else {
            free(newpat);
            newpattab_val[newpat_ndx] = combine(
              newpattab_val[newpat_ndx], pattab_val[subpat_ndx] ); 
          }
        }
      }
    }
  }

  /* for some tiny extra speed, one could forget the free()s
   * as the memory is freed anyway on exit().
   * However, the gain is minimal and now the code can be cleanly
   * incorporated into other code */
  for (p=0; p<newpatterns; p++) {
    fprintf(out,"%s\n",compact(newpattab_val[p]));
    free(newpattab_key[p]);
    free(newpattab_val[p]);
  }
  fclose(out);

  for (p=0; p<patterns; p++) {
    free(pattab_key[p]);
    free(pattab_val[p]);
  }
  return 0;
}