/* ******************************************************************************* * * Copyright (C) 1999-2008, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: genprops.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 1999dec08 * created by: Markus W. Scherer * * This program reads several of the Unicode character database text files, * parses them, and extracts most of the properties for each character. * It then writes a binary file containing the properties * that is designed to be used directly for random-access to * the properties of each Unicode character. */ #include <stdio.h> #include <stdlib.h> #include "unicode/utypes.h" #include "unicode/uchar.h" #include "unicode/putil.h" #include "unicode/uclean.h" #include "cmemory.h" #include "cstring.h" #include "unewdata.h" #include "uoptions.h" #include "uparse.h" #include "uprops.h" #include "propsvec.h" U_CDECL_BEGIN #include "genprops.h" U_CDECL_END #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) UBool beVerbose=FALSE, haveCopyright=TRUE; /* prototypes --------------------------------------------------------------- */ static void parseDB(const char *filename, UErrorCode *pErrorCode); /* -------------------------------------------------------------------------- */ enum { HELP_H, HELP_QUESTION_MARK, VERBOSE, COPYRIGHT, DESTDIR, SOURCEDIR, UNICODE_VERSION, ICUDATADIR, CSOURCE }; /* Keep these values in sync with the above enums */ static UOption options[]={ UOPTION_HELP_H, UOPTION_HELP_QUESTION_MARK, UOPTION_VERBOSE, UOPTION_COPYRIGHT, UOPTION_DESTDIR, UOPTION_SOURCEDIR, UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), UOPTION_ICUDATADIR, UOPTION_DEF("csource", 'C', UOPT_NO_ARG) }; extern int main(int argc, char* argv[]) { char filename[300]; const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; char *basename=NULL; UErrorCode errorCode=U_ZERO_ERROR; U_MAIN_INIT_ARGS(argc, argv); /* preset then read command line options */ options[DESTDIR].value=u_getDataDirectory(); options[SOURCEDIR].value=""; options[UNICODE_VERSION].value=""; options[ICUDATADIR].value=u_getDataDirectory(); argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); /* error handling, printing usage message */ if(argc<0) { fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); } if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { /* * Broken into chucks because the C89 standard says the minimum * required supported string length is 509 bytes. */ fprintf(stderr, "Usage: %s [-options] [suffix]\n" "\n" "read the UnicodeData.txt file and other Unicode properties files and\n" "create a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n" "\n", argv[0]); fprintf(stderr, "Options:\n" "\t-h or -? or --help this usage text\n" "\t-v or --verbose verbose output\n" "\t-c or --copyright include a copyright notice\n" "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n" "\t-C or --csource generate a .c source file rather than the .icu binary\n"); fprintf(stderr, "\t-d or --destdir destination directory, followed by the path\n" "\t-s or --sourcedir source directory, followed by the path\n" "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" "\t followed by path, defaults to %s\n" "\tsuffix suffix that is to be appended with a '-'\n" "\t to the source file basenames before opening;\n" "\t 'genprops new' will read UnicodeData-new.txt etc.\n", u_getDataDirectory()); return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } /* get the options values */ beVerbose=options[VERBOSE].doesOccur; haveCopyright=options[COPYRIGHT].doesOccur; srcDir=options[SOURCEDIR].value; destDir=options[DESTDIR].value; if(argc>=2) { suffix=argv[1]; } else { suffix=NULL; } if(options[UNICODE_VERSION].doesOccur) { setUnicodeVersion(options[UNICODE_VERSION].value); } /* else use the default dataVersion in store.c */ if (options[ICUDATADIR].doesOccur) { u_setDataDirectory(options[ICUDATADIR].value); } /* prepare the filename beginning with the source dir */ uprv_strcpy(filename, srcDir); basename=filename+uprv_strlen(filename); if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { *basename++=U_FILE_SEP_CHAR; } /* initialize */ initStore(); /* process UnicodeData.txt */ writeUCDFilename(basename, "UnicodeData", suffix); parseDB(filename, &errorCode); /* process additional properties files */ *basename=0; generateAdditionalProperties(filename, suffix, &errorCode); /* process parsed data */ if(U_SUCCESS(errorCode)) { /* write the properties data file */ generateData(destDir, options[CSOURCE].doesOccur); } exitStore(); u_cleanup(); return errorCode; } U_CFUNC void writeUCDFilename(char *basename, const char *filename, const char *suffix) { int32_t length=(int32_t)uprv_strlen(filename); uprv_strcpy(basename, filename); if(suffix!=NULL) { basename[length++]='-'; uprv_strcpy(basename+length, suffix); length+=(int32_t)uprv_strlen(suffix); } uprv_strcpy(basename+length, ".txt"); } U_CFUNC UBool isToken(const char *token, const char *s) { const char *z; int32_t j; s=u_skipWhitespace(s); for(j=0;; ++j) { if(token[j]!=0) { if(s[j]!=token[j]) { break; } } else { z=u_skipWhitespace(s+j); if(*z==';' || *z==0) { return TRUE; } else { break; } } } return FALSE; } U_CFUNC int32_t getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { const char *t, *z; int32_t i, j; s=u_skipWhitespace(s); for(i=0; i<countTokens; ++i) { t=tokens[i]; if(t!=NULL) { for(j=0;; ++j) { if(t[j]!=0) { if(s[j]!=t[j]) { break; } } else { z=u_skipWhitespace(s+j); if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') { return i; } else { break; } } } } } return -1; } /* parser for UnicodeData.txt ----------------------------------------------- */ /* general categories */ const char *const genCategoryNames[U_CHAR_CATEGORY_COUNT]={ "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd", "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Co", "Cs", "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", "Pi", "Pf" }; const char *const decompositionTypeNames[U_DT_COUNT]={ NULL, NULL, "compat", "circle", "final", "font", "fraction", "initial", "isolated", "medial", "narrow", "noBreak", "small", "square", "sub", "super", "vertical", "wide" }; static struct { uint32_t first, last, props; char name[80]; } unicodeAreas[32]; static int32_t unicodeAreaIndex=0; static void U_CALLCONV unicodeDataLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { Props p; char *end; static uint32_t prevCode=0; uint32_t value; int32_t i; /* reset the properties */ uprv_memset(&p, 0, sizeof(Props)); /* get the character code, field 0 */ p.code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "genprops: syntax error in field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get general category, field 2 */ i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]); if(i>=0) { p.generalCategory=(uint8_t)i; } else { fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n", fields[2][0], (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get decomposition type, field 5 */ if(fields[5][0]<fields[5][1]) { /* there is some decomposition */ if(*fields[5][0]!='<') { /* canonical */ i=U_DT_CANONICAL; } else { /* get compatibility type */ end=fields[5][0]+1; while(end<fields[5][1] && *end!='>') { ++end; } *end='#'; i=getTokenIndex(decompositionTypeNames, U_DT_COUNT, fields[5][0]+1); if(i<0) { fprintf(stderr, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n", fields[5][0], (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } } upvec_setValue(pv, p.code, p.code, 2, (uint32_t)i, UPROPS_DT_MASK, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode)); exit(*pErrorCode); } } /* decimal digit value, field 6 */ if(fields[6][0]<fields[6][1]) { value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10); if(end!=fields[6][1] || value>0x7fff) { fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } p.numericValue=(int32_t)value; p.numericType=1; } /* digit value, field 7 */ if(fields[7][0]<fields[7][1]) { value=(uint32_t)uprv_strtoul(fields[7][0], &end, 10); if(end!=fields[7][1] || value>0x7fff) { fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } if(p.numericType==0) { p.numericValue=(int32_t)value; p.numericType=2; } else if((int32_t)value!=p.numericValue) { fprintf(stderr, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } } /* numeric value, field 8 */ if(fields[8][0]<fields[8][1]) { char *s=fields[8][0]; UBool isNegative; /* get a possible minus sign */ if(*s=='-') { isNegative=TRUE; ++s; } else { isNegative=FALSE; } value=(uint32_t)uprv_strtoul(s, &end, 10); if(value>0 && *end=='/') { /* field 8 may contain a fractional value, get the denominator */ if(p.numericType>0) { fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } p.denominator=(uint32_t)uprv_strtoul(end+1, &end, 10); if(p.denominator==0) { fprintf(stderr, "genprops: denominator is 0 in field 8 at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } } if(end!=fields[8][1] || value>0x7fffffff) { fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } if(p.numericType==0) { if(isNegative) { p.numericValue=-(int32_t)value; } else { p.numericValue=(int32_t)value; } p.numericType=3; } else if((int32_t)value!=p.numericValue) { fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } } value=makeProps(&p); if(*fields[1][0]=='<') { /* first or last entry of a Unicode area */ size_t length=fields[1][1]-fields[1][0]; if(length<9) { /* name too short for an area name */ } else if(0==uprv_memcmp(", First>", fields[1][1]-8, 8)) { /* set the current area */ if(unicodeAreas[unicodeAreaIndex].first==0xffffffff) { length-=9; unicodeAreas[unicodeAreaIndex].first=p.code; unicodeAreas[unicodeAreaIndex].props=value; uprv_memcpy(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length); unicodeAreas[unicodeAreaIndex].name[length]=0; } else { /* error: a previous area is incomplete */ fprintf(stderr, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas[unicodeAreaIndex].name); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } return; } else if(0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) { /* check that the current area matches, and complete it with the last code point */ length-=8; if( unicodeAreas[unicodeAreaIndex].props==value && 0==uprv_memcmp(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length) && unicodeAreas[unicodeAreaIndex].name[length]==0 && unicodeAreas[unicodeAreaIndex].first<p.code ) { unicodeAreas[unicodeAreaIndex].last=p.code; if(beVerbose) { printf("Unicode area U+%04lx..U+%04lx \"%s\"\n", (unsigned long)unicodeAreas[unicodeAreaIndex].first, (unsigned long)unicodeAreas[unicodeAreaIndex].last, unicodeAreas[unicodeAreaIndex].name); } unicodeAreas[++unicodeAreaIndex].first=0xffffffff; } else { /* error: different properties between first & last, different area name, first>=last */ fprintf(stderr, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas[unicodeAreaIndex].name); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } return; } else { /* not an area name */ } } /* check for non-character code points */ if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) { fprintf(stderr, "genprops: error - properties for non-character code point U+%04lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* check that the code points (p.code) are in ascending order */ if(p.code<=prevCode && p.code>0) { fprintf(stderr, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", (unsigned long)p.code, (unsigned long)prevCode); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } prevCode=p.code; /* properties for a single code point */ addProps(p.code, value); } /* set repeated properties for the areas */ static void repeatAreaProps() { uint32_t puaProps; int32_t i; UBool hasPlane15PUA, hasPlane16PUA; UErrorCode errorCode; /* * UnicodeData.txt before 3.0.1 did not contain the PUAs on * planes 15 and 16. * If that is the case, then we add them here, using the properties * from the BMP PUA. */ puaProps=0; hasPlane15PUA=hasPlane16PUA=FALSE; for(i=0; i<unicodeAreaIndex; ++i) { repeatProps(unicodeAreas[i].first, unicodeAreas[i].last, unicodeAreas[i].props); if(unicodeAreas[i].first==0xe000) { puaProps=unicodeAreas[i].props; } else if(unicodeAreas[i].first==0xf0000) { hasPlane15PUA=TRUE; } else if(unicodeAreas[i].first==0x100000) { hasPlane16PUA=TRUE; } } if(puaProps!=0) { if(!hasPlane15PUA) { repeatProps(0xf0000, 0xffffd, puaProps); } if(!hasPlane16PUA) { repeatProps(0x100000, 0x10fffd, puaProps); } } /* Hangul have canonical decompositions */ errorCode=U_ZERO_ERROR; upvec_setValue(pv, 0xac00, 0xd7a3, 2, (uint32_t)U_DT_CANONICAL, UPROPS_DT_MASK, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode)); exit(errorCode); } } static void parseDB(const char *filename, UErrorCode *pErrorCode) { char *fields[15][2]; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return; } /* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */ unicodeAreas[0].first=0xffffffff; u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); if(unicodeAreas[unicodeAreaIndex].first!=0xffffffff) { fprintf(stderr, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n", unicodeAreas[unicodeAreaIndex].name, (unsigned long)unicodeAreas[unicodeAreaIndex].first); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } repeatAreaProps(); if(U_FAILURE(*pErrorCode)) { return; } } /* * Hey, Emacs, please set the following: * * Local Variables: * indent-tabs-mode: nil * End: * */