/* ******************************************************************************* * * Copyright (C) 1999-2009, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: gennames.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 1999sep30 * created by: Markus W. Scherer * * This program reads the Unicode character database text file, * parses it, and extracts the character code, * the "modern" character name, and optionally the * Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment. * It then tokenizes and compresses the names and builds * compact binary tables for random-access lookup * in a u_charName() API function. * * unames.icu file format (after UDataInfo header etc. - see udata.c) * (all data is static const) * * UDataInfo fields: * dataFormat "unam" * formatVersion 1.0 * dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0 * * -- data-based names * uint32_t tokenStringOffset, * groupsOffset, * groupStringOffset, * algNamesOffset; * * uint16_t tokenCount; * uint16_t tokenTable[tokenCount]; * * char tokenStrings[]; -- padded to even count * * -- strings (groupStrings) are tokenized as follows: * for each character c * if(c>=tokenCount) write that character c directly * else * token=tokenTable[c]; * if(token==0xfffe) -- lead byte of double-byte token * token=tokenTable[c<<8|next character]; * if(token==-1) * write c directly * else * tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;) * append zero-terminated tokenString; * * Different strings for a code point - normal name, 1.0 name, and ISO comment - * are separated by ';'. * * uint16_t groupCount; * struct { * uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5 * uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset * uint16_t offsetLow; * } groupTable[groupCount]; * * char groupStrings[]; -- padded to 4-count * * -- The actual, tokenized group strings are not zero-terminated because * that would take up too much space. * Instead, they are preceeded by their length, written in a variable-length sequence: * For each of the 32 group strings, one or two nibbles are stored for its length. * Nibbles (4-bit values, half-bytes) are read MSB first. * A nibble with a value of 0..11 directly indicates the length of the name string. * A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m * by (((n-12)<<4)|m)+12, reaching values of 12..75. * These lengths are sequentially for each tokenized string, not for the de-tokenized result. * For the de-tokenizing, see token description above; the strings immediately follow the * 32 lengths. * * -- algorithmic names * * typedef struct AlgorithmicRange { * uint32_t rangeStart, rangeEnd; * uint8_t algorithmType, algorithmVariant; * uint16_t rangeSize; * } AlgorithmicRange; * * uint32_t algRangesCount; -- number of data blocks for ranges of * algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames) * * struct { * AlgorithmicRange algRange; * uint8_t algRangeData[]; -- padded to 4-count except in last range * } algRanges[algNamesCount]; * -- not a real array because each part has a different size * of algRange.rangeSize (including AlgorithmicRange) * * -- algorithmic range types: * * 0 Names are formed from a string prefix that is stored in * the algRangeData (zero-terminated), followed by the Unicode code point * of the character in hexadecimal digits; * algRange.algorithmVariant digits are written * * 1 Names are formed by calculating modulo-factors of the code point value as follows: * algRange.algorithmVariant is the count of modulo factors * algRangeData contains * uint16_t factors[algRange.algorithmVariant]; * char strings[]; * the first zero-terminated string is written as the prefix; then: * * The rangeStart is subtracted; with the difference, here "code": * for(i=algRange.algorithmVariant-1 to 0 step -1) * index[i]=code%factor[i]; * code/=factor[i]; * * The strings after the prefix are short pieces that are then appended to the result * according to index[0..algRange.algorithmVariant-1]. */ #include <stdio.h> #include "unicode/utypes.h" #include "unicode/putil.h" #include "unicode/uclean.h" #include "unicode/udata.h" #include "cmemory.h" #include "cstring.h" #include "uarrsort.h" #include "unewdata.h" #include "uoptions.h" #include "uparse.h" #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) #define STRING_STORE_SIZE 1000000 #define GROUP_STORE_SIZE 5000 #define GROUP_SHIFT 5 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT) #define GROUP_MASK (LINES_PER_GROUP-1) #define MAX_LINE_COUNT 50000 #define MAX_WORD_COUNT 20000 #define MAX_GROUP_COUNT 5000 #define DATA_NAME "unames" #define DATA_TYPE "icu" #define VERSION_STRING "unam" #define NAME_SEPARATOR_CHAR ';' #define ISO_DATA_NAME "ucomment" /* Unicode versions --------------------------------------------------------- */ enum { UNI_1_0, UNI_1_1, UNI_2_0, UNI_3_0, UNI_3_1, UNI_3_2, UNI_4_0, UNI_4_0_1, UNI_4_1, UNI_5_0, UNI_5_1, UNI_5_2, UNI_VER_COUNT }; static const UVersionInfo unicodeVersions[]={ { 1, 0, 0, 0 }, { 1, 1, 0, 0 }, { 2, 0, 0, 0 }, { 3, 0, 0, 0 }, { 3, 1, 0, 0 }, { 3, 2, 0, 0 }, { 4, 0, 0, 0 }, { 4, 0, 1, 0 }, { 4, 1, 0, 0 }, { 5, 0, 0, 0 }, { 5, 1, 0, 0 }, { 5, 2, 0, 0 } }; static int32_t ucdVersion=UNI_5_2; static int32_t findUnicodeVersion(const UVersionInfo version) { int32_t i; for(i=0; /* while(version>unicodeVersions[i]) {} */ i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0; ++i) {} if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) { --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */ } return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */ } /* generator data ----------------------------------------------------------- */ /* UDataInfo cf. udata.h */ static UDataInfo dataInfo={ sizeof(UDataInfo), 0, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, sizeof(UChar), 0, {0x75, 0x6e, 0x61, 0x6d}, /* dataFormat="unam" */ {1, 0, 0, 0}, /* formatVersion */ {3, 0, 0, 0} /* dataVersion */ }; static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE; typedef struct Options { UBool storeNames; UBool store10Names; UBool storeISOComments; } Options; /* * Pair of code point and name alias. * Try to keep sizeof(CpNameAlias) a multiple of 4 to avoid padding. */ typedef struct CpNameAlias { uint32_t code; char nameAlias[124]; } CpNameAlias; static CpNameAlias cpNameAliases[50]; static uint32_t cpNameAliasesIndex=0, cpNameAliasesTop=0; static uint8_t stringStore[STRING_STORE_SIZE], groupStore[GROUP_STORE_SIZE], lineLengths[LINES_PER_GROUP]; static uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop; typedef struct { uint32_t code; int16_t length; uint8_t *s; } Line; typedef struct { int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */ int16_t count; int16_t length; uint8_t *s; } Word; static Line lines[MAX_LINE_COUNT]; static Word words[MAX_WORD_COUNT]; static uint32_t lineCount=0, wordCount=0; static int16_t leadByteCount; #define LEADBYTE_LIMIT 16 static int16_t tokens[LEADBYTE_LIMIT*256]; static uint32_t tokenCount; /* prototypes --------------------------------------------------------------- */ static void init(void); static void parseNameAliases(const char *filename, Options *options); static void parseDB(const char *filename, Options *options); static void parseName(char *name, int16_t length); static int16_t skipNoise(char *line, int16_t start, int16_t limit); static int16_t getWord(char *line, int16_t start, int16_t limit); static void compress(void); static void compressLines(void); static int16_t compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop); static int32_t compareWords(const void *context, const void *word1, const void *word2); static void generateData(const char *dataDir, Options *options); static uint32_t generateAlgorithmicData(UNewDataMemory *pData, Options *options); static int16_t findToken(uint8_t *s, int16_t length); static Word * findWord(char *s, int16_t length); static Word * addWord(char *s, int16_t length); static void countWord(Word *word); static void addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count); static void addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length); static uint32_t addToken(uint8_t *s, int16_t length); static void appendLineLength(int16_t length); static void appendLineLengthNibble(uint8_t nibble); static uint8_t * allocLine(int32_t length); static uint8_t * allocWord(uint32_t length); /* -------------------------------------------------------------------------- */ enum { HELP_H, HELP_QUESTION_MARK, VERBOSE, QUIET, COPYRIGHT, DESTDIR, UNICODE, UNICODE1_NAMES, NO_ISO_COMMENTS, ONLY_ISO_COMMENTS }; static UOption options[]={ UOPTION_HELP_H, UOPTION_HELP_QUESTION_MARK, UOPTION_VERBOSE, UOPTION_QUIET, UOPTION_COPYRIGHT, UOPTION_DESTDIR, { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }, { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 }, { "no-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }, { "only-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 } }; extern int main(int argc, char* argv[]) { UVersionInfo version; Options moreOptions={ TRUE, FALSE, TRUE }; UErrorCode errorCode = U_ZERO_ERROR; U_MAIN_INIT_ARGS(argc, argv); /* Initialize ICU */ u_init(&errorCode); if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) { /* Note: u_init() will try to open ICU property data. * failures here are expected when building ICU from scratch. * ignore them. */ fprintf(stderr, "%s: can not initialize ICU. errorCode = %s\n", argv[0], u_errorName(errorCode)); exit(1); } /* preset then read command line options */ options[DESTDIR].value=u_getDataDirectory(); options[UNICODE].value="4.1"; argc=u_parseArgs(argc, argv, LENGTHOF(options), options); /* error handling, printing usage message */ if(argc<0) { fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); } else if(argc<2) { argc=-1; } if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { /* * Broken into chucks because the C89 standard says the minimum * required supported string length is 509 bytes. */ fprintf(stderr, "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] [filename_ud [filename_na]]\n" "\n" "Read the UnicodeData.txt file and \n" "create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n" "\n" "\tfilename_ud absolute path/filename for the UnicodeData.txt file\n" "\t (default: standard input)\n" "\tfilename_na absolute path/filename for the NameAliases.txt file\n" "\t (default: no name aliases)\n" "\n", argv[0]); fprintf(stderr, "Options:\n" "\t-h or -? or --help this usage text\n" "\t-v or --verbose verbose output\n" "\t-q or --quiet no output\n" "\t-c or --copyright include a copyright notice\n" "\t-d or --destdir destination directory, followed by the path\n" "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"); fprintf(stderr, "\t-1 or --unicode1-names store Unicode 1.0 character names\n" "\t --no-iso-comments do not store ISO comments\n" "\t --only-iso-comments write ucomment.icu with only ISO comments\n"); return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } /* get the options values */ beVerbose=options[VERBOSE].doesOccur; beQuiet=options[QUIET].doesOccur; haveCopyright=options[COPYRIGHT].doesOccur; moreOptions.store10Names=options[UNICODE1_NAMES].doesOccur; moreOptions.storeISOComments=!options[NO_ISO_COMMENTS].doesOccur; if(options[ONLY_ISO_COMMENTS].doesOccur) { moreOptions.storeNames=moreOptions.store10Names=FALSE; moreOptions.storeISOComments=TRUE; } /* set the Unicode version */ u_versionFromString(version, options[UNICODE].value); uprv_memcpy(dataInfo.dataVersion, version, 4); ucdVersion=findUnicodeVersion(version); init(); if(argc>=3) { parseNameAliases(argv[2], &moreOptions); } parseDB(argc>=2 ? argv[1] : "-", &moreOptions); compress(); generateData(options[DESTDIR].value, &moreOptions); u_cleanup(); return 0; } static void init() { int i; for(i=0; i<256; ++i) { tokens[i]=0; } } /* parsing ------------------------------------------------------------------ */ /* get a name, strip leading and trailing whitespace */ static int16_t getName(char **pStart, char *limit) { /* strip leading whitespace */ char *start=(char *)u_skipWhitespace(*pStart); /* strip trailing whitespace */ while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) { --limit; } /* return results */ *pStart=start; return (int16_t)(limit-start); } static void U_CALLCONV nameAliasesLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *name; int16_t length=0; static uint32_t prevCode=0; uint32_t code=0; if(U_FAILURE(*pErrorCode)) { return; } /* get the character code */ code=uprv_strtoul(fields[0][0], NULL, 16); /* get the character name */ name=fields[1][0]; length=getName(&name, fields[1][1]); if(length==0 || length>=sizeof(cpNameAliases[cpNameAliasesTop].nameAlias)) { fprintf(stderr, "gennames: error - name alias %s empty or too long for code point U+%04lx\n", name, (unsigned long)code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* check for non-character code points */ if(!U_IS_UNICODE_CHAR(code)) { fprintf(stderr, "gennames: error - name alias for non-character code point U+%04lx\n", (unsigned long)code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* check that the code points (code) are in ascending order */ if(code<=prevCode && code>0) { fprintf(stderr, "gennames: error - NameAliases entries out of order, U+%04lx after U+%04lx\n", (unsigned long)code, (unsigned long)prevCode); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } prevCode=code; if(cpNameAliasesTop>=LENGTHOF(cpNameAliases)) { fprintf(stderr, "gennames: error - too many name aliases\n"); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } cpNameAliases[cpNameAliasesTop].code=code; uprv_memcpy(cpNameAliases[cpNameAliasesTop].nameAlias, name, length); cpNameAliases[cpNameAliasesTop].nameAlias[length]=0; ++cpNameAliasesTop; parseName(name, length); } static void U_CALLCONV lineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { Options *storeOptions=(Options *)context; char *names[4]; int16_t lengths[4]={ 0, 0, 0, 0 }; static uint32_t prevCode=0; uint32_t code=0; if(U_FAILURE(*pErrorCode)) { return; } /* get the character code */ code=uprv_strtoul(fields[0][0], NULL, 16); /* get the character name */ if(storeOptions->storeNames) { names[0]=fields[1][0]; lengths[0]=getName(names+0, fields[1][1]); if(names[0][0]=='<') { /* do not store pseudo-names in <> brackets */ lengths[0]=0; } } /* store 1.0 names */ /* get the second character name, the one from Unicode 1.0 */ if(storeOptions->store10Names) { names[1]=fields[10][0]; lengths[1]=getName(names+1, fields[10][1]); if(names[1][0]=='<') { /* do not store pseudo-names in <> brackets */ lengths[1]=0; } } /* get the ISO 10646 comment */ if(storeOptions->storeISOComments) { names[2]=fields[11][0]; lengths[2]=getName(names+2, fields[11][1]); } if(lengths[0]+lengths[1]+lengths[2]==0) { return; } /* check for non-character code points */ if(!U_IS_UNICODE_CHAR(code)) { fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n", (unsigned long)code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* check that the code points (code) are in ascending order */ if(code<=prevCode && code>0) { fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", (unsigned long)code, (unsigned long)prevCode); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } prevCode=code; parseName(names[0], lengths[0]); parseName(names[1], lengths[1]); parseName(names[2], lengths[2]); if(cpNameAliasesIndex<cpNameAliasesTop && code>=cpNameAliases[cpNameAliasesIndex].code) { if(code==cpNameAliases[cpNameAliasesIndex].code) { names[3]=cpNameAliases[cpNameAliasesIndex].nameAlias; lengths[3]=(int16_t)uprv_strlen(cpNameAliases[cpNameAliasesIndex].nameAlias); ++cpNameAliasesIndex; } else { fprintf(stderr, "gennames: error - NameAlias but no UnicodeData entry for U+%04lx\n", (unsigned long)code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } } /* * set the count argument to * 1: only store regular names, or only store ISO 10646 comments * 2: store regular and 1.0 names * 3: store names and ISO 10646 comment * 4: also store name alias * * addLine() will ignore empty trailing names */ if(storeOptions->storeNames) { /* store names and comments as parsed according to storeOptions */ addLine(code, names, lengths, LENGTHOF(names)); } else { /* store only ISO 10646 comments */ addLine(code, names+2, lengths+2, 1); } } static void parseNameAliases(const char *filename, Options *storeOptions) { char *fields[2][2]; UErrorCode errorCode=U_ZERO_ERROR; if(!storeOptions->storeNames) { return; } u_parseDelimitedFile(filename, ';', fields, 2, nameAliasesLineFn, NULL, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode)); exit(errorCode); } if(!beQuiet) { printf("number of name aliases: %lu\n", (unsigned long)cpNameAliasesTop); } } static void parseDB(const char *filename, Options *storeOptions) { char *fields[15][2]; UErrorCode errorCode=U_ZERO_ERROR; u_parseDelimitedFile(filename, ';', fields, 15, lineFn, storeOptions, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode)); exit(errorCode); } if(cpNameAliasesIndex<cpNameAliasesTop) { fprintf(stderr, "gennames: error - NameAlias but no UnicodeData entry for U+%04lx\n", (unsigned long)cpNameAliases[cpNameAliasesIndex].code); exit(U_PARSE_ERROR); } if(!beQuiet) { printf("size of all names in the database: %lu\n", (unsigned long)lineTop); printf("number of named Unicode characters: %lu\n", (unsigned long)lineCount); printf("number of words in the dictionary from these names: %lu\n", (unsigned long)wordCount); } } static void parseName(char *name, int16_t length) { int16_t start=0, limit, wordLength/*, prevStart=-1*/; Word *word; while(start<length) { /* skip any "noise" characters */ limit=skipNoise(name, start, length); if(start<limit) { /*prevStart=-1;*/ start=limit; } if(start==length) { break; } /* get a word and add it if it is longer than 1 */ limit=getWord(name, start, length); wordLength=(int16_t)(limit-start); if(wordLength>1) { word=findWord(name+start, wordLength); if(word==NULL) { word=addWord(name+start, wordLength); } countWord(word); } #if 0 /* * if there was a word before this * (with no noise in between), then add the pair of words, too */ if(prevStart!=-1) { wordLength=limit-prevStart; word=findWord(name+prevStart, wordLength); if(word==NULL) { word=addWord(name+prevStart, wordLength); } countWord(word); } #endif /*prevStart=start;*/ start=limit; } } static UBool U_INLINE isWordChar(char c) { return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */ ('J'<=c && c<='R') || ('S'<=c && c<='Z') || ('a'<=c && c<='i') || /* lowercase letters for ISO comments */ ('j'<=c && c<='r') || ('s'<=c && c<='z') || ('0'<=c && c<='9'); } static int16_t skipNoise(char *line, int16_t start, int16_t limit) { /* skip anything that is not part of a word in this sense */ while(start<limit && !isWordChar(line[start])) { ++start; } return start; } static int16_t getWord(char *line, int16_t start, int16_t limit) { char c=0; /* initialize to avoid a compiler warning although the code was safe */ /* a unicode character name word consists of A-Z0-9 */ while(start<limit && isWordChar(line[start])) { ++start; } /* include a following space or dash */ if(start<limit && ((c=line[start])==' ' || c=='-')) { ++start; } return start; } /* compressing -------------------------------------------------------------- */ static void compress() { uint32_t i, letterCount; int16_t wordNumber; UErrorCode errorCode; /* sort the words in reverse order by weight */ errorCode=U_ZERO_ERROR; uprv_sortArray(words, wordCount, sizeof(Word), compareWords, NULL, FALSE, &errorCode); /* remove the words that do not save anything */ while(wordCount>0 && words[wordCount-1].weight<1) { --wordCount; } /* count the letters in the token range */ letterCount=0; for(i=LEADBYTE_LIMIT; i<256; ++i) { if(tokens[i]==-1) { ++letterCount; } } if(!beQuiet) { printf("number of letters used in the names: %d\n", (int)letterCount); } /* do we need double-byte tokens? */ if(wordCount+letterCount<=256) { /* no, single-byte tokens are enough */ leadByteCount=0; for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) { if(tokens[i]!=-1) { tokens[i]=wordNumber; if(beVerbose) { printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", (int)i, (long)words[wordNumber].weight, words[wordNumber].length, words[wordNumber].s); } ++wordNumber; } } tokenCount=i; } else { /* * The tokens that need two token bytes * get their weight reduced by their count * because they save less. */ tokenCount=256-letterCount; for(i=tokenCount; i<wordCount; ++i) { words[i].weight-=words[i].count; } /* sort these words in reverse order by weight */ errorCode=U_ZERO_ERROR; uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word), compareWords, NULL, FALSE, &errorCode); /* remove the words that do not save anything */ while(wordCount>0 && words[wordCount-1].weight<1) { --wordCount; } /* how many tokens and lead bytes do we have now? */ tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1); /* * adjust upwards to take into account that * double-byte tokens must not * use NAME_SEPARATOR_CHAR as a second byte */ tokenCount+=(tokenCount-256+254)/255; leadByteCount=(int16_t)(tokenCount>>8); if(leadByteCount<LEADBYTE_LIMIT) { /* adjust for the real number of lead bytes */ tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount; } else { /* limit the number of lead bytes */ leadByteCount=LEADBYTE_LIMIT-1; tokenCount=LEADBYTE_LIMIT*256; wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1); /* adjust again to skip double-byte tokens with ';' */ wordCount-=(tokenCount-256+254)/255; } /* set token 0 to word 0 */ tokens[0]=0; if(beVerbose) { printf("tokens[0x000]: word%8ld \"%.*s\"\n", (long)words[0].weight, words[0].length, words[0].s); } wordNumber=1; /* set the lead byte tokens */ for(i=1; (int16_t)i<=leadByteCount; ++i) { tokens[i]=-2; } /* set the tokens */ for(; i<256; ++i) { /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */ if(tokens[i]!=-1) { tokens[i]=wordNumber; if(beVerbose) { printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", (int)i, (long)words[wordNumber].weight, words[wordNumber].length, words[wordNumber].s); } ++wordNumber; } } /* continue above 255 where there are no letters */ for(; (uint32_t)wordNumber<wordCount; ++i) { if((i&0xff)==NAME_SEPARATOR_CHAR) { tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */ } else { tokens[i]=wordNumber; if(beVerbose) { printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", (int)i, (long)words[wordNumber].weight, words[wordNumber].length, words[wordNumber].s); } ++wordNumber; } } tokenCount=i; /* should be already tokenCount={i or i+1} */ } if(!beQuiet) { printf("number of lead bytes: %d\n", leadByteCount); printf("number of single-byte tokens: %lu\n", (unsigned long)256-letterCount-leadByteCount); printf("number of tokens: %lu\n", (unsigned long)tokenCount); } compressLines(); } static void compressLines() { Line *line=NULL; uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */, groupMSB=0xffff, lineCount2; int16_t groupTop=0; /* store the groups like lines, with compressed data after raw strings */ groupBottom=lineTop; lineCount2=lineCount; lineCount=0; /* loop over all lines */ while(i<lineCount2) { line=lines+i++; inLine=line->code; /* segment the lines to groups of 32 */ if(inLine>>GROUP_SHIFT!=groupMSB) { /* finish the current group with empty lines */ while((++outLine&GROUP_MASK)!=0) { appendLineLength(0); } /* store the group like a line */ if(groupTop>0) { if(groupTop>GROUP_STORE_SIZE) { fprintf(stderr, "gennames: group store overflow\n"); exit(U_BUFFER_OVERFLOW_ERROR); } addGroup(groupMSB, groupStore, groupTop); } /* start the new group */ lineLengthsTop=0; groupTop=0; groupMSB=inLine>>GROUP_SHIFT; outLine=(inLine&~GROUP_MASK)-1; } /* write empty lines between the previous line in the group and this one */ while(++outLine<inLine) { appendLineLength(0); } /* write characters and tokens for this line */ appendLineLength(compressLine(line->s, line->length, &groupTop)); } /* finish and store the last group */ if(line && groupMSB!=0xffff) { /* finish the current group with empty lines */ while((++outLine&GROUP_MASK)!=0) { appendLineLength(0); } /* store the group like a line */ if(groupTop>0) { if(groupTop>GROUP_STORE_SIZE) { fprintf(stderr, "gennames: group store overflow\n"); exit(U_BUFFER_OVERFLOW_ERROR); } addGroup(groupMSB, groupStore, groupTop); } } if(!beQuiet) { printf("number of groups: %lu\n", (unsigned long)lineCount); } } static int16_t compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) { int16_t start, limit, token, groupTop=*pGroupTop; start=0; do { /* write any "noise" characters */ limit=skipNoise((char *)s, start, length); while(start<limit) { groupStore[groupTop++]=s[start++]; } if(start==length) { break; } /* write a word, as token or directly */ limit=getWord((char *)s, start, length); if(limit-start==1) { groupStore[groupTop++]=s[start++]; } else { token=findToken(s+start, (int16_t)(limit-start)); if(token!=-1) { if(token>0xff) { groupStore[groupTop++]=(uint8_t)(token>>8); } groupStore[groupTop++]=(uint8_t)token; start=limit; } else { while(start<limit) { groupStore[groupTop++]=s[start++]; } } } } while(start<length); length=(int16_t)(groupTop-*pGroupTop); *pGroupTop=groupTop; return length; } static int32_t compareWords(const void *context, const void *word1, const void *word2) { /* reverse sort by word weight */ return ((Word *)word2)->weight-((Word *)word1)->weight; } /* generate output data ----------------------------------------------------- */ static void generateData(const char *dataDir, Options *storeOptions) { UNewDataMemory *pData; UErrorCode errorCode=U_ZERO_ERROR; uint16_t groupWords[3]; uint32_t i, groupTop=lineTop, offset, size, tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; long dataLength; int16_t token; pData=udata_create(dataDir, DATA_TYPE, storeOptions->storeNames ? DATA_NAME : ISO_DATA_NAME, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode); exit(errorCode); } /* first, see how much space we need, and prepare the token strings */ for(i=0; i<tokenCount; ++i) { token=tokens[i]; if(token!=-1 && token!=-2) { tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop); } } /* * Required padding for data swapping: * The token table undergoes a permutation during data swapping when the * input and output charsets are different. * The token table cannot grow during swapping, so we need to make sure that * the table is long enough for successful in-place permutation. * * We simply round up tokenCount to the next multiple of 256 to account for * all possible permutations. * * An optimization is possible if we only ever swap between ASCII and EBCDIC: * * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used * and will be swapped between ASCII and EBCDIC between * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon). * This should be the only -1 entry in tokens[256..511] on which the data * swapper bases its trail byte permutation map (trailMap[]). * * It would be sufficient to increase tokenCount so that its lower 8 bits * are at least 0x5e+1 to make room for swapping between the two semicolons. * For values higher than 0x5e, the trail byte permutation map (trailMap[]) * should always be an identity map, where we do not need additional room. */ i=tokenCount; tokenCount=(tokenCount+0xff)&~0xff; if(!beQuiet && i<tokenCount) { printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i)); } for(; i<tokenCount; ++i) { if((i&0xff)==NAME_SEPARATOR_CHAR) { tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */ } else { tokens[i]=0; /* unused token for padding */ } } /* * Calculate the total size in bytes of the data including: * - the offset to the token strings, uint32_t (4) * - the offset to the group table, uint32_t (4) * - the offset to the group strings, uint32_t (4) * - the offset to the algorithmic names, uint32_t (4) * * - the number of tokens, uint16_t (2) * - the token table, uint16_t[tokenCount] (2*tokenCount) * * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded * * - the number of groups, uint16_t (2) * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount] * * - the group strings (groupTop-groupBottom), 2-padded * * - the size of the data for the algorithmic names */ tokenStringOffset=4+4+4+4+2+2*tokenCount; groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1; groupStringOffset=groupsOffset+2+6*lineCount; algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3; offset=generateAlgorithmicData(NULL, storeOptions); size=algNamesOffset+offset; if(!beQuiet) { printf("size of the Unicode Names data:\n" "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n", (unsigned long)size, (unsigned long)(lineTop-groupTop), (unsigned long)(groupTop-groupBottom), (unsigned long)offset); } /* write the data to the file */ /* offsets */ udata_write32(pData, tokenStringOffset); udata_write32(pData, groupsOffset); udata_write32(pData, groupStringOffset); udata_write32(pData, algNamesOffset); /* token table */ udata_write16(pData, (uint16_t)tokenCount); udata_writeBlock(pData, tokens, 2*tokenCount); /* token strings */ udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop); if((lineTop-groupTop)&1) { /* 2-padding */ udata_writePadding(pData, 1); } /* group table */ udata_write16(pData, (uint16_t)lineCount); for(i=0; i<lineCount; ++i) { /* groupMSB */ groupWords[0]=(uint16_t)lines[i].code; /* offset */ offset = (uint32_t)((lines[i].s - stringStore)-groupBottom); groupWords[1]=(uint16_t)(offset>>16); groupWords[2]=(uint16_t)(offset); udata_writeBlock(pData, groupWords, 6); } /* group strings */ udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom); /* 4-align the algorithmic names data */ udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom))); generateAlgorithmicData(pData, storeOptions); /* finish up */ dataLength=udata_finish(pData, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "gennames: error %d writing the output file\n", errorCode); exit(errorCode); } if(dataLength!=(long)size) { fprintf(stderr, "gennames: data length %ld != calculated size %lu\n", dataLength, (unsigned long)size); exit(U_INTERNAL_PROGRAM_ERROR); } } /* the structure for algorithmic names needs to be 4-aligned */ typedef struct AlgorithmicRange { uint32_t rangeStart, rangeEnd; uint8_t algorithmType, algorithmVariant; uint16_t rangeSize; } AlgorithmicRange; static uint32_t generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) { static char prefix[] = "CJK UNIFIED IDEOGRAPH-"; # define PREFIX_LENGTH 23 # define PREFIX_LENGTH_4 24 uint32_t countAlgRanges; static AlgorithmicRange cjkExtA={ 0x3400, 0x4db5, 0, 4, sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 }; static AlgorithmicRange cjk={ 0x4e00, 0x9fa5, 0, 4, sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 }; static AlgorithmicRange cjkExtB={ 0x20000, 0x2a6d6, 0, 5, sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 }; static AlgorithmicRange cjkExtC={ 0x2a700, 0x2b734, 0, 5, sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 }; static char jamo[]= "HANGUL SYLLABLE \0" "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0" "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0" "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0" "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0" "YU\0EU\0YI\0I\0" "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0" "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0" "S\0SS\0NG\0J\0C\0K\0T\0P\0H" ; static AlgorithmicRange hangul={ 0xac00, 0xd7a3, 1, 3, sizeof(AlgorithmicRange)+6+sizeof(jamo) }; /* modulo factors, maximum 8 */ /* 3 factors: 19, 21, 28, most-to-least-significant */ static uint16_t hangulFactors[3]={ 19, 21, 28 }; uint32_t size; size=0; if(ucdVersion>=UNI_5_2) { /* Unicode 5.2 and up has a longer CJK Unihan range than before */ cjk.rangeEnd=0x9FCB; } else if(ucdVersion>=UNI_5_1) { /* Unicode 5.1 and up has a longer CJK Unihan range than before */ cjk.rangeEnd=0x9FC3; } else if(ucdVersion>=UNI_4_1) { /* Unicode 4.1 and up has a longer CJK Unihan range than before */ cjk.rangeEnd=0x9FBB; } /* number of ranges of algorithmic names */ if(!storeOptions->storeNames) { countAlgRanges=0; } else if(ucdVersion>=UNI_5_2) { /* Unicode 5.2 and up has 5 ranges including CJK Extension C */ countAlgRanges=5; } else if(ucdVersion>=UNI_3_1) { /* Unicode 3.1 and up has 4 ranges including CJK Extension B */ countAlgRanges=4; } else if(ucdVersion>=UNI_3_0) { /* Unicode 3.0 has 3 ranges including CJK Extension A */ countAlgRanges=3; } else { /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */ countAlgRanges=2; } if(pData!=NULL) { udata_write32(pData, countAlgRanges); } else { size+=4; } if(countAlgRanges==0) { return size; } /* * each range: * uint32_t rangeStart * uint32_t rangeEnd * uint8_t algorithmType * uint8_t algorithmVariant * uint16_t size of range data * uint8_t[size] data */ /* range 0: cjk extension a */ if(countAlgRanges>=3) { if(pData!=NULL) { udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange)); udata_writeString(pData, prefix, PREFIX_LENGTH); if(PREFIX_LENGTH<PREFIX_LENGTH_4) { udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH); } } else { size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4; } } /* range 1: cjk */ if(pData!=NULL) { udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange)); udata_writeString(pData, prefix, PREFIX_LENGTH); if(PREFIX_LENGTH<PREFIX_LENGTH_4) { udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH); } } else { size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4; } /* range 2: hangul syllables */ if(pData!=NULL) { udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange)); udata_writeBlock(pData, hangulFactors, 6); udata_writeString(pData, jamo, sizeof(jamo)); } else { size+=sizeof(AlgorithmicRange)+6+sizeof(jamo); } /* range 3: cjk extension b */ if(countAlgRanges>=4) { if(pData!=NULL) { udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange)); udata_writeString(pData, prefix, PREFIX_LENGTH); if(PREFIX_LENGTH<PREFIX_LENGTH_4) { udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH); } } else { size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4; } } /* range 4: cjk extension c */ if(countAlgRanges>=5) { if(pData!=NULL) { udata_writeBlock(pData, &cjkExtC, sizeof(AlgorithmicRange)); udata_writeString(pData, prefix, PREFIX_LENGTH); if(PREFIX_LENGTH<PREFIX_LENGTH_4) { udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH); } } else { size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4; } } return size; } /* helpers ------------------------------------------------------------------ */ static int16_t findToken(uint8_t *s, int16_t length) { int16_t i, token; for(i=0; i<(int16_t)tokenCount; ++i) { token=tokens[i]; if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) { return i; } } return -1; } static Word * findWord(char *s, int16_t length) { uint32_t i; for(i=0; i<wordCount; ++i) { if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) { return words+i; } } return NULL; } static Word * addWord(char *s, int16_t length) { uint8_t *stringStart; Word *word; if(wordCount==MAX_WORD_COUNT) { fprintf(stderr, "gennames: too many words\n"); exit(U_BUFFER_OVERFLOW_ERROR); } stringStart=allocWord(length); uprv_memcpy(stringStart, s, length); word=words+wordCount; /* * Initialize the weight with the costs for this token: * a zero-terminated string and a 16-bit offset. */ word->weight=-(length+1+2); word->count=0; word->length=length; word->s=stringStart; ++wordCount; return word; } static void countWord(Word *word) { /* add to the weight the savings: the length of the word minus 1 byte for the token */ word->weight+=word->length-1; ++word->count; } static void addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) { uint8_t *stringStart; Line *line; int16_t i, length; if(lineCount==MAX_LINE_COUNT) { fprintf(stderr, "gennames: too many lines\n"); exit(U_BUFFER_OVERFLOW_ERROR); } /* find the last non-empty name */ while(count>0 && lengths[count-1]==0) { --count; } if(count==0) { return; /* should not occur: caller should not have called */ } /* there will be (count-1) separator characters */ i=count; length=count-1; /* add lengths of strings */ while(i>0) { length+=lengths[--i]; } /* allocate line memory */ stringStart=allocLine(length); /* copy all strings into the line memory */ length=0; /* number of chars copied so far */ for(i=0; i<count; ++i) { if(i>0) { stringStart[length++]=NAME_SEPARATOR_CHAR; } if(lengths[i]>0) { uprv_memcpy(stringStart+length, names[i], lengths[i]); length+=lengths[i]; } } line=lines+lineCount; line->code=code; line->length=length; line->s=stringStart; ++lineCount; /* prevent a character value that is actually in a name from becoming a token */ while(length>0) { tokens[stringStart[--length]]=-1; } } static void addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) { uint8_t *stringStart; Line *line; if(lineCount==MAX_LINE_COUNT) { fprintf(stderr, "gennames: too many groups\n"); exit(U_BUFFER_OVERFLOW_ERROR); } /* store the line lengths first, then the strings */ lineLengthsTop=(lineLengthsTop+1)/2; stringStart=allocLine(lineLengthsTop+length); uprv_memcpy(stringStart, lineLengths, lineLengthsTop); uprv_memcpy(stringStart+lineLengthsTop, strings, length); line=lines+lineCount; line->code=groupMSB; line->length=length; line->s=stringStart; ++lineCount; } static uint32_t addToken(uint8_t *s, int16_t length) { uint8_t *stringStart; stringStart=allocLine(length+1); uprv_memcpy(stringStart, s, length); stringStart[length]=0; return (uint32_t)(stringStart - stringStore); } static void appendLineLength(int16_t length) { if(length>=76) { fprintf(stderr, "gennames: compressed line too long\n"); exit(U_BUFFER_OVERFLOW_ERROR); } if(length>=12) { length-=12; appendLineLengthNibble((uint8_t)((length>>4)|12)); } appendLineLengthNibble((uint8_t)length); } static void appendLineLengthNibble(uint8_t nibble) { if((lineLengthsTop&1)==0) { lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4); } else { lineLengths[lineLengthsTop/2]|=nibble&0xf; } ++lineLengthsTop; } static uint8_t * allocLine(int32_t length) { uint32_t top=lineTop+length; uint8_t *p; if(top>wordBottom) { fprintf(stderr, "gennames: out of memory\n"); exit(U_MEMORY_ALLOCATION_ERROR); } p=stringStore+lineTop; lineTop=top; return p; } static uint8_t * allocWord(uint32_t length) { uint32_t bottom=wordBottom-length; if(lineTop>bottom) { fprintf(stderr, "gennames: out of memory\n"); exit(U_MEMORY_ALLOCATION_ERROR); } wordBottom=bottom; return stringStore+bottom; } /* * Hey, Emacs, please set the following: * * Local Variables: * indent-tabs-mode: nil * End: * */