/* ******************************************************************************** * * Copyright (C) 1998-2007, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************** * * * makeconv.c: * tool creating a binary (compressed) representation of the conversion mapping * table (IBM NLTC ucmap format). * * 05/04/2000 helena Added fallback mapping into the picture... * 06/29/2000 helena Major rewrite of the callback APIs. */ #include <stdio.h> #include "unicode/putil.h" #include "unicode/ucnv_err.h" #include "ucnv_bld.h" #include "ucnv_imp.h" #include "ucnv_cnv.h" #include "cstring.h" #include "cmemory.h" #include "uinvchar.h" #include "filestrm.h" #include "toolutil.h" #include "uoptions.h" #include "unicode/udata.h" #include "unewdata.h" #include "uparse.h" #include "ucm.h" #include "makeconv.h" #include "genmbcs.h" #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) #define DEBUG 0 typedef struct ConvData { UCMFile *ucm; NewConverter *cnvData, *extData; UConverterSharedData sharedData; UConverterStaticData staticData; } ConvData; static void initConvData(ConvData *data) { uprv_memset(data, 0, sizeof(ConvData)); data->sharedData.structSize=sizeof(UConverterSharedData); data->staticData.structSize=sizeof(UConverterStaticData); data->sharedData.staticData=&data->staticData; } static void cleanupConvData(ConvData *data) { if(data!=NULL) { if(data->cnvData!=NULL) { data->cnvData->close(data->cnvData); data->cnvData=NULL; } if(data->extData!=NULL) { data->extData->close(data->extData); data->extData=NULL; } ucm_close(data->ucm); data->ucm=NULL; } } /* * from ucnvstat.c - static prototypes of data-based converters */ extern const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]; /* * Global - verbosity */ UBool VERBOSE = FALSE; UBool SMALL = FALSE; static void createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode); /* * Set up the UNewData and write the converter.. */ static void writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status); UBool haveCopyright=TRUE; static UDataInfo dataInfo={ sizeof(UDataInfo), 0, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, sizeof(UChar), 0, {0x63, 0x6e, 0x76, 0x74}, /* dataFormat="cnvt" */ {6, 2, 0, 0}, /* formatVersion */ {0, 0, 0, 0} /* dataVersion (calculated at runtime) */ }; static void writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status) { UNewDataMemory *mem = NULL; uint32_t sz2; uint32_t size = 0; int32_t tableType; if(U_FAILURE(*status)) { return; } tableType=TABLE_NONE; if(data->cnvData!=NULL) { tableType|=TABLE_BASE; } if(data->extData!=NULL) { tableType|=TABLE_EXT; } mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status); if(U_FAILURE(*status)) { fprintf(stderr, "Couldn't create the udata %s.%s: %s\n", cnvName, "cnv", u_errorName(*status)); return; } if(VERBOSE) { printf("- Opened udata %s.%s\n", cnvName, "cnv"); } /* all read only, clean, platform independent data. Mmmm. :) */ udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData)); size += sizeof(UConverterStaticData); /* Is 4-aligned - by size */ /* Now, write the table */ if(tableType&TABLE_BASE) { size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType); } if(tableType&TABLE_EXT) { size += data->extData->write(data->extData, &data->staticData, mem, tableType); } sz2 = udata_finish(mem, status); if(size != sz2) { fprintf(stderr, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", (int)sz2, (int)size); *status=U_INTERNAL_PROGRAM_ERROR; } if(VERBOSE) { printf("- Wrote %u bytes to the udata.\n", (int)sz2); } } enum { OPT_HELP_H, OPT_HELP_QUESTION_MARK, OPT_COPYRIGHT, OPT_VERSION, OPT_DESTDIR, OPT_VERBOSE, OPT_SMALL, OPT_COUNT }; static UOption options[]={ UOPTION_HELP_H, UOPTION_HELP_QUESTION_MARK, UOPTION_COPYRIGHT, UOPTION_VERSION, UOPTION_DESTDIR, UOPTION_VERBOSE, { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 } }; int main(int argc, char* argv[]) { ConvData data; UErrorCode err = U_ZERO_ERROR, localError; char outFileName[UCNV_MAX_FULL_FILE_NAME_LENGTH]; const char* destdir, *arg; size_t destdirlen; char* dot = NULL, *outBasename; char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH]; char cnvNameWithPkg[UCNV_MAX_FULL_FILE_NAME_LENGTH]; UVersionInfo icuVersion; UBool printFilename; err = U_ZERO_ERROR; U_MAIN_INIT_ARGS(argc, argv); /* Set up the ICU version number */ u_getVersion(icuVersion); uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo)); /* preset then read command line options */ options[OPT_DESTDIR].value=u_getDataDirectory(); argc=u_parseArgs(argc, argv, LENGTHOF(options), options); /* error handling, printing usage message */ if(argc<0) { fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); } else if(argc<2) { argc=-1; } if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) { FILE *stdfile=argc<0 ? stderr : stdout; fprintf(stdfile, "usage: %s [-options] files...\n" "\tread .ucm codepage mapping files and write .cnv files\n" "options:\n" "\t-h or -? or --help this usage text\n" "\t-V or --version show a version message\n" "\t-c or --copyright include a copyright notice\n" "\t-d or --destdir destination directory, followed by the path\n" "\t-v or --verbose Turn on verbose output\n", argv[0]); fprintf(stdfile, "\t --small Generate smaller .cnv files. They will be\n" "\t significantly smaller but may not be compatible with\n" "\t older versions of ICU and will require heap memory\n" "\t allocation when loaded.\n"); return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } if(options[OPT_VERSION].doesOccur) { printf("makeconv version %hu.%hu, ICU tool to read .ucm codepage mapping files and write .cnv files\n", dataInfo.formatVersion[0], dataInfo.formatVersion[1]); printf("%s\n", U_COPYRIGHT_STRING); exit(0); } /* get the options values */ haveCopyright = options[OPT_COPYRIGHT].doesOccur; destdir = options[OPT_DESTDIR].value; VERBOSE = options[OPT_VERBOSE].doesOccur; SMALL = options[OPT_SMALL].doesOccur; if (destdir != NULL && *destdir != 0) { uprv_strcpy(outFileName, destdir); destdirlen = uprv_strlen(destdir); outBasename = outFileName + destdirlen; if (*(outBasename - 1) != U_FILE_SEP_CHAR) { *outBasename++ = U_FILE_SEP_CHAR; ++destdirlen; } } else { destdirlen = 0; outBasename = outFileName; } #if DEBUG { int i; printf("makeconv: processing %d files...\n", argc - 1); for(i=1; i<argc; ++i) { printf("%s ", argv[i]); } printf("\n"); fflush(stdout); } #endif err = U_ZERO_ERROR; printFilename = (UBool) (argc > 2 || VERBOSE); for (++argv; --argc; ++argv) { arg = getLongPathname(*argv); /*produces the right destination path for display*/ if (destdirlen != 0) { const char *basename; /* find the last file sepator */ basename = findBasename(arg); uprv_strcpy(outBasename, basename); } else { uprv_strcpy(outFileName, arg); } /*removes the extension if any is found*/ dot = uprv_strrchr(outBasename, '.'); if (dot) { *dot = '\0'; } /* the basename without extension is the converter name */ uprv_strcpy(cnvName, outBasename); /*Adds the target extension*/ uprv_strcat(outBasename, CONVERTER_FILE_EXTENSION); #if DEBUG printf("makeconv: processing %s ...\n", arg); fflush(stdout); #endif localError = U_ZERO_ERROR; initConvData(&data); createConverter(&data, arg, &localError); if (U_FAILURE(localError)) { /* if an error is found, print out an error msg and keep going */ fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n", outFileName, arg, u_errorName(localError)); if(U_SUCCESS(err)) { err = localError; } } else { /* Make the static data name equal to the file name */ if( /*VERBOSE && */ uprv_stricmp(cnvName,data.staticData.name)) { fprintf(stderr, "Warning: %s%s claims to be '%s'\n", cnvName, CONVERTER_FILE_EXTENSION, data.staticData.name); } uprv_strcpy((char*)data.staticData.name, cnvName); if(!uprv_isInvariantString((char*)data.staticData.name, -1)) { fprintf(stderr, "Error: A converter name must contain only invariant characters.\n" "%s is not a valid converter name.\n", data.staticData.name); if(U_SUCCESS(err)) { err = U_INVALID_TABLE_FORMAT; } } uprv_strcpy(cnvNameWithPkg, cnvName); localError = U_ZERO_ERROR; writeConverterData(&data, cnvNameWithPkg, destdir, &localError); if(U_FAILURE(localError)) { /* if an error is found, print out an error msg and keep going*/ fprintf(stderr, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName, arg, u_errorName(localError)); if(U_SUCCESS(err)) { err = localError; } } else if (printFilename) { puts(outBasename); } } fflush(stdout); fflush(stderr); cleanupConvData(&data); } return err; } static void getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID) { if( (name[0]=='i' || name[0]=='I') && (name[1]=='b' || name[1]=='B') && (name[2]=='m' || name[2]=='M') ) { name+=3; if(*name=='-') { ++name; } *pPlatform=UCNV_IBM; *pCCSID=(int32_t)uprv_strtoul(name, NULL, 10); } else { *pPlatform=UCNV_UNKNOWN; *pCCSID=0; } } static void readHeader(ConvData *data, FileStream* convFile, const char* converterName, UErrorCode *pErrorCode) { char line[200]; char *s, *key, *value; const UConverterStaticData *prototype; UConverterStaticData *staticData; if(U_FAILURE(*pErrorCode)) { return; } staticData=&data->staticData; staticData->platform=UCNV_IBM; staticData->subCharLen=0; while(T_FileStream_readLine(convFile, line, sizeof(line))) { /* basic parsing and handling of state-related items */ if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) { continue; } /* stop at the beginning of the mapping section */ if(uprv_strcmp(line, "CHARMAP")==0) { break; } /* collect the information from the header field, ignore unknown keys */ if(uprv_strcmp(key, "code_set_name")==0) { if(*value!=0) { uprv_strcpy((char *)staticData->name, value); getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage); } } else if(uprv_strcmp(key, "subchar")==0) { uint8_t bytes[UCNV_EXT_MAX_BYTES]; int8_t length; s=value; length=ucm_parseBytes(bytes, line, (const char **)&s); if(1<=length && length<=4 && *s==0) { staticData->subCharLen=length; uprv_memcpy(staticData->subChar, bytes, length); } else { fprintf(stderr, "error: illegal <subchar> %s\n", value); *pErrorCode=U_INVALID_TABLE_FORMAT; return; } } else if(uprv_strcmp(key, "subchar1")==0) { uint8_t bytes[UCNV_EXT_MAX_BYTES]; s=value; if(1==ucm_parseBytes(bytes, line, (const char **)&s) && *s==0) { staticData->subChar1=bytes[0]; } else { fprintf(stderr, "error: illegal <subchar1> %s\n", value); *pErrorCode=U_INVALID_TABLE_FORMAT; return; } } } /* copy values from the UCMFile to the static data */ staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength; staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength; staticData->conversionType=data->ucm->states.conversionType; if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) { fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n"); *pErrorCode=U_INVALID_TABLE_FORMAT; return; } /* * Now that we know the type, copy any 'default' values from the table. * We need not check the type any further because the parser only * recognizes what we have prototypes for. * * For delta (extension-only) tables, copy values from the base file * instead, see createConverter(). */ if(data->ucm->baseName[0]==0) { prototype=ucnv_converterStaticData[staticData->conversionType]; if(prototype!=NULL) { if(staticData->name[0]==0) { uprv_strcpy((char *)staticData->name, prototype->name); } if(staticData->codepage==0) { staticData->codepage=prototype->codepage; } if(staticData->platform==0) { staticData->platform=prototype->platform; } if(staticData->minBytesPerChar==0) { staticData->minBytesPerChar=prototype->minBytesPerChar; } if(staticData->maxBytesPerChar==0) { staticData->maxBytesPerChar=prototype->maxBytesPerChar; } if(staticData->subCharLen==0) { staticData->subCharLen=prototype->subCharLen; if(prototype->subCharLen>0) { uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen); } } } } if(data->ucm->states.outputType<0) { data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength-1; } if( staticData->subChar1!=0 && (staticData->minBytesPerChar>1 || (staticData->conversionType!=UCNV_MBCS && staticData->conversionType!=UCNV_EBCDIC_STATEFUL)) ) { fprintf(stderr, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n"); *pErrorCode=U_INVALID_TABLE_FORMAT; } } /* return TRUE if a base table was read, FALSE for an extension table */ static UBool readFile(ConvData *data, const char* converterName, UErrorCode *pErrorCode) { char line[200]; char *end; FileStream *convFile; UCMStates *baseStates; UBool dataIsBase; if(U_FAILURE(*pErrorCode)) { return FALSE; } data->ucm=ucm_open(); convFile=T_FileStream_open(converterName, "r"); if(convFile==NULL) { *pErrorCode=U_FILE_ACCESS_ERROR; return FALSE; } readHeader(data, convFile, converterName, pErrorCode); if(U_FAILURE(*pErrorCode)) { return FALSE; } if(data->ucm->baseName[0]==0) { dataIsBase=TRUE; baseStates=&data->ucm->states; ucm_processStates(baseStates); } else { dataIsBase=FALSE; baseStates=NULL; } /* read the base table */ ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode); if(U_FAILURE(*pErrorCode)) { return FALSE; } /* read an extension table if there is one */ while(T_FileStream_readLine(convFile, line, sizeof(line))) { end=uprv_strchr(line, 0); while(line<end && (*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) { --end; } *end=0; if(line[0]=='#' || u_skipWhitespace(line)==end) { continue; /* ignore empty and comment lines */ } if(0==uprv_strcmp(line, "CHARMAP")) { /* read the extension table */ ucm_readTable(data->ucm, convFile, FALSE, baseStates, pErrorCode); } else { fprintf(stderr, "unexpected text after the base mapping table\n"); } break; } T_FileStream_close(convFile); if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) { fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n"); *pErrorCode=U_INVALID_TABLE_FORMAT; } return dataIsBase; } static void createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) { ConvData baseData; UBool dataIsBase; UConverterStaticData *staticData; UCMStates *states, *baseStates; if(U_FAILURE(*pErrorCode)) { return; } initConvData(data); dataIsBase=readFile(data, converterName, pErrorCode); if(U_FAILURE(*pErrorCode)) { return; } staticData=&data->staticData; states=&data->ucm->states; if(dataIsBase) { /* * Build a normal .cnv file with a base table * and an optional extension table. */ data->cnvData=MBCSOpen(data->ucm); if(data->cnvData==NULL) { *pErrorCode=U_MEMORY_ALLOCATION_ERROR; } else if(!data->cnvData->isValid(data->cnvData, staticData->subChar, staticData->subCharLen) ) { fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n"); *pErrorCode=U_INVALID_TABLE_FORMAT; } else if(staticData->subChar1!=0 && !data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1) ) { fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n"); *pErrorCode=U_INVALID_TABLE_FORMAT; } else if( data->ucm->ext->mappingsLength>0 && !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE) ) { *pErrorCode=U_INVALID_TABLE_FORMAT; } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) { /* sort the table so that it can be turned into UTF-8-friendly data */ ucm_sortTable(data->ucm->base); } if(U_SUCCESS(*pErrorCode)) { if( /* add the base table after ucm_checkBaseExt()! */ !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData) ) { *pErrorCode=U_INVALID_TABLE_FORMAT; } else { /* * addTable() may have requested moving more mappings to the extension table * if they fit into the base toUnicode table but not into the * base fromUnicode table. * (Especially for UTF-8-friendly fromUnicode tables.) * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them * to be excluded from the extension toUnicode data. * See MBCSOkForBaseFromUnicode() for which mappings do not fit into * the base fromUnicode table. */ ucm_moveMappings(data->ucm->base, data->ucm->ext); ucm_sortTable(data->ucm->ext); if(data->ucm->ext->mappingsLength>0) { /* prepare the extension table, if there is one */ data->extData=CnvExtOpen(data->ucm); if(data->extData==NULL) { *pErrorCode=U_MEMORY_ALLOCATION_ERROR; } else if( !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData) ) { *pErrorCode=U_INVALID_TABLE_FORMAT; } } } } } else { /* Build an extension-only .cnv file. */ char baseFilename[500]; char *basename; initConvData(&baseData); /* assemble a path/filename for data->ucm->baseName */ uprv_strcpy(baseFilename, converterName); basename=(char *)findBasename(baseFilename); uprv_strcpy(basename, data->ucm->baseName); uprv_strcat(basename, ".ucm"); /* read the base table */ dataIsBase=readFile(&baseData, baseFilename, pErrorCode); if(U_FAILURE(*pErrorCode)) { return; } else if(!dataIsBase) { fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename); *pErrorCode=U_INVALID_TABLE_FORMAT; } else { /* prepare the extension table */ data->extData=CnvExtOpen(data->ucm); if(data->extData==NULL) { *pErrorCode=U_MEMORY_ALLOCATION_ERROR; } else { /* fill in gaps in extension file header fields */ UCMapping *m, *mLimit; uint8_t fallbackFlags; baseStates=&baseData.ucm->states; if(states->conversionType==UCNV_DBCS) { staticData->minBytesPerChar=(int8_t)(states->minCharLength=2); } else if(states->minCharLength==0) { staticData->minBytesPerChar=(int8_t)(states->minCharLength=baseStates->minCharLength); } if(states->maxCharLength<states->minCharLength) { staticData->maxBytesPerChar=(int8_t)(states->maxCharLength=baseStates->maxCharLength); } if(staticData->subCharLen==0) { uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4); staticData->subCharLen=baseData.staticData.subCharLen; } /* * do not copy subChar1 - * only use what is explicitly specified * because it cannot be unset in the extension file header */ /* get the fallback flags */ fallbackFlags=0; for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength; m<mLimit && fallbackFlags!=3; ++m ) { if(m->f==1) { fallbackFlags|=1; } else if(m->f==3) { fallbackFlags|=2; } } if(fallbackFlags&1) { staticData->hasFromUnicodeFallback=TRUE; } if(fallbackFlags&2) { staticData->hasToUnicodeFallback=TRUE; } if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) { fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n"); *pErrorCode=U_INVALID_TABLE_FORMAT; } else if(1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) { fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n"); *pErrorCode=U_INVALID_TABLE_FORMAT; } else if( !ucm_checkValidity(data->ucm->ext, baseStates) || !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE) ) { *pErrorCode=U_INVALID_TABLE_FORMAT; } else { if(states->maxCharLength>1) { /* * When building a normal .cnv file with a base table * for an MBCS (not SBCS) table with explicit precision flags, * the MBCSAddTable() function marks some mappings for moving * to the extension table. * They fit into the base toUnicode table but not into the * base fromUnicode table. * (Note: We do have explicit precision flags because they are * required for extension table generation, and * ucm_checkBaseExt() verified it.) * * We do not call MBCSAddTable() here (we probably could) * so we need to do the analysis before building the extension table. * We assume that MBCSAddTable() will build a UTF-8-friendly table. * Redundant mappings in the extension table are ok except they cost some size. * * Do this after ucm_checkBaseExt(). */ const MBCSData *mbcsData=MBCSGetDummy(); int32_t needsMove=0; for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength; m<mLimit; ++m ) { if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) { m->f|=MBCS_FROM_U_EXT_FLAG; m->moveFlag=UCM_MOVE_TO_EXT; ++needsMove; } } if(needsMove!=0) { ucm_moveMappings(baseData.ucm->base, data->ucm->ext); ucm_sortTable(data->ucm->ext); } } if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) { *pErrorCode=U_INVALID_TABLE_FORMAT; } } } } cleanupConvData(&baseData); } } /* * Hey, Emacs, please set the following: * * Local Variables: * indent-tabs-mode: nil * End: * */