// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include <stdio.h> #include <string> #include <stdlib.h> #include <errno.h> #include <string.h> #include <iostream> #include <fstream> // We only use U8_* macros, which are entirely inline. #include "unicode/utf8.h" // This contains a codepage and ISO 14882:1998 illegality table. // Use "make gen-table" to rebuild it. #include "cptbl.h" /** * What is this? * * "This" is a preprocessor that makes an attempt to convert fully valid C++11 source code * in utf-8 into something consumable by certain compilers (Solaris, xlC) * which aren't quite standards compliant. * * - u"<unicode>" or u'<unicode>' gets converted to u"\uNNNN" or u'\uNNNN' * - u8"<unicode>" gets converted to "\xAA\xBB\xCC\xDD" etc. * (some compilers do not support the u8 prefix correctly.) * - if the system is EBCDIC-based, that is used to correct the input characters. * * Usage: * escapesrc infile.cpp outfile.cpp * Normally this is invoked by the build stage, with a rule such as: * * _%.cpp: $(srcdir)/%.cpp * @$(BINDIR)/escapesrc$(EXEEXT) $< $@ * %.o: _%.cpp * $(COMPILE.cc) ... $@ $< * * In the Makefiles, SKIP_ESCAPING=YES is used to prevent escapesrc.cpp * from being itself escaped. */ static const char kSPACE = 0x20, kTAB = 0x09, kLF = 0x0A, kCR = 0x0D; // For convenience # define cp1047_to_8859(c) cp1047_8859_1[c] // Our app's name std::string prog; /** * Give the usual 1-line documentation and exit */ void usage() { fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str()); } /** * Delete the output file (if any) * We want to delete even if we didn't generate, because it might be stale. */ int cleanup(const std::string &outfile) { const char *outstr = outfile.c_str(); if(outstr && *outstr) { int rc = std::remove(outstr); if(rc == 0) { fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr); return 0; } else { if( errno == ENOENT ) { return 0; // File did not exist - no error. } else { perror("std::remove"); return 1; } } } return 0; } /** * Skip across any known whitespace. * @param p startpoint * @param e limit * @return first non-whitespace char */ inline const char *skipws(const char *p, const char *e) { for(;p<e;p++) { switch(*p) { case kSPACE: case kTAB: case kLF: case kCR: break; default: return p; // non ws } } return p; } /** * Append a byte, hex encoded * @param outstr sstring to append to * @param byte the byte to append */ void appendByte(std::string &outstr, uint8_t byte) { char tmp2[5]; sprintf(tmp2, "\\x%02X", 0xFF & (int)(byte)); outstr += tmp2; } /** * Append the bytes from 'linestr' into outstr, with escaping * @param outstr the output buffer * @param linestr the input buffer * @param pos in/out: the current char under consideration * @param chars the number of chars to consider * @return true on failure */ bool appendUtf8(std::string &outstr, const std::string &linestr, size_t &pos, size_t chars) { char tmp[9]; for(size_t i=0;i<chars;i++) { tmp[i] = linestr[++pos]; } tmp[chars] = 0; unsigned int c; sscanf(tmp, "%X", &c); UChar32 ch = c & 0x1FFFFF; // now to append \\x%% etc uint8_t bytesNeeded = U8_LENGTH(ch); if(bytesNeeded == 0) { fprintf(stderr, "Illegal code point U+%X\n", ch); return true; } uint8_t bytes[4]; uint8_t *s = bytes; size_t i = 0; U8_APPEND_UNSAFE(s, i, ch); for(size_t t = 0; t<i; t++) { appendByte(outstr, s[t]); } return false; } /** * Fixup u8"x" * @param linestr string to mutate. Already escaped into \u format. * @param origpos beginning, points to 'u8"' * @param pos end, points to " * @return false for no-problem, true for failure! */ bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) { size_t pos = origpos + 3; std::string outstr; outstr += '\"'; // local encoding for(;pos<endpos;pos++) { char c = linestr[pos]; if(c == '\\') { char c2 = linestr[++pos]; switch(c2) { case '\'': case '"': #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) c2 = cp1047_to_8859(c2); #endif appendByte(outstr, c2); break; case 'u': appendUtf8(outstr, linestr, pos, 4); break; case 'U': appendUtf8(outstr, linestr, pos, 8); break; } } else { #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) c = cp1047_to_8859(c); #endif appendByte(outstr, c); } } outstr += ('\"'); linestr.replace(origpos, (endpos-origpos+1), outstr); return false; // OK } /** * fix the u"x"/u'x'/u8"x" string at the position * u8'x' is not supported, sorry. * @param linestr the input string * @param pos the position * @return false = no err, true = had err */ bool fixAt(std::string &linestr, size_t pos) { size_t origpos = pos; if(linestr[pos] != 'u') { fprintf(stderr, "Not a 'u'?"); return true; } pos++; // past 'u' bool utf8 = false; if(linestr[pos] == '8') { // u8" utf8 = true; pos++; } char quote = linestr[pos]; if(quote != '\'' && quote != '\"') { fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote); return true; } if(quote == '\'' && utf8) { fprintf(stderr, "Cannot do u8'...'\n"); return true; } pos ++; //printf("u%c…%c\n", quote, quote); for(; pos < linestr.size(); pos++) { if(linestr[pos] == quote) { if(utf8) { return fixu8(linestr, origpos, pos); // fix u8"..." } else { return false; // end of quote } } if(linestr[pos] == '\\') { pos++; if(linestr[pos] == quote) continue; // quoted quote if(linestr[pos] == 'u') continue; // for now ... unicode escape if(linestr[pos] == '\\') continue; // some other escape… ignore } else { size_t old_pos = pos; int32_t i = pos; #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) // mogrify 1-4 bytes from 1047 'back' to utf-8 char old_byte = linestr[pos]; linestr[pos] = cp1047_to_8859(linestr[pos]); // how many more? int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]); for(size_t pos2 = pos+1; trail>0; pos2++,trail--) { linestr[pos2] = cp1047_to_8859(linestr[pos2]); if(linestr[pos2] == 0x0A) { linestr[pos2] = 0x85; // NL is ambiguous here } } #endif // Proceed to decode utf-8 const uint8_t *s = (const uint8_t*) (linestr.c_str()); int32_t length = linestr.size(); UChar32 c; if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) { #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) linestr[pos] = old_byte; // put it back #endif continue; // single code point not previously legal for \u escaping } // otherwise, convert it to \u / \U { U8_NEXT(s, i, length, c); } if(c<0) { fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", (int)old_pos); fprintf(stderr, "Line: >>%s<<\n", linestr.c_str()); return true; } size_t seqLen = (i-pos); //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout); char newSeq[20]; if( c <= 0xFFFF) { sprintf(newSeq, "\\u%04X", c); } else { sprintf(newSeq, "\\U%08X", c); } linestr.replace(pos, seqLen, newSeq); pos += strlen(newSeq) - 1; } } return false; } /** * Fixup an entire line * false = no err * true = had err * @param no the line number (not used) * @param linestr the string to fix * @return true if any err, else false */ bool fixLine(int /*no*/, std::string &linestr) { const char *line = linestr.c_str(); size_t len = linestr.size(); // no u' in the line? if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) { return false; // Nothing to do. No u' or u" detected } // start from the end and find all u" cases size_t pos = len = linestr.size(); while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) { //printf("found doublequote at %d\n", pos); if(fixAt(linestr, pos)) return true; if(pos == 0) break; pos--; } // reset and find all u' cases pos = len = linestr.size(); while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) { //printf("found singlequote at %d\n", pos); if(fixAt(linestr, pos)) return true; if(pos == 0) break; pos--; } // reset and find all u8" cases pos = len = linestr.size(); while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) { if(fixAt(linestr, pos)) return true; if(pos == 0) break; pos--; } //fprintf(stderr, "%d - fixed\n", no); return false; } /** * Convert a whole file * @param infile * @param outfile * @return 1 on err, 0 otherwise */ int convert(const std::string &infile, const std::string &outfile) { fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str()); std::ifstream inf; inf.open(infile.c_str(), std::ios::in); if(!inf.is_open()) { fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str()); cleanup(outfile); return 1; } std::ofstream outf; outf.open(outfile.c_str(), std::ios::out); if(!outf.is_open()) { fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str()); return 1; } // TODO: any platform variations of #line? outf << "#line 1 \"" << infile << "\"" << '\n'; int no = 0; std::string linestr; while( getline( inf, linestr)) { no++; if(fixLine(no, linestr)) { outf.close(); fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str()); cleanup(outfile); return 1; } outf << linestr << '\n'; } return 0; } /** * Main function */ int main(int argc, const char *argv[]) { prog = argv[0]; if(argc != 3) { usage(); return 1; } std::string infile = argv[1]; std::string outfile = argv[2]; return convert(infile, outfile); }