#!/usr/bin/python # coding=utf-8 # # Copyright 2008 The RE2 Authors. All Rights Reserved. # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. # See unicode_casefold.h for description of case folding tables. """Generate C++ table for Unicode case folding.""" import unicode, sys _header = """ // GENERATED BY make_unicode_casefold.py; DO NOT EDIT. // make_unicode_casefold.py >unicode_casefold.cc #include "re2/unicode_casefold.h" namespace re2 { """ _trailer = """ } // namespace re2 """ def _Delta(a, b): """Compute the delta for b - a. Even/odd and odd/even are handled specially, as described above.""" if a+1 == b: if a%2 == 0: return 'EvenOdd' else: return 'OddEven' if a == b+1: if a%2 == 0: return 'OddEven' else: return 'EvenOdd' return b - a def _AddDelta(a, delta): """Return a + delta, handling EvenOdd and OddEven specially.""" if type(delta) == int: return a+delta if delta == 'EvenOdd': if a%2 == 0: return a+1 else: return a-1 if delta == 'OddEven': if a%2 == 1: return a+1 else: return a-1 print >>sys.stderr, "Bad Delta: ", delta raise "Bad Delta" def _MakeRanges(pairs): """Turn a list like [(65,97), (66, 98), ..., (90,122)] into [(65, 90, +32)].""" ranges = [] last = -100 def evenodd(last, a, b, r): if a != last+1 or b != _AddDelta(a, r[2]): return False r[1] = a return True def evenoddpair(last, a, b, r): if a != last+2: return False delta = r[2] d = delta if type(delta) is not str: return False if delta.endswith('Skip'): d = delta[:-4] else: delta = d + 'Skip' if b != _AddDelta(a, d): return False r[1] = a r[2] = delta return True for a, b in pairs: if ranges and evenodd(last, a, b, ranges[-1]): pass elif ranges and evenoddpair(last, a, b, ranges[-1]): pass else: ranges.append([a, a, _Delta(a, b)]) last = a return ranges # The maximum size of a case-folding group. # Case folding is implemented in parse.cc by a recursive process # with a recursion depth equal to the size of the largest # case-folding group, so it is important that this bound be small. # The current tables have no group bigger than 4. # If there are ever groups bigger than 10 or so, it will be # time to rework the code in parse.cc. MaxCasefoldGroup = 4 def main(): lowergroups, casegroups = unicode.CaseGroups() foldpairs = [] seen = {} for c in casegroups: if len(c) > MaxCasefoldGroup: raise unicode.Error("casefold group too long: %s" % (c,)) for i in range(len(c)): if c[i-1] in seen: raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i])) seen[c[i-1]] = True foldpairs.append([c[i-1], c[i]]) lowerpairs = [] for lower, group in lowergroups.iteritems(): for g in group: if g != lower: lowerpairs.append([g, lower]) def printpairs(name, foldpairs): foldpairs.sort() foldranges = _MakeRanges(foldpairs) print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges)) print "CaseFold unicode_%s[] = {" % (name,) for lo, hi, delta in foldranges: print "\t{ %d, %d, %s }," % (lo, hi, delta) print "};" print "int num_unicode_%s = %d;" % (name, len(foldranges),) print "" print _header printpairs("casefold", foldpairs) printpairs("tolower", lowerpairs) print _trailer if __name__ == '__main__': main()