#!/usr/bin/python # Copyright 2008 The RE2 Authors. All Rights Reserved. # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. """Generate C++ tables for Unicode Script and Category groups.""" import sys import unicode _header = """ // GENERATED BY make_unicode_groups.py; DO NOT EDIT. // make_unicode_groups.py >unicode_groups.cc #include "re2/unicode_groups.h" namespace re2 { """ _trailer = """ } // namespace re2 """ n16 = 0 n32 = 0 def MakeRanges(codes): """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]""" ranges = [] last = -100 for c in codes: if c == last+1: ranges[-1][1] = c else: ranges.append([c, c]) last = c return ranges def PrintRanges(type, name, ranges): """Print the ranges as an array of type named name.""" print "static %s %s[] = {" % (type, name,) for lo, hi in ranges: print "\t{ %d, %d }," % (lo, hi) print "};" # def PrintCodes(type, name, codes): # """Print the codes as an array of type named name.""" # print "static %s %s[] = {" % (type, name,) # for c in codes: # print "\t%d," % (c,) # print "};" def PrintGroup(name, codes): """Print the data structures for the group of codes. Return a UGroup literal for the group.""" # See unicode_groups.h for a description of the data structure. # Split codes into 16-bit ranges and 32-bit ranges. range16 = MakeRanges([c for c in codes if c < 65536]) range32 = MakeRanges([c for c in codes if c >= 65536]) # Pull singleton ranges out of range16. # code16 = [lo for lo, hi in range16 if lo == hi] # range16 = [[lo, hi] for lo, hi in range16 if lo != hi] global n16 global n32 n16 += len(range16) n32 += len(range32) ugroup = "{ \"%s\", +1" % (name,) # if len(code16) > 0: # PrintCodes("uint16", name+"_code16", code16) # ugroup += ", %s_code16, %d" % (name, len(code16)) # else: # ugroup += ", 0, 0" if len(range16) > 0: PrintRanges("URange16", name+"_range16", range16) ugroup += ", %s_range16, %d" % (name, len(range16)) else: ugroup += ", 0, 0" if len(range32) > 0: PrintRanges("URange32", name+"_range32", range32) ugroup += ", %s_range32, %d" % (name, len(range32)) else: ugroup += ", 0, 0" ugroup += " }" return ugroup def main(): print _header ugroups = [] for name, codes in unicode.Categories().iteritems(): ugroups.append(PrintGroup(name, codes)) for name, codes in unicode.Scripts().iteritems(): ugroups.append(PrintGroup(name, codes)) print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32) print "UGroup unicode_groups[] = {"; ugroups.sort() for ug in ugroups: print "\t%s," % (ug,) print "};" print "int num_unicode_groups = %d;" % (len(ugroups),) print _trailer if __name__ == '__main__': main()