ucol_tok.cpp - Android社区 - https://www.androidos.net.cn/

/*
*******************************************************************************
*
*   Copyright (C) 2001-2008, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  ucol_tok.cpp
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created 02/22/2001
*   created by: Vladimir Weinstein
*
* This module reads a tailoring rule string and produces a list of
* tokens that will be turned into collation elements
*
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/ustring.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"

#include "ucol_tok.h"
#include "ucol_bld.h"
#include "cmemory.h"
#include "../common/util.h"

U_CDECL_BEGIN
static int32_t U_CALLCONV
uhash_hashTokens(const UHashTok k)
{
    int32_t hash = 0;
    //uint32_t key = (uint32_t)k.integer;
    UColToken *key = (UColToken *)k.pointer;
    if (key != 0) {
        //int32_t len = (key & 0xFF000000)>>24;
        int32_t len = (key->source & 0xFF000000)>>24;
        int32_t inc = ((len - 32) / 32) + 1;

//const UChar *p = (key & 0x00FFFFFF) + rulesToParse;
        const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse;
        const UChar *limit = p + len;

while (p<limit) {
            hash = (hash * 37) + *p;
            p += inc;
        }
    }
    return hash;
}

static UBool U_CALLCONV
uhash_compareTokens(const UHashTok key1, const UHashTok key2)
{
    //uint32_t p1 = (uint32_t) key1.integer;
    //uint32_t p2 = (uint32_t) key2.integer;
    UColToken *p1 = (UColToken *)key1.pointer;
    UColToken *p2 = (UColToken *)key2.pointer;
    const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse;
    const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse;
    uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
    uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
    const UChar *end = s1+s1L-1;

if (p1 == p2) {
        return TRUE;
    }
    if (p1->source == 0 || p2->source == 0) {
        return FALSE;
    }
    if(s1L != s2L) {
        return FALSE;
    }
    if(p1->source == p2->source) {
        return TRUE;
    }
    while((s1 < end) && *s1 == *s2) {
        ++s1;
        ++s2;
    }
    if(*s1 == *s2) {
        return TRUE;
    } else {
        return FALSE;
    }
}
U_CDECL_END

/*static inline void U_CALLCONV
uhash_freeBlockWrapper(void *obj) {
    uhash_freeBlock(obj);
}*/

typedef struct {
    uint32_t startCE;
    uint32_t startContCE;
    uint32_t limitCE;
    uint32_t limitContCE;
} indirectBoundaries;

/* these values are used for finding CE values for indirect positioning. */
/* Indirect positioning is a mechanism for allowing resets on symbolic   */
/* values. It only works for resets and you cannot tailor indirect names */
/* An indirect name can define either an anchor point or a range. An     */
/* anchor point behaves in exactly the same way as a code point in reset */
/* would, except that it cannot be tailored. A range (we currently only  */
/* know for the [top] range will explicitly set the upper bound for      */
/* generated CEs, thus allowing for better control over how many CEs can */
/* be squeezed between in the range without performance penalty.         */
/* In that respect, we use [top] for tailoring of locales that use CJK   */
/* characters. Other indirect values are currently a pure convenience,   */
/* they can be used to assure that the CEs will be always positioned in  */
/* the same place relative to a point with known properties (e.g. first  */
/* primary ignorable). */
static indirectBoundaries ucolIndirectBoundaries[15];
/*
static indirectBoundaries ucolIndirectBoundaries[11] = {
{ UCOL_RESET_TOP_VALUE,               0,
UCOL_NEXT_TOP_VALUE,                0 },
{ UCOL_FIRST_PRIMARY_IGNORABLE,       0,
0,                                  0 },
{ UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
0,                                  0 },
{ UCOL_FIRST_SECONDARY_IGNORABLE,     0,
0,                                  0 },
{ UCOL_LAST_SECONDARY_IGNORABLE,      0,
0,                                  0 },
{ UCOL_FIRST_TERTIARY_IGNORABLE,      0,
0,                                  0 },
{ UCOL_LAST_TERTIARY_IGNORABLE,       0,
0,                                  0 },
{ UCOL_FIRST_VARIABLE,                0,
0,                                  0 },
{ UCOL_LAST_VARIABLE,                 0,
0,                                  0 },
{ UCOL_FIRST_NON_VARIABLE,            0,
0,                                  0 },
{ UCOL_LAST_NON_VARIABLE,             0,
0,                                  0 },
};
*/

static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {

// Set values for the top - TODO: once we have values for all the indirects, we are going
    // to initalize here.
    ucolIndirectBoundaries[indexR].startCE = start[0];
    ucolIndirectBoundaries[indexR].startContCE = start[1];
    if(end) {
        ucolIndirectBoundaries[indexR].limitCE = end[0];
        ucolIndirectBoundaries[indexR].limitContCE = end[1];
    } else {
        ucolIndirectBoundaries[indexR].limitCE = 0;
        ucolIndirectBoundaries[indexR].limitContCE = 0;
    }
}

static inline
void syntaxError(const UChar* rules,
                 int32_t pos,
                 int32_t rulesLen,
                 UParseError* parseError)
{
    parseError->offset = pos;
    parseError->line = 0 ; /* we are not using line numbers */

// for pre-context
    int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
    int32_t stop  = pos;

u_memcpy(parseError->preContext,rules+start,stop-start);
    //null terminate the buffer
    parseError->preContext[stop-start] = 0;

//for post-context
    start = pos+1;
    stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
    rulesLen;

if(start < stop) {
        u_memcpy(parseError->postContext,rules+start,stop-start);
        //null terminate the buffer
        parseError->postContext[stop-start]= 0;
    } else {
        parseError->postContext[0] = 0;
    }
}

static
void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
    switch(attrib) {
    case UCOL_HIRAGANA_QUATERNARY_MODE:
        opts->hiraganaQ = value;
        break;
    case UCOL_FRENCH_COLLATION:
        opts->frenchCollation = value;
        break;
    case UCOL_ALTERNATE_HANDLING:
        opts->alternateHandling = value;
        break;
    case UCOL_CASE_FIRST:
        opts->caseFirst = value;
        break;
    case UCOL_CASE_LEVEL:
        opts->caseLevel = value;
        break;
    case UCOL_NORMALIZATION_MODE:
        opts->normalizationMode = value;
        break;
    case UCOL_STRENGTH:
        opts->strength = value;
        break;
    case UCOL_NUMERIC_COLLATION:
        opts->numericCollation = value;
        break;
    case UCOL_ATTRIBUTE_COUNT:
    default:
        break;
    }
}

#define UTOK_OPTION_COUNT 20

static UBool didInit = FALSE;
/* we can be strict, or we can be lenient */
/* I'd surely be lenient with the option arguments */
/* maybe even with options */
U_STRING_DECL(suboption_00, "non-ignorable", 13);
U_STRING_DECL(suboption_01, "shifted",        7);

U_STRING_DECL(suboption_02, "lower",          5);
U_STRING_DECL(suboption_03, "upper",          5);
U_STRING_DECL(suboption_04, "off",            3);
U_STRING_DECL(suboption_05, "on",             2);
U_STRING_DECL(suboption_06, "1",              1);
U_STRING_DECL(suboption_07, "2",              1);
U_STRING_DECL(suboption_08, "3",              1);
U_STRING_DECL(suboption_09, "4",              1);
U_STRING_DECL(suboption_10, "I",              1);

U_STRING_DECL(suboption_11, "primary",        7);
U_STRING_DECL(suboption_12, "secondary",      9);
U_STRING_DECL(suboption_13, "tertiary",       8);
U_STRING_DECL(suboption_14, "variable",       8);
U_STRING_DECL(suboption_15, "regular",        7);
U_STRING_DECL(suboption_16, "implicit",       8);
U_STRING_DECL(suboption_17, "trailing",       8);

U_STRING_DECL(option_00,    "undefined",      9);
U_STRING_DECL(option_01,    "rearrange",      9);
U_STRING_DECL(option_02,    "alternate",      9);
U_STRING_DECL(option_03,    "backwards",      9);
U_STRING_DECL(option_04,    "variable top",  12);
U_STRING_DECL(option_05,    "top",            3);
U_STRING_DECL(option_06,    "normalization", 13);
U_STRING_DECL(option_07,    "caseLevel",      9);
U_STRING_DECL(option_08,    "caseFirst",      9);
U_STRING_DECL(option_09,    "scriptOrder",   11);
U_STRING_DECL(option_10,    "charsetname",   11);
U_STRING_DECL(option_11,    "charset",        7);
U_STRING_DECL(option_12,    "before",         6);
U_STRING_DECL(option_13,    "hiraganaQ",      9);
U_STRING_DECL(option_14,    "strength",       8);
U_STRING_DECL(option_15,    "first",          5);
U_STRING_DECL(option_16,    "last",           4);
U_STRING_DECL(option_17,    "optimize",       8);
U_STRING_DECL(option_18,    "suppressContractions",         20);
U_STRING_DECL(option_19,    "numericOrdering",              15);

/*
[last variable] last variable value
[last primary ignorable] largest CE for primary ignorable
[last secondary ignorable] largest CE for secondary ignorable
[last tertiary ignorable] largest CE for tertiary ignorable
[top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
*/

static const ucolTokSuboption alternateSub[2] = {
    {suboption_00, 13, UCOL_NON_IGNORABLE},
    {suboption_01,  7, UCOL_SHIFTED}
};

static const ucolTokSuboption caseFirstSub[3] = {
    {suboption_02, 5, UCOL_LOWER_FIRST},
    {suboption_03,  5, UCOL_UPPER_FIRST},
    {suboption_04,  3, UCOL_OFF},
};

static const ucolTokSuboption onOffSub[2] = {
    {suboption_04, 3, UCOL_OFF},
    {suboption_05, 2, UCOL_ON}
};

static const ucolTokSuboption frenchSub[1] = {
    {suboption_07, 1, UCOL_ON}
};

static const ucolTokSuboption beforeSub[3] = {
    {suboption_06, 1, UCOL_PRIMARY},
    {suboption_07, 1, UCOL_SECONDARY},
    {suboption_08, 1, UCOL_TERTIARY}
};

static const ucolTokSuboption strengthSub[5] = {
    {suboption_06, 1, UCOL_PRIMARY},
    {suboption_07, 1, UCOL_SECONDARY},
    {suboption_08, 1, UCOL_TERTIARY},
    {suboption_09, 1, UCOL_QUATERNARY},
    {suboption_10, 1, UCOL_IDENTICAL},
};

static const ucolTokSuboption firstLastSub[7] = {
    {suboption_11, 7, UCOL_PRIMARY},
    {suboption_12, 9, UCOL_PRIMARY},
    {suboption_13, 8, UCOL_PRIMARY},
    {suboption_14, 8, UCOL_PRIMARY},
    {suboption_15, 7, UCOL_PRIMARY},
    {suboption_16, 8, UCOL_PRIMARY},
    {suboption_17, 8, UCOL_PRIMARY},
};

enum OptionNumber {
    OPTION_ALTERNATE_HANDLING = 0,
    OPTION_FRENCH_COLLATION,
    OPTION_CASE_LEVEL,
    OPTION_CASE_FIRST,
    OPTION_NORMALIZATION_MODE,
    OPTION_HIRAGANA_QUATERNARY,
    OPTION_STRENGTH,
    OPTION_NUMERIC_COLLATION,
    OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
    OPTION_VARIABLE_TOP,
    OPTION_REARRANGE,
    OPTION_BEFORE,
    OPTION_TOP,
    OPTION_FIRST,
    OPTION_LAST,
    OPTION_OPTIMIZE,
    OPTION_SUPPRESS_CONTRACTIONS,
    OPTION_UNDEFINED,
    OPTION_SCRIPT_ORDER,
    OPTION_CHARSET_NAME,
    OPTION_CHARSET
} ;

static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
    /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
    /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
    /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
    /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
    /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
    /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
    /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
    /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
    /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
    /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
    /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
    /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
    /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
    /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
    /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
    /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
    /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
    /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
    /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
    /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"charset"        */
};

static
int32_t u_strncmpNoCase(const UChar     *s1,
                        const UChar     *s2,
                        int32_t     n)
{
    if(n > 0) {
        int32_t rc;
        for(;;) {
            rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
            if(rc != 0 || *s1 == 0 || --n == 0) {
                return rc;
            }
            ++s1;
            ++s2;
        }
    }
    return 0;
}

static
void ucol_uprv_tok_initData() {
    if(!didInit) {
        U_STRING_INIT(suboption_00, "non-ignorable", 13);
        U_STRING_INIT(suboption_01, "shifted",        7);

U_STRING_INIT(suboption_02, "lower",          5);
        U_STRING_INIT(suboption_03, "upper",          5);
        U_STRING_INIT(suboption_04, "off",            3);
        U_STRING_INIT(suboption_05, "on",             2);

U_STRING_INIT(suboption_06, "1",              1);
        U_STRING_INIT(suboption_07, "2",              1);
        U_STRING_INIT(suboption_08, "3",              1);
        U_STRING_INIT(suboption_09, "4",              1);
        U_STRING_INIT(suboption_10, "I",              1);

U_STRING_INIT(suboption_11, "primary",        7);
        U_STRING_INIT(suboption_12, "secondary",      9);
        U_STRING_INIT(suboption_13, "tertiary",       8);
        U_STRING_INIT(suboption_14, "variable",       8);
        U_STRING_INIT(suboption_15, "regular",        7);
        U_STRING_INIT(suboption_16, "implicit",       8);
        U_STRING_INIT(suboption_17, "trailing",       8);

U_STRING_INIT(option_00, "undefined",      9);
        U_STRING_INIT(option_01, "rearrange",      9);
        U_STRING_INIT(option_02, "alternate",      9);
        U_STRING_INIT(option_03, "backwards",      9);
        U_STRING_INIT(option_04, "variable top",  12);
        U_STRING_INIT(option_05, "top",            3);
        U_STRING_INIT(option_06, "normalization", 13);
        U_STRING_INIT(option_07, "caseLevel",      9);
        U_STRING_INIT(option_08, "caseFirst",      9);
        U_STRING_INIT(option_09, "scriptOrder",   11);
        U_STRING_INIT(option_10, "charsetname",   11);
        U_STRING_INIT(option_11, "charset",        7);
        U_STRING_INIT(option_12, "before",         6);
        U_STRING_INIT(option_13, "hiraganaQ",      9);
        U_STRING_INIT(option_14, "strength",       8);
        U_STRING_INIT(option_15, "first",          5);
        U_STRING_INIT(option_16, "last",           4);
        U_STRING_INIT(option_17, "optimize",       8);
        U_STRING_INIT(option_18, "suppressContractions",         20);
        U_STRING_INIT(option_19, "numericOrdering",      15);
        didInit = TRUE;
    }
}

// This function reads basic options to set in the runtime collator
// used by data driven tests. Should not support build time options
U_CAPI const UChar * U_EXPORT2
ucol_tok_getNextArgument(const UChar *start, const UChar *end,
                         UColAttribute *attrib, UColAttributeValue *value,
                         UErrorCode *status)
{
    uint32_t i = 0;
    int32_t j=0;
    UBool foundOption = FALSE;
    const UChar *optionArg = NULL;

ucol_uprv_tok_initData();

while(start < end && u_isWhitespace(*start)) { /* eat whitespace */
        start++;
    }
    if(start >= end) {
        return NULL;
    }
    /* skip opening '[' */
    if(*start == 0x005b) {
        start++;
    } else {
        *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
        return NULL;
    }

while(i < UTOK_OPTION_COUNT) {
        if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
            foundOption = TRUE;
            if(end - start > rulesOptions[i].optionLen) {
                optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
                while(u_isWhitespace(*optionArg)) { /* eat whitespace */
                    optionArg++;
                }
            }
            break;
        }
        i++;
    }

if(!foundOption) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

if(optionArg) {
        for(j = 0; j<rulesOptions[i].subSize; j++) {
            if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
                //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
                *attrib = rulesOptions[i].attr;
                *value = rulesOptions[i].subopts[j].attrVal;
                optionArg += rulesOptions[i].subopts[j].subLen;
                while(u_isWhitespace(*optionArg)) { /* eat whitespace */
                    optionArg++;
                }
                if(*optionArg == 0x005d) {
                    optionArg++;
                    return optionArg;
                } else {
                    *status = U_ILLEGAL_ARGUMENT_ERROR;
                    return NULL;
                }
            }
        }
    }
    *status = U_ILLEGAL_ARGUMENT_ERROR;
    return NULL;
}

static
USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
    while(*start != 0x005b) { /* advance while we find the first '[' */
        start++;
    }
    // now we need to get a balanced set of '[]'. The problem is that a set can have
    // many, and *end point to the first closing '['
    int32_t noOpenBraces = 1;
    int32_t current = 1; // skip the opening brace
    while(start+current < end && noOpenBraces != 0) {
        if(start[current] == 0x005b) {
            noOpenBraces++;
        } else if(start[current] == 0x005D) { // closing brace
            noOpenBraces--;
        }
        current++;
    }

if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    return uset_openPattern(start, current, status);
}

static
int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
    int32_t i = 0;
    ucol_uprv_tok_initData();

while(u_isWhitespace(*start)) { /* eat whitespace */
        start++;
    }
    while(i < UTOK_OPTION_COUNT) {
        if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
            if(end - start > rulesOptions[i].optionLen) {
                *optionArg = start+rulesOptions[i].optionLen; /* start of the options*/
                while(u_isWhitespace(**optionArg)) { /* eat whitespace */
                    (*optionArg)++;
                }
            }
            break;
        }
        i++;
    }
    if(i == UTOK_OPTION_COUNT) {
        i = -1; // didn't find an option
    }
    return i;
}

// reads and conforms to various options in rules
// end is the position of the first closing ']'
// However, some of the options take an UnicodeSet definition
// which needs to duplicate the closing ']'
// for example: '[copy [\uAC00-\uD7FF]]'
// These options will move end to the second ']' and the
// caller will set the current to it.
static
uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
    const UChar* start = src->current;
    int32_t i = 0;
    int32_t j=0;
    const UChar *optionArg = NULL;

uint8_t result = 0;

start++; /*skip opening '['*/
    i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
    if(optionArg) {
        src->current = optionArg;
    }

if(i < 0) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
    } else {
        int32_t noOpenBraces = 1;
        switch(i) {
    case OPTION_ALTERNATE_HANDLING:
    case OPTION_FRENCH_COLLATION:
    case OPTION_CASE_LEVEL:
    case OPTION_CASE_FIRST:
    case OPTION_NORMALIZATION_MODE:
    case OPTION_HIRAGANA_QUATERNARY:
    case OPTION_STRENGTH:
    case OPTION_NUMERIC_COLLATION:
        if(optionArg) {
            for(j = 0; j<rulesOptions[i].subSize; j++) {
                if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
                    ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
                    result =  UCOL_TOK_SUCCESS;
                }
            }
        }
        if(result == 0) {
            *status = U_ILLEGAL_ARGUMENT_ERROR;
        }
        break;
    case OPTION_VARIABLE_TOP:
        result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
        break;
    case OPTION_REARRANGE:
        result = UCOL_TOK_SUCCESS;
        break;
    case OPTION_BEFORE:
        if(optionArg) {
            for(j = 0; j<rulesOptions[i].subSize; j++) {
                if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
                    result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
                }
            }
        }
        if(result == 0) {
            *status = U_ILLEGAL_ARGUMENT_ERROR;
        }
        break;
    case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
        /* index to this array will be src->parsedToken.indirectIndex*/
        src->parsedToken.indirectIndex = 0;
        result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
        break;
    case OPTION_FIRST:
    case OPTION_LAST: /* first, last */
        for(j = 0; j<rulesOptions[i].subSize; j++) {
            if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
                // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
                // element of indirect boundaries is reserved for top.
                src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
                result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
            }
        }
        if(result == 0) {
            *status = U_ILLEGAL_ARGUMENT_ERROR;
        }
        break;
    case OPTION_OPTIMIZE:
    case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
        // we need to move end here
        src->current++; // skip opening brace
        while(src->current < src->end && noOpenBraces != 0) {
            if(*src->current == 0x005b) {
                noOpenBraces++;
            } else if(*src->current == 0x005D) { // closing brace
                noOpenBraces--;
            }
            src->current++;
        }
        result = UCOL_TOK_SUCCESS;
        break;
    default:
        *status = U_UNSUPPORTED_ERROR;
        break;
        }
    }
    src->current = u_memchr(src->current, 0x005d, src->end-src->current);
    return result;
}

inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
    if(src->extraCurrent+len >= src->extraEnd) {
        /* reallocate */
        UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
        if(newSrc != NULL) {
            src->current = newSrc + (src->current - src->source);
            src->extraCurrent = newSrc + (src->extraCurrent - src->source);
            src->end = newSrc + (src->end - src->source);
            src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
            src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
            src->source = newSrc;
        } else {
            *status = U_MEMORY_ALLOCATION_ERROR;
        }
    }
    if(len == 1) {
        *src->extraCurrent++ = *stuff;
    } else {
        uprv_memcpy(src->extraCurrent, stuff, len*sizeof(UChar));
        src->extraCurrent += len;
    }

}

inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
    /*
    top = TRUE;
    */
    UChar buff[5];
    src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
    buff[0] = 0xFFFE;
    buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
    buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
    if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
        src->parsedToken.charsLen = 3;
        ucol_tok_addToExtraCurrent(src, buff, 3, status);
    } else {
        buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
        buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
        src->parsedToken.charsLen = 5;
        ucol_tok_addToExtraCurrent(src, buff, 5, status);
    }
    return TRUE;
}

static UBool isCharNewLine(UChar c){
    switch(c){
    case 0x000A: /* LF  */
    case 0x000D: /* CR  */
    case 0x000C: /* FF  */
    case 0x0085: /* NEL */
    case 0x2028: /* LS  */
    case 0x2029: /* PS  */
        return TRUE;
    default:
        return FALSE;
    }
}

U_CAPI const UChar* U_EXPORT2
ucol_tok_parseNextToken(UColTokenParser *src,
                        UBool startOfRules,
                        UParseError *parseError,
                        UErrorCode *status)
{
    /* parsing part */
    UBool variableTop = FALSE;
    UBool top = FALSE;
    UBool inChars = TRUE;
    UBool inQuote = FALSE;
    UBool wasInQuote = FALSE;
    uint8_t before = 0;
    UBool isEscaped = FALSE;
    // TODO: replace these variables with src->parsedToken counterparts
    // no need to use them anymore since we have src->parsedToken.
    // Ideally, token parser would be a nice class... Once, when I have
    // more time (around 2020 probably).
    uint32_t newExtensionLen = 0;
    uint32_t extensionOffset = 0;
    uint32_t newStrength = UCOL_TOK_UNSET;
    UChar buff[10];

src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
    src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
    src->parsedToken.indirectIndex = 0;

while (src->current < src->end) {
        UChar ch = *(src->current);

if (inQuote) {
            if (ch == 0x0027/*'\''*/) {
                inQuote = FALSE;
            } else {
                if ((src->parsedToken.charsLen == 0) || inChars) {
                    if(src->parsedToken.charsLen == 0) {
                        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
                    }
                    src->parsedToken.charsLen++;
                } else {
                    if(newExtensionLen == 0) {
                        extensionOffset = (uint32_t)(src->extraCurrent - src->source);
                    }
                    newExtensionLen++;
                }
            }
        }else if(isEscaped){
            isEscaped =FALSE;
            if (newStrength == UCOL_TOK_UNSET) {
                *status = U_INVALID_FORMAT_ERROR;
                syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                return NULL;
                // enabling rules to start with non-tokens a < b
                // newStrength = UCOL_TOK_RESET;
            }
            if(ch != 0x0000  && src->current != src->end) {
                if (inChars) {
                    if(src->parsedToken.charsLen == 0) {
                        src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
                    }
                    src->parsedToken.charsLen++;
                } else {
                    if(newExtensionLen == 0) {
                        extensionOffset = (uint32_t)(src->current - src->source);
                    }
                    newExtensionLen++;
                }
            }
        }else {
            if(!uprv_isRuleWhiteSpace(ch)) {
                /* Sets the strength for this entry */
                switch (ch) {
                case 0x003D/*'='*/ :
                    if (newStrength != UCOL_TOK_UNSET) {
                        goto EndOfLoop;
                    }

case 0x002C/*','*/:
                    if (newStrength != UCOL_TOK_UNSET) {
                        goto EndOfLoop;
                    }

case  0x003B/*';'*/:
                    if (newStrength != UCOL_TOK_UNSET) {
                        goto EndOfLoop;
                    }

case 0x003C/*'<'*/:
                    if (newStrength != UCOL_TOK_UNSET) {
                        goto EndOfLoop;
                    }

/* if we start with strength, we'll reset to top */
                    if(startOfRules == TRUE) {
                        src->parsedToken.indirectIndex = 5;
                        top = ucol_tok_doSetTop(src, status);
                        newStrength = UCOL_TOK_RESET;
                        goto EndOfLoop;
                    }
                    /* before this, do a scan to verify whether this is */
                    /* another strength */
                    if(*(src->current+1) == 0x003C) {
                        src->current++;
                        if(*(src->current+1) == 0x003C) {
                            src->current++; /* three in a row! */
                            newStrength = UCOL_TERTIARY;
                        } else { /* two in a row */
                            newStrength = UCOL_SECONDARY;
                        }
                    } else { /* just one */
                        newStrength = UCOL_PRIMARY;
                    }
                    break;

case 0x0026/*'&'*/:
                    if (newStrength != UCOL_TOK_UNSET) {
                        /**/
                        goto EndOfLoop;
                    }

newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
                    break;

case 0x005b/*'['*/:
                    /* options - read an option, analyze it */
                    if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
                        uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
                        if(U_SUCCESS(*status)) {
                            if(result & UCOL_TOK_TOP) {
                                if(newStrength == UCOL_TOK_RESET) {
                                    top = ucol_tok_doSetTop(src, status);
                                    if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
                                        src->parsedToken.charsLen+=2;
                                        buff[0] = 0x002d;
                                        buff[1] = before;
                                        ucol_tok_addToExtraCurrent(src, buff, 2, status);
                                    }

src->current++;
                                    goto EndOfLoop;
                                } else {
                                    *status = U_INVALID_FORMAT_ERROR;
                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                                }
                            } else if(result & UCOL_TOK_VARIABLE_TOP) {
                                if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
                                    variableTop = TRUE;
                                    src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
                                    src->parsedToken.charsLen = 1;
                                    buff[0] = 0xFFFF;
                                    ucol_tok_addToExtraCurrent(src, buff, 1, status);
                                    src->current++;
                                    goto EndOfLoop;
                                } else {
                                    *status = U_INVALID_FORMAT_ERROR;
                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                                }
                            } else if (result & UCOL_TOK_BEFORE){
                                if(newStrength == UCOL_TOK_RESET) {
                                    before = result & UCOL_TOK_BEFORE;
                                } else {
                                    *status = U_INVALID_FORMAT_ERROR;
                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);

}
                            }
                        } else {
                            *status = U_INVALID_FORMAT_ERROR;
                            syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                            return NULL;
                        }
                    }
                    break;
                case 0x0021/*! skip java thai modifier reordering*/:
                    break;
                case 0x002F/*'/'*/:
                    wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
                    inChars = FALSE; /* we're now processing expansion */
                    break;
                case 0x005C /* back slash for escaped chars */:
                    isEscaped = TRUE;
                    break;
                    /* found a quote, we're gonna start copying */
                case 0x0027/*'\''*/:
                    if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
                        *status = U_INVALID_FORMAT_ERROR;
                        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                        return NULL;
                        // enabling rules to start with a non-token character a < b
                        // newStrength = UCOL_TOK_RESET;
                    }

inQuote = TRUE;

if(inChars) { /* we're doing characters */
                        if(wasInQuote == FALSE) {
                            src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
                        }
                        if (src->parsedToken.charsLen != 0) {
                            ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
                        }
                        src->parsedToken.charsLen++;
                    } else { /* we're doing an expansion */
                        if(wasInQuote == FALSE) {
                            extensionOffset = (uint32_t)(src->extraCurrent - src->source);
                        }
                        if (newExtensionLen != 0) {
                            ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
                        }
                        newExtensionLen++;
                    }

wasInQuote = TRUE;

ch = *(++(src->current));
                    if(ch == 0x0027) { /* copy the double quote */
                        ucol_tok_addToExtraCurrent(src, &ch, 1, status);
                        inQuote = FALSE;
                    }
                    break;

/* '@' is french only if the strength is not currently set */
                    /* if it is, it's just a regular character in collation rules */
                case 0x0040/*'@'*/:
                    if (newStrength == UCOL_TOK_UNSET) {
                        src->opts->frenchCollation = UCOL_ON;
                        break;
                    }

case 0x007C /*|*/: /* this means we have actually been reading prefix part */
                    // we want to store read characters to the prefix part and continue reading
                    // the characters (proper way would be to restart reading the chars, but in
                    // that case we would have to complicate the token hasher, which I do not
                    // intend to play with. Instead, we will do prefixes when prefixes are due
                    // (before adding the elements).
                    src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
                    src->parsedToken.prefixLen = src->parsedToken.charsLen;

wasInQuote = TRUE;

do {
                        ch = *(++(src->current));
                        // skip whitespace between '|' and the character
                    } while (uprv_isRuleWhiteSpace(ch));
                    break;

//charsOffset = 0;
                    //newCharsLen = 0;
                    //break; // We want to store the whole prefix/character sequence. If we break
                    // the '|' is going to get lost.
                case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
                    do {
                        ch = *(++(src->current));
                    } while (!isCharNewLine(ch));

break;
                default:
                    if (newStrength == UCOL_TOK_UNSET) {
                        *status = U_INVALID_FORMAT_ERROR;
                        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                        return NULL;
                    }

if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
                        *status = U_INVALID_FORMAT_ERROR;
                        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                        return NULL;
                    }

if(ch == 0x0000 && src->current+1 == src->end) {
                        break;
                    }

if (inChars) {
                        if(src->parsedToken.charsLen == 0) {
                            src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
                        }
                        src->parsedToken.charsLen++;
                    } else {
                        if(newExtensionLen == 0) {
                            extensionOffset = (uint32_t)(src->current - src->source);
                        }
                        newExtensionLen++;
                    }

break;
                }
            }
        }

if(wasInQuote) {
            if(ch != 0x27) {
                if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
                    ucol_tok_addToExtraCurrent(src, &ch, 1, status);
                }
            }
        }

src->current++;
    }

EndOfLoop:
    wasInQuote = FALSE;
    if (newStrength == UCOL_TOK_UNSET) {
        return NULL;
    }

if (src->parsedToken.charsLen == 0 && top == FALSE) {
        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
        *status = U_INVALID_FORMAT_ERROR;
        return NULL;
    }

src->parsedToken.strength = newStrength;
    src->parsedToken.extensionOffset = extensionOffset;
    src->parsedToken.extensionLen = newExtensionLen;
    src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;

return src->current;
}

/*
Processing Description
1 Build a ListList. Each list has a header, which contains two lists (positive
and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
reset may be null.
2 As you process, you keep a LAST pointer that points to the last token you
handled.
*/

static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
                                      UParseError *parseError, UErrorCode *status)
{
    if(src->resultLen == src->listCapacity) {
        // Unfortunately, this won't work, as we store addresses of lhs in token
        src->listCapacity *= 2;
        src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
        if(src->lh == NULL) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            return NULL;
        }
    }
    /* do the reset thing */
    UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
    /* test for NULL */
    if (sourceToken == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        return NULL;
    }
    sourceToken->rulesToParse = src->source;
    sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
    sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;

sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
    sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);

// keep the flags around so that we know about before
    sourceToken->flags = src->parsedToken.flags;

if(src->parsedToken.prefixOffset != 0) {
        // this is a syntax error
        *status = U_INVALID_FORMAT_ERROR;
        syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
        uprv_free(sourceToken);
        return 0;
    } else {
        sourceToken->prefix = 0;
    }

sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
    sourceToken->strength = UCOL_TOK_RESET;
    sourceToken->next = NULL;
    sourceToken->previous = NULL;
    sourceToken->noOfCEs = 0;
    sourceToken->noOfExpCEs = 0;
    sourceToken->listHeader = &src->lh[src->resultLen];

src->lh[src->resultLen].first = NULL;
    src->lh[src->resultLen].last = NULL;
    src->lh[src->resultLen].first = NULL;
    src->lh[src->resultLen].last = NULL;

src->lh[src->resultLen].reset = sourceToken;

/*
    3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
    First convert all expansions into normal form. Examples:
    If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
    d * ... into &x * c/y * d * ...
    Note: reset values can never have expansions, although they can cause the
    very next item to have one. They may be contractions, if they are found
    earlier in the list.
    */
    *expandNext = 0;
    if(expand != NULL) {
        /* check to see if there is an expansion */
        if(src->parsedToken.charsLen > 1) {
            uint32_t resetCharsOffset;
            resetCharsOffset = (uint32_t)(expand - src->source);
            sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
            *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
        }
    }

src->resultLen++;

uhash_put(src->tailored, sourceToken, sourceToken, status);

return sourceToken;
}

static
inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
    if(U_FAILURE(*status)) {
        return NULL;
    }
    /* this is a virgin before - we need to fish the anchor from the UCA */
    collIterate s;
    uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
    uint32_t CE, SecondCE;
    uint32_t invPos;
    if(sourceToken != NULL) {
        uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s);
    } else {
        uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s);
    }

baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
    baseContCE = ucol_getNextCE(src->UCA, &s, status);
    if(baseContCE == UCOL_NO_MORE_CES) {
        baseContCE = 0;
    }

UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
    uint32_t ch = 0;
    uint32_t expandNext = 0;
    UColToken key;

if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
        uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
        uint32_t raw = uprv_uca_getRawFromImplicit(primary);
        ch = uprv_uca_getCodePointFromRaw(raw-1);
        uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
        CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
        SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;

src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
        *src->extraCurrent++ = 0xFFFE;
        *src->extraCurrent++ = (UChar)ch;
        src->parsedToken.charsLen++;

key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
        key.rulesToParse = src->source;

//sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
        sourceToken = (UColToken *)uhash_get(src->tailored, &key);

if(sourceToken == NULL) {
            src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
            if(isContinuation(SecondCE)) {
                src->lh[src->resultLen].baseContCE = SecondCE;
            } else {
                src->lh[src->resultLen].baseContCE = 0;
            }
            src->lh[src->resultLen].nextCE = 0;
            src->lh[src->resultLen].nextContCE = 0;
            src->lh[src->resultLen].previousCE = 0;
            src->lh[src->resultLen].previousContCE = 0;

src->lh[src->resultLen].indirect = FALSE;

sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
        }

} else {
        invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);

// we got the previous CE. Now we need to see if the difference between
        // the two CEs is really of the requested strength.
        // if it's a bigger difference (we asked for secondary and got primary), we
        // need to modify the CE.
        if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
            // adjust the strength
            // now we are in the situation where our baseCE should actually be modified in
            // order to get the CE in the right position.
            if(strength == UCOL_SECONDARY) {
                CE = baseCE - 0x0200;
            } else { // strength == UCOL_TERTIARY
                CE = baseCE - 0x02;
            }
            if(baseContCE) {
                if(strength == UCOL_SECONDARY) {
                    SecondCE = baseContCE - 0x0200;
                } else { // strength == UCOL_TERTIARY
                    SecondCE = baseContCE - 0x02;
                }
            }
        }

#if 0
        // the code below relies on getting a code point from the inverse table, in order to be
        // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
        // 1. There are many code points that have the same CE
        // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
        // Also, in case when there is no equivalent strength before an element, we have to actually
        // construct one. For example, &[before 2]a << x won't result in x << a, because the element
        // before a is a primary difference.

//uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);

ch = CETable[3*invPos+2];

if((ch &  UCOL_INV_SIZEMASK) != 0) {
            uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
            uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
            ch = conts[offset];
        }

*src->extraCurrent++ = (UChar)ch;
        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
        src->parsedToken.charsLen = 1;

// We got an UCA before. However, this might have been tailored.
        // example:
        // &\u30ca = \u306a
        // &[before 3]\u306a<<<\u306a|\u309d

// uint32_t key = (*newCharsLen << 24) | *charsOffset;
        key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
        key.rulesToParse = src->source;

//sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
#endif

// here is how it should be. The situation such as &[before 1]a < x, should be
        // resolved exactly as if we wrote &a > x.
        // therefore, I don't really care if the UCA value before a has been changed.
        // However, I do care if the strength between my element and the previous element
        // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
        // have to construct the base CE.

// if we found a tailored thing, we have to use the UCA value and construct
        // a new reset token with constructed name
        //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
        // character to which we want to anchor is already tailored.
        // We need to construct a new token which will be the anchor
        // point
        //*(src->extraCurrent-1) = 0xFFFE;
        //*src->extraCurrent++ = (UChar)ch;
        // grab before
        src->parsedToken.charsOffset -= 10;
        src->parsedToken.charsLen += 10;
        src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
        if(isContinuation(SecondCE)) {
            src->lh[src->resultLen].baseContCE = SecondCE;
        } else {
            src->lh[src->resultLen].baseContCE = 0;
        }
        src->lh[src->resultLen].nextCE = 0;
        src->lh[src->resultLen].nextContCE = 0;
        src->lh[src->resultLen].previousCE = 0;
        src->lh[src->resultLen].previousContCE = 0;

src->lh[src->resultLen].indirect = FALSE;

sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
        //}
    }

return sourceToken;

}

uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
    UColToken *lastToken = NULL;
    const UChar *parseEnd = NULL;
    uint32_t expandNext = 0;
    UBool variableTop = FALSE;
    UBool top = FALSE;
    uint16_t specs = 0;
    UColTokListHeader *ListList = NULL;

src->parsedToken.strength = UCOL_TOK_UNSET;

ListList = src->lh;

if(U_FAILURE(*status)) {
        return 0;
    }

while(src->current < src->end) {
        src->parsedToken.prefixOffset = 0;

parseEnd = ucol_tok_parseNextToken(src,
            (UBool)(lastToken == NULL),
            parseError,
            status);

specs = src->parsedToken.flags;

variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
        top = ((specs & UCOL_TOK_TOP) != 0);

if(U_SUCCESS(*status) && parseEnd != NULL) {
            UColToken *sourceToken = NULL;
            //uint32_t key = 0;
            uint32_t lastStrength = UCOL_TOK_UNSET;

if(lastToken != NULL ) {
                lastStrength = lastToken->strength;
            }

//key = newCharsLen << 24 | charsOffset;
            UColToken key;
            key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
            key.rulesToParse = src->source;

/*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
            sourceToken = (UColToken *)uhash_get(src->tailored, &key);

if(src->parsedToken.strength != UCOL_TOK_RESET) {
                if(lastToken == NULL) { /* this means that rules haven't started properly */
                    *status = U_INVALID_FORMAT_ERROR;
                    syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
                    return 0;
                }
                /*  6 Otherwise (when relation != reset) */
                if(sourceToken == NULL) {
                    /* If sourceToken is null, create new one, */
                    sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
                    /* test for NULL */
                    if (sourceToken == NULL) {
                        *status = U_MEMORY_ALLOCATION_ERROR;
                        return 0;
                    }
                    sourceToken->rulesToParse = src->source;
                    sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;

sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);

sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
                    sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);

sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
                    sourceToken->next = NULL;
                    sourceToken->previous = NULL;
                    sourceToken->noOfCEs = 0;
                    sourceToken->noOfExpCEs = 0;
                    // keep the flags around so that we know about before
                    sourceToken->flags = src->parsedToken.flags;
                    uhash_put(src->tailored, sourceToken, sourceToken, status);
                    if(U_FAILURE(*status)) {
                        return 0;
                    }
                } else {
                    /* we could have fished out a reset here */
                    if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
                        /* otherwise remove sourceToken from where it was. */
                        if(sourceToken->next != NULL) {
                            if(sourceToken->next->strength > sourceToken->strength) {
                                sourceToken->next->strength = sourceToken->strength;
                            }
                            sourceToken->next->previous = sourceToken->previous;
                        } else {
                            sourceToken->listHeader->last = sourceToken->previous;
                        }

if(sourceToken->previous != NULL) {
                            sourceToken->previous->next = sourceToken->next;
                        } else {
                            sourceToken->listHeader->first = sourceToken->next;
                        }
                        sourceToken->next = NULL;
                        sourceToken->previous = NULL;
                    }
                }

sourceToken->strength = src->parsedToken.strength;
                sourceToken->listHeader = lastToken->listHeader;

/*
                1.  Find the strongest strength in each list, and set strongestP and strongestN
                accordingly in the headers.
                */
                if(lastStrength == UCOL_TOK_RESET
                    || sourceToken->listHeader->first == 0) {
                        /* If LAST is a reset
                        insert sourceToken in the list. */
                        if(sourceToken->listHeader->first == 0) {
                            sourceToken->listHeader->first = sourceToken;
                            sourceToken->listHeader->last = sourceToken;
                        } else { /* we need to find a place for us */
                            /* and we'll get in front of the same strength */
                            if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
                                sourceToken->next = sourceToken->listHeader->first;
                                sourceToken->next->previous = sourceToken;
                                sourceToken->listHeader->first = sourceToken;
                                sourceToken->previous = NULL;
                            } else {
                                lastToken = sourceToken->listHeader->first;
                                while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
                                    lastToken = lastToken->next;
                                }
                                if(lastToken->next != NULL) {
                                    lastToken->next->previous = sourceToken;
                                } else {
                                    sourceToken->listHeader->last = sourceToken;
                                }
                                sourceToken->previous = lastToken;
                                sourceToken->next = lastToken->next;
                                lastToken->next = sourceToken;
                            }
                        }
                    } else {
                        /* Otherwise (when LAST is not a reset)
                        if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
                        otherwise insert before.
                        when inserting after or before, search to the next position with the same
                        strength in that direction. (This is called postpone insertion).         */
                        if(sourceToken != lastToken) {
                            if(lastToken->polarity == sourceToken->polarity) {
                                while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
                                    lastToken = lastToken->next;
                                }
                                sourceToken->previous = lastToken;
                                if(lastToken->next != NULL) {
                                    lastToken->next->previous = sourceToken;
                                } else {
                                    sourceToken->listHeader->last = sourceToken;
                                }

sourceToken->next = lastToken->next;
                                lastToken->next = sourceToken;
                            } else {
                                while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
                                    lastToken = lastToken->previous;
                                }
                                sourceToken->next = lastToken;
                                if(lastToken->previous != NULL) {
                                    lastToken->previous->next = sourceToken;
                                } else {
                                    sourceToken->listHeader->first = sourceToken;
                                }
                                sourceToken->previous = lastToken->previous;
                                lastToken->previous = sourceToken;
                            }
                        } else { /* repeated one thing twice in rules, stay with the stronger strength */
                            if(lastStrength < sourceToken->strength) {
                                sourceToken->strength = lastStrength;
                            }
                        }
                    }

/* if the token was a variable top, we're gonna put it in */
                    if(variableTop == TRUE && src->varTop == NULL) {
                        variableTop = FALSE;
                        src->varTop = sourceToken;
                    }

// Treat the expansions.
                    // There are two types of expansions: explicit (x / y) and reset based propagating expansions
                    // (&abc * d * e <=> &ab * d / c * e / c)
                    // if both of them are in effect for a token, they are combined.

sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;

if(expandNext != 0) {
                        if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
                            expandNext = 0;
                        } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
                            sourceToken->expansion = expandNext;
                        } else { /* there is both explicit and implicit expansion. We need to make a combination */
                            uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
                            uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
                            sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
                            src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
                        }
                    }

// This is just for debugging purposes
                    if(sourceToken->expansion != 0) {
                        sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
                    } else {
                        sourceToken->debugExpansion = 0;
                    }
                    // if the previous token was a reset before, the strength of this
                    // token must match the strength of before. Otherwise we have an
                    // undefined situation.
                    // In other words, we currently have a cludge which we use to
                    // represent &a >> x. This is written as &[before 2]a << x.
                    if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
                        uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
                        if(beforeStrength != sourceToken->strength) {
                            *status = U_INVALID_FORMAT_ERROR;
                            syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
                            return 0;
                        }
                    }
            } else {
                if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
                    /* if the previous token was also a reset, */
                    /*this means that we have two consecutive resets */
                    /* and we want to remove the previous one if empty*/
                    if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
                        src->resultLen--;
                    }
                }

if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
                    uint32_t searchCharsLen = src->parsedToken.charsLen;
                    while(searchCharsLen > 1 && sourceToken == NULL) {
                        searchCharsLen--;
                        //key = searchCharsLen << 24 | charsOffset;
                        UColToken key;
                        key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
                        key.rulesToParse = src->source;
                        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
                    }
                    if(sourceToken != NULL) {
                        expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
                    }
                }

if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
                    if(top == FALSE) { /* there is no indirection */
                        uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
                        if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
                            /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
                            while(sourceToken->strength > strength && sourceToken->previous != NULL) {
                                sourceToken = sourceToken->previous;
                            }
                            /* here, either we hit the strength or NULL */
                            if(sourceToken->strength == strength) {
                                if(sourceToken->previous != NULL) {
                                    sourceToken = sourceToken->previous;
                                } else { /* start of list */
                                    sourceToken = sourceToken->listHeader->reset;
                                }
                            } else { /* we hit NULL */
                                /* we should be doing the else part */
                                sourceToken = sourceToken->listHeader->reset;
                                sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
                            }
                        } else {
                            sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
                        }
                    } else { /* this is both before and indirection */
                        top = FALSE;
                        ListList[src->resultLen].previousCE = 0;
                        ListList[src->resultLen].previousContCE = 0;
                        ListList[src->resultLen].indirect = TRUE;
                        /* we need to do slightly more work. we need to get the baseCE using the */
                        /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
                        /* in ucol_bld */
                        uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
                        uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
                        uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
                        uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;

UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
                        if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
                            uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
                            uint32_t raw = uprv_uca_getRawFromImplicit(primary);
                            uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
                            CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
                            SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
                        } else {
                            /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
                            ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
                        }

ListList[src->resultLen].baseCE = CE;
                        ListList[src->resultLen].baseContCE = SecondCE;
                        ListList[src->resultLen].nextCE = 0;
                        ListList[src->resultLen].nextContCE = 0;

sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
                    }
                }

/*  5 If the relation is a reset:
                If sourceToken is null
                Create new list, create new sourceToken, make the baseCE from source, put
                the sourceToken in ListHeader of the new list */
                if(sourceToken == NULL) {
                    /*
                    3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
                    First convert all expansions into normal form. Examples:
                    If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
                    d * ... into &x * c/y * d * ...
                    Note: reset values can never have expansions, although they can cause the
                    very next item to have one. They may be contractions, if they are found
                    earlier in the list.
                    */
                    if(top == FALSE) {
                        collIterate s;
                        uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;

uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s);

CE = ucol_getNextCE(src->UCA, &s, status);
                        UChar *expand = s.pos;
                        SecondCE = ucol_getNextCE(src->UCA, &s, status);

ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
                        if(isContinuation(SecondCE)) {
                            ListList[src->resultLen].baseContCE = SecondCE;
                        } else {
                            ListList[src->resultLen].baseContCE = 0;
                        }
                        ListList[src->resultLen].nextCE = 0;
                        ListList[src->resultLen].nextContCE = 0;
                        ListList[src->resultLen].previousCE = 0;
                        ListList[src->resultLen].previousContCE = 0;
                        ListList[src->resultLen].indirect = FALSE;
                        sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
                    } else { /* top == TRUE */
                        /* just use the supplied values */
                        top = FALSE;
                        ListList[src->resultLen].previousCE = 0;
                        ListList[src->resultLen].previousContCE = 0;
                        ListList[src->resultLen].indirect = TRUE;
                        ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
                        ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
                        ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
                        ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;

sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);

}
                } else { /* reset to something already in rules */
                    top = FALSE;
                }
            }
            /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
            lastToken = sourceToken;
        } else {
            if(U_FAILURE(*status)) {
                return 0;
            }
        }
    }

if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
        src->resultLen--;
    }
    return src->resultLen;
}

void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) {
    U_NAMESPACE_USE

uint32_t nSize = 0;
    uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
    if(U_FAILURE(*status)) {
        return;
    }

// set everything to zero, so that we can clean up gracefully
    uprv_memset(src, 0, sizeof(UColTokenParser));

// first we need to find options that don't like to be normalized,
    // like copy and remove...
    //const UChar *openBrace = rules;
    int32_t optionNumber = -1;
    const UChar *setStart = NULL;
    uint32_t i = 0;
    while(i < rulesLength) {
        if(rules[i] == 0x005B) {
            // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
            //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
            optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
            if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
                USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
                if(U_SUCCESS(*status)) {
                    if(src->copySet == NULL) {
                        src->copySet = newSet;
                    } else {
                        uset_addAll(src->copySet, newSet);
                        uset_close(newSet);
                    }
                } else {
                    return;
                }
            } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
                USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
                if(U_SUCCESS(*status)) {
                    if(src->removeSet == NULL) {
                        src->removeSet = newSet;
                    } else {
                        uset_addAll(src->removeSet, newSet);
                        uset_close(newSet);
                    }
                } else {
                    return;
                }
            }
        }
        //openBrace++;
        i++;
    }

src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
    /* test for NULL */
    if (src->source == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
    nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
    if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
        *status = U_ZERO_ERROR;
        src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
        /* test for NULL */
        if (src->source == NULL) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            return;
        }
        nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
    }
    src->current = src->source;
    src->end = src->source+nSize;
    src->sourceCurrent = src->source;
    src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
    src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
    src->varTop = NULL;
    src->UCA = UCA;
    src->invUCA = ucol_initInverseUCA(status);
    src->parsedToken.charsLen = 0;
    src->parsedToken.charsOffset = 0;
    src->parsedToken.extensionLen = 0;
    src->parsedToken.extensionOffset = 0;
    src->parsedToken.prefixLen = 0;
    src->parsedToken.prefixOffset = 0;
    src->parsedToken.flags = 0;
    src->parsedToken.strength = UCOL_TOK_UNSET;
    src->buildCCTabFlag = FALSE;

if(U_FAILURE(*status)) {
        return;
    }
    src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
    if(U_FAILURE(*status)) {
        return;
    }
    uhash_setValueDeleter(src->tailored, uhash_freeBlock);

src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
    /* test for NULL */
    if (src->opts == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }

uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));

// rulesToParse = src->source;
    src->lh = 0;
    src->listCapacity = 1024;
    src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
    //Test for NULL
    if (src->lh == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
    src->resultLen = 0;

UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);

// UCOL_RESET_TOP_VALUE
    setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
    // UCOL_FIRST_PRIMARY_IGNORABLE
    setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
    // UCOL_LAST_PRIMARY_IGNORABLE
    setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
    // UCOL_FIRST_SECONDARY_IGNORABLE
    setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
    // UCOL_LAST_SECONDARY_IGNORABLE
    setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
    // UCOL_FIRST_TERTIARY_IGNORABLE
    setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
    // UCOL_LAST_TERTIARY_IGNORABLE
    setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
    // UCOL_FIRST_VARIABLE
    setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
    // UCOL_LAST_VARIABLE
    setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
    // UCOL_FIRST_NON_VARIABLE
    setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
    // UCOL_LAST_NON_VARIABLE
    setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
    // UCOL_FIRST_IMPLICIT
    setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
    // UCOL_LAST_IMPLICIT
    setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
    // UCOL_FIRST_TRAILING
    setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
    // UCOL_LAST_TRAILING
    setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
    ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
}

void ucol_tok_closeTokenList(UColTokenParser *src) {
    if(src->copySet != NULL) {
        uset_close(src->copySet);
    }
    if(src->removeSet != NULL) {
        uset_close(src->removeSet);
    }
    if(src->tailored != NULL) {
        uhash_close(src->tailored);
    }
    if(src->lh != NULL) {
        uprv_free(src->lh);
    }
    if(src->source != NULL) {
        uprv_free(src->source);
    }
    if(src->opts != NULL) {
        uprv_free(src->opts);
    }
}

#endif /* #if !UCONFIG_NO_COLLATION */

C++程序 | 1916行 | 79 KB

/*
*******************************************************************************
*
*   Copyright (C) 2001-2008, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  ucol_tok.cpp
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created 02/22/2001
*   created by: Vladimir Weinstein
*
* This module reads a tailoring rule string and produces a list of
* tokens that will be turned into collation elements
*
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/ustring.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"

#include "ucol_tok.h"
#include "ucol_bld.h"
#include "cmemory.h"
#include "../common/util.h"

U_CDECL_BEGIN
static int32_t U_CALLCONV
uhash_hashTokens(const UHashTok k)
{
    int32_t hash = 0;
    //uint32_t key = (uint32_t)k.integer;
    UColToken *key = (UColToken *)k.pointer;
    if (key != 0) {
        //int32_t len = (key & 0xFF000000)>>24;
        int32_t len = (key->source & 0xFF000000)>>24;
        int32_t inc = ((len - 32) / 32) + 1;

        //const UChar *p = (key & 0x00FFFFFF) + rulesToParse;
        const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse;
        const UChar *limit = p + len;

        while (p<limit) {
            hash = (hash * 37) + *p;
            p += inc;
        }
    }
    return hash;
}

static UBool U_CALLCONV
uhash_compareTokens(const UHashTok key1, const UHashTok key2)
{
    //uint32_t p1 = (uint32_t) key1.integer;
    //uint32_t p2 = (uint32_t) key2.integer;
    UColToken *p1 = (UColToken *)key1.pointer;
    UColToken *p2 = (UColToken *)key2.pointer;
    const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse;
    const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse;
    uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
    uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
    const UChar *end = s1+s1L-1;

    if (p1 == p2) {
        return TRUE;
    }
    if (p1->source == 0 || p2->source == 0) {
        return FALSE;
    }
    if(s1L != s2L) {
        return FALSE;
    }
    if(p1->source == p2->source) {
        return TRUE;
    }
    while((s1 < end) && *s1 == *s2) {
        ++s1;
        ++s2;
    }
    if(*s1 == *s2) {
        return TRUE;
    } else {
        return FALSE;
    }
}
U_CDECL_END

/*static inline void U_CALLCONV
uhash_freeBlockWrapper(void *obj) {
    uhash_freeBlock(obj);
}*/


typedef struct {
    uint32_t startCE;
    uint32_t startContCE;
    uint32_t limitCE;
    uint32_t limitContCE;
} indirectBoundaries;

/* these values are used for finding CE values for indirect positioning. */
/* Indirect positioning is a mechanism for allowing resets on symbolic   */
/* values. It only works for resets and you cannot tailor indirect names */
/* An indirect name can define either an anchor point or a range. An     */
/* anchor point behaves in exactly the same way as a code point in reset */
/* would, except that it cannot be tailored. A range (we currently only  */
/* know for the [top] range will explicitly set the upper bound for      */
/* generated CEs, thus allowing for better control over how many CEs can */
/* be squeezed between in the range without performance penalty.         */
/* In that respect, we use [top] for tailoring of locales that use CJK   */
/* characters. Other indirect values are currently a pure convenience,   */
/* they can be used to assure that the CEs will be always positioned in  */
/* the same place relative to a point with known properties (e.g. first  */
/* primary ignorable). */
static indirectBoundaries ucolIndirectBoundaries[15];
/*
static indirectBoundaries ucolIndirectBoundaries[11] = {
{ UCOL_RESET_TOP_VALUE,               0,
UCOL_NEXT_TOP_VALUE,                0 },
{ UCOL_FIRST_PRIMARY_IGNORABLE,       0,
0,                                  0 },
{ UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
0,                                  0 },
{ UCOL_FIRST_SECONDARY_IGNORABLE,     0,
0,                                  0 },
{ UCOL_LAST_SECONDARY_IGNORABLE,      0,
0,                                  0 },
{ UCOL_FIRST_TERTIARY_IGNORABLE,      0,
0,                                  0 },
{ UCOL_LAST_TERTIARY_IGNORABLE,       0,
0,                                  0 },
{ UCOL_FIRST_VARIABLE,                0,
0,                                  0 },
{ UCOL_LAST_VARIABLE,                 0,
0,                                  0 },
{ UCOL_FIRST_NON_VARIABLE,            0,
0,                                  0 },
{ UCOL_LAST_NON_VARIABLE,             0,
0,                                  0 },
};
*/

static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {

    // Set values for the top - TODO: once we have values for all the indirects, we are going
    // to initalize here.
    ucolIndirectBoundaries[indexR].startCE = start[0];
    ucolIndirectBoundaries[indexR].startContCE = start[1];
    if(end) {
        ucolIndirectBoundaries[indexR].limitCE = end[0];
        ucolIndirectBoundaries[indexR].limitContCE = end[1];
    } else {
        ucolIndirectBoundaries[indexR].limitCE = 0;
        ucolIndirectBoundaries[indexR].limitContCE = 0;
    }
}


static inline
void syntaxError(const UChar* rules,
                 int32_t pos,
                 int32_t rulesLen,
                 UParseError* parseError)
{
    parseError->offset = pos;
    parseError->line = 0 ; /* we are not using line numbers */

    // for pre-context
    int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
    int32_t stop  = pos;

    u_memcpy(parseError->preContext,rules+start,stop-start);
    //null terminate the buffer
    parseError->preContext[stop-start] = 0;

    //for post-context
    start = pos+1;
    stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
    rulesLen;

    if(start < stop) {
        u_memcpy(parseError->postContext,rules+start,stop-start);
        //null terminate the buffer
        parseError->postContext[stop-start]= 0;
    } else {
        parseError->postContext[0] = 0;
    }
}

static
void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
    switch(attrib) {
    case UCOL_HIRAGANA_QUATERNARY_MODE:
        opts->hiraganaQ = value;
        break;
    case UCOL_FRENCH_COLLATION:
        opts->frenchCollation = value;
        break;
    case UCOL_ALTERNATE_HANDLING:
        opts->alternateHandling = value;
        break;
    case UCOL_CASE_FIRST:
        opts->caseFirst = value;
        break;
    case UCOL_CASE_LEVEL:
        opts->caseLevel = value;
        break;
    case UCOL_NORMALIZATION_MODE:
        opts->normalizationMode = value;
        break;
    case UCOL_STRENGTH:
        opts->strength = value;
        break;
    case UCOL_NUMERIC_COLLATION:
        opts->numericCollation = value;
        break;
    case UCOL_ATTRIBUTE_COUNT:
    default:
        break;
    }
}

#define UTOK_OPTION_COUNT 20

static UBool didInit = FALSE;
/* we can be strict, or we can be lenient */
/* I'd surely be lenient with the option arguments */
/* maybe even with options */
U_STRING_DECL(suboption_00, "non-ignorable", 13);
U_STRING_DECL(suboption_01, "shifted",        7);

U_STRING_DECL(suboption_02, "lower",          5);
U_STRING_DECL(suboption_03, "upper",          5);
U_STRING_DECL(suboption_04, "off",            3);
U_STRING_DECL(suboption_05, "on",             2);
U_STRING_DECL(suboption_06, "1",              1);
U_STRING_DECL(suboption_07, "2",              1);
U_STRING_DECL(suboption_08, "3",              1);
U_STRING_DECL(suboption_09, "4",              1);
U_STRING_DECL(suboption_10, "I",              1);

U_STRING_DECL(suboption_11, "primary",        7);
U_STRING_DECL(suboption_12, "secondary",      9);
U_STRING_DECL(suboption_13, "tertiary",       8);
U_STRING_DECL(suboption_14, "variable",       8);
U_STRING_DECL(suboption_15, "regular",        7);
U_STRING_DECL(suboption_16, "implicit",       8);
U_STRING_DECL(suboption_17, "trailing",       8);


U_STRING_DECL(option_00,    "undefined",      9);
U_STRING_DECL(option_01,    "rearrange",      9);
U_STRING_DECL(option_02,    "alternate",      9);
U_STRING_DECL(option_03,    "backwards",      9);
U_STRING_DECL(option_04,    "variable top",  12);
U_STRING_DECL(option_05,    "top",            3);
U_STRING_DECL(option_06,    "normalization", 13);
U_STRING_DECL(option_07,    "caseLevel",      9);
U_STRING_DECL(option_08,    "caseFirst",      9);
U_STRING_DECL(option_09,    "scriptOrder",   11);
U_STRING_DECL(option_10,    "charsetname",   11);
U_STRING_DECL(option_11,    "charset",        7);
U_STRING_DECL(option_12,    "before",         6);
U_STRING_DECL(option_13,    "hiraganaQ",      9);
U_STRING_DECL(option_14,    "strength",       8);
U_STRING_DECL(option_15,    "first",          5);
U_STRING_DECL(option_16,    "last",           4);
U_STRING_DECL(option_17,    "optimize",       8);
U_STRING_DECL(option_18,    "suppressContractions",         20);
U_STRING_DECL(option_19,    "numericOrdering",              15);


/*
[last variable] last variable value
[last primary ignorable] largest CE for primary ignorable
[last secondary ignorable] largest CE for secondary ignorable
[last tertiary ignorable] largest CE for tertiary ignorable
[top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
*/


static const ucolTokSuboption alternateSub[2] = {
    {suboption_00, 13, UCOL_NON_IGNORABLE},
    {suboption_01,  7, UCOL_SHIFTED}
};

static const ucolTokSuboption caseFirstSub[3] = {
    {suboption_02, 5, UCOL_LOWER_FIRST},
    {suboption_03,  5, UCOL_UPPER_FIRST},
    {suboption_04,  3, UCOL_OFF},
};

static const ucolTokSuboption onOffSub[2] = {
    {suboption_04, 3, UCOL_OFF},
    {suboption_05, 2, UCOL_ON}
};

static const ucolTokSuboption frenchSub[1] = {
    {suboption_07, 1, UCOL_ON}
};

static const ucolTokSuboption beforeSub[3] = {
    {suboption_06, 1, UCOL_PRIMARY},
    {suboption_07, 1, UCOL_SECONDARY},
    {suboption_08, 1, UCOL_TERTIARY}
};

static const ucolTokSuboption strengthSub[5] = {
    {suboption_06, 1, UCOL_PRIMARY},
    {suboption_07, 1, UCOL_SECONDARY},
    {suboption_08, 1, UCOL_TERTIARY},
    {suboption_09, 1, UCOL_QUATERNARY},
    {suboption_10, 1, UCOL_IDENTICAL},
};

static const ucolTokSuboption firstLastSub[7] = {
    {suboption_11, 7, UCOL_PRIMARY},
    {suboption_12, 9, UCOL_PRIMARY},
    {suboption_13, 8, UCOL_PRIMARY},
    {suboption_14, 8, UCOL_PRIMARY},
    {suboption_15, 7, UCOL_PRIMARY},
    {suboption_16, 8, UCOL_PRIMARY},
    {suboption_17, 8, UCOL_PRIMARY},
};

enum OptionNumber {
    OPTION_ALTERNATE_HANDLING = 0,
    OPTION_FRENCH_COLLATION,
    OPTION_CASE_LEVEL,
    OPTION_CASE_FIRST,
    OPTION_NORMALIZATION_MODE,
    OPTION_HIRAGANA_QUATERNARY,
    OPTION_STRENGTH,
    OPTION_NUMERIC_COLLATION,
    OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
    OPTION_VARIABLE_TOP,
    OPTION_REARRANGE,
    OPTION_BEFORE,
    OPTION_TOP,
    OPTION_FIRST,
    OPTION_LAST,
    OPTION_OPTIMIZE,
    OPTION_SUPPRESS_CONTRACTIONS,
    OPTION_UNDEFINED,
    OPTION_SCRIPT_ORDER,
    OPTION_CHARSET_NAME,
    OPTION_CHARSET
} ;

static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
    /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
    /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
    /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
    /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
    /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
    /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
    /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
    /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
    /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
    /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
    /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
    /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
    /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
    /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
    /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
    /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
    /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
    /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
    /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
    /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"charset"        */
};

static
int32_t u_strncmpNoCase(const UChar     *s1,
                        const UChar     *s2,
                        int32_t     n)
{
    if(n > 0) {
        int32_t rc;
        for(;;) {
            rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
            if(rc != 0 || *s1 == 0 || --n == 0) {
                return rc;
            }
            ++s1;
            ++s2;
        }
    }
    return 0;
}

static
void ucol_uprv_tok_initData() {
    if(!didInit) {
        U_STRING_INIT(suboption_00, "non-ignorable", 13);
        U_STRING_INIT(suboption_01, "shifted",        7);

        U_STRING_INIT(suboption_02, "lower",          5);
        U_STRING_INIT(suboption_03, "upper",          5);
        U_STRING_INIT(suboption_04, "off",            3);
        U_STRING_INIT(suboption_05, "on",             2);

        U_STRING_INIT(suboption_06, "1",              1);
        U_STRING_INIT(suboption_07, "2",              1);
        U_STRING_INIT(suboption_08, "3",              1);
        U_STRING_INIT(suboption_09, "4",              1);
        U_STRING_INIT(suboption_10, "I",              1);

        U_STRING_INIT(suboption_11, "primary",        7);
        U_STRING_INIT(suboption_12, "secondary",      9);
        U_STRING_INIT(suboption_13, "tertiary",       8);
        U_STRING_INIT(suboption_14, "variable",       8);
        U_STRING_INIT(suboption_15, "regular",        7);
        U_STRING_INIT(suboption_16, "implicit",       8);
        U_STRING_INIT(suboption_17, "trailing",       8);


        U_STRING_INIT(option_00, "undefined",      9);
        U_STRING_INIT(option_01, "rearrange",      9);
        U_STRING_INIT(option_02, "alternate",      9);
        U_STRING_INIT(option_03, "backwards",      9);
        U_STRING_INIT(option_04, "variable top",  12);
        U_STRING_INIT(option_05, "top",            3);
        U_STRING_INIT(option_06, "normalization", 13);
        U_STRING_INIT(option_07, "caseLevel",      9);
        U_STRING_INIT(option_08, "caseFirst",      9);
        U_STRING_INIT(option_09, "scriptOrder",   11);
        U_STRING_INIT(option_10, "charsetname",   11);
        U_STRING_INIT(option_11, "charset",        7);
        U_STRING_INIT(option_12, "before",         6);
        U_STRING_INIT(option_13, "hiraganaQ",      9);
        U_STRING_INIT(option_14, "strength",       8);
        U_STRING_INIT(option_15, "first",          5);
        U_STRING_INIT(option_16, "last",           4);
        U_STRING_INIT(option_17, "optimize",       8);
        U_STRING_INIT(option_18, "suppressContractions",         20);
        U_STRING_INIT(option_19, "numericOrdering",      15);
        didInit = TRUE;
    }
}


// This function reads basic options to set in the runtime collator
// used by data driven tests. Should not support build time options
U_CAPI const UChar * U_EXPORT2
ucol_tok_getNextArgument(const UChar *start, const UChar *end,
                         UColAttribute *attrib, UColAttributeValue *value,
                         UErrorCode *status)
{
    uint32_t i = 0;
    int32_t j=0;
    UBool foundOption = FALSE;
    const UChar *optionArg = NULL;

    ucol_uprv_tok_initData();

    while(start < end && u_isWhitespace(*start)) { /* eat whitespace */
        start++;
    }
    if(start >= end) {
        return NULL;
    }
    /* skip opening '[' */
    if(*start == 0x005b) {
        start++;
    } else {
        *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
        return NULL;
    }

    while(i < UTOK_OPTION_COUNT) {
        if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
            foundOption = TRUE;
            if(end - start > rulesOptions[i].optionLen) {
                optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
                while(u_isWhitespace(*optionArg)) { /* eat whitespace */
                    optionArg++;
                }
            }
            break;
        }
        i++;
    }

    if(!foundOption) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

    if(optionArg) {
        for(j = 0; j<rulesOptions[i].subSize; j++) {
            if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
                //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
                *attrib = rulesOptions[i].attr;
                *value = rulesOptions[i].subopts[j].attrVal;
                optionArg += rulesOptions[i].subopts[j].subLen;
                while(u_isWhitespace(*optionArg)) { /* eat whitespace */
                    optionArg++;
                }
                if(*optionArg == 0x005d) {
                    optionArg++;
                    return optionArg;
                } else {
                    *status = U_ILLEGAL_ARGUMENT_ERROR;
                    return NULL;
                }
            }
        }
    }
    *status = U_ILLEGAL_ARGUMENT_ERROR;
    return NULL;
}

static
USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
    while(*start != 0x005b) { /* advance while we find the first '[' */
        start++;
    }
    // now we need to get a balanced set of '[]'. The problem is that a set can have
    // many, and *end point to the first closing '['
    int32_t noOpenBraces = 1;
    int32_t current = 1; // skip the opening brace
    while(start+current < end && noOpenBraces != 0) {
        if(start[current] == 0x005b) {
            noOpenBraces++;
        } else if(start[current] == 0x005D) { // closing brace
            noOpenBraces--;
        }
        current++;
    }

    if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    return uset_openPattern(start, current, status);
}

static
int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
    int32_t i = 0;
    ucol_uprv_tok_initData();

    while(u_isWhitespace(*start)) { /* eat whitespace */
        start++;
    }
    while(i < UTOK_OPTION_COUNT) {
        if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
            if(end - start > rulesOptions[i].optionLen) {
                *optionArg = start+rulesOptions[i].optionLen; /* start of the options*/
                while(u_isWhitespace(**optionArg)) { /* eat whitespace */
                    (*optionArg)++;
                }
            }
            break;
        }
        i++;
    }
    if(i == UTOK_OPTION_COUNT) {
        i = -1; // didn't find an option
    }
    return i;
}


// reads and conforms to various options in rules
// end is the position of the first closing ']'
// However, some of the options take an UnicodeSet definition
// which needs to duplicate the closing ']'
// for example: '[copy [\uAC00-\uD7FF]]'
// These options will move end to the second ']' and the
// caller will set the current to it.
static
uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
    const UChar* start = src->current;
    int32_t i = 0;
    int32_t j=0;
    const UChar *optionArg = NULL;

    uint8_t result = 0;

    start++; /*skip opening '['*/
    i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
    if(optionArg) {
        src->current = optionArg;
    }

    if(i < 0) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
    } else {
        int32_t noOpenBraces = 1;
        switch(i) {
    case OPTION_ALTERNATE_HANDLING:
    case OPTION_FRENCH_COLLATION:
    case OPTION_CASE_LEVEL:
    case OPTION_CASE_FIRST:
    case OPTION_NORMALIZATION_MODE:
    case OPTION_HIRAGANA_QUATERNARY:
    case OPTION_STRENGTH:
    case OPTION_NUMERIC_COLLATION:
        if(optionArg) {
            for(j = 0; j<rulesOptions[i].subSize; j++) {
                if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
                    ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
                    result =  UCOL_TOK_SUCCESS;
                }
            }
        }
        if(result == 0) {
            *status = U_ILLEGAL_ARGUMENT_ERROR;
        }
        break;
    case OPTION_VARIABLE_TOP:
        result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
        break;
    case OPTION_REARRANGE:
        result = UCOL_TOK_SUCCESS;
        break;
    case OPTION_BEFORE:
        if(optionArg) {
            for(j = 0; j<rulesOptions[i].subSize; j++) {
                if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
                    result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
                }
            }
        }
        if(result == 0) {
            *status = U_ILLEGAL_ARGUMENT_ERROR;
        }
        break;
    case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
        /* index to this array will be src->parsedToken.indirectIndex*/
        src->parsedToken.indirectIndex = 0;
        result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
        break;
    case OPTION_FIRST:
    case OPTION_LAST: /* first, last */
        for(j = 0; j<rulesOptions[i].subSize; j++) {
            if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
                // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
                // element of indirect boundaries is reserved for top.
                src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
                result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
            }
        }
        if(result == 0) {
            *status = U_ILLEGAL_ARGUMENT_ERROR;
        }
        break;
    case OPTION_OPTIMIZE:
    case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
        // we need to move end here
        src->current++; // skip opening brace
        while(src->current < src->end && noOpenBraces != 0) {
            if(*src->current == 0x005b) {
                noOpenBraces++;
            } else if(*src->current == 0x005D) { // closing brace
                noOpenBraces--;
            }
            src->current++;
        }
        result = UCOL_TOK_SUCCESS;
        break;
    default:
        *status = U_UNSUPPORTED_ERROR;
        break;
        }
    }
    src->current = u_memchr(src->current, 0x005d, src->end-src->current);
    return result;
}


inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
    if(src->extraCurrent+len >= src->extraEnd) {
        /* reallocate */
        UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
        if(newSrc != NULL) {
            src->current = newSrc + (src->current - src->source);
            src->extraCurrent = newSrc + (src->extraCurrent - src->source);
            src->end = newSrc + (src->end - src->source);
            src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
            src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
            src->source = newSrc;
        } else {
            *status = U_MEMORY_ALLOCATION_ERROR;
        }
    }
    if(len == 1) {
        *src->extraCurrent++ = *stuff;
    } else {
        uprv_memcpy(src->extraCurrent, stuff, len*sizeof(UChar));
        src->extraCurrent += len;
    }


}

inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
    /*
    top = TRUE;
    */
    UChar buff[5];
    src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
    buff[0] = 0xFFFE;
    buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
    buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
    if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
        src->parsedToken.charsLen = 3;
        ucol_tok_addToExtraCurrent(src, buff, 3, status);
    } else {
        buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
        buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
        src->parsedToken.charsLen = 5;
        ucol_tok_addToExtraCurrent(src, buff, 5, status);
    }
    return TRUE;
}

static UBool isCharNewLine(UChar c){
    switch(c){
    case 0x000A: /* LF  */
    case 0x000D: /* CR  */
    case 0x000C: /* FF  */
    case 0x0085: /* NEL */
    case 0x2028: /* LS  */
    case 0x2029: /* PS  */
        return TRUE;
    default:
        return FALSE;
    }
}

U_CAPI const UChar* U_EXPORT2
ucol_tok_parseNextToken(UColTokenParser *src,
                        UBool startOfRules,
                        UParseError *parseError,
                        UErrorCode *status)
{
    /* parsing part */
    UBool variableTop = FALSE;
    UBool top = FALSE;
    UBool inChars = TRUE;
    UBool inQuote = FALSE;
    UBool wasInQuote = FALSE;
    uint8_t before = 0;
    UBool isEscaped = FALSE;
    // TODO: replace these variables with src->parsedToken counterparts
    // no need to use them anymore since we have src->parsedToken.
    // Ideally, token parser would be a nice class... Once, when I have
    // more time (around 2020 probably).
    uint32_t newExtensionLen = 0;
    uint32_t extensionOffset = 0;
    uint32_t newStrength = UCOL_TOK_UNSET;
    UChar buff[10];

    src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
    src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
    src->parsedToken.indirectIndex = 0;

    while (src->current < src->end) {
        UChar ch = *(src->current);

        if (inQuote) {
            if (ch == 0x0027/*'\''*/) {
                inQuote = FALSE;
            } else {
                if ((src->parsedToken.charsLen == 0) || inChars) {
                    if(src->parsedToken.charsLen == 0) {
                        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
                    }
                    src->parsedToken.charsLen++;
                } else {
                    if(newExtensionLen == 0) {
                        extensionOffset = (uint32_t)(src->extraCurrent - src->source);
                    }
                    newExtensionLen++;
                }
            }
        }else if(isEscaped){
            isEscaped =FALSE;
            if (newStrength == UCOL_TOK_UNSET) {
                *status = U_INVALID_FORMAT_ERROR;
                syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                return NULL;
                // enabling rules to start with non-tokens a < b
                // newStrength = UCOL_TOK_RESET;
            }
            if(ch != 0x0000  && src->current != src->end) {
                if (inChars) {
                    if(src->parsedToken.charsLen == 0) {
                        src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
                    }
                    src->parsedToken.charsLen++;
                } else {
                    if(newExtensionLen == 0) {
                        extensionOffset = (uint32_t)(src->current - src->source);
                    }
                    newExtensionLen++;
                }
            }
        }else {
            if(!uprv_isRuleWhiteSpace(ch)) {
                /* Sets the strength for this entry */
                switch (ch) {
                case 0x003D/*'='*/ :
                    if (newStrength != UCOL_TOK_UNSET) {
                        goto EndOfLoop;
                    }

                    /* if we start with strength, we'll reset to top */
                    if(startOfRules == TRUE) {
                        src->parsedToken.indirectIndex = 5;
                        top = ucol_tok_doSetTop(src, status);
                        newStrength = UCOL_TOK_RESET;
                        goto EndOfLoop;
                    }
                    newStrength = UCOL_IDENTICAL;
                    break;

                case 0x002C/*','*/:
                    if (newStrength != UCOL_TOK_UNSET) {
                        goto EndOfLoop;
                    }

                    /* if we start with strength, we'll reset to top */
                    if(startOfRules == TRUE) {
                        src->parsedToken.indirectIndex = 5;
                        top = ucol_tok_doSetTop(src, status);
                        newStrength = UCOL_TOK_RESET;
                        goto EndOfLoop;
                    }
                    newStrength = UCOL_TERTIARY;
                    break;

                case  0x003B/*';'*/:
                    if (newStrength != UCOL_TOK_UNSET) {
                        goto EndOfLoop;
                    }

                    /* if we start with strength, we'll reset to top */
                    if(startOfRules == TRUE) {
                        src->parsedToken.indirectIndex = 5;
                        top = ucol_tok_doSetTop(src, status);
                        newStrength = UCOL_TOK_RESET;
                        goto EndOfLoop;
                    }
                    newStrength = UCOL_SECONDARY;
                    break;

                case 0x003C/*'<'*/:
                    if (newStrength != UCOL_TOK_UNSET) {
                        goto EndOfLoop;
                    }

                    /* if we start with strength, we'll reset to top */
                    if(startOfRules == TRUE) {
                        src->parsedToken.indirectIndex = 5;
                        top = ucol_tok_doSetTop(src, status);
                        newStrength = UCOL_TOK_RESET;
                        goto EndOfLoop;
                    }
                    /* before this, do a scan to verify whether this is */
                    /* another strength */
                    if(*(src->current+1) == 0x003C) {
                        src->current++;
                        if(*(src->current+1) == 0x003C) {
                            src->current++; /* three in a row! */
                            newStrength = UCOL_TERTIARY;
                        } else { /* two in a row */
                            newStrength = UCOL_SECONDARY;
                        }
                    } else { /* just one */
                        newStrength = UCOL_PRIMARY;
                    }
                    break;

                case 0x0026/*'&'*/:
                    if (newStrength != UCOL_TOK_UNSET) {
                        /**/
                        goto EndOfLoop;
                    }

                    newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
                    break;

                case 0x005b/*'['*/:
                    /* options - read an option, analyze it */
                    if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
                        uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
                        if(U_SUCCESS(*status)) {
                            if(result & UCOL_TOK_TOP) {
                                if(newStrength == UCOL_TOK_RESET) {
                                    top = ucol_tok_doSetTop(src, status);
                                    if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
                                        src->parsedToken.charsLen+=2;
                                        buff[0] = 0x002d;
                                        buff[1] = before;
                                        ucol_tok_addToExtraCurrent(src, buff, 2, status);
                                    }

                                    src->current++;
                                    goto EndOfLoop;
                                } else {
                                    *status = U_INVALID_FORMAT_ERROR;
                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                                }
                            } else if(result & UCOL_TOK_VARIABLE_TOP) {
                                if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
                                    variableTop = TRUE;
                                    src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
                                    src->parsedToken.charsLen = 1;
                                    buff[0] = 0xFFFF;
                                    ucol_tok_addToExtraCurrent(src, buff, 1, status);
                                    src->current++;
                                    goto EndOfLoop;
                                } else {
                                    *status = U_INVALID_FORMAT_ERROR;
                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                                }
                            } else if (result & UCOL_TOK_BEFORE){
                                if(newStrength == UCOL_TOK_RESET) {
                                    before = result & UCOL_TOK_BEFORE;
                                } else {
                                    *status = U_INVALID_FORMAT_ERROR;
                                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);

                                }
                            }
                        } else {
                            *status = U_INVALID_FORMAT_ERROR;
                            syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                            return NULL;
                        }
                    }
                    break;
                case 0x0021/*! skip java thai modifier reordering*/:
                    break;
                case 0x002F/*'/'*/:
                    wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
                    inChars = FALSE; /* we're now processing expansion */
                    break;
                case 0x005C /* back slash for escaped chars */:
                    isEscaped = TRUE;
                    break;
                    /* found a quote, we're gonna start copying */
                case 0x0027/*'\''*/:
                    if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
                        *status = U_INVALID_FORMAT_ERROR;
                        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                        return NULL;
                        // enabling rules to start with a non-token character a < b
                        // newStrength = UCOL_TOK_RESET;
                    }

                    inQuote = TRUE;

                    if(inChars) { /* we're doing characters */
                        if(wasInQuote == FALSE) {
                            src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
                        }
                        if (src->parsedToken.charsLen != 0) {
                            ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
                        }
                        src->parsedToken.charsLen++;
                    } else { /* we're doing an expansion */
                        if(wasInQuote == FALSE) {
                            extensionOffset = (uint32_t)(src->extraCurrent - src->source);
                        }
                        if (newExtensionLen != 0) {
                            ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
                        }
                        newExtensionLen++;
                    }

                    wasInQuote = TRUE;

                    ch = *(++(src->current));
                    if(ch == 0x0027) { /* copy the double quote */
                        ucol_tok_addToExtraCurrent(src, &ch, 1, status);
                        inQuote = FALSE;
                    }
                    break;

                    /* '@' is french only if the strength is not currently set */
                    /* if it is, it's just a regular character in collation rules */
                case 0x0040/*'@'*/:
                    if (newStrength == UCOL_TOK_UNSET) {
                        src->opts->frenchCollation = UCOL_ON;
                        break;
                    }

                case 0x007C /*|*/: /* this means we have actually been reading prefix part */
                    // we want to store read characters to the prefix part and continue reading
                    // the characters (proper way would be to restart reading the chars, but in
                    // that case we would have to complicate the token hasher, which I do not
                    // intend to play with. Instead, we will do prefixes when prefixes are due
                    // (before adding the elements).
                    src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
                    src->parsedToken.prefixLen = src->parsedToken.charsLen;

                    if(inChars) { /* we're doing characters */
                        if(wasInQuote == FALSE) {
                            src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
                        }
                        if (src->parsedToken.charsLen != 0) {
                            ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
                        }
                        src->parsedToken.charsLen++;
                    }

                    wasInQuote = TRUE;

                    do {
                        ch = *(++(src->current));
                        // skip whitespace between '|' and the character
                    } while (uprv_isRuleWhiteSpace(ch));
                    break;

                    //charsOffset = 0;
                    //newCharsLen = 0;
                    //break; // We want to store the whole prefix/character sequence. If we break
                    // the '|' is going to get lost.
                case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
                    do {
                        ch = *(++(src->current));
                    } while (!isCharNewLine(ch));

                    break;
                default:
                    if (newStrength == UCOL_TOK_UNSET) {
                        *status = U_INVALID_FORMAT_ERROR;
                        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                        return NULL;
                    }

                    if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
                        *status = U_INVALID_FORMAT_ERROR;
                        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                        return NULL;
                    }

                    if(ch == 0x0000 && src->current+1 == src->end) {
                        break;
                    }

                    if (inChars) {
                        if(src->parsedToken.charsLen == 0) {
                            src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
                        }
                        src->parsedToken.charsLen++;
                    } else {
                        if(newExtensionLen == 0) {
                            extensionOffset = (uint32_t)(src->current - src->source);
                        }
                        newExtensionLen++;
                    }

                    break;
                }
            }
        }

        if(wasInQuote) {
            if(ch != 0x27) {
                if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
                    ucol_tok_addToExtraCurrent(src, &ch, 1, status);
                }
            }
        }

        src->current++;
    }

EndOfLoop:
    wasInQuote = FALSE;
    if (newStrength == UCOL_TOK_UNSET) {
        return NULL;
    }

    if (src->parsedToken.charsLen == 0 && top == FALSE) {
        syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
        *status = U_INVALID_FORMAT_ERROR;
        return NULL;
    }

    src->parsedToken.strength = newStrength;
    src->parsedToken.extensionOffset = extensionOffset;
    src->parsedToken.extensionLen = newExtensionLen;
    src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;

    return src->current;
}

/*
Processing Description
1 Build a ListList. Each list has a header, which contains two lists (positive
and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
reset may be null.
2 As you process, you keep a LAST pointer that points to the last token you
handled.
*/

static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
                                      UParseError *parseError, UErrorCode *status)
{
    if(src->resultLen == src->listCapacity) {
        // Unfortunately, this won't work, as we store addresses of lhs in token
        src->listCapacity *= 2;
        src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
        if(src->lh == NULL) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            return NULL;
        }
    }
    /* do the reset thing */
    UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
    /* test for NULL */
    if (sourceToken == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        return NULL;
    }
    sourceToken->rulesToParse = src->source;
    sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
    sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;

    sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
    sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);

    // keep the flags around so that we know about before
    sourceToken->flags = src->parsedToken.flags;

    if(src->parsedToken.prefixOffset != 0) {
        // this is a syntax error
        *status = U_INVALID_FORMAT_ERROR;
        syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
        uprv_free(sourceToken);
        return 0;
    } else {
        sourceToken->prefix = 0;
    }

    sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
    sourceToken->strength = UCOL_TOK_RESET;
    sourceToken->next = NULL;
    sourceToken->previous = NULL;
    sourceToken->noOfCEs = 0;
    sourceToken->noOfExpCEs = 0;
    sourceToken->listHeader = &src->lh[src->resultLen];

    src->lh[src->resultLen].first = NULL;
    src->lh[src->resultLen].last = NULL;
    src->lh[src->resultLen].first = NULL;
    src->lh[src->resultLen].last = NULL;

    src->lh[src->resultLen].reset = sourceToken;

    /*
    3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
    First convert all expansions into normal form. Examples:
    If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
    d * ... into &x * c/y * d * ...
    Note: reset values can never have expansions, although they can cause the
    very next item to have one. They may be contractions, if they are found
    earlier in the list.
    */
    *expandNext = 0;
    if(expand != NULL) {
        /* check to see if there is an expansion */
        if(src->parsedToken.charsLen > 1) {
            uint32_t resetCharsOffset;
            resetCharsOffset = (uint32_t)(expand - src->source);
            sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
            *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
        }
    }

    src->resultLen++;

    uhash_put(src->tailored, sourceToken, sourceToken, status);

    return sourceToken;
}

static
inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
    if(U_FAILURE(*status)) {
        return NULL;
    }
    /* this is a virgin before - we need to fish the anchor from the UCA */
    collIterate s;
    uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
    uint32_t CE, SecondCE;
    uint32_t invPos;
    if(sourceToken != NULL) {
        uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s);
    } else {
        uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s);
    }

    baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
    baseContCE = ucol_getNextCE(src->UCA, &s, status);
    if(baseContCE == UCOL_NO_MORE_CES) {
        baseContCE = 0;
    }


    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
    uint32_t ch = 0;
    uint32_t expandNext = 0;
    UColToken key;

    if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
        uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
        uint32_t raw = uprv_uca_getRawFromImplicit(primary);
        ch = uprv_uca_getCodePointFromRaw(raw-1);
        uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
        CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
        SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;

        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
        *src->extraCurrent++ = 0xFFFE;
        *src->extraCurrent++ = (UChar)ch;
        src->parsedToken.charsLen++;

        key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
        key.rulesToParse = src->source;

        //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
        sourceToken = (UColToken *)uhash_get(src->tailored, &key);

        if(sourceToken == NULL) {
            src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
            if(isContinuation(SecondCE)) {
                src->lh[src->resultLen].baseContCE = SecondCE;
            } else {
                src->lh[src->resultLen].baseContCE = 0;
            }
            src->lh[src->resultLen].nextCE = 0;
            src->lh[src->resultLen].nextContCE = 0;
            src->lh[src->resultLen].previousCE = 0;
            src->lh[src->resultLen].previousContCE = 0;

            src->lh[src->resultLen].indirect = FALSE;

            sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
        }

    } else {
        invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);

        // we got the previous CE. Now we need to see if the difference between
        // the two CEs is really of the requested strength.
        // if it's a bigger difference (we asked for secondary and got primary), we
        // need to modify the CE.
        if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
            // adjust the strength
            // now we are in the situation where our baseCE should actually be modified in
            // order to get the CE in the right position.
            if(strength == UCOL_SECONDARY) {
                CE = baseCE - 0x0200;
            } else { // strength == UCOL_TERTIARY
                CE = baseCE - 0x02;
            }
            if(baseContCE) {
                if(strength == UCOL_SECONDARY) {
                    SecondCE = baseContCE - 0x0200;
                } else { // strength == UCOL_TERTIARY
                    SecondCE = baseContCE - 0x02;
                }
            }
        }

#if 0
        // the code below relies on getting a code point from the inverse table, in order to be
        // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
        // 1. There are many code points that have the same CE
        // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
        // Also, in case when there is no equivalent strength before an element, we have to actually
        // construct one. For example, &[before 2]a << x won't result in x << a, because the element
        // before a is a primary difference.

        //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);


        ch = CETable[3*invPos+2];

        if((ch &  UCOL_INV_SIZEMASK) != 0) {
            uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
            uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
            ch = conts[offset];
        }

        *src->extraCurrent++ = (UChar)ch;
        src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
        src->parsedToken.charsLen = 1;

        // We got an UCA before. However, this might have been tailored.
        // example:
        // &\u30ca = \u306a
        // &[before 3]\u306a<<<\u306a|\u309d


        // uint32_t key = (*newCharsLen << 24) | *charsOffset;
        key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
        key.rulesToParse = src->source;

        //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
#endif

        // here is how it should be. The situation such as &[before 1]a < x, should be
        // resolved exactly as if we wrote &a > x.
        // therefore, I don't really care if the UCA value before a has been changed.
        // However, I do care if the strength between my element and the previous element
        // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
        // have to construct the base CE.



        // if we found a tailored thing, we have to use the UCA value and construct
        // a new reset token with constructed name
        //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
        // character to which we want to anchor is already tailored.
        // We need to construct a new token which will be the anchor
        // point
        //*(src->extraCurrent-1) = 0xFFFE;
        //*src->extraCurrent++ = (UChar)ch;
        // grab before
        src->parsedToken.charsOffset -= 10;
        src->parsedToken.charsLen += 10;
        src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
        if(isContinuation(SecondCE)) {
            src->lh[src->resultLen].baseContCE = SecondCE;
        } else {
            src->lh[src->resultLen].baseContCE = 0;
        }
        src->lh[src->resultLen].nextCE = 0;
        src->lh[src->resultLen].nextContCE = 0;
        src->lh[src->resultLen].previousCE = 0;
        src->lh[src->resultLen].previousContCE = 0;

        src->lh[src->resultLen].indirect = FALSE;

        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
        //}
    }

    return sourceToken;

}

uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
    UColToken *lastToken = NULL;
    const UChar *parseEnd = NULL;
    uint32_t expandNext = 0;
    UBool variableTop = FALSE;
    UBool top = FALSE;
    uint16_t specs = 0;
    UColTokListHeader *ListList = NULL;

    src->parsedToken.strength = UCOL_TOK_UNSET;

    ListList = src->lh;

    if(U_FAILURE(*status)) {
        return 0;
    }

    while(src->current < src->end) {
        src->parsedToken.prefixOffset = 0;

        parseEnd = ucol_tok_parseNextToken(src,
            (UBool)(lastToken == NULL),
            parseError,
            status);

        specs = src->parsedToken.flags;


        variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
        top = ((specs & UCOL_TOK_TOP) != 0);

        if(U_SUCCESS(*status) && parseEnd != NULL) {
            UColToken *sourceToken = NULL;
            //uint32_t key = 0;
            uint32_t lastStrength = UCOL_TOK_UNSET;

            if(lastToken != NULL ) {
                lastStrength = lastToken->strength;
            }

            //key = newCharsLen << 24 | charsOffset;
            UColToken key;
            key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
            key.rulesToParse = src->source;

            /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
            sourceToken = (UColToken *)uhash_get(src->tailored, &key);

            if(src->parsedToken.strength != UCOL_TOK_RESET) {
                if(lastToken == NULL) { /* this means that rules haven't started properly */
                    *status = U_INVALID_FORMAT_ERROR;
                    syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
                    return 0;
                }
                /*  6 Otherwise (when relation != reset) */
                if(sourceToken == NULL) {
                    /* If sourceToken is null, create new one, */
                    sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
                    /* test for NULL */
                    if (sourceToken == NULL) {
                        *status = U_MEMORY_ALLOCATION_ERROR;
                        return 0;
                    }
                    sourceToken->rulesToParse = src->source;
                    sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;

                    sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);

                    sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
                    sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);

                    sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
                    sourceToken->next = NULL;
                    sourceToken->previous = NULL;
                    sourceToken->noOfCEs = 0;
                    sourceToken->noOfExpCEs = 0;
                    // keep the flags around so that we know about before
                    sourceToken->flags = src->parsedToken.flags;
                    uhash_put(src->tailored, sourceToken, sourceToken, status);
                    if(U_FAILURE(*status)) {
                        return 0;
                    }
                } else {
                    /* we could have fished out a reset here */
                    if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
                        /* otherwise remove sourceToken from where it was. */
                        if(sourceToken->next != NULL) {
                            if(sourceToken->next->strength > sourceToken->strength) {
                                sourceToken->next->strength = sourceToken->strength;
                            }
                            sourceToken->next->previous = sourceToken->previous;
                        } else {
                            sourceToken->listHeader->last = sourceToken->previous;
                        }

                        if(sourceToken->previous != NULL) {
                            sourceToken->previous->next = sourceToken->next;
                        } else {
                            sourceToken->listHeader->first = sourceToken->next;
                        }
                        sourceToken->next = NULL;
                        sourceToken->previous = NULL;
                    }
                }

                sourceToken->strength = src->parsedToken.strength;
                sourceToken->listHeader = lastToken->listHeader;

                /*
                1.  Find the strongest strength in each list, and set strongestP and strongestN
                accordingly in the headers.
                */
                if(lastStrength == UCOL_TOK_RESET
                    || sourceToken->listHeader->first == 0) {
                        /* If LAST is a reset
                        insert sourceToken in the list. */
                        if(sourceToken->listHeader->first == 0) {
                            sourceToken->listHeader->first = sourceToken;
                            sourceToken->listHeader->last = sourceToken;
                        } else { /* we need to find a place for us */
                            /* and we'll get in front of the same strength */
                            if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
                                sourceToken->next = sourceToken->listHeader->first;
                                sourceToken->next->previous = sourceToken;
                                sourceToken->listHeader->first = sourceToken;
                                sourceToken->previous = NULL;
                            } else {
                                lastToken = sourceToken->listHeader->first;
                                while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
                                    lastToken = lastToken->next;
                                }
                                if(lastToken->next != NULL) {
                                    lastToken->next->previous = sourceToken;
                                } else {
                                    sourceToken->listHeader->last = sourceToken;
                                }
                                sourceToken->previous = lastToken;
                                sourceToken->next = lastToken->next;
                                lastToken->next = sourceToken;
                            }
                        }
                    } else {
                        /* Otherwise (when LAST is not a reset)
                        if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
                        otherwise insert before.
                        when inserting after or before, search to the next position with the same
                        strength in that direction. (This is called postpone insertion).         */
                        if(sourceToken != lastToken) {
                            if(lastToken->polarity == sourceToken->polarity) {
                                while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
                                    lastToken = lastToken->next;
                                }
                                sourceToken->previous = lastToken;
                                if(lastToken->next != NULL) {
                                    lastToken->next->previous = sourceToken;
                                } else {
                                    sourceToken->listHeader->last = sourceToken;
                                }

                                sourceToken->next = lastToken->next;
                                lastToken->next = sourceToken;
                            } else {
                                while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
                                    lastToken = lastToken->previous;
                                }
                                sourceToken->next = lastToken;
                                if(lastToken->previous != NULL) {
                                    lastToken->previous->next = sourceToken;
                                } else {
                                    sourceToken->listHeader->first = sourceToken;
                                }
                                sourceToken->previous = lastToken->previous;
                                lastToken->previous = sourceToken;
                            }
                        } else { /* repeated one thing twice in rules, stay with the stronger strength */
                            if(lastStrength < sourceToken->strength) {
                                sourceToken->strength = lastStrength;
                            }
                        }
                    }

                    /* if the token was a variable top, we're gonna put it in */
                    if(variableTop == TRUE && src->varTop == NULL) {
                        variableTop = FALSE;
                        src->varTop = sourceToken;
                    }

                    // Treat the expansions.
                    // There are two types of expansions: explicit (x / y) and reset based propagating expansions
                    // (&abc * d * e <=> &ab * d / c * e / c)
                    // if both of them are in effect for a token, they are combined.

                    sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;

                    if(expandNext != 0) {
                        if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
                            expandNext = 0;
                        } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
                            sourceToken->expansion = expandNext;
                        } else { /* there is both explicit and implicit expansion. We need to make a combination */
                            uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
                            uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
                            sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
                            src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
                        }
                    }

                    // This is just for debugging purposes
                    if(sourceToken->expansion != 0) {
                        sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
                    } else {
                        sourceToken->debugExpansion = 0;
                    }
                    // if the previous token was a reset before, the strength of this
                    // token must match the strength of before. Otherwise we have an
                    // undefined situation.
                    // In other words, we currently have a cludge which we use to
                    // represent &a >> x. This is written as &[before 2]a << x.
                    if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
                        uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
                        if(beforeStrength != sourceToken->strength) {
                            *status = U_INVALID_FORMAT_ERROR;
                            syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
                            return 0;
                        }
                    }
            } else {
                if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
                    /* if the previous token was also a reset, */
                    /*this means that we have two consecutive resets */
                    /* and we want to remove the previous one if empty*/
                    if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
                        src->resultLen--;
                    }
                }

                if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
                    uint32_t searchCharsLen = src->parsedToken.charsLen;
                    while(searchCharsLen > 1 && sourceToken == NULL) {
                        searchCharsLen--;
                        //key = searchCharsLen << 24 | charsOffset;
                        UColToken key;
                        key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
                        key.rulesToParse = src->source;
                        sourceToken = (UColToken *)uhash_get(src->tailored, &key);
                    }
                    if(sourceToken != NULL) {
                        expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
                    }
                }

                if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
                    if(top == FALSE) { /* there is no indirection */
                        uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
                        if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
                            /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
                            while(sourceToken->strength > strength && sourceToken->previous != NULL) {
                                sourceToken = sourceToken->previous;
                            }
                            /* here, either we hit the strength or NULL */
                            if(sourceToken->strength == strength) {
                                if(sourceToken->previous != NULL) {
                                    sourceToken = sourceToken->previous;
                                } else { /* start of list */
                                    sourceToken = sourceToken->listHeader->reset;
                                }
                            } else { /* we hit NULL */
                                /* we should be doing the else part */
                                sourceToken = sourceToken->listHeader->reset;
                                sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
                            }
                        } else {
                            sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
                        }
                    } else { /* this is both before and indirection */
                        top = FALSE;
                        ListList[src->resultLen].previousCE = 0;
                        ListList[src->resultLen].previousContCE = 0;
                        ListList[src->resultLen].indirect = TRUE;
                        /* we need to do slightly more work. we need to get the baseCE using the */
                        /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
                        /* in ucol_bld */
                        uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
                        uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
                        uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
                        uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;

                        UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
                        if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
                            uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
                            uint32_t raw = uprv_uca_getRawFromImplicit(primary);
                            uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
                            CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
                            SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
                        } else {
                            /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
                            ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
                        }

                        ListList[src->resultLen].baseCE = CE;
                        ListList[src->resultLen].baseContCE = SecondCE;
                        ListList[src->resultLen].nextCE = 0;
                        ListList[src->resultLen].nextContCE = 0;

                        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
                    }
                }


                /*  5 If the relation is a reset:
                If sourceToken is null
                Create new list, create new sourceToken, make the baseCE from source, put
                the sourceToken in ListHeader of the new list */
                if(sourceToken == NULL) {
                    /*
                    3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
                    First convert all expansions into normal form. Examples:
                    If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
                    d * ... into &x * c/y * d * ...
                    Note: reset values can never have expansions, although they can cause the
                    very next item to have one. They may be contractions, if they are found
                    earlier in the list.
                    */
                    if(top == FALSE) {
                        collIterate s;
                        uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;

                        uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s);

                        CE = ucol_getNextCE(src->UCA, &s, status);
                        UChar *expand = s.pos;
                        SecondCE = ucol_getNextCE(src->UCA, &s, status);

                        ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
                        if(isContinuation(SecondCE)) {
                            ListList[src->resultLen].baseContCE = SecondCE;
                        } else {
                            ListList[src->resultLen].baseContCE = 0;
                        }
                        ListList[src->resultLen].nextCE = 0;
                        ListList[src->resultLen].nextContCE = 0;
                        ListList[src->resultLen].previousCE = 0;
                        ListList[src->resultLen].previousContCE = 0;
                        ListList[src->resultLen].indirect = FALSE;
                        sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
                    } else { /* top == TRUE */
                        /* just use the supplied values */
                        top = FALSE;
                        ListList[src->resultLen].previousCE = 0;
                        ListList[src->resultLen].previousContCE = 0;
                        ListList[src->resultLen].indirect = TRUE;
                        ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
                        ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
                        ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
                        ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;

                        sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);

                    }
                } else { /* reset to something already in rules */
                    top = FALSE;
                }
            }
            /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
            lastToken = sourceToken;
        } else {
            if(U_FAILURE(*status)) {
                return 0;
            }
        }
    }

    if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
        src->resultLen--;
    }
    return src->resultLen;
}

void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) {
    U_NAMESPACE_USE

    uint32_t nSize = 0;
    uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
    if(U_FAILURE(*status)) {
        return;
    }

    // set everything to zero, so that we can clean up gracefully
    uprv_memset(src, 0, sizeof(UColTokenParser));

    // first we need to find options that don't like to be normalized,
    // like copy and remove...
    //const UChar *openBrace = rules;
    int32_t optionNumber = -1;
    const UChar *setStart = NULL;
    uint32_t i = 0;
    while(i < rulesLength) {
        if(rules[i] == 0x005B) {
            // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
            //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
            optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
            if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
                USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
                if(U_SUCCESS(*status)) {
                    if(src->copySet == NULL) {
                        src->copySet = newSet;
                    } else {
                        uset_addAll(src->copySet, newSet);
                        uset_close(newSet);
                    }
                } else {
                    return;
                }
            } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
                USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
                if(U_SUCCESS(*status)) {
                    if(src->removeSet == NULL) {
                        src->removeSet = newSet;
                    } else {
                        uset_addAll(src->removeSet, newSet);
                        uset_close(newSet);
                    }
                } else {
                    return;
                }
            }
        }
        //openBrace++;
        i++;
    }

    src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
    /* test for NULL */
    if (src->source == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
    nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
    if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
        *status = U_ZERO_ERROR;
        src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
        /* test for NULL */
        if (src->source == NULL) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            return;
        }
        nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
    }
    src->current = src->source;
    src->end = src->source+nSize;
    src->sourceCurrent = src->source;
    src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
    src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
    src->varTop = NULL;
    src->UCA = UCA;
    src->invUCA = ucol_initInverseUCA(status);
    src->parsedToken.charsLen = 0;
    src->parsedToken.charsOffset = 0;
    src->parsedToken.extensionLen = 0;
    src->parsedToken.extensionOffset = 0;
    src->parsedToken.prefixLen = 0;
    src->parsedToken.prefixOffset = 0;
    src->parsedToken.flags = 0;
    src->parsedToken.strength = UCOL_TOK_UNSET;
    src->buildCCTabFlag = FALSE;

    if(U_FAILURE(*status)) {
        return;
    }
    src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
    if(U_FAILURE(*status)) {
        return;
    }
    uhash_setValueDeleter(src->tailored, uhash_freeBlock);

    src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
    /* test for NULL */
    if (src->opts == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }

    uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));

    // rulesToParse = src->source;
    src->lh = 0;
    src->listCapacity = 1024;
    src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
    //Test for NULL
    if (src->lh == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
    src->resultLen = 0;

    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);

    // UCOL_RESET_TOP_VALUE
    setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
    // UCOL_FIRST_PRIMARY_IGNORABLE
    setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
    // UCOL_LAST_PRIMARY_IGNORABLE
    setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
    // UCOL_FIRST_SECONDARY_IGNORABLE
    setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
    // UCOL_LAST_SECONDARY_IGNORABLE
    setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
    // UCOL_FIRST_TERTIARY_IGNORABLE
    setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
    // UCOL_LAST_TERTIARY_IGNORABLE
    setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
    // UCOL_FIRST_VARIABLE
    setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
    // UCOL_LAST_VARIABLE
    setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
    // UCOL_FIRST_NON_VARIABLE
    setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
    // UCOL_LAST_NON_VARIABLE
    setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
    // UCOL_FIRST_IMPLICIT
    setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
    // UCOL_LAST_IMPLICIT
    setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
    // UCOL_FIRST_TRAILING
    setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
    // UCOL_LAST_TRAILING
    setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
    ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
}


void ucol_tok_closeTokenList(UColTokenParser *src) {
    if(src->copySet != NULL) {
        uset_close(src->copySet);
    }
    if(src->removeSet != NULL) {
        uset_close(src->removeSet);
    }
    if(src->tailored != NULL) {
        uhash_close(src->tailored);
    }
    if(src->lh != NULL) {
        uprv_free(src->lh);
    }
    if(src->source != NULL) {
        uprv_free(src->source);
    }
    if(src->opts != NULL) {
        uprv_free(src->opts);
    }
}

#endif /* #if !UCONFIG_NO_COLLATION */

登录后可以享受更多权益