/*
* Copyright (C) 2008, 2009 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "CharacterClassConstructor.h"
#if ENABLE(WREC)
#include "pcre_internal.h"
#include <wtf/ASCIICType.h>
using namespace WTF;
namespace JSC { namespace WREC {
void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch)
{
unsigned pos = 0;
unsigned range = matches.size();
// binary chop, find position to insert char.
while (range) {
unsigned index = range >> 1;
int val = matches[pos+index] - ch;
if (!val)
return;
else if (val > 0)
range = index;
else {
pos += (index+1);
range -= (index+1);
}
}
if (pos == matches.size())
matches.append(ch);
else
matches.insert(pos, ch);
}
void CharacterClassConstructor::addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
{
unsigned end = ranges.size();
// Simple linear scan - I doubt there are that many ranges anyway...
// feel free to fix this with something faster (eg binary chop).
for (unsigned i = 0; i < end; ++i) {
// does the new range fall before the current position in the array
if (hi < ranges[i].begin) {
// optional optimization: concatenate appending ranges? - may not be worthwhile.
if (hi == (ranges[i].begin - 1)) {
ranges[i].begin = lo;
return;
}
CharacterRange r = {lo, hi};
ranges.insert(i, r);
return;
}
// Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
// If the new range start at or before the end of the last range, then the overlap (if it starts one after the
// end of the last range they concatenate, which is just as good.
if (lo <= (ranges[i].end + 1)) {
// found an intersect! we'll replace this entry in the array.
ranges[i].begin = std::min(ranges[i].begin, lo);
ranges[i].end = std::max(ranges[i].end, hi);
// now check if the new range can subsume any subsequent ranges.
unsigned next = i+1;
// each iteration of the loop we will either remove something from the list, or break the loop.
while (next < ranges.size()) {
if (ranges[next].begin <= (ranges[i].end + 1)) {
// the next entry now overlaps / concatenates this one.
ranges[i].end = std::max(ranges[i].end, ranges[next].end);
ranges.remove(next);
} else
break;
}
return;
}
}
// CharacterRange comes after all existing ranges.
CharacterRange r = {lo, hi};
ranges.append(r);
}
void CharacterClassConstructor::put(UChar ch)
{
// Parsing a regular expression like [a-z], we start in an initial empty state:
// ((m_charBuffer == -1) && !m_isPendingDash)
// When buffer the 'a' sice it may be (and is in this case) part of a range:
// ((m_charBuffer != -1) && !m_isPendingDash)
// Having parsed the hyphen we then record that the dash is also pending:
// ((m_charBuffer != -1) && m_isPendingDash)
// The next change will always take us back to the initial state - either because
// a complete range has been parsed (such as [a-z]), or because a flush is forced,
// due to an early end in the regexp ([a-]), or a character class escape being added
// ([a-\s]). The fourth permutation of m_charBuffer and m_isPendingDash is not permitted.
ASSERT(!((m_charBuffer == -1) && m_isPendingDash));
if (m_charBuffer != -1) {
if (m_isPendingDash) {
// EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range.
UChar lo = m_charBuffer;
UChar hi = ch;
// Reset back to the inital state.
m_charBuffer = -1;
m_isPendingDash = false;
// This is an error, detected lazily. Do not proceed.
if (lo > hi) {
m_isUpsideDown = true;
return;
}
if (lo <= 0x7f) {
char asciiLo = lo;
char asciiHi = std::min(hi, (UChar)0x7f);
addSortedRange(m_ranges, lo, asciiHi);
if (m_isCaseInsensitive) {
if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
if ((asciiLo <= 'z') && (asciiHi >= 'a'))
addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
}
}
if (hi >= 0x80) {
UChar unicodeCurr = std::max(lo, (UChar)0x80);
addSortedRange(m_rangesUnicode, unicodeCurr, hi);
if (m_isCaseInsensitive) {
// we're going to scan along, updating the start of the range
while (unicodeCurr <= hi) {
// Spin forwards over any characters that don't have two cases.
for (; jsc_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) {
// if this was the last character in the range, we're done.
if (unicodeCurr == hi)
return;
}
// if we fall through to here, unicodeCurr <= hi & has another case. Get the other case.
UChar rangeStart = unicodeCurr;
UChar otherCurr = jsc_pcre_ucp_othercase(unicodeCurr);
// If unicodeCurr is not yet hi, check the next char in the range. If it also has another case,
// and if it's other case value is one greater then the othercase value for the current last
// character included in the range, we can include next into the range.
while ((unicodeCurr < hi) && (jsc_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) {
// increment unicodeCurr; it points to the end of the range.
// increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value.
++unicodeCurr;
++otherCurr;
}
// otherChar is the last in the range of other case chars, calculate offset to get back to the start.
addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr);
// unicodeCurr has been added, move on to the next char.
++unicodeCurr;
}
}
}
} else if (ch == '-')
// EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range.
m_isPendingDash = true;
else {
// EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'.
flush();
m_charBuffer = ch;
}
} else
// EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character
// (the hyphen not treated as a special character in this case, same handling for any char).
m_charBuffer = ch;
}
// When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range.
// When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate
// array (either ascii or unicode).
// If the pattern is case insensitive we add entries for both cases.
void CharacterClassConstructor::flush()
{
if (m_charBuffer != -1) {
if (m_charBuffer <= 0x7f) {
if (m_isCaseInsensitive && isASCIILower(m_charBuffer))
addSorted(m_matches, toASCIIUpper(m_charBuffer));
addSorted(m_matches, m_charBuffer);
if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer))
addSorted(m_matches, toASCIILower(m_charBuffer));
} else {
addSorted(m_matchesUnicode, m_charBuffer);
if (m_isCaseInsensitive) {
int other = jsc_pcre_ucp_othercase(m_charBuffer);
if (other != -1)
addSorted(m_matchesUnicode, other);
}
}
m_charBuffer = -1;
}
if (m_isPendingDash) {
addSorted(m_matches, '-');
m_isPendingDash = false;
}
}
void CharacterClassConstructor::append(const CharacterClass& other)
{
// [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]).
// Need to check the spec, really, but think this matches PCRE behaviour.
flush();
if (other.numMatches) {
for (size_t i = 0; i < other.numMatches; ++i)
addSorted(m_matches, other.matches[i]);
}
if (other.numRanges) {
for (size_t i = 0; i < other.numRanges; ++i)
addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end);
}
if (other.numMatchesUnicode) {
for (size_t i = 0; i < other.numMatchesUnicode; ++i)
addSorted(m_matchesUnicode, other.matchesUnicode[i]);
}
if (other.numRangesUnicode) {
for (size_t i = 0; i < other.numRangesUnicode; ++i)
addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end);
}
}
} } // namespace JSC::WREC
#endif // ENABLE(WREC)