PreloadScanner.cpp - Android社区 - https://www.androidos.net.cn/

/*
 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 */

#include "config.h"
#include "PreloadScanner.h"

#include "AtomicString.h"
#include "CachedCSSStyleSheet.h"
#include "CachedImage.h"
#include "CachedResource.h"
#include "CachedResourceClient.h"
#include "CachedScript.h"
#include "CSSHelper.h"
#include "CString.h"
#include "DocLoader.h"
#include "Document.h"
#include "Frame.h"
#include "FrameLoader.h"
#include "HTMLLinkElement.h"
#include "HTMLNames.h"
#include <wtf/CurrentTime.h>
#include <wtf/unicode/Unicode.h>

#ifdef __GNUC__
// The main tokenizer includes this too so we are getting two copies of the data. However, this way the code gets inlined.
#include "HTMLEntityNames.c"
#else
// Not inlined for non-GCC compilers
struct Entity {
    const char* name;
    int code;
};
const struct Entity* findEntity(register const char* str, register unsigned int len);
#endif

#define PRELOAD_DEBUG 0

using namespace WTF;

namespace WebCore {
    
using namespace HTMLNames;
    
PreloadScanner::PreloadScanner(Document* doc)
    : m_inProgress(false)
    , m_timeUsed(0)
    , m_bodySeen(false)
    , m_document(doc)
{
#if PRELOAD_DEBUG
    printf("CREATING PRELOAD SCANNER FOR %s\n", m_document->url().string().latin1().data());
#endif
}
    
PreloadScanner::~PreloadScanner()
{
#if PRELOAD_DEBUG
    printf("DELETING PRELOAD SCANNER FOR %s\n", m_document->url().string().latin1().data());
    printf("TOTAL TIME USED %.4fs\n", m_timeUsed);
#endif
}
    
void PreloadScanner::begin() 
{ 
    ASSERT(!m_inProgress); 
    reset(); 
    m_inProgress = true; 
}
    
void PreloadScanner::end() 
{ 
    ASSERT(m_inProgress); 
    m_inProgress = false; 
}

void PreloadScanner::reset()
{
    m_source.clear();
    
    m_state = Data;
    m_escape = false;
    m_contentModel = PCDATA;
    m_commentPos = 0;

m_closeTag = false;
    m_tagName.clear();
    m_attributeName.clear();
    m_attributeValue.clear();
    m_lastStartTag = AtomicString();
    
    m_urlToLoad = String();
    m_charset = String();
    m_linkIsStyleSheet = false;
    m_lastCharacterIndex = 0;
    clearLastCharacters();
    
    m_cssState = CSSInitial;
    m_cssRule.clear();
    m_cssRuleValue.clear();
}
    
bool PreloadScanner::scanningBody() const
{
    return m_document->body() || m_bodySeen;
}
    
void PreloadScanner::write(const SegmentedString& source)
{
    double startTime = currentTime();
    tokenize(source);
    m_timeUsed += currentTime() - startTime;
}
    
static inline bool isWhitespace(UChar c)
{
    return c == ' ' || c == '\n' || c == '\r' || c == '\t';
}
    
inline void PreloadScanner::clearLastCharacters()
{
    memset(m_lastCharacters, 0, lastCharactersBufferSize * sizeof(UChar));
}
    
inline void PreloadScanner::rememberCharacter(UChar c)
{
    m_lastCharacterIndex = (m_lastCharacterIndex + 1) % lastCharactersBufferSize;
    m_lastCharacters[m_lastCharacterIndex] = c;
}
    
inline bool PreloadScanner::lastCharactersMatch(const char* chars, unsigned count) const
{
    unsigned pos = m_lastCharacterIndex;
    while (count) {
        if (chars[count - 1] != m_lastCharacters[pos])
            return false;
        --count;
        if (!pos)
            pos = lastCharactersBufferSize;
        --pos;
    }
    return true;
}
    
static inline unsigned legalEntityFor(unsigned value)
{
    // FIXME There is a table for more exceptions in the HTML5 specification.
    if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
        return 0xFFFD;
    return value;
}
    
unsigned PreloadScanner::consumeEntity(SegmentedString& source, bool& notEnoughCharacters)
{
    enum EntityState {
        Initial,
        NumberType,
        MaybeHex,
        Hex,
        Decimal,
        Named
    };
    EntityState entityState = Initial;
    unsigned result = 0;
    Vector<UChar, 10> seenChars;
    Vector<char, 10> entityName;
    
    while (!source.isEmpty()) {
        UChar cc = *source;
        seenChars.append(cc);
        switch (entityState) {
        case Initial:
            if (isWhitespace(cc) || cc == '<' || cc == '&')
                return 0;
            else if (cc == '#') 
                entityState = NumberType;
            else if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
                entityName.append(cc);
                entityState = Named;
            } else
                return 0;
            break;
        case NumberType:
            if (cc == 'x' || cc == 'X')
                entityState = MaybeHex;
            else if (cc >= '0' && cc <= '9') {
                entityState = Decimal;
                result = cc - '0';
            } else {
                source.push('#');
                return 0;
            }
            break;
        case MaybeHex:
            if (cc >= '0' && cc <= '9')
                result = cc - '0';
            else if (cc >= 'a' && cc <= 'f')
                result = 10 + cc - 'a';
            else if (cc >= 'A' && cc <= 'F')
                result = 10 + cc - 'A';
            else {
                source.push(seenChars[1]);
                source.push('#');
                return 0;
            }
            entityState = Hex;
            break;
        case Hex:
            if (cc >= '0' && cc <= '9')
                result = result * 16 + cc - '0';
            else if (cc >= 'a' && cc <= 'f')
                result = result * 16 + 10 + cc - 'a';
            else if (cc >= 'A' && cc <= 'F')
                result = result * 16 + 10 + cc - 'A';
            else if (cc == ';') {
                source.advance();
                return legalEntityFor(result);
            } else 
                return legalEntityFor(result);
            break;
        case Decimal:
            if (cc >= '0' && cc <= '9')
                result = result * 10 + cc - '0';
            else if (cc == ';') {
                source.advance();
                return legalEntityFor(result);
            } else
                return legalEntityFor(result);
            break;               
        case Named:
            // This is the attribute only version, generic version matches somewhat differently
            while (entityName.size() <= 8) {
                if (cc == ';') {
                    const Entity* entity = findEntity(entityName.data(), entityName.size());
                    if (entity) {
                        source.advance();
                        return entity->code;
                    }
                    break;
                }
                if (!(cc >= 'a' && cc <= 'z') && !(cc >= 'A' && cc <= 'Z') && !(cc >= '0' && cc <= '9')) {
                    const Entity* entity = findEntity(entityName.data(), entityName.size());
                    if (entity)
                        return entity->code;
                    break;
                }
                entityName.append(cc);
                source.advance();
                if (source.isEmpty())
                    goto outOfCharacters;
                cc = *source;
                seenChars.append(cc);
            }
            if (seenChars.size() == 2)
                source.push(seenChars[0]);
            else if (seenChars.size() == 3) {
                source.push(seenChars[1]);
                source.push(seenChars[0]);
            } else
                source.prepend(SegmentedString(String(seenChars.data(), seenChars.size() - 1)));
            return 0;
        }
        source.advance();
    }
outOfCharacters:
    notEnoughCharacters = true;
    source.prepend(SegmentedString(String(seenChars.data(), seenChars.size())));
    return 0;
}

void PreloadScanner::tokenize(const SegmentedString& source)
{
    ASSERT(m_inProgress);
    
    m_source.append(source);

// This is a simplified HTML5 Tokenizer
    // http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
    while (!m_source.isEmpty()) {
        UChar cc = *m_source;
        switch (m_state) {
        case Data:
            while (1) {
                rememberCharacter(cc);
                if (cc == '&') {
                    if (m_contentModel == PCDATA || m_contentModel == RCDATA) {
                        m_state = EntityData;
                        break;
                    }
                } else if (cc == '-') {
                    if ((m_contentModel == RCDATA || m_contentModel == CDATA) && !m_escape) {
                        if (lastCharactersMatch("", 3))
                             m_escape = false;
                     }
                }
                emitCharacter(cc);
                m_source.advance();
                if (m_source.isEmpty())
                     return;
                cc = *m_source;
            }
            break;
        case EntityData:
            // should try to consume the entity but we only care about entities in attributes
            m_state = Data;
            break;
        case TagOpen:
            if (m_contentModel == RCDATA || m_contentModel == CDATA) {
                if (cc == '/')
                    m_state = CloseTagOpen;
                else {
                    m_state = Data;
                    continue;
                }
            } else if (m_contentModel == PCDATA) {
                if (cc == '!')
                    m_state = MarkupDeclarationOpen;
                else if (cc == '/')
                    m_state = CloseTagOpen;
                else if (cc >= 'A' && cc <= 'Z') {
                    m_tagName.clear();
                    m_charset = String();
                    m_tagName.append(cc + 0x20);
                    m_closeTag = false;
                    m_state = TagName;
                } else if (cc >= 'a' && cc <= 'z') {
                    m_tagName.clear();
                    m_charset = String();
                    m_tagName.append(cc);
                    m_closeTag = false;
                    m_state = TagName;
                } else if (cc == '>') {
                    m_state = Data;
                } else if (cc == '?') {
                    m_state = BogusComment;
                } else {
                    m_state = Data;
                    continue;
                }
            }
            break;
        case CloseTagOpen:
            if (m_contentModel == RCDATA || m_contentModel == CDATA) {
                if (!m_lastStartTag.length()) {
                    m_state = Data;
                    continue;
                }
                if (m_source.length() < m_lastStartTag.length() + 1)
                    return;
                Vector<UChar> tmpString;
                UChar tmpChar = 0;
                bool match = true;
                for (unsigned n = 0; n < m_lastStartTag.length() + 1; n++) {
                    tmpChar = Unicode::toLower(*m_source);
                    if (n < m_lastStartTag.length() && tmpChar != m_lastStartTag[n])
                        match = false;
                    tmpString.append(tmpChar);
                    m_source.advance();
                }
                m_source.prepend(SegmentedString(String(tmpString.data(), tmpString.size())));
                if (!match || (!isWhitespace(tmpChar) && tmpChar != '>' && tmpChar != '/')) {
                    m_state = Data;
                    continue;
                }
            }
            if (cc >= 'A' && cc <= 'Z') {
                m_tagName.clear();
                m_charset = String();
                m_tagName.append(cc + 0x20);
                m_closeTag = true;
                m_state = TagName;
            } else if (cc >= 'a' && cc <= 'z') {
                m_tagName.clear();
                m_charset = String();
                m_tagName.append(cc);
                m_closeTag = true;
                m_state = TagName;
            } else if (cc == '>') {
                m_state = Data;
            } else
                m_state = BogusComment;
            break;
        case TagName:
            while (1) {
                if (isWhitespace(cc)) {
                    m_state = BeforeAttributeName;
                    break;
                }
                if (cc == '>') {
                    emitTag();
                    m_state = Data;
                    break;
                }
                if (cc == '/') {
                    m_state = BeforeAttributeName;
                    break;
                }
                if (cc >= 'A' && cc <= 'Z')
                    m_tagName.append(cc + 0x20);
                else
                    m_tagName.append(cc);
                m_source.advance();
                if (m_source.isEmpty())
                    return;
                cc = *m_source;
            }
            break;
        case BeforeAttributeName:
            if (isWhitespace(cc))
                ;
            else if (cc == '>') {
                emitTag();
                m_state = Data;
            } else if (cc >= 'A' && cc <= 'Z') {
                m_attributeName.clear();
                m_attributeValue.clear();
                m_attributeName.append(cc + 0x20);
                m_state = AttributeName;
            } else if (cc == '/')
                ;
            else {
                m_attributeName.clear();
                m_attributeValue.clear();
                m_attributeName.append(cc);
                m_state = AttributeName;
            }
            break;
        case AttributeName:
            while (1) {
                if (isWhitespace(cc)) {
                    m_state = AfterAttributeName;
                    break;
                }
                if (cc == '=') {
                    m_state = BeforeAttributeValue;
                    break;
                }
                if (cc == '>') {
                    emitTag();
                    m_state = Data;
                    break;
                } 
                if (cc == '/') {
                    m_state = BeforeAttributeName;
                    break;
                }
                if (cc >= 'A' && cc <= 'Z')
                    m_attributeName.append(cc + 0x20);
                else
                    m_attributeName.append(cc);
                m_source.advance();
                if (m_source.isEmpty())
                    return;
                cc = *m_source;
            }
            break;
        case AfterAttributeName:
            if (isWhitespace(cc))
                ;
            else if (cc == '=')
                m_state = BeforeAttributeValue; 
            else if (cc == '>') {
                emitTag();
                m_state = Data;
            } else if (cc >= 'A' && cc <= 'Z') {
                m_attributeName.clear();
                m_attributeValue.clear();
                m_attributeName.append(cc + 0x20);
                m_state = AttributeName;
            } else if (cc == '/')
                m_state = BeforeAttributeName;
            else {
                m_attributeName.clear();
                m_attributeValue.clear();
                m_attributeName.append(cc);
                m_state = AttributeName;
            }
            break;
        case BeforeAttributeValue:
            if (isWhitespace(cc))
                ;
            else if (cc == '"')
                m_state = AttributeValueDoubleQuoted;
            else if (cc == '&') {
                m_state = AttributeValueUnquoted;
                continue;
            } else if (cc == '\'')
                m_state = AttributeValueSingleQuoted;
            else if (cc == '>') {
                emitTag();
                m_state = Data;
            } else {
                m_attributeValue.append(cc);
                m_state = AttributeValueUnquoted;
            }
            break;
        case AttributeValueDoubleQuoted:
            while (1) {
                if (cc == '"') {
                    processAttribute();
                    m_state = BeforeAttributeName;
                    break;
                }
                if (cc == '&') {
                    m_stateBeforeEntityInAttributeValue = m_state;
                    m_state = EntityInAttributeValue;
                    break;
                } 
                m_attributeValue.append(cc);
                m_source.advance();
                if (m_source.isEmpty())
                    return;
                cc = *m_source;
            }
            break;
        case AttributeValueSingleQuoted:
            while (1) {
                if (cc == '\'') {
                    processAttribute();
                    m_state = BeforeAttributeName;
                    break;
                }
                if (cc == '&') {
                    m_stateBeforeEntityInAttributeValue = m_state;
                    m_state = EntityInAttributeValue;
                    break;
                } 
                m_attributeValue.append(cc);
                m_source.advance();
                if (m_source.isEmpty())
                    return;
                cc = *m_source;
            }
            break;
        case AttributeValueUnquoted:
            while (1) {
                if (isWhitespace(cc)) {
                    processAttribute();
                    m_state = BeforeAttributeName;
                    break;
                }
                if (cc == '&') {
                    m_stateBeforeEntityInAttributeValue = m_state;
                    m_state = EntityInAttributeValue;
                    break;
                }
                if (cc == '>') {
                    processAttribute();
                    emitTag();
                    m_state = Data;
                    break;
                }
                m_attributeValue.append(cc);
                m_source.advance();
                if (m_source.isEmpty())
                    return;
                cc = *m_source;
            }
            break;
        case EntityInAttributeValue: 
            {
                bool notEnoughCharacters = false; 
                unsigned entity = consumeEntity(m_source, notEnoughCharacters);
                if (notEnoughCharacters)
                    return;
                if (entity > 0xFFFF) {
                    m_attributeValue.append(U16_LEAD(entity));
                    m_attributeValue.append(U16_TRAIL(entity));
                } else if (entity)
                    m_attributeValue.append(entity);
                else
                    m_attributeValue.append('&');
            }
            m_state = m_stateBeforeEntityInAttributeValue;
            continue;
        case BogusComment:
            while (1) {
                if (cc == '>') {
                    m_state = Data;
                    break;
                }
                m_source.advance();
                if (m_source.isEmpty())
                    return;
                cc = *m_source;
            }
            break;
        case MarkupDeclarationOpen: {
            if (cc == '-') {
                if (m_source.length() < 2)
                    return;
                m_source.advance();
                cc = *m_source;
                if (cc == '-')
                    m_state = CommentStart;
                else {
                    m_state = BogusComment;
                    continue;
                }
            // If we cared about the DOCTYPE we would test to enter those states here
            } else {
                m_state = BogusComment;
                continue;
            }
            break;
        }
        case CommentStart:
            if (cc == '-')
                m_state = CommentStartDash;
            else if (cc == '>')
                m_state = Data;
            else
                m_state = Comment;
            break;
        case CommentStartDash:
            if (cc == '-')
                m_state = CommentEnd;
            else if (cc == '>')
                m_state = Data;
            else
                m_state = Comment;
            break;
        case Comment:
            while (1) {
                if (cc == '-') {
                    m_state = CommentEndDash;
                    break;
                }
                m_source.advance();
                if (m_source.isEmpty())
                    return;
                cc = *m_source;
            }
            break;
        case CommentEndDash:
            if (cc == '-')
                m_state = CommentEnd;
            else 
                m_state = Comment;
            break;
        case CommentEnd:
            if (cc == '>')
                m_state = Data;
            else if (cc == '-')
                ;
            else 
                m_state = Comment;
            break;
        }
        m_source.advance();
    }
}
    
void PreloadScanner::processAttribute()
{
    AtomicString tag = AtomicString(m_tagName.data(), m_tagName.size());
    AtomicString attribute = AtomicString(m_attributeName.data(), m_attributeName.size());
    
    String value(m_attributeValue.data(), m_attributeValue.size());
    if (tag == scriptTag || tag == imgTag) {
        if (attribute == srcAttr && m_urlToLoad.isEmpty())
            m_urlToLoad = parseURL(value);
        else if (attribute == charsetAttr)
            m_charset = value;
    } else if (tag == linkTag) {
        if (attribute == hrefAttr && m_urlToLoad.isEmpty())
            m_urlToLoad = parseURL(value);
        else if (attribute == relAttr) {
            bool styleSheet = false;
            bool alternate = false;
            bool icon = false;
            bool dnsPrefetch = false;
            HTMLLinkElement::tokenizeRelAttribute(value, styleSheet, alternate, icon, dnsPrefetch);
            m_linkIsStyleSheet = styleSheet && !alternate && !icon && !dnsPrefetch;
        } else if (attribute == charsetAttr)
            m_charset = value;
    }
}
    
inline void PreloadScanner::emitCharacter(UChar c)
{
    if (m_contentModel == CDATA && m_lastStartTag == styleTag) 
        tokenizeCSS(c);
}
    
inline void PreloadScanner::tokenizeCSS(UChar c)
{    
    // We are just interested in @import rules, no need for real tokenization here
    // Searching for other types of resources is probably low payoff
    switch (m_cssState) {
    case CSSInitial:
        if (c == '@')
            m_cssState = CSSRuleStart;
        else if (c == '/')
            m_cssState = CSSMaybeComment;
        break;
    case CSSMaybeComment:
        if (c == '*')
            m_cssState = CSSComment;
        else
            m_cssState = CSSInitial;
        break;
    case CSSComment:
        if (c == '*')
            m_cssState = CSSMaybeCommentEnd;
        break;
    case CSSMaybeCommentEnd:
        if (c == '/')
            m_cssState = CSSInitial;
        else if (c == '*')
            ;
        else
            m_cssState = CSSComment;
        break;
    case CSSRuleStart:
        if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
            m_cssRule.clear();
            m_cssRuleValue.clear();
            m_cssRule.append(c);
            m_cssState = CSSRule;
        } else
            m_cssState = CSSInitial;
        break;
    case CSSRule:
        if (isWhitespace(c))
            m_cssState = CSSAfterRule;
        else if (c == ';')
            m_cssState = CSSInitial;
        else
            m_cssRule.append(c);
        break;
    case CSSAfterRule:
        if (isWhitespace(c))
            ;
        else if (c == ';')
            m_cssState = CSSInitial;
        else {
            m_cssState = CSSRuleValue;
            m_cssRuleValue.append(c);
        }
        break;
    case CSSRuleValue:
        if (isWhitespace(c))
            m_cssState = CSSAfterRuleValue;
        else if (c == ';') {
            emitCSSRule();
            m_cssState = CSSInitial;
        } else 
            m_cssRuleValue.append(c);
        break;
    case CSSAfterRuleValue:
        if (isWhitespace(c))
            ;
        else if (c == ';') {
            emitCSSRule();
            m_cssState = CSSInitial;
        } else {
            // FIXME media rules
             m_cssState = CSSInitial;
        }
        break;
    }
}
    
void PreloadScanner::emitTag()
{
    if (m_closeTag) {
        m_contentModel = PCDATA;
        m_cssState = CSSInitial;
        clearLastCharacters();
        return;
    }
    
    AtomicString tag(m_tagName.data(), m_tagName.size());
    m_lastStartTag = tag;
    
    if (tag == textareaTag || tag == titleTag)
        m_contentModel = RCDATA;
    else if (tag == styleTag || tag == xmpTag || tag == scriptTag || tag == iframeTag || tag == noembedTag || tag == noframesTag)
        m_contentModel = CDATA;
    else if (tag == noscriptTag)
        // we wouldn't be here if scripts were disabled
        m_contentModel = CDATA;
    else if (tag == plaintextTag)
        m_contentModel = PLAINTEXT;
    else
        m_contentModel = PCDATA;
    
    if (tag == bodyTag)
        m_bodySeen = true;
    
    if (m_urlToLoad.isEmpty()) {
        m_linkIsStyleSheet = false;
        return;
    }
    
    if (tag == scriptTag)
        m_document->docLoader()->preload(CachedResource::Script, m_urlToLoad, m_charset, scanningBody());
    else if (tag == imgTag) 
        m_document->docLoader()->preload(CachedResource::ImageResource, m_urlToLoad, String(), scanningBody());
    else if (tag == linkTag && m_linkIsStyleSheet) 
        m_document->docLoader()->preload(CachedResource::CSSStyleSheet, m_urlToLoad, m_charset, scanningBody());

m_urlToLoad = String();
    m_charset = String();
    m_linkIsStyleSheet = false;
}
    
void PreloadScanner::emitCSSRule()
{
    String rule(m_cssRule.data(), m_cssRule.size());
    if (equalIgnoringCase(rule, "import") && !m_cssRuleValue.isEmpty()) {
        String value(m_cssRuleValue.data(), m_cssRuleValue.size());
        String url = parseURL(value);
        if (!url.isEmpty())
            m_document->docLoader()->preload(CachedResource::CSSStyleSheet, url, String(), scanningBody());
    }
    m_cssRule.clear();
    m_cssRuleValue.clear();
}
                
}

C++程序 | 854行 | 27.08 KB

/*
 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 */

#include "config.h"
#include "PreloadScanner.h"

#include "AtomicString.h"
#include "CachedCSSStyleSheet.h"
#include "CachedImage.h"
#include "CachedResource.h"
#include "CachedResourceClient.h"
#include "CachedScript.h"
#include "CSSHelper.h"
#include "CString.h"
#include "DocLoader.h"
#include "Document.h"
#include "Frame.h"
#include "FrameLoader.h"
#include "HTMLLinkElement.h"
#include "HTMLNames.h"
#include <wtf/CurrentTime.h>
#include <wtf/unicode/Unicode.h>

#ifdef __GNUC__
// The main tokenizer includes this too so we are getting two copies of the data. However, this way the code gets inlined.
#include "HTMLEntityNames.c"
#else
// Not inlined for non-GCC compilers
struct Entity {
    const char* name;
    int code;
};
const struct Entity* findEntity(register const char* str, register unsigned int len);
#endif

#define PRELOAD_DEBUG 0

using namespace WTF;

namespace WebCore {
    
using namespace HTMLNames;
    
PreloadScanner::PreloadScanner(Document* doc)
    : m_inProgress(false)
    , m_timeUsed(0)
    , m_bodySeen(false)
    , m_document(doc)
{
#if PRELOAD_DEBUG
    printf("CREATING PRELOAD SCANNER FOR %s\n", m_document->url().string().latin1().data());
#endif
}
    
PreloadScanner::~PreloadScanner()
{
#if PRELOAD_DEBUG
    printf("DELETING PRELOAD SCANNER FOR %s\n", m_document->url().string().latin1().data());
    printf("TOTAL TIME USED %.4fs\n", m_timeUsed);
#endif
}
    
void PreloadScanner::begin() 
{ 
    ASSERT(!m_inProgress); 
    reset(); 
    m_inProgress = true; 
}
    
void PreloadScanner::end() 
{ 
    ASSERT(m_inProgress); 
    m_inProgress = false; 
}

void PreloadScanner::reset()
{
    m_source.clear();
    
    m_state = Data;
    m_escape = false;
    m_contentModel = PCDATA;
    m_commentPos = 0;

    m_closeTag = false;
    m_tagName.clear();
    m_attributeName.clear();
    m_attributeValue.clear();
    m_lastStartTag = AtomicString();
    
    m_urlToLoad = String();
    m_charset = String();
    m_linkIsStyleSheet = false;
    m_lastCharacterIndex = 0;
    clearLastCharacters();
    
    m_cssState = CSSInitial;
    m_cssRule.clear();
    m_cssRuleValue.clear();
}
    
bool PreloadScanner::scanningBody() const
{
    return m_document->body() || m_bodySeen;
}
    
void PreloadScanner::write(const SegmentedString& source)
{
    double startTime = currentTime();
    tokenize(source);
    m_timeUsed += currentTime() - startTime;
}
    
static inline bool isWhitespace(UChar c)
{
    return c == ' ' || c == '\n' || c == '\r' || c == '\t';
}
    
inline void PreloadScanner::clearLastCharacters()
{
    memset(m_lastCharacters, 0, lastCharactersBufferSize * sizeof(UChar));
}
    
inline void PreloadScanner::rememberCharacter(UChar c)
{
    m_lastCharacterIndex = (m_lastCharacterIndex + 1) % lastCharactersBufferSize;
    m_lastCharacters[m_lastCharacterIndex] = c;
}
    
inline bool PreloadScanner::lastCharactersMatch(const char* chars, unsigned count) const
{
    unsigned pos = m_lastCharacterIndex;
    while (count) {
        if (chars[count - 1] != m_lastCharacters[pos])
            return false;
        --count;
        if (!pos)
            pos = lastCharactersBufferSize;
        --pos;
    }
    return true;
}
    
static inline unsigned legalEntityFor(unsigned value)
{
    // FIXME There is a table for more exceptions in the HTML5 specification.
    if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
        return 0xFFFD;
    return value;
}
    
unsigned PreloadScanner::consumeEntity(SegmentedString& source, bool& notEnoughCharacters)
{
    enum EntityState {
        Initial,
        NumberType,
        MaybeHex,
        Hex,
        Decimal,
        Named
    };
    EntityState entityState = Initial;
    unsigned result = 0;
    Vector<UChar, 10> seenChars;
    Vector<char, 10> entityName;
    
    while (!source.isEmpty()) {
        UChar cc = *source;
        seenChars.append(cc);
        switch (entityState) {
        case Initial:
            if (isWhitespace(cc) || cc == '<' || cc == '&')
                return 0;
            else if (cc == '#') 
                entityState = NumberType;
            else if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
                entityName.append(cc);
                entityState = Named;
            } else
                return 0;
            break;
        case NumberType:
            if (cc == 'x' || cc == 'X')
                entityState = MaybeHex;
            else if (cc >= '0' && cc <= '9') {
                entityState = Decimal;
                result = cc - '0';
            } else {
                source.push('#');
                return 0;
            }
            break;
        case MaybeHex:
            if (cc >= '0' && cc <= '9')
                result = cc - '0';
            else if (cc >= 'a' && cc <= 'f')
                result = 10 + cc - 'a';
            else if (cc >= 'A' && cc <= 'F')
                result = 10 + cc - 'A';
            else {
                source.push(seenChars[1]);
                source.push('#');
                return 0;
            }
            entityState = Hex;
            break;
        case Hex:
            if (cc >= '0' && cc <= '9')
                result = result * 16 + cc - '0';
            else if (cc >= 'a' && cc <= 'f')
                result = result * 16 + 10 + cc - 'a';
            else if (cc >= 'A' && cc <= 'F')
                result = result * 16 + 10 + cc - 'A';
            else if (cc == ';') {
                source.advance();
                return legalEntityFor(result);
            } else 
                return legalEntityFor(result);
            break;
        case Decimal:
            if (cc >= '0' && cc <= '9')
                result = result * 10 + cc - '0';
            else if (cc == ';') {
                source.advance();
                return legalEntityFor(result);
            } else
                return legalEntityFor(result);
            break;               
        case Named:
            // This is the attribute only version, generic version matches somewhat differently
            while (entityName.size() <= 8) {
                if (cc == ';') {
                    const Entity* entity = findEntity(entityName.data(), entityName.size());
                    if (entity) {
                        source.advance();
                        return entity->code;
                    }
                    break;
                }
                if (!(cc >= 'a' && cc <= 'z') && !(cc >= 'A' && cc <= 'Z') && !(cc >= '0' && cc <= '9')) {
                    const Entity* entity = findEntity(entityName.data(), entityName.size());
                    if (entity)
                        return entity->code;
                    break;
                }
                entityName.append(cc);
                source.advance();
                if (source.isEmpty())
                    goto outOfCharacters;
                cc = *source;
                seenChars.append(cc);
            }
            if (seenChars.size() == 2)
                source.push(seenChars[0]);
            else if (seenChars.size() == 3) {
                source.push(seenChars[1]);
                source.push(seenChars[0]);
            } else
                source.prepend(SegmentedString(String(seenChars.data(), seenChars.size() - 1)));
            return 0;
        }
        source.advance();
    }
outOfCharacters:
    notEnoughCharacters = true;
    source.prepend(SegmentedString(String(seenChars.data(), seenChars.size())));
    return 0;
}

void PreloadScanner::tokenize(const SegmentedString& source)
{
    ASSERT(m_inProgress);
    
    m_source.append(source);

    // This is a simplified HTML5 Tokenizer
    // http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
    while (!m_source.isEmpty()) {
        UChar cc = *m_source;
        switch (m_state) {
        case Data:
            while (1) {
                rememberCharacter(cc);
                if (cc == '&') {
                    if (m_contentModel == PCDATA || m_contentModel == RCDATA) {
                        m_state = EntityData;
                        break;
                    }
                } else if (cc == '-') {
                    if ((m_contentModel == RCDATA || m_contentModel == CDATA) && !m_escape) {
                        if (lastCharactersMatch("<!--", 4))
                            m_escape = true;
                    }
                } else if (cc == '<') {
                    if (m_contentModel == PCDATA || ((m_contentModel == RCDATA || m_contentModel == CDATA) && !m_escape)) {
                        m_state = TagOpen;
                        break;
                    }
                } else if (cc == '>') {
                     if ((m_contentModel == RCDATA || m_contentModel == CDATA) && m_escape) {
                         if (lastCharactersMatch("-->", 3))
                             m_escape = false;
                     }
                }
                emitCharacter(cc);
                m_source.advance();
                if (m_source.isEmpty())
                     return;
                cc = *m_source;
            }
            break;
        case EntityData:
            // should try to consume the entity but we only care about entities in attributes
            m_state = Data;
            break;
        case TagOpen:
            if (m_contentModel == RCDATA || m_contentModel == CDATA) {
                if (cc == '/')
                    m_state = CloseTagOpen;
                else {
                    m_state = Data;
                    continue;
                }
            } else if (m_contentModel == PCDATA) {
                if (cc == '!')
                    m_state = MarkupDeclarationOpen;
                else if (cc == '/')
                    m_state = CloseTagOpen;
                else if (cc >= 'A' && cc <= 'Z') {
                    m_tagName.clear();
                    m_charset = String();
                    m_tagName.append(cc + 0x20);
                    m_closeTag = false;
                    m_state = TagName;
                } else if (cc >= 'a' && cc <= 'z') {
                    m_tagName.clear();
                    m_charset = String();
                    m_tagName.append(cc);
                    m_closeTag = false;
                    m_state = TagName;
                } else if (cc == '>') {
                    m_state = Data;
                } else if (cc == '?') {
                    m_state = BogusComment;
                } else {
                    m_state = Data;
                    continue;
                }
            }
            break;
        case CloseTagOpen:
            if (m_contentModel == RCDATA || m_contentModel == CDATA) {
                if (!m_lastStartTag.length()) {
                    m_state = Data;
                    continue;
                }
                if (m_source.length() < m_lastStartTag.length() + 1)
                    return;
                Vector<UChar> tmpString;
                UChar tmpChar = 0;
                bool match = true;
                for (unsigned n = 0; n < m_lastStartTag.length() + 1; n++) {
                    tmpChar = Unicode::toLower(*m_source);
                    if (n < m_lastStartTag.length() && tmpChar != m_lastStartTag[n])
                        match = false;
                    tmpString.append(tmpChar);
                    m_source.advance();
                }
                m_source.prepend(SegmentedString(String(tmpString.data(), tmpString.size())));
                if (!match || (!isWhitespace(tmpChar) && tmpChar != '>' && tmpChar != '/')) {
                    m_state = Data;
                    continue;
                }
            }
            if (cc >= 'A' && cc <= 'Z') {
                m_tagName.clear();
                m_charset = String();
                m_tagName.append(cc + 0x20);
                m_closeTag = true;
                m_state = TagName;
            } else if (cc >= 'a' && cc <= 'z') {
                m_tagName.clear();
                m_charset = String();
                m_tagName.append(cc);
                m_closeTag = true;
                m_state = TagName;
            } else if (cc == '>') {
                m_state = Data;
            } else
                m_state = BogusComment;
            break;
        case TagName:
            while (1) {
                if (isWhitespace(cc)) {
                    m_state = BeforeAttributeName;
                    break;
                }
                if (cc == '>') {
                    emitTag();
                    m_state = Data;
                    break;
                }
                if (cc == '/') {
                    m_state = BeforeAttributeName;
                    break;
                }
                if (cc >= 'A' && cc <= 'Z')
                    m_tagName.append(cc + 0x20);
                else
                    m_tagName.append(cc);
                m_source.advance();
                if (m_source.isEmpty())
                    return;
                cc = *m_source;
            }
            break;
        case BeforeAttributeName:
            if (isWhitespace(cc))
                ;
            else if (cc == '>') {
                emitTag();
                m_state = Data;
            } else if (cc >= 'A' && cc <= 'Z') {
                m_attributeName.clear();
                m_attributeValue.clear();
                m_attributeName.append(cc + 0x20);
                m_state = AttributeName;
            } else if (cc == '/')
                ;
            else {
                m_attributeName.clear();
                m_attributeValue.clear();
                m_attributeName.append(cc);
                m_state = AttributeName;
            }
            break;
        case AttributeName:
            while (1) {
                if (isWhitespace(cc)) {
                    m_state = AfterAttributeName;
                    break;
                }
                if (cc == '=') {
                    m_state = BeforeAttributeValue;
                    break;
                }
                if (cc == '>') {
                    emitTag();
                    m_state = Data;
                    break;
                } 
                if (cc == '/') {
                    m_state = BeforeAttributeName;
                    break;
                }
                if (cc >= 'A' && cc <= 'Z')
                    m_attributeName.append(cc + 0x20);
                else
                    m_attributeName.append(cc);
                m_source.advance();
                if (m_source.isEmpty())
                    return;
                cc = *m_source;
            }
            break;
        case AfterAttributeName:
            if (isWhitespace(cc))
                ;
            else if (cc == '=')
                m_state = BeforeAttributeValue; 
            else if (cc == '>') {
                emitTag();
                m_state = Data;
            } else if (cc >= 'A' && cc <= 'Z') {
                m_attributeName.clear();
                m_attributeValue.clear();
                m_attributeName.append(cc + 0x20);
                m_state = AttributeName;
            } else if (cc == '/')
                m_state = BeforeAttributeName;
            else {
                m_attributeName.clear();
                m_attributeValue.clear();
                m_attributeName.append(cc);
                m_state = AttributeName;
            }
            break;
        case BeforeAttributeValue:
            if (isWhitespace(cc))
                ;
            else if (cc == '"')
                m_state = AttributeValueDoubleQuoted;
            else if (cc == '&') {
                m_state = AttributeValueUnquoted;
                continue;
            } else if (cc == '\'')
                m_state = AttributeValueSingleQuoted;
            else if (cc == '>') {
                emitTag();
                m_state = Data;
            } else {
                m_attributeValue.append(cc);
                m_state = AttributeValueUnquoted;
            }
            break;
        case AttributeValueDoubleQuoted:
            while (1) {
                if (cc == '"') {
                    processAttribute();
                    m_state = BeforeAttributeName;
                    break;
                }
                if (cc == '&') {
                    m_stateBeforeEntityInAttributeValue = m_state;
                    m_state = EntityInAttributeValue;
                    break;
                } 
                m_attributeValue.append(cc);
                m_source.advance();
                if (m_source.isEmpty())
                    return;
                cc = *m_source;
            }
            break;
        case AttributeValueSingleQuoted:
            while (1) {
                if (cc == '\'') {
                    processAttribute();
                    m_state = BeforeAttributeName;
                    break;
                }
                if (cc == '&') {
                    m_stateBeforeEntityInAttributeValue = m_state;
                    m_state = EntityInAttributeValue;
                    break;
                } 
                m_attributeValue.append(cc);
                m_source.advance();
                if (m_source.isEmpty())
                    return;
                cc = *m_source;
            }
            break;
        case AttributeValueUnquoted:
            while (1) {
                if (isWhitespace(cc)) {
                    processAttribute();
                    m_state = BeforeAttributeName;
                    break;
                }
                if (cc == '&') {
                    m_stateBeforeEntityInAttributeValue = m_state;
                    m_state = EntityInAttributeValue;
                    break;
                }
                if (cc == '>') {
                    processAttribute();
                    emitTag();
                    m_state = Data;
                    break;
                }
                m_attributeValue.append(cc);
                m_source.advance();
                if (m_source.isEmpty())
                    return;
                cc = *m_source;
            }
            break;
        case EntityInAttributeValue: 
            {
                bool notEnoughCharacters = false; 
                unsigned entity = consumeEntity(m_source, notEnoughCharacters);
                if (notEnoughCharacters)
                    return;
                if (entity > 0xFFFF) {
                    m_attributeValue.append(U16_LEAD(entity));
                    m_attributeValue.append(U16_TRAIL(entity));
                } else if (entity)
                    m_attributeValue.append(entity);
                else
                    m_attributeValue.append('&');
            }
            m_state = m_stateBeforeEntityInAttributeValue;
            continue;
        case BogusComment:
            while (1) {
                if (cc == '>') {
                    m_state = Data;
                    break;
                }
                m_source.advance();
                if (m_source.isEmpty())
                    return;
                cc = *m_source;
            }
            break;
        case MarkupDeclarationOpen: {
            if (cc == '-') {
                if (m_source.length() < 2)
                    return;
                m_source.advance();
                cc = *m_source;
                if (cc == '-')
                    m_state = CommentStart;
                else {
                    m_state = BogusComment;
                    continue;
                }
            // If we cared about the DOCTYPE we would test to enter those states here
            } else {
                m_state = BogusComment;
                continue;
            }
            break;
        }
        case CommentStart:
            if (cc == '-')
                m_state = CommentStartDash;
            else if (cc == '>')
                m_state = Data;
            else
                m_state = Comment;
            break;
        case CommentStartDash:
            if (cc == '-')
                m_state = CommentEnd;
            else if (cc == '>')
                m_state = Data;
            else
                m_state = Comment;
            break;
        case Comment:
            while (1) {
                if (cc == '-') {
                    m_state = CommentEndDash;
                    break;
                }
                m_source.advance();
                if (m_source.isEmpty())
                    return;
                cc = *m_source;
            }
            break;
        case CommentEndDash:
            if (cc == '-')
                m_state = CommentEnd;
            else 
                m_state = Comment;
            break;
        case CommentEnd:
            if (cc == '>')
                m_state = Data;
            else if (cc == '-')
                ;
            else 
                m_state = Comment;
            break;
        }
        m_source.advance();
    }
}
    
void PreloadScanner::processAttribute()
{
    AtomicString tag = AtomicString(m_tagName.data(), m_tagName.size());
    AtomicString attribute = AtomicString(m_attributeName.data(), m_attributeName.size());
    
    String value(m_attributeValue.data(), m_attributeValue.size());
    if (tag == scriptTag || tag == imgTag) {
        if (attribute == srcAttr && m_urlToLoad.isEmpty())
            m_urlToLoad = parseURL(value);
        else if (attribute == charsetAttr)
            m_charset = value;
    } else if (tag == linkTag) {
        if (attribute == hrefAttr && m_urlToLoad.isEmpty())
            m_urlToLoad = parseURL(value);
        else if (attribute == relAttr) {
            bool styleSheet = false;
            bool alternate = false;
            bool icon = false;
            bool dnsPrefetch = false;
            HTMLLinkElement::tokenizeRelAttribute(value, styleSheet, alternate, icon, dnsPrefetch);
            m_linkIsStyleSheet = styleSheet && !alternate && !icon && !dnsPrefetch;
        } else if (attribute == charsetAttr)
            m_charset = value;
    }
}
    
inline void PreloadScanner::emitCharacter(UChar c)
{
    if (m_contentModel == CDATA && m_lastStartTag == styleTag) 
        tokenizeCSS(c);
}
    
inline void PreloadScanner::tokenizeCSS(UChar c)
{    
    // We are just interested in @import rules, no need for real tokenization here
    // Searching for other types of resources is probably low payoff
    switch (m_cssState) {
    case CSSInitial:
        if (c == '@')
            m_cssState = CSSRuleStart;
        else if (c == '/')
            m_cssState = CSSMaybeComment;
        break;
    case CSSMaybeComment:
        if (c == '*')
            m_cssState = CSSComment;
        else
            m_cssState = CSSInitial;
        break;
    case CSSComment:
        if (c == '*')
            m_cssState = CSSMaybeCommentEnd;
        break;
    case CSSMaybeCommentEnd:
        if (c == '/')
            m_cssState = CSSInitial;
        else if (c == '*')
            ;
        else
            m_cssState = CSSComment;
        break;
    case CSSRuleStart:
        if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
            m_cssRule.clear();
            m_cssRuleValue.clear();
            m_cssRule.append(c);
            m_cssState = CSSRule;
        } else
            m_cssState = CSSInitial;
        break;
    case CSSRule:
        if (isWhitespace(c))
            m_cssState = CSSAfterRule;
        else if (c == ';')
            m_cssState = CSSInitial;
        else
            m_cssRule.append(c);
        break;
    case CSSAfterRule:
        if (isWhitespace(c))
            ;
        else if (c == ';')
            m_cssState = CSSInitial;
        else {
            m_cssState = CSSRuleValue;
            m_cssRuleValue.append(c);
        }
        break;
    case CSSRuleValue:
        if (isWhitespace(c))
            m_cssState = CSSAfterRuleValue;
        else if (c == ';') {
            emitCSSRule();
            m_cssState = CSSInitial;
        } else 
            m_cssRuleValue.append(c);
        break;
    case CSSAfterRuleValue:
        if (isWhitespace(c))
            ;
        else if (c == ';') {
            emitCSSRule();
            m_cssState = CSSInitial;
        } else {
            // FIXME media rules
             m_cssState = CSSInitial;
        }
        break;
    }
}
    
void PreloadScanner::emitTag()
{
    if (m_closeTag) {
        m_contentModel = PCDATA;
        m_cssState = CSSInitial;
        clearLastCharacters();
        return;
    }
    
    AtomicString tag(m_tagName.data(), m_tagName.size());
    m_lastStartTag = tag;
    
    if (tag == textareaTag || tag == titleTag)
        m_contentModel = RCDATA;
    else if (tag == styleTag || tag == xmpTag || tag == scriptTag || tag == iframeTag || tag == noembedTag || tag == noframesTag)
        m_contentModel = CDATA;
    else if (tag == noscriptTag)
        // we wouldn't be here if scripts were disabled
        m_contentModel = CDATA;
    else if (tag == plaintextTag)
        m_contentModel = PLAINTEXT;
    else
        m_contentModel = PCDATA;
    
    if (tag == bodyTag)
        m_bodySeen = true;
    
    if (m_urlToLoad.isEmpty()) {
        m_linkIsStyleSheet = false;
        return;
    }
    
    if (tag == scriptTag)
        m_document->docLoader()->preload(CachedResource::Script, m_urlToLoad, m_charset, scanningBody());
    else if (tag == imgTag) 
        m_document->docLoader()->preload(CachedResource::ImageResource, m_urlToLoad, String(), scanningBody());
    else if (tag == linkTag && m_linkIsStyleSheet) 
        m_document->docLoader()->preload(CachedResource::CSSStyleSheet, m_urlToLoad, m_charset, scanningBody());

    m_urlToLoad = String();
    m_charset = String();
    m_linkIsStyleSheet = false;
}
    
void PreloadScanner::emitCSSRule()
{
    String rule(m_cssRule.data(), m_cssRule.size());
    if (equalIgnoringCase(rule, "import") && !m_cssRuleValue.isEmpty()) {
        String value(m_cssRuleValue.data(), m_cssRuleValue.size());
        String url = parseURL(value);
        if (!url.isEmpty())
            m_document->docLoader()->preload(CachedResource::CSSStyleSheet, url, String(), scanningBody());
    }
    m_cssRule.clear();
    m_cssRuleValue.clear();
}
                
}

登录后可以享受更多权益