/* * Copyright (C) 2008 Apple Inc. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "PreloadScanner.h" #include "AtomicString.h" #include "CachedCSSStyleSheet.h" #include "CachedImage.h" #include "CachedResource.h" #include "CachedResourceClient.h" #include "CachedScript.h" #include "CSSHelper.h" #include "CString.h" #include "DocLoader.h" #include "Document.h" #include "Frame.h" #include "FrameLoader.h" #include "HTMLLinkElement.h" #include "HTMLNames.h" #include <wtf/CurrentTime.h> #include <wtf/unicode/Unicode.h> #ifdef __GNUC__ // The main tokenizer includes this too so we are getting two copies of the data. However, this way the code gets inlined. #include "HTMLEntityNames.c" #else // Not inlined for non-GCC compilers struct Entity { const char* name; int code; }; const struct Entity* findEntity(register const char* str, register unsigned int len); #endif #define PRELOAD_DEBUG 0 using namespace WTF; namespace WebCore { using namespace HTMLNames; PreloadScanner::PreloadScanner(Document* doc) : m_inProgress(false) , m_timeUsed(0) , m_bodySeen(false) , m_document(doc) { #if PRELOAD_DEBUG printf("CREATING PRELOAD SCANNER FOR %s\n", m_document->url().string().latin1().data()); #endif } PreloadScanner::~PreloadScanner() { #if PRELOAD_DEBUG printf("DELETING PRELOAD SCANNER FOR %s\n", m_document->url().string().latin1().data()); printf("TOTAL TIME USED %.4fs\n", m_timeUsed); #endif } void PreloadScanner::begin() { ASSERT(!m_inProgress); reset(); m_inProgress = true; } void PreloadScanner::end() { ASSERT(m_inProgress); m_inProgress = false; } void PreloadScanner::reset() { m_source.clear(); m_state = Data; m_escape = false; m_contentModel = PCDATA; m_commentPos = 0; m_closeTag = false; m_tagName.clear(); m_attributeName.clear(); m_attributeValue.clear(); m_lastStartTag = AtomicString(); m_urlToLoad = String(); m_charset = String(); m_linkIsStyleSheet = false; m_lastCharacterIndex = 0; clearLastCharacters(); m_cssState = CSSInitial; m_cssRule.clear(); m_cssRuleValue.clear(); } bool PreloadScanner::scanningBody() const { return m_document->body() || m_bodySeen; } void PreloadScanner::write(const SegmentedString& source) { double startTime = currentTime(); tokenize(source); m_timeUsed += currentTime() - startTime; } static inline bool isWhitespace(UChar c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; } inline void PreloadScanner::clearLastCharacters() { memset(m_lastCharacters, 0, lastCharactersBufferSize * sizeof(UChar)); } inline void PreloadScanner::rememberCharacter(UChar c) { m_lastCharacterIndex = (m_lastCharacterIndex + 1) % lastCharactersBufferSize; m_lastCharacters[m_lastCharacterIndex] = c; } inline bool PreloadScanner::lastCharactersMatch(const char* chars, unsigned count) const { unsigned pos = m_lastCharacterIndex; while (count) { if (chars[count - 1] != m_lastCharacters[pos]) return false; --count; if (!pos) pos = lastCharactersBufferSize; --pos; } return true; } static inline unsigned legalEntityFor(unsigned value) { // FIXME There is a table for more exceptions in the HTML5 specification. if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) return 0xFFFD; return value; } unsigned PreloadScanner::consumeEntity(SegmentedString& source, bool& notEnoughCharacters) { enum EntityState { Initial, NumberType, MaybeHex, Hex, Decimal, Named }; EntityState entityState = Initial; unsigned result = 0; Vector<UChar, 10> seenChars; Vector<char, 10> entityName; while (!source.isEmpty()) { UChar cc = *source; seenChars.append(cc); switch (entityState) { case Initial: if (isWhitespace(cc) || cc == '<' || cc == '&') return 0; else if (cc == '#') entityState = NumberType; else if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) { entityName.append(cc); entityState = Named; } else return 0; break; case NumberType: if (cc == 'x' || cc == 'X') entityState = MaybeHex; else if (cc >= '0' && cc <= '9') { entityState = Decimal; result = cc - '0'; } else { source.push('#'); return 0; } break; case MaybeHex: if (cc >= '0' && cc <= '9') result = cc - '0'; else if (cc >= 'a' && cc <= 'f') result = 10 + cc - 'a'; else if (cc >= 'A' && cc <= 'F') result = 10 + cc - 'A'; else { source.push(seenChars[1]); source.push('#'); return 0; } entityState = Hex; break; case Hex: if (cc >= '0' && cc <= '9') result = result * 16 + cc - '0'; else if (cc >= 'a' && cc <= 'f') result = result * 16 + 10 + cc - 'a'; else if (cc >= 'A' && cc <= 'F') result = result * 16 + 10 + cc - 'A'; else if (cc == ';') { source.advance(); return legalEntityFor(result); } else return legalEntityFor(result); break; case Decimal: if (cc >= '0' && cc <= '9') result = result * 10 + cc - '0'; else if (cc == ';') { source.advance(); return legalEntityFor(result); } else return legalEntityFor(result); break; case Named: // This is the attribute only version, generic version matches somewhat differently while (entityName.size() <= 8) { if (cc == ';') { const Entity* entity = findEntity(entityName.data(), entityName.size()); if (entity) { source.advance(); return entity->code; } break; } if (!(cc >= 'a' && cc <= 'z') && !(cc >= 'A' && cc <= 'Z') && !(cc >= '0' && cc <= '9')) { const Entity* entity = findEntity(entityName.data(), entityName.size()); if (entity) return entity->code; break; } entityName.append(cc); source.advance(); if (source.isEmpty()) goto outOfCharacters; cc = *source; seenChars.append(cc); } if (seenChars.size() == 2) source.push(seenChars[0]); else if (seenChars.size() == 3) { source.push(seenChars[1]); source.push(seenChars[0]); } else source.prepend(SegmentedString(String(seenChars.data(), seenChars.size() - 1))); return 0; } source.advance(); } outOfCharacters: notEnoughCharacters = true; source.prepend(SegmentedString(String(seenChars.data(), seenChars.size()))); return 0; } void PreloadScanner::tokenize(const SegmentedString& source) { ASSERT(m_inProgress); m_source.append(source); // This is a simplified HTML5 Tokenizer // http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0 while (!m_source.isEmpty()) { UChar cc = *m_source; switch (m_state) { case Data: while (1) { rememberCharacter(cc); if (cc == '&') { if (m_contentModel == PCDATA || m_contentModel == RCDATA) { m_state = EntityData; break; } } else if (cc == '-') { if ((m_contentModel == RCDATA || m_contentModel == CDATA) && !m_escape) { if (lastCharactersMatch("<!--", 4)) m_escape = true; } } else if (cc == '<') { if (m_contentModel == PCDATA || ((m_contentModel == RCDATA || m_contentModel == CDATA) && !m_escape)) { m_state = TagOpen; break; } } else if (cc == '>') { if ((m_contentModel == RCDATA || m_contentModel == CDATA) && m_escape) { if (lastCharactersMatch("-->", 3)) m_escape = false; } } emitCharacter(cc); m_source.advance(); if (m_source.isEmpty()) return; cc = *m_source; } break; case EntityData: // should try to consume the entity but we only care about entities in attributes m_state = Data; break; case TagOpen: if (m_contentModel == RCDATA || m_contentModel == CDATA) { if (cc == '/') m_state = CloseTagOpen; else { m_state = Data; continue; } } else if (m_contentModel == PCDATA) { if (cc == '!') m_state = MarkupDeclarationOpen; else if (cc == '/') m_state = CloseTagOpen; else if (cc >= 'A' && cc <= 'Z') { m_tagName.clear(); m_charset = String(); m_tagName.append(cc + 0x20); m_closeTag = false; m_state = TagName; } else if (cc >= 'a' && cc <= 'z') { m_tagName.clear(); m_charset = String(); m_tagName.append(cc); m_closeTag = false; m_state = TagName; } else if (cc == '>') { m_state = Data; } else if (cc == '?') { m_state = BogusComment; } else { m_state = Data; continue; } } break; case CloseTagOpen: if (m_contentModel == RCDATA || m_contentModel == CDATA) { if (!m_lastStartTag.length()) { m_state = Data; continue; } if (m_source.length() < m_lastStartTag.length() + 1) return; Vector<UChar> tmpString; UChar tmpChar = 0; bool match = true; for (unsigned n = 0; n < m_lastStartTag.length() + 1; n++) { tmpChar = Unicode::toLower(*m_source); if (n < m_lastStartTag.length() && tmpChar != m_lastStartTag[n]) match = false; tmpString.append(tmpChar); m_source.advance(); } m_source.prepend(SegmentedString(String(tmpString.data(), tmpString.size()))); if (!match || (!isWhitespace(tmpChar) && tmpChar != '>' && tmpChar != '/')) { m_state = Data; continue; } } if (cc >= 'A' && cc <= 'Z') { m_tagName.clear(); m_charset = String(); m_tagName.append(cc + 0x20); m_closeTag = true; m_state = TagName; } else if (cc >= 'a' && cc <= 'z') { m_tagName.clear(); m_charset = String(); m_tagName.append(cc); m_closeTag = true; m_state = TagName; } else if (cc == '>') { m_state = Data; } else m_state = BogusComment; break; case TagName: while (1) { if (isWhitespace(cc)) { m_state = BeforeAttributeName; break; } if (cc == '>') { emitTag(); m_state = Data; break; } if (cc == '/') { m_state = BeforeAttributeName; break; } if (cc >= 'A' && cc <= 'Z') m_tagName.append(cc + 0x20); else m_tagName.append(cc); m_source.advance(); if (m_source.isEmpty()) return; cc = *m_source; } break; case BeforeAttributeName: if (isWhitespace(cc)) ; else if (cc == '>') { emitTag(); m_state = Data; } else if (cc >= 'A' && cc <= 'Z') { m_attributeName.clear(); m_attributeValue.clear(); m_attributeName.append(cc + 0x20); m_state = AttributeName; } else if (cc == '/') ; else { m_attributeName.clear(); m_attributeValue.clear(); m_attributeName.append(cc); m_state = AttributeName; } break; case AttributeName: while (1) { if (isWhitespace(cc)) { m_state = AfterAttributeName; break; } if (cc == '=') { m_state = BeforeAttributeValue; break; } if (cc == '>') { emitTag(); m_state = Data; break; } if (cc == '/') { m_state = BeforeAttributeName; break; } if (cc >= 'A' && cc <= 'Z') m_attributeName.append(cc + 0x20); else m_attributeName.append(cc); m_source.advance(); if (m_source.isEmpty()) return; cc = *m_source; } break; case AfterAttributeName: if (isWhitespace(cc)) ; else if (cc == '=') m_state = BeforeAttributeValue; else if (cc == '>') { emitTag(); m_state = Data; } else if (cc >= 'A' && cc <= 'Z') { m_attributeName.clear(); m_attributeValue.clear(); m_attributeName.append(cc + 0x20); m_state = AttributeName; } else if (cc == '/') m_state = BeforeAttributeName; else { m_attributeName.clear(); m_attributeValue.clear(); m_attributeName.append(cc); m_state = AttributeName; } break; case BeforeAttributeValue: if (isWhitespace(cc)) ; else if (cc == '"') m_state = AttributeValueDoubleQuoted; else if (cc == '&') { m_state = AttributeValueUnquoted; continue; } else if (cc == '\'') m_state = AttributeValueSingleQuoted; else if (cc == '>') { emitTag(); m_state = Data; } else { m_attributeValue.append(cc); m_state = AttributeValueUnquoted; } break; case AttributeValueDoubleQuoted: while (1) { if (cc == '"') { processAttribute(); m_state = BeforeAttributeName; break; } if (cc == '&') { m_stateBeforeEntityInAttributeValue = m_state; m_state = EntityInAttributeValue; break; } m_attributeValue.append(cc); m_source.advance(); if (m_source.isEmpty()) return; cc = *m_source; } break; case AttributeValueSingleQuoted: while (1) { if (cc == '\'') { processAttribute(); m_state = BeforeAttributeName; break; } if (cc == '&') { m_stateBeforeEntityInAttributeValue = m_state; m_state = EntityInAttributeValue; break; } m_attributeValue.append(cc); m_source.advance(); if (m_source.isEmpty()) return; cc = *m_source; } break; case AttributeValueUnquoted: while (1) { if (isWhitespace(cc)) { processAttribute(); m_state = BeforeAttributeName; break; } if (cc == '&') { m_stateBeforeEntityInAttributeValue = m_state; m_state = EntityInAttributeValue; break; } if (cc == '>') { processAttribute(); emitTag(); m_state = Data; break; } m_attributeValue.append(cc); m_source.advance(); if (m_source.isEmpty()) return; cc = *m_source; } break; case EntityInAttributeValue: { bool notEnoughCharacters = false; unsigned entity = consumeEntity(m_source, notEnoughCharacters); if (notEnoughCharacters) return; if (entity > 0xFFFF) { m_attributeValue.append(U16_LEAD(entity)); m_attributeValue.append(U16_TRAIL(entity)); } else if (entity) m_attributeValue.append(entity); else m_attributeValue.append('&'); } m_state = m_stateBeforeEntityInAttributeValue; continue; case BogusComment: while (1) { if (cc == '>') { m_state = Data; break; } m_source.advance(); if (m_source.isEmpty()) return; cc = *m_source; } break; case MarkupDeclarationOpen: { if (cc == '-') { if (m_source.length() < 2) return; m_source.advance(); cc = *m_source; if (cc == '-') m_state = CommentStart; else { m_state = BogusComment; continue; } // If we cared about the DOCTYPE we would test to enter those states here } else { m_state = BogusComment; continue; } break; } case CommentStart: if (cc == '-') m_state = CommentStartDash; else if (cc == '>') m_state = Data; else m_state = Comment; break; case CommentStartDash: if (cc == '-') m_state = CommentEnd; else if (cc == '>') m_state = Data; else m_state = Comment; break; case Comment: while (1) { if (cc == '-') { m_state = CommentEndDash; break; } m_source.advance(); if (m_source.isEmpty()) return; cc = *m_source; } break; case CommentEndDash: if (cc == '-') m_state = CommentEnd; else m_state = Comment; break; case CommentEnd: if (cc == '>') m_state = Data; else if (cc == '-') ; else m_state = Comment; break; } m_source.advance(); } } void PreloadScanner::processAttribute() { AtomicString tag = AtomicString(m_tagName.data(), m_tagName.size()); AtomicString attribute = AtomicString(m_attributeName.data(), m_attributeName.size()); String value(m_attributeValue.data(), m_attributeValue.size()); if (tag == scriptTag || tag == imgTag) { if (attribute == srcAttr && m_urlToLoad.isEmpty()) m_urlToLoad = parseURL(value); else if (attribute == charsetAttr) m_charset = value; } else if (tag == linkTag) { if (attribute == hrefAttr && m_urlToLoad.isEmpty()) m_urlToLoad = parseURL(value); else if (attribute == relAttr) { bool styleSheet = false; bool alternate = false; bool icon = false; bool dnsPrefetch = false; HTMLLinkElement::tokenizeRelAttribute(value, styleSheet, alternate, icon, dnsPrefetch); m_linkIsStyleSheet = styleSheet && !alternate && !icon && !dnsPrefetch; } else if (attribute == charsetAttr) m_charset = value; } } inline void PreloadScanner::emitCharacter(UChar c) { if (m_contentModel == CDATA && m_lastStartTag == styleTag) tokenizeCSS(c); } inline void PreloadScanner::tokenizeCSS(UChar c) { // We are just interested in @import rules, no need for real tokenization here // Searching for other types of resources is probably low payoff switch (m_cssState) { case CSSInitial: if (c == '@') m_cssState = CSSRuleStart; else if (c == '/') m_cssState = CSSMaybeComment; break; case CSSMaybeComment: if (c == '*') m_cssState = CSSComment; else m_cssState = CSSInitial; break; case CSSComment: if (c == '*') m_cssState = CSSMaybeCommentEnd; break; case CSSMaybeCommentEnd: if (c == '/') m_cssState = CSSInitial; else if (c == '*') ; else m_cssState = CSSComment; break; case CSSRuleStart: if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { m_cssRule.clear(); m_cssRuleValue.clear(); m_cssRule.append(c); m_cssState = CSSRule; } else m_cssState = CSSInitial; break; case CSSRule: if (isWhitespace(c)) m_cssState = CSSAfterRule; else if (c == ';') m_cssState = CSSInitial; else m_cssRule.append(c); break; case CSSAfterRule: if (isWhitespace(c)) ; else if (c == ';') m_cssState = CSSInitial; else { m_cssState = CSSRuleValue; m_cssRuleValue.append(c); } break; case CSSRuleValue: if (isWhitespace(c)) m_cssState = CSSAfterRuleValue; else if (c == ';') { emitCSSRule(); m_cssState = CSSInitial; } else m_cssRuleValue.append(c); break; case CSSAfterRuleValue: if (isWhitespace(c)) ; else if (c == ';') { emitCSSRule(); m_cssState = CSSInitial; } else { // FIXME media rules m_cssState = CSSInitial; } break; } } void PreloadScanner::emitTag() { if (m_closeTag) { m_contentModel = PCDATA; m_cssState = CSSInitial; clearLastCharacters(); return; } AtomicString tag(m_tagName.data(), m_tagName.size()); m_lastStartTag = tag; if (tag == textareaTag || tag == titleTag) m_contentModel = RCDATA; else if (tag == styleTag || tag == xmpTag || tag == scriptTag || tag == iframeTag || tag == noembedTag || tag == noframesTag) m_contentModel = CDATA; else if (tag == noscriptTag) // we wouldn't be here if scripts were disabled m_contentModel = CDATA; else if (tag == plaintextTag) m_contentModel = PLAINTEXT; else m_contentModel = PCDATA; if (tag == bodyTag) m_bodySeen = true; if (m_urlToLoad.isEmpty()) { m_linkIsStyleSheet = false; return; } if (tag == scriptTag) m_document->docLoader()->preload(CachedResource::Script, m_urlToLoad, m_charset, scanningBody()); else if (tag == imgTag) m_document->docLoader()->preload(CachedResource::ImageResource, m_urlToLoad, String(), scanningBody()); else if (tag == linkTag && m_linkIsStyleSheet) m_document->docLoader()->preload(CachedResource::CSSStyleSheet, m_urlToLoad, m_charset, scanningBody()); m_urlToLoad = String(); m_charset = String(); m_linkIsStyleSheet = false; } void PreloadScanner::emitCSSRule() { String rule(m_cssRule.data(), m_cssRule.size()); if (equalIgnoringCase(rule, "import") && !m_cssRuleValue.isEmpty()) { String value(m_cssRuleValue.data(), m_cssRuleValue.size()); String url = parseURL(value); if (!url.isEmpty()) m_document->docLoader()->preload(CachedResource::CSSStyleSheet, url, String(), scanningBody()); } m_cssRule.clear(); m_cssRuleValue.clear(); } }