/* Copyright (C) 1997 Martin Jones (mjones@kde.org) (C) 1997 Torben Weis (weis@kde.org) (C) 1998 Waldo Bastian (bastian@kde.org) (C) 1999 Lars Knoll (knoll@kde.org) (C) 1999 Antti Koivisto (koivisto@kde.org) (C) 2001 Dirk Mueller (mueller@kde.org) Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config.h" #include "HTMLTokenizer.h" #include "CSSHelper.h" #include "Cache.h" #include "CachedScript.h" #include "DocLoader.h" #include "DocumentFragment.h" #include "EventNames.h" #include "Frame.h" #include "FrameLoader.h" #include "FrameView.h" #include "HTMLElement.h" #include "HTMLNames.h" #include "HTMLParser.h" #include "HTMLScriptElement.h" #include "HTMLViewSourceDocument.h" #include "Page.h" #include "PreloadScanner.h" #include "ScriptController.h" #include "ScriptSourceCode.h" #include "ScriptValue.h" #include <wtf/ASCIICType.h> #include <wtf/CurrentTime.h> #include "HTMLEntityNames.c" #ifdef ANDROID_INSTRUMENT #include "TimeCounter.h" #endif #define PRELOAD_SCANNER_ENABLED 1 // #define INSTRUMENT_LAYOUT_SCHEDULING 1 using namespace WTF; using namespace std; namespace WebCore { using namespace HTMLNames; #if MOBILE // The mobile device needs to be responsive, as such the tokenizer chunk size is reduced. // This value is used to define how many characters the tokenizer will process before // yeilding control. static const int defaultTokenizerChunkSize = 256; #else static const int defaultTokenizerChunkSize = 4096; #endif #if MOBILE // As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise // it will take way to long to load a page. static const double defaultTokenizerTimeDelay = 0.300; #else // FIXME: We would like this constant to be 200ms. // Yielding more aggressively results in increased responsiveness and better incremental rendering. // It slows down overall page-load on slower machines, though, so for now we set a value of 500. static const double defaultTokenizerTimeDelay = 0.500; #endif static const char commentStart [] = "<!--"; static const char doctypeStart [] = "<!doctype"; static const char publicStart [] = "public"; static const char systemStart [] = "system"; static const char scriptEnd [] = "</script"; static const char xmpEnd [] = "</xmp"; static const char styleEnd [] = "</style"; static const char textareaEnd [] = "</textarea"; static const char titleEnd [] = "</title"; static const char iframeEnd [] = "</iframe"; // Full support for MS Windows extensions to Latin-1. // Technically these extensions should only be activated for pages // marked "windows-1252" or "cp1252", but // in the standard Microsoft way, these extensions infect hundreds of thousands // of web pages. Note that people with non-latin-1 Microsoft extensions // are SOL. // // See: http://www.microsoft.com/globaldev/reference/WinCP.asp // http://www.bbsinc.com/iso8859.html // http://www.obviously.com/ // // There may be better equivalents // We only need this for entities. For non-entity text, we handle this in the text encoding. static const UChar windowsLatin1ExtensionArray[32] = { 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F }; static inline UChar fixUpChar(UChar c) { if ((c & ~0x1F) != 0x0080) return c; return windowsLatin1ExtensionArray[c - 0x80]; } static inline bool tagMatch(const char* s1, const UChar* s2, unsigned length) { for (unsigned i = 0; i != length; ++i) { unsigned char c1 = s1[i]; unsigned char uc1 = toASCIIUpper(static_cast<char>(c1)); UChar c2 = s2[i]; if (c1 != c2 && uc1 != c2) return false; } return true; } inline void Token::addAttribute(AtomicString& attrName, const AtomicString& attributeValue, bool viewSourceMode) { if (!attrName.isEmpty()) { ASSERT(!attrName.contains('/')); RefPtr<MappedAttribute> a = MappedAttribute::create(attrName, attributeValue); if (!attrs) { attrs = NamedMappedAttrMap::create(); attrs->reserveCapacity(10); } attrs->insertAttribute(a.release(), viewSourceMode); } attrName = emptyAtom; } // ---------------------------------------------------------------------------- HTMLTokenizer::HTMLTokenizer(HTMLDocument* doc, bool reportErrors) : Tokenizer() , m_buffer(0) , m_scriptCode(0) , m_scriptCodeSize(0) , m_scriptCodeCapacity(0) , m_scriptCodeResync(0) , m_executingScript(0) , m_requestingScript(false) , m_hasScriptsWaitingForStylesheets(false) , m_timer(this, &HTMLTokenizer::timerFired) , m_doc(doc) , m_parser(new HTMLParser(doc, reportErrors)) , m_inWrite(false) , m_fragment(false) { begin(); } HTMLTokenizer::HTMLTokenizer(HTMLViewSourceDocument* doc) : Tokenizer(true) , m_buffer(0) , m_scriptCode(0) , m_scriptCodeSize(0) , m_scriptCodeCapacity(0) , m_scriptCodeResync(0) , m_executingScript(0) , m_requestingScript(false) , m_hasScriptsWaitingForStylesheets(false) , m_timer(this, &HTMLTokenizer::timerFired) , m_doc(doc) , m_parser(0) , m_inWrite(false) , m_fragment(false) { begin(); } HTMLTokenizer::HTMLTokenizer(DocumentFragment* frag) : m_buffer(0) , m_scriptCode(0) , m_scriptCodeSize(0) , m_scriptCodeCapacity(0) , m_scriptCodeResync(0) , m_executingScript(0) , m_requestingScript(false) , m_hasScriptsWaitingForStylesheets(false) , m_timer(this, &HTMLTokenizer::timerFired) , m_doc(frag->document()) , m_parser(new HTMLParser(frag)) , m_inWrite(false) , m_fragment(true) { begin(); } void HTMLTokenizer::reset() { ASSERT(m_executingScript == 0); while (!m_pendingScripts.isEmpty()) { CachedScript* cs = m_pendingScripts.first().get(); m_pendingScripts.removeFirst(); ASSERT(cache()->disabled() || cs->accessCount() > 0); cs->removeClient(this); } fastFree(m_buffer); m_buffer = m_dest = 0; m_bufferSize = 0; fastFree(m_scriptCode); m_scriptCode = 0; m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0; m_timer.stop(); m_state.setAllowYield(false); m_state.setForceSynchronous(false); m_currentToken.reset(); m_doctypeToken.reset(); m_doctypeSearchCount = 0; m_doctypeSecondarySearchCount = 0; m_hasScriptsWaitingForStylesheets = false; } void HTMLTokenizer::begin() { m_executingScript = 0; m_requestingScript = false; m_hasScriptsWaitingForStylesheets = false; m_state.setLoadingExtScript(false); reset(); m_bufferSize = 254; m_buffer = static_cast<UChar*>(fastMalloc(sizeof(UChar) * 254)); m_dest = m_buffer; tquote = NoQuote; searchCount = 0; m_state.setEntityState(NoEntity); m_scriptTagSrcAttrValue = String(); m_pendingSrc.clear(); m_currentPrependingSrc = 0; m_noMoreData = false; m_brokenComments = false; m_brokenServer = false; m_lineNumber = 0; m_currentScriptTagStartLineNumber = 0; m_currentTagStartLineNumber = 0; m_state.setForceSynchronous(false); Page* page = m_doc->page(); if (page && page->hasCustomHTMLTokenizerTimeDelay()) m_tokenizerTimeDelay = page->customHTMLTokenizerTimeDelay(); else m_tokenizerTimeDelay = defaultTokenizerTimeDelay; if (page && page->hasCustomHTMLTokenizerChunkSize()) m_tokenizerChunkSize = page->customHTMLTokenizerChunkSize(); else m_tokenizerChunkSize = defaultTokenizerChunkSize; } void HTMLTokenizer::setForceSynchronous(bool force) { m_state.setForceSynchronous(force); } HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state) { // This function adds the listing 'list' as // preformatted text-tokens to the token-collection while (!list.isEmpty()) { if (state.skipLF()) { state.setSkipLF(false); if (*list == '\n') { list.advance(); continue; } } checkBuffer(); if (*list == '\n' || *list == '\r') { if (state.discardLF()) // Ignore this LF state.setDiscardLF(false); // We have discarded 1 LF else *m_dest++ = '\n'; /* Check for MS-DOS CRLF sequence */ if (*list == '\r') state.setSkipLF(true); list.advance(); } else { state.setDiscardLF(false); *m_dest++ = *list; list.advance(); } } return state; } HTMLTokenizer::State HTMLTokenizer::parseSpecial(SegmentedString& src, State state) { ASSERT(state.inTextArea() || state.inTitle() || state.inIFrame() || !state.hasEntityState()); ASSERT(!state.hasTagState()); ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() + state.inIFrame() == 1 ); if (state.inScript() && !m_currentScriptTagStartLineNumber) m_currentScriptTagStartLineNumber = m_lineNumber; if (state.inComment()) state = parseComment(src, state); int lastDecodedEntityPosition = -1; while (!src.isEmpty()) { checkScriptBuffer(); UChar ch = *src; if (!m_scriptCodeResync && !m_brokenComments && !state.inXmp() && ch == '-' && m_scriptCodeSize >= 3 && !src.escaped() && m_scriptCode[m_scriptCodeSize - 3] == '<' && m_scriptCode[m_scriptCodeSize - 2] == '!' && m_scriptCode[m_scriptCodeSize - 1] == '-' && (lastDecodedEntityPosition < m_scriptCodeSize - 3)) { state.setInComment(true); state = parseComment(src, state); continue; } if (m_scriptCodeResync && !tquote && ch == '>') { src.advancePastNonNewline(); m_scriptCodeSize = m_scriptCodeResync - 1; m_scriptCodeResync = 0; m_scriptCode[m_scriptCodeSize] = m_scriptCode[m_scriptCodeSize + 1] = 0; if (state.inScript()) state = scriptHandler(state); else { state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state); processToken(); if (state.inStyle()) { m_currentToken.tagName = styleTag.localName(); m_currentToken.beginTag = false; } else if (state.inTextArea()) { m_currentToken.tagName = textareaTag.localName(); m_currentToken.beginTag = false; } else if (state.inTitle()) { m_currentToken.tagName = titleTag.localName(); m_currentToken.beginTag = false; } else if (state.inXmp()) { m_currentToken.tagName = xmpTag.localName(); m_currentToken.beginTag = false; } else if (state.inIFrame()) { m_currentToken.tagName = iframeTag.localName(); m_currentToken.beginTag = false; } processToken(); state.setInStyle(false); state.setInScript(false); state.setInTextArea(false); state.setInTitle(false); state.setInXmp(false); state.setInIFrame(false); tquote = NoQuote; m_scriptCodeSize = m_scriptCodeResync = 0; } return state; } // possible end of tagname, lets check. if (!m_scriptCodeResync && !state.escaped() && !src.escaped() && (ch == '>' || ch == '/' || isASCIISpace(ch)) && m_scriptCodeSize >= m_searchStopperLength && tagMatch(m_searchStopper, m_scriptCode + m_scriptCodeSize - m_searchStopperLength, m_searchStopperLength) && (lastDecodedEntityPosition < m_scriptCodeSize - m_searchStopperLength)) { m_scriptCodeResync = m_scriptCodeSize-m_searchStopperLength+1; tquote = NoQuote; continue; } if (m_scriptCodeResync && !state.escaped()) { if (ch == '\"') tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote); else if (ch == '\'') tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote; else if (tquote != NoQuote && (ch == '\r' || ch == '\n')) tquote = NoQuote; } state.setEscaped(!state.escaped() && ch == '\\'); if (!m_scriptCodeResync && (state.inTextArea() || state.inTitle() || state.inIFrame()) && !src.escaped() && ch == '&') { UChar* scriptCodeDest = m_scriptCode + m_scriptCodeSize; src.advancePastNonNewline(); state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false); if (scriptCodeDest == m_scriptCode + m_scriptCodeSize) lastDecodedEntityPosition = m_scriptCodeSize; else m_scriptCodeSize = scriptCodeDest - m_scriptCode; } else { m_scriptCode[m_scriptCodeSize++] = ch; src.advance(m_lineNumber); } } return state; } HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state) { // We are inside a <script> bool doScriptExec = false; int startLine = m_currentScriptTagStartLineNumber + 1; // Script line numbers are 1 based, HTMLTokenzier line numbers are 0 based // Reset m_currentScriptTagStartLineNumber to indicate that we've finished parsing the current script element m_currentScriptTagStartLineNumber = 0; // (Bugzilla 3837) Scripts following a frameset element should not execute or, // in the case of extern scripts, even load. bool followingFrameset = (m_doc->body() && m_doc->body()->hasTagName(framesetTag)); CachedScript* cs = 0; // don't load external scripts for standalone documents (for now) if (!inViewSourceMode()) { if (!m_scriptTagSrcAttrValue.isEmpty() && m_doc->frame()) { // forget what we just got; load from src url instead if (!m_parser->skipMode() && !followingFrameset) { #ifdef INSTRUMENT_LAYOUT_SCHEDULING if (!m_doc->ownerElement()) printf("Requesting script at time %d\n", m_doc->elapsedTime()); #endif // The parser might have been stopped by for example a window.close call in an earlier script. // If so, we don't want to load scripts. if (!m_parserStopped && (cs = m_doc->docLoader()->requestScript(m_scriptTagSrcAttrValue, m_scriptTagCharsetAttrValue))) m_pendingScripts.append(cs); else m_scriptNode = 0; } else m_scriptNode = 0; m_scriptTagSrcAttrValue = String(); } else { // Parse m_scriptCode containing <script> info #if USE(LOW_BANDWIDTH_DISPLAY) if (m_doc->inLowBandwidthDisplay()) { // ideal solution is only skipping internal JavaScript if there is external JavaScript. // but internal JavaScript can use document.write() to create an external JavaScript, // so we have to skip internal JavaScript all the time. m_doc->frame()->loader()->needToSwitchOutLowBandwidthDisplay(); doScriptExec = false; } else #endif doScriptExec = m_scriptNode->shouldExecuteAsJavaScript(); m_scriptNode = 0; } } state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state); RefPtr<Node> node = processToken(); String scriptString = node ? node->textContent() : ""; m_currentToken.tagName = scriptTag.localName(); m_currentToken.beginTag = false; processToken(); state.setInScript(false); m_scriptCodeSize = m_scriptCodeResync = 0; // FIXME: The script should be syntax highlighted. if (inViewSourceMode()) return state; SegmentedString* savedPrependingSrc = m_currentPrependingSrc; SegmentedString prependingSrc; m_currentPrependingSrc = &prependingSrc; #ifdef ANDROID_INSTRUMENT android::TimeCounter::recordNoCounter(android::TimeCounter::ParsingTimeCounter, __FUNCTION__); #endif if (!m_parser->skipMode() && !followingFrameset) { if (cs) { if (savedPrependingSrc) savedPrependingSrc->append(m_src); else m_pendingSrc.prepend(m_src); setSrc(SegmentedString()); // the ref() call below may call notifyFinished if the script is already in cache, // and that mucks with the state directly, so we must write it back to the object. m_state = state; bool savedRequestingScript = m_requestingScript; m_requestingScript = true; cs->addClient(this); m_requestingScript = savedRequestingScript; state = m_state; // will be 0 if script was already loaded and ref() executed it if (!m_pendingScripts.isEmpty()) state.setLoadingExtScript(true); } else if (!m_fragment && doScriptExec) { if (!m_executingScript) m_pendingSrc.prepend(m_src); else prependingSrc = m_src; setSrc(SegmentedString()); state = scriptExecution(ScriptSourceCode(scriptString, m_doc->frame() ? m_doc->frame()->document()->url() : KURL(), startLine), state); } } #ifdef ANDROID_INSTRUMENT android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter); #endif if (!m_executingScript && !state.loadingExtScript()) { m_src.append(m_pendingSrc); m_pendingSrc.clear(); } else if (!prependingSrc.isEmpty()) { // restore first so that the write appends in the right place // (does not hurt to do it again below) m_currentPrependingSrc = savedPrependingSrc; // we need to do this slightly modified bit of one of the write() cases // because we want to prepend to m_pendingSrc rather than appending // if there's no previous prependingSrc if (!m_pendingScripts.isEmpty()) { if (m_currentPrependingSrc) m_currentPrependingSrc->append(prependingSrc); else m_pendingSrc.prepend(prependingSrc); } else { m_state = state; write(prependingSrc, false); state = m_state; } } #if PRELOAD_SCANNER_ENABLED if (!m_pendingScripts.isEmpty() && !m_executingScript) { if (!m_preloadScanner) m_preloadScanner.set(new PreloadScanner(m_doc)); if (!m_preloadScanner->inProgress()) { m_preloadScanner->begin(); m_preloadScanner->write(m_pendingSrc); } } #endif m_currentPrependingSrc = savedPrependingSrc; return state; } HTMLTokenizer::State HTMLTokenizer::scriptExecution(const ScriptSourceCode& sourceCode, State state) { if (m_fragment || !m_doc->frame()) return state; m_executingScript++; SegmentedString* savedPrependingSrc = m_currentPrependingSrc; SegmentedString prependingSrc; m_currentPrependingSrc = &prependingSrc; #ifdef INSTRUMENT_LAYOUT_SCHEDULING if (!m_doc->ownerElement()) printf("beginning script execution at %d\n", m_doc->elapsedTime()); #endif m_state = state; m_doc->frame()->loader()->executeScript(sourceCode); state = m_state; state.setAllowYield(true); #ifdef INSTRUMENT_LAYOUT_SCHEDULING if (!m_doc->ownerElement()) printf("ending script execution at %d\n", m_doc->elapsedTime()); #endif m_executingScript--; if (!m_executingScript && !state.loadingExtScript()) { m_pendingSrc.prepend(prependingSrc); m_src.append(m_pendingSrc); m_pendingSrc.clear(); } else if (!prependingSrc.isEmpty()) { // restore first so that the write appends in the right place // (does not hurt to do it again below) m_currentPrependingSrc = savedPrependingSrc; // we need to do this slightly modified bit of one of the write() cases // because we want to prepend to m_pendingSrc rather than appending // if there's no previous prependingSrc if (!m_pendingScripts.isEmpty()) { if (m_currentPrependingSrc) m_currentPrependingSrc->append(prependingSrc); else m_pendingSrc.prepend(prependingSrc); #if PRELOAD_SCANNER_ENABLED // We are stuck waiting for another script. Lets check the source that // was just document.write()n for anything to load. PreloadScanner documentWritePreloadScanner(m_doc); documentWritePreloadScanner.begin(); documentWritePreloadScanner.write(prependingSrc); documentWritePreloadScanner.end(); #endif } else { m_state = state; write(prependingSrc, false); state = m_state; } } m_currentPrependingSrc = savedPrependingSrc; return state; } HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString& src, State state) { // FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus. checkScriptBuffer(src.length()); while (!src.isEmpty()) { UChar ch = *src; m_scriptCode[m_scriptCodeSize++] = ch; if (ch == '>') { bool handleBrokenComments = m_brokenComments && !(state.inScript() || state.inStyle()); int endCharsCount = 1; // start off with one for the '>' character if (m_scriptCodeSize > 2 && m_scriptCode[m_scriptCodeSize-3] == '-' && m_scriptCode[m_scriptCodeSize-2] == '-') { endCharsCount = 3; } else if (m_scriptCodeSize > 3 && m_scriptCode[m_scriptCodeSize-4] == '-' && m_scriptCode[m_scriptCodeSize-3] == '-' && m_scriptCode[m_scriptCodeSize-2] == '!') { // Other browsers will accept --!> as a close comment, even though it's // not technically valid. endCharsCount = 4; } if (handleBrokenComments || endCharsCount > 1) { src.advancePastNonNewline(); if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle() || state.inIFrame())) { checkScriptBuffer(); m_scriptCode[m_scriptCodeSize] = 0; m_scriptCode[m_scriptCodeSize + 1] = 0; m_currentToken.tagName = commentAtom; m_currentToken.beginTag = true; state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize - endCharsCount), state); processToken(); m_currentToken.tagName = commentAtom; m_currentToken.beginTag = false; processToken(); m_scriptCodeSize = 0; } state.setInComment(false); return state; // Finished parsing comment } } src.advance(m_lineNumber); } return state; } HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state) { checkScriptBuffer(src.length()); while (!src.isEmpty()) { UChar ch = *src; m_scriptCode[m_scriptCodeSize++] = ch; if (ch == '>' && m_scriptCodeSize > 1 && m_scriptCode[m_scriptCodeSize - 2] == '%') { src.advancePastNonNewline(); state.setInServer(false); m_scriptCodeSize = 0; return state; // Finished parsing server include } src.advance(m_lineNumber); } return state; } HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString& src, State state) { UChar oldchar = 0; while (!src.isEmpty()) { UChar chbegin = *src; if (chbegin == '\'') tquote = tquote == SingleQuote ? NoQuote : SingleQuote; else if (chbegin == '\"') tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote; // Look for '?>' // Some crappy sites omit the "?" before it, so // we look for an unquoted '>' instead. (IE compatible) else if (chbegin == '>' && (!tquote || oldchar == '?')) { // We got a '?>' sequence state.setInProcessingInstruction(false); src.advancePastNonNewline(); state.setDiscardLF(true); return state; // Finished parsing comment! } src.advance(m_lineNumber); oldchar = chbegin; } return state; } HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString& src, State state) { while (!src.isEmpty()) { UChar cc = *src; if (state.skipLF()) { state.setSkipLF(false); if (cc == '\n') { src.advancePastNewline(m_lineNumber); continue; } } // do we need to enlarge the buffer? checkBuffer(); if (cc == '\r') { state.setSkipLF(true); *m_dest++ = '\n'; } else *m_dest++ = cc; src.advance(m_lineNumber); } return state; } HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString& src, UChar*& dest, State state, unsigned& cBufferPos, bool start, bool parsingTag) { if (start) { cBufferPos = 0; state.setEntityState(SearchEntity); EntityUnicodeValue = 0; } while(!src.isEmpty()) { UChar cc = *src; switch(state.entityState()) { case NoEntity: ASSERT(state.entityState() != NoEntity); return state; case SearchEntity: if (cc == '#') { m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); state.setEntityState(NumericSearch); } else state.setEntityState(EntityName); break; case NumericSearch: if (cc == 'x' || cc == 'X') { m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); state.setEntityState(Hexadecimal); } else if (cc >= '0' && cc <= '9') state.setEntityState(Decimal); else state.setEntityState(SearchSemicolon); break; case Hexadecimal: { int ll = min(src.length(), 10 - cBufferPos); while (ll--) { cc = *src; if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) { state.setEntityState(SearchSemicolon); break; } int digit; if (cc < 'A') digit = cc - '0'; else digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch EntityUnicodeValue = EntityUnicodeValue * 16 + digit; m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); } if (cBufferPos == 10) state.setEntityState(SearchSemicolon); break; } case Decimal: { int ll = min(src.length(), 9-cBufferPos); while(ll--) { cc = *src; if (!(cc >= '0' && cc <= '9')) { state.setEntityState(SearchSemicolon); break; } EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0'); m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); } if (cBufferPos == 9) state.setEntityState(SearchSemicolon); break; } case EntityName: { int ll = min(src.length(), 9-cBufferPos); while(ll--) { cc = *src; if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) { state.setEntityState(SearchSemicolon); break; } m_cBuffer[cBufferPos++] = cc; src.advancePastNonNewline(); } if (cBufferPos == 9) state.setEntityState(SearchSemicolon); if (state.entityState() == SearchSemicolon) { if(cBufferPos > 1) { // Since the maximum length of entity name is 9, // so a single char array which is allocated on // the stack, its length is 10, should be OK. // Also if we have an illegal character, we treat it // as illegal entity name. unsigned testedEntityNameLen = 0; char tmpEntityNameBuffer[10]; ASSERT(cBufferPos < 10); for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) { if (m_cBuffer[testedEntityNameLen] > 0x7e) break; tmpEntityNameBuffer[testedEntityNameLen] = m_cBuffer[testedEntityNameLen]; } const Entity *e; if (testedEntityNameLen == cBufferPos) e = findEntity(tmpEntityNameBuffer, cBufferPos); else e = 0; if(e) EntityUnicodeValue = e->code; // be IE compatible if(parsingTag && EntityUnicodeValue > 255 && *src != ';') EntityUnicodeValue = 0; } } else break; } case SearchSemicolon: // Don't allow values that are more than 21 bits. if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) { if (!inViewSourceMode()) { if (*src == ';') src.advancePastNonNewline(); if (EntityUnicodeValue <= 0xFFFF) { checkBuffer(); src.push(fixUpChar(EntityUnicodeValue)); } else { // Convert to UTF-16, using surrogate code points. checkBuffer(2); src.push(U16_LEAD(EntityUnicodeValue)); src.push(U16_TRAIL(EntityUnicodeValue)); } } else { // FIXME: We should eventually colorize entities by sending them as a special token. // 12 bytes required: up to 10 bytes in m_cBuffer plus the // leading '&' and trailing ';' checkBuffer(12); *dest++ = '&'; for (unsigned i = 0; i < cBufferPos; i++) dest[i] = m_cBuffer[i]; dest += cBufferPos; if (*src == ';') { *dest++ = ';'; src.advancePastNonNewline(); } } } else { // 11 bytes required: up to 10 bytes in m_cBuffer plus the // leading '&' checkBuffer(11); // ignore the sequence, add it to the buffer as plaintext *dest++ = '&'; for (unsigned i = 0; i < cBufferPos; i++) dest[i] = m_cBuffer[i]; dest += cBufferPos; } state.setEntityState(NoEntity); return state; } } return state; } HTMLTokenizer::State HTMLTokenizer::parseDoctype(SegmentedString& src, State state) { ASSERT(state.inDoctype()); while (!src.isEmpty() && state.inDoctype()) { UChar c = *src; bool isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' '; switch (m_doctypeToken.state()) { case DoctypeBegin: { m_doctypeToken.setState(DoctypeBeforeName); if (isWhitespace) { src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; } case DoctypeBeforeName: { if (c == '>') { // Malformed. Just exit. src.advancePastNonNewline(); state.setInDoctype(false); if (inViewSourceMode()) processDoctypeToken(); } else if (isWhitespace) { src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else m_doctypeToken.setState(DoctypeName); break; } case DoctypeName: { if (c == '>') { // Valid doctype. Emit it. src.advancePastNonNewline(); state.setInDoctype(false); processDoctypeToken(); } else if (isWhitespace) { m_doctypeSearchCount = 0; // Used now to scan for PUBLIC m_doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM m_doctypeToken.setState(DoctypeAfterName); src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else { src.advancePastNonNewline(); m_doctypeToken.m_name.append(c); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; } case DoctypeAfterName: { if (c == '>') { // Valid doctype. Emit it. src.advancePastNonNewline(); state.setInDoctype(false); processDoctypeToken(); } else if (!isWhitespace) { src.advancePastNonNewline(); if (toASCIILower(c) == publicStart[m_doctypeSearchCount]) { m_doctypeSearchCount++; if (m_doctypeSearchCount == 6) // Found 'PUBLIC' sequence m_doctypeToken.setState(DoctypeBeforePublicID); } else if (m_doctypeSearchCount > 0) { m_doctypeSearchCount = 0; m_doctypeToken.setState(DoctypeBogus); } else if (toASCIILower(c) == systemStart[m_doctypeSecondarySearchCount]) { m_doctypeSecondarySearchCount++; if (m_doctypeSecondarySearchCount == 6) // Found 'SYSTEM' sequence m_doctypeToken.setState(DoctypeBeforeSystemID); } else { m_doctypeSecondarySearchCount = 0; m_doctypeToken.setState(DoctypeBogus); } if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else { src.advance(m_lineNumber); // Whitespace keeps us in the after name state. if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; } case DoctypeBeforePublicID: { if (c == '\"' || c == '\'') { tquote = c == '\"' ? DoubleQuote : SingleQuote; m_doctypeToken.setState(DoctypePublicID); src.advancePastNonNewline(); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else if (c == '>') { // Considered bogus. Don't process the doctype. src.advancePastNonNewline(); state.setInDoctype(false); if (inViewSourceMode()) processDoctypeToken(); } else if (isWhitespace) { src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else m_doctypeToken.setState(DoctypeBogus); break; } case DoctypePublicID: { if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) { src.advancePastNonNewline(); m_doctypeToken.setState(DoctypeAfterPublicID); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else if (c == '>') { // Considered bogus. Don't process the doctype. src.advancePastNonNewline(); state.setInDoctype(false); if (inViewSourceMode()) processDoctypeToken(); } else { m_doctypeToken.m_publicID.append(c); src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; } case DoctypeAfterPublicID: if (c == '\"' || c == '\'') { tquote = c == '\"' ? DoubleQuote : SingleQuote; m_doctypeToken.setState(DoctypeSystemID); src.advancePastNonNewline(); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else if (c == '>') { // Valid doctype. Emit it now. src.advancePastNonNewline(); state.setInDoctype(false); processDoctypeToken(); } else if (isWhitespace) { src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else m_doctypeToken.setState(DoctypeBogus); break; case DoctypeBeforeSystemID: if (c == '\"' || c == '\'') { tquote = c == '\"' ? DoubleQuote : SingleQuote; m_doctypeToken.setState(DoctypeSystemID); src.advancePastNonNewline(); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else if (c == '>') { // Considered bogus. Don't process the doctype. src.advancePastNonNewline(); state.setInDoctype(false); } else if (isWhitespace) { src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else m_doctypeToken.setState(DoctypeBogus); break; case DoctypeSystemID: if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) { src.advancePastNonNewline(); m_doctypeToken.setState(DoctypeAfterSystemID); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else if (c == '>') { // Considered bogus. Don't process the doctype. src.advancePastNonNewline(); state.setInDoctype(false); if (inViewSourceMode()) processDoctypeToken(); } else { m_doctypeToken.m_systemID.append(c); src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; case DoctypeAfterSystemID: if (c == '>') { // Valid doctype. Emit it now. src.advancePastNonNewline(); state.setInDoctype(false); processDoctypeToken(); } else if (isWhitespace) { src.advance(m_lineNumber); if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } else m_doctypeToken.setState(DoctypeBogus); break; case DoctypeBogus: if (c == '>') { // Done with the bogus doctype. src.advancePastNonNewline(); state.setInDoctype(false); if (inViewSourceMode()) processDoctypeToken(); } else { src.advance(m_lineNumber); // Just keep scanning for '>' if (inViewSourceMode()) m_doctypeToken.m_source.append(c); } break; default: break; } } return state; } HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString& src, State state) { ASSERT(!state.hasEntityState()); unsigned cBufferPos = m_cBufferPos; bool lastIsSlash = false; while (!src.isEmpty()) { checkBuffer(); switch(state.tagState()) { case NoTag: { m_cBufferPos = cBufferPos; return state; } case TagName: { if (searchCount > 0) { if (*src == commentStart[searchCount]) { searchCount++; if (searchCount == 2) m_doctypeSearchCount++; // A '!' is also part of a doctype, so we are moving through that still as well. else m_doctypeSearchCount = 0; if (searchCount == 4) { // Found '<!--' sequence src.advancePastNonNewline(); m_dest = m_buffer; // ignore the previous part of this tag state.setInComment(true); state.setTagState(NoTag); // Fix bug 34302 at kde.bugs.org. Go ahead and treat // <!--> as a valid comment, since both mozilla and IE on windows // can handle this case. Only do this in quirks mode. -dwh if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) { state.setInComment(false); src.advancePastNonNewline(); if (!src.isEmpty()) m_cBuffer[cBufferPos++] = *src; } else state = parseComment(src, state); m_cBufferPos = cBufferPos; return state; // Finished parsing tag! } m_cBuffer[cBufferPos++] = *src; src.advancePastNonNewline(); break; } else searchCount = 0; // Stop looking for '<!--' sequence } if (m_doctypeSearchCount > 0) { if (toASCIILower(*src) == doctypeStart[m_doctypeSearchCount]) { m_doctypeSearchCount++; m_cBuffer[cBufferPos++] = *src; src.advancePastNonNewline(); if (m_doctypeSearchCount == 9) { // Found '<!DOCTYPE' sequence state.setInDoctype(true); state.setTagState(NoTag); m_doctypeToken.reset(); if (inViewSourceMode()) m_doctypeToken.m_source.append(m_cBuffer, cBufferPos); state = parseDoctype(src, state); m_cBufferPos = cBufferPos; return state; } break; } else m_doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence } bool finish = false; unsigned int ll = min(src.length(), CBUFLEN - cBufferPos); while (ll--) { UChar curchar = *src; if (isASCIISpace(curchar) || curchar == '>' || curchar == '<') { finish = true; break; } // tolower() shows up on profiles. This is faster! if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode()) m_cBuffer[cBufferPos++] = curchar + ('a' - 'A'); else m_cBuffer[cBufferPos++] = curchar; src.advancePastNonNewline(); } // Disadvantage: we add the possible rest of the tag // as attribute names. ### judge if this causes problems if (finish || CBUFLEN == cBufferPos) { bool beginTag; UChar* ptr = m_cBuffer; unsigned int len = cBufferPos; m_cBuffer[cBufferPos] = '\0'; if ((cBufferPos > 0) && (*ptr == '/')) { // End Tag beginTag = false; ptr++; len--; } else // Start Tag beginTag = true; // Ignore the / in fake xml tags like <br/>. We trim off the "/" so that we'll get "br" as the tag name and not "br/". if (len > 1 && ptr[len-1] == '/' && !inViewSourceMode()) ptr[--len] = '\0'; // Now that we've shaved off any invalid / that might have followed the name), make the tag. // FIXME: FireFox and WinIE turn !foo nodes into comments, we ignore comments. (fast/parser/tag-with-exclamation-point.html) if (ptr[0] != '!' || inViewSourceMode()) { m_currentToken.tagName = AtomicString(ptr); m_currentToken.beginTag = beginTag; } m_dest = m_buffer; state.setTagState(SearchAttribute); cBufferPos = 0; } break; } case SearchAttribute: while(!src.isEmpty()) { UChar curchar = *src; // In this mode just ignore any quotes we encounter and treat them like spaces. if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"') { if (curchar == '<' || curchar == '>') state.setTagState(SearchEnd); else state.setTagState(AttributeName); cBufferPos = 0; break; } if (inViewSourceMode()) m_currentToken.addViewSourceChar(curchar); src.advance(m_lineNumber); } break; case AttributeName: { int ll = min(src.length(), CBUFLEN - cBufferPos); while (ll--) { UChar curchar = *src; // If we encounter a "/" when scanning an attribute name, treat it as a delimiter. This allows the // cases like <input type=checkbox checked/> to work (and accommodates XML-style syntax as per HTML5). if (curchar <= '>' && (curchar >= '<' || isASCIISpace(curchar) || curchar == '/')) { m_cBuffer[cBufferPos] = '\0'; m_attrName = AtomicString(m_cBuffer); m_dest = m_buffer; *m_dest++ = 0; state.setTagState(SearchEqual); if (inViewSourceMode()) m_currentToken.addViewSourceChar('a'); break; } // tolower() shows up on profiles. This is faster! if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode()) m_cBuffer[cBufferPos++] = curchar + ('a' - 'A'); else m_cBuffer[cBufferPos++] = curchar; src.advance(m_lineNumber); } if (cBufferPos == CBUFLEN) { m_cBuffer[cBufferPos] = '\0'; m_attrName = AtomicString(m_cBuffer); m_dest = m_buffer; *m_dest++ = 0; state.setTagState(SearchEqual); if (inViewSourceMode()) m_currentToken.addViewSourceChar('a'); } break; } case SearchEqual: while (!src.isEmpty()) { UChar curchar = *src; if (lastIsSlash && curchar == '>') { // This is a quirk (with a long sad history). We have to do this // since widgets do <script src="foo.js"/> and expect the tag to close. if (m_currentToken.tagName == scriptTag) m_currentToken.selfClosingTag = true; m_currentToken.brokenXMLStyle = true; } // In this mode just ignore any quotes or slashes we encounter and treat them like spaces. if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"' && curchar != '/') { if (curchar == '=') { state.setTagState(SearchValue); if (inViewSourceMode()) m_currentToken.addViewSourceChar(curchar); src.advancePastNonNewline(); } else { m_currentToken.addAttribute(m_attrName, emptyAtom, inViewSourceMode()); m_dest = m_buffer; state.setTagState(SearchAttribute); lastIsSlash = false; } break; } if (inViewSourceMode()) m_currentToken.addViewSourceChar(curchar); lastIsSlash = curchar == '/'; src.advance(m_lineNumber); } break; case SearchValue: while (!src.isEmpty()) { UChar curchar = *src; if (!isASCIISpace(curchar)) { if (curchar == '\'' || curchar == '\"') { tquote = curchar == '\"' ? DoubleQuote : SingleQuote; state.setTagState(QuotedValue); if (inViewSourceMode()) m_currentToken.addViewSourceChar(curchar); src.advancePastNonNewline(); } else state.setTagState(Value); break; } if (inViewSourceMode()) m_currentToken.addViewSourceChar(curchar); src.advance(m_lineNumber); } break; case QuotedValue: while (!src.isEmpty()) { checkBuffer(); UChar curchar = *src; if (curchar <= '>' && !src.escaped()) { if (curchar == '>' && m_attrName.isEmpty()) { // Handle a case like <img '>. Just go ahead and be willing // to close the whole tag. Don't consume the character and // just go back into SearchEnd while ignoring the whole // value. // FIXME: Note that this is actually not a very good solution. // It doesn't handle the general case of // unmatched quotes among attributes that have names. -dwh while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r')) m_dest--; // remove trailing newlines AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1); if (!attributeValue.contains('/')) m_attrName = attributeValue; // Just make the name/value match. (FIXME: Is this some WinIE quirk?) m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode()); if (inViewSourceMode()) m_currentToken.addViewSourceChar('x'); state.setTagState(SearchAttribute); m_dest = m_buffer; tquote = NoQuote; break; } if (curchar == '&') { src.advancePastNonNewline(); state = parseEntity(src, m_dest, state, cBufferPos, true, true); break; } if ((tquote == SingleQuote && curchar == '\'') || (tquote == DoubleQuote && curchar == '\"')) { // some <input type=hidden> rely on trailing spaces. argh while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r')) m_dest--; // remove trailing newlines AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1); if (m_attrName.isEmpty() && !attributeValue.contains('/')) { m_attrName = attributeValue; // Make the name match the value. (FIXME: Is this a WinIE quirk?) if (inViewSourceMode()) m_currentToken.addViewSourceChar('x'); } else if (inViewSourceMode()) m_currentToken.addViewSourceChar('v'); m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode()); m_dest = m_buffer; state.setTagState(SearchAttribute); tquote = NoQuote; if (inViewSourceMode()) m_currentToken.addViewSourceChar(curchar); src.advancePastNonNewline(); break; } } *m_dest++ = curchar; src.advance(m_lineNumber); } break; case Value: while(!src.isEmpty()) { checkBuffer(); UChar curchar = *src; if (curchar <= '>' && !src.escaped()) { // parse Entities if (curchar == '&') { src.advancePastNonNewline(); state = parseEntity(src, m_dest, state, cBufferPos, true, true); break; } // no quotes. Every space means end of value // '/' does not delimit in IE! if (isASCIISpace(curchar) || curchar == '>') { AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1); m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode()); if (inViewSourceMode()) m_currentToken.addViewSourceChar('v'); m_dest = m_buffer; state.setTagState(SearchAttribute); break; } } *m_dest++ = curchar; src.advance(m_lineNumber); } break; case SearchEnd: { while (!src.isEmpty()) { UChar ch = *src; if (ch == '>' || ch == '<') break; if (ch == '/') m_currentToken.selfClosingTag = true; if (inViewSourceMode()) m_currentToken.addViewSourceChar(ch); src.advance(m_lineNumber); } if (src.isEmpty()) break; searchCount = 0; // Stop looking for '<!--' sequence state.setTagState(NoTag); tquote = NoQuote; if (*src != '<') src.advance(m_lineNumber); if (m_currentToken.tagName == nullAtom) { //stop if tag is unknown m_cBufferPos = cBufferPos; return state; } AtomicString tagName = m_currentToken.tagName; // Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard // compatibility. bool isSelfClosingScript = m_currentToken.selfClosingTag && m_currentToken.beginTag && m_currentToken.tagName == scriptTag; bool beginTag = !m_currentToken.selfClosingTag && m_currentToken.beginTag; if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode()) { Attribute* a = 0; m_scriptTagSrcAttrValue = String(); m_scriptTagCharsetAttrValue = String(); if (m_currentToken.attrs && !m_fragment) { if (m_doc->frame() && m_doc->frame()->script()->isEnabled()) { if ((a = m_currentToken.attrs->getAttributeItem(srcAttr))) m_scriptTagSrcAttrValue = m_doc->completeURL(parseURL(a->value())).string(); } } } RefPtr<Node> n = processToken(); m_cBufferPos = cBufferPos; if (n || inViewSourceMode()) { if ((tagName == preTag || tagName == listingTag) && !inViewSourceMode()) { if (beginTag) state.setDiscardLF(true); // Discard the first LF after we open a pre. } else if (tagName == scriptTag) { ASSERT(!m_scriptNode); m_scriptNode = static_pointer_cast<HTMLScriptElement>(n); if (m_scriptNode) m_scriptTagCharsetAttrValue = m_scriptNode->scriptCharset(); if (beginTag) { m_searchStopper = scriptEnd; m_searchStopperLength = 8; state.setInScript(true); state = parseSpecial(src, state); } else if (isSelfClosingScript) { // Handle <script src="foo"/> state.setInScript(true); state = scriptHandler(state); } } else if (tagName == styleTag) { if (beginTag) { m_searchStopper = styleEnd; m_searchStopperLength = 7; state.setInStyle(true); state = parseSpecial(src, state); } } else if (tagName == textareaTag) { if (beginTag) { m_searchStopper = textareaEnd; m_searchStopperLength = 10; state.setInTextArea(true); state = parseSpecial(src, state); } } else if (tagName == titleTag) { if (beginTag) { m_searchStopper = titleEnd; m_searchStopperLength = 7; State savedState = state; SegmentedString savedSrc = src; long savedLineno = m_lineNumber; state.setInTitle(true); state = parseSpecial(src, state); if (state.inTitle() && src.isEmpty()) { // We just ate the rest of the document as the title #text node! // Reset the state then retokenize without special title handling. // Let the parser clean up the missing </title> tag. // FIXME: This is incorrect, because src.isEmpty() doesn't mean we're // at the end of the document unless m_noMoreData is also true. We need // to detect this case elsewhere, and save the state somewhere other // than a local variable. state = savedState; src = savedSrc; m_lineNumber = savedLineno; m_scriptCodeSize = 0; } } } else if (tagName == xmpTag) { if (beginTag) { m_searchStopper = xmpEnd; m_searchStopperLength = 5; state.setInXmp(true); state = parseSpecial(src, state); } } else if (tagName == iframeTag) { if (beginTag) { m_searchStopper = iframeEnd; m_searchStopperLength = 8; state.setInIFrame(true); state = parseSpecial(src, state); } } } if (tagName == plaintextTag) state.setInPlainText(beginTag); return state; // Finished parsing tag! } } // end switch } m_cBufferPos = cBufferPos; return state; } inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state) { // We don't want to be checking elapsed time with every character, so we only check after we've // processed a certain number of characters. bool allowedYield = state.allowYield(); state.setAllowYield(false); if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > m_tokenizerChunkSize || allowedYield)) { processedCount = 0; if (currentTime() - startTime > m_tokenizerTimeDelay) { /* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to load, but this hurts overall performance on slower machines. For now turn this off. || (!m_doc->haveStylesheetsLoaded() && (m_doc->documentElement()->id() != ID_HTML || m_doc->body()))) {*/ // Schedule the timer to keep processing as soon as possible. m_timer.startOneShot(0); #ifdef INSTRUMENT_LAYOUT_SCHEDULING if (currentTime() - startTime > m_tokenizerTimeDelay) printf("Deferring processing of data because 500ms elapsed away from event loop.\n"); #endif return false; } } processedCount++; return true; } bool HTMLTokenizer::write(const SegmentedString& str, bool appendData) { if (!m_buffer) return false; if (m_parserStopped) return false; SegmentedString source(str); if (m_executingScript) source.setExcludeLineNumbers(); if ((m_executingScript && appendData) || !m_pendingScripts.isEmpty()) { // don't parse; we will do this later if (m_currentPrependingSrc) m_currentPrependingSrc->append(source); else { m_pendingSrc.append(source); #if PRELOAD_SCANNER_ENABLED if (m_preloadScanner && m_preloadScanner->inProgress() && appendData) m_preloadScanner->write(source); #endif } return false; } #if PRELOAD_SCANNER_ENABLED if (m_preloadScanner && m_preloadScanner->inProgress() && appendData) m_preloadScanner->end(); #endif if (!m_src.isEmpty()) m_src.append(source); else setSrc(source); // Once a timer is set, it has control of when the tokenizer continues. if (m_timer.isActive()) return false; bool wasInWrite = m_inWrite; m_inWrite = true; #ifdef INSTRUMENT_LAYOUT_SCHEDULING if (!m_doc->ownerElement()) printf("Beginning write at time %d\n", m_doc->elapsedTime()); #endif int processedCount = 0; double startTime = currentTime(); #ifdef ANDROID_INSTRUMENT android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter); #endif Frame* frame = m_doc->frame(); State state = m_state; while (!m_src.isEmpty() && (!frame || !frame->loader()->isScheduledLocationChangePending())) { if (!continueProcessing(processedCount, startTime, state)) break; // do we need to enlarge the buffer? checkBuffer(); UChar cc = *m_src; bool wasSkipLF = state.skipLF(); if (wasSkipLF) state.setSkipLF(false); if (wasSkipLF && (cc == '\n')) m_src.advance(); else if (state.needsSpecialWriteHandling()) { // it's important to keep needsSpecialWriteHandling with the flags this block tests if (state.hasEntityState()) state = parseEntity(m_src, m_dest, state, m_cBufferPos, false, state.hasTagState()); else if (state.inPlainText()) state = parseText(m_src, state); else if (state.inAnySpecial()) state = parseSpecial(m_src, state); else if (state.inComment()) state = parseComment(m_src, state); else if (state.inDoctype()) state = parseDoctype(m_src, state); else if (state.inServer()) state = parseServer(m_src, state); else if (state.inProcessingInstruction()) state = parseProcessingInstruction(m_src, state); else if (state.hasTagState()) state = parseTag(m_src, state); else if (state.startTag()) { state.setStartTag(false); switch(cc) { case '/': break; case '!': { // <!-- comment --> or <!DOCTYPE ...> searchCount = 1; // Look for '<!--' sequence to start comment or '<!DOCTYPE' sequence to start doctype m_doctypeSearchCount = 1; break; } case '?': { // xml processing instruction state.setInProcessingInstruction(true); tquote = NoQuote; state = parseProcessingInstruction(m_src, state); continue; break; } case '%': if (!m_brokenServer) { // <% server stuff, handle as comment %> state.setInServer(true); tquote = NoQuote; state = parseServer(m_src, state); continue; } // else fall through default: { if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) { // Start of a Start-Tag } else { // Invalid tag // Add as is *m_dest = '<'; m_dest++; continue; } } }; // end case processToken(); m_cBufferPos = 0; state.setTagState(TagName); state = parseTag(m_src, state); } } else if (cc == '&' && !m_src.escaped()) { m_src.advancePastNonNewline(); state = parseEntity(m_src, m_dest, state, m_cBufferPos, true, state.hasTagState()); } else if (cc == '<' && !m_src.escaped()) { m_currentTagStartLineNumber = m_lineNumber; m_src.advancePastNonNewline(); state.setStartTag(true); state.setDiscardLF(false); } else if (cc == '\n' || cc == '\r') { if (state.discardLF()) // Ignore this LF state.setDiscardLF(false); // We have discarded 1 LF else { // Process this LF *m_dest++ = '\n'; if (cc == '\r' && !m_src.excludeLineNumbers()) m_lineNumber++; } /* Check for MS-DOS CRLF sequence */ if (cc == '\r') state.setSkipLF(true); m_src.advance(m_lineNumber); } else { state.setDiscardLF(false); *m_dest++ = cc; m_src.advancePastNonNewline(); } } #ifdef INSTRUMENT_LAYOUT_SCHEDULING if (!m_doc->ownerElement()) printf("Ending write at time %d\n", m_doc->elapsedTime()); #endif m_inWrite = wasInWrite; m_state = state; #ifdef ANDROID_INSTRUMENT android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__); #endif if (m_noMoreData && !m_inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive()) { end(); // this actually causes us to be deleted return true; } return false; } void HTMLTokenizer::stopParsing() { Tokenizer::stopParsing(); m_timer.stop(); // The part needs to know that the tokenizer has finished with its data, // regardless of whether it happened naturally or due to manual intervention. if (!m_fragment && m_doc->frame()) m_doc->frame()->loader()->tokenizerProcessedData(); } bool HTMLTokenizer::processingData() const { return m_timer.isActive() || m_inWrite; } void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*) { #ifdef INSTRUMENT_LAYOUT_SCHEDULING if (!m_doc->ownerElement()) printf("Beginning timer write at time %d\n", m_doc->elapsedTime()); #endif #ifdef ANDROID_MOBILE if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay() && !m_doc->extraLayoutDelay()) { #else if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) { #endif // Restart the timer and let layout win. This is basically a way of ensuring that the layout // timer has higher priority than our timer. m_timer.startOneShot(0); return; } // Invoke write() as though more data came in. This might cause us to get deleted. write(SegmentedString(), true); } void HTMLTokenizer::end() { ASSERT(!m_timer.isActive()); m_timer.stop(); // Only helps if assertion above fires, but do it anyway. if (m_buffer) { // parseTag is using the buffer for different matters if (!m_state.hasTagState()) processToken(); fastFree(m_scriptCode); m_scriptCode = 0; m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0; fastFree(m_buffer); m_buffer = 0; } if (!inViewSourceMode()) m_parser->finished(); else m_doc->finishedParsing(); } void HTMLTokenizer::finish() { // do this as long as we don't find matching comment ends while ((m_state.inComment() || m_state.inServer()) && m_scriptCode && m_scriptCodeSize) { // we've found an unmatched comment start if (m_state.inComment()) m_brokenComments = true; else m_brokenServer = true; checkScriptBuffer(); m_scriptCode[m_scriptCodeSize] = 0; m_scriptCode[m_scriptCodeSize + 1] = 0; int pos; String food; if (m_state.inScript() || m_state.inStyle() || m_state.inTextArea()) food = String(m_scriptCode, m_scriptCodeSize); else if (m_state.inServer()) { food = "<"; food.append(m_scriptCode, m_scriptCodeSize); } else { pos = find(m_scriptCode, m_scriptCodeSize, '>'); food = String(m_scriptCode + pos + 1, m_scriptCodeSize - pos - 1); } fastFree(m_scriptCode); m_scriptCode = 0; m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0; m_state.setInComment(false); m_state.setInServer(false); if (!food.isEmpty()) write(food, true); } // this indicates we will not receive any more data... but if we are waiting on // an external script to load, we can't finish parsing until that is done m_noMoreData = true; if (!m_inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive()) end(); // this actually causes us to be deleted } PassRefPtr<Node> HTMLTokenizer::processToken() { ScriptController* scriptController = (!m_fragment && m_doc->frame()) ? m_doc->frame()->script() : 0; if (scriptController && scriptController->isEnabled()) // FIXME: Why isn't this m_currentScriptTagStartLineNumber? I suspect this is wrong. scriptController->setEventHandlerLineno(m_currentTagStartLineNumber + 1); // Script line numbers are 1 based. if (m_dest > m_buffer) { m_currentToken.text = StringImpl::createStrippingNullCharacters(m_buffer, m_dest - m_buffer); if (m_currentToken.tagName != commentAtom) m_currentToken.tagName = textAtom; } else if (m_currentToken.tagName == nullAtom) { m_currentToken.reset(); if (scriptController) scriptController->setEventHandlerLineno(m_lineNumber + 1); // Script line numbers are 1 based. return 0; } m_dest = m_buffer; RefPtr<Node> n; if (!m_parserStopped) { if (NamedMappedAttrMap* map = m_currentToken.attrs.get()) map->shrinkToLength(); if (inViewSourceMode()) static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceToken(&m_currentToken); else // pass the token over to the parser, the parser DOES NOT delete the token n = m_parser->parseToken(&m_currentToken); } m_currentToken.reset(); if (scriptController) scriptController->setEventHandlerLineno(0); return n.release(); } void HTMLTokenizer::processDoctypeToken() { if (inViewSourceMode()) static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceDoctypeToken(&m_doctypeToken); else m_parser->parseDoctypeToken(&m_doctypeToken); } HTMLTokenizer::~HTMLTokenizer() { ASSERT(!m_inWrite); reset(); } void HTMLTokenizer::enlargeBuffer(int len) { int newSize = max(m_bufferSize * 2, m_bufferSize + len); int oldOffset = m_dest - m_buffer; m_buffer = static_cast<UChar*>(fastRealloc(m_buffer, newSize * sizeof(UChar))); m_dest = m_buffer + oldOffset; m_bufferSize = newSize; } void HTMLTokenizer::enlargeScriptBuffer(int len) { int newSize = max(m_scriptCodeCapacity * 2, m_scriptCodeCapacity + len); m_scriptCode = static_cast<UChar*>(fastRealloc(m_scriptCode, newSize * sizeof(UChar))); m_scriptCodeCapacity = newSize; } void HTMLTokenizer::executeScriptsWaitingForStylesheets() { ASSERT(m_doc->haveStylesheetsLoaded()); if (m_hasScriptsWaitingForStylesheets) notifyFinished(0); } void HTMLTokenizer::notifyFinished(CachedResource*) { #ifdef INSTRUMENT_LAYOUT_SCHEDULING if (!m_doc->ownerElement()) printf("script loaded at %d\n", m_doc->elapsedTime()); #endif ASSERT(!m_pendingScripts.isEmpty()); // Make external scripts wait for external stylesheets. // FIXME: This needs to be done for inline scripts too. m_hasScriptsWaitingForStylesheets = !m_doc->haveStylesheetsLoaded(); if (m_hasScriptsWaitingForStylesheets) return; bool finished = false; while (!finished && m_pendingScripts.first()->isLoaded()) { CachedScript* cs = m_pendingScripts.first().get(); m_pendingScripts.removeFirst(); ASSERT(cache()->disabled() || cs->accessCount() > 0); setSrc(SegmentedString()); // make sure we forget about the script before we execute the new one // infinite recursion might happen otherwise ScriptSourceCode sourceCode(cs); bool errorOccurred = cs->errorOccurred(); cs->removeClient(this); RefPtr<Node> n = m_scriptNode.release(); #ifdef INSTRUMENT_LAYOUT_SCHEDULING if (!m_doc->ownerElement()) printf("external script beginning execution at %d\n", m_doc->elapsedTime()); #endif if (errorOccurred) EventTargetNodeCast(n.get())->dispatchEventForType(eventNames().errorEvent, true, false); else { if (static_cast<HTMLScriptElement*>(n.get())->shouldExecuteAsJavaScript()) m_state = scriptExecution(sourceCode, m_state); EventTargetNodeCast(n.get())->dispatchEventForType(eventNames().loadEvent, false, false); } // The state of m_pendingScripts.isEmpty() can change inside the scriptExecution() // call above, so test afterwards. finished = m_pendingScripts.isEmpty(); if (finished) { ASSERT(!m_hasScriptsWaitingForStylesheets); m_state.setLoadingExtScript(false); #ifdef INSTRUMENT_LAYOUT_SCHEDULING if (!m_doc->ownerElement()) printf("external script finished execution at %d\n", m_doc->elapsedTime()); #endif } else if (m_hasScriptsWaitingForStylesheets) { // m_hasScriptsWaitingForStylesheets flag might have changed during the script execution. // If it did we are now blocked waiting for stylesheets and should not execute more scripts until they arrive. finished = true; } // 'm_requestingScript' is true when we are called synchronously from // scriptHandler(). In that case scriptHandler() will take care // of m_pendingSrc. if (!m_requestingScript) { SegmentedString rest = m_pendingSrc; m_pendingSrc.clear(); write(rest, false); // we might be deleted at this point, do not access any members. } } } bool HTMLTokenizer::isWaitingForScripts() const { return m_state.loadingExtScript(); } void HTMLTokenizer::setSrc(const SegmentedString& source) { m_src = source; } void parseHTMLDocumentFragment(const String& source, DocumentFragment* fragment) { HTMLTokenizer tok(fragment); tok.setForceSynchronous(true); tok.write(source, true); tok.finish(); ASSERT(!tok.processingData()); // make sure we're done (see 3963151) } UChar decodeNamedEntity(const char* name) { const Entity* e = findEntity(name, strlen(name)); return e ? e->code : 0; } }