Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a reset function to the parser so that it can be reused (less mem) #185

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 146 additions & 1 deletion lib/HTMLParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -1947,6 +1947,7 @@ function HTMLParser(address, fragmentContext, options) {
var leftovers = "";
var first_batch = true;
var paused = 0; // Becomes non-zero while loading scripts
var hasParsedAnything = false; // Has parsed anything


// Tokenizer state
Expand Down Expand Up @@ -1999,6 +2000,148 @@ function HTMLParser(address, fragmentContext, options) {
* return value and defines the public API of the parser
*/
var htmlparser = {
reset: function(address, fragmentContext, options) {
if (!hasParsedAnything) return
/***
* These are the parser's state variables
*/
// Scanner state
chars = null;
numchars = 0; // Length of chars
nextchar = 0; // Index of next char
input_complete = false; // Becomes true when end() called.
scanner_skip_newline = false; // If previous char was CR
reentrant_invocations = 0;
saved_scanner_state = [];
leftovers = "";
first_batch = true;
paused = 0; // Becomes non-zero while loading scripts
hasParsedAnything = false;


// Tokenizer state
tokenizer = data_state; // Current tokenizer state
return_state;
character_reference_code;
tagnamebuf = "";
lasttagname = ""; // holds the target end tag for text states
tempbuf = [];
attrnamebuf = "";
attrvaluebuf = "";
commentbuf = [];
doctypenamebuf = [];
doctypepublicbuf = [];
doctypesystembuf = [];
attributes = [];
is_end_tag = false;

// Tree builder state
parser = initial_mode; // Current insertion mode
originalInsertionMode = null; // A saved insertion mode
templateInsertionModes = []; // Stack of template insertion modes.
stack = new HTMLParser.ElementStack(); // Stack of open elements
afe = new HTMLParser.ActiveFormattingElements(); // mis-nested tags
fragment = (fragmentContext!==undefined); // For innerHTML, etc.
head_element_pointer = null;
form_element_pointer = null;
scripting_enabled = true;
if (fragmentContext) {
scripting_enabled = fragmentContext.ownerDocument._scripting_enabled;
}
if (options && options.scripting_enabled === false)
scripting_enabled = false;
frameset_ok = true;
force_quirks = false;
pending_table_text;
text_integration_mode; // XXX a spec bug workaround?

// A single run of characters, buffered up to be sent to
// the parser as a single string.
textrun = [];
textIncludesNUL = false;
ignore_linefeed = false;


// This is the document we'll be building up
doc = new Document(true, address);

// The document needs to know about the parser, for document.write().
// This _parser property will be deleted when we're done parsing.
doc._parser = htmlparser;

// XXX I think that any document we use this parser on should support
// scripts. But I may need to configure that through a parser parameter
// Only documents with windows ("browsing contexts" to be precise)
// allow scripting.
doc._scripting_enabled = scripting_enabled;


/***
* The actual code of the HTMLParser() factory function begins here.
*/

if (fragmentContext) { // for innerHTML parsing
if (fragmentContext.ownerDocument._quirks)
doc._quirks = true;
if (fragmentContext.ownerDocument._limitedQuirks)
doc._limitedQuirks = true;

// Set the initial tokenizer state
if (fragmentContext.namespaceURI === NAMESPACE.HTML) {
switch(fragmentContext.localName) {
case "title":
case "textarea":
tokenizer = rcdata_state;
break;
case "style":
case "xmp":
case "iframe":
case "noembed":
case "noframes":
case "script":
case "plaintext":
tokenizer = plaintext_state;
break;
case "noscript":
if (scripting_enabled)
tokenizer = plaintext_state;
}
}

var root = doc.createElement("html");
doc._appendChild(root);
stack.push(root);
if (fragmentContext instanceof impl.HTMLTemplateElement) {
templateInsertionModes.push(in_template_mode);
}
resetInsertionMode();

for(var e = fragmentContext; e !== null; e = e.parentElement) {
if (e instanceof impl.HTMLFormElement) {
form_element_pointer = e;
break;
}
}
}

// The in_table insertion mode turns on this flag, and that makes
// insertHTMLElement use the foster parenting algorithm for elements
// tags inside a table
foster_parent_mode = false;

// Used by the adoptionAgency() function
BOOKMARK = {localName:"BM"};

bogus_comment_state.lookahead = ">";
markup_declaration_open_state.lookahead = 7;
after_doctype_name_state.lookahead = 6;
// We might need to pause tokenization until we have enough characters
// in the buffer for longest possible character reference.
named_character_reference_state.lookahead = -NAMEDCHARREF_MAXLEN;

insertToken = htmlparser.insertToken = ogInsertToken;
},

document: function() {
return doc;
},
Expand Down Expand Up @@ -2036,6 +2179,7 @@ function HTMLParser(address, fragmentContext, options) {
// The second argument must not be set for recursive invocations
// from document.write()
parse: function(s, end, shouldPauseFunc) {
hasParsedAnything = true;
var moreToDo;

// If we're paused, remember the text to parse, but
Expand Down Expand Up @@ -2516,9 +2660,10 @@ function HTMLParser(address, fragmentContext, options) {
doc.modclock = 1; // Start tracking modifications
}

var ogInsertToken
// Insert a token, either using the current parser insertion mode
// (for HTML stuff) or using the insertForeignToken() method.
var insertToken = htmlparser.insertToken = function insertToken(t, value, arg3, arg4) {
var insertToken = ogInsertToken = htmlparser.insertToken = function insertToken(t, value, arg3, arg4) {
flushText();
var current = stack.top;

Expand Down
5 changes: 3 additions & 2 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,21 @@ exports.createDOMImplementation = function() {
return new DOMImplementation(null);
};

const parser = new HTMLParser();
exports.createDocument = function(html, force) {
// Previous API couldn't let you pass '' as a document, and that
// yields a slightly different document than createHTMLDocument('')
// does. The new `force` parameter lets you pass '' if you want to.
if (html || force) {
var parser = new HTMLParser();
parser.reset();
parser.parse(html || '', true);
return parser.document();
}
return new DOMImplementation(null).createHTMLDocument("");
};

exports.createIncrementalHTMLParser = function() {
var parser = new HTMLParser();
parser.reset();
/** API for incremental parser. */
return {
/** Provide an additional chunk of text to be parsed. */
Expand Down