fgnass · a-rasin · May 7, 2024
diff --git a/lib/HTMLParser.js b/lib/HTMLParser.js
@@ -1947,6 +1947,7 @@ function HTMLParser(address, fragmentContext, options) {
   var leftovers = "";
   var first_batch = true;
   var paused = 0; // Becomes non-zero while loading scripts
+  var hasParsedAnything = false; // Has parsed anything
 
 
   // Tokenizer state
@@ -1999,6 +2000,148 @@ function HTMLParser(address, fragmentContext, options) {
    * return value and defines the public API of the parser
    */
   var htmlparser = {
+    reset: function(address, fragmentContext, options) {
+      if (!hasParsedAnything) return
+      /***
+      * These are the parser's state variables
+      */
+      // Scanner state
+      chars = null;
+      numchars = 0; // Length of chars
+      nextchar = 0; // Index of next char
+      input_complete = false; // Becomes true when end() called.
+      scanner_skip_newline = false; // If previous char was CR
+      reentrant_invocations = 0;
+      saved_scanner_state = [];
+      leftovers = "";
+      first_batch = true;
+      paused = 0; // Becomes non-zero while loading scripts
+      hasParsedAnything = false;
+
+
+      // Tokenizer state
+      tokenizer = data_state; // Current tokenizer state
+      return_state;
+      character_reference_code;
+      tagnamebuf = "";
+      lasttagname = ""; // holds the target end tag for text states
+      tempbuf = [];
+      attrnamebuf = "";
+      attrvaluebuf = "";
+      commentbuf = [];
+      doctypenamebuf = [];
+      doctypepublicbuf = [];
+      doctypesystembuf = [];
+      attributes = [];
+      is_end_tag = false;
+
+      // Tree builder state
+      parser = initial_mode; // Current insertion mode
+      originalInsertionMode = null; // A saved insertion mode
+      templateInsertionModes = []; // Stack of template insertion modes.
+      stack = new HTMLParser.ElementStack(); // Stack of open elements
+      afe = new HTMLParser.ActiveFormattingElements(); // mis-nested tags
+      fragment = (fragmentContext!==undefined); // For innerHTML, etc.
+      head_element_pointer = null;
+      form_element_pointer = null;
+      scripting_enabled = true;
+      if (fragmentContext) {
+        scripting_enabled = fragmentContext.ownerDocument._scripting_enabled;
+      }
+      if (options && options.scripting_enabled === false)
+        scripting_enabled = false;
+      frameset_ok = true;
+      force_quirks = false;
+      pending_table_text;
+      text_integration_mode; // XXX a spec bug workaround?
+
+      // A single run of characters, buffered up to be sent to
+      // the parser as a single string.
+      textrun = [];
+      textIncludesNUL = false;
+      ignore_linefeed = false;
+
+
+      // This is the document we'll be building up
+      doc = new Document(true, address);
+
+      // The document needs to know about the parser, for document.write().
+      // This _parser property will be deleted when we're done parsing.
+      doc._parser = htmlparser;
+
+      // XXX I think that any document we use this parser on should support
+      // scripts. But I may need to configure that through a parser parameter
+      // Only documents with windows ("browsing contexts" to be precise)
+      // allow scripting.
+      doc._scripting_enabled = scripting_enabled;
+
+
+      /***
+      * The actual code of the HTMLParser() factory function begins here.
+      */
+
+      if (fragmentContext) { // for innerHTML parsing
+        if (fragmentContext.ownerDocument._quirks)
+          doc._quirks = true;
+        if (fragmentContext.ownerDocument._limitedQuirks)
+          doc._limitedQuirks = true;
+
+        // Set the initial tokenizer state
+        if (fragmentContext.namespaceURI === NAMESPACE.HTML) {
+          switch(fragmentContext.localName) {
+          case "title":
+          case "textarea":
+            tokenizer = rcdata_state;
+            break;
+          case "style":
+          case "xmp":
+          case "iframe":
+          case "noembed":
+          case "noframes":
+          case "script":
+          case "plaintext":
+            tokenizer = plaintext_state;
+            break;
+          case "noscript":
+            if (scripting_enabled)
+              tokenizer = plaintext_state;
+          }
+        }
+
+        var root = doc.createElement("html");
+        doc._appendChild(root);
+        stack.push(root);
+        if (fragmentContext instanceof impl.HTMLTemplateElement) {
+          templateInsertionModes.push(in_template_mode);
+        }
+        resetInsertionMode();
+
+        for(var e = fragmentContext; e !== null; e = e.parentElement) {
+          if (e instanceof impl.HTMLFormElement) {
+            form_element_pointer = e;
+            break;
+          }
+        }
+      }
+
+      // The in_table insertion mode turns on this flag, and that makes
+      // insertHTMLElement use the foster parenting algorithm for elements
+      // tags inside a table
+      foster_parent_mode = false;
+
+      // Used by the adoptionAgency() function
+      BOOKMARK = {localName:"BM"};
+
+      bogus_comment_state.lookahead = ">";
+      markup_declaration_open_state.lookahead = 7;
+      after_doctype_name_state.lookahead = 6;
+      // We might need to pause tokenization until we have enough characters
+      // in the buffer for longest possible character reference.
+      named_character_reference_state.lookahead = -NAMEDCHARREF_MAXLEN;
+
+      insertToken = htmlparser.insertToken = ogInsertToken;
+    },
+
     document: function() {
       return doc;
     },
@@ -2036,6 +2179,7 @@ function HTMLParser(address, fragmentContext, options) {
     // The second argument must not be set for recursive invocations
     // from document.write()
     parse: function(s, end, shouldPauseFunc) {
+      hasParsedAnything = true;
       var moreToDo;
 
       // If we're paused, remember the text to parse, but
@@ -2516,9 +2660,10 @@ function HTMLParser(address, fragmentContext, options) {
     doc.modclock = 1; // Start tracking modifications
   }
 
+  var ogInsertToken
   // Insert a token, either using the current parser insertion mode
   // (for HTML stuff) or using the insertForeignToken() method.
-  var insertToken = htmlparser.insertToken = function insertToken(t, value, arg3, arg4) {
+  var insertToken = ogInsertToken = htmlparser.insertToken = function insertToken(t, value, arg3, arg4) {
     flushText();
     var current = stack.top;
 

diff --git a/lib/index.js b/lib/index.js
@@ -7,20 +7,21 @@ exports.createDOMImplementation = function() {
   return new DOMImplementation(null);
 };
 
+const parser = new HTMLParser();
 exports.createDocument = function(html, force) {
   // Previous API couldn't let you pass '' as a document, and that
   // yields a slightly different document than createHTMLDocument('')
   // does.  The new `force` parameter lets you pass '' if you want to.
   if (html || force) {
-    var parser = new HTMLParser();
+    parser.reset();
     parser.parse(html || '', true);
     return parser.document();
   }
   return new DOMImplementation(null).createHTMLDocument("");
 };
 
 exports.createIncrementalHTMLParser = function() {
-    var parser = new HTMLParser();
+    parser.reset();
     /** API for incremental parser. */
     return {
         /** Provide an additional chunk of text to be parsed. */