diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index e9bca4e..8f3df88 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -322,6 +322,12 @@ public function startTag($name, $attributes = array(), $selfClosing = false) break; } + // Case when no exists, note section on 'Anything else' below. + // https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode + if ($this->insertMode === static::IM_AFTER_HEAD && 'head' !== $name && 'body' !== $name) { + $this->startTag('body'); + } + // Special case handling for SVG. if ($this->insertMode === static::IM_IN_SVG) { $lname = Elements::normalizeSvgElement($lname); @@ -548,21 +554,20 @@ public function comment($cdata) public function text($data) { - // XXX: Hmmm.... should we really be this strict? + // https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode if ($this->insertMode < static::IM_IN_HEAD) { // Per '8.2.5.4.3 The "before head" insertion mode' the characters - // " \t\n\r\f" should be ignored but no mention of a parse error. This is - // practical as most documents contain these characters. Other text is not - // expected here so recording a parse error is necessary. + // " \t\n\r\f" should be ignored . $dataTmp = trim($data, " \t\n\r\f"); - if (!empty($dataTmp)) { - // fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode); - $this->parseError('Unexpected text. Ignoring: ' . $dataTmp); + if (! empty($dataTmp)) { + $this->startTag('head'); + $this->endTag('head'); + $this->startTag('body'); + } else { + return; } - - return; } - // fprintf(STDOUT, "Appending text %s.", $data); + $node = $this->doc->createTextNode($data); $this->current->appendChild($node); } diff --git a/test/HTML5/Html5Test.php b/test/HTML5/Html5Test.php index 26d52fe..b181d53 100644 --- a/test/HTML5/Html5Test.php +++ b/test/HTML5/Html5Test.php @@ -480,4 +480,50 @@ public function testCDATA() $res = $this->cycleFragment('a a test. ]]>b'); $this->assertRegExp('| a test\. \]\]>|', $res); } + + /** + * Test for issue #166. + * + * @param $input + * @param $expected + * + * @dataProvider tagOmissionProvider + */ + public function testTagOmission($input, $expected) + { + $doc = $this->html5->loadHTML($input); + + $out = $this->html5->saveHTML($doc); + + $this->assertRegExp("|" . preg_quote($expected, "|") . "|", $out); + } + + /** + * Tag omission test cases. + * + * @return \string[][] + */ + public function tagOmissionProvider() + { + return $provider = array( + array( + 'Hello, This is a test.
Does it work this time?', + 'Hello, This is a test.
Does it work this time?', + ), + // test whitespace (\n) + array( + ' + + + +
+ +', + ' + +
+' + ), + ); + } }