diff --git a/.github/workflows/cs.yaml b/.github/workflows/cs.yaml index 4b7375d..bc6575c 100644 --- a/.github/workflows/cs.yaml +++ b/.github/workflows/cs.yaml @@ -28,5 +28,5 @@ jobs: - name: cs fix run: | - wget -q https://github.com/FriendsOfPHP/PHP-CS-Fixer/releases/download/v2.13.1/php-cs-fixer.phar + wget -q https://github.com/FriendsOfPHP/PHP-CS-Fixer/releases/download/v2.19.0/php-cs-fixer.phar php php-cs-fixer.phar fix --dry-run --diff diff --git a/.php_cs.dist b/.php_cs.dist index d5e4918..4c16946 100644 --- a/.php_cs.dist +++ b/.php_cs.dist @@ -9,6 +9,7 @@ return PhpCsFixer\Config::create() '@Symfony' => true, 'concat_space' => array('spacing' => 'one'), 'phpdoc_annotation_without_dot' => false, + 'array_syntax' => ['syntax' => 'long'], )) ->setFinder($finder) ; diff --git a/src/HTML5.php b/src/HTML5.php index c857145..49a90da 100644 --- a/src/HTML5.php +++ b/src/HTML5.php @@ -146,7 +146,6 @@ public function hasErrors() * Parse an input string. * * @param string $input - * @param array $options * * @return \DOMDocument */ diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index 293d83e..a718fe2 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -231,8 +231,6 @@ public function fragment() * * This is used for handling Processor Instructions as they are * inserted. If omitted, PI's are inserted directly into the DOM tree. - * - * @param InstructionProcessor $proc */ public function setInstructionProcessor(InstructionProcessor $proc) { @@ -302,12 +300,28 @@ public function startTag($name, $attributes = array(), $selfClosing = false) case 'head': if ($this->insertMode > static::IM_BEFORE_HEAD) { $this->parseError('Unexpected head tag outside of head context.'); + // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody + // A start tag whose tag name is one of: "caption", "col", "colgroup", "frame", "head", "tbody", + // "td", "tfoot", "th", "thead", "tr" + // Parse error. Ignore the token. + return 0; } else { $this->insertMode = static::IM_IN_HEAD; } break; case 'body': - $this->insertMode = static::IM_IN_BODY; + if ($this->insertMode >= static::IM_IN_BODY) { + $this->parseError('Unexpected body tag outside of body context.'); + // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody + // A start tag whose tag name is "body" + // Parse error. + // If the second element on the stack of open elements is not a body element, if the stack of open elements has only one node on it, or if there is a template element on the stack of open elements, then ignore the token. (fragment case) + // Otherwise, set the frameset-ok flag to "not ok"; then, for each attribute on the token, check to see if the attribute is already present on the body element (the second element) on the stack of open elements, and if it is not, add the attribute and its corresponding value to that element. + return 0; + } else { + $this->insertMode = static::IM_IN_BODY; + } + break; case 'svg': $this->insertMode = static::IM_IN_SVG; @@ -322,6 +336,12 @@ public function startTag($name, $attributes = array(), $selfClosing = false) break; } + // Case when no
exists, note section on 'Anything else' below. + // https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode + if ($this->insertMode === static::IM_AFTER_HEAD && 'head' !== $name && 'body' !== $name) { + $this->startTag('body'); + } + // Special case handling for SVG. if ($this->insertMode === static::IM_IN_SVG) { $lname = Elements::normalizeSvgElement($lname); @@ -535,10 +555,18 @@ public function endTag($name) switch ($lname) { case 'head': - $this->insertMode = static::IM_AFTER_HEAD; + if ($this->insertMode <= static::IM_AFTER_HEAD) { + $this->insertMode = static::IM_AFTER_HEAD; + } else { + $this->parseError('Closing head tag encountered but not in head context.'); + } break; case 'body': - $this->insertMode = static::IM_AFTER_BODY; + if ($this->insertMode <= static::IM_AFTER_BODY || $this->insertMode >= static::IM_IN_SVG) { + $this->insertMode = static::IM_AFTER_BODY; + } else { + $this->parseError('Closing body tag encountered but not in body context.'); + } break; case 'svg': case 'mathml': @@ -556,21 +584,20 @@ public function comment($cdata) public function text($data) { - // XXX: Hmmm.... should we really be this strict? + // https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode if ($this->insertMode < static::IM_IN_HEAD) { // Per '8.2.5.4.3 The "before head" insertion mode' the characters - // " \t\n\r\f" should be ignored but no mention of a parse error. This is - // practical as most documents contain these characters. Other text is not - // expected here so recording a parse error is necessary. + // " \t\n\r\f" should be ignored . $dataTmp = trim($data, " \t\n\r\f"); if (!empty($dataTmp)) { - // fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode); - $this->parseError('Unexpected text. Ignoring: ' . $dataTmp); + $this->startTag('head'); + $this->endTag('head'); + $this->startTag('body'); + } else { + return; } - - return; } - // fprintf(STDOUT, "Appending text %s.", $data); + $node = $this->doc->createTextNode($data); $this->current->appendChild($node); } diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 016919a..db3799d 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -726,6 +726,7 @@ protected function isCommentEnd() // Test for '!>' if ('!' == $this->scanner->current() && '>' == $this->scanner->peek()) { $this->scanner->consume(); // Consume the last '>' + return true; } // Unread '-' and one of '!' or '>'; diff --git a/src/HTML5/Parser/TreeBuildingRules.php b/src/HTML5/Parser/TreeBuildingRules.php index 00d3951..4c6983b 100644 --- a/src/HTML5/Parser/TreeBuildingRules.php +++ b/src/HTML5/Parser/TreeBuildingRules.php @@ -80,7 +80,6 @@ public function evaluate($new, $current) case 'thead': case 'tfoot': case 'table': // Spec isn't explicit about this, but it's necessary. - return $this->closeIfCurrentMatches($new, $current, array( 'thead', 'tfoot', diff --git a/test/HTML5/Html5Test.php b/test/HTML5/Html5Test.php index 1887a8d..ccb7bf8 100644 --- a/test/HTML5/Html5Test.php +++ b/test/HTML5/Html5Test.php @@ -492,4 +492,47 @@ public function testAnchorTargetQueryParam() $res ); } + + /** + * Test for issue #166. + * + * @dataProvider tagOmissionProvider + */ + public function testTagOmission($input, $expected) + { + $doc = $this->html5->loadHTML($input); + $this->assertCount(0, $this->html5->getErrors()); + + $out = $this->html5->saveHTML($doc); + $this->assertRegExp('|' . preg_quote($expected, '|') . '|', $out); + } + + /** + * Tag omission test cases. + * + * @return \string[][] + */ + public function tagOmissionProvider() + { + return array( + array( + 'Hello, This is a test.