diff --git a/.github/workflows/cs.yaml b/.github/workflows/cs.yaml index 4b7375d..bc6575c 100644 --- a/.github/workflows/cs.yaml +++ b/.github/workflows/cs.yaml @@ -28,5 +28,5 @@ jobs: - name: cs fix run: | - wget -q https://github.com/FriendsOfPHP/PHP-CS-Fixer/releases/download/v2.13.1/php-cs-fixer.phar + wget -q https://github.com/FriendsOfPHP/PHP-CS-Fixer/releases/download/v2.19.0/php-cs-fixer.phar php php-cs-fixer.phar fix --dry-run --diff diff --git a/.php_cs.dist b/.php_cs.dist index d5e4918..4c16946 100644 --- a/.php_cs.dist +++ b/.php_cs.dist @@ -9,6 +9,7 @@ return PhpCsFixer\Config::create() '@Symfony' => true, 'concat_space' => array('spacing' => 'one'), 'phpdoc_annotation_without_dot' => false, + 'array_syntax' => ['syntax' => 'long'], )) ->setFinder($finder) ; diff --git a/src/HTML5.php b/src/HTML5.php index c857145..49a90da 100644 --- a/src/HTML5.php +++ b/src/HTML5.php @@ -146,7 +146,6 @@ public function hasErrors() * Parse an input string. * * @param string $input - * @param array $options * * @return \DOMDocument */ diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index 293d83e..a718fe2 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -231,8 +231,6 @@ public function fragment() * * This is used for handling Processor Instructions as they are * inserted. If omitted, PI's are inserted directly into the DOM tree. - * - * @param InstructionProcessor $proc */ public function setInstructionProcessor(InstructionProcessor $proc) { @@ -302,12 +300,28 @@ public function startTag($name, $attributes = array(), $selfClosing = false) case 'head': if ($this->insertMode > static::IM_BEFORE_HEAD) { $this->parseError('Unexpected head tag outside of head context.'); + // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody + // A start tag whose tag name is one of: "caption", "col", "colgroup", "frame", "head", "tbody", + // "td", "tfoot", "th", "thead", "tr" + // Parse error. Ignore the token. + return 0; } else { $this->insertMode = static::IM_IN_HEAD; } break; case 'body': - $this->insertMode = static::IM_IN_BODY; + if ($this->insertMode >= static::IM_IN_BODY) { + $this->parseError('Unexpected body tag outside of body context.'); + // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody + // A start tag whose tag name is "body" + // Parse error. + // If the second element on the stack of open elements is not a body element, if the stack of open elements has only one node on it, or if there is a template element on the stack of open elements, then ignore the token. (fragment case) + // Otherwise, set the frameset-ok flag to "not ok"; then, for each attribute on the token, check to see if the attribute is already present on the body element (the second element) on the stack of open elements, and if it is not, add the attribute and its corresponding value to that element. + return 0; + } else { + $this->insertMode = static::IM_IN_BODY; + } + break; case 'svg': $this->insertMode = static::IM_IN_SVG; @@ -322,6 +336,12 @@ public function startTag($name, $attributes = array(), $selfClosing = false) break; } + // Case when no exists, note section on 'Anything else' below. + // https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode + if ($this->insertMode === static::IM_AFTER_HEAD && 'head' !== $name && 'body' !== $name) { + $this->startTag('body'); + } + // Special case handling for SVG. if ($this->insertMode === static::IM_IN_SVG) { $lname = Elements::normalizeSvgElement($lname); @@ -535,10 +555,18 @@ public function endTag($name) switch ($lname) { case 'head': - $this->insertMode = static::IM_AFTER_HEAD; + if ($this->insertMode <= static::IM_AFTER_HEAD) { + $this->insertMode = static::IM_AFTER_HEAD; + } else { + $this->parseError('Closing head tag encountered but not in head context.'); + } break; case 'body': - $this->insertMode = static::IM_AFTER_BODY; + if ($this->insertMode <= static::IM_AFTER_BODY || $this->insertMode >= static::IM_IN_SVG) { + $this->insertMode = static::IM_AFTER_BODY; + } else { + $this->parseError('Closing body tag encountered but not in body context.'); + } break; case 'svg': case 'mathml': @@ -556,21 +584,20 @@ public function comment($cdata) public function text($data) { - // XXX: Hmmm.... should we really be this strict? + // https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode if ($this->insertMode < static::IM_IN_HEAD) { // Per '8.2.5.4.3 The "before head" insertion mode' the characters - // " \t\n\r\f" should be ignored but no mention of a parse error. This is - // practical as most documents contain these characters. Other text is not - // expected here so recording a parse error is necessary. + // " \t\n\r\f" should be ignored . $dataTmp = trim($data, " \t\n\r\f"); if (!empty($dataTmp)) { - // fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode); - $this->parseError('Unexpected text. Ignoring: ' . $dataTmp); + $this->startTag('head'); + $this->endTag('head'); + $this->startTag('body'); + } else { + return; } - - return; } - // fprintf(STDOUT, "Appending text %s.", $data); + $node = $this->doc->createTextNode($data); $this->current->appendChild($node); } diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 016919a..db3799d 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -726,6 +726,7 @@ protected function isCommentEnd() // Test for '!>' if ('!' == $this->scanner->current() && '>' == $this->scanner->peek()) { $this->scanner->consume(); // Consume the last '>' + return true; } // Unread '-' and one of '!' or '>'; diff --git a/src/HTML5/Parser/TreeBuildingRules.php b/src/HTML5/Parser/TreeBuildingRules.php index 00d3951..4c6983b 100644 --- a/src/HTML5/Parser/TreeBuildingRules.php +++ b/src/HTML5/Parser/TreeBuildingRules.php @@ -80,7 +80,6 @@ public function evaluate($new, $current) case 'thead': case 'tfoot': case 'table': // Spec isn't explicit about this, but it's necessary. - return $this->closeIfCurrentMatches($new, $current, array( 'thead', 'tfoot', diff --git a/test/HTML5/Html5Test.php b/test/HTML5/Html5Test.php index 1887a8d..ccb7bf8 100644 --- a/test/HTML5/Html5Test.php +++ b/test/HTML5/Html5Test.php @@ -492,4 +492,47 @@ public function testAnchorTargetQueryParam() $res ); } + + /** + * Test for issue #166. + * + * @dataProvider tagOmissionProvider + */ + public function testTagOmission($input, $expected) + { + $doc = $this->html5->loadHTML($input); + $this->assertCount(0, $this->html5->getErrors()); + + $out = $this->html5->saveHTML($doc); + $this->assertRegExp('|' . preg_quote($expected, '|') . '|', $out); + } + + /** + * Tag omission test cases. + * + * @return \string[][] + */ + public function tagOmissionProvider() + { + return array( + array( + 'Hello, This is a test.
Does it work this time?', + 'Hello, This is a test.
Does it work this time?', + ), + // test whitespace (\n) + array( + ' + + + +
+ +', + ' + +
+', + ), + ); + } } diff --git a/test/HTML5/Parser/DOMTreeBuilderTest.php b/test/HTML5/Parser/DOMTreeBuilderTest.php index 659378c..9e9c003 100644 --- a/test/HTML5/Parser/DOMTreeBuilderTest.php +++ b/test/HTML5/Parser/DOMTreeBuilderTest.php @@ -6,9 +6,9 @@ namespace Masterminds\HTML5\Tests\Parser; +use Masterminds\HTML5\Parser\DOMTreeBuilder; use Masterminds\HTML5\Parser\Scanner; use Masterminds\HTML5\Parser\Tokenizer; -use Masterminds\HTML5\Parser\DOMTreeBuilder; /** * These tests are functional, not necessarily unit tests. @@ -457,14 +457,34 @@ public function testText() $data = $wrapper->childNodes->item(0); $this->assertEquals(XML_TEXT_NODE, $data->nodeType); $this->assertEquals('test', $data->data); + } + public function testTextBeforeHeadNotAllowed() + { // The DomTreeBuilder has special handling for text when in before head mode. - $html = ' - Foo'; + $html = 'Footest'; $doc = $this->parse($html); - $this->assertEquals('Line 0, Col 0: Unexpected text. Ignoring: Foo', $this->errors[0]); - $headElement = $doc->documentElement->firstChild; - $this->assertEquals('head', $headElement->tagName); + + $this->assertContains('Line 0, Col 0: Unexpected body tag outside of body context.', $this->errors); + $this->assertXmlStringEqualsXmlString($doc, 'Footest'); + } + + public function testHeadInBodyTriggersParseError() + { + $html = 'test'; + $doc = $this->parse($html); + + $this->assertContains('Line 0, Col 0: Unexpected head tag outside of head context.', $this->errors); + $this->assertXmlStringEqualsXmlString($doc, 'test'); + } + + public function testBodyInBodyTriggersParseError() + { + $html = 'testba
z'; + $doc = $this->parse($html); + + $this->assertContains('Line 0, Col 0: Unexpected body tag outside of body context.', $this->errors); + $this->assertXmlStringEqualsXmlString($doc, 'testba
z'); } public function testParseErrors() diff --git a/test/HTML5/Parser/ScannerTest.php b/test/HTML5/Parser/ScannerTest.php index 9f75c4d..a181c1b 100644 --- a/test/HTML5/Parser/ScannerTest.php +++ b/test/HTML5/Parser/ScannerTest.php @@ -6,8 +6,8 @@ namespace Masterminds\HTML5\Tests\Parser; -use Masterminds\HTML5\Parser\StringInputStream; use Masterminds\HTML5\Parser\Scanner; +use Masterminds\HTML5\Parser\StringInputStream; class ScannerTest extends \Masterminds\HTML5\Tests\TestCase { diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php index c9643c5..374896f 100644 --- a/test/HTML5/Parser/TokenizerTest.php +++ b/test/HTML5/Parser/TokenizerTest.php @@ -2,9 +2,9 @@ namespace Masterminds\HTML5\Tests\Parser; -use Masterminds\HTML5\Parser\UTF8Utils; use Masterminds\HTML5\Parser\Scanner; use Masterminds\HTML5\Parser\Tokenizer; +use Masterminds\HTML5\Parser\UTF8Utils; class TokenizerTest extends \Masterminds\HTML5\Tests\TestCase { diff --git a/test/HTML5/Parser/TreeBuildingRulesTest.php b/test/HTML5/Parser/TreeBuildingRulesTest.php index 45c68bc..ae17a34 100644 --- a/test/HTML5/Parser/TreeBuildingRulesTest.php +++ b/test/HTML5/Parser/TreeBuildingRulesTest.php @@ -6,10 +6,10 @@ namespace Masterminds\HTML5\Tests\Parser; -use Masterminds\HTML5\Parser\TreeBuildingRules; -use Masterminds\HTML5\Parser\Tokenizer; -use Masterminds\HTML5\Parser\Scanner; use Masterminds\HTML5\Parser\DOMTreeBuilder; +use Masterminds\HTML5\Parser\Scanner; +use Masterminds\HTML5\Parser\Tokenizer; +use Masterminds\HTML5\Parser\TreeBuildingRules; /** * These tests are functional, not necessarily unit tests. diff --git a/test/HTML5/Serializer/OutputRulesTest.php b/test/HTML5/Serializer/OutputRulesTest.php index 5741579..bb09853 100644 --- a/test/HTML5/Serializer/OutputRulesTest.php +++ b/test/HTML5/Serializer/OutputRulesTest.php @@ -2,9 +2,9 @@ namespace Masterminds\HTML5\Tests\Serializer; +use Masterminds\HTML5; use Masterminds\HTML5\Serializer\OutputRules; use Masterminds\HTML5\Serializer\Traverser; -use Masterminds\HTML5; class OutputRulesTest extends \Masterminds\HTML5\Tests\TestCase {