From 8e1f170924cc83b77eb8293edc5416bf0f16f8b4 Mon Sep 17 00:00:00 2001 From: Constantin Hong Date: Thu, 2 May 2024 20:41:31 +0900 Subject: [PATCH 01/37] html_tools/fix: Add forest_transplanting to handle invalid DOM --- changedetectionio/html_tools.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index a03653b9eda..96ffbec997a 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -110,6 +110,24 @@ def elementpath_tostring(obj): return str(obj) +def forest_transplanting(root): + """ + libxml2 violates DOM rules. it means there can be multiple root element + nodes. So I choose just transplating them to a new root by default. + See also, https://gitlab.gnome.org/GNOME/libxml2/-/issues/716 + This will emulate xpath1 of html of libxml2 like '/html[2]/*'. + To make this function work, 'fragment=True' in elementpath.select is required. + """ + from lxml import etree + from itertools import chain + root_siblings_preceding = [ s for s in root.itersiblings(preceding=True)] + root_siblings_preceding.reverse() + root_siblings = [s for s in root.itersiblings()] + new_root = etree.Element("new_root") + for node in chain(root_siblings_preceding, [root], root_siblings): + new_root.append(node) + return new_root + # Return str Utf-8 of matched rules def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False): from lxml import etree, html @@ -123,9 +141,10 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False parser = etree.XMLParser(strip_cdata=False) tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) + tree = forest_transplanting(tree) html_block = "" - r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser) + r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser, fragment=True) #@note: //title/text() wont work where CDATA.. if type(r) != list: From 1f776ff8f69f1a19060f55ae9a6df13177d97d74 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Thu, 2 May 2024 20:43:45 +0900 Subject: [PATCH 02/37] requirements/fix: Upgrade and pin elementpath to support fragment option --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 76e88c8f42f..3084e5ab20d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -55,7 +55,7 @@ beautifulsoup4 lxml >=4.8.0,<6 # XPath 2.0-3.1 support - 4.2.0 broke something? -elementpath==4.1.5 +elementpath==4.4.0 selenium~=4.14.0 From bf5c2c7b0055bc25fc37a033504d2698c3a5d7f8 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Fri, 3 May 2024 02:16:22 +0900 Subject: [PATCH 03/37] html_tools/fix: --- changedetectionio/html_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 96ffbec997a..a3ca75f579a 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -144,7 +144,7 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False tree = forest_transplanting(tree) html_block = "" - r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser, fragment=True) + r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser, fragment=True, item=tree[0]) #@note: //title/text() wont work where <title>CDATA.. if type(r) != list: From 9f0cb3544f3316d7324f5bd4938bd41f7dacab46 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Fri, 3 May 2024 02:34:32 +0900 Subject: [PATCH 04/37] html_tools/fix: Another option --- changedetectionio/html_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index a3ca75f579a..0ea3401bd60 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -144,7 +144,7 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False tree = forest_transplanting(tree) html_block = "" - r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser, fragment=True, item=tree[0]) + r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser, fragment=True, item=tree) #@note: //title/text() wont work where <title>CDATA.. if type(r) != list: From 879d0b2c06644826a1042930122f8e5897e1009b Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Tue, 7 May 2024 15:27:52 +0900 Subject: [PATCH 05/37] html_tools/fix: --- changedetectionio/html_tools.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 0ea3401bd60..a53d86fa8a4 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -121,12 +121,28 @@ def forest_transplanting(root): from lxml import etree from itertools import chain root_siblings_preceding = [ s for s in root.itersiblings(preceding=True)] - root_siblings_preceding.reverse() root_siblings = [s for s in root.itersiblings()] - new_root = etree.Element("new_root") - for node in chain(root_siblings_preceding, [root], root_siblings): - new_root.append(node) - return new_root + + Is_fragment=False + # If element node exsits in root element node's sibilings, it is fragment. + for node in chain(root_siblings_preceding, root_siblings): + if not hasattr(node.tag, '__name__'): + Is_fragment=True + # early exit. because the root is already root element. + # So, two root element nodes are detected. DOM violation. + break + + if Is_fragment: + new_root = etree.Element("new_root") + root_siblings_preceding.reverse() + #tree = etree.ElementTree(new_root) + for node in chain(root_siblings_preceding, [root], root_siblings): + new_root.append(node) + #print(new_root.getchildren()) + return new_root, True + + return root, False + # Return str Utf-8 of matched rules def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False): @@ -141,10 +157,10 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False parser = etree.XMLParser(strip_cdata=False) tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) - tree = forest_transplanting(tree) + tree, is_fragment = forest_transplanting(tree) html_block = "" - r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser, fragment=True, item=tree) + r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser, fragment=is_fragment) #@note: //title/text() wont work where <title>CDATA.. if type(r) != list: From ed2aaf4cab615fe5cdfd9316dc22e035afad99f1 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Tue, 7 May 2024 22:56:52 +0900 Subject: [PATCH 06/37] tests/test_xpath_selector_unit/test: Add test. --- .../tests/test_xpath_selector_unit.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index b4dda08068a..3f08de17ea5 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -201,3 +201,27 @@ def test_trips(html_content, xpath, answer): html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str assert answer in html_content + +DOM_violation_two_html_root_element = ="""<!DOCTYPE html> +<html> + <body> + <h1>Hello absurd world</h1> + <p>First paragraph.</p> + </body> +</html> +<html> + <body> + <h1>Hello absurd world</h1> + <p>Browsers parse this part by fixing it but lxml doesn't and returns two root element node</p> + <p>Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one.</p> + </body> +</html>""" + +@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) +@pytest.mark.parametrize("xpath, answer", [ + ("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), + ]) +def test_trips(html_content, xpath, answer): + html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) + assert type(html_content) == str + assert answer not in html_content From dd8b4fe9222b4cb129bd12fab9e2805d222e2d49 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Tue, 7 May 2024 22:58:47 +0900 Subject: [PATCH 07/37] html_tools/docs: Remove comments --- changedetectionio/html_tools.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index a53d86fa8a4..6ae4ef07587 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -135,10 +135,8 @@ def forest_transplanting(root): if Is_fragment: new_root = etree.Element("new_root") root_siblings_preceding.reverse() - #tree = etree.ElementTree(new_root) for node in chain(root_siblings_preceding, [root], root_siblings): new_root.append(node) - #print(new_root.getchildren()) return new_root, True return root, False From fbd55129eddcf3ac3e5cf0964de407f4a3f75f08 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Tue, 7 May 2024 23:09:57 +0900 Subject: [PATCH 08/37] tests/test_xpath_selector_unit/fix: Typo --- changedetectionio/tests/test_xpath_selector_unit.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index 3f08de17ea5..131054c5610 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -202,7 +202,7 @@ def test_trips(html_content, xpath, answer): assert type(html_content) == str assert answer in html_content -DOM_violation_two_html_root_element = ="""<!DOCTYPE html> +DOM_violation_two_html_root_element = """<!DOCTYPE html> <html> <body> <h1>Hello absurd world</h1> @@ -216,7 +216,6 @@ def test_trips(html_content, xpath, answer): <p>Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one.</p> </body> </html>""" - @pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) @pytest.mark.parametrize("xpath, answer", [ ("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), From 20195e7a79b279b3f92ca1b135534066cf5d8d98 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Tue, 7 May 2024 23:25:45 +0900 Subject: [PATCH 09/37] tests/test_xpath_selector_unit/test: Fix test and add more small tests for fragment --- changedetectionio/tests/test_xpath_selector_unit.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index 131054c5610..7b9c57d10be 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -205,22 +205,27 @@ def test_trips(html_content, xpath, answer): DOM_violation_two_html_root_element = """<!DOCTYPE html> <html> <body> - <h1>Hello absurd world</h1> + <h1>Hello world</h1> <p>First paragraph.</p> </body> </html> <html> <body> - <h1>Hello absurd world</h1> + <h1>Hello world</h1> <p>Browsers parse this part by fixing it but lxml doesn't and returns two root element node</p> <p>Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one.</p> </body> </html>""" @pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) @pytest.mark.parametrize("xpath, answer", [ + ("/html/body/p[1]", "First paragraph."), ("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), + ("//html/body/p[1]", "First paragraph."), + ("//html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), + ("//body/p[1]", "First paragraph."), + ("//body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ]) def test_trips(html_content, xpath, answer): html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str - assert answer not in html_content + assert answer in html_content From 220f484ee97e049b11a8901c298c2ad8cb9fec4c Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 8 May 2024 00:00:08 +0900 Subject: [PATCH 10/37] tests/test_xpath_selector_unit/test: Check error occurs. --- changedetectionio/tests/test_xpath_selector_unit.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index 7b9c57d10be..315a50fbc3b 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -226,6 +226,18 @@ def test_trips(html_content, xpath, answer): ("//body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ]) def test_trips(html_content, xpath, answer): + + + # In normal situation, DOM's root element node is only one. So Exception occurs. + with pytest.raises(Exception): + from lxml import etree, html + import elementpath + from elementpath.xpath3 import XPath3Parser + parser = etree.HTMLParser() + tree = html.fromstring(bytes(doc, encoding='utf-8'), parser=parser) + # Error will occur. + r = elementpath.select(tree, path.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser) + html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str assert answer in html_content From e84b9f1c0f129209b8ef911ff7eca11e0a1f5981 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 8 May 2024 00:04:02 +0900 Subject: [PATCH 11/37] tests/test_xpath_selector_unit/test: Fix --- changedetectionio/tests/test_xpath_selector_unit.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index 315a50fbc3b..e3ea3894023 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -234,9 +234,10 @@ def test_trips(html_content, xpath, answer): import elementpath from elementpath.xpath3 import XPath3Parser parser = etree.HTMLParser() - tree = html.fromstring(bytes(doc, encoding='utf-8'), parser=parser) + tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) + # just example xpath # Error will occur. - r = elementpath.select(tree, path.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser) + r = elementpath.select(tree, xpath.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser) html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str From 60777e429a183e5d5d8e669204ca2ecbb3f4c3c7 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 8 May 2024 00:22:02 +0900 Subject: [PATCH 12/37] tests/test_xpath_selector_unit/test: Add more unintuitive tests --- .../tests/test_xpath_selector_unit.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index e3ea3894023..1045a73837c 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -224,11 +224,15 @@ def test_trips(html_content, xpath, answer): ("//html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("//body/p[1]", "First paragraph."), ("//body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), + ("/html[2]/body/p[1]", "First paragraph."), + ("/html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), + ("//html[2]/body/p[1]", "First paragraph."), + ("//html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ]) def test_trips(html_content, xpath, answer): - # In normal situation, DOM's root element node is only one. So Exception occurs. + # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. with pytest.raises(Exception): from lxml import etree, html import elementpath @@ -242,3 +246,16 @@ def test_trips(html_content, xpath, answer): html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str assert answer in html_content + +@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) +@pytest.mark.parametrize("xpath, answer", [ + ("/html[2]/body/p[1]", "First paragraph."), + ("//html[2]/body/p[1]", "First paragraph."), + ]) +def test_trips(html_content, xpath, answer): + # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. + + html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) + assert type(html_content) == str + # check the answer is not in the html_content + assert answer not in html_content From e325e029672ef3372f8d52c7a68c680b6d0342f6 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 8 May 2024 01:04:41 +0900 Subject: [PATCH 13/37] tests/test_xpath_selector_unit/test: Trigger test again --- changedetectionio/tests/test_xpath_selector_unit.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index 1045a73837c..f976b67ad87 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -254,8 +254,7 @@ def test_trips(html_content, xpath, answer): ]) def test_trips(html_content, xpath, answer): # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. - html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str - # check the answer is not in the html_content + # check the answer is *not in* the html_content assert answer not in html_content From 6a2e1cf9138e7f6d4236922dfc490240175c3b3f Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 8 May 2024 01:20:17 +0900 Subject: [PATCH 14/37] tests/test_xpath_selector_unit/fix: Trigger test again. why it doesn't work like my repo --- changedetectionio/tests/test_xpath_selector_unit.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index f976b67ad87..ebbaf23a92d 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -230,8 +230,6 @@ def test_trips(html_content, xpath, answer): ("//html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ]) def test_trips(html_content, xpath, answer): - - # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. with pytest.raises(Exception): from lxml import etree, html From 55b2c6c63e3e73b408002f4c17713c843be1297d Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 8 May 2024 01:36:04 +0900 Subject: [PATCH 15/37] tests/test_xpath_selector_unit/test: Oops fix test name --- changedetectionio/tests/test_xpath_selector_unit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index ebbaf23a92d..95bdb525f47 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -229,7 +229,7 @@ def test_trips(html_content, xpath, answer): ("//html[2]/body/p[1]", "First paragraph."), ("//html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ]) -def test_trips(html_content, xpath, answer): +def test_broken_DOM_01(html_content, xpath, answer): # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. with pytest.raises(Exception): from lxml import etree, html @@ -250,7 +250,7 @@ def test_trips(html_content, xpath, answer): ("/html[2]/body/p[1]", "First paragraph."), ("//html[2]/body/p[1]", "First paragraph."), ]) -def test_trips(html_content, xpath, answer): +def test_Broken_DOM_02(html_content, xpath, answer): # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str From 93a9585fc6e2f340170796f83f799215ffd4c1f5 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 8 May 2024 01:47:10 +0900 Subject: [PATCH 16/37] tests/test_xpath_selector_unit/test: Failed successfully --- changedetectionio/tests/test_xpath_selector_unit.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index 95bdb525f47..3d8d84806c2 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -224,9 +224,7 @@ def test_trips(html_content, xpath, answer): ("//html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("//body/p[1]", "First paragraph."), ("//body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), - ("/html[2]/body/p[1]", "First paragraph."), ("/html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), - ("//html[2]/body/p[1]", "First paragraph."), ("//html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ]) def test_broken_DOM_01(html_content, xpath, answer): From e6b13c9ad3b0ec5ead2fc38494bdf1fdb03aaf40 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 8 May 2024 02:16:37 +0900 Subject: [PATCH 17/37] tests/test_xpath_selector_unit/test: Add count test --- changedetectionio/tests/test_xpath_selector_unit.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index 3d8d84806c2..0cbbb045570 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -220,6 +220,11 @@ def test_trips(html_content, xpath, answer): @pytest.mark.parametrize("xpath, answer", [ ("/html/body/p[1]", "First paragraph."), ("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), + ("count(/html/body/p[1])", "2"), + ("count(/html)", "2"), + ("count(//html)", "2"), + ("count(//body)", "2"), + ("count(/html/body)", "2"), ("//html/body/p[1]", "First paragraph."), ("//html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("//body/p[1]", "First paragraph."), From 2e3e7811ef9dc949e17c9c4bbc34320cae1a242c Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 8 May 2024 02:30:34 +0900 Subject: [PATCH 18/37] tests/test_xpath_selector_unit/chore: Trigger CICD --- changedetectionio/tests/test_xpath_selector_unit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index 0cbbb045570..958e2e29b9f 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -257,5 +257,5 @@ def test_Broken_DOM_02(html_content, xpath, answer): # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str - # check the answer is *not in* the html_content + # Check the answer is *not in* the html_content assert answer not in html_content From c295c5e40dce6b58517ba44c34ebb0fbecf96356 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 8 May 2024 02:50:21 +0900 Subject: [PATCH 19/37] tests/test_xpath_selector_unit/test: Add same behavior for xpath 1 --- .../tests/test_xpath_selector_unit.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index 958e2e29b9f..cfee3080e45 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -259,3 +259,21 @@ def test_Broken_DOM_02(html_content, xpath, answer): assert type(html_content) == str # Check the answer is *not in* the html_content assert answer not in html_content + +@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) +@pytest.mark.parametrize("xpath, answer", [ + ("/html/body/p[1]", 2), + ("/html", 2), + ("//html", 2), + ("//body", 2), + ("/html/body", 2), + ]) +def test_Broken_DOM_03(html_content, xpath, answer): + # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. + + from lxml import etree, html + parser = etree.HTMLParser() + tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) + + # test xpath 1 + assert len(tree.xpath(xpath)) == 2 From 5acd31fb1ec347e8e07f4017940a29a8bcab55e5 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 8 May 2024 02:52:04 +0900 Subject: [PATCH 20/37] tests/test_xpath_selector_unit/test: Fix misc --- changedetectionio/tests/test_xpath_selector_unit.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index cfee3080e45..0d1ac6d36ea 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -269,8 +269,7 @@ def test_Broken_DOM_02(html_content, xpath, answer): ("/html/body", 2), ]) def test_Broken_DOM_03(html_content, xpath, answer): - # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. - + """just test for xpath1""" from lxml import etree, html parser = etree.HTMLParser() tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) From de7b66bc8e132211ca67054847e7f81f930cbf71 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 8 May 2024 02:54:19 +0900 Subject: [PATCH 21/37] tests/test_xpath_selector_unit/test: Fix answer --- changedetectionio/tests/test_xpath_selector_unit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index 0d1ac6d36ea..3f2b86d282e 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -275,4 +275,4 @@ def test_Broken_DOM_03(html_content, xpath, answer): tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) # test xpath 1 - assert len(tree.xpath(xpath)) == 2 + assert len(tree.xpath(xpath)) == answer From 66a7dae381367ede985202598913b5e97019b5fb Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 8 May 2024 03:00:50 +0900 Subject: [PATCH 22/37] html_tools/docs: Fix old comment --- changedetectionio/html_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 6ae4ef07587..276a6219487 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -113,7 +113,7 @@ def elementpath_tostring(obj): def forest_transplanting(root): """ libxml2 violates DOM rules. it means there can be multiple root element - nodes. So I choose just transplating them to a new root by default. + nodes. So I choose just transplating them to a new root when the violation happens. See also, https://gitlab.gnome.org/GNOME/libxml2/-/issues/716 This will emulate xpath1 of html of libxml2 like '/html[2]/*'. To make this function work, 'fragment=True' in elementpath.select is required. From 4d266cac9f33d62ae1c662a3128d043d9a0579fd Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Fri, 10 May 2024 00:08:49 +0900 Subject: [PATCH 23/37] tests/test_xpath_selector_unit/feat: Do forest_transplanting by default --- changedetectionio/html_tools.py | 22 +++++-------------- .../tests/test_xpath_selector_unit.py | 3 +++ 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 276a6219487..8a7bbd929d6 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -123,24 +123,12 @@ def forest_transplanting(root): root_siblings_preceding = [ s for s in root.itersiblings(preceding=True)] root_siblings = [s for s in root.itersiblings()] - Is_fragment=False - # If element node exsits in root element node's sibilings, it is fragment. - for node in chain(root_siblings_preceding, root_siblings): - if not hasattr(node.tag, '__name__'): - Is_fragment=True - # early exit. because the root is already root element. - # So, two root element nodes are detected. DOM violation. - break - - if Is_fragment: - new_root = etree.Element("new_root") - root_siblings_preceding.reverse() - for node in chain(root_siblings_preceding, [root], root_siblings): - new_root.append(node) - return new_root, True - - return root, False + new_root = etree.Element("new_root") + root_siblings_preceding.reverse() + for node in chain(root_siblings_preceding, [root], root_siblings): + new_root.append(node) + return new_root, True # Return str Utf-8 of matched rules def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False): diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index 3f2b86d282e..047191ab03a 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -218,6 +218,9 @@ def test_trips(html_content, xpath, answer): </html>""" @pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) @pytest.mark.parametrize("xpath, answer", [ + (".", "First paragraph."), + ("/*", "First paragraph."), + ("/html", "First paragraph."), ("/html/body/p[1]", "First paragraph."), ("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("count(/html/body/p[1])", "2"), From ebf7fd4ef3e754520718b54527d207dd4641424e Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Fri, 10 May 2024 00:36:27 +0900 Subject: [PATCH 24/37] tests/test_xpath_selector_unit/test: Fix tests --- .../tests/test_xpath_selector_unit.py | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index 047191ab03a..0d839d5afd7 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -68,7 +68,7 @@ ("some $i in //hotel/branch/staff satisfies $i/age < 20", "false"), ("every $i in /hotel/branch/staff satisfies $i/age > 20", "true"), ("every $i in //hotel/branch/staff satisfies $i/age > 20 ", "true"), - ("let $x := branch[@location = 'California'], $y := branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"), + ("let $x := hotel/branch[@location = 'California'], $y := hotel/branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"), ("let $x := //branch[@location = 'California'], $y := //branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"), ("let $nu := 1, $de := 1000 return 'probability = ' || $nu div $de * 100 || '%'", "0.1%"), ("let $nu := 2, $probability := function ($argument) { 'probability = ' || $nu div $argument * 100 || '%'}, $de := 5 return $probability($de)", "40%"), @@ -99,45 +99,45 @@ def test_hotels(html_content, xpath, answer): </branches_to_visit>""" @pytest.mark.parametrize("html_content", [branches_to_visit]) @pytest.mark.parametrize("xpath, answer", [ - ("manager[@name = 'Godot']/branch union manager[@name = 'Freya']/branch", "Area 51"), + ("branches_to_visit/manager[@name = 'Godot']/branch union branches_to_visit/manager[@name = 'Freya']/branch", "Area 51"), ("//manager[@name = 'Godot']/branch union //manager[@name = 'Freya']/branch", "Stalsk12"), - ("manager[@name = 'Godot']/branch | manager[@name = 'Freya']/branch", "Stalsk12"), + ("branches_to_visit/manager[@name = 'Godot']/branch | branches_to_visit/manager[@name = 'Freya']/branch", "Stalsk12"), ("//manager[@name = 'Godot']/branch | //manager[@name = 'Freya']/branch", "Stalsk12"), - ("manager/branch intersect manager[@name = 'Godot']/branch", "A place with no name"), + ("branches_to_visit/manager/branch intersect branches_to_visit/manager[@name = 'Godot']/branch", "A place with no name"), ("//manager/branch intersect //manager[@name = 'Godot']/branch", "A place with no name"), - ("manager[@name = 'Godot']/branch intersect manager[@name = 'Freya']/branch", ""), - ("manager/branch except manager[@name = 'Godot']/branch", "Barcelona"), - ("manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"), + ("branches_to_visit/manager[@name = 'Godot']/branch intersect branches_to_visit/manager[@name = 'Freya']/branch", ""), + ("branches_to_visit/manager/branch except branches_to_visit/manager[@name = 'Godot']/branch", "Barcelona"), + ("branches_to_visit/manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"), ("//manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"), - ("manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"), + ("branches_to_visit/manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"), ("//manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"), - ("manager[@name = 'Godot']/branch[2] eq manager[@name = 'Freya']/branch[2]", "false"), + ("branches_to_visit/manager[@name = 'Godot']/branch[2] eq branches_to_visit/manager[@name = 'Freya']/branch[2]", "false"), ("//manager[@name = 'Godot']/branch[2] eq //manager[@name = 'Freya']/branch[2]", "false"), - ("manager[1]/@room_no lt manager[2]/@room_no", "false"), + ("branches_to_visit/manager[1]/@room_no lt branches_to_visit/manager[2]/@room_no", "false"), ("//manager[1]/@room_no lt //manager[2]/@room_no", "false"), - ("manager[1]/@room_no gt manager[2]/@room_no", "true"), + ("branches_to_visit/manager[1]/@room_no gt branches_to_visit/manager[2]/@room_no", "true"), ("//manager[1]/@room_no gt //manager[2]/@room_no", "true"), - ("manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"), + ("branches_to_visit/manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"), ("//manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"), - ("manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"), + ("branches_to_visit/manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"), ("//manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"), - ("manager[@name = 'Godot']/branch = 'Area 51'", "true"), + ("branches_to_visit/manager[@name = 'Godot']/branch = 'Area 51'", "true"), ("//manager[@name = 'Godot']/branch = 'Area 51'", "true"), - ("manager[@name = 'Godot']/branch = 'Barcelona'", "false"), + ("branches_to_visit/manager[@name = 'Godot']/branch = 'Barcelona'", "false"), ("//manager[@name = 'Godot']/branch = 'Barcelona'", "false"), - ("manager[1]/@room_no > manager[2]/@room_no", "true"), + ("branches_to_visit/manager[1]/@room_no > branches_to_visit/manager[2]/@room_no", "true"), ("//manager[1]/@room_no > //manager[2]/@room_no", "true"), - ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[1]/branch[1]", "false"), + ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[1]/branch[1]", "false"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[1]/branch[1]", "false"), - ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[1]/branch[3]", "true"), + ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[1]/branch[3]", "true"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[1]/branch[3]", "true"), - ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << manager[1]/branch[1]", "false"), + ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << branches_to_visit/manager[1]/branch[1]", "false"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << //manager[1]/branch[1]", "false"), - ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> manager[1]/branch[1]", "true"), + ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> branches_to_visit/manager[1]/branch[1]", "true"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> //manager[1]/branch[1]", "true"), - ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"), + ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"), - ("manager[1]/@name || manager[2]/@name", "GodotFreya"), + ("branches_to_visit/manager[1]/@name || branches_to_visit/manager[2]/@name", "GodotFreya"), ("//manager[1]/@name || //manager[2]/@name", "GodotFreya"), ]) def test_branches_to_visit(html_content, xpath, answer): @@ -170,10 +170,10 @@ def test_branches_to_visit(html_content, xpath, answer): ("(1 + 9 * 9 + 5) div 6", "14.5"), ("23 idiv 3", "7"), ("23 div 3", "7.66666666"), - ("for $i in ./trip return $i/traveler/duration * $i/traveler/price", "21002.04"), - ("for $i in ./trip return $i/traveler/duration ", "4"), + ("for $i in ./trips/trip return $i/traveler/duration * $i/traveler/price", "21002.04"), + ("for $i in ./trips/trip return $i/traveler/duration ", "4"), ("for $i in .//trip return $i/traveler/duration * $i/traveler/price", "21002.04"), - ("sum(for $i in ./trip return $i/traveler/duration * $i/traveler/price)", "29002.04"), + ("sum(for $i in ./trips/trip return $i/traveler/duration * $i/traveler/price)", "29002.04"), ("sum(for $i in .//trip return $i/traveler/duration * $i/traveler/price)", "29002.04"), #("trip[1]/depart - trip[1]/arrive", "fail_to_get_answer"), #("//trip[1]/depart - //trip[1]/arrive", "fail_to_get_answer"), From 26e4a58cba1a16aff0fc07cee0928005393d6a0c Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Fri, 10 May 2024 01:44:06 +0900 Subject: [PATCH 25/37] tests/test_xpath_selector_unit/test: Add context node related tests --- .../tests/test_xpath_selector_unit.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index 0d839d5afd7..db3d7f03dd4 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -205,22 +205,39 @@ def test_trips(html_content, xpath, answer): DOM_violation_two_html_root_element = """<!DOCTYPE html> <html> <body> - <h1>Hello world</h1> + <h1>Hello world1</h1> <p>First paragraph.</p> </body> </html> <html> <body> - <h1>Hello world</h1> + <h1>Hello world2</h1> <p>Browsers parse this part by fixing it but lxml doesn't and returns two root element node</p> <p>Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one.</p> </body> </html>""" @pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) @pytest.mark.parametrize("xpath, answer", [ + (".", "Hello world1"), (".", "First paragraph."), + (".", "Hello world2"), + (".", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), + (".", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."), + ("/*", "Hello world1"), ("/*", "First paragraph."), + ("/*", "Hello world2"), + ("/*", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), + ("/*", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."), + ("html", "Hello world1"), + ("html", "First paragraph."), + ("html", "Hello world2"), + ("html", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), + ("html", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."), + ("/html", "Hello world1"), ("/html", "First paragraph."), + ("/html", "Hello world2"), + ("/html", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), + ("/html", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."), ("/html/body/p[1]", "First paragraph."), ("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("count(/html/body/p[1])", "2"), From dbf4e87b3174be8c3c76cf6aa0a1477520b65311 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Thu, 16 May 2024 14:33:13 +0900 Subject: [PATCH 26/37] requirements/chore: Change minimum version of elementpath --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3084e5ab20d..36017f4a985 100644 --- a/requirements.txt +++ b/requirements.txt @@ -55,7 +55,7 @@ beautifulsoup4 lxml >=4.8.0,<6 # XPath 2.0-3.1 support - 4.2.0 broke something? -elementpath==4.4.0 +elementpath>=4.2.1 selenium~=4.14.0 From 7cd764f101cb994acafa768f1547674672c805fc Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Fri, 17 May 2024 18:27:17 +0900 Subject: [PATCH 27/37] html_tools/fix: Improve speed for function calls --- changedetectionio/html_tools.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 8a7bbd929d6..3425dc0809c 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -9,6 +9,10 @@ from xml.sax.saxutils import escape as xml_escape import json import re +from itertools import chain +from elementpath import select as elementpath_select +# xpath 2.0-3.1 +from elementpath.xpath3 import XPath3Parser # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis @@ -119,7 +123,7 @@ def forest_transplanting(root): To make this function work, 'fragment=True' in elementpath.select is required. """ from lxml import etree - from itertools import chain + root_siblings_preceding = [ s for s in root.itersiblings(preceding=True)] root_siblings = [s for s in root.itersiblings()] @@ -133,9 +137,6 @@ def forest_transplanting(root): # Return str Utf-8 of matched rules def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False): from lxml import etree, html - import elementpath - # xpath 2.0-3.1 - from elementpath.xpath3 import XPath3Parser parser = etree.HTMLParser() if is_rss: @@ -146,7 +147,7 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False tree, is_fragment = forest_transplanting(tree) html_block = "" - r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser, fragment=is_fragment) + r = elementpath_select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser, fragment=is_fragment) #@note: //title/text() wont work where <title>CDATA.. if type(r) != list: From 361987796e8e98729be6e39eb88b406a91b14cee Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Sun, 26 May 2024 19:07:14 +0900 Subject: [PATCH 28/37] Revert "html_tools/docs: Fix old comment" This reverts commit 66a7dae381367ede985202598913b5e97019b5fb. --- changedetectionio/html_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 3425dc0809c..a4e283d6dad 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -117,7 +117,7 @@ def elementpath_tostring(obj): def forest_transplanting(root): """ libxml2 violates DOM rules. it means there can be multiple root element - nodes. So I choose just transplating them to a new root when the violation happens. + nodes. So I choose just transplating them to a new root by default. See also, https://gitlab.gnome.org/GNOME/libxml2/-/issues/716 This will emulate xpath1 of html of libxml2 like '/html[2]/*'. To make this function work, 'fragment=True' in elementpath.select is required. From 827f81a293e9f6083f25c537a8ffef5717c4cd1d Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Thu, 1 Aug 2024 18:32:28 +0900 Subject: [PATCH 29/37] Update html_tools.py description add precise description --- changedetectionio/html_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 74c5fef15e8..1ca26e2f786 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -114,7 +114,7 @@ def elementpath_tostring(obj): def forest_transplanting(root): """ - libxml2 violates DOM rules. it means there can be multiple root element + The html parser of libxml2 violates DOM rules. It means there can be multiple root element nodes. So I choose just transplating them to a new root by default. See also, https://gitlab.gnome.org/GNOME/libxml2/-/issues/716 This will emulate xpath1 of html of libxml2 like '/html[2]/*'. From e6ac28598a2d435656a24dc8657834933b079438 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 11 Sep 2024 02:40:08 +0900 Subject: [PATCH 30/37] Revert "tests/test_xpath_selector_unit/test: Fix tests" This reverts commit ebf7fd4ef3e754520718b54527d207dd4641424e. --- .../tests/test_xpath_selector_unit.py | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py index db3d7f03dd4..e56ef823c36 100644 --- a/changedetectionio/tests/test_xpath_selector_unit.py +++ b/changedetectionio/tests/test_xpath_selector_unit.py @@ -68,7 +68,7 @@ ("some $i in //hotel/branch/staff satisfies $i/age < 20", "false"), ("every $i in /hotel/branch/staff satisfies $i/age > 20", "true"), ("every $i in //hotel/branch/staff satisfies $i/age > 20 ", "true"), - ("let $x := hotel/branch[@location = 'California'], $y := hotel/branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"), + ("let $x := branch[@location = 'California'], $y := branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"), ("let $x := //branch[@location = 'California'], $y := //branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"), ("let $nu := 1, $de := 1000 return 'probability = ' || $nu div $de * 100 || '%'", "0.1%"), ("let $nu := 2, $probability := function ($argument) { 'probability = ' || $nu div $argument * 100 || '%'}, $de := 5 return $probability($de)", "40%"), @@ -99,45 +99,45 @@ def test_hotels(html_content, xpath, answer): </branches_to_visit>""" @pytest.mark.parametrize("html_content", [branches_to_visit]) @pytest.mark.parametrize("xpath, answer", [ - ("branches_to_visit/manager[@name = 'Godot']/branch union branches_to_visit/manager[@name = 'Freya']/branch", "Area 51"), + ("manager[@name = 'Godot']/branch union manager[@name = 'Freya']/branch", "Area 51"), ("//manager[@name = 'Godot']/branch union //manager[@name = 'Freya']/branch", "Stalsk12"), - ("branches_to_visit/manager[@name = 'Godot']/branch | branches_to_visit/manager[@name = 'Freya']/branch", "Stalsk12"), + ("manager[@name = 'Godot']/branch | manager[@name = 'Freya']/branch", "Stalsk12"), ("//manager[@name = 'Godot']/branch | //manager[@name = 'Freya']/branch", "Stalsk12"), - ("branches_to_visit/manager/branch intersect branches_to_visit/manager[@name = 'Godot']/branch", "A place with no name"), + ("manager/branch intersect manager[@name = 'Godot']/branch", "A place with no name"), ("//manager/branch intersect //manager[@name = 'Godot']/branch", "A place with no name"), - ("branches_to_visit/manager[@name = 'Godot']/branch intersect branches_to_visit/manager[@name = 'Freya']/branch", ""), - ("branches_to_visit/manager/branch except branches_to_visit/manager[@name = 'Godot']/branch", "Barcelona"), - ("branches_to_visit/manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"), + ("manager[@name = 'Godot']/branch intersect manager[@name = 'Freya']/branch", ""), + ("manager/branch except manager[@name = 'Godot']/branch", "Barcelona"), + ("manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"), ("//manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"), - ("branches_to_visit/manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"), + ("manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"), ("//manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"), - ("branches_to_visit/manager[@name = 'Godot']/branch[2] eq branches_to_visit/manager[@name = 'Freya']/branch[2]", "false"), + ("manager[@name = 'Godot']/branch[2] eq manager[@name = 'Freya']/branch[2]", "false"), ("//manager[@name = 'Godot']/branch[2] eq //manager[@name = 'Freya']/branch[2]", "false"), - ("branches_to_visit/manager[1]/@room_no lt branches_to_visit/manager[2]/@room_no", "false"), + ("manager[1]/@room_no lt manager[2]/@room_no", "false"), ("//manager[1]/@room_no lt //manager[2]/@room_no", "false"), - ("branches_to_visit/manager[1]/@room_no gt branches_to_visit/manager[2]/@room_no", "true"), + ("manager[1]/@room_no gt manager[2]/@room_no", "true"), ("//manager[1]/@room_no gt //manager[2]/@room_no", "true"), - ("branches_to_visit/manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"), + ("manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"), ("//manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"), - ("branches_to_visit/manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"), + ("manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"), ("//manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"), - ("branches_to_visit/manager[@name = 'Godot']/branch = 'Area 51'", "true"), + ("manager[@name = 'Godot']/branch = 'Area 51'", "true"), ("//manager[@name = 'Godot']/branch = 'Area 51'", "true"), - ("branches_to_visit/manager[@name = 'Godot']/branch = 'Barcelona'", "false"), + ("manager[@name = 'Godot']/branch = 'Barcelona'", "false"), ("//manager[@name = 'Godot']/branch = 'Barcelona'", "false"), - ("branches_to_visit/manager[1]/@room_no > branches_to_visit/manager[2]/@room_no", "true"), + ("manager[1]/@room_no > manager[2]/@room_no", "true"), ("//manager[1]/@room_no > //manager[2]/@room_no", "true"), - ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[1]/branch[1]", "false"), + ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[1]/branch[1]", "false"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[1]/branch[1]", "false"), - ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[1]/branch[3]", "true"), + ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[1]/branch[3]", "true"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[1]/branch[3]", "true"), - ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << branches_to_visit/manager[1]/branch[1]", "false"), + ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << manager[1]/branch[1]", "false"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << //manager[1]/branch[1]", "false"), - ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> branches_to_visit/manager[1]/branch[1]", "true"), + ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> manager[1]/branch[1]", "true"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> //manager[1]/branch[1]", "true"), - ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"), + ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"), - ("branches_to_visit/manager[1]/@name || branches_to_visit/manager[2]/@name", "GodotFreya"), + ("manager[1]/@name || manager[2]/@name", "GodotFreya"), ("//manager[1]/@name || //manager[2]/@name", "GodotFreya"), ]) def test_branches_to_visit(html_content, xpath, answer): @@ -170,10 +170,10 @@ def test_branches_to_visit(html_content, xpath, answer): ("(1 + 9 * 9 + 5) div 6", "14.5"), ("23 idiv 3", "7"), ("23 div 3", "7.66666666"), - ("for $i in ./trips/trip return $i/traveler/duration * $i/traveler/price", "21002.04"), - ("for $i in ./trips/trip return $i/traveler/duration ", "4"), + ("for $i in ./trip return $i/traveler/duration * $i/traveler/price", "21002.04"), + ("for $i in ./trip return $i/traveler/duration ", "4"), ("for $i in .//trip return $i/traveler/duration * $i/traveler/price", "21002.04"), - ("sum(for $i in ./trips/trip return $i/traveler/duration * $i/traveler/price)", "29002.04"), + ("sum(for $i in ./trip return $i/traveler/duration * $i/traveler/price)", "29002.04"), ("sum(for $i in .//trip return $i/traveler/duration * $i/traveler/price)", "29002.04"), #("trip[1]/depart - trip[1]/arrive", "fail_to_get_answer"), #("//trip[1]/depart - //trip[1]/arrive", "fail_to_get_answer"), From 0a0f281d805651a07b81f24c7400bf5509f0925a Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 11 Sep 2024 02:41:55 +0900 Subject: [PATCH 31/37] Revert "tests/test_xpath_selector_unit/feat: Do forest_transplanting by default" This reverts commit 4d266cac9f33d62ae1c662a3128d043d9a0579fd. --- changedetectionio/html_tools.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 1ca26e2f786..990e0af1902 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -125,12 +125,24 @@ def forest_transplanting(root): root_siblings_preceding = [ s for s in root.itersiblings(preceding=True)] root_siblings = [s for s in root.itersiblings()] - new_root = etree.Element("new_root") + Is_fragment=False + # If element node exsits in root element node's sibilings, it is fragment. + for node in chain(root_siblings_preceding, root_siblings): + if not hasattr(node.tag, '__name__'): + Is_fragment=True + # early exit. because the root is already root element. + # So, two root element nodes are detected. DOM violation. + break + + if Is_fragment: + new_root = etree.Element("new_root") + root_siblings_preceding.reverse() + for node in chain(root_siblings_preceding, [root], root_siblings): + new_root.append(node) + return new_root, True + + return root, False - root_siblings_preceding.reverse() - for node in chain(root_siblings_preceding, [root], root_siblings): - new_root.append(node) - return new_root, True # Return str Utf-8 of matched rules def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False): From 322382096c1b88abc1a03167c184cd7ab335ab37 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 11 Sep 2024 02:50:59 +0900 Subject: [PATCH 32/37] Reapply "html_tools/docs: Fix old comment" This reverts commit 361987796e8e98729be6e39eb88b406a91b14cee. --- changedetectionio/html_tools.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 990e0af1902..6bbe2236064 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -114,11 +114,12 @@ def elementpath_tostring(obj): def forest_transplanting(root): """ - The html parser of libxml2 violates DOM rules. It means there can be multiple root element - nodes. So I choose just transplating them to a new root by default. - See also, https://gitlab.gnome.org/GNOME/libxml2/-/issues/716 - This will emulate xpath1 of html of libxml2 like '/html[2]/*'. - To make this function work, 'fragment=True' in elementpath.select is required. + The html parser of libxml2 violates DOM rules. It means there can be + multiple root element nodes. So I choose just transplating them to a new + root when the violation happens. See also, + https://gitlab.gnome.org/GNOME/libxml2/-/issues/716 This will emulate + xpath1 of html of libxml2 like '/html[2]/*'. To make this function work, + 'fragment=True' in elementpath.select is required. """ from lxml import etree From 93950c0f3db1791d6cb702554ae013bd28e5946f Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 11 Sep 2024 03:23:52 +0900 Subject: [PATCH 33/37] Update html_tools.py to trigger test just blanks --- changedetectionio/html_tools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 6bbe2236064..dc21d034477 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -123,14 +123,14 @@ def forest_transplanting(root): """ from lxml import etree - root_siblings_preceding = [ s for s in root.itersiblings(preceding=True)] + root_siblings_preceding = [s for s in root.itersiblings(preceding=True)] root_siblings = [s for s in root.itersiblings()] - Is_fragment=False + Is_fragment = False # If element node exsits in root element node's sibilings, it is fragment. for node in chain(root_siblings_preceding, root_siblings): if not hasattr(node.tag, '__name__'): - Is_fragment=True + Is_fragment = True # early exit. because the root is already root element. # So, two root element nodes are detected. DOM violation. break From 0e66cb072eaf36775a815bf331b1e9c8094abf1d Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 11 Sep 2024 04:06:04 +0900 Subject: [PATCH 34/37] Update html_tools.py document for trigger test --- changedetectionio/html_tools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index dc21d034477..ae13e636b1d 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -73,7 +73,7 @@ def element_removal(selectors: List[str], html_content): def elementpath_tostring(obj): """ - change elementpath.select results to string type + change elementpath.select results(XDM) to string type # The MIT License (MIT), Copyright (c), 2018-2021, SISSA (Scuola Internazionale Superiore di Studi Avanzati) # https://github.com/sissaschool/elementpath/blob/dfcc2fd3d6011b16e02bf30459a7924f547b47d0/elementpath/xpath_tokens.py#L1038 """ @@ -116,9 +116,9 @@ def forest_transplanting(root): """ The html parser of libxml2 violates DOM rules. It means there can be multiple root element nodes. So I choose just transplating them to a new - root when the violation happens. See also, + root when the violation happens. See also, https://gitlab.gnome.org/GNOME/libxml2/-/issues/716 This will emulate - xpath1 of html of libxml2 like '/html[2]/*'. To make this function work, + xpath1 of html of libxml2 like '/html[2]/*'. To make this function work, 'fragment=True' in elementpath.select is required. """ from lxml import etree From 889fdbbcffe2b4b19b5287c51397a4a89cf3814d Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Wed, 11 Sep 2024 04:38:50 +0900 Subject: [PATCH 35/37] Update html_tools.py comment to trigger test --- changedetectionio/html_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index ae13e636b1d..6ff4c4e6f7d 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -116,7 +116,7 @@ def forest_transplanting(root): """ The html parser of libxml2 violates DOM rules. It means there can be multiple root element nodes. So I choose just transplating them to a new - root when the violation happens. See also, + root element when the violation happens. See also, https://gitlab.gnome.org/GNOME/libxml2/-/issues/716 This will emulate xpath1 of html of libxml2 like '/html[2]/*'. To make this function work, 'fragment=True' in elementpath.select is required. From 4043e9adb48eef8913ecf947cae00509d30331c2 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Sat, 14 Sep 2024 02:28:15 +0900 Subject: [PATCH 36/37] html_tools/feat: Add logger for forest transplanting. --- changedetectionio/html_tools.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 6ff4c4e6f7d..20fbc0356e1 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -11,6 +11,7 @@ from elementpath import select as elementpath_select # xpath 2.0-3.1 from elementpath.xpath3 import XPath3Parser +from loguru import logger # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis @@ -136,6 +137,7 @@ def forest_transplanting(root): break if Is_fragment: + logger.debug("forest_transplanting is triggered.") new_root = etree.Element("new_root") root_siblings_preceding.reverse() for node in chain(root_siblings_preceding, [root], root_siblings): From 912470fb0d3bb4ba674f60025f6df90c0a2f53d6 Mon Sep 17 00:00:00 2001 From: Constantin Hong <hongconstantin@gmail.com> Date: Sat, 14 Sep 2024 04:09:59 +0900 Subject: [PATCH 37/37] html_tools/docs: Add string to trigger test --- changedetectionio/html_tools.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index d4d920461b5..83277d73be0 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -116,7 +116,8 @@ def forest_transplanting(root): root element when the violation happens. See also, https://gitlab.gnome.org/GNOME/libxml2/-/issues/716 This will emulate xpath1 of html of libxml2 like '/html[2]/*'. To make this function work, - 'fragment=True' in elementpath.select is required. + 'fragment=True' in elementpath.select is required. This part is where I + violates the spec. """ from lxml import etree