From 8259dd4088b7b01c482f6d4db84c133f041994a6 Mon Sep 17 00:00:00 2001 From: dream2333 Date: Thu, 13 Jun 2024 20:27:20 +0800 Subject: [PATCH 1/8] Fix drop html element from a text type Selector --- parsel/selector.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index 2027599..ebdcaa2 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -423,6 +423,7 @@ class Selector: "_huge_tree", "root", "_text", + "_text_lazy_html_root", "body", "__weakref__", ] @@ -507,6 +508,7 @@ def __init__( self._expr = _expr self._huge_tree = huge_tree self._text = text + self._text_lazy_html_root = None def __getstate__(self) -> Any: raise TypeError("can't pickle Selector objects") @@ -606,7 +608,9 @@ def xpath( ) else: try: - xpathev = self._get_root(self._text or "", type="html").xpath + if self._text_lazy_html_root is None: + self._text_lazy_html_root = self._get_root(self.root or "", type="html") + xpathev = self._text_lazy_html_root.xpath except AttributeError: return typing.cast( SelectorList[_SelectorType], self.selectorlist_cls([]) @@ -722,8 +726,12 @@ def get(self) -> Any: For HTML and XML, the result is always a string, and percent-encoded content is unquoted. """ - if self.type in ("text", "json"): + if self.type == "json": return self.root + elif self.type == "text": + if self._text_lazy_html_root is None: + return self.root + return typing.cast(str, etree.tostring(self._text_lazy_html_root, encoding="unicode", with_tail=False)) try: return typing.cast( str, From 70aca9bc9d5461266eca1f1262426b5420272eeb Mon Sep 17 00:00:00 2001 From: dream2333 Date: Fri, 14 Jun 2024 01:24:39 +0800 Subject: [PATCH 2/8] Add testcases for drop html node --- tests/test_selector.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_selector.py b/tests/test_selector.py index 96713f9..754822e 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1007,6 +1007,34 @@ def test_remove_selector(self) -> None: self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["2", "3"]) + def test_remove_selector_from_html_in_text(self) -> None: + html = "

hello world

" + expect_result = "

hello world

" + sel = self.sscls(text=html, type="text") + self.assertEqual(sel.type, "text") + li_sel_list = sel.css("style") + li_sel_list.drop() + self.assertEqual(sel.get(), expect_result) + # The type of the parent selector should not change + self.assertEqual(sel.type, "text") + + def test_remove_selector_from_html_in_json(self) -> None: + json_str = """{ + "title": "hello world", + "body": "

hello world

" + } + """ + expect_result = "

hello world

" + sel = self.sscls(text=json_str) + html_sel = sel.jmespath("body")[0] + self.assertEqual(html_sel.type, "text") + li_sel_list = html_sel.css("style") + li_sel_list.drop() + self.assertEqual(html_sel.get(), expect_result) + # The type of the parent selector should not change + self.assertEqual(html_sel.type, "text") + + def test_remove_pseudo_element_selector_list(self) -> None: sel = self.sscls( text="" From 0c2b57a20cf1d173d78593c64a9bfa57a7f7bc79 Mon Sep 17 00:00:00 2001 From: dream2333 Date: Fri, 14 Jun 2024 01:52:42 +0800 Subject: [PATCH 3/8] Add type hint --- parsel/selector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index ebdcaa2..cf33aa7 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -508,7 +508,7 @@ def __init__( self._expr = _expr self._huge_tree = huge_tree self._text = text - self._text_lazy_html_root = None + self._text_lazy_html_root: Optional[etree._Element] = None def __getstate__(self) -> Any: raise TypeError("can't pickle Selector objects") @@ -610,7 +610,8 @@ def xpath( try: if self._text_lazy_html_root is None: self._text_lazy_html_root = self._get_root(self.root or "", type="html") - xpathev = self._text_lazy_html_root.xpath + if self._text_lazy_html_root is not None: + xpathev = self._text_lazy_html_root.xpath except AttributeError: return typing.cast( SelectorList[_SelectorType], self.selectorlist_cls([]) From 9c8869a11efeb0b1f7530e8f86f9f170408a87ae Mon Sep 17 00:00:00 2001 From: dream2333 Date: Fri, 14 Jun 2024 01:24:39 +0800 Subject: [PATCH 4/8] Add testcases for drop html node --- tests/test_selector.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_selector.py b/tests/test_selector.py index 96713f9..902f82b 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1007,6 +1007,35 @@ def test_remove_selector(self) -> None: self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["2", "3"]) + def test_remove_selector_from_html_in_text(self) -> None: + html = ( + "

hello world

" + ) + expect_result = "

hello world

" + sel = self.sscls(text=html, type="text") + self.assertEqual(sel.type, "text") + li_sel_list = sel.css("style") + li_sel_list.drop() + self.assertEqual(sel.get(), expect_result) + # The type of the parent selector should not change + self.assertEqual(sel.type, "text") + + def test_remove_selector_from_html_in_json(self) -> None: + json_str = """{ + "title": "hello world", + "body": "

hello world

" + } + """ + expect_result = "

hello world

" + sel = self.sscls(text=json_str) + html_sel = sel.jmespath("body")[0] + self.assertEqual(html_sel.type, "text") + li_sel_list = html_sel.css("style") + li_sel_list.drop() + self.assertEqual(html_sel.get(), expect_result) + # The type of the parent selector should not change + self.assertEqual(html_sel.type, "text") + def test_remove_pseudo_element_selector_list(self) -> None: sel = self.sscls( text="
  • 1
  • 2
  • 3
" From 955abd900fc30a612ade74b19c0b240858a901fa Mon Sep 17 00:00:00 2001 From: dream2333 Date: Fri, 14 Jun 2024 02:34:17 +0800 Subject: [PATCH 5/8] Fix drop html element from a text type Selector --- parsel/selector.py | 54 +++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index ebdcaa2..a5db974 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -508,7 +508,7 @@ def __init__( self._expr = _expr self._huge_tree = huge_tree self._text = text - self._text_lazy_html_root = None + self._text_lazy_html_root: Optional[etree._Element] = None def __getstate__(self) -> Any: raise TypeError("can't pickle Selector objects") @@ -609,8 +609,11 @@ def xpath( else: try: if self._text_lazy_html_root is None: - self._text_lazy_html_root = self._get_root(self.root or "", type="html") - xpathev = self._text_lazy_html_root.xpath + self._text_lazy_html_root = self._get_root( + self.root or "", type="html" + ) + if self._text_lazy_html_root is not None: + xpathev = self._text_lazy_html_root.xpath except AttributeError: return typing.cast( SelectorList[_SelectorType], self.selectorlist_cls([]) @@ -726,29 +729,30 @@ def get(self) -> Any: For HTML and XML, the result is always a string, and percent-encoded content is unquoted. """ - if self.type == "json": + if self.type in ("json", "text"): + if self.type == "text" and self._text_lazy_html_root is not None: + return etree.tostring( + self._text_lazy_html_root, encoding="unicode", with_tail=False + ) return self.root - elif self.type == "text": - if self._text_lazy_html_root is None: - return self.root - return typing.cast(str, etree.tostring(self._text_lazy_html_root, encoding="unicode", with_tail=False)) - try: - return typing.cast( - str, - etree.tostring( - self.root, - method=_ctgroup[self.type]["_tostring_method"], - encoding="unicode", - with_tail=False, - ), - ) - except (AttributeError, TypeError): - if self.root is True: - return "1" - elif self.root is False: - return "0" - else: - return str(self.root) + else: + try: + return typing.cast( + str, + etree.tostring( + self.root, + method=_ctgroup[self.type]["_tostring_method"], + encoding="unicode", + with_tail=False, + ), + ) + except (AttributeError, TypeError): + if self.root is True: + return "1" + elif self.root is False: + return "0" + else: + return str(self.root) extract = get From e7c5e975fe0e47fefb0251512468b9ad18452a8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Jun 2024 20:52:08 +0200 Subject: [PATCH 6/8] Support forcing a selector type into a subselector --- parsel/selector.py | 77 +++++++++++++++++++----------------------- tests/test_selector.py | 12 +++---- 2 files changed, 41 insertions(+), 48 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index a5db974..0088703 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -1,6 +1,7 @@ """XPath and JMESPath selectors based on the lxml and jmespath Python packages.""" +import builtins import json import typing import warnings @@ -141,7 +142,7 @@ def __getitem__( def __getstate__(self) -> None: raise TypeError("can't pickle SelectorList objects") - def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]": + def jmespath(self, query: str, *, type: Optional[str] = None, **kwargs: Any) -> "SelectorList[_SelectorType]": """ Call the ``.jmespath()`` method for each element in this list and return their results flattened as another :class:`SelectorList`. @@ -153,12 +154,14 @@ def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]": selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict)) """ - return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self])) + return self.__class__(flatten([x.jmespath(query, type=type, **kwargs) for x in self])) def xpath( self, xpath: str, namespaces: Optional[Mapping[str, str]] = None, + *, + type: Optional[str] = None, **kwargs: Any, ) -> "SelectorList[_SelectorType]": """ @@ -178,17 +181,17 @@ def xpath( selector.xpath('//a[href=$url]', url="http://www.example.com") """ return self.__class__( - flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self]) + flatten([x.xpath(xpath, namespaces=namespaces, type=type, **kwargs) for x in self]) ) - def css(self, query: str) -> "SelectorList[_SelectorType]": + def css(self, query: str, type: Optional[str] = None,) -> "SelectorList[_SelectorType]": """ Call the ``.css()`` method for each element in this list and return their results flattened as another :class:`SelectorList`. ``query`` is the same argument as the one in :meth:`Selector.css` """ - return self.__class__(flatten([x.css(query) for x in self])) + return self.__class__(flatten([x.css(query, type=type) for x in self])) def re( self, regex: Union[str, Pattern[str]], replace_entities: bool = True @@ -423,7 +426,6 @@ class Selector: "_huge_tree", "root", "_text", - "_text_lazy_html_root", "body", "__weakref__", ] @@ -508,7 +510,6 @@ def __init__( self._expr = _expr self._huge_tree = huge_tree self._text = text - self._text_lazy_html_root: Optional[etree._Element] = None def __getstate__(self) -> Any: raise TypeError("can't pickle Selector objects") @@ -534,6 +535,7 @@ def _get_root( def jmespath( self: _SelectorType, query: str, + type: Optional[str] = None, **kwargs: Any, ) -> SelectorList[_SelectorType]: """ @@ -567,9 +569,9 @@ def jmespath( def make_selector(x: Any) -> _SelectorType: # closure function if isinstance(x, str): - return self.__class__(text=x, _expr=query, type="text") + return self.__class__(text=x, _expr=query, type=type or "text") else: - return self.__class__(root=x, _expr=query) + return self.__class__(root=x, _expr=query, type=type) result = [make_selector(x) for x in result] return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result)) @@ -578,6 +580,7 @@ def xpath( self: _SelectorType, query: str, namespaces: Optional[Mapping[str, str]] = None, + type: Optional[str] = None, **kwargs: Any, ) -> SelectorList[_SelectorType]: """ @@ -608,12 +611,7 @@ def xpath( ) else: try: - if self._text_lazy_html_root is None: - self._text_lazy_html_root = self._get_root( - self.root or "", type="html" - ) - if self._text_lazy_html_root is not None: - xpathev = self._text_lazy_html_root.xpath + xpathev = self._get_root(self._text or "", type="html").xpath except AttributeError: return typing.cast( SelectorList[_SelectorType], self.selectorlist_cls([]) @@ -632,7 +630,7 @@ def xpath( except etree.XPathError as exc: raise ValueError(f"XPath error: {exc} in {query}") - if type(result) is not list: + if builtins.type(result) is not list: result = [result] result = [ @@ -640,13 +638,13 @@ def xpath( root=x, _expr=query, namespaces=self.namespaces, - type=_xml_or_html(self.type), + type=type or _xml_or_html(self.type), ) for x in result ] return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result)) - def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]: + def css(self: _SelectorType, query: str, type: Optional[str] = None) -> SelectorList[_SelectorType]: """ Apply the given CSS selector and return a :class:`SelectorList` instance. @@ -659,7 +657,7 @@ def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]: """ if self.type not in ("html", "xml", "text"): raise ValueError(f"Cannot use css on a Selector of type {self.type!r}") - return self.xpath(self._css2xpath(query)) + return self.xpath(self._css2xpath(query), type=type) def _css2xpath(self, query: str) -> str: type = _xml_or_html(self.type) @@ -729,30 +727,25 @@ def get(self) -> Any: For HTML and XML, the result is always a string, and percent-encoded content is unquoted. """ - if self.type in ("json", "text"): - if self.type == "text" and self._text_lazy_html_root is not None: - return etree.tostring( - self._text_lazy_html_root, encoding="unicode", with_tail=False - ) + if self.type in ("text", "json"): return self.root - else: - try: - return typing.cast( - str, - etree.tostring( - self.root, - method=_ctgroup[self.type]["_tostring_method"], - encoding="unicode", - with_tail=False, - ), - ) - except (AttributeError, TypeError): - if self.root is True: - return "1" - elif self.root is False: - return "0" - else: - return str(self.root) + try: + return typing.cast( + str, + etree.tostring( + self.root, + method=_ctgroup[self.type]["_tostring_method"], + encoding="unicode", + with_tail=False, + ), + ) + except (AttributeError, TypeError): + if self.root is True: + return "1" + elif self.root is False: + return "0" + else: + return str(self.root) extract = get diff --git a/tests/test_selector.py b/tests/test_selector.py index 902f82b..8cb2a1b 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1012,13 +1012,13 @@ def test_remove_selector_from_html_in_text(self) -> None: "

hello world

" ) expect_result = "

hello world

" - sel = self.sscls(text=html, type="text") - self.assertEqual(sel.type, "text") + sel = self.sscls(text=html, type="html") + self.assertEqual(sel.type, "html") li_sel_list = sel.css("style") li_sel_list.drop() self.assertEqual(sel.get(), expect_result) # The type of the parent selector should not change - self.assertEqual(sel.type, "text") + self.assertEqual(sel.type, "html") def test_remove_selector_from_html_in_json(self) -> None: json_str = """{ @@ -1028,13 +1028,13 @@ def test_remove_selector_from_html_in_json(self) -> None: """ expect_result = "

hello world

" sel = self.sscls(text=json_str) - html_sel = sel.jmespath("body")[0] - self.assertEqual(html_sel.type, "text") + html_sel = sel.jmespath("body", type="html")[0] + self.assertEqual(html_sel.type, "html") li_sel_list = html_sel.css("style") li_sel_list.drop() self.assertEqual(html_sel.get(), expect_result) # The type of the parent selector should not change - self.assertEqual(html_sel.type, "text") + self.assertEqual(html_sel.type, "html") def test_remove_pseudo_element_selector_list(self) -> None: sel = self.sscls( From 122c0f02577a5b2ad8714a4d742a58e28ef03a07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Jun 2024 21:03:29 +0200 Subject: [PATCH 7/8] Remove unnecessary test case --- tests/test_selector.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/tests/test_selector.py b/tests/test_selector.py index 8cb2a1b..db8d099 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1007,20 +1007,7 @@ def test_remove_selector(self) -> None: self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["2", "3"]) - def test_remove_selector_from_html_in_text(self) -> None: - html = ( - "

hello world

" - ) - expect_result = "

hello world

" - sel = self.sscls(text=html, type="html") - self.assertEqual(sel.type, "html") - li_sel_list = sel.css("style") - li_sel_list.drop() - self.assertEqual(sel.get(), expect_result) - # The type of the parent selector should not change - self.assertEqual(sel.type, "html") - - def test_remove_selector_from_html_in_json(self) -> None: + def test_remove_selector_from_nested_html(self) -> None: json_str = """{ "title": "hello world", "body": "

hello world

" @@ -1028,6 +1015,8 @@ def test_remove_selector_from_html_in_json(self) -> None: """ expect_result = "

hello world

" sel = self.sscls(text=json_str) + # We need to force the selector type to HTML to make that functionality + # readily available. html_sel = sel.jmespath("body", type="html")[0] self.assertEqual(html_sel.type, "html") li_sel_list = html_sel.css("style") From fd49654f8531d32cd5ca4c37982056c8db622eb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Jun 2024 21:08:48 +0200 Subject: [PATCH 8/8] Apply pre-commit --- parsel/selector.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index 0088703..1092210 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -142,7 +142,9 @@ def __getitem__( def __getstate__(self) -> None: raise TypeError("can't pickle SelectorList objects") - def jmespath(self, query: str, *, type: Optional[str] = None, **kwargs: Any) -> "SelectorList[_SelectorType]": + def jmespath( + self, query: str, *, type: Optional[str] = None, **kwargs: Any + ) -> "SelectorList[_SelectorType]": """ Call the ``.jmespath()`` method for each element in this list and return their results flattened as another :class:`SelectorList`. @@ -154,7 +156,9 @@ def jmespath(self, query: str, *, type: Optional[str] = None, **kwargs: Any) -> selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict)) """ - return self.__class__(flatten([x.jmespath(query, type=type, **kwargs) for x in self])) + return self.__class__( + flatten([x.jmespath(query, type=type, **kwargs) for x in self]) + ) def xpath( self, @@ -181,10 +185,19 @@ def xpath( selector.xpath('//a[href=$url]', url="http://www.example.com") """ return self.__class__( - flatten([x.xpath(xpath, namespaces=namespaces, type=type, **kwargs) for x in self]) + flatten( + [ + x.xpath(xpath, namespaces=namespaces, type=type, **kwargs) + for x in self + ] + ) ) - def css(self, query: str, type: Optional[str] = None,) -> "SelectorList[_SelectorType]": + def css( + self, + query: str, + type: Optional[str] = None, + ) -> "SelectorList[_SelectorType]": """ Call the ``.css()`` method for each element in this list and return their results flattened as another :class:`SelectorList`. @@ -644,7 +657,9 @@ def xpath( ] return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result)) - def css(self: _SelectorType, query: str, type: Optional[str] = None) -> SelectorList[_SelectorType]: + def css( + self: _SelectorType, query: str, type: Optional[str] = None + ) -> SelectorList[_SelectorType]: """ Apply the given CSS selector and return a :class:`SelectorList` instance.