From 8259dd4088b7b01c482f6d4db84c133f041994a6 Mon Sep 17 00:00:00 2001
From: dream2333 <vincentqng@gmail.com>
Date: Thu, 13 Jun 2024 20:27:20 +0800
Subject: [PATCH 1/8] Fix drop html element from a text type Selector

---
 parsel/selector.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/parsel/selector.py b/parsel/selector.py
index 2027599..ebdcaa2 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -423,6 +423,7 @@ class Selector:
         "_huge_tree",
         "root",
         "_text",
+        "_text_lazy_html_root",
         "body",
         "__weakref__",
     ]
@@ -507,6 +508,7 @@ def __init__(
         self._expr = _expr
         self._huge_tree = huge_tree
         self._text = text
+        self._text_lazy_html_root = None
 
     def __getstate__(self) -> Any:
         raise TypeError("can't pickle Selector objects")
@@ -606,7 +608,9 @@ def xpath(
                 )
         else:
             try:
-                xpathev = self._get_root(self._text or "", type="html").xpath
+                if self._text_lazy_html_root is None:
+                    self._text_lazy_html_root = self._get_root(self.root or "", type="html")
+                xpathev = self._text_lazy_html_root.xpath
             except AttributeError:
                 return typing.cast(
                     SelectorList[_SelectorType], self.selectorlist_cls([])
@@ -722,8 +726,12 @@ def get(self) -> Any:
         For HTML and XML, the result is always a string, and percent-encoded
         content is unquoted.
         """
-        if self.type in ("text", "json"):
+        if self.type == "json":
             return self.root
+        elif self.type == "text":
+            if self._text_lazy_html_root is None:
+                return self.root
+            return typing.cast(str, etree.tostring(self._text_lazy_html_root, encoding="unicode", with_tail=False))
         try:
             return typing.cast(
                 str,

From 70aca9bc9d5461266eca1f1262426b5420272eeb Mon Sep 17 00:00:00 2001
From: dream2333 <vincentqng@gmail.com>
Date: Fri, 14 Jun 2024 01:24:39 +0800
Subject: [PATCH 2/8] Add testcases for drop html node

---
 tests/test_selector.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/test_selector.py b/tests/test_selector.py
index 96713f9..754822e 100644
--- a/tests/test_selector.py
+++ b/tests/test_selector.py
@@ -1007,6 +1007,34 @@ def test_remove_selector(self) -> None:
         self.assertIsSelectorList(sel.css("li"))
         self.assertEqual(sel.css("li::text").getall(), ["2", "3"])
 
+    def test_remove_selector_from_html_in_text(self) -> None:
+        html = "<html><body><style>p{color:red;}</style><p>hello world</p></body></html>"
+        expect_result = "<html><body><p>hello world</p></body></html>"
+        sel = self.sscls(text=html, type="text")
+        self.assertEqual(sel.type, "text")
+        li_sel_list = sel.css("style")
+        li_sel_list.drop()
+        self.assertEqual(sel.get(), expect_result)
+        # The type of the parent selector should not change
+        self.assertEqual(sel.type, "text")
+
+    def test_remove_selector_from_html_in_json(self) -> None:
+        json_str = """{
+            "title": "hello world",
+            "body": "<html><body><style>p{color:red;}</style><p>hello world</p></body></html>"
+        }
+        """
+        expect_result = "<html><body><p>hello world</p></body></html>"
+        sel = self.sscls(text=json_str)
+        html_sel = sel.jmespath("body")[0]
+        self.assertEqual(html_sel.type, "text")
+        li_sel_list = html_sel.css("style")
+        li_sel_list.drop()
+        self.assertEqual(html_sel.get(), expect_result)
+        # The type of the parent selector should not change
+        self.assertEqual(html_sel.type, "text")
+
+
     def test_remove_pseudo_element_selector_list(self) -> None:
         sel = self.sscls(
             text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>"

From 0c2b57a20cf1d173d78593c64a9bfa57a7f7bc79 Mon Sep 17 00:00:00 2001
From: dream2333 <vincentqng@gmail.com>
Date: Fri, 14 Jun 2024 01:52:42 +0800
Subject: [PATCH 3/8] Add type hint

---
 parsel/selector.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/parsel/selector.py b/parsel/selector.py
index ebdcaa2..cf33aa7 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -508,7 +508,7 @@ def __init__(
         self._expr = _expr
         self._huge_tree = huge_tree
         self._text = text
-        self._text_lazy_html_root = None
+        self._text_lazy_html_root: Optional[etree._Element] = None
 
     def __getstate__(self) -> Any:
         raise TypeError("can't pickle Selector objects")
@@ -610,7 +610,8 @@ def xpath(
             try:
                 if self._text_lazy_html_root is None:
                     self._text_lazy_html_root = self._get_root(self.root or "", type="html")
-                xpathev = self._text_lazy_html_root.xpath
+                if self._text_lazy_html_root is not None:
+                    xpathev = self._text_lazy_html_root.xpath
             except AttributeError:
                 return typing.cast(
                     SelectorList[_SelectorType], self.selectorlist_cls([])

From 9c8869a11efeb0b1f7530e8f86f9f170408a87ae Mon Sep 17 00:00:00 2001
From: dream2333 <vincentqng@gmail.com>
Date: Fri, 14 Jun 2024 01:24:39 +0800
Subject: [PATCH 4/8] Add testcases for drop html node

---
 tests/test_selector.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tests/test_selector.py b/tests/test_selector.py
index 96713f9..902f82b 100644
--- a/tests/test_selector.py
+++ b/tests/test_selector.py
@@ -1007,6 +1007,35 @@ def test_remove_selector(self) -> None:
         self.assertIsSelectorList(sel.css("li"))
         self.assertEqual(sel.css("li::text").getall(), ["2", "3"])
 
+    def test_remove_selector_from_html_in_text(self) -> None:
+        html = (
+            "<html><body><style>p{color:red;}</style><p>hello world</p></body></html>"
+        )
+        expect_result = "<html><body><p>hello world</p></body></html>"
+        sel = self.sscls(text=html, type="text")
+        self.assertEqual(sel.type, "text")
+        li_sel_list = sel.css("style")
+        li_sel_list.drop()
+        self.assertEqual(sel.get(), expect_result)
+        # The type of the parent selector should not change
+        self.assertEqual(sel.type, "text")
+
+    def test_remove_selector_from_html_in_json(self) -> None:
+        json_str = """{
+            "title": "hello world",
+            "body": "<html><body><style>p{color:red;}</style><p>hello world</p></body></html>"
+        }
+        """
+        expect_result = "<html><body><p>hello world</p></body></html>"
+        sel = self.sscls(text=json_str)
+        html_sel = sel.jmespath("body")[0]
+        self.assertEqual(html_sel.type, "text")
+        li_sel_list = html_sel.css("style")
+        li_sel_list.drop()
+        self.assertEqual(html_sel.get(), expect_result)
+        # The type of the parent selector should not change
+        self.assertEqual(html_sel.type, "text")
+
     def test_remove_pseudo_element_selector_list(self) -> None:
         sel = self.sscls(
             text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>"

From 955abd900fc30a612ade74b19c0b240858a901fa Mon Sep 17 00:00:00 2001
From: dream2333 <vincentqng@gmail.com>
Date: Fri, 14 Jun 2024 02:34:17 +0800
Subject: [PATCH 5/8] Fix drop html element from a text type Selector

---
 parsel/selector.py | 54 +++++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/parsel/selector.py b/parsel/selector.py
index ebdcaa2..a5db974 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -508,7 +508,7 @@ def __init__(
         self._expr = _expr
         self._huge_tree = huge_tree
         self._text = text
-        self._text_lazy_html_root = None
+        self._text_lazy_html_root: Optional[etree._Element] = None
 
     def __getstate__(self) -> Any:
         raise TypeError("can't pickle Selector objects")
@@ -609,8 +609,11 @@ def xpath(
         else:
             try:
                 if self._text_lazy_html_root is None:
-                    self._text_lazy_html_root = self._get_root(self.root or "", type="html")
-                xpathev = self._text_lazy_html_root.xpath
+                    self._text_lazy_html_root = self._get_root(
+                        self.root or "", type="html"
+                    )
+                if self._text_lazy_html_root is not None:
+                    xpathev = self._text_lazy_html_root.xpath
             except AttributeError:
                 return typing.cast(
                     SelectorList[_SelectorType], self.selectorlist_cls([])
@@ -726,29 +729,30 @@ def get(self) -> Any:
         For HTML and XML, the result is always a string, and percent-encoded
         content is unquoted.
         """
-        if self.type == "json":
+        if self.type in ("json", "text"):
+            if self.type == "text" and self._text_lazy_html_root is not None:
+                return etree.tostring(
+                    self._text_lazy_html_root, encoding="unicode", with_tail=False
+                )
             return self.root
-        elif self.type == "text":
-            if self._text_lazy_html_root is None:
-                return self.root
-            return typing.cast(str, etree.tostring(self._text_lazy_html_root, encoding="unicode", with_tail=False))
-        try:
-            return typing.cast(
-                str,
-                etree.tostring(
-                    self.root,
-                    method=_ctgroup[self.type]["_tostring_method"],
-                    encoding="unicode",
-                    with_tail=False,
-                ),
-            )
-        except (AttributeError, TypeError):
-            if self.root is True:
-                return "1"
-            elif self.root is False:
-                return "0"
-            else:
-                return str(self.root)
+        else:
+            try:
+                return typing.cast(
+                    str,
+                    etree.tostring(
+                        self.root,
+                        method=_ctgroup[self.type]["_tostring_method"],
+                        encoding="unicode",
+                        with_tail=False,
+                    ),
+                )
+            except (AttributeError, TypeError):
+                if self.root is True:
+                    return "1"
+                elif self.root is False:
+                    return "0"
+                else:
+                    return str(self.root)
 
     extract = get
 

From e7c5e975fe0e47fefb0251512468b9ad18452a8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Fri, 14 Jun 2024 20:52:08 +0200
Subject: [PATCH 6/8] Support forcing a selector type into a subselector

---
 parsel/selector.py     | 77 +++++++++++++++++++-----------------------
 tests/test_selector.py | 12 +++----
 2 files changed, 41 insertions(+), 48 deletions(-)

diff --git a/parsel/selector.py b/parsel/selector.py
index a5db974..0088703 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -1,6 +1,7 @@
 """XPath and JMESPath selectors based on the lxml and jmespath Python
 packages."""
 
+import builtins
 import json
 import typing
 import warnings
@@ -141,7 +142,7 @@ def __getitem__(
     def __getstate__(self) -> None:
         raise TypeError("can't pickle SelectorList objects")
 
-    def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]":
+    def jmespath(self, query: str, *, type: Optional[str] = None, **kwargs: Any) -> "SelectorList[_SelectorType]":
         """
         Call the ``.jmespath()`` method for each element in this list and return
         their results flattened as another :class:`SelectorList`.
@@ -153,12 +154,14 @@ def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]":
 
             selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
         """
-        return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self]))
+        return self.__class__(flatten([x.jmespath(query, type=type, **kwargs) for x in self]))
 
     def xpath(
         self,
         xpath: str,
         namespaces: Optional[Mapping[str, str]] = None,
+        *,
+        type: Optional[str] = None,
         **kwargs: Any,
     ) -> "SelectorList[_SelectorType]":
         """
@@ -178,17 +181,17 @@ def xpath(
             selector.xpath('//a[href=$url]', url="http://www.example.com")
         """
         return self.__class__(
-            flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self])
+            flatten([x.xpath(xpath, namespaces=namespaces, type=type, **kwargs) for x in self])
         )
 
-    def css(self, query: str) -> "SelectorList[_SelectorType]":
+    def css(self, query: str, type: Optional[str] = None,) -> "SelectorList[_SelectorType]":
         """
         Call the ``.css()`` method for each element in this list and return
         their results flattened as another :class:`SelectorList`.
 
         ``query`` is the same argument as the one in :meth:`Selector.css`
         """
-        return self.__class__(flatten([x.css(query) for x in self]))
+        return self.__class__(flatten([x.css(query, type=type) for x in self]))
 
     def re(
         self, regex: Union[str, Pattern[str]], replace_entities: bool = True
@@ -423,7 +426,6 @@ class Selector:
         "_huge_tree",
         "root",
         "_text",
-        "_text_lazy_html_root",
         "body",
         "__weakref__",
     ]
@@ -508,7 +510,6 @@ def __init__(
         self._expr = _expr
         self._huge_tree = huge_tree
         self._text = text
-        self._text_lazy_html_root: Optional[etree._Element] = None
 
     def __getstate__(self) -> Any:
         raise TypeError("can't pickle Selector objects")
@@ -534,6 +535,7 @@ def _get_root(
     def jmespath(
         self: _SelectorType,
         query: str,
+        type: Optional[str] = None,
         **kwargs: Any,
     ) -> SelectorList[_SelectorType]:
         """
@@ -567,9 +569,9 @@ def jmespath(
 
         def make_selector(x: Any) -> _SelectorType:  # closure function
             if isinstance(x, str):
-                return self.__class__(text=x, _expr=query, type="text")
+                return self.__class__(text=x, _expr=query, type=type or "text")
             else:
-                return self.__class__(root=x, _expr=query)
+                return self.__class__(root=x, _expr=query, type=type)
 
         result = [make_selector(x) for x in result]
         return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))
@@ -578,6 +580,7 @@ def xpath(
         self: _SelectorType,
         query: str,
         namespaces: Optional[Mapping[str, str]] = None,
+        type: Optional[str] = None,
         **kwargs: Any,
     ) -> SelectorList[_SelectorType]:
         """
@@ -608,12 +611,7 @@ def xpath(
                 )
         else:
             try:
-                if self._text_lazy_html_root is None:
-                    self._text_lazy_html_root = self._get_root(
-                        self.root or "", type="html"
-                    )
-                if self._text_lazy_html_root is not None:
-                    xpathev = self._text_lazy_html_root.xpath
+                xpathev = self._get_root(self._text or "", type="html").xpath
             except AttributeError:
                 return typing.cast(
                     SelectorList[_SelectorType], self.selectorlist_cls([])
@@ -632,7 +630,7 @@ def xpath(
         except etree.XPathError as exc:
             raise ValueError(f"XPath error: {exc} in {query}")
 
-        if type(result) is not list:
+        if builtins.type(result) is not list:
             result = [result]
 
         result = [
@@ -640,13 +638,13 @@ def xpath(
                 root=x,
                 _expr=query,
                 namespaces=self.namespaces,
-                type=_xml_or_html(self.type),
+                type=type or _xml_or_html(self.type),
             )
             for x in result
         ]
         return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))
 
-    def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]:
+    def css(self: _SelectorType, query: str, type: Optional[str] = None) -> SelectorList[_SelectorType]:
         """
         Apply the given CSS selector and return a :class:`SelectorList` instance.
 
@@ -659,7 +657,7 @@ def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]:
         """
         if self.type not in ("html", "xml", "text"):
             raise ValueError(f"Cannot use css on a Selector of type {self.type!r}")
-        return self.xpath(self._css2xpath(query))
+        return self.xpath(self._css2xpath(query), type=type)
 
     def _css2xpath(self, query: str) -> str:
         type = _xml_or_html(self.type)
@@ -729,30 +727,25 @@ def get(self) -> Any:
         For HTML and XML, the result is always a string, and percent-encoded
         content is unquoted.
         """
-        if self.type in ("json", "text"):
-            if self.type == "text" and self._text_lazy_html_root is not None:
-                return etree.tostring(
-                    self._text_lazy_html_root, encoding="unicode", with_tail=False
-                )
+        if self.type in ("text", "json"):
             return self.root
-        else:
-            try:
-                return typing.cast(
-                    str,
-                    etree.tostring(
-                        self.root,
-                        method=_ctgroup[self.type]["_tostring_method"],
-                        encoding="unicode",
-                        with_tail=False,
-                    ),
-                )
-            except (AttributeError, TypeError):
-                if self.root is True:
-                    return "1"
-                elif self.root is False:
-                    return "0"
-                else:
-                    return str(self.root)
+        try:
+            return typing.cast(
+                str,
+                etree.tostring(
+                    self.root,
+                    method=_ctgroup[self.type]["_tostring_method"],
+                    encoding="unicode",
+                    with_tail=False,
+                ),
+            )
+        except (AttributeError, TypeError):
+            if self.root is True:
+                return "1"
+            elif self.root is False:
+                return "0"
+            else:
+                return str(self.root)
 
     extract = get
 
diff --git a/tests/test_selector.py b/tests/test_selector.py
index 902f82b..8cb2a1b 100644
--- a/tests/test_selector.py
+++ b/tests/test_selector.py
@@ -1012,13 +1012,13 @@ def test_remove_selector_from_html_in_text(self) -> None:
             "<html><body><style>p{color:red;}</style><p>hello world</p></body></html>"
         )
         expect_result = "<html><body><p>hello world</p></body></html>"
-        sel = self.sscls(text=html, type="text")
-        self.assertEqual(sel.type, "text")
+        sel = self.sscls(text=html, type="html")
+        self.assertEqual(sel.type, "html")
         li_sel_list = sel.css("style")
         li_sel_list.drop()
         self.assertEqual(sel.get(), expect_result)
         # The type of the parent selector should not change
-        self.assertEqual(sel.type, "text")
+        self.assertEqual(sel.type, "html")
 
     def test_remove_selector_from_html_in_json(self) -> None:
         json_str = """{
@@ -1028,13 +1028,13 @@ def test_remove_selector_from_html_in_json(self) -> None:
         """
         expect_result = "<html><body><p>hello world</p></body></html>"
         sel = self.sscls(text=json_str)
-        html_sel = sel.jmespath("body")[0]
-        self.assertEqual(html_sel.type, "text")
+        html_sel = sel.jmespath("body", type="html")[0]
+        self.assertEqual(html_sel.type, "html")
         li_sel_list = html_sel.css("style")
         li_sel_list.drop()
         self.assertEqual(html_sel.get(), expect_result)
         # The type of the parent selector should not change
-        self.assertEqual(html_sel.type, "text")
+        self.assertEqual(html_sel.type, "html")
 
     def test_remove_pseudo_element_selector_list(self) -> None:
         sel = self.sscls(

From 122c0f02577a5b2ad8714a4d742a58e28ef03a07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Fri, 14 Jun 2024 21:03:29 +0200
Subject: [PATCH 7/8] Remove unnecessary test case

---
 tests/test_selector.py | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/tests/test_selector.py b/tests/test_selector.py
index 8cb2a1b..db8d099 100644
--- a/tests/test_selector.py
+++ b/tests/test_selector.py
@@ -1007,20 +1007,7 @@ def test_remove_selector(self) -> None:
         self.assertIsSelectorList(sel.css("li"))
         self.assertEqual(sel.css("li::text").getall(), ["2", "3"])
 
-    def test_remove_selector_from_html_in_text(self) -> None:
-        html = (
-            "<html><body><style>p{color:red;}</style><p>hello world</p></body></html>"
-        )
-        expect_result = "<html><body><p>hello world</p></body></html>"
-        sel = self.sscls(text=html, type="html")
-        self.assertEqual(sel.type, "html")
-        li_sel_list = sel.css("style")
-        li_sel_list.drop()
-        self.assertEqual(sel.get(), expect_result)
-        # The type of the parent selector should not change
-        self.assertEqual(sel.type, "html")
-
-    def test_remove_selector_from_html_in_json(self) -> None:
+    def test_remove_selector_from_nested_html(self) -> None:
         json_str = """{
             "title": "hello world",
             "body": "<html><body><style>p{color:red;}</style><p>hello world</p></body></html>"
@@ -1028,6 +1015,8 @@ def test_remove_selector_from_html_in_json(self) -> None:
         """
         expect_result = "<html><body><p>hello world</p></body></html>"
         sel = self.sscls(text=json_str)
+        # We need to force the selector type to HTML to make that functionality
+        # readily available.
         html_sel = sel.jmespath("body", type="html")[0]
         self.assertEqual(html_sel.type, "html")
         li_sel_list = html_sel.css("style")

From fd49654f8531d32cd5ca4c37982056c8db622eb0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Fri, 14 Jun 2024 21:08:48 +0200
Subject: [PATCH 8/8] Apply pre-commit

---
 parsel/selector.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/parsel/selector.py b/parsel/selector.py
index 0088703..1092210 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -142,7 +142,9 @@ def __getitem__(
     def __getstate__(self) -> None:
         raise TypeError("can't pickle SelectorList objects")
 
-    def jmespath(self, query: str, *, type: Optional[str] = None, **kwargs: Any) -> "SelectorList[_SelectorType]":
+    def jmespath(
+        self, query: str, *, type: Optional[str] = None, **kwargs: Any
+    ) -> "SelectorList[_SelectorType]":
         """
         Call the ``.jmespath()`` method for each element in this list and return
         their results flattened as another :class:`SelectorList`.
@@ -154,7 +156,9 @@ def jmespath(self, query: str, *, type: Optional[str] = None, **kwargs: Any) ->
 
             selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
         """
-        return self.__class__(flatten([x.jmespath(query, type=type, **kwargs) for x in self]))
+        return self.__class__(
+            flatten([x.jmespath(query, type=type, **kwargs) for x in self])
+        )
 
     def xpath(
         self,
@@ -181,10 +185,19 @@ def xpath(
             selector.xpath('//a[href=$url]', url="http://www.example.com")
         """
         return self.__class__(
-            flatten([x.xpath(xpath, namespaces=namespaces, type=type, **kwargs) for x in self])
+            flatten(
+                [
+                    x.xpath(xpath, namespaces=namespaces, type=type, **kwargs)
+                    for x in self
+                ]
+            )
         )
 
-    def css(self, query: str, type: Optional[str] = None,) -> "SelectorList[_SelectorType]":
+    def css(
+        self,
+        query: str,
+        type: Optional[str] = None,
+    ) -> "SelectorList[_SelectorType]":
         """
         Call the ``.css()`` method for each element in this list and return
         their results flattened as another :class:`SelectorList`.
@@ -644,7 +657,9 @@ def xpath(
         ]
         return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))
 
-    def css(self: _SelectorType, query: str, type: Optional[str] = None) -> SelectorList[_SelectorType]:
+    def css(
+        self: _SelectorType, query: str, type: Optional[str] = None
+    ) -> SelectorList[_SelectorType]:
         """
         Apply the given CSS selector and return a :class:`SelectorList` instance.