Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support forcing a selector type into a subselector #299

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
42 changes: 31 additions & 11 deletions parsel/selector.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""XPath and JMESPath selectors based on the lxml and jmespath Python
packages."""

import builtins
import json
import typing
import warnings
Expand Down Expand Up @@ -141,7 +142,9 @@ def __getitem__(
def __getstate__(self) -> None:
raise TypeError("can't pickle SelectorList objects")

def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]":
def jmespath(
self, query: str, *, type: Optional[str] = None, **kwargs: Any
) -> "SelectorList[_SelectorType]":
"""
Call the ``.jmespath()`` method for each element in this list and return
their results flattened as another :class:`SelectorList`.
Expand All @@ -153,12 +156,16 @@ def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]":

selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
"""
return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self]))
return self.__class__(
flatten([x.jmespath(query, type=type, **kwargs) for x in self])
)

def xpath(
self,
xpath: str,
namespaces: Optional[Mapping[str, str]] = None,
*,
type: Optional[str] = None,
**kwargs: Any,
) -> "SelectorList[_SelectorType]":
"""
Expand All @@ -178,17 +185,26 @@ def xpath(
selector.xpath('//a[href=$url]', url="http://www.example.com")
"""
return self.__class__(
flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self])
flatten(
[
x.xpath(xpath, namespaces=namespaces, type=type, **kwargs)
for x in self
]
)
)

def css(self, query: str) -> "SelectorList[_SelectorType]":
def css(
self,
query: str,
type: Optional[str] = None,
) -> "SelectorList[_SelectorType]":
"""
Call the ``.css()`` method for each element in this list and return
their results flattened as another :class:`SelectorList`.

``query`` is the same argument as the one in :meth:`Selector.css`
"""
return self.__class__(flatten([x.css(query) for x in self]))
return self.__class__(flatten([x.css(query, type=type) for x in self]))

def re(
self, regex: Union[str, Pattern[str]], replace_entities: bool = True
Expand Down Expand Up @@ -532,6 +548,7 @@ def _get_root(
def jmespath(
self: _SelectorType,
query: str,
type: Optional[str] = None,
**kwargs: Any,
) -> SelectorList[_SelectorType]:
"""
Expand Down Expand Up @@ -565,9 +582,9 @@ def jmespath(

def make_selector(x: Any) -> _SelectorType: # closure function
if isinstance(x, str):
return self.__class__(text=x, _expr=query, type="text")
return self.__class__(text=x, _expr=query, type=type or "text")
else:
return self.__class__(root=x, _expr=query)
return self.__class__(root=x, _expr=query, type=type)

result = [make_selector(x) for x in result]
return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))
Expand All @@ -576,6 +593,7 @@ def xpath(
self: _SelectorType,
query: str,
namespaces: Optional[Mapping[str, str]] = None,
type: Optional[str] = None,
**kwargs: Any,
) -> SelectorList[_SelectorType]:
"""
Expand Down Expand Up @@ -625,21 +643,23 @@ def xpath(
except etree.XPathError as exc:
raise ValueError(f"XPath error: {exc} in {query}")

if type(result) is not list:
if builtins.type(result) is not list:
result = [result]

result = [
self.__class__(
root=x,
_expr=query,
namespaces=self.namespaces,
type=_xml_or_html(self.type),
type=type or _xml_or_html(self.type),
)
for x in result
]
return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))

def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]:
def css(
self: _SelectorType, query: str, type: Optional[str] = None
) -> SelectorList[_SelectorType]:
"""
Apply the given CSS selector and return a :class:`SelectorList` instance.

Expand All @@ -652,7 +672,7 @@ def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]:
"""
if self.type not in ("html", "xml", "text"):
raise ValueError(f"Cannot use css on a Selector of type {self.type!r}")
return self.xpath(self._css2xpath(query))
return self.xpath(self._css2xpath(query), type=type)

def _css2xpath(self, query: str) -> str:
type = _xml_or_html(self.type)
Expand Down
18 changes: 18 additions & 0 deletions tests/test_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -1007,6 +1007,24 @@ def test_remove_selector(self) -> None:
self.assertIsSelectorList(sel.css("li"))
self.assertEqual(sel.css("li::text").getall(), ["2", "3"])

def test_remove_selector_from_nested_html(self) -> None:
json_str = """{
"title": "hello world",
"body": "<html><body><style>p{color:red;}</style><p>hello world</p></body></html>"
}
"""
expect_result = "<html><body><p>hello world</p></body></html>"
sel = self.sscls(text=json_str)
# We need to force the selector type to HTML to make that functionality
# readily available.
html_sel = sel.jmespath("body", type="html")[0]
self.assertEqual(html_sel.type, "html")
li_sel_list = html_sel.css("style")
li_sel_list.drop()
self.assertEqual(html_sel.get(), expect_result)
# The type of the parent selector should not change
self.assertEqual(html_sel.type, "html")

def test_remove_pseudo_element_selector_list(self) -> None:
sel = self.sscls(
text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>"
Expand Down