-
Notifications
You must be signed in to change notification settings - Fork 15
Home
Paul Tremberth edited this page Jun 25, 2013
·
16 revisions
Welcome to the parslepy wiki!
- Use parslepy with Scrapy
- Scrapy tutorial example using parslepy (DMOZ Spider)
- Implementing bucketed arrays (work in progess)
- Parsing XML with parslepy
- Instantiate a
parslepy.Parselet
with a Pythondict
containing extraction rules -
parslepy.Parselet
works withlxml
-processed documents:
- either use
.extract(document)
if you've already parsed your document withlxml
, - or use
.parse(fp[, parser])
to let yourParselet
instance do the parsing, usinglxml
's default HTML parser or optionally pass another parser forlxml
.fp
must be a file-like object, not a string (To use a string, you should useStringIO
.)
Sample document for our examples and parsing it with lxml
>>> import lxml.etree
>>> import parslepy
>>> html = """
... <!DOCTYPE html>
... <html>
... <head>
... <title>Sample document to test parslepy</title>
... <meta http-equiv="content-type" content="text/html;charset=utf-8" />
... </head>
... <body>
... <h1 id="main">What’s new</h1>
... <ul>
... <li class="newsitem"><a href="/article-001.html">This is the first article</a></li>
... <li class="newsitem"><a href="/article-002.html">A second report on something</a></li>
... <li class="newsitem"><a href="/article-003.html">Python is great!</a> <span class="fresh">New!</span></li>
... </ul>
... </body>
... </html>
... """
>>> html_parser = lxml.etree.HTMLParser()
>>> doc = lxml.etree.fromstring(html, parser=html_parser)
>>> doc
<Element html at 0x7f5fb1fce9b0>
>>> rules = {"title": "title"}
>>> p = parslepy.Parselet(rules)
>>> p.extract(doc)
{'title': u'Sample document to test parslepy'}
>>> rules = {"heading": "h1"}
>>> p = parslepy.Parselet(rules)
>>> p.extract(doc)
{'heading': u'What\u2019s new'}
>>> rules = {
... "headingcss": "#main",
... "headingxpath": "//h1[@id='main']"
... }
>>> p = parslepy.Parselet(rules)
>>> p.extract(doc)
{'headingcss': u'What\u2019s new', 'headingxpath': u'What\u2019s new'}
Nest your object rules inside a []
>>> rules = {
... "heading": "h1#main",
... "news(li.newsitem)": [{
... "title": ".",
... "url": "a/@href"
... }],
... }
>>> p = parslepy.Parselet(rules)
>>> import pprint
>>> pprint.pprint(p.extract(doc))
{'heading': u'What\u2019s new',
'news': [{'title': u'This is the first article', 'url': '/article-001.html'},
{'title': u'A second report on something',
'url': '/article-002.html'},
{'title': u'Python is great!', 'url': '/article-003.html'}]}
Non-matching rules will output {}
>>> rules = {
... "heading1": "h1#main",
... "heading2": "h2#main",
... }
>>> p = parslepy.Parselet(rules)
>>> pprint.pprint(p.extract(doc))
{'heading1': u'What\u2019s new'} # only 1 key in output
>>> rules = {
... "heading2": "h2#main"
... }
>>> p = parslepy.Parselet(rules)
>>> pprint.pprint(p.extract(doc))
{} # nothin in output, no selector rule matched anything
Non-matching rules will raise NonMatchingNonOptionalKey
exception
>>> rules = {
... "heading1": "h1#main",
... "heading2": "h2#main",
... }
>>> p = parslepy.Parselet(rules, strict=True)
>>> p.extract(doc)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "parslepy/base.py", line 501, in extract
return self._extract(self.parselet_tree, document)
File "parslepy/base.py", line 582, in _extract
document.getroottree().getpath(document),v
parslepy.base.NonMatchingNonOptionalKey: key "heading2" is required but yield nothing
Current path: /html/(<Selector: inner=<CSSSelector 20a2758 for 'h2#main'>>)
Add ?
to the keys that may not match
>>> rules = {
... "heading1": "h1#main",
... "heading2?": "h2#main",
... }
>>> p = parslepy.Parselet(rules, strict=True)
>>> p.extract(doc)
{'heading1': u'What\u2019s new'}
In our sample document, one (and only one) LI
contains a SPAN
element with class fresh
In strict mode, adding a "fresh" key for our item extraction rules raises and Exception
>>> rules = {
... "heading": "h1#main",
... "news(li.newsitem)": [{
... "title": ".",
... "url": "a/@href",
... "fresh": ".fresh"
... }],
... }
>>> p = parslepy.Parselet(rules, strict=True)
>>> p.extract(doc)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "parslepy/base.py", line 501, in extract
return self._extract(self.parselet_tree, document)
File "parslepy/base.py", line 532, in _extract
parse_result = self._extract(v, elem, level=level+1)
File "parslepy/base.py", line 582, in _extract
document.getroottree().getpath(document),v
parslepy.base.NonMatchingNonOptionalKey: key "fresh" is required but yield nothing
Current path: /html/body/ul/li[1]/(<Selector: inner=<CSSSelector 2096e60 for '.fresh'>>)
In non-strict/default mode, this will simply omit the "fresh" key for the items that do not have this SPAN
>>> p = parslepy.Parselet(rules)
>>> p.extract(doc)
{'news': [{'url': '/article-001.html', 'title': u'This is the first article'}, {'url': '/article-002.html', 'title': u'A second report on something'}, {'url': '/article-003.html', 'fresh': u'New!', 'title': u'Python is great! New!'}], 'heading': u'What\u2019s new'}
>>> pprint.pprint(p.extract(doc))
{'heading': u'What\u2019s new',
'news': [{'title': u'This is the first article', 'url': '/article-001.html'},
{'title': u'A second report on something',
'url': '/article-002.html'},
{'fresh': u'New!',
'title': u'Python is great! New!',
'url': '/article-003.html'}]}
>>>
Or in stict mode, you would need to set the "fresh" rule as optional to get the same output
>>> rules = {
... "heading": "h1#main",
... "news(li.newsitem)": [{
... "title": ".",
... "url": "a/@href",
... "fresh?": ".fresh"
... }],
... }
>>> p = parslepy.Parselet(rules, strict=True)
>>> pprint.pprint(p.extract(doc))
{'heading': u'What\u2019s new',
'news': [{'title': u'This is the first article', 'url': '/article-001.html'},
{'title': u'A second report on something',
'url': '/article-002.html'},
{'fresh': u'New!',
'title': u'Python is great! New!',
'url': '/article-003.html'}]}
>>>
>>> rules = {
... "heading!": "h1#main",
... }
>>> p = parslepy.Parselet(rules, strict=True)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "parslepy/base.py", line 325, in __init__
self.compile()
File "parslepy/base.py", line 393, in compile
self.parselet_tree = self._compile(self.parselet)
File "parslepy/base.py", line 432, in _compile
raise InvalidKeySyntax("Key %s is not valid" % k)
parslepy.base.InvalidKeySyntax: Key heading! is not valid
>>> p = parslepy.Parselet({"heading@": "#main"})
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "parslepy/base.py", line 325, in __init__
self.compile()
File "parslepy/base.py", line 393, in compile
self.parselet_tree = self._compile(self.parselet)
File "parslepy/base.py", line 432, in _compile
raise InvalidKeySyntax("Key %s is not valid" % k)
parslepy.base.InvalidKeySyntax: Key heading@ is not valid
>>> p = parslepy.Parselet({"heading{": "#main"})
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "parslepy/base.py", line 325, in __init__
self.compile()
File "parslepy/base.py", line 393, in compile
self.parselet_tree = self._compile(self.parselet)
File "parslepy/base.py", line 432, in _compile
raise InvalidKeySyntax("Key %s is not valid" % k)
parslepy.base.InvalidKeySyntax: Key heading{ is not valid
When not your keys but your selectors are invalid, you get an XPath syntax error exception
>>> p = parslepy.Parselet({"heading": "#main#"})
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "parslepy/base.py", line 325, in __init__
self.compile()
File "parslepy/base.py", line 393, in compile
self.parselet_tree = self._compile(self.parselet)
File "parslepy/base.py", line 471, in _compile
child_tree = self._compile(v, level=level+1)
File "parslepy/base.py", line 489, in _compile
return self.selector_handler.make(parselet_node)
File "parslepy/base.py", line 201, in make
extensions = cls.XPATH_EXTENSIONS)
File "xpath.pxi", line 438, in lxml.etree.XPath.__init__ (src/lxml/lxml.etree.c:134866)
File "xpath.pxi", line 215, in lxml.etree._XPathEvaluatorBase._raise_parse_error (src/lxml/lxml.etree.c:132490)
lxml.etree.XPathSyntaxError: Invalid expression