Skip to content

Commit

Permalink
V==0.8
Browse files Browse the repository at this point in the history
  • Loading branch information
yogendratamang48 committed Sep 28, 2019
1 parent 6727e40 commit cba99e1
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 9 deletions.
22 changes: 22 additions & 0 deletions parse_utils/page_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,25 @@ def extract_dict(self, config, item=None, is_list=None):
continue

return _item

def extract_dict_from_json(self, config, item=None, is_list=None):
"""
extracts json data
"""
if not item:
_item = dict()
else:
_item = {** item }
for _k, _pathlist in config.items():
if _pathlist[0].__class__ != list:
_pathlist = [_pathlist]
for _paths in _pathlist:
tmp = self._selector
for _path in _paths:
tmp = tmp.get(_path)
if tmp is None:
break
if tmp:
_item[_k] = tmp
break
return _item
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="parse-utils",
version="0.7",
version="0.9",
author="Yogendra Tamang",
author_email="48yogen@gmail.com",
description="Page Parser Utils For scraping, List index update",
Expand Down
43 changes: 35 additions & 8 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,39 @@
</body>
</html>
'''
config = {
'header': ['//p[@id="header"]/text()'],
'content': ['//p[@class="content"]'],
json_data = {
'name': 'Yogendra',
'address': {
'country': 'Nepal',
'city': 'Pokhara',
}
}
pparser = PageParser(html_data)
item = pparser.extract_dict(config)
_item = pparser.extract_dict(config, is_list=True)
print(item)
print(_item)


def test_html_parser():
'''
'''
config = {
'header': ['//p[@id="header"]/text()'],
'content': ['//p[@class="content"]'],
}
pparser = PageParser(html_data)
item = pparser.extract_dict(config)
_item = pparser.extract_dict(config, is_list=True)
print(item)
print(_item)

def test_json_parser():
'''
'''
config = {
'header': ['name'],
'city': ['address', 'city'],
}
jparser = PageParser(json_data, selector=True)
item = jparser.extract_dict_from_json(config)
print(item)

if __name__ == '__main__':
test_html_parser()
test_json_parser()

0 comments on commit cba99e1

Please sign in to comment.