Skip to content

Commit

Permalink
Add extension to expose DOM for embedded properties (#208)
Browse files Browse the repository at this point in the history
  • Loading branch information
angelogladding authored Nov 30, 2023
1 parent 70bbb6a commit c073fe1
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file.
- fix whitespace in plaintext conversion (#207)
- add srcset support (#209)
- add language support (#210)
- add extension to expose the DOM for embedded properties (#208)

## 1.1.3 - 2023-06-28
- reduce instances where photo is implied (#135)
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ obj = mf2py.parse(url="http://tommorris.org/")

### Extensions

Use `expose_dom=True` to expose the DOM of embedded properties.

---

`parse` is a convenience method that actually delegates to
Expand Down
7 changes: 5 additions & 2 deletions mf2py/parse_property.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,13 @@ def datetime(el, default_date=None):
)


def embedded(el, root_lang, document_lang, base_url=""):
def embedded(el, base_url, root_lang, document_lang, expose_dom):
"""Process e-* properties"""
for tag in el.find_all():
for attr in ("href", "src", "cite", "data", "poster"):
if attr in tag.attrs:
tag.attrs[attr] = try_urljoin(base_url, tag.attrs[attr])
prop_value = {
"html": el.decode_contents().strip(), # secret bs4 method to get innerHTML
"value": get_textContent(el, replace_img=True, base_url=base_url),
}
if lang := el.attrs.get("lang"):
Expand All @@ -110,4 +109,8 @@ def embedded(el, root_lang, document_lang, base_url=""):
prop_value["lang"] = root_lang
elif document_lang:
prop_value["lang"] = document_lang
if expose_dom:
prop_value["dom"] = el
else:
prop_value["html"] = el.decode_contents().strip()
return prop_value
11 changes: 7 additions & 4 deletions mf2py/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from .version import __version__


def parse(doc=None, url=None, html_parser=None):
def parse(doc=None, url=None, html_parser=None, expose_dom=False):
"""
Parse a microformats2 document or url and return a json dictionary.
Expand All @@ -25,10 +25,11 @@ def parse(doc=None, url=None, html_parser=None):
html_parser (string): optional, select a specific HTML parser. Valid
options from the BeautifulSoup documentation are:
"html", "xml", "html5", "lxml", "html5lib", and "html.parser"
expose_dom (boolean): optional, expose the DOM of embedded properties.
Return: a json dict represented the structured data in this document.
"""
return Parser(doc, url, html_parser).to_dict()
return Parser(doc, url, html_parser, expose_dom).to_dict()


class Parser(object):
Expand All @@ -45,6 +46,7 @@ class Parser(object):
options from the BeautifulSoup documentation are:
"html", "xml", "html5", "lxml", "html5lib", and "html.parser"
defaults to "html5lib"
expose_dom (boolean): optional, expose the DOM of embedded properties.
Attributes:
useragent (string): the User-Agent string for the Parser
Expand All @@ -54,7 +56,7 @@ class Parser(object):
ua_url = "https://github.com/microformats/mf2py"
useragent = "{0} - version {1} - {2}".format(ua_desc, __version__, ua_url)

def __init__(self, doc=None, url=None, html_parser=None):
def __init__(self, doc=None, url=None, html_parser=None, expose_dom=False):
self.__url__ = None
self.__doc__ = None
self._preserve_doc = False
Expand All @@ -68,6 +70,7 @@ def __init__(self, doc=None, url=None, html_parser=None):
"version": __version__,
},
}
self.expose_dom = expose_dom
self.lang = None

# use default parser if none specified
Expand Down Expand Up @@ -372,7 +375,7 @@ def parse_props(el, root_lang):
embedded_el = copy.copy(embedded_el)
temp_fixes.rm_templates(embedded_el)
e_value = parse_property.embedded(
embedded_el, root_lang, self.lang, base_url=self.__url__
embedded_el, self.__url__, root_lang, self.lang, self.expose_dom
)

if root_class_names:
Expand Down
8 changes: 8 additions & 0 deletions test/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
from unittest import TestCase

import bs4
import mock
from bs4 import BeautifulSoup

Expand Down Expand Up @@ -191,6 +192,13 @@ def test_embedded_parsing():
)


def test_embedded_exposed_dom():
result = parse_fixture("embedded.html", expose_dom=True)
content = result["items"][0]["properties"]["content"][0]
assert "html" not in content
assert isinstance(content["dom"], bs4.element.Tag)


def test_hoisting_nested_hcard():
result = parse_fixture("nested_hcards.html")
expected = [
Expand Down

0 comments on commit c073fe1

Please sign in to comment.