Skip to content

Commit

Permalink
Add extension to support metaformats (#213)
Browse files Browse the repository at this point in the history
  • Loading branch information
snarfed authored Dec 1, 2023
1 parent 043c2f6 commit 698f2bb
Show file tree
Hide file tree
Showing 7 changed files with 235 additions and 4 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ All notable changes to this project will be documented in this file.
- add srcset support (#209)
- add language support (#210)
- add extension to expose the DOM for embedded properties (#208)
- add extension to support metaformats (#212)

## 1.1.3 - 2023-06-28
- reduce instances where photo is implied (#135)
Expand Down
89 changes: 89 additions & 0 deletions mf2py/metaformats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""Metaformats parser.
https://microformats.org/wiki/metaformats
TODO:
* explicit mf2 classes on meta tags
https://microformats.org/wiki/metaformats#parsing_an_element_for_properties
"""
from .dom_helpers import try_urljoin
from .mf2_classes import filter_classes

METAFORMAT_TO_MF2 = [
# in priority order, descending
# OGP
("property", "article:author", "author"),
("property", "article:published_time", "published"),
("property", "article:modified_time", "updated"),
("property", "og:audio", "audio"),
("property", "og:description", "summary"),
("property", "og:image", "photo"),
("property", "og:title", "name"),
("property", "og:video", "video"),
# Twitter
("name", "twitter:title", "name"),
("name", "twitter:description", "summary"),
("name", "twitter:image", "photo"),
# HTML standard meta names
# https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name
("name", "description", "summary"),
]
OGP_TYPE_TO_MF2 = {
"article": "h-entry",
"movie": "h-cite",
"music": "h-cite",
"profile": "h-card",
}
URL_PROPERTIES = {
"article:author",
"og:audio",
"og:image",
"og:video",
"twitter:image",
}


def parse(soup, url=None):
"""Extracts and returns a metaformats item from a BeautifulSoup parse tree.
Args:
soup (bs4.BeautifulSoup): parsed HTML
url (str): URL of document
Returns:
dict: mf2 item, or None if the input is not eligible for metaformats
"""
if not soup.head:
return None

# Is there a microformat2 root class on the html element?
if filter_classes(soup.get("class", []))["h"]:
return None

parsed = {"properties": {}}
props = parsed["properties"]

# Properties
for attr, meta, mf2 in METAFORMAT_TO_MF2:
if val := soup.head.find("meta", attrs={attr: meta}):
if content := val.get("content"):
if meta in URL_PROPERTIES:
content = try_urljoin(url, content)
props.setdefault(mf2, [content])

if soup.head.title:
if text := soup.head.title.text:
props.setdefault("name", [text])

if not props:
# No OGP or Twitter properties
return None

# type from OGP or default to h-entry
parsed["type"] = ["h-entry"]
if ogp_type := soup.head.find("meta", property="og:type"):
if content := ogp_type.get("content"):
if mf2_type := OGP_TYPE_TO_MF2.get(content.split(".")[0]):
parsed["type"] = [mf2_type]

return parsed
40 changes: 36 additions & 4 deletions mf2py/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,20 @@
from bs4 import BeautifulSoup, FeatureNotFound
from bs4.element import Tag

from . import backcompat, implied_properties, mf2_classes, parse_property, temp_fixes
from . import (
backcompat,
implied_properties,
metaformats,
mf2_classes,
parse_property,
temp_fixes,
)
from .dom_helpers import get_attr, get_children, get_descendents, try_urljoin
from .mf_helpers import unordered_list
from .version import __version__


def parse(doc=None, url=None, html_parser=None, expose_dom=False):
def parse(doc=None, url=None, html_parser=None, expose_dom=False, metaformats=False):
"""
Parse a microformats2 document or url and return a json dictionary.
Expand All @@ -26,10 +33,18 @@ def parse(doc=None, url=None, html_parser=None, expose_dom=False):
options from the BeautifulSoup documentation are:
"html", "xml", "html5", "lxml", "html5lib", and "html.parser"
expose_dom (boolean): optional, expose the DOM of embedded properties.
metaformats (boolean): whether to include metaformats extracted from OGP
and Twitter card data: https://microformats.org/wiki/metaformats
Return: a json dict represented the structured data in this document.
"""
return Parser(doc, url, html_parser, expose_dom).to_dict()
return Parser(
doc,
url,
html_parser,
expose_dom=expose_dom,
metaformats=metaformats,
).to_dict()


class Parser(object):
Expand All @@ -47,6 +62,8 @@ class Parser(object):
"html", "xml", "html5", "lxml", "html5lib", and "html.parser"
defaults to "html5lib"
expose_dom (boolean): optional, expose the DOM of embedded properties.
metaformats (boolean): whether to include metaformats extracted from OGP
and Twitter card data: https://microformats.org/wiki/metaformats
Attributes:
useragent (string): the User-Agent string for the Parser
Expand All @@ -56,7 +73,14 @@ class Parser(object):
ua_url = "https://github.com/microformats/mf2py"
useragent = "{0} - version {1} - {2}".format(ua_desc, __version__, ua_url)

def __init__(self, doc=None, url=None, html_parser=None, expose_dom=False):
def __init__(
self,
doc=None,
url=None,
html_parser=None,
expose_dom=False,
metaformats=False,
):
self.__url__ = None
self.__doc__ = None
self._preserve_doc = False
Expand All @@ -70,6 +94,7 @@ def __init__(self, doc=None, url=None, html_parser=None, expose_dom=False):
"version": __version__,
},
}
self.__metaformats = metaformats
self.expose_dom = expose_dom
self.lang = None

Expand Down Expand Up @@ -487,9 +512,16 @@ def parse_el(el, ctx):
parse_el(child, ctx)

ctx = []

if self.__metaformats:
# extract out a metaformats item, if available
self.__metaformats_item = metaformats.parse(self.__doc__, url=self.__url__)

# start parsing at root element of the document
parse_el(self.__doc__, ctx)
self.__parsed__["items"] = ctx
if self.__metaformats and self.__metaformats_item:
self.__parsed__["items"].append(self.__metaformats_item)

# parse for rel values
for el in get_descendents(self.__doc__):
Expand Down
12 changes: 12 additions & 0 deletions test/examples/metaformats_html_meta.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8">
<title>Hello World</title>
<base href="http://tantek.com/" />
<meta name="description" content="Descrypshun bar" />
</head>
<body>
<p>Hello world!</p>
</body>
</html>
20 changes: 20 additions & 0 deletions test/examples/metaformats_ogp.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8">
<title>Hello World</title>
<base href="http://tantek.com/" />
<meta property="og:type" content="article" />
<meta property="og:title" content="Titull foo" />
<meta property="og:description" content="Descrypshun bar" />
<meta property="og:image" content="http://example.com/baz.jpg" />
<meta property="og:audio" content="http://example.com/biff.mp3" />
<meta property="og:video" content="http://example.com/boff.mov" />
<meta property="article:author" content="/me" />
<meta property="article:published_time" content="2023-01-02T03:04Z" />
<meta property="article:modified_time" content="2023-01-02T05:06Z" />
</head>
<body>
<p>Hello world!</p>
</body>
</html>
14 changes: 14 additions & 0 deletions test/examples/metaformats_twitter.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8">
<title>Hello World</title>
<base href="http://tantek.com/" />
<meta name="twitter:title" content="Titull foo" />
<meta name="twitter:description" content="Descrypshun bar" />
<meta name="twitter:image" content="/baz.jpg" />
</head>
<body>
<p>Hello world!</p>
</body>
</html>
63 changes: 63 additions & 0 deletions test/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1126,6 +1126,69 @@ def test_all_u_cases():
)


def test_metaformats_flag_false():
result = parse_fixture("metaformats_ogp.html")
assert result["items"] == []


def test_metaformats_title_only():
result = parse_fixture("base.html", metaformats=True)
assert result["items"] == [
{
"type": ["h-entry"],
"properties": {
"name": ["Hello World"],
},
}
]


def test_metaformats_ogp():
result = parse_fixture("metaformats_ogp.html", metaformats=True)
assert result["items"] == [
{
"type": ["h-entry"],
"properties": {
"name": ["Titull foo"],
"summary": ["Descrypshun bar"],
"photo": ["http://example.com/baz.jpg"],
"audio": ["http://example.com/biff.mp3"],
"video": ["http://example.com/boff.mov"],
"author": ["http://tantek.com/me"],
"published": ["2023-01-02T03:04Z"],
"updated": ["2023-01-02T05:06Z"],
},
}
]


def test_metaformats_twitter():
result = parse_fixture("metaformats_twitter.html", metaformats=True)
assert result["items"] == [
{
"type": ["h-entry"],
"properties": {
"name": ["Titull foo"],
"summary": ["Descrypshun bar"],
"photo": ["http://tantek.com/baz.jpg"],
},
}
]


def test_metaformats_html_meta():
result = parse_fixture("metaformats_html_meta.html", metaformats=True)
assert result["items"] == [
{
"type": ["h-entry"],
"properties": {
"name": ["Hello World"],
"summary": ["Descrypshun bar"],
},
}
]


def test_language():
result = parse_fixture("language.html")
assert result["items"][0]["lang"] == "it"
Expand Down

0 comments on commit 698f2bb

Please sign in to comment.