Skip to content

Commit

Permalink
Add filter list for known conflicting root class names (#211)
Browse files Browse the repository at this point in the history
  • Loading branch information
angelogladding authored Dec 8, 2023
1 parent 1a3142d commit 34841b1
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 24 deletions.
4 changes: 2 additions & 2 deletions mf2py/backcompat.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def root(classes):
return unordered_list([c for c in classes if c in _CLASSIC_MAP])


def apply_rules(el, html_parser):
def apply_rules(el, html_parser, filtered_roots):
"""add modern classnames for older mf1 classnames
returns a copy of el and does not modify the original
Expand All @@ -164,7 +164,7 @@ def apply_prop_rules_to_children(parent, rules):
rule(child)

# recurse if it's not a nested mf1 or mf2 root
if not (mf2_classes.root(classes) or root(classes)):
if not (mf2_classes.root(classes, filtered_roots) or root(classes)):
apply_prop_rules_to_children(child, rules)

# add mf2 root equivalent
Expand Down
22 changes: 11 additions & 11 deletions mf2py/implied_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from .dom_helpers import get_attr, get_children, get_img, get_textContent, try_urljoin


def name(el, base_url=""):
def name(el, base_url, filtered_roots):
"""Find an implied name property
Args:
Expand Down Expand Up @@ -34,7 +34,7 @@ def non_empty(val):
poss_child = children[0]

# ignore if mf2 root
if mf2_classes.root(poss_child.get("class", [])):
if mf2_classes.root(poss_child.get("class", []), filtered_roots):
poss_child = None

# if it is not img, area, abbr then find grandchild
Expand All @@ -45,7 +45,7 @@ def non_empty(val):
poss_child = grandchildren[0]
# if it is not img, area, abbr or is mf2 root then no possible child
if poss_child.name not in ("img", "area", "abbr") or mf2_classes.root(
poss_child.get("class", [])
poss_child.get("class", []), filtered_roots
):
poss_child = None

Expand All @@ -67,7 +67,7 @@ def non_empty(val):
return get_textContent(el, replace_img=True, img_to_src=False, base_url=base_url)


def photo(el, base_url=""):
def photo(el, base_url, filtered_roots):
"""Find an implied photo property
Args:
Expand All @@ -86,15 +86,15 @@ def get_photo_child(children):
poss_imgs = [c for c in children if c.name == "img"]
if len(poss_imgs) == 1:
poss_img = poss_imgs[0]
if not mf2_classes.root(poss_img.get("class", [])):
if not mf2_classes.root(poss_img.get("class", []), filtered_roots):
return poss_img

# if element has one object child use data if exists and object is
# not root class
poss_objs = [c for c in children if c.name == "object"]
if len(poss_objs) == 1:
poss_obj = poss_objs[0]
if not mf2_classes.root(poss_obj.get("class", [])):
if not mf2_classes.root(poss_obj.get("class", []), filtered_roots):
return poss_obj

def resolve_relative_url(prop_value):
Expand Down Expand Up @@ -122,7 +122,7 @@ def resolve_relative_url(prop_value):
if (
poss_child is None
and len(children) == 1
and not mf2_classes.root(children[0].get("class", []))
and not mf2_classes.root(children[0].get("class", []), filtered_roots)
):
grandchildren = list(get_children(children[0]))
poss_child = get_photo_child(grandchildren)
Expand All @@ -138,7 +138,7 @@ def resolve_relative_url(prop_value):
return resolve_relative_url(prop_value)


def url(el, base_url=""):
def url(el, base_url, filtered_roots):
"""Find an implied url property
Args:
Expand All @@ -156,14 +156,14 @@ def get_url_child(children):
poss_as = [c for c in children if c.name == "a"]
if len(poss_as) == 1:
poss_a = poss_as[0]
if not mf2_classes.root(poss_a.get("class", [])):
if not mf2_classes.root(poss_a.get("class", []), filtered_roots):
return poss_a

# if element has one area child use if not root class
poss_areas = [c for c in children if c.name == "area"]
if len(poss_areas) == 1:
poss_area = poss_areas[0]
if not mf2_classes.root(poss_area.get("class", [])):
if not mf2_classes.root(poss_area.get("class", []), filtered_roots):
return poss_area

# if element is a <a> or area use its href if exists
Expand All @@ -181,7 +181,7 @@ def get_url_child(children):
if (
poss_child is None
and len(children) == 1
and not mf2_classes.root(children[0].get("class", []))
and not mf2_classes.root(children[0].get("class", []), filtered_roots)
):
grandchildren = list(get_children(children[0]))
poss_child = get_url_child(grandchildren)
Expand Down
8 changes: 6 additions & 2 deletions mf2py/mf2_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
_mf2_properties_re = re.compile("(p|e|u|dt)-(:?[a-z0-9]+-)?[a-z]+(:?-[a-z]+)*$")
_mf2_e_properties_re = re.compile("e-(:?[a-z0-9]+-)?[a-z]+(:?-[a-z]+)*$")

CONFLICTING_ROOTS_TAILWIND = {"auto", "fit", "full", "max", "min", "px", "screen"}


def filter_classes(classes, regex=_mf2_classes_re):
"""detect classes that are valid names for mf2, sort in dictionary by prefix"""
Expand All @@ -20,8 +22,10 @@ def filter_classes(classes, regex=_mf2_classes_re):
return types


def root(classes):
return {c for c in classes if _mf2_roots_re.match(c)}
def root(classes, filtered_roots):
return {
c for c in classes if _mf2_roots_re.match(c) and c[2:] not in filtered_roots
}


def is_property_class(class_):
Expand Down
38 changes: 29 additions & 9 deletions mf2py/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,14 @@
from .version import __version__


def parse(doc=None, url=None, html_parser=None, expose_dom=False, metaformats=False):
def parse(
doc=None,
url=None,
html_parser=None,
expose_dom=False,
metaformats=False,
filter_roots=False,
):
"""
Parse a microformats2 document or url and return a json dictionary.
Expand All @@ -44,6 +51,7 @@ def parse(doc=None, url=None, html_parser=None, expose_dom=False, metaformats=Fa
html_parser,
expose_dom=expose_dom,
metaformats=metaformats,
filter_roots=filter_roots,
).to_dict()


Expand Down Expand Up @@ -80,6 +88,7 @@ def __init__(
html_parser=None,
expose_dom=False,
metaformats=False,
filter_roots=False,
):
self.__url__ = None
self.__doc__ = None
Expand All @@ -94,9 +103,16 @@ def __init__(
"version": __version__,
},
}
self.__metaformats = metaformats
self.expose_dom = expose_dom
self.lang = None
self.expose_dom = expose_dom
self.__metaformats = metaformats
try:
self.filtered_roots = set(filter_roots)
except TypeError:
if filter_roots:
self.filtered_roots = mf2_classes.CONFLICTING_ROOTS_TAILWIND
else:
self.filtered_roots = []

# use default parser if none specified
self.__html_parser__ = html_parser or "html5lib"
Expand Down Expand Up @@ -189,8 +205,12 @@ def handle_microformat(
parsed_types_aggregation = set()

if backcompat_mode:
el = backcompat.apply_rules(el, self.__html_parser__)
root_class_names = mf2_classes.root(el.get("class", []))
el = backcompat.apply_rules(
el, self.__html_parser__, self.filtered_roots
)
root_class_names = mf2_classes.root(
el.get("class", []), self.filtered_roots
)

root_lang = el.attrs.get("lang")

Expand Down Expand Up @@ -220,21 +240,21 @@ def handle_microformat(
"peh"
):
properties["name"] = [
implied_properties.name(el, base_url=self.__url__)
implied_properties.name(el, self.__url__, self.filtered_roots)
]

if "photo" not in properties and parsed_types_aggregation.isdisjoint(
"uh"
):
x = implied_properties.photo(el, base_url=self.__url__)
x = implied_properties.photo(el, self.__url__, self.filtered_roots)
if x is not None:
properties["photo"] = [x]

# stop implied url if any u-* or h-* is already found
if "url" not in properties and parsed_types_aggregation.isdisjoint(
"uh"
):
x = implied_properties.url(el, base_url=self.__url__)
x = implied_properties.url(el, self.__url__, self.filtered_roots)
if x is not None:
properties["url"] = [x]

Expand Down Expand Up @@ -492,7 +512,7 @@ def parse_el(el, ctx):
classes = el.get("class", [])

# find potential microformats in root classnames h-*
potential_microformats = mf2_classes.root(classes)
potential_microformats = mf2_classes.root(classes, self.filtered_roots)

# if potential microformats found parse them
if potential_microformats:
Expand Down
9 changes: 9 additions & 0 deletions test/examples/filter_roots.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<h2>Tailwind root filter</h2>
<div class=h-card>fnord</div>
<div class=h-auto>fnord</div>
<div class=h-fit>fnord</div>
<div class=h-full>fnord</div>
<div class=h-max>fnord</div>
<div class=h-min>fnord</div>
<div class=h-px>fnord</div>
<div class=h-screen>fnord</div>
6 changes: 6 additions & 0 deletions test/examples/filter_roots_custom.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<h2>Custom root filter</h2>
<div class=h-card>fnord</div>
<div class=h-foo>fnord</div>
<div class=h-bar>fnord</div>
<div class=h-bat>fnord</div>
<div class=h-baz>fnord</div>
13 changes: 13 additions & 0 deletions test/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1126,6 +1126,19 @@ def test_all_u_cases():
)


def test_filtered_roots():
result = parse_fixture("filter_roots.html")
assert len(result["items"]) == 8

result = parse_fixture("filter_roots.html", filter_roots=True)
assert len(result["items"]) == 1

result = parse_fixture(
"filter_roots_custom.html", filter_roots={"foo", "bar", "bat", "baz"}
)
assert len(result["items"]) == 1


def test_metaformats_flag_false():
result = parse_fixture("metaformats_ogp.html")
assert result["items"] == []
Expand Down

0 comments on commit 34841b1

Please sign in to comment.