diff --git a/mf2py/backcompat.py b/mf2py/backcompat.py index a29004d..49a866e 100644 --- a/mf2py/backcompat.py +++ b/mf2py/backcompat.py @@ -143,7 +143,7 @@ def root(classes): return unordered_list([c for c in classes if c in _CLASSIC_MAP]) -def apply_rules(el, html_parser): +def apply_rules(el, html_parser, filtered_roots): """add modern classnames for older mf1 classnames returns a copy of el and does not modify the original @@ -164,7 +164,7 @@ def apply_prop_rules_to_children(parent, rules): rule(child) # recurse if it's not a nested mf1 or mf2 root - if not (mf2_classes.root(classes) or root(classes)): + if not (mf2_classes.root(classes, filtered_roots) or root(classes)): apply_prop_rules_to_children(child, rules) # add mf2 root equivalent diff --git a/mf2py/implied_properties.py b/mf2py/implied_properties.py index de422ab..4c538f3 100644 --- a/mf2py/implied_properties.py +++ b/mf2py/implied_properties.py @@ -2,7 +2,7 @@ from .dom_helpers import get_attr, get_children, get_img, get_textContent, try_urljoin -def name(el, base_url=""): +def name(el, base_url, filtered_roots): """Find an implied name property Args: @@ -34,7 +34,7 @@ def non_empty(val): poss_child = children[0] # ignore if mf2 root - if mf2_classes.root(poss_child.get("class", [])): + if mf2_classes.root(poss_child.get("class", []), filtered_roots): poss_child = None # if it is not img, area, abbr then find grandchild @@ -45,7 +45,7 @@ def non_empty(val): poss_child = grandchildren[0] # if it is not img, area, abbr or is mf2 root then no possible child if poss_child.name not in ("img", "area", "abbr") or mf2_classes.root( - poss_child.get("class", []) + poss_child.get("class", []), filtered_roots ): poss_child = None @@ -67,7 +67,7 @@ def non_empty(val): return get_textContent(el, replace_img=True, img_to_src=False, base_url=base_url) -def photo(el, base_url=""): +def photo(el, base_url, filtered_roots): """Find an implied photo property Args: @@ -86,7 +86,7 @@ def get_photo_child(children): poss_imgs = [c for c in children if c.name == "img"] if len(poss_imgs) == 1: poss_img = poss_imgs[0] - if not mf2_classes.root(poss_img.get("class", [])): + if not mf2_classes.root(poss_img.get("class", []), filtered_roots): return poss_img # if element has one object child use data if exists and object is @@ -94,7 +94,7 @@ def get_photo_child(children): poss_objs = [c for c in children if c.name == "object"] if len(poss_objs) == 1: poss_obj = poss_objs[0] - if not mf2_classes.root(poss_obj.get("class", [])): + if not mf2_classes.root(poss_obj.get("class", []), filtered_roots): return poss_obj def resolve_relative_url(prop_value): @@ -122,7 +122,7 @@ def resolve_relative_url(prop_value): if ( poss_child is None and len(children) == 1 - and not mf2_classes.root(children[0].get("class", [])) + and not mf2_classes.root(children[0].get("class", []), filtered_roots) ): grandchildren = list(get_children(children[0])) poss_child = get_photo_child(grandchildren) @@ -138,7 +138,7 @@ def resolve_relative_url(prop_value): return resolve_relative_url(prop_value) -def url(el, base_url=""): +def url(el, base_url, filtered_roots): """Find an implied url property Args: @@ -156,14 +156,14 @@ def get_url_child(children): poss_as = [c for c in children if c.name == "a"] if len(poss_as) == 1: poss_a = poss_as[0] - if not mf2_classes.root(poss_a.get("class", [])): + if not mf2_classes.root(poss_a.get("class", []), filtered_roots): return poss_a # if element has one area child use if not root class poss_areas = [c for c in children if c.name == "area"] if len(poss_areas) == 1: poss_area = poss_areas[0] - if not mf2_classes.root(poss_area.get("class", [])): + if not mf2_classes.root(poss_area.get("class", []), filtered_roots): return poss_area # if element is a or area use its href if exists @@ -181,7 +181,7 @@ def get_url_child(children): if ( poss_child is None and len(children) == 1 - and not mf2_classes.root(children[0].get("class", [])) + and not mf2_classes.root(children[0].get("class", []), filtered_roots) ): grandchildren = list(get_children(children[0])) poss_child = get_url_child(grandchildren) diff --git a/mf2py/mf2_classes.py b/mf2py/mf2_classes.py index 6565414..c48b9ad 100644 --- a/mf2py/mf2_classes.py +++ b/mf2py/mf2_classes.py @@ -5,6 +5,8 @@ _mf2_properties_re = re.compile("(p|e|u|dt)-(:?[a-z0-9]+-)?[a-z]+(:?-[a-z]+)*$") _mf2_e_properties_re = re.compile("e-(:?[a-z0-9]+-)?[a-z]+(:?-[a-z]+)*$") +CONFLICTING_ROOTS_TAILWIND = {"auto", "fit", "full", "max", "min", "px", "screen"} + def filter_classes(classes, regex=_mf2_classes_re): """detect classes that are valid names for mf2, sort in dictionary by prefix""" @@ -20,8 +22,10 @@ def filter_classes(classes, regex=_mf2_classes_re): return types -def root(classes): - return {c for c in classes if _mf2_roots_re.match(c)} +def root(classes, filtered_roots): + return { + c for c in classes if _mf2_roots_re.match(c) and c[2:] not in filtered_roots + } def is_property_class(class_): diff --git a/mf2py/parser.py b/mf2py/parser.py index 23accba..1ff8057 100644 --- a/mf2py/parser.py +++ b/mf2py/parser.py @@ -19,7 +19,14 @@ from .version import __version__ -def parse(doc=None, url=None, html_parser=None, expose_dom=False, metaformats=False): +def parse( + doc=None, + url=None, + html_parser=None, + expose_dom=False, + metaformats=False, + filter_roots=False, +): """ Parse a microformats2 document or url and return a json dictionary. @@ -44,6 +51,7 @@ def parse(doc=None, url=None, html_parser=None, expose_dom=False, metaformats=Fa html_parser, expose_dom=expose_dom, metaformats=metaformats, + filter_roots=filter_roots, ).to_dict() @@ -80,6 +88,7 @@ def __init__( html_parser=None, expose_dom=False, metaformats=False, + filter_roots=False, ): self.__url__ = None self.__doc__ = None @@ -94,9 +103,16 @@ def __init__( "version": __version__, }, } - self.__metaformats = metaformats - self.expose_dom = expose_dom self.lang = None + self.expose_dom = expose_dom + self.__metaformats = metaformats + try: + self.filtered_roots = set(filter_roots) + except TypeError: + if filter_roots: + self.filtered_roots = mf2_classes.CONFLICTING_ROOTS_TAILWIND + else: + self.filtered_roots = [] # use default parser if none specified self.__html_parser__ = html_parser or "html5lib" @@ -189,8 +205,12 @@ def handle_microformat( parsed_types_aggregation = set() if backcompat_mode: - el = backcompat.apply_rules(el, self.__html_parser__) - root_class_names = mf2_classes.root(el.get("class", [])) + el = backcompat.apply_rules( + el, self.__html_parser__, self.filtered_roots + ) + root_class_names = mf2_classes.root( + el.get("class", []), self.filtered_roots + ) root_lang = el.attrs.get("lang") @@ -220,13 +240,13 @@ def handle_microformat( "peh" ): properties["name"] = [ - implied_properties.name(el, base_url=self.__url__) + implied_properties.name(el, self.__url__, self.filtered_roots) ] if "photo" not in properties and parsed_types_aggregation.isdisjoint( "uh" ): - x = implied_properties.photo(el, base_url=self.__url__) + x = implied_properties.photo(el, self.__url__, self.filtered_roots) if x is not None: properties["photo"] = [x] @@ -234,7 +254,7 @@ def handle_microformat( if "url" not in properties and parsed_types_aggregation.isdisjoint( "uh" ): - x = implied_properties.url(el, base_url=self.__url__) + x = implied_properties.url(el, self.__url__, self.filtered_roots) if x is not None: properties["url"] = [x] @@ -492,7 +512,7 @@ def parse_el(el, ctx): classes = el.get("class", []) # find potential microformats in root classnames h-* - potential_microformats = mf2_classes.root(classes) + potential_microformats = mf2_classes.root(classes, self.filtered_roots) # if potential microformats found parse them if potential_microformats: diff --git a/test/examples/filter_roots.html b/test/examples/filter_roots.html new file mode 100644 index 0000000..ce72889 --- /dev/null +++ b/test/examples/filter_roots.html @@ -0,0 +1,9 @@ +

Tailwind root filter

+
fnord
+
fnord
+
fnord
+
fnord
+
fnord
+
fnord
+
fnord
+
fnord
diff --git a/test/examples/filter_roots_custom.html b/test/examples/filter_roots_custom.html new file mode 100644 index 0000000..4ff79de --- /dev/null +++ b/test/examples/filter_roots_custom.html @@ -0,0 +1,6 @@ +

Custom root filter

+
fnord
+
fnord
+
fnord
+
fnord
+
fnord
diff --git a/test/test_parser.py b/test/test_parser.py index 7a5cea5..a4ebdc3 100644 --- a/test/test_parser.py +++ b/test/test_parser.py @@ -1126,6 +1126,19 @@ def test_all_u_cases(): ) +def test_filtered_roots(): + result = parse_fixture("filter_roots.html") + assert len(result["items"]) == 8 + + result = parse_fixture("filter_roots.html", filter_roots=True) + assert len(result["items"]) == 1 + + result = parse_fixture( + "filter_roots_custom.html", filter_roots={"foo", "bar", "bat", "baz"} + ) + assert len(result["items"]) == 1 + + def test_metaformats_flag_false(): result = parse_fixture("metaformats_ogp.html") assert result["items"] == []