From ca840defda6d4b6e36528dc121e94707d88410cd Mon Sep 17 00:00:00 2001
From: Anthony Ciccarello <aciccarello@sitepen.com>
Date: Mon, 10 Jul 2023 01:15:13 -0700
Subject: [PATCH] implement metaformats parsing

Closes #224
---
 README.md                                     |   4 +
 demo/demo.js                                  |   6 +-
 demo/index.tpl.html                           |  10 +
 src/helpers/metaformats.ts                    | 246 ++++++++++++++++++
 src/helpers/nodeMatchers.ts                   |   5 +
 src/microformats/property.ts                  |   3 +
 src/parser.ts                                 |  13 +-
 src/types.ts                                  |   5 +-
 src/validator.ts                              |  12 +-
 test/scenarios.spec.ts                        |  17 +-
 .../metaformats-missing-head.html             |   8 +
 .../metaformats-missing-head.json             |   5 +
 .../experimental/metaformats-og-article.html  |  25 +-
 .../experimental/metaformats-og-article.json  |  29 ++-
 .../metaformats-og-audio-soundcloud.html      |  67 +++++
 .../metaformats-og-audio-soundcloud.json      |  22 ++
 .../metaformats-og-profile-linkedin.html      | 207 +++++++++++++++
 .../metaformats-og-profile-linkedin.json      |  31 +++
 .../metaformats-og-video-vimeo.html           |  68 +++++
 .../metaformats-og-video-vimeo.json           |  34 +++
 .../experimental/metaformats-prefer-mf.html   |  24 ++
 .../experimental/metaformats-prefer-mf.json   |  15 ++
 .../experimental/metaformats-standard.html    |  19 ++
 .../experimental/metaformats-standard.json    |  25 ++
 .../metaformats-twitter-article.html          |  19 ++
 .../metaformats-twitter-article.json          |  23 ++
 26 files changed, 914 insertions(+), 28 deletions(-)
 create mode 100644 src/helpers/metaformats.ts
 create mode 100644 test/suites/experimental/metaformats-missing-head.html
 create mode 100644 test/suites/experimental/metaformats-missing-head.json
 create mode 100644 test/suites/experimental/metaformats-og-audio-soundcloud.html
 create mode 100644 test/suites/experimental/metaformats-og-audio-soundcloud.json
 create mode 100644 test/suites/experimental/metaformats-og-profile-linkedin.html
 create mode 100644 test/suites/experimental/metaformats-og-profile-linkedin.json
 create mode 100644 test/suites/experimental/metaformats-og-video-vimeo.html
 create mode 100644 test/suites/experimental/metaformats-og-video-vimeo.json
 create mode 100644 test/suites/experimental/metaformats-prefer-mf.html
 create mode 100644 test/suites/experimental/metaformats-prefer-mf.json
 create mode 100644 test/suites/experimental/metaformats-standard.html
 create mode 100644 test/suites/experimental/metaformats-standard.json
 create mode 100644 test/suites/experimental/metaformats-twitter-article.html
 create mode 100644 test/suites/experimental/metaformats-twitter-article.json
diff --git a/README.md b/README.md
index f1b2e03b..5f702264 100644
--- a/README.md
+++ b/README.md
@@ -109,6 +109,10 @@ These are sourced from the element themselves, a parent microformat, the HTML do
 
 When parsing microformats for text content, all the consecutive whitespace is collapsed into a single space. `<br/>` and `<p>` tags are treated as line breaks.
 
+#### `metaformats`
+
+Enables fallback to [metaformats](https://microformats.org/wiki/metaformats) parsing which looks at `<meta>` tags to infer content.
+
 ## Contributing
 
 See our [contributing guidelines](./CONTRIBUTING.md) for more information.
diff --git a/demo/demo.js b/demo/demo.js
index bd43e018..da407322 100644
--- a/demo/demo.js
+++ b/demo/demo.js
@@ -32,6 +32,10 @@ window.parseHtml = () => {
   const baseUrl = document.getElementById("base-url").value;
   const lang = document.getElementById("lang").checked;
   const textContent = document.getElementById("textContent").checked;
+  const metaformats = document.getElementById("metaformats").checked;
 
-  return parse(html, { baseUrl, experimental: { lang, textContent } });
+  return parse(html, {
+    baseUrl,
+    experimental: { lang, textContent, metaformats },
+  });
 };
diff --git a/demo/index.tpl.html b/demo/index.tpl.html
index e90b34d4..ce59a30f 100644
--- a/demo/index.tpl.html
+++ b/demo/index.tpl.html
@@ -72,6 +72,16 @@ <h3>Experimental options</h3>
             />
             <span>Better text content</span>
           </label>
+          <label>
+            <input
+              type="checkbox"
+              name="metaformats"
+              id="metaformats"
+              value="true"
+              checked
+            />
+            <span>Metaformats parsing</span>
+          </label>
         </p>
 
         <div class="submit">
diff --git a/src/helpers/metaformats.ts b/src/helpers/metaformats.ts
new file mode 100644
index 00000000..a8faf24e
--- /dev/null
+++ b/src/helpers/metaformats.ts
@@ -0,0 +1,246 @@
+import { Document, Element } from "parse5";
+
+import { MicroformatRoot, ParsingOptions } from "../types";
+import {
+  getAttributeIfTag,
+  getAttributeValue,
+  hasRelIntersect,
+} from "./attributes";
+import { isEnabled } from "./experimental";
+import { isElement, isTag } from "./nodeMatchers";
+
+/** Special key for title tag in meta collection */
+const TITLE_TAG_KEY = "<title>";
+const CANONICAL_URL_KEY = "<canonical>";
+const MEDIA_TYPES = ["image", "video", "audio"];
+
+interface ComplexMediaMeta {
+  value: string;
+  alt: string;
+}
+type MetaTagContent = string | ComplexMediaMeta;
+
+/**
+ * Creates a normalized store for meta tags
+ */
+const initializeMetaContentCollection = (): MetaContentCollection => {
+  /**
+   * Collection of all relevant meta tag content
+   * Since tag order isn't guaranteed, need to collect all value before applying defaults
+   */
+  const metaContent: Record<string, MetaTagContent[]> = {};
+
+  /**
+   * Gets the values of the first property found
+   * @param properties Array of properties to look for, preferred item first
+   */
+  const get = (properties: string[]) => {
+    for (const key of properties) {
+      if (metaContent[key]) {
+        return metaContent[key];
+      }
+    }
+    return;
+  };
+
+  /**
+   * Stores meta tag values.
+   *
+   * Includes following normalization rules:
+   * - Duplicates are removed from repeated (array) tags
+   * - src, url, and secure_url media tags are treated same as base (e.g. og:image:url -> og:image)
+   * - Alt text is added as property on last image url
+   */
+  const set = (key: string, value: string) => {
+    // Split tag name to normalize values like "og:video:url"
+    const [domain, type, subtype] = key.split(":");
+
+    // Media tags specific parsing
+    if (
+      (domain === "og" || domain === "twitter") &&
+      MEDIA_TYPES.includes(type)
+    ) {
+      if (subtype === "alt") {
+        const existingMedia = metaContent[`${domain}:${type}`];
+
+        if (existingMedia?.length) {
+          const last = existingMedia.pop();
+
+          if (typeof last === "string") {
+            existingMedia.push({ value: last, alt: value });
+          } else if (last) {
+            // Found duplicate alt text tag so re-inserting existing
+            // last should always be object. if condition added for types
+            existingMedia.push(last);
+          }
+        }
+
+        return; // Stop as alt text is already added
+      } else if (["url", "secure_url"].includes(subtype)) {
+        // Mutate key to normalize different url values
+        // Duplicates will be cleaned up on insertion
+        key = `${domain}:${type}`;
+      }
+    }
+    const existing = metaContent[key];
+
+    if (existing) {
+      const isDuplicate = existing
+        .map((existingValue) =>
+          typeof existingValue === "string"
+            ? existingValue
+            : existingValue.value
+        )
+        .some((existingValue) => value === existingValue);
+
+      if (!isDuplicate) {
+        metaContent[key].push(value);
+      } // Else ignore duplicates
+    } else {
+      metaContent[key] = [value];
+    }
+  };
+
+  return {
+    metaContent,
+    set,
+    get,
+  };
+};
+
+interface MetaContentCollection {
+  metaContent: Record<string, MetaTagContent[]>;
+  set: (key: string, value: string) => void;
+  get: (properties: string[]) => MetaTagContent[] | undefined;
+}
+
+const collectMetaTags = (head: Element): MetaContentCollection => {
+  const metaTags = initializeMetaContentCollection();
+
+  for (const i in head.childNodes) {
+    const child = head.childNodes[i];
+
+    if (!isElement(child)) {
+      continue;
+    }
+
+    const content = getAttributeIfTag(child, ["meta"], "content");
+    if (content) {
+      // Tags keys usually use the "name" attribute but open graph uses "property"
+      // Consider them separately in case a meta tag uses both
+      // e.g. <meta property="og:title" name="author" content="Johnny Complex" >
+      const property = getAttributeValue(child, "property");
+      if (property) {
+        metaTags.set(property, content);
+      }
+
+      const name = getAttributeValue(child, "name");
+      if (name && name !== property) {
+        metaTags.set(name, content);
+      }
+    } else if (child.tagName === "title" && "value" in child.childNodes[0]) {
+      metaTags.set(TITLE_TAG_KEY, child.childNodes[0].value);
+    } else if (
+      child.tagName === "link" &&
+      hasRelIntersect(child, ["canonical"])
+    ) {
+      const canonicalUrl = getAttributeValue(child, "href");
+      if (canonicalUrl) {
+        metaTags.set(CANONICAL_URL_KEY, canonicalUrl);
+      }
+    }
+  }
+  return metaTags;
+};
+
+/**
+ * Collect meta content into a microformat object
+ * @param metaTags Previously parsed meta tag collection
+ * @param options Library parsing options
+ */
+const combineRoot = (
+  metaTags: MetaContentCollection,
+  options: ParsingOptions
+): MicroformatRoot[] => {
+  const item: MicroformatRoot = { properties: {} };
+
+  if (isEnabled(options, "lang") && options.inherited.lang) {
+    item.lang = options.inherited.lang;
+  }
+
+  /**
+   * Define property on microformat root if values are found
+   * @param property Key of microformats property
+   * @param value Array of values for the property. Empty and undefined values are not added.
+   */
+  const setMicroformatProp = (
+    property: string,
+    value: MetaTagContent[] = []
+  ) => {
+    const filteredValue = value.filter(Boolean);
+    if (filteredValue.length) {
+      item.properties[property] = filteredValue;
+    }
+  };
+
+  let impliedRootClass = "h-entry";
+  const [ogType] = metaTags.get(["og:type"]) ?? [];
+  if (ogType && typeof ogType === "string") {
+    if (ogType === "profile") {
+      impliedRootClass = "h-card";
+    } else if (["music", "video"].some((type) => ogType.includes(type))) {
+      impliedRootClass = "h-cite";
+    } // else h-entry
+  }
+  item.type = [impliedRootClass];
+
+  setMicroformatProp(
+    "name",
+    metaTags.get(["og:title", "twitter:title", TITLE_TAG_KEY])
+  );
+  setMicroformatProp(
+    "summary",
+    metaTags.get(["og:description", "twitter:description", "description"])
+  );
+  setMicroformatProp("featured", metaTags.get(["og:image", "twitter:image"]));
+  setMicroformatProp("video", metaTags.get(["og:video", "twitter:video"]));
+  setMicroformatProp("audio", metaTags.get(["og:audio", "twitter:audio"]));
+  setMicroformatProp(
+    "published",
+    metaTags.get(["article:published_time", "date"])
+  );
+  setMicroformatProp("updated", metaTags.get(["article:modified_time"]));
+  setMicroformatProp("author", metaTags.get(["article:author", "author"]));
+  setMicroformatProp("url", metaTags.get(["og:url", CANONICAL_URL_KEY]));
+
+  // Publication properties useful for h-cite
+  setMicroformatProp(
+    "publication",
+    metaTags.get(["og:site_name", "publisher"])
+  );
+
+  if (impliedRootClass === "h-card") {
+    setMicroformatProp("given-name", metaTags.get(["profile:first_name"]));
+    setMicroformatProp("family-name", metaTags.get(["profile:last_name"]));
+  }
+
+  if (Object.keys(item.properties).length === 0) {
+    return [];
+  }
+
+  return [item];
+};
+
+export const parseMetaformats = (
+  doc: Document,
+  options: ParsingOptions
+): MicroformatRoot[] => {
+  // Per validation, html element will always be found
+  const html = doc.childNodes.find(isTag("html"));
+  const head = html?.childNodes.find(isTag("head"));
+
+  // Per manual testing, head will always be defined
+  // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
+  const metaContent = collectMetaTags(head!);
+  return combineRoot(metaContent, options);
+};
diff --git a/src/helpers/nodeMatchers.ts b/src/helpers/nodeMatchers.ts
index ff7858d8..6aacaed7 100644
--- a/src/helpers/nodeMatchers.ts
+++ b/src/helpers/nodeMatchers.ts
@@ -20,6 +20,11 @@ const propClassRegex = classRegex("(p|e|u|dt)");
 export const isElement = (node: Node): node is Element =>
   "tagName" in node && "childNodes" in node;
 
+export const isTag =
+  (tagName: string) =>
+  (node: Node): node is Element =>
+    isElement(node) && node.tagName === tagName;
+
 export const isTextNode = (node: Node): node is TextNode => "value" in node;
 
 export const isMicroformatV2Root = (node: Element): boolean =>
diff --git a/src/microformats/property.ts b/src/microformats/property.ts
index ba4be1e4..5f271868 100644
--- a/src/microformats/property.ts
+++ b/src/microformats/property.ts
@@ -34,6 +34,7 @@ export const parseP = (node: Element, options: ParsingOptions): string =>
   getAttributeIfTag(node, ["abbr", "link"], "title") ??
   getAttributeIfTag(node, ["data"], "value") ??
   getAttributeIfTag(node, ["img", "area"], "alt") ??
+  getAttributeIfTag(node, ["meta"], "content") ??
   textContent(node, options);
 
 export const parseU = (
@@ -49,6 +50,7 @@ export const parseU = (
     valueClassPattern(node, options) ??
     getAttributeIfTag(node, ["abbr"], "title") ??
     getAttributeIfTag(node, ["data", "input"], "value") ??
+    getAttributeIfTag(node, ["meta"], "content") ??
     textContent(node, options);
 
   if (typeof url === "string" && isLocalLink(url)) {
@@ -63,6 +65,7 @@ const parseDt = (node: Element, options: ParsingOptions): string =>
   getAttributeIfTag(node, ["time", "ins", "del"], "datetime") ??
   getAttributeIfTag(node, ["abbr"], "title") ??
   getAttributeIfTag(node, ["data", "input"], "value") ??
+  getAttributeIfTag(node, ["meta"], "content") ??
   textContent(node, options);
 
 export const parseE = (node: Element, options: ParsingOptions): Html => {
diff --git a/src/parser.ts b/src/parser.ts
index 1b639617..e4bbed7b 100644
--- a/src/parser.ts
+++ b/src/parser.ts
@@ -6,6 +6,8 @@ import { isMicroformatRoot } from "./helpers/nodeMatchers";
 import { ParsedDocument, ParserOptions, ParsingOptions } from "./types";
 import { validateParsedHtml } from "./validator";
 import { documentSetup } from "./helpers/documentSetup";
+import { parseMetaformats } from "./helpers/metaformats";
+import { isEnabled } from "./helpers/experimental";
 
 export const parser = (
   html: string,
@@ -22,12 +24,17 @@ export const parser = (
     idRefs,
     inherited: { roots: [], lang },
   };
+  let items = findChildren(doc, isMicroformatRoot).map((mf) =>
+    parseMicroformat(mf, parsingOptions)
+  );
+
+  if (items.length === 0 && isEnabled(parsingOptions, "metaformats")) {
+    items = parseMetaformats(doc, parsingOptions);
+  }
 
   return {
     rels,
     "rel-urls": relUrls,
-    items: findChildren(doc, isMicroformatRoot).map((mf) =>
-      parseMicroformat(mf, parsingOptions)
-    ),
+    items,
   };
 };
diff --git a/src/types.ts b/src/types.ts
index a6110350..902bb558 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -2,16 +2,17 @@ import { Element } from "parse5";
 
 import { BackcompatRoot } from "./backcompat";
 
-export type ExperimentalName = "lang" | "textContent";
-
 export interface ParserOptions {
   baseUrl: string;
   experimental?: {
     lang?: boolean;
     textContent?: boolean;
+    metaformats?: boolean;
   };
 }
 
+export type ExperimentalName = keyof NonNullable<ParserOptions["experimental"]>;
+
 export interface ParsingOptions extends ParserOptions {
   implyProperties?: boolean;
   idRefs: IdRefs;
diff --git a/src/validator.ts b/src/validator.ts
index 344b73b0..9ccd74dc 100644
--- a/src/validator.ts
+++ b/src/validator.ts
@@ -1,6 +1,6 @@
-import { Document, Element } from "parse5";
+import { Document } from "parse5";
 
-import { isElement } from "./helpers/nodeMatchers";
+import { isElement, isTag } from "./helpers/nodeMatchers";
 
 const assertIsString = (str: unknown, name: string): string => {
   if (typeof str === "undefined") {
@@ -103,17 +103,13 @@ export const validator = (
 export const validateParsedHtml = (doc: Document): void => {
   // <html> and <body> are always defined (based on tests)
   // Provide error handling in the event they are ever not defined
-  const html = doc.childNodes.find(
-    (child): child is Element => isElement(child) && child.tagName === "html"
-  );
+  const html = doc.childNodes.find(isTag("html"));
 
   if (!html) {
     throw new Error("Microformats parser: No <html> element found");
   }
 
-  const body = html.childNodes.find(
-    (child): child is Element => isElement(child) && child.tagName === "body"
-  );
+  const body = html.childNodes.find(isTag("body"));
 
   if (!body) {
     throw new Error("Microformats parser: No <body> element found");
diff --git a/test/scenarios.spec.ts b/test/scenarios.spec.ts
index 2884a2be..ae75c1a5 100644
--- a/test/scenarios.spec.ts
+++ b/test/scenarios.spec.ts
@@ -1,4 +1,4 @@
-import { expect } from "chai";
+import { expect, assert } from "chai";
 import * as path from "path";
 
 import { mf2 } from "../src";
@@ -72,4 +72,19 @@ describe("mf2() // experimental scenarios", () => {
       expect(result).to.deep.equal(expected);
     });
   });
+
+  it("should respect the experimental flag", () => {
+    const findTestCase = (searchName: string) =>
+      experimental.find(({ name }) => name === searchName) ??
+      assert.fail(`Test case "${searchName}" not found`);
+    const { input } = findTestCase("metaformats-og-article");
+    const { expected: emptyMfResult } = findTestCase(
+      "metaformats-missing-head"
+    );
+
+    const result = mf2(input, {
+      ...options,
+    });
+    expect(result).to.deep.equal(emptyMfResult);
+  });
 });
diff --git a/test/suites/experimental/metaformats-missing-head.html b/test/suites/experimental/metaformats-missing-head.html
new file mode 100644
index 00000000..bf48cbdf
--- /dev/null
+++ b/test/suites/experimental/metaformats-missing-head.html
@@ -0,0 +1,8 @@
+<!DOCTYPE html>
+<html lang="en">
+  <body>
+    <h1>Missing Head</h1>
+    <p>Shouldn't return any items if properties are not found.</p>
+    <img src="http://example.com/img-image.png" />
+  </body>
+</html>
diff --git a/test/suites/experimental/metaformats-missing-head.json b/test/suites/experimental/metaformats-missing-head.json
new file mode 100644
index 00000000..63780580
--- /dev/null
+++ b/test/suites/experimental/metaformats-missing-head.json
@@ -0,0 +1,5 @@
+{
+  "items": [],
+  "rels": {},
+  "rel-urls": {}
+}
diff --git a/test/suites/experimental/metaformats-og-article.html b/test/suites/experimental/metaformats-og-article.html
index de2dc257..6e0bb6b4 100644
--- a/test/suites/experimental/metaformats-og-article.html
+++ b/test/suites/experimental/metaformats-og-article.html
@@ -1,16 +1,35 @@
 <!DOCTYPE html>
 <html lang="en">
   <head>
+    <title>OGP | Title</title>
     <meta charset="utf-8" />
     <meta property="og:type" content="article" />
-    <meta property="og:title" content="Test title" />
-    <meta property="og:description" content="Test description" />
+    <meta name="twitter:title" content="Title 4 Twitter" />
+    <meta property="og:title" content="Open Graph Protocol" />
+    <meta property="og:url" content="https://ogp.me/" />
+    <meta
+      property="og:description"
+      content="OG is preferred over other meta tags"
+    />
+    <meta name="twitter:description" content="Twitter description" />
+    <meta name="description" content="Description tag" />
     <meta property="og:image" content="http://example.com/image.png" />
+    <meta property="og:image:alt" content="Alt text for image. " />
+    <meta
+      property="og:image:alt"
+      content="Ignored since alt text was already found"
+    />
+    <meta name="twitter:image" content="http://example.com/twitter-image.png" />
     <meta property="article:published_time" content="2015-05-29" />
     <meta property="article:modified_time" content="2016-02-14" />
     <meta property="article:author" content="Glenn Jones" />
+    <meta property="article:author" content="Ghost Writer" />
+    <meta property="article:tag" content="Programming" />
+    <meta property="article:tag" content="Testing" />
   </head>
   <body>
-    <h1>Test</h1>
+    <h1>OGP</h1>
+    <p>OGP tags are read if no microformats are found</p>
+    <img src="http://example.com/img-image.png" />
   </body>
 </html>
diff --git a/test/suites/experimental/metaformats-og-article.json b/test/suites/experimental/metaformats-og-article.json
index 7e475447..fd2feb84 100644
--- a/test/suites/experimental/metaformats-og-article.json
+++ b/test/suites/experimental/metaformats-og-article.json
@@ -1,15 +1,24 @@
 {
-  "items": {
-    "type": ["h-entry"],
-    "properties": {
-      "name": ["Test title"],
-      "summary": ["Test description"],
-      "photo": ["http://example.com/image.png"],
-      "published": "2015-05-29",
-      "updated": "2016-02-14",
-      "author": "Glenn Jones"
+  "items": [
+    {
+      "type": ["h-entry"],
+      "lang": "en",
+      "properties": {
+        "name": ["Open Graph Protocol"],
+        "url": ["https://ogp.me/"],
+        "summary": ["OG is preferred over other meta tags"],
+        "featured": [
+          {
+            "value": "http://example.com/image.png",
+            "alt": "Alt text for image. "
+          }
+        ],
+        "published": ["2015-05-29"],
+        "updated": ["2016-02-14"],
+        "author": ["Glenn Jones", "Ghost Writer"]
+      }
     }
-  },
+  ],
   "rels": {},
   "rel-urls": {}
 }
diff --git a/test/suites/experimental/metaformats-og-audio-soundcloud.html b/test/suites/experimental/metaformats-og-audio-soundcloud.html
new file mode 100644
index 00000000..5b3f9612
--- /dev/null
+++ b/test/suites/experimental/metaformats-og-audio-soundcloud.html
@@ -0,0 +1,67 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="theme-color" content="#333" />
+
+    <title>
+      Stream Over The Moon by Surprise Chef | Listen online for free on
+      SoundCloud
+    </title>
+    <meta
+      content="record, sounds, share, sound, audio, tracks, music, soundcloud"
+      name="keywords"
+    />
+    <meta content="SoundCloud" property="og:site_name" />
+    <meta content="SoundCloud" property="twitter:site" />
+    <meta
+      name="description"
+      content="Stream Over The Moon by Surprise Chef on desktop and mobile. Play over 320 million tracks for free on SoundCloud."
+    />
+    <meta property="twitter:title" content="Over The Moon" />
+    <meta
+      property="twitter:description"
+      content="Surprise Chef’s music is based on evoking mood; their vivid arrangements utilize time and space to build soundscapes that invite the listener into their world. The quintet’s distinct sound pulls from "
+    />
+    <meta property="twitter:card" content="player" />
+    <meta
+      property="twitter:player"
+      content="https://w.soundcloud.com/player/?url=https%3A%2F%2Fapi.soundcloud.com%2Ftracks%2F1455186289&amp;auto_play=false&amp;show_artwork=true&amp;visual=true&amp;origin=twitter"
+    />
+    <meta
+      property="twitter:url"
+      content="https://soundcloud.com/surprisechef/over-the-moon"
+    />
+    <meta property="twitter:player:height" content="400" />
+    <meta property="twitter:player:width" content="435" />
+    <meta
+      property="twitter:image"
+      content="https://i1.sndcdn.com/artworks-92VWfGsSB6dA-0-t500x500.jpg"
+    />
+    <meta property="og:type" content="music.song" />
+    <!-- Soundcloud doesn't actually use og:audio -->
+    <meta
+      property="og:audio"
+      content="https://soundcloud.com/surprisechef/over-the-moon.mp3"
+    />
+    <meta
+      property="og:url"
+      content="https://soundcloud.com/surprisechef/over-the-moon"
+    />
+    <meta property="og:title" content="Over The Moon" />
+    <meta
+      property="og:image"
+      content="https://i1.sndcdn.com/artworks-92VWfGsSB6dA-0-t500x500.jpg"
+    />
+    <meta property="og:image:width" content="500" />
+    <meta property="og:image:height" content="500" />
+    <meta
+      property="og:description"
+      content="Surprise Chef’s music is based on evoking mood; their vivid arrangements utilize time and space to build soundscapes that invite the listener into their world. The quintet’s distinct sound pulls from "
+    />
+  </head>
+  <body>
+    <h1>Over The Moon</h1>
+    <p></p>
+  </body>
+</html>
diff --git a/test/suites/experimental/metaformats-og-audio-soundcloud.json b/test/suites/experimental/metaformats-og-audio-soundcloud.json
new file mode 100644
index 00000000..596ba4a3
--- /dev/null
+++ b/test/suites/experimental/metaformats-og-audio-soundcloud.json
@@ -0,0 +1,22 @@
+{
+  "items": [
+    {
+      "lang": "en",
+      "properties": {
+        "name": ["Over The Moon"],
+        "featured": [
+          "https://i1.sndcdn.com/artworks-92VWfGsSB6dA-0-t500x500.jpg"
+        ],
+        "publication": ["SoundCloud"],
+        "summary": [
+          "Surprise Chef’s music is based on evoking mood; their vivid arrangements utilize time and space to build soundscapes that invite the listener into their world. The quintet’s distinct sound pulls from "
+        ],
+        "audio": ["https://soundcloud.com/surprisechef/over-the-moon.mp3"],
+        "url": ["https://soundcloud.com/surprisechef/over-the-moon"]
+      },
+      "type": ["h-cite"]
+    }
+  ],
+  "rels": {},
+  "rel-urls": {}
+}
diff --git a/test/suites/experimental/metaformats-og-profile-linkedin.html b/test/suites/experimental/metaformats-og-profile-linkedin.html
new file mode 100644
index 00000000..3d6a0ee2
--- /dev/null
+++ b/test/suites/experimental/metaformats-og-profile-linkedin.html
@@ -0,0 +1,207 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta name="locale" content="en_US" />
+    <link rel="canonical" href="https://www.linkedin.com/in/tantek" />
+    <title>
+      Tantek Çelik - Web Standards Lead - Mozilla Corporation | LinkedIn
+    </title>
+    <meta charset="UTF-8" />
+
+    <meta
+      name="description"
+      content="Standard: View Tantek Çelik’s profile on LinkedIn, the world’s largest professional community. Tantek has 9 jobs listed on their profile. See the complete profile on LinkedIn and discover Tantek’s connections and jobs at similar companies."
+    />
+    <!-- Looks like LinkedIn didn't use property attribute here -->
+    <meta
+      name="og:description"
+      content="OG: View Tantek Çelik’s profile on LinkedIn, the world’s largest professional community. Tantek has 9 jobs listed on their profile. See the complete profile on LinkedIn and discover Tantek’s connections and jobs at similar companies."
+    />
+    <meta
+      name="twitter:description"
+      content="Twitter: View Tantek Çelik’s profile on LinkedIn, the world’s largest professional community. Tantek has 9 jobs listed on their profile. See the complete profile on LinkedIn and discover Tantek’s connections and jobs at similar companies."
+    />
+
+    <meta
+      property="og:title"
+      content="Tantek Çelik - Web Standards Lead - Mozilla Corporation | LinkedIn"
+    />
+    <meta
+      property="og:image"
+      content="https://media.licdn.com/dms/image/C4E03AQFCJlpMj8yLiA/profile-displayphoto-shrink_800_800/0/1516166857593?e=2147483647&amp;v=beta&amp;t=JKdFRqEQmtFMizqPGG-yegxmy0vCycdFZMDVS4elPSY"
+    />
+    <meta property="og:type" content="profile" />
+
+    <meta property="profile:first_name" content="Tantek" />
+    <meta property="profile:last_name" content="Çelik" />
+
+    <meta property="og:url" content="https://www.linkedin.com/in/tantek" />
+
+    <meta name="twitter:card" content="summary" />
+    <meta name="twitter:site" content="@Linkedin" />
+    <meta
+      name="twitter:title"
+      content="Tantek Çelik - Web Standards Lead - Mozilla Corporation | LinkedIn"
+    />
+    <meta
+      name="twitter:image"
+      content="https://media.licdn.com/dms/image/C4E03AQFCJlpMj8yLiA/profile-displayphoto-shrink_800_800/0/1516166857593?e=2147483647&amp;v=beta&amp;t=JKdFRqEQmtFMizqPGG-yegxmy0vCycdFZMDVS4elPSY"
+    />
+    <script type="application/ld+json">
+      {
+        "@context": "http://schema.org",
+        "@graph": [
+          {
+            "@type": "Person",
+            "address": {
+              "@type": "PostalAddress",
+              "addressLocality": "San Francisco, California, United States",
+              "addressCountry": "us"
+            },
+            "alumniOf": [
+              {
+                "@type": "Organization",
+                "name": "Revision3",
+                "url": "https://www.linkedin.com/company/revision3?trk=ppro_cprof",
+                "location": "San Francisco Bay Area",
+                "member": {
+                  "@type": "OrganizationRole",
+                  "description": "Implemented HTML5 video support for Revision3, specifically for the iPad with scaling/fallback for both other devices and various desktop browsers as well.",
+                  "startDate": "2010-05",
+                  "endDate": "2010-05"
+                }
+              },
+              {
+                "@type": "Organization",
+                "name": "Technorati",
+                "url": "https://www.linkedin.com/company/technorati?trk=ppro_cprof",
+                "member": {
+                  "@type": "OrganizationRole",
+                  "description": "Create and drive the launch of new efforts like Technorati's Media Products services and partnerships, the Searchlet, Technorati This! favelet, blog and site widgets, and prototype microformats search. Investigate and define new standards and new technologies companywide, with partners, and throughout the industry.",
+                  "startDate": "2004-07",
+                  "endDate": "2007-07"
+                }
+              },
+              {
+                "@type": "Organization",
+                "name": "Microsoft Corporation",
+                "url": "https://www.linkedin.com/company/microsoft?trk=ppro_cprof",
+                "member": {
+                  "@type": "OrganizationRole",
+                  "startDate": "1997-03",
+                  "endDate": "2004-07"
+                }
+              },
+              {
+                "@type": "Organization",
+                "name": "6prime",
+                "member": {
+                  "@type": "OrganizationRole",
+                  "description": "6prime was my first startup which I cofounded with Eric Soldan.",
+                  "startDate": "1996-01",
+                  "endDate": "1997-05"
+                }
+              },
+              {
+                "@type": "Organization",
+                "name": "Apple Computer",
+                "url": "https://www.linkedin.com/company/apple?trk=ppro_cprof",
+                "member": {
+                  "@type": "OrganizationRole",
+                  "description": "I was one of two OpenDoc Technical Leads when I left Apple.",
+                  "startDate": "1992-01",
+                  "endDate": "1996-01"
+                }
+              }
+            ],
+            "awards": [],
+            "image": {
+              "@type": "ImageObject",
+              "contentUrl": "https://media.licdn.com/dms/image/C4E03AQFCJlpMj8yLiA/profile-displayphoto-shrink_800_800/0/1516166857593?e=2147483647&v=beta&t=JKdFRqEQmtFMizqPGG-yegxmy0vCycdFZMDVS4elPSY"
+            },
+            "jobTitle": [
+              "Web Standards Lead",
+              "co-founder, admin, community manager",
+              "Founder",
+              "Principal"
+            ],
+            "name": "Tantek Çelik",
+            "sameAs": "https://www.linkedin.com/in/tantek",
+            "url": "https://www.linkedin.com/in/tantek",
+            "memberOf": [
+              {
+                "@type": "Organization",
+                "name": "BarCamp",
+                "description": null
+              },
+              {
+                "@type": "Organization",
+                "name": "FOOCamp",
+                "description": null
+              }
+            ],
+            "worksFor": [
+              {
+                "@type": "Organization",
+                "name": "Mozilla Corporation",
+                "url": "https://www.linkedin.com/company/mozilla-corporation?trk=ppro_cprof",
+                "location": "San Francisco Bay Area",
+                "member": {
+                  "@type": "OrganizationRole",
+                  "startDate": "2010-05"
+                }
+              },
+              {
+                "@type": "Organization",
+                "name": "microformats.org",
+                "url": "https://www.linkedin.com/company/microformats.org?trk=ppro_cprof",
+                "member": {
+                  "@type": "OrganizationRole",
+                  "startDate": "2005-06"
+                }
+              },
+              {
+                "@type": "Organization",
+                "name": "GMPG",
+                "member": {
+                  "@type": "OrganizationRole",
+                  "description": "http://gmpg.org/\u003Cbr\u003E\u003Cbr\u003EThe Global Multimedia Protocols Group (GMPG) connects people through incremental simplicity.\u003Cbr\u003E\u003Cbr\u003EJoin the XHTML Friends Network (XFN).  You may already be a member.   \u003Cbr\u003E\u003Cbr\u003Ehttp://gmpg.org/xfn",
+                  "startDate": "2003-03"
+                }
+              },
+              {
+                "@type": "Organization",
+                "name": "TekaPIM",
+                "url": "https://www.linkedin.com/company/heritage-b?trk=ppro_cprof",
+                "member": {
+                  "@type": "OrganizationRole",
+                  "description": "http://tekapim.com",
+                  "startDate": "1987-09"
+                }
+              }
+            ],
+            "knowsLanguage": [],
+            "disambiguatingDescription": "",
+            "interactionStatistic": {
+              "@type": "InteractionCounter",
+              "interactionType": "https://schema.org/FollowAction",
+              "name": "Follows",
+              "userInteractionCount": 578
+            }
+          },
+          {
+            "@type": "WebPage",
+            "url": "https://www.linkedin.com/in/tantek",
+            "reviewedBy": {
+              "@type": "Person",
+              "name": "Tantek Çelik"
+            }
+          }
+        ]
+      }
+    </script>
+  </head>
+  <body>
+    <h1>Test</h1>
+  </body>
+</html>
diff --git a/test/suites/experimental/metaformats-og-profile-linkedin.json b/test/suites/experimental/metaformats-og-profile-linkedin.json
new file mode 100644
index 00000000..45dfc528
--- /dev/null
+++ b/test/suites/experimental/metaformats-og-profile-linkedin.json
@@ -0,0 +1,31 @@
+{
+  "items": [
+    {
+      "lang": "en",
+      "type": ["h-card"],
+      "properties": {
+        "name": [
+          "Tantek Çelik - Web Standards Lead - Mozilla Corporation | LinkedIn"
+        ],
+        "summary": [
+          "OG: View Tantek Çelik’s profile on LinkedIn, the world’s largest professional community. Tantek has 9 jobs listed on their profile. See the complete profile on LinkedIn and discover Tantek’s connections and jobs at similar companies."
+        ],
+        "featured": [
+          "https://media.licdn.com/dms/image/C4E03AQFCJlpMj8yLiA/profile-displayphoto-shrink_800_800/0/1516166857593?e=2147483647&v=beta&t=JKdFRqEQmtFMizqPGG-yegxmy0vCycdFZMDVS4elPSY"
+        ],
+        "given-name": ["Tantek"],
+        "family-name": ["Çelik"],
+        "url": ["https://www.linkedin.com/in/tantek"]
+      }
+    }
+  ],
+  "rels": {
+    "canonical": ["https://www.linkedin.com/in/tantek"]
+  },
+  "rel-urls": {
+    "https://www.linkedin.com/in/tantek": {
+      "rels": ["canonical"],
+      "text": ""
+    }
+  }
+}
diff --git a/test/suites/experimental/metaformats-og-video-vimeo.html b/test/suites/experimental/metaformats-og-video-vimeo.html
new file mode 100644
index 00000000..cd34b2e8
--- /dev/null
+++ b/test/suites/experimental/metaformats-og-video-vimeo.html
@@ -0,0 +1,68 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <meta property="og:site_name" content="Vimeo" />
+    <meta property="og:url" content="https://vimeo.com/789006133" />
+    <meta property="og:type" content="video.other" />
+    <meta property="og:title" content="Ultromedia Please (Interactive)" />
+    <meta
+      property="og:description"
+      content="Ultromedia Please is a winner of the 2022 Best of the Year award. To explore the full list of winners, check out vimeo.com/bestoftheyear   What begins as a helpful…"
+    />
+    <!-- Vimeo appears to be using the wrong property name here -->
+    <meta property="og:updated_time" content="2023-07-10T02:20:47-04:00" />
+    <meta
+      property="og:image"
+      content="https://i.vimeocdn.com/video/1586931541-9f193de8dc4391b9676499e272f48c10669bc145876d549fb70c917c0cb1a7dd-d"
+    />
+    <meta property="og:image:alt" content="Image alt text." />
+    <meta
+      property="og:image:secure_url"
+      content="https://i.vimeocdn.com/video/1586931541-9f193de8dc4391b9676499e272f48c10669bc145876d549fb70c917c0cb1a7dd-d"
+    />
+    <meta property="og:image:type" content="image/jpeg" />
+    <meta property="og:image:width" content="1280" />
+    <meta property="og:image:height" content="720" />
+    <meta
+      property="og:video:url"
+      content="https://player.vimeo.com/video/789006133?autoplay=1&amp;h=82e9bae2d0"
+    />
+    <meta
+      property="og:video:secure_url"
+      content="https://player.vimeo.com/video/789006133?autoplay=1&amp;h=82e9bae2d0"
+    />
+    <meta property="og:video:type" content="text/html" />
+    <meta property="og:video:width" content="1280" />
+    <meta property="og:video:height" content="720" />
+    <meta
+      name="description"
+      content="Ultromedia Please is a winner of the 2022 Best of the Year award. To explore the full list of winners, check out vimeo.com/bestoftheyear   What begins as a helpful…"
+    />
+    <meta name="twitter:card" content="player" />
+    <meta name="twitter:site" content="@vimeo" />
+    <meta name="twitter:title" content="Ultromedia Please (Interactive)" />
+    <meta
+      name="twitter:description"
+      content="Ultromedia Please is a winner of the 2022 Best of the Year award. To explore the full list of winners, check out vimeo.com/bestoftheyear   What begins as a helpful…"
+    />
+    <meta
+      name="twitter:image"
+      content="https://i.vimeocdn.com/video/1586931541-9f193de8dc4391b9676499e272f48c10669bc145876d549fb70c917c0cb1a7dd-d"
+    />
+    <meta
+      name="twitter:player"
+      content="https://player.vimeo.com/video/789006133?h=82e9bae2d0"
+    />
+    <meta name="twitter:player:width" content="1280" />
+    <meta name="twitter:player:height" content="720" />
+    <meta name="twitter:site" content="@vimeo" />
+    <link rel="canonical" href="https://vimeo.com/789006133" />
+
+    <title>Ultromedia Please (Interactive) on Vimeo</title>
+  </head>
+  <body>
+    <h1>Test</h1>
+  </body>
+</html>
diff --git a/test/suites/experimental/metaformats-og-video-vimeo.json b/test/suites/experimental/metaformats-og-video-vimeo.json
new file mode 100644
index 00000000..eeaaa98c
--- /dev/null
+++ b/test/suites/experimental/metaformats-og-video-vimeo.json
@@ -0,0 +1,34 @@
+{
+  "items": [
+    {
+      "lang": "en",
+      "type": ["h-cite"],
+      "properties": {
+        "name": ["Ultromedia Please (Interactive)"],
+        "url": ["https://vimeo.com/789006133"],
+        "summary": [
+          "Ultromedia Please is a winner of the 2022 Best of the Year award. To explore the full list of winners, check out vimeo.com/bestoftheyear   What begins as a helpful…"
+        ],
+        "featured": [
+          {
+            "value": "https://i.vimeocdn.com/video/1586931541-9f193de8dc4391b9676499e272f48c10669bc145876d549fb70c917c0cb1a7dd-d",
+            "alt": "Image alt text."
+          }
+        ],
+        "video": [
+          "https://player.vimeo.com/video/789006133?autoplay=1&h=82e9bae2d0"
+        ],
+        "publication": ["Vimeo"]
+      }
+    }
+  ],
+  "rels": {
+    "canonical": ["https://vimeo.com/789006133"]
+  },
+  "rel-urls": {
+    "https://vimeo.com/789006133": {
+      "rels": ["canonical"],
+      "text": ""
+    }
+  }
+}
diff --git a/test/suites/experimental/metaformats-prefer-mf.html b/test/suites/experimental/metaformats-prefer-mf.html
new file mode 100644
index 00000000..b689028d
--- /dev/null
+++ b/test/suites/experimental/metaformats-prefer-mf.html
@@ -0,0 +1,24 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta property="og:type" content="article" />
+    <meta name="twitter:title" content="Title 4 Twitter" />
+    <meta property="og:title" content="Test title" />
+    <meta property="og:description" content="Test description" />
+    <meta name="twitter:description" content="OG is preferred over twitter" />
+    <meta property="og:image" content="http://example.com/image.png" />
+    <meta name="twitter:image" content="http://example.com/twitter-image.png" />
+    <meta property="article:published_time" content="2015-05-29" />
+    <meta property="article:modified_time" content="2016-02-14" />
+    <meta property="article:author" content="Glenn Jones" />
+    <meta property="article:author" content="Ghost Writer" />
+    <meta property="article:tag" content="Programming" />
+    <meta property="article:tag" content="Testing" />
+  </head>
+  <body class="h-entry">
+    <h1 class="p-name">Microformats</h1>
+    <p class="p-content">True microformats should prevent metaformats</p>
+    <img src="http://example.com/mf-image.png" class="u-photo" />
+  </body>
+</html>
diff --git a/test/suites/experimental/metaformats-prefer-mf.json b/test/suites/experimental/metaformats-prefer-mf.json
new file mode 100644
index 00000000..c83eee07
--- /dev/null
+++ b/test/suites/experimental/metaformats-prefer-mf.json
@@ -0,0 +1,15 @@
+{
+  "items": [
+    {
+      "type": ["h-entry"],
+      "lang": "en",
+      "properties": {
+        "name": ["Microformats"],
+        "content": ["True microformats should prevent metaformats"],
+        "photo": ["http://example.com/mf-image.png"]
+      }
+    }
+  ],
+  "rels": {},
+  "rel-urls": {}
+}
diff --git a/test/suites/experimental/metaformats-standard.html b/test/suites/experimental/metaformats-standard.html
new file mode 100644
index 00000000..51bfde5f
--- /dev/null
+++ b/test/suites/experimental/metaformats-standard.html
@@ -0,0 +1,19 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <title>Standard Meta Tags</title>
+    <meta name="description" content="Standard meta tags are also parsed" />
+    <meta name="author" content="Ted Lasso" />
+    <meta name="publisher" content="microformats-parser" />
+    <meta name="date" content="2023-08-02" />
+    <meta
+      name="og:image:alt"
+      content="Alt text without media will be ignored."
+    />
+    <link rel="canonical" href="https://microformats.org/wiki/rel-canonical" />
+  </head>
+  <body>
+    <h1>Test</h1>
+  </body>
+</html>
diff --git a/test/suites/experimental/metaformats-standard.json b/test/suites/experimental/metaformats-standard.json
new file mode 100644
index 00000000..7fc3bdfd
--- /dev/null
+++ b/test/suites/experimental/metaformats-standard.json
@@ -0,0 +1,25 @@
+{
+  "items": [
+    {
+      "type": ["h-entry"],
+      "lang": "en",
+      "properties": {
+        "name": ["Standard Meta Tags"],
+        "summary": ["Standard meta tags are also parsed"],
+        "published": ["2023-08-02"],
+        "url": ["https://microformats.org/wiki/rel-canonical"],
+        "author": ["Ted Lasso"],
+        "publication": ["microformats-parser"]
+      }
+    }
+  ],
+  "rels": {
+    "canonical": ["https://microformats.org/wiki/rel-canonical"]
+  },
+  "rel-urls": {
+    "https://microformats.org/wiki/rel-canonical": {
+      "rels": ["canonical"],
+      "text": ""
+    }
+  }
+}
diff --git a/test/suites/experimental/metaformats-twitter-article.html b/test/suites/experimental/metaformats-twitter-article.html
new file mode 100644
index 00000000..4e347d57
--- /dev/null
+++ b/test/suites/experimental/metaformats-twitter-article.html
@@ -0,0 +1,19 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="twitter:title" content="Title 4 Twitter" />
+    <meta
+      name="twitter:description"
+      content="Twitter tags are used if no OGP tags are found."
+    />
+    <meta name="twitter:image" content="http://example.com/twitter-image.png" />
+    <meta name="twitter:image:alt" content="This is alt text for an image. " />
+    <meta property="article:published_time" content="2015-05-29" />
+    <meta property="article:modified_time" content="2016-02-14" />
+    <meta property="article:author" content="Glenn Jones" />
+  </head>
+  <body>
+    <h1>Test</h1>
+  </body>
+</html>
diff --git a/test/suites/experimental/metaformats-twitter-article.json b/test/suites/experimental/metaformats-twitter-article.json
new file mode 100644
index 00000000..1871bd5e
--- /dev/null
+++ b/test/suites/experimental/metaformats-twitter-article.json
@@ -0,0 +1,23 @@
+{
+  "items": [
+    {
+      "type": ["h-entry"],
+      "lang": "en",
+      "properties": {
+        "name": ["Title 4 Twitter"],
+        "summary": ["Twitter tags are used if no OGP tags are found."],
+        "featured": [
+          {
+            "value": "http://example.com/twitter-image.png",
+            "alt": "This is alt text for an image. "
+          }
+        ],
+        "published": ["2015-05-29"],
+        "updated": ["2016-02-14"],
+        "author": ["Glenn Jones"]
+      }
+    }
+  ],
+  "rels": {},
+  "rel-urls": {}
+}