Skip to content

Commit

Permalink
feat(Experimental): add support for metaformats (#229)
Browse files Browse the repository at this point in the history
* feat(Experimental): add support for metaformats

* implement metaformats parsing

Closes #224

* chore(deps): update micoformats/test (#1)

should fix test ordering issue

---------

Co-authored-by: aimee-gm <12508200+aimee-gm@users.noreply.github.com>
  • Loading branch information
aciccarello and aimee-gm authored Sep 4, 2023
1 parent e5b6070 commit 38e14bb
Show file tree
Hide file tree
Showing 33 changed files with 966 additions and 21 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_and_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
- uses: actions/checkout@v2
- uses: actions/setup-node@v1
with:
node-version: 16
node-version: 18
- name: Install dependencies
run: yarn
- name: Lint code
Expand Down
2 changes: 1 addition & 1 deletion .nvmrc
Original file line number Diff line number Diff line change
@@ -1 +1 @@
16
18
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ These are sourced from the element themselves, a parent microformat, the HTML do

When parsing microformats for text content, all the consecutive whitespace is collapsed into a single space. `<br/>` and `<p>` tags are treated as line breaks.

#### `metaformats`

Enables fallback to [metaformats](https://microformats.org/wiki/metaformats) parsing which looks at `<meta>` tags to infer content.

## Contributing

See our [contributing guidelines](./CONTRIBUTING.md) for more information.
6 changes: 5 additions & 1 deletion demo/demo.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ window.parseHtml = () => {
const baseUrl = document.getElementById("base-url").value;
const lang = document.getElementById("lang").checked;
const textContent = document.getElementById("textContent").checked;
const metaformats = document.getElementById("metaformats").checked;

return parse(html, { baseUrl, experimental: { lang, textContent } });
return parse(html, {
baseUrl,
experimental: { lang, textContent, metaformats },
});
};
10 changes: 10 additions & 0 deletions demo/index.tpl.html
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,16 @@ <h3>Experimental options</h3>
/>
<span>Better text content</span>
</label>
<label>
<input
type="checkbox"
name="metaformats"
id="metaformats"
value="true"
checked
/>
<span>Metaformats parsing</span>
</label>
</p>

<div class="submit">
Expand Down
246 changes: 246 additions & 0 deletions src/helpers/metaformats.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
import { Document, Element } from "parse5";

import { MicroformatRoot, ParsingOptions } from "../types";
import {
getAttributeIfTag,
getAttributeValue,
hasRelIntersect,
} from "./attributes";
import { isEnabled } from "./experimental";
import { isElement, isTag } from "./nodeMatchers";

/** Special key for title tag in meta collection */
const TITLE_TAG_KEY = "<title>";
const CANONICAL_URL_KEY = "<canonical>";
const MEDIA_TYPES = ["image", "video", "audio"];

interface ComplexMediaMeta {
value: string;
alt: string;
}
type MetaTagContent = string | ComplexMediaMeta;

/**
* Creates a normalized store for meta tags
*/
const initializeMetaContentCollection = (): MetaContentCollection => {
/**
* Collection of all relevant meta tag content
* Since tag order isn't guaranteed, need to collect all value before applying defaults
*/
const metaContent: Record<string, MetaTagContent[]> = {};

/**
* Gets the values of the first property found
* @param properties Array of properties to look for, preferred item first
*/
const get = (properties: string[]) => {
for (const key of properties) {
if (metaContent[key]) {
return metaContent[key];
}
}
return;
};

/**
* Stores meta tag values.
*
* Includes following normalization rules:
* - Duplicates are removed from repeated (array) tags
* - src, url, and secure_url media tags are treated same as base (e.g. og:image:url -> og:image)
* - Alt text is added as property on last image url
*/
const set = (key: string, value: string) => {
// Split tag name to normalize values like "og:video:url"
const [domain, type, subtype] = key.split(":");

// Media tags specific parsing
if (
(domain === "og" || domain === "twitter") &&
MEDIA_TYPES.includes(type)
) {
if (subtype === "alt") {
const existingMedia = metaContent[`${domain}:${type}`];

if (existingMedia?.length) {
const last = existingMedia.pop();

if (typeof last === "string") {
existingMedia.push({ value: last, alt: value });
} else if (last) {
// Found duplicate alt text tag so re-inserting existing
// last should always be object. if condition added for types
existingMedia.push(last);
}
}

return; // Stop as alt text is already added
} else if (["url", "secure_url"].includes(subtype)) {
// Mutate key to normalize different url values
// Duplicates will be cleaned up on insertion
key = `${domain}:${type}`;
}
}
const existing = metaContent[key];

if (existing) {
const isDuplicate = existing
.map((existingValue) =>
typeof existingValue === "string"
? existingValue
: existingValue.value
)
.some((existingValue) => value === existingValue);

if (!isDuplicate) {
metaContent[key].push(value);
} // Else ignore duplicates
} else {
metaContent[key] = [value];
}
};

return {
metaContent,
set,
get,
};
};

interface MetaContentCollection {
metaContent: Record<string, MetaTagContent[]>;
set: (key: string, value: string) => void;
get: (properties: string[]) => MetaTagContent[] | undefined;
}

const collectMetaTags = (head: Element): MetaContentCollection => {
const metaTags = initializeMetaContentCollection();

for (const i in head.childNodes) {
const child = head.childNodes[i];

if (!isElement(child)) {
continue;
}

const content = getAttributeIfTag(child, ["meta"], "content");
if (content) {
// Tags keys usually use the "name" attribute but open graph uses "property"
// Consider them separately in case a meta tag uses both
// e.g. <meta property="og:title" name="author" content="Johnny Complex" >
const property = getAttributeValue(child, "property");
if (property) {
metaTags.set(property, content);
}

const name = getAttributeValue(child, "name");
if (name && name !== property) {
metaTags.set(name, content);
}
} else if (child.tagName === "title" && "value" in child.childNodes[0]) {
metaTags.set(TITLE_TAG_KEY, child.childNodes[0].value);
} else if (
child.tagName === "link" &&
hasRelIntersect(child, ["canonical"])
) {
const canonicalUrl = getAttributeValue(child, "href");
if (canonicalUrl) {
metaTags.set(CANONICAL_URL_KEY, canonicalUrl);
}
}
}
return metaTags;
};

/**
* Collect meta content into a microformat object
* @param metaTags Previously parsed meta tag collection
* @param options Library parsing options
*/
const combineRoot = (
metaTags: MetaContentCollection,
options: ParsingOptions
): MicroformatRoot[] => {
const item: MicroformatRoot = { properties: {} };

if (isEnabled(options, "lang") && options.inherited.lang) {
item.lang = options.inherited.lang;
}

/**
* Define property on microformat root if values are found
* @param property Key of microformats property
* @param value Array of values for the property. Empty and undefined values are not added.
*/
const setMicroformatProp = (
property: string,
value: MetaTagContent[] = []
) => {
const filteredValue = value.filter(Boolean);
if (filteredValue.length) {
item.properties[property] = filteredValue;
}
};

let impliedRootClass = "h-entry";
const [ogType] = metaTags.get(["og:type"]) ?? [];
if (ogType && typeof ogType === "string") {
if (ogType === "profile") {
impliedRootClass = "h-card";
} else if (["music", "video"].some((type) => ogType.includes(type))) {
impliedRootClass = "h-cite";
} // else h-entry
}
item.type = [impliedRootClass];

setMicroformatProp(
"name",
metaTags.get(["og:title", "twitter:title", TITLE_TAG_KEY])
);
setMicroformatProp(
"summary",
metaTags.get(["og:description", "twitter:description", "description"])
);
setMicroformatProp("featured", metaTags.get(["og:image", "twitter:image"]));
setMicroformatProp("video", metaTags.get(["og:video", "twitter:video"]));
setMicroformatProp("audio", metaTags.get(["og:audio", "twitter:audio"]));
setMicroformatProp(
"published",
metaTags.get(["article:published_time", "date"])
);
setMicroformatProp("updated", metaTags.get(["article:modified_time"]));
setMicroformatProp("author", metaTags.get(["article:author", "author"]));
setMicroformatProp("url", metaTags.get(["og:url", CANONICAL_URL_KEY]));

// Publication properties useful for h-cite
setMicroformatProp(
"publication",
metaTags.get(["og:site_name", "publisher"])
);

if (impliedRootClass === "h-card") {
setMicroformatProp("given-name", metaTags.get(["profile:first_name"]));
setMicroformatProp("family-name", metaTags.get(["profile:last_name"]));
}

if (Object.keys(item.properties).length === 0) {
return [];
}

return [item];
};

export const parseMetaformats = (
doc: Document,
options: ParsingOptions
): MicroformatRoot[] => {
// Per validation, html element will always be found
const html = doc.childNodes.find(isTag("html"));
const head = html?.childNodes.find(isTag("head"));

// Per manual testing, head will always be defined
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
const metaContent = collectMetaTags(head!);
return combineRoot(metaContent, options);
};
5 changes: 5 additions & 0 deletions src/helpers/nodeMatchers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ const propClassRegex = classRegex("(p|e|u|dt)");
export const isElement = (node: Node): node is Element =>
"tagName" in node && "childNodes" in node;

export const isTag =
(tagName: string) =>
(node: Node): node is Element =>
isElement(node) && node.tagName === tagName;

export const isTextNode = (node: Node): node is TextNode => "value" in node;

export const isMicroformatV2Root = (node: Element): boolean =>
Expand Down
1 change: 1 addition & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export interface Options {
experimental?: {
lang?: boolean;
textContent?: boolean;
metaformats?: boolean;
};
}

Expand Down
3 changes: 3 additions & 0 deletions src/microformats/property.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ export const parseP = (node: Element, options: ParsingOptions): string =>
getAttributeIfTag(node, ["abbr", "link"], "title") ??
getAttributeIfTag(node, ["data"], "value") ??
getAttributeIfTag(node, ["img", "area"], "alt") ??
getAttributeIfTag(node, ["meta"], "content") ??
textContent(node, options);

export const parseU = (
Expand All @@ -49,6 +50,7 @@ export const parseU = (
valueClassPattern(node, options) ??
getAttributeIfTag(node, ["abbr"], "title") ??
getAttributeIfTag(node, ["data", "input"], "value") ??
getAttributeIfTag(node, ["meta"], "content") ??
textContent(node, options);

if (typeof url === "string" && isLocalLink(url)) {
Expand All @@ -63,6 +65,7 @@ const parseDt = (node: Element, options: ParsingOptions): string =>
getAttributeIfTag(node, ["time", "ins", "del"], "datetime") ??
getAttributeIfTag(node, ["abbr"], "title") ??
getAttributeIfTag(node, ["data", "input"], "value") ??
getAttributeIfTag(node, ["meta"], "content") ??
textContent(node, options);

export const parseE = (node: Element, options: ParsingOptions): Html => {
Expand Down
13 changes: 10 additions & 3 deletions src/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import { isMicroformatRoot } from "./helpers/nodeMatchers";
import { ParsedDocument, ParserOptions, ParsingOptions } from "./types";
import { validateParsedHtml } from "./validator";
import { documentSetup } from "./helpers/documentSetup";
import { parseMetaformats } from "./helpers/metaformats";
import { isEnabled } from "./helpers/experimental";

export const parser = (
html: string,
Expand All @@ -22,12 +24,17 @@ export const parser = (
idRefs,
inherited: { roots: [], lang },
};
let items = findChildren(doc, isMicroformatRoot).map((mf) =>
parseMicroformat(mf, parsingOptions)
);

if (items.length === 0 && isEnabled(parsingOptions, "metaformats")) {
items = parseMetaformats(doc, parsingOptions);
}

return {
rels,
"rel-urls": relUrls,
items: findChildren(doc, isMicroformatRoot).map((mf) =>
parseMicroformat(mf, parsingOptions)
),
items,
};
};
1 change: 1 addition & 0 deletions src/rels/rels.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ export const parseRel = (
relUrls[href] = { rels: [rel], text };
} else if (!relUrls[href].rels.includes(rel)) {
relUrls[href].rels.push(rel);
relUrls[href].rels.sort();
}

if (text && !relUrls[href].text) {
Expand Down
Loading

0 comments on commit 38e14bb

Please sign in to comment.