Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(Experimental): add support for metaformats #229

Merged
merged 4 commits into from
Sep 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build_and_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
- uses: actions/checkout@v2
- uses: actions/setup-node@v1
with:
node-version: 16
node-version: 18
- name: Install dependencies
run: yarn
- name: Lint code
Expand Down
2 changes: 1 addition & 1 deletion .nvmrc
Original file line number Diff line number Diff line change
@@ -1 +1 @@
16
18
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ These are sourced from the element themselves, a parent microformat, the HTML do

When parsing microformats for text content, all the consecutive whitespace is collapsed into a single space. `<br/>` and `<p>` tags are treated as line breaks.

#### `metaformats`

Enables fallback to [metaformats](https://microformats.org/wiki/metaformats) parsing which looks at `<meta>` tags to infer content.

## Contributing

See our [contributing guidelines](./CONTRIBUTING.md) for more information.
6 changes: 5 additions & 1 deletion demo/demo.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ window.parseHtml = () => {
const baseUrl = document.getElementById("base-url").value;
const lang = document.getElementById("lang").checked;
const textContent = document.getElementById("textContent").checked;
const metaformats = document.getElementById("metaformats").checked;

return parse(html, { baseUrl, experimental: { lang, textContent } });
return parse(html, {
baseUrl,
experimental: { lang, textContent, metaformats },
});
};
10 changes: 10 additions & 0 deletions demo/index.tpl.html
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,16 @@ <h3>Experimental options</h3>
/>
<span>Better text content</span>
</label>
<label>
<input
type="checkbox"
name="metaformats"
id="metaformats"
value="true"
checked
/>
<span>Metaformats parsing</span>
</label>
</p>

<div class="submit">
Expand Down
246 changes: 246 additions & 0 deletions src/helpers/metaformats.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
import { Document, Element } from "parse5";

import { MicroformatRoot, ParsingOptions } from "../types";
import {
getAttributeIfTag,
getAttributeValue,
hasRelIntersect,
} from "./attributes";
import { isEnabled } from "./experimental";
import { isElement, isTag } from "./nodeMatchers";

/** Special key for title tag in meta collection */
const TITLE_TAG_KEY = "<title>";
const CANONICAL_URL_KEY = "<canonical>";
const MEDIA_TYPES = ["image", "video", "audio"];

interface ComplexMediaMeta {
value: string;
alt: string;
}
type MetaTagContent = string | ComplexMediaMeta;

/**
* Creates a normalized store for meta tags
*/
const initializeMetaContentCollection = (): MetaContentCollection => {
/**
* Collection of all relevant meta tag content
* Since tag order isn't guaranteed, need to collect all value before applying defaults
*/
const metaContent: Record<string, MetaTagContent[]> = {};

/**
* Gets the values of the first property found
* @param properties Array of properties to look for, preferred item first
*/
const get = (properties: string[]) => {
for (const key of properties) {
if (metaContent[key]) {
return metaContent[key];
}
}
return;
};

/**
* Stores meta tag values.
*
* Includes following normalization rules:
* - Duplicates are removed from repeated (array) tags
* - src, url, and secure_url media tags are treated same as base (e.g. og:image:url -> og:image)
* - Alt text is added as property on last image url
*/
const set = (key: string, value: string) => {
// Split tag name to normalize values like "og:video:url"
const [domain, type, subtype] = key.split(":");

// Media tags specific parsing
if (
(domain === "og" || domain === "twitter") &&
MEDIA_TYPES.includes(type)
) {
if (subtype === "alt") {
const existingMedia = metaContent[`${domain}:${type}`];

if (existingMedia?.length) {
const last = existingMedia.pop();

if (typeof last === "string") {
existingMedia.push({ value: last, alt: value });
} else if (last) {
// Found duplicate alt text tag so re-inserting existing
// last should always be object. if condition added for types
existingMedia.push(last);
}
}

return; // Stop as alt text is already added
} else if (["url", "secure_url"].includes(subtype)) {
aciccarello marked this conversation as resolved.
Show resolved Hide resolved
// Mutate key to normalize different url values
// Duplicates will be cleaned up on insertion
key = `${domain}:${type}`;
}
}
const existing = metaContent[key];

if (existing) {
const isDuplicate = existing
.map((existingValue) =>
typeof existingValue === "string"
? existingValue
: existingValue.value
)
.some((existingValue) => value === existingValue);

if (!isDuplicate) {
metaContent[key].push(value);
} // Else ignore duplicates
} else {
metaContent[key] = [value];
}
};

return {
metaContent,
set,
get,
};
};

interface MetaContentCollection {
metaContent: Record<string, MetaTagContent[]>;
set: (key: string, value: string) => void;
get: (properties: string[]) => MetaTagContent[] | undefined;
}

const collectMetaTags = (head: Element): MetaContentCollection => {
const metaTags = initializeMetaContentCollection();

for (const i in head.childNodes) {
const child = head.childNodes[i];

if (!isElement(child)) {
continue;
}

const content = getAttributeIfTag(child, ["meta"], "content");
if (content) {
// Tags keys usually use the "name" attribute but open graph uses "property"
// Consider them separately in case a meta tag uses both
// e.g. <meta property="og:title" name="author" content="Johnny Complex" >
const property = getAttributeValue(child, "property");
if (property) {
metaTags.set(property, content);
}

const name = getAttributeValue(child, "name");
if (name && name !== property) {
metaTags.set(name, content);
}
} else if (child.tagName === "title" && "value" in child.childNodes[0]) {
metaTags.set(TITLE_TAG_KEY, child.childNodes[0].value);
} else if (
child.tagName === "link" &&
hasRelIntersect(child, ["canonical"])
) {
const canonicalUrl = getAttributeValue(child, "href");
if (canonicalUrl) {
metaTags.set(CANONICAL_URL_KEY, canonicalUrl);
}
}
}
return metaTags;
};

/**
* Collect meta content into a microformat object
* @param metaTags Previously parsed meta tag collection
* @param options Library parsing options
*/
const combineRoot = (
metaTags: MetaContentCollection,
options: ParsingOptions
): MicroformatRoot[] => {
const item: MicroformatRoot = { properties: {} };

if (isEnabled(options, "lang") && options.inherited.lang) {
item.lang = options.inherited.lang;
}

/**
* Define property on microformat root if values are found
* @param property Key of microformats property
* @param value Array of values for the property. Empty and undefined values are not added.
*/
const setMicroformatProp = (
property: string,
value: MetaTagContent[] = []
) => {
const filteredValue = value.filter(Boolean);
if (filteredValue.length) {
item.properties[property] = filteredValue;
}
};

let impliedRootClass = "h-entry";
const [ogType] = metaTags.get(["og:type"]) ?? [];
if (ogType && typeof ogType === "string") {
if (ogType === "profile") {
impliedRootClass = "h-card";
} else if (["music", "video"].some((type) => ogType.includes(type))) {
impliedRootClass = "h-cite";
} // else h-entry
}
item.type = [impliedRootClass];

setMicroformatProp(
"name",
metaTags.get(["og:title", "twitter:title", TITLE_TAG_KEY])
);
setMicroformatProp(
"summary",
metaTags.get(["og:description", "twitter:description", "description"])
);
setMicroformatProp("featured", metaTags.get(["og:image", "twitter:image"]));
setMicroformatProp("video", metaTags.get(["og:video", "twitter:video"]));
setMicroformatProp("audio", metaTags.get(["og:audio", "twitter:audio"]));
setMicroformatProp(
"published",
metaTags.get(["article:published_time", "date"])
);
setMicroformatProp("updated", metaTags.get(["article:modified_time"]));
setMicroformatProp("author", metaTags.get(["article:author", "author"]));
setMicroformatProp("url", metaTags.get(["og:url", CANONICAL_URL_KEY]));

// Publication properties useful for h-cite
setMicroformatProp(
"publication",
metaTags.get(["og:site_name", "publisher"])
);

if (impliedRootClass === "h-card") {
setMicroformatProp("given-name", metaTags.get(["profile:first_name"]));
setMicroformatProp("family-name", metaTags.get(["profile:last_name"]));
}

if (Object.keys(item.properties).length === 0) {
return [];
}

return [item];
};

export const parseMetaformats = (
doc: Document,
options: ParsingOptions
): MicroformatRoot[] => {
// Per validation, html element will always be found
const html = doc.childNodes.find(isTag("html"));
const head = html?.childNodes.find(isTag("head"));

// Per manual testing, head will always be defined
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
const metaContent = collectMetaTags(head!);
return combineRoot(metaContent, options);
};
5 changes: 5 additions & 0 deletions src/helpers/nodeMatchers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ const propClassRegex = classRegex("(p|e|u|dt)");
export const isElement = (node: Node): node is Element =>
"tagName" in node && "childNodes" in node;

export const isTag =
(tagName: string) =>
(node: Node): node is Element =>
isElement(node) && node.tagName === tagName;

export const isTextNode = (node: Node): node is TextNode => "value" in node;

export const isMicroformatV2Root = (node: Element): boolean =>
Expand Down
1 change: 1 addition & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export interface Options {
experimental?: {
lang?: boolean;
textContent?: boolean;
metaformats?: boolean;
};
}

Expand Down
3 changes: 3 additions & 0 deletions src/microformats/property.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ export const parseP = (node: Element, options: ParsingOptions): string =>
getAttributeIfTag(node, ["abbr", "link"], "title") ??
getAttributeIfTag(node, ["data"], "value") ??
getAttributeIfTag(node, ["img", "area"], "alt") ??
getAttributeIfTag(node, ["meta"], "content") ??
textContent(node, options);

export const parseU = (
Expand All @@ -49,6 +50,7 @@ export const parseU = (
valueClassPattern(node, options) ??
getAttributeIfTag(node, ["abbr"], "title") ??
getAttributeIfTag(node, ["data", "input"], "value") ??
getAttributeIfTag(node, ["meta"], "content") ??
textContent(node, options);

if (typeof url === "string" && isLocalLink(url)) {
Expand All @@ -63,6 +65,7 @@ const parseDt = (node: Element, options: ParsingOptions): string =>
getAttributeIfTag(node, ["time", "ins", "del"], "datetime") ??
getAttributeIfTag(node, ["abbr"], "title") ??
getAttributeIfTag(node, ["data", "input"], "value") ??
getAttributeIfTag(node, ["meta"], "content") ??
textContent(node, options);

export const parseE = (node: Element, options: ParsingOptions): Html => {
Expand Down
13 changes: 10 additions & 3 deletions src/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import { isMicroformatRoot } from "./helpers/nodeMatchers";
import { ParsedDocument, ParserOptions, ParsingOptions } from "./types";
import { validateParsedHtml } from "./validator";
import { documentSetup } from "./helpers/documentSetup";
import { parseMetaformats } from "./helpers/metaformats";
import { isEnabled } from "./helpers/experimental";

export const parser = (
html: string,
Expand All @@ -22,12 +24,17 @@ export const parser = (
idRefs,
inherited: { roots: [], lang },
};
let items = findChildren(doc, isMicroformatRoot).map((mf) =>
parseMicroformat(mf, parsingOptions)
);

if (items.length === 0 && isEnabled(parsingOptions, "metaformats")) {
items = parseMetaformats(doc, parsingOptions);
}

return {
rels,
"rel-urls": relUrls,
items: findChildren(doc, isMicroformatRoot).map((mf) =>
parseMicroformat(mf, parsingOptions)
),
items,
};
};
1 change: 1 addition & 0 deletions src/rels/rels.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ export const parseRel = (
relUrls[href] = { rels: [rel], text };
} else if (!relUrls[href].rels.includes(rel)) {
relUrls[href].rels.push(rel);
relUrls[href].rels.sort();
}

if (text && !relUrls[href].text) {
Expand Down
Loading