Skip to content

Commit

Permalink
Added option for getting accurate font metrics for native-text pdfs; …
Browse files Browse the repository at this point in the history
…added option for extracting fonts from PDF documents; fixed positioning of superscripts in pdf text imports
  • Loading branch information
Balearica committed Sep 9, 2024
1 parent 025456a commit 4b2b43e
Show file tree
Hide file tree
Showing 14 changed files with 355 additions and 111 deletions.
4 changes: 4 additions & 0 deletions js/containers/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ export class opt {

/** Generate debug visualizations when running OCR. */
static debugVis = false;

static extractPDFFonts = false;

static calcSuppFontInfo = false;
}

export class inputData {
Expand Down
13 changes: 13 additions & 0 deletions js/containers/fontContainer.js
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ export function loadFontFace(fontFamily, fontStyle, fontWeight, src) {

const fontFace = new FontFace(fontFamily, src1, { style: fontStyle, weight: fontWeight });

if (fontFace.status === 'error') throw new Error(`FontFace failed to load: ${fontFamily} ${fontStyle} ${fontWeight}`);

// Fonts are stored in `document.fonts` for the main thread and `WorkerGlobalScope.fonts` for workers
const fontSet = globalThis.document ? globalThis.document.fonts : globalThis.fonts;

Expand Down Expand Up @@ -157,6 +159,10 @@ export function FontContainerFont(family, style, src, opt, opentypeObj) {
/** @type {("sans"|"serif")} */
this.type = determineSansSerif(this.family) === 'SansDefault' ? 'sans' : 'serif';
this.smallCapsMult = 0.75;
/**
* @type {boolean} - Disable font. This is used to prevent a flawed font extracted from a PDF from being used.
*/
this.disable = false;

if (typeof FontFace !== 'undefined') loadFontFace(this.fontFaceName, this.fontFaceStyle, this.fontFaceWeight, this.src);
}
Expand Down Expand Up @@ -228,6 +234,9 @@ export class FontCont {
/** @type {?FontContainer} */
static opt = null;

/** @type {?Object<string, FontContainerFamilyUpload>} */
static doc = null;

/** @type {?FontContainer} */
static export = null;

Expand Down Expand Up @@ -298,6 +307,10 @@ export class FontCont {
* @returns {FontContainerFont}
*/
static getFont = (family, style = 'normal', lang = 'eng') => {
if (FontCont.doc?.[family]?.[style] && !FontCont.doc?.[family]?.[style]?.disable) {
return FontCont.doc[family][style];
}

if (lang === 'chi_sim') {
if (!FontCont.supp.chi_sim) throw new Error('chi_sim font does not exist.');
return FontCont.supp.chi_sim;
Expand Down
69 changes: 44 additions & 25 deletions js/containers/imageContainer.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { initMuPDFWorker } from '../../mupdf/mupdf-async.js';

import { getImageBitmap } from '../utils/imageUtils.js';

import { setUploadFontsWorker } from '../fontContainerMain.js';
import { updateFontContWorkerMain } from '../fontContainerMain.js';
import { pageMetricsArr } from './dataContainer.js';
import {
FontCont,
Expand All @@ -16,7 +16,7 @@ import {

import { gs } from '../generalWorkerMain.js';
import { imageUtils } from '../objects/imageObjects.js';
import { determineSansSerif, range } from '../utils/miscUtils.js';
import { range } from '../utils/miscUtils.js';
import { opt } from './app.js';

let skipTextMode = false;
Expand Down Expand Up @@ -534,42 +534,61 @@ export class ImageCache {
});

// WIP: Extract fonts embedded in PDFs.
if (false) {
// This feature is disabled by default as the results are often bad.
// In addition to only working for certain font formats, fonts embedded in PDFs are often subsetted and/or corrupted.
// Therefore, before this is enabled by default, more sophisticated rules regarding when fonts should be used are needed.
if (opt.extractPDFFonts) {
muPDFScheduler.extractAllFonts().then(async (x) => {
globalImageCache.fontArr = [];
for (let i = 0; i < x.length; i++) {
const src = x[i].buffer;
const fontObj = await loadOpentype(src);
const fontNameEmbedded = fontObj.names.postScriptName.en;
const fontFamilyEmbedded = fontObj.names?.fontFamily?.en || fontNameEmbedded.replace(/-\w+$/, '');
let fontObj;
let fontData;
try {
fontObj = await loadOpentype(src);
// It is common for raw fonts embedded in PDFs to be invalid and rejected by the OTS, but running them through opentype.js fixes them.
// This appears to be because of the way that fonts are subsetted in PDFs.
fontData = fontObj.toArrayBuffer();
} catch (error) {
console.error(`Error loading font ${i}.`);
console.error(error);
continue;
}

// Skip bold and bold-italic fonts for now.
if (fontNameEmbedded.match(/bold/i)) continue;
const fontNameEmbedded = fontObj.names.postScriptName.en;

let fontStyle = 'normal';
if (fontNameEmbedded.match(/italic/i)) {
fontStyle = 'italic';
} else if (fontNameEmbedded.match(/bold/i)) {
// Bold fonts should be enabled at some later point.
// While we previously found that we were unable to detect bold fonts reliably,
// when importing from PDFs, we do not need to guess.
// fontStyle = 'bold';
fontStyle = 'bold';
}
const type = determineSansSerif(fontFamilyEmbedded) === 'SansDefault' ? 'sans' : 'serif';

// mupdf replaces spaces with underscores in font names.
const fontName = fontFamilyEmbedded.replace(/[^+]+\+/g, '').replace(/\s/g, '_');

if (!FontCont.raw[fontName]) {
FontCont.raw[fontName] = {};
}

if (!FontCont.raw[fontName][fontStyle]) {
FontCont.raw[fontName][fontStyle] = new FontContainerFont(fontName, fontStyle, src, false, fontObj);
// mupdf makes changes to font names, so we need to do the same.
// Font names in the form `MEDJCO+CenturySchoolbook` are changed to `CenturySchoolbook`.
// Spaces are replaced with underscores.
const fontName = fontNameEmbedded.replace(/[^+]+\+/g, '').replace(/\s/g, '_');

if (!FontCont.doc?.[fontName]?.[fontStyle]) {
try {
const fontContainer = new FontContainerFont(fontName, fontStyle, fontData, false, fontObj);

if (!FontCont.doc) {
FontCont.doc = {};
}

if (!FontCont.doc[fontName]) {
FontCont.doc[fontName] = {};
}

FontCont.doc[fontName][fontStyle] = fontContainer;
} catch (error) {
console.error(`Error loading font ${fontName} ${fontStyle}.`);
}
} else {
console.warn(`Font ${fontName} ${fontStyle} already exists.`);
}
}

await setUploadFontsWorker(gs.schedulerInner);
await updateFontContWorkerMain();
});
}
};
Expand Down
Loading

0 comments on commit 4b2b43e

Please sign in to comment.