diff --git a/js/fontSupp.js b/js/fontSupp.js index af8abec..d0947cc 100644 --- a/js/fontSupp.js +++ b/js/fontSupp.js @@ -70,6 +70,7 @@ const calcSuppFontInfoForWords = async (words) => { * and we need to determine how large to render the text. */ export const calcSuppFontInfo = async (ocrArr) => { + if (!ocrArr) return; await gs.initTesseract({ anyOk: true, langs: ['eng'] }); // console.time('calcSuppFontInfo'); const calcFonts = new Set(); diff --git a/tests/assets/coca-cola-business-and-sustainability-report-2022.pdf b/tests/assets/coca-cola-business-and-sustainability-report-2022.pdf new file mode 100644 index 0000000..c08426d Binary files /dev/null and b/tests/assets/coca-cola-business-and-sustainability-report-2022.pdf differ diff --git a/tests/module/importPdfText.spec.js b/tests/module/importPdfText.spec.js index 7734969..56a39c6 100644 --- a/tests/module/importPdfText.spec.js +++ b/tests/module/importPdfText.spec.js @@ -169,17 +169,44 @@ describe('Check superscripts are detected in PDF imports.', function () { }); }).timeout(120000); -// Note that these font sizes will not match the scribeocr.com interface, as `calcSuppFontInfo` is enabled in the interface but not the tests, -// and this setting scales the font sizes reported by the PDF parser. describe('Check font size is correctly parsed in PDF imports.', function () { this.timeout(10000); - // This word was problematic at one point due to the change in font size between the first and second word. + // Note: the version which uses `calcSuppFontInfo` corresponds to the scribeocr.com interface, which enables this option. it('Should correctly parse font sizes (1st doc)', async () => { await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true }); + // This word was problematic at one point due to the change in font size between the first and second word. assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].size, 32.5); assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].text, 'Agent'); }).timeout(10000); + it('Should correctly parse font sizes and scale using calcSuppFontInfo option (1st doc)', async () => { + scribe.opt.calcSuppFontInfo = true; + await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true }); + assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].size, 39); + assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].text, 'Agent'); + }).timeout(10000); + + scribe.opt.calcSuppFontInfo = false; + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); + +describe('Check that text-native PDFs with broken encoding dictionaries are detected and skipped.', function () { + this.timeout(10000); + // Note: the version which uses `calcSuppFontInfo` corresponds to the scribeocr.com interface, which enables this option. + it('Should correctly parse font sizes (1st doc)', async () => { + // Set `calcSuppFontInfo` to `true` as this option previously crashed the program with this type of PDFs. + scribe.opt.calcSuppFontInfo = true; + + await scribe.importFiles([`${ASSETS_PATH_KARMA}/coca-cola-business-and-sustainability-report-2022.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true }); + + assert.strictEqual(scribe.data.ocr.active.length, 0); + }).timeout(10000); + + scribe.opt.calcSuppFontInfo = false; + after(async () => { await scribe.terminate(); });