Fixed crash when importing PDF with invalid dictionary with calcSuppF…

…ontInfo option enabled, added new tests
scribeocr · Oct 28, 2024 · 0588cfc · 0588cfc
1 parent 6383a00
commit 0588cfc
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 3 deletions.
diff --git a/js/fontSupp.js b/js/fontSupp.js
@@ -70,6 +70,7 @@ const calcSuppFontInfoForWords = async (words) => {
    * and we need to determine how large to render the text.
    */
 export const calcSuppFontInfo = async (ocrArr) => {
+  if (!ocrArr) return;
   await gs.initTesseract({ anyOk: true, langs: ['eng'] });
   // console.time('calcSuppFontInfo');
   const calcFonts = new Set();

diff --git a/tests/assets/coca-cola-business-and-sustainability-report-2022.pdf b/tests/assets/coca-cola-business-and-sustainability-report-2022.pdf
diff --git a/tests/module/importPdfText.spec.js b/tests/module/importPdfText.spec.js
@@ -169,17 +169,44 @@ describe('Check superscripts are detected in PDF imports.', function () {
   });
 }).timeout(120000);
 
-// Note that these font sizes will not match the scribeocr.com interface, as `calcSuppFontInfo` is enabled in the interface but not the tests,
-// and this setting scales the font sizes reported by the PDF parser.
 describe('Check font size is correctly parsed in PDF imports.', function () {
   this.timeout(10000);
-  // This word was problematic at one point due to the change in font size between the first and second word.
+  // Note: the version which uses `calcSuppFontInfo` corresponds to the scribeocr.com interface, which enables this option.
   it('Should correctly parse font sizes (1st doc)', async () => {
     await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true });
+    // This word was problematic at one point due to the change in font size between the first and second word.
     assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].size, 32.5);
     assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].text, 'Agent');
   }).timeout(10000);
 
+  it('Should correctly parse font sizes and scale using calcSuppFontInfo option (1st doc)', async () => {
+    scribe.opt.calcSuppFontInfo = true;
+    await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true });
+    assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].size, 39);
+    assert.strictEqual(scribe.data.ocr.active[0].lines[249].words[1].text, 'Agent');
+  }).timeout(10000);
+
+  scribe.opt.calcSuppFontInfo = false;
+
+  after(async () => {
+    await scribe.terminate();
+  });
+}).timeout(120000);
+
+describe('Check that text-native PDFs with broken encoding dictionaries are detected and skipped.', function () {
+  this.timeout(10000);
+  // Note: the version which uses `calcSuppFontInfo` corresponds to the scribeocr.com interface, which enables this option.
+  it('Should correctly parse font sizes (1st doc)', async () => {
+    // Set `calcSuppFontInfo` to `true` as this option previously crashed the program with this type of PDFs.
+    scribe.opt.calcSuppFontInfo = true;
+
+    await scribe.importFiles([`${ASSETS_PATH_KARMA}/coca-cola-business-and-sustainability-report-2022.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true });
+
+    assert.strictEqual(scribe.data.ocr.active.length, 0);
+  }).timeout(10000);
+
+  scribe.opt.calcSuppFontInfo = false;
+
   after(async () => {
     await scribe.terminate();
   });