diff --git a/js/import/importOCR.js b/js/import/importOCR.js index a3f0aac..b6292e6 100644 --- a/js/import/importOCR.js +++ b/js/import/importOCR.js @@ -18,13 +18,11 @@ export async function importOCRFiles(ocrFilesAll) { // In the case of 1 HOCR file const singleHOCRMode = ocrFilesAll.length === 1; - let hocrStrStart = ''; - let hocrStrEnd = ''; + let hocrStrStart = null; let abbyyMode = false; let stextMode = false; let scribeMode = false; - let hocrArrPages; let pageCountHOCR; let hocrRaw; /** @type {?Object.} */ @@ -47,20 +45,16 @@ export async function importOCRFiles(ocrFilesAll) { stextMode = !!node2 && !!//)[0]; - hocrStrEnd = hocrStrAll.match(/<\/body>[\s\S]*$/)[0]; - hocrArrPages = splitHOCRStr(hocrStrAll); + // `hocrStrStart` will be missing for individual HOCR pages created with Tesseract.js or the Tesseract API. + hocrStrStart = hocrStrAll.match(/[\s\S]*?/)?.[0]; + hocrRaw = splitHOCRStr(hocrStrAll); } - pageCountHOCR = hocrArrPages.length; - hocrRaw = Array(pageCountHOCR); - for (let i = 0; i < pageCountHOCR; i++) { - hocrRaw[i] = hocrStrStart + hocrArrPages[i] + hocrStrEnd; - } + pageCountHOCR = hocrRaw.length; } else { pageCountHOCR = ocrFilesAll.length; hocrRaw = Array(pageCountHOCR); @@ -76,11 +70,11 @@ export async function importOCRFiles(ocrFilesAll) { } } - if (!abbyyMode && !stextMode && hocrRaw[0]) { + if (!abbyyMode && !stextMode && hocrStrStart) { const getMeta = (name) => { const regex = new RegExp(`)/i)?.[1]; if (!contentStr) return null; diff --git a/tests/assets/bill.hocr b/tests/assets/bill.hocr new file mode 100644 index 0000000..81931df --- /dev/null +++ b/tests/assets/bill.hocr @@ -0,0 +1,151 @@ + + + + + + + + + + +
+
+

+ + FIRST + CHEQUING + + + Line + of + Credit + 100,000.00 + Rate + 4.2000 + +

+
+
+

+ + Date + Description + Number + Debits + Credits + Balance + +

+
+
+
+

+ + 01Aug2018 + Clearing + Cheque + 4987 + 36.07 + 99,914.15 + - + + + 01Aug2018 + Clearing + Cheque + 4986 + 60.93 + 99,975.08 + - + + + 01Aug2018 + Clearing + Cheque + 4982 + 800.04 + 100,775.12 + EX + + + 01Aug2018 + Clearing + Cheque + 4981 + 823.34 + 101,598.46 + EX + + + 01Aug2018 + Incoming + Interac + e-Transfer + 14.54 + 101,583.92 + EX + + + 01Aug2018 + Incoming + Interac + e-Transfer + 400.00 + 101,183.92 + EX + + + 01Aug2018 + Assisted + Deposit + 3241450 + 68,769.42 + - + + + 01Aug2018 + Transfer + out + to + loan + 7 + 1,500.00 + 70,269.42 + - + + + 02Aug2018 + Clearing + Cheque + 4984 + 48.08 + 70,317.50 + - + + + 02Aug2018 + Clearing + Cheque + 4985 + 7051 + 70,388.01 + - + +

+
+
+

+ + 02Aug2018 + Clearing + Cheque + 4992 + 500.00 + 70,888.01 + +

+
+
+ + diff --git a/tests/assets/bill.tesseractjs.hocr b/tests/assets/bill.tesseractjs.hocr new file mode 100644 index 0000000..8fa2506 --- /dev/null +++ b/tests/assets/bill.tesseractjs.hocr @@ -0,0 +1,139 @@ +
+
+

+ + FIRST + CHEQUING + +

+ +

+ + Line + of + Credit + 100,000.00 + Rate + 4.2000 + +

+ +

+ + Date + Description + Number + Debits + Credits + Balance + + + 31Jul2018 + Balance + Forward + 99,878.08 + - + + + 01Aug2018 + Clearing + Cheque + 4987 + 36.07 + 99,914.15 + - + + + 01Aug2018 + Clearing + Cheque + 4986 + 60.93 + 99,975.08 + - + + + 01Aug2018 + Clearing + Cheque + 4982 + 800.04 + 100,775.12 + EX + + + 01Aug2018 + Clearing + Cheque + 4981 + 823.34 + 101,598.46 + EX + + + 01Aug2018 + Incoming + Interac + e-Transfer + 1454 + 101,583.92 + EX + + + 01Aug2018 + Incoming + Interac + e-Transfer + 400.00 + 101,183.92 + EX + + + 01Aug2018 + Assisted + Deposit + 3241450 + 68,769.42 + - + + + 01Aug2018 + Transfer + out + to + loan + 7 + 1,500.00 + 70,269.42 + - + + + 02Aug2018 + Clearing + Cheque + 4984 + 48.08 + 70,317.50 + - + + + 02Aug2018 + Clearing + Cheque + 4985 + 7051 + 70,388.01 + - + + + 02Aug2018 + Clearing + Cheque + 4992 + 500.00 + 70,888.01 + - + +

+
+
diff --git a/tests/module/importAbbyy.spec.js b/tests/module/importAbbyy.spec.js new file mode 100644 index 0000000..60aabdf --- /dev/null +++ b/tests/module/importAbbyy.spec.js @@ -0,0 +1,33 @@ +// Relative imports are required to run in browser. +/* eslint-disable import/no-relative-packages */ +import { assert, config } from '../../node_modules/chai/chai.js'; +// import mocha from '../../node_modules/mocha/mocha.js'; +import scribe from '../../scribe.js'; +import { ASSETS_PATH_KARMA } from '../constants.js'; + +config.truncateThreshold = 0; // Disable truncation for actual/expected values on assertion failure. + +// Using arrow functions breaks references to `this`. +/* eslint-disable prefer-arrow-callback */ +/* eslint-disable func-names */ + +describe('Check Abbyy XML import function.', function () { + this.timeout(10000); + before(async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/econometrica_example_abbyy.xml`]); + }); + + it('Should correctly import smallcaps attribute', async () => { + const text1 = scribe.data.ocr.active[0].lines[4].words.map((x) => x.text).join(' '); + + const text2 = scribe.data.ocr.active[0].lines[23].words.map((x) => x.text).join(' '); + + assert.strictEqual(text1, 'Shubhdeep Deb'); + + assert.strictEqual(text2, 'Wage inequality in the United States has risen sharply since the 1980s. The skill'); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); diff --git a/tests/module/import.spec.js b/tests/module/importTesseract.spec.js similarity index 57% rename from tests/module/import.spec.js rename to tests/module/importTesseract.spec.js index 69113d9..f8b22e6 100644 --- a/tests/module/import.spec.js +++ b/tests/module/importTesseract.spec.js @@ -11,34 +11,51 @@ config.truncateThreshold = 0; // Disable truncation for actual/expected values o /* eslint-disable prefer-arrow-callback */ /* eslint-disable func-names */ -describe('Check Tesseract import function.', function () { +describe('Check .hocr import function (basic)', function () { this.timeout(10000); before(async () => { await scribe.importFiles([`${ASSETS_PATH_KARMA}/econometrica_example_tess.hocr`]); }); - it('Should correctly import small caps printed using font size adjustments', async () => { - const text1 = scribe.data.ocr.active[0].lines[4].words.map((x) => x.text).join(' '); + it('Should import HOCR created with Tesseract CLI', async () => { - const text2 = scribe.data.ocr.active[0].lines[23].words.map((x) => x.text).join(' '); + await scribe.importFiles([`${ASSETS_PATH_KARMA}/bill.hocr`]); - assert.strictEqual(text1, 'Shubhdeep Deb'); + const page = scribe.data.ocr.active[0]; - assert.strictEqual(text2, 'Wage inequality in the United States has risen sharply since the 1980s. The skill'); + const text1 = page.lines[0].words.map((x) => x.text).join(' '); + + assert.strictEqual(text1, 'FIRST CHEQUING'); + }).timeout(10000); + + + // When using Tesseract.js or the Tesseract API to save individual pages as .hocr files, the output is different from the output of the Tesseract CLI, + // as they only include the div with the class 'ocr_page' and the text content of the page, not the entire HTML structure. + it('Should import HOCR pages created with Tesseract API/Tesseract.js', async () => { + + await scribe.importFiles([`${ASSETS_PATH_KARMA}/bill.tesseractjs.hocr`]); + + const page = scribe.data.ocr.active[0]; + + const text1 = page.lines[0].words.map((x) => x.text).join(' '); + + assert.strictEqual(text1, 'FIRST CHEQUING'); }).timeout(10000); + after(async () => { await scribe.terminate(); }); }).timeout(120000); -describe('Check Abbyy XML import function.', function () { + +describe('Check Tesseract .hocr import function (small caps).', function () { this.timeout(10000); before(async () => { - await scribe.importFiles([`${ASSETS_PATH_KARMA}/econometrica_example_abbyy.xml`]); + await scribe.importFiles([`${ASSETS_PATH_KARMA}/econometrica_example_tess.hocr`]); }); - it('Should correctly import smallcaps attribute', async () => { + it('Should correctly import small caps printed using font size adjustments', async () => { const text1 = scribe.data.ocr.active[0].lines[4].words.map((x) => x.text).join(' '); const text2 = scribe.data.ocr.active[0].lines[23].words.map((x) => x.text).join(' '); @@ -52,17 +69,3 @@ describe('Check Abbyy XML import function.', function () { await scribe.terminate(); }); }).timeout(120000); - -describe('Check cleanup functions allow for resetting module.', function () { - this.timeout(10000); - it('Check that cleanup functions work properly', async () => { - await scribe.importFiles([`${ASSETS_PATH_KARMA}/chi_eng_mixed_sample.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true }); - await scribe.terminate(); - await scribe.init(); - await scribe.importFiles([`${ASSETS_PATH_KARMA}/chi_eng_mixed_sample.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true }); - }).timeout(10000); - - after(async () => { - await scribe.terminate(); - }); -}).timeout(120000); diff --git a/tests/module/misc.spec.js b/tests/module/misc.spec.js new file mode 100644 index 0000000..ba68d3d --- /dev/null +++ b/tests/module/misc.spec.js @@ -0,0 +1,26 @@ +// Relative imports are required to run in browser. +/* eslint-disable import/no-relative-packages */ +import { assert, config } from '../../node_modules/chai/chai.js'; +// import mocha from '../../node_modules/mocha/mocha.js'; +import scribe from '../../scribe.js'; +import { ASSETS_PATH_KARMA } from '../constants.js'; + +config.truncateThreshold = 0; // Disable truncation for actual/expected values on assertion failure. + +// Using arrow functions breaks references to `this`. +/* eslint-disable prefer-arrow-callback */ +/* eslint-disable func-names */ + +describe('Check cleanup functions allow for resetting module.', function () { + this.timeout(10000); + it('Check that cleanup functions work properly', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/chi_eng_mixed_sample.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true }); + await scribe.terminate(); + await scribe.init(); + await scribe.importFiles([`${ASSETS_PATH_KARMA}/chi_eng_mixed_sample.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true }); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000);