Skip to content

Commit

Permalink
Updated comparison rule
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Sep 3, 2024
1 parent a5fea4f commit cdc7116
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 6 deletions.
32 changes: 29 additions & 3 deletions js/worker/compareOCRModule.js
Original file line number Diff line number Diff line change
Expand Up @@ -798,7 +798,7 @@ export async function compareOCRPageImp({

// The LSTM model is known to be more accurate on average.
// Therefore, if both metrics are terrible (indicating the word isn't lined up at all), the LSTM word is used.
if (hocrBError < hocrAError || (legacyLSTMComb && hocrAError > 0.7)) {
if (hocrBError < hocrAError || (legacyLSTMComb && hocrAError > 0.5)) {
const skip = ['eg', 'ie'].includes(wordA.text.replace(/\W/g, ''));

if (!skip) {
Expand Down Expand Up @@ -899,11 +899,37 @@ export async function compareOCRPageImp({
// Note: These metrics leave open the door for some fringe edge cases.
// For example,

const hocrBAll = {};
ocr.getPageWords(pageB).forEach((x) => {
hocrBAll[x.id] = 1;
});

const hocrAAll = {};
ocr.getPageWords(pageAInt).forEach((x) => {
hocrAAll[x.id] = 1;
});

// Delete any punctuation-only words from the stats if they are being ignored.
if (ignorePunct) {
const punctOnlyIDsA = ocr.getPageWords(pageA).filter((x) => !x.text.replace(/[\W_]/g, '')).map((x) => x.id);
punctOnlyIDsA.forEach((x) => {
delete hocrAAll[x];
delete hocrAOverlap[x];
delete hocrACorrect[x];
});
const punctOnlyIDsB = ocr.getPageWords(pageB).filter((x) => !x.text.replace(/[\W_]/g, '')).map((x) => x.id);
punctOnlyIDsB.forEach((x) => {
delete hocrBAll[x];
delete hocrBOverlap[x];
delete hocrBCorrect[x];
});
}

// Number of words in ground truth
const totalCountB = ocr.getPageWords(pageB).length;
const totalCountB = Object.keys(hocrBAll).length;

// Number of words in candidate OCR
const totalCountA = ocr.getPageWords(pageAInt).length;
const totalCountA = Object.keys(hocrAAll).length;

// Number of words in ground truth with any overlap with candidate OCR
const overlapCountB = Object.keys(hocrBOverlap).length;
Expand Down
6 changes: 3 additions & 3 deletions tests/cli/cli.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ describe('Check Node.js commands.', () => {
// originalConsoleLog(consoleOutput);

// Assert that console.log was called with 'blah'
expect(consoleOutput).to.include('388 of 404');
expect(consoleOutput).to.include('387 of 404');
}).timeout(30000);

it('Overlay .pdf and Abbyy .xml file.', async () => {
Expand Down Expand Up @@ -113,9 +113,9 @@ describe('Check Node.js commands.', () => {
// Call the function
await overlayCLI(path.join(__dirname, '../assets/scribe_test_pdf1.pdf'), path.join(__dirname, '../assets/scribe_test_pdf1_abbyy.xml'), tmpDir, { robust: true, conf: true, vis: true });

if (!/388 of 404/.test(consoleOutput)) originalConsoleLog(consoleOutput);
if (!/387 of 404/.test(consoleOutput)) originalConsoleLog(consoleOutput);

expect(consoleOutput).to.include('388 of 404');
expect(consoleOutput).to.include('387 of 404');

const outputPath = `${tmpDir}/scribe_test_pdf1_vis.pdf`;

Expand Down

0 comments on commit cdc7116

Please sign in to comment.