Skip to content

Commit

Permalink
Updates to fix Vanilla Tesseract build
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Sep 28, 2024
1 parent 0ab1db7 commit 53813ef
Show file tree
Hide file tree
Showing 7 changed files with 1,084 additions and 1,170 deletions.
48 changes: 29 additions & 19 deletions js/worker/generalWorker.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,35 +19,37 @@ import { optimizeFont } from './optimizeFontModule.js';

const Tesseract = typeof process === 'undefined' ? (await import('../../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');

// TODO: Add back support for multiple PSM modes.
// There is already an advanced option in the UI that claims to switch this, but it currently does nothing.
// tessedit_pageseg_mode: Tesseract.PSM["SINGLE_COLUMN"],

const defaultConfigsVanilla = {
tessedit_pageseg_mode: Tesseract.PSM.AUTO,
hocr_char_boxes: '1',
max_page_gradient_recognize: '100',
hocr_font_info: '1',
};

const defaultConfigs = {
// TODO: Add back support for multiple PSM modes.
// There is already an advanced option in the UI that claims to switch this, but it currently does nothing.
// tessedit_pageseg_mode: Tesseract.PSM["SINGLE_COLUMN"],
tessedit_pageseg_mode: Tesseract.PSM.AUTO,
hocr_char_boxes: '1',
// TODO: Rework how config options interact with options set by the user.
// Specifically, no non-default options (that impact recognition) should be set when the engine is "vanilla",
// and the filters for non-English characters should only be set for the English language.
// The Tesseract LSTM engine frequently identifies a bar character "|"
// This is virtually always a false positive (usually "I").
// tessedit_char_blacklist: '|éï',
tessedit_char_blacklist: '|',
max_page_gradient_recognize: '100',
hocr_font_info: '1',

// This is virtually always a false positive (usually "I").
tessedit_char_blacklist: '|',
// This option disables an undesirable behavior where Tesseract categorizes blobs *of any size* as noise,
// simply because they are too rectangular. This option should always be enabled outside of debugging purposes.
textord_noise_area_ratio: '1',
// textord_noise_area_ratio: '1',
// Table detection appears to interfere with the layout analysis of some documents with multi-column layouts,
// causing columns to be combined into a single line. This should be investigated in more detail,
// but disabling as it does not seem to improve results even when the input document is a table.
textord_tabfind_find_tables: '0',
// classify_enable_learning: '0',
// classify_enable_adaptive_matcher: '0',
// tessedit_enable_doc_dict: '0',
// chop_enable: '0'
};

const initConfigs = {
const defaultInitConfigsVanilla = {};

const defaultInitConfigs = {
// load_system_dawg: '0',
load_freq_dawg: '0',
// load_unambig_dawg: '0',
Expand Down Expand Up @@ -114,6 +116,8 @@ const reinitialize = async ({ langs, oem, vanillaMode }) => {
if (changeOEM) oemCurrent = oem;
if (changeVanilla) vanillaMode_ = vanillaMode;

const initConfigs = vanillaMode_ ? defaultInitConfigsVanilla : defaultInitConfigs;

// The worker only needs to be created from scratch if the build of Tesseract being used changes,
// or if it was never created in the first place.
if (changeVanilla || !worker) {
Expand All @@ -129,7 +133,9 @@ const reinitialize = async ({ langs, oem, vanillaMode }) => {
await worker.reinitialize(langArrCurrent, oemCurrent, initConfigs);
}

await worker.setParameters(defaultConfigs);
const config = vanillaMode_ ? defaultConfigsVanilla : defaultConfigs;

await worker.setParameters(config);
};

/**
Expand All @@ -153,6 +159,8 @@ const reinitialize2 = async ({ langs, vanillaMode }) => {
if (changeLang) langArrCurrent = langArr;
if (changeVanilla) vanillaMode_ = vanillaMode;

const initConfigs = vanillaMode_ ? defaultInitConfigsVanilla : defaultInitConfigs;

// The worker only needs to be created from scratch if the build of Tesseract being used changes,
// or if it was never created in the first place.
if (changeVanilla || !workerLegacy || !workerLSTM) {
Expand Down Expand Up @@ -180,8 +188,10 @@ const reinitialize2 = async ({ langs, vanillaMode }) => {
await workerLSTM.reinitialize(langArrCurrent, 1, initConfigs);
}

await workerLegacy.setParameters(defaultConfigs);
await workerLSTM.setParameters(defaultConfigs);
const config = vanillaMode_ ? defaultConfigsVanilla : defaultConfigs;

await workerLegacy.setParameters(config);
await workerLSTM.setParameters(config);
};

/**
Expand Down
16 changes: 8 additions & 8 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
"canvas": "^2.11.2",
"commander": "^11.1.0",
"puppeteer": "^22.13.0",
"@scribe.js/tesseract.js": "^5.0.6",
"@scribe.js/tesseract.js": "^5.0.7",
"web-worker": "~1.2.0"
}
}
548 changes: 262 additions & 286 deletions tess/core_vanilla/tesseract-core-lstm.wasm.js

Large diffs are not rendered by default.

548 changes: 262 additions & 286 deletions tess/core_vanilla/tesseract-core-simd-lstm.wasm.js

Large diffs are not rendered by default.

546 changes: 261 additions & 285 deletions tess/core_vanilla/tesseract-core-simd.wasm.js

Large diffs are not rendered by default.

546 changes: 261 additions & 285 deletions tess/core_vanilla/tesseract-core.wasm.js

Large diffs are not rendered by default.

0 comments on commit 53813ef

Please sign in to comment.