Skip to content

Commit

Permalink
Fixed support with hocr from Tesseract API/Tesseract.js; added test
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Aug 21, 2024
1 parent 5bd401a commit 47dac54
Show file tree
Hide file tree
Showing 6 changed files with 384 additions and 38 deletions.
24 changes: 9 additions & 15 deletions js/import/importOCR.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,11 @@ export async function importOCRFiles(ocrFilesAll) {
// In the case of 1 HOCR file
const singleHOCRMode = ocrFilesAll.length === 1;

let hocrStrStart = '';
let hocrStrEnd = '';
let hocrStrStart = null;
let abbyyMode = false;
let stextMode = false;
let scribeMode = false;

let hocrArrPages;
let pageCountHOCR;
let hocrRaw;
/** @type {?Object.<string, FontMetricsFamily>} */
Expand All @@ -47,20 +45,16 @@ export async function importOCRFiles(ocrFilesAll) {
stextMode = !!node2 && !!/<document name/.test(node2);

if (abbyyMode) {
hocrArrPages = hocrStrAll.split(/(?=<page)/).slice(1);
hocrRaw = hocrStrAll.split(/(?=<page)/).slice(1);
} else if (stextMode) {
hocrArrPages = hocrStrAll.split(/(?=<page)/).slice(1);
hocrRaw = hocrStrAll.split(/(?=<page)/).slice(1);
} else {
hocrStrStart = hocrStrAll.match(/[\s\S]*?<body>/)[0];
hocrStrEnd = hocrStrAll.match(/<\/body>[\s\S]*$/)[0];
hocrArrPages = splitHOCRStr(hocrStrAll);
// `hocrStrStart` will be missing for individual HOCR pages created with Tesseract.js or the Tesseract API.
hocrStrStart = hocrStrAll.match(/[\s\S]*?<body>/)?.[0];
hocrRaw = splitHOCRStr(hocrStrAll);
}

pageCountHOCR = hocrArrPages.length;
hocrRaw = Array(pageCountHOCR);
for (let i = 0; i < pageCountHOCR; i++) {
hocrRaw[i] = hocrStrStart + hocrArrPages[i] + hocrStrEnd;
}
pageCountHOCR = hocrRaw.length;
} else {
pageCountHOCR = ocrFilesAll.length;
hocrRaw = Array(pageCountHOCR);
Expand All @@ -76,11 +70,11 @@ export async function importOCRFiles(ocrFilesAll) {
}
}

if (!abbyyMode && !stextMode && hocrRaw[0]) {
if (!abbyyMode && !stextMode && hocrStrStart) {
const getMeta = (name) => {
const regex = new RegExp(`<meta name=["']${name}["'][^<]+`, 'i');

const nodeStr = hocrRaw[0].match(regex)?.[0];
const nodeStr = hocrStrStart.match(regex)?.[0];
if (!nodeStr) return null;
const contentStr = nodeStr.match(/content=["']([\s\S]+?)(?=["']\s{0,5}\/?>)/i)?.[1];
if (!contentStr) return null;
Expand Down
151 changes: 151 additions & 0 deletions tests/assets/bill.hocr
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
<meta name='ocr-system' content='tesseract 5.1.0-471-gbc490' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "_img/bill.png"; bbox 0 0 957 307; ppageno 0; scan_res 96 96'>
<div class='ocr_carea' id='block_1_1' title="bbox 32 18 324 50">
<p class='ocr_par' id='par_1_1' lang='eng' title="bbox 32 18 324 50">
<span class='ocr_line' id='line_1_1' title="bbox 32 18 174 31; baseline 0 -1; x_size 16; x_descenders 4; x_ascenders 4">
<span class='ocrx_word' id='word_1_1' title='bbox 32 18 80 30; x_wconf 93'>FIRST</span>
<span class='ocrx_word' id='word_1_2' title='bbox 85 18 174 31; x_wconf 91'>CHEQUING</span>
</span>
<span class='ocr_line' id='line_1_2' title="bbox 32 35 324 50; baseline 0 -3; x_size 16; x_descenders 4; x_ascenders 4">
<span class='ocrx_word' id='word_1_3' title='bbox 32 35 66 47; x_wconf 96'>Line</span>
<span class='ocrx_word' id='word_1_4' title='bbox 71 35 87 47; x_wconf 96'>of</span>
<span class='ocrx_word' id='word_1_5' title='bbox 91 35 139 47; x_wconf 96'>Credit</span>
<span class='ocrx_word' id='word_1_6' title='bbox 145 35 227 50; x_wconf 96'>100,000.00</span>
<span class='ocrx_word' id='word_1_7' title='bbox 233 35 268 47; x_wconf 97'>Rate</span>
<span class='ocrx_word' id='word_1_8' title='bbox 273 35 324 47; x_wconf 96'>4.2000</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_2' title="bbox 32 63 905 91">
<p class='ocr_par' id='par_1_2' lang='eng' title="bbox 32 63 905 91">
<span class='ocr_caption' id='line_1_3' title="bbox 32 63 905 91; baseline 0 -9; x_size 20; x_descenders 5; x_ascenders 5">
<span class='ocrx_word' id='word_1_9' title='bbox 32 63 68 91; x_wconf 94'>Date</span>
<span class='ocrx_word' id='word_1_10' title='bbox 119 63 204 91; x_wconf 96'>Description</span>
<span class='ocrx_word' id='word_1_11' title='bbox 549 63 606 91; x_wconf 90'>Number</span>
<span class='ocrx_word' id='word_1_12' title='bbox 656 63 705 91; x_wconf 96'>Debits</span>
<span class='ocrx_word' id='word_1_13' title='bbox 750 63 805 91; x_wconf 94'>Credits</span>
<span class='ocrx_word' id='word_1_14' title='bbox 848 71 905 82; x_wconf 95'>Balance</span>
</span>
</p>
</div>
<div class='ocr_photo' id='block_1_3' title="bbox 27 94 939 104"></div>
<div class='ocr_carea' id='block_1_4' title="bbox 32 114 932 290">
<p class='ocr_par' id='par_1_3' lang='eng' title="bbox 32 114 932 290">
<span class='ocr_line' id='line_1_4' title="bbox 32 114 917 142; baseline 0 -9; x_size 20; x_descenders 5; x_ascenders 5">
<span class='ocrx_word' id='word_1_15' title='bbox 32 122 108 136; x_wconf 75'>01Aug2018</span>
<span class='ocrx_word' id='word_1_16' title='bbox 122 122 177 136; x_wconf 53'>Clearing</span>
<span class='ocrx_word' id='word_1_17' title='bbox 182 122 235 136; x_wconf 95'>Cheque</span>
<span class='ocrx_word' id='word_1_18' title='bbox 568 122 602 133; x_wconf 95'>4987</span>
<span class='ocrx_word' id='word_1_19' title='bbox 662 114 703 142; x_wconf 96'>36.07</span>
<span class='ocrx_word' id='word_1_20' title='bbox 836 122 902 135; x_wconf 80'>99,914.15</span>
<span class='ocrx_word' id='word_1_21' title='bbox 912 128 917 130; x_wconf 57'>-</span>
</span>
<span class='ocr_line' id='line_1_5' title="bbox 32 131 917 159; baseline 0 -9; x_size 20; x_descenders 5; x_ascenders 5">
<span class='ocrx_word' id='word_1_22' title='bbox 32 139 108 153; x_wconf 75'>01Aug2018</span>
<span class='ocrx_word' id='word_1_23' title='bbox 122 139 177 153; x_wconf 54'>Clearing</span>
<span class='ocrx_word' id='word_1_24' title='bbox 182 139 235 153; x_wconf 94'>Cheque</span>
<span class='ocrx_word' id='word_1_25' title='bbox 568 139 602 150; x_wconf 95'>4986</span>
<span class='ocrx_word' id='word_1_26' title='bbox 662 131 703 159; x_wconf 95'>60.93</span>
<span class='ocrx_word' id='word_1_27' title='bbox 836 139 902 152; x_wconf 89'>99,975.08</span>
<span class='ocrx_word' id='word_1_28' title='bbox 912 145 917 147; x_wconf 82'>-</span>
</span>
<span class='ocr_line' id='line_1_6' title="bbox 32 148 932 176; baseline 0 -9; x_size 20; x_descenders 5; x_ascenders 5">
<span class='ocrx_word' id='word_1_29' title='bbox 32 156 108 170; x_wconf 79'>01Aug2018</span>
<span class='ocrx_word' id='word_1_30' title='bbox 122 156 177 170; x_wconf 52'>Clearing</span>
<span class='ocrx_word' id='word_1_31' title='bbox 182 156 235 170; x_wconf 95'>Cheque</span>
<span class='ocrx_word' id='word_1_32' title='bbox 568 156 602 167; x_wconf 95'>4982</span>
<span class='ocrx_word' id='word_1_33' title='bbox 653 148 707 176; x_wconf 95'>800.04</span>
<span class='ocrx_word' id='word_1_34' title='bbox 828 156 902 169; x_wconf 93'>100,775.12</span>
<span class='ocrx_word' id='word_1_35' title='bbox 913 156 932 167; x_wconf 91'>EX</span>
</span>
<span class='ocr_line' id='line_1_7' title="bbox 32 166 932 194; baseline 0 -9; x_size 20; x_descenders 5; x_ascenders 5">
<span class='ocrx_word' id='word_1_36' title='bbox 32 174 108 188; x_wconf 78'>01Aug2018</span>
<span class='ocrx_word' id='word_1_37' title='bbox 122 174 177 188; x_wconf 63'>Clearing</span>
<span class='ocrx_word' id='word_1_38' title='bbox 182 174 235 188; x_wconf 95'>Cheque</span>
<span class='ocrx_word' id='word_1_39' title='bbox 568 174 600 185; x_wconf 96'>4981</span>
<span class='ocrx_word' id='word_1_40' title='bbox 653 166 703 194; x_wconf 92'>823.34</span>
<span class='ocrx_word' id='word_1_41' title='bbox 828 174 902 187; x_wconf 94'>101,598.46</span>
<span class='ocrx_word' id='word_1_42' title='bbox 913 174 932 185; x_wconf 94'>EX</span>
</span>
<span class='ocr_line' id='line_1_8' title="bbox 32 191 932 205; baseline 0 -3; x_size 20; x_descenders 5; x_ascenders 5">
<span class='ocrx_word' id='word_1_43' title='bbox 32 191 108 205; x_wconf 90'>01Aug2018</span>
<span class='ocrx_word' id='word_1_44' title='bbox 123 191 182 205; x_wconf 50'>Incoming</span>
<span class='ocrx_word' id='word_1_45' title='bbox 188 183 231 211; x_wconf 93'>Interac</span>
<span class='ocrx_word' id='word_1_46' title='bbox 238 183 307 211; x_wconf 92'>e-Transfer</span>
<span class='ocrx_word' id='word_1_47' title='bbox 767 191 803 202; x_wconf 75'>14.54</span>
<span class='ocrx_word' id='word_1_48' title='bbox 828 191 902 204; x_wconf 56'>101,583.92</span>
<span class='ocrx_word' id='word_1_49' title='bbox 913 191 932 202; x_wconf 96'>EX</span>
</span>
<span class='ocr_line' id='line_1_9' title="bbox 32 208 932 222; baseline 0 -3; x_size 20; x_descenders 5; x_ascenders 5">
<span class='ocrx_word' id='word_1_50' title='bbox 32 208 108 222; x_wconf 91'>01Aug2018</span>
<span class='ocrx_word' id='word_1_51' title='bbox 123 208 182 222; x_wconf 56'>Incoming</span>
<span class='ocrx_word' id='word_1_52' title='bbox 188 200 231 228; x_wconf 93'>Interac</span>
<span class='ocrx_word' id='word_1_53' title='bbox 238 200 307 228; x_wconf 93'>e-Transfer</span>
<span class='ocrx_word' id='word_1_54' title='bbox 757 208 803 219; x_wconf 87'>400.00</span>
<span class='ocrx_word' id='word_1_55' title='bbox 828 208 902 221; x_wconf 74'>101,183.92</span>
<span class='ocrx_word' id='word_1_56' title='bbox 913 208 932 219; x_wconf 96'>EX</span>
</span>
<span class='ocr_line' id='line_1_10' title="bbox 32 225 917 239; baseline 0 -3; x_size 20; x_descenders 5; x_ascenders 5">
<span class='ocrx_word' id='word_1_57' title='bbox 32 225 108 239; x_wconf 91'>01Aug2018</span>
<span class='ocrx_word' id='word_1_58' title='bbox 122 225 178 236; x_wconf 50'>Assisted</span>
<span class='ocrx_word' id='word_1_59' title='bbox 183 225 234 239; x_wconf 93'>Deposit</span>
<span class='ocrx_word' id='word_1_60' title='bbox 736 225 802 238; x_wconf 62'>3241450</span>
<span class='ocrx_word' id='word_1_61' title='bbox 836 225 902 238; x_wconf 44'>68,769.42</span>
<span class='ocrx_word' id='word_1_62' title='bbox 912 231 917 233; x_wconf 87'>-</span>
</span>
<span class='ocr_line' id='line_1_11' title="bbox 32 234 917 262; baseline 0 -9; x_size 20; x_descenders 5; x_ascenders 5">
<span class='ocrx_word' id='word_1_63' title='bbox 32 242 108 256; x_wconf 87'>01Aug2018</span>
<span class='ocrx_word' id='word_1_64' title='bbox 122 242 178 253; x_wconf 84'>Transfer</span>
<span class='ocrx_word' id='word_1_65' title='bbox 182 243 203 253; x_wconf 92'>out</span>
<span class='ocrx_word' id='word_1_66' title='bbox 210 234 222 262; x_wconf 95'>to</span>
<span class='ocrx_word' id='word_1_67' title='bbox 228 242 268 253; x_wconf 95'>loan</span>
<span class='ocrx_word' id='word_1_68' title='bbox 263 234 271 262; x_wconf 95'>7</span>
<span class='ocrx_word' id='word_1_69' title='bbox 645 234 702 262; x_wconf 95'>1,500.00</span>
<span class='ocrx_word' id='word_1_70' title='bbox 836 242 902 255; x_wconf 86'>70,269.42</span>
<span class='ocrx_word' id='word_1_71' title='bbox 912 248 917 250; x_wconf 74'>-</span>
</span>
<span class='ocr_line' id='line_1_12' title="bbox 32 251 917 279; baseline 0 -9; x_size 20; x_descenders 5; x_ascenders 5">
<span class='ocrx_word' id='word_1_72' title='bbox 32 259 108 273; x_wconf 76'>02Aug2018</span>
<span class='ocrx_word' id='word_1_73' title='bbox 122 259 177 273; x_wconf 66'>Clearing</span>
<span class='ocrx_word' id='word_1_74' title='bbox 182 259 235 273; x_wconf 94'>Cheque</span>
<span class='ocrx_word' id='word_1_75' title='bbox 568 251 602 279; x_wconf 88'>4984</span>
<span class='ocrx_word' id='word_1_76' title='bbox 661 251 703 279; x_wconf 96'>48.08</span>
<span class='ocrx_word' id='word_1_77' title='bbox 836 259 902 272; x_wconf 62'>70,317.50</span>
<span class='ocrx_word' id='word_1_78' title='bbox 912 265 917 267; x_wconf 81'>-</span>
</span>
<span class='ocr_line' id='line_1_13' title="bbox 32 276 917 290; baseline 0 -3; x_size 20; x_descenders 5; x_ascenders 5">
<span class='ocrx_word' id='word_1_79' title='bbox 32 276 108 290; x_wconf 90'>02Aug2018</span>
<span class='ocrx_word' id='word_1_80' title='bbox 122 276 177 290; x_wconf 61'>Clearing</span>
<span class='ocrx_word' id='word_1_81' title='bbox 182 276 235 290; x_wconf 96'>Cheque</span>
<span class='ocrx_word' id='word_1_82' title='bbox 568 276 602 287; x_wconf 96'>4985</span>
<span class='ocrx_word' id='word_1_83' title='bbox 666 276 701 287; x_wconf 80'>7051</span>
<span class='ocrx_word' id='word_1_84' title='bbox 836 276 900 289; x_wconf 91'>70,388.01</span>
<span class='ocrx_word' id='word_1_85' title='bbox 912 282 917 284; x_wconf 87'>-</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_5' title="bbox 32 285 900 307">
<p class='ocr_par' id='par_1_4' lang='eng' title="bbox 32 285 900 307">
<span class='ocr_line' id='line_1_14' title="bbox 32 285 900 307; baseline 0 -3; x_size 20; x_descenders 5; x_ascenders 5">
<span class='ocrx_word' id='word_1_86' title='bbox 32 293 108 307; x_wconf 90'>02Aug2018</span>
<span class='ocrx_word' id='word_1_87' title='bbox 122 293 177 307; x_wconf 81'>Clearing</span>
<span class='ocrx_word' id='word_1_88' title='bbox 182 293 235 307; x_wconf 83'>Cheque</span>
<span class='ocrx_word' id='word_1_89' title='bbox 568 293 602 304; x_wconf 92'>4992</span>
<span class='ocrx_word' id='word_1_90' title='bbox 653 285 703 307; x_wconf 94'>500.00</span>
<span class='ocrx_word' id='word_1_91' title='bbox 836 293 900 306; x_wconf 54'>70,888.01</span>
</span>
</p>
</div>
</div>
</body>
</html>
Loading

0 comments on commit 47dac54

Please sign in to comment.