-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fixed support with hocr from Tesseract API/Tesseract.js; added test
- Loading branch information
Showing
6 changed files
with
384 additions
and
38 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" | ||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | ||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
<head> | ||
<title></title> | ||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8"/> | ||
<meta name='ocr-system' content='tesseract 5.1.0-471-gbc490' /> | ||
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/> | ||
</head> | ||
<body> | ||
<div class='ocr_page' id='page_1' title='image "_img/bill.png"; bbox 0 0 957 307; ppageno 0; scan_res 96 96'> | ||
<div class='ocr_carea' id='block_1_1' title="bbox 32 18 324 50"> | ||
<p class='ocr_par' id='par_1_1' lang='eng' title="bbox 32 18 324 50"> | ||
<span class='ocr_line' id='line_1_1' title="bbox 32 18 174 31; baseline 0 -1; x_size 16; x_descenders 4; x_ascenders 4"> | ||
<span class='ocrx_word' id='word_1_1' title='bbox 32 18 80 30; x_wconf 93'>FIRST</span> | ||
<span class='ocrx_word' id='word_1_2' title='bbox 85 18 174 31; x_wconf 91'>CHEQUING</span> | ||
</span> | ||
<span class='ocr_line' id='line_1_2' title="bbox 32 35 324 50; baseline 0 -3; x_size 16; x_descenders 4; x_ascenders 4"> | ||
<span class='ocrx_word' id='word_1_3' title='bbox 32 35 66 47; x_wconf 96'>Line</span> | ||
<span class='ocrx_word' id='word_1_4' title='bbox 71 35 87 47; x_wconf 96'>of</span> | ||
<span class='ocrx_word' id='word_1_5' title='bbox 91 35 139 47; x_wconf 96'>Credit</span> | ||
<span class='ocrx_word' id='word_1_6' title='bbox 145 35 227 50; x_wconf 96'>100,000.00</span> | ||
<span class='ocrx_word' id='word_1_7' title='bbox 233 35 268 47; x_wconf 97'>Rate</span> | ||
<span class='ocrx_word' id='word_1_8' title='bbox 273 35 324 47; x_wconf 96'>4.2000</span> | ||
</span> | ||
</p> | ||
</div> | ||
<div class='ocr_carea' id='block_1_2' title="bbox 32 63 905 91"> | ||
<p class='ocr_par' id='par_1_2' lang='eng' title="bbox 32 63 905 91"> | ||
<span class='ocr_caption' id='line_1_3' title="bbox 32 63 905 91; baseline 0 -9; x_size 20; x_descenders 5; x_ascenders 5"> | ||
<span class='ocrx_word' id='word_1_9' title='bbox 32 63 68 91; x_wconf 94'>Date</span> | ||
<span class='ocrx_word' id='word_1_10' title='bbox 119 63 204 91; x_wconf 96'>Description</span> | ||
<span class='ocrx_word' id='word_1_11' title='bbox 549 63 606 91; x_wconf 90'>Number</span> | ||
<span class='ocrx_word' id='word_1_12' title='bbox 656 63 705 91; x_wconf 96'>Debits</span> | ||
<span class='ocrx_word' id='word_1_13' title='bbox 750 63 805 91; x_wconf 94'>Credits</span> | ||
<span class='ocrx_word' id='word_1_14' title='bbox 848 71 905 82; x_wconf 95'>Balance</span> | ||
</span> | ||
</p> | ||
</div> | ||
<div class='ocr_photo' id='block_1_3' title="bbox 27 94 939 104"></div> | ||
<div class='ocr_carea' id='block_1_4' title="bbox 32 114 932 290"> | ||
<p class='ocr_par' id='par_1_3' lang='eng' title="bbox 32 114 932 290"> | ||
<span class='ocr_line' id='line_1_4' title="bbox 32 114 917 142; baseline 0 -9; x_size 20; x_descenders 5; x_ascenders 5"> | ||
<span class='ocrx_word' id='word_1_15' title='bbox 32 122 108 136; x_wconf 75'>01Aug2018</span> | ||
<span class='ocrx_word' id='word_1_16' title='bbox 122 122 177 136; x_wconf 53'>Clearing</span> | ||
<span class='ocrx_word' id='word_1_17' title='bbox 182 122 235 136; x_wconf 95'>Cheque</span> | ||
<span class='ocrx_word' id='word_1_18' title='bbox 568 122 602 133; x_wconf 95'>4987</span> | ||
<span class='ocrx_word' id='word_1_19' title='bbox 662 114 703 142; x_wconf 96'>36.07</span> | ||
<span class='ocrx_word' id='word_1_20' title='bbox 836 122 902 135; x_wconf 80'>99,914.15</span> | ||
<span class='ocrx_word' id='word_1_21' title='bbox 912 128 917 130; x_wconf 57'>-</span> | ||
</span> | ||
<span class='ocr_line' id='line_1_5' title="bbox 32 131 917 159; baseline 0 -9; x_size 20; x_descenders 5; x_ascenders 5"> | ||
<span class='ocrx_word' id='word_1_22' title='bbox 32 139 108 153; x_wconf 75'>01Aug2018</span> | ||
<span class='ocrx_word' id='word_1_23' title='bbox 122 139 177 153; x_wconf 54'>Clearing</span> | ||
<span class='ocrx_word' id='word_1_24' title='bbox 182 139 235 153; x_wconf 94'>Cheque</span> | ||
<span class='ocrx_word' id='word_1_25' title='bbox 568 139 602 150; x_wconf 95'>4986</span> | ||
<span class='ocrx_word' id='word_1_26' title='bbox 662 131 703 159; x_wconf 95'>60.93</span> | ||
<span class='ocrx_word' id='word_1_27' title='bbox 836 139 902 152; x_wconf 89'>99,975.08</span> | ||
<span class='ocrx_word' id='word_1_28' title='bbox 912 145 917 147; x_wconf 82'>-</span> | ||
</span> | ||
<span class='ocr_line' id='line_1_6' title="bbox 32 148 932 176; baseline 0 -9; x_size 20; x_descenders 5; x_ascenders 5"> | ||
<span class='ocrx_word' id='word_1_29' title='bbox 32 156 108 170; x_wconf 79'>01Aug2018</span> | ||
<span class='ocrx_word' id='word_1_30' title='bbox 122 156 177 170; x_wconf 52'>Clearing</span> | ||
<span class='ocrx_word' id='word_1_31' title='bbox 182 156 235 170; x_wconf 95'>Cheque</span> | ||
<span class='ocrx_word' id='word_1_32' title='bbox 568 156 602 167; x_wconf 95'>4982</span> | ||
<span class='ocrx_word' id='word_1_33' title='bbox 653 148 707 176; x_wconf 95'>800.04</span> | ||
<span class='ocrx_word' id='word_1_34' title='bbox 828 156 902 169; x_wconf 93'>100,775.12</span> | ||
<span class='ocrx_word' id='word_1_35' title='bbox 913 156 932 167; x_wconf 91'>EX</span> | ||
</span> | ||
<span class='ocr_line' id='line_1_7' title="bbox 32 166 932 194; baseline 0 -9; x_size 20; x_descenders 5; x_ascenders 5"> | ||
<span class='ocrx_word' id='word_1_36' title='bbox 32 174 108 188; x_wconf 78'>01Aug2018</span> | ||
<span class='ocrx_word' id='word_1_37' title='bbox 122 174 177 188; x_wconf 63'>Clearing</span> | ||
<span class='ocrx_word' id='word_1_38' title='bbox 182 174 235 188; x_wconf 95'>Cheque</span> | ||
<span class='ocrx_word' id='word_1_39' title='bbox 568 174 600 185; x_wconf 96'>4981</span> | ||
<span class='ocrx_word' id='word_1_40' title='bbox 653 166 703 194; x_wconf 92'>823.34</span> | ||
<span class='ocrx_word' id='word_1_41' title='bbox 828 174 902 187; x_wconf 94'>101,598.46</span> | ||
<span class='ocrx_word' id='word_1_42' title='bbox 913 174 932 185; x_wconf 94'>EX</span> | ||
</span> | ||
<span class='ocr_line' id='line_1_8' title="bbox 32 191 932 205; baseline 0 -3; x_size 20; x_descenders 5; x_ascenders 5"> | ||
<span class='ocrx_word' id='word_1_43' title='bbox 32 191 108 205; x_wconf 90'>01Aug2018</span> | ||
<span class='ocrx_word' id='word_1_44' title='bbox 123 191 182 205; x_wconf 50'>Incoming</span> | ||
<span class='ocrx_word' id='word_1_45' title='bbox 188 183 231 211; x_wconf 93'>Interac</span> | ||
<span class='ocrx_word' id='word_1_46' title='bbox 238 183 307 211; x_wconf 92'>e-Transfer</span> | ||
<span class='ocrx_word' id='word_1_47' title='bbox 767 191 803 202; x_wconf 75'>14.54</span> | ||
<span class='ocrx_word' id='word_1_48' title='bbox 828 191 902 204; x_wconf 56'>101,583.92</span> | ||
<span class='ocrx_word' id='word_1_49' title='bbox 913 191 932 202; x_wconf 96'>EX</span> | ||
</span> | ||
<span class='ocr_line' id='line_1_9' title="bbox 32 208 932 222; baseline 0 -3; x_size 20; x_descenders 5; x_ascenders 5"> | ||
<span class='ocrx_word' id='word_1_50' title='bbox 32 208 108 222; x_wconf 91'>01Aug2018</span> | ||
<span class='ocrx_word' id='word_1_51' title='bbox 123 208 182 222; x_wconf 56'>Incoming</span> | ||
<span class='ocrx_word' id='word_1_52' title='bbox 188 200 231 228; x_wconf 93'>Interac</span> | ||
<span class='ocrx_word' id='word_1_53' title='bbox 238 200 307 228; x_wconf 93'>e-Transfer</span> | ||
<span class='ocrx_word' id='word_1_54' title='bbox 757 208 803 219; x_wconf 87'>400.00</span> | ||
<span class='ocrx_word' id='word_1_55' title='bbox 828 208 902 221; x_wconf 74'>101,183.92</span> | ||
<span class='ocrx_word' id='word_1_56' title='bbox 913 208 932 219; x_wconf 96'>EX</span> | ||
</span> | ||
<span class='ocr_line' id='line_1_10' title="bbox 32 225 917 239; baseline 0 -3; x_size 20; x_descenders 5; x_ascenders 5"> | ||
<span class='ocrx_word' id='word_1_57' title='bbox 32 225 108 239; x_wconf 91'>01Aug2018</span> | ||
<span class='ocrx_word' id='word_1_58' title='bbox 122 225 178 236; x_wconf 50'>Assisted</span> | ||
<span class='ocrx_word' id='word_1_59' title='bbox 183 225 234 239; x_wconf 93'>Deposit</span> | ||
<span class='ocrx_word' id='word_1_60' title='bbox 736 225 802 238; x_wconf 62'>3241450</span> | ||
<span class='ocrx_word' id='word_1_61' title='bbox 836 225 902 238; x_wconf 44'>68,769.42</span> | ||
<span class='ocrx_word' id='word_1_62' title='bbox 912 231 917 233; x_wconf 87'>-</span> | ||
</span> | ||
<span class='ocr_line' id='line_1_11' title="bbox 32 234 917 262; baseline 0 -9; x_size 20; x_descenders 5; x_ascenders 5"> | ||
<span class='ocrx_word' id='word_1_63' title='bbox 32 242 108 256; x_wconf 87'>01Aug2018</span> | ||
<span class='ocrx_word' id='word_1_64' title='bbox 122 242 178 253; x_wconf 84'>Transfer</span> | ||
<span class='ocrx_word' id='word_1_65' title='bbox 182 243 203 253; x_wconf 92'>out</span> | ||
<span class='ocrx_word' id='word_1_66' title='bbox 210 234 222 262; x_wconf 95'>to</span> | ||
<span class='ocrx_word' id='word_1_67' title='bbox 228 242 268 253; x_wconf 95'>loan</span> | ||
<span class='ocrx_word' id='word_1_68' title='bbox 263 234 271 262; x_wconf 95'>7</span> | ||
<span class='ocrx_word' id='word_1_69' title='bbox 645 234 702 262; x_wconf 95'>1,500.00</span> | ||
<span class='ocrx_word' id='word_1_70' title='bbox 836 242 902 255; x_wconf 86'>70,269.42</span> | ||
<span class='ocrx_word' id='word_1_71' title='bbox 912 248 917 250; x_wconf 74'>-</span> | ||
</span> | ||
<span class='ocr_line' id='line_1_12' title="bbox 32 251 917 279; baseline 0 -9; x_size 20; x_descenders 5; x_ascenders 5"> | ||
<span class='ocrx_word' id='word_1_72' title='bbox 32 259 108 273; x_wconf 76'>02Aug2018</span> | ||
<span class='ocrx_word' id='word_1_73' title='bbox 122 259 177 273; x_wconf 66'>Clearing</span> | ||
<span class='ocrx_word' id='word_1_74' title='bbox 182 259 235 273; x_wconf 94'>Cheque</span> | ||
<span class='ocrx_word' id='word_1_75' title='bbox 568 251 602 279; x_wconf 88'>4984</span> | ||
<span class='ocrx_word' id='word_1_76' title='bbox 661 251 703 279; x_wconf 96'>48.08</span> | ||
<span class='ocrx_word' id='word_1_77' title='bbox 836 259 902 272; x_wconf 62'>70,317.50</span> | ||
<span class='ocrx_word' id='word_1_78' title='bbox 912 265 917 267; x_wconf 81'>-</span> | ||
</span> | ||
<span class='ocr_line' id='line_1_13' title="bbox 32 276 917 290; baseline 0 -3; x_size 20; x_descenders 5; x_ascenders 5"> | ||
<span class='ocrx_word' id='word_1_79' title='bbox 32 276 108 290; x_wconf 90'>02Aug2018</span> | ||
<span class='ocrx_word' id='word_1_80' title='bbox 122 276 177 290; x_wconf 61'>Clearing</span> | ||
<span class='ocrx_word' id='word_1_81' title='bbox 182 276 235 290; x_wconf 96'>Cheque</span> | ||
<span class='ocrx_word' id='word_1_82' title='bbox 568 276 602 287; x_wconf 96'>4985</span> | ||
<span class='ocrx_word' id='word_1_83' title='bbox 666 276 701 287; x_wconf 80'>7051</span> | ||
<span class='ocrx_word' id='word_1_84' title='bbox 836 276 900 289; x_wconf 91'>70,388.01</span> | ||
<span class='ocrx_word' id='word_1_85' title='bbox 912 282 917 284; x_wconf 87'>-</span> | ||
</span> | ||
</p> | ||
</div> | ||
<div class='ocr_carea' id='block_1_5' title="bbox 32 285 900 307"> | ||
<p class='ocr_par' id='par_1_4' lang='eng' title="bbox 32 285 900 307"> | ||
<span class='ocr_line' id='line_1_14' title="bbox 32 285 900 307; baseline 0 -3; x_size 20; x_descenders 5; x_ascenders 5"> | ||
<span class='ocrx_word' id='word_1_86' title='bbox 32 293 108 307; x_wconf 90'>02Aug2018</span> | ||
<span class='ocrx_word' id='word_1_87' title='bbox 122 293 177 307; x_wconf 81'>Clearing</span> | ||
<span class='ocrx_word' id='word_1_88' title='bbox 182 293 235 307; x_wconf 83'>Cheque</span> | ||
<span class='ocrx_word' id='word_1_89' title='bbox 568 293 602 304; x_wconf 92'>4992</span> | ||
<span class='ocrx_word' id='word_1_90' title='bbox 653 285 703 307; x_wconf 94'>500.00</span> | ||
<span class='ocrx_word' id='word_1_91' title='bbox 836 293 900 306; x_wconf 54'>70,888.01</span> | ||
</span> | ||
</p> | ||
</div> | ||
</div> | ||
</body> | ||
</html> |
Oops, something went wrong.