Skip to content

Commit

Permalink
Minor updates
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Sep 9, 2024
1 parent 4b2b43e commit 5846586
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 19 deletions.
2 changes: 1 addition & 1 deletion js/fontSupp.js
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ export const calcSuppFontInfo = async (ocrArr) => {
}
if (fontSizeMultArr.length >= 3) {
const fontSizeMult = quantile(fontSizeMultArr, 0.5);
FontProps.sizeMult[key] = fontSizeMult;
if (fontSizeMult && fontSizeMult > 0.9 && fontSizeMult < 1.5) FontProps.sizeMult[key] = fontSizeMult;
}

if (serifVotes > sansVotes) {
Expand Down
7 changes: 1 addition & 6 deletions js/import/convertPageAbbyy.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ import ocr from '../objects/ocrObjects.js';
import {
ascCharArr,
calcBboxUnion,
determineSansSerif,
mean50,
quantile,
round6, unescapeXml,
Expand Down Expand Up @@ -72,8 +71,6 @@ export async function convertPageAbbyy({ ocrStr, n }) {
const xmlLineFormatting = xmlLinePreChar?.match(/<formatting[^>]+/)?.[0];
const fontName = xmlLineFormatting?.match(/ff=['"]([^'"]*)/)?.[1];

const fontFamily = determineSansSerif(fontName);

let dropCap = false;
const dropCapMatch = xmlLine.match(abbyyDropCapRegex);
if (dropCapMatch != null && parseInt(dropCapMatch[1]) > 0) {
Expand Down Expand Up @@ -315,9 +312,7 @@ export async function convertPageAbbyy({ ocrStr, n }) {

wordObj.smallCaps = smallCapsArr[i];

if (fontFamily !== 'Default') {
wordObj.font = fontFamily;
}
if (fontName) wordObj.font = fontName;

if (styleArr[i] === 'sup') {
wordObj.sup = true;
Expand Down
18 changes: 7 additions & 11 deletions js/import/convertPageHocr.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import ocr from '../objects/ocrObjects.js';

import {
determineSansSerif, getTextScript,
getTextScript,
unescapeXml,
} from '../utils/miscUtils.js';

Expand Down Expand Up @@ -166,9 +166,8 @@ export async function convertPageHocr({

const wordLangRaw = match.match(/lang=['"]([^'"]*)['"]/i)?.[1];

const fontName = match.match(/^[^>]+?x_font\s*([\w-]+)/)?.[1];

const fontFamily = determineSansSerif(fontName);
let fontName = match.match(/^[^>]+?x_font\s*([^'";]+)/)?.[1];
if (fontName) fontName = fontName.trim();

const it = match.matchAll(charRegex);
const letterArr = [...it];
Expand Down Expand Up @@ -246,7 +245,7 @@ export async function convertPageHocr({
if (debugMode) wordObj.raw = match;

if (italic) wordObj.style = 'italic';
if (fontFamily !== 'Default') wordObj.font = fontFamily;
if (fontName) wordObj.font = fontName;

wordObj.conf = wordConf;

Expand Down Expand Up @@ -297,9 +296,8 @@ export async function convertPageHocr({
bottom: wordBox1[3],
};

const fontName = match.match(/^[^>]+?x_font\s*([\w-]+)/)?.[1];

const fontFamily = determineSansSerif(fontName);
let fontName = match.match(/^[^>]+?x_font\s*([^'";]+)/)?.[1];
if (fontName) fontName = fontName.trim();

const styleStr = match.match(/style=['"]([^'"]+)/)?.[1];

Expand Down Expand Up @@ -333,9 +331,7 @@ export async function convertPageHocr({
}

wordObj.style = fontStyle;
if (fontFamily !== 'Default') {
wordObj.font = fontFamily;
}
if (fontName) wordObj.font = fontName;

wordObj.sup = wordSup;

Expand Down
2 changes: 1 addition & 1 deletion js/import/import.js
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ export async function importFiles(files, options = {}) {
await extractInternalPDFText({
setActive: true, extractPDFTextNative, extractPDFTextOCR, extractPDFTextImage,
});
if (opt.calcSuppFontInfo) calcSuppFontInfo(ocrAll.pdf);
if (opt.calcSuppFontInfo) await calcSuppFontInfo(ocrAll.pdf);
}
}

Expand Down

0 comments on commit 5846586

Please sign in to comment.