From 8b790b4d0e129649f923944362c909188547dcab Mon Sep 17 00:00:00 2001 From: Balearica Date: Fri, 20 Sep 2024 22:42:16 -0700 Subject: [PATCH] Updated render functions; added getNextLine helper function --- js/export/export.js | 6 +++--- js/export/exportPDF.js | 2 +- js/export/exportRenderText.js | 6 +++++- js/objects/ocrObjects.js | 14 ++++++++++++++ scribe.js | 10 ++++++++++ 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/js/export/export.js b/js/export/export.js index a6f4a1d..2c2429b 100644 --- a/js/export/export.js +++ b/js/export/export.js @@ -3,7 +3,7 @@ import { layoutRegions, ocrAll, pageMetricsArr } from '../containers/dataContain import { ImageCache } from '../containers/imageContainer.js'; import { reorderOcrPage } from '../modifyOCR.js'; import { saveAs } from '../utils/miscUtils.js'; -import { hocrToPDF } from './exportPDF.js'; +import { renderPDF } from './exportPDF.js'; import { renderHOCR } from './exportRenderHOCR.js'; import { renderText } from './exportRenderText.js'; @@ -60,7 +60,7 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) { // and assume that the overlay PDF is the same size as the input images. // The `maxpage` argument must be set manually to `inputData.pageCount-1`, as this avoids an error in the case where there is no OCR data (`hocrDownload` has length 0). // In all other cases, this should be equivalent to using the default argument of `-1` (which results in `hocrDownload.length` being used). - const pdfStr = await hocrToPDF(ocrDownload, 0, inputData.pageCount - 1, opt.displayMode, rotateText, rotateBackground, + const pdfStr = await renderPDF(ocrDownload, 0, inputData.pageCount - 1, opt.displayMode, rotateText, rotateBackground, { width: -1, height: -1 }, opt.confThreshHigh, opt.confThreshMed, opt.overlayOpacity / 100); const enc = new TextEncoder(); @@ -142,7 +142,7 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) { }); } } else { - const pdfStr = await hocrToPDF(ocrDownload, minValue, maxValue, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed, + const pdfStr = await renderPDF(ocrDownload, minValue, maxValue, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed, opt.overlayOpacity / 100); // The PDF is still run through muPDF, even thought in eBook mode no background layer is added. diff --git a/js/export/exportPDF.js b/js/export/exportPDF.js index bbfa06e..cb246ff 100644 --- a/js/export/exportPDF.js +++ b/js/export/exportPDF.js @@ -31,7 +31,7 @@ import ocr from '../objects/ocrObjects.js'; * * A valid PDF will be created if an empty array is provided for `hocrArr`, as long as `maxpage` is set manually. */ -export async function hocrToPDF(hocrArr, minpage = 0, maxpage = -1, textMode = 'ebook', rotateText = false, rotateBackground = false, +export async function renderPDF(hocrArr, minpage = 0, maxpage = -1, textMode = 'ebook', rotateText = false, rotateBackground = false, dimsLimit = { width: -1, height: -1 }, confThreshHigh = 85, confThreshMed = 75, proofOpacity = 0.8) { if (!FontCont.raw) throw new Error('No fonts loaded.'); diff --git a/js/export/exportRenderText.js b/js/export/exportRenderText.js index d26a2af..323fd48 100644 --- a/js/export/exportRenderText.js +++ b/js/export/exportRenderText.js @@ -11,8 +11,10 @@ import { assignParagraphs } from '../utils/reflowPars.js'; * @param {number} maxpage - The last page to include in the document. * @param {boolean} reflowText - Remove line breaks within what appears to be the same paragraph. * @param {boolean} docxMode - Create XML for a word document rather than plain text. + * @param {?Array} wordIds - An array of word IDs to include in the document. + * If omitted, all words are included. */ -export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, docxMode = false) { +export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, docxMode = false, wordIds = null) { let textStr = ''; if (maxpage === -1) maxpage = ocrCurrent.length - 1; @@ -48,6 +50,8 @@ export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = f const wordObj = lineObj.words[i]; if (!wordObj) continue; + if (wordIds && !wordIds.includes(wordObj.id)) continue; + if (docxMode) { let fontStyle = ''; if (wordObj.style === 'italic') { diff --git a/js/objects/ocrObjects.js b/js/objects/ocrObjects.js index dd8fd83..3d3e591 100644 --- a/js/objects/ocrObjects.js +++ b/js/objects/ocrObjects.js @@ -226,6 +226,19 @@ export const getPrevLine = (lineObj) => { return lineObj.page.lines[lineIndex - 1]; }; +/** + * + * @param {OcrLine} lineObj + */ +export const getNextLine = (lineObj) => { + // While lines have no unique ID, word IDs are assumed unique. + // Therefore, lines are identified using the ID of the first word. + if (!lineObj.words[0]) throw new Error('All lines must contain >=1 word'); + const lineIndex = lineObj.page.lines.findIndex((elem) => elem.words?.[0]?.id === lineObj.words[0].id); + if (lineIndex + 1 >= lineObj.page.lines.length) return null; + return lineObj.page.lines[lineIndex + 1]; +}; + /** * @param {OcrPage} page * @param {string} id @@ -721,6 +734,7 @@ const ocr = { getParText, getLineText, getPrevLine, + getNextLine, getWordFillOpacity, clonePage, cloneLine, diff --git a/scribe.js b/scribe.js index 615fac7..80d26e5 100644 --- a/scribe.js +++ b/scribe.js @@ -12,6 +12,9 @@ import coords from './js/coordinates.js'; import { drawDebugImages, renderPageStatic } from './js/debug.js'; import { download, exportData } from './js/export/export.js'; import { convertToCSV, writeDebugCsv } from './js/export/exportDebugCsv.js'; +import { renderPDF } from './js/export/exportPDF.js'; +import { renderHOCR } from './js/export/exportRenderHOCR.js'; +import { renderText } from './js/export/exportRenderText.js'; import { extractInternalPDFText } from './js/extractPDFText.js'; import { extractSingleTableContent } from './js/extractTables.js'; import { enableFontOpt, loadBuiltInFontsRaw } from './js/fontContainerMain.js'; @@ -196,6 +199,13 @@ class utils { // Font utils static calcWordMetrics = calcWordMetrics; + // Export functions + static renderPDF = renderPDF; + + static renderHOCR = renderHOCR; + + static renderText = renderText; + // Misc utils static calcBoxOverlap = calcBoxOverlap;