Skip to content

Commit

Permalink
Updated render functions; added getNextLine helper function
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Sep 21, 2024
1 parent a19d59d commit 8b790b4
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 5 deletions.
6 changes: 3 additions & 3 deletions js/export/export.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { layoutRegions, ocrAll, pageMetricsArr } from '../containers/dataContain
import { ImageCache } from '../containers/imageContainer.js';
import { reorderOcrPage } from '../modifyOCR.js';
import { saveAs } from '../utils/miscUtils.js';
import { hocrToPDF } from './exportPDF.js';
import { renderPDF } from './exportPDF.js';
import { renderHOCR } from './exportRenderHOCR.js';
import { renderText } from './exportRenderText.js';

Expand Down Expand Up @@ -60,7 +60,7 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
// and assume that the overlay PDF is the same size as the input images.
// The `maxpage` argument must be set manually to `inputData.pageCount-1`, as this avoids an error in the case where there is no OCR data (`hocrDownload` has length 0).
// In all other cases, this should be equivalent to using the default argument of `-1` (which results in `hocrDownload.length` being used).
const pdfStr = await hocrToPDF(ocrDownload, 0, inputData.pageCount - 1, opt.displayMode, rotateText, rotateBackground,
const pdfStr = await renderPDF(ocrDownload, 0, inputData.pageCount - 1, opt.displayMode, rotateText, rotateBackground,
{ width: -1, height: -1 }, opt.confThreshHigh, opt.confThreshMed, opt.overlayOpacity / 100);

const enc = new TextEncoder();
Expand Down Expand Up @@ -142,7 +142,7 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
});
}
} else {
const pdfStr = await hocrToPDF(ocrDownload, minValue, maxValue, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed,
const pdfStr = await renderPDF(ocrDownload, minValue, maxValue, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed,
opt.overlayOpacity / 100);

// The PDF is still run through muPDF, even thought in eBook mode no background layer is added.
Expand Down
2 changes: 1 addition & 1 deletion js/export/exportPDF.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import ocr from '../objects/ocrObjects.js';
*
* A valid PDF will be created if an empty array is provided for `hocrArr`, as long as `maxpage` is set manually.
*/
export async function hocrToPDF(hocrArr, minpage = 0, maxpage = -1, textMode = 'ebook', rotateText = false, rotateBackground = false,
export async function renderPDF(hocrArr, minpage = 0, maxpage = -1, textMode = 'ebook', rotateText = false, rotateBackground = false,
dimsLimit = { width: -1, height: -1 }, confThreshHigh = 85, confThreshMed = 75, proofOpacity = 0.8) {
if (!FontCont.raw) throw new Error('No fonts loaded.');

Expand Down
6 changes: 5 additions & 1 deletion js/export/exportRenderText.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@ import { assignParagraphs } from '../utils/reflowPars.js';
* @param {number} maxpage - The last page to include in the document.
* @param {boolean} reflowText - Remove line breaks within what appears to be the same paragraph.
* @param {boolean} docxMode - Create XML for a word document rather than plain text.
* @param {?Array<string>} wordIds - An array of word IDs to include in the document.
* If omitted, all words are included.
*/
export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, docxMode = false) {
export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, docxMode = false, wordIds = null) {
let textStr = '';

if (maxpage === -1) maxpage = ocrCurrent.length - 1;
Expand Down Expand Up @@ -48,6 +50,8 @@ export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = f
const wordObj = lineObj.words[i];
if (!wordObj) continue;

if (wordIds && !wordIds.includes(wordObj.id)) continue;

if (docxMode) {
let fontStyle = '';
if (wordObj.style === 'italic') {
Expand Down
14 changes: 14 additions & 0 deletions js/objects/ocrObjects.js
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,19 @@ export const getPrevLine = (lineObj) => {
return lineObj.page.lines[lineIndex - 1];
};

/**
*
* @param {OcrLine} lineObj
*/
export const getNextLine = (lineObj) => {
// While lines have no unique ID, word IDs are assumed unique.
// Therefore, lines are identified using the ID of the first word.
if (!lineObj.words[0]) throw new Error('All lines must contain >=1 word');
const lineIndex = lineObj.page.lines.findIndex((elem) => elem.words?.[0]?.id === lineObj.words[0].id);
if (lineIndex + 1 >= lineObj.page.lines.length) return null;
return lineObj.page.lines[lineIndex + 1];
};

/**
* @param {OcrPage} page
* @param {string} id
Expand Down Expand Up @@ -721,6 +734,7 @@ const ocr = {
getParText,
getLineText,
getPrevLine,
getNextLine,
getWordFillOpacity,
clonePage,
cloneLine,
Expand Down
10 changes: 10 additions & 0 deletions scribe.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ import coords from './js/coordinates.js';
import { drawDebugImages, renderPageStatic } from './js/debug.js';
import { download, exportData } from './js/export/export.js';
import { convertToCSV, writeDebugCsv } from './js/export/exportDebugCsv.js';
import { renderPDF } from './js/export/exportPDF.js';
import { renderHOCR } from './js/export/exportRenderHOCR.js';
import { renderText } from './js/export/exportRenderText.js';
import { extractInternalPDFText } from './js/extractPDFText.js';
import { extractSingleTableContent } from './js/extractTables.js';
import { enableFontOpt, loadBuiltInFontsRaw } from './js/fontContainerMain.js';
Expand Down Expand Up @@ -196,6 +199,13 @@ class utils {
// Font utils
static calcWordMetrics = calcWordMetrics;

// Export functions
static renderPDF = renderPDF;

static renderHOCR = renderHOCR;

static renderText = renderText;

// Misc utils
static calcBoxOverlap = calcBoxOverlap;

Expand Down

0 comments on commit 8b790b4

Please sign in to comment.