diff --git a/README.md b/README.md index 50f2606..500992b 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,7 @@ val df = spark.read.format("pdf") .option("resolution", "200") .option("pagePerPartition", "2") .option("reader", "pdfBox") + .option("ocrConfig", "psm=11") .load("path to the pdf file(s)") df.select("path", "document").show() @@ -164,6 +165,7 @@ df = spark.read.format("pdf") \ .option("resolution", "200") \ .option("pagePerPartition", "2") \ .option("reader", "pdfBox") \ + .option("ocrConfig", "psm=11") \ .load("path to the pdf file(s)") df.select("path", "document").show() diff --git a/src/test/scala/PdfDatasourceSuite.scala b/src/test/scala/PdfDatasourceSuite.scala index 5f4ea88..3ac737e 100644 --- a/src/test/scala/PdfDatasourceSuite.scala +++ b/src/test/scala/PdfDatasourceSuite.scala @@ -86,6 +86,7 @@ class PdfDatasourceSuite extends AnyFunSuite with BeforeAndAfterEach { .option("resolution", "200") .option("pagePerPartition", "2") .option("reader", reader) + .option("ocrConfig", "psm=11") .load(pdfPath) (filePath, fileName, pdfDF) }