Skip to content

Commit

Permalink
feat: read page by page (#7)
Browse files Browse the repository at this point in the history
* first working version to parse page-by-page

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added the read page-by-page using bytesio

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
  • Loading branch information
PeterStaar-IBM authored Aug 21, 2024
1 parent db951c8 commit 92e02ec
Show file tree
Hide file tree
Showing 13 changed files with 6,307 additions and 45 deletions.
17 changes: 15 additions & 2 deletions app/pybind_parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,24 @@ PYBIND11_MODULE(docling_parse, m) {

pybind11::class_<docling::docling_parser>(m, "pdf_parser")
.def(pybind11::init())

.def("set_loglevel", &docling::docling_parser::set_loglevel)

.def("unload_documents", &docling::docling_parser::unload_documents)

.def("find_cells",
pybind11::overload_cast<std::string>(&docling::docling_parser::find_cells),
"parse pdf-document from path into json")
"parse pdf-document from path into json")

.def("find_cells_from_bytesio",
&docling::docling_parser::find_cells_from_bytesio,
"A function to read a BytesIO object");
"parse pdf-document from a BytesIO object")

.def("find_cells_on_page",
&docling::docling_parser::find_cells_on_page,
"parse specific page in pdf-document from path into json")

.def("find_cells_from_bytesio_on_page",
&docling::docling_parser::find_cells_from_bytesio_on_page,
"parse pdf-document from a BytesIO object for a specific page");
}
236 changes: 215 additions & 21 deletions src/proj_folders/pdf_interface/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ namespace pdf_lib
interface();
~interface();

void clear();

int query(std::string input_file);

//private:
Expand All @@ -36,12 +38,25 @@ namespace pdf_lib

void parse(container_lib::container& input);

bool parse_pdf_page(std::string id,
container_lib::container& raw_page);
bool load_document(std::string filename);
bool unload_document(std::string filename);

std::string load_document(const char* buffer, std::size_t size);

bool unload_documents();

bool parse_pdf_page(std::string filename,
container_lib::container& raw_page);

bool parse_pdf_page(std::string filename, int page,
container_lib::container& raw_page);

bool parse_pdf_page(const char* buffer, std::size_t size,
container_lib::container& raw_page);

bool parse_pdf_page(const char* buffer, std::size_t size, int page,
container_lib::container& raw_page);

bool clean_raw_page(container_lib::container& raw_page);

void clean_pages(container_lib::container& raw_doc);
Expand Down Expand Up @@ -77,6 +92,9 @@ namespace pdf_lib

std::vector<parser_task> tasks;
std::vector<ocr_merge_task> ocr_merge_tasks;

std::map<std::string, std::shared_ptr<pdf_lib::core::object<pdf_lib::core::DOCUMENT> > > loaded_documents;
std::map<std::string, std::shared_ptr<pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> > > loaded_parsers;
};

interface<PARSER>::interface():
Expand All @@ -95,6 +113,12 @@ namespace pdf_lib
core::object<core::FONT>::finalize();
}

void interface<PARSER>::clear()
{
loaded_documents.clear();
loaded_parsers.clear();
}

int interface<PARSER>::query(std::string input_file)
{
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
Expand Down Expand Up @@ -503,6 +527,104 @@ namespace pdf_lib
return true;
}

bool interface<PARSER>::load_document(std::string filename)
{
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;

if(loaded_documents.count(filename)==0 and
loaded_parsers.count(filename)==0)
{
auto doc = std::make_shared<pdf_lib::core::object<pdf_lib::core::DOCUMENT> >();
auto parser = std::make_shared<pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> >(*doc);

//pdf_lib::core::object<pdf_lib::core::DOCUMENT> doc;
//pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> parser(doc);

parser->load_document(filename);
doc->resize_pages(parser->number_of_pages());

loaded_documents[filename] = doc;
loaded_parsers[filename] = parser;
}

return true;
}

bool interface<PARSER>::unload_document(std::string filename)
{
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;

if(loaded_documents.count(filename))
{
loaded_documents.erase(filename);
}

if(loaded_parsers.count(filename))
{
loaded_parsers.erase(filename);
}

return true;
}

bool interface<PARSER>::unload_documents()
{
loaded_documents.clear();
loaded_parsers.clear();

return true;
}

bool interface<PARSER>::parse_pdf_page(std::string filename, int page,
container_lib::container &raw_page)
{
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;

raw_page.clear();

/*
pdf_lib::core::object<pdf_lib::core::DOCUMENT> doc;
{
pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> parser(doc);
parser.load_document(filename).process_page(page);
}
*/

// lazy loading
load_document(filename);

// clean the document
{
//auto doc = std::make_shared<pdf_lib::core::object<pdf_lib::core::DOCUMENT> >();
//loaded_documents[filename] = doc;
}

auto& doc = loaded_documents.at(filename);
auto& parser = loaded_parsers.at(filename);

//parser->set_object(*doc);

parser->process_page_from_document(page);

try
{
pdf_lib::core::writer writer;
writer.execute(*doc, raw_page);

// do clean-up
doc->delete_page(page);
}
catch (...)
{
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__
<< "\t ERROR in conversion pdf_lib::core::DOCUMENT --> container !!\n";
return false;
}

return true;
}

bool interface<PARSER>::parse_pdf_page(const char* buffer, std::size_t size,
container_lib::container &raw_page)
{
Expand Down Expand Up @@ -541,15 +663,80 @@ namespace pdf_lib
return true;
}

std::string interface<PARSER>::load_document(const char* buffer, std::size_t size)
{
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;

// we are using the buffer as a way to disambiguate doc's. Is not completely fool-proof
std::string key = "document-"+std::to_string(size);

if(loaded_documents.count(key)==0 and
loaded_parsers.count(key)==0)
{
auto doc = std::make_shared<pdf_lib::core::object<pdf_lib::core::DOCUMENT> >();
auto parser = std::make_shared<pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> >(*doc);

std::string desc = "parsing document buffer via BytesIO";
parser->load_buffer(desc.c_str(), buffer, size);

doc->resize_pages(parser->number_of_pages());

loaded_documents[key] = doc;
loaded_parsers[key] = parser;
}

return key;
}

bool interface<PARSER>::parse_pdf_page(const char* buffer, std::size_t size, int page,
container_lib::container &raw_page)
{
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;

raw_page.clear();

/*
pdf_lib::core::object<pdf_lib::core::DOCUMENT> doc;
{
std::string desc = "parsing document buffer via BytesIO";
pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> parser(doc);
parser.load_buffer(desc.c_str(), buffer, size).process_all();
}
*/

// lazy loading
std::string key = load_document(buffer, size);

auto& doc = loaded_documents.at(key);
auto& parser = loaded_parsers.at(key);

parser->process_page_from_document(page);

try
{
pdf_lib::core::writer writer;
writer.execute(*doc, raw_page);
}
catch (...)
{
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__
<< "\t ERROR in conversion pdf_lib::core::DOCUMENT --> container !!\n";
return false;
}

return true;
}

bool interface<PARSER>::clean_raw_page(container_lib::container& raw_page)
{
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;

typedef float scalar_type;

logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t"
<< "#-cells: " << raw_page["pages"][0]["cells"].get_size();

//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t"
//<< "#-cells: " << raw_page["pages"][0]["cells"].get_size();

post_processor<BUILD_OVERVIEW, scalar_type> overview;
{
Expand Down Expand Up @@ -623,22 +810,30 @@ namespace pdf_lib
{
container_lib::container &page = pages[k];

{
post_processor<REMOVE_EMPTY_CELLS, scalar_type> post_processor;
post_processor.execute(page);
}

{
post_processor<REMOVE_DUPLICATE_CELLS, scalar_type> post_processor;
post_processor.execute(page);
}

{
post_processor<REMOVE_OUTLIER_CELLS, scalar_type> post_processor;
post_processor.execute(page);
}
if(page.has(core::keys<core::PAGE>::cells()))
{
{
post_processor<REMOVE_EMPTY_CELLS, scalar_type> post_processor;
post_processor.execute(page);
}

{
post_processor<REMOVE_DUPLICATE_CELLS, scalar_type> post_processor;
post_processor.execute(page);
}

{
post_processor<REMOVE_OUTLIER_CELLS, scalar_type> post_processor;
post_processor.execute(page);
}

}
else
{
logging_lib::warn("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t"
<< "skipping page: no cells";
}
}

}

bool interface<PARSER>::is_acceptable(container_lib::container& raw_doc, container_lib::container& page_stats)
Expand All @@ -664,7 +859,6 @@ namespace pdf_lib

container_lib::container& v_paths = page[core::keys<core::PAGE>::vertical_lines()];
container_lib::container& h_paths = page[core::keys<core::PAGE>::horizontal_lines()];


logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << " raw-doc:\n"
<< "\t#-cells: " << cells.get_size() << "\n"
Expand Down
Loading

0 comments on commit 92e02ec

Please sign in to comment.