From 7690b8bacd120967487bbc974b2f710f2c9249b5 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 8 Jan 2024 19:00:11 +0100 Subject: [PATCH] PDF to HTML (#349) --- CMakeLists.txt | 3 +- src/odr/exceptions.cpp | 2 + src/odr/exceptions.hpp | 4 + src/odr/file.cpp | 40 ++++- src/odr/file.hpp | 20 +++ src/odr/html.cpp | 40 +++-- src/odr/html.hpp | 8 + src/odr/internal/html/pdf_file.cpp | 152 ++++++++++++++++++ src/odr/internal/html/pdf_file.hpp | 20 +++ src/odr/internal/magic.cpp | 14 +- src/odr/internal/magic.hpp | 6 +- src/odr/internal/open_strategy.cpp | 3 + src/odr/internal/pdf/pdf_document_parser.cpp | 8 +- src/odr/internal/pdf/pdf_file.cpp | 22 +++ src/odr/internal/pdf/pdf_file.hpp | 24 +++ .../internal/pdf/pdf_graphics_operator.hpp | 11 +- .../pdf/pdf_graphics_operator_parser.cpp | 11 +- src/odr/internal/pdf/pdf_graphics_state.cpp | 30 ++-- src/odr/internal/pdf/pdf_graphics_state.hpp | 21 +-- src/odr/internal/pdf/pdf_object_parser.cpp | 5 +- src/odr/open_document_reader.cpp | 50 ++++-- src/odr/open_document_reader.hpp | 26 ++- test/CMakeLists.txt | 2 +- test/data/reference-output/odr-public | 2 +- ...eference_test.cpp => html_output_test.cpp} | 51 +++--- 25 files changed, 476 insertions(+), 99 deletions(-) create mode 100644 src/odr/internal/html/pdf_file.cpp create mode 100644 src/odr/internal/html/pdf_file.hpp create mode 100644 src/odr/internal/pdf/pdf_file.cpp create mode 100644 src/odr/internal/pdf/pdf_file.hpp rename test/src/{output_reference_test.cpp => html_output_test.cpp} (74%) diff --git a/CMakeLists.txt b/CMakeLists.txt index a8514b39..c6277ef4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,6 +114,7 @@ add_library(odr "src/odr/internal/html/filesystem.cpp" "src/odr/internal/html/html_writer.cpp" "src/odr/internal/html/image_file.cpp" + "src/odr/internal/html/pdf_file.cpp" "src/odr/internal/html/text_file.cpp" "src/odr/internal/json/json_file.cpp" @@ -152,7 +153,7 @@ add_library(odr "src/odr/internal/pdf/pdf_document.cpp" "src/odr/internal/pdf/pdf_document_element.cpp" "src/odr/internal/pdf/pdf_document_parser.cpp" - "src/odr/internal/pdf/pdf_object.cpp" + "src/odr/internal/pdf/pdf_file.cpp" "src/odr/internal/pdf/pdf_file_parser.cpp" "src/odr/internal/pdf/pdf_graphics_operator.cpp" "src/odr/internal/pdf/pdf_graphics_operator_parser.cpp" diff --git a/src/odr/exceptions.cpp b/src/odr/exceptions.cpp index ad6c30f6..65d97585 100644 --- a/src/odr/exceptions.cpp +++ b/src/odr/exceptions.cpp @@ -44,6 +44,8 @@ NoOpenDocumentFile::NoOpenDocumentFile() NoOfficeOpenXmlFile::NoOfficeOpenXmlFile() : std::runtime_error("not an office open xml file") {} +NoPdfFile::NoPdfFile() : std::runtime_error("not a pdf file") {} + NoXml::NoXml() : std::runtime_error("not xml") {} UnsupportedCryptoAlgorithm::UnsupportedCryptoAlgorithm() diff --git a/src/odr/exceptions.hpp b/src/odr/exceptions.hpp index b47663d6..ac7ec02f 100644 --- a/src/odr/exceptions.hpp +++ b/src/odr/exceptions.hpp @@ -80,6 +80,10 @@ struct NoOfficeOpenXmlFile final : public std::runtime_error { NoOfficeOpenXmlFile(); }; +struct NoPdfFile final : public std::runtime_error { + NoPdfFile(); +}; + struct NoXml final : public std::runtime_error { NoXml(); }; diff --git a/src/odr/file.cpp b/src/odr/file.cpp index 9e126085..565fe8a5 100644 --- a/src/odr/file.cpp +++ b/src/odr/file.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -89,9 +90,7 @@ DecodedFile::DecodedFile(const std::string &path, FileType as) DecodedFile::operator bool() const { return m_impl.operator bool(); } -FileType DecodedFile::file_type() const noexcept { - return m_impl->file_meta().type; -} +FileType DecodedFile::file_type() const noexcept { return m_impl->file_type(); } FileCategory DecodedFile::file_category() const noexcept { return m_impl->file_category(); @@ -101,6 +100,30 @@ FileMeta DecodedFile::file_meta() const noexcept { return m_impl->file_meta(); } File DecodedFile::file() const { return File(m_impl->file()); } +bool DecodedFile::is_text_file() const { + return std::dynamic_pointer_cast(m_impl) != + nullptr; +} + +bool DecodedFile::is_image_file() const { + return std::dynamic_pointer_cast(m_impl) != + nullptr; +} + +bool DecodedFile::is_archive_file() const { + return std::dynamic_pointer_cast(m_impl) != + nullptr; +} + +bool DecodedFile::is_document_file() const { + return std::dynamic_pointer_cast(m_impl) != + nullptr; +} + +bool DecodedFile::is_pdf_file() const { + return std::dynamic_pointer_cast(m_impl) != nullptr; +} + TextFile DecodedFile::text_file() const { if (auto text_file = std::dynamic_pointer_cast(m_impl)) { @@ -133,6 +156,14 @@ DocumentFile DecodedFile::document_file() const { throw NoDocumentFile(); } +PdfFile DecodedFile::pdf_file() const { + if (auto pdf_file = + std::dynamic_pointer_cast(m_impl)) { + return PdfFile(pdf_file); + } + throw NoPdfFile(); +} + TextFile::TextFile(std::shared_ptr impl) : DecodedFile(impl), m_impl{std::move(impl)} {} @@ -198,4 +229,7 @@ DocumentMeta DocumentFile::document_meta() const { Document DocumentFile::document() const { return Document(m_impl->document()); } +PdfFile::PdfFile(std::shared_ptr impl) + : DecodedFile(impl), m_impl{std::move(impl)} {} + } // namespace odr diff --git a/src/odr/file.hpp b/src/odr/file.hpp index c849f36d..39ea4aa6 100644 --- a/src/odr/file.hpp +++ b/src/odr/file.hpp @@ -16,11 +16,16 @@ class ArchiveFile; class DocumentFile; } // namespace odr::internal::abstract +namespace odr::internal::pdf { +class PdfFile; +} + namespace odr { class TextFile; class ImageFile; class ArchiveFile; class DocumentFile; +class PdfFile; class Archive; class Document; @@ -166,10 +171,17 @@ class DecodedFile { [[nodiscard]] File file() const; + [[nodiscard]] bool is_text_file() const; + [[nodiscard]] bool is_image_file() const; + [[nodiscard]] bool is_archive_file() const; + [[nodiscard]] bool is_document_file() const; + [[nodiscard]] bool is_pdf_file() const; + [[nodiscard]] TextFile text_file() const; [[nodiscard]] ImageFile image_file() const; [[nodiscard]] ArchiveFile archive_file() const; [[nodiscard]] DocumentFile document_file() const; + [[nodiscard]] PdfFile pdf_file() const; protected: std::shared_ptr m_impl; @@ -229,6 +241,14 @@ class DocumentFile final : public DecodedFile { std::shared_ptr m_impl; }; +class PdfFile final : public DecodedFile { +public: + explicit PdfFile(std::shared_ptr); + +private: + std::shared_ptr m_impl; +}; + } // namespace odr #endif // ODR_FILE_HPP diff --git a/src/odr/html.cpp b/src/odr/html.cpp index 4ade286d..e1a50578 100644 --- a/src/odr/html.cpp +++ b/src/odr/html.cpp @@ -9,12 +9,13 @@ #include #include #include +#include #include -#include - #include +#include + using namespace odr::internal; namespace fs = std::filesystem; @@ -53,21 +54,32 @@ Html html::translate(const File &file, const std::string &output_path, const PasswordCallback &password_callback) { auto decoded_file = DecodedFile(file); - if (decoded_file.file_category() == FileCategory::text) { - return translate(decoded_file.text_file(), output_path, config); - } else if (decoded_file.file_category() == FileCategory::image) { - return translate(decoded_file.image_file(), output_path, config); - } else if (decoded_file.file_category() == FileCategory::archive) { - return translate(decoded_file.archive_file().archive(), output_path, - config); - } else if (decoded_file.file_category() == FileCategory::document) { + if (decoded_file.is_document_file()) { DocumentFile document_file = decoded_file.document_file(); if (document_file.password_encrypted()) { if (!document_file.decrypt(password_callback())) { throw WrongPassword(); } } - return translate(document_file.document(), output_path, config); + } + + return translate(decoded_file, output_path, config); +} + +Html html::translate(const DecodedFile &decoded_file, + const std::string &output_path, const HtmlConfig &config) { + if (decoded_file.is_text_file()) { + return translate(decoded_file.text_file(), output_path, config); + } else if (decoded_file.is_image_file()) { + return translate(decoded_file.image_file(), output_path, config); + } else if (decoded_file.is_archive_file()) { + return translate(decoded_file.archive_file().archive(), output_path, + config); + } else if (decoded_file.is_document_file()) { + return translate(decoded_file.document_file().document(), output_path, + config); + } else if (decoded_file.is_pdf_file()) { + return translate(decoded_file.pdf_file(), output_path, config); } throw UnsupportedFileType(decoded_file.file_type()); @@ -98,6 +110,12 @@ Html html::translate(const Document &document, const std::string &output_path, return internal::html::translate_document(document, output_path, config); } +Html html::translate(const PdfFile &pdf_file, const std::string &output_path, + const HtmlConfig &config) { + fs::create_directories(output_path); + return internal::html::translate_pdf_file(pdf_file, output_path, config); +} + void html::edit(const Document &document, const char *diff) { auto json = nlohmann::json::parse(diff); for (const auto &[key, value] : json["modifiedText"].items()) { diff --git a/src/odr/html.hpp b/src/odr/html.hpp index 6c3d6c50..f7369933 100644 --- a/src/odr/html.hpp +++ b/src/odr/html.hpp @@ -90,9 +90,13 @@ struct HtmlPage final { using PasswordCallback = std::function; namespace html { + Html translate(const File &file, const std::string &output_path, const HtmlConfig &config, const PasswordCallback &password_callback); +Html translate(const DecodedFile &file, const std::string &output_path, + const HtmlConfig &config); + Html translate(const TextFile &text_file, const std::string &output_path, const HtmlConfig &config); Html translate(const ImageFile &image_file, const std::string &output_path, @@ -101,7 +105,11 @@ Html translate(const Archive &archive, const std::string &output_path, const HtmlConfig &config); Html translate(const Document &document, const std::string &output_path, const HtmlConfig &config); +Html translate(const PdfFile &pdf_file, const std::string &output_path, + const HtmlConfig &config); + void edit(const Document &document, const char *diff); + } // namespace html } // namespace odr diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp new file mode 100644 index 00000000..f272c32b --- /dev/null +++ b/src/odr/internal/html/pdf_file.cpp @@ -0,0 +1,152 @@ +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace odr::internal { + +Html html::translate_pdf_file(const PdfFile &pdf_file, + const std::string &output_path, + const HtmlConfig &config) { + auto in = pdf_file.file().stream(); + pdf::DocumentParser parser(*in); + + std::unique_ptr document = parser.parse_document(); + + std::vector ordered_pages; + std::function recurse_pages = + [&](pdf::Pages *pages) { + for (pdf::Element *kid : pages->kids) { + if (auto p = dynamic_cast(kid); p != nullptr) { + recurse_pages(p); + } else if (auto page = dynamic_cast(kid); + page != nullptr) { + ordered_pages.push_back(page); + } else { + throw std::runtime_error("unhandled element"); + } + } + }; + + recurse_pages(document->catalog->pages); + + auto output_file_path = output_path + "/document.html"; + std::ofstream ostream(output_file_path); + if (!ostream.is_open()) { + throw FileWriteError(); + } + HtmlWriter out(ostream, config.format_html, config.html_indent); + + out.write_begin(); + out.write_header_begin(); + out.write_header_charset("UTF-8"); + out.write_header_target("_blank"); + out.write_header_title("odr"); + out.write_header_viewport( + "width=device-width,initial-scale=1.0,user-scalable=yes"); + out.write_header_end(); + + out.write_body_begin(); + + for (pdf::Page *page : ordered_pages) { + pdf::Array page_box = page->object.as_dictionary()["MediaBox"].as_array(); + + out.write_element_begin( + "div", HtmlElementOptions().set_style([&](std::ostream &o) { + o << "position:relative;"; + o << "width:" << page_box[2].as_real() / 72.0 << "in;"; + o << "height:" << page_box[3].as_real() / 72.0 << "in;"; + })); + + pdf::IndirectObject page_contents_object = + parser.read_object(page->contents_reference); + std::string stream = parser.read_object_stream(page_contents_object); + std::string page_content = crypto::util::zlib_inflate(stream); + + std::istringstream ss(page_content); + pdf::GraphicsOperatorParser parser2(ss); + pdf::GraphicsState state; + while (!ss.eof()) { + pdf::GraphicsOperator op = parser2.read_operator(); + state.execute(op); + + if (op.type == pdf::GraphicsOperatorType::text_next_line) { + double leading = state.current().text.leading; + double size = state.current().text.size; + + state.current().text.offset[1] -= size + leading; + } else if (op.type == pdf::GraphicsOperatorType::show_text) { + const std::string &font_ref = state.current().text.font; + double size = state.current().text.size; + + std::array offset = state.current().text.offset; + + pdf::Font *font = page->resources->font.at(font_ref); + + const std::string &glyphs = op.arguments[0].as_string(); + std::string unicode = font->cmap.translate_string(glyphs); + + if (unicode.find("Colored Line") != std::string::npos) { + std::cout << "hi" << std::endl; + } + + out.write_element_begin( + "span", HtmlElementOptions().set_style([&](std::ostream &o) { + o << "position:absolute;"; + o << "left:" << offset[0] / 72.0 << "in;"; + o << "bottom:" << offset[1] / 72.0 << "in;"; + o << "font-size:" << size << "pt;"; + })); + out.write_raw(unicode); + out.write_element_end("span"); + } else if (op.type == + pdf::GraphicsOperatorType::show_text_manual_spacing) { + const std::string &font_ref = state.current().text.font; + pdf::Font *font = page->resources->font.at(font_ref); + double size = state.current().text.size; + + std::cout << font->object << std::endl; + + for (const auto &element : op.arguments[0].as_array()) { + if (element.is_real()) { + std::cout << "spacing: " << element.as_real() << std::endl; + } else if (element.is_string()) { + const std::string &glyphs = element.as_string(); + std::string unicode = font->cmap.translate_string(glyphs); + std::cout << "show text manual spacing: font=" << font + << ", size=" << size << ", text=" << unicode << std::endl; + } + } + } else if (op.type == pdf::GraphicsOperatorType::show_text_next_line) { + std::cout << "TODO show_text_next_line" << std::endl; + } else if (op.type == + pdf::GraphicsOperatorType::show_text_next_line_set_spacing) { + std::cout << "TODO show_text_next_line_set_spacing" << std::endl; + } + } + + out.write_element_end("div"); + } + + out.write_body_end(); + out.write_end(); + + return {FileType::portable_document_format, + config, + {{"document", output_file_path}}}; +} + +} // namespace odr::internal diff --git a/src/odr/internal/html/pdf_file.hpp b/src/odr/internal/html/pdf_file.hpp new file mode 100644 index 00000000..6df0f55a --- /dev/null +++ b/src/odr/internal/html/pdf_file.hpp @@ -0,0 +1,20 @@ +#ifndef ODR_INTERNAL_PDF_FILE_HPP +#define ODR_INTERNAL_PDF_FILE_HPP + +#include + +namespace odr { +class PdfFile; + +struct HtmlConfig; +class Html; +} // namespace odr + +namespace odr::internal::html { + +Html translate_pdf_file(const PdfFile &pdf_file, const std::string &output_path, + const HtmlConfig &config); + +} + +#endif // ODR_INTERNAL_PDF_FILE_HPP diff --git a/src/odr/internal/magic.cpp b/src/odr/internal/magic.cpp index f0605012..3a3b0f3c 100644 --- a/src/odr/internal/magic.cpp +++ b/src/odr/internal/magic.cpp @@ -1,5 +1,7 @@ #include +#include + #include #include @@ -58,13 +60,21 @@ FileType magic::file_type(const std::string &head) { return FileType::unknown; } -FileType magic::file_type(const internal::abstract::File &file) { +FileType magic::file_type(std::istream &in) { static constexpr auto max_head_size = 12; char head[max_head_size]; - file.stream()->read(head, sizeof(head)); + in.read(head, sizeof(head)); return file_type(std::string(head, max_head_size)); } +FileType magic::file_type(const internal::abstract::File &file) { + return file_type(*file.stream()); +} + +FileType magic::file_type(const File &file) { + return file_type(*file.stream()); +} + } // namespace odr::internal diff --git a/src/odr/internal/magic.hpp b/src/odr/internal/magic.hpp index 2372db73..ac5a05f5 100644 --- a/src/odr/internal/magic.hpp +++ b/src/odr/internal/magic.hpp @@ -1,11 +1,13 @@ #ifndef ODR_MAGIC_HPP #define ODR_MAGIC_HPP +#include #include namespace odr { enum class FileType; -} +class File; +} // namespace odr namespace odr::internal::abstract { class File; @@ -13,7 +15,9 @@ class File; namespace odr::internal::magic { FileType file_type(const std::string &magic); +FileType file_type(std::istream &in); FileType file_type(const internal::abstract::File &file); +FileType file_type(const File &file); } // namespace odr::internal::magic #endif // ODR_MAGIC_HPP diff --git a/src/odr/internal/open_strategy.cpp b/src/odr/internal/open_strategy.cpp index ba2c8adc..31711a01 100644 --- a/src/odr/internal/open_strategy.cpp +++ b/src/odr/internal/open_strategy.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -133,6 +134,8 @@ open_strategy::open_file(std::shared_ptr file) { } return cfb; + } else if (file_type == FileType::portable_document_format) { + return std::make_unique(file); } else if (file_type == FileType::portable_network_graphics || file_type == FileType::graphics_interchange_format || file_type == FileType::jpeg || diff --git a/src/odr/internal/pdf/pdf_document_parser.cpp b/src/odr/internal/pdf/pdf_document_parser.cpp index a08d0041..85cf9be4 100644 --- a/src/odr/internal/pdf/pdf_document_parser.cpp +++ b/src/odr/internal/pdf/pdf_document_parser.cpp @@ -94,9 +94,11 @@ pdf::Page *parse_page(DocumentParser &parser, const ObjectReference &reference, page->resources = parse_resources(parser, dictionary["Resources"].as_reference(), document); - for (Object annotation : dictionary["Annots"].as_array()) { - page->annotations.push_back( - parse_annotation(parser, annotation.as_reference(), document)); + if (dictionary.has_key("Annots")) { + for (Object annotation : dictionary["Annots"].as_array()) { + page->annotations.push_back( + parse_annotation(parser, annotation.as_reference(), document)); + } } return page; diff --git a/src/odr/internal/pdf/pdf_file.cpp b/src/odr/internal/pdf/pdf_file.cpp new file mode 100644 index 00000000..00260c0f --- /dev/null +++ b/src/odr/internal/pdf/pdf_file.cpp @@ -0,0 +1,22 @@ +#include + +namespace odr::internal::pdf { + +PdfFile::PdfFile(std::shared_ptr file) + : m_file{std::move(file)} {} + +FileCategory PdfFile::file_category() const noexcept { + return FileCategory::document; +} + +std::shared_ptr PdfFile::file() const noexcept { + return m_file; +} + +FileType PdfFile::file_type() const noexcept { + return FileType::portable_document_format; +} + +FileMeta PdfFile::file_meta() const noexcept { return {}; } + +} // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_file.hpp b/src/odr/internal/pdf/pdf_file.hpp new file mode 100644 index 00000000..42b751be --- /dev/null +++ b/src/odr/internal/pdf/pdf_file.hpp @@ -0,0 +1,24 @@ +#ifndef ODR_INTERNAL_PDF_FILE_HPP +#define ODR_INTERNAL_PDF_FILE_HPP + +#include + +namespace odr::internal::pdf { + +class PdfFile : public abstract::DecodedFile { +public: + explicit PdfFile(std::shared_ptr file); + + [[nodiscard]] std::shared_ptr file() const noexcept final; + + [[nodiscard]] FileType file_type() const noexcept final; + [[nodiscard]] FileCategory file_category() const noexcept final; + [[nodiscard]] FileMeta file_meta() const noexcept final; + +private: + std::shared_ptr m_file; +}; + +} // namespace odr::internal::pdf + +#endif // ODR_INTERNAL_PDF_FILE_HPP diff --git a/src/odr/internal/pdf/pdf_graphics_operator.hpp b/src/odr/internal/pdf/pdf_graphics_operator.hpp index f67db696..77543cf4 100644 --- a/src/odr/internal/pdf/pdf_graphics_operator.hpp +++ b/src/odr/internal/pdf/pdf_graphics_operator.hpp @@ -132,6 +132,11 @@ class GraphicsArgument { enum class GraphicsOperatorType { unknown, + save_state, + restore_state, + + set_matrix, + set_line_width, set_cap_style, set_join_style, @@ -141,10 +146,6 @@ enum class GraphicsOperatorType { set_flatness_tolerance, set_graphics_state_parameters, - save_state, - restore_state, - set_matrix, - draw_object, begin_inline_image, begin_inline_image_data, @@ -184,7 +185,7 @@ enum class GraphicsOperatorType { set_text_rise, text_next_line_relative, - text_next_line_text_leading, + text_next_line_relative_leading, set_text_matrix, text_next_line, diff --git a/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp b/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp index 7e153a91..258578b0 100644 --- a/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp +++ b/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp @@ -11,6 +11,11 @@ namespace { GraphicsOperatorType operator_name_to_type(const std::string &name) { static std::unordered_map mapping = { + {"q", GraphicsOperatorType::save_state}, + {"Q", GraphicsOperatorType::restore_state}, + + {"cm", GraphicsOperatorType::set_matrix}, + {"w", GraphicsOperatorType::set_line_width}, {"J", GraphicsOperatorType::set_cap_style}, {"j", GraphicsOperatorType::set_join_style}, @@ -20,10 +25,6 @@ GraphicsOperatorType operator_name_to_type(const std::string &name) { {"i", GraphicsOperatorType::set_flatness_tolerance}, {"gm", GraphicsOperatorType::set_graphics_state_parameters}, - {"q", GraphicsOperatorType::save_state}, - {"Q", GraphicsOperatorType::restore_state}, - {"cm", GraphicsOperatorType::set_matrix}, - {"Do", GraphicsOperatorType::draw_object}, {"BI", GraphicsOperatorType::begin_inline_image}, {"ID", GraphicsOperatorType::begin_inline_image_data}, @@ -64,7 +65,7 @@ GraphicsOperatorType operator_name_to_type(const std::string &name) { {"Ts", GraphicsOperatorType::set_text_rise}, {"Td", GraphicsOperatorType::text_next_line_relative}, - {"TD", GraphicsOperatorType::text_next_line_text_leading}, + {"TD", GraphicsOperatorType::text_next_line_relative_leading}, {"Tm", GraphicsOperatorType::set_text_matrix}, {"T*", GraphicsOperatorType::text_next_line}, diff --git a/src/odr/internal/pdf/pdf_graphics_state.cpp b/src/odr/internal/pdf/pdf_graphics_state.cpp index 36dd6392..01f94f16 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.cpp +++ b/src/odr/internal/pdf/pdf_graphics_state.cpp @@ -39,6 +39,12 @@ void GraphicsState::execute(const GraphicsOperator &op) { stack.pop_back(); break; + case GraphicsOperatorType::set_matrix: + for (int i = 0; i < 6; ++i) { + current().general.transform_matrix.at(i) = op.arguments.at(i).as_real(); + } + break; + case GraphicsOperatorType::set_line_width: current().general.line_width = op.arguments.at(0).as_real(); break; @@ -64,11 +70,6 @@ void GraphicsState::execute(const GraphicsOperator &op) { current().general.graphics_state_parameters = op.arguments.at(0).as_string(); break; - case GraphicsOperatorType::set_matrix: - for (int i = 0; i < 6; ++i) { - current().general.transform_matrix.at(i) = op.arguments.at(i).as_real(); - } - break; case GraphicsOperatorType::path_move_to: for (int i = 0; i < 2; ++i) { @@ -106,22 +107,33 @@ void GraphicsState::execute(const GraphicsOperator &op) { current().text.horizontal_scaling = op.arguments.at(0).as_real(); break; case GraphicsOperatorType::set_text_leading: - current().text.text_leading = op.arguments.at(0).as_real(); + current().text.leading = op.arguments.at(0).as_real(); break; case GraphicsOperatorType::set_text_font_size: current().text.font = op.arguments.at(0).as_string(); current().text.size = op.arguments.at(1).as_real(); break; case GraphicsOperatorType::set_text_rendering_mode: - current().text.text_rendering_mode = op.arguments.at(0).as_integer(); + current().text.rendering_mode = op.arguments.at(0).as_integer(); break; case GraphicsOperatorType::set_text_rise: - current().text.text_rise = op.arguments.at(0).as_real(); + current().text.rise = op.arguments.at(0).as_real(); break; + case GraphicsOperatorType::text_next_line_relative: + for (int i = 0; i < 2; ++i) { + current().text.offset.at(i) += op.arguments.at(i).as_real(); + } + break; + case GraphicsOperatorType::text_next_line_relative_leading: + current().text.leading = -op.arguments.at(1).as_real(); + for (int i = 0; i < 2; ++i) { + current().text.offset.at(i) += op.arguments.at(i).as_real(); + } + break; case GraphicsOperatorType::set_text_matrix: for (int i = 0; i < 6; ++i) { - current().general.transform_matrix.at(i) = op.arguments.at(i).as_real(); + current().text.transform_matrix.at(i) = op.arguments.at(i).as_real(); } break; diff --git a/src/odr/internal/pdf/pdf_graphics_state.hpp b/src/odr/internal/pdf/pdf_graphics_state.hpp index 8617a673..0bc2806c 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.hpp +++ b/src/odr/internal/pdf/pdf_graphics_state.hpp @@ -27,24 +27,25 @@ struct GraphicsState { double color_rendering_intent{}; double flatness_tolerance{}; std::string graphics_state_parameters; - std::array transform_matrix{}; + std::array transform_matrix{1, 0, 0, 1, 0, 0}; }; struct Path { - std::array current_position{}; + std::array current_position{0, 0}; // TODO clipping }; struct Text { - double char_spacing{}; - double word_spacing{}; - double horizontal_scaling{}; - double text_leading{}; - std::string font{}; + double char_spacing{0}; + double word_spacing{0}; + double horizontal_scaling{1}; + double leading{0}; + std::string font; double size{}; - int text_rendering_mode{}; - double text_rise{}; - std::array transform_matrix{}; + int rendering_mode{0}; + double rise{0}; + std::array offset{0, 0}; + std::array transform_matrix{1, 0, 0, 1, 0, 0}; std::array glyph_width{}; std::array glyph_bounding_box{}; }; diff --git a/src/odr/internal/pdf/pdf_object_parser.cpp b/src/odr/internal/pdf/pdf_object_parser.cpp index e8d06f92..a1bf787d 100644 --- a/src/odr/internal/pdf/pdf_object_parser.cpp +++ b/src/odr/internal/pdf/pdf_object_parser.cpp @@ -11,6 +11,7 @@ namespace odr::internal::pdf { using char_type = std::streambuf::char_type; using int_type = std::streambuf::int_type; static constexpr int_type eof = std::streambuf::traits_type::eof(); +using pos_type = std::streambuf::pos_type; namespace { @@ -121,9 +122,9 @@ std::variant ObjectParser::read_integer_or_real() const { } sb().sbumpc(); - std::streamsize begin = in().gcount(); + pos_type begin = in().tellg(); UnsignedInteger i2 = read_unsigned_integer(); - std::streamsize end = in().gcount(); + pos_type end = in().tellg(); return i + i2 * std::pow(10.0, begin - end); } diff --git a/src/odr/open_document_reader.cpp b/src/odr/open_document_reader.cpp index 58d1a8db..e4121cd3 100644 --- a/src/odr/open_document_reader.cpp +++ b/src/odr/open_document_reader.cpp @@ -1,7 +1,5 @@ #include -#include -#include #include #include @@ -174,28 +172,24 @@ DecodedFile OpenDocumentReader::open(const std::string &path) { return DecodedFile(path); } -Html OpenDocumentReader::html(const std::string &path, const char *password, +Html OpenDocumentReader::html(const std::string &path, + const PasswordCallback &password_callback, const std::string &output_path, const HtmlConfig &config) { - return html(DecodedFile(path), password, output_path, config); + return html(File(path), password_callback, output_path, config); } -Html OpenDocumentReader::html(const DecodedFile &file, const char *password, +Html OpenDocumentReader::html(const File &file, + const PasswordCallback &password_callback, const std::string &output_path, const HtmlConfig &config) { - if (file.file_type() == FileType::text_file) { - return html(file.text_file(), output_path, config); - } else if (file.file_category() == FileCategory::document) { - auto document_file = file.document_file(); - if (document_file.password_encrypted()) { - if ((password == nullptr) || !document_file.decrypt(password)) { - throw WrongPassword(); - } - } - return html(document_file.document(), output_path, config); - } + return html::translate(file, output_path, config, password_callback); +} - throw UnknownFileType(); +Html OpenDocumentReader::html(const DecodedFile &file, + const std::string &output_path, + const HtmlConfig &config) { + return html::translate(file, output_path, config); } Html OpenDocumentReader::html(const TextFile &text_file, @@ -204,12 +198,34 @@ Html OpenDocumentReader::html(const TextFile &text_file, return html::translate(text_file, output_path, config); } +Html OpenDocumentReader::html(const ImageFile &image_file, + const std::string &output_path, + const HtmlConfig &config) { + return html::translate(image_file, output_path, config); +} + +Html OpenDocumentReader::html(const Archive &archive, + const std::string &output_path, + const HtmlConfig &config) { + return html::translate(archive, output_path, config); +} + Html OpenDocumentReader::html(const Document &document, const std::string &output_path, const HtmlConfig &config) { return html::translate(document, output_path, config); } +Html OpenDocumentReader::html(const PdfFile &pdf_file, + const std::string &output_path, + const HtmlConfig &config) { + return html::translate(pdf_file, output_path, config); +} + +void OpenDocumentReader::edit(const Document &document, const char *diff) { + html::edit(document, diff); +} + void OpenDocumentReader::copy_resources(const std::string &to_path) { auto resources = internal::Resources::instance(); diff --git a/src/odr/open_document_reader.hpp b/src/odr/open_document_reader.hpp index fc251fca..5055c75e 100644 --- a/src/odr/open_document_reader.hpp +++ b/src/odr/open_document_reader.hpp @@ -1,18 +1,25 @@ #ifndef ODR_OPEN_DOCUMENT_READER_HPP #define ODR_OPEN_DOCUMENT_READER_HPP +#include #include #include namespace odr { enum class FileType; enum class FileCategory; +class File; class DecodedFile; class TextFile; +class ImageFile; +class Archive; class Document; +class PdfFile; class Html; struct HtmlConfig; +using PasswordCallback = std::function; + class OpenDocumentReader final { public: [[nodiscard]] static std::string version() noexcept; @@ -27,18 +34,33 @@ class OpenDocumentReader final { [[nodiscard]] static DecodedFile open(const std::string &path); [[nodiscard]] static Html html(const std::string &input_path, - const char *password, + const PasswordCallback &password_callback, + const std::string &output_path, + const HtmlConfig &config); + [[nodiscard]] static Html html(const File &file, + const PasswordCallback &password_callback, const std::string &output_path, const HtmlConfig &config); - [[nodiscard]] static Html html(const DecodedFile &file, const char *password, + [[nodiscard]] static Html html(const DecodedFile &file, const std::string &output_path, const HtmlConfig &config); [[nodiscard]] static Html html(const TextFile &text_file, const std::string &output_path, const HtmlConfig &config); + [[nodiscard]] static Html html(const ImageFile &image_file, + const std::string &output_path, + const HtmlConfig &config); + [[nodiscard]] static Html html(const Archive &archive, + const std::string &output_path, + const HtmlConfig &config); [[nodiscard]] static Html html(const Document &document, const std::string &output_path, const HtmlConfig &config); + [[nodiscard]] static Html html(const PdfFile &pdf_file, + const std::string &output_path, + const HtmlConfig &config); + + void edit(const Document &document, const char *diff); static void copy_resources(const std::string &to_path); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index db58ba9f..34cca9f6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -14,7 +14,7 @@ add_executable(odr_test "src/document_test.cpp" "src/file_test.cpp" "src/open_document_reader_test.cpp" - "src/output_reference_test.cpp" + "src/html_output_test.cpp" "src/quantity_test.cpp" "src/internal/magic_test.cpp" diff --git a/test/data/reference-output/odr-public b/test/data/reference-output/odr-public index 5d02b56b..471b3089 160000 --- a/test/data/reference-output/odr-public +++ b/test/data/reference-output/odr-public @@ -1 +1 @@ -Subproject commit 5d02b56b870baaf6b147fa5711ed0e3ab998700b +Subproject commit 471b3089b520fc37b44e914ac9cfaf7603f48aa0 diff --git a/test/src/output_reference_test.cpp b/test/src/html_output_test.cpp similarity index 74% rename from test/src/output_reference_test.cpp rename to test/src/html_output_test.cpp index d85a46c8..bea05802 100644 --- a/test/src/output_reference_test.cpp +++ b/test/src/html_output_test.cpp @@ -21,16 +21,16 @@ using namespace odr::internal; using namespace odr::test; namespace fs = std::filesystem; -using OutputReferenceTests = ::testing::TestWithParam; +using HtmlOutputTests = ::testing::TestWithParam; -TEST_P(OutputReferenceTests, html_meta) { - const auto test_file_path = GetParam(); - const auto test_file = TestData::test_file(test_file_path); +TEST_P(HtmlOutputTests, html_meta) { + const std::string test_file_path = GetParam(); + const TestFile test_file = TestData::test_file(test_file_path); - const auto test_repo = *common::Path(test_file_path).begin(); - const auto output_path_prefix = + const std::string test_repo = *common::Path(test_file_path).begin(); + const std::string output_path_prefix = common::Path("output").join(test_repo).join("output").string(); - const auto output_path = + const std::string output_path = common::Path(output_path_prefix) .join(common::Path(test_file_path).rebase(test_repo)) .string(); @@ -41,7 +41,6 @@ TEST_P(OutputReferenceTests, html_meta) { // these files cannot be opened if (util::string::ends_with(test_file.path, ".sxw") || - (test_file.type == FileType::portable_document_format) || (test_file.type == FileType::legacy_word_document) || (test_file.type == FileType::legacy_powerpoint_presentation) || (test_file.type == FileType::legacy_excel_worksheets) || @@ -50,23 +49,30 @@ TEST_P(OutputReferenceTests, html_meta) { GTEST_SKIP(); } + // TODO fix + if ((test_file.type == FileType::portable_document_format) && + (test_repo != "odr-public")) { + GTEST_SKIP(); + } + const DecodedFile file{test_file.path}; - auto file_meta = file.file_meta(); + FileMeta file_meta = file.file_meta(); // encrypted ooxml type cannot be inspected if ((file.file_type() != FileType::office_open_xml_encrypted)) { EXPECT_EQ(test_file.type, file.file_type()); } + // TODO enable zip, csv, json if ((test_file.type == FileType::zip) || (test_file.type == FileType::comma_separated_values) || (test_file.type == FileType::javascript_object_notation)) { GTEST_SKIP(); } - if (file.file_category() == FileCategory::document) { - auto document_file = file.document_file(); + if (file.is_document_file()) { + DocumentFile document_file = file.document_file(); EXPECT_EQ(test_file.password_encrypted, document_file.password_encrypted()); if (document_file.password_encrypted()) { @@ -80,14 +86,15 @@ TEST_P(OutputReferenceTests, html_meta) { { const std::string meta_output = output_path + "/meta.json"; - const auto json = odr::internal::util::meta::meta_to_json(file_meta); + const nlohmann::json json = + odr::internal::util::meta::meta_to_json(file_meta); std::ofstream o(meta_output); o << std::setw(4) << json << std::endl; EXPECT_TRUE(fs::is_regular_file(meta_output)); EXPECT_LT(0, fs::file_size(meta_output)); } - const auto resource_path = + const std::string resource_path = common::Path(output_path_prefix).parent().join("resources").string(); OpenDocumentReader::copy_resources(resource_path); @@ -99,27 +106,19 @@ TEST_P(OutputReferenceTests, html_meta) { config.spreadsheet_limit = TableDimensions(4000, 500); config.format_html = true; config.html_indent = 2; - std::optional html; - - if (file.file_type() == FileType::text_file) { - html = OpenDocumentReader::html(file.text_file(), output_path, config); - } else if (file.file_category() == FileCategory::document) { - auto document_file = file.document_file(); - auto document = document_file.document(); - html = OpenDocumentReader::html(document, output_path, config); - } - EXPECT_TRUE(html); - for (auto &&html_page : html->pages()) { + Html html = OpenDocumentReader::html(file, output_path, config); + + for (const HtmlPage &html_page : html.pages()) { EXPECT_TRUE(fs::is_regular_file(html_page.path)); EXPECT_LT(0, fs::file_size(html_page.path)); } } -INSTANTIATE_TEST_SUITE_P(all_test_files, OutputReferenceTests, +INSTANTIATE_TEST_SUITE_P(all_test_files, HtmlOutputTests, testing::ValuesIn(TestData::test_file_paths()), [](const ::testing::TestParamInfo &info) { - auto path = info.param; + std::string path = info.param; internal::util::string::replace_all(path, "/", "_"); internal::util::string::replace_all(path, "-", "_"); internal::util::string::replace_all(path, "+", "_");