diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index 2bc94e40..3208f2be 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -72,6 +72,8 @@ jobs: -DCMAKE_CXX_FLAGS="-Werror" -DCMAKE_INSTALL_PREFIX=install -DODR_TEST=ON + -DWITH_PDF2HTMLEX=ON + -DWITH_WVWARE=ON - name: cmake if: runner.os == 'Windows' @@ -82,6 +84,8 @@ jobs: -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DODR_TEST=ON + -DWITH_PDF2HTMLEX=OFF + -DWITH_WVWARE=OFF - name: build run: cmake --build build --config Release diff --git a/CMakeLists.txt b/CMakeLists.txt index d804a522..c869e538 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,6 +108,7 @@ set(ODR_SOURCE_FILES "src/odr/internal/html/html_writer.cpp" "src/odr/internal/html/image_file.cpp" "src/odr/internal/html/pdf_file.cpp" + "src/odr/internal/html/pdf2htmlex_wrapper.hpp" "src/odr/internal/html/text_file.cpp" "src/odr/internal/json/json_file.cpp" @@ -194,16 +195,40 @@ target_link_libraries(odr utf8::cpp ) -if(WITH_PDF2HTMLEX) - target_sources(odr PRIVATE "src/odr/internal/html/pdf2htmlEX_wrapper.cpp") +if (WITH_PDF2HTMLEX) find_package(pdf2htmlEX REQUIRED) - target_link_libraries(odr PRIVATE pdf2htmlex::pdf2htmlex) -endif(WITH_PDF2HTMLEX) -if(WITH_WVWARE) - target_sources(odr PRIVATE "src/odr/internal/html/wvWare_wrapper.cpp") + find_package(poppler REQUIRED) + target_sources(odr + PRIVATE + "src/odr/internal/html/pdf2htmlex_wrapper.cpp" + "src/odr/internal/pdf_poppler/poppler_pdf_file.cpp" + ) + target_link_libraries(odr + PRIVATE + pdf2htmlex::pdf2htmlex + poppler::poppler + ) + target_compile_definitions(odr + PRIVATE + ODR_WITH_PDF2HTMLEX + ) +endif () +if (WITH_WVWARE) find_package(wvware REQUIRED) - target_link_libraries(odr PRIVATE wvware::wvware) -endif(WITH_WVWARE) + target_sources(odr + PRIVATE + "src/odr/internal/html/wvware_wrapper.cpp" + "src/odr/internal/oldms_wvware/wvware_oldms_file.cpp" + ) + target_link_libraries(odr + PRIVATE + wvware::wvware + ) + target_compile_definitions(odr + PRIVATE + ODR_WITH_WVWARE + ) +endif () if (EXISTS "${PROJECT_SOURCE_DIR}/.git") add_dependencies(odr check_git) diff --git a/scripts/conan b/scripts/conan index c40f4d8a..6107326f 100755 --- a/scripts/conan +++ b/scripts/conan @@ -1,5 +1,5 @@ #!/usr/bin/env bash -conan install . --output-folder=cmake-build-relwithdebinfo --build=missing -s build_type=Release -s "&:build_type=RelWithDebInfo" -conan install . --output-folder=cmake-build-debug --build=missing -s build_type=Release -s "&:build_type=Debug" -conan install . --output-folder=cmake-build-release --build=never -s build_type=Release -s "&:build_type=Release" +conan install . --output-folder=cmake-build-relwithdebinfo --build=missing -s build_type=RelWithDebInfo -s "&:build_type=RelWithDebInfo" +conan install . --output-folder=cmake-build-debug --build=missing -s build_type=RelWithDebInfo -s "&:build_type=Debug" +conan install . --output-folder=cmake-build-release --build=missing -s build_type=RelWithDebInfo -s "&:build_type=Release" diff --git a/src/odr/exceptions.cpp b/src/odr/exceptions.cpp index 65d97585..7b81c7ef 100644 --- a/src/odr/exceptions.cpp +++ b/src/odr/exceptions.cpp @@ -14,6 +14,9 @@ UnknownFileType::UnknownFileType() : std::runtime_error("unknown file type") {} UnsupportedFileType::UnsupportedFileType(const FileType file_type) : std::runtime_error("unknown file type"), file_type{file_type} {} +UnknownDecoderEngine::UnknownDecoderEngine() + : std::runtime_error("unknown decoder engine") {} + FileReadError::FileReadError() : std::runtime_error("file read error") {} FileWriteError::FileWriteError() : std::runtime_error("file write error") {} diff --git a/src/odr/exceptions.hpp b/src/odr/exceptions.hpp index 356d773a..eae36daf 100644 --- a/src/odr/exceptions.hpp +++ b/src/odr/exceptions.hpp @@ -28,6 +28,11 @@ struct UnsupportedFileType final : public std::runtime_error { explicit UnsupportedFileType(FileType file_type); }; +/// @brief Unknown decoder engine exception +struct UnknownDecoderEngine final : public std::runtime_error { + UnknownDecoderEngine(); +}; + /// @brief File read error struct FileReadError final : public std::runtime_error { FileReadError(); diff --git a/src/odr/file.cpp b/src/odr/file.cpp index 565fe8a5..75b9f944 100644 --- a/src/odr/file.cpp +++ b/src/odr/file.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -59,6 +58,12 @@ std::vector DecodedFile::types(const std::string &path) { std::make_shared(path)); } +std::vector DecodedFile::engines(const std::string &path, + FileType as) { + return internal::open_strategy::engines( + std::make_shared(path), as); +} + FileType DecodedFile::type(const std::string &path) { return DecodedFile(path).file_type(); } @@ -69,7 +74,7 @@ FileMeta DecodedFile::meta(const std::string &path) { DecodedFile::DecodedFile(std::shared_ptr impl) : m_impl{std::move(impl)} { - if (!m_impl) { + if (m_impl == nullptr) { throw UnknownFileType(); } } @@ -88,6 +93,11 @@ DecodedFile::DecodedFile(const std::string &path, FileType as) : DecodedFile(internal::open_strategy::open_file( std::make_shared(path), as)) {} +DecodedFile::DecodedFile(const std::string &path, + const DecodePreference &preference) + : DecodedFile(internal::open_strategy::open_file( + std::make_shared(path), preference)) {} + DecodedFile::operator bool() const { return m_impl.operator bool(); } FileType DecodedFile::file_type() const noexcept { return m_impl->file_type(); } @@ -98,6 +108,10 @@ FileCategory DecodedFile::file_category() const noexcept { FileMeta DecodedFile::file_meta() const noexcept { return m_impl->file_meta(); } +DecoderEngine DecodedFile::decoder_engine() const noexcept { + return m_impl->decoder_engine(); +} + File DecodedFile::file() const { return File(m_impl->file()); } bool DecodedFile::is_text_file() const { @@ -121,7 +135,8 @@ bool DecodedFile::is_document_file() const { } bool DecodedFile::is_pdf_file() const { - return std::dynamic_pointer_cast(m_impl) != nullptr; + return std::dynamic_pointer_cast(m_impl) != + nullptr; } TextFile DecodedFile::text_file() const { @@ -158,7 +173,7 @@ DocumentFile DecodedFile::document_file() const { PdfFile DecodedFile::pdf_file() const { if (auto pdf_file = - std::dynamic_pointer_cast(m_impl)) { + std::dynamic_pointer_cast(m_impl)) { return PdfFile(pdf_file); } throw NoPdfFile(); @@ -229,7 +244,27 @@ DocumentMeta DocumentFile::document_meta() const { Document DocumentFile::document() const { return Document(m_impl->document()); } -PdfFile::PdfFile(std::shared_ptr impl) +std::shared_ptr DocumentFile::impl() const { + return m_impl; +} + +PdfFile::PdfFile(std::shared_ptr impl) : DecodedFile(impl), m_impl{std::move(impl)} {} +bool PdfFile::password_encrypted() const { + return m_impl->password_encrypted(); +} + +EncryptionState PdfFile::encryption_state() const { + return m_impl->encryption_state(); +} + +bool PdfFile::decrypt(const std::string &password) { + return m_impl->decrypt(password); +} + +std::shared_ptr PdfFile::impl() const { + return m_impl; +} + } // namespace odr diff --git a/src/odr/file.hpp b/src/odr/file.hpp index e3e28f4d..1506fb04 100644 --- a/src/odr/file.hpp +++ b/src/odr/file.hpp @@ -14,11 +14,8 @@ class TextFile; class ImageFile; class ArchiveFile; class DocumentFile; -} // namespace odr::internal::abstract - -namespace odr::internal::pdf { class PdfFile; -} +} // namespace odr::internal::abstract namespace odr { class TextFile; @@ -97,6 +94,22 @@ enum class FileLocation { disk, }; +/// @brief Collection of decoder engines. +enum class DecoderEngine { + odr, + poppler, + wvware, +}; + +/// @brief Preference for decoding files. +struct DecodePreference final { + std::optional as_file_type; + std::optional with_engine; + + std::vector file_type_priority; + std::vector engine_priority; +}; + /// @brief Collection of encryption states. enum class EncryptionState { unknown, @@ -163,20 +176,24 @@ class File final { class DecodedFile { public: static std::vector types(const std::string &path); + static std::vector engines(const std::string &path, + FileType as); static FileType type(const std::string &path); static FileMeta meta(const std::string &path); - explicit DecodedFile(std::shared_ptr); + explicit DecodedFile(std::shared_ptr impl); explicit DecodedFile(const File &file); DecodedFile(const File &file, FileType as); explicit DecodedFile(const std::string &path); DecodedFile(const std::string &path, FileType as); + DecodedFile(const std::string &path, const DecodePreference &preference); [[nodiscard]] explicit operator bool() const; [[nodiscard]] FileType file_type() const noexcept; [[nodiscard]] FileCategory file_category() const noexcept; [[nodiscard]] FileMeta file_meta() const noexcept; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept; [[nodiscard]] File file() const; @@ -250,6 +267,8 @@ class DocumentFile final : public DecodedFile { [[nodiscard]] Document document() const; + [[nodiscard]] std::shared_ptr impl() const; + private: std::shared_ptr m_impl; }; @@ -257,10 +276,16 @@ class DocumentFile final : public DecodedFile { /// @brief Represents a PDF file. class PdfFile final : public DecodedFile { public: - explicit PdfFile(std::shared_ptr); + explicit PdfFile(std::shared_ptr); + + [[nodiscard]] bool password_encrypted() const; + [[nodiscard]] EncryptionState encryption_state() const; + bool decrypt(const std::string &password); + + [[nodiscard]] std::shared_ptr impl() const; private: - std::shared_ptr m_impl; + std::shared_ptr m_impl; }; } // namespace odr diff --git a/src/odr/html.cpp b/src/odr/html.cpp index 9a7fd62b..c619d33a 100644 --- a/src/odr/html.cpp +++ b/src/odr/html.cpp @@ -9,8 +9,12 @@ #include #include #include +#include #include #include +#include +#include +#include #include @@ -73,11 +77,9 @@ Html html::translate(const DecodedFile &decoded_file, } else if (decoded_file.is_image_file()) { return translate(decoded_file.image_file(), output_path, config); } else if (decoded_file.is_archive_file()) { - return translate(decoded_file.archive_file().archive(), output_path, - config); + return translate(decoded_file.archive_file(), output_path, config); } else if (decoded_file.is_document_file()) { - return translate(decoded_file.document_file().document(), output_path, - config); + return translate(decoded_file.document_file(), output_path, config); } else if (decoded_file.is_pdf_file()) { return translate(decoded_file.pdf_file(), output_path, config); } @@ -97,6 +99,44 @@ Html html::translate(const ImageFile &image_file, return internal::html::translate_image_file(image_file, output_path, config); } +Html html::translate(const ArchiveFile &archive_file, + const std::string &output_path, const HtmlConfig &config) { + return translate(archive_file.archive(), output_path, config); +} + +Html html::translate(const DocumentFile &document_file, + const std::string &output_path, const HtmlConfig &config) { + auto document_file_impl = document_file.impl(); + +#ifdef ODR_WITH_WVWARE + if (auto wv_document_file = + std::dynamic_pointer_cast( + document_file_impl)) { + fs::create_directories(output_path); + return internal::html::translate_wvware_oldms_file(*wv_document_file, + output_path, config); + } +#endif + + return translate(document_file.document(), output_path, config); +} + +Html html::translate(const PdfFile &pdf_file, const std::string &output_path, + const HtmlConfig &config) { + auto pdf_file_impl = pdf_file.impl(); + +#ifdef ODR_WITH_PDF2HTMLEX + if (auto poppler_pdf_file = + std::dynamic_pointer_cast(pdf_file_impl)) { + fs::create_directories(output_path); + return internal::html::translate_poppler_pdf_file(*poppler_pdf_file, + output_path, config); + } +#endif + + return internal::html::translate_pdf_file(pdf_file, output_path, config); +} + Html html::translate(const Archive &archive, const std::string &output_path, const HtmlConfig &config) { fs::create_directories(output_path); @@ -110,12 +150,6 @@ Html html::translate(const Document &document, const std::string &output_path, return internal::html::translate_document(document, output_path, config); } -Html html::translate(const PdfFile &pdf_file, const std::string &output_path, - const HtmlConfig &config) { - fs::create_directories(output_path); - return internal::html::translate_pdf_file(pdf_file, output_path, config); -} - void html::edit(const Document &document, const char *diff) { auto json = nlohmann::json::parse(diff); for (const auto &[key, value] : json["modifiedText"].items()) { diff --git a/src/odr/html.hpp b/src/odr/html.hpp index ecf9199c..7996cb18 100644 --- a/src/odr/html.hpp +++ b/src/odr/html.hpp @@ -144,20 +144,20 @@ Html translate(const ImageFile &image_file, const std::string &output_path, const HtmlConfig &config); /// @brief Translates an archive to HTML. /// -/// @param archive Archive to translate. +/// @param archive Archive file to translate. /// @param output_path Path to save the HTML output. /// @param config Configuration for the HTML output. /// @return HTML output. -Html translate(const Archive &archive, const std::string &output_path, +Html translate(const ArchiveFile &archive_file, const std::string &output_path, const HtmlConfig &config); /// @brief Translates a document to HTML. /// -/// @param document Document to translate. +/// @param document_file Document file to translate. /// @param output_path Path to save the HTML output. /// @param config Configuration for the HTML output. /// @return HTML output. -Html translate(const Document &document, const std::string &output_path, - const HtmlConfig &config); +Html translate(const DocumentFile &document_file, + const std::string &output_path, const HtmlConfig &config); /// @brief Translates a PDF file to HTML. /// /// @param pdf_file PDF file to translate. @@ -167,6 +167,23 @@ Html translate(const Document &document, const std::string &output_path, Html translate(const PdfFile &pdf_file, const std::string &output_path, const HtmlConfig &config); +/// @brief Translates an archive to HTML. +/// +/// @param archive Archive to translate. +/// @param output_path Path to save the HTML output. +/// @param config Configuration for the HTML output. +/// @return HTML output. +Html translate(const Archive &archive, const std::string &output_path, + const HtmlConfig &config); +/// @brief Translates a document to HTML. +/// +/// @param document Document to translate. +/// @param output_path Path to save the HTML output. +/// @param config Configuration for the HTML output. +/// @return HTML output. +Html translate(const Document &document, const std::string &output_path, + const HtmlConfig &config); + /// @brief Edits a document with a diff. /// /// @note The diff is generated by our JavaScript code in the browser. diff --git a/src/odr/internal/abstract/file.hpp b/src/odr/internal/abstract/file.hpp index cd86ed54..728e9d30 100644 --- a/src/odr/internal/abstract/file.hpp +++ b/src/odr/internal/abstract/file.hpp @@ -38,6 +38,7 @@ class DecodedFile { [[nodiscard]] virtual FileType file_type() const noexcept = 0; [[nodiscard]] virtual FileCategory file_category() const noexcept = 0; [[nodiscard]] virtual FileMeta file_meta() const noexcept = 0; + [[nodiscard]] virtual DecoderEngine decoder_engine() const noexcept = 0; }; class TextFile : public DecodedFile { @@ -81,6 +82,20 @@ class DocumentFile : public DecodedFile { [[nodiscard]] virtual std::shared_ptr document() const = 0; }; +class PdfFile : public DecodedFile { +public: + [[nodiscard]] FileType file_type() const noexcept final { + return FileType::portable_document_format; + } + [[nodiscard]] FileCategory file_category() const noexcept final { + return FileCategory::document; + } + + [[nodiscard]] virtual bool password_encrypted() const noexcept = 0; + [[nodiscard]] virtual EncryptionState encryption_state() const noexcept = 0; + [[nodiscard]] virtual bool decrypt(const std::string &password) = 0; +}; + } // namespace odr::internal::abstract #endif // ODR_INTERNAL_ABSTRACT_FILE_HPP diff --git a/src/odr/internal/cfb/cfb_file.cpp b/src/odr/internal/cfb/cfb_file.cpp index 7facfc25..e1e818a2 100644 --- a/src/odr/internal/cfb/cfb_file.cpp +++ b/src/odr/internal/cfb/cfb_file.cpp @@ -22,6 +22,10 @@ FileMeta CfbFile::file_meta() const noexcept { return meta; } +DecoderEngine CfbFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + std::shared_ptr CfbFile::archive() const { return std::make_shared(m_cfb); } diff --git a/src/odr/internal/cfb/cfb_file.hpp b/src/odr/internal/cfb/cfb_file.hpp index 35a7ac12..f5696f0e 100644 --- a/src/odr/internal/cfb/cfb_file.hpp +++ b/src/odr/internal/cfb/cfb_file.hpp @@ -26,6 +26,7 @@ class CfbFile final : public abstract::ArchiveFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; [[nodiscard]] std::shared_ptr archive() const final; diff --git a/src/odr/internal/common/image_file.cpp b/src/odr/internal/common/image_file.cpp index 2f655e9e..43f14acf 100644 --- a/src/odr/internal/common/image_file.cpp +++ b/src/odr/internal/common/image_file.cpp @@ -14,6 +14,10 @@ FileType ImageFile::file_type() const noexcept { return m_file_type; } FileMeta ImageFile::file_meta() const noexcept { return {}; } +DecoderEngine ImageFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + std::shared_ptr ImageFile::image() const { return {}; } } // namespace odr::internal::common diff --git a/src/odr/internal/common/image_file.hpp b/src/odr/internal/common/image_file.hpp index da61e735..81024754 100644 --- a/src/odr/internal/common/image_file.hpp +++ b/src/odr/internal/common/image_file.hpp @@ -13,6 +13,7 @@ class ImageFile : public abstract::ImageFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; [[nodiscard]] std::shared_ptr image() const final; diff --git a/src/odr/internal/csv/csv_file.cpp b/src/odr/internal/csv/csv_file.cpp index 59936046..0beaa2f1 100644 --- a/src/odr/internal/csv/csv_file.cpp +++ b/src/odr/internal/csv/csv_file.cpp @@ -24,4 +24,8 @@ FileMeta CsvFile::file_meta() const noexcept { return {FileType::comma_separated_values, false, {}}; } +DecoderEngine CsvFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + } // namespace odr::internal::csv diff --git a/src/odr/internal/csv/csv_file.hpp b/src/odr/internal/csv/csv_file.hpp index b0fadebf..00d67dd4 100644 --- a/src/odr/internal/csv/csv_file.hpp +++ b/src/odr/internal/csv/csv_file.hpp @@ -17,6 +17,7 @@ class CsvFile final : public abstract::TextFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; private: std::shared_ptr m_file; diff --git a/src/odr/internal/html/document.cpp b/src/odr/internal/html/document.cpp index cfa14e9d..ec20726e 100644 --- a/src/odr/internal/html/document.cpp +++ b/src/odr/internal/html/document.cpp @@ -80,7 +80,7 @@ void front(const Document &document, HtmlWriter &out, const HtmlConfig &config, out.write_body_begin(HtmlElementOptions().set_class(body_clazz)); } -void back(const Document &document, internal::html::HtmlWriter &out, +void back(const Document &document, html::HtmlWriter &out, const HtmlConfig &config, const HtmlResourceLocator &resourceLocator) { (void)document; @@ -104,8 +104,8 @@ void back(const Document &document, internal::html::HtmlWriter &out, std::string fill_path_variables(const std::string &path, std::optional index = {}) { std::string result = path; - internal::util::string::replace_all(result, "{index}", - index ? std::to_string(*index) : ""); + util::string::replace_all(result, "{index}", + index ? std::to_string(*index) : ""); return result; } @@ -219,7 +219,7 @@ class SlideHtmlFragment final : public HtmlFragmentBase { void write_html_fragment(HtmlWriter &out, const HtmlConfig &config, const HtmlResourceLocator &resourceLocator) const final { - internal::html::translate_slide(m_slide, out, config, resourceLocator); + html::translate_slide(m_slide, out, config, resourceLocator); } private: @@ -253,7 +253,7 @@ class PageHtmlFragment final : public HtmlFragmentBase { void write_html_fragment(HtmlWriter &out, const HtmlConfig &config, const HtmlResourceLocator &resourceLocator) const final { - internal::html::translate_page(m_page, out, config, resourceLocator); + html::translate_page(m_page, out, config, resourceLocator); } private: @@ -304,12 +304,11 @@ Html html::translate_document(const odr::Document &document, std::uint32_t i = 0; for (const auto &fragment : service.fragments()) { std::string filled_path = get_output_path(document, i, output_path, config); - std::ofstream ostream(filled_path); + std::ofstream ostream(filled_path, std::ios::out); if (!ostream.is_open()) { throw FileWriteError(); } - internal::html::HtmlWriter out(ostream, config.format_html, - config.html_indent); + html::HtmlWriter out(ostream, config.format_html, config.html_indent); fragment.write_html_document(out.out(), config, resourceLocator); diff --git a/src/odr/internal/html/html_writer.cpp b/src/odr/internal/html/html_writer.cpp index 5fc3ee92..540bb1fd 100644 --- a/src/odr/internal/html/html_writer.cpp +++ b/src/odr/internal/html/html_writer.cpp @@ -121,123 +121,123 @@ HtmlElementOptions::set_extra(std::optional _extra) { HtmlWriter::HtmlWriter(std::ostream &out, bool format, std::uint8_t indent, std::uint32_t current_indent) - : m_out{out}, m_format{format}, m_indent(indent, ' '), + : m_out{&out}, m_format{format}, m_indent(indent, ' '), m_current_indent{current_indent} {} HtmlWriter::HtmlWriter(std::ostream &out, const HtmlConfig &config) : HtmlWriter{out, config.format_html, config.html_indent} {} void HtmlWriter::write_begin() { - m_out << "\n"; - m_out << ""; + out() << "\n"; + out() << ""; } void HtmlWriter::write_end() { write_new_line(); - m_out << ""; + out() << ""; } void HtmlWriter::write_header_begin() { write_new_line(); ++m_current_indent; - m_out << ""; + out() << ""; } void HtmlWriter::write_header_end() { --m_current_indent; write_new_line(); - m_out << ""; + out() << ""; } void HtmlWriter::write_header_title(const std::string &title) { write_new_line(); - m_out << "" << title << ""; + out() << "" << title << ""; } void HtmlWriter::write_header_viewport(const std::string &viewport) { write_new_line(); - m_out << R"("; + out() << R"("; } void HtmlWriter::write_header_target(const std::string &target) { write_new_line(); - m_out << ""; + out() << ""; } void HtmlWriter::write_header_charset(const std::string &charset) { write_new_line(); - m_out << ""; + out() << ""; } void HtmlWriter::write_header_style(const std::string &href) { write_new_line(); - m_out << R"("; + out() << R"("; } void HtmlWriter::write_header_style_begin() { write_new_line(); ++m_current_indent; - m_out << ""; + out() << ""; } void HtmlWriter::write_script(const std::string &src) { write_new_line(); - m_out << R"("; + out() << R"("; } void HtmlWriter::write_script_begin() { write_new_line(); ++m_current_indent; - m_out << ""; + out() << ""; } void HtmlWriter::write_body_begin(const HtmlElementOptions &options) { write_new_line(); ++m_current_indent; - m_out << ""; + out() << ""; } void HtmlWriter::write_body_end() { --m_current_indent; write_new_line(); - m_out << ""; + out() << ""; } void HtmlWriter::write_element_begin(const std::string &name, @@ -248,12 +248,12 @@ void HtmlWriter::write_element_begin(const std::string &name, m_stack.push_back({name, options.inline_element}); } - m_out << "<" << name; - write_element_options(m_out, options); + out() << "<" << name; + write_element_options(out(), options); if (options.close_type == HtmlCloseType::trailing) { - m_out << "/>"; + out() << "/>"; } else { - m_out << ">"; + out() << ">"; } } @@ -269,7 +269,7 @@ void HtmlWriter::write_element_end(const std::string &name) { } m_stack.pop_back(); - m_out << ""; + out() << ""; } bool HtmlWriter::is_inline_mode() const { @@ -286,9 +286,9 @@ void HtmlWriter::write_new_line() { return; } - m_out << '\n'; + out() << '\n'; for (std::uint32_t i = 0; i < m_current_indent; ++i) { - m_out << m_indent; + out() << m_indent; } } @@ -297,9 +297,9 @@ void HtmlWriter::write_raw(const HtmlWritable &writable, bool new_line) { write_new_line(); } - write_writable(m_out, writable); + write_writable(out(), writable); } -std::ostream &HtmlWriter::out() { return m_out; } +std::ostream &HtmlWriter::out() { return *m_out; } } // namespace odr::internal::html diff --git a/src/odr/internal/html/html_writer.hpp b/src/odr/internal/html/html_writer.hpp index 2f8158ab..85776796 100644 --- a/src/odr/internal/html/html_writer.hpp +++ b/src/odr/internal/html/html_writer.hpp @@ -88,7 +88,7 @@ class HtmlWriter { bool inline_element{false}; }; - std::ostream &m_out; + std::ostream *m_out{nullptr}; bool m_format{false}; std::string m_indent; std::uint32_t m_current_indent{0}; diff --git a/src/odr/internal/html/pdf2htmlEX_wrapper.cpp b/src/odr/internal/html/pdf2htmlEX_wrapper.cpp deleted file mode 100644 index 169821f8..00000000 --- a/src/odr/internal/html/pdf2htmlEX_wrapper.cpp +++ /dev/null @@ -1,64 +0,0 @@ -#include - -#include -#include -#include - -#include - -#include - -#include -#include - -namespace odr::internal { - -Html html::pdf2htmlEX_wrapper(const std::string &input_path, - const std::string &output_path, - const HtmlConfig &config, - std::optional &password) { - static const char *fontconfig_path = getenv("FONTCONFIG_PATH"); - if (nullptr == fontconfig_path) { - // Storage is allocated and after successful putenv, it will never be freed. - // This is the way of putenv. - char *storage = strdup("FONTCONFIG_PATH=" FONTCONFIG_PATH); - if (0 != putenv(storage)) { - free(storage); - } - fontconfig_path = getenv("FONTCONFIG_PATH"); - } - - pdf2htmlEX::pdf2htmlEX pdf2htmlEX; - pdf2htmlEX.setDataDir(PDF2HTMLEX_DATA_DIR); - pdf2htmlEX.setPopplerDataDir(POPPLER_DATA_DIR); - - pdf2htmlEX.setInputFilename(input_path); - pdf2htmlEX.setDestinationDir(output_path); - auto output_file_name = "document.html"; - pdf2htmlEX.setOutputFilename(output_file_name); - - pdf2htmlEX.setDRM(false); - pdf2htmlEX.setProcessOutline(false); - pdf2htmlEX.setProcessAnnotation(true); - - if (password.has_value()) { - pdf2htmlEX.setOwnerPassword(password.value()); - pdf2htmlEX.setUserPassword(password.value()); - } - - try { - pdf2htmlEX.convert(); - } catch (const pdf2htmlEX::EncryptionPasswordException &e) { - throw WrongPassword(); - } catch (const pdf2htmlEX::DocumentCopyProtectedException &e) { - throw std::runtime_error("document is copy protected"); - } catch (const pdf2htmlEX::ConversionFailedException &e) { - throw std::runtime_error(std::string("conversion error ") + e.what()); - } - - return {FileType::portable_document_format, - config, - {{"document", output_path + "/" + output_file_name}}}; -} - -} // namespace odr::internal diff --git a/src/odr/internal/html/pdf2htmlEX_wrapper.hpp b/src/odr/internal/html/pdf2htmlEX_wrapper.hpp deleted file mode 100644 index ace0e5ce..00000000 --- a/src/odr/internal/html/pdf2htmlEX_wrapper.hpp +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef ODR_INTERNAL_PDF2HTMLEX_WRAPPER_HPP -#define ODR_INTERNAL_PDF2HTMLEX_WRAPPER_HPP - -#include -#include - -namespace odr { -class PdfFile; - -struct HtmlConfig; -class Html; -} // namespace odr - -namespace odr::internal::html { - -Html pdf2htmlEX_wrapper(const std::string &input_path, - const std::string &output_path, - const HtmlConfig &config, - std::optional &password); - -} - -#endif // ODR_INTERNAL_PDF2HTMLEX_WRAPPER_HPP diff --git a/src/odr/internal/html/pdf2htmlex_wrapper.cpp b/src/odr/internal/html/pdf2htmlex_wrapper.cpp new file mode 100644 index 00000000..e8856b15 --- /dev/null +++ b/src/odr/internal/html/pdf2htmlex_wrapper.cpp @@ -0,0 +1,133 @@ +#include + +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +namespace odr::internal { + +Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, + const std::string &output_path, + const HtmlConfig &config) { + PDFDoc &pdf_doc = pdf_file.pdf_doc(); + + const char *fontconfig_path = std::getenv("FONTCONFIG_PATH"); + if (fontconfig_path == nullptr) { + // Storage is allocated and after successful putenv, it will never be freed. + // This is the way of putenv. + char *storage = strdup("FONTCONFIG_PATH=" FONTCONFIG_PATH); + if (0 != putenv(storage)) { + free(storage); + } + fontconfig_path = std::getenv("FONTCONFIG_PATH"); + } + + pdf2htmlEX::Param param; + + // pages + param.first_page = 1; + param.last_page = pdf_doc.getNumPages(); + + // dimension + param.zoom = 0; + param.fit_width = 0; + param.fit_height = 0; + param.use_cropbox = 1; + param.desired_dpi = 144.0; + + // output + param.embed_css = 1; + param.embed_font = 1; + param.embed_image = 1; + param.embed_javascript = 1; + param.embed_outline = 1; + param.split_pages = 0; + param.dest_dir = output_path; + param.css_filename = ""; + param.page_filename = ""; + param.outline_filename = ""; + param.process_nontext = 1; + param.process_outline = 1; + param.process_annotation = 0; + param.process_form = 0; + param.printing = 1; + param.fallback = 0; + param.tmp_file_size_limit = -1; + + // font + param.embed_external_font = 0; // TODO 1 + param.font_format = "woff"; + param.decompose_ligature = 0; + param.turn_off_ligatures = 0; + param.auto_hint = 0; + param.external_hint_tool = ""; + param.stretch_narrow_glyph = 0; + param.squeeze_wide_glyph = 1; + param.override_fstype = 0; + param.process_type3 = 0; + + // text + param.h_eps = 1.0; + param.v_eps = 1.0; + param.space_threshold = 1.0 / 8; + param.font_size_multiplier = 4.0; + param.space_as_offset = 0; + param.tounicode = 0; + param.optimize_text = 0; + param.correct_text_visibility = 1; + param.text_dpi = 300; + + // background + param.bg_format = "png"; + param.svg_node_count_limit = -1; + param.svg_embed_bitmap = 1; + + // encryption + param.owner_password = ""; + param.user_password = ""; + param.no_drm = 0; + + // misc + param.clean_tmp = 1; + param.tmp_dir = "/tmp"; + param.data_dir = PDF2HTMLEX_DATA_DIR; + param.poppler_data_dir = POPPLER_DATA_DIR; + param.debug = 0; + param.proof = 0; + param.quiet = 1; + + // input, output + param.input_filename = ""; + param.output_filename = "document.html"; + + if (!pdf_doc.okToCopy()) { + if (param.no_drm == 0) { + throw DocumentCopyProtectedException(""); + } + } + + globalParams = std::make_unique( + !param.poppler_data_dir.empty() ? param.poppler_data_dir.c_str() + : nullptr); + + // TODO not sure what the `progPath` is used for. it cannot be `nullptr` + // TODO potentially just a cache dir? + pdf2htmlEX::HTMLRenderer(fontconfig_path, param).process(&pdf_doc); + + globalParams.reset(); + + return {FileType::portable_document_format, + config, + {{"document", output_path + "/document.html"}}}; +} + +} // namespace odr::internal diff --git a/src/odr/internal/html/pdf2htmlex_wrapper.hpp b/src/odr/internal/html/pdf2htmlex_wrapper.hpp new file mode 100644 index 00000000..6d8589f9 --- /dev/null +++ b/src/odr/internal/html/pdf2htmlex_wrapper.hpp @@ -0,0 +1,29 @@ +#ifndef ODR_INTERNAL_HTML_PDF2HTMLEX_WRAPPER_HPP +#define ODR_INTERNAL_HTML_PDF2HTMLEX_WRAPPER_HPP + +#include +#include + +namespace odr { +struct HtmlConfig; +class Html; +} // namespace odr + +namespace odr::internal { +class PopplerPdfFile; +} // namespace odr::internal + +namespace odr::internal::html { + +Html translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, + const std::string &output_path, + const HtmlConfig &config); + +class DocumentCopyProtectedException : public std::runtime_error { +public: + using std::runtime_error::runtime_error; +}; + +} // namespace odr::internal::html + +#endif // ODR_INTERNAL_HTML_PDF2HTMLEX_WRAPPER_HPP diff --git a/src/odr/internal/html/pdf_file.hpp b/src/odr/internal/html/pdf_file.hpp index 6df0f55a..068b822a 100644 --- a/src/odr/internal/html/pdf_file.hpp +++ b/src/odr/internal/html/pdf_file.hpp @@ -1,5 +1,5 @@ -#ifndef ODR_INTERNAL_PDF_FILE_HPP -#define ODR_INTERNAL_PDF_FILE_HPP +#ifndef ODR_INTERNAL_HTML_PDF_FILE_HPP +#define ODR_INTERNAL_HTML_PDF_FILE_HPP #include @@ -17,4 +17,4 @@ Html translate_pdf_file(const PdfFile &pdf_file, const std::string &output_path, } -#endif // ODR_INTERNAL_PDF_FILE_HPP +#endif // ODR_INTERNAL_HTML_PDF_FILE_HPP diff --git a/src/odr/internal/html/wvWare_wrapper.cpp b/src/odr/internal/html/wvWare_wrapper.cpp deleted file mode 100644 index 1908836f..00000000 --- a/src/odr/internal/html/wvWare_wrapper.cpp +++ /dev/null @@ -1,52 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -namespace odr::internal::html { - -Html wvWare_wrapper(const std::string &input_path, - const std::string &output_path, const HtmlConfig &config, - std::optional &password) { - if (nullptr == g_wvDataDir) { - g_wvDataDir = WVDATADIR; - } - - auto output_file_path = output_path + "/document.html"; - - char *input_file_path = strdup(input_path.c_str()); - char *output_dir = strdup(output_path.c_str()); - - g_htmlOutputFileHandle = fopen(output_file_path.c_str(), "w"); - - std::string pw; - if (password.has_value()) { - pw = password.value(); - } - int retVal = wvHtml_convert(input_file_path, output_dir, pw.c_str()); - free(output_dir); - free(input_file_path); - fclose(g_htmlOutputFileHandle); - g_htmlOutputFileHandle = nullptr; - - if (0 != retVal) { - unlink(output_file_path.c_str()); - - switch (retVal) { - case 100: // PasswordRequired - case 101: // Wrong Password - throw WrongPassword(); - default: - throw std::runtime_error("Conversion error"); - } - } - - return { - FileType::legacy_word_document, config, {{"document", output_file_path}}}; -} - -} // namespace odr::internal::html diff --git a/src/odr/internal/html/wvWare_wrapper.hpp b/src/odr/internal/html/wvWare_wrapper.hpp deleted file mode 100644 index e7000901..00000000 --- a/src/odr/internal/html/wvWare_wrapper.hpp +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef ODR_INTERNAL_WVWARE_WRAPPER_HPP -#define ODR_INTERNAL_WVWARE_WRAPPER_HPP - -#include -#include - -namespace odr { -class File; - -struct HtmlConfig; -class Html; -} // namespace odr - -namespace odr::internal::html { - -Html wvWare_wrapper(const std::string &input_path, - const std::string &output_path, const HtmlConfig &config, - std::optional &password); - -} - -#endif // ODR_INTERNAL_WVWARE_WRAPPER_HPP diff --git a/src/odr/internal/html/wvware_wrapper.cpp b/src/odr/internal/html/wvware_wrapper.cpp new file mode 100644 index 00000000..4821c259 --- /dev/null +++ b/src/odr/internal/html/wvware_wrapper.cpp @@ -0,0 +1,850 @@ +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace odr::internal { + +/// A lot of this code is duplicated from wvWare, mostly from `wvWare.c` and +/// `wvHtml.c`. +/// +/// wvWare is writing to stdout, while we want to write to a file. Also, wvWare +/// is configurable to write not only HTML but also other formats. We only need +/// HTML. +/// +/// We decided to duplicate the code instead of changing upstream wvWare code +/// because it is rather an application not a library, it is quite outdated and +/// not actively developed, and written in C. Duplication allows for a clean +/// separation between wvWare and our code while also being able to write modern +/// C++ code. +/// +/// A copy of wvWare can be found here: +/// https://github.com/opendocument-app/wvWare +namespace { + +/// Extension of `expand_data` see +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wv.h#L2776-L2814 +/// to allow for more state variables. +struct TranslationState : public expand_data { + explicit TranslationState(html::HtmlWriter _out) + : expand_data{}, out(std::move(_out)) {} + + char *charset = nullptr; + PAP *ppap = nullptr; + + struct { + int message = 0; + } special_char_handler_state = {}; + + std::size_t figure_number = 0; + + html::HtmlWriter out; +}; + +/// Originally from `text.c` `wvConvertUnicodeToHtml` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/text.c#L1999-L2154 +int convert_unicode_to_html(wvParseStruct *ps, std::uint16_t char16) { + auto *data = (TranslationState *)ps->userData; + auto &out = data->out; + + switch (char16) { + case 11: + out.out() << "
"; + return 1; + case 31: /* non-required hyphen */ + out.out() << "­"; /*vladimir@lukianov.name HTML 4.01 spec*/ + return 1; + case 30: + case 45: + case 0x2013: + out.out() << "-"; /* en-dash */ + return 1; + case 12: + case 13: + case 14: + case 7: + return 1; + case 34: + out.out() << """; + return 1; + case 38: + out.out() << "&"; + return 1; + case 60: + out.out() << "<"; + return 1; + case 62: + out.out() << ">"; + return 1; + /* + german characters, im assured that this is the right way to handle them + by Markus Schulte + + As the output encoding for HTML was chosen as UTF-8, + we don't need Ä etc. etc. I removed all but sz + -- MV 6.4.2000 + */ + + case 0xdf: + out.out() << "ß"; + return 1; + /* end german characters */ + case 0x2026: +#if 0 +/* +this just looks awful in netscape 4.5, so im going to do a very foolish +thing and just put ... instead of this +*/ + printf ("…"); +/*is there a proper html name for ... &ellipse;? Yes, … -- MV */ +#endif + out.out() << "…"; + return 1; + case 0x2019: + out.out() << "'"; + return 1; + case 0x2215: + out.out() << "/"; + return 1; + case 0xF8E7: /* without this, things should work in theory, but not for me */ + out.out() << "_"; + return 1; + case 0x2018: + out.out() << "`"; + return 1; + + /* Windows specials (MV): */ + case 0x0160: + out.out() << "Š"; + return 1; + case 0x0161: + out.out() << "š"; + return 1; + case 0x2014: + out.out() << "—"; + return 1; + case 0x201c: + out.out() << "“"; /* inverted double quotation mark */ + return 1; + case 0x201d: + out.out() << "”"; /* double q.m. */ + return 1; + case 0x201e: + out.out() << "„"; /* below double q.m. */ + return 1; + case 0x2020: + out.out() << "†"; + return 1; + case 0x2021: + out.out() << "‡"; + return 1; + case 0x2022: + out.out() << "•"; + return 1; + case 0x0152: + out.out() << "Œ"; + return 1; + case 0x0153: + out.out() << "œ"; + return 1; + case 0x0178: + out.out() << "Ÿ"; + return 1; + case 0x2030: + out.out() << "‰"; + return 1; + case 0x20ac: + out.out() << "€"; + return 1; + + /* Mac specials (MV): */ + case 0xf020: + out.out() << " "; + return 1; + case 0xf02c: + out.out() << ","; + return 1; + case 0xf028: + out.out() << "("; + return 1; + + case 0xf03e: + out.out() << ">"; + return 1; + case 0xf067: + out.out() << "γ"; + return 1; + case 0xf064: + out.out() << "δ"; + return 1; + case 0xf072: + out.out() << "ρ"; + return 1; + case 0xf073: + out.out() << "σ"; + return 1; + case 0xf0ae: + out.out() << "→"; /* right arrow */ + return 1; + case 0xf0b6: + out.out() << "∂"; /* partial deriv. */ + return 1; + case 0xf0b3: + out.out() << "≥"; + return 1; + default: + break; + } + /* Debugging aid: */ + /* if (char16 >= 0x100) printf("[%x]", char16); */ + return 0; +} + +/// Originally from `text.c` `wvOutputFromUnicode` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/text.c#L757-L840 +void output_from_unicode(wvParseStruct *ps, std::uint16_t eachchar, + char *outputtype) { + auto *data = (TranslationState *)ps->userData; + auto &out = data->out; + + GIConv g_iconv_handle = (GIConv)-1; + int need_swapping; + gchar *ibuf, *obuf; + std::size_t ibuflen, obuflen, len, count, i; + std::uint8_t buffer[2], buffer2[5]; + + if (convert_unicode_to_html(ps, eachchar) != 0) { + return; + } + + { + g_iconv_handle = g_iconv_open(outputtype, "UCS-2"); + if (g_iconv_handle == (GIConv)-1) { + std::cerr << "g_iconv_open fail: " << errno + << ", cannot convert UCS-2 to " << outputtype << "\n"; + out.out() << "?"; + return; + } + + /* Determining if unicode biteorder is swapped (glibc < 2.2) */ + need_swapping = 1; + + buffer[0] = 0x20; + buffer[1] = 0; + ibuf = reinterpret_cast(buffer); + obuf = reinterpret_cast(buffer2); + ibuflen = 2; + obuflen = 5; + + count = g_iconv(g_iconv_handle, &ibuf, &ibuflen, &obuf, &obuflen); + if (count != (std::size_t)-1) { + need_swapping = buffer2[0] != 0x20; + } + } + + if (need_swapping) { + buffer[0] = (eachchar >> 8) & 0x00ff; + buffer[1] = eachchar & 0x00ff; + } else { + buffer[0] = eachchar & 0x00ff; + buffer[1] = (eachchar >> 8) & 0x00ff; + } + + ibuf = reinterpret_cast(buffer); + obuf = reinterpret_cast(buffer2); + + ibuflen = 2; + len = obuflen = 5; + + count = g_iconv(g_iconv_handle, &ibuf, &ibuflen, &obuf, &obuflen); + if (count == (std::size_t)-1) { + std::cerr << "iconv failed, errno: " << errno << ", char: 0x" << std::hex + << eachchar << ", UCS-2 -> " << outputtype << "\n"; + + /* I'm torn here - do i just announce the failure, continue, or copy over to + * the other buffer? */ + + /* errno is usually 84 (illegal byte sequence) + should i reverse the bytes and try again? */ + out.out() << ibuf[1]; + } else { + len = len - obuflen; + + for (i = 0; i < len; i++) { + out.out() << buffer2[i]; + } + } + + // TODO iconv could be cached + { g_iconv_close(g_iconv_handle); } +} + +/// Originally from `wvWare.c` `wvStrangeNoGraphicData` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L661-L676 +/// simplified to HTML output +void strange_no_graphic_data(wvParseStruct *ps, int graphicstype) { + auto *data = (TranslationState *)ps->userData; + auto &out = data->out; + + std::cerr << "Strange No Graphic Data in the 0x01/0x08 graphic\n"; + + // TODO + out.out() << R"()
)"; +} + +/// Originally from `wvWare.c` `wvPrintGraphics` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1239-L1287 +/// simplified to HTML output +void print_graphics(wvParseStruct *ps, int graphicstype, int width, int height, + const std::string &source) { + // upstream converts to PNG, we just use the original format as the browser + // should support them + + auto *data = (TranslationState *)ps->userData; + auto &out = data->out; + + // TODO export/embed image + + out.out() << R"()
)"; +} + +void handle_bitmap(wvParseStruct * /*ps*/, const std::string &name, + BitmapBlip *bitmap) { + wvStream *pwv = bitmap->m_pvBits; + FILE *fd = nullptr; + std::size_t size = 0, i; + + fd = fopen(name.c_str(), "wb"); + if (fd == nullptr) { + throw std::runtime_error("Cannot open " + name + " file for writing"); + } + size = wvStream_size(pwv); + wvStream_rewind(pwv); + + for (i = 0; i < size; i++) { + fputc(read_8ubit(pwv), fd); + } + fclose(fd); +} + +int handle_metafile(wvParseStruct * /*ps*/, const char *name, + MetaFileBlip *bitmap) { + wvStream *pwv = bitmap->m_pvBits; + FILE *fd = nullptr; + std::size_t size = 0, i; + std::uint8_t decompressf = 0; + + fd = fopen(name, "wb"); + if (fd == nullptr) { + fprintf(stderr, "\nCannot open %s for writing\n", name); + exit(1); + } + size = wvStream_size(pwv); + wvStream_rewind(pwv); + + if (bitmap->m_fCompression == msocompressionDeflate) { + decompressf = setdecom(); + } + + if (!decompressf) { + for (i = 0; i < size; i++) { + fputc(read_8ubit(pwv), fd); + } + } else /* decompress here */ + { + FILE *tmp = tmpfile(); + FILE *out = tmpfile(); + + for (i = 0; i < size; i++) { + fputc(read_8ubit(pwv), tmp); + } + + rewind(tmp); + decompress(tmp, out, bitmap->m_cbSave, bitmap->m_cb); + fclose(tmp); + + rewind(out); + + for (i = 0; i < bitmap->m_cb; i++) { + fputc(fgetc(out), fd); + } + + fclose(out); + } + + fclose(fd); + return 0; +} + +std::string figure_name(wvParseStruct *ps) { + auto *data = (TranslationState *)ps->userData; + + std::size_t number = data->figure_number++; + std::string name = "figure" + std::to_string(number); + + return name; +} + +std::string html_graphic(wvParseStruct *ps, Blip *blip) { + std::string name; + wvStream *fd; + char test[3]; + + name = figure_name(ps); + + /* + temp hack to test older included bmps in word 6 and 7, + should be wrapped in a modern escher strucure before getting + to here, and then handled as normal + */ + switch (blip->type) { + case msoblipJPEG: + case msoblipDIB: + case msoblipPNG: + fd = (blip->blip.bitmap.m_pvBits); + test[2] = '\0'; + test[0] = (char)read_8ubit(fd); + + test[1] = (char)read_8ubit(fd); + wvStream_rewind(fd); + if (!(strcmp(test, "BM"))) { + name += ".bmp"; + handle_bitmap(ps, name, &blip->blip.bitmap); + return name; + } + default: + break; + } + + switch (blip->type) { + case msoblipWMF: + name += ".wmf"; + handle_metafile(ps, name.c_str(), &blip->blip.metafile); + break; + case msoblipEMF: + name += ".emf"; + handle_metafile(ps, name.c_str(), &blip->blip.metafile); + break; + case msoblipPICT: + name += ".pict"; + handle_metafile(ps, name.c_str(), &blip->blip.metafile); + break; + case msoblipJPEG: + name += ".jpg"; + handle_bitmap(ps, name.c_str(), &blip->blip.bitmap); + break; + case msoblipDIB: + name += ".dib"; + handle_bitmap(ps, name.c_str(), &blip->blip.bitmap); + break; + case msoblipPNG: + name += ".png"; + handle_bitmap(ps, name.c_str(), &blip->blip.bitmap); + break; + } + return name; +} + +/// Originally from `wvWare.c` `myelehandler` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L503-L599 +int element_handler(wvParseStruct *ps, wvTag tag, void *props, int /*dirty*/) { + auto *data = (TranslationState *)ps->userData; + data->anSttbfAssoc = &ps->anSttbfAssoc; + data->lfo = &ps->lfo; + data->lfolvl = ps->lfolvl; + data->lvl = ps->lvl; + data->nolfo = &ps->nolfo; + data->nooflvl = &ps->nooflvl; + data->stsh = &ps->stsh; + data->lst = &ps->lst; + data->noofLST = &ps->noofLST; + data->liststartnos = &ps->liststartnos; + data->listnfcs = &ps->listnfcs; + data->finallvl = &ps->finallvl; + data->fib = &ps->fib; + data->dop = &ps->dop; + data->intable = &ps->intable; + data->cellbounds = &ps->cellbounds; + data->nocellbounds = &ps->nocellbounds; + data->endcell = &ps->endcell; + data->vmerges = &ps->vmerges; + data->norows = &ps->norows; + data->nextpap = &ps->nextpap; + if (data->charset == nullptr) { + data->charset = wvAutoCharset(ps); + } + data->props = props; + + switch (tag) { + case PARABEGIN: { + S16 tilfo = 0; + /* test begin */ + if (*(data->endcell) != 0) { + tilfo = ((PAP *)(data->props))->ilfo; + ((PAP *)(data->props))->ilfo = 0; + } + /* test end */ + data->ppap = (PAP *)data->props; + wvBeginPara(data); + if (tilfo != 0) { + ((PAP *)(data->props))->ilfo = tilfo; + } + } break; + case PARAEND: { + S16 tilfo = 0; + /* test begin */ + if (*(data->endcell) != 0) { + tilfo = ((PAP *)(data->props))->ilfo; + ((PAP *)(data->props))->ilfo = 0; + } + /* test end */ + wvEndCharProp(data); /* danger will break in the future */ + wvEndPara(data); + if (tilfo != 0) { + ((PAP *)(data->props))->ilfo = tilfo; + } + wvCopyPAP(&data->lastpap, (PAP *)(data->props)); + } break; + case CHARPROPBEGIN: + wvBeginCharProp(data, data->ppap); + break; + case CHARPROPEND: + wvEndCharProp(data); + break; + case SECTIONBEGIN: + wvBeginSection(data); + break; + case SECTIONEND: + wvEndSection(data); + break; + case COMMENTBEGIN: + wvBeginComment(data); + break; + case COMMENTEND: + wvEndComment(data); + break; + default: + break; + } + return 0; +} + +/// Originally from `wvWare.c` `mydochandler` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L601-L659 +int document_handler(wvParseStruct *ps, wvTag tag) { + auto *data = (TranslationState *)ps->userData; + data->anSttbfAssoc = &ps->anSttbfAssoc; + data->lfo = &ps->lfo; + data->lfolvl = ps->lfolvl; + data->lvl = ps->lvl; + data->nolfo = &ps->nolfo; + data->nooflvl = &ps->nooflvl; + data->stsh = &ps->stsh; + data->lst = &ps->lst; + data->noofLST = &ps->noofLST; + data->liststartnos = &ps->liststartnos; + data->listnfcs = &ps->listnfcs; + data->finallvl = &ps->finallvl; + data->fib = &ps->fib; + data->dop = &ps->dop; + data->intable = &ps->intable; + data->cellbounds = &ps->cellbounds; + data->nocellbounds = &ps->nocellbounds; + data->endcell = &ps->endcell; + data->vmerges = &ps->vmerges; + data->norows = &ps->norows; + + wvSetEntityConverter(data); + data->filename = ps->filename; + data->whichcell = 0; + data->whichrow = 0; + data->asep = nullptr; + wvInitPAP(&data->lastpap); + data->nextpap = nullptr; + data->ps = ps; + + if (data->charset == nullptr) { + data->charset = wvAutoCharset(ps); + } + + switch (tag) { + case DOCBEGIN: + wvBeginDocument(data); + break; + case DOCEND: + wvEndDocument(data); + break; + default: + break; + } + + return 0; +} + +/// Originally from `wvWare.c` `myCharProc` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1556-L1605 +int char_handler(wvParseStruct *ps, std::uint16_t eachchar, + std::uint8_t chartype, std::uint16_t lid) { + auto *data = (TranslationState *)ps->userData; + + switch (eachchar) { + case 19: + ps->fieldstate++; + ps->fieldmiddle = 0; + fieldCharProc(ps, eachchar, chartype, lid); /* temp */ + return 0; + case 20: + fieldCharProc(ps, eachchar, chartype, lid); + ps->fieldmiddle = 1; + return 0; + case 21: + ps->fieldmiddle = 0; + ps->fieldstate--; + fieldCharProc(ps, eachchar, chartype, lid); /* temp */ + return 0; + case 0x08: + std::cerr << "hmm did we loose the fSpec flag ?, this is possibly a bug\n"; + break; + default: + break; + } + + if (ps->fieldstate != 0 && fieldCharProc(ps, eachchar, chartype, lid) != 0) { + return 0; + } + + // from `wvOutputHtmlChar` + { + char *outputtype = + data->charset != nullptr ? data->charset : wvAutoCharset(ps); + if (chartype != 0) { + eachchar = wvHandleCodePage(eachchar, lid); + } + output_from_unicode(ps, eachchar, outputtype); + } + + return 0; +} + +/// Originally from `wvWare.c` `mySpecCharProc` +/// https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1289-L1553 +int special_char_handler(wvParseStruct *ps, std::uint16_t eachchar, CHP *achp) { + auto *data = (TranslationState *)ps->userData; + auto &state = data->special_char_handler_state; + auto &out = data->out; + + PICF picf; + FSPA *fspa = nullptr; + + switch (eachchar) { + case 19: + // field began + ps->fieldstate++; + ps->fieldmiddle = 0; + fieldCharProc(ps, eachchar, 0, 0x400); /* temp */ + return 0; + case 20: + if (achp->fOle2 != 0) { + std::cerr << "this field has an associated embedded object of id " + << achp->fcPic_fcObj_lTagObj << "\n"; + } + fieldCharProc(ps, eachchar, 0, 0x400); /* temp */ + ps->fieldmiddle = 1; + return 0; + case 21: + ps->fieldstate--; + ps->fieldmiddle = 0; + fieldCharProc(ps, eachchar, 0, 0x400); /* temp */ + return 0; + default: + break; + } + + if (ps->fieldstate) { + if (fieldCharProc(ps, eachchar, 0, 0x400)) + return 0; + } + + switch (eachchar) { + case 0x05: + /* this should be handled by the COMMENTBEGIN and COMMENTEND events */ + return 0; + case 0x01: { + wvStream *f; + Blip blip; + long p = wvStream_tell(ps->data); + + if (achp->fOle2 != 0) { + return 0; + } + + wvStream_goto(ps->data, achp->fcPic_fcObj_lTagObj); + wvGetPICF(wvQuerySupported(&ps->fib, nullptr), &picf, ps->data); + f = picf.rgb; + if (wv0x01(&blip, f, picf.lcb - picf.cbHeader) != 0) { + std::string name = html_graphic(ps, &blip); + print_graphics(ps, 0x01, (int)wvTwipsToHPixels(picf.dxaGoal), + (int)wvTwipsToVPixels(picf.dyaGoal), name); + } else { + strange_no_graphic_data(ps, 0x01); + } + + wvStream_goto(ps->data, p); + return 0; + } + case 0x08: { + Blip blip; + if (wvQuerySupported(&ps->fib, nullptr) == WORD8) { + if (ps->nooffspa > 0) { + fspa = + wvGetFSPAFromCP(ps->currentcp, ps->fspa, ps->fspapos, ps->nooffspa); + + if (fspa == nullptr) { + std::cerr << "No fspa! Insanity abounds!\n"; + return 0; + } + + data->props = fspa; + if (wv0x08(&blip, (int)fspa->spid, ps) != 0) { + std::string name = html_graphic(ps, &blip); + print_graphics( + ps, 0x08, + (int)wvTwipsToHPixels((short)(fspa->xaRight - fspa->xaLeft)), + (int)wvTwipsToVPixels((short)(fspa->yaBottom - fspa->yaTop)), + name); + } else { + strange_no_graphic_data(ps, 0x08); + } + } else { + std::cerr << "nooffspa was <=0! Ignoring.\n"; + } + } else { + std::cerr << "pre word8 0x08 graphic, unsupported at the moment\n"; + FDOA *fdoa = + wvGetFDOAFromCP(ps->currentcp, ps->fdoa, ps->fdoapos, ps->nooffdoa); + data->props = fdoa; + } + + // Potentially relevant disabled code section in `wvWare.c`? + // https://github.com/opendocument-app/wvWare/blob/c015326b001f1ad6dfb1f5e718461c16c56cca5f/wvWare.c#L1443-L1459 + + return 0; + } + case 0x28: { + std::uint16_t symbol[6] = {'S', 'y', 'm', 'b', 'o', 'l'}; + std::uint16_t wingdings[9] = {'W', 'i', 'n', 'g', 'd', 'i', 'n', 'g', 's'}; + std::uint16_t mtextra[8] = {'M', 'T', ' ', 'E', 'x', 't', 'r', 'a'}; + + if (0 == memcmp(symbol, ps->fonts.ffn[achp->ftcSym].xszFfn, 12)) { + if ((state.message == 0) && (strcasecmp("UTF-8", data->charset) != 0)) { + std::cerr + << "Symbol font detected (too late sorry!), rerun wvHtml with option --charset utf-8\n\ +option to support correct symbol font conversion to a viewable format.\n"; + state.message++; + } + output_from_unicode(ps, wvConvertSymbolToUnicode(achp->xchSym - 61440), + data->charset); + return 0; + } else if (0 == memcmp(mtextra, ps->fonts.ffn[achp->ftcSym].xszFfn, 16)) { + if ((state.message == 0) && (strcasecmp("UTF-8", data->charset) != 0)) { + std::cerr + << "MT Extra font detected (too late sorry!), rerun wvHtml with option --charset utf-8\n\ +option to support correct symbol font conversion to a viewable format.\n"; + state.message++; + } + output_from_unicode(ps, wvConvertMTExtraToUnicode(achp->xchSym - 61440), + data->charset); + return 0; + } else if (0 == memcmp(wingdings, ps->fonts.ffn[achp->ftcSym].xszFfn, 18)) { + if (state.message == 0) { + std::cerr << "Wingdings font detected, i need a mapping table to " + "unicode for this\n"; + state.message++; + } + } else { + if (state.message == 0) { + char *fontname = wvWideStrToMB(ps->fonts.ffn[achp->ftcSym].xszFfn); + std::cerr << "Special font " << fontname + << ", I need a mapping table to unicode for this\n"; + wvFree(fontname); + out.out() << "*"; + state.message++; + } + return 0; + } + } + default: + break; + } + + return 0; +} + +} // namespace + +Html html::translate_wvware_oldms_file( + const WvWareLegacyMicrosoftFile &oldms_file, const std::string &output_path, + const HtmlConfig &config) { + HtmlResourceLocator resourceLocator = + local_resource_locator(output_path, config); + + std::string output_file_path = output_path + "/document.html"; + + std::ofstream ostream(output_file_path, std::ios::out); + if (!ostream.is_open()) { + throw FileWriteError(); + } + html::HtmlWriter out(ostream, config.format_html, config.html_indent); + + wvParseStruct &ps = oldms_file.parse_struct(); + + wvSetElementHandler(&ps, element_handler); + wvSetDocumentHandler(&ps, document_handler); + wvSetCharHandler(&ps, char_handler); + wvSetSpecialCharHandler(&ps, special_char_handler); + + state_data handle; + TranslationState translation_state(out); + + wvInitStateData(&handle); + + translation_state.sd = &handle; + ps.userData = &translation_state; + + out.write_begin(); + out.write_header_begin(); + out.write_header_charset("UTF-8"); + out.write_header_target("_blank"); + out.write_header_title("odr"); + out.write_header_viewport( + "width=device-width,initial-scale=1.0,user-scalable=yes"); + out.write_header_end(); + out.write_body_begin(); + + if (wvHtml(&ps) != 0) { + throw std::runtime_error("wvHtml failed"); + } + + out.write_body_end(); + out.write_end(); + + return { + FileType::legacy_word_document, config, {{"document", output_file_path}}}; +} + +} // namespace odr::internal diff --git a/src/odr/internal/html/wvware_wrapper.hpp b/src/odr/internal/html/wvware_wrapper.hpp new file mode 100644 index 00000000..b7061ea4 --- /dev/null +++ b/src/odr/internal/html/wvware_wrapper.hpp @@ -0,0 +1,23 @@ +#ifndef ODR_INTERNAL_WVWARE_WRAPPER_HPP +#define ODR_INTERNAL_WVWARE_WRAPPER_HPP + +#include + +namespace odr { +struct HtmlConfig; +class Html; +} // namespace odr + +namespace odr::internal { +class WvWareLegacyMicrosoftFile; +} // namespace odr::internal + +namespace odr::internal::html { + +Html translate_wvware_oldms_file(const WvWareLegacyMicrosoftFile &oldms_file, + const std::string &output_path, + const HtmlConfig &config); + +} + +#endif // ODR_INTERNAL_WVWARE_WRAPPER_HPP diff --git a/src/odr/internal/json/json_file.cpp b/src/odr/internal/json/json_file.cpp index f19ffca1..71b35024 100644 --- a/src/odr/internal/json/json_file.cpp +++ b/src/odr/internal/json/json_file.cpp @@ -24,4 +24,8 @@ FileMeta JsonFile::file_meta() const noexcept { return {FileType::javascript_object_notation, false, {}}; } +DecoderEngine JsonFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + } // namespace odr::internal::json diff --git a/src/odr/internal/json/json_file.hpp b/src/odr/internal/json/json_file.hpp index 29872e23..6640d2bc 100644 --- a/src/odr/internal/json/json_file.hpp +++ b/src/odr/internal/json/json_file.hpp @@ -17,6 +17,7 @@ class JsonFile final : public abstract::TextFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; private: std::shared_ptr m_file; diff --git a/src/odr/internal/odf/odf_file.cpp b/src/odr/internal/odf/odf_file.cpp index 12cf46f9..5e85d90a 100644 --- a/src/odr/internal/odf/odf_file.cpp +++ b/src/odr/internal/odf/odf_file.cpp @@ -43,6 +43,10 @@ FileType OpenDocumentFile::file_type() const noexcept { FileMeta OpenDocumentFile::file_meta() const noexcept { return m_file_meta; } +DecoderEngine OpenDocumentFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + DocumentType OpenDocumentFile::document_type() const { return m_file_meta.document_meta->document_type; } diff --git a/src/odr/internal/odf/odf_file.hpp b/src/odr/internal/odf/odf_file.hpp index 684ac403..8d50525e 100644 --- a/src/odr/internal/odf/odf_file.hpp +++ b/src/odr/internal/odf/odf_file.hpp @@ -28,6 +28,8 @@ class OpenDocumentFile final : public virtual abstract::DocumentFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; + [[nodiscard]] DocumentType document_type() const final; [[nodiscard]] DocumentMeta document_meta() const final; diff --git a/src/odr/internal/oldms/oldms_file.cpp b/src/odr/internal/oldms/oldms_file.cpp index 9b5a9f0f..a8433df0 100644 --- a/src/odr/internal/oldms/oldms_file.cpp +++ b/src/odr/internal/oldms/oldms_file.cpp @@ -8,11 +8,6 @@ #include #include -namespace odr::internal::abstract { -class Document; -class File; -} // namespace odr::internal::abstract - namespace odr::internal::oldms { namespace { @@ -62,6 +57,10 @@ FileType LegacyMicrosoftFile::file_type() const noexcept { FileMeta LegacyMicrosoftFile::file_meta() const noexcept { return m_file_meta; } +DecoderEngine LegacyMicrosoftFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + DocumentType LegacyMicrosoftFile::document_type() const { return m_file_meta.document_meta->document_type; } diff --git a/src/odr/internal/oldms/oldms_file.hpp b/src/odr/internal/oldms/oldms_file.hpp index f3651c7e..7950b3e3 100644 --- a/src/odr/internal/oldms/oldms_file.hpp +++ b/src/odr/internal/oldms/oldms_file.hpp @@ -25,6 +25,8 @@ class LegacyMicrosoftFile final : public abstract::DocumentFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; + [[nodiscard]] DocumentType document_type() const final; [[nodiscard]] DocumentMeta document_meta() const final; diff --git a/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp b/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp new file mode 100644 index 00000000..a7bbdd55 --- /dev/null +++ b/src/odr/internal/oldms_wvware/wvware_oldms_file.cpp @@ -0,0 +1,141 @@ +#include + +#include + +#include +#include +#include + +namespace odr::internal { + +struct WvWareLegacyMicrosoftFile::ParserState { + GsfInput *gsf_input{}; + + wvParseStruct ps{}; + int encryption_flag{}; +}; + +WvWareLegacyMicrosoftFile::WvWareLegacyMicrosoftFile( + std::shared_ptr file) + : m_file{std::move(file)} { + GError *error = nullptr; + + m_parser_state = std::make_shared(); + + m_parser_state->gsf_input = + gsf_input_stdio_new(m_file->disk_path()->string().c_str(), &error); + + if (m_parser_state->gsf_input == nullptr) { + throw std::runtime_error("gsf_input_stdio_new failed"); + } + + open(); +} + +WvWareLegacyMicrosoftFile::WvWareLegacyMicrosoftFile( + std::shared_ptr file) + : m_file{std::move(file)} { + m_parser_state = std::make_shared(); + + m_parser_state->gsf_input = gsf_input_memory_new( + reinterpret_cast(m_file->memory_data()), + static_cast(m_file->size()), false); + + open(); +} + +WvWareLegacyMicrosoftFile::~WvWareLegacyMicrosoftFile() { + wvOLEFree(&m_parser_state->ps); +} + +void WvWareLegacyMicrosoftFile::open() { + wvInit(); + + int ret = wvInitParser_gsf(&m_parser_state->ps, m_parser_state->gsf_input); + + // check if password is required + if ((ret & 0x8000) != 0) { + m_encryption_state = EncryptionState::encrypted; + m_parser_state->encryption_flag = ret & 0x7fff; + + if ((m_parser_state->encryption_flag == WORD8) || + (m_parser_state->encryption_flag == WORD7) || + (m_parser_state->encryption_flag == WORD6)) { + ret = 0; + } + } else { + m_encryption_state = EncryptionState::not_encrypted; + } + + if (ret != 0) { + wvOLEFree(&m_parser_state->ps); + throw std::runtime_error("wvInitParser failed"); + } +} + +std::shared_ptr +WvWareLegacyMicrosoftFile::file() const noexcept { + return m_file; +} + +FileType WvWareLegacyMicrosoftFile::file_type() const noexcept { + return FileType::legacy_word_document; +} + +FileMeta WvWareLegacyMicrosoftFile::file_meta() const noexcept { + return {file_type(), password_encrypted(), document_meta()}; +} + +DecoderEngine WvWareLegacyMicrosoftFile::decoder_engine() const noexcept { + return DecoderEngine::wvware; +} + +DocumentType WvWareLegacyMicrosoftFile::document_type() const { + return DocumentType::text; +} + +DocumentMeta WvWareLegacyMicrosoftFile::document_meta() const { return {}; } + +bool WvWareLegacyMicrosoftFile::password_encrypted() const noexcept { + return m_encryption_state == EncryptionState::encrypted || + m_encryption_state == EncryptionState::decrypted; +} + +EncryptionState WvWareLegacyMicrosoftFile::encryption_state() const noexcept { + return m_encryption_state; +} + +bool WvWareLegacyMicrosoftFile::decrypt(const std::string &password) { + if (m_encryption_state != EncryptionState::encrypted) { + return false; + } + + wvSetPassword(password.c_str(), &m_parser_state->ps); + + bool success = false; + + if (m_parser_state->encryption_flag == WORD8) { + success = wvDecrypt97(&m_parser_state->ps) == 0; + } else if (m_parser_state->encryption_flag == WORD7 || + m_parser_state->encryption_flag == WORD6) { + success = wvDecrypt95(&m_parser_state->ps) == 0; + } + + if (!success) { + return false; + } + + m_encryption_state = EncryptionState::decrypted; + return true; +} + +std::shared_ptr +WvWareLegacyMicrosoftFile::document() const { + return {}; // TODO throw +} + +wvParseStruct &WvWareLegacyMicrosoftFile::parse_struct() const { + return m_parser_state->ps; +} + +} // namespace odr::internal diff --git a/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp b/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp new file mode 100644 index 00000000..efc7a1fc --- /dev/null +++ b/src/odr/internal/oldms_wvware/wvware_oldms_file.hpp @@ -0,0 +1,57 @@ +#ifndef ODR_INTERNAL_WVWARE_OLDMS_FILE_HPP +#define ODR_INTERNAL_WVWARE_OLDMS_FILE_HPP + +#include + +#include +#include + +#include + +struct _wvParseStruct; +using wvParseStruct = struct _wvParseStruct; + +namespace odr::internal::common { +class DiskFile; +class MemoryFile; +} // namespace odr::internal::common + +namespace odr::internal { + +class WvWareLegacyMicrosoftFile final : public abstract::DocumentFile { +public: + explicit WvWareLegacyMicrosoftFile(std::shared_ptr file); + explicit WvWareLegacyMicrosoftFile(std::shared_ptr file); + ~WvWareLegacyMicrosoftFile() final; + + [[nodiscard]] std::shared_ptr file() const noexcept final; + + [[nodiscard]] FileType file_type() const noexcept final; + [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; + + [[nodiscard]] DocumentType document_type() const final; + [[nodiscard]] DocumentMeta document_meta() const final; + + [[nodiscard]] bool password_encrypted() const noexcept final; + [[nodiscard]] EncryptionState encryption_state() const noexcept final; + bool decrypt(const std::string &password) final; + + [[nodiscard]] std::shared_ptr document() const final; + + [[nodiscard]] wvParseStruct &parse_struct() const; + +private: + struct ParserState; + + std::shared_ptr m_file; + std::shared_ptr m_parser_state; + + EncryptionState m_encryption_state{EncryptionState::unknown}; + + void open(); +}; + +} // namespace odr::internal + +#endif // ODR_INTERNAL_WVWARE_OLDMS_FILE_HPP diff --git a/src/odr/internal/ooxml/ooxml_file.cpp b/src/odr/internal/ooxml/ooxml_file.cpp index 2cf85092..6a1404be 100644 --- a/src/odr/internal/ooxml/ooxml_file.cpp +++ b/src/odr/internal/ooxml/ooxml_file.cpp @@ -37,6 +37,10 @@ FileType OfficeOpenXmlFile::file_type() const noexcept { FileMeta OfficeOpenXmlFile::file_meta() const noexcept { return m_file_meta; } +DecoderEngine OfficeOpenXmlFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + DocumentType OfficeOpenXmlFile::document_type() const { return m_file_meta.document_meta->document_type; } diff --git a/src/odr/internal/ooxml/ooxml_file.hpp b/src/odr/internal/ooxml/ooxml_file.hpp index b3ab97a1..e6b420fe 100644 --- a/src/odr/internal/ooxml/ooxml_file.hpp +++ b/src/odr/internal/ooxml/ooxml_file.hpp @@ -27,6 +27,8 @@ class OfficeOpenXmlFile final : public abstract::DocumentFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; + [[nodiscard]] DocumentType document_type() const final; [[nodiscard]] DocumentMeta document_meta() const final; diff --git a/src/odr/internal/open_strategy.cpp b/src/odr/internal/open_strategy.cpp index 37df3282..335efd43 100644 --- a/src/odr/internal/open_strategy.cpp +++ b/src/odr/internal/open_strategy.cpp @@ -12,17 +12,19 @@ #include #include #include +#include #include #include +#include #include #include -#include +#include namespace odr::internal { std::vector -open_strategy::types(std::shared_ptr file) { +open_strategy::types(const std::shared_ptr &file) { std::vector result; auto file_type = magic::file_type(*file); @@ -93,6 +95,24 @@ open_strategy::types(std::shared_ptr file) { return result; } +std::vector +open_strategy::engines(const std::shared_ptr & /*file*/, + FileType as) { + std::vector result; + + result.push_back(DecoderEngine::odr); + + if (as == FileType::legacy_word_document) { + result.push_back(DecoderEngine::wvware); + } + + if (as == FileType::portable_document_format) { + result.push_back(DecoderEngine::poppler); + } + + return result; +} + std::unique_ptr open_strategy::open_file(std::shared_ptr file) { auto file_type = magic::file_type(*file); @@ -133,7 +153,7 @@ open_strategy::open_file(std::shared_ptr file) { return cfb_file; } else if (file_type == FileType::portable_document_format) { - return std::make_unique(file); + return std::make_unique(file); } else if (file_type == FileType::portable_network_graphics || file_type == FileType::graphics_interchange_format || file_type == FileType::jpeg || @@ -167,10 +187,227 @@ open_strategy::open_file(std::shared_ptr file) { } std::unique_ptr -open_strategy::open_file(std::shared_ptr /*file*/, - const FileType /*as*/) { - // TODO implement - throw UnknownFileType(); +open_strategy::open_file(std::shared_ptr file, FileType as) { + DecodePreference preference; + preference.as_file_type = as; + return open_file(file, preference); +} + +std::unique_ptr +open_strategy::open_file(std::shared_ptr file, FileType as, + DecoderEngine with) { + if (as == FileType::opendocument_text || + as == FileType::opendocument_presentation || + as == FileType::opendocument_spreadsheet || + as == FileType::opendocument_graphics) { + if (with == DecoderEngine::odr) { + try { + auto memory_file = std::make_shared(*file); + auto zip_file = std::make_unique(std::move(memory_file)); + auto filesystem = zip_file->archive()->filesystem(); + return std::make_unique(filesystem); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::office_open_xml_document || + as == FileType::office_open_xml_presentation || + as == FileType::office_open_xml_workbook || + as == FileType::office_open_xml_encrypted) { + if (with == DecoderEngine::odr) { + try { + auto memory_file = std::make_shared(*file); + auto zip_file = std::make_unique(std::move(memory_file)); + auto filesystem = zip_file->archive()->filesystem(); + return std::make_unique(filesystem); + } catch (...) { + } + try { + auto memory_file = std::make_shared(*file); + auto cfb_file = std::make_unique(std::move(memory_file)); + auto filesystem = cfb_file->archive()->filesystem(); + return std::make_unique(filesystem); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::legacy_word_document || + as == FileType::legacy_powerpoint_presentation || + as == FileType::legacy_excel_worksheets) { + if (with == DecoderEngine::odr) { + try { + auto memory_file = std::make_shared(*file); + auto cfb_file = std::make_unique(std::move(memory_file)); + auto filesystem = cfb_file->archive()->filesystem(); + return std::make_unique(filesystem); + } catch (...) { + } + return nullptr; + } +#ifdef ODR_WITH_WVWARE + if (with == DecoderEngine::wvware) { + try { + auto memory_file = std::make_shared(*file); + return std::make_unique( + std::move(memory_file)); + } catch (...) { + } + return nullptr; + } +#endif + return nullptr; + } + + if (as == FileType::portable_document_format) { + if (with == DecoderEngine::odr) { + try { + return std::make_unique(file); + } catch (...) { + } + return nullptr; + } +#ifdef ODR_WITH_PDF2HTMLEX + if (with == DecoderEngine::poppler) { + try { + auto memory_file = std::make_shared(*file); + return std::make_unique(memory_file); + } catch (...) { + } + return nullptr; + } +#endif + return nullptr; + } + + if (as == FileType::portable_network_graphics || + as == FileType::graphics_interchange_format || as == FileType::jpeg || + as == FileType::bitmap_image_file) { + if (with == DecoderEngine::odr) { + try { + return std::make_unique(file, as); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::starview_metafile) { + if (with == DecoderEngine::odr) { + try { + auto memory_file = std::make_shared(*file); + return std::make_unique(memory_file); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::text_file) { + if (with == DecoderEngine::odr) { + try { + return std::make_unique(file); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::comma_separated_values) { + if (with == DecoderEngine::odr) { + try { + auto text = std::make_shared(file); + return std::make_unique(text); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::javascript_object_notation) { + if (with == DecoderEngine::odr) { + try { + auto text = std::make_shared(file); + return std::make_unique(text); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::zip) { + if (with == DecoderEngine::odr) { + try { + auto memory_file = std::make_shared(*file); + return std::make_unique(memory_file); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + if (as == FileType::compound_file_binary_format) { + if (with == DecoderEngine::odr) { + try { + auto memory_file = std::make_shared(*file); + return std::make_unique(memory_file); + } catch (...) { + } + return nullptr; + } + return nullptr; + } + + return nullptr; +} + +std::unique_ptr +open_strategy::open_file(std::shared_ptr file, + const DecodePreference &preference) { + std::vector probe_types; + if (preference.as_file_type.has_value()) { + probe_types.push_back(*preference.as_file_type); + } else { + std::vector detected_types = types(file); + probe_types.insert(probe_types.end(), detected_types.begin(), + detected_types.end()); + auto probe_types_end = std::unique(probe_types.begin(), probe_types.end()); + probe_types.erase(probe_types_end, probe_types.end()); + } + + for (FileType as : probe_types) { + std::vector probe_engines; + if (preference.with_engine.has_value()) { + probe_engines.push_back(*preference.with_engine); + } else { + std::vector detected_engines = engines(file, as); + probe_engines.insert(probe_engines.end(), detected_engines.begin(), + detected_engines.end()); + auto probe_engines_end = + std::unique(probe_engines.begin(), probe_engines.end()); + probe_engines.erase(probe_engines_end, probe_engines.end()); + } + + for (DecoderEngine with : probe_engines) { + auto decoded_file = open_file(file, as, with); + if (decoded_file != nullptr) { + return decoded_file; + } + } + } + + return nullptr; } std::unique_ptr diff --git a/src/odr/internal/open_strategy.hpp b/src/odr/internal/open_strategy.hpp index 1661cd0e..704c28e0 100644 --- a/src/odr/internal/open_strategy.hpp +++ b/src/odr/internal/open_strategy.hpp @@ -6,7 +6,9 @@ namespace odr { enum class FileType; -} +enum class DecoderEngine; +struct DecodePreference; +} // namespace odr namespace odr::internal::abstract { class File; @@ -19,13 +21,23 @@ class Path; } // namespace odr::internal::common namespace odr::internal::open_strategy { -std::vector types(std::shared_ptr file); +std::vector +types(const std::shared_ptr &file); +std::vector +engines(const std::shared_ptr &file, FileType as); std::unique_ptr open_file(std::shared_ptr file); std::unique_ptr open_file(std::shared_ptr file, FileType as); +std::unique_ptr +open_file(std::shared_ptr file, FileType as, + DecoderEngine with); +std::unique_ptr +open_file(std::shared_ptr file, + const DecodePreference &preference); + std::unique_ptr open_document_file(std::shared_ptr file); } // namespace odr::internal::open_strategy diff --git a/src/odr/internal/pdf/pdf_file.cpp b/src/odr/internal/pdf/pdf_file.cpp index 00260c0f..a1ba8e56 100644 --- a/src/odr/internal/pdf/pdf_file.cpp +++ b/src/odr/internal/pdf/pdf_file.cpp @@ -1,22 +1,26 @@ #include -namespace odr::internal::pdf { +namespace odr::internal { PdfFile::PdfFile(std::shared_ptr file) : m_file{std::move(file)} {} -FileCategory PdfFile::file_category() const noexcept { - return FileCategory::document; -} - std::shared_ptr PdfFile::file() const noexcept { return m_file; } -FileType PdfFile::file_type() const noexcept { - return FileType::portable_document_format; +FileMeta PdfFile::file_meta() const noexcept { return {}; } + +DecoderEngine PdfFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + +bool PdfFile::password_encrypted() const noexcept { return false; } + +EncryptionState PdfFile::encryption_state() const noexcept { + return EncryptionState::not_encrypted; } -FileMeta PdfFile::file_meta() const noexcept { return {}; } +bool PdfFile::decrypt(const std::string &) { return false; } -} // namespace odr::internal::pdf +} // namespace odr::internal diff --git a/src/odr/internal/pdf/pdf_file.hpp b/src/odr/internal/pdf/pdf_file.hpp index 42b751be..a19f89f1 100644 --- a/src/odr/internal/pdf/pdf_file.hpp +++ b/src/odr/internal/pdf/pdf_file.hpp @@ -3,22 +3,25 @@ #include -namespace odr::internal::pdf { +namespace odr::internal { -class PdfFile : public abstract::DecodedFile { +class PdfFile final : public abstract::PdfFile { public: explicit PdfFile(std::shared_ptr file); [[nodiscard]] std::shared_ptr file() const noexcept final; - [[nodiscard]] FileType file_type() const noexcept final; - [[nodiscard]] FileCategory file_category() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; + + [[nodiscard]] bool password_encrypted() const noexcept final; + [[nodiscard]] EncryptionState encryption_state() const noexcept final; + [[nodiscard]] bool decrypt(const std::string &password) final; private: std::shared_ptr m_file; }; -} // namespace odr::internal::pdf +} // namespace odr::internal #endif // ODR_INTERNAL_PDF_FILE_HPP diff --git a/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp b/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp new file mode 100644 index 00000000..fed671df --- /dev/null +++ b/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp @@ -0,0 +1,84 @@ +#include + +#include +#include +#include + +namespace odr::internal { + +PopplerPdfFile::PopplerPdfFile(std::shared_ptr file) + : m_file{std::move(file)} { + open(std::nullopt); +} + +PopplerPdfFile::PopplerPdfFile(std::shared_ptr file) + : m_file{std::move(file)} { + open(std::nullopt); +} + +void PopplerPdfFile::open(const std::optional &password) { + std::optional password_goo; + if (password.has_value()) { + password_goo = GooString(password.value().c_str()); + } + + if (auto disk_file = std::dynamic_pointer_cast(m_file)) { + auto file_path_goo = + std::make_unique(disk_file->disk_path()->string().c_str()); + m_pdf_doc = std::make_shared(std::move(file_path_goo), password_goo, + password_goo); + } else if (auto memory_file = + std::dynamic_pointer_cast(m_file)) { + // `stream` is freed by `m_pdf_doc` + auto stream = new MemStream(memory_file->memory_data(), 0, + memory_file->size(), Object(objNull)); + m_pdf_doc = std::make_shared(stream, password_goo, password_goo); + } else { + throw std::runtime_error("Unsupported file type"); + } + + if (!m_pdf_doc->isOk()) { + if (m_pdf_doc->getErrorCode() == errEncrypted) { + m_encryption_state = EncryptionState::encrypted; + } else { + throw std::runtime_error("Failed to open PDF file"); + } + } else { + m_encryption_state = m_pdf_doc->isEncrypted() + ? EncryptionState::decrypted + : EncryptionState::not_encrypted; + } +} + +std::shared_ptr PopplerPdfFile::file() const noexcept { + return m_file; +} + +FileMeta PopplerPdfFile::file_meta() const noexcept { return {}; } + +DecoderEngine PopplerPdfFile::decoder_engine() const noexcept { + return DecoderEngine::poppler; +} + +bool PopplerPdfFile::password_encrypted() const noexcept { + return m_encryption_state == EncryptionState::encrypted || + m_encryption_state == EncryptionState::decrypted; +} + +EncryptionState PopplerPdfFile::encryption_state() const noexcept { + return m_encryption_state; +} + +bool PopplerPdfFile::decrypt(const std::string &password) { + if (encryption_state() != EncryptionState::encrypted) { + return false; + } + + open(password); + + return encryption_state() == EncryptionState::decrypted; +} + +PDFDoc &PopplerPdfFile::pdf_doc() const { return *m_pdf_doc; } + +} // namespace odr::internal diff --git a/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp b/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp new file mode 100644 index 00000000..47f7866e --- /dev/null +++ b/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp @@ -0,0 +1,39 @@ +#ifndef ODR_INTERNAL_POPPLER_PDF_FILE_HPP +#define ODR_INTERNAL_POPPLER_PDF_FILE_HPP + +#include + +#include + +class PDFDoc; + +namespace odr::internal { + +class PopplerPdfFile final : public abstract::PdfFile { +public: + explicit PopplerPdfFile(std::shared_ptr file); + explicit PopplerPdfFile(std::shared_ptr file); + + [[nodiscard]] std::shared_ptr file() const noexcept final; + + [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; + + [[nodiscard]] bool password_encrypted() const noexcept final; + [[nodiscard]] EncryptionState encryption_state() const noexcept final; + [[nodiscard]] bool decrypt(const std::string &password) final; + + [[nodiscard]] PDFDoc &pdf_doc() const; + +private: + std::shared_ptr m_file; + std::shared_ptr m_pdf_doc; + + EncryptionState m_encryption_state{EncryptionState::unknown}; + + void open(const std::optional &password); +}; + +} // namespace odr::internal + +#endif // ODR_INTERNAL_POPPLER_PDF_FILE_HPP diff --git a/src/odr/internal/svm/svm_file.cpp b/src/odr/internal/svm/svm_file.cpp index fefc604f..47db2716 100644 --- a/src/odr/internal/svm/svm_file.cpp +++ b/src/odr/internal/svm/svm_file.cpp @@ -31,6 +31,10 @@ FileMeta SvmFile::file_meta() const noexcept { return result; } +DecoderEngine SvmFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + std::shared_ptr SvmFile::image() const { return {}; } } // namespace odr::internal::svm diff --git a/src/odr/internal/svm/svm_file.hpp b/src/odr/internal/svm/svm_file.hpp index 60e1f806..dd28cd6d 100644 --- a/src/odr/internal/svm/svm_file.hpp +++ b/src/odr/internal/svm/svm_file.hpp @@ -20,6 +20,7 @@ class SvmFile final : public abstract::ImageFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; [[nodiscard]] std::shared_ptr image() const final; diff --git a/src/odr/internal/text/text_file.cpp b/src/odr/internal/text/text_file.cpp index a6908f5d..7fc489c1 100644 --- a/src/odr/internal/text/text_file.cpp +++ b/src/odr/internal/text/text_file.cpp @@ -22,4 +22,8 @@ FileMeta TextFile::file_meta() const noexcept { return {FileType::text_file, false, {}}; } +DecoderEngine TextFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + } // namespace odr::internal::text diff --git a/src/odr/internal/text/text_file.hpp b/src/odr/internal/text/text_file.hpp index f0e84271..69bad36b 100644 --- a/src/odr/internal/text/text_file.hpp +++ b/src/odr/internal/text/text_file.hpp @@ -18,6 +18,7 @@ class TextFile final : public abstract::TextFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; private: std::shared_ptr m_file; diff --git a/src/odr/internal/zip/zip_file.cpp b/src/odr/internal/zip/zip_file.cpp index b0fb59a7..496a986a 100644 --- a/src/odr/internal/zip/zip_file.cpp +++ b/src/odr/internal/zip/zip_file.cpp @@ -23,6 +23,10 @@ FileMeta ZipFile::file_meta() const noexcept { return meta; } +DecoderEngine ZipFile::decoder_engine() const noexcept { + return DecoderEngine::odr; +} + std::shared_ptr ZipFile::archive() const { return std::make_shared(m_zip); } diff --git a/src/odr/internal/zip/zip_file.hpp b/src/odr/internal/zip/zip_file.hpp index 6d11c195..fde6f6db 100644 --- a/src/odr/internal/zip/zip_file.hpp +++ b/src/odr/internal/zip/zip_file.hpp @@ -27,6 +27,7 @@ class ZipFile final : public abstract::ArchiveFile { [[nodiscard]] FileType file_type() const noexcept final; [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; [[nodiscard]] std::shared_ptr archive() const final; diff --git a/src/odr/open_document_reader.cpp b/src/odr/open_document_reader.cpp index 04d8d865..d2da7fa6 100644 --- a/src/odr/open_document_reader.cpp +++ b/src/odr/open_document_reader.cpp @@ -164,15 +164,51 @@ std::string OpenDocumentReader::type_to_string(const FileType type) noexcept { } } +std::string OpenDocumentReader::engine_to_string(const DecoderEngine engine) { + if (engine == DecoderEngine::odr) { + return "odr"; + } else if (engine == DecoderEngine::poppler) { + return "poppler"; + } else if (engine == DecoderEngine::wvware) { + return "wvware"; + } + throw UnknownDecoderEngine(); +} + +DecoderEngine OpenDocumentReader::engine_by_name(const std::string &name) { + if (name == "odr") { + return DecoderEngine::odr; + } else if (name == "poppler") { + return DecoderEngine::poppler; + } else if (name == "wvware") { + return DecoderEngine::wvware; + } + throw UnknownDecoderEngine(); +} + std::vector OpenDocumentReader::types(const std::string &path) { - File file(path); - return internal::open_strategy::types(file.impl()); + return DecodedFile::types(path); +} + +std::vector OpenDocumentReader::engines(const std::string &path, + const FileType as) { + return DecodedFile::engines(path, as); } DecodedFile OpenDocumentReader::open(const std::string &path) { return DecodedFile(path); } +DecodedFile OpenDocumentReader::open(const std::string &path, + const FileType as) { + return DecodedFile(path, as); +} + +DecodedFile OpenDocumentReader::open(const std::string &path, + const DecodePreference &preference) { + return DecodedFile(path, preference); +} + Html OpenDocumentReader::html(const std::string &path, const PasswordCallback &password_callback, const std::string &output_path, diff --git a/src/odr/open_document_reader.hpp b/src/odr/open_document_reader.hpp index e73c993e..a6893f65 100644 --- a/src/odr/open_document_reader.hpp +++ b/src/odr/open_document_reader.hpp @@ -8,6 +8,8 @@ namespace odr { enum class FileType; enum class FileCategory; +enum class DecoderEngine; +struct DecodePreference; class File; class DecodedFile; class TextFile; @@ -45,14 +47,41 @@ class OpenDocumentReader final { /// @return The file type as a string. [[nodiscard]] static std::string type_to_string(FileType type) noexcept; + /// @brief Get the decoder engine as a string. + /// @param engine The decoder engine. + /// @return The decoder engine as a string. + [[nodiscard]] static std::string engine_to_string(DecoderEngine engine); + /// @brief Get the decoder engine by the name. + /// @param engine The name of the decoder engine. + /// @return The decoder engine. + [[nodiscard]] static DecoderEngine engine_by_name(const std::string &engine); + /// @brief Get the file types by the file path. /// @param path The file path. /// @return The file types. [[nodiscard]] static std::vector types(const std::string &path); + /// @brief Get the decoder engines for a file path and file type. + /// @param path The file path. + /// @param as The file type. + /// @return The decoder engines. + [[nodiscard]] static std::vector + engines(const std::string &path, FileType as); + /// @brief Open a file. /// @param path The file path. /// @return The decoded file. [[nodiscard]] static DecodedFile open(const std::string &path); + /// @brief Open a file. + /// @param path The file path. + /// @param as The file type. + /// @return The decoded file. + [[nodiscard]] static DecodedFile open(const std::string &path, FileType as); + /// @brief Open a file. + /// @param path The file path. + /// @param preference The decode preference. + /// @return The decoded file. + [[nodiscard]] static DecodedFile open(const std::string &path, + const DecodePreference &preference); /// @brief Translates a file to HTML. /// diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e8f623e9..16c988cd 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -63,13 +63,4 @@ target_link_libraries(odr_test odr ) -if(WITH_PDF2HTMLEX) - target_sources(odr_test PRIVATE "src/pdf2htmlEX_wrapper_test.cpp") - target_link_libraries(odr_test PRIVATE pdf2htmlex::pdf2htmlex) -endif(WITH_PDF2HTMLEX) -if(WITH_WVWARE) - target_sources(odr_test PRIVATE "src/wvWare_wrapper_test.cpp") - target_link_libraries(odr_test PRIVATE wvware::wvware) -endif(WITH_WVWARE) - gtest_add_tests(TARGET odr_test) diff --git a/test/data/input/odr-private b/test/data/input/odr-private index a997171b..2e0f2f9a 160000 --- a/test/data/input/odr-private +++ b/test/data/input/odr-private @@ -1 +1 @@ -Subproject commit a997171b727f230c4a81421d43e2ed62f37b94ca +Subproject commit 2e0f2f9ac0af7b3fd11a3f808e0ac2cf479c6b25 diff --git a/test/data/input/odr-public b/test/data/input/odr-public index c2cc81ba..99f85ddc 160000 --- a/test/data/input/odr-public +++ b/test/data/input/odr-public @@ -1 +1 @@ -Subproject commit c2cc81ba91b6145ff51801644169f4f01878556b +Subproject commit 99f85ddc0ab26c83759ab6de544fac82b85e5cc8 diff --git a/test/data/reference-output/odr-private b/test/data/reference-output/odr-private index b1d06179..118b6afa 160000 --- a/test/data/reference-output/odr-private +++ b/test/data/reference-output/odr-private @@ -1 +1 @@ -Subproject commit b1d061790ee59b5ded4c3b970dd0a5c453d65b96 +Subproject commit 118b6afae107a2326f5eb70e3536e209751eb079 diff --git a/test/data/reference-output/odr-public b/test/data/reference-output/odr-public index c3b3d0b1..76d0a13e 160000 --- a/test/data/reference-output/odr-public +++ b/test/data/reference-output/odr-public @@ -1 +1 @@ -Subproject commit c3b3d0b160c4bb34ee3ca9b7e61cff504335cbc5 +Subproject commit 76d0a13e5d69081fc41cf2cfc296fb1bd85156f8 diff --git a/test/docker/README.md b/test/docker/README.md new file mode 100644 index 00000000..bc41a717 --- /dev/null +++ b/test/docker/README.md @@ -0,0 +1,5 @@ +# Manually build the image + +```bash +docker build --tag odr_core_test test/docker +``` diff --git a/test/docker/compare_output_server.sh b/test/docker/compare_output_server.sh index 6a4651ee..57b2f2b0 100755 --- a/test/docker/compare_output_server.sh +++ b/test/docker/compare_output_server.sh @@ -1,11 +1,17 @@ #!/usr/bin/env bash REF="test/data/reference-output/" -OBS="cmake-build-debug/test/output/" +OBS="cmake-build-relwithdebinfo/test/output/" DRIVER="firefox" -# manually build the image -#docker build --tag odr_core_test test/docker +if [ ! -d "$REF" ]; then + echo "Reference output directory does not exist: $REF" + exit 1 +fi +if [ ! -d "$OBS" ]; then + echo "Observed output directory does not exist: $OBS" + exit 1 +fi docker run -ti \ -v $(pwd):/repo \ diff --git a/test/scripts/html_render_diff.py b/test/scripts/html_render_diff.py index a945e78f..4879883f 100755 --- a/test/scripts/html_render_diff.py +++ b/test/scripts/html_render_diff.py @@ -29,7 +29,7 @@ def screenshot(browser, url): loaded_page_settling_time = 0 # Selenium doesn't like when we try to screenshot element of documents generated by pdf2htmlEX - if 'output-pdf2htmlEX' in url: + if 'poppler' in url: target_find_by = By.ID target = 'page-container' loaded_page_settling_time = 1 diff --git a/test/src/document_test.cpp b/test/src/document_test.cpp index 5f56abac..983ffb8a 100644 --- a/test/src/document_test.cpp +++ b/test/src/document_test.cpp @@ -107,7 +107,7 @@ TEST(Document, edit_ods_diff) { DocumentFile document_file( TestData::test_file_path("odr-public/ods/pages.ods")); document_file.decrypt( - TestData::test_file("odr-public/ods/pages.ods").password); + TestData::test_file("odr-public/ods/pages.ods").password.value()); Document document = document_file.document(); html::edit(document, diff); diff --git a/test/src/html_output_test.cpp b/test/src/html_output_test.cpp index bea05802..7fc2f3bf 100644 --- a/test/src/html_output_test.cpp +++ b/test/src/html_output_test.cpp @@ -21,27 +21,32 @@ using namespace odr::internal; using namespace odr::test; namespace fs = std::filesystem; -using HtmlOutputTests = ::testing::TestWithParam; +struct TestParams { + TestFile test_file; + std::string path; + DecoderEngine engine{DecoderEngine::odr}; + std::string test_repo; + std::string output_path; + std::string output_path_prefix; +}; -TEST_P(HtmlOutputTests, html_meta) { - const std::string test_file_path = GetParam(); - const TestFile test_file = TestData::test_file(test_file_path); +using HtmlOutputTests = ::testing::TestWithParam; - const std::string test_repo = *common::Path(test_file_path).begin(); +TEST_P(HtmlOutputTests, html_meta) { + const TestParams ¶ms = GetParam(); + const TestFile &test_file = params.test_file; + const DecoderEngine engine = params.engine; + const std::string &test_repo = params.test_repo; + const std::string &output_path = params.output_path; const std::string output_path_prefix = - common::Path("output").join(test_repo).join("output").string(); - const std::string output_path = - common::Path(output_path_prefix) - .join(common::Path(test_file_path).rebase(test_repo)) - .string(); + common::Path(output_path).parent().string(); - std::cout << test_file.path << " to " << output_path << std::endl; + std::cout << test_file.short_path << " to " << output_path << std::endl; // TODO compare guessed file type VS actual file type // these files cannot be opened - if (util::string::ends_with(test_file.path, ".sxw") || - (test_file.type == FileType::legacy_word_document) || + if (util::string::ends_with(test_file.short_path, ".sxw") || (test_file.type == FileType::legacy_powerpoint_presentation) || (test_file.type == FileType::legacy_excel_worksheets) || (test_file.type == FileType::word_perfect) || @@ -50,12 +55,17 @@ TEST_P(HtmlOutputTests, html_meta) { } // TODO fix - if ((test_file.type == FileType::portable_document_format) && + if ((engine == DecoderEngine::odr) && + (test_file.type == FileType::portable_document_format) && (test_repo != "odr-public")) { GTEST_SKIP(); } - const DecodedFile file{test_file.path}; + DecodePreference decode_preference; + decode_preference.as_file_type = test_file.type; + decode_preference.with_engine = engine; + DecodedFile file = + OpenDocumentReader::open(test_file.absolute_path, decode_preference); FileMeta file_meta = file.file_meta(); @@ -71,16 +81,32 @@ TEST_P(HtmlOutputTests, html_meta) { GTEST_SKIP(); } + // TODO check wvware decryption + if ((test_file.type == FileType::legacy_word_document) && + (engine == DecoderEngine::wvware)) { + GTEST_SKIP(); + } + if (file.is_document_file()) { DocumentFile document_file = file.document_file(); - EXPECT_EQ(test_file.password_encrypted, document_file.password_encrypted()); - if (document_file.password_encrypted()) { - EXPECT_TRUE(document_file.decrypt(test_file.password)); + EXPECT_EQ(test_file.password.has_value(), + document_file.password_encrypted()); + if (test_file.password.has_value()) { + EXPECT_TRUE(document_file.decrypt(test_file.password.value())); } EXPECT_EQ(test_file.type, document_file.file_type()); } + if (file.is_pdf_file()) { + PdfFile pdf_file = file.pdf_file(); + + EXPECT_EQ(test_file.password.has_value(), pdf_file.password_encrypted()); + if (test_file.password.has_value()) { + EXPECT_TRUE(pdf_file.decrypt(test_file.password.value())); + } + } + fs::create_directories(output_path); file_meta = file.file_meta(); @@ -94,8 +120,11 @@ TEST_P(HtmlOutputTests, html_meta) { EXPECT_LT(0, fs::file_size(meta_output)); } - const std::string resource_path = - common::Path(output_path_prefix).parent().join("resources").string(); + const std::string resource_path = common::Path(output_path_prefix) + .parent() + .parent() + .join("resources") + .string(); OpenDocumentReader::copy_resources(resource_path); HtmlConfig config; @@ -115,15 +144,72 @@ TEST_P(HtmlOutputTests, html_meta) { } } +namespace { + +std::string engine_suffix(const DecoderEngine engine) { + return engine == DecoderEngine::odr + ? "" + : "-" + OpenDocumentReader::engine_to_string(engine); +} + +std::string test_params_to_name(const TestParams ¶ms) { + std::string path = params.path + engine_suffix(params.engine); + internal::util::string::replace_all(path, "/", "_"); + internal::util::string::replace_all(path, "-", "_"); + internal::util::string::replace_all(path, "+", "_"); + internal::util::string::replace_all(path, ".", "_"); + internal::util::string::replace_all(path, " ", "_"); + internal::util::string::replace_all(path, "$", ""); + return path; +} + +TestParams create_test_params(const TestFile &test_file, + const DecoderEngine engine) { + const std::string test_file_path = test_file.short_path; + + const std::string test_repo = *common::Path(test_file_path).begin(); + const std::string output_path_prefix = + common::Path("output").join(test_repo).join("output").string(); + const std::string output_path_suffix = engine_suffix(engine); + const std::string output_path = + common::Path(output_path_prefix) + .join(common::Path(test_file_path).rebase(test_repo)) + .string() + + output_path_suffix; + + return { + .test_file = test_file, + .path = test_file_path, + .engine = engine, + .test_repo = test_repo, + .output_path = output_path, + .output_path_prefix = output_path_prefix, + }; +} + +std::vector list_test_params() { + std::vector params; + for (const TestFile &test_file : TestData::test_files()) { + std::vector engines = {DecoderEngine::odr}; + if (test_file.type == FileType::portable_document_format) { + engines.push_back(DecoderEngine::poppler); + } + if (test_file.type == FileType::legacy_word_document) { + engines.clear(); + engines.push_back(DecoderEngine::wvware); + } + + for (const DecoderEngine engine : engines) { + params.push_back(create_test_params(test_file, engine)); + } + } + return params; +} + +} // namespace + INSTANTIATE_TEST_SUITE_P(all_test_files, HtmlOutputTests, - testing::ValuesIn(TestData::test_file_paths()), - [](const ::testing::TestParamInfo &info) { - std::string path = info.param; - internal::util::string::replace_all(path, "/", "_"); - internal::util::string::replace_all(path, "-", "_"); - internal::util::string::replace_all(path, "+", "_"); - internal::util::string::replace_all(path, ".", "_"); - internal::util::string::replace_all(path, " ", "_"); - internal::util::string::replace_all(path, "$", ""); - return path; + testing::ValuesIn(list_test_params()), + [](const ::testing::TestParamInfo &info) { + return test_params_to_name(info.param); }); diff --git a/test/src/pdf2htmlEX_wrapper_test.cpp b/test/src/pdf2htmlEX_wrapper_test.cpp deleted file mode 100644 index e29c82c3..00000000 --- a/test/src/pdf2htmlEX_wrapper_test.cpp +++ /dev/null @@ -1,68 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -using namespace odr; -using namespace odr::test; -using namespace odr::internal; -using namespace odr::test; -namespace fs = std::filesystem; - -using pdf2htmlEXWrapperTests = ::testing::TestWithParam; - -TEST_P(pdf2htmlEXWrapperTests, html) { - const std::string test_file_path = GetParam(); - const TestFile test_file = TestData::test_file(test_file_path); - - const std::string test_repo = *common::Path(test_file_path).begin(); - const std::string output_path_prefix = - common::Path("output").join(test_repo).join("output-pdf2htmlEX").string(); - const std::string output_path = - common::Path(output_path_prefix) - .join(common::Path(test_file_path).rebase(test_repo)) - .string(); - - std::cout << test_file.path << " to " << output_path << std::endl; - - fs::create_directories(output_path); - HtmlConfig config; - std::optional password; - - if (test_file.password_encrypted) { - password = test_file.password; - } - // @TODO: why does test_file.password_encrypted == false for this file?? - else if (test_file.path.ends_with("encrypted_fontfile3_opentype.pdf")) { - password = "sample-user-password"; - } - - Html html = odr::internal::html::pdf2htmlEX_wrapper( - test_file.path, output_path, config, password); - for (const HtmlPage &html_page : html.pages()) { - EXPECT_TRUE(fs::is_regular_file(html_page.path)); - EXPECT_LT(0, fs::file_size(html_page.path)); - } -} - -INSTANTIATE_TEST_SUITE_P(pdf2htmlEX_test_files, pdf2htmlEXWrapperTests, - testing::ValuesIn(TestData::test_file_paths( - FileType::portable_document_format)), - [](const ::testing::TestParamInfo &info) { - std::string path = info.param; - internal::util::string::replace_all(path, "/", "_"); - internal::util::string::replace_all(path, "-", "_"); - internal::util::string::replace_all(path, "+", "_"); - internal::util::string::replace_all(path, ".", "_"); - internal::util::string::replace_all(path, " ", "_"); - internal::util::string::replace_all(path, "$", ""); - return path; - }); diff --git a/test/src/test_util.cpp b/test/src/test_util.cpp index 764ad4ea..0863a093 100644 --- a/test/src/test_util.cpp +++ b/test/src/test_util.cpp @@ -18,23 +18,29 @@ namespace fs = std::filesystem; namespace odr::test { namespace { -TestFile get_test_file(std::string input) { - const FileType type = - OpenDocumentReader::type_by_extension(common::Path(input).extension()); - const std::string file_name = fs::path(input).filename().string(); - std::string password; - if (const auto left = file_name.find('$'), right = file_name.rfind('$'); + +TestFile get_test_file(const std::string &root_path, + std::string absolute_path) { + const FileType type = OpenDocumentReader::type_by_extension( + common::Path(absolute_path).extension()); + + std::string short_path = absolute_path.substr(root_path.size() + 1); + + std::optional password; + const std::string filename = fs::path(absolute_path).filename().string(); + if (const auto left = filename.find('$'), right = filename.rfind('$'); (left != std::string::npos) && (left != right)) { - password = file_name.substr(left, right); + password = filename.substr(left, right); } - const bool encrypted = !password.empty(); - return {std::move(input), type, encrypted, std::move(password)}; + return {std::move(absolute_path), std::move(short_path), type, + std::move(password)}; } -std::vector get_test_files(const std::string &input_path) { +std::vector get_test_files(const std::string &root_path, + const std::string &input_path) { if (fs::is_regular_file(input_path)) { - return {get_test_file(input_path)}; + return {get_test_file(root_path, input_path)}; } if (!fs::is_directory(input_path)) { return {}; @@ -45,21 +51,22 @@ std::vector get_test_files(const std::string &input_path) { const std::string index_path = input_path + "/index.csv"; if (fs::is_regular_file(index_path)) { for (const auto &row : csv::CSVReader(index_path)) { - const std::string path = input_path + "/" + row["path"].get<>(); - const FileType type = + std::string absolute_path = input_path + "/" + row["path"].get<>(); + std::string short_path = absolute_path.substr(root_path.size() + 1); + FileType type = OpenDocumentReader::type_by_extension(row["type"].get<>()); - std::string password = row["password"].get<>(); - const bool encrypted = !password.empty(); - const std::string file_name = fs::path(path).filename().string(); + std::optional password = row["encrypted"].get<>() == "yes" + ? row["password"].get<>() + : std::optional(); if (type == FileType::unknown) { continue; } - result.emplace_back(path, type, encrypted, std::move(password)); + result.emplace_back(std::move(absolute_path), std::move(short_path), type, + std::move(password)); } } - // TODO this will also recurse `.git` for (auto &&p : fs::recursive_directory_iterator(input_path)) { if (!p.is_regular_file()) { continue; @@ -68,15 +75,17 @@ std::vector get_test_files(const std::string &input_path) { if (path == index_path) { continue; } - - if (const auto it = - std::find_if(std::begin(result), std::end(result), - [&](auto &&file) { return file.path == path; }); + if (p.path().filename().string().starts_with(".")) { + continue; + } + if (const auto it = std::find_if( + std::begin(result), std::end(result), + [&](auto &&file) { return file.absolute_path == path; }); it != std::end(result)) { continue; } - const auto file = get_test_file(path); + const auto file = get_test_file(root_path, path); if (file.type == FileType::unknown) { continue; @@ -87,26 +96,30 @@ std::vector get_test_files(const std::string &input_path) { return result; } -std::unordered_map get_test_files() { - std::unordered_map result; +std::vector get_test_files() { + std::vector result; - for (const auto &e : - fs::directory_iterator(test::TestData::data_input_directory())) { - const auto files = get_test_files(e.path().string()); - for (auto &&file : files) { - std::string testPath = - file.path.substr(TestData::data_input_directory().length() + 1); - result[testPath] = file; - } + std::string root = TestData::data_input_directory(); + + for (const auto &e : fs::directory_iterator(root)) { + const auto files = get_test_files(root, e.path().string()); + result.insert(std::end(result), std::begin(files), std::end(files)); } + std::sort(std::begin(result), std::end(result), + [](const auto &lhs, const auto &rhs) { + return lhs.short_path < rhs.short_path; + }); + return result; } + } // namespace -TestFile::TestFile(std::string path, const FileType type, - const bool password_encrypted, std::string password) - : path{std::move(path)}, type{type}, password_encrypted{password_encrypted}, +TestFile::TestFile(std::string absolute_path, std::string short_path, + const FileType type, std::optional password) + : absolute_path{std::move(absolute_path)}, + short_path{std::move(short_path)}, type{type}, password{std::move(password)} {} std::string TestData::data_input_directory() { @@ -118,46 +131,41 @@ TestData &TestData::instance_() { return instance; } -std::vector TestData::test_file_paths() { - return instance_().test_file_paths_(); +std::vector TestData::test_files() { + return instance_().m_test_files; } -std::vector TestData::test_file_paths(FileType fileType) { - return instance_().test_file_paths_(fileType); +std::vector TestData::test_files(FileType fileType) { + return instance_().test_files_(fileType); } -TestFile TestData::test_file(const std::string &path) { - return instance_().test_file_(path); +TestFile TestData::test_file(const std::string &short_path) { + const auto &files = instance_().m_test_files; + const auto it = + std::find_if(std::begin(files), std::end(files), [&](const auto &file) { + return file.short_path == short_path; + }); + if (it == std::end(files)) { + throw std::runtime_error("Test file not found: " + short_path); + } + return *it; } -std::string TestData::test_file_path(const std::string &path) { - return test_file(path).path; +std::string TestData::test_file_path(const std::string &short_path) { + return test_file(short_path).absolute_path; } TestData::TestData() : m_test_files{get_test_files()} {} -std::vector TestData::test_file_paths_() const { - std::vector result; - for (auto &&file : m_test_files) { - result.push_back(file.first); - } - std::sort(std::begin(result), std::end(result)); - return result; -} - -std::vector TestData::test_file_paths_(FileType fileType) const { - std::vector result; +std::vector TestData::test_files_(const FileType fileType) const { + std::vector result; + result.reserve(m_test_files.size()); for (auto &&file : m_test_files) { - if (file.second.type == fileType) { - result.push_back(file.first); + if (file.type == fileType) { + result.push_back(file); } } - std::sort(std::begin(result), std::end(result)); return result; } -TestFile TestData::test_file_(const std::string &path) const { - return m_test_files.at(path); -} - } // namespace odr::test diff --git a/test/src/test_util.hpp b/test/src/test_util.hpp index 324f76c4..c5a05e87 100644 --- a/test/src/test_util.hpp +++ b/test/src/test_util.hpp @@ -4,20 +4,19 @@ #include #include -#include #include namespace odr::test { struct TestFile { - std::string path; + std::string absolute_path; + std::string short_path; FileType type{FileType::unknown}; - bool password_encrypted{false}; - std::string password; + std::optional password; TestFile() = default; - TestFile(std::string path, FileType type, bool password_encrypted, - std::string password); + TestFile(std::string absolute_path, std::string short_path, FileType type, + std::optional password); }; class TestData { @@ -25,10 +24,11 @@ class TestData { static std::string data_directory(); static std::string data_input_directory(); - static std::vector test_file_paths(); - static std::vector test_file_paths(FileType); - static TestFile test_file(const std::string &path); - static std::string test_file_path(const std::string &path); + static std::vector test_files(); + static std::vector test_files(FileType); + + static TestFile test_file(const std::string &short_path); + static std::string test_file_path(const std::string &short_path); TestData(const TestData &) = delete; TestData &operator=(const TestData &) = delete; @@ -39,11 +39,9 @@ class TestData { TestData(); static TestData &instance_(); - std::vector test_file_paths_() const; - std::vector test_file_paths_(FileType) const; - TestFile test_file_(const std::string &path) const; + [[nodiscard]] std::vector test_files_(FileType) const; - std::unordered_map m_test_files; + std::vector m_test_files; }; } // namespace odr::test diff --git a/test/src/wvWare_wrapper_test.cpp b/test/src/wvWare_wrapper_test.cpp deleted file mode 100644 index d3d45252..00000000 --- a/test/src/wvWare_wrapper_test.cpp +++ /dev/null @@ -1,65 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -using namespace odr; -using namespace odr::test; -using namespace odr::internal; -using namespace odr::test; -namespace fs = std::filesystem; - -using wvWareWrapperTests = ::testing::TestWithParam; - -TEST_P(wvWareWrapperTests, html) { - const std::string test_file_path = GetParam(); - const TestFile test_file = TestData::test_file(test_file_path); - - const std::string test_repo = *common::Path(test_file_path).begin(); - const std::string output_path_prefix = - common::Path("output").join(test_repo).join("output-wvWare").string(); - const std::string output_path = - common::Path(output_path_prefix) - .join(common::Path(test_file_path).rebase(test_repo)) - .string(); - - std::cout << test_file.path << " to " << output_path << std::endl; - - // Password protected files are problematic on wvWare - if (test_file.password_encrypted) { - GTEST_SKIP(); - } - - fs::create_directories(output_path); - HtmlConfig config; - std::optional password; - Html html = odr::internal::html::wvWare_wrapper(test_file.path, output_path, - config, password); - - for (const HtmlPage &html_page : html.pages()) { - EXPECT_TRUE(fs::is_regular_file(html_page.path)); - EXPECT_LT(0, fs::file_size(html_page.path)); - } -} - -INSTANTIATE_TEST_SUITE_P(wvWare_test_files, wvWareWrapperTests, - testing::ValuesIn(TestData::test_file_paths( - FileType::legacy_word_document)), - [](const ::testing::TestParamInfo &info) { - std::string path = info.param; - internal::util::string::replace_all(path, "/", "_"); - internal::util::string::replace_all(path, "-", "_"); - internal::util::string::replace_all(path, "+", "_"); - internal::util::string::replace_all(path, ".", "_"); - internal::util::string::replace_all(path, " ", "_"); - internal::util::string::replace_all(path, "$", ""); - return path; - });