Skip to content

Commit

Permalink
PDF to HTML (#349)
Browse files Browse the repository at this point in the history
  • Loading branch information
andiwand authored Jan 8, 2024
1 parent 60daf83 commit 7690b8b
Show file tree
Hide file tree
Showing 25 changed files with 476 additions and 99 deletions.
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ add_library(odr
"src/odr/internal/html/filesystem.cpp"
"src/odr/internal/html/html_writer.cpp"
"src/odr/internal/html/image_file.cpp"
"src/odr/internal/html/pdf_file.cpp"
"src/odr/internal/html/text_file.cpp"

"src/odr/internal/json/json_file.cpp"
Expand Down Expand Up @@ -152,7 +153,7 @@ add_library(odr
"src/odr/internal/pdf/pdf_document.cpp"
"src/odr/internal/pdf/pdf_document_element.cpp"
"src/odr/internal/pdf/pdf_document_parser.cpp"
"src/odr/internal/pdf/pdf_object.cpp"
"src/odr/internal/pdf/pdf_file.cpp"
"src/odr/internal/pdf/pdf_file_parser.cpp"
"src/odr/internal/pdf/pdf_graphics_operator.cpp"
"src/odr/internal/pdf/pdf_graphics_operator_parser.cpp"
Expand Down
2 changes: 2 additions & 0 deletions src/odr/exceptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ NoOpenDocumentFile::NoOpenDocumentFile()
NoOfficeOpenXmlFile::NoOfficeOpenXmlFile()
: std::runtime_error("not an office open xml file") {}

NoPdfFile::NoPdfFile() : std::runtime_error("not a pdf file") {}

NoXml::NoXml() : std::runtime_error("not xml") {}

UnsupportedCryptoAlgorithm::UnsupportedCryptoAlgorithm()
Expand Down
4 changes: 4 additions & 0 deletions src/odr/exceptions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ struct NoOfficeOpenXmlFile final : public std::runtime_error {
NoOfficeOpenXmlFile();
};

struct NoPdfFile final : public std::runtime_error {
NoPdfFile();
};

struct NoXml final : public std::runtime_error {
NoXml();
};
Expand Down
40 changes: 37 additions & 3 deletions src/odr/file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <odr/internal/abstract/file.hpp>
#include <odr/internal/common/file.hpp>
#include <odr/internal/open_strategy.hpp>
#include <odr/internal/pdf/pdf_file.hpp>

#include <optional>
#include <utility>
Expand Down Expand Up @@ -89,9 +90,7 @@ DecodedFile::DecodedFile(const std::string &path, FileType as)

DecodedFile::operator bool() const { return m_impl.operator bool(); }

FileType DecodedFile::file_type() const noexcept {
return m_impl->file_meta().type;
}
FileType DecodedFile::file_type() const noexcept { return m_impl->file_type(); }

FileCategory DecodedFile::file_category() const noexcept {
return m_impl->file_category();
Expand All @@ -101,6 +100,30 @@ FileMeta DecodedFile::file_meta() const noexcept { return m_impl->file_meta(); }

File DecodedFile::file() const { return File(m_impl->file()); }

bool DecodedFile::is_text_file() const {
return std::dynamic_pointer_cast<internal::abstract::TextFile>(m_impl) !=
nullptr;
}

bool DecodedFile::is_image_file() const {
return std::dynamic_pointer_cast<internal::abstract::ImageFile>(m_impl) !=
nullptr;
}

bool DecodedFile::is_archive_file() const {
return std::dynamic_pointer_cast<internal::abstract::ArchiveFile>(m_impl) !=
nullptr;
}

bool DecodedFile::is_document_file() const {
return std::dynamic_pointer_cast<internal::abstract::DocumentFile>(m_impl) !=
nullptr;
}

bool DecodedFile::is_pdf_file() const {
return std::dynamic_pointer_cast<internal::pdf::PdfFile>(m_impl) != nullptr;
}

TextFile DecodedFile::text_file() const {
if (auto text_file =
std::dynamic_pointer_cast<internal::abstract::TextFile>(m_impl)) {
Expand Down Expand Up @@ -133,6 +156,14 @@ DocumentFile DecodedFile::document_file() const {
throw NoDocumentFile();
}

PdfFile DecodedFile::pdf_file() const {
if (auto pdf_file =
std::dynamic_pointer_cast<internal::pdf::PdfFile>(m_impl)) {
return PdfFile(pdf_file);
}
throw NoPdfFile();
}

TextFile::TextFile(std::shared_ptr<internal::abstract::TextFile> impl)
: DecodedFile(impl), m_impl{std::move(impl)} {}

Expand Down Expand Up @@ -198,4 +229,7 @@ DocumentMeta DocumentFile::document_meta() const {

Document DocumentFile::document() const { return Document(m_impl->document()); }

PdfFile::PdfFile(std::shared_ptr<internal::pdf::PdfFile> impl)
: DecodedFile(impl), m_impl{std::move(impl)} {}

} // namespace odr
20 changes: 20 additions & 0 deletions src/odr/file.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,16 @@ class ArchiveFile;
class DocumentFile;
} // namespace odr::internal::abstract

namespace odr::internal::pdf {
class PdfFile;
}

namespace odr {
class TextFile;
class ImageFile;
class ArchiveFile;
class DocumentFile;
class PdfFile;

class Archive;
class Document;
Expand Down Expand Up @@ -166,10 +171,17 @@ class DecodedFile {

[[nodiscard]] File file() const;

[[nodiscard]] bool is_text_file() const;
[[nodiscard]] bool is_image_file() const;
[[nodiscard]] bool is_archive_file() const;
[[nodiscard]] bool is_document_file() const;
[[nodiscard]] bool is_pdf_file() const;

[[nodiscard]] TextFile text_file() const;
[[nodiscard]] ImageFile image_file() const;
[[nodiscard]] ArchiveFile archive_file() const;
[[nodiscard]] DocumentFile document_file() const;
[[nodiscard]] PdfFile pdf_file() const;

protected:
std::shared_ptr<internal::abstract::DecodedFile> m_impl;
Expand Down Expand Up @@ -229,6 +241,14 @@ class DocumentFile final : public DecodedFile {
std::shared_ptr<internal::abstract::DocumentFile> m_impl;
};

class PdfFile final : public DecodedFile {
public:
explicit PdfFile(std::shared_ptr<internal::pdf::PdfFile>);

private:
std::shared_ptr<internal::pdf::PdfFile> m_impl;
};

} // namespace odr

#endif // ODR_FILE_HPP
40 changes: 29 additions & 11 deletions src/odr/html.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@
#include <odr/internal/html/document.hpp>
#include <odr/internal/html/filesystem.hpp>
#include <odr/internal/html/image_file.hpp>
#include <odr/internal/html/pdf_file.hpp>
#include <odr/internal/html/text_file.hpp>

#include <nlohmann/json.hpp>

#include <filesystem>

#include <nlohmann/json.hpp>

using namespace odr::internal;
namespace fs = std::filesystem;

Expand Down Expand Up @@ -53,21 +54,32 @@ Html html::translate(const File &file, const std::string &output_path,
const PasswordCallback &password_callback) {
auto decoded_file = DecodedFile(file);

if (decoded_file.file_category() == FileCategory::text) {
return translate(decoded_file.text_file(), output_path, config);
} else if (decoded_file.file_category() == FileCategory::image) {
return translate(decoded_file.image_file(), output_path, config);
} else if (decoded_file.file_category() == FileCategory::archive) {
return translate(decoded_file.archive_file().archive(), output_path,
config);
} else if (decoded_file.file_category() == FileCategory::document) {
if (decoded_file.is_document_file()) {
DocumentFile document_file = decoded_file.document_file();
if (document_file.password_encrypted()) {
if (!document_file.decrypt(password_callback())) {
throw WrongPassword();
}
}
return translate(document_file.document(), output_path, config);
}

return translate(decoded_file, output_path, config);
}

Html html::translate(const DecodedFile &decoded_file,
const std::string &output_path, const HtmlConfig &config) {
if (decoded_file.is_text_file()) {
return translate(decoded_file.text_file(), output_path, config);
} else if (decoded_file.is_image_file()) {
return translate(decoded_file.image_file(), output_path, config);
} else if (decoded_file.is_archive_file()) {
return translate(decoded_file.archive_file().archive(), output_path,
config);
} else if (decoded_file.is_document_file()) {
return translate(decoded_file.document_file().document(), output_path,
config);
} else if (decoded_file.is_pdf_file()) {
return translate(decoded_file.pdf_file(), output_path, config);
}

throw UnsupportedFileType(decoded_file.file_type());
Expand Down Expand Up @@ -98,6 +110,12 @@ Html html::translate(const Document &document, const std::string &output_path,
return internal::html::translate_document(document, output_path, config);
}

Html html::translate(const PdfFile &pdf_file, const std::string &output_path,
const HtmlConfig &config) {
fs::create_directories(output_path);
return internal::html::translate_pdf_file(pdf_file, output_path, config);
}

void html::edit(const Document &document, const char *diff) {
auto json = nlohmann::json::parse(diff);
for (const auto &[key, value] : json["modifiedText"].items()) {
Expand Down
8 changes: 8 additions & 0 deletions src/odr/html.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,13 @@ struct HtmlPage final {
using PasswordCallback = std::function<std::string()>;

namespace html {

Html translate(const File &file, const std::string &output_path,
const HtmlConfig &config,
const PasswordCallback &password_callback);
Html translate(const DecodedFile &file, const std::string &output_path,
const HtmlConfig &config);

Html translate(const TextFile &text_file, const std::string &output_path,
const HtmlConfig &config);
Html translate(const ImageFile &image_file, const std::string &output_path,
Expand All @@ -101,7 +105,11 @@ Html translate(const Archive &archive, const std::string &output_path,
const HtmlConfig &config);
Html translate(const Document &document, const std::string &output_path,
const HtmlConfig &config);
Html translate(const PdfFile &pdf_file, const std::string &output_path,
const HtmlConfig &config);

void edit(const Document &document, const char *diff);

} // namespace html

} // namespace odr
Expand Down
Loading

0 comments on commit 7690b8b

Please sign in to comment.