diff --git a/CMakeLists.txt b/CMakeLists.txt index e05ac4f2..a8514b39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,6 +47,7 @@ find_package(cryptopp REQUIRED) find_package(nlohmann_json REQUIRED) find_package(vincentlaucsb-csv-parser REQUIRED) find_package(uchardet REQUIRED) +find_package(utf8cpp REQUIRED) configure_file("src/odr/internal/project_info.cpp.in" "src/odr/internal/project_info.cpp") @@ -146,6 +147,8 @@ add_library(odr "src/odr/internal/ooxml/ooxml_meta.cpp" "src/odr/internal/ooxml/ooxml_util.cpp" + "src/odr/internal/pdf/pdf_cmap.cpp" + "src/odr/internal/pdf/pdf_cmap_parser.cpp" "src/odr/internal/pdf/pdf_document.cpp" "src/odr/internal/pdf/pdf_document_element.cpp" "src/odr/internal/pdf/pdf_document_parser.cpp" @@ -164,6 +167,7 @@ add_library(odr "src/odr/internal/text/text_file.cpp" "src/odr/internal/text/text_util.cpp" + "src/odr/internal/util/byte_util.cpp" "src/odr/internal/util/file_util.cpp" "src/odr/internal/util/hash_util.cpp" "src/odr/internal/util/odr_meta_util.cpp" @@ -191,6 +195,7 @@ target_link_libraries(odr nlohmann_json::nlohmann_json vincentlaucsb-csv-parser::vincentlaucsb-csv-parser uchardet::uchardet + utf8cpp::utf8cpp ) add_subdirectory("cli") diff --git a/conanfile.py b/conanfile.py index a2fb7aea..892f51c3 100644 --- a/conanfile.py +++ b/conanfile.py @@ -22,7 +22,7 @@ class OpenDocumentCoreConan(ConanFile): exports_sources = ["cli/*", "cmake/*", "src/*", "CMakeLists.txt"] requires = ["pugixml/1.14", "cryptopp/8.8.0", "miniz/3.0.2", "nlohmann_json/3.11.3", - "vincentlaucsb-csv-parser/2.1.3", "uchardet/0.0.7"] + "vincentlaucsb-csv-parser/2.1.3", "uchardet/0.0.7", "utfcpp/4.0.4"] build_requires = ["gtest/1.14.0"] generators = "cmake_paths", "cmake_find_package" diff --git a/src/odr/internal/pdf/pdf_cmap.cpp b/src/odr/internal/pdf/pdf_cmap.cpp new file mode 100644 index 00000000..4056a341 --- /dev/null +++ b/src/odr/internal/pdf/pdf_cmap.cpp @@ -0,0 +1,29 @@ +#include + +#include + +#include + +namespace odr::internal::pdf { + +CMap::CMap() = default; + +void CMap::map_bfchar(char glyph, char16_t unicode) { + m_bfchar[glyph] = unicode; +} + +char16_t CMap::translate_glyph(char glyph) const { + return util::map::lookup_default(m_bfchar, glyph, glyph); +} + +std::string CMap::translate_string(const std::string &glyphs) const { + std::u16string result; + + for (char glyph : glyphs) { + result += translate_glyph(glyph); + } + + return utf8::utf16to8(result); +} + +} // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_cmap.hpp b/src/odr/internal/pdf/pdf_cmap.hpp new file mode 100644 index 00000000..086f1299 --- /dev/null +++ b/src/odr/internal/pdf/pdf_cmap.hpp @@ -0,0 +1,24 @@ +#ifndef ODR_INTERNAL_PDF_CMAP_HPP +#define ODR_INTERNAL_PDF_CMAP_HPP + +#include +#include + +namespace odr::internal::pdf { + +class CMap { +public: + CMap(); + + void map_bfchar(char glyph, char16_t unicode); + + char16_t translate_glyph(char glyph) const; + std::string translate_string(const std::string &glyphs) const; + +private: + std::unordered_map m_bfchar; +}; + +} // namespace odr::internal::pdf + +#endif // ODR_INTERNAL_PDF_CMAP_HPP diff --git a/src/odr/internal/pdf/pdf_cmap_parser.cpp b/src/odr/internal/pdf/pdf_cmap_parser.cpp new file mode 100644 index 00000000..c60f6f65 --- /dev/null +++ b/src/odr/internal/pdf/pdf_cmap_parser.cpp @@ -0,0 +1,136 @@ +#include + +#include +#include + +#include + +namespace odr::internal::pdf { + +using char_type = std::streambuf::char_type; +using int_type = std::streambuf::int_type; +static constexpr int_type eof = std::streambuf::traits_type::eof(); + +CMapParser::CMapParser(std::istream &in) : m_parser(in) {} + +std::istream &CMapParser::in() const { return m_parser.in(); } + +std::streambuf &CMapParser::sb() const { return m_parser.sb(); } + +const ObjectParser &CMapParser::parser() const { return m_parser; } + +std::variant CMapParser::read_token() const { + if (m_parser.peek_number()) { + return std::visit([](auto n) { return Object(n); }, + m_parser.read_integer_or_real()); + } + if (m_parser.peek_string()) { + return std::visit([](auto s) { return Object(std::move(s)); }, + m_parser.read_string()); + } + if (m_parser.peek_name()) { + return m_parser.read_name(); + } + if (m_parser.peek_dictionary()) { + return m_parser.read_dictionary(); + } + + std::string token; + while (true) { + int_type c = sb().sgetc(); + if (c == eof) { + in().setstate(std::ios::eofbit); + return token; + } + if (ObjectParser::is_whitespace(c)) { + return token; + } + sb().sbumpc(); + token += (char_type)c; + } +} + +void CMapParser::read_codespacerange(std::uint32_t n, CMap &cmap) const { + m_parser.skip_whitespace(); + for (std::uint32_t i = 0; i < n; ++i) { + auto from_glyph = m_parser.read_object(); + m_parser.skip_whitespace(); + auto to_glyph = m_parser.read_object(); + m_parser.skip_whitespace(); + + // TODO + } +} + +void CMapParser::read_bfchar(std::uint32_t n, CMap &cmap) const { + m_parser.skip_whitespace(); + for (std::uint32_t i = 0; i < n; ++i) { + std::string glyph = m_parser.read_object().as_string(); + m_parser.skip_whitespace(); + std::string unicode = m_parser.read_object().as_string(); + m_parser.skip_whitespace(); + + util::reverse_bytes(reinterpret_cast(unicode.data()), + (std::size_t)unicode.size() / 2); + std::u16string_view unicode16( + reinterpret_cast(unicode.data()), unicode.size() / 2); + + if (glyph.length() != 1) { + throw std::runtime_error("unexpected glyph length"); + } + if (unicode16.length() != 1) { + throw std::runtime_error("unexpected unicode length"); + } + + cmap.map_bfchar(glyph[0], unicode16[0]); + } +} + +void CMapParser::read_bfrange(std::uint32_t n, CMap &cmap) const { + m_parser.skip_whitespace(); + for (std::uint32_t i = 0; i < n; ++i) { + auto from_glyph = m_parser.read_object(); + m_parser.skip_whitespace(); + auto to_glyph = m_parser.read_object(); + m_parser.skip_whitespace(); + auto unicode = m_parser.read_object(); + m_parser.skip_whitespace(); + + // TODO + } +} + +CMap CMapParser::parse_cmap() const { + CMap cmap; + + std::uint32_t last_int{}; + + m_parser.skip_whitespace(); + while (true) { + Token token = read_token(); + if (in().eof()) { + break; + } + m_parser.skip_whitespace(); + + if (std::holds_alternative(token)) { + const Object &object = std::get(token); + if (object.is_integer()) { + last_int = object.as_integer(); + } + } else if (std::holds_alternative(token)) { + const std::string &command = std::get(token); + if (command == "begincodespacerange") { + read_codespacerange(last_int, cmap); + } else if (command == "beginbfchar") { + read_bfchar(last_int, cmap); + } else if (command == "beginbfrange") { + read_bfrange(last_int, cmap); + } + } + } + + return cmap; +} + +} // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_cmap_parser.hpp b/src/odr/internal/pdf/pdf_cmap_parser.hpp new file mode 100644 index 00000000..00808568 --- /dev/null +++ b/src/odr/internal/pdf/pdf_cmap_parser.hpp @@ -0,0 +1,38 @@ +#ifndef ODR_INTERNAL_PDF_CMAP_PARSER_HPP +#define ODR_INTERNAL_PDF_CMAP_PARSER_HPP + +#include +#include + +#include +#include + +namespace odr::internal::pdf { + +class CMap; + +class CMapParser { +public: + using Token = std::variant; + + explicit CMapParser(std::istream &); + + std::istream &in() const; + std::streambuf &sb() const; + const ObjectParser &parser() const; + + CMap parse_cmap() const; + +private: + ObjectParser m_parser; + + Token read_token() const; + + void read_codespacerange(std::uint32_t n, CMap &) const; + void read_bfchar(std::uint32_t n, CMap &) const; + void read_bfrange(std::uint32_t n, CMap &) const; +}; + +} // namespace odr::internal::pdf + +#endif // ODR_INTERNAL_PDF_CMAP_PARSER_HPP diff --git a/src/odr/internal/pdf/pdf_document_element.hpp b/src/odr/internal/pdf/pdf_document_element.hpp index dfa7e770..b6dc90e2 100644 --- a/src/odr/internal/pdf/pdf_document_element.hpp +++ b/src/odr/internal/pdf/pdf_document_element.hpp @@ -1,6 +1,7 @@ #ifndef ODR_INTERNAL_PDF_DOCUMENT_ELEMENT_HPP #define ODR_INTERNAL_PDF_DOCUMENT_ELEMENT_HPP +#include #include #include @@ -57,7 +58,9 @@ struct Resources : Element { std::unordered_map font; }; -struct Font : Element {}; +struct Font : Element { + CMap cmap; +}; } // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_document_parser.cpp b/src/odr/internal/pdf/pdf_document_parser.cpp index 16de7ff9..a08d0041 100644 --- a/src/odr/internal/pdf/pdf_document_parser.cpp +++ b/src/odr/internal/pdf/pdf_document_parser.cpp @@ -1,9 +1,13 @@ #include +#include +#include #include #include #include +#include + namespace odr::internal::pdf { namespace { @@ -22,6 +26,16 @@ pdf::Font *parse_font(DocumentParser &parser, const ObjectReference &reference, font->object_reference = reference; font->object = dictionary; + if (dictionary.has_key("ToUnicode")) { + auto to_unicode_obj = + parser.read_object(dictionary["ToUnicode"].as_reference()); + std::string stream = parser.read_object_stream(to_unicode_obj); + std::string inflate = crypto::util::zlib_inflate(stream); + std::istringstream ss(inflate); + CMapParser cmap_parser(ss); + font->cmap = cmap_parser.parse_cmap(); + } + return font; } diff --git a/src/odr/internal/pdf/pdf_graphics_operator.hpp b/src/odr/internal/pdf/pdf_graphics_operator.hpp index 3cc5d767..f67db696 100644 --- a/src/odr/internal/pdf/pdf_graphics_operator.hpp +++ b/src/odr/internal/pdf/pdf_graphics_operator.hpp @@ -188,10 +188,10 @@ enum class GraphicsOperatorType { set_text_matrix, text_next_line, - show_string, - next_line_show_string, - set_spacing_next_line_show, - show_string_manual_spacing, + show_text, + show_text_manual_spacing, + show_text_next_line, + show_text_next_line_set_spacing, set_stroke_color_space, set_stroke_color, diff --git a/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp b/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp index c71c00e4..7e153a91 100644 --- a/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp +++ b/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp @@ -1,6 +1,7 @@ #include #include +#include #include @@ -67,10 +68,10 @@ GraphicsOperatorType operator_name_to_type(const std::string &name) { {"Tm", GraphicsOperatorType::set_text_matrix}, {"T*", GraphicsOperatorType::text_next_line}, - {"Tj", GraphicsOperatorType::show_string}, - {"'", GraphicsOperatorType::next_line_show_string}, - {"\"", GraphicsOperatorType::set_spacing_next_line_show}, - {"TJ", GraphicsOperatorType::show_string_manual_spacing}, + {"Tj", GraphicsOperatorType::show_text}, + {"TJ", GraphicsOperatorType::show_text_manual_spacing}, + {"'", GraphicsOperatorType::show_text_next_line}, + {"\"", GraphicsOperatorType::show_text_next_line_set_spacing}, {"CS", GraphicsOperatorType::set_stroke_color_space}, {"SC", GraphicsOperatorType::set_stroke_color}, @@ -99,11 +100,8 @@ GraphicsOperatorType operator_name_to_type(const std::string &name) { {"EX", GraphicsOperatorType::end_compat_sec}, }; - if (auto it = mapping.find(name); it != std::end(mapping)) { - return it->second; - } - - return GraphicsOperatorType::unknown; + return util::map::lookup_default(mapping, name, + GraphicsOperatorType::unknown); } } // namespace diff --git a/src/odr/internal/pdf/pdf_graphics_state.cpp b/src/odr/internal/pdf/pdf_graphics_state.cpp index 2bb78ccb..36dd6392 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.cpp +++ b/src/odr/internal/pdf/pdf_graphics_state.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include @@ -16,11 +17,7 @@ ColorSpace color_space_name_to_enum(const std::string &name) { {"cmyk", ColorSpace::device_cmyk}, }; - if (auto it = mapping.find(name); it != std::end(mapping)) { - return it->second; - } - - return ColorSpace::unknown; + return util::map::lookup_default(mapping, name, ColorSpace::unknown); } } // namespace diff --git a/src/odr/internal/pdf/pdf_object_parser.cpp b/src/odr/internal/pdf/pdf_object_parser.cpp index 142d5a70..e8d06f92 100644 --- a/src/odr/internal/pdf/pdf_object_parser.cpp +++ b/src/odr/internal/pdf/pdf_object_parser.cpp @@ -37,23 +37,27 @@ std::istream &ObjectParser::in() const { return *m_in; } std::streambuf &ObjectParser::sb() const { return *m_sb; } +bool ObjectParser::is_whitespace(char c) { + return c == '\0' || c == '\t' || c == '\n' || c == '\f' || c == '\r' || + c == ' '; +} + +bool ObjectParser::peek_whitespace() const { + int_type c = sb().sgetc(); + return c != eof && is_whitespace(c); +} + void ObjectParser::skip_whitespace() const { while (true) { int_type c = sb().sgetc(); - switch (c) { - case '\0': - case '\t': - case '\n': - case '\f': - case '\r': - case ' ': - sb().sbumpc(); - break; - case eof: + if (c == eof) { in().setstate(std::ios::eofbit); - default: return; } + if (!is_whitespace(c)) { + return; + } + sb().sbumpc(); } } diff --git a/src/odr/internal/pdf/pdf_object_parser.hpp b/src/odr/internal/pdf/pdf_object_parser.hpp index 58e4921c..3a692f0d 100644 --- a/src/odr/internal/pdf/pdf_object_parser.hpp +++ b/src/odr/internal/pdf/pdf_object_parser.hpp @@ -15,6 +15,8 @@ class ObjectParser { std::istream &in() const; std::streambuf &sb() const; + static bool is_whitespace(char c); + bool peek_whitespace() const; void skip_whitespace() const; void skip_line() const; std::string read_line(bool inclusive = false) const; diff --git a/src/odr/internal/util/byte_util.cpp b/src/odr/internal/util/byte_util.cpp new file mode 100644 index 00000000..75378d1b --- /dev/null +++ b/src/odr/internal/util/byte_util.cpp @@ -0,0 +1,29 @@ +#include + +namespace odr::internal { + +void util::reverse_bytes(char16_t *string, std::size_t length) { + for (std::size_t i = 0; i < length; ++i) { + util::reverse_bytes(string[i]); + } +} + +void util::reverse_bytes(char32_t *string, std::size_t length) { + for (std::size_t i = 0; i < length; ++i) { + util::reverse_bytes(string[i]); + } +} + +void util::reverse_bytes(std::u16string &string) { + for (char16_t &c : string) { + util::reverse_bytes(c); + } +} + +void util::reverse_bytes(std::u32string &string) { + for (char32_t &c : string) { + util::reverse_bytes(c); + } +} + +} // namespace odr::internal diff --git a/src/odr/internal/util/byte_util.hpp b/src/odr/internal/util/byte_util.hpp new file mode 100644 index 00000000..864db941 --- /dev/null +++ b/src/odr/internal/util/byte_util.hpp @@ -0,0 +1,24 @@ +#ifndef ODR_INTERNAL_BYTE_UTIL_HPP +#define ODR_INTERNAL_BYTE_UTIL_HPP + +#include +#include + +namespace odr::internal::util { + +template void reverse_bytes(T &x) { + for (char *a = reinterpret_cast(std::addressof(x)), + *b = a + sizeof(x) - 1; + a < b; ++a, --b) { + std::swap(*a, *b); + } +} + +void reverse_bytes(char16_t *string, std::size_t length); +void reverse_bytes(char32_t *string, std::size_t length); +void reverse_bytes(std::u16string &string); +void reverse_bytes(std::u32string &string); + +} // namespace odr::internal::util + +#endif // ODR_INTERNAL_BYTE_UTIL_HPP diff --git a/test/src/internal/pdf/pdf_document_parser.cpp b/test/src/internal/pdf/pdf_document_parser.cpp index d15ae2e6..b1a2895e 100644 --- a/test/src/internal/pdf/pdf_document_parser.cpp +++ b/test/src/internal/pdf/pdf_document_parser.cpp @@ -52,37 +52,45 @@ TEST(DocumentParser, foo) { } Page *first_page = ordered_pages.front(); - for (const auto &[key, value] : first_page->resources->font) { - std::cout << "font " << key << std::endl; - for (const auto &[prop_key, prop_val] : value->object.as_dictionary()) { - std::cout << "prop key " << prop_key << std::endl; - } - auto to_unicode_ref = - value->object.as_dictionary()["ToUnicode"].as_reference(); - std::cout << "to unicode " << to_unicode_ref.id << std::endl; - auto to_unicode_obj = parser.read_object(to_unicode_ref); - std::cout << "to unicode " << to_unicode_obj.object.as_dictionary().size() - << std::endl; - for (const auto &[prop_key, prop_val] : - to_unicode_obj.object.as_dictionary()) { - std::cout << "prop key " << prop_key << std::endl; - } - std::string stream = parser.read_object_stream(to_unicode_obj); - std::cout << crypto::util::zlib_inflate(stream) << std::endl; - } - IndirectObject first_page_contents_object = parser.read_object(first_page->contents_reference); std::string stream = parser.read_object_stream(first_page_contents_object); std::string first_page_content = crypto::util::zlib_inflate(stream); - std::cout << first_page_content << std::endl; - - std::istringstream in2(first_page_content); - GraphicsOperatorParser parser2(in2); + std::istringstream ss(first_page_content); + GraphicsOperatorParser parser2(ss); GraphicsState state; - while (!in2.eof()) { + while (!ss.eof()) { GraphicsOperator op = parser2.read_operator(); state.execute(op); + + const std::string &font = state.current().text.font; + double size = state.current().text.size; + + if (op.type == GraphicsOperatorType::show_text) { + const std::string &glyphs = op.arguments[0].as_string(); + std::string unicode = + first_page->resources->font.at(font)->cmap.translate_string(glyphs); + std::cout << "show text: font=" << font << ", size=" << size + << ", text=" << unicode << std::endl; + } else if (op.type == GraphicsOperatorType::show_text_manual_spacing) { + for (const auto &element : op.arguments[0].as_array()) { + if (element.is_real()) { + std::cout << "spacing: " << element.as_real() << std::endl; + } else if (element.is_string()) { + const std::string &glyphs = element.as_string(); + std::string unicode = + first_page->resources->font.at(font)->cmap.translate_string( + glyphs); + std::cout << "show text manual spacing: font=" << font + << ", size=" << size << ", text=" << unicode << std::endl; + } + } + } else if (op.type == GraphicsOperatorType::show_text_next_line) { + std::cout << "TODO show_text_next_line" << std::endl; + } else if (op.type == + GraphicsOperatorType::show_text_next_line_set_spacing) { + std::cout << "TODO show_text_next_line_set_spacing" << std::endl; + } } }