Skip to content

Commit

Permalink
PDF CMap (#348)
Browse files Browse the repository at this point in the history
  • Loading branch information
andiwand authored Jan 8, 2024
1 parent 76bee27 commit 60daf83
Show file tree
Hide file tree
Showing 16 changed files with 366 additions and 55 deletions.
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ find_package(cryptopp REQUIRED)
find_package(nlohmann_json REQUIRED)
find_package(vincentlaucsb-csv-parser REQUIRED)
find_package(uchardet REQUIRED)
find_package(utf8cpp REQUIRED)

configure_file("src/odr/internal/project_info.cpp.in" "src/odr/internal/project_info.cpp")

Expand Down Expand Up @@ -146,6 +147,8 @@ add_library(odr
"src/odr/internal/ooxml/ooxml_meta.cpp"
"src/odr/internal/ooxml/ooxml_util.cpp"

"src/odr/internal/pdf/pdf_cmap.cpp"
"src/odr/internal/pdf/pdf_cmap_parser.cpp"
"src/odr/internal/pdf/pdf_document.cpp"
"src/odr/internal/pdf/pdf_document_element.cpp"
"src/odr/internal/pdf/pdf_document_parser.cpp"
Expand All @@ -164,6 +167,7 @@ add_library(odr
"src/odr/internal/text/text_file.cpp"
"src/odr/internal/text/text_util.cpp"

"src/odr/internal/util/byte_util.cpp"
"src/odr/internal/util/file_util.cpp"
"src/odr/internal/util/hash_util.cpp"
"src/odr/internal/util/odr_meta_util.cpp"
Expand Down Expand Up @@ -191,6 +195,7 @@ target_link_libraries(odr
nlohmann_json::nlohmann_json
vincentlaucsb-csv-parser::vincentlaucsb-csv-parser
uchardet::uchardet
utf8cpp::utf8cpp
)

add_subdirectory("cli")
Expand Down
2 changes: 1 addition & 1 deletion conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class OpenDocumentCoreConan(ConanFile):
exports_sources = ["cli/*", "cmake/*", "src/*", "CMakeLists.txt"]

requires = ["pugixml/1.14", "cryptopp/8.8.0", "miniz/3.0.2", "nlohmann_json/3.11.3",
"vincentlaucsb-csv-parser/2.1.3", "uchardet/0.0.7"]
"vincentlaucsb-csv-parser/2.1.3", "uchardet/0.0.7", "utfcpp/4.0.4"]
build_requires = ["gtest/1.14.0"]
generators = "cmake_paths", "cmake_find_package"

Expand Down
29 changes: 29 additions & 0 deletions src/odr/internal/pdf/pdf_cmap.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#include <odr/internal/pdf/pdf_cmap.hpp>

#include <odr/internal/util/map_util.hpp>

#include <utf8cpp/utf8/cpp17.h>

namespace odr::internal::pdf {

CMap::CMap() = default;

void CMap::map_bfchar(char glyph, char16_t unicode) {
m_bfchar[glyph] = unicode;
}

char16_t CMap::translate_glyph(char glyph) const {
return util::map::lookup_default(m_bfchar, glyph, glyph);
}

std::string CMap::translate_string(const std::string &glyphs) const {
std::u16string result;

for (char glyph : glyphs) {
result += translate_glyph(glyph);
}

return utf8::utf16to8(result);
}

} // namespace odr::internal::pdf
24 changes: 24 additions & 0 deletions src/odr/internal/pdf/pdf_cmap.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef ODR_INTERNAL_PDF_CMAP_HPP
#define ODR_INTERNAL_PDF_CMAP_HPP

#include <string>
#include <unordered_map>

namespace odr::internal::pdf {

class CMap {
public:
CMap();

void map_bfchar(char glyph, char16_t unicode);

char16_t translate_glyph(char glyph) const;
std::string translate_string(const std::string &glyphs) const;

private:
std::unordered_map<char, char16_t> m_bfchar;
};

} // namespace odr::internal::pdf

#endif // ODR_INTERNAL_PDF_CMAP_HPP
136 changes: 136 additions & 0 deletions src/odr/internal/pdf/pdf_cmap_parser.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#include <odr/internal/pdf/pdf_cmap_parser.hpp>

#include <odr/internal/pdf/pdf_cmap.hpp>
#include <odr/internal/util/byte_util.hpp>

#include <iostream>

namespace odr::internal::pdf {

using char_type = std::streambuf::char_type;
using int_type = std::streambuf::int_type;
static constexpr int_type eof = std::streambuf::traits_type::eof();

CMapParser::CMapParser(std::istream &in) : m_parser(in) {}

std::istream &CMapParser::in() const { return m_parser.in(); }

std::streambuf &CMapParser::sb() const { return m_parser.sb(); }

const ObjectParser &CMapParser::parser() const { return m_parser; }

std::variant<Object, std::string> CMapParser::read_token() const {
if (m_parser.peek_number()) {
return std::visit([](auto n) { return Object(n); },
m_parser.read_integer_or_real());
}
if (m_parser.peek_string()) {
return std::visit([](auto s) { return Object(std::move(s)); },
m_parser.read_string());
}
if (m_parser.peek_name()) {
return m_parser.read_name();
}
if (m_parser.peek_dictionary()) {
return m_parser.read_dictionary();
}

std::string token;
while (true) {
int_type c = sb().sgetc();
if (c == eof) {
in().setstate(std::ios::eofbit);
return token;
}
if (ObjectParser::is_whitespace(c)) {
return token;
}
sb().sbumpc();
token += (char_type)c;
}
}

void CMapParser::read_codespacerange(std::uint32_t n, CMap &cmap) const {
m_parser.skip_whitespace();
for (std::uint32_t i = 0; i < n; ++i) {
auto from_glyph = m_parser.read_object();
m_parser.skip_whitespace();
auto to_glyph = m_parser.read_object();
m_parser.skip_whitespace();

// TODO
}
}

void CMapParser::read_bfchar(std::uint32_t n, CMap &cmap) const {
m_parser.skip_whitespace();
for (std::uint32_t i = 0; i < n; ++i) {
std::string glyph = m_parser.read_object().as_string();
m_parser.skip_whitespace();
std::string unicode = m_parser.read_object().as_string();
m_parser.skip_whitespace();

util::reverse_bytes(reinterpret_cast<char16_t *>(unicode.data()),
(std::size_t)unicode.size() / 2);
std::u16string_view unicode16(
reinterpret_cast<const char16_t *>(unicode.data()), unicode.size() / 2);

if (glyph.length() != 1) {
throw std::runtime_error("unexpected glyph length");
}
if (unicode16.length() != 1) {
throw std::runtime_error("unexpected unicode length");
}

cmap.map_bfchar(glyph[0], unicode16[0]);
}
}

void CMapParser::read_bfrange(std::uint32_t n, CMap &cmap) const {
m_parser.skip_whitespace();
for (std::uint32_t i = 0; i < n; ++i) {
auto from_glyph = m_parser.read_object();
m_parser.skip_whitespace();
auto to_glyph = m_parser.read_object();
m_parser.skip_whitespace();
auto unicode = m_parser.read_object();
m_parser.skip_whitespace();

// TODO
}
}

CMap CMapParser::parse_cmap() const {
CMap cmap;

std::uint32_t last_int{};

m_parser.skip_whitespace();
while (true) {
Token token = read_token();
if (in().eof()) {
break;
}
m_parser.skip_whitespace();

if (std::holds_alternative<Object>(token)) {
const Object &object = std::get<Object>(token);
if (object.is_integer()) {
last_int = object.as_integer();
}
} else if (std::holds_alternative<std::string>(token)) {
const std::string &command = std::get<std::string>(token);
if (command == "begincodespacerange") {
read_codespacerange(last_int, cmap);
} else if (command == "beginbfchar") {
read_bfchar(last_int, cmap);
} else if (command == "beginbfrange") {
read_bfrange(last_int, cmap);
}
}
}

return cmap;
}

} // namespace odr::internal::pdf
38 changes: 38 additions & 0 deletions src/odr/internal/pdf/pdf_cmap_parser.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#ifndef ODR_INTERNAL_PDF_CMAP_PARSER_HPP
#define ODR_INTERNAL_PDF_CMAP_PARSER_HPP

#include <odr/internal/pdf/pdf_object.hpp>
#include <odr/internal/pdf/pdf_object_parser.hpp>

#include <iosfwd>
#include <variant>

namespace odr::internal::pdf {

class CMap;

class CMapParser {
public:
using Token = std::variant<Object, std::string>;

explicit CMapParser(std::istream &);

std::istream &in() const;
std::streambuf &sb() const;
const ObjectParser &parser() const;

CMap parse_cmap() const;

private:
ObjectParser m_parser;

Token read_token() const;

void read_codespacerange(std::uint32_t n, CMap &) const;
void read_bfchar(std::uint32_t n, CMap &) const;
void read_bfrange(std::uint32_t n, CMap &) const;
};

} // namespace odr::internal::pdf

#endif // ODR_INTERNAL_PDF_CMAP_PARSER_HPP
5 changes: 4 additions & 1 deletion src/odr/internal/pdf/pdf_document_element.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef ODR_INTERNAL_PDF_DOCUMENT_ELEMENT_HPP
#define ODR_INTERNAL_PDF_DOCUMENT_ELEMENT_HPP

#include <odr/internal/pdf/pdf_cmap.hpp>
#include <odr/internal/pdf/pdf_object.hpp>

#include <unordered_map>
Expand Down Expand Up @@ -57,7 +58,9 @@ struct Resources : Element {
std::unordered_map<std::string, Font *> font;
};

struct Font : Element {};
struct Font : Element {
CMap cmap;
};

} // namespace odr::internal::pdf

Expand Down
14 changes: 14 additions & 0 deletions src/odr/internal/pdf/pdf_document_parser.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
#include <odr/internal/pdf/pdf_document_parser.hpp>

#include <odr/internal/crypto/crypto_util.hpp>
#include <odr/internal/pdf/pdf_cmap_parser.hpp>
#include <odr/internal/pdf/pdf_document.hpp>
#include <odr/internal/pdf/pdf_document_element.hpp>
#include <odr/internal/pdf/pdf_file_parser.hpp>

#include <sstream>

namespace odr::internal::pdf {
namespace {

Expand All @@ -22,6 +26,16 @@ pdf::Font *parse_font(DocumentParser &parser, const ObjectReference &reference,
font->object_reference = reference;
font->object = dictionary;

if (dictionary.has_key("ToUnicode")) {
auto to_unicode_obj =
parser.read_object(dictionary["ToUnicode"].as_reference());
std::string stream = parser.read_object_stream(to_unicode_obj);
std::string inflate = crypto::util::zlib_inflate(stream);
std::istringstream ss(inflate);
CMapParser cmap_parser(ss);
font->cmap = cmap_parser.parse_cmap();
}

return font;
}

Expand Down
8 changes: 4 additions & 4 deletions src/odr/internal/pdf/pdf_graphics_operator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,10 +188,10 @@ enum class GraphicsOperatorType {
set_text_matrix,
text_next_line,

show_string,
next_line_show_string,
set_spacing_next_line_show,
show_string_manual_spacing,
show_text,
show_text_manual_spacing,
show_text_next_line,
show_text_next_line_set_spacing,

set_stroke_color_space,
set_stroke_color,
Expand Down
16 changes: 7 additions & 9 deletions src/odr/internal/pdf/pdf_graphics_operator_parser.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <odr/internal/pdf/pdf_graphics_operator_parser.hpp>

#include <odr/internal/pdf/pdf_graphics_operator.hpp>
#include <odr/internal/util/map_util.hpp>

#include <unordered_map>

Expand Down Expand Up @@ -67,10 +68,10 @@ GraphicsOperatorType operator_name_to_type(const std::string &name) {
{"Tm", GraphicsOperatorType::set_text_matrix},
{"T*", GraphicsOperatorType::text_next_line},

{"Tj", GraphicsOperatorType::show_string},
{"'", GraphicsOperatorType::next_line_show_string},
{"\"", GraphicsOperatorType::set_spacing_next_line_show},
{"TJ", GraphicsOperatorType::show_string_manual_spacing},
{"Tj", GraphicsOperatorType::show_text},
{"TJ", GraphicsOperatorType::show_text_manual_spacing},
{"'", GraphicsOperatorType::show_text_next_line},
{"\"", GraphicsOperatorType::show_text_next_line_set_spacing},

{"CS", GraphicsOperatorType::set_stroke_color_space},
{"SC", GraphicsOperatorType::set_stroke_color},
Expand Down Expand Up @@ -99,11 +100,8 @@ GraphicsOperatorType operator_name_to_type(const std::string &name) {
{"EX", GraphicsOperatorType::end_compat_sec},
};

if (auto it = mapping.find(name); it != std::end(mapping)) {
return it->second;
}

return GraphicsOperatorType::unknown;
return util::map::lookup_default(mapping, name,
GraphicsOperatorType::unknown);
}

} // namespace
Expand Down
7 changes: 2 additions & 5 deletions src/odr/internal/pdf/pdf_graphics_state.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <odr/internal/pdf/pdf_graphics_state.hpp>

#include <odr/internal/pdf/pdf_graphics_operator.hpp>
#include <odr/internal/util/map_util.hpp>

#include <iostream>
#include <unordered_map>
Expand All @@ -16,11 +17,7 @@ ColorSpace color_space_name_to_enum(const std::string &name) {
{"cmyk", ColorSpace::device_cmyk},
};

if (auto it = mapping.find(name); it != std::end(mapping)) {
return it->second;
}

return ColorSpace::unknown;
return util::map::lookup_default(mapping, name, ColorSpace::unknown);
}

} // namespace
Expand Down
Loading

0 comments on commit 60daf83

Please sign in to comment.