Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

core-clp: Refactor LibarchiveFileReader to properly handle empty data blocks (fixes #389). #455

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 32 additions & 31 deletions components/core/src/clp/LibarchiveFileReader.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "LibarchiveFileReader.hpp"

#include <cstring>
#include <span>

#include "spdlog_with_specializations.hpp"

Expand Down Expand Up @@ -38,7 +39,7 @@ LibarchiveFileReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_
while (true) {
// Read a data block if necessary
if (nullptr == m_data_block) {
auto error_code = read_next_data_block();
auto error_code = read_next_nonempty_data_block();
if (ErrorCode_Success != error_code) {
if (ErrorCode_EndOfFile == error_code && num_bytes_read > 0) {
return ErrorCode_Success;
Expand Down Expand Up @@ -111,7 +112,7 @@ ErrorCode LibarchiveFileReader::try_read_to_delimiter(
while (true) {
// Read a data block if necessary
if (nullptr == m_data_block) {
auto error_code = read_next_data_block();
auto error_code = read_next_nonempty_data_block();
if (ErrorCode_Success != error_code) {
if (ErrorCode_EndOfFile == error_code && str.length() > original_str_length) {
// NOTE: At this point, we haven't found delim, so return directly without
Expand Down Expand Up @@ -206,7 +207,7 @@ void LibarchiveFileReader::close() {
m_pos_in_file = 0;
}

ErrorCode LibarchiveFileReader::try_load_data_block() {
auto LibarchiveFileReader::try_load_nonempty_data_block() -> ErrorCode {
if (nullptr == m_archive) {
throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
}
Expand All @@ -217,49 +218,51 @@ ErrorCode LibarchiveFileReader::try_load_data_block() {
if (m_data_block != nullptr) {
return ErrorCode_Success;
}
return read_next_data_block();
return read_next_nonempty_data_block();
}

void LibarchiveFileReader::peek_buffered_data(char const*& buf, size_t& buf_size) const {
auto LibarchiveFileReader::peek_buffered_data() const -> std::span<char const> {
if (nullptr == m_archive) {
throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
}
if (nullptr == m_archive_entry) {
throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
}
if (nullptr == m_data_block) {
buf_size = 0;
buf = nullptr;
} else if (m_pos_in_file < m_data_block_pos_in_file) {
return {};
}
if (m_pos_in_file < m_data_block_pos_in_file) {
// Position in the file is before the current data block, so we return nulls corresponding
// to the sparse bytes before the data block
// NOTE: We don't return ALL sparse bytes before the data block since that might require
// allocating more bytes, violating the const-ness of this method. Since peek is a
// best-effort method, this should be sufficient for most callers.
buf = m_nulls_for_peek.data();
buf_size = std::min(
m_nulls_for_peek.size(),
static_cast<size_t>(m_data_block_pos_in_file - m_pos_in_file)
);
} else {
buf_size = m_data_block_length - m_pos_in_data_block;
buf = static_cast<char const*>(m_data_block);
return {m_nulls_for_peek.data(),
std::min(
m_nulls_for_peek.size(),
static_cast<size_t>(m_data_block_pos_in_file - m_pos_in_file)
)};
}

return {static_cast<char const*>(m_data_block), m_data_block_length - m_pos_in_data_block};
}

ErrorCode LibarchiveFileReader::read_next_data_block() {
auto return_value = archive_read_data_block(
m_archive,
&m_data_block,
&m_data_block_length,
&m_data_block_pos_in_file
);
if (ARCHIVE_OK != return_value) {
m_data_block = nullptr;
if (ARCHIVE_EOF == return_value) {
m_reached_eof = true;
return ErrorCode_EndOfFile;
} else {
auto LibarchiveFileReader::read_next_nonempty_data_block() -> ErrorCode {
m_data_block_length = 0;
m_pos_in_data_block = 0;
haiqi96 marked this conversation as resolved.
Show resolved Hide resolved
while (0 == m_data_block_length) {
auto return_value = archive_read_data_block(
m_archive,
&m_data_block,
&m_data_block_length,
&m_data_block_pos_in_file
);
if (ARCHIVE_OK != return_value) {
m_data_block = nullptr;
if (ARCHIVE_EOF == return_value) {
m_reached_eof = true;
return ErrorCode_EndOfFile;
}
SPDLOG_DEBUG(
"Failed to read data block from libarchive - {}",
archive_error_string(m_archive)
Expand All @@ -268,8 +271,6 @@ ErrorCode LibarchiveFileReader::read_next_data_block() {
}
}

m_pos_in_data_block = 0;

return ErrorCode_Success;
}
} // namespace clp
38 changes: 16 additions & 22 deletions components/core/src/clp/LibarchiveFileReader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define CLP_LIBARCHIVEFILEREADER_HPP

#include <array>
#include <span>
#include <string>

#include <archive.h>
Expand Down Expand Up @@ -30,13 +31,7 @@ class LibarchiveFileReader : public ReaderInterface {
};

// Constructors
LibarchiveFileReader()
: m_archive(nullptr),
m_archive_entry(nullptr),
m_data_block(nullptr),
m_reached_eof(false),
m_data_block_pos_in_file(0),
m_pos_in_file(0) {}
LibarchiveFileReader() = default;

// Methods implementing the ReaderInterface
/**
Expand Down Expand Up @@ -89,43 +84,42 @@ class LibarchiveFileReader : public ReaderInterface {
void close();

/**
* Tries to the load a data block from the file if none is loaded
* Tries to the load a nonempty data block from the file if none is loaded
* @return ErrorCode_EndOfFile on EOF
* @return ErrorCode_Failure on failure
* @return ErrorCode_Success on success
*/
[[nodiscard]] ErrorCode try_load_data_block();
[[nodiscard]] auto try_load_nonempty_data_block() -> ErrorCode;

/**
* Peeks the remaining buffered content without advancing the read head.
*
* NOTE: Any subsequent read or seek operations may invalidate the returned buffer.
* @param buf Returns a pointer to any buffered data
* @param buf_size Returns the number of bytes in the buffer
* @return The view of the buffered data
*/
void peek_buffered_data(char const*& buf, size_t& buf_size) const;
[[nodiscard]] auto peek_buffered_data() const -> std::span<char const>;

private:
// Methods
/**
* Reads next data block from the archive
* Reads next nonempty data block from the archive
* @return ErrorCode_EndOfFile on EOF
* @return ErrorCode_Failure on failure
* @return ErrorCode_Success on success
*/
ErrorCode read_next_data_block();
[[nodiscard]] auto read_next_nonempty_data_block() -> ErrorCode;

// Variables
struct archive* m_archive;
struct archive* m_archive{nullptr};

struct archive_entry* m_archive_entry;
la_int64_t m_data_block_pos_in_file;
void const* m_data_block;
size_t m_data_block_length;
la_int64_t m_pos_in_data_block;
bool m_reached_eof;
struct archive_entry* m_archive_entry{nullptr};
la_int64_t m_data_block_pos_in_file{0};
void const* m_data_block{nullptr};
size_t m_data_block_length{0};
la_int64_t m_pos_in_data_block{0};
bool m_reached_eof{false};

size_t m_pos_in_file;
size_t m_pos_in_file{0};

// Nulls for peek
std::array<char, 4096> m_nulls_for_peek{0};
Expand Down
16 changes: 9 additions & 7 deletions components/core/src/clp/clp/FileCompressor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <algorithm>
#include <iostream>
#include <set>
#include <span>

#include <archive_entry.h>
#include <boost/algorithm/string.hpp>
Expand Down Expand Up @@ -343,7 +344,7 @@ bool FileCompressor::try_compressing_as_archive(
m_libarchive_reader.open_file_reader(m_libarchive_file_reader);

// Check that file is UTF-8 encoded
if (auto error_code = m_libarchive_file_reader.try_load_data_block();
if (auto error_code = m_libarchive_file_reader.try_load_nonempty_data_block();
ErrorCode_Success != error_code && ErrorCode_EndOfFile != error_code)
{
SPDLOG_ERROR(
Expand All @@ -355,12 +356,10 @@ bool FileCompressor::try_compressing_as_archive(
succeeded = false;
continue;
}
char const* utf8_validation_buf{nullptr};
size_t peek_size{0};
m_libarchive_file_reader.peek_buffered_data(utf8_validation_buf, peek_size);
auto const utf8_validation_buf{m_libarchive_file_reader.peek_buffered_data()};
string file_path{m_libarchive_reader.get_path()};
auto utf8_validation_buf_len = std::min(peek_size, cUtfMaxValidationLen);
if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf)) {
auto utf8_validation_buf_len = std::min(utf8_validation_buf.size(), cUtfMaxValidationLen);
if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf.data())) {
auto boost_path_for_compression = parent_boost_path / file_path;
if (use_heuristic) {
parse_and_encode_with_heuristic(
Expand All @@ -383,7 +382,10 @@ bool FileCompressor::try_compressing_as_archive(
m_libarchive_file_reader
);
}
} else if (has_ir_stream_magic_number({utf8_validation_buf, peek_size})) {
} else if (has_ir_stream_magic_number(
{utf8_validation_buf.data(), utf8_validation_buf.size()}
))
{
// Remove .clp suffix if found
static constexpr char cIrStreamExtension[] = ".clp";
if (boost::iends_with(file_path, cIrStreamExtension)) {
Expand Down
Loading