Skip to content

Commit

Permalink
Make sure that the ROOT writers enforce consistency for the Frame con…
Browse files Browse the repository at this point in the history
…tents they write (#513)

* Make sure the contents of Frames in one category are consistent

* Refactor RNTupleWriter internals

* Minimize number of internal maps in RNTuple writer

* Add possibility to check consistency of frames before writing
  • Loading branch information
tmadlener authored Nov 14, 2023
1 parent 0602f32 commit 8ebec76
Show file tree
Hide file tree
Showing 8 changed files with 310 additions and 33 deletions.
16 changes: 16 additions & 0 deletions include/podio/ROOTFrameWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,22 @@ class ROOTFrameWriter {
*/
void finish();

/** Check whether the collsToWrite are consistent with the state of the passed
* category.
*
* Return two vectors of collection names. The first one contains all the
* names that were missing from the collsToWrite but were present in the
* category. The second one contains the names that are present in the
* collsToWrite only. If both vectors are empty the category and the passed
* collsToWrite are consistent.
*
* NOTE: This will only be a meaningful check if the first Frame of the passed
* category has already been written. Also, this check is rather expensive as
* it has to effectively do two set differences.
*/
std::tuple<std::vector<std::string>, std::vector<std::string>>
checkConsistency(const std::vector<std::string>& collsToWrite, const std::string& category) const;

private:
using StoreCollection = std::pair<const std::string&, podio::CollectionBase*>;

Expand Down
25 changes: 20 additions & 5 deletions include/podio/ROOTNTupleWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,29 +32,44 @@ class ROOTNTupleWriter {
void writeFrame(const podio::Frame& frame, const std::string& category, const std::vector<std::string>& collsToWrite);
void finish();

/** Check whether the collsToWrite are consistent with the state of the passed
* category.
*
* Return two vectors of collection names. The first one contains all the
* names that were missing from the collsToWrite but were present in the
* category. The second one contains the names that are present in the
* collsToWrite only. If both vectors are empty the category and the passed
* collsToWrite are consistent.
*
* NOTE: This will only be a meaningful check if the first Frame of the passed
* category has already been written. Also, this check is rather expensive as
* it has to effectively do two set differences.
*/
std::tuple<std::vector<std::string>, std::vector<std::string>>
checkConsistency(const std::vector<std::string>& collsToWrite, const std::string& category) const;

private:
using StoreCollection = std::pair<const std::string&, podio::CollectionBase*>;
std::unique_ptr<ROOT::Experimental::RNTupleModel> createModels(const std::vector<StoreCollection>& collections);

std::unique_ptr<ROOT::Experimental::RNTupleModel> m_metadata{};
std::unordered_map<std::string, std::unique_ptr<ROOT::Experimental::RNTupleWriter>> m_writers{};
std::unique_ptr<ROOT::Experimental::RNTupleWriter> m_metadataWriter{};

std::unique_ptr<TFile> m_file{};

DatamodelDefinitionCollector m_datamodelCollector{};

struct CollectionInfo {
std::vector<unsigned int> id{};
std::vector<uint32_t> id{};
std::vector<std::string> name{};
std::vector<std::string> type{};
std::vector<short> isSubsetCollection{};
std::vector<SchemaVersionT> schemaVersion{};
std::unique_ptr<ROOT::Experimental::RNTupleWriter> writer{nullptr};
};
CollectionInfo& getCategoryInfo(const std::string& category);

std::unordered_map<std::string, CollectionInfo> m_collectionInfo{};

std::set<std::string> m_categories{};
std::unordered_map<std::string, CollectionInfo> m_categories{};

bool m_finished{false};

Expand Down
3 changes: 3 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ PODIO_ADD_LIB_AND_DICT(podioRootIO "${root_headers}" "${root_sources}" root_sele
target_link_libraries(podioRootIO PUBLIC podio::podio ROOT::Core ROOT::RIO ROOT::Tree)
if(ENABLE_RNTUPLE)
target_link_libraries(podioRootIO PUBLIC ROOT::ROOTNTuple)
target_compile_definitions(podioRootIO PUBLIC PODIO_ENABLE_RNTUPLE=1)
else()
target_compile_definitions(podioRootIO PUBLIC PODIO_ENABLE_RNTUPLE=0)
endif()


Expand Down
26 changes: 24 additions & 2 deletions src/ROOTFrameWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,12 @@ void ROOTFrameWriter::writeFrame(const podio::Frame& frame, const std::string& c
collections.reserve(catInfo.collsToWrite.size());
for (const auto& name : catInfo.collsToWrite) {
auto* coll = frame.getCollectionForWrite(name);
if (!coll) {
// Make sure all collections that we want to write are actually available
// NOLINTNEXTLINE(performance-inefficient-string-concatenation)
throw std::runtime_error("Collection '" + name + "' in category '" + category + "' is not available in Frame");
}
collections.emplace_back(name, const_cast<podio::CollectionBase*>(coll));

m_datamodelCollector.registerDatamodelDefinition(coll, name);
}

// We will at least have a parameters branch, even if there are no
Expand All @@ -52,6 +55,12 @@ void ROOTFrameWriter::writeFrame(const podio::Frame& frame, const std::string& c
initBranches(catInfo, collections, const_cast<podio::GenericParameters&>(frame.getParameters()));

} else {
// Make sure that the category contents are consistent with the initial
// frame in the category
if (!root_utils::checkConsistentColls(catInfo.collsToWrite, collsToWrite)) {
throw std::runtime_error("Trying to write category '" + category + "' with inconsistent collection content. " +
root_utils::getInconsistentCollsMsg(catInfo.collsToWrite, collsToWrite));
}
resetBranches(catInfo.branches, collections, &const_cast<podio::GenericParameters&>(frame.getParameters()));
}

Expand All @@ -73,6 +82,10 @@ void ROOTFrameWriter::initBranches(CategoryInfo& catInfo, const std::vector<Stor

// First collections
for (auto& [name, coll] : collections) {
// For the first entry in each category we also record the datamodel
// definition
m_datamodelCollector.registerDatamodelDefinition(coll, name);

root_utils::CollectionBranches branches;
const auto buffers = coll->getBuffers();
// For subset collections we only fill one references branch
Expand Down Expand Up @@ -153,4 +166,13 @@ void ROOTFrameWriter::finish() {
m_finished = true;
}

std::tuple<std::vector<std::string>, std::vector<std::string>>
ROOTFrameWriter::checkConsistency(const std::vector<std::string>& collsToWrite, const std::string& category) const {
if (const auto it = m_categories.find(category); it != m_categories.end()) {
return root_utils::getInconsistentColls(it->second.collsToWrite, collsToWrite);
}

return {std::vector<std::string>{}, collsToWrite};
}

} // namespace podio
89 changes: 64 additions & 25 deletions src/ROOTNTupleWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,22 +64,50 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string&

void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string& category,
const std::vector<std::string>& collsToWrite) {
auto& catInfo = getCategoryInfo(category);

// Use the writer as proxy to check whether this category has been initialized
// already and do so if not
const bool new_category = (catInfo.writer == nullptr);
if (new_category) {
// This is the minimal information that we need for now
catInfo.name = root_utils::sortAlphabeticaly(collsToWrite);
}

std::vector<StoreCollection> collections;
collections.reserve(collsToWrite.size());
for (const auto& name : collsToWrite) {
collections.reserve(catInfo.name.size());
// Only loop over the collections that were requested in the first Frame of
// this category
for (const auto& name : catInfo.name) {
auto* coll = frame.getCollectionForWrite(name);
if (!coll) {
// Make sure all collections that we want to write are actually available
// NOLINTNEXTLINE(performance-inefficient-string-concatenation)
throw std::runtime_error("Collection '" + name + "' in category '" + category + "' is not available in Frame");
}

collections.emplace_back(name, const_cast<podio::CollectionBase*>(coll));
}

bool new_category = false;
if (m_writers.find(category) == m_writers.end()) {
new_category = true;
if (new_category) {
// Now we have enough info to populate the rest
auto model = createModels(collections);
m_writers[category] = ROOT::Experimental::RNTupleWriter::Append(std::move(model), category, *m_file.get(), {});
catInfo.writer = ROOT::Experimental::RNTupleWriter::Append(std::move(model), category, *m_file.get(), {});

for (const auto& [name, coll] : collections) {
catInfo.id.emplace_back(coll->getID());
catInfo.type.emplace_back(coll->getTypeName());
catInfo.isSubsetCollection.emplace_back(coll->isSubsetCollection());
catInfo.schemaVersion.emplace_back(coll->getSchemaVersion());
}
} else {
if (!root_utils::checkConsistentColls(catInfo.name, collsToWrite)) {
throw std::runtime_error("Trying to write category '" + category + "' with inconsistent collection content. " +
root_utils::getInconsistentCollsMsg(catInfo.name, collsToWrite));
}
}

auto entry = m_writers[category]->GetModel()->CreateBareEntry();
auto entry = m_categories[category].writer->GetModel()->CreateBareEntry();

ROOT::Experimental::RNTupleWriteOptions options;
options.SetCompression(ROOT::RCompressionSetting::EDefaults::kUseGeneralPurpose);
Expand Down Expand Up @@ -121,14 +149,6 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string&
// Not supported
// entry->CaptureValueUnsafe(root_utils::paramBranchName,
// &const_cast<podio::GenericParameters&>(frame.getParameters()));

if (new_category) {
m_collectionInfo[category].id.emplace_back(coll->getID());
m_collectionInfo[category].name.emplace_back(name);
m_collectionInfo[category].type.emplace_back(coll->getTypeName());
m_collectionInfo[category].isSubsetCollection.emplace_back(coll->isSubsetCollection());
m_collectionInfo[category].schemaVersion.emplace_back(coll->getSchemaVersion());
}
}

auto params = frame.getParameters();
Expand All @@ -137,8 +157,7 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string&
fillParams<double>(params, entry.get());
fillParams<std::string>(params, entry.get());

m_writers[category]->Fill(*entry);
m_categories.insert(category);
m_categories[category].writer->Fill(*entry);
}

std::unique_ptr<ROOT::Experimental::RNTupleModel>
Expand Down Expand Up @@ -215,6 +234,15 @@ ROOTNTupleWriter::createModels(const std::vector<StoreCollection>& collections)
return model;
}

ROOTNTupleWriter::CollectionInfo& ROOTNTupleWriter::getCategoryInfo(const std::string& category) {
if (auto it = m_categories.find(category); it != m_categories.end()) {
return it->second;
}

auto [it, _] = m_categories.try_emplace(category, CollectionInfo{});
return it->second;
}

void ROOTNTupleWriter::finish() {

auto podioVersion = podio::version::build_version;
Expand All @@ -227,21 +255,21 @@ void ROOTNTupleWriter::finish() {
*edmField = edmDefinitions;

auto availableCategoriesField = m_metadata->MakeField<std::vector<std::string>>(root_utils::availableCategories);
for (auto& [c, _] : m_collectionInfo) {
for (auto& [c, _] : m_categories) {
availableCategoriesField->push_back(c);
}

for (auto& category : m_categories) {
for (auto& [category, collInfo] : m_categories) {
auto idField = m_metadata->MakeField<std::vector<unsigned int>>({root_utils::idTableName(category)});
*idField = m_collectionInfo[category].id;
*idField = collInfo.id;
auto collectionNameField = m_metadata->MakeField<std::vector<std::string>>({root_utils::collectionName(category)});
*collectionNameField = m_collectionInfo[category].name;
*collectionNameField = collInfo.name;
auto collectionTypeField = m_metadata->MakeField<std::vector<std::string>>({root_utils::collInfoName(category)});
*collectionTypeField = m_collectionInfo[category].type;
*collectionTypeField = collInfo.type;
auto subsetCollectionField = m_metadata->MakeField<std::vector<short>>({root_utils::subsetCollection(category)});
*subsetCollectionField = m_collectionInfo[category].isSubsetCollection;
*subsetCollectionField = collInfo.isSubsetCollection;
auto schemaVersionField = m_metadata->MakeField<std::vector<SchemaVersionT>>({"schemaVersion_" + category});
*schemaVersionField = m_collectionInfo[category].schemaVersion;
*schemaVersionField = collInfo.schemaVersion;
}

m_metadata->Freeze();
Expand All @@ -254,10 +282,21 @@ void ROOTNTupleWriter::finish() {

// All the tuple writers must be deleted before the file so that they flush
// unwritten output
m_writers.clear();
for (auto& [_, catInfo] : m_categories) {
catInfo.writer.reset();
}
m_metadataWriter.reset();

m_finished = true;
}

std::tuple<std::vector<std::string>, std::vector<std::string>>
ROOTNTupleWriter::checkConsistency(const std::vector<std::string>& collsToWrite, const std::string& category) const {
if (const auto it = m_categories.find(category); it != m_categories.end()) {
return root_utils::getInconsistentColls(it->second.name, collsToWrite);
}

return {std::vector<std::string>{}, collsToWrite};
}

} // namespace podio
82 changes: 82 additions & 0 deletions src/rootUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <algorithm>
#include <cctype>
#include <iostream>
#include <iterator>
#include <string>
#include <string_view>
#include <tuple>
Expand Down Expand Up @@ -272,6 +273,87 @@ inline std::vector<std::string> sortAlphabeticaly(std::vector<std::string> strin
return strings;
}

/**
* Check whether existingColls and candidateColls both contain the same
* collection names. Returns false if the two vectors differ in content. Inputs
* can have random order wrt each other, but the assumption is that each vector
* only contains unique names.
*/
inline bool checkConsistentColls(const std::vector<std::string>& existingColls,
const std::vector<std::string>& candidateColls) {
if (existingColls.size() != candidateColls.size()) {
return false;
}

// Since we are guaranteed to have unique names here, we can just look for
// collisions brute force, which seems to be quickest approach for vector
// sizes we typically have (few hundred). We can take advantage of the fact
// that the existingColls are ordered (alphabetically and case-insensitive),
// so we can do a binary_search
for (const auto& id : candidateColls) {
if (!std::binary_search(existingColls.begin(), existingColls.end(), id, [](const auto& lhs, const auto& rhs) {
return lhs.size() == rhs.size() &&
std::lexicographical_compare(
lhs.begin(), lhs.end(), rhs.begin(), rhs.end(),
[](const auto cl, const auto cr) { return std::tolower(cl) < std::tolower(cr); });
})) {
return false;
}
}

return true;
}

/**
* Get the differences in the existingColls and candidateColls collection names.
* Returns two vectors of collection names. The first one are the collections
* that only exist in the existingColls, the seconde one are the names that only
* exist in the candidateColls.
*/
inline std::tuple<std::vector<std::string>, std::vector<std::string>>
getInconsistentColls(std::vector<std::string> existingColls, std::vector<std::string> candidateColls) {
// Need sorted ranges for set_difference
std::sort(existingColls.begin(), existingColls.end());
std::sort(candidateColls.begin(), candidateColls.end());

std::vector<std::string> onlyInExisting{};
std::set_difference(existingColls.begin(), existingColls.end(), candidateColls.begin(), candidateColls.end(),
std::back_inserter(onlyInExisting));

std::vector<std::string> onlyInCands{};
std::set_difference(candidateColls.begin(), candidateColls.end(), existingColls.begin(), existingColls.end(),
std::back_inserter(onlyInCands));

return {std::move(onlyInExisting), std::move(onlyInCands)};
}

inline std::string getInconsistentCollsMsg(const std::vector<std::string>& existingColls,
const std::vector<std::string>& candidateColls) {
const auto& [onlyExisting, onlyCands] = getInconsistentColls(existingColls, candidateColls);

std::stringstream sstr;
std::string sep = "";
if (!onlyExisting.empty()) {
sstr << "missing: [";
for (const auto& name : onlyExisting) {
sstr << sep << name;
sep = ",";
}
sstr << "]";
}
if (!onlyCands.empty()) {
sstr << sep << " superfluous: [";
sep = "";
for (const auto& name : onlyCands) {
sstr << sep << name;
sep = ",";
}
sstr << "]";
}

return sstr.str();
}

} // namespace podio::root_utils

#endif
2 changes: 1 addition & 1 deletion tests/unittests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ else()
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
TEST_PREFIX "UT_" # make it possible to filter easily with -R ^UT
TEST_SPEC ${filter_tests} # discover only tests that are known to not fail
DL_PATHS ${CMAKE_CURRENT_BINARY_DIR}:${PROJECT_BINARY_DIR}/src:$<TARGET_FILE_DIR:ROOT::Tree>:$<$<TARGET_EXISTS:SIO::sio>:$<TARGET_FILE_DIR:SIO::sio>>:$ENV{LD_LIBRARY_PATH}
DL_PATHS ${CMAKE_CURRENT_BINARY_DIR}:${PROJECT_BINARY_DIR}/src:${PROJECT_BINARY_DIR}/tests:$<TARGET_FILE_DIR:ROOT::Tree>:$<$<TARGET_EXISTS:SIO::sio>:$<TARGET_FILE_DIR:SIO::sio>>:$ENV{LD_LIBRARY_PATH}
PROPERTIES
ENVIRONMENT
PODIO_SIOBLOCK_PATH=${CMAKE_CURRENT_BINARY_DIR}
Expand Down
Loading

0 comments on commit 8ebec76

Please sign in to comment.