From 8ebec767ef0030f91e1cb563d7f91c45a05614c5 Mon Sep 17 00:00:00 2001 From: Thomas Madlener Date: Tue, 14 Nov 2023 12:34:53 +0100 Subject: [PATCH] Make sure that the ROOT writers enforce consistency for the Frame contents they write (#513) * Make sure the contents of Frames in one category are consistent * Refactor RNTupleWriter internals * Minimize number of internal maps in RNTuple writer * Add possibility to check consistency of frames before writing --- include/podio/ROOTFrameWriter.h | 16 +++++ include/podio/ROOTNTupleWriter.h | 25 ++++++-- src/CMakeLists.txt | 3 + src/ROOTFrameWriter.cc | 26 +++++++- src/ROOTNTupleWriter.cc | 89 +++++++++++++++++++-------- src/rootUtils.h | 82 +++++++++++++++++++++++++ tests/unittests/CMakeLists.txt | 2 +- tests/unittests/unittest.cpp | 100 +++++++++++++++++++++++++++++++ 8 files changed, 310 insertions(+), 33 deletions(-) diff --git a/include/podio/ROOTFrameWriter.h b/include/podio/ROOTFrameWriter.h index addc22692..d6b7ecc8c 100644 --- a/include/podio/ROOTFrameWriter.h +++ b/include/podio/ROOTFrameWriter.h @@ -49,6 +49,22 @@ class ROOTFrameWriter { */ void finish(); + /** Check whether the collsToWrite are consistent with the state of the passed + * category. + * + * Return two vectors of collection names. The first one contains all the + * names that were missing from the collsToWrite but were present in the + * category. The second one contains the names that are present in the + * collsToWrite only. If both vectors are empty the category and the passed + * collsToWrite are consistent. + * + * NOTE: This will only be a meaningful check if the first Frame of the passed + * category has already been written. Also, this check is rather expensive as + * it has to effectively do two set differences. + */ + std::tuple, std::vector> + checkConsistency(const std::vector& collsToWrite, const std::string& category) const; + private: using StoreCollection = std::pair; diff --git a/include/podio/ROOTNTupleWriter.h b/include/podio/ROOTNTupleWriter.h index 0f6f6d466..6a1b4cd71 100644 --- a/include/podio/ROOTNTupleWriter.h +++ b/include/podio/ROOTNTupleWriter.h @@ -32,12 +32,27 @@ class ROOTNTupleWriter { void writeFrame(const podio::Frame& frame, const std::string& category, const std::vector& collsToWrite); void finish(); + /** Check whether the collsToWrite are consistent with the state of the passed + * category. + * + * Return two vectors of collection names. The first one contains all the + * names that were missing from the collsToWrite but were present in the + * category. The second one contains the names that are present in the + * collsToWrite only. If both vectors are empty the category and the passed + * collsToWrite are consistent. + * + * NOTE: This will only be a meaningful check if the first Frame of the passed + * category has already been written. Also, this check is rather expensive as + * it has to effectively do two set differences. + */ + std::tuple, std::vector> + checkConsistency(const std::vector& collsToWrite, const std::string& category) const; + private: using StoreCollection = std::pair; std::unique_ptr createModels(const std::vector& collections); std::unique_ptr m_metadata{}; - std::unordered_map> m_writers{}; std::unique_ptr m_metadataWriter{}; std::unique_ptr m_file{}; @@ -45,16 +60,16 @@ class ROOTNTupleWriter { DatamodelDefinitionCollector m_datamodelCollector{}; struct CollectionInfo { - std::vector id{}; + std::vector id{}; std::vector name{}; std::vector type{}; std::vector isSubsetCollection{}; std::vector schemaVersion{}; + std::unique_ptr writer{nullptr}; }; + CollectionInfo& getCategoryInfo(const std::string& category); - std::unordered_map m_collectionInfo{}; - - std::set m_categories{}; + std::unordered_map m_categories{}; bool m_finished{false}; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8cdd0d813..a68b94bfb 100755 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -109,6 +109,9 @@ PODIO_ADD_LIB_AND_DICT(podioRootIO "${root_headers}" "${root_sources}" root_sele target_link_libraries(podioRootIO PUBLIC podio::podio ROOT::Core ROOT::RIO ROOT::Tree) if(ENABLE_RNTUPLE) target_link_libraries(podioRootIO PUBLIC ROOT::ROOTNTuple) + target_compile_definitions(podioRootIO PUBLIC PODIO_ENABLE_RNTUPLE=1) +else() + target_compile_definitions(podioRootIO PUBLIC PODIO_ENABLE_RNTUPLE=0) endif() diff --git a/src/ROOTFrameWriter.cc b/src/ROOTFrameWriter.cc index 48a47b482..20e6bee53 100644 --- a/src/ROOTFrameWriter.cc +++ b/src/ROOTFrameWriter.cc @@ -41,9 +41,12 @@ void ROOTFrameWriter::writeFrame(const podio::Frame& frame, const std::string& c collections.reserve(catInfo.collsToWrite.size()); for (const auto& name : catInfo.collsToWrite) { auto* coll = frame.getCollectionForWrite(name); + if (!coll) { + // Make sure all collections that we want to write are actually available + // NOLINTNEXTLINE(performance-inefficient-string-concatenation) + throw std::runtime_error("Collection '" + name + "' in category '" + category + "' is not available in Frame"); + } collections.emplace_back(name, const_cast(coll)); - - m_datamodelCollector.registerDatamodelDefinition(coll, name); } // We will at least have a parameters branch, even if there are no @@ -52,6 +55,12 @@ void ROOTFrameWriter::writeFrame(const podio::Frame& frame, const std::string& c initBranches(catInfo, collections, const_cast(frame.getParameters())); } else { + // Make sure that the category contents are consistent with the initial + // frame in the category + if (!root_utils::checkConsistentColls(catInfo.collsToWrite, collsToWrite)) { + throw std::runtime_error("Trying to write category '" + category + "' with inconsistent collection content. " + + root_utils::getInconsistentCollsMsg(catInfo.collsToWrite, collsToWrite)); + } resetBranches(catInfo.branches, collections, &const_cast(frame.getParameters())); } @@ -73,6 +82,10 @@ void ROOTFrameWriter::initBranches(CategoryInfo& catInfo, const std::vectorgetBuffers(); // For subset collections we only fill one references branch @@ -153,4 +166,13 @@ void ROOTFrameWriter::finish() { m_finished = true; } +std::tuple, std::vector> +ROOTFrameWriter::checkConsistency(const std::vector& collsToWrite, const std::string& category) const { + if (const auto it = m_categories.find(category); it != m_categories.end()) { + return root_utils::getInconsistentColls(it->second.collsToWrite, collsToWrite); + } + + return {std::vector{}, collsToWrite}; +} + } // namespace podio diff --git a/src/ROOTNTupleWriter.cc b/src/ROOTNTupleWriter.cc index 741af53b3..9cf3b9d8d 100644 --- a/src/ROOTNTupleWriter.cc +++ b/src/ROOTNTupleWriter.cc @@ -64,22 +64,50 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string& void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string& category, const std::vector& collsToWrite) { + auto& catInfo = getCategoryInfo(category); + + // Use the writer as proxy to check whether this category has been initialized + // already and do so if not + const bool new_category = (catInfo.writer == nullptr); + if (new_category) { + // This is the minimal information that we need for now + catInfo.name = root_utils::sortAlphabeticaly(collsToWrite); + } std::vector collections; - collections.reserve(collsToWrite.size()); - for (const auto& name : collsToWrite) { + collections.reserve(catInfo.name.size()); + // Only loop over the collections that were requested in the first Frame of + // this category + for (const auto& name : catInfo.name) { auto* coll = frame.getCollectionForWrite(name); + if (!coll) { + // Make sure all collections that we want to write are actually available + // NOLINTNEXTLINE(performance-inefficient-string-concatenation) + throw std::runtime_error("Collection '" + name + "' in category '" + category + "' is not available in Frame"); + } + collections.emplace_back(name, const_cast(coll)); } - bool new_category = false; - if (m_writers.find(category) == m_writers.end()) { - new_category = true; + if (new_category) { + // Now we have enough info to populate the rest auto model = createModels(collections); - m_writers[category] = ROOT::Experimental::RNTupleWriter::Append(std::move(model), category, *m_file.get(), {}); + catInfo.writer = ROOT::Experimental::RNTupleWriter::Append(std::move(model), category, *m_file.get(), {}); + + for (const auto& [name, coll] : collections) { + catInfo.id.emplace_back(coll->getID()); + catInfo.type.emplace_back(coll->getTypeName()); + catInfo.isSubsetCollection.emplace_back(coll->isSubsetCollection()); + catInfo.schemaVersion.emplace_back(coll->getSchemaVersion()); + } + } else { + if (!root_utils::checkConsistentColls(catInfo.name, collsToWrite)) { + throw std::runtime_error("Trying to write category '" + category + "' with inconsistent collection content. " + + root_utils::getInconsistentCollsMsg(catInfo.name, collsToWrite)); + } } - auto entry = m_writers[category]->GetModel()->CreateBareEntry(); + auto entry = m_categories[category].writer->GetModel()->CreateBareEntry(); ROOT::Experimental::RNTupleWriteOptions options; options.SetCompression(ROOT::RCompressionSetting::EDefaults::kUseGeneralPurpose); @@ -121,14 +149,6 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string& // Not supported // entry->CaptureValueUnsafe(root_utils::paramBranchName, // &const_cast(frame.getParameters())); - - if (new_category) { - m_collectionInfo[category].id.emplace_back(coll->getID()); - m_collectionInfo[category].name.emplace_back(name); - m_collectionInfo[category].type.emplace_back(coll->getTypeName()); - m_collectionInfo[category].isSubsetCollection.emplace_back(coll->isSubsetCollection()); - m_collectionInfo[category].schemaVersion.emplace_back(coll->getSchemaVersion()); - } } auto params = frame.getParameters(); @@ -137,8 +157,7 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string& fillParams(params, entry.get()); fillParams(params, entry.get()); - m_writers[category]->Fill(*entry); - m_categories.insert(category); + m_categories[category].writer->Fill(*entry); } std::unique_ptr @@ -215,6 +234,15 @@ ROOTNTupleWriter::createModels(const std::vector& collections) return model; } +ROOTNTupleWriter::CollectionInfo& ROOTNTupleWriter::getCategoryInfo(const std::string& category) { + if (auto it = m_categories.find(category); it != m_categories.end()) { + return it->second; + } + + auto [it, _] = m_categories.try_emplace(category, CollectionInfo{}); + return it->second; +} + void ROOTNTupleWriter::finish() { auto podioVersion = podio::version::build_version; @@ -227,21 +255,21 @@ void ROOTNTupleWriter::finish() { *edmField = edmDefinitions; auto availableCategoriesField = m_metadata->MakeField>(root_utils::availableCategories); - for (auto& [c, _] : m_collectionInfo) { + for (auto& [c, _] : m_categories) { availableCategoriesField->push_back(c); } - for (auto& category : m_categories) { + for (auto& [category, collInfo] : m_categories) { auto idField = m_metadata->MakeField>({root_utils::idTableName(category)}); - *idField = m_collectionInfo[category].id; + *idField = collInfo.id; auto collectionNameField = m_metadata->MakeField>({root_utils::collectionName(category)}); - *collectionNameField = m_collectionInfo[category].name; + *collectionNameField = collInfo.name; auto collectionTypeField = m_metadata->MakeField>({root_utils::collInfoName(category)}); - *collectionTypeField = m_collectionInfo[category].type; + *collectionTypeField = collInfo.type; auto subsetCollectionField = m_metadata->MakeField>({root_utils::subsetCollection(category)}); - *subsetCollectionField = m_collectionInfo[category].isSubsetCollection; + *subsetCollectionField = collInfo.isSubsetCollection; auto schemaVersionField = m_metadata->MakeField>({"schemaVersion_" + category}); - *schemaVersionField = m_collectionInfo[category].schemaVersion; + *schemaVersionField = collInfo.schemaVersion; } m_metadata->Freeze(); @@ -254,10 +282,21 @@ void ROOTNTupleWriter::finish() { // All the tuple writers must be deleted before the file so that they flush // unwritten output - m_writers.clear(); + for (auto& [_, catInfo] : m_categories) { + catInfo.writer.reset(); + } m_metadataWriter.reset(); m_finished = true; } +std::tuple, std::vector> +ROOTNTupleWriter::checkConsistency(const std::vector& collsToWrite, const std::string& category) const { + if (const auto it = m_categories.find(category); it != m_categories.end()) { + return root_utils::getInconsistentColls(it->second.name, collsToWrite); + } + + return {std::vector{}, collsToWrite}; +} + } // namespace podio diff --git a/src/rootUtils.h b/src/rootUtils.h index 523d5c228..acc552f24 100644 --- a/src/rootUtils.h +++ b/src/rootUtils.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -272,6 +273,87 @@ inline std::vector sortAlphabeticaly(std::vector strin return strings; } +/** + * Check whether existingColls and candidateColls both contain the same + * collection names. Returns false if the two vectors differ in content. Inputs + * can have random order wrt each other, but the assumption is that each vector + * only contains unique names. + */ +inline bool checkConsistentColls(const std::vector& existingColls, + const std::vector& candidateColls) { + if (existingColls.size() != candidateColls.size()) { + return false; + } + + // Since we are guaranteed to have unique names here, we can just look for + // collisions brute force, which seems to be quickest approach for vector + // sizes we typically have (few hundred). We can take advantage of the fact + // that the existingColls are ordered (alphabetically and case-insensitive), + // so we can do a binary_search + for (const auto& id : candidateColls) { + if (!std::binary_search(existingColls.begin(), existingColls.end(), id, [](const auto& lhs, const auto& rhs) { + return lhs.size() == rhs.size() && + std::lexicographical_compare( + lhs.begin(), lhs.end(), rhs.begin(), rhs.end(), + [](const auto cl, const auto cr) { return std::tolower(cl) < std::tolower(cr); }); + })) { + return false; + } + } + + return true; +} + +/** + * Get the differences in the existingColls and candidateColls collection names. + * Returns two vectors of collection names. The first one are the collections + * that only exist in the existingColls, the seconde one are the names that only + * exist in the candidateColls. + */ +inline std::tuple, std::vector> +getInconsistentColls(std::vector existingColls, std::vector candidateColls) { + // Need sorted ranges for set_difference + std::sort(existingColls.begin(), existingColls.end()); + std::sort(candidateColls.begin(), candidateColls.end()); + + std::vector onlyInExisting{}; + std::set_difference(existingColls.begin(), existingColls.end(), candidateColls.begin(), candidateColls.end(), + std::back_inserter(onlyInExisting)); + + std::vector onlyInCands{}; + std::set_difference(candidateColls.begin(), candidateColls.end(), existingColls.begin(), existingColls.end(), + std::back_inserter(onlyInCands)); + + return {std::move(onlyInExisting), std::move(onlyInCands)}; +} + +inline std::string getInconsistentCollsMsg(const std::vector& existingColls, + const std::vector& candidateColls) { + const auto& [onlyExisting, onlyCands] = getInconsistentColls(existingColls, candidateColls); + + std::stringstream sstr; + std::string sep = ""; + if (!onlyExisting.empty()) { + sstr << "missing: ["; + for (const auto& name : onlyExisting) { + sstr << sep << name; + sep = ","; + } + sstr << "]"; + } + if (!onlyCands.empty()) { + sstr << sep << " superfluous: ["; + sep = ""; + for (const auto& name : onlyCands) { + sstr << sep << name; + sep = ","; + } + sstr << "]"; + } + + return sstr.str(); +} + } // namespace podio::root_utils #endif diff --git a/tests/unittests/CMakeLists.txt b/tests/unittests/CMakeLists.txt index 4988ab36d..c49157b3f 100644 --- a/tests/unittests/CMakeLists.txt +++ b/tests/unittests/CMakeLists.txt @@ -88,7 +88,7 @@ else() WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} TEST_PREFIX "UT_" # make it possible to filter easily with -R ^UT TEST_SPEC ${filter_tests} # discover only tests that are known to not fail - DL_PATHS ${CMAKE_CURRENT_BINARY_DIR}:${PROJECT_BINARY_DIR}/src:$:$<$:$>:$ENV{LD_LIBRARY_PATH} + DL_PATHS ${CMAKE_CURRENT_BINARY_DIR}:${PROJECT_BINARY_DIR}/src:${PROJECT_BINARY_DIR}/tests:$:$<$:$>:$ENV{LD_LIBRARY_PATH} PROPERTIES ENVIRONMENT PODIO_SIOBLOCK_PATH=${CMAKE_CURRENT_BINARY_DIR} diff --git a/tests/unittests/unittest.cpp b/tests/unittests/unittest.cpp index 8a313de28..c3f4eb6c7 100644 --- a/tests/unittests/unittest.cpp +++ b/tests/unittests/unittest.cpp @@ -8,11 +8,15 @@ #include #include "catch2/catch_test_macros.hpp" +#include "catch2/matchers/catch_matchers_string.hpp" +#include "catch2/matchers/catch_matchers_vector.hpp" // podio specific includes #include "podio/EventStore.h" +#include "podio/Frame.h" #include "podio/GenericParameters.h" #include "podio/ROOTFrameReader.h" +#include "podio/ROOTFrameWriter.h" #include "podio/ROOTLegacyReader.h" #include "podio/ROOTReader.h" #include "podio/podioVersion.h" @@ -26,6 +30,10 @@ #include "podio/SIOReader.h" #endif +#if PODIO_ENABLE_RNTUPLE + #include "podio/ROOTNTupleWriter.h" +#endif + // Test data types #include "datamodel/EventInfoCollection.h" #include "datamodel/ExampleClusterCollection.h" @@ -1134,3 +1142,95 @@ TEST_CASE("JSON", "[json]") { } #endif + +// Write a template function that can be used with different writers in order to +// be able to tag the unittests differently. This is necessary because the +// ROOTFrameWriter fails with ASan, but the ROOTNTuple writer doesn't +template +void runConsistentFrameTest(const std::string& filename) { + using Catch::Matchers::ContainsSubstring; + + podio::Frame frame; + + frame.put(ExampleClusterCollection(), "clusters"); + frame.put(ExampleClusterCollection(), "clusters2"); + frame.put(ExampleHitCollection(), "hits"); + + WriterT writer(filename); + writer.writeFrame(frame, "full"); + + // Write a frame with more collections + frame.put(ExampleHitCollection(), "hits2"); + REQUIRE_THROWS_WITH(writer.writeFrame(frame, "full"), + ContainsSubstring("Trying to write category") && + ContainsSubstring("inconsistent collection content") && + ContainsSubstring("superfluous: [hits2]")); + + // Write a frame with less collections + podio::Frame frame2; + frame2.put(ExampleClusterCollection(), "clusters"); + frame2.put(ExampleClusterCollection(), "clusters2"); + REQUIRE_THROWS_WITH(writer.writeFrame(frame2, "full"), + ContainsSubstring("Collection 'hits' in category") && + ContainsSubstring("not available in Frame")); + + // Write only a subset of collections + const std::vector collsToWrite = {"clusters", "hits"}; + writer.writeFrame(frame, "subset", collsToWrite); + + // Frame is missing a collection + REQUIRE_THROWS_AS(writer.writeFrame(frame2, "subset", collsToWrite), std::runtime_error); + + // Don't throw if frame contents are different, but the subset that is written + // is consistent + const std::vector otherCollsToWrite = {"clusters", "clusters2"}; + writer.writeFrame(frame, "subset2", otherCollsToWrite); + REQUIRE_NOTHROW(writer.writeFrame(frame2, "subset2", otherCollsToWrite)); + + // Make sure that restricting the second frame works. + // See https://github.com/AIDASoft/podio/issues/382 for the original issue + writer.writeFrame(frame2, "full_frame2"); + REQUIRE_NOTHROW(writer.writeFrame(frame, "full_frame2", frame2.getAvailableCollections())); +} + +template +void runCheckConsistencyTest(const std::string& filename) { + using Catch::Matchers::UnorderedEquals; + + WriterT writer(filename); + podio::Frame frame; + frame.put(ExampleClusterCollection(), "clusters"); + frame.put(ExampleClusterCollection(), "clusters2"); + frame.put(ExampleHitCollection(), "hits"); + writer.writeFrame(frame, "frame"); + + // Cumbersome way to get the collections that are used for this category + const auto& [categoryColls, emptyVec] = writer.checkConsistency({}, "frame"); + REQUIRE_THAT(categoryColls, UnorderedEquals({"clusters", "clusters2", "hits"})); + REQUIRE(emptyVec.empty()); + + const std::vector collsToWrite = {"clusters", "clusters2", "non-existant"}; + const auto& [missing, superfluous] = writer.checkConsistency(collsToWrite, "frame"); + REQUIRE_THAT(missing, UnorderedEquals({"hits"})); + REQUIRE_THAT(superfluous, UnorderedEquals({"non-existant"})); +} + +TEST_CASE("ROOTFrameWriter consistent frame contents", "[ASAN-FAIL][UBSAN-FAIL][THREAD-FAIL][basics][root]") { + // The UBSAN-FAIL and TSAN-FAIL only happens on clang12 in CI. + runConsistentFrameTest("unittests_frame_consistency.root"); +} + +TEST_CASE("ROOTFrameWriter check consistency", "[ASAN-FAIL][UBSAN-FAIL][basics][root]") { + runCheckConsistencyTest("unittests_frame_check_consistency.root"); +} + +#if PODIO_ENABLE_RNTUPLE +TEST_CASE("ROOTNTupleWriter consistent frame contents", "[basics][root]") { + runConsistentFrameTest("unittests_frame_consistency_rntuple.root"); +} + +TEST_CASE("ROOTNTupleWriter check consistency", "[basics][root]") { + runCheckConsistencyTest("unittests_frame_check_consistency_rntuple.root"); +} + +#endif