From c701f1f066136beb7612fc12782f62562d3d257e Mon Sep 17 00:00:00 2001 From: Florine de Geus Date: Tue, 4 Jun 2024 18:35:28 +0200 Subject: [PATCH] [df] Add initial RNTuple snapshot implementation This initial implemenation only supports single-threaded snapshotting. When the original datasource is an RNTuple, the resulting snapshot will be an RNTuple by default. This can be changed to TTree through the RSnapshotOptions. Snapshotting from TTree to RNTuple is not yet supported in this version, but will be added in the future. Snapshotting from other data sources or dataframes created from scratch to RNTuple is supported. --- tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx | 191 +++++++- .../dataframe/inc/ROOT/RDF/InterfaceUtils.hxx | 45 +- tree/dataframe/inc/ROOT/RDF/RInterface.hxx | 133 ++++-- tree/dataframe/inc/ROOT/RDF/RLoopManager.hxx | 4 + tree/dataframe/inc/ROOT/RSnapshotOptions.hxx | 9 +- tree/dataframe/src/RDFActionHelpers.cxx | 51 +++ tree/dataframe/src/RDFInterfaceUtils.cxx | 21 +- tree/dataframe/test/CMakeLists.txt | 1 + tree/dataframe/test/NTupleStruct.hxx | 6 + .../test/dataframe_snapshot_ntuple.cxx | 421 ++++++++++++++++++ 10 files changed, 826 insertions(+), 56 deletions(-) create mode 100644 tree/dataframe/test/dataframe_snapshot_ntuple.cxx diff --git a/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx index 6ddc8c9525de72..3315c722f2e09b 100644 --- a/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx @@ -35,7 +35,7 @@ #include "TClassRef.h" #include "TDirectory.h" #include "TError.h" // for R__ASSERT, Warning -#include "TFile.h" // for SnapshotHelper +#include "TFile.h" // for SnapshotHelper #include "TH1.h" #include "TGraph.h" #include "TGraphAsymmErrors.h" @@ -46,6 +46,12 @@ #include "TStatistic.h" #include "ROOT/RDF/RActionImpl.hxx" #include "ROOT/RDF/RMergeableValue.hxx" +#include "ROOT/RDF/RLoopManager.hxx" + +#ifdef R__HAS_ROOT7 +#include "ROOT/RNTupleDS.hxx" +#include "ROOT/RNTupleWriter.hxx" // for SnapshotRNTupleHelper +#endif #include #include @@ -1379,8 +1385,8 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, const std::string &i branchAddress = nullptr; } -/// Helper function for SnapshotHelper and SnapshotHelperMT. It creates new branches for the output TTree of a Snapshot. -/// This overload is called for columns of type `RVec`. For RDF, these can represent: +/// Helper function for SnapshotHelper and SnapshotHelperMT. It creates new branches for the output TTree of a +/// Snapshot. This overload is called for columns of type `RVec`. For RDF, these can represent: /// 1. c-style arrays in ROOT files, so we are sure that there are input trees to which we can ask the correct branch /// title /// 2. RVecs coming from a custom column or the input file/data-source @@ -1483,7 +1489,7 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, const std::string &i void ValidateSnapshotOutput(const RSnapshotOptions &opts, const std::string &treeName, const std::string &fileName); -/// Helper object for a single-thread Snapshot action +/// Helper object for a single-thread TTree-based Snapshot action template class R__CLING_PTRCHECK(off) SnapshotHelper : public RActionImpl> { std::string fFileName; @@ -1507,9 +1513,15 @@ public: SnapshotHelper(std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector &&isDefine) - : fFileName(filename), fDirName(dirname), fTreeName(treename), fOptions(options), fInputBranchNames(vbnames), - fOutputBranchNames(ReplaceDotWithUnderscore(bnames)), fBranches(vbnames.size(), nullptr), - fBranchAddresses(vbnames.size(), nullptr), fIsDefine(std::move(isDefine)) + : fFileName(filename), + fDirName(dirname), + fTreeName(treename), + fOptions(options), + fInputBranchNames(vbnames), + fOutputBranchNames(ReplaceDotWithUnderscore(bnames)), + fBranches(vbnames.size(), nullptr), + fBranchAddresses(vbnames.size(), nullptr), + fIsDefine(std::move(isDefine)) { ValidateSnapshotOutput(fOptions, fTreeName, fFileName); } @@ -1632,7 +1644,7 @@ public: } }; -/// Helper object for a multi-thread Snapshot action +/// Helper object for a multi-thread TTree-based Snapshot action template class R__CLING_PTRCHECK(off) SnapshotHelperMT : public RActionImpl> { unsigned int fNSlots; @@ -1659,11 +1671,20 @@ public: SnapshotHelperMT(const unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector &&isDefine) - : fNSlots(nSlots), fOutputFiles(fNSlots), fOutputTrees(fNSlots), fBranchAddressesNeedReset(fNSlots, 1), - fFileName(filename), fDirName(dirname), fTreeName(treename), fOptions(options), fInputBranchNames(vbnames), - fOutputBranchNames(ReplaceDotWithUnderscore(bnames)), fInputTrees(fNSlots), + : fNSlots(nSlots), + fOutputFiles(fNSlots), + fOutputTrees(fNSlots), + fBranchAddressesNeedReset(fNSlots, 1), + fFileName(filename), + fDirName(dirname), + fTreeName(treename), + fOptions(options), + fInputBranchNames(vbnames), + fOutputBranchNames(ReplaceDotWithUnderscore(bnames)), + fInputTrees(fNSlots), fBranches(fNSlots, std::vector(vbnames.size(), nullptr)), - fBranchAddresses(fNSlots, std::vector(vbnames.size(), nullptr)), fOutputBranches(fNSlots), + fBranchAddresses(fNSlots, std::vector(vbnames.size(), nullptr)), + fOutputBranches(fNSlots), fIsDefine(std::move(isDefine)) { ValidateSnapshotOutput(fOptions, fTreeName, fFileName); @@ -1818,6 +1839,152 @@ public: } }; +#ifdef R__HAS_ROOT7 +void ValidateSnapshotRNTupleOutput(const RSnapshotOptions &opts, const std::string &ntupleName, + const std::string &fileName); + +/// Helper function to update the value of an RNTuple's field in the provided entry. +template +void SetFieldsHelper(T value, std::string_view fieldName, ROOT::Experimental::REntry *entry) +{ + entry->BindValue(fieldName, std::make_shared(value)); +} + +/// Helper object for a single-thread RNTuple-based Snapshot action +template +class R__CLING_PTRCHECK(off) SnapshotRNTupleHelper : public RActionImpl> { + std::string fFileName; + std::string fNTupleName; + + std::unique_ptr fOutFile = nullptr; + + RSnapshotOptions fOptions; + ROOT::Detail::RDF::RLoopManager *fLoopManager; + ColumnNames_t fInputFieldNames; // This contains the resolved aliases + ColumnNames_t fOutputFieldNames; + std::unique_ptr fWriter{nullptr}; + + ROOT::Experimental::REntry *fOutputEntry; + + std::vector fIsDefine; + +public: + using ColumnTypes_t = TypeList; + SnapshotRNTupleHelper(std::string_view filename, std::string_view ntuplename, const ColumnNames_t &vfnames, + const ColumnNames_t &fnames, const RSnapshotOptions &options, + ROOT::Detail::RDF::RLoopManager *lm, std::vector &&isDefine) + : fFileName(filename), + fNTupleName(ntuplename), + fOptions(options), + fLoopManager(lm), + fInputFieldNames(vfnames), + fOutputFieldNames(ReplaceDotWithUnderscore(fnames)), + fIsDefine(std::move(isDefine)) + { + ValidateSnapshotRNTupleOutput(fOptions, fNTupleName, fFileName); + } + + SnapshotRNTupleHelper(const SnapshotRNTupleHelper &) = delete; + SnapshotRNTupleHelper(SnapshotRNTupleHelper &&) = default; + ~SnapshotRNTupleHelper() + { + if (!fNTupleName.empty() && !fLoopManager->GetDataSource() && fOptions.fLazy) + Warning("Snapshot", "A lazy Snapshot action was booked but never triggered."); + } + + void InitTask(TTreeReader *, unsigned int /* slot */) {} + + void Exec(unsigned int /* slot */, ColTypes &...values) + { + using ind_t = std::index_sequence_for; + + SetFields(values..., ind_t{}); + fWriter->Fill(); + } + + template + void SetFields(ColTypes &...values, std::index_sequence /*dummy*/) + { + int expander[] = {(SetFieldsHelper(values, fOutputFieldNames[S], fOutputEntry), 0)..., 0}; + (void)expander; // avoid unused variable warnings for older compilers (gcc 14.1) + } + + void Initialize() + { + using ind_t = std::index_sequence_for; + + auto model = ROOT::Experimental::RNTupleModel::Create(); + MakeFields(*model, ind_t{}); + fOutputEntry = &model->GetDefaultEntry(); + + fOutFile = std::unique_ptr( + TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str() /* , cs */)); + if (!fOutFile) + throw std::runtime_error("Snapshot: could not create output file " + fFileName); + + ROOT::Experimental::RNTupleWriteOptions writeOptions; + writeOptions.SetCompression(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel); + + TString checkupdate = fOptions.fMode; + checkupdate.ToLower(); + + if (checkupdate == "update") { + fWriter = ROOT::Experimental::RNTupleWriter::Append(std::move(model), fNTupleName, *fOutFile, writeOptions); + } else { + // If the output file is not closed before the RNTupleWriter is created, no actual data will be written and + // the file will be empty. + fOutFile->Close(); + fWriter = ROOT::Experimental::RNTupleWriter::Recreate(std::move(model), fNTupleName, fFileName, writeOptions); + } + } + + template + void MakeFields(ROOT::Experimental::RNTupleModel &model, std::index_sequence /*dummy*/) + { + int expander[] = {(model.MakeField(fOutputFieldNames[S]), 0)..., 0}; + (void)expander; // avoid unused variable warnings for older compilers (gcc 14.1) + } + + void Finalize() + { + { + fWriter.reset(); + } + fLoopManager->SetDataSource(std::make_unique(fNTupleName, fFileName)); + } + + std::string GetActionName() { return "Snapshot"; } + + ROOT::RDF::SampleCallback_t GetSampleCallback() final + { + return [](unsigned int, const RSampleInfo &) mutable {}; + } + + /** + * @brief Create a new SnapshotRNTupleHelper with a different output file name + * + * @param newName A type-erased string with the output file name + * @return SnapshotRNTupleHelper + * + * This MakeNew implementation is tied to the cloning feature of actions + * of the computation graph. In particular, cloning a Snapshot node usually + * also involves changing the name of the output file, otherwise the cloned + * Snapshot would overwrite the same file. + */ + SnapshotRNTupleHelper MakeNew(void *newName) + { + const std::string finalName = *reinterpret_cast(newName); + return SnapshotRNTupleHelper{finalName, + fNTupleName, + fInputFieldNames, + fOutputFieldNames, + fOptions, + fLoopManager, + std::vector(fIsDefine)}; + } +}; +#endif + template ::value> class R__CLING_PTRCHECK(off) AggregateHelper diff --git a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx index 7ab241761bc63a..d15c6f48e01799 100644 --- a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx +++ b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx @@ -250,6 +250,8 @@ struct SnapshotHelperArgs { std::string fTreeName; std::vector fOutputColNames; ROOT::RDF::RSnapshotOptions fOptions; + RDFDetail::RLoopManager *fLoopManager; + bool fToNTuple; }; // Snapshot action @@ -275,20 +277,39 @@ BuildAction(const ColumnNames_t &colNames, const std::shared_ptr isDefine = makeIsDefine(); std::unique_ptr actionPtr; - if (!ROOT::IsImplicitMTEnabled()) { - // single-thread snapshot - using Helper_t = SnapshotHelper; + if (snapHelperArgs->fToNTuple) { +#ifdef R__HAS_ROOT7 + // TODO(fdegeus) Add MT snapshotting + using Helper_t = SnapshotRNTupleHelper; using Action_t = RAction; + + auto loopManager = snapHelperArgs->fLoopManager; + actionPtr.reset( - new Action_t(Helper_t(filename, dirname, treename, colNames, outputColNames, options, std::move(isDefine)), + new Action_t(Helper_t(filename, treename, colNames, outputColNames, options, loopManager, std::move(isDefine)), colNames, prevNode, colRegister)); + + return actionPtr; +#else + throw std::runtime_error("Cannot snapshot to RNTuple: this installation of ROOT has not been build with ROOT7 " + "components enabled."); +#endif } else { - // multi-thread snapshot - using Helper_t = SnapshotHelperMT; - using Action_t = RAction; - actionPtr.reset(new Action_t( - Helper_t(nSlots, filename, dirname, treename, colNames, outputColNames, options, std::move(isDefine)), - colNames, prevNode, colRegister)); + if (!ROOT::IsImplicitMTEnabled()) { + // single-thread snapshot + using Helper_t = SnapshotHelper; + using Action_t = RAction; + actionPtr.reset( + new Action_t(Helper_t(filename, dirname, treename, colNames, outputColNames, options, std::move(isDefine)), + colNames, prevNode, colRegister)); + } else { + // multi-thread snapshot + using Helper_t = SnapshotHelperMT; + using Action_t = RAction; + actionPtr.reset(new Action_t( + Helper_t(nSlots, filename, dirname, treename, colNames, outputColNames, options, std::move(isDefine)), + colNames, prevNode, colRegister)); + } } return actionPtr; } @@ -781,6 +802,10 @@ AddSizeBranches(const std::vector &branches, TTree *tree, std::vect void RemoveDuplicates(ColumnNames_t &columnNames); +#ifdef R__HAS_ROOT7 +void RemoveRNTupleSubFields(ColumnNames_t &columnNames); +#endif + } // namespace RDF } // namespace Internal diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index e12c856be3048a..9d1286882bb529 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -1161,29 +1161,62 @@ public: const auto &colListNoAliasesWithSizeBranches = pairOfColumnLists.first; const auto &colListWithAliasesAndSizeBranches = pairOfColumnLists.second; - const auto fullTreeName = treename; const auto parsedTreePath = RDFInternal::ParseTreePath(fullTreeName); treename = parsedTreePath.fTreeName; const auto &dirname = parsedTreePath.fDirName; - auto snapHelperArgs = std::make_shared( - RDFInternal::SnapshotHelperArgs{std::string(filename), std::string(dirname), std::string(treename), - colListWithAliasesAndSizeBranches, options}); - ::TDirectory::TContext ctxt; - // The CreateLMFromTTree function by default opens the file passed as input - // to check for the presence of the TTree inside. But at this moment the - // filename we are using here corresponds to a file which does not exist yet, - // i.e. the output file of the Snapshot call. Thus, checkFile=false will - // prevent the function from trying to open a non-existent file. - auto newRDF = std::make_shared>(ROOT::Detail::RDF::CreateLMFromTTree( - fullTreeName, filename, colListNoAliasesWithSizeBranches, /*checkFile*/ false)); + RResultPtr> resPtr; - auto resPtr = CreateAction( - colListNoAliasesWithSizeBranches, newRDF, snapHelperArgs, fProxiedPtr, - colListNoAliasesWithSizeBranches.size()); + bool isBasedOnRNTuple = fLoopManager->GetDataSource() && fLoopManager->GetDataSource()->GetLabel() == "RNTupleDS"; + + if (options.fOutputFormat == ESnapshotOutputFormat::kRNTuple || + (options.fOutputFormat == ESnapshotOutputFormat::kDefault && isBasedOnRNTuple)) { +#ifdef R__HAS_ROOT7 + if (fLoopManager->GetTree()) { + throw std::runtime_error( + "Snapshotting from TTree to RNTuple is not yet supported. The current recommended " + "way to convert TTrees to RNTuple is through ROOT::Experimental::RNTupleImporter."); + } + + auto newRDF = std::make_shared>(std::make_shared(0)); + + auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ + std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches, + options, newRDF->GetLoopManager(), true /* fToNTuple */}); + + // The Snapshot helper will use colListNoAliasesWithSizeBranches (with aliases resolved) as input columns, and + // colListWithAliasesAndSizeBranches (still with aliases in it, passed through snapHelperArgs) as output column + // names. + resPtr = CreateAction( + colListNoAliasesWithSizeBranches, newRDF, snapHelperArgs, fProxiedPtr, + colListNoAliasesWithSizeBranches.size()); +#else + throw std::runtime_error("Cannot snapshot to RNTuple: this installation of ROOT has not been build with ROOT7 " + "components enabled."); +#endif + } else { + // The CreateLMFromTTree function by default opens the file passed as input + // to check for the presence of the TTree inside. But at this moment the + // filename we are using here corresponds to a file which does not exist yet, + // i.e. the output file of the Snapshot call. Thus, checkFile=false will + // prevent the function from trying to open a non-existent file. + auto newRDF = std::make_shared>(ROOT::Detail::RDF::CreateLMFromTTree( + fullTreeName, filename, /*defaultColumns=*/colListNoPoundSizes, /*checkFile=*/false)); + + auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ + std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches, + options, nullptr, false /* fToNTuple */}); + + // The Snapshot helper will use colListNoAliasesWithSizeBranches (with aliases resolved) as input columns, and + // colListWithAliasesAndSizeBranches (still with aliases in it, passed through snapHelperArgs) as output column + // names. + resPtr = CreateAction( + colListNoAliasesWithSizeBranches, newRDF, snapHelperArgs, fProxiedPtr, + colListNoAliasesWithSizeBranches.size()); + } if (!options.fLazy) *resPtr; @@ -1209,6 +1242,7 @@ public: { const auto definedColumns = fColRegister.GenerateColumnNames(); auto *tree = fLoopManager->GetTree(); + const auto treeBranchNames = tree != nullptr ? ROOT::Internal::TreeUtils::GetTopLevelBranchNames(*tree) : ColumnNames_t{}; const auto dsColumns = GetDataSource() ? GetDataSource()->GetColumnNames() : ColumnNames_t{}; // Ignore R_rdf_sizeof_* columns coming from datasources: we don't want to Snapshot those @@ -1225,7 +1259,12 @@ public: // RemoveDuplicates should preserve ordering of the columns: it might be meaningful. RDFInternal::RemoveDuplicates(columnNames); - const auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Snapshot"); + auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Snapshot"); + + if (GetDataSource() && GetDataSource()->GetLabel() == "RNTupleDS") { + RDFInternal::RemoveRNTupleSubFields(selectedColumns); + } + return Snapshot(treename, filename, selectedColumns, options); } // clang-format on @@ -3002,23 +3041,55 @@ private: const auto &treename = parsedTreePath.fTreeName; const auto &dirname = parsedTreePath.fDirName; - auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ - std::string(filename), std::string(dirname), std::string(treename), columnListWithoutSizeColumns, options}); - ::TDirectory::TContext ctxt; - // The CreateLMFromTTree function by default opens the file passed as input - // to check for the presence of the TTree inside. But at this moment the - // filename we are using here corresponds to a file which does not exist yet, - // i.e. the output file of the Snapshot call. Thus, checkFile=false will - // prevent the function from trying to open a non-existent file. - auto newRDF = std::make_shared>(ROOT::Detail::RDF::CreateLMFromTTree( - fullTreeName, filename, /*defaultColumns=*/columnListWithoutSizeColumns, /*checkFile=*/false)); - - // The Snapshot helper will use validCols (with aliases resolved) as input columns, and - // columnListWithoutSizeColumns (still with aliases in it, passed through snapHelperArgs) as output column names. - auto resPtr = CreateAction(validCols, newRDF, snapHelperArgs, - fProxiedPtr); + RResultPtr> resPtr; + + bool isBasedOnRNTuple = fLoopManager->GetDataSource() && fLoopManager->GetDataSource()->GetLabel() == "RNTupleDS"; + + if (options.fOutputFormat == ESnapshotOutputFormat::kRNTuple || + (options.fOutputFormat == ESnapshotOutputFormat::kDefault && isBasedOnRNTuple)) { +#ifdef R__HAS_ROOT7 + if (fLoopManager->GetTree()) { + throw std::runtime_error( + "Snapshotting from TTree to RNTuple is not yet supported. The current recommended " + "way to convert TTrees to RNTuple is through ROOT::Experimental::RNTupleImporter."); + } + + auto newRDF = std::make_shared>(std::make_shared(0)); + + auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ + std::string(filename), std::string(dirname), std::string(treename), columnListWithoutSizeColumns, options, + newRDF->GetLoopManager(), true /* fToRNTuple */}); + + // The Snapshot helper will use validCols (with aliases resolved) as input columns, and + // columnListWithoutSizeColumns (still with aliases in it, passed through snapHelperArgs) as output column + // names. + resPtr = CreateAction(validCols, newRDF, snapHelperArgs, + fProxiedPtr); +#else + throw std::runtime_error("Cannot snapshot to RNTuple: this installation of ROOT has not been build with ROOT7 " + "components enabled."); +#endif + } else { + // The CreateLMFromTTree function by default opens the file passed as input + // to check for the presence of the TTree inside. But at this moment the + // filename we are using here corresponds to a file which does not exist yet, + // i.e. the output file of the Snapshot call. Thus, checkFile=false will + // prevent the function from trying to open a non-existent file. + auto newRDF = std::make_shared>(ROOT::Detail::RDF::CreateLMFromTTree( + fullTreeName, filename, /*defaultColumns=*/columnListWithoutSizeColumns, /*checkFile=*/false)); + + auto snapHelperArgs = std::make_shared( + RDFInternal::SnapshotHelperArgs{std::string(filename), std::string(dirname), std::string(treename), + columnListWithoutSizeColumns, options, nullptr, false /* fToRNTuple */}); + + // The Snapshot helper will use validCols (with aliases resolved) as input columns, and + // columnListWithoutSizeColumns (still with aliases in it, passed through snapHelperArgs) as output column + // names. + resPtr = CreateAction(validCols, newRDF, snapHelperArgs, + fProxiedPtr); + } if (!options.fLazy) *resPtr; diff --git a/tree/dataframe/inc/ROOT/RDF/RLoopManager.hxx b/tree/dataframe/inc/ROOT/RDF/RLoopManager.hxx index 772703b47330cc..48af82c896fb21 100644 --- a/tree/dataframe/inc/ROOT/RDF/RLoopManager.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RLoopManager.hxx @@ -20,6 +20,10 @@ #include "ROOT/RDF/RSampleInfo.hxx" #include "ROOT/RDF/Utils.hxx" +#ifdef R__HAS_ROOT7 +#include "ROOT/RNTupleWriter.hxx" +#endif + #include #include #include diff --git a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx index e921bf91502a93..fdf001c443df77 100644 --- a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx +++ b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx @@ -18,6 +18,8 @@ namespace ROOT { namespace RDF { +enum class ESnapshotOutputFormat { kDefault, kTTree, kRNTuple }; + /// A collection of options to steer the creation of the dataset on file struct RSnapshotOptions { using ECAlgo = ROOT::RCompressionSetting::EAlgorithm::EValues; @@ -25,14 +27,16 @@ struct RSnapshotOptions { RSnapshotOptions(const RSnapshotOptions &) = default; RSnapshotOptions(RSnapshotOptions &&) = default; RSnapshotOptions(std::string_view mode, ECAlgo comprAlgo, int comprLevel, int autoFlush, int splitLevel, bool lazy, - bool overwriteIfExists = false) + bool overwriteIfExists = false, + ESnapshotOutputFormat outputFormat = ESnapshotOutputFormat::kDefault) : fMode(mode), fCompressionAlgorithm(comprAlgo), fCompressionLevel{comprLevel}, fAutoFlush(autoFlush), fSplitLevel(splitLevel), fLazy(lazy), - fOverwriteIfExists(overwriteIfExists) + fOverwriteIfExists(overwriteIfExists), + fOutputFormat(outputFormat) { } std::string fMode = "RECREATE"; ///< Mode of creation of output file @@ -43,6 +47,7 @@ struct RSnapshotOptions { int fSplitLevel = 99; ///< Split level of output tree bool fLazy = false; ///< Do not start the event loop when Snapshot is called bool fOverwriteIfExists = false; ///< If fMode is "UPDATE", overwrite object in output file if it already exists + ESnapshotOutputFormat fOutputFormat = ESnapshotOutputFormat::kDefault; ///< Which data format to write to }; } // namespace RDF } // namespace ROOT diff --git a/tree/dataframe/src/RDFActionHelpers.cxx b/tree/dataframe/src/RDFActionHelpers.cxx index 53f52cfedf44f0..085218f2bac4a3 100644 --- a/tree/dataframe/src/RDFActionHelpers.cxx +++ b/tree/dataframe/src/RDFActionHelpers.cxx @@ -10,6 +10,7 @@ #include "ROOT/RDF/ActionHelpers.hxx" #include "ROOT/RDF/Utils.hxx" // CacheLineStep +#include "ROOT/RNTuple.hxx" // ValidateSnapshotRNTupleOutput namespace ROOT { namespace Internal { @@ -242,6 +243,56 @@ void ValidateSnapshotOutput(const RSnapshotOptions &opts, const std::string &tre } } +#ifdef R__HAS_ROOT7 +void ValidateSnapshotRNTupleOutput(const RSnapshotOptions &opts, const std::string &ntupleName, + const std::string &fileName) +{ + TString fileMode = opts.fMode; + fileMode.ToLower(); + if (fileMode != "update") + return; + + // output file opened in "update" mode: must check whether output RNTuple is already present in file + std::unique_ptr outFile{TFile::Open(fileName.c_str(), "update")}; + if (!outFile || outFile->IsZombie()) + throw std::invalid_argument("Snapshot: cannot open file \"" + fileName + "\" in update mode"); + + auto *outNTuple = outFile->Get(ntupleName.c_str()); + + if (outNTuple) { + if (opts.fOverwriteIfExists) { + outFile->Delete(ntupleName.c_str()); + return; + } else { + const std::string msg = "Snapshot: RNTuple \"" + ntupleName + "\" already present in file \"" + fileName + + "\". If you want to delete the original ntuple and write another, please set " + "RSnapshotOptions::fOverwriteIfExists to true."; + throw std::invalid_argument(msg); + } + } + + // Also check if there is any object other than an RNTuple with the provided ntupleName. + TObject *outObj = outFile->Get(ntupleName.c_str()); + + if (!outObj) + return; + + // An object called ntupleName is already present in the file. + if (opts.fOverwriteIfExists) { + if (outObj->InheritsFrom("TTree")) { + static_cast(outObj)->Delete("all"); + } else { + outFile->Delete(ntupleName.c_str()); + } + } else { + const std::string msg = "Snapshot: object \"" + ntupleName + "\" already present in file \"" + fileName + + "\". If you want to delete the original object and write a new RNTuple, please set " + "RSnapshotOptions::fOverwriteIfExists to true."; + throw std::invalid_argument(msg); + } +} +#endif + } // end NS RDF } // end NS Internal } // end NS ROOT diff --git a/tree/dataframe/src/RDFInterfaceUtils.cxx b/tree/dataframe/src/RDFInterfaceUtils.cxx index 93c75199237ba3..52c55b34c85063 100644 --- a/tree/dataframe/src/RDFInterfaceUtils.cxx +++ b/tree/dataframe/src/RDFInterfaceUtils.cxx @@ -981,7 +981,7 @@ AddSizeBranches(const std::vector &branches, TTree *tree, std::vect assert(colsWithoutAliases.size() == colsWithAliases.size()); auto nCols = colsWithoutAliases.size(); - // Use index-iteration as we modify the vector during the iteration. + // Use index-iteration as we modify the vector during the iteration. for (std::size_t i = 0u; i < nCols; ++i) { const auto &colName = colsWithoutAliases[i]; if (!IsStrInVec(colName, branches)) @@ -1018,6 +1018,25 @@ void RemoveDuplicates(ColumnNames_t &columnNames) columnNames.end()); } +#ifdef R__HAS_ROOT7 +void RemoveRNTupleSubFields(ColumnNames_t &columnNames) +{ + ColumnNames_t parentFields; + + std::copy_if(columnNames.cbegin(), columnNames.cend(), std::back_inserter(parentFields), + [](const std::string &colName) { return colName.find('.') == std::string::npos; }); + + columnNames.erase(std::remove_if(columnNames.begin(), columnNames.end(), + [&parentFields](const std::string &colName) { + if (colName.find('.') == std::string::npos) + return false; + const auto parentFieldName = colName.substr(0, colName.find_first_of('.')); + return std::find(parentFields.cbegin(), parentFields.cend(), parentFieldName) != + parentFields.end(); + }), + columnNames.end()); +} +#endif } // namespace RDF } // namespace Internal } // namespace ROOT diff --git a/tree/dataframe/test/CMakeLists.txt b/tree/dataframe/test/CMakeLists.txt index fdb3876634721a..43004ebe2bc1eb 100644 --- a/tree/dataframe/test/CMakeLists.txt +++ b/tree/dataframe/test/CMakeLists.txt @@ -97,6 +97,7 @@ endif() if(root7) ROOT_ADD_GTEST(datasource_ntuple datasource_ntuple.cxx LIBRARIES ROOTDataFrame) + ROOT_ADD_GTEST(dataframe_snapshot_ntuple dataframe_snapshot_ntuple.cxx LIBRARIES ROOTDataFrame ROOTNTupleUtil) ROOT_STANDARD_LIBRARY_PACKAGE(NTupleStruct NO_INSTALL_HEADERS diff --git a/tree/dataframe/test/NTupleStruct.hxx b/tree/dataframe/test/NTupleStruct.hxx index 3842aa095ba21b..888dd087dbbf4d 100644 --- a/tree/dataframe/test/NTupleStruct.hxx +++ b/tree/dataframe/test/NTupleStruct.hxx @@ -1,6 +1,8 @@ #ifndef ROOT7_RDataFrame_Test_NTupleStruct #define ROOT7_RDataFrame_Test_NTupleStruct +#include + /** * Used to test serialization and deserialization of classes in RNTuple with TClass */ @@ -8,4 +10,8 @@ struct Electron { float pt; }; +struct Jet { + std::vector electrons; +}; + #endif diff --git a/tree/dataframe/test/dataframe_snapshot_ntuple.cxx b/tree/dataframe/test/dataframe_snapshot_ntuple.cxx new file mode 100644 index 00000000000000..2bb088af1b063d --- /dev/null +++ b/tree/dataframe/test/dataframe_snapshot_ntuple.cxx @@ -0,0 +1,421 @@ +#include "ROOT/TestSupport.hxx" +#include "ROOT/RDataFrame.hxx" + +#include "ROOT/RNTupleModel.hxx" +#include "ROOT/RNTupleWriter.hxx" +#include "ROOT/RNTupleReader.hxx" +#include "ROOT/RNTupleInspector.hxx" // For testing compression settings + +#include "TROOT.h" +#include "TSystem.h" + +#include "gtest/gtest.h" +#include "NTupleStruct.hxx" + +#include + +using ROOT::Experimental::RNTupleInspector; +using ROOT::Experimental::RNTupleModel; +using ROOT::Experimental::RNTupleReader; +using ROOT::Experimental::RNTupleWriter; + +using namespace ROOT::RDF; + +TEST(RDFSnapshotRNTuple, FromScratchTemplated) +{ + const auto filename = "RDFSnapshotRNTuple_from_scratch_templated.root"; + const std::vector columns = {"x"}; + + auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; }); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + + auto sdf = df.Snapshot("ntuple", filename, columns, opts); + + EXPECT_EQ(columns, sdf->GetColumnNames()); + + auto ntuple = RNTupleReader::Open("ntuple", filename); + EXPECT_EQ(25ull, ntuple->GetNEntries()); + + auto x = ntuple->GetView("x"); + for (const auto i : ntuple->GetEntryRange()) { + EXPECT_EQ(10, x(i)); + } +} + +TEST(RDFSnapshotRNTuple, FromScratchJITted) +{ + const auto filename = "RDFSnapshotRNTuple_from_scratch_jitted.root"; + const std::vector columns = {"x"}; + + auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; }); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + + auto sdf = df.Snapshot("ntuple", filename, "x", opts); + + EXPECT_EQ(columns, sdf->GetColumnNames()); + + auto ntuple = RNTupleReader::Open("ntuple", filename); + EXPECT_EQ(25ull, ntuple->GetNEntries()); + + auto x = ntuple->GetView("x"); + for (const auto i : ntuple->GetEntryRange()) { + EXPECT_EQ(10, x(i)); + } +} + +void BookLazySnapshot() +{ + auto d = ROOT::RDataFrame(1); + ROOT::RDF::RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + opts.fLazy = true; + d.Snapshot("t", "lazysnapshotnottriggered_shouldnotbecreated.root", {"rdfentry_"}, opts); +} + +TEST(RDFSnapshotRNTuple, LazyNotTriggered) +{ + ROOT_EXPECT_WARNING(BookLazySnapshot(), "Snapshot", "A lazy Snapshot action was booked but never triggered."); + EXPECT_FALSE(std::filesystem::exists("lazysnapshotnottriggered_shouldnotbecreated.root")); +} + +TEST(RDFSnapshotRNTuple, Compression) +{ + const auto filename = "RDFSnapshotRNTuple_compression.root"; + const std::vector columns = {"x"}; + + auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; }); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + opts.fCompressionAlgorithm = ROOT::RCompressionSetting::EAlgorithm::kLZ4; + opts.fCompressionLevel = 4; + + auto sdf = df.Snapshot("ntuple", filename, "x", opts); + + EXPECT_EQ(columns, sdf->GetColumnNames()); + + auto inspector = RNTupleInspector::Create("ntuple", filename); + EXPECT_EQ(404, inspector->GetCompressionSettings()); +} + +class RDFSnapshotRNTupleTest : public ::testing::Test { +protected: + const std::string fFileName = "RDFSnapshotRNTuple.root"; + const std::string fNtplName = "ntuple"; + + void SetUp() override + { + auto model = RNTupleModel::Create(); + model->MakeField("pt", 42.0); + model->MakeField("tag", "xyz"); + auto fldNnlo = model->MakeField>>("nnlo"); + fldNnlo->push_back(std::vector()); + fldNnlo->push_back(std::vector{1.0}); + fldNnlo->push_back(std::vector{1.0, 2.0, 4.0, 8.0}); + model->MakeField("rvec", ROOT::RVecI{1, 2, 3}); + auto fldElectron = model->MakeField("electron"); + fldElectron->pt = 137.0; + auto fldElectrons = model->MakeField>("electrons"); + fldElectrons->push_back(*fldElectron); + fldElectrons->push_back(*fldElectron); + auto fldJets = model->MakeField>("jets"); + fldJets->push_back(Jet{*fldElectrons}); + { + auto ntuple = RNTupleWriter::Recreate(std::move(model), fNtplName, fFileName); + ntuple->Fill(); + } + } + + void TearDown() override { std::remove(fFileName.c_str()); } +}; + +TEST_F(RDFSnapshotRNTupleTest, DefaultToRNTupleTemplated) +{ + const auto filename = "RDFSnapshotRNTuple_snap_templated.root"; + + auto df = ROOT::RDataFrame(fNtplName, fFileName); + auto sdf = df.Define("x", [] { return 10; }).Snapshot("ntuple", filename, {"pt", "x"}); + + auto ntuple = RNTupleReader::Open("ntuple", filename); + EXPECT_EQ(1ull, ntuple->GetNEntries()); + + auto pt = ntuple->GetView("pt"); + auto x = ntuple->GetView("x"); + + EXPECT_FLOAT_EQ(42.0, pt(0)); + EXPECT_EQ(10, x(0)); +} + +TEST_F(RDFSnapshotRNTupleTest, DefaultToRNTupleJITted) +{ + const auto filename = "RDFSnapshotRNTuple_snap_jitted.root"; + + auto df = ROOT::RDataFrame(fNtplName, fFileName); + auto sdf = df.Define("x", [] { return 10; }).Snapshot("ntuple", filename, {"pt", "x"}); + + auto ntuple = RNTupleReader::Open("ntuple", filename); + EXPECT_EQ(1ull, ntuple->GetNEntries()); + + auto pt = ntuple->GetView("pt"); + auto x = ntuple->GetView("x"); + + EXPECT_FLOAT_EQ(42.0, pt(0)); + EXPECT_EQ(10, x(0)); +} + +TEST_F(RDFSnapshotRNTupleTest, ToTTreeTemplated) +{ + const auto filename = "RDFSnapshotRNTuple_to_ttree_templated.root"; + + auto df = ROOT::RDataFrame(fNtplName, fFileName); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kTTree; + + auto sdf = df.Define("x", [] { return 10; }).Snapshot("tree", filename, {"pt", "x"}, opts); + + TFile file(filename); + auto tree = file.Get("tree"); + EXPECT_EQ(1ull, tree->GetEntries()); + + float pt; + int x; + + tree->SetBranchAddress("pt", &pt); + tree->SetBranchAddress("x", &x); + + tree->GetEntry(0); + + EXPECT_FLOAT_EQ(42.0, pt); + EXPECT_EQ(10, x); +} + +TEST_F(RDFSnapshotRNTupleTest, ToTTreeJITted) +{ + const auto filename = "RDFSnapshotRNTuple_to_ttree_jitted.root"; + + auto df = ROOT::RDataFrame(fNtplName, fFileName); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kTTree; + + auto sdf = df.Define("x", [] { return 10; }).Snapshot("tree", filename, {"pt", "x"}, opts); + + TFile file(filename); + auto tree = file.Get("tree"); + EXPECT_EQ(1ull, tree->GetEntries()); + + float pt; + int x; + + tree->SetBranchAddress("pt", &pt); + tree->SetBranchAddress("x", &x); + + tree->GetEntry(0); + + EXPECT_FLOAT_EQ(42.0, pt); + EXPECT_EQ(10, x); +} + +TEST_F(RDFSnapshotRNTupleTest, ScalarFields) +{ + auto df = ROOT::RDataFrame(fNtplName, fFileName); + auto sdf = df.Snapshot("ntuple", "RDFSnapshotRNTuple_scalar_fields.root", "pt"); + + std::vector expected = {"pt"}; + EXPECT_EQ(expected, sdf->GetColumnNames()); +} + +TEST_F(RDFSnapshotRNTupleTest, VectorFields) +{ + auto df = ROOT::RDataFrame(fNtplName, fFileName); + auto sdf = df.Snapshot("ntuple", "RDFSnapshotRNTuple_all_fields.root", "nnlo"); + + std::vector expected = {"nnlo"}; + EXPECT_EQ(expected, sdf->GetColumnNames()); +} + +TEST_F(RDFSnapshotRNTupleTest, ComplexFields) +{ + auto df = ROOT::RDataFrame(fNtplName, fFileName); + auto sdf = df.Snapshot("ntuple", "RDFSnapshotRNTuple_complex_fields.root", "electron"); + + std::vector expected = {"electron", "electron.pt"}; + EXPECT_EQ(expected, sdf->GetColumnNames()); +} + +TEST_F(RDFSnapshotRNTupleTest, InnerFields) +{ + auto df = ROOT::RDataFrame(fNtplName, fFileName); + + auto sdf1 = df.Snapshot("ntuple", "RDFSnapshotRNTuple_inner_fields.root", "electron.pt"); + + std::vector expected = {"electron_pt"}; + EXPECT_EQ(expected, sdf1->GetColumnNames()); + + auto sdf2 = df.Snapshot("ntuple", "RDFSnapshotRNTuple_inner_fields.root", "jets.electrons"); + + expected = {"jets_electrons", "jets_electrons.pt"}; + EXPECT_EQ(expected, sdf2->GetColumnNames()); +} + +TEST_F(RDFSnapshotRNTupleTest, AllFields) +{ + auto df = ROOT::RDataFrame(fNtplName, fFileName); + auto sdf = df.Snapshot("ntuple", "RDFSnapshotRNTuple_all_fields.root"); + + EXPECT_EQ(df.GetColumnNames(), sdf->GetColumnNames()); +} + +TEST_F(RDFSnapshotRNTupleTest, WithDefines) +{ + auto df = ROOT::RDataFrame(fNtplName, fFileName); + auto sdf = df.Define("x", [] { return 10; }).Snapshot("ntuple", "RDFSnapshotRNTuple_with_defines.root"); + + std::vector expected = df.GetColumnNames(); + expected.push_back("x"); + EXPECT_EQ(expected, sdf->GetColumnNames()); +} + +TEST(RDFSnapshotRNTuple, WithFilters) +{ + const auto filename = "RDFSnapshotRNTuple_defines_and_filters.root"; + + { + auto df = ROOT::RDataFrame(10ull).DefineSlotEntry("x", [](unsigned int, std::uint64_t entry) { return entry; }); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + + df.Snapshot("ntuple", filename, "x", opts); + } + + auto df = ROOT::RDataFrame("ntuple", filename).Filter("x % 2 == 0"); + auto sdf = df.Snapshot("ntuple", "snap_ntuple_filtered.root"); + auto ntuple = RNTupleReader::Open("ntuple", "snap_ntuple_filtered.root"); + EXPECT_EQ(5ull, ntuple->GetNEntries()); + + auto x = ntuple->GetView("x"); + for (const auto i : ntuple->GetEntryRange()) { + EXPECT_FLOAT_EQ(i * 2, x(i)); + } +} + +TEST(RDFSnapshotRNTuple, UpdateDifferentName) +{ + const auto filename = "RDFSnapshotRNTuple_update_different_name.root"; + + { + auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; }); + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + auto sdf = df.Snapshot("ntuple", filename, "x", opts); + } + + auto df = ROOT::RDataFrame("ntuple", filename); + + RSnapshotOptions opts; + opts.fMode = "UPDATE"; + + auto sdf = df.Define("y", [] { return 42; }).Snapshot("ntuple_snap", filename, "", opts); + + std::vector expected = {"x", "y"}; + EXPECT_EQ(expected, sdf->GetColumnNames()); + + auto ntupleOriginal = RNTupleReader::Open("ntuple", filename); + EXPECT_EQ(25ull, ntupleOriginal->GetNEntries()); + + auto ntupleSnap = RNTupleReader::Open("ntuple_snap", filename); + EXPECT_EQ(25ull, ntupleSnap->GetNEntries()); +} + +TEST(RDFSnapshotRNTuple, UpdateSameName) +{ + const auto filename = "RDFSnapshotRNTuple_update_same_name.root"; + + { + auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; }); + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + auto sdf = df.Snapshot("ntuple", filename, "x", opts); + } + + auto df = ROOT::RDataFrame("ntuple", filename); + + RSnapshotOptions opts; + opts.fMode = "UPDATE"; + + try { + auto sdf = df.Define("y", [] { return 42; }).Snapshot("ntuple", filename, {"x", "y"}, opts); + FAIL() << "snapshotting in \"UPDATE\" mode to the same ntuple name without `fOverwriteIfExists` is not allowed "; + } catch (const std::invalid_argument &err) { + EXPECT_STREQ(err.what(), "Snapshot: RNTuple \"ntuple\" already present in file " + "\"RDFSnapshotRNTuple_update_same_name.root\". If you want to delete the original " + "ntuple and write another, please set RSnapshotOptions::fOverwriteIfExists to true."); + } + + opts.fOverwriteIfExists = true; + auto sdf = df.Define("y", [] { return 42; }).Snapshot("ntuple", filename, "", opts); + + std::vector expected = {"x", "y"}; + EXPECT_EQ(expected, sdf->GetColumnNames()); +} + +void WriteTestTree(const std::string &tname, const std::string &fname) +{ + TFile file(fname.c_str(), "RECREATE"); + TTree t(tname.c_str(), tname.c_str()); + float pt; + t.Branch("pt", &pt); + + pt = 42.0; + t.Fill(); + + t.Write(); +} + +TEST(RDFSnapshotRNTuple, DisallowToTTreeTemplated) +{ + const auto treename = "tree"; + const auto filename = "RDFSnapshotRNTuple_disallow_to_ttree_templated.root"; + + WriteTestTree(treename, filename); + + auto df = ROOT::RDataFrame(treename, filename); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + + try { + auto sdf = df.Define("x", [] { return 10; }).Snapshot("ntuple", filename, {"pt", "x"}, opts); + FAIL() << "snapshotting from RNTuple to TTree is not (yet) possible"; + } catch (const std::runtime_error &err) { + EXPECT_STREQ(err.what(), "Snapshotting from TTree to RNTuple is not yet supported. The current recommended way " + "to convert TTrees to RNTuple is through ROOT::Experimental::RNTupleImporter."); + } +} + +TEST(RDFSnapshotRNTuple, DisallowToTTreeJITted) +{ + const auto treename = "tree"; + const auto filename = "RDFSnapshotRNTuple_disallow_to_ttree_jitted.root"; + + WriteTestTree(treename, filename); + + auto df = ROOT::RDataFrame(treename, filename); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + + try { + auto sdf = df.Define("x", [] { return 10; }).Snapshot("ntuple", filename, {"pt", "x"}, opts); + FAIL() << "snapshotting from RNTuple to TTree is not (yet) possible"; + } catch (const std::runtime_error &err) { + EXPECT_STREQ(err.what(), "Snapshotting from TTree to RNTuple is not yet supported. The current recommended way " + "to convert TTrees to RNTuple is through ROOT::Experimental::RNTupleImporter."); + } +}