diff --git a/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx index 6ddc8c9525de72..3315c722f2e09b 100644 --- a/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx @@ -35,7 +35,7 @@ #include "TClassRef.h" #include "TDirectory.h" #include "TError.h" // for R__ASSERT, Warning -#include "TFile.h" // for SnapshotHelper +#include "TFile.h" // for SnapshotHelper #include "TH1.h" #include "TGraph.h" #include "TGraphAsymmErrors.h" @@ -46,6 +46,12 @@ #include "TStatistic.h" #include "ROOT/RDF/RActionImpl.hxx" #include "ROOT/RDF/RMergeableValue.hxx" +#include "ROOT/RDF/RLoopManager.hxx" + +#ifdef R__HAS_ROOT7 +#include "ROOT/RNTupleDS.hxx" +#include "ROOT/RNTupleWriter.hxx" // for SnapshotRNTupleHelper +#endif #include #include @@ -1379,8 +1385,8 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, const std::string &i branchAddress = nullptr; } -/// Helper function for SnapshotHelper and SnapshotHelperMT. It creates new branches for the output TTree of a Snapshot. -/// This overload is called for columns of type `RVec`. For RDF, these can represent: +/// Helper function for SnapshotHelper and SnapshotHelperMT. It creates new branches for the output TTree of a +/// Snapshot. This overload is called for columns of type `RVec`. For RDF, these can represent: /// 1. c-style arrays in ROOT files, so we are sure that there are input trees to which we can ask the correct branch /// title /// 2. RVecs coming from a custom column or the input file/data-source @@ -1483,7 +1489,7 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, const std::string &i void ValidateSnapshotOutput(const RSnapshotOptions &opts, const std::string &treeName, const std::string &fileName); -/// Helper object for a single-thread Snapshot action +/// Helper object for a single-thread TTree-based Snapshot action template class R__CLING_PTRCHECK(off) SnapshotHelper : public RActionImpl> { std::string fFileName; @@ -1507,9 +1513,15 @@ public: SnapshotHelper(std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector &&isDefine) - : fFileName(filename), fDirName(dirname), fTreeName(treename), fOptions(options), fInputBranchNames(vbnames), - fOutputBranchNames(ReplaceDotWithUnderscore(bnames)), fBranches(vbnames.size(), nullptr), - fBranchAddresses(vbnames.size(), nullptr), fIsDefine(std::move(isDefine)) + : fFileName(filename), + fDirName(dirname), + fTreeName(treename), + fOptions(options), + fInputBranchNames(vbnames), + fOutputBranchNames(ReplaceDotWithUnderscore(bnames)), + fBranches(vbnames.size(), nullptr), + fBranchAddresses(vbnames.size(), nullptr), + fIsDefine(std::move(isDefine)) { ValidateSnapshotOutput(fOptions, fTreeName, fFileName); } @@ -1632,7 +1644,7 @@ public: } }; -/// Helper object for a multi-thread Snapshot action +/// Helper object for a multi-thread TTree-based Snapshot action template class R__CLING_PTRCHECK(off) SnapshotHelperMT : public RActionImpl> { unsigned int fNSlots; @@ -1659,11 +1671,20 @@ public: SnapshotHelperMT(const unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector &&isDefine) - : fNSlots(nSlots), fOutputFiles(fNSlots), fOutputTrees(fNSlots), fBranchAddressesNeedReset(fNSlots, 1), - fFileName(filename), fDirName(dirname), fTreeName(treename), fOptions(options), fInputBranchNames(vbnames), - fOutputBranchNames(ReplaceDotWithUnderscore(bnames)), fInputTrees(fNSlots), + : fNSlots(nSlots), + fOutputFiles(fNSlots), + fOutputTrees(fNSlots), + fBranchAddressesNeedReset(fNSlots, 1), + fFileName(filename), + fDirName(dirname), + fTreeName(treename), + fOptions(options), + fInputBranchNames(vbnames), + fOutputBranchNames(ReplaceDotWithUnderscore(bnames)), + fInputTrees(fNSlots), fBranches(fNSlots, std::vector(vbnames.size(), nullptr)), - fBranchAddresses(fNSlots, std::vector(vbnames.size(), nullptr)), fOutputBranches(fNSlots), + fBranchAddresses(fNSlots, std::vector(vbnames.size(), nullptr)), + fOutputBranches(fNSlots), fIsDefine(std::move(isDefine)) { ValidateSnapshotOutput(fOptions, fTreeName, fFileName); @@ -1818,6 +1839,152 @@ public: } }; +#ifdef R__HAS_ROOT7 +void ValidateSnapshotRNTupleOutput(const RSnapshotOptions &opts, const std::string &ntupleName, + const std::string &fileName); + +/// Helper function to update the value of an RNTuple's field in the provided entry. +template +void SetFieldsHelper(T value, std::string_view fieldName, ROOT::Experimental::REntry *entry) +{ + entry->BindValue(fieldName, std::make_shared(value)); +} + +/// Helper object for a single-thread RNTuple-based Snapshot action +template +class R__CLING_PTRCHECK(off) SnapshotRNTupleHelper : public RActionImpl> { + std::string fFileName; + std::string fNTupleName; + + std::unique_ptr fOutFile = nullptr; + + RSnapshotOptions fOptions; + ROOT::Detail::RDF::RLoopManager *fLoopManager; + ColumnNames_t fInputFieldNames; // This contains the resolved aliases + ColumnNames_t fOutputFieldNames; + std::unique_ptr fWriter{nullptr}; + + ROOT::Experimental::REntry *fOutputEntry; + + std::vector fIsDefine; + +public: + using ColumnTypes_t = TypeList; + SnapshotRNTupleHelper(std::string_view filename, std::string_view ntuplename, const ColumnNames_t &vfnames, + const ColumnNames_t &fnames, const RSnapshotOptions &options, + ROOT::Detail::RDF::RLoopManager *lm, std::vector &&isDefine) + : fFileName(filename), + fNTupleName(ntuplename), + fOptions(options), + fLoopManager(lm), + fInputFieldNames(vfnames), + fOutputFieldNames(ReplaceDotWithUnderscore(fnames)), + fIsDefine(std::move(isDefine)) + { + ValidateSnapshotRNTupleOutput(fOptions, fNTupleName, fFileName); + } + + SnapshotRNTupleHelper(const SnapshotRNTupleHelper &) = delete; + SnapshotRNTupleHelper(SnapshotRNTupleHelper &&) = default; + ~SnapshotRNTupleHelper() + { + if (!fNTupleName.empty() && !fLoopManager->GetDataSource() && fOptions.fLazy) + Warning("Snapshot", "A lazy Snapshot action was booked but never triggered."); + } + + void InitTask(TTreeReader *, unsigned int /* slot */) {} + + void Exec(unsigned int /* slot */, ColTypes &...values) + { + using ind_t = std::index_sequence_for; + + SetFields(values..., ind_t{}); + fWriter->Fill(); + } + + template + void SetFields(ColTypes &...values, std::index_sequence /*dummy*/) + { + int expander[] = {(SetFieldsHelper(values, fOutputFieldNames[S], fOutputEntry), 0)..., 0}; + (void)expander; // avoid unused variable warnings for older compilers (gcc 14.1) + } + + void Initialize() + { + using ind_t = std::index_sequence_for; + + auto model = ROOT::Experimental::RNTupleModel::Create(); + MakeFields(*model, ind_t{}); + fOutputEntry = &model->GetDefaultEntry(); + + fOutFile = std::unique_ptr( + TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str() /* , cs */)); + if (!fOutFile) + throw std::runtime_error("Snapshot: could not create output file " + fFileName); + + ROOT::Experimental::RNTupleWriteOptions writeOptions; + writeOptions.SetCompression(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel); + + TString checkupdate = fOptions.fMode; + checkupdate.ToLower(); + + if (checkupdate == "update") { + fWriter = ROOT::Experimental::RNTupleWriter::Append(std::move(model), fNTupleName, *fOutFile, writeOptions); + } else { + // If the output file is not closed before the RNTupleWriter is created, no actual data will be written and + // the file will be empty. + fOutFile->Close(); + fWriter = ROOT::Experimental::RNTupleWriter::Recreate(std::move(model), fNTupleName, fFileName, writeOptions); + } + } + + template + void MakeFields(ROOT::Experimental::RNTupleModel &model, std::index_sequence /*dummy*/) + { + int expander[] = {(model.MakeField(fOutputFieldNames[S]), 0)..., 0}; + (void)expander; // avoid unused variable warnings for older compilers (gcc 14.1) + } + + void Finalize() + { + { + fWriter.reset(); + } + fLoopManager->SetDataSource(std::make_unique(fNTupleName, fFileName)); + } + + std::string GetActionName() { return "Snapshot"; } + + ROOT::RDF::SampleCallback_t GetSampleCallback() final + { + return [](unsigned int, const RSampleInfo &) mutable {}; + } + + /** + * @brief Create a new SnapshotRNTupleHelper with a different output file name + * + * @param newName A type-erased string with the output file name + * @return SnapshotRNTupleHelper + * + * This MakeNew implementation is tied to the cloning feature of actions + * of the computation graph. In particular, cloning a Snapshot node usually + * also involves changing the name of the output file, otherwise the cloned + * Snapshot would overwrite the same file. + */ + SnapshotRNTupleHelper MakeNew(void *newName) + { + const std::string finalName = *reinterpret_cast(newName); + return SnapshotRNTupleHelper{finalName, + fNTupleName, + fInputFieldNames, + fOutputFieldNames, + fOptions, + fLoopManager, + std::vector(fIsDefine)}; + } +}; +#endif + template ::value> class R__CLING_PTRCHECK(off) AggregateHelper diff --git a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx index 7ab241761bc63a..d15c6f48e01799 100644 --- a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx +++ b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx @@ -250,6 +250,8 @@ struct SnapshotHelperArgs { std::string fTreeName; std::vector fOutputColNames; ROOT::RDF::RSnapshotOptions fOptions; + RDFDetail::RLoopManager *fLoopManager; + bool fToNTuple; }; // Snapshot action @@ -275,20 +277,39 @@ BuildAction(const ColumnNames_t &colNames, const std::shared_ptr isDefine = makeIsDefine(); std::unique_ptr actionPtr; - if (!ROOT::IsImplicitMTEnabled()) { - // single-thread snapshot - using Helper_t = SnapshotHelper; + if (snapHelperArgs->fToNTuple) { +#ifdef R__HAS_ROOT7 + // TODO(fdegeus) Add MT snapshotting + using Helper_t = SnapshotRNTupleHelper; using Action_t = RAction; + + auto loopManager = snapHelperArgs->fLoopManager; + actionPtr.reset( - new Action_t(Helper_t(filename, dirname, treename, colNames, outputColNames, options, std::move(isDefine)), + new Action_t(Helper_t(filename, treename, colNames, outputColNames, options, loopManager, std::move(isDefine)), colNames, prevNode, colRegister)); + + return actionPtr; +#else + throw std::runtime_error("Cannot snapshot to RNTuple: this installation of ROOT has not been build with ROOT7 " + "components enabled."); +#endif } else { - // multi-thread snapshot - using Helper_t = SnapshotHelperMT; - using Action_t = RAction; - actionPtr.reset(new Action_t( - Helper_t(nSlots, filename, dirname, treename, colNames, outputColNames, options, std::move(isDefine)), - colNames, prevNode, colRegister)); + if (!ROOT::IsImplicitMTEnabled()) { + // single-thread snapshot + using Helper_t = SnapshotHelper; + using Action_t = RAction; + actionPtr.reset( + new Action_t(Helper_t(filename, dirname, treename, colNames, outputColNames, options, std::move(isDefine)), + colNames, prevNode, colRegister)); + } else { + // multi-thread snapshot + using Helper_t = SnapshotHelperMT; + using Action_t = RAction; + actionPtr.reset(new Action_t( + Helper_t(nSlots, filename, dirname, treename, colNames, outputColNames, options, std::move(isDefine)), + colNames, prevNode, colRegister)); + } } return actionPtr; } @@ -781,6 +802,10 @@ AddSizeBranches(const std::vector &branches, TTree *tree, std::vect void RemoveDuplicates(ColumnNames_t &columnNames); +#ifdef R__HAS_ROOT7 +void RemoveRNTupleSubFields(ColumnNames_t &columnNames); +#endif + } // namespace RDF } // namespace Internal diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index e12c856be3048a..9d1286882bb529 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -1161,29 +1161,62 @@ public: const auto &colListNoAliasesWithSizeBranches = pairOfColumnLists.first; const auto &colListWithAliasesAndSizeBranches = pairOfColumnLists.second; - const auto fullTreeName = treename; const auto parsedTreePath = RDFInternal::ParseTreePath(fullTreeName); treename = parsedTreePath.fTreeName; const auto &dirname = parsedTreePath.fDirName; - auto snapHelperArgs = std::make_shared( - RDFInternal::SnapshotHelperArgs{std::string(filename), std::string(dirname), std::string(treename), - colListWithAliasesAndSizeBranches, options}); - ::TDirectory::TContext ctxt; - // The CreateLMFromTTree function by default opens the file passed as input - // to check for the presence of the TTree inside. But at this moment the - // filename we are using here corresponds to a file which does not exist yet, - // i.e. the output file of the Snapshot call. Thus, checkFile=false will - // prevent the function from trying to open a non-existent file. - auto newRDF = std::make_shared>(ROOT::Detail::RDF::CreateLMFromTTree( - fullTreeName, filename, colListNoAliasesWithSizeBranches, /*checkFile*/ false)); + RResultPtr> resPtr; - auto resPtr = CreateAction( - colListNoAliasesWithSizeBranches, newRDF, snapHelperArgs, fProxiedPtr, - colListNoAliasesWithSizeBranches.size()); + bool isBasedOnRNTuple = fLoopManager->GetDataSource() && fLoopManager->GetDataSource()->GetLabel() == "RNTupleDS"; + + if (options.fOutputFormat == ESnapshotOutputFormat::kRNTuple || + (options.fOutputFormat == ESnapshotOutputFormat::kDefault && isBasedOnRNTuple)) { +#ifdef R__HAS_ROOT7 + if (fLoopManager->GetTree()) { + throw std::runtime_error( + "Snapshotting from TTree to RNTuple is not yet supported. The current recommended " + "way to convert TTrees to RNTuple is through ROOT::Experimental::RNTupleImporter."); + } + + auto newRDF = std::make_shared>(std::make_shared(0)); + + auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ + std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches, + options, newRDF->GetLoopManager(), true /* fToNTuple */}); + + // The Snapshot helper will use colListNoAliasesWithSizeBranches (with aliases resolved) as input columns, and + // colListWithAliasesAndSizeBranches (still with aliases in it, passed through snapHelperArgs) as output column + // names. + resPtr = CreateAction( + colListNoAliasesWithSizeBranches, newRDF, snapHelperArgs, fProxiedPtr, + colListNoAliasesWithSizeBranches.size()); +#else + throw std::runtime_error("Cannot snapshot to RNTuple: this installation of ROOT has not been build with ROOT7 " + "components enabled."); +#endif + } else { + // The CreateLMFromTTree function by default opens the file passed as input + // to check for the presence of the TTree inside. But at this moment the + // filename we are using here corresponds to a file which does not exist yet, + // i.e. the output file of the Snapshot call. Thus, checkFile=false will + // prevent the function from trying to open a non-existent file. + auto newRDF = std::make_shared>(ROOT::Detail::RDF::CreateLMFromTTree( + fullTreeName, filename, /*defaultColumns=*/colListNoPoundSizes, /*checkFile=*/false)); + + auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ + std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches, + options, nullptr, false /* fToNTuple */}); + + // The Snapshot helper will use colListNoAliasesWithSizeBranches (with aliases resolved) as input columns, and + // colListWithAliasesAndSizeBranches (still with aliases in it, passed through snapHelperArgs) as output column + // names. + resPtr = CreateAction( + colListNoAliasesWithSizeBranches, newRDF, snapHelperArgs, fProxiedPtr, + colListNoAliasesWithSizeBranches.size()); + } if (!options.fLazy) *resPtr; @@ -1209,6 +1242,7 @@ public: { const auto definedColumns = fColRegister.GenerateColumnNames(); auto *tree = fLoopManager->GetTree(); + const auto treeBranchNames = tree != nullptr ? ROOT::Internal::TreeUtils::GetTopLevelBranchNames(*tree) : ColumnNames_t{}; const auto dsColumns = GetDataSource() ? GetDataSource()->GetColumnNames() : ColumnNames_t{}; // Ignore R_rdf_sizeof_* columns coming from datasources: we don't want to Snapshot those @@ -1225,7 +1259,12 @@ public: // RemoveDuplicates should preserve ordering of the columns: it might be meaningful. RDFInternal::RemoveDuplicates(columnNames); - const auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Snapshot"); + auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Snapshot"); + + if (GetDataSource() && GetDataSource()->GetLabel() == "RNTupleDS") { + RDFInternal::RemoveRNTupleSubFields(selectedColumns); + } + return Snapshot(treename, filename, selectedColumns, options); } // clang-format on @@ -3002,23 +3041,55 @@ private: const auto &treename = parsedTreePath.fTreeName; const auto &dirname = parsedTreePath.fDirName; - auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ - std::string(filename), std::string(dirname), std::string(treename), columnListWithoutSizeColumns, options}); - ::TDirectory::TContext ctxt; - // The CreateLMFromTTree function by default opens the file passed as input - // to check for the presence of the TTree inside. But at this moment the - // filename we are using here corresponds to a file which does not exist yet, - // i.e. the output file of the Snapshot call. Thus, checkFile=false will - // prevent the function from trying to open a non-existent file. - auto newRDF = std::make_shared>(ROOT::Detail::RDF::CreateLMFromTTree( - fullTreeName, filename, /*defaultColumns=*/columnListWithoutSizeColumns, /*checkFile=*/false)); - - // The Snapshot helper will use validCols (with aliases resolved) as input columns, and - // columnListWithoutSizeColumns (still with aliases in it, passed through snapHelperArgs) as output column names. - auto resPtr = CreateAction(validCols, newRDF, snapHelperArgs, - fProxiedPtr); + RResultPtr> resPtr; + + bool isBasedOnRNTuple = fLoopManager->GetDataSource() && fLoopManager->GetDataSource()->GetLabel() == "RNTupleDS"; + + if (options.fOutputFormat == ESnapshotOutputFormat::kRNTuple || + (options.fOutputFormat == ESnapshotOutputFormat::kDefault && isBasedOnRNTuple)) { +#ifdef R__HAS_ROOT7 + if (fLoopManager->GetTree()) { + throw std::runtime_error( + "Snapshotting from TTree to RNTuple is not yet supported. The current recommended " + "way to convert TTrees to RNTuple is through ROOT::Experimental::RNTupleImporter."); + } + + auto newRDF = std::make_shared>(std::make_shared(0)); + + auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ + std::string(filename), std::string(dirname), std::string(treename), columnListWithoutSizeColumns, options, + newRDF->GetLoopManager(), true /* fToRNTuple */}); + + // The Snapshot helper will use validCols (with aliases resolved) as input columns, and + // columnListWithoutSizeColumns (still with aliases in it, passed through snapHelperArgs) as output column + // names. + resPtr = CreateAction(validCols, newRDF, snapHelperArgs, + fProxiedPtr); +#else + throw std::runtime_error("Cannot snapshot to RNTuple: this installation of ROOT has not been build with ROOT7 " + "components enabled."); +#endif + } else { + // The CreateLMFromTTree function by default opens the file passed as input + // to check for the presence of the TTree inside. But at this moment the + // filename we are using here corresponds to a file which does not exist yet, + // i.e. the output file of the Snapshot call. Thus, checkFile=false will + // prevent the function from trying to open a non-existent file. + auto newRDF = std::make_shared>(ROOT::Detail::RDF::CreateLMFromTTree( + fullTreeName, filename, /*defaultColumns=*/columnListWithoutSizeColumns, /*checkFile=*/false)); + + auto snapHelperArgs = std::make_shared( + RDFInternal::SnapshotHelperArgs{std::string(filename), std::string(dirname), std::string(treename), + columnListWithoutSizeColumns, options, nullptr, false /* fToRNTuple */}); + + // The Snapshot helper will use validCols (with aliases resolved) as input columns, and + // columnListWithoutSizeColumns (still with aliases in it, passed through snapHelperArgs) as output column + // names. + resPtr = CreateAction(validCols, newRDF, snapHelperArgs, + fProxiedPtr); + } if (!options.fLazy) *resPtr; diff --git a/tree/dataframe/inc/ROOT/RDF/RLoopManager.hxx b/tree/dataframe/inc/ROOT/RDF/RLoopManager.hxx index 772703b47330cc..48af82c896fb21 100644 --- a/tree/dataframe/inc/ROOT/RDF/RLoopManager.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RLoopManager.hxx @@ -20,6 +20,10 @@ #include "ROOT/RDF/RSampleInfo.hxx" #include "ROOT/RDF/Utils.hxx" +#ifdef R__HAS_ROOT7 +#include "ROOT/RNTupleWriter.hxx" +#endif + #include #include #include diff --git a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx index e921bf91502a93..fdf001c443df77 100644 --- a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx +++ b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx @@ -18,6 +18,8 @@ namespace ROOT { namespace RDF { +enum class ESnapshotOutputFormat { kDefault, kTTree, kRNTuple }; + /// A collection of options to steer the creation of the dataset on file struct RSnapshotOptions { using ECAlgo = ROOT::RCompressionSetting::EAlgorithm::EValues; @@ -25,14 +27,16 @@ struct RSnapshotOptions { RSnapshotOptions(const RSnapshotOptions &) = default; RSnapshotOptions(RSnapshotOptions &&) = default; RSnapshotOptions(std::string_view mode, ECAlgo comprAlgo, int comprLevel, int autoFlush, int splitLevel, bool lazy, - bool overwriteIfExists = false) + bool overwriteIfExists = false, + ESnapshotOutputFormat outputFormat = ESnapshotOutputFormat::kDefault) : fMode(mode), fCompressionAlgorithm(comprAlgo), fCompressionLevel{comprLevel}, fAutoFlush(autoFlush), fSplitLevel(splitLevel), fLazy(lazy), - fOverwriteIfExists(overwriteIfExists) + fOverwriteIfExists(overwriteIfExists), + fOutputFormat(outputFormat) { } std::string fMode = "RECREATE"; ///< Mode of creation of output file @@ -43,6 +47,7 @@ struct RSnapshotOptions { int fSplitLevel = 99; ///< Split level of output tree bool fLazy = false; ///< Do not start the event loop when Snapshot is called bool fOverwriteIfExists = false; ///< If fMode is "UPDATE", overwrite object in output file if it already exists + ESnapshotOutputFormat fOutputFormat = ESnapshotOutputFormat::kDefault; ///< Which data format to write to }; } // namespace RDF } // namespace ROOT diff --git a/tree/dataframe/src/RDFActionHelpers.cxx b/tree/dataframe/src/RDFActionHelpers.cxx index 53f52cfedf44f0..085218f2bac4a3 100644 --- a/tree/dataframe/src/RDFActionHelpers.cxx +++ b/tree/dataframe/src/RDFActionHelpers.cxx @@ -10,6 +10,7 @@ #include "ROOT/RDF/ActionHelpers.hxx" #include "ROOT/RDF/Utils.hxx" // CacheLineStep +#include "ROOT/RNTuple.hxx" // ValidateSnapshotRNTupleOutput namespace ROOT { namespace Internal { @@ -242,6 +243,56 @@ void ValidateSnapshotOutput(const RSnapshotOptions &opts, const std::string &tre } } +#ifdef R__HAS_ROOT7 +void ValidateSnapshotRNTupleOutput(const RSnapshotOptions &opts, const std::string &ntupleName, + const std::string &fileName) +{ + TString fileMode = opts.fMode; + fileMode.ToLower(); + if (fileMode != "update") + return; + + // output file opened in "update" mode: must check whether output RNTuple is already present in file + std::unique_ptr outFile{TFile::Open(fileName.c_str(), "update")}; + if (!outFile || outFile->IsZombie()) + throw std::invalid_argument("Snapshot: cannot open file \"" + fileName + "\" in update mode"); + + auto *outNTuple = outFile->Get(ntupleName.c_str()); + + if (outNTuple) { + if (opts.fOverwriteIfExists) { + outFile->Delete(ntupleName.c_str()); + return; + } else { + const std::string msg = "Snapshot: RNTuple \"" + ntupleName + "\" already present in file \"" + fileName + + "\". If you want to delete the original ntuple and write another, please set " + "RSnapshotOptions::fOverwriteIfExists to true."; + throw std::invalid_argument(msg); + } + } + + // Also check if there is any object other than an RNTuple with the provided ntupleName. + TObject *outObj = outFile->Get(ntupleName.c_str()); + + if (!outObj) + return; + + // An object called ntupleName is already present in the file. + if (opts.fOverwriteIfExists) { + if (outObj->InheritsFrom("TTree")) { + static_cast(outObj)->Delete("all"); + } else { + outFile->Delete(ntupleName.c_str()); + } + } else { + const std::string msg = "Snapshot: object \"" + ntupleName + "\" already present in file \"" + fileName + + "\". If you want to delete the original object and write a new RNTuple, please set " + "RSnapshotOptions::fOverwriteIfExists to true."; + throw std::invalid_argument(msg); + } +} +#endif + } // end NS RDF } // end NS Internal } // end NS ROOT diff --git a/tree/dataframe/src/RDFInterfaceUtils.cxx b/tree/dataframe/src/RDFInterfaceUtils.cxx index 93c75199237ba3..52c55b34c85063 100644 --- a/tree/dataframe/src/RDFInterfaceUtils.cxx +++ b/tree/dataframe/src/RDFInterfaceUtils.cxx @@ -981,7 +981,7 @@ AddSizeBranches(const std::vector &branches, TTree *tree, std::vect assert(colsWithoutAliases.size() == colsWithAliases.size()); auto nCols = colsWithoutAliases.size(); - // Use index-iteration as we modify the vector during the iteration. + // Use index-iteration as we modify the vector during the iteration. for (std::size_t i = 0u; i < nCols; ++i) { const auto &colName = colsWithoutAliases[i]; if (!IsStrInVec(colName, branches)) @@ -1018,6 +1018,25 @@ void RemoveDuplicates(ColumnNames_t &columnNames) columnNames.end()); } +#ifdef R__HAS_ROOT7 +void RemoveRNTupleSubFields(ColumnNames_t &columnNames) +{ + ColumnNames_t parentFields; + + std::copy_if(columnNames.cbegin(), columnNames.cend(), std::back_inserter(parentFields), + [](const std::string &colName) { return colName.find('.') == std::string::npos; }); + + columnNames.erase(std::remove_if(columnNames.begin(), columnNames.end(), + [&parentFields](const std::string &colName) { + if (colName.find('.') == std::string::npos) + return false; + const auto parentFieldName = colName.substr(0, colName.find_first_of('.')); + return std::find(parentFields.cbegin(), parentFields.cend(), parentFieldName) != + parentFields.end(); + }), + columnNames.end()); +} +#endif } // namespace RDF } // namespace Internal } // namespace ROOT diff --git a/tree/dataframe/test/CMakeLists.txt b/tree/dataframe/test/CMakeLists.txt index fdb3876634721a..43004ebe2bc1eb 100644 --- a/tree/dataframe/test/CMakeLists.txt +++ b/tree/dataframe/test/CMakeLists.txt @@ -97,6 +97,7 @@ endif() if(root7) ROOT_ADD_GTEST(datasource_ntuple datasource_ntuple.cxx LIBRARIES ROOTDataFrame) + ROOT_ADD_GTEST(dataframe_snapshot_ntuple dataframe_snapshot_ntuple.cxx LIBRARIES ROOTDataFrame ROOTNTupleUtil) ROOT_STANDARD_LIBRARY_PACKAGE(NTupleStruct NO_INSTALL_HEADERS diff --git a/tree/dataframe/test/NTupleStruct.hxx b/tree/dataframe/test/NTupleStruct.hxx index 3842aa095ba21b..888dd087dbbf4d 100644 --- a/tree/dataframe/test/NTupleStruct.hxx +++ b/tree/dataframe/test/NTupleStruct.hxx @@ -1,6 +1,8 @@ #ifndef ROOT7_RDataFrame_Test_NTupleStruct #define ROOT7_RDataFrame_Test_NTupleStruct +#include + /** * Used to test serialization and deserialization of classes in RNTuple with TClass */ @@ -8,4 +10,8 @@ struct Electron { float pt; }; +struct Jet { + std::vector electrons; +}; + #endif diff --git a/tree/dataframe/test/dataframe_snapshot_ntuple.cxx b/tree/dataframe/test/dataframe_snapshot_ntuple.cxx new file mode 100644 index 00000000000000..2bb088af1b063d --- /dev/null +++ b/tree/dataframe/test/dataframe_snapshot_ntuple.cxx @@ -0,0 +1,421 @@ +#include "ROOT/TestSupport.hxx" +#include "ROOT/RDataFrame.hxx" + +#include "ROOT/RNTupleModel.hxx" +#include "ROOT/RNTupleWriter.hxx" +#include "ROOT/RNTupleReader.hxx" +#include "ROOT/RNTupleInspector.hxx" // For testing compression settings + +#include "TROOT.h" +#include "TSystem.h" + +#include "gtest/gtest.h" +#include "NTupleStruct.hxx" + +#include + +using ROOT::Experimental::RNTupleInspector; +using ROOT::Experimental::RNTupleModel; +using ROOT::Experimental::RNTupleReader; +using ROOT::Experimental::RNTupleWriter; + +using namespace ROOT::RDF; + +TEST(RDFSnapshotRNTuple, FromScratchTemplated) +{ + const auto filename = "RDFSnapshotRNTuple_from_scratch_templated.root"; + const std::vector columns = {"x"}; + + auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; }); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + + auto sdf = df.Snapshot("ntuple", filename, columns, opts); + + EXPECT_EQ(columns, sdf->GetColumnNames()); + + auto ntuple = RNTupleReader::Open("ntuple", filename); + EXPECT_EQ(25ull, ntuple->GetNEntries()); + + auto x = ntuple->GetView("x"); + for (const auto i : ntuple->GetEntryRange()) { + EXPECT_EQ(10, x(i)); + } +} + +TEST(RDFSnapshotRNTuple, FromScratchJITted) +{ + const auto filename = "RDFSnapshotRNTuple_from_scratch_jitted.root"; + const std::vector columns = {"x"}; + + auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; }); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + + auto sdf = df.Snapshot("ntuple", filename, "x", opts); + + EXPECT_EQ(columns, sdf->GetColumnNames()); + + auto ntuple = RNTupleReader::Open("ntuple", filename); + EXPECT_EQ(25ull, ntuple->GetNEntries()); + + auto x = ntuple->GetView("x"); + for (const auto i : ntuple->GetEntryRange()) { + EXPECT_EQ(10, x(i)); + } +} + +void BookLazySnapshot() +{ + auto d = ROOT::RDataFrame(1); + ROOT::RDF::RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + opts.fLazy = true; + d.Snapshot("t", "lazysnapshotnottriggered_shouldnotbecreated.root", {"rdfentry_"}, opts); +} + +TEST(RDFSnapshotRNTuple, LazyNotTriggered) +{ + ROOT_EXPECT_WARNING(BookLazySnapshot(), "Snapshot", "A lazy Snapshot action was booked but never triggered."); + EXPECT_FALSE(std::filesystem::exists("lazysnapshotnottriggered_shouldnotbecreated.root")); +} + +TEST(RDFSnapshotRNTuple, Compression) +{ + const auto filename = "RDFSnapshotRNTuple_compression.root"; + const std::vector columns = {"x"}; + + auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; }); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + opts.fCompressionAlgorithm = ROOT::RCompressionSetting::EAlgorithm::kLZ4; + opts.fCompressionLevel = 4; + + auto sdf = df.Snapshot("ntuple", filename, "x", opts); + + EXPECT_EQ(columns, sdf->GetColumnNames()); + + auto inspector = RNTupleInspector::Create("ntuple", filename); + EXPECT_EQ(404, inspector->GetCompressionSettings()); +} + +class RDFSnapshotRNTupleTest : public ::testing::Test { +protected: + const std::string fFileName = "RDFSnapshotRNTuple.root"; + const std::string fNtplName = "ntuple"; + + void SetUp() override + { + auto model = RNTupleModel::Create(); + model->MakeField("pt", 42.0); + model->MakeField("tag", "xyz"); + auto fldNnlo = model->MakeField>>("nnlo"); + fldNnlo->push_back(std::vector()); + fldNnlo->push_back(std::vector{1.0}); + fldNnlo->push_back(std::vector{1.0, 2.0, 4.0, 8.0}); + model->MakeField("rvec", ROOT::RVecI{1, 2, 3}); + auto fldElectron = model->MakeField("electron"); + fldElectron->pt = 137.0; + auto fldElectrons = model->MakeField>("electrons"); + fldElectrons->push_back(*fldElectron); + fldElectrons->push_back(*fldElectron); + auto fldJets = model->MakeField>("jets"); + fldJets->push_back(Jet{*fldElectrons}); + { + auto ntuple = RNTupleWriter::Recreate(std::move(model), fNtplName, fFileName); + ntuple->Fill(); + } + } + + void TearDown() override { std::remove(fFileName.c_str()); } +}; + +TEST_F(RDFSnapshotRNTupleTest, DefaultToRNTupleTemplated) +{ + const auto filename = "RDFSnapshotRNTuple_snap_templated.root"; + + auto df = ROOT::RDataFrame(fNtplName, fFileName); + auto sdf = df.Define("x", [] { return 10; }).Snapshot("ntuple", filename, {"pt", "x"}); + + auto ntuple = RNTupleReader::Open("ntuple", filename); + EXPECT_EQ(1ull, ntuple->GetNEntries()); + + auto pt = ntuple->GetView("pt"); + auto x = ntuple->GetView("x"); + + EXPECT_FLOAT_EQ(42.0, pt(0)); + EXPECT_EQ(10, x(0)); +} + +TEST_F(RDFSnapshotRNTupleTest, DefaultToRNTupleJITted) +{ + const auto filename = "RDFSnapshotRNTuple_snap_jitted.root"; + + auto df = ROOT::RDataFrame(fNtplName, fFileName); + auto sdf = df.Define("x", [] { return 10; }).Snapshot("ntuple", filename, {"pt", "x"}); + + auto ntuple = RNTupleReader::Open("ntuple", filename); + EXPECT_EQ(1ull, ntuple->GetNEntries()); + + auto pt = ntuple->GetView("pt"); + auto x = ntuple->GetView("x"); + + EXPECT_FLOAT_EQ(42.0, pt(0)); + EXPECT_EQ(10, x(0)); +} + +TEST_F(RDFSnapshotRNTupleTest, ToTTreeTemplated) +{ + const auto filename = "RDFSnapshotRNTuple_to_ttree_templated.root"; + + auto df = ROOT::RDataFrame(fNtplName, fFileName); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kTTree; + + auto sdf = df.Define("x", [] { return 10; }).Snapshot("tree", filename, {"pt", "x"}, opts); + + TFile file(filename); + auto tree = file.Get("tree"); + EXPECT_EQ(1ull, tree->GetEntries()); + + float pt; + int x; + + tree->SetBranchAddress("pt", &pt); + tree->SetBranchAddress("x", &x); + + tree->GetEntry(0); + + EXPECT_FLOAT_EQ(42.0, pt); + EXPECT_EQ(10, x); +} + +TEST_F(RDFSnapshotRNTupleTest, ToTTreeJITted) +{ + const auto filename = "RDFSnapshotRNTuple_to_ttree_jitted.root"; + + auto df = ROOT::RDataFrame(fNtplName, fFileName); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kTTree; + + auto sdf = df.Define("x", [] { return 10; }).Snapshot("tree", filename, {"pt", "x"}, opts); + + TFile file(filename); + auto tree = file.Get("tree"); + EXPECT_EQ(1ull, tree->GetEntries()); + + float pt; + int x; + + tree->SetBranchAddress("pt", &pt); + tree->SetBranchAddress("x", &x); + + tree->GetEntry(0); + + EXPECT_FLOAT_EQ(42.0, pt); + EXPECT_EQ(10, x); +} + +TEST_F(RDFSnapshotRNTupleTest, ScalarFields) +{ + auto df = ROOT::RDataFrame(fNtplName, fFileName); + auto sdf = df.Snapshot("ntuple", "RDFSnapshotRNTuple_scalar_fields.root", "pt"); + + std::vector expected = {"pt"}; + EXPECT_EQ(expected, sdf->GetColumnNames()); +} + +TEST_F(RDFSnapshotRNTupleTest, VectorFields) +{ + auto df = ROOT::RDataFrame(fNtplName, fFileName); + auto sdf = df.Snapshot("ntuple", "RDFSnapshotRNTuple_all_fields.root", "nnlo"); + + std::vector expected = {"nnlo"}; + EXPECT_EQ(expected, sdf->GetColumnNames()); +} + +TEST_F(RDFSnapshotRNTupleTest, ComplexFields) +{ + auto df = ROOT::RDataFrame(fNtplName, fFileName); + auto sdf = df.Snapshot("ntuple", "RDFSnapshotRNTuple_complex_fields.root", "electron"); + + std::vector expected = {"electron", "electron.pt"}; + EXPECT_EQ(expected, sdf->GetColumnNames()); +} + +TEST_F(RDFSnapshotRNTupleTest, InnerFields) +{ + auto df = ROOT::RDataFrame(fNtplName, fFileName); + + auto sdf1 = df.Snapshot("ntuple", "RDFSnapshotRNTuple_inner_fields.root", "electron.pt"); + + std::vector expected = {"electron_pt"}; + EXPECT_EQ(expected, sdf1->GetColumnNames()); + + auto sdf2 = df.Snapshot("ntuple", "RDFSnapshotRNTuple_inner_fields.root", "jets.electrons"); + + expected = {"jets_electrons", "jets_electrons.pt"}; + EXPECT_EQ(expected, sdf2->GetColumnNames()); +} + +TEST_F(RDFSnapshotRNTupleTest, AllFields) +{ + auto df = ROOT::RDataFrame(fNtplName, fFileName); + auto sdf = df.Snapshot("ntuple", "RDFSnapshotRNTuple_all_fields.root"); + + EXPECT_EQ(df.GetColumnNames(), sdf->GetColumnNames()); +} + +TEST_F(RDFSnapshotRNTupleTest, WithDefines) +{ + auto df = ROOT::RDataFrame(fNtplName, fFileName); + auto sdf = df.Define("x", [] { return 10; }).Snapshot("ntuple", "RDFSnapshotRNTuple_with_defines.root"); + + std::vector expected = df.GetColumnNames(); + expected.push_back("x"); + EXPECT_EQ(expected, sdf->GetColumnNames()); +} + +TEST(RDFSnapshotRNTuple, WithFilters) +{ + const auto filename = "RDFSnapshotRNTuple_defines_and_filters.root"; + + { + auto df = ROOT::RDataFrame(10ull).DefineSlotEntry("x", [](unsigned int, std::uint64_t entry) { return entry; }); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + + df.Snapshot("ntuple", filename, "x", opts); + } + + auto df = ROOT::RDataFrame("ntuple", filename).Filter("x % 2 == 0"); + auto sdf = df.Snapshot("ntuple", "snap_ntuple_filtered.root"); + auto ntuple = RNTupleReader::Open("ntuple", "snap_ntuple_filtered.root"); + EXPECT_EQ(5ull, ntuple->GetNEntries()); + + auto x = ntuple->GetView("x"); + for (const auto i : ntuple->GetEntryRange()) { + EXPECT_FLOAT_EQ(i * 2, x(i)); + } +} + +TEST(RDFSnapshotRNTuple, UpdateDifferentName) +{ + const auto filename = "RDFSnapshotRNTuple_update_different_name.root"; + + { + auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; }); + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + auto sdf = df.Snapshot("ntuple", filename, "x", opts); + } + + auto df = ROOT::RDataFrame("ntuple", filename); + + RSnapshotOptions opts; + opts.fMode = "UPDATE"; + + auto sdf = df.Define("y", [] { return 42; }).Snapshot("ntuple_snap", filename, "", opts); + + std::vector expected = {"x", "y"}; + EXPECT_EQ(expected, sdf->GetColumnNames()); + + auto ntupleOriginal = RNTupleReader::Open("ntuple", filename); + EXPECT_EQ(25ull, ntupleOriginal->GetNEntries()); + + auto ntupleSnap = RNTupleReader::Open("ntuple_snap", filename); + EXPECT_EQ(25ull, ntupleSnap->GetNEntries()); +} + +TEST(RDFSnapshotRNTuple, UpdateSameName) +{ + const auto filename = "RDFSnapshotRNTuple_update_same_name.root"; + + { + auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; }); + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + auto sdf = df.Snapshot("ntuple", filename, "x", opts); + } + + auto df = ROOT::RDataFrame("ntuple", filename); + + RSnapshotOptions opts; + opts.fMode = "UPDATE"; + + try { + auto sdf = df.Define("y", [] { return 42; }).Snapshot("ntuple", filename, {"x", "y"}, opts); + FAIL() << "snapshotting in \"UPDATE\" mode to the same ntuple name without `fOverwriteIfExists` is not allowed "; + } catch (const std::invalid_argument &err) { + EXPECT_STREQ(err.what(), "Snapshot: RNTuple \"ntuple\" already present in file " + "\"RDFSnapshotRNTuple_update_same_name.root\". If you want to delete the original " + "ntuple and write another, please set RSnapshotOptions::fOverwriteIfExists to true."); + } + + opts.fOverwriteIfExists = true; + auto sdf = df.Define("y", [] { return 42; }).Snapshot("ntuple", filename, "", opts); + + std::vector expected = {"x", "y"}; + EXPECT_EQ(expected, sdf->GetColumnNames()); +} + +void WriteTestTree(const std::string &tname, const std::string &fname) +{ + TFile file(fname.c_str(), "RECREATE"); + TTree t(tname.c_str(), tname.c_str()); + float pt; + t.Branch("pt", &pt); + + pt = 42.0; + t.Fill(); + + t.Write(); +} + +TEST(RDFSnapshotRNTuple, DisallowToTTreeTemplated) +{ + const auto treename = "tree"; + const auto filename = "RDFSnapshotRNTuple_disallow_to_ttree_templated.root"; + + WriteTestTree(treename, filename); + + auto df = ROOT::RDataFrame(treename, filename); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + + try { + auto sdf = df.Define("x", [] { return 10; }).Snapshot("ntuple", filename, {"pt", "x"}, opts); + FAIL() << "snapshotting from RNTuple to TTree is not (yet) possible"; + } catch (const std::runtime_error &err) { + EXPECT_STREQ(err.what(), "Snapshotting from TTree to RNTuple is not yet supported. The current recommended way " + "to convert TTrees to RNTuple is through ROOT::Experimental::RNTupleImporter."); + } +} + +TEST(RDFSnapshotRNTuple, DisallowToTTreeJITted) +{ + const auto treename = "tree"; + const auto filename = "RDFSnapshotRNTuple_disallow_to_ttree_jitted.root"; + + WriteTestTree(treename, filename); + + auto df = ROOT::RDataFrame(treename, filename); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + + try { + auto sdf = df.Define("x", [] { return 10; }).Snapshot("ntuple", filename, {"pt", "x"}, opts); + FAIL() << "snapshotting from RNTuple to TTree is not (yet) possible"; + } catch (const std::runtime_error &err) { + EXPECT_STREQ(err.what(), "Snapshotting from TTree to RNTuple is not yet supported. The current recommended way " + "to convert TTrees to RNTuple is through ROOT::Experimental::RNTupleImporter."); + } +}