diff --git a/include/podio/DataSource.h b/include/podio/DataSource.h index 33f6ab921..6ed137765 100644 --- a/include/podio/DataSource.h +++ b/include/podio/DataSource.h @@ -7,6 +7,7 @@ #include // ROOT +#include #include #include @@ -14,6 +15,7 @@ #include #include #include +#include #include #include @@ -100,47 +102,59 @@ class DataSource : public ROOT::RDF::RDataSource { std::string GetLabel() override { return "PODIO Datasource"; - }; + } + + // Legacy API + std::vector GetColumnReadersImpl(std::string_view, const std::type_info&) override { + return {}; + } + + std::size_t GetNFiles() const override { + return m_filePathList.size(); + } -protected: /// - /// @brief Type-erased vector of pointers to pointers to column - /// values --- one per slot. + /// @brief Returns a column reader for the given slot and column. /// - std::vector GetColumnReadersImpl(std::string_view name, const std::type_info& typeInfo) override; + std::unique_ptr GetColumnReaders(unsigned int slot, std::string_view name, + const std::type_info& tid) override; +protected: std::string AsString() override { return "Podio data source"; } private: - /// Number of slots/threads - unsigned int m_nSlots = 1; - /// Input filename std::vector m_filePathList = {}; /// Total number of events ULong64_t m_nEvents = 0; - /// Ranges of events available to be processed - std::vector> m_rangesAvailable = {}; - - /// Ranges of events available ever created + /// All entry ranges, fixed after SetNSlots std::vector> m_rangesAll = {}; + /// Cursor into m_rangesAll for GetEntryRanges, reset each Initialize() + size_t m_rangesCursor = 0; + /// Column names std::vector m_columnNames{}; /// Column types std::vector m_columnTypes = {}; + /// Fast column name -> index lookup + std::unordered_map m_columnIndex{}; + /// Collections, m_Collections[columnIndex][slotIndex] std::vector> m_Collections = {}; /// Active collections std::vector m_activeCollections = {}; + /// Names of active collections, kept in sync with m_activeCollections + std::vector m_activeCollectionNames{}; + /// Root podio readers std::vector> m_podioReaders = {}; diff --git a/include/podio/RNTupleLazyFrameData.h b/include/podio/RNTupleLazyFrameData.h new file mode 100644 index 000000000..6ff0732ff --- /dev/null +++ b/include/podio/RNTupleLazyFrameData.h @@ -0,0 +1,63 @@ +#ifndef PODIO_RNTUPLELAZY_FRAMEDATA_H +#define PODIO_RNTUPLELAZY_FRAMEDATA_H + +#include "podio/CollectionBuffers.h" +#include "podio/CollectionIDTable.h" +#include "podio/GenericParameters.h" + +#include +#include +#include +#include +#include + +namespace podio { + +struct RNTupleCategoryState; + +/// FrameData implementation for lazy RNTuple reading. Instead of holding +/// pre-populated buffers, it holds a reference to the shared RNTupleCategoryState +/// and reads individual collections from disk on demand when +/// getCollectionBuffers() is called. Each lazy read creates (or reuses a cached) +/// partial RNTuple reader with a minimal model containing only the needed fields. +class RNTupleLazyFrameData { + using CollIDPtr = std::shared_ptr; + +public: + RNTupleLazyFrameData() = delete; + ~RNTupleLazyFrameData() = default; + RNTupleLazyFrameData(RNTupleLazyFrameData&&) = default; + RNTupleLazyFrameData& operator=(RNTupleLazyFrameData&&) = default; + RNTupleLazyFrameData(const RNTupleLazyFrameData&) = delete; + RNTupleLazyFrameData& operator=(const RNTupleLazyFrameData&) = delete; + + /// Construct from shared category state, entry number, map of available + /// collections (name -> index in RNTupleCategoryState::collectionInfo), + /// the shared ID table, and the eagerly-read parameters. + RNTupleLazyFrameData(std::shared_ptr state, unsigned entry, + std::unordered_map&& availableCollections, CollIDPtr idTable, + podio::GenericParameters&& params); + + /// Lazily read a single collection from RNTuple. Creates or reuses a cached + /// partial reader with only this collection's fields and calls LoadEntry(). + std::optional getCollectionBuffers(const std::string& name); + + podio::CollectionIDTable getIDTable() const; + + std::unique_ptr getParameters(); + + std::vector getAvailableCollections() const; + +private: + std::shared_ptr m_state; + unsigned m_entry{0}; + /// Maps collection name to its index in RNTupleCategoryState::collectionInfo. + /// Collections are removed from this map once they have been read. + std::unordered_map m_availableCollections{}; + CollIDPtr m_idTable{nullptr}; + podio::GenericParameters m_parameters{}; +}; + +} // namespace podio + +#endif // PODIO_RNTUPLELAZY_FRAMEDATA_H diff --git a/include/podio/RNTupleLazyReader.h b/include/podio/RNTupleLazyReader.h new file mode 100644 index 000000000..fa652360e --- /dev/null +++ b/include/podio/RNTupleLazyReader.h @@ -0,0 +1,92 @@ +#ifndef PODIO_RNTUPLELAZY_READER_H +#define PODIO_RNTUPLELAZY_READER_H + +#include "podio/RNTupleLazyFrameData.h" +#include "podio/utilities/ReaderCommon.h" +#include "podio/utilities/RNTupleHelpers.h" + +#include +#include +#include +#include +#include +#include + +namespace podio { + +struct RNTupleCategoryState; + +/// The RNTupleLazyReader reads files written with the RNTuple backend lazily: +/// individual collections are only read from disk when they are first accessed +/// via Frame::get(). +/// +/// It provides data as RNTupleLazyFrameData from which a podio::Frame can be +/// constructed. Unlike RNTupleReader which reads all collections eagerly, this +/// reader defers the actual RNTuple I/O to the point of collection access, +/// using partial RNTuple readers with minimal field models so that LoadEntry() +/// only reads the requested collection's data. +class RNTupleLazyReader : public ReaderCommon, public RNTupleReaderCommon { + +public: + RNTupleLazyReader() = default; + ~RNTupleLazyReader() = default; + + RNTupleLazyReader(const RNTupleLazyReader&) = delete; + RNTupleLazyReader& operator=(const RNTupleLazyReader&) = delete; + RNTupleLazyReader(RNTupleLazyReader&&) = default; + RNTupleLazyReader& operator=(RNTupleLazyReader&&) = default; + + /// Open a single file for reading. + /// + /// @param filename The name of the input file + void openFile(const std::string& filename); + + /// Open multiple files for reading and treat them as if they are one file. + /// + /// @param filenames The filenames of all input files that should be read + void openFiles(const std::vector& filenames); + + /// Read the next data entry for a given category. + /// + /// @param name The category name for which to read the next entry + /// @param collsToRead (optional) the collection names that should be + /// available for lazy reading. If not provided (or empty) + /// all collections will be available. + /// + /// @returns FrameData from which a podio::Frame can be constructed if the + /// category exists and there are still entries left. Otherwise nullptr. + /// No collection data is read at this point. + /// + /// @throws std::invalid_argument if collsToRead contains collection names + /// that are not available + std::unique_ptr readNextEntry(std::string_view name, + const std::vector& collsToRead = {}); + + /// Read the desired data entry for a given category. + /// + /// @param name The category name for which to read the next entry + /// @param entry The entry number to read + /// @param collsToRead (optional) the collection names that should be + /// available for lazy reading. If not provided (or empty) + /// all collections will be available. + /// + /// @returns FrameData from which a podio::Frame can be constructed if the + /// category and the desired entry exist. Otherwise nullptr. + /// No collection data is read at this point. + /// + /// @throws std::invalid_argument if collsToRead contains collection names + /// that are not available + std::unique_ptr readEntry(std::string_view name, unsigned entry, + const std::vector& collsToRead = {}); + + /// Get the number of entries for the given category. + unsigned getEntries(std::string_view name) const; + +private: + /// Per-category shared state (readers, entry offsets, collection info, etc.) + std::unordered_map> m_categoryStates{}; +}; + +} // namespace podio + +#endif // PODIO_RNTUPLELAZY_READER_H diff --git a/include/podio/RNTupleReader.h b/include/podio/RNTupleReader.h index 80b731b3e..fe53c9739 100644 --- a/include/podio/RNTupleReader.h +++ b/include/podio/RNTupleReader.h @@ -2,37 +2,22 @@ #define PODIO_RNTUPLEREADER_H #include "podio/ROOTFrameData.h" -#include "podio/podioVersion.h" -#include "podio/utilities/DatamodelRegistryIOHelpers.h" -#include "podio/utilities/RootHelpers.h" +#include "podio/utilities/ReaderCommon.h" +#include "podio/utilities/RNTupleHelpers.h" #include #include #include #include -#include -#include -#include - namespace podio { -/// Introduce a new namespace instead of potentially opening and polluting the -/// ROOT namespace -namespace root_compat { -#if ROOT_VERSION_CODE < ROOT_VERSION(6, 35, 0) - using RNTupleReader = ROOT::Experimental::RNTupleReader; -#else - using RNTupleReader = ROOT::RNTupleReader; -#endif -} // namespace root_compat - /// The RNTupleReader can be used to read files that have been written with the /// RNTuple backend. /// /// The RNTupleReader provides the data as ROOTFrameData from which a podio::Frame /// can be constructed. It can be used to read files written by the RNTupleWriter. -class RNTupleReader { +class RNTupleReader : public ReaderCommon, public RNTupleReaderCommon { public: RNTupleReader() = default; @@ -91,11 +76,6 @@ class RNTupleReader { std::unique_ptr readEntry(std::string_view name, const unsigned entry, const std::vector& collsToRead = {}); - /// Get the names of all the available Frame categories in the current file(s). - /// - /// @returns The names of the available categores from the file - std::vector getAvailableCategories() const; - /// Get the number of entries for the given name /// /// @param name The name of the category @@ -103,62 +83,8 @@ class RNTupleReader { /// @returns The number of entries that are available for the category unsigned getEntries(std::string_view name) const; - /// Get the build version of podio that has been used to write the current - /// file - /// - /// @returns The podio build version - podio::version::Version currentFileVersion() const { - return m_fileVersion; - } - - /// Get the (build) version of a datamodel that has been used to write the - /// current file - /// - /// @param name The name of the datamodel - /// - /// @returns The (build) version of the datamodel if available or an empty - /// optional - std::optional currentFileVersion(std::string_view name) const { - return m_datamodelHolder.getDatamodelVersion(name); - } - - /// Get the datamodel definition for the given name - /// - /// @param name The name of the datamodel - /// - /// @returns The high level definition of the datamodel in JSON format - const std::string_view getDatamodelDefinition(std::string_view name) const { - return m_datamodelHolder.getDatamodelDefinition(name); - } - - /// Get all names of the datamodels that are available from this reader - /// - /// @returns The names of the datamodels - std::vector getAvailableDatamodels() const { - return m_datamodelHolder.getAvailableDatamodels(); - } - private: - /** - * Initialize the given category by filling the maps with metadata information - * that will be used later - */ - bool initCategory(std::string_view category); - - /** - * Read and reconstruct the generic parameters of the Frame - */ - GenericParameters readEventMetaData(root_compat::RNTupleReader* reader, const unsigned localEntry); - - std::unique_ptr m_metadata{}; - - podio::version::Version m_fileVersion{}; - DatamodelDefinitionHolder m_datamodelHolder{}; - std::unordered_map>> m_readers{}; - std::unordered_map> m_metadata_readers{}; - std::vector m_filenames{}; - std::unordered_map m_entries{}; // Map category to a vector that contains at how many entries each reader starts // For example, if we have 3 readers and the first one has 10 entries, the second one 20 and the third one 30 @@ -170,9 +96,7 @@ class RNTupleReader { /// Map each category to the collections that have been written and are available std::unordered_map> m_collectionInfo{}; - std::vector m_availableCategories{}; - - std::unordered_map> m_idTables{}; + std::unordered_map> m_idTables{}; }; } // namespace podio diff --git a/include/podio/ROOTLazyFrameData.h b/include/podio/ROOTLazyFrameData.h new file mode 100644 index 000000000..e06315b3e --- /dev/null +++ b/include/podio/ROOTLazyFrameData.h @@ -0,0 +1,62 @@ +#ifndef PODIO_ROOTLAZYFRAMEDATA_H +#define PODIO_ROOTLAZYFRAMEDATA_H + +#include "podio/CollectionBuffers.h" +#include "podio/CollectionIDTable.h" +#include "podio/GenericParameters.h" + +#include +#include +#include +#include +#include + +namespace podio { + +struct CategoryState; + +/// FrameData implementation for lazy ROOT reading. Instead of holding +/// pre-populated buffers, it holds a reference to the shared CategoryState and +/// reads individual collections from disk on demand when getCollectionBuffers() +/// is called. +class ROOTLazyFrameData { + using CollIDPtr = std::shared_ptr; + +public: + ROOTLazyFrameData() = delete; + ~ROOTLazyFrameData() = default; + ROOTLazyFrameData(ROOTLazyFrameData&&) = default; + ROOTLazyFrameData& operator=(ROOTLazyFrameData&&) = default; + ROOTLazyFrameData(const ROOTLazyFrameData&) = delete; + ROOTLazyFrameData& operator=(const ROOTLazyFrameData&) = delete; + + /// Construct from shared state, entry number, available collections, ID table + /// and eagerly-read parameters + ROOTLazyFrameData(std::shared_ptr state, unsigned entry, + std::unordered_map&& availableCollections, CollIDPtr idTable, + podio::GenericParameters&& params); + + /// Lazily read a single collection from ROOT. Each call reads exactly one + /// collection from disk, positions the TChain, refreshes branch pointers, + /// and returns the populated buffers. + std::optional getCollectionBuffers(const std::string& name); + + podio::CollectionIDTable getIDTable() const; + + std::unique_ptr getParameters(); + + std::vector getAvailableCollections() const; + +private: + std::shared_ptr m_state; + unsigned m_entry{0}; + /// Maps collection name to its index in CategoryState::storedClasses. + /// Collections are removed from this map once extracted. + std::unordered_map m_availableCollections{}; + CollIDPtr m_idTable{nullptr}; + podio::GenericParameters m_parameters{}; +}; + +} // namespace podio + +#endif // PODIO_ROOTLAZYFRAMEDATA_H diff --git a/include/podio/ROOTLazyReader.h b/include/podio/ROOTLazyReader.h new file mode 100644 index 000000000..2f287337e --- /dev/null +++ b/include/podio/ROOTLazyReader.h @@ -0,0 +1,117 @@ +#ifndef PODIO_ROOTLAZYREADER_H +#define PODIO_ROOTLAZYREADER_H + +#include "podio/ROOTLazyFrameData.h" +#include "podio/podioVersion.h" +#include "podio/utilities/DatamodelRegistryIOHelpers.h" +#include "podio/utilities/ReaderCommon.h" +#include "podio/utilities/RootHelpers.h" + +#include "TChain.h" + +#include +#include +#include +#include +#include +#include + +namespace podio { + +class CollectionIDTable; +class GenericParameters; + +/// This class reads data from ROOT TTree files lazily: individual collections +/// are only read from disk when they are first accessed via Frame::get(). +/// +/// The ROOTLazyReader provides data as ROOTLazyFrameData from which a +/// podio::Frame can be constructed. It can be used to read files written by the +/// ROOTWriter. Unlike the ROOTReader which reads all collections eagerly, this +/// reader defers the actual ROOT I/O to the point of collection access. +class ROOTLazyReader : public ReaderCommon, root_utils::TTreeReaderCommon { + +public: + ROOTLazyReader() = default; + ~ROOTLazyReader() = default; + + ROOTLazyReader(const ROOTLazyReader&) = delete; + ROOTLazyReader& operator=(const ROOTLazyReader&) = delete; + ROOTLazyReader(ROOTLazyReader&&) = default; + ROOTLazyReader& operator=(ROOTLazyReader&&) = default; + + /// Open a single file for reading. + /// + /// @param filename The name of the input file + void openFile(const std::string& filename); + + /// Open multiple files for reading and then treat them as if they are one file + /// + /// @note All of the files are assumed to have the same structure. Specifically + /// this means: + /// - The same categories are available from all files + /// - The collections that are contained in the individual categories are the + /// same across all files + /// + /// @param filenames The filenames of all input files that should be read + void openFiles(const std::vector& filenames); + + /// Read the next data entry for a given category. + /// + /// @param name The category name for which to read the next entry + /// @param collsToRead (optional) the collection names that should be available + /// for lazy reading. If not provided (or empty) all collections + /// will be available. + /// + /// @returns FrameData from which a podio::Frame can be constructed if the + /// category exists and if there are still entries left to read. + /// Otherwise a nullptr. No collection data is read at this point. + /// + /// @throws std::invalid_argument in case collsToRead contains collection + /// names that are not available + std::unique_ptr readNextEntry(std::string_view name, + const std::vector& collsToRead = {}); + + /// Read the desired data entry for a given category. + /// + /// @param name The category name for which to read the next entry + /// @param entry The entry number to read + /// @param collsToRead (optional) the collection names that should be available + /// for lazy reading. If not provided (or empty) all collections + /// will be available. + /// + /// @returns FrameData from which a podio::Frame can be constructed if the + /// category and the desired entry exist. Otherwise a nullptr. No + /// collection data is read at this point. + /// + /// @throws std::invalid_argument in case collsToRead contains collection + /// names that are not available + std::unique_ptr readEntry(std::string_view name, const unsigned entry, + const std::vector& collsToRead = {}); + + /// Get the number of entries for the given name + /// + /// @param name The name of the category + /// + /// @returns The number of entries that are available for the category + unsigned getEntries(std::string_view name) const; + + /// Get the names of all the available Frame categories in the current file(s). + /// + /// @returns The names of the available categories from the file + std::vector getAvailableCategories() const; + +private: + /// Get the category state for the given name. Initializes on first access. + std::shared_ptr& getCategoryState(std::string_view name); + + /// Read the data entry specified by the current entry counter in the given + /// category state. Returns nullptr if out of bounds. + std::unique_ptr readEntry(std::shared_ptr& catState, unsigned entry, + const std::vector& collsToRead); + + std::unordered_map> m_categoryStates{}; +}; + +} // namespace podio + +#endif // PODIO_ROOTLAZYREADER_H diff --git a/include/podio/ROOTReader.h b/include/podio/ROOTReader.h index f59c2941d..6970dded6 100644 --- a/include/podio/ROOTReader.h +++ b/include/podio/ROOTReader.h @@ -2,8 +2,7 @@ #define PODIO_ROOTREADER_H #include "podio/ROOTFrameData.h" -#include "podio/podioVersion.h" -#include "podio/utilities/DatamodelRegistryIOHelpers.h" +#include "podio/utilities/ReaderCommon.h" #include "podio/utilities/ReaderUtils.h" #include "podio/utilities/RootHelpers.h" @@ -13,7 +12,6 @@ #include #include #include -#include #include #include @@ -24,18 +22,6 @@ class TTree; namespace podio { -namespace detail { - // Information about the collection class type, whether it is a subset, the - // schema version on file and the index in the collection branches cache - // vector - using CollectionInfo = std::tuple; - - struct NamedCollInfo { - std::string name{}; - CollectionInfo info{}; - }; -} // namespace detail - class CollectionBase; class CollectionIDTable; class GenericParameters; @@ -46,7 +32,7 @@ struct CollectionReadBuffers; /// /// The ROOTReader provides the data as ROOTFrameData from which a podio::Frame /// can be constructed. It can be used to read files written by the ROOTWriter. -class ROOTReader { +class ROOTReader : public ReaderCommon, root_utils::TTreeReaderCommon { public: ROOTReader() = default; @@ -112,45 +98,11 @@ class ROOTReader { /// @returns The number of entries that are available for the category unsigned getEntries(std::string_view name) const; - /// Get the build version of podio that has been used to write the current - /// file - /// - /// @returns The podio build version - podio::version::Version currentFileVersion() const { - return m_fileVersion; - } - - /// Get the (build) version of a datamodel that has been used to write the - /// current file - /// - /// @param name The name of the datamodel - /// - /// @returns The (build) version of the datamodel if available or an empty - /// optional - std::optional currentFileVersion(std::string_view name) const { - return m_datamodelHolder.getDatamodelVersion(name); - } - /// Get the names of all the available Frame categories in the current file(s). /// /// @returns The names of the available categories from the file std::vector getAvailableCategories() const; - /// Get the datamodel definition for the given name - /// - /// @param name The name of the datamodel - /// - /// @returns The high level definition of the datamodel in JSON format - const std::string_view getDatamodelDefinition(std::string_view name) const { - return m_datamodelHolder.getDatamodelDefinition(name); - } - - /// Get all names of the datamodels that are available from this reader - /// - /// @returns The names of the datamodels - std::vector getAvailableDatamodels() const { - return m_datamodelHolder.getAvailableDatamodels(); - } std::optional> getSizeStats(std::string_view category); private: @@ -163,31 +115,20 @@ class ROOTReader { /// constructor from chain for more convenient map insertion CategoryInfo(std::unique_ptr&& c) : chain(std::move(c)) { } - std::unique_ptr chain{nullptr}; ///< The TChain with the data - unsigned entry{0}; ///< The next entry to read - std::vector storedClasses{}; ///< The stored collections in this - ///< category - std::vector branches{}; ///< The branches for this category - std::shared_ptr table{nullptr}; ///< The collection ID table for this category + std::unique_ptr chain{nullptr}; ///< The TChain with the data + unsigned entry{0}; ///< The next entry to read + std::vector storedClasses{}; ///< The stored collections in this + ///< category + std::vector branches{}; ///< The (data) branches for this category + std::vector paramBranches{}; ///< The parameter branches for this category + std::shared_ptr table{nullptr}; ///< The collection ID table for this category }; - /// Initialize the passed CategoryInfo by setting up the necessary branches, - /// collection infos and all necessary meta data to be able to read entries - /// with this name - void initCategory(CategoryInfo& catInfo, std::string_view name); - /// Get the category information for the given name. In case there is no TTree /// with contents for the given name this will return a CategoryInfo with an /// uninitialized chain (nullptr) member CategoryInfo& getCategoryInfo(std::string_view name); - /// Read the parameters for the entry specified in the passed CategoryInfo - GenericParameters readEntryParameters(CategoryInfo& catInfo, bool reloadBranches, unsigned int localEntry); - - template - static void readParams(CategoryInfo& catInfo, podio::GenericParameters& params, bool reloadBranches, - unsigned int localEntry); - /// Read the data entry specified in the passed CategoryInfo, and increase the /// counter afterwards. In case the requested entry is larger than the /// available number of entries, return a nullptr. @@ -198,12 +139,7 @@ class ROOTReader { std::optional getCollectionBuffers(CategoryInfo& catInfo, size_t iColl, bool reloadBranches, unsigned int localEntry); - std::unique_ptr m_metaChain{nullptr}; ///< The metadata tree std::unordered_map m_categories{}; ///< All categories - std::vector m_availCategories{}; ///< All available categories from this file - - podio::version::Version m_fileVersion{0, 0, 0}; - DatamodelDefinitionHolder m_datamodelHolder{}; }; } // namespace podio diff --git a/include/podio/Reader.h b/include/podio/Reader.h index 7d1f45822..e64ba55e5 100644 --- a/include/podio/Reader.h +++ b/include/podio/Reader.h @@ -226,10 +226,12 @@ class Reader { /// /// @param filename The (path to the) file to read from. /// The file path can include glob patterns to match multiple files. +/// @param lazy Whether or not a lazy reader should be used internally (if +/// available) /// /// @returns A Reader that has been initialized and that can be used for reading /// data from the passed file -Reader makeReader(const std::string& filename); +Reader makeReader(const std::string& filename, bool lazy = false); /// Create a Reader that is able to read the files /// @@ -241,6 +243,8 @@ Reader makeReader(const std::string& filename); /// @note For SIO files this will only work with exactly one file! /// /// @param filenames The (paths to the) files to read from +/// @param lazy Whether or not a lazy reader should be used internally (if +/// available) /// /// @returns A Reader that has been initialized and that can be used for reading /// data from the passed files @@ -248,7 +252,7 @@ Reader makeReader(const std::string& filename); /// @throws std::runtime_error in case the file extensions differ or in case /// support for the necessary I/O backend has not been built or in case /// multiple files for the SIO backend are passed -Reader makeReader(const std::vector& filenames); +Reader makeReader(const std::vector& filenames, bool lazy = false); } // namespace podio diff --git a/include/podio/utilities/RNTupleHelpers.h b/include/podio/utilities/RNTupleHelpers.h new file mode 100644 index 000000000..2fd699b67 --- /dev/null +++ b/include/podio/utilities/RNTupleHelpers.h @@ -0,0 +1,66 @@ +#ifndef PODIO_UTILITIES_RNTUPLEHELPERS_H +#define PODIO_UTILITIES_RNTUPLEHELPERS_H + +#include "podio/CollectionIDTable.h" +#include "podio/GenericParameters.h" +#include "podio/podioVersion.h" +#include "podio/utilities/DatamodelRegistryIOHelpers.h" +#include "podio/utilities/RootHelpers.h" + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace podio { + +/// Introduce a new namespace instead of potentially opening and polluting the +/// ROOT namespace +namespace root_compat { +#if ROOT_VERSION_CODE < ROOT_VERSION(6, 35, 0) + using RNTupleReader = ROOT::Experimental::RNTupleReader; +#else + using RNTupleReader = ROOT::RNTupleReader; +#endif +} // namespace root_compat + +/// Base struct that factors out the common metadata handling shared between +/// RNTupleReader and RNTupleLazyReader. Mirrors the role that +/// root_utils::TTreeReaderCommon plays for the TTree-based readers. +struct RNTupleReaderCommon { +public: + /// Get the names of all the available Frame categories in the current file(s). + std::vector getAvailableCategories() const; + +protected: + /// Open per-file metadata readers, read the podio version, EDM definitions + /// and the list of available categories. Writes into fileVersion and + /// datamodelHolder (which live in the ReaderCommon base class). + void openMetaData(const std::vector& filenames, podio::version::Version& fileVersion, + podio::DatamodelDefinitionHolder& datamodelHolder); + + /// Read and reconstruct the generic parameters of the Frame from the given + /// metadata reader at the given local entry index. + GenericParameters readEventMetaData(root_compat::RNTupleReader* reader, unsigned localEntry); + + /// Read the CollectionWriteInfo for the given category from the first file's + /// metadata reader and build the CollectionIDTable. Both output parameters are + /// written only on success (return value true). Returns false if the category + /// is not in m_availableCategories. + bool initCategory(std::string_view category, std::vector& collInfo, + std::shared_ptr& idTable); + + std::unique_ptr m_metadata{}; + std::unordered_map> m_metadata_readers{}; + std::vector m_filenames{}; + std::vector m_availableCategories{}; +}; + +} // namespace podio + +#endif // PODIO_UTILITIES_RNTUPLEHELPERS_H diff --git a/include/podio/utilities/ReaderCommon.h b/include/podio/utilities/ReaderCommon.h new file mode 100644 index 000000000..3540e87ea --- /dev/null +++ b/include/podio/utilities/ReaderCommon.h @@ -0,0 +1,53 @@ +#ifndef PODIO_UTILITIES_READERCOMMON_H +#define PODIO_UTILITIES_READERCOMMON_H + +#include "podio/podioVersion.h" +#include "podio/utilities/DatamodelRegistryIOHelpers.h" + +namespace podio { + +class ReaderCommon { +public: + /// Get the build version of podio that has been used to write the current + /// file + /// + /// @returns The podio build version + podio::version::Version currentFileVersion() const { + return m_fileVersion; + } + + /// Get the (build) version of a datamodel that has been used to write the + /// current file + /// + /// @param name The name of the datamodel + /// + /// @returns The (build) version of the datamodel if available or an empty + /// optional + std::optional currentFileVersion(std::string_view name) const { + return m_datamodelHolder.getDatamodelVersion(name); + } + + /// Get the datamodel definition for the given name + /// + /// @param name The name of the datamodel + /// + /// @returns The high level definition of the datamodel in JSON format + const std::string_view getDatamodelDefinition(std::string_view name) const { + return m_datamodelHolder.getDatamodelDefinition(name); + } + + /// Get all names of the datamodels that are available from this reader + /// + /// @returns The names of the datamodels + std::vector getAvailableDatamodels() const { + return m_datamodelHolder.getAvailableDatamodels(); + } + +protected: + podio::version::Version m_fileVersion{0, 0, 0}; + DatamodelDefinitionHolder m_datamodelHolder{}; +}; + +} // namespace podio + +#endif // PODIO_UTILITIES_READERCOMMON_H diff --git a/include/podio/utilities/RootHelpers.h b/include/podio/utilities/RootHelpers.h index 41d17f927..62c4cf5af 100644 --- a/include/podio/utilities/RootHelpers.h +++ b/include/podio/utilities/RootHelpers.h @@ -2,16 +2,23 @@ #define PODIO_UTILITIES_ROOTHELPERS_H #include "podio/GenericParameters.h" +#include "podio/SchemaEvolution.h" +#include "podio/podioVersion.h" +#include "podio/utilities/DatamodelRegistryIOHelpers.h" #include "ROOT/RVec.hxx" #include "TBranch.h" +#include "TChain.h" +#include #include +#include #include #include namespace podio { class CollectionBase; +class CollectionIDTable; namespace root_utils { @@ -97,6 +104,34 @@ namespace root_utils { ROOT::VecOps::RVec doubleKeys, ROOT::VecOps::RVec> doubleValues, ROOT::VecOps::RVec stringKeys, ROOT::VecOps::RVec> stringValues); + struct TTreeReaderCommon { + // Information about the collection class type, whether it is a subset, the + // schema version on file and the index in the collection branches cache + // vector + using CollectionInfo = std::tuple; + + struct NamedCollInfo { + std::string name{}; + CollectionInfo info{}; + }; + + protected: + /// Open the metadata chain, read the file version and EDM definitions into + /// the provided references, and populate m_availCategories. + /// fileVersion and datamodelHolder are passed by ref because they live in + /// ReaderCommon (a separate base class). + void openMetaChain(const std::vector& filenames, podio::version::Version& fileVersion, + podio::DatamodelDefinitionHolder& datamodelHolder); + + /// Unified parameter reading. reloadBranches=true always for the lazy reader. + static podio::GenericParameters + readEntryParameters(std::vector& paramBranches, TChain* chain, + const podio::version::Version& fileVersion, bool reloadBranches, unsigned int localEntry); + + std::unique_ptr m_metaChain{nullptr}; ///< The metadata tree + std::vector m_availCategories{}; ///< All available categories from this file + }; + } // namespace root_utils } // namespace podio diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e5fecaa38..3789aeab6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -89,28 +89,38 @@ SET(root_sources rootUtils.h ROOTWriter.cc ROOTReader.cc + ROOTLazyReader.cc ROOTLegacyReader.cc ROOTFrameData.cc + ROOTLazyFrameData.cc RootHelpers.cc ) if(ENABLE_RNTUPLE) list(APPEND root_sources + RNTupleReaderCommon.cc RNTupleReader.cc RNTupleWriter.cc + RNTupleLazyReader.cc + RNTupleLazyFrameData.cc ) endif() SET(root_headers ${PROJECT_SOURCE_DIR}/include/podio/ROOTReader.h + ${PROJECT_SOURCE_DIR}/include/podio/ROOTLazyReader.h ${PROJECT_SOURCE_DIR}/include/podio/ROOTLegacyReader.h ${PROJECT_SOURCE_DIR}/include/podio/ROOTWriter.h ${PROJECT_SOURCE_DIR}/include/podio/ROOTFrameData.h + ${PROJECT_SOURCE_DIR}/include/podio/ROOTLazyFrameData.h ${PROJECT_SOURCE_DIR}/include/podio/utilities/RootHelpers.h ) if(ENABLE_RNTUPLE) list(APPEND root_headers ${PROJECT_SOURCE_DIR}/include/podio/RNTupleReader.h ${PROJECT_SOURCE_DIR}/include/podio/RNTupleWriter.h + ${PROJECT_SOURCE_DIR}/include/podio/RNTupleLazyReader.h + ${PROJECT_SOURCE_DIR}/include/podio/RNTupleLazyFrameData.h + ${PROJECT_SOURCE_DIR}/include/podio/utilities/RNTupleHelpers.h ) endif() diff --git a/src/DataSource.cc b/src/DataSource.cc index 5e4f0779f..e8d0eecf7 100644 --- a/src/DataSource.cc +++ b/src/DataSource.cc @@ -6,21 +6,37 @@ #include // ROOT -#include +#include // STL #include -#include #include namespace podio { + +// Column reader that wraps a pointer to the per-slot CollectionBase* pointer +class PodioColumnReader : public ROOT::Detail::RDF::RColumnReaderBase { + const podio::CollectionBase** fPtr; + +public: + explicit PodioColumnReader(const podio::CollectionBase** ptr) : fPtr(ptr) { + } + PodioColumnReader(const PodioColumnReader&) = delete; + PodioColumnReader& operator=(const PodioColumnReader&) = delete; + void* GetImpl(Long64_t) override { + // Return the actual collection pointer (T*), not the address of the storage (T**) + // RColumnReaderBase::Get does *static_cast(GetImpl()), so we return the T* itself + return const_cast(static_cast(*fPtr)); + } +}; + DataSource::DataSource(const std::string& filePath, int nEvents, const std::vector& collNames) : DataSource(utils::expand_glob(filePath), nEvents, collNames) { } DataSource::DataSource(const std::vector& filePathList, int nEvents, const std::vector& collNames) : - m_nSlots{1}, m_filePathList{filePathList} { + m_filePathList{filePathList} { SetupInput(nEvents, collNames); } @@ -35,7 +51,7 @@ void DataSource::SetupInput(int nEvents, const std::vector& collsTo // Create probing frame podio::Frame frame; unsigned int nEventsInFiles = 0; - auto podioReader = podio::makeReader(m_filePathList); + auto podioReader = podio::makeReader(m_filePathList, true); nEventsInFiles = podioReader.getEntries(podio::Category::Event); frame = podioReader.readFrame(podio::Category::Event, 0, collsToRead); @@ -55,11 +71,12 @@ void DataSource::SetupInput(int nEvents, const std::vector& collsTo m_nEvents = nEventsInFiles; } - // Get collections stored in the files + // Get collections stored in the files and build fast lookup map std::vector collNames = frame.getAvailableCollections(); for (auto&& collName : collNames) { const podio::CollectionBase* coll = frame.get(collName); if (coll) { + m_columnIndex[collName] = m_columnNames.size(); m_columnNames.emplace_back(std::move(collName)); m_columnTypes.emplace_back(coll->getTypeName()); } @@ -67,62 +84,54 @@ void DataSource::SetupInput(int nEvents, const std::vector& collsTo } void DataSource::SetNSlots(unsigned int nSlots) { - m_nSlots = nSlots; - - if (m_nSlots > m_nEvents) { - throw std::runtime_error("podio::DataSource: Number of events too small!"); - } + RDataSource::SetNSlots(nSlots); - int eventsPerSlot = m_nEvents / m_nSlots; - for (size_t i = 0; i < (m_nSlots - 1); ++i) { + // Build one range per slot; if there are fewer events than slots, cap at m_nEvents ranges + const unsigned int effectiveSlots = std::min(fNSlots, static_cast(m_nEvents)); + const ULong64_t eventsPerSlot = m_nEvents / effectiveSlots; + for (size_t i = 0; i < (effectiveSlots - 1); ++i) { m_rangesAll.emplace_back(eventsPerSlot * i, eventsPerSlot * (i + 1)); } - m_rangesAll.emplace_back(eventsPerSlot * (m_nSlots - 1), m_nEvents); - m_rangesAvailable = m_rangesAll; + m_rangesAll.emplace_back(eventsPerSlot * (effectiveSlots - 1), m_nEvents); + m_rangesCursor = 0; - // Initialize set of addresses needed - m_Collections.resize(m_columnNames.size(), std::vector(m_nSlots, nullptr)); + // Collections indexed [column][slot] + m_Collections.resize(m_columnNames.size(), std::vector(fNSlots, nullptr)); // Initialize podio readers - for (size_t i = 0; i < m_nSlots; ++i) { - m_podioReaders.emplace_back(std::make_unique(podio::makeReader(m_filePathList))); + for (size_t i = 0; i < fNSlots; ++i) { + m_podioReaders.emplace_back(std::make_unique(podio::makeReader(m_filePathList, true))); } - for (size_t i = 0; i < m_nSlots; ++i) { + for (size_t i = 0; i < fNSlots; ++i) { m_frames.emplace_back(std::make_unique()); } } void DataSource::Initialize() { + m_rangesCursor = 0; } std::vector> DataSource::GetEntryRanges() { - std::vector> rangesToBeProcessed; - for (auto& range : m_rangesAvailable) { - rangesToBeProcessed.emplace_back(range.first, range.second); - if (rangesToBeProcessed.size() >= m_nSlots) { - break; - } + if (m_rangesCursor >= m_rangesAll.size()) { + return {}; } - - if (m_rangesAvailable.size() > m_nSlots) { - m_rangesAvailable.erase(m_rangesAvailable.begin(), m_rangesAvailable.begin() + m_nSlots); - } else { - m_rangesAvailable.erase(m_rangesAvailable.begin(), m_rangesAvailable.end()); - } - - return rangesToBeProcessed; + const size_t end = std::min(m_rangesCursor + fNSlots, m_rangesAll.size()); + std::vector> result(m_rangesAll.cbegin() + m_rangesCursor, + m_rangesAll.cbegin() + end); + m_rangesCursor = end; + return result; } void DataSource::InitSlot(unsigned int, ULong64_t) { } bool DataSource::SetEntry(unsigned int slot, ULong64_t entry) { - m_frames[slot] = - std::make_unique(m_podioReaders[slot]->readFrame(podio::Category::Event, entry, m_columnNames)); + m_frames[slot] = std::make_unique( + m_podioReaders[slot]->readFrame(podio::Category::Event, entry, m_activeCollectionNames)); - for (auto& collectionIndex : m_activeCollections) { - m_Collections[collectionIndex][slot] = m_frames[slot]->get(m_columnNames.at(collectionIndex)); + for (auto collectionIndex : m_activeCollections) { + m_Collections[collectionIndex][slot] = m_frames[slot]->get(m_columnNames[collectionIndex]); } return true; @@ -134,45 +143,43 @@ void DataSource::FinalizeSlot(unsigned int) { void DataSource::Finalize() { } -std::vector DataSource::GetColumnReadersImpl(std::string_view columnName, const std::type_info&) { - auto itr = std::find(m_columnNames.begin(), m_columnNames.end(), columnName); - if (itr == m_columnNames.end()) { - std::string errMsg = "podio::DataSource: Can't find requested column \""; - errMsg += columnName; - errMsg += "\"!"; - throw std::runtime_error(errMsg); - } - auto columnIndex = std::distance(m_columnNames.begin(), itr); - m_activeCollections.emplace_back(columnIndex); - - std::vector columnReaders(m_nSlots); - for (size_t slotIndex = 0; slotIndex < m_nSlots; ++slotIndex) { - columnReaders[slotIndex] = static_cast(&m_Collections[columnIndex][slotIndex]); - } - - return columnReaders; -} - const std::vector& DataSource::GetColumnNames() const { return m_columnNames; } bool DataSource::HasColumn(std::string_view columnName) const { - return std::find(m_columnNames.begin(), m_columnNames.end(), columnName) != m_columnNames.end(); + return m_columnIndex.count(std::string(columnName)) > 0; } std::string DataSource::GetTypeName(std::string_view columnName) const { - auto itr = std::find(m_columnNames.begin(), m_columnNames.end(), columnName); - if (itr == m_columnNames.end()) { + auto itr = m_columnIndex.find(std::string(columnName)); + if (itr == m_columnIndex.end()) { std::string errMsg = "podio::DataSource: Type name for \""; errMsg += columnName; errMsg += "\" not found!"; throw std::runtime_error(errMsg); } - auto typeIndex = std::distance(m_columnNames.begin(), itr); + return m_columnTypes.at(itr->second); +} + +std::unique_ptr +DataSource::GetColumnReaders(unsigned int slot, std::string_view columnName, const std::type_info&) { + auto itr = m_columnIndex.find(std::string(columnName)); + if (itr == m_columnIndex.end()) { + std::string errMsg = "podio::DataSource: Can't find requested column \""; + errMsg += columnName; + errMsg += "\"!"; + throw std::runtime_error(errMsg); + } + const auto columnIndex = itr->second; + + if (std::find(m_activeCollections.begin(), m_activeCollections.end(), columnIndex) == m_activeCollections.end()) { + m_activeCollections.emplace_back(columnIndex); + m_activeCollectionNames.emplace_back(m_columnNames[columnIndex]); + } - return m_columnTypes.at(typeIndex); + return std::make_unique(&m_Collections[columnIndex][slot]); } ROOT::RDataFrame CreateDataFrame(const std::vector& filePathList, diff --git a/src/RNTupleLazyCategoryState.h b/src/RNTupleLazyCategoryState.h new file mode 100644 index 000000000..c384d68d9 --- /dev/null +++ b/src/RNTupleLazyCategoryState.h @@ -0,0 +1,61 @@ +#ifndef PODIO_RNTUPLELAZYCATEGORYSTATE_H +#define PODIO_RNTUPLELAZYCATEGORYSTATE_H + +#include "podio/CollectionIDTable.h" +#include "podio/utilities/RNTupleHelpers.h" + +#include +#include +#include +#include +#include +#include + +namespace podio { + +/// Shared state between RNTupleLazyReader and RNTupleLazyFrameData instances +/// for the same category. Holds the full RNTuple readers, entry offset +/// information, collection metadata, and a cache of partial readers used for +/// per-collection lazy loading. The mutex serializes all RNTuple I/O operations +/// since ROOT is not thread-safe. +struct RNTupleCategoryState { + RNTupleCategoryState() = default; + ~RNTupleCategoryState() = default; + RNTupleCategoryState(const RNTupleCategoryState&) = delete; + RNTupleCategoryState& operator=(const RNTupleCategoryState&) = delete; + RNTupleCategoryState(RNTupleCategoryState&&) = delete; + RNTupleCategoryState& operator=(RNTupleCategoryState&&) = delete; + + /// The category name (needed when opening partial readers) + std::string category{}; + + /// Full RNTuple readers, one per file + std::vector> readers{}; + /// Filenames parallel to readers (needed to open partial readers for the same file) + std::vector filenames{}; + + /// Cumulative entry offsets across readers. Entry i belongs to the reader + /// whose index corresponds to the last value in this vector that is <= i. + std::vector readerEntries{}; + unsigned totalEntries{0}; + + /// Collection metadata read from the file metadata RNTuple + std::vector collectionInfo{}; + /// Shared collection ID table + std::shared_ptr idTable{nullptr}; + + /// Sequential entry counter used by readNextEntry() + unsigned entry{0}; + + /// Cache of partial readers, keyed by (readerIndex, collectionName). + /// Each partial reader has a minimal RNTuple model with only the fields + /// needed for one collection, so LoadEntry() only reads that collection's data. + std::map, std::unique_ptr> partialReaders{}; + + /// Mutex serializing all RNTuple I/O operations and partial reader creation + std::mutex mutex{}; +}; + +} // namespace podio + +#endif // PODIO_RNTUPLELAZYCATEGORYSTATE_H diff --git a/src/RNTupleLazyFrameData.cc b/src/RNTupleLazyFrameData.cc new file mode 100644 index 000000000..ccc02752a --- /dev/null +++ b/src/RNTupleLazyFrameData.cc @@ -0,0 +1,145 @@ +#include "podio/RNTupleLazyFrameData.h" +#include "podio/CollectionBufferFactory.h" +#include "podio/CollectionBuffers.h" +#include "podio/CollectionIDTable.h" +#include "podio/DatamodelRegistry.h" +#include "podio/GenericParameters.h" + +#include "RNTupleLazyCategoryState.h" +#include "rntuple_utils.h" +#include "rootUtils.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace podio { + +RNTupleLazyFrameData::RNTupleLazyFrameData(std::shared_ptr state, unsigned entry, + std::unordered_map&& availableCollections, + CollIDPtr idTable, podio::GenericParameters&& params) : + m_state(std::move(state)), + m_entry(entry), + m_availableCollections(std::move(availableCollections)), + m_idTable(std::move(idTable)), + m_parameters(std::move(params)) { +} + +std::optional RNTupleLazyFrameData::getCollectionBuffers(const std::string& name) { + const auto it = m_availableCollections.find(name); + if (it == m_availableCollections.end()) { + return std::nullopt; + } + + const auto collIndex = it->second; + const auto& coll = m_state->collectionInfo[collIndex]; + const auto& collType = coll.dataType; + + const auto& bufferFactory = podio::CollectionBufferFactory::instance(); + auto maybeBuffers = bufferFactory.createBuffers(collType, coll.schemaVersion, coll.isSubset); + if (!maybeBuffers) { + std::cerr << "WARNING: Buffers couldn't be created for collection " << name << " of type " << collType + << " and schema version " << coll.schemaVersion << std::endl; + return std::nullopt; + } + auto& collBuffers = maybeBuffers.value(); + + { + std::lock_guard lock{m_state->mutex}; + + // Multi-file dispatch: find which reader contains this entry and what the + // local entry index is within that reader + const auto& readerEntries = m_state->readerEntries; + const auto upper = std::ranges::upper_bound(readerEntries, m_entry); + const auto localEntry = m_entry - *(upper - 1); + const auto readerIndex = static_cast(upper - 1 - readerEntries.begin()); + + // Get or create a partial reader for (readerIndex, collectionName). + // Each partial reader has a minimal model with only the fields for one + // collection, so LoadEntry() only reads that collection's data from disk. + const auto cacheKey = std::make_pair(readerIndex, name); + auto partialIt = m_state->partialReaders.find(cacheKey); + if (partialIt == m_state->partialReaders.end()) { + // Compute the RNTuple field names needed for this collection + std::vector neededFieldNames; + if (coll.isSubset) { + neededFieldNames.emplace_back(root_utils::subsetBranch(coll.name)); + } else { + neededFieldNames.emplace_back(coll.name); + const auto relVecNames = podio::DatamodelRegistry::instance().getRelationNames(collType); + for (const auto& relName : relVecNames.relations) { + neededFieldNames.emplace_back(root_utils::refBranch(coll.name, relName)); + } + for (const auto& vecName : relVecNames.vectorMembers) { + neededFieldNames.emplace_back(root_utils::vecBranch(coll.name, vecName)); + } + } + + // Build a minimal RNTupleModel from the full reader's descriptor, + // containing only the fields needed for this collection + auto& fullReader = *m_state->readers[readerIndex]; + const auto& desc = fullReader.GetDescriptor(); + + ROOT::RCreateFieldOptions fieldOpts; + fieldOpts.SetEmulateUnknownTypes(true); + fieldOpts.SetReturnInvalidOnError(true); + + auto smallModel = ROOT::RNTupleModel::CreateBare(); + const auto& topFieldDesc = desc.GetFieldDescriptor(desc.GetFieldZeroId()); + for (const auto& fieldDesc : desc.GetFieldIterable(topFieldDesc)) { + const auto& fn = fieldDesc.GetFieldName(); + if (std::ranges::find(neededFieldNames, fn) != neededFieldNames.end()) { + auto field = fieldDesc.CreateField(desc, fieldOpts); + if (field) { + smallModel->AddField(std::move(field)); + } + } + } + smallModel->Freeze(); + + const auto& filename = m_state->filenames[readerIndex]; + auto partialReader = root_compat::RNTupleReader::Open(std::move(smallModel), m_state->category, filename); + auto [insertIt, _] = m_state->partialReaders.emplace(cacheKey, std::move(partialReader)); + partialIt = insertIt; + } + + auto& partialReader = *partialIt->second; + const auto dentry = partialReader.GetModel().CreateEntry(); + + if (!rntuple_utils::bindCollectionToEntry(dentry.get(), collBuffers, coll)) { + return std::nullopt; + } + + partialReader.LoadEntry(localEntry, *dentry); + } // mutex released here + + m_availableCollections.erase(it); + return {std::move(collBuffers)}; +} + +podio::CollectionIDTable RNTupleLazyFrameData::getIDTable() const { + return {m_idTable->ids(), m_idTable->names()}; +} + +std::unique_ptr RNTupleLazyFrameData::getParameters() { + return std::make_unique(std::move(m_parameters)); +} + +std::vector RNTupleLazyFrameData::getAvailableCollections() const { + std::vector collections; + collections.reserve(m_availableCollections.size()); + for (const auto& [name, _] : m_availableCollections) { + collections.push_back(name); + } + return collections; +} + +} // namespace podio diff --git a/src/RNTupleLazyReader.cc b/src/RNTupleLazyReader.cc new file mode 100644 index 000000000..36ca9d17e --- /dev/null +++ b/src/RNTupleLazyReader.cc @@ -0,0 +1,126 @@ +#include "podio/RNTupleLazyReader.h" +#include "podio/GenericParameters.h" +#include "podio/utilities/RootHelpers.h" + +#include "RNTupleLazyCategoryState.h" +#include "rntuple_utils.h" + +#include +#include +#include +#include +#include +#include + +namespace podio { + +void RNTupleLazyReader::openFile(const std::string& filename) { + openFiles({filename}); +} + +void RNTupleLazyReader::openFiles(const std::vector& filenames) { + openMetaData(filenames, m_fileVersion, m_datamodelHolder); + + // For each category, create a shared state and open one full reader per file + for (const auto& category : m_availableCategories) { + auto state = std::make_shared(); + state->category = category; + state->readerEntries.reserve(m_filenames.size() + 1); + state->readerEntries.push_back(0); + + for (const auto& filename : m_filenames) { + try { +#if ROOT_VERSION_CODE >= ROOT_VERSION(6, 36, 0) + ROOT::RNTupleDescriptor::RCreateModelOptions options; + options.SetEmulateUnknownTypes(true); + state->readers.emplace_back(root_compat::RNTupleReader::Open(options, category, filename)); +#else + state->readers.emplace_back(root_compat::RNTupleReader::Open(category, filename)); +#endif + state->filenames.emplace_back(filename); + state->readerEntries.push_back(state->readerEntries.back() + state->readers.back()->GetNEntries()); + } catch (const RException&) { + std::cout << "Category " << category << " not found in file " << filename << std::endl; + } + } + + state->totalEntries = state->readerEntries.back(); + // The last element is the total; drop it so the vector only contains start offsets + state->readerEntries.pop_back(); + + m_categoryStates.try_emplace(category, std::move(state)); + } +} + +unsigned RNTupleLazyReader::getEntries(std::string_view name) const { + if (const auto it = m_categoryStates.find(name); it != m_categoryStates.end()) { + return it->second->totalEntries; + } + return 0; +} + +std::unique_ptr RNTupleLazyReader::readNextEntry(std::string_view name, + const std::vector& collsToRead) { + if (const auto it = m_categoryStates.find(name); it != m_categoryStates.end()) { + return readEntry(name, it->second->entry, collsToRead); + } + return nullptr; +} + +std::unique_ptr RNTupleLazyReader::readEntry(std::string_view category, const unsigned entNum, + const std::vector& collsToRead) { + // Lazy-initialize collection info on first access for this category + const auto stateIt = m_categoryStates.find(category); + if (stateIt == m_categoryStates.end()) { + return nullptr; + } + auto& state = stateIt->second; + + if (state->collectionInfo.empty()) { + if (!initCategory(category, state->collectionInfo, state->idTable)) { + return nullptr; + } + } + + if (entNum >= state->totalEntries) { + return nullptr; + } + + const auto& collInfo = state->collectionInfo; + if (!collsToRead.empty()) { + for (const auto& name : collsToRead) { + if (std::ranges::find(collInfo, name, &root_utils::CollectionWriteInfo::name) == collInfo.end()) { + throw std::invalid_argument(name + " is not available from Frame"); + } + } + } + + // Multi-file dispatch to find the right reader and local entry + const auto& readerEntries = state->readerEntries; + const auto upper = std::ranges::upper_bound(readerEntries, entNum); + const auto localEntry = entNum - *(upper - 1); + const auto readerIndex = static_cast(upper - 1 - readerEntries.begin()); + + // Read parameters eagerly (they are small and almost always needed) + GenericParameters parameters; + { + std::lock_guard lock{state->mutex}; + parameters = readEventMetaData(state->readers[readerIndex].get(), localEntry); + } + + // Build available collections map: name -> index in collectionInfo + std::unordered_map availableCollections; + for (size_t i = 0; i < collInfo.size(); ++i) { + if (!collsToRead.empty() && std::ranges::find(collsToRead, collInfo[i].name) == collsToRead.end()) { + continue; + } + availableCollections.emplace(collInfo[i].name, i); + } + + state->entry = entNum + 1; + + return std::make_unique(state, entNum, std::move(availableCollections), state->idTable, + std::move(parameters)); +} + +} // namespace podio diff --git a/src/RNTupleReader.cc b/src/RNTupleReader.cc index 68f892d91..945664e5e 100644 --- a/src/RNTupleReader.cc +++ b/src/RNTupleReader.cc @@ -1,99 +1,22 @@ #include "podio/RNTupleReader.h" #include "podio/CollectionBufferFactory.h" -#include "podio/CollectionBuffers.h" -#include "podio/DatamodelRegistry.h" -#include "podio/GenericParameters.h" #include "podio/utilities/RootHelpers.h" -#include "rootUtils.h" - -#include +#include "rntuple_utils.h" #include -#include #include #include #include -#include #include -// Adjust for the move of this out of ROOT v7 in -// https://github.com/root-project/root/pull/17281 -#if ROOT_VERSION_CODE >= ROOT_VERSION(6, 35, 0) -using ROOT::RException; -#else -using ROOT::Experimental::RException; -#endif - namespace podio { -template -void readParams(root_compat::RNTupleReader* reader, const unsigned localEntry, GenericParameters& params) { - auto keyView = reader->GetView>(root_utils::getGPKeyName()); - auto valueView = reader->GetView>>(root_utils::getGPValueName()); - - params.loadFrom(keyView(localEntry), valueView(localEntry)); -} - -GenericParameters RNTupleReader::readEventMetaData(root_compat::RNTupleReader* reader, const unsigned localEntry) { - GenericParameters params; - - readParams(reader, localEntry, params); - readParams(reader, localEntry, params); - readParams(reader, localEntry, params); - readParams(reader, localEntry, params); - - return params; -} - -bool RNTupleReader::initCategory(std::string_view category) { - if (std::ranges::find(m_availableCategories, category) == m_availableCategories.end()) { - return false; - } - // Assume that the metadata is the same in all files - const auto& filename = m_filenames[0]; - - auto collInfo = m_metadata_readers[filename]->GetView>( - {root_utils::collInfoName(category)}); - - m_collectionInfo[category] = collInfo(0); - m_idTables[category] = root_utils::makeCollIdTable(collInfo(0)); - - return true; -} - void RNTupleReader::openFile(const std::string& filename) { openFiles({filename}); } void RNTupleReader::openFiles(const std::vector& filenames) { - - m_filenames.insert(m_filenames.end(), filenames.begin(), filenames.end()); - for (const auto& filename : filenames) { - m_metadata_readers.try_emplace(filename, root_compat::RNTupleReader::Open(root_utils::metaTreeName, filename)); - } - - m_metadata = root_compat::RNTupleReader::Open(root_utils::metaTreeName, filenames[0]); - - auto versionView = m_metadata->GetView>(root_utils::versionBranchName); - const auto version = versionView(0); - - m_fileVersion = podio::version::Version{version[0], version[1], version[2]}; - - auto edmView = m_metadata->GetView>>(root_utils::edmDefBranchName); - auto edm = edmView(0); - DatamodelDefinitionHolder::VersionList edmVersions{}; - for (const auto& [name, _] : edm) { - try { - auto edmVersionView = m_metadata->GetView>(root_utils::edmVersionBranchName(name)); - const auto edmVersion = edmVersionView(0); - edmVersions.emplace_back(name, podio::version::Version{edmVersion[0], edmVersion[1], edmVersion[2]}); - } catch (const RException&) { - } - } - m_datamodelHolder = DatamodelDefinitionHolder(std::move(edm), std::move(edmVersions)); - - auto availableCategoriesField = m_metadata->GetView>(root_utils::availableCategories); - m_availableCategories = availableCategoriesField(0); + openMetaData(filenames, m_fileVersion, m_datamodelHolder); // Pre-fill the entries map for (const auto& category : m_availableCategories) { @@ -128,15 +51,6 @@ unsigned RNTupleReader::getEntries(std::string_view name) const { return 0; } -std::vector RNTupleReader::getAvailableCategories() const { - std::vector cats; - cats.reserve(m_availableCategories.size()); - for (const auto& cat : m_availableCategories) { - cats.emplace_back(cat); - } - return cats; -} - std::unique_ptr RNTupleReader::readNextEntry(std::string_view category, const std::vector& collsToRead) { return readEntry(category, m_entries[category], collsToRead); @@ -145,7 +59,7 @@ std::unique_ptr RNTupleReader::readNextEntry(std::string_view cat std::unique_ptr RNTupleReader::readEntry(std::string_view category, const unsigned entNum, const std::vector& collsToRead) { if (m_collectionInfo.find(category) == m_collectionInfo.end()) { - if (!initCategory(category)) { + if (!initCategory(category, m_collectionInfo[category], m_idTables[category])) { return nullptr; } } @@ -197,36 +111,7 @@ std::unique_ptr RNTupleReader::readEntry(std::string_view categor } auto& collBuffers = maybeBuffers.value(); - try { - if (coll.isSubset) { - const auto brName = root_utils::subsetBranch(coll.name); - const auto vec = new std::vector; - dentry->BindRawPtr(brName, vec); - collBuffers.references->at(0) = std::unique_ptr>(vec); - } else { - dentry->BindRawPtr(coll.name, collBuffers.data); - - const auto relVecNames = podio::DatamodelRegistry::instance().getRelationNames(collType); - for (size_t j = 0; j < relVecNames.relations.size(); ++j) { - const auto relName = relVecNames.relations[j]; - const auto vec = new std::vector; - const auto brName = root_utils::refBranch(coll.name, relName); - dentry->BindRawPtr(brName, vec); - collBuffers.references->at(j) = std::unique_ptr>(vec); - } - - for (size_t j = 0; j < relVecNames.vectorMembers.size(); ++j) { - const auto vecName = relVecNames.vectorMembers[j]; - const auto brName = root_utils::vecBranch(coll.name, vecName); - dentry->BindRawPtr(brName, collBuffers.vectorMembers->at(j).second); - } - } - } catch (const RException&) { - // We disable the automatic cleanup by clearing the delete function, - // because it seems ROOT still calls the destructor of the collection - // buffers when LoadEntry fails, which leads to a double deletion in case - // of a partial binding in BindRawPtr. - collBuffers.deleteBuffers = {}; + if (!rntuple_utils::bindCollectionToEntry(dentry.get(), collBuffers, coll)) { continue; } @@ -237,7 +122,8 @@ std::unique_ptr RNTupleReader::readEntry(std::string_view categor auto parameters = readEventMetaData(reader.get(), localEntry); - return std::make_unique(std::move(buffers), m_idTables[category], std::move(parameters)); + auto idTable = m_idTables[category]; + return std::make_unique(std::move(buffers), std::move(idTable), std::move(parameters)); } } // namespace podio diff --git a/src/RNTupleReaderCommon.cc b/src/RNTupleReaderCommon.cc new file mode 100644 index 000000000..00fffb9bd --- /dev/null +++ b/src/RNTupleReaderCommon.cc @@ -0,0 +1,80 @@ +#include "podio/utilities/RNTupleHelpers.h" +#include "rntuple_utils.h" +#include "rootUtils.h" + +#include +#include +#include +#include +#include + +namespace podio { + +std::vector RNTupleReaderCommon::getAvailableCategories() const { + std::vector cats; + cats.reserve(m_availableCategories.size()); + for (const auto& cat : m_availableCategories) { + cats.emplace_back(cat); + } + return cats; +} + +void RNTupleReaderCommon::openMetaData(const std::vector& filenames, + podio::version::Version& fileVersion, + podio::DatamodelDefinitionHolder& datamodelHolder) { + m_filenames.insert(m_filenames.end(), filenames.begin(), filenames.end()); + + for (const auto& filename : filenames) { + m_metadata_readers.try_emplace(filename, root_compat::RNTupleReader::Open(root_utils::metaTreeName, filename)); + } + + m_metadata = root_compat::RNTupleReader::Open(root_utils::metaTreeName, filenames[0]); + + auto versionView = m_metadata->GetView>(root_utils::versionBranchName); + const auto version = versionView(0); + fileVersion = podio::version::Version{version[0], version[1], version[2]}; + + auto edmView = m_metadata->GetView>>(root_utils::edmDefBranchName); + auto edm = edmView(0); + DatamodelDefinitionHolder::VersionList edmVersions{}; + for (const auto& [name, _] : edm) { + try { + auto edmVersionView = m_metadata->GetView>(root_utils::edmVersionBranchName(name)); + const auto edmVersion = edmVersionView(0); + edmVersions.emplace_back(name, podio::version::Version{edmVersion[0], edmVersion[1], edmVersion[2]}); + } catch (const RException&) { + } + } + datamodelHolder = DatamodelDefinitionHolder(std::move(edm), std::move(edmVersions)); + + auto availableCategoriesField = m_metadata->GetView>(root_utils::availableCategories); + m_availableCategories = availableCategoriesField(0); +} + +GenericParameters RNTupleReaderCommon::readEventMetaData(root_compat::RNTupleReader* reader, + const unsigned localEntry) { + GenericParameters params; + rntuple_utils::readParams(reader, localEntry, params); + rntuple_utils::readParams(reader, localEntry, params); + rntuple_utils::readParams(reader, localEntry, params); + rntuple_utils::readParams(reader, localEntry, params); + return params; +} + +bool RNTupleReaderCommon::initCategory(std::string_view category, + std::vector& collInfo, + std::shared_ptr& idTable) { + if (std::ranges::find(m_availableCategories, category) == m_availableCategories.end()) { + return false; + } + + const auto& filename = m_filenames[0]; + auto collInfoView = m_metadata_readers[filename]->GetView>( + {root_utils::collInfoName(category)}); + + collInfo = collInfoView(0); + idTable = root_utils::makeCollIdTable(collInfo); + return true; +} + +} // namespace podio diff --git a/src/ROOTFrameData.cc b/src/ROOTFrameData.cc index cbd690cc8..4afd49a47 100644 --- a/src/ROOTFrameData.cc +++ b/src/ROOTFrameData.cc @@ -11,7 +11,6 @@ std::optional ROOTFrameData::getCollectionBuffers( if (bufferHandle.empty()) { return std::nullopt; } - return {std::move(bufferHandle.mapped())}; } diff --git a/src/ROOTLazyCategoryState.h b/src/ROOTLazyCategoryState.h new file mode 100644 index 000000000..a86ba16f7 --- /dev/null +++ b/src/ROOTLazyCategoryState.h @@ -0,0 +1,38 @@ +#ifndef PODIO_ROOTLAZYCATEGORYSTATE_H +#define PODIO_ROOTLAZYCATEGORYSTATE_H + +#include "podio/CollectionIDTable.h" +#include "podio/ROOTReader.h" +#include "podio/utilities/RootHelpers.h" + +#include "TChain.h" + +#include +#include +#include + +namespace podio { + +/// Shared state between ROOTLazyReader and ROOTLazyFrameData instances for the +/// same category. Holds the TChain and branch information needed for lazy I/O. +/// The mutex serializes all ROOT I/O operations since ROOT is not thread-safe. +struct CategoryState { + CategoryState() = default; + ~CategoryState() = default; + CategoryState(const CategoryState&) = delete; + CategoryState& operator=(const CategoryState&) = delete; + CategoryState(CategoryState&&) = delete; + CategoryState& operator=(CategoryState&&) = delete; + + std::unique_ptr chain{nullptr}; + unsigned entry{0}; + std::vector storedClasses{}; + std::vector branches{}; + std::shared_ptr table{nullptr}; + std::vector paramBranches{}; + std::mutex mutex{}; +}; + +} // namespace podio + +#endif // PODIO_ROOTLAZYCATEGORYSTATE_H diff --git a/src/ROOTLazyFrameData.cc b/src/ROOTLazyFrameData.cc new file mode 100644 index 000000000..181127bc9 --- /dev/null +++ b/src/ROOTLazyFrameData.cc @@ -0,0 +1,86 @@ +#include "podio/ROOTLazyFrameData.h" +#include "podio/CollectionBufferFactory.h" +#include "podio/CollectionBuffers.h" +#include "podio/CollectionIDTable.h" +#include "podio/GenericParameters.h" + +#include "ROOTLazyCategoryState.h" +#include "rootUtils.h" + +#include "TChain.h" + +#include +#include +#include +#include +#include +#include + +namespace podio { + +ROOTLazyFrameData::ROOTLazyFrameData(std::shared_ptr state, unsigned entry, + std::unordered_map&& availableCollections, CollIDPtr idTable, + podio::GenericParameters&& params) : + m_state(std::move(state)), + m_entry(entry), + m_availableCollections(std::move(availableCollections)), + m_idTable(std::move(idTable)), + m_parameters(std::move(params)) { +} + +std::optional ROOTLazyFrameData::getCollectionBuffers(const std::string& name) { + const auto it = m_availableCollections.find(name); + if (it == m_availableCollections.end()) { + return std::nullopt; + } + + const auto collIndex = it->second; + const auto& [collType, isSubsetColl, schemaVersion, branchIndex] = m_state->storedClasses[collIndex].info; + auto& branches = m_state->branches[branchIndex]; + + const auto& bufferFactory = podio::CollectionBufferFactory::instance(); + auto maybeBuffers = bufferFactory.createBuffers(collType, schemaVersion, isSubsetColl); + if (!maybeBuffers) { + std::cerr << "WARNING: Buffers couldn't be created for collection " << name << " of type " << collType + << " and schema version " << schemaVersion << std::endl; + return std::nullopt; + } + auto& collBuffers = maybeBuffers.value(); + + { + std::lock_guard lock{m_state->mutex}; + + const auto localEntry = m_state->chain->LoadTree(m_entry); + + // Always refresh branch pointers for this collection since any intervening + // LoadTree call (from another ROOTLazyFrameData) may have invalidated them + root_utils::resetBranches(m_state->chain.get(), branches, m_state->storedClasses[collIndex].name); + + if (!root_utils::setCollectionAddressesReader(collBuffers, branches)) { + return std::nullopt; + } + root_utils::readBranchesData(branches, localEntry); + } + + m_availableCollections.erase(it); + return {std::move(collBuffers)}; +} + +podio::CollectionIDTable ROOTLazyFrameData::getIDTable() const { + return {m_idTable->ids(), m_idTable->names()}; +} + +std::unique_ptr ROOTLazyFrameData::getParameters() { + return std::make_unique(std::move(m_parameters)); +} + +std::vector ROOTLazyFrameData::getAvailableCollections() const { + std::vector collections; + collections.reserve(m_availableCollections.size()); + for (const auto& [name, _] : m_availableCollections) { + collections.push_back(name); + } + return collections; +} + +} // namespace podio diff --git a/src/ROOTLazyReader.cc b/src/ROOTLazyReader.cc new file mode 100644 index 000000000..8b512521a --- /dev/null +++ b/src/ROOTLazyReader.cc @@ -0,0 +1,134 @@ +#include "podio/ROOTLazyReader.h" +#include "podio/GenericParameters.h" +#include "podio/utilities/RootHelpers.h" + +#include "ROOTLazyCategoryState.h" +#include "rootUtils.h" + +// ROOT specific includes +#include "TChain.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace podio { + +std::unique_ptr ROOTLazyReader::readNextEntry(std::string_view name, + const std::vector& collsToRead) { + auto& catState = getCategoryState(name); + if (!catState) { + return nullptr; + } + auto result = readEntry(catState, catState->entry, collsToRead); + if (result) { + catState->entry++; + } + return result; +} + +std::unique_ptr ROOTLazyReader::readEntry(std::string_view name, const unsigned entry, + const std::vector& collsToRead) { + auto& catState = getCategoryState(name); + auto result = readEntry(catState, entry, collsToRead); + if (result) { + catState->entry = entry + 1; + } + return result; +} + +std::unique_ptr ROOTLazyReader::readEntry(std::shared_ptr& catState, unsigned entry, + const std::vector& collsToRead) { + if (!catState || !catState->chain) { + return nullptr; + } + if (entry >= catState->chain->GetEntries()) { + return nullptr; + } + + // Validate requested collections + if (!collsToRead.empty()) { + for (const auto& name : collsToRead) { + if (std::ranges::find(catState->storedClasses, name, &NamedCollInfo::name) == catState->storedClasses.end()) { + throw std::invalid_argument(name + " is not available from Frame"); + } + } + } + + podio::GenericParameters parameters; + { + std::lock_guard lock{catState->mutex}; + + const auto localEntry = catState->chain->LoadTree(entry); + + // Read parameters eagerly + parameters = readEntryParameters(catState->paramBranches, catState->chain.get(), m_fileVersion, + /*reloadBranches=*/true, localEntry); + } + + // Build available collections map (no lock needed, just reading metadata) + std::unordered_map availableCollections; + for (size_t i = 0; i < catState->storedClasses.size(); ++i) { + if (!collsToRead.empty() && std::ranges::find(collsToRead, catState->storedClasses[i].name) == collsToRead.end()) { + continue; + } + availableCollections.emplace(catState->storedClasses[i].name, i); + } + + return std::make_unique(catState, entry, std::move(availableCollections), catState->table, + std::move(parameters)); +} + +std::shared_ptr& ROOTLazyReader::getCategoryState(std::string_view category) { + if (auto it = m_categoryStates.find(category); it != m_categoryStates.end()) { + // Use branches as proxy to check whether this category has been initialized + if (it->second->branches.empty()) { + root_utils::initCategory(*it->second, m_metaChain.get(), category, m_fileVersion); + } + return it->second; + } + + // Return a static nullptr for unknown categories + static auto invalidState = std::shared_ptr(nullptr); + return invalidState; +} + +void ROOTLazyReader::openFile(const std::string& filename) { + openFiles({filename}); +} + +void ROOTLazyReader::openFiles(const std::vector& filenames) { + openMetaChain(filenames, m_fileVersion, m_datamodelHolder); + + // Set up categories and their chains + for (const auto& cat : m_availCategories) { + auto state = std::make_shared(); + state->chain = std::make_unique(cat.c_str()); + for (const auto& fn : filenames) { + state->chain->Add(fn.c_str()); + } + m_categoryStates.try_emplace(cat, std::move(state)); + } +} + +unsigned ROOTLazyReader::getEntries(std::string_view name) const { + if (const auto it = m_categoryStates.find(name); it != m_categoryStates.end()) { + return it->second->chain->GetEntries(); + } + return 0; +} + +std::vector ROOTLazyReader::getAvailableCategories() const { + std::vector cats; + cats.reserve(m_categoryStates.size()); + for (const auto& [cat, _] : m_categoryStates) { + cats.emplace_back(cat); + } + return cats; +} + +} // namespace podio diff --git a/src/ROOTReader.cc b/src/ROOTReader.cc index 65aaa7429..19b5de307 100644 --- a/src/ROOTReader.cc +++ b/src/ROOTReader.cc @@ -1,17 +1,11 @@ #include "podio/ROOTReader.h" -#include "podio/CollectionBase.h" #include "podio/CollectionBufferFactory.h" #include "podio/CollectionBuffers.h" -#include "podio/CollectionIDTable.h" -#include "podio/DatamodelRegistry.h" -#include "podio/GenericParameters.h" -#include "podio/podioVersion.h" #include "podio/utilities/RootHelpers.h" #include "rootUtils.h" // ROOT specific includes #include "TChain.h" -#include "TClass.h" #include #include @@ -24,67 +18,6 @@ namespace podio { -std::tuple, std::vector> -createCollectionBranches(TChain* chain, const podio::CollectionIDTable& idTable, - const std::vector& collInfo); - -std::tuple, std::vector> -createCollectionBranchesIndexBased(TChain* chain, const podio::CollectionIDTable& idTable, - const std::vector& collInfo); - -template -void ROOTReader::readParams(ROOTReader::CategoryInfo& catInfo, podio::GenericParameters& params, bool reloadBranches, - unsigned int localEntry) { - const auto collBranchIdx = catInfo.branches.size() - root_utils::nParamBranches - 1; - constexpr auto brOffset = root_utils::getGPBranchOffsets(); - - if (reloadBranches) { - auto& keyBranch = catInfo.branches[collBranchIdx + brOffset.keys].data; - keyBranch = root_utils::getBranch(catInfo.chain.get(), root_utils::getGPKeyName()); - auto& valueBranch = catInfo.branches[collBranchIdx + brOffset.values].data; - valueBranch = root_utils::getBranch(catInfo.chain.get(), root_utils::getGPValueName()); - } - - auto keyBranch = catInfo.branches[collBranchIdx + brOffset.keys].data; - auto valueBranch = catInfo.branches[collBranchIdx + brOffset.values].data; - - root_utils::ParamStorage storage; - keyBranch->SetAddress(storage.keysPtr()); - keyBranch->GetEntry(localEntry); - valueBranch->SetAddress(storage.valuesPtr()); - valueBranch->GetEntry(localEntry); - - params.loadFrom(std::move(storage.keys), std::move(storage.values)); -} - -GenericParameters ROOTReader::readEntryParameters(ROOTReader::CategoryInfo& catInfo, bool reloadBranches, - unsigned int localEntry) { - GenericParameters params; - - if (m_fileVersion < podio::version::Version{0, 99, 99}) { - // Parameter branch is always the last one - auto& paramBranches = catInfo.branches.back(); - - // Make sure to have a valid branch pointer after switching trees in the chain - // as well as on the first event - if (reloadBranches) { - paramBranches.data = root_utils::getBranch(catInfo.chain.get(), root_utils::paramBranchName); - } - auto* branch = paramBranches.data; - - auto* emd = ¶ms; - branch->SetAddress(&emd); - branch->GetEntry(localEntry); - } else { - readParams(catInfo, params, reloadBranches, localEntry); - readParams(catInfo, params, reloadBranches, localEntry); - readParams(catInfo, params, reloadBranches, localEntry); - readParams(catInfo, params, reloadBranches, localEntry); - } - - return params; -} - std::unique_ptr ROOTReader::readNextEntry(std::string_view name, const std::vector& collsToRead) { auto& catInfo = getCategoryInfo(name); @@ -110,7 +43,7 @@ std::unique_ptr ROOTReader::readEntry(ROOTReader::CategoryInfo& c // Make sure to not silently ignore non-existant but requested collections if (!collsToRead.empty()) { for (const auto& name : collsToRead) { - if (std::ranges::find(catInfo.storedClasses, name, &detail::NamedCollInfo::name) == catInfo.storedClasses.end()) { + if (std::ranges::find(catInfo.storedClasses, name, &NamedCollInfo::name) == catInfo.storedClasses.end()) { throw std::invalid_argument(name + " is not available from Frame"); } } @@ -141,7 +74,8 @@ std::unique_ptr ROOTReader::readEntry(ROOTReader::CategoryInfo& c buffers.emplace(catInfo.storedClasses[i].name, std::move(collBuffers.value())); } - auto parameters = readEntryParameters(catInfo, reloadBranches, localEntry); + auto parameters = + readEntryParameters(catInfo.paramBranches, catInfo.chain.get(), m_fileVersion, reloadBranches, localEntry); catInfo.entry++; return std::make_unique(std::move(buffers), catInfo.table, std::move(parameters)); @@ -181,7 +115,7 @@ ROOTReader::CategoryInfo& ROOTReader::getCategoryInfo(std::string_view category) // Use the id table as proxy to check whether this category has been // initialized already if (it->second.branches.empty()) { - initCategory(it->second, category); + root_utils::initCategory(it->second, m_metaChain.get(), category, m_fileVersion); } return it->second; } @@ -193,145 +127,16 @@ ROOTReader::CategoryInfo& ROOTReader::getCategoryInfo(std::string_view category) return invalidCategory; } -void ROOTReader::initCategory(CategoryInfo& catInfo, std::string_view category) { - - auto* collInfoBranch = root_utils::getBranch(m_metaChain.get(), root_utils::collInfoName(category)); - - auto collInfo = std::vector(); - auto* collInfoPtr = &collInfo; - if (m_fileVersion >= podio::version::Version{1, 2, 999}) { - collInfoBranch->SetAddress(&collInfoPtr); - collInfoBranch->GetEntry(0); - } else { - auto collInfoOld = std::vector(); - if (m_fileVersion < podio::version::Version{0, 16, 4}) { - auto collInfoReallyOld = std::vector(); - auto* tmpPtr = &collInfoReallyOld; - collInfoBranch->SetAddress(&tmpPtr); - collInfoBranch->GetEntry(0); - collInfoOld.reserve(collInfoReallyOld.size()); - for (const auto& [collID, collType, isSubsetColl] : collInfoReallyOld) { - // Manually set the schema version to 1 - collInfo.emplace_back(collID, std::move(collType), isSubsetColl, 1u); - } - } else { - auto* tmpPtr = &collInfoOld; - collInfoBranch->SetAddress(&tmpPtr); - collInfoBranch->GetEntry(0); - } - // "Convert" to new style - collInfo.reserve(collInfoOld.size()); - for (const auto& [id, typeName, isSubsetColl, schemaVersion] : collInfoOld) { - collInfo.emplace_back(id, std::move(typeName), isSubsetColl, schemaVersion); - } - } - - // Recreate the idTable form the collection info if necessary, otherwise read - // it directly - if (m_fileVersion >= podio::version::Version{1, 2, 999}) { - catInfo.table = root_utils::makeCollIdTable(collInfo); - } else { - catInfo.table = std::make_shared(); - const auto* table = catInfo.table.get(); - auto* tableBranch = root_utils::getBranch(m_metaChain.get(), root_utils::idTableName(category)); - tableBranch->SetAddress(&table); - tableBranch->GetEntry(0); - } - - // For backwards compatibility make it possible to read the index based files - // from older versions - if (m_fileVersion < podio::version::Version{0, 16, 99}) { - std::tie(catInfo.branches, catInfo.storedClasses) = - createCollectionBranchesIndexBased(catInfo.chain.get(), *catInfo.table, collInfo); - } else { - std::tie(catInfo.branches, catInfo.storedClasses) = - createCollectionBranches(catInfo.chain.get(), *catInfo.table, collInfo); - } - - // Finally set up the branches for the parameters - if (m_fileVersion < podio::version::Version{0, 99, 99}) { - root_utils::CollectionBranches paramBranches{}; - catInfo.branches.emplace_back(root_utils::getBranch(catInfo.chain.get(), root_utils::paramBranchName)); - } else { - catInfo.branches.emplace_back(root_utils::getBranch(catInfo.chain.get(), root_utils::intKeyName)); - catInfo.branches.emplace_back(root_utils::getBranch(catInfo.chain.get(), root_utils::intValueName)); - - catInfo.branches.emplace_back(root_utils::getBranch(catInfo.chain.get(), root_utils::floatKeyName)); - catInfo.branches.emplace_back(root_utils::getBranch(catInfo.chain.get(), root_utils::floatValueName)); - - catInfo.branches.emplace_back(root_utils::getBranch(catInfo.chain.get(), root_utils::doubleKeyName)); - catInfo.branches.emplace_back(root_utils::getBranch(catInfo.chain.get(), root_utils::doubleValueName)); - - catInfo.branches.emplace_back(root_utils::getBranch(catInfo.chain.get(), root_utils::stringKeyName)); - catInfo.branches.emplace_back(root_utils::getBranch(catInfo.chain.get(), root_utils::stringValueName)); - } -} - -std::vector getAvailableCategories(TChain* metaChain) { - const auto* branches = metaChain->GetListOfBranches(); - std::vector brNames; - brNames.reserve(branches->GetEntries()); - - for (const auto branch : *branches) { - const std::string name = branch->GetName(); - const auto fUnder = name.find(root_utils::collInfoName("")); - if (fUnder != std::string::npos) { - brNames.emplace_back(name.substr(0, fUnder)); - } - } - - std::ranges::sort(brNames); - brNames.erase(std::unique(brNames.begin(), brNames.end()), brNames.end()); - return brNames; -} - void ROOTReader::openFile(const std::string& filename) { openFiles({filename}); } void ROOTReader::openFiles(const std::vector& filenames) { - m_metaChain = std::make_unique(root_utils::metaTreeName); - // NOTE: We simply assume that the meta data doesn't change throughout the - // chain! This essentially boils down to the assumption that all files that - // are read this way were written with the same settings. - // Reading all files is done to check that all file exists - for (const auto& filename : filenames) { - if (!m_metaChain->Add(filename.c_str(), -1)) { - throw std::runtime_error("File " + filename + " couldn't be found or the \"" + root_utils::metaTreeName + - "\" tree couldn't be read."); - } - } - - auto* versionPtr = &m_fileVersion; - if (auto* versionBranch = root_utils::getBranch(m_metaChain.get(), root_utils::versionBranchName)) { - versionBranch->SetAddress(&versionPtr); - versionBranch->GetEntry(0); - } - - if (auto* edmDefBranch = root_utils::getBranch(m_metaChain.get(), root_utils::edmDefBranchName)) { - auto datamodelDefs = DatamodelDefinitionHolder::MapType{}; - auto* datamodelDefsPtr = &datamodelDefs; - edmDefBranch->SetAddress(&datamodelDefsPtr); - edmDefBranch->GetEntry(0); - - DatamodelDefinitionHolder::VersionList edmVersions{}; - for (const auto& [name, _] : datamodelDefs) { - if (auto* edmVersionBranch = root_utils::getBranch(m_metaChain.get(), root_utils::edmVersionBranchName(name))) { - const auto edmVersion = podio::version::Version{}; - auto* tmpPtr = &edmVersion; - edmVersionBranch->SetAddress(&tmpPtr); - edmVersionBranch->GetEntry(0); - edmVersions.emplace_back(name, edmVersion); - } - } - - m_datamodelHolder = DatamodelDefinitionHolder(std::move(datamodelDefs), std::move(edmVersions)); - } + openMetaChain(filenames, m_fileVersion, m_datamodelHolder); // Do some work up front for setting up categories and setup all the chains // and record the available categories. The rest of the setup follows on // demand when the category is first read - m_availCategories = ::podio::getAvailableCategories(m_metaChain.get()); for (const auto& cat : m_availCategories) { const auto [it, _] = m_categories.try_emplace(cat, std::make_unique(cat.c_str())); for (const auto& fn : filenames) { @@ -357,104 +162,6 @@ std::vector ROOTReader::getAvailableCategories() const { return cats; } -std::tuple, std::vector> -createCollectionBranchesIndexBased(TChain* chain, const podio::CollectionIDTable& idTable, - const std::vector& collInfo) { - - size_t collectionIndex{0}; - std::vector collBranches; - collBranches.reserve(collInfo.size() + 1); - std::vector storedClasses; - storedClasses.reserve(collInfo.size()); - - for (const auto& [collID, collType, isSubsetColl, collSchemaVersion, _, __] : collInfo) { - // We only write collections that are in the collectionIDTable, so no need - // to check here - const auto name = idTable.name(collID).value(); - - const auto collectionClass = TClass::GetClass(collType.c_str()); - // Need the collection here to setup all the branches. Have to manage the - // temporary collection ourselves - const auto collection = - std::unique_ptr(static_cast(collectionClass->New())); - root_utils::CollectionBranches branches{}; - if (isSubsetColl) { - // Only one branch will exist and we can trivially get its name - const auto brName = root_utils::refBranch(name, 0); - branches.refs.push_back(root_utils::getBranch(chain, brName.c_str())); - branches.refNames.emplace_back(std::move(brName)); - } else { - // This branch is guaranteed to exist since only collections that are - // also written to file are in the info metadata that we work with here - branches.data = root_utils::getBranch(chain, name.c_str()); - - const auto buffers = collection->getBuffers(); - for (size_t i = 0; i < buffers.references->size(); ++i) { - const auto brName = root_utils::refBranch(name, i); - branches.refs.push_back(root_utils::getBranch(chain, brName.c_str())); - branches.refNames.emplace_back(std::move(brName)); - } - - for (size_t i = 0; i < buffers.vectorMembers->size(); ++i) { - const auto brName = root_utils::vecBranch(name, i); - branches.vecs.push_back(root_utils::getBranch(chain, brName.c_str())); - branches.vecNames.emplace_back(std::move(brName)); - } - } - - storedClasses.emplace_back(name, std::make_tuple(collType, isSubsetColl, collSchemaVersion, collectionIndex++)); - collBranches.emplace_back(std::move(branches)); - } - - return {std::move(collBranches), storedClasses}; -} - -std::tuple, std::vector> -createCollectionBranches(TChain* chain, const podio::CollectionIDTable& idTable, - const std::vector& collInfo) { - - size_t collectionIndex{0}; - std::vector collBranches; - collBranches.reserve(collInfo.size() + 1); - std::vector storedClasses; - storedClasses.reserve(collInfo.size()); - - for (const auto& [collID, collType, isSubsetColl, collSchemaVersion, _, __] : collInfo) { - // We only write collections that are in the collectionIDTable, so no need - // to check here - const auto name = idTable.name(collID).value(); - - root_utils::CollectionBranches branches{}; - if (isSubsetColl) { - // Only one branch will exist and we can trivially get its name - const auto brName = root_utils::subsetBranch(name); - branches.refs.push_back(root_utils::getBranch(chain, brName.c_str())); - branches.refNames.emplace_back(std::move(brName)); - } else { - // This branch is guaranteed to exist since only collections that are - // also written to file are in the info metadata that we work with here - branches.data = root_utils::getBranch(chain, name.c_str()); - - const auto relVecNames = podio::DatamodelRegistry::instance().getRelationNames(collType); - for (const auto& relName : relVecNames.relations) { - const auto brName = root_utils::refBranch(name, relName); - branches.refs.push_back(root_utils::getBranch(chain, brName.c_str())); - branches.refNames.emplace_back(std::move(brName)); - } - for (const auto& vecName : relVecNames.vectorMembers) { - const auto brName = root_utils::refBranch(name, vecName); - branches.vecs.push_back(root_utils::getBranch(chain, brName.c_str())); - branches.vecNames.emplace_back(std::move(brName)); - } - } - - storedClasses.emplace_back(name, std::make_tuple(collType, isSubsetColl, collSchemaVersion, collectionIndex++)); - collBranches.emplace_back(std::move(branches)); - } - - return {std::move(collBranches), storedClasses}; -} - std::optional> ROOTReader::getSizeStats(std::string_view category) { std::map stats; getCategoryInfo(category); // Ensure category is initialized diff --git a/src/Reader.cc b/src/Reader.cc index 922d62eea..e8bae0487 100644 --- a/src/Reader.cc +++ b/src/Reader.cc @@ -1,5 +1,7 @@ #include "podio/Reader.h" +#include "podio/RNTupleLazyReader.h" +#include "podio/ROOTLazyReader.h" #include "podio/ROOTReader.h" #if PODIO_ENABLE_RNTUPLE #include "podio/RNTupleReader.h" @@ -21,11 +23,11 @@ template Reader::Reader(std::unique_ptr reader) : m_self(std::make_unique>(std::move(reader))) { } -Reader makeReader(const std::string& filename) { - return makeReader(utils::expand_glob(filename)); +Reader makeReader(const std::string& filename, bool lazy) { + return makeReader(utils::expand_glob(filename), lazy); } -Reader makeReader(const std::vector& filenames) { +Reader makeReader(const std::vector& filenames, bool lazy) { if (filenames.empty()) { throw std::runtime_error("No files given to create a Podio Reader"); @@ -59,18 +61,30 @@ Reader makeReader(const std::vector& filenames) { } if (hasRNTuple) { #if PODIO_ENABLE_RNTUPLE - auto actualReader = std::make_unique(); - actualReader->openFiles(filenames); - Reader reader{std::move(actualReader)}; - return reader; + if (lazy) { + auto actualReader = std::make_unique(); + actualReader->openFiles(filenames); + return actualReader; + } else { + auto actualReader = std::make_unique(); + actualReader->openFiles(filenames); + Reader reader{std::move(actualReader)}; + return reader; + } #else throw std::runtime_error("ROOT RNTuple reader not available. Please recompile with ROOT RNTuple support."); #endif } else { - auto actualReader = std::make_unique(); - actualReader->openFiles(filenames); - Reader reader{std::move(actualReader)}; - return reader; + if (lazy) { + auto actualReader = std::make_unique(); + actualReader->openFiles(filenames); + return actualReader; + } else { + auto actualReader = std::make_unique(); + actualReader->openFiles(filenames); + Reader reader{std::move(actualReader)}; + return reader; + } } } else if (suffix == "sio") { #if PODIO_ENABLE_SIO diff --git a/src/RootHelpers.cc b/src/RootHelpers.cc index a1a33484c..56ec59386 100644 --- a/src/RootHelpers.cc +++ b/src/RootHelpers.cc @@ -1,4 +1,12 @@ #include "podio/utilities/RootHelpers.h" +#include "podio/CollectionIDTable.h" +#include "podio/utilities/DatamodelRegistryIOHelpers.h" + +#include "rootUtils.h" + +#include "TChain.h" + +#include namespace podio::root_utils { GenericParameters @@ -14,4 +22,77 @@ loadParamsFrom(ROOT::VecOps::RVec intKeys, ROOT::VecOps::RVec& filenames, podio::version::Version& fileVersion, + podio::DatamodelDefinitionHolder& datamodelHolder) { + m_metaChain = std::make_unique(root_utils::metaTreeName); + // NOTE: We simply assume that the meta data doesn't change throughout the + // chain! This essentially boils down to the assumption that all files that + // are read this way were written with the same settings. + // Reading all files is done to check that all file exists + for (const auto& filename : filenames) { + if (!m_metaChain->Add(filename.c_str(), -1)) { + throw std::runtime_error("File " + filename + " couldn't be found or the \"" + + std::string(root_utils::metaTreeName) + "\" tree couldn't be read."); + } + } + + auto* versionPtr = &fileVersion; + if (auto* versionBranch = root_utils::getBranch(m_metaChain.get(), root_utils::versionBranchName)) { + versionBranch->SetAddress(&versionPtr); + versionBranch->GetEntry(0); + } + + if (auto* edmDefBranch = root_utils::getBranch(m_metaChain.get(), root_utils::edmDefBranchName)) { + auto datamodelDefs = DatamodelDefinitionHolder::MapType{}; + auto* datamodelDefsPtr = &datamodelDefs; + edmDefBranch->SetAddress(&datamodelDefsPtr); + edmDefBranch->GetEntry(0); + + DatamodelDefinitionHolder::VersionList edmVersions{}; + for (const auto& [name, _] : datamodelDefs) { + if (auto* edmVersionBranch = root_utils::getBranch(m_metaChain.get(), root_utils::edmVersionBranchName(name))) { + const auto edmVersion = podio::version::Version{}; + auto* tmpPtr = &edmVersion; + edmVersionBranch->SetAddress(&tmpPtr); + edmVersionBranch->GetEntry(0); + edmVersions.emplace_back(name, edmVersion); + } + } + + datamodelHolder = DatamodelDefinitionHolder(std::move(datamodelDefs), std::move(edmVersions)); + } + + m_availCategories = root_utils::getAvailableCategories(m_metaChain.get()); +} + +podio::GenericParameters +TTreeReaderCommon::readEntryParameters(std::vector& paramBranches, TChain* chain, + const podio::version::Version& fileVersion, bool reloadBranches, + unsigned int localEntry) { + GenericParameters params; + + if (fileVersion < podio::version::Version{0, 99, 99}) { + // Parameter branch is always the last one + auto& paramBranch = paramBranches[0]; + + // Make sure to have a valid branch pointer after switching trees in the chain + // as well as on the first event + if (reloadBranches) { + paramBranch.data = root_utils::getBranch(chain, root_utils::paramBranchName); + } + auto* branch = paramBranch.data; + + auto* emd = ¶ms; + branch->SetAddress(&emd); + branch->GetEntry(localEntry); + } else { + root_utils::readParams(paramBranches, chain, params, reloadBranches, localEntry); + root_utils::readParams(paramBranches, chain, params, reloadBranches, localEntry); + root_utils::readParams(paramBranches, chain, params, reloadBranches, localEntry); + root_utils::readParams(paramBranches, chain, params, reloadBranches, localEntry); + } + + return params; +} + } // namespace podio::root_utils diff --git a/src/rntuple_utils.h b/src/rntuple_utils.h new file mode 100644 index 000000000..d0ae98f3c --- /dev/null +++ b/src/rntuple_utils.h @@ -0,0 +1,76 @@ +#ifndef PODIO_RNTUPLE_UTILS_H // NOLINT(llvm-header-guard): internal headers confuse clang-tidy +#define PODIO_RNTUPLE_UTILS_H // NOLINT(llvm-header-guard): internal headers confuse clang-tidy + +#include "podio/CollectionBuffers.h" +#include "podio/DatamodelRegistry.h" +#include "podio/GenericParameters.h" +#include "podio/utilities/RNTupleHelpers.h" +#include "rootUtils.h" + +#include +#include + +#include +#include + +// Adjust for the move of this out of ROOT v7 in +// https://github.com/root-project/root/pull/17281 +#if ROOT_VERSION_CODE >= ROOT_VERSION(6, 35, 0) +using ROOT::RException; +#else +using ROOT::Experimental::RException; +#endif + +namespace podio::rntuple_utils { + +/// Read the generic parameters of one type from a metadata reader entry and +/// store them in params. +template +void readParams(root_compat::RNTupleReader* reader, const unsigned localEntry, GenericParameters& params) { + auto keyView = reader->GetView>(root_utils::getGPKeyName()); + auto valueView = reader->GetView>>(root_utils::getGPValueName()); + params.loadFrom(keyView(localEntry), valueView(localEntry)); +} + +/// Bind all fields of the given collection to dentry and populate collBuffers. +/// Returns false if a ROOT field is missing (RException was thrown), in which +/// case collBuffers.deleteBuffers has been cleared to prevent a double-free on +/// partial binding. +template +bool bindCollectionToEntry(EntryT* dentry, podio::CollectionReadBuffers& collBuffers, + const podio::root_utils::CollectionWriteInfo& coll) { + const auto& collType = coll.dataType; + try { + if (coll.isSubset) { + const auto brName = root_utils::subsetBranch(coll.name); + const auto vec = new std::vector; + dentry->BindRawPtr(brName, vec); + collBuffers.references->at(0) = std::unique_ptr>(vec); + } else { + dentry->BindRawPtr(coll.name, collBuffers.data); + + const auto relVecNames = podio::DatamodelRegistry::instance().getRelationNames(collType); + for (size_t j = 0; j < relVecNames.relations.size(); ++j) { + const auto relName = relVecNames.relations[j]; + const auto vec = new std::vector; + const auto brName = root_utils::refBranch(coll.name, relName); + dentry->BindRawPtr(brName, vec); + collBuffers.references->at(j) = std::unique_ptr>(vec); + } + + for (size_t j = 0; j < relVecNames.vectorMembers.size(); ++j) { + const auto vecName = relVecNames.vectorMembers[j]; + const auto brName = root_utils::vecBranch(coll.name, vecName); + dentry->BindRawPtr(brName, collBuffers.vectorMembers->at(j).second); + } + } + } catch (const RException&) { + collBuffers.deleteBuffers = {}; + return false; + } + return true; +} + +} // namespace podio::rntuple_utils + +#endif // PODIO_RNTUPLE_UTILS_H diff --git a/src/rootUtils.h b/src/rootUtils.h index 12e31d929..107d8fd72 100644 --- a/src/rootUtils.h +++ b/src/rootUtils.h @@ -4,10 +4,12 @@ #include "podio/CollectionBase.h" #include "podio/CollectionBuffers.h" #include "podio/CollectionIDTable.h" +#include "podio/ROOTReader.h" // For NamedCollInfo #include "podio/utilities/RootHelpers.h" #include "podio/utilities/TypeHelpers.h" #include "TBranch.h" +#include "TChain.h" #include "TTree.h" #include @@ -407,6 +409,222 @@ inline std::shared_ptr makeCollIdTable(const std::vect return std::make_shared(std::move(ids), std::move(names)); } +inline std::vector getAvailableCategories(TChain* metaChain) { + const auto* branches = metaChain->GetListOfBranches(); + std::vector brNames; + brNames.reserve(branches->GetEntries()); + + for (const auto branch : *branches) { + const std::string name = branch->GetName(); + const auto fUnder = name.find(root_utils::collInfoName("")); + if (fUnder != std::string::npos) { + brNames.emplace_back(name.substr(0, fUnder)); + } + } + + std::ranges::sort(brNames); + brNames.erase(std::unique(brNames.begin(), brNames.end()), brNames.end()); + return brNames; +} + +template +void readParams(std::vector& branches, TChain* chain, podio::GenericParameters& params, + bool reloadBranches, unsigned int localEntry) { + constexpr auto brOffset = root_utils::getGPBranchOffsets(); + const auto keyIdx = brOffset.keys - 1; + const auto valIdx = brOffset.values - 1; + + if (reloadBranches) { + branches[keyIdx].data = getBranch(chain, getGPKeyName()); + branches[valIdx].data = getBranch(chain, getGPValueName()); + } + + auto keyBranch = branches[keyIdx].data; + auto valueBranch = branches[valIdx].data; + + root_utils::ParamStorage storage; + keyBranch->SetAddress(storage.keysPtr()); + keyBranch->GetEntry(localEntry); + valueBranch->SetAddress(storage.valuesPtr()); + valueBranch->GetEntry(localEntry); + + params.loadFrom(std::move(storage.keys), std::move(storage.values)); +} + +} // namespace podio::root_utils + +// Free function template for initializing a category container from the metadata chain. +// Lives here because it requires the helper functions defined above. +namespace podio::root_utils { + +// Forward declarations for helpers defined in RootHelpers.cc +std::tuple< + std::vector, + std::vector> inline createCollectionBranches(TChain* chain, + const podio::CollectionIDTable& + idTable, + const std::vector< + CollectionWriteInfo>& collInfo) { + size_t collectionIndex{0}; + std::vector collBranches; + collBranches.reserve(collInfo.size() + 1); + std::vector storedClasses; + storedClasses.reserve(collInfo.size()); + + for (const auto& [collID, collType, isSubsetColl, collSchemaVersion, _, __] : collInfo) { + // We only write collections that are in the collectionIDTable, so no need + // to check here + const auto name = idTable.name(collID).value(); + + root_utils::CollectionBranches branches{}; + if (isSubsetColl) { + // Only one branch will exist and we can trivially get its name + const auto brName = root_utils::subsetBranch(name); + branches.refs.push_back(root_utils::getBranch(chain, brName.c_str())); + branches.refNames.emplace_back(std::move(brName)); + } else { + // This branch is guaranteed to exist since only collections that are + // also written to file are in the info metadata that we work with here + branches.data = root_utils::getBranch(chain, name.c_str()); + + const auto relVecNames = podio::DatamodelRegistry::instance().getRelationNames(collType); + for (const auto& relName : relVecNames.relations) { + const auto brName = root_utils::refBranch(name, relName); + branches.refs.push_back(root_utils::getBranch(chain, brName.c_str())); + branches.refNames.emplace_back(std::move(brName)); + } + for (const auto& vecName : relVecNames.vectorMembers) { + const auto brName = root_utils::refBranch(name, vecName); + branches.vecs.push_back(root_utils::getBranch(chain, brName.c_str())); + branches.vecNames.emplace_back(std::move(brName)); + } + } + + storedClasses.emplace_back(name, std::make_tuple(collType, isSubsetColl, collSchemaVersion, collectionIndex++)); + collBranches.emplace_back(std::move(branches)); + } + + return {std::move(collBranches), storedClasses}; +} + +std::tuple< + std::vector, + std::vector> inline createCollectionBranchesIndexBased(TChain* chain, + const podio::CollectionIDTable& idTable, + const std::vector& + collInfo) { + size_t collectionIndex{0}; + std::vector collBranches; + collBranches.reserve(collInfo.size() + 1); + std::vector storedClasses; + storedClasses.reserve(collInfo.size()); + + for (const auto& [collID, collType, isSubsetColl, collSchemaVersion, _, __] : collInfo) { + // We only write collections that are in the collectionIDTable, so no need + // to check here + const auto name = idTable.name(collID).value(); + + const auto collectionClass = TClass::GetClass(collType.c_str()); + // Need the collection here to setup all the branches. Have to manage the + // temporary collection ourselves + const auto collection = + std::unique_ptr(static_cast(collectionClass->New())); + root_utils::CollectionBranches branches{}; + if (isSubsetColl) { + // Only one branch will exist and we can trivially get its name + const auto brName = root_utils::refBranch(name, 0); + branches.refs.push_back(root_utils::getBranch(chain, brName.c_str())); + branches.refNames.emplace_back(std::move(brName)); + } else { + // This branch is guaranteed to exist since only collections that are + // also written to file are in the info metadata that we work with here + branches.data = root_utils::getBranch(chain, name.c_str()); + + const auto buffers = collection->getBuffers(); + for (size_t i = 0; i < buffers.references->size(); ++i) { + const auto brName = root_utils::refBranch(name, i); + branches.refs.push_back(root_utils::getBranch(chain, brName.c_str())); + branches.refNames.emplace_back(std::move(brName)); + } + + for (size_t i = 0; i < buffers.vectorMembers->size(); ++i) { + const auto brName = root_utils::vecBranch(name, i); + branches.vecs.push_back(root_utils::getBranch(chain, brName.c_str())); + branches.vecNames.emplace_back(std::move(brName)); + } + } + + storedClasses.emplace_back(name, std::make_tuple(collType, isSubsetColl, collSchemaVersion, collectionIndex++)); + collBranches.emplace_back(std::move(branches)); + } + + return {std::move(collBranches), storedClasses}; +} + +template +void initCategory(CategoryContainerT& container, TChain* metaChain, std::string_view category, + const podio::version::Version& fileVersion) { + auto* collInfoBranch = getBranch(metaChain, collInfoName(category)); + + auto collInfo = std::vector(); + auto* collInfoPtr = &collInfo; + if (fileVersion >= podio::version::Version{1, 2, 999}) { + collInfoBranch->SetAddress(&collInfoPtr); + collInfoBranch->GetEntry(0); + } else { + auto collInfoOld = std::vector(); + if (fileVersion < podio::version::Version{0, 16, 4}) { + auto collInfoReallyOld = std::vector(); + auto* tmpPtr = &collInfoReallyOld; + collInfoBranch->SetAddress(&tmpPtr); + collInfoBranch->GetEntry(0); + for (const auto& [collID, collType, isSubsetColl] : collInfoReallyOld) { + collInfo.emplace_back(collID, std::move(collType), isSubsetColl, 1u); + } + } else { + auto* tmpPtr = &collInfoOld; + collInfoBranch->SetAddress(&tmpPtr); + collInfoBranch->GetEntry(0); + } + collInfo.reserve(collInfoOld.size()); + for (const auto& [id, typeName, isSubsetColl, schemaVersion] : collInfoOld) { + collInfo.emplace_back(id, std::move(typeName), isSubsetColl, schemaVersion); + } + } + + if (fileVersion >= podio::version::Version{1, 2, 999}) { + container.table = makeCollIdTable(collInfo); + } else { + container.table = std::make_shared(); + const auto* table = container.table.get(); + auto* tableBranch = getBranch(metaChain, idTableName(category)); + tableBranch->SetAddress(&table); + tableBranch->GetEntry(0); + } + + if (fileVersion < podio::version::Version{0, 16, 99}) { + std::tie(container.branches, container.storedClasses) = + createCollectionBranchesIndexBased(container.chain.get(), *container.table, collInfo); + } else { + std::tie(container.branches, container.storedClasses) = + createCollectionBranches(container.chain.get(), *container.table, collInfo); + } + + if (fileVersion < podio::version::Version{0, 99, 99}) { + container.paramBranches.emplace_back(getBranch(container.chain.get(), paramBranchName)); + } else { + container.paramBranches.emplace_back(getBranch(container.chain.get(), intKeyName)); + container.paramBranches.emplace_back(getBranch(container.chain.get(), intValueName)); + container.paramBranches.emplace_back(getBranch(container.chain.get(), floatKeyName)); + container.paramBranches.emplace_back(getBranch(container.chain.get(), floatValueName)); + container.paramBranches.emplace_back(getBranch(container.chain.get(), doubleKeyName)); + container.paramBranches.emplace_back(getBranch(container.chain.get(), doubleValueName)); + container.paramBranches.emplace_back(getBranch(container.chain.get(), stringKeyName)); + container.paramBranches.emplace_back(getBranch(container.chain.get(), stringValueName)); + } +} + } // namespace podio::root_utils #endif diff --git a/tests/root_io/CMakeLists.txt b/tests/root_io/CMakeLists.txt index ebbcee703..4d3c56ca6 100644 --- a/tests/root_io/CMakeLists.txt +++ b/tests/root_io/CMakeLists.txt @@ -9,6 +9,7 @@ set(root_dependent_tests write_interface_root.cpp read_interface_root.cpp read_glob.cpp + read_frame_root_lazy.cpp selected_colls_roundtrip_root.cpp write_empty_collections_root.cpp ) @@ -17,6 +18,7 @@ if(ENABLE_RNTUPLE) ${root_dependent_tests} write_rntuple.cpp read_rntuple.cpp + read_rntuple_lazy.cpp read_python_frame_rntuple.cpp write_interface_rntuple.cpp read_interface_rntuple.cpp @@ -52,6 +54,7 @@ set_tests_properties( read_frame_root_multiple read_and_write_frame_root read_glob + read_frame_root_lazy selected_colls_roundtrip_root PROPERTIES @@ -69,6 +72,7 @@ set_tests_properties(write_python_empty_colls_root PROPERTIES FIXTURES_SETUP pod if(ENABLE_RNTUPLE) set_tests_properties( read_rntuple + read_rntuple_lazy selected_colls_roundtrip_rntuple PROPERTIES @@ -90,6 +94,42 @@ if(ENABLE_DATASOURCE) set_tests_properties(read_with_rdatasource_root PROPERTIES FIXTURES_REQUIRED podio_write_root_fixture) endif() +if(ENABLE_DATASOURCE) + add_test(NAME use_python_with_rdatasource_root + COMMAND python3 ${PROJECT_SOURCE_DIR}/tests/root_io/use_datasource.py) + PODIO_SET_TEST_ENV(use_python_with_rdatasource_root PYTHON) + set_tests_properties(use_python_with_rdatasource_root PROPERTIES + FIXTURES_REQUIRED podio_write_root_fixture + FIXTURES_SETUP podio_datasource_snapshot_fixture + ) + + add_executable(check_datasource_output check_datasource_output.cpp) + target_link_libraries(check_datasource_output PRIVATE ROOT::Core ROOT::RIO ROOT::Tree) + add_test(NAME check_datasource_output COMMAND check_datasource_output) + PODIO_SET_TEST_ENV(check_datasource_output) + set_tests_properties(check_datasource_output PROPERTIES + FIXTURES_REQUIRED podio_datasource_snapshot_fixture + ) +endif() + +if(ENABLE_DATASOURCE AND ENABLE_RNTUPLE) + add_test(NAME use_python_with_rdatasource_rntuple + COMMAND python3 ${PROJECT_SOURCE_DIR}/tests/root_io/use_datasource.py + example_rntuple.root datasource_snapshot_rntuple.root) + PODIO_SET_TEST_ENV(use_python_with_rdatasource_rntuple PYTHON) + set_tests_properties(use_python_with_rdatasource_rntuple PROPERTIES + FIXTURES_REQUIRED podio_write_rntuple_fixture + FIXTURES_SETUP podio_datasource_rntuple_snapshot_fixture + ) + + add_test(NAME check_datasource_output_rntuple + COMMAND check_datasource_output datasource_snapshot_rntuple.root) + PODIO_SET_TEST_ENV(check_datasource_output_rntuple) + set_tests_properties(check_datasource_output_rntuple PROPERTIES + FIXTURES_REQUIRED podio_datasource_rntuple_snapshot_fixture + ) +endif() + add_executable(read_frame_legacy_root read_frame_legacy_root.cpp) target_link_libraries(read_frame_legacy_root PRIVATE "${root_libs}") @@ -119,8 +159,4 @@ add_test(NAME param_reading_rdataframe COMMAND python3 ${CMAKE_CURRENT_SOURCE_DI PODIO_SET_TEST_ENV(param_reading_rdataframe PYTHON) set_tests_properties(param_reading_rdataframe PROPERTIES FIXTURES_REQUIRED podio_write_root_fixture) -if(ENABLE_DATASOURCE) - add_test(NAME read_python_with_rdatasource_root COMMAND python3 ${PROJECT_SOURCE_DIR}/tests/root_io/read_datasource.py) - PODIO_SET_TEST_ENV(read_python_with_rdatasource_root PYTHON) - set_tests_properties(read_python_with_rdatasource_root PROPERTIES FIXTURES_REQUIRED podio_write_root_fixture) -endif() + diff --git a/tests/root_io/check_datasource_output.cpp b/tests/root_io/check_datasource_output.cpp new file mode 100644 index 000000000..3b3307fb0 --- /dev/null +++ b/tests/root_io/check_datasource_output.cpp @@ -0,0 +1,119 @@ +/** + * Checker for the datasource snapshot output produced by use_datasource.py. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#define CHECK(condition, msg) \ + if (!(condition)) { \ + throw std::runtime_error(std::string("check_datasource_output: ") + (msg)); \ + } + +#define CHECK_EQUAL(actual, expected, label) \ + if (std::abs((actual) - (expected)) >= std::numeric_limits::min()) { \ + throw std::runtime_error(std::string("check_datasource_output: ") + (label) + " value mismatch: got " + \ + std::to_string(actual) + ", expected " + std::to_string(expected)); \ + } + +int main(int argc, const char* argv[]) { + std::string inputFile = "datasource_snapshot.root"; + if (argc == 2) { + inputFile = argv[1]; + } else if (argc > 2) { + std::cout << "Usage: " << argv[0] << " [FILE]" << std::endl; + return 1; + } + + TFile f(inputFile.c_str(), "READ"); + if (f.IsZombie()) { + std::cerr << "Could not open " << inputFile << std::endl; + return EXIT_FAILURE; + } + + auto* tree = f.Get("events"); + CHECK(tree != nullptr, "TTree 'events' not found in " + inputFile) + CHECK(tree->GetEntries() == 10, "Expected 10 entries in snapshot, got " + std::to_string(tree->GetEntries())) + + int event_number = 0; + double hit_energy_0 = 0., hit_energy_1 = 0., hit_energy_sum = 0.; + double cluster_energy_0 = 0., cluster_energy_1 = 0., cluster_energy_2 = 0.; + + double composite_hit_energy_0 = 0., composite_hit_energy_1 = 0.; + + double sub_cluster_energy_0 = 0., sub_cluster_energy_1 = 0.; + + double link0_weight = 0., link0_from_energy = 0., link0_to_energy = 0.; + double link1_weight = 0., link1_from_energy = 0., link1_to_energy = 0.; + + tree->SetBranchAddress("event_number", &event_number); + tree->SetBranchAddress("hit_energy_0", &hit_energy_0); + tree->SetBranchAddress("hit_energy_1", &hit_energy_1); + tree->SetBranchAddress("hit_energy_sum", &hit_energy_sum); + tree->SetBranchAddress("cluster_energy_0", &cluster_energy_0); + tree->SetBranchAddress("cluster_energy_1", &cluster_energy_1); + tree->SetBranchAddress("cluster_energy_2", &cluster_energy_2); + tree->SetBranchAddress("composite_hit_energy_0", &composite_hit_energy_0); + tree->SetBranchAddress("composite_hit_energy_1", &composite_hit_energy_1); + tree->SetBranchAddress("sub_cluster_energy_0", &sub_cluster_energy_0); + tree->SetBranchAddress("sub_cluster_energy_1", &sub_cluster_energy_1); + tree->SetBranchAddress("link0_weight", &link0_weight); + tree->SetBranchAddress("link0_from_energy", &link0_from_energy); + tree->SetBranchAddress("link0_to_energy", &link0_to_energy); + tree->SetBranchAddress("link1_weight", &link1_weight); + tree->SetBranchAddress("link1_from_energy", &link1_from_energy); + tree->SetBranchAddress("link1_to_energy", &link1_to_energy); + + for (Long64_t entry = 0; entry < tree->GetEntries(); ++entry) { + tree->GetEntry(entry); + + const int i = event_number; // event_number == i by construction + CHECK(i >= 0 && i < 10, "event_number out of expected range [0,9]: " + std::to_string(i)) + const std::string ev = " (event " + std::to_string(i) + ")"; + + CHECK_EQUAL(hit_energy_0, 23. + i, "hit_energy_0" + ev) + CHECK_EQUAL(hit_energy_1, 12. + i, "hit_energy_1" + ev) + CHECK_EQUAL(hit_energy_sum, 35. + 2. * i, "hit_energy_sum" + ev) + + CHECK_EQUAL(cluster_energy_0, 23. + i, "cluster_energy_0" + ev) + CHECK_EQUAL(cluster_energy_1, 12. + i, "cluster_energy_1" + ev) + CHECK_EQUAL(cluster_energy_2, 35. + 2. * i, "cluster_energy_2" + ev) + + CHECK_EQUAL(hit_energy_sum, cluster_energy_2, "hit_energy_sum vs cluster_energy_2" + ev) + CHECK_EQUAL(hit_energy_0 + hit_energy_1, cluster_energy_2, "hit0+hit1 vs cluster_energy_2" + ev) + CHECK_EQUAL(cluster_energy_0 + cluster_energy_1, cluster_energy_2, "sub-cluster sum vs cluster_energy_2" + ev) + + CHECK_EQUAL(composite_hit_energy_0, 23. + i, "composite_hit_energy_0 (cluster->hit relation)" + ev) + CHECK_EQUAL(composite_hit_energy_1, 12. + i, "composite_hit_energy_1 (cluster->hit relation)" + ev) + + CHECK_EQUAL(composite_hit_energy_0, hit_energy_0, "composite_hit_energy_0 vs hit_energy_0" + ev) + CHECK_EQUAL(composite_hit_energy_1, hit_energy_1, "composite_hit_energy_1 vs hit_energy_1" + ev) + + CHECK_EQUAL(sub_cluster_energy_0, 23. + i, "sub_cluster_energy_0 (cluster->cluster relation)" + ev) + CHECK_EQUAL(sub_cluster_energy_1, 12. + i, "sub_cluster_energy_1 (cluster->cluster relation)" + ev) + + CHECK_EQUAL(sub_cluster_energy_0, cluster_energy_0, "sub_cluster_energy_0 vs cluster_energy_0" + ev) + CHECK_EQUAL(sub_cluster_energy_1, cluster_energy_1, "sub_cluster_energy_1 vs cluster_energy_1" + ev) + + CHECK_EQUAL(link0_weight, 0.0, "link0_weight" + ev) + CHECK_EQUAL(link0_from_energy, 23. + i, "link0_from_energy (link->hit relation)" + ev) + CHECK_EQUAL(link0_to_energy, 12. + i, "link0_to_energy (link->cluster relation)" + ev) + CHECK_EQUAL(link0_from_energy, hit_energy_0, "link0_from_energy vs hit_energy_0" + ev) + CHECK_EQUAL(link0_to_energy, cluster_energy_1, "link0_to_energy vs cluster_energy_1" + ev) + + CHECK_EQUAL(link1_weight, 0.5, "link1_weight" + ev) + CHECK_EQUAL(link1_from_energy, 12. + i, "link1_from_energy (link->hit relation)" + ev) + CHECK_EQUAL(link1_to_energy, 23. + i, "link1_to_energy (link->cluster relation)" + ev) + CHECK_EQUAL(link1_from_energy, hit_energy_1, "link1_from_energy vs hit_energy_1" + ev) + CHECK_EQUAL(link1_to_energy, cluster_energy_0, "link1_to_energy vs cluster_energy_0" + ev) + } + + std::cout << "check_datasource_output: all checks passed" << std::endl; + return EXIT_SUCCESS; +} diff --git a/tests/root_io/read_datasource.py b/tests/root_io/read_datasource.py deleted file mode 100644 index b8528568d..000000000 --- a/tests/root_io/read_datasource.py +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env python3 -"""Small test case for checking DataSource based creating RDataFrames is accessible from python""" - -import ROOT -from podio.data_source import CreateDataFrame # pylint: disable=import-error, no-name-in-module - -if ROOT.gSystem.Load("libTestDataModelDict") < 0: - raise RuntimeError("Could not load TestDataModel dictionary") - -rdf = CreateDataFrame("example_frame.root") - -assert rdf.Count().GetValue() == 10 - -rdf = CreateDataFrame("example_frame_?.root") - -assert rdf.Count().GetValue() == 20 diff --git a/tests/root_io/read_frame_root_lazy.cpp b/tests/root_io/read_frame_root_lazy.cpp new file mode 100644 index 000000000..ed637299e --- /dev/null +++ b/tests/root_io/read_frame_root_lazy.cpp @@ -0,0 +1,24 @@ +#include "read_frame.h" +#include "read_frame_auxiliary.h" + +#include "podio/ROOTLazyReader.h" + +#include +#include + +int main(int argc, char* argv[]) { + std::string inputFile = "example_frame.root"; + bool assertBuildVersion = true; + if (argc == 2) { + inputFile = argv[1]; + assertBuildVersion = false; + } else if (argc > 2) { + std::cout << "Wrong number of arguments" << std::endl; + std::cout << "Usage: " << argv[0] << " FILE" << std::endl; + return 1; + } + + return read_frames(inputFile, assertBuildVersion) + + test_frame_aux_info(inputFile) + + test_read_frame_limited(inputFile); +} diff --git a/tests/root_io/read_rntuple_lazy.cpp b/tests/root_io/read_rntuple_lazy.cpp new file mode 100644 index 000000000..8e78fe355 --- /dev/null +++ b/tests/root_io/read_rntuple_lazy.cpp @@ -0,0 +1,22 @@ +#include "read_frame.h" +#include "read_frame_auxiliary.h" + +#include "podio/RNTupleLazyReader.h" + +#include +#include + +int main(int argc, char* argv[]) { + std::string inputFile = "example_rntuple.root"; + + if (argc == 2) { + inputFile = argv[1]; + } else if (argc > 2) { + std::cerr << "Usage: " << argv[0] << " [input_file]" << std::endl; + return 1; + } + + return read_frames(inputFile) + + test_frame_aux_info(inputFile) + + test_read_frame_limited(inputFile); +} diff --git a/tests/root_io/use_datasource.py b/tests/root_io/use_datasource.py new file mode 100644 index 000000000..698834196 --- /dev/null +++ b/tests/root_io/use_datasource.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +"""Test to exercise DataSource thoroughly""" + +import sys +import ROOT +from podio.data_source import CreateDataFrame # pylint: disable=import-error, no-name-in-module + +input_file = sys.argv[1] if len(sys.argv) > 1 else "example_frame.root" +snapshot_file = sys.argv[2] if len(sys.argv) > 2 else "datasource_snapshot.root" + +if ROOT.gSystem.Load("libTestDataModelDict") < 0: + raise RuntimeError("Could not load TestDataModel dictionary") + +ROOT.gInterpreter.ProcessLine("using namespace ROOT::VecOps;") + +ROOT.gInterpreter.Declare( + """ +#include +#include +#include +#include +""" +) + +# Declare helpers that extract quantities from the test collections +ROOT.gInterpreter.Declare( + """ +RVec getHitEnergies(const ExampleHitCollection& hits) { + RVec v; + v.reserve(hits.size()); + for (const auto& h : hits) { + v.push_back(h.energy()); + } + return v; +} + +RVec getClusterEnergies(const ExampleClusterCollection& clusters) { + RVec v; + v.reserve(clusters.size()); + for (const auto& c : clusters) { + v.push_back(c.energy()); + } + return v; +} + +int getEventNumber(const EventInfoCollection& info) { + return info[0].Number(); +} + +RVec getCompositeClusterHitEnergies(const ExampleClusterCollection& clusters) { + RVec v; + for (const auto& hit : clusters[2].Hits()) { + v.push_back(hit.energy()); + } + return v; +} + +RVec getSubClusterEnergies(const ExampleClusterCollection& clusters) { + RVec v; + for (const auto& sub : clusters[2].Clusters()) { + v.push_back(sub.energy()); + } + return v; +} + +// Follow each link From (hit) and To (cluster) relations and return: +// [weight0, hit_energy0, cluster_energy0, weight1, hit_energy1, cluster_energy1] +RVec getLinkInfo(const TestLinkCollection& links) { + RVec v; + for (const auto& link : links) { + v.push_back(link.getWeight()); + v.push_back(link.getFrom().energy()); + v.push_back(link.getTo().energy()); + } + return v; +} +""" +) + +rdf = CreateDataFrame(input_file) + +assert rdf.Count().GetValue() == 10, f"Expected 10 events in {input_file}" + +rdf = ( + rdf.Define("hit_energies", "getHitEnergies(hits)") + .Define("hit_energy_0", "hit_energies[0]") + .Define("hit_energy_1", "hit_energies[1]") + .Define("hit_energy_sum", "Sum(hit_energies)") + .Define("cluster_energies", "getClusterEnergies(clusters)") + .Define("cluster_energy_0", "cluster_energies[0]") + .Define("cluster_energy_1", "cluster_energies[1]") + .Define("cluster_energy_2", "cluster_energies[2]") + .Define("event_number", "getEventNumber(info)") + .Define("composite_hit_energies", "getCompositeClusterHitEnergies(clusters)") + .Define("composite_hit_energy_0", "composite_hit_energies[0]") + .Define("composite_hit_energy_1", "composite_hit_energies[1]") + .Define("sub_cluster_energies", "getSubClusterEnergies(clusters)") + .Define("sub_cluster_energy_0", "sub_cluster_energies[0]") + .Define("sub_cluster_energy_1", "sub_cluster_energies[1]") + .Define("link_info", "getLinkInfo(links)") + .Define("link0_weight", "link_info[0]") + .Define("link0_from_energy", "link_info[1]") + .Define("link0_to_energy", "link_info[2]") + .Define("link1_weight", "link_info[3]") + .Define("link1_from_energy", "link_info[4]") + .Define("link1_to_energy", "link_info[5]") +) + +# print(rdf.Describe()) +# print(rdf.Display(["event_number", "hit_energy_0", "cluster_energy_2", "composite_hit_energy_0", "sub_cluster_energy_0"]).AsString()) + +# 10 events, i = 0..9 +# hit_energy_0 = 23+i +mean_hit0 = rdf.Mean("hit_energy_0").GetValue() +assert abs(mean_hit0 - 27.5) < 1e-9, f"Mean of hit_energy_0 should be 27.5, got {mean_hit0}" + +# cluster_energy_2 = 35+2*i +mean_clu2 = rdf.Mean("cluster_energy_2").GetValue() +assert abs(mean_clu2 - 44.0) < 1e-9, f"Mean of cluster_energy_2 should be 44.0, got {mean_clu2}" + +# event_number = i +mean_evtnum = rdf.Mean("event_number").GetValue() +assert abs(mean_evtnum - 4.5) < 1e-9, f"Mean of event_number should be 4.5, got {mean_evtnum}" + +# composite_hit_energy_0 follows clusters[2].Hits()[0] == hits[0], energy = 23+i -> Mean = 27.5 +mean_chit0 = rdf.Mean("composite_hit_energy_0").GetValue() +assert ( + abs(mean_chit0 - 27.5) < 1e-9 +), f"Mean of composite_hit_energy_0 (cluster->hit relation) should be 27.5, got {mean_chit0}" + +# sub_cluster_energy_0 follows clusters[2].Clusters()[0] == clusters[0], energy = 23+i -> Mean = 27.5 +mean_sub0 = rdf.Mean("sub_cluster_energy_0").GetValue() +assert ( + abs(mean_sub0 - 27.5) < 1e-9 +), f"Mean of sub_cluster_energy_0 (cluster->cluster relation) should be 27.5, got {mean_sub0}" + +# link0_weight is always 0.0, link1_weight is always 0.5 +mean_w0 = rdf.Mean("link0_weight").GetValue() +assert abs(mean_w0 - 0.0) < 1e-9, f"Mean of link0_weight should be 0.0, got {mean_w0}" +mean_w1 = rdf.Mean("link1_weight").GetValue() +assert abs(mean_w1 - 0.5) < 1e-9, f"Mean of link1_weight should be 0.5, got {mean_w1}" + +# link0 goes hits[0]->clusters[1]: from-energy = 23+i -> Mean = 27.5, +# to-energy = 12+i -> Mean = 16.5 +mean_l0from = rdf.Mean("link0_from_energy").GetValue() +assert ( + abs(mean_l0from - 27.5) < 1e-9 +), f"Mean of link0_from_energy (link->hit relation) should be 27.5, got {mean_l0from}" +mean_l0to = rdf.Mean("link0_to_energy").GetValue() +assert ( + abs(mean_l0to - 16.5) < 1e-9 +), f"Mean of link0_to_energy (link->cluster relation) should be 16.5, got {mean_l0to}" + +rdf_even = rdf.Filter("event_number % 2 == 0") +assert rdf_even.Count().GetValue() == 5, "Expected 5 even-numbered events after filter" + +columns = [ + "event_number", + "hit_energy_0", + "hit_energy_1", + "hit_energy_sum", + "cluster_energy_0", + "cluster_energy_1", + "cluster_energy_2", + "composite_hit_energy_0", + "composite_hit_energy_1", + "sub_cluster_energy_0", + "sub_cluster_energy_1", + "link0_weight", + "link0_from_energy", + "link0_to_energy", + "link1_weight", + "link1_from_energy", + "link1_to_energy", +] + +rdf.Snapshot("events", snapshot_file, columns) + +print(f"All assertions passed, snapshot written to {snapshot_file}")