Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 26 additions & 12 deletions include/podio/DataSource.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
#include <podio/Reader.h>

// ROOT
#include <ROOT/RDF/RColumnReaderBase.hxx>
#include <ROOT/RDataFrame.hxx>
#include <ROOT/RDataSource.hxx>

// STL
#include <memory>
#include <string>
#include <typeinfo>
#include <unordered_map>
#include <utility>
#include <vector>

Expand Down Expand Up @@ -100,47 +102,59 @@ class DataSource : public ROOT::RDF::RDataSource {

std::string GetLabel() override {
return "PODIO Datasource";
};
}

// Legacy API
std::vector<void*> GetColumnReadersImpl(std::string_view, const std::type_info&) override {
return {};
}

std::size_t GetNFiles() const override {
return m_filePathList.size();
}

protected:
///
/// @brief Type-erased vector of pointers to pointers to column
/// values --- one per slot.
/// @brief Returns a column reader for the given slot and column.
///
std::vector<void*> GetColumnReadersImpl(std::string_view name, const std::type_info& typeInfo) override;
std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase> GetColumnReaders(unsigned int slot, std::string_view name,
const std::type_info& tid) override;

protected:
std::string AsString() override {
return "Podio data source";
}

private:
/// Number of slots/threads
unsigned int m_nSlots = 1;

/// Input filename
std::vector<std::string> m_filePathList = {};

/// Total number of events
ULong64_t m_nEvents = 0;

/// Ranges of events available to be processed
std::vector<std::pair<ULong64_t, ULong64_t>> m_rangesAvailable = {};

/// Ranges of events available ever created
/// All entry ranges, fixed after SetNSlots
std::vector<std::pair<ULong64_t, ULong64_t>> m_rangesAll = {};

/// Cursor into m_rangesAll for GetEntryRanges, reset each Initialize()
size_t m_rangesCursor = 0;

/// Column names
std::vector<std::string> m_columnNames{};

/// Column types
std::vector<std::string> m_columnTypes = {};

/// Fast column name -> index lookup
std::unordered_map<std::string, size_t> m_columnIndex{};

/// Collections, m_Collections[columnIndex][slotIndex]
std::vector<std::vector<const podio::CollectionBase*>> m_Collections = {};

/// Active collections
std::vector<unsigned int> m_activeCollections = {};

/// Names of active collections, kept in sync with m_activeCollections
std::vector<std::string> m_activeCollectionNames{};

/// Root podio readers
std::vector<std::unique_ptr<podio::Reader>> m_podioReaders = {};

Expand Down
63 changes: 63 additions & 0 deletions include/podio/RNTupleLazyFrameData.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#ifndef PODIO_RNTUPLELAZY_FRAMEDATA_H
#define PODIO_RNTUPLELAZY_FRAMEDATA_H

#include "podio/CollectionBuffers.h"
#include "podio/CollectionIDTable.h"
#include "podio/GenericParameters.h"

#include <memory>
#include <optional>
#include <string>
#include <unordered_map>
#include <vector>

namespace podio {

struct RNTupleCategoryState;

/// FrameData implementation for lazy RNTuple reading. Instead of holding
/// pre-populated buffers, it holds a reference to the shared RNTupleCategoryState
/// and reads individual collections from disk on demand when
/// getCollectionBuffers() is called. Each lazy read creates (or reuses a cached)
/// partial RNTuple reader with a minimal model containing only the needed fields.
class RNTupleLazyFrameData {
using CollIDPtr = std::shared_ptr<const podio::CollectionIDTable>;

public:
RNTupleLazyFrameData() = delete;
~RNTupleLazyFrameData() = default;
RNTupleLazyFrameData(RNTupleLazyFrameData&&) = default;
RNTupleLazyFrameData& operator=(RNTupleLazyFrameData&&) = default;
RNTupleLazyFrameData(const RNTupleLazyFrameData&) = delete;
RNTupleLazyFrameData& operator=(const RNTupleLazyFrameData&) = delete;

/// Construct from shared category state, entry number, map of available
/// collections (name -> index in RNTupleCategoryState::collectionInfo),
/// the shared ID table, and the eagerly-read parameters.
RNTupleLazyFrameData(std::shared_ptr<RNTupleCategoryState> state, unsigned entry,
std::unordered_map<std::string, size_t>&& availableCollections, CollIDPtr idTable,
podio::GenericParameters&& params);

/// Lazily read a single collection from RNTuple. Creates or reuses a cached
/// partial reader with only this collection's fields and calls LoadEntry().
std::optional<podio::CollectionReadBuffers> getCollectionBuffers(const std::string& name);

podio::CollectionIDTable getIDTable() const;

std::unique_ptr<podio::GenericParameters> getParameters();

std::vector<std::string> getAvailableCollections() const;

private:
std::shared_ptr<RNTupleCategoryState> m_state;
unsigned m_entry{0};
/// Maps collection name to its index in RNTupleCategoryState::collectionInfo.
/// Collections are removed from this map once they have been read.
std::unordered_map<std::string, size_t> m_availableCollections{};
CollIDPtr m_idTable{nullptr};
podio::GenericParameters m_parameters{};
};

} // namespace podio

#endif // PODIO_RNTUPLELAZY_FRAMEDATA_H
92 changes: 92 additions & 0 deletions include/podio/RNTupleLazyReader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#ifndef PODIO_RNTUPLELAZY_READER_H
#define PODIO_RNTUPLELAZY_READER_H

#include "podio/RNTupleLazyFrameData.h"
#include "podio/utilities/ReaderCommon.h"
#include "podio/utilities/RNTupleHelpers.h"

#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <unordered_map>
#include <vector>

namespace podio {

struct RNTupleCategoryState;

/// The RNTupleLazyReader reads files written with the RNTuple backend lazily:
/// individual collections are only read from disk when they are first accessed
/// via Frame::get().
///
/// It provides data as RNTupleLazyFrameData from which a podio::Frame can be
/// constructed. Unlike RNTupleReader which reads all collections eagerly, this
/// reader defers the actual RNTuple I/O to the point of collection access,
/// using partial RNTuple readers with minimal field models so that LoadEntry()
/// only reads the requested collection's data.
class RNTupleLazyReader : public ReaderCommon, public RNTupleReaderCommon {

public:
RNTupleLazyReader() = default;
~RNTupleLazyReader() = default;

RNTupleLazyReader(const RNTupleLazyReader&) = delete;
RNTupleLazyReader& operator=(const RNTupleLazyReader&) = delete;
RNTupleLazyReader(RNTupleLazyReader&&) = default;
RNTupleLazyReader& operator=(RNTupleLazyReader&&) = default;

/// Open a single file for reading.
///
/// @param filename The name of the input file
void openFile(const std::string& filename);

/// Open multiple files for reading and treat them as if they are one file.
///
/// @param filenames The filenames of all input files that should be read
void openFiles(const std::vector<std::string>& filenames);

/// Read the next data entry for a given category.
///
/// @param name The category name for which to read the next entry
/// @param collsToRead (optional) the collection names that should be
/// available for lazy reading. If not provided (or empty)
/// all collections will be available.
///
/// @returns FrameData from which a podio::Frame can be constructed if the
/// category exists and there are still entries left. Otherwise nullptr.
/// No collection data is read at this point.
///
/// @throws std::invalid_argument if collsToRead contains collection names
/// that are not available
std::unique_ptr<podio::RNTupleLazyFrameData> readNextEntry(std::string_view name,
const std::vector<std::string>& collsToRead = {});

/// Read the desired data entry for a given category.
///
/// @param name The category name for which to read the next entry
/// @param entry The entry number to read
/// @param collsToRead (optional) the collection names that should be
/// available for lazy reading. If not provided (or empty)
/// all collections will be available.
///
/// @returns FrameData from which a podio::Frame can be constructed if the
/// category and the desired entry exist. Otherwise nullptr.
/// No collection data is read at this point.
///
/// @throws std::invalid_argument if collsToRead contains collection names
/// that are not available
std::unique_ptr<podio::RNTupleLazyFrameData> readEntry(std::string_view name, unsigned entry,
const std::vector<std::string>& collsToRead = {});

/// Get the number of entries for the given category.
unsigned getEntries(std::string_view name) const;

private:
/// Per-category shared state (readers, entry offsets, collection info, etc.)
std::unordered_map<std::string_view, std::shared_ptr<RNTupleCategoryState>> m_categoryStates{};
};

} // namespace podio

#endif // PODIO_RNTUPLELAZY_READER_H
84 changes: 4 additions & 80 deletions include/podio/RNTupleReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,22 @@
#define PODIO_RNTUPLEREADER_H

#include "podio/ROOTFrameData.h"
#include "podio/podioVersion.h"
#include "podio/utilities/DatamodelRegistryIOHelpers.h"
#include "podio/utilities/RootHelpers.h"
#include "podio/utilities/ReaderCommon.h"
#include "podio/utilities/RNTupleHelpers.h"

#include <string>
#include <string_view>
#include <unordered_map>
#include <vector>

#include <ROOT/RNTuple.hxx>
#include <ROOT/RNTupleReader.hxx>
#include <RVersion.h>

namespace podio {

/// Introduce a new namespace instead of potentially opening and polluting the
/// ROOT namespace
namespace root_compat {
#if ROOT_VERSION_CODE < ROOT_VERSION(6, 35, 0)
using RNTupleReader = ROOT::Experimental::RNTupleReader;
#else
using RNTupleReader = ROOT::RNTupleReader;
#endif
} // namespace root_compat

/// The RNTupleReader can be used to read files that have been written with the
/// RNTuple backend.
///
/// The RNTupleReader provides the data as ROOTFrameData from which a podio::Frame
/// can be constructed. It can be used to read files written by the RNTupleWriter.
class RNTupleReader {
class RNTupleReader : public ReaderCommon, public RNTupleReaderCommon {

public:
RNTupleReader() = default;
Expand Down Expand Up @@ -91,74 +76,15 @@ class RNTupleReader {
std::unique_ptr<podio::ROOTFrameData> readEntry(std::string_view name, const unsigned entry,
const std::vector<std::string>& collsToRead = {});

/// Get the names of all the available Frame categories in the current file(s).
///
/// @returns The names of the available categores from the file
std::vector<std::string_view> getAvailableCategories() const;

/// Get the number of entries for the given name
///
/// @param name The name of the category
///
/// @returns The number of entries that are available for the category
unsigned getEntries(std::string_view name) const;

/// Get the build version of podio that has been used to write the current
/// file
///
/// @returns The podio build version
podio::version::Version currentFileVersion() const {
return m_fileVersion;
}

/// Get the (build) version of a datamodel that has been used to write the
/// current file
///
/// @param name The name of the datamodel
///
/// @returns The (build) version of the datamodel if available or an empty
/// optional
std::optional<podio::version::Version> currentFileVersion(std::string_view name) const {
return m_datamodelHolder.getDatamodelVersion(name);
}

/// Get the datamodel definition for the given name
///
/// @param name The name of the datamodel
///
/// @returns The high level definition of the datamodel in JSON format
const std::string_view getDatamodelDefinition(std::string_view name) const {
return m_datamodelHolder.getDatamodelDefinition(name);
}

/// Get all names of the datamodels that are available from this reader
///
/// @returns The names of the datamodels
std::vector<std::string> getAvailableDatamodels() const {
return m_datamodelHolder.getAvailableDatamodels();
}

private:
/**
* Initialize the given category by filling the maps with metadata information
* that will be used later
*/
bool initCategory(std::string_view category);

/**
* Read and reconstruct the generic parameters of the Frame
*/
GenericParameters readEventMetaData(root_compat::RNTupleReader* reader, const unsigned localEntry);

std::unique_ptr<root_compat::RNTupleReader> m_metadata{};

podio::version::Version m_fileVersion{};
DatamodelDefinitionHolder m_datamodelHolder{};

std::unordered_map<std::string_view, std::vector<std::unique_ptr<root_compat::RNTupleReader>>> m_readers{};
std::unordered_map<std::string, std::unique_ptr<root_compat::RNTupleReader>> m_metadata_readers{};
std::vector<std::string> m_filenames{};

std::unordered_map<std::string_view, unsigned> m_entries{};
// Map category to a vector that contains at how many entries each reader starts
// For example, if we have 3 readers and the first one has 10 entries, the second one 20 and the third one 30
Expand All @@ -170,9 +96,7 @@ class RNTupleReader {
/// Map each category to the collections that have been written and are available
std::unordered_map<std::string_view, std::vector<podio::root_utils::CollectionWriteInfo>> m_collectionInfo{};

std::vector<std::string> m_availableCategories{};

std::unordered_map<std::string_view, std::shared_ptr<podio::CollectionIDTable>> m_idTables{};
std::unordered_map<std::string_view, std::shared_ptr<const podio::CollectionIDTable>> m_idTables{};
};

} // namespace podio
Expand Down
Loading
Loading