File Index#

Interface#

class FileIndexer#

File index interface. To read and write a file index.

Public Functions

virtual ~FileIndexer() = default#
virtual Result<std::shared_ptr<FileIndexReader>> CreateReader(::ArrowSchema *arrow_schema, int32_t start, int32_t length, const std::shared_ptr<InputStream> &input_stream, const std::shared_ptr<MemoryPool> &pool) const = 0#

Create FileIndexReader with input stream.

Parameters:
  • arrow_schema – ArrowSchema derived from arrow schema or struct type with specified indexed field.

  • start – Start position of input stream.

  • length – Length of index bytes.

  • input_stream – Input stream for read index.

  • pool – Memory pool for memory allocation.

Returns:

A FileIndexReader to read index.

virtual Result<std::shared_ptr<FileIndexWriter>> CreateWriter(::ArrowSchema *arrow_schema, const std::shared_ptr<MemoryPool> &pool) const = 0#

Create FileIndexWriter for arrow schema.

Parameters:
  • arrow_schema – ArrowSchema derived from arrow schema or struct type with specified indexed field.

  • pool – Memory pool for memory allocation.

Returns:

A FileIndexWriter to write index.

class FileIndexerFactory : public paimon::Factory#

File index factory to construct FileIndexer.

Public Functions

~FileIndexerFactory() override#
virtual Result<std::unique_ptr<FileIndexer>> Create(const std::map<std::string, std::string> &options) const = 0#

Create a FileIndexer with specified options.

Public Static Functions

static Result<std::unique_ptr<FileIndexer>> Get(const std::string &identifier, const std::map<std::string, std::string> &options)#

Get a FileIndexer according to identifier and options.

class FileIndexWriter#

Interface for writing file-level index data from Arrow batches.

Public Functions

virtual ~FileIndexWriter() = default#
virtual Status AddBatch(::ArrowArray *batch) = 0#

Adds a batch of data to the index writer.

Parameters:

batch – Pointer to a C ArrowArray derived from arrow struct array contain specified indexed field.

Returns:

Status::OK() on success; otherwise, an error indicating failure (e.g., schema mismatch).

virtual Result<PAIMON_UNIQUE_PTR<Bytes>> SerializedBytes() const = 0#

Serializes the built index into a byte buffer.

Note

This method returns the complete serialized form of the index after all batches have been added. It can be called only once and typically assumes no further calls to AddBatch() will occur afterward.

Returns:

A unique pointer to a byte array containing the serialized index data, or an error if serialization fails.

class FileIndexReader : public paimon::FunctionVisitor<std::shared_ptr<FileIndexResult>>#

Evaluates filter predicates against a file-level index to determine file eligibility.

FileIndexReader implements the FunctionVisitor interface specialized to produce std::shared_ptr<FileIndexResult> objects. It reads pre-built file-level index data (e.g., bitmap, bsi or bloom filters) from index file and evaluates whether a given data file may contain rows matching a specific predicate.

Public Functions

virtual Result<std::shared_ptr<FileIndexResult>> VisitIsNotNull() override#

Evaluates the IS NOT NULL predicate on the indexed column.

virtual Result<std::shared_ptr<FileIndexResult>> VisitIsNull() override#

Evaluates the IS NULL predicate on the indexed column.

virtual Result<std::shared_ptr<FileIndexResult>> VisitEqual(const Literal &literal) override#

Evaluates the equality (==) predicate against the given literal.

virtual Result<std::shared_ptr<FileIndexResult>> VisitNotEqual(const Literal &literal) override#

Evaluates the inequality (!=) predicate against the given literal.

virtual Result<std::shared_ptr<FileIndexResult>> VisitLessThan(const Literal &literal) override#

Evaluates the less-than (<) predicate against the given literal.

virtual Result<std::shared_ptr<FileIndexResult>> VisitLessOrEqual(const Literal &literal) override#

Evaluates the less-than-or-equal (<=) predicate against the given literal.

virtual Result<std::shared_ptr<FileIndexResult>> VisitGreaterThan(const Literal &literal) override#

Evaluates the greater-than (>) predicate against the given literal.

virtual Result<std::shared_ptr<FileIndexResult>> VisitGreaterOrEqual(const Literal &literal) override#

Evaluates the greater-than-or-equal (>=) predicate against the given literal.

virtual Result<std::shared_ptr<FileIndexResult>> VisitIn(const std::vector<Literal> &literals) override#

Evaluates the IN predicate against a list of literals.

virtual Result<std::shared_ptr<FileIndexResult>> VisitNotIn(const std::vector<Literal> &literals) override#

Evaluates the NOT IN predicate against a list of literals.

virtual Result<std::shared_ptr<FileIndexResult>> VisitStartsWith(const Literal &prefix) override#

Evaluates whether string values start with the given prefix.

virtual Result<std::shared_ptr<FileIndexResult>> VisitEndsWith(const Literal &suffix) override#

Evaluates whether string values end with the given prefix.

virtual Result<std::shared_ptr<FileIndexResult>> VisitContains(const Literal &literal) override#

Evaluates whether string values contain the given substring.

class FileIndexFormat#

Defines the on-disk format and versioning for Paimon file-level indexes.

File index file format. Put all column and offset in the header.

Public Static Functions

static Result<std::unique_ptr<Reader>> CreateReader(const std::shared_ptr<InputStream> &input_stream, const std::shared_ptr<MemoryPool> &pool)#

Creates a Reader to parse a index file (may contain multiple indexes) from the given input stream.

Parameters:
  • input_stream – Input stream containing serialized index data.

  • pool – Memory pool for temporary allocations during reading.

Returns:

A unique pointer to a Reader on success, or an error if the stream is invalid (e.g., wrong magic, unsupported version, or corrupted data).

Public Static Attributes

static const int64_t MAGIC#
static const int32_t EMPTY_INDEX_FLAG#
static const int32_t V_1#
class Reader#

Reader for file index file.

Public Functions

virtual ~Reader() = default#
virtual Result<std::vector<std::shared_ptr<FileIndexReader>>> ReadColumnIndex(const std::string &column_name, ::ArrowSchema *arrow_schema) const = 0#

Reads index data for a specific column from the index file.

Parameters:
  • column_name – Name of the column to retrieve index data for.

  • arrow_schema – Arrow schema that must contain a field corresponding to column_name.

Returns:

A vector of shared pointers to FileIndexReader objects, each corresponding to a different index type; or an error if the column is not indexed or the index is malformed.

class FileIndexResult : public std::enable_shared_from_this<FileIndexResult>#

File index result to decide whether filter a file.

Subclassed by paimon::BitmapIndexResult, paimon::Remain, paimon::Skip

Public Functions

virtual ~FileIndexResult() = default#
virtual Result<bool> IsRemain() const = 0#
Returns:

Whether the file is remained.

virtual Result<std::shared_ptr<FileIndexResult>> And(const std::shared_ptr<FileIndexResult> &other)#

Compute the intersection of the current result with the provided result.

virtual Result<std::shared_ptr<FileIndexResult>> Or(const std::shared_ptr<FileIndexResult> &other)#

Compute the union of the current result with the provided result.

virtual std::string ToString() const = 0#

Public Static Functions

static std::shared_ptr<FileIndexResult> Remain()#

Note

This is a singleton-like utility; all calls return equivalent objects.

Returns:

A shared instance representing “retain the file”.

static std::shared_ptr<FileIndexResult> Skip()#

Note

This is a singleton-like utility; all calls return equivalent objects.

Returns:

A shared instance representing “skip the file”.

class BitmapIndexResult : public paimon::FileIndexResult#

The implementation of bitmap file index result, represents row granularity.

Note

The inner bitmap in BitmapIndexResult is lazily initialized only when the result is about to be used.

Public Types

using BitmapSupplier = std::function<Result<RoaringBitmap32>()>#

Public Functions

explicit BitmapIndexResult(BitmapSupplier bitmap_supplier)#
~BitmapIndexResult() override#
virtual Result<bool> IsRemain() const override#
Returns:

Whether the file is remained.

virtual Result<std::shared_ptr<FileIndexResult>> And(const std::shared_ptr<FileIndexResult> &other) override#

Compute the intersection of the current result with the provided result.

virtual Result<std::shared_ptr<FileIndexResult>> Or(const std::shared_ptr<FileIndexResult> &other) override#

Compute the union of the current result with the provided result.

Result<const RoaringBitmap32*> GetBitmap() const#
Returns:

Inner RoaringBitmap32.

virtual std::string ToString() const override#