File Format#
Interface#
-
class FileFormat#
FileFormatis used to createReaderBuilderandWriterBuilder.Public Functions
-
virtual ~FileFormat() = default#
-
virtual const std::string &Identifier() const = 0#
- Returns:
The corresponding identifier of file format, e.g., orc.
-
virtual Result<std::unique_ptr<ReaderBuilder>> CreateReaderBuilder(int32_t batch_size) const = 0#
- Returns:
A reader builder which will create reader with specific batch size.
-
virtual Result<std::unique_ptr<WriterBuilder>> CreateWriterBuilder(::ArrowSchema *schema, int32_t batch_size) const = 0#
- Returns:
A
WriterBuilderof the corresponding schema, or error status when schema is invalid.
-
virtual Result<std::unique_ptr<FormatStatsExtractor>> CreateStatsExtractor(::ArrowSchema *schema) const = 0#
- Returns:
A
FormatStatsExtractorof current file format.
-
virtual ~FileFormat() = default#
-
class FormatWriter#
File format writer, each writer corresponds to a data file.
Public Functions
-
virtual ~FormatWriter() = default#
-
virtual Status AddBatch(::ArrowArray *batch) = 0#
Add a batch of records to the format writer.
Note
The batch must conform to the schema expected by the writer.
Note
This method can be called multiple times to write data incrementally.
Note
After calling
Finish(), this method should not be called again.- Parameters:
batch – Pointer to an ArrowArray containing the batch data to write.
- Returns:
Status indicating success (OK) or failure with error information.
-
virtual Status Flush() = 0#
Flushes all intermediate buffered data to the format writer.
- Returns:
Error status returned if the encoder cannot be flushed, or if the output stream return an error.
-
virtual Status Finish() = 0#
Finishes the writing.
This must flush all internal buffer, finish encoding, and write footers.
Note
The writer is not expected to handle any more records via
AddBatch()after this method is called.Warning
This method MUST NOT close the stream that the writer writes to. Closing the stream is expected to happen through the invoker of this method afterwards.
- Returns:
Error status returned if the finalization fails.
-
virtual Result<bool> ReachTargetSize(bool suggested_check, int64_t target_size) const = 0#
Check if the writer has reached the
target_size.- Parameters:
suggested_check – Whether it needs to be checked, but subclasses can also decide whether to check it themselves.
target_size – The size of the target.
- Returns:
True if the target size was reached, otherwise false.
- Returns:
Error status returned if calculating the length fails.
-
virtual std::shared_ptr<Metrics> GetWriterMetrics() const = 0#
Get metrics of the writer.
- Returns:
The accumulated writer metrics to current state.
-
virtual ~FormatWriter() = default#
-
class WriterBuilder#
Create a file format writer based on the file output stream. Allows you to specify memory pool.
Subclassed by paimon::DirectWriterBuilder, paimon::SpecificFSWriterBuilder
Public Functions
-
virtual ~WriterBuilder() = default#
Set memory pool to use.
Build a file format writer based on the file output stream and file compression.
-
virtual ~WriterBuilder() = default#
-
class BatchReader#
A batch reader that supports reading batch data into an arrow array.
Subclassed by paimon::FileBatchReader
Public Types
-
using ReadBatch = std::pair<std::unique_ptr<ArrowArray>, std::unique_ptr<ArrowSchema>>#
Public Functions
-
virtual ~BatchReader() = default#
-
virtual Result<ReadBatch> NextBatch() = 0#
Retrieves the next batch of data.
If EOF is reached, returns an OK status with a nullptr array. Returns an error status only for critical failures (e.g., IO errors). Once an error is returned, this method must not be retried, as it will repeatedly return the same error code.
- Returns:
A result containing a
ReadBatch, which consists of a unique pointer toArrowArrayand a unique pointer toArrowSchema. Returned array contains a_VALUE_KINDfield (the first field) to indicate the row kind of each row. Deleted or index-filtered rows are removed.
-
virtual Result<ReadBatchWithBitmap> NextBatchWithBitmap()#
Retrieves the next batch of data.
If EOF is reached, returns an OK status with a nullptr array. Returns an error status only for critical failures (e.g., IO errors). Once an error is returned, this method must not be retried, as it will repeatedly return the same error code.
- Returns:
A result containing a
ReadBatchand a valid bitmap.ReadBatchconsists of a unique pointer toArrowArrayand a unique pointer toArrowSchema. Returned array contains a _VALUE_KIND field (the first field) to indicate the row kind of each row. Deleted or index-filtered records maybe maintained inReadBatch, while bitmap indicates valid row id. If deletion vector or index are enabled, this function is more efficient thanNextBatch(). The default implementation callsNextBatch()and adds all rows to valid bitmap. Noted that the returned bitmap has at least one valid row id.
-
virtual std::shared_ptr<Metrics> GetReaderMetrics() const = 0#
Retrieves the reader’s metrics.
Note that calling this method frequently may incur significant performance overhead.
- Returns:
A shared pointer to the
Metricsobject.
-
virtual void Close() = 0#
Closes the
BatchReader, releasing any associated resources.After calling this method, further calls to
NextBatch()is undefined and should be avoided.
Public Static Functions
-
static bool IsEofBatch(const ReadBatch &batch)#
Determine whether a
ReadBatchorReadBatchWithBitmapis eof batch, if return true, all the data has been returned.
-
static bool IsEofBatch(const ReadBatchWithBitmap &batch_with_bitmap)#
-
static ReadBatchWithBitmap MakeEofBatchWithBitmap()#
-
using ReadBatch = std::pair<std::unique_ptr<ArrowArray>, std::unique_ptr<ArrowSchema>>#
-
class FileBatchReader : public paimon::BatchReader#
The batch reader for a single file supports returning the line number of the last batch read for deletion vector judgment.
Public Functions
-
virtual Result<std::unique_ptr<::ArrowSchema>> GetFileSchema() const = 0#
- Returns:
The schema of the file.
Resets the read schema and predicate.
If
SetReadSchema()is not called,NextBatch()will return data with the file schema. After resetting the read schema,NextBatch()will read data starting from the first row.- Parameters:
read_schema – The schema to set for reading.
predicate – The predicate to apply for filtering data.
selection_bitmap – The bitmap to apply for filtering data.
- Returns:
The status of the operation.
-
virtual Status SeekToRow(uint64_t row_number) = 0#
Seeks to a specific row in the file.
- Parameters:
row_number – The row number to seek to.
- Returns:
The status of the operation.
-
virtual uint64_t GetPreviousBatchFirstRowNumber() const = 0#
Get the row number of the first row in the previously read batch.
-
virtual uint64_t GetNumberOfRows() const = 0#
Get the number of rows in the file.
-
virtual uint64_t GetNextRowToRead() const = 0#
Retrieves the row number of the next row to be read.
This method indicates the current read position within the file.
- Returns:
The row number of the next row to read.
-
virtual Result<std::vector<std::pair<uint64_t, uint64_t>>> GenReadRanges(bool *need_prefetch) const = 0#
Generates a list of row ranges to be read in batches.
Each range specifies the start and end row numbers for a batch, allowing for efficient batch processing.
The underlying format layer (e.g., parquet) is responsible for determining the most effective way to split the data. This could be by row groups, stripes, or other internal data structures. The key principle is to split the data into contiguous, seekable ranges to minimize read amplification.
For example:
A parquet format could split by RowGroup directly, ensuring each range aligns with a single RowGroup.
The smallest splittable unit must be seekable to its start position, and the splitting strategy should aim to avoid read amplification.
- Parameters:
need_prefetch – A pointer to a boolean. The format layer sets this to indicate whether prefetching is beneficial for the current scenario, to avoid performance regression in certain cases.
- Returns:
A vector of pairs, where each pair represents a range with a start and end row number.
-
virtual Status SetReadRanges(const std::vector<std::pair<uint64_t, uint64_t>> &read_ranges) = 0#
Sets the specific row ranges as a hint to be read from format file.
If the specific file format does not support explicit range-based reads, implementations may gracefully ignore this hint and provide an empty (no-op) implementation.
- Parameters:
read_ranges – A vector of pairs, where each pair defines a half-open interval
[start_row, end_row). Thestart_rowis inclusive, and theend_rowis exclusive.
-
virtual bool SupportPreciseBitmapSelection() const = 0#
Get whether or not support read precisely while bitmap pushed down.
-
virtual Result<ReadBatch> NextBatch() = 0#
Retrieves the next batch of data.
If EOF is reached, returns an OK status with a nullptr array. Returns an error status only for critical failures (e.g., IO errors). Once an error is returned, this method must not be retried, as it will repeatedly return the same error code.
- Returns:
A result containing a
ReadBatch, which consists of a unique pointer toArrowArrayand a unique pointer toArrowSchema. Returned array contains a_VALUE_KINDfield (the first field) to indicate the row kind of each row. Deleted or index-filtered rows are removed.
-
virtual Result<ReadBatchWithBitmap> NextBatchWithBitmap()#
Retrieves the next batch of data.
If EOF is reached, returns an OK status with a nullptr array. Returns an error status only for critical failures (e.g., IO errors). Once an error is returned, this method must not be retried, as it will repeatedly return the same error code.
- Returns:
A result containing a
ReadBatchand a valid bitmap.ReadBatchconsists of a unique pointer toArrowArrayand a unique pointer toArrowSchema. Returned array contains a _VALUE_KIND field (the first field) to indicate the row kind of each row. Deleted or index-filtered records maybe maintained inReadBatch, while bitmap indicates valid row id. If deletion vector or index are enabled, this function is more efficient thanNextBatch(). The default implementation callsNextBatch()and adds all rows to valid bitmap. Noted that the returned bitmap has at least one valid row id.
-
virtual Result<std::unique_ptr<::ArrowSchema>> GetFileSchema() const = 0#
-
class ReaderBuilder#
Create a file batch reader based on the file path. Allows you to specify memory pool.
Public Functions
-
virtual ~ReaderBuilder() = default#
Set memory pool to use.
Build a file batch reader based on the created
InputStream.
-
virtual Result<std::unique_ptr<FileBatchReader>> Build(const std::string &path) const = 0#
Build a file batch reader based on the file path.
-
virtual ~ReaderBuilder() = default#
-
class FileFormatFactory : public paimon::Factory#
A factory for creating
FileFormatinstances.Public Functions
-
~FileFormatFactory() override#
-
virtual Result<std::unique_ptr<FileFormat>> Create(const std::map<std::string, std::string> &options) const = 0#
Create a
FileFormatwith the corresponding options.
Public Static Functions
-
static Result<std::unique_ptr<FileFormat>> Get(const std::string &identifier, const std::map<std::string, std::string> &options)#
Get
FileFormatcorresponding to identifier.- Pre:
Factory is already registered.
-
~FileFormatFactory() override#
-
class FormatStatsExtractor#
Extracts statistics directly from file.
Public Functions
-
virtual ~FormatStatsExtractor() = default#
Extracts statistics for each column of a data file based on the file path and file system.
Extracts statistics for each column and
FileInfoof a data file based on the file path and file system.
-
class FileInfo#
File info fetched from physical file, currently only include row count.
-
virtual ~FormatStatsExtractor() = default#