PyIceberg

0.11.1 · active · verified Sun Mar 29

PyIceberg is the official Python client for Apache Iceberg, an open table format designed for huge analytic datasets. It provides a pure Pythonic experience, enabling DML operations and queries on Iceberg tables without a JVM, and integrates seamlessly with popular Python data tools like Polars, Pandas, and DuckDB. Currently at version 0.11.1, the library maintains a regular release cadence with minor feature releases and necessary patch updates.

Warnings

Install

Imports

Quickstart

This quickstart demonstrates how to initialize a local SQL catalog, define an Iceberg schema, create a new table, append PyArrow data to it, and then read the data back. It uses local file system for both catalog metadata and data storage.

import os
from pyiceberg.catalog import load_catalog
from pyiceberg.schema import Schema, NestedField, PrimitiveType
import pyarrow as pa
import datetime

# Configure a local SQL catalog using environment variables or direct properties
# For simplicity, we'll use a temporary local directory.
warehouse_path = '/tmp/pyiceberg_warehouse'
os.makedirs(warehouse_path, exist_ok=True)

catalog = load_catalog(
    "default",
    type="sql",
    uri=f"sqlite:///{warehouse_path}/pyiceberg_catalog.db",
    warehouse=f"file://{warehouse_path}"
)

print(f"✓ Successfully loaded catalog: {catalog.name}")

# Define a simple schema
schema = Schema(
    NestedField(1, "id", PrimitiveType.long(), required=True),
    NestedField(2, "name", PrimitiveType.string(), required=False),
    NestedField(3, "event_time", PrimitiveType.timestamp_ntz(), required=False)
)

namespace = "default"
table_name = "my_sample_table"

# Create a namespace if it doesn't exist
catalog.create_namespace_if_not_exists(namespace)
print(f"✓ Ensured namespace '{namespace}' exists.")

# Create a table
try:
    table = catalog.create_table(f"{namespace}.{table_name}", schema)
    print(f"✓ Successfully created table: {table.identifier}")
except Exception as e:
    print(f"Table {namespace}.{table_name} might already exist. Loading it instead. Error: {e}")
    table = catalog.load_table(f"{namespace}.{table_name}")

# Prepare some data using PyArrow
data = pa.table({
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "event_time": [datetime.datetime.now() - datetime.timedelta(days=i) for i in range(3)]
})

# Append data to the table
table.append(data)
print(f"✓ Appended {len(data)} rows to the table.")

# Read data from the table
scan_result = table.scan().to_arrow()
print(f"\nTotal rows read: {len(scan_result)}")
print("Sample data:")
print(scan_result.to_pandas())

# Clean up (optional: uncomment to drop the table and namespace)
# catalog.drop_table(f"{namespace}.{table_name}")
# catalog.drop_namespace(namespace)
# print(f"Cleaned up table {table_name} and namespace {namespace}.")

view raw JSON →