PyIceberg-Core

0.9.0 · active · verified Fri Apr 10

PyIceberg-core is a foundational Python library that provides a Rust-powered core for PyIceberg, enabling efficient access to Apache Iceberg tables without a JVM. It's primarily intended as an internal dependency for the main PyIceberg library but offers performance optimizations for Iceberg data operations. The current version is 0.9.0, and it is actively maintained as part of the broader Apache Iceberg Python project with frequent releases aligning with PyIceberg.

Warnings

Install

Imports

Quickstart

This quickstart demonstrates how to use PyIceberg (which leverages pyiceberg-core) to set up a local SQLite catalog, create a namespace and a table with a defined schema, append data using PyArrow, and then read the data back. It includes necessary cleanup.

import os
import shutil
import pyarrow as pa
from pyiceberg.catalog import load_catalog
from pyiceberg.schema import Schema
from pyiceberg.types import NestedField, StringType, LongType, IntegerType

# Define a temporary warehouse directory
WAREHOUSE_PATH = "/tmp/pyiceberg_warehouse"
CATALOG_DB_PATH = os.path.join(WAREHOUSE_PATH, "pyiceberg_catalog.db")

# Clean up previous run if exists
if os.path.exists(WAREHOUSE_PATH):
    shutil.rmtree(WAREHOUSE_PATH)
os.makedirs(WAREHOUSE_PATH, exist_ok=True)

# Configure and load a local SQL catalog
catalog = load_catalog(
    "default",
    type="sql",
    uri=f"sqlite:///{CATALOG_DB_PATH}",
    warehouse=f"file://{WAREHOUSE_PATH}"
)

# Create a namespace (database)
NAMESPACE = "my_namespace"
catalog.create_namespace(NAMESPACE, properties={"comment": "My first Iceberg namespace"})
print(f"Created namespace: {NAMESPACE}")

# Define a schema for the Iceberg table
schema = Schema(
    NestedField(1, "id", LongType(), required=True),
    NestedField(2, "name", StringType()),
    NestedField(3, "age", IntegerType())
)

# Create an Iceberg table
TABLE_NAME = "my_table"
table = catalog.create_table(f"{NAMESPACE}.{TABLE_NAME}", schema, properties={
    "format-version": "2",
    "write.parquet.compression-codec": "zstd"
})
print(f"Created table: {table.name}")

# Prepare data with PyArrow
data = pa.table({
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "age": [30, 24, 35]
})

# Append data to the table
table.append(data)
print("Appended data to the table.")

# Read data from the table
read_df = table.scan().to_arrow()
print("\nData read from Iceberg table:")
print(read_df.to_pandas())

# Clean up
shutil.rmtree(WAREHOUSE_PATH)
print(f"Cleaned up warehouse at {WAREHOUSE_PATH}")

view raw JSON →