SeqIO

0.0.20 · active · verified Thu Apr 16

SeqIO is a Python library by Google for creating task-based datasets, preprocessing pipelines, and evaluation for sequence models. It integrates deeply with T5, Gin-config, and TensorFlow/JAX/PyTorch backends, providing a flexible framework for machine learning research, particularly in NLP. The current version is 0.0.20, and it's under active development with frequent minor releases.

Common errors

Warnings

Install

Imports

Quickstart

This quickstart demonstrates how to define a custom task in SeqIO, including a data source function, a preprocessor to convert data into integer IDs, and a mock vocabulary. It registers the task and then retrieves a processed `tf.data.Dataset` for inspection. This setup forms the basis for training sequence models.

import seqio
import tensorflow as tf
import functools

# 1. Define a minimal mock vocabulary (required for seqio.Feature)
class SimpleVocabulary(seqio.Vocabulary):
    def _encode(self, s): return [ord(c) for c in s] # Simple char to int
    def _decode(self, ids): return "".join([chr(i) for i in ids]) # Simple int to char
    @property
    def EOS_ID(self): return 1
    @property
    def vocab_size(self): return 256 # ASCII range

# 2. Define a data source function that returns a tf.data.Dataset
def my_data_source_fn(split, shuffle_files=False):
    if split == "train":
        return tf.data.Dataset.from_tensor_slices({
            "inputs": ["hello world", "python is fun"],
            "targets": ["olleh dlrow", "nohtyp si nuf"] # Simple reverse task
        })
    raise ValueError(f"Unknown split: {split}")

# 3. Define a simple preprocessor (converts string to integer IDs)
@seqio.map_over_dataset_fn
def tokenize_example(example):
    return {
        "inputs": tf.constant([ord(c) for c in example["inputs"].numpy().decode()], dtype=tf.int32),
        "targets": tf.constant([ord(c) for c in example["targets"].numpy().decode()], dtype=tf.int32),
    }

# 4. Register the task with SeqIO
seqio.Task.make_task(
    name="simple_reverse_task",
    source=seqio.FunctionDataSource(
        dataset_fn=my_data_source_fn,
        splits=["train"]
    ),
    preprocessors=[
        tokenize_example,
        functools.partial(seqio.preprocessors.trim_and_pad, 
                          output_features={"inputs": 20, "targets": 20}),
        seqio.preprocessors.append_eos_after_trim,
    ],
    output_features={
        "inputs": seqio.Feature(vocabulary=SimpleVocabulary(), add_eos=True),
        "targets": seqio.Feature(vocabulary=SimpleVocabulary(), add_eos=True)
    }
)

# 5. Retrieve the task and get its processed dataset
task = seqio.get_mixture_or_task("simple_reverse_task")
ds = task.get_dataset(
    sequence_length={"inputs": 20, "targets": 20}, # Max sequence length for features
    split="train",
    shuffle=False
)

# 6. Iterate through an example to verify
for ex in ds.take(1):
    print("\n--- Processed Example ---")
    print("Raw features:", {k: v.numpy() for k, v in ex.items()})
    
    decoded_inputs = task.output_features["inputs"].vocabulary.decode(ex["inputs"].numpy())
    decoded_targets = task.output_features["targets"].vocabulary.decode(ex["targets"].numpy())
    print(f"Decoded inputs: '{decoded_inputs}'")
    print(f"Decoded targets: '{decoded_targets}'")

view raw JSON →