xgrammar: Structured Generation

0.1.33 · active · verified Thu Apr 09

xgrammar is a Python library focused on efficient, flexible, and portable structured generation, primarily for Large Language Models (LLMs). It allows defining output constraints using grammars (e.g., JSON schema) to ensure LLM outputs conform to specific formats. It is under active development, with frequent minor releases providing new features and performance improvements.

Warnings

Install

Imports

Quickstart

This quickstart demonstrates how to define a JSON schema, convert it into an xgrammar `TagDispatch` object, and use it with a tokenizer to generate valid token masks for structured LLM output. It includes a mock tokenizer for standalone execution, but for real usage, a proper LLM tokenizer (e.g., from HuggingFace or MLC LLM) should be used.

from xgrammar.dfa import JsonSchemaConverter, TagDispatch
from xgrammar.tokenizer import get_tokenizer_and_vocabulary
import json
import os

# For demonstration, use a placeholder tokenizer. In a real scenario, use a specific LLM tokenizer.
# Ensure your tokenizer path is correct or specify a HuggingFace model_id.
# Example: tokenizer_path = "mistralai/Mistral-7B-Instruct-v0.2"
# Placeholder for a simple tokenizer for local testing without large model dependencies.
# If running this with a real model, replace 'llama' with your model's family.
# For a full setup, you might need to install 'mlc-llm' and use its tokenizer.

def create_mock_tokenizer_and_vocabulary():
    # This is a mock function to allow the quickstart to run standalone.
    # In a real use case, you'd load a proper tokenizer (e.g., from HuggingFace or MLC LLM).
    class MockTokenizer:
        def encode(self, text):
            return [ord(c) for c in text]
        def decode(self, tokens):
            return ''.join([chr(t) for t in tokens])
        def vocab_size(self):
            return 256 # ASCII range
    
    # Simulate a basic vocabulary with common tokens
    vocab = {chr(i): i for i in range(32, 127)} # Printable ASCII
    vocab['\n'] = 10
    vocab[' '] = 32
    return MockTokenizer(), vocab

# Try to get a real tokenizer if environment variable is set
MLC_LLM_MODEL_PATH = os.environ.get("MLC_LLM_MODEL_PATH", "")
if MLC_LLM_MODEL_PATH: # Not running this part if env var is not set, use mock
    try:
        tokenizer, vocabulary = get_tokenizer_and_vocabulary(tokenizer_path=MLC_LLM_MODEL_PATH, vocab_type="hf_llama")
    except Exception as e:
        print(f"Could not load tokenizer from MLC_LLM_MODEL_PATH: {e}. Using mock tokenizer.")
        tokenizer, vocabulary = create_mock_tokenizer_and_vocabulary()
else:
    tokenizer, vocabulary = create_mock_tokenizer_and_vocabulary()

# Define a JSON schema
json_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "age": {"type": "integer", "minimum": 0},
        "isStudent": {"type": "boolean"}
    },
    "required": ["name", "age"]
}

# Create a JsonSchemaConverter
json_converter = JsonSchemaConverter(json_schema)

# Compile the grammar
grammar = TagDispatch(json_converter.grammar, tokenizer, vocabulary)

# Example of generating a mask for a partial input (simulating LLM generation)
# The mask would guide the LLM to only generate valid next tokens.
partial_input_text = '{"name": "Alice", "age": 30, "isStudent": ' 
print(f"Partial input: {partial_input_text}")

# Get next token mask (in a real scenario, this would be passed to the LLM's sampler)
next_token_mask = grammar.get_mask(partial_input_text, use_cache=True)

# For demonstration, we'll manually check some tokens
# A valid token might be 't' for true or 'f' for false
example_token_ids = [tokenizer.encode('t')[0], tokenizer.encode('f')[0]] 

print(f"Is 't' (id={example_token_ids[0]}) allowed? {next_token_mask[example_token_ids[0]]}")
print(f"Is 'f' (id={example_token_ids[1]}) allowed? {next_token_mask[example_token_ids[1]]}")

# A number like '5' should not be allowed at this point
invalid_token_id = tokenizer.encode('5')[0]
print(f"Is '5' (id={invalid_token_id}) allowed? {next_token_mask[invalid_token_id]}")

# Note: The actual mask values are usually 0 or 1 (or -inf/0 for logit masking).
# For boolean values, the grammar would expect 'true' or 'false'.
# The current partial_input_text expects the beginning of a boolean. 
# A subsequent call after generating 't' would then expect 'r', 'u', 'e'.

view raw JSON →