TorchText

0.18.0 · maintenance · verified Sun Apr 12

TorchText is a Python library providing text utilities, models, transforms, and datasets for PyTorch. As of version 0.18.0, released in April 2024, active development on new features has stopped, and it is considered the last stable release, primarily focusing on compatibility with PyTorch 2.3.0 and subsequent patch releases.

Warnings

Install

Imports

Quickstart

This quickstart demonstrates the modern TorchText API for text classification. It covers accessing a raw dataset, building a vocabulary, defining text processing pipelines using `get_tokenizer` and `build_vocab_from_iterator`, and finally using `torch.utils.data.DataLoader` with a custom `collate_fn` for batching, padding, and numericalization.

import torch
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

def yield_tokens(data_iter, tokenizer):
    for _, text in data_iter:
        yield tokenizer(text)

def collate_batch(batch, vocab, tokenizer):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(int(_label) - 1)
        processed_text = torch.tensor(vocab(tokenizer(_text)), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list, text_list, offsets

# 1. Access the raw dataset iterators
train_iter = AG_NEWS(split='train')
test_iter = AG_NEWS(split='test')

# 2. Prepare data processing pipelines
tokenizer = get_tokenizer('basic_english')

# Build vocabulary
vocab = build_vocab_from_iterator(
    yield_tokens(train_iter, tokenizer),
    min_freq=1,
    specials=['<unk>']
)
vocab.set_default_index(vocab['<unk>'])

# Re-initialize iterators for vocabulary (if needed, or use a cached list)
train_iter = AG_NEWS(split='train')
test_iter = AG_NEWS(split='test')

# Create a partial function for collate_batch with vocab and tokenizer
current_collate_batch = lambda batch: collate_batch(batch, vocab, tokenizer)

# 3. Generate data batch and iterator with DataLoader
BATCH_SIZE = 64

train_dataloader = DataLoader(
    list(train_iter), # Convert to list for Map-style dataset behavior
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=current_collate_batch
)

test_dataloader = DataLoader(
    list(test_iter), # Convert to list for Map-style dataset behavior
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=current_collate_batch
)

# Example usage:
for i, (labels, texts, offsets) in enumerate(train_dataloader):
    if i == 0:
        print(f"Batch {i+1}:")
        print(f"  Labels: {labels}")
        print(f"  Texts (concatenated token IDs): {texts}")
        print(f"  Offsets (start index of each text in 'texts'): {offsets}")
        break

view raw JSON →