DeepSpeed

0.18.9 · active · verified Sat Apr 11

DeepSpeed is a deep learning optimization library for PyTorch, developed by Microsoft, that significantly reduces computing resources required for training and inference of large-scale models. It provides techniques such as ZeRO (Zero Redundancy Optimizer) for memory optimization, DeepSpeed-MoE for Mixture of Experts, and high-performance inference. Currently at version 0.18.9, it maintains an active development pace with frequent patch releases addressing bug fixes, performance enhancements, and new feature integrations, often every few weeks.

Warnings

Install

Imports

Quickstart

This quickstart demonstrates a basic DeepSpeed setup for a simple PyTorch model. It defines a model, creates a dummy DeepSpeed configuration (`ds_config.json`), initializes DeepSpeed, and performs a few training steps. To run this script correctly and leverage DeepSpeed's distributed features (especially multi-GPU), you should use the DeepSpeed launcher command, e.g., `deepspeed --num_gpus=1 --deepspeed_config ds_config.json quickstart_script.py`. The script includes `torch.distributed.init_process_group` for basic local testing but the launcher handles distributed setup automatically.

import torch
import torch.nn as nn
import deepspeed
import json
import os

# 1. Define a simple PyTorch model
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(10, 1)

    def forward(self, x):
        return self.linear(x)

# 2. Create a dummy DeepSpeed config file
ds_config = {
    "train_batch_size": 2,
    "gradient_accumulation_steps": 1,
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": 1e-5,
            "betas": [0.8, 0.999],
            "eps": 1e-8,
            "weight_decay": 0.01
        }
    },
    "fp16": {
        "enabled": true,
        "initial_scale_power": 16
    },
    "zero_optimization": {
        "stage": 1
    },
    "logging": {
        "steps_per_print": 2
    }
}

# Save the config to a temporary file
config_path = "ds_config.json"
with open(config_path, "w") as f:
    json.dump(ds_config, f, indent=4)

# 3. Initialize DeepSpeed
def main():
    # DeepSpeed usually handles torch.distributed.init_process_group internally when launched via `deepspeed` command.
    # This block ensures it's runnable in a basic sense if run directly, but proper execution uses the deepspeed launcher.
    if not torch.distributed.is_initialized():
        try:
            # Try to initialize if not already done (e.g., when running interactively or not via deepspeed launcher)
            # This requires environment variables like MASTER_ADDR, MASTER_PORT, RANK, WORLD_SIZE set.
            # For simple local test, one might set them manually or ignore this branch for quickstart.
            rank = int(os.environ.get('RANK', '0'))
            world_size = int(os.environ.get('WORLD_SIZE', '1'))
            master_addr = os.environ.get('MASTER_ADDR', 'localhost')
            master_port = os.environ.get('MASTER_PORT', '29500')
            torch.distributed.init_process_group(backend="nccl" if torch.cuda.is_available() else "gloo",
                                               rank=rank, world_size=world_size,
                                               init_method=f"tcp://{master_addr}:{master_port}")
        except RuntimeError as e:
            print(f"Warning: Failed to initialize torch.distributed process group directly. Error: {e}")
            print("This is expected if not launched by DeepSpeed runner. Proceeding with potential issues.")
            # If running without distributed setup, DeepSpeed may not function correctly.
            # For this quickstart, we proceed for demonstration, but a real setup needs distributed init.

    model = SimpleModel()
    # Move model to CUDA if available for DeepSpeed operations
    if torch.cuda.is_available():
        model.cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=ds_config["optimizer"]["params"]["lr"])

    model_engine, optimizer, _, lr_scheduler = deepspeed.initialize(
        model=model,
        optimizer=optimizer,
        config_params=ds_config,
        model_parameters=model.parameters()
    )

    # 4. Dummy data and training step
    input_data = torch.randn(2, 10)
    labels = torch.randn(2, 1)
    if torch.cuda.is_available():
        input_data = input_data.cuda()
        labels = labels.cuda()

    for i in range(3):
        output = model_engine(input_data)
        loss = nn.MSELoss()(output, labels)
        model_engine.backward(loss)
        model_engine.step()
        if torch.distributed.is_initialized():
            print(f"Rank {torch.distributed.get_rank()} - Step {i}, Loss: {loss.item():.4f}")
        else:
            print(f"Step {i}, Loss: {loss.item():.4f}")

    # Clean up the config file
    os.remove(config_path)

if __name__ == '__main__':
    # For DeepSpeed, typically run with `deepspeed your_script.py --deepspeed_config ds_config.json`
    # The provided main() function attempts a basic run, but for full functionality and multi-GPU,
    # the `deepspeed` launcher is necessary.
    try:
        main()
    except Exception as e:
        print(f"An error occurred: {e}")
        print("Hint: DeepSpeed usually requires the `deepspeed` launcher for proper distributed setup.")
        print("Try running with: `deepspeed --num_gpus=1 --deepspeed_config ds_config.json quickstart_script.py`")

view raw JSON →