{"id":22472,"library":"torchft-nightly","title":"TorchFT Nightly","description":"TorchFT (Fault Tolerance) is a PyTorch library providing fault tolerant distributed training with automatic recovery from node failures. The nightly version (2026.4.27) tracks the latest development on PyTorch main branch. Requires Python >=3.8. Released daily.","status":"active","version":"2026.4.27","language":"python","source_language":"en","source_url":"https://github.com/pytorch/torchft","tags":["pytorch","distributed-training","fault-tolerance","nightly","deep-learning"],"install":[{"cmd":"pip install torchft-nightly","lang":"bash","label":"Install nightly"}],"dependencies":[{"reason":"Core dependency; TorchFT wraps PyTorch distributed primitives.","package":"torch","optional":false}],"imports":[{"note":"TorchftManager is defined in torchft.manager module; direct import from torchft does not expose it.","wrong":"from torchft import TorchftManager","symbol":"TorchftManager","correct":"from torchft.manager import TorchftManager"},{"note":"The module is named 'elastic', not 'elastic_agent'.","wrong":"from torchft.elastic_agent import TorchftElasticAgent","symbol":"TorchftElasticAgent","correct":"from torchft.elastic import TorchftElasticAgent"}],"quickstart":{"code":"import torch\nimport torch.distributed as dist\nfrom torchft.manager import TorchftManager\n\n# Initialize the process group (example: NCCL backend)\ndist.init_process_group(backend='nccl')\n\n# Create a TorchFT manager with fault tolerance\nmanager = TorchftManager(\n    store_addr=os.environ.get('STORE_ADDR', 'localhost:1234'),\n    world_size=4,\n    rank=dist.get_rank(),\n    heartbeat_interval=1.0,\n)\n\n# Wrap your model with the manager\nmodel = torch.nn.Linear(10, 10).cuda()\noptimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n\nfor step in range(100):\n    inputs = torch.randn(32, 10, device='cuda')\n    outputs = model(inputs)\n    loss = outputs.sum()\n    loss.backward()\n    optimizer.step()\n    optimizer.zero_grad()\n    manager.commit()  # checkpoint after each step\n\n# Cleanup\nmanager.shutdown()\ndist.destroy_process_group()","lang":"python","description":"Initialize a fault-tolerant distributed training loop using TorchFT manager."},"warnings":[{"fix":"Replace `from torchft.elastic import TorchftElasticAgent` with `from torchft.manager import TorchftManager`.","message":"The `torchft.elastic` module is deprecated in favor of `torchft.manager` since nightly build 2026.3.15. Use `TorchftManager` instead of `TorchftElasticAgent`.","severity":"deprecated","affected_versions":">=2026.3.15"},{"fix":"Change `TorchftManager(store=my_store, ...)` to `TorchftManager(store_addr='host:port', ...)` and use a TCPStore internally.","message":"In nightly builds after 2026.4.10, the `TorchftManager` constructor requires `store_addr` as a string; previously it accepted optional `Store` object. This may break code using `torch.distributed.Store`.","severity":"breaking","affected_versions":">=2026.4.10"},{"fix":"Use `torch.cuda.is_available()` to assert GPU availability before using TorchFT.","message":"TorchFT nightly does not support CPU-only training; it requires CUDA. Running on CPU may cause silent hangs during heartbeat.","severity":"gotcha","affected_versions":"all"}],"env_vars":null,"last_verified":"2026-04-27T00:00:00.000Z","next_check":"2026-07-26T00:00:00.000Z","problems":[{"fix":"Run `pip install torchft-nightly`. Note: There is no stable `torchft` package on PyPI; only nightly is published.","cause":"Installed `torchft` instead of `torchft-nightly` or forgot to install.","error":"ModuleNotFoundError: No module named 'torchft'"},{"fix":"Use `from torchft.manager import TorchftManager` and ensure the subpackage is installed.","cause":"Direct import of `TorchftManager` from top-level package instead of its submodule.","error":"AttributeError: module 'torchft' has no attribute 'TorchftManager'"}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}