{"id":24025,"library":"megatron-energon","title":"Megatron-Energon","description":"NVIDIA Megatron-Energon is a multi-modal data loader library for large-scale deep learning, particularly for training large language models (LLMs) and vision-language models. It supports tar-based WebDataset, JSONL files, and polylithic datasets, with features like caching, AV decoding, and FUSE mount. Current version is 7.3.2, with active development and a release cadence of roughly monthly.","status":"active","version":"7.3.2","language":"python","source_language":"en","source_url":"https://github.com/NVIDIA/Megatron-Energon","tags":["deep-learning","data-loader","nvidia","megatron","multi-modal","webdataset"],"install":[{"cmd":"pip install megatron-energon","lang":"bash","label":"Core install"},{"cmd":"pip install megatron-energon[all]","lang":"bash","label":"All extras (AV, MSC, fuse)"}],"dependencies":[],"imports":[{"note":"Incorrect import path; module is under megatron.energon.","wrong":"from energon import get_train_dataset","symbol":"get_train_dataset","correct":"from megatron.energon import get_train_dataset"},{"note":"Incorrect import path; module is under megatron.energon.","wrong":"from energon import WorkerConfig","symbol":"WorkerConfig","correct":"from megatron.energon import WorkerConfig"},{"note":"Correct import for Webdataset class.","symbol":"Webdataset","correct":"from megatron.energon import Webdataset"}],"quickstart":{"code":"import os\nfrom megatron.energon import get_train_dataset, WorkerConfig\n\n# Create a simple dataset\nworker_config = WorkerConfig(\n    rank=0,\n    world_size=1,\n    num_workers=2,\n)\n\ndataset = get_train_dataset(\n    path=os.environ.get('DATASET_PATH', 'path/to/dataset'),\n    worker_config=worker_config,\n    batch_size=4,\n    shuffle_buffer_size=100,\n)\n\nfor batch in dataset:\n    # Each batch is a dict with keys like 'rgb', 'json'\n    print(batch.keys())\n    break","lang":"python","description":"Minimal example loading a training dataset with Megatron-Energon."},"warnings":[{"fix":"Run `energon prepare` again or use an older version (6.0.1) if you must keep legacy format.","message":"Version 7.0.0 introduced polylithic datasets and a new AVDecoder, breaking datasets created with v6.x. Existing datasets may need re-preparation.","severity":"breaking","affected_versions":">=7.0.0"},{"fix":"Implement new save/restore logic as per updated docs; old checkpoints are not compatible.","message":"In version 6.0.0, the save/restore mechanism changed: worker dimension is now the outer dimension. Checkpoint compatibility broken.","severity":"breaking","affected_versions":">=6.0.0 <7.0.0"},{"fix":"Upgrade to >=7.3.2 or avoid concurrent tar readers.","message":"The `disable_cache` option (for ITarReader) is only available from 7.3.2 onward. Before that, caching may cause thread-safety issues.","severity":"gotcha","affected_versions":"<7.3.2"},{"fix":"Update to EPath paths as described in the migration guide.","message":"The use of fsspec was replaced by EPath in 6.0.1. Code relying on fsspec paths may break.","severity":"deprecated","affected_versions":"<6.0.1"}],"env_vars":null,"last_verified":"2026-05-01T00:00:00.000Z","next_check":"2026-07-30T00:00:00.000Z","problems":[{"fix":"Use `from megatron.energon import ...`","cause":"Attempting to import directly from 'energon' instead of 'megatron.energon'.","error":"ModuleNotFoundError: No module named 'energon'"},{"fix":"Upgrade to >=7.2.2 which includes a fix for this locking issue.","cause":"Concurrent SQLite access in cache database without proper retry logic.","error":"sqlite3.OperationalError: locking protocol"},{"fix":"Ensure your dataset's `.nvs` file includes the correct subflavors (e.g., 'image', 'video', 'audio').","cause":"The dataset metadata does not match the sample type you are requesting.","error":"ValueError: Do not know how to load sample, given the available subflavors"}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}