{"library":"megatron-core","title":"NVIDIA Megatron Core","description":"Megatron Core is a Python library developed by NVIDIA for building highly efficient and scalable transformer-based models, especially for large-scale distributed training. It provides fundamental building blocks for tensor and pipeline parallelism. The current version is 0.16.1, and it generally follows an active release cadence with minor versions released frequently.","language":"python","status":"active","last_verified":"Sun May 17","install":{"commands":["pip install megatron-core","pip install 'megatron-core[cuda]' # For full CUDA/cuDNN integration if needed beyond base"],"cli":null},"imports":["from megatron.core.tensor_parallel.layers import ColumnParallelLinear","from megatron.core.tensor_parallel.layers import RowParallelLinear","from megatron.core.transformer.transformer_block import TransformerBlock","from megatron.core.transformer.transformer_layer import TransformerLayer","from megatron.core import dist_init"],"auth":{"required":false,"env_vars":[]},"quickstart":{"code":"import os\nimport torch\nimport torch.distributed as dist\nfrom megatron.core.tensor_parallel.layers import ColumnParallelLinear\nfrom megatron.core import dist_init\n\n# Minimal distributed setup for demonstration purposes.\n# In a real scenario, these env vars would be set by a launcher (e.g., torchrun)\n# and dist.init_process_group would be called globally.\nif not dist.is_initialized():\n    os.environ['MASTER_ADDR'] = os.environ.get('MASTER_ADDR', 'localhost')\n    os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', '29500')\n    os.environ['RANK'] = os.environ.get('RANK', '0')\n    # Set WORLD_SIZE to 1 for a single-GPU test without a full distributed setup\n    os.environ['WORLD_SIZE'] = os.environ.get('WORLD_SIZE', '1') \n    \n    if torch.cuda.is_available() and int(os.environ['WORLD_SIZE']) > 0:\n        try:\n            dist.init_process_group(backend='nccl', rank=int(os.environ['RANK']), world_size=int(os.environ['WORLD_SIZE']))\n            print(\"PyTorch distributed group initialized with NCCL.\")\n        except Exception as e:\n            print(f\"Warning: Could not initialize NCCL backend: {e}. Falling back to CPU/non-distributed.\")\n            os.environ['WORLD_SIZE'] = '1'\n            if dist.is_initialized(): # Destroy if partial init failed\n                dist.destroy_process_group()\n    else:\n        print(\"Warning: CUDA not available or WORLD_SIZE=0. Skipping torch.distributed init.\")\n        os.environ['WORLD_SIZE'] = '1'\n\n# Set Megatron-Core specific parallel configuration\n# This is crucial for Megatron-Core layers to correctly interpret parallel ranks.\nif dist.is_initialized():\n    dist_init.set_tensor_model_parallel_world_size(int(os.environ['WORLD_SIZE']))\n    dist_init.set_tensor_model_parallel_rank(int(os.environ['RANK']))\nelse:\n    # Fallback for CPU-only or non-distributed setup (effectively no parallelism)\n    dist_init.set_tensor_model_parallel_world_size(1)\n    dist_init.set_tensor_model_parallel_rank(0)\n\n# Define a simple parallel linear layer\nhidden_size = 128\noutput_size = 256\n\ntry:\n    # ColumnParallelLinear shards the input tensor across GPUs.\n    # If world_size > 1, each rank will only compute a part of the output.\n    # gather_output=True means the output is gathered on all ranks at the end.\n    linear_layer = ColumnParallelLinear(\n        input_size=hidden_size,\n        output_size=output_size,\n        gather_output=True\n    )\n    if torch.cuda.is_available():\n        linear_layer.cuda()\n\n    # Create a dummy input tensor\n    # Input size should match hidden_size. Batch and sequence length can vary.\n    input_tensor = torch.randn(2, 4, hidden_size)\n    if torch.cuda.is_available():\n        input_tensor = input_tensor.cuda()\n\n    # Perform a forward pass\n    output_tensor = linear_layer(input_tensor)\n\n    print(f\"\\nMegatron-Core ColumnParallelLinear initialized successfully.\")\n    print(f\"Input shape: {input_tensor.shape}\")\n    print(f\"Output shape (gathered): {output_tensor.shape}\")\n    print(f\"Output device: {output_tensor.device}\")\n\nexcept Exception as e:\n    print(f\"An error occurred during Megatron-Core layer execution: {e}\")\n\nfinally:\n    # Clean up distributed process group if initialized\n    if dist.is_initialized():\n        dist.destroy_process_group()\n","lang":"python","description":"This quickstart demonstrates how to initialize a basic distributed environment (required for Megatron-Core components) and instantiate a `ColumnParallelLinear` layer. It showcases the fundamental usage pattern of defining a parallelized model component. For actual distributed training, `torch.distributed.launch` or `torchrun` should be used to set up the environment variables.","tag":null,"tag_description":null,"last_tested":null,"results":[]},"compatibility":{"tag":null,"tag_description":null,"last_tested":"2026-05-17","installed_version":"0.16.1","pypi_latest":"0.17.0","is_stale":true,"summary":{"python_range":"3.10–3.9","success_rate":40,"avg_install_s":71.7,"avg_import_s":17.19,"wheel_type":"wheel"},"results":[{"runtime":"python:3.10-alpine","python_version":"3.10","os_libc":"alpine (musl)","variant":"cuda","exit_code":1,"wheel_type":null,"failure_reason":"build_error","import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.10-alpine","python_version":"3.10","os_libc":"alpine (musl)","variant":"megatron-core","exit_code":1,"wheel_type":null,"failure_reason":"build_error","import_side_effects":null,"install_time_s":0.1,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.10-slim","python_version":"3.10","os_libc":"slim (glibc)","variant":"cuda","exit_code":0,"wheel_type":null,"failure_reason":null,"import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.10-slim","python_version":"3.10","os_libc":"slim (glibc)","variant":"megatron-core","exit_code":0,"wheel_type":"wheel","failure_reason":null,"import_side_effects":"noisy","install_time_s":75.1,"import_time_s":13.69,"mem_mb":167.1,"disk_size":"4.7G"},{"runtime":"python:3.11-alpine","python_version":"3.11","os_libc":"alpine (musl)","variant":"cuda","exit_code":1,"wheel_type":null,"failure_reason":"build_error","import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.11-alpine","python_version":"3.11","os_libc":"alpine (musl)","variant":"megatron-core","exit_code":1,"wheel_type":null,"failure_reason":"build_error","import_side_effects":null,"install_time_s":0.1,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.11-slim","python_version":"3.11","os_libc":"slim (glibc)","variant":"cuda","exit_code":0,"wheel_type":null,"failure_reason":null,"import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.11-slim","python_version":"3.11","os_libc":"slim (glibc)","variant":"megatron-core","exit_code":0,"wheel_type":"wheel","failure_reason":null,"import_side_effects":"noisy","install_time_s":76.1,"import_time_s":19.51,"mem_mb":187.2,"disk_size":"4.8G"},{"runtime":"python:3.12-alpine","python_version":"3.12","os_libc":"alpine (musl)","variant":"cuda","exit_code":1,"wheel_type":null,"failure_reason":"build_error","import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.12-alpine","python_version":"3.12","os_libc":"alpine (musl)","variant":"megatron-core","exit_code":1,"wheel_type":null,"failure_reason":"build_error","import_side_effects":null,"install_time_s":0.1,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.12-slim","python_version":"3.12","os_libc":"slim (glibc)","variant":"cuda","exit_code":0,"wheel_type":null,"failure_reason":null,"import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.12-slim","python_version":"3.12","os_libc":"slim (glibc)","variant":"megatron-core","exit_code":0,"wheel_type":"wheel","failure_reason":null,"import_side_effects":"noisy","install_time_s":67.8,"import_time_s":19.15,"mem_mb":180.5,"disk_size":"4.8G"},{"runtime":"python:3.13-alpine","python_version":"3.13","os_libc":"alpine (musl)","variant":"cuda","exit_code":1,"wheel_type":null,"failure_reason":"build_error","import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.13-alpine","python_version":"3.13","os_libc":"alpine (musl)","variant":"megatron-core","exit_code":1,"wheel_type":null,"failure_reason":"build_error","import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.13-slim","python_version":"3.13","os_libc":"slim (glibc)","variant":"cuda","exit_code":0,"wheel_type":null,"failure_reason":null,"import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.13-slim","python_version":"3.13","os_libc":"slim (glibc)","variant":"megatron-core","exit_code":0,"wheel_type":"wheel","failure_reason":null,"import_side_effects":"noisy","install_time_s":67.8,"import_time_s":16.39,"mem_mb":184.6,"disk_size":"4.8G"},{"runtime":"python:3.9-alpine","python_version":"3.9","os_libc":"alpine (musl)","variant":"cuda","exit_code":1,"wheel_type":null,"failure_reason":"build_error","import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.9-alpine","python_version":"3.9","os_libc":"alpine (musl)","variant":"megatron-core","exit_code":1,"wheel_type":null,"failure_reason":"build_error","import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.9-slim","python_version":"3.9","os_libc":"slim (glibc)","variant":"cuda","exit_code":1,"wheel_type":null,"failure_reason":"timeout","import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.9-slim","python_version":"3.9","os_libc":"slim (glibc)","variant":"megatron-core","exit_code":1,"wheel_type":null,"failure_reason":"timeout","import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null}]}}