Data Version Control (DVC)

3.67.1 · active · verified Sat Apr 11

DVC (Data Version Control) extends Git to handle large files and machine learning pipelines, providing version control for datasets and models, and enabling reproducible ML workflows. It stores data and model files in a cache outside of Git, supporting various remote storage platforms (S3, Azure, Google Cloud, SSH, etc.). The current version is 3.67.1, with frequent releases.

Warnings

Install

Imports

Quickstart

This quickstart first sets up a minimal DVC project using shell commands (simulated via `subprocess`) to initialize DVC within a Git repository and track a `data.csv` file. It then demonstrates how to use the `dvc.api.read()` function in Python to programmatically access the content of the DVC-tracked file.

import os
import subprocess
import dvc.api

# --- CLI Setup (normally run in shell) ---
# This part simulates initial DVC project setup if not already done.
# In a real scenario, you'd run these in your terminal.

def setup_dvc_project():
    if not os.path.exists('dvc_quickstart_repo'):
        os.makedirs('dvc_quickstart_repo')
    os.chdir('dvc_quickstart_repo')

    if not os.path.exists('.git'):
        subprocess.run(['git', 'init', '-b', 'main'], check=True)
    
    # Ensure dvc is initialized
    if not os.path.exists('.dvc'):
        subprocess.run(['dvc', 'init'], check=True)
    subprocess.run(['git', 'add', '.dvcignore', '.dvc/config', '.dvc/.gitignore'], check=True)
    subprocess.run(['git', 'commit', '-m', 'Initialize DVC'], check=True)

    # Create a dummy data file
    with open('data.csv', 'w') as f:
        f.write('col1,col2\n1,A\n2,B\n3,C\n')
    
    # Add data to DVC and commit the .dvc file to Git
    subprocess.run(['dvc', 'add', 'data.csv'], check=True)
    subprocess.run(['git', 'add', 'data.csv.dvc'], check=True)
    subprocess.run(['git', 'commit', '-m', 'Add data.csv'], check=True)
    
    print("DVC project setup complete in 'dvc_quickstart_repo'")
    os.chdir('..') # Go back to original directory

# Run the setup
setup_dvc_project()

# --- Python API Usage ---
# Now, demonstrate reading the DVC-tracked file programmatically
repo_path = 'dvc_quickstart_repo'
file_path = 'data.csv'

try:
    # Read the content of the DVC-tracked file
    # dvc.api will automatically handle fetching from cache or remote if needed
    content = dvc.api.read(
        path=file_path,
        repo=repo_path,
        rev='HEAD' # Or a specific Git commit/tag/branch
    )
    print(f"\nContent of {file_path} from DVC repo '{repo_path}':\n{content}")

    # Example: Reading a specific parameter from params.yaml if it existed
    # (This example assumes no params.yaml is set up in the quickstart for simplicity)
    # params_content = dvc.api.read(path='params.yaml', repo=repo_path, rev='HEAD')
    # import yaml
    # params = yaml.safe_load(params_content)
    # print(f"Parameters: {params}")

except Exception as e:
    print(f"An error occurred while reading DVC-tracked file: {e}")

view raw JSON →