Internet Archive Python Library

5.8.0 · active · verified Thu Apr 16

internetarchive is a Python interface to archive.org, providing both a command-line interface (CLI) and a Python API. It allows programmatic access to search, download, upload, and interact with various Internet Archive services. The library is actively maintained, with version 5.8.0 being the latest stable release, and new versions released periodically to add features, improve performance, and address bugs or security vulnerabilities.

Common errors

Warnings

Install

Imports

Quickstart

This quickstart demonstrates how to search for items, download files, and upload new content to the Internet Archive using the `internetarchive` Python library. It highlights the use of `search_items` to find content, `get_item` and `File.download` for downloading, and the `upload` function for creating new archive items. Authentication for uploads requires setting `IA_ACCESS_KEY` and `IA_SECRET_KEY` environment variables.

import os
from internetarchive import search_items, get_item, upload
import tempfile

# --- Authentication ---
# Set your IA S3 keys as environment variables for uploads and metadata modification.
# You can generate them at https://archive.org/account/s3.php
# For programmatic access, it's recommended to set IA_ACCESS_KEY and IA_SECRET_KEY
# as environment variables.
# Example: export IA_ACCESS_KEY='YOUR_ACCESS_KEY' 
#          export IA_SECRET_KEY='YOUR_SECRET_KEY'

# --- 1. Search for items ---
print("Searching for items tagged 'NASA'...")
search_results = search_items('subject:NASA')
for i, result in enumerate(search_results.iter_as_results()):
    if i >= 3: # Limit to 3 results for brevity
        break
    print(f"  - Identifier: {result['identifier']}, Title: {result.get('title')}")

# --- 2. Download a file from an item ---
# Using an example item known to exist with publicly downloadable files
print("\nAttempting to download a file from 'nasa_images_1960s'...")
try:
    item_to_download = get_item('nasa_images_1960s') # Use a stable public item
    # Try to find an image file, otherwise download the first available file
    files = item_to_download.get_files(formats=['JPEG', 'PNG', 'image/jpeg'])
    if files:
        file_to_download = files[0]
        print(f"Downloading {file_to_download.name}...")
        # Use tempfile for a safe, temporary download location
        with tempfile.TemporaryDirectory() as tmpdir:
            downloaded_path = file_to_download.download(tmpdir)
            print(f"Downloaded to: {downloaded_path}")
    else:
        print("No suitable files found to download from 'nasa_images_1960s'.")
except Exception as e:
    print(f"Error during download: {e}")


# --- 3. Upload a dummy file ---
# Requires IA_ACCESS_KEY and IA_SECRET_KEY to be set as environment variables
access_key = os.environ.get('IA_ACCESS_KEY', 'YOUR_ACCESS_KEY')
secret_key = os.environ.get('IA_SECRET_KEY', 'YOUR_SECRET_KEY')

if access_key == 'YOUR_ACCESS_KEY' or secret_key == 'YOUR_SECRET_KEY':
    print("\nSkipping upload example: IA_ACCESS_KEY or IA_SECRET_KEY not set.")
    print("Please set environment variables or use 'ia configure' to enable uploads.")
else:
    print("\nAttempting to upload a dummy file...")
    temp_file_name = "my_dummy_file.txt"
    temp_file_content = "This is a test upload from the internetarchive Python library."
    identifier = "my_unique_test_item_12345" # Replace with a truly unique identifier

    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
        f.write(temp_file_content)
        temp_file_path = f.name

    metadata = {
        'title': f'My Test Item {identifier}',
        'description': 'A dummy item uploaded via Python library quickstart.',
        'mediatype': 'data', # Required
        'collection': 'test_collection' # Replace with a collection you have write access to
    }

    try:
        print(f"Uploading {temp_file_path} to {identifier}...")
        r = upload(identifier, files=[temp_file_path], metadata=metadata)
        print(f"Upload successful! Status: {r[0].status_code}")
        print(f"View item at: https://archive.org/details/{identifier}")
    except Exception as e:
        print(f"Error during upload: {e}")
    finally:
        os.remove(temp_file_path)

view raw JSON →