OleFileIO_PL for Microsoft OLE2 Files

0.42.1 · active · verified Fri Apr 17

OleFileIO_PL is a Python package designed to parse, read, and write Microsoft OLE2 files, also known as Structured Storage or Compound Document files (e.g., older Microsoft Office formats like .doc, .xls, .ppt). It is an improved version of the original OleFileIO module from the Python Image Library (PIL). The current version is 0.42.1, and it maintains an active, though not rapid, release cadence.

Common errors

Warnings

Install

Imports

Quickstart

This quickstart demonstrates how to check if a file is an OLE document, open it, list its contents, and extract common metadata. Note that for full functionality, you must replace `actual_ole_file_path` with the path to an existing, valid OLE file.

import olefile
import os
import tempfile

# For demonstration, let's create a dummy (non-OLE) file to show error handling.
# In a real scenario, you would point to an actual OLE file 
# (e.g., an old .doc, .xls, .ppt document).
dummy_file_path = os.path.join(tempfile.gettempdir(), "dummy_not_ole.txt")
with open(dummy_file_path, "w") as f:
    f.write("This is not an OLE file.")

# <<< IMPORTANT: REPLACE THIS with the actual path to your OLE file >>>
actual_ole_file_path = "path/to/your/actual_ole_file.doc"

print(f"Checking if '{dummy_file_path}' is an OLE file: {olefile.isOleFile(dummy_file_path)}")
print(f"Checking if '{actual_ole_file_path}' is an OLE file: {olefile.isOleFile(actual_ole_file_path)}\n")

try:
    # Attempt to open a placeholder file - this will likely fail unless
    # you replace 'actual_ole_file_path' with a real OLE file.
    print(f"Attempting to open '{actual_ole_file_path}'...")
    with olefile.open(actual_ole_file_path) as ole:
        print(f"Successfully opened '{actual_ole_file_path}'.")

        # List all top-level streams and storages
        print("\nTop-level entries:")
        for entry in ole.listdir():
            print(f"  - {entry}")

        # Example: Access a stream if it exists (e.g., 'WordDocument' for .doc files)
        if ole.exists('WordDocument'):
            with ole.openstream('WordDocument') as stream:
                content = stream.read()
                print(f"\nFirst 100 bytes of 'WordDocument' stream: {content[:100]}")
        else:
            print("\n'WordDocument' stream not found.")

        # Example: Read standard metadata properties (if available)
        print("\nMetadata (if present):")
        if ole.exists('Root Entry'):
            root_props = ole.getproperties('Root Entry')
            if root_props:
                for prop_id, prop_name, prop_type, prop_value in root_props:
                    # Common properties like Author (0x04) or Creation Time (0x01)
                    if prop_id == 0x01: print(f"  Creation Time: {prop_value.as_datetime()}")
                    elif prop_id == 0x04: print(f"  Author: {prop_value.as_str()}")
                    else: print(f"  Property {hex(prop_id)} ({prop_name}): {prop_value}")
            else:
                print("  No standard properties found in Root Entry.")
        else:
            print("  'Root Entry' not found.")

except olefile.BadOleFile:
    print(f"\nError: '{actual_ole_file_path}' is not a valid OLE file. Please provide a real OLE file.")
except FileNotFoundError:
    print(f"\nError: '{actual_ole_file_path}' not found. Please provide a valid path to an OLE file.")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")
finally:
    os.remove(dummy_file_path) # Clean up dummy file
    print(f"\nCleaned up dummy file: {dummy_file_path}")

view raw JSON →