Autoevals

0.2.0 · active · verified Sat Apr 11

Autoevals is a universal library for quickly and easily evaluating AI model outputs. Developed by the team at Braintrust, it bundles together a variety of automatic evaluation methods including LLM-as-a-judge, heuristic (e.g., Levenshtein distance), and statistical (e.g., BLEU) evaluations. Currently at version 0.2.0, the library is actively maintained with frequent updates.

Warnings

Install

Imports

Quickstart

This quickstart demonstrates how to use various autoevals scorers, including an LLM-as-a-judge evaluator (Factuality), a heuristic evaluator (Levenshtein), and a numeric evaluator (NumericDiff). It covers both synchronous and asynchronous evaluation patterns for LLM-based scorers and includes a placeholder for the OpenAI API key, which is essential for LLM evaluations.

import os
import asyncio
from autoevals.llm import Factuality
from autoevals import Levenshtein
from autoevals.number import NumericDiff

# Set up your OpenAI API key (or compatible service) for LLM evaluators
# In a production environment, ensure this is loaded securely from an environment variable.
os.environ['OPENAI_API_KEY'] = os.environ.get('OPENAI_API_KEY', 'YOUR_OPENAI_API_KEY_HERE')

async def main():
    # Example 1: LLM-as-a-judge evaluation (requires an LLM API key)
    print("--- LLM Factuality Evaluation ---")
    factuality_evaluator = Factuality()
    input_text = "Which country has the highest population?"
    output_text = "People's Republic of China"
    expected_text = "China"

    # Synchronous evaluation
    if os.environ['OPENAI_API_KEY'] != 'YOUR_OPENAI_API_KEY_HERE':
        llm_result_sync = factuality_evaluator(output_text, expected_text, input=input_text)
        print(f"Factuality score (sync): {llm_result_sync.score}")
        print(f"Factuality rationale (sync): {llm_result_sync.metadata.get('rationale')}")

        # Asynchronous evaluation
        llm_result_async = await factuality_evaluator.eval_async(output_text, expected_text, input=input_text)
        print(f"Factuality score (async): {llm_result_async.score}")
        print(f"Factuality rationale (async): {llm_result_async.metadata.get('rationale')}")
    else:
        print("Skipping LLM Factuality evaluation: OPENAI_API_KEY not set.")

    # Example 2: Heuristic evaluation (Levenshtein distance)
    print("\n--- Levenshtein Distance Evaluation ---")
    levenshtein_evaluator = Levenshtein()
    output_str = "hello world"
    expected_str = "hallo world"
    lev_result = levenshtein_evaluator(output_str, expected_str)
    print(f"Levenshtein score: {lev_result.score}")
    print(f"Levenshtein metadata: {lev_result.metadata}")

    # Example 3: Numeric difference evaluation
    print("\n--- Numeric Difference Evaluation ---")
    numeric_evaluator = NumericDiff()
    output_num = 105
    expected_num = 100
    num_result = numeric_evaluator(output_num, expected_num)
    print(f"NumericDiff score: {num_result.score}")
    print(f"NumericDiff metadata: {num_result.metadata}")

if __name__ == "__main__":
    asyncio.run(main())

view raw JSON →