Using the Python API

Run evaluations programmatically using the Python API for integration into your applications and workflows.

Basic Usage

Instead of using YAML config files, you can run evaluations directly from Python:

from judge_llm import evaluate

report = evaluate(
    dataset={
        "loader": "local_file",
        "paths": ["./tests.json"]
    },
    providers=[
        {"type": "gemini", "agent_id": "my_agent"}
    ],
    evaluators=[
        {"type": "response_evaluator"}
    ],
    reporters=[
        {"type": "console"}
    ]
)

# Access results
print(f"Success Rate: {report.success_rate * 100:.1f}%")
print(f"Total Cost: ${report.total_cost:.4f}")
print(f"Tests Passed: {report.summary['successful_executions']}/{report.summary['total_executions']}")

Accessing Results

The evaluate() function returns an EvaluationReport object with all results:

report = evaluate(...)

# Summary statistics
print(f"Total Cost: ${report.total_cost:.4f}")
print(f"Total Time: {report.total_time:.2f}s")
print(f"Success Rate: {report.success_rate * 100:.1f}%")
print(f"Overall Success: {report.overall_success}")

# Detailed summary
print(f"Total Tests: {report.summary['total_executions']}")
print(f"Passed: {report.summary['successful_executions']}")
print(f"Failed: {report.summary['failed_executions']}")

# Individual test cases
for test_case in report.test_cases:
    print(f"\nTest: {test_case.eval_id}")
    print(f"  Status: {'✅ PASSED' if test_case.passed else '❌ FAILED'}")
    print(f"  Cost: ${test_case.cost:.4f}")
    print(f"  Time: {test_case.time_taken:.2f}s")
    
    # Evaluator results
    for eval_result in test_case.evaluation_results:
        status = "✅" if eval_result.passed else "❌"
        print(f"  {status} {eval_result.evaluator_type}: {eval_result.reason}")

Loading from Environment

Use environment variables for sensitive data:

import os
from dotenv import load_dotenv
from judge_llm import evaluate

# Load .env file
load_dotenv()

report = evaluate(
    dataset={
        "loader": "local_file",
        "paths": [os.getenv("TEST_FILE", "./tests.json")]
    },
    providers=[{
        "type": "gemini",
        "agent_id": os.getenv("AGENT_ID", "default"),
        "api_key": os.getenv("GEMINI_API_KEY")
    }],
    evaluators=[{"type": "response_evaluator"}]
)

Multiple Providers

Compare different models programmatically:

report = evaluate(
    dataset={"loader": "local_file", "paths": ["./tests.json"]},
    providers=[
        {"type": "gemini", "agent_id": "gemini_flash"},
        {"type": "openai", "agent_id": "gpt4"},
        {"type": "anthropic", "agent_id": "claude"}
    ],
    evaluators=[{"type": "response_evaluator"}],
    reporters=[
        {"type": "console"},
        {"type": "html", "output_path": "./comparison.html"}
    ]
)

# Analyze by provider
import pandas as pd

data = [{
    "provider": tc.provider_type,
    "passed": tc.passed,
    "cost": tc.cost,
    "latency": tc.time_taken
} for tc in report.test_cases]

df = pd.DataFrame(data)

print("\nResults by Provider:")
print(df.groupby("provider").agg({
    "passed": "mean",
    "cost": ["mean", "sum"],
    "latency": "mean"
}))

Error Handling

Handle errors gracefully:

from judge_llm import evaluate

try:
    report = evaluate(
        dataset={"loader": "local_file", "paths": ["./tests.json"]},
        providers=[{"type": "gemini", "agent_id": "test"}],
        evaluators=[{"type": "response_evaluator"}]
    )
    
    if report.overall_success:
        print("✅ All tests passed!")
    else:
        print(f"❌ {report.summary['failed_executions']} tests failed")
        
        # Print failed tests
        for tc in report.test_cases:
            if not tc.passed:
                print(f"  - {tc.eval_id}")
                for eval_result in tc.evaluation_results:
                    if not eval_result.passed:
                        print(f"    ✗ {eval_result.evaluator_type}: {eval_result.reason}")
    
except ValueError as e:
    print(f"Configuration error: {e}")
except Exception as e:
    print(f"Evaluation failed: {e}")

Custom Reporters

Generate custom output formats:

report = evaluate(
    dataset={"loader": "local_file", "paths": ["./tests.json"]},
    providers=[{"type": "gemini", "agent_id": "test"}],
    evaluators=[{"type": "response_evaluator"}]
)

# Save to CSV manually
import csv

with open("results.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["eval_id", "passed", "cost", "time"])
    
    for tc in report.test_cases:
        writer.writerow([
            tc.eval_id,
            tc.passed,
            tc.cost,
            tc.time_taken
        ])

print("Results saved to results.csv")

Conditional Evaluation

Run different evaluations based on conditions:

import os
from judge_llm import evaluate

environment = os.getenv("ENVIRONMENT", "development")

if environment == "production":
    # Stricter evaluation for production
    evaluators = [
        {"type": "response_evaluator"},
        {"type": "trajectory_evaluator"},
        {"type": "cost_evaluator", "max_cost": 0.01},
        {"type": "latency_evaluator", "max_latency": 3.0}
    ]
else:
    # Lenient for development
    evaluators = [
        {"type": "response_evaluator"}
    ]

report = evaluate(
    dataset={"loader": "local_file", "paths": ["./tests.json"]},
    providers=[{"type": "gemini", "agent_id": f"{environment}_agent"}],
    evaluators=evaluators
)

Integration with Testing Frameworks

pytest

# test_llm.py
import pytest
from judge_llm import evaluate

def test_llm_evaluation():
    """Test that LLM passes all evaluations"""
    report = evaluate(
        dataset={"loader": "local_file", "paths": ["./tests.json"]},
        providers=[{"type": "gemini", "agent_id": "test"}],
        evaluators=[{"type": "response_evaluator"}]
    )
    
    assert report.overall_success, f"{report.summary['failed_executions']} tests failed"
    assert report.success_rate >= 0.9, f"Success rate too low: {report.success_rate:.1%}"

def test_cost_within_budget():
    """Test that evaluation stays within budget"""
    report = evaluate(
        dataset={"loader": "local_file", "paths": ["./tests.json"]},
        providers=[{"type": "gemini", "agent_id": "test"}],
        evaluators=[{"type": "cost_evaluator", "max_cost": 0.01}]
    )
    
    assert report.total_cost < 1.0, f"Total cost too high: ${report.total_cost:.2f}"

Run with pytest:

pytest test_llm.py -v

unittest

# test_llm.py
import unittest
from judge_llm import evaluate

class TestLLMEvaluation(unittest.TestCase):
    
    def test_all_pass(self):
        """Test that all evaluations pass"""
        report = evaluate(
            dataset={"loader": "local_file", "paths": ["./tests.json"]},
            providers=[{"type": "gemini", "agent_id": "test"}],
            evaluators=[{"type": "response_evaluator"}]
        )
        
        self.assertTrue(report.overall_success)
        self.assertGreaterEqual(report.success_rate, 0.9)
    
    def test_specific_case(self):
        """Test specific test case"""
        report = evaluate(
            dataset={"loader": "local_file", "paths": ["./tests.json"]},
            providers=[{"type": "gemini", "agent_id": "test"}],
            evaluators=[{"type": "response_evaluator"}]
        )
        
        # Find specific test
        test_001 = next(tc for tc in report.test_cases if tc.eval_id == "test_001")
        self.assertTrue(test_001.passed, "test_001 should pass")

if __name__ == '__main__':
    unittest.main()

Registering Custom Components

from judge_llm import evaluate, register_evaluator
from my_evaluators import SafetyEvaluator

# Register once
register_evaluator("safety", SafetyEvaluator)

# Use by name in multiple evaluations
report1 = evaluate(
    dataset={"loader": "local_file", "paths": ["./tests1.json"]},
    providers=[{"type": "gemini", "agent_id": "test1"}],
    evaluators=[{"type": "safety"}]  # Use registered name
)

report2 = evaluate(
    dataset={"loader": "local_file", "paths": ["./tests2.json"]},
    providers=[{"type": "gemini", "agent_id": "test2"}],
    evaluators=[{"type": "safety"}]  # Use registered name
)

Next Steps

Configuration Guide - Learn all configuration options
Python API Reference - Complete API documentation
Custom Evaluators - Build custom evaluators
Examples - See more examples

CLI Reference - Command-line usage
Basic Usage - Quick start guide
Custom Reporters - Build custom reporters

Basic Usage​

Accessing Results​

Loading from Environment​

Multiple Providers​

Error Handling​

Custom Reporters​

Conditional Evaluation​

Integration with Testing Frameworks​

pytest​

unittest​

Registering Custom Components​

Next Steps​

Related Documentation​