Skip to main content

Python API Reference

Complete Python API reference for programmatic evaluation and custom integrations.

Core Functions

evaluate()

The main entry point for running evaluations programmatically.

from judge_llm import evaluate

report = evaluate(
dataset: dict,
providers: list[dict],
evaluators: list[dict],
reporters: list[dict] = None,
validate: bool = True
) -> EvaluationReport

Parameters:

ParameterTypeDescriptionRequired
datasetdictDataset loader configurationYes
providerslist[dict]List of provider configurationsYes
evaluatorslist[dict]List of evaluator configurationsYes
reporterslist[dict]List of reporter configurationsNo
validateboolWhether to validate config (default: True)No

Returns: EvaluationReport object containing all results

Example:

from judge_llm import evaluate

report = evaluate(
dataset={
"loader": "local_file",
"paths": ["./tests.json"]
},
providers=[
{"type": "gemini", "agent_id": "test_agent"}
],
evaluators=[
{"type": "response_evaluator"}
],
reporters=[
{"type": "console"},
{"type": "json", "output_path": "./results.json"}
]
)

# Access results
print(f"Success rate: {report.success_rate * 100:.1f}%")
print(f"Total cost: ${report.total_cost:.4f}")
print(f"Passed: {report.summary['successful_executions']}/{report.summary['total_executions']}")

Registration Functions

register_provider()

Register a custom provider class.

from judge_llm import register_provider

register_provider(name: str, provider_class: Type[BaseProvider])

Example:

from judge_llm import register_provider
from judge_llm.providers.base import BaseProvider

class MyCustomProvider(BaseProvider):
def initialize(self):
# Setup
pass

def execute(self, conversation_history):
# Execute and return response
return {"content": "response"}

# Register
register_provider("my_provider", MyCustomProvider)

# Use
report = evaluate(
dataset={"loader": "local_file", "paths": ["./tests.json"]},
providers=[{"type": "my_provider", "agent_id": "custom"}],
evaluators=[{"type": "response_evaluator"}]
)

register_evaluator()

Register a custom evaluator class.

from judge_llm import register_evaluator

register_evaluator(name: str, evaluator_class: Type[BaseEvaluator])

Example:

from judge_llm import register_evaluator
from judge_llm.evaluators.base import BaseEvaluator
from judge_llm.core.models import EvaluationResult

class SafetyEvaluator(BaseEvaluator):
def evaluate(self, test_case, response):
# Check for unsafe content
unsafe_terms = ["violence", "harmful"]
content = response.get("content", "").lower()

is_safe = not any(term in content for term in unsafe_terms)

return EvaluationResult(
evaluator_type="safety",
passed=is_safe,
score=1.0 if is_safe else 0.0,
reason="Safe content" if is_safe else "Unsafe content detected"
)

# Register
register_evaluator("safety", SafetyEvaluator)

# Use
report = evaluate(
dataset={"loader": "local_file", "paths": ["./tests.json"]},
providers=[{"type": "gemini", "agent_id": "test"}],
evaluators=[
{"type": "response_evaluator"},
{"type": "safety"} # Your custom evaluator
]
)

register_reporter()

Register a custom reporter class.

from judge_llm import register_reporter

register_reporter(name: str, reporter_class: Type[BaseReporter])

Example:

from judge_llm import register_reporter
from judge_llm.reporters.base import BaseReporter
import csv

class CSVReporter(BaseReporter):
def __init__(self, config=None):
self.config = config or {}
self.output_path = self.config.get("output_path", "./results.csv")

def generate_report(self, report):
with open(self.output_path, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['eval_id', 'passed', 'cost', 'time'])
for tc in report.test_cases:
writer.writerow([tc.eval_id, tc.passed, tc.cost, tc.time_taken])

def cleanup(self):
pass

# Register
register_reporter("csv", CSVReporter)

# Use
report = evaluate(
dataset={"loader": "local_file", "paths": ["./tests.json"]},
providers=[{"type": "gemini", "agent_id": "test"}],
evaluators=[{"type": "response_evaluator"}],
reporters=[{"type": "csv", "output_path": "./results.csv"}]
)

Data Models

EvaluationReport

The complete evaluation report returned by evaluate().

@dataclass
class EvaluationReport:
total_cost: float # Total cost across all tests
total_time: float # Total execution time (seconds)
success_rate: float # Percentage of passed tests (0-1)
overall_success: bool # True if all tests passed
summary: Dict[str, Any] # Aggregated statistics
test_cases: List[TestCaseResult] # Individual test results

Example:

report = evaluate(...)

# Access summary stats
print(f"Total cost: ${report.total_cost:.4f}")
print(f"Success rate: {report.success_rate * 100:.1f}%")
print(f"Total tests: {report.summary['total_executions']}")
print(f"Passed: {report.summary['successful_executions']}")
print(f"Failed: {report.summary['failed_executions']}")

# Iterate test cases
for tc in report.test_cases:
if not tc.passed:
print(f"Failed: {tc.eval_id}")

TestCaseResult

Individual test case result.

@dataclass
class TestCaseResult:
eval_id: str # Test case ID
agent_id: str # Agent identifier
provider_type: str # Provider type
passed: bool # Overall pass/fail
cost: float # Cost for this test
time_taken: float # Execution time (seconds)
evaluation_results: List[EvaluationResult] # Evaluator results
conversation_history: List[Dict[str, Any]] # Full conversation

Example:

report = evaluate(...)

for tc in report.test_cases:
print(f"\nTest: {tc.eval_id}")
print(f" Status: {'✅ PASSED' if tc.passed else '❌ FAILED'}")
print(f" Cost: ${tc.cost:.4f}")
print(f" Time: {tc.time_taken:.2f}s")

# Check individual evaluators
for eval_result in tc.evaluation_results:
status = "✅" if eval_result.passed else "❌"
print(f" {status} {eval_result.evaluator_type}: {eval_result.reason}")

EvaluationResult

Result from a single evaluator.

@dataclass
class EvaluationResult:
evaluator_type: str # Evaluator name
passed: bool # Pass/fail
score: float # Numeric score (0-1)
reason: str # Explanation
metadata: dict # Additional info

Example:

report = evaluate(...)

for tc in report.test_cases:
for eval_result in tc.evaluation_results:
if not eval_result.passed:
print(f"Evaluator {eval_result.evaluator_type} failed:")
print(f" Reason: {eval_result.reason}")
print(f" Score: {eval_result.score}")
if eval_result.metadata:
print(f" Metadata: {eval_result.metadata}")

Configuration from Code

Dataset Configuration

# Local file
dataset = {
"loader": "local_file",
"paths": ["./tests.json", "./more_tests.json"]
}

# BrowserBase (for web data)
dataset = {
"loader": "browserbase",
"api_key": "your_api_key",
"project_id": "your_project_id"
}

Provider Configuration

# Gemini
providers = [{
"type": "gemini",
"agent_id": "gemini_agent",
"model": "gemini-2.0-flash-exp",
"temperature": 0.0,
"api_key": "your_api_key" # Or use env var
}]

# OpenAI
providers = [{
"type": "openai",
"agent_id": "openai_agent",
"model": "gpt-4",
"temperature": 0.7,
"api_key": "your_api_key"
}]

# Anthropic
providers = [{
"type": "anthropic",
"agent_id": "claude_agent",
"model": "claude-3-5-sonnet-20241022",
"temperature": 0.0,
"api_key": "your_api_key"
}]

# Multiple providers (A/B testing)
providers = [
{"type": "gemini", "agent_id": "gemini"},
{"type": "openai", "agent_id": "openai"},
{"type": "anthropic", "agent_id": "claude"}
]

Evaluator Configuration

# Response evaluator
evaluators = [{
"type": "response_evaluator",
"llm_provider": "gemini",
"llm_model": "gemini-2.0-flash-exp"
}]

# Trajectory evaluator
evaluators = [{
"type": "trajectory_evaluator",
"llm_provider": "gemini"
}]

# Cost evaluator
evaluators = [{
"type": "cost_evaluator",
"max_cost": 0.01 # Fail if cost > $0.01
}]

# Latency evaluator
evaluators = [{
"type": "latency_evaluator",
"max_latency": 5.0 # Fail if latency > 5s
}]

# Multiple evaluators
evaluators = [
{"type": "response_evaluator"},
{"type": "cost_evaluator", "max_cost": 0.01},
{"type": "latency_evaluator", "max_latency": 3.0}
]

Reporter Configuration

# Console reporter
reporters = [{"type": "console"}]

# JSON reporter
reporters = [{
"type": "json",
"output_path": "./results.json"
}]

# HTML reporter
reporters = [{
"type": "html",
"output_path": "./report.html"
}]

# Database reporter
reporters = [{
"type": "database",
"db_path": "./results.db"
}]

# Multiple reporters
reporters = [
{"type": "console"},
{"type": "json", "output_path": "./results.json"},
{"type": "html", "output_path": "./report.html"},
{"type": "database", "db_path": "./results.db"}
]

Advanced Usage

Environment Variables

import os
from dotenv import load_dotenv
from judge_llm import evaluate

# Load .env file
load_dotenv()

report = evaluate(
dataset={
"loader": "local_file",
"paths": [os.getenv("TEST_FILE_PATH")]
},
providers=[{
"type": "gemini",
"agent_id": os.getenv("AGENT_ID"),
"api_key": os.getenv("GEMINI_API_KEY")
}],
evaluators=[{"type": "response_evaluator"}]
)

Error Handling

from judge_llm import evaluate

try:
report = evaluate(
dataset={"loader": "local_file", "paths": ["./tests.json"]},
providers=[{"type": "gemini", "agent_id": "test"}],
evaluators=[{"type": "response_evaluator"}]
)

if report.overall_success:
print("✅ All tests passed!")
else:
print(f"❌ {report.summary['failed_executions']} tests failed")

# Print failed tests
for tc in report.test_cases:
if not tc.passed:
print(f" - {tc.eval_id}")

except ValueError as e:
print(f"Configuration error: {e}")
except Exception as e:
print(f"Evaluation failed: {e}")

Conditional Evaluation

from judge_llm import evaluate

def run_evaluation(config_name: str):
"""Run evaluation and return pass/fail"""
report = evaluate(
dataset={"loader": "local_file", "paths": [f"./{config_name}.json"]},
providers=[{"type": "gemini", "agent_id": config_name}],
evaluators=[{"type": "response_evaluator"}],
reporters=[{"type": "json", "output_path": f"./{config_name}_results.json"}]
)
return report.overall_success

# Run multiple evaluations
configs = ["dev", "staging", "prod"]
results = {config: run_evaluation(config) for config in configs}

# Check if all passed
if all(results.values()):
print("✅ All environments passed!")
else:
failed = [k for k, v in results.items() if not v]
print(f"❌ Failed environments: {', '.join(failed)}")

Custom Analysis

from judge_llm import evaluate
import pandas as pd

report = evaluate(
dataset={"loader": "local_file", "paths": ["./tests.json"]},
providers=[
{"type": "gemini", "agent_id": "gemini"},
{"type": "openai", "agent_id": "openai"}
],
evaluators=[{"type": "response_evaluator"}]
)

# Convert to DataFrame for analysis
data = []
for tc in report.test_cases:
data.append({
'eval_id': tc.eval_id,
'provider': tc.provider_type,
'passed': tc.passed,
'cost': tc.cost,
'latency': tc.time_taken
})

df = pd.DataFrame(data)

# Analysis
print("\nCost by Provider:")
print(df.groupby('provider')['cost'].agg(['mean', 'sum', 'count']))

print("\nLatency by Provider:")
print(df.groupby('provider')['latency'].agg(['mean', 'min', 'max']))

print("\nSuccess Rate by Provider:")
print(df.groupby('provider')['passed'].mean() * 100)

Integration with Testing Frameworks

pytest

# test_llm_eval.py
import pytest
from judge_llm import evaluate

def test_llm_evaluation():
"""Test that LLM passes all evaluations"""
report = evaluate(
dataset={"loader": "local_file", "paths": ["./tests.json"]},
providers=[{"type": "gemini", "agent_id": "test"}],
evaluators=[{"type": "response_evaluator"}]
)

assert report.overall_success, f"Evaluation failed: {report.summary['failed_executions']} tests failed"
assert report.success_rate >= 0.9, f"Success rate too low: {report.success_rate * 100:.1f}%"
assert report.total_cost < 1.0, f"Cost too high: ${report.total_cost:.2f}"

def test_individual_test_cases():
"""Test specific test cases"""
report = evaluate(
dataset={"loader": "local_file", "paths": ["./tests.json"]},
providers=[{"type": "gemini", "agent_id": "test"}],
evaluators=[{"type": "response_evaluator"}]
)

# Check specific test
test_001 = next(tc for tc in report.test_cases if tc.eval_id == "test_001")
assert test_001.passed, "test_001 should pass"

unittest

# test_llm_eval.py
import unittest
from judge_llm import evaluate

class TestLLMEvaluation(unittest.TestCase):
def test_overall_success(self):
"""Test that all evaluations pass"""
report = evaluate(
dataset={"loader": "local_file", "paths": ["./tests.json"]},
providers=[{"type": "gemini", "agent_id": "test"}],
evaluators=[{"type": "response_evaluator"}]
)

self.assertTrue(report.overall_success)
self.assertGreaterEqual(report.success_rate, 0.9)

def test_cost_threshold(self):
"""Test that cost is within threshold"""
report = evaluate(
dataset={"loader": "local_file", "paths": ["./tests.json"]},
providers=[{"type": "gemini", "agent_id": "test"}],
evaluators=[{"type": "cost_evaluator", "max_cost": 0.01}]
)

self.assertLess(report.total_cost, 1.0)

if __name__ == '__main__':
unittest.main()