Custom Reporters
Create your own reporters to output evaluation results in any format you need - CSV, Slack, metrics systems, and more.
Overview
Custom reporters let you extend Judge LLM with your own output formats and integrations. Implement the simple BaseReporter interface to:
- Export to custom formats (CSV, Excel, Markdown)
- Send results to communication platforms (Slack, Teams, Discord)
- Push metrics to monitoring systems (Prometheus, DataDog, CloudWatch)
- Integrate with issue trackers (Jira, Linear, GitHub Issues)
- Store in custom databases (MongoDB, PostgreSQL, Redis)
Quick Start
1. Create a Custom Reporter
from judge_llm.reporters.base import BaseReporter
from judge_llm.core.models import EvaluationReport
from pathlib import Path
import csv
class CSVReporter(BaseReporter):
"""Export evaluation results as CSV"""
def __init__(self, config: dict = None):
self.config = config or {}
self.output_path = Path(self.config.get("output_path", "./results.csv"))
def generate_report(self, report: EvaluationReport):
"""Generate CSV report"""
with open(self.output_path, 'w', newline='') as f:
writer = csv.writer(f)
# Write header
writer.writerow([
'eval_id', 'agent_id', 'provider_type',
'passed', 'cost', 'time_taken'
])
# Write test cases
for tc in report.test_cases:
writer.writerow([
tc.eval_id,
tc.agent_id,
tc.provider_type,
tc.passed,
tc.cost,
tc.time_taken
])
print(f"CSV report saved to {self.output_path}")
def cleanup(self):
"""Cleanup resources"""
pass
2. Register the Reporter
Option 1: Inline Registration (Python API)
from judge_llm import evaluate, register_reporter
from my_reporters import CSVReporter
# Register once
register_reporter("csv", CSVReporter)
# Use by name
report = evaluate(
dataset={"loader": "local_file", "paths": ["./tests.json"]},
providers=[{"type": "gemini", "agent_id": "test"}],
evaluators=[{"type": "response_evaluator"}],
reporters=[{"type": "csv", "output_path": "./results.csv"}]
)
Option 2: Config-Based Registration
# test.yaml
reporters:
- type: custom
module_path: ./reporters/csv_reporter.py
class_name: CSVReporter
output_path: ./results.csv
Option 3: Default Config Registration (Recommended)
Register once in .judge_llm.defaults.yaml:
reporters:
- type: custom
module_path: ./reporters/csv_reporter.py
class_name: CSVReporter
register_as: csv
Use everywhere by name:
# test.yaml
reporters:
- type: csv
output_path: ./results.csv
BaseReporter Interface
All custom reporters must implement the BaseReporter interface:
from abc import ABC, abstractmethod
from judge_llm.core.models import EvaluationReport
class BaseReporter(ABC):
"""Abstract base class for reporters"""
@abstractmethod
def generate_report(self, report: EvaluationReport):
"""Generate report from evaluation results
Args:
report: EvaluationReport object containing all results
"""
pass
@abstractmethod
def cleanup(self):
"""Cleanup resources after report generation"""
pass
EvaluationReport Structure
@dataclass
class EvaluationReport:
total_cost: float
total_time: float
success_rate: float
overall_success: bool
summary: Dict[str, Any] # Contains counts, aggregates
test_cases: List[TestCaseResult]
TestCaseResult Structure
@dataclass
class TestCaseResult:
eval_id: str
agent_id: str
provider_type: str
passed: bool
cost: float
time_taken: float
evaluation_results: List[EvaluationResult]
conversation_history: List[Dict[str, Any]]
Example Reporters
CSV Reporter (Production-Ready)
from judge_llm.reporters.base import BaseReporter
from judge_llm.core.models import EvaluationReport
from pathlib import Path
import csv
from datetime import datetime
class CSVReporter(BaseReporter):
"""Export evaluation results as CSV with full details"""
def __init__(self, config: dict = None):
self.config = config or {}
self.output_path = Path(self.config.get("output_path", "./results.csv"))
self.include_summary = self.config.get("include_summary", True)
def generate_report(self, report: EvaluationReport):
"""Generate detailed CSV report"""
# Ensure directory exists
self.output_path.parent.mkdir(parents=True, exist_ok=True)
with open(self.output_path, 'w', newline='') as f:
writer = csv.writer(f)
# Summary section
if self.include_summary:
writer.writerow(['=== SUMMARY ==='])
writer.writerow(['Metric', 'Value'])
writer.writerow(['Timestamp', datetime.now().isoformat()])
writer.writerow(['Total Cost', f'${report.total_cost:.4f}'])
writer.writerow(['Total Time', f'{report.total_time:.2f}s'])
writer.writerow(['Success Rate', f'{report.success_rate * 100:.1f}%'])
writer.writerow(['Total Tests', report.summary['total_executions']])
writer.writerow(['Passed', report.summary['successful_executions']])
writer.writerow(['Failed', report.summary['failed_executions']])
writer.writerow([]) # Blank line
# Test cases section
writer.writerow(['=== TEST CASES ==='])
writer.writerow([
'Eval ID', 'Agent ID', 'Provider', 'Status',
'Cost', 'Time (s)', 'Evaluators Passed', 'Evaluators Failed'
])
for tc in report.test_cases:
passed_evals = sum(1 for e in tc.evaluation_results if e.passed)
failed_evals = len(tc.evaluation_results) - passed_evals
writer.writerow([
tc.eval_id,
tc.agent_id,
tc.provider_type,
'PASS' if tc.passed else 'FAIL',
f'${tc.cost:.4f}',
f'{tc.time_taken:.2f}',
passed_evals,
failed_evals
])
print(f"✓ CSV report saved to {self.output_path}")
def cleanup(self):
"""No cleanup needed for CSV files"""
pass
Slack Reporter
import requests
from judge_llm.reporters.base import BaseReporter
from judge_llm.core.models import EvaluationReport
class SlackReporter(BaseReporter):
"""Send evaluation summary to Slack"""
def __init__(self, config: dict = None):
self.config = config or {}
self.webhook_url = self.config.get("webhook_url")
self.channel = self.config.get("channel", "#evals")
self.mention_on_failure = self.config.get("mention_on_failure", True)
if not self.webhook_url:
raise ValueError("webhook_url is required for SlackReporter")
def generate_report(self, report: EvaluationReport):
"""Send summary to Slack"""
# Build message
status_emoji = "✅" if report.overall_success else "❌"
status_text = "All tests passed!" if report.overall_success else "Some tests failed"
message = {
"channel": self.channel,
"blocks": [
{
"type": "header",
"text": {
"type": "plain_text",
"text": f"{status_emoji} Evaluation Results"
}
},
{
"type": "section",
"fields": [
{"type": "mrkdwn", "text": f"*Status:*\n{status_text}"},
{"type": "mrkdwn", "text": f"*Success Rate:*\n{report.success_rate * 100:.1f}%"},
{"type": "mrkdwn", "text": f"*Total Cost:*\n${report.total_cost:.4f}"},
{"type": "mrkdwn", "text": f"*Total Time:*\n{report.total_time:.1f}s"},
{"type": "mrkdwn", "text": f"*Tests Passed:*\n{report.summary['successful_executions']}/{report.summary['total_executions']}"},
{"type": "mrkdwn", "text": f"*Tests Failed:*\n{report.summary['failed_executions']}/{report.summary['total_executions']}"}
]
}
]
}
# Add mention if failures
if not report.overall_success and self.mention_on_failure:
message["text"] = "<!channel> Evaluation failures detected"
# Send to Slack
response = requests.post(self.webhook_url, json=message)
response.raise_for_status()
print(f"✓ Results sent to Slack channel {self.channel}")
def cleanup(self):
"""No cleanup needed"""
pass
Prometheus Metrics Reporter
from judge_llm.reporters.base import BaseReporter
from judge_llm.core.models import EvaluationReport
from prometheus_client import CollectorRegistry, Gauge, push_to_gateway
class PrometheusReporter(BaseReporter):
"""Push evaluation metrics to Prometheus Pushgateway"""
def __init__(self, config: dict = None):
self.config = config or {}
self.pushgateway = self.config.get("pushgateway", "localhost:9091")
self.job_name = self.config.get("job_name", "judge_llm_evals")
self.registry = CollectorRegistry()
# Define metrics
self.success_rate = Gauge(
'eval_success_rate',
'Evaluation success rate',
registry=self.registry
)
self.total_cost = Gauge(
'eval_total_cost',
'Total evaluation cost',
registry=self.registry
)
self.total_time = Gauge(
'eval_total_time_seconds',
'Total evaluation time',
registry=self.registry
)
self.test_count = Gauge(
'eval_test_count',
'Number of test cases',
['status'],
registry=self.registry
)
def generate_report(self, report: EvaluationReport):
"""Push metrics to Prometheus"""
# Set gauge values
self.success_rate.set(report.success_rate)
self.total_cost.set(report.total_cost)
self.total_time.set(report.total_time)
self.test_count.labels(status='passed').set(report.summary['successful_executions'])
self.test_count.labels(status='failed').set(report.summary['failed_executions'])
# Push to gateway
push_to_gateway(
self.pushgateway,
job=self.job_name,
registry=self.registry
)
print(f"✓ Metrics pushed to Prometheus at {self.pushgateway}")
def cleanup(self):
"""No cleanup needed"""
pass
Markdown Reporter
from judge_llm.reporters.base import BaseReporter
from judge_llm.core.models import EvaluationReport
from pathlib import Path
from datetime import datetime
class MarkdownReporter(BaseReporter):
"""Generate Markdown report suitable for GitHub, documentation, etc."""
def __init__(self, config: dict = None):
self.config = config or {}
self.output_path = Path(self.config.get("output_path", "./EVAL_REPORT.md"))
self.include_details = self.config.get("include_details", True)
def generate_report(self, report: EvaluationReport):
"""Generate Markdown report"""
lines = []
# Header
lines.append("# Evaluation Report")
lines.append(f"\n**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
lines.append("")
# Summary
lines.append("## Summary")
lines.append("")
status = "✅ PASSED" if report.overall_success else "❌ FAILED"
lines.append(f"**Status:** {status}")
lines.append(f"**Success Rate:** {report.success_rate * 100:.1f}%")
lines.append(f"**Total Cost:** ${report.total_cost:.4f}")
lines.append(f"**Total Time:** {report.total_time:.2f}s")
lines.append(f"**Tests:** {report.summary['successful_executions']}/{report.summary['total_executions']} passed")
lines.append("")
# Test results table
lines.append("## Test Results")
lines.append("")
lines.append("| Eval ID | Agent | Provider | Status | Cost | Time |")
lines.append("|---------|-------|----------|--------|------|------|")
for tc in report.test_cases:
status = "✅" if tc.passed else "❌"
lines.append(
f"| {tc.eval_id} | {tc.agent_id} | {tc.provider_type} | "
f"{status} | ${tc.cost:.4f} | {tc.time_taken:.2f}s |"
)
lines.append("")
# Details section
if self.include_details:
lines.append("## Detailed Results")
lines.append("")
for tc in report.test_cases:
lines.append(f"### {tc.eval_id}")
lines.append("")
lines.append(f"- **Agent:** {tc.agent_id}")
lines.append(f"- **Provider:** {tc.provider_type}")
lines.append(f"- **Status:** {'✅ PASSED' if tc.passed else '❌ FAILED'}")
lines.append(f"- **Cost:** ${tc.cost:.4f}")
lines.append(f"- **Time:** {tc.time_taken:.2f}s")
lines.append("")
# Evaluator results
lines.append("**Evaluator Results:**")
for eval_result in tc.evaluation_results:
status = "✅" if eval_result.passed else "❌"
lines.append(f"- {status} {eval_result.evaluator_type}")
if eval_result.reason:
lines.append(f" - {eval_result.reason}")
lines.append("")
# Write file
self.output_path.parent.mkdir(parents=True, exist_ok=True)
with open(self.output_path, 'w') as f:
f.write('\n'.join(lines))
print(f"✓ Markdown report saved to {self.output_path}")
def cleanup(self):
"""No cleanup needed"""
pass
Registration Methods
1. Inline Registration (Python API)
Best for: Scripts, notebooks, quick prototyping
from judge_llm import evaluate, register_reporter
from my_reporters import CSVReporter, SlackReporter
# Register custom reporters
register_reporter("csv", CSVReporter)
register_reporter("slack", SlackReporter)
# Use them
report = evaluate(
dataset={"loader": "local_file", "paths": ["./tests.json"]},
providers=[{"type": "gemini", "agent_id": "test"}],
evaluators=[{"type": "response_evaluator"}],
reporters=[
{"type": "csv", "output_path": "./results.csv"},
{"type": "slack", "webhook_url": "https://hooks.slack.com/..."}
]
)
2. Config-Based Registration
Best for: One-off custom reporters, testing
# test.yaml
reporters:
- type: custom
module_path: ./reporters/csv_reporter.py
class_name: CSVReporter
output_path: ./results.csv
- type: custom
module_path: ./reporters/slack_reporter.py
class_name: SlackReporter
webhook_url: ${SLACK_WEBHOOK_URL}
channel: "#evals"
3. Default Config Registration (Recommended)
Best for: Reusable reporters across multiple configs
Register once in .judge_llm.defaults.yaml:
reporters:
# Register CSV reporter
- type: custom
module_path: ./reporters/csv_reporter.py
class_name: CSVReporter
register_as: csv
# Register Slack reporter
- type: custom
module_path: ./reporters/slack_reporter.py
class_name: SlackReporter
register_as: slack
Use everywhere by name:
# test1.yaml
reporters:
- type: csv
output_path: ./test1.csv
# test2.yaml
reporters:
- type: csv
output_path: ./test2.csv
- type: slack
webhook_url: ${SLACK_WEBHOOK_URL}
Best Practices
1. Error Handling
class RobustReporter(BaseReporter):
def generate_report(self, report: EvaluationReport):
try:
# Your reporting logic
self._send_to_external_service(report)
except Exception as e:
print(f"⚠️ Reporter failed: {e}")
# Optionally fallback to file
self._save_to_fallback_file(report)
def cleanup(self):
try:
# Cleanup logic
self._close_connections()
except Exception as e:
print(f"⚠️ Cleanup failed: {e}")
2. Configuration Validation
class ConfiguredReporter(BaseReporter):
def __init__(self, config: dict = None):
self.config = config or {}
# Validate required config
required = ["api_key", "endpoint"]
missing = [k for k in required if k not in self.config]
if missing:
raise ValueError(f"Missing required config: {', '.join(missing)}")
self.api_key = self.config["api_key"]
self.endpoint = self.config["endpoint"]
3. Environment Variables
import os
class SecureReporter(BaseReporter):
def __init__(self, config: dict = None):
self.config = config or {}
# Support both config and env vars
self.api_key = (
self.config.get("api_key") or
os.environ.get("REPORTER_API_KEY")
)
if not self.api_key:
raise ValueError("API key required (config or REPORTER_API_KEY env var)")
4. Progress Feedback
class VerboseReporter(BaseReporter):
def generate_report(self, report: EvaluationReport):
print(f"Generating report for {len(report.test_cases)} test cases...")
# Do work
self._process_results(report)
print(f"✓ Report generated successfully")
5. Resource Cleanup
class ResourceReporter(BaseReporter):
def __init__(self, config: dict = None):
self.config = config or {}
self.connection = None
def generate_report(self, report: EvaluationReport):
self.connection = self._connect()
self._send_data(report)
# Don't close here - use cleanup()
def cleanup(self):
"""Always called after generate_report"""
if self.connection:
self.connection.close()
self.connection = None
Testing Custom Reporters
Unit Test Example
import pytest
from judge_llm.core.models import EvaluationReport, TestCaseResult
from my_reporters import CSVReporter
from pathlib import Path
def test_csv_reporter():
# Create mock report
report = EvaluationReport(
total_cost=0.05,
total_time=10.0,
success_rate=0.8,
overall_success=True,
summary={
"total_executions": 5,
"successful_executions": 4,
"failed_executions": 1
},
test_cases=[
TestCaseResult(
eval_id="test_1",
agent_id="agent_1",
provider_type="gemini",
passed=True,
cost=0.01,
time_taken=2.0,
evaluation_results=[],
conversation_history=[]
)
]
)
# Test reporter
output_path = Path("/tmp/test_report.csv")
reporter = CSVReporter({"output_path": str(output_path)})
reporter.generate_report(report)
# Verify output
assert output_path.exists()
content = output_path.read_text()
assert "test_1" in content
assert "agent_1" in content
# Cleanup
reporter.cleanup()
output_path.unlink()