Skip to main content

Custom Evaluators

Build domain-specific evaluators to validate any aspect of LLM responses beyond the built-in evaluators.

Overview

Custom evaluators allow you to:

  • Implement business-specific validation rules
  • Check safety and compliance requirements
  • Validate domain knowledge
  • Enforce style guidelines
  • Combine multiple evaluation criteria

Creating a Custom Evaluator

Step 1: Inherit from BaseEvaluator

from judge_llm.evaluators.base import BaseEvaluator
from judge_llm.core.models import EvalCase, ProviderResult, EvaluatorResult
from typing import Any, Dict, Optional

class MyCustomEvaluator(BaseEvaluator):
def __init__(self, config: Dict[str, Any] = None):
super().__init__(config)
# Initialize your evaluator
self.threshold = config.get("threshold", 0.8) if config else 0.8

Step 2: Implement evaluate() Method

def evaluate(
self,
eval_case: EvalCase,
agent_metadata: Dict[str, Any],
provider_result: ProviderResult,
eval_config: Optional[Dict[str, Any]] = None,
) -> EvaluatorResult:
"""Evaluate provider result

Args:
eval_case: Original test case with expected outputs
agent_metadata: Provider metadata
provider_result: Actual execution results
eval_config: Per-test-case configuration override

Returns:
EvaluatorResult with pass/fail and details
"""
# Merge config: per-test-case overrides instance config
config = self.get_config(eval_config)

# Check provider succeeded
if not provider_result.success:
return EvaluatorResult(
evaluator_name=self.get_evaluator_name(),
evaluator_type=self.get_evaluator_type(),
success=False,
passed=False,
details={"error": "Provider execution failed"},
error="Provider execution failed",
)

# Your evaluation logic here
score = self._calculate_score(provider_result)
passed = score >= self.threshold

return EvaluatorResult(
evaluator_name=self.get_evaluator_name(),
evaluator_type=self.get_evaluator_type(),
success=True,
score=score,
threshold=self.threshold,
passed=passed,
details={
"score": score,
"threshold": self.threshold,
# Add your custom details
},
)

def _calculate_score(self, provider_result: ProviderResult) -> float:
"""Implement your scoring logic"""
# Your implementation
return 0.8

Registration Methods

Method 1: File-Based Registration

Create a Python file with your evaluator:

# my_evaluators/custom_eval.py
from judge_llm.evaluators.base import BaseEvaluator
from judge_llm.core.models import EvaluatorResult

class CustomEvaluator(BaseEvaluator):
def evaluate(self, eval_case, agent_metadata, provider_result, eval_config=None):
# Implementation
pass

Use in config:

evaluators:
- type: custom
module_path: ./my_evaluators/custom_eval.py
class_name: CustomEvaluator
enabled: true
config:
threshold: 0.8
custom_param: value

Method 2: Programmatic Registration

from judge_llm import register_evaluator, evaluate
from my_evaluators.custom_eval import CustomEvaluator

# Register your evaluator
register_evaluator("my_custom_eval", CustomEvaluator)

# Use in evaluation
report = evaluate(
dataset={"loader": "local_file", "paths": ["./tests.json"]},
providers=[{"type": "gemini", "agent_id": "test"}],
evaluators=[{
"type": "my_custom_eval",
"config": {"threshold": 0.8}
}],
reporters=[{"type": "console"}]
)

Complete Examples

Example 1: Length Validator

Ensures responses are within specified length bounds:

# evaluators/length_validator.py
from judge_llm.evaluators.base import BaseEvaluator
from judge_llm.core.models import EvalCase, ProviderResult, EvaluatorResult
from typing import Any, Dict, Optional

class LengthValidator(BaseEvaluator):
"""Validate response length is within bounds"""

def evaluate(
self,
eval_case: EvalCase,
agent_metadata: Dict[str, Any],
provider_result: ProviderResult,
eval_config: Optional[Dict[str, Any]] = None,
) -> EvaluatorResult:
config = self.get_config(eval_config)
min_length = config.get("min_length", 10)
max_length = config.get("max_length", 1000)

if not provider_result.success:
return EvaluatorResult(
evaluator_name=self.get_evaluator_name(),
evaluator_type=self.get_evaluator_type(),
success=False,
passed=False,
details={"error": "Provider execution failed"},
)

# Check all responses
issues = []
total_length = 0

for inv in provider_result.conversation_history:
response_text = " ".join(
part.text for part in inv.final_response.parts if part.text
)
length = len(response_text)
total_length += length

if length < min_length:
issues.append({
"invocation": inv.invocation_id,
"issue": "too_short",
"actual": length,
"minimum": min_length
})
elif length > max_length:
issues.append({
"invocation": inv.invocation_id,
"issue": "too_long",
"actual": length,
"maximum": max_length
})

passed = len(issues) == 0
avg_length = total_length / len(provider_result.conversation_history)

return EvaluatorResult(
evaluator_name=self.get_evaluator_name(),
evaluator_type=self.get_evaluator_type(),
success=True,
score=1.0 if passed else 0.0,
passed=passed,
details={
"min_length": min_length,
"max_length": max_length,
"average_length": avg_length,
"total_invocations": len(provider_result.conversation_history),
"issues": issues,
},
)

Configuration:

evaluators:
- type: custom
module_path: ./evaluators/length_validator.py
class_name: LengthValidator
config:
min_length: 50
max_length: 500

Example 2: JSON Format Validator

Validates responses are valid JSON:

# evaluators/json_validator.py
import json
from judge_llm.evaluators.base import BaseEvaluator
from judge_llm.core.models import EvalCase, ProviderResult, EvaluatorResult
from typing import Any, Dict, Optional

class JSONValidator(BaseEvaluator):
"""Validate responses are valid JSON"""

def evaluate(
self,
eval_case: EvalCase,
agent_metadata: Dict[str, Any],
provider_result: ProviderResult,
eval_config: Optional[Dict[str, Any]] = None,
) -> EvaluatorResult:
config = self.get_config(eval_config)
require_schema = config.get("require_schema", False)
expected_keys = config.get("expected_keys", [])

if not provider_result.success:
return EvaluatorResult(
evaluator_name=self.get_evaluator_name(),
evaluator_type=self.get_evaluator_type(),
success=False,
passed=False,
details={"error": "Provider execution failed"},
)

validation_results = []

for inv in provider_result.conversation_history:
response_text = " ".join(
part.text for part in inv.final_response.parts if part.text
)

try:
# Try to parse JSON
parsed = json.loads(response_text)

# Check for expected keys
missing_keys = [
key for key in expected_keys
if key not in parsed
]

validation_results.append({
"invocation": inv.invocation_id,
"valid_json": True,
"missing_keys": missing_keys,
"has_all_keys": len(missing_keys) == 0,
})

except json.JSONDecodeError as e:
validation_results.append({
"invocation": inv.invocation_id,
"valid_json": False,
"error": str(e),
})

# All must be valid JSON with required keys
passed = all(
r["valid_json"] and r.get("has_all_keys", True)
for r in validation_results
)

valid_count = sum(1 for r in validation_results if r["valid_json"])
score = valid_count / len(validation_results)

return EvaluatorResult(
evaluator_name=self.get_evaluator_name(),
evaluator_type=self.get_evaluator_type(),
success=True,
score=score,
passed=passed,
details={
"expected_keys": expected_keys,
"validation_results": validation_results,
"valid_count": valid_count,
"total_count": len(validation_results),
},
)

Configuration:

evaluators:
- type: custom
module_path: ./evaluators/json_validator.py
class_name: JSONValidator
config:
expected_keys: ["status", "data", "message"]

Example 3: Safety Evaluator (Full Implementation)

See the complete safety evaluator in Example 04:

examples/04-safety-long-conversation/evaluators/safety_evaluator.py

Features:

  • PII detection (email, phone, SSN, credit cards)
  • Toxicity checking
  • Harmful instruction detection
  • Hate speech prevention
  • Severity thresholds
  • Context-aware filtering
  • LLM-as-judge pattern support

Best Practices

1. Error Handling

Always handle errors gracefully:

def evaluate(self, eval_case, agent_metadata, provider_result, eval_config=None):
try:
# Your evaluation logic
return EvaluatorResult(...)
except Exception as e:
return EvaluatorResult(
evaluator_name=self.get_evaluator_name(),
evaluator_type=self.get_evaluator_type(),
success=False,
passed=False,
error=str(e),
details={"exception": str(e)},
)

2. Use Per-Case Config

Support overriding settings per test case:

def evaluate(self, eval_case, agent_metadata, provider_result, eval_config=None):
# Merge configurations
config = self.get_config(eval_config)

# Use merged config
threshold = config.get("threshold", 0.8)

3. Provide Detailed Feedback

Include actionable details in results:

return EvaluatorResult(
# ...
details={
"score": score,
"threshold": threshold,
"reasons_for_failure": reasons, # What went wrong
"suggestions": suggestions, # How to fix
"examples": examples, # Specific examples
},
)

4. Analyze All Invocations

For multi-turn conversations, check all turns:

for inv in provider_result.conversation_history:
response_text = " ".join(
part.text for part in inv.final_response.parts if part.text
)
# Analyze this turn

5. Normalize Scores

Return scores in 0.0-1.0 range:

# Good
score = valid_count / total_count # 0.0 to 1.0

# Also good
score = max(0.0, 1.0 - (issues / max_expected_issues))

Common Patterns

Pattern 1: Keyword Checker

def _check_keywords(self, text: str, required_keywords: list) -> float:
"""Check presence of required keywords"""
text_lower = text.lower()
found = sum(1 for kw in required_keywords if kw in text_lower)
return found / len(required_keywords)

Pattern 2: Regex Validator

def _validate_format(self, text: str, pattern: str) -> bool:
"""Validate text matches regex pattern"""
import re
return bool(re.match(pattern, text))

Pattern 3: External API Call

def _call_external_validator(self, text: str) -> dict:
"""Call external validation service"""
import requests
response = requests.post(
"https://api.validator.com/check",
json={"text": text},
timeout=10
)
return response.json()

Pattern 4: LLM-as-Judge

def _llm_judge(self, text: str, criteria: str) -> dict:
"""Use LLM to evaluate quality"""
# Call LLM API with evaluation prompt
prompt = f"""Evaluate this response based on: {criteria}

Response: {text}

Return JSON with score (0-1) and reasoning."""

# Call your LLM provider
result = your_llm_call(prompt)
return result

Testing Your Evaluator

Unit Tests

# tests/test_custom_evaluator.py
from my_evaluators.length_validator import LengthValidator
from judge_llm.core.models import EvalCase, ProviderResult

def test_length_validator():
evaluator = LengthValidator({"min_length": 10, "max_length": 100})

# Create test case
eval_case = create_test_eval_case()
provider_result = create_test_provider_result()

# Run evaluation
result = evaluator.evaluate(eval_case, {}, provider_result)

assert result.success
assert result.passed or not result.passed # Depends on test data

Integration Tests

from judge_llm import evaluate

def test_length_validator_integration():
report = evaluate(
dataset={"loader": "local_file", "paths": ["./test_cases.json"]},
providers=[{"type": "mock", "agent_id": "test"}],
evaluators=[{
"type": "custom",
"module_path": "./my_evaluators/length_validator.py",
"class_name": "LengthValidator",
"config": {"min_length": 10, "max_length": 100}
}],
reporters=[{"type": "console"}]
)

assert report.total_test_cases > 0

Troubleshooting

Evaluator Not Found

Error: Evaluator type 'my_custom' not found

Solution: Use type: custom with module_path and class_name

Module Import Error

Error: ModuleNotFoundError: No module named 'my_evaluators'

Solution: Check module_path is correct relative path from execution directory

AttributeError

Error: 'CustomEvaluator' object has no attribute 'get_evaluator_name'

Solution: Ensure you inherit from BaseEvaluator

Find more custom evaluator examples in: