"""Contains the LLMEvaluator class for building LLM-as-a-judge evaluators."""
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
from pydantic import BaseModel
from langsmith._internal._beta_decorator import warn_beta
from langsmith.evaluation import EvaluationResult, EvaluationResults, RunEvaluator
from langsmith.schemas import Example, Run
[docs]class CategoricalScoreConfig(BaseModel):
"""Configuration for a categorical score."""
key: str
choices: List[str]
description: str
include_explanation: bool = False
explanation_description: Optional[str] = None
[docs]class ContinuousScoreConfig(BaseModel):
"""Configuration for a continuous score."""
key: str
min: float = 0
max: float = 1
description: str
include_explanation: bool = False
explanation_description: Optional[str] = None
def _create_score_json_schema(
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
) -> dict:
properties: Dict[str, Any] = {}
if isinstance(score_config, CategoricalScoreConfig):
properties["score"] = {
"type": "string",
"enum": score_config.choices,
"description": f"The score for the evaluation, one of "
f"{', '.join(score_config.choices)}.",
}
elif isinstance(score_config, ContinuousScoreConfig):
properties["score"] = {
"type": "number",
"minimum": score_config.min,
"maximum": score_config.max,
"description": f"The score for the evaluation, between "
f"{score_config.min} and {score_config.max}, inclusive.",
}
else:
raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'")
if score_config.include_explanation:
properties["explanation"] = {
"type": "string",
"description": (
"The explanation for the score."
if score_config.explanation_description is None
else score_config.explanation_description
),
}
return {
"title": score_config.key,
"description": score_config.description,
"type": "object",
"properties": properties,
"required": (
["score", "explanation"] if score_config.include_explanation else ["score"]
),
}
class LLMEvaluator(RunEvaluator):
"""A class for building LLM-as-a-judge evaluators."""
def __init__(
self,
*,
prompt_template: Union[str, List[Tuple[str, str]]],
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
model_name: str = "gpt-4o",
model_provider: str = "openai",
**kwargs,
):
"""Initialize the LLMEvaluator.
Args:
prompt_template (Union[str, List[Tuple[str, str]]): The prompt
template to use for the evaluation. If a string is provided, it is
assumed to be a human / user message.
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
The configuration for the score, either categorical or continuous.
map_variables (Optional[Callable[[Run, Example], dict]], optional):
A function that maps the run and example to the variables in the
prompt. Defaults to None. If None, it is assumed that the prompt
only requires 'input', 'output', and 'expected'.
model_name (Optional[str], optional): The model to use for the evaluation.
Defaults to "gpt-4o".
model_provider (Optional[str], optional): The model provider to use
for the evaluation. Defaults to "openai".
"""
try:
from langchain.chat_models import init_chat_model
except ImportError as e:
raise ImportError(
"LLMEvaluator requires langchain to be installed. "
"Please install langchain by running `pip install langchain`."
) from e
chat_model = init_chat_model(
model=model_name, model_provider=model_provider, **kwargs
)
self._initialize(prompt_template, score_config, map_variables, chat_model)
@classmethod
def from_model(
cls,
model: Any,
*,
prompt_template: Union[str, List[Tuple[str, str]]],
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
):
"""Create an LLMEvaluator instance from a BaseChatModel instance.
Args:
model (BaseChatModel): The chat model instance to use for the evaluation.
prompt_template (Union[str, List[Tuple[str, str]]): The prompt
template to use for the evaluation. If a string is provided, it is
assumed to be a system message.
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
The configuration for the score, either categorical or continuous.
map_variables (Optional[Callable[[Run, Example]], dict]], optional):
A function that maps the run and example to the variables in the
prompt. Defaults to None. If None, it is assumed that the prompt
only requires 'input', 'output', and 'expected'.
Returns:
LLMEvaluator: An instance of LLMEvaluator.
"""
instance = cls.__new__(cls)
instance._initialize(prompt_template, score_config, map_variables, model)
return instance
def _initialize(
self,
prompt_template: Union[str, List[Tuple[str, str]]],
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
map_variables: Optional[Callable[[Run, Optional[Example]], dict]],
chat_model: Any,
):
"""Shared initialization code for __init__ and from_model.
Args:
prompt_template (Union[str, List[Tuple[str, str]]): The prompt template.
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
The score configuration.
map_variables (Optional[Callable[[Run, Example]], dict]]):
Function to map variables.
chat_model (BaseChatModel): The chat model instance.
"""
try:
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.prompts import ChatPromptTemplate
except ImportError as e:
raise ImportError(
"LLMEvaluator requires langchain-core to be installed. "
"Please install langchain-core by running `pip install langchain-core`."
) from e
if not (
isinstance(chat_model, BaseChatModel)
and hasattr(chat_model, "with_structured_output")
):
raise ValueError(
"chat_model must be an instance of "
"BaseLanguageModel and support structured output."
)
if isinstance(prompt_template, str):
self.prompt = ChatPromptTemplate.from_messages([("human", prompt_template)])
else:
self.prompt = ChatPromptTemplate.from_messages(prompt_template)
if set(self.prompt.input_variables) - {"input", "output", "expected"}:
if not map_variables:
raise ValueError(
"map_inputs must be provided if the prompt template contains "
"variables other than 'input', 'output', and 'expected'"
)
self.map_variables = map_variables
self.score_config = score_config
self.score_schema = _create_score_json_schema(self.score_config)
chat_model = chat_model.with_structured_output(self.score_schema)
self.runnable = self.prompt | chat_model
@warn_beta
def evaluate_run(
self, run: Run, example: Optional[Example] = None
) -> Union[EvaluationResult, EvaluationResults]:
"""Evaluate a run."""
variables = self._prepare_variables(run, example)
output: dict = cast(dict, self.runnable.invoke(variables))
return self._parse_output(output)
@warn_beta
async def aevaluate_run(
self, run: Run, example: Optional[Example] = None
) -> Union[EvaluationResult, EvaluationResults]:
"""Asynchronously evaluate a run."""
variables = self._prepare_variables(run, example)
output: dict = cast(dict, await self.runnable.ainvoke(variables))
return self._parse_output(output)
def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict:
"""Prepare variables for model invocation."""
if self.map_variables:
return self.map_variables(run, example)
variables = {}
if "input" in self.prompt.input_variables:
if len(run.inputs) == 0:
raise ValueError(
"No input keys are present in run.inputs but the prompt "
"requires 'input'."
)
if len(run.inputs) != 1:
raise ValueError(
"Multiple input keys are present in run.inputs. Please provide "
"a map_variables function."
)
variables["input"] = list(run.inputs.values())[0]
if "output" in self.prompt.input_variables:
if not run.outputs:
raise ValueError(
"No output keys are present in run.outputs but the prompt "
"requires 'output'."
)
if len(run.outputs) == 0:
raise ValueError(
"No output keys are present in run.outputs but the prompt "
"requires 'output'."
)
if len(run.outputs) != 1:
raise ValueError(
"Multiple output keys are present in run.outputs. Please "
"provide a map_variables function."
)
variables["output"] = list(run.outputs.values())[0]
if "expected" in self.prompt.input_variables:
if not example or not example.outputs:
raise ValueError(
"No example or example outputs is provided but the prompt "
"requires 'expected'."
)
if len(example.outputs) == 0:
raise ValueError(
"No output keys are present in example.outputs but the prompt "
"requires 'expected'."
)
if len(example.outputs) != 1:
raise ValueError(
"Multiple output keys are present in example.outputs. Please "
"provide a map_variables function."
)
variables["expected"] = list(example.outputs.values())[0]
return variables
def _parse_output(self, output: dict) -> Union[EvaluationResult, EvaluationResults]:
"""Parse the model output into an evaluation result."""
if isinstance(self.score_config, CategoricalScoreConfig):
value = output["score"]
explanation = output.get("explanation", None)
return EvaluationResult(
key=self.score_config.key, value=value, comment=explanation
)
elif isinstance(self.score_config, ContinuousScoreConfig):
score = output["score"]
explanation = output.get("explanation", None)
return EvaluationResult(
key=self.score_config.key, score=score, comment=explanation
)