"""
LLMEvaluator implementation.
This evaluator uses the given `LLM` to evaluate the chain's responses.
"""
from typing import List
from council.contexts import AgentContext, ChatMessage, ContextLogger, ScoredChatMessage
from council.evaluators import EvaluatorBase
from council.llm import LLMBase, LLMMessage, MonitoredLLM
from council.utils import Option
[docs]
class LLMEvaluator(EvaluatorBase):
"""Evaluator using an `LLM` to evaluate chain responses."""
[docs]
def __init__(self, llm: LLMBase):
"""
Build a new LLMEvaluator.
:param llm: model to use for the evaluation.
"""
super().__init__()
self._llm = self.register_monitor(MonitoredLLM("llm", llm))
@property
def llm(self) -> LLMBase:
"""
the LLM used by the evaluator
"""
return self._llm.inner
def _execute(self, context: AgentContext) -> List[ScoredChatMessage]:
query = context.chat_history.try_last_user_message.unwrap()
chain_results = [
chain_messages.try_last_message.unwrap()
for chain_messages in context.chains
if chain_messages.try_last_message.is_some()
]
# Build prompt to send to the inner LLM
llm_response = self._call_llm(context, query, chain_results)
# Parse LLM response with the score for each message we want to score
scores = [
self._parse_eval(line, context.logger)
for line in llm_response.split("\n")
if line.lower().startswith("grade")
]
scored_messages = []
for skill_message, score in filter(lambda tuple: tuple[1].is_some(), zip(chain_results, scores)):
scored_message = ScoredChatMessage(
ChatMessage.agent(message=skill_message.message, data=skill_message.data), score.unwrap()
)
scored_messages.append(scored_message)
return scored_messages
def _call_llm(self, context: AgentContext, query: ChatMessage, chain_results: list[ChatMessage]) -> str:
messages = self._build_llm_messages(query, chain_results)
if len(messages) <= 0:
return ""
result = self._llm.post_chat_request(context, messages=messages)
llm_response = result.first_choice
context.logger.debug(f"llm response: {llm_response}")
return llm_response
def _build_llm_messages(self, query: ChatMessage, skill_messages: list[ChatMessage]) -> List[LLMMessage]:
if len(skill_messages) <= 0:
return []
if len(skill_messages) == 1:
prompt = self._build_system_prompt_single_answer()
return [
LLMMessage.system_message(prompt),
LLMMessage.user_message(self._build_single_answer_message(query.message, skill_messages[0].message)),
]
responses = [skill_message.message for skill_message in skill_messages]
prompt = self._build_system_prompt_multiple_answers()
return [
LLMMessage.system_message(prompt),
LLMMessage.user_message(self._build_multiple_answers_message(query.message, responses)),
]
def _parse_eval(self, line: str, logger: ContextLogger) -> Option[float]:
"""Parse the evaluation response from the inner `LLM`."""
line = line.lower().removeprefix("answer").strip().replace("-", ":")
try:
score = line.split(":", 3)
return Option.some(float(score[1]))
except ValueError:
logger.exception(f'message="could not parse score" line="{line}"')
raise
except Exception:
logger.exception(f'message="could not parse evaluation response" line="{line}"')
raise
@staticmethod
def _build_multiple_answers_message(query: str, answers: list[str]) -> str:
prompt_answers = "\n".join(f"Answer #{index+1} is:\n{answer}" for index, answer in enumerate(answers))
lines = ["# The question to grade is:", query, "# The given answers are:", prompt_answers, "# Please grade."]
return "\n".join(lines)
@staticmethod
def _build_single_answer_message(query: str, answer: str) -> str:
lines = ["# The question to grade is:", query, "# The given answer is:", answer, "# Please grade."]
return "\n".join(lines)
@staticmethod
def _build_system_prompt_multiple_answers() -> str:
"""Build prompt that will be sent to the inner `LLM`."""
task_description = [
"# You are a grading expert, grading how accurate and relevant multiple answers are to a given question.",
"# Your grade will only be based on the given answer.",
"# The list of given answers is formatted precisely as:",
"Answer #{index} is:",
"{answer}",
"# INSTRUCTIONS: ",
"# Give a grade from 0.0 to 10.0",
"# Same answers must have the same grade.",
"# Irrelevant or empty answer must be graded 0.0",
"# For each given answer, your grade will be formatted precisely as:",
"grade #{index}: {grade as float} - short justification",
]
prompt = "\n".join(task_description)
return prompt
@staticmethod
def _build_system_prompt_single_answer() -> str:
"""Build prompt that will be sent to the inner `LLM`."""
task_description = [
"# You are a grading expert, grading how accurate and relevant an answer is to a given question.",
"# INSTRUCTIONS: ",
"# Give a grade from 0.0 to 10.0",
"# Irrelevant or empty answer must be graded 0.0",
"# Your grade will be formatted precisely as:",
"grade: {grade as float} - short justification",
]
prompt = "\n".join(task_description)
return prompt