Source code for council.evaluators.llm_evaluator

"""
LLMEvaluator implementation.

This evaluator uses the given `LLM` to evaluate the chain's responses.
"""
from typing import List

from council.contexts import AgentContext, ChatMessage, ContextLogger, ScoredChatMessage
from council.evaluators import EvaluatorBase
from council.llm import LLMBase, LLMMessage, MonitoredLLM
from council.utils import Option


[docs] class LLMEvaluator(EvaluatorBase): """Evaluator using an `LLM` to evaluate chain responses."""
[docs] def __init__(self, llm: LLMBase): """ Build a new LLMEvaluator. :param llm: model to use for the evaluation. """ super().__init__() self._llm = self.register_monitor(MonitoredLLM("llm", llm))
@property def llm(self) -> LLMBase: """ the LLM used by the evaluator """ return self._llm.inner def _execute(self, context: AgentContext) -> List[ScoredChatMessage]: query = context.chat_history.try_last_user_message.unwrap() chain_results = [ chain_messages.try_last_message.unwrap() for chain_messages in context.chains if chain_messages.try_last_message.is_some() ] # Build prompt to send to the inner LLM llm_response = self._call_llm(context, query, chain_results) # Parse LLM response with the score for each message we want to score scores = [ self._parse_eval(line, context.logger) for line in llm_response.split("\n") if line.lower().startswith("grade") ] scored_messages = [] for skill_message, score in filter(lambda tuple: tuple[1].is_some(), zip(chain_results, scores)): scored_message = ScoredChatMessage( ChatMessage.agent(message=skill_message.message, data=skill_message.data), score.unwrap() ) scored_messages.append(scored_message) return scored_messages def _call_llm(self, context: AgentContext, query: ChatMessage, chain_results: list[ChatMessage]) -> str: messages = self._build_llm_messages(query, chain_results) if len(messages) <= 0: return "" result = self._llm.post_chat_request(context, messages=messages) llm_response = result.first_choice context.logger.debug(f"llm response: {llm_response}") return llm_response def _build_llm_messages(self, query: ChatMessage, skill_messages: list[ChatMessage]) -> List[LLMMessage]: if len(skill_messages) <= 0: return [] if len(skill_messages) == 1: prompt = self._build_system_prompt_single_answer() return [ LLMMessage.system_message(prompt), LLMMessage.user_message(self._build_single_answer_message(query.message, skill_messages[0].message)), ] responses = [skill_message.message for skill_message in skill_messages] prompt = self._build_system_prompt_multiple_answers() return [ LLMMessage.system_message(prompt), LLMMessage.user_message(self._build_multiple_answers_message(query.message, responses)), ] def _parse_eval(self, line: str, logger: ContextLogger) -> Option[float]: """Parse the evaluation response from the inner `LLM`.""" line = line.lower().removeprefix("answer").strip().replace("-", ":") try: score = line.split(":", 3) return Option.some(float(score[1])) except ValueError: logger.exception(f'message="could not parse score" line="{line}"') raise except Exception: logger.exception(f'message="could not parse evaluation response" line="{line}"') raise @staticmethod def _build_multiple_answers_message(query: str, answers: list[str]) -> str: prompt_answers = "\n".join(f"Answer #{index+1} is:\n{answer}" for index, answer in enumerate(answers)) lines = ["# The question to grade is:", query, "# The given answers are:", prompt_answers, "# Please grade."] return "\n".join(lines) @staticmethod def _build_single_answer_message(query: str, answer: str) -> str: lines = ["# The question to grade is:", query, "# The given answer is:", answer, "# Please grade."] return "\n".join(lines) @staticmethod def _build_system_prompt_multiple_answers() -> str: """Build prompt that will be sent to the inner `LLM`.""" task_description = [ "# You are a grading expert, grading how accurate and relevant multiple answers are to a given question.", "# Your grade will only be based on the given answer.", "# The list of given answers is formatted precisely as:", "Answer #{index} is:", "{answer}", "# INSTRUCTIONS: ", "# Give a grade from 0.0 to 10.0", "# Same answers must have the same grade.", "# Irrelevant or empty answer must be graded 0.0", "# For each given answer, your grade will be formatted precisely as:", "grade #{index}: {grade as float} - short justification", ] prompt = "\n".join(task_description) return prompt @staticmethod def _build_system_prompt_single_answer() -> str: """Build prompt that will be sent to the inner `LLM`.""" task_description = [ "# You are a grading expert, grading how accurate and relevant an answer is to a given question.", "# INSTRUCTIONS: ", "# Give a grade from 0.0 to 10.0", "# Irrelevant or empty answer must be graded 0.0", "# Your grade will be formatted precisely as:", "grade: {grade as float} - short justification", ] prompt = "\n".join(task_description) return prompt