Source code for council.evaluators.llm_evaluator

"""
LLMEvaluator implementation.

This evaluator uses the given `LLM` to evaluate the chain's responses.
"""
import logging
from typing import List

from council.contexts import AgentContext, ScoredChatMessage, ChatMessage
from council.evaluators import EvaluatorBase
from council.llm import LLMBase, LLMMessage
from council.runners import Budget
from council.utils import Option


[docs]class LLMEvaluator(EvaluatorBase):
    """Evaluator using an `LLM` to evaluate chain responses."""

[docs]    def __init__(self, llm: LLMBase):
        """
        Build a new LLMEvaluator.

        :param llm: model to use for the evaluation.
        """
        """Build a new LLMEvaluator."""
        super().__init__()
        self.llm = llm

[docs]    def execute(self, context: AgentContext, budget: Budget) -> List[ScoredChatMessage]:
        query = context.chatHistory.try_last_user_message.unwrap()
        chain_results = [
            chain_history[-1].try_last_message.unwrap()
            for chain_history in context.chainHistory.values()
            if chain_history[-1].try_last_message.is_some()
        ]
        scored_messages = self.__score_responses(query=query, skill_messages=chain_results)
        return list(scored_messages)

    def __score_responses(self, query: ChatMessage, skill_messages: list[ChatMessage]) -> List[ScoredChatMessage]:
        """
        Score agent response.

        :param query: Query used to build the responses.
        :param skill_messages: Responses generated by the chain.
        :return: list of scored messages.
        """
        # Build prompt to send to the inner LLM
        responses = [skill_message.message for skill_message in skill_messages]
        prompt = self.__build_prompt(query.message, responses=responses)

        # Send prompt to inner LLM
        messages = [LLMMessage.system_message(prompt)]
        llm_response = self.llm.post_chat_request(messages=messages)[0]

        # Parse LLM response with the score for each message we want to score
        scores = [self.__parse_eval(line) for line in llm_response.split("\n")]

        agent_messages = []
        for skill_message, score in filter(lambda tuple: tuple[1].is_some(), zip(skill_messages, scores)):
            agent_message = ScoredChatMessage(
                ChatMessage.agent(message=skill_message.message, data=skill_message.data), score.unwrap()
            )
            agent_messages.append(agent_message)

        return agent_messages

    @staticmethod
    def __parse_eval(line: str) -> Option[int]:
        """Parse the evaluation response from the inner `LLM`."""
        line = line.removeprefix("response").strip()
        try:
            (_response, score) = line.split(":", 2)
            return Option.some(int(score))
        except Exception:
            logging.exception('message="could not parse response evaluation"')
            raise

    @staticmethod
    def __build_prompt(query: str, responses: list[str]) -> str:
        """Build prompt that will be sent to the inner `LLM`."""
        answers = "\n".join(f"response {index}:\n{response}\n------" for index, response in enumerate(responses))
        task_description = [
            "You are grading the following question:",
            query,
            "Answers are given as a response (response: {response})",
            "Answers are separated with `------`",
            "You are grading how informative each of the following response is:",
            answers,
            "# Instructions: ",
            "# What grade do you give from 0 to 10",
            "# For each answer, you will answer exactly with `response {index}: {score}`",
            "# You will not provide any justification",
        ]
        prompt = "\n".join(task_description)
        return prompt