Source code for mixedvoices.evaluation.eval_run

import os
import time
from typing import TYPE_CHECKING, List, Optional, Type
from uuid import uuid4

import mixedvoices.constants as constants
from mixedvoices.evaluation.eval_agent import EvalAgent
from mixedvoices.utils import load_json, save_json

if TYPE_CHECKING:
    from mixedvoices import BaseAgent  # pragma: no cover


def get_info_path(project_id, version_id, eval_id, run_id):
    return os.path.join(
        constants.PROJECTS_FOLDER,
        project_id,
        "evals",
        eval_id,
        "versions",
        version_id,
        "runs",
        run_id,
        "info.json",
    )


# TODO add resume later



[docs]
class EvalRun:
    """Tracks a single run of Evaluator"""
    def __init__(
        self,
        run_id: str,
        project_id: str,
        version_id: str,
        eval_id: str,
        agent_prompt: str,
        metric_names: List[str],
        test_cases: List[str],
        verbose: bool = True,
        created_at: Optional[int] = None,
        eval_agents: Optional[List[EvalAgent]] = None,
        started: bool = False,
        ended: bool = False,
        error: Optional[str] = None,
        last_updated: Optional[int] = None,
    ):
        self._run_id = run_id
        self._project_id = project_id
        self._version_id = version_id
        self._eval_id = eval_id

        self._agent_prompt = agent_prompt
        self._metric_names = metric_names
        self._test_cases = test_cases
        self._verbose = verbose
        self._created_at = created_at or int(time.time())
        self._eval_agents = eval_agents or [
            EvalAgent(
                uuid4().hex,
                project_id,
                version_id,
                eval_id,
                run_id,
                agent_prompt,
                test_case,
                metric_names,
                verbose,
            )
            for test_case in self._test_cases
        ]
        self._started = started
        self._ended = ended
        self._error = error
        self._last_updated = last_updated
        self._save()

    @property
    def id(self) -> str:
        """Get the id of the EvalRun"""
        return self._run_id

    @property
    def project_id(self) -> str:
        """Get the name of the Project"""
        return self._project_id

    @property
    def version_id(self) -> str:
        """Get the name of the Version"""
        return self._version_id

    @property
    def eval_id(self) -> str:
        """Get the id of the Evaluator"""
        return self._eval_id


[docs]
    def run(
        self,
        agent_class: Type["BaseAgent"],
        agent_starts: Optional[bool],
        **kwargs,
    ):
        """Runs the evaluator and saves the results.

        Args:
            agent_class (Type[BaseAgent]): The agent class to evaluate
            agent_starts (Optional[bool]): Whether the agent starts the conversation or not.
                If True, the agent starts the conversation
                If False, the evaluator starts the conversation
                If None, random choice
            **kwargs: Keyword arguments to pass to the agent class
        """
        if self._started:
            raise ValueError(
                "This run was already started. Create a new run to test again."
            )

        if self._verbose:
            print(f"Starting Evaluation of {len(self._test_cases)} Test Cases")
        self._started = True
        for i, eval_agent in enumerate(self._eval_agents):
            try:
                eval_agent.evaluate(agent_class, agent_starts, i + 1, **kwargs)
            except Exception as e:
                self._error = f"Error Source: EvalRun Run \nError: {str(e)}"
                self._save()
                raise RuntimeError(f"Error evaluating agent: {str(e)}") from e
            self._save()
        self._ended = True


    @property
    def status(self):
        """Returns the status of the run as a string"""
        if self._error:
            return "FAILED"
        if not self._started:
            return "PENDING"
        if self._ended:
            return "COMPLETED"
        current_time = int(time.time())
        if current_time - self._last_updated < 300:
            return "IN PROGRESS"
        return "INTERRUPTED"

    @property
    def results(self) -> List[dict]:
        """Returns the results of the run as a list of dictionaries each representing a test case's results"""
        return [agent.results() for agent in self._eval_agents]

    @property
    def info(self):
        """Get the info of the run as a dictionary"""
        return {
            "project_id": self.project_id,
            "version_id": self.version_id,
            "eval_id": self.eval_id,
            "run_id": self.id,
            "created_at": self._created_at,
        }

    @property
    def _path(self):
        return get_info_path(self.project_id, self.version_id, self.eval_id, self.id)

    def _save(self):
        self._last_updated = int(time.time())
        os.makedirs(os.path.dirname(self._path), exist_ok=True)
        d = {
            "agent_prompt": self._agent_prompt,
            "metric_names": self._metric_names,
            "test_cases": self._test_cases,
            "created_at": self._created_at,
            "eval_agent_ids": [a.id for a in self._eval_agents],
            "started": self._started,
            "ended": self._ended,
        }
        save_json(d, self._path)

    @classmethod
    def _load(cls, project_id, version_id, eval_id, run_id):
        load_path = get_info_path(project_id, version_id, eval_id, run_id)
        try:
            d = load_json(load_path)
        except FileNotFoundError:
            return

        eval_agent_ids = d.pop("eval_agent_ids")
        eval_agents = [
            EvalAgent._load(project_id, version_id, eval_id, run_id, agent_id)
            for agent_id in eval_agent_ids
        ]
        eval_agents = [a for a in eval_agents if a]

        d.update(
            {
                "project_id": project_id,
                "version_id": version_id,
                "eval_id": eval_id,
                "run_id": run_id,
                "eval_agents": eval_agents,
            }
        )

        return cls(**d)