Source code for mixedvoices.evaluation.evaluator

import os
import time
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type
from uuid import uuid4

import mixedvoices as mv
import mixedvoices.constants as constants
from mixedvoices.evaluation.eval_run import EvalRun
from mixedvoices.utils import load_json, save_json

if TYPE_CHECKING:
    from mixedvoices import BaseAgent  # pragma: no cover
    from mixedvoices.core.version import Version  # pragma: no cover


def get_info_path(project_id: str, eval_id: str):
    return os.path.join(
        constants.PROJECTS_FOLDER,
        project_id,
        "evals",
        eval_id,
        "info.json",
    )


[docs] class Evaluator: """Evaluator is a reusable collections of tests cases and metrics to test model performance. These can be run multiple times across different versions to track performance. """ def __init__( self, eval_id: str, project_id: str, metric_names: List[str], test_cases: List[str], created_at: Optional[int] = None, eval_runs: Optional[dict[str, EvalRun]] = None, ): self._eval_id = eval_id self._project_id = project_id self._metric_names = metric_names self._test_cases = test_cases self._created_at = created_at or int(time.time()) self._eval_runs = eval_runs or {} self._cached_project = None self._save() @property def id(self) -> str: """Get the id of the Evaluator""" return self._eval_id @property def project_id(self) -> str: """Get the name of the Project""" return self._project_id @property def metric_names(self) -> List[str]: """List of metric names to be evaluated""" return self._metric_names @property def test_cases(self) -> List[str]: """List of test cases to be evaluated""" return self._test_cases @property def info(self) -> Dict[str, Any]: """Get the info of the evaluator as a dictionary""" return { "eval_id": self.id, "created_at": self._created_at, "num_prompts": len(self.test_cases), "num_eval_runs": len(self.list_eval_runs()), "metric_names": self.metric_names, }
[docs] def list_eval_runs(self, version_id: Optional[str] = None) -> List[EvalRun]: """List of eval runs""" if version_id and version_id not in self._project.version_ids: raise KeyError( f"Version {version_id} not found in project {self.project_id}" ) all_runs = list(self._eval_runs.values()) if version_id: all_runs = [run for run in all_runs if run.version_id == version_id] return all_runs
[docs] def load_eval_run(self, run_id: str) -> EvalRun: """Load an eval run from id Args: run_id (str): The id of the eval run """ if run_id not in self._eval_runs: raise KeyError(f"Eval run {run_id} not found") return self._eval_runs[run_id]
[docs] def run( self, version: "Version", agent_class: Type["BaseAgent"], agent_starts: Optional[bool], verbose: bool = True, **kwargs, ) -> EvalRun: """Runs the evaluator and saves the results. Args: version (Version): The version of the project to evaluate agent_class (Type[BaseAgent]): The agent class to evaluate agent_starts (Optional[bool]): Whether the agent starts the conversation or not. If True, the agent starts the conversation If False, the evaluator starts the conversation If None, random choice verbose (bool): Whether to print testing conversation and scores. Defaults to True **kwargs: Keyword arguments to pass to the agent class """ run_id = uuid4().hex project = self._project version_id = version.id if version_id not in project.version_ids: raise ValueError("Evaluator can only be run on a version of the project") prompt = version._prompt run = EvalRun( run_id, self.project_id, version_id, self.id, prompt, self._metric_names, self._test_cases, verbose, ) self._eval_runs[run_id] = run self._save() run.run(agent_class, agent_starts, **kwargs) return run
@property def _project(self): if self._cached_project is None: self._cached_project = mv.load_project(self.project_id) return self._cached_project @property def _path(self): return get_info_path(self.project_id, self.id) def _save(self): os.makedirs(os.path.dirname(self._path), exist_ok=True) d = { "metric_names": self._metric_names, "test_cases": self._test_cases, "created_at": self._created_at, "eval_run_ids": list(self._eval_runs.keys()), "eval_run_version_ids": [ run.version_id for run in self._eval_runs.values() ], } save_json(d, self._path) @classmethod def _load(cls, project_id, eval_id): load_path = get_info_path(project_id, eval_id) try: d = load_json(load_path) except FileNotFoundError: return eval_run_ids = d.pop("eval_run_ids") eval_run_version_ids = d.pop("eval_run_version_ids") eval_runs = { run_id: EvalRun._load(project_id, version_id, eval_id, run_id) for run_id, version_id in zip(eval_run_ids, eval_run_version_ids) } d.update( { "project_id": project_id, "eval_id": eval_id, "eval_runs": eval_runs, } ) return cls(**d)