Source code for mixedvoices.evaluation.test_case_generator

import tempfile
from typing import TYPE_CHECKING, List, Literal, Optional

from tqdm import tqdm

from mixedvoices import models
from mixedvoices.core.utils import get_transcript_and_duration
from mixedvoices.utils import get_openai_client

if TYPE_CHECKING:
    from mixedvoices.core.project import Project  # pragma: no cover
    from mixedvoices.core.version import Version  # pragma: no cover

# TODO: This style doesn't encapsulate transcription errors
SYSTEM_PROMPT = """You're an expert at creating PROMPTS for TESTING agents to evaluate REAL agent.
    Prompt Structure (Each field should be inline, no bullets/numbers):-
    Info i.e name and age for eg. John Doe, 30
    Personality i.e. Talking style, quirks, 1-2 lines, don't use terms like Type A/B etc. Don't include speed, pauses, modulation, this is text only.
    Call Objective 1-3 lines, include who you are calling here as well
    Call Path, represent like A->B->C..->Farewell where A, B, C are steps, ALWAYS end with Farewell
    """  # noqa E501

START_PROMPT = """REAL agent prompt:
----
{agent_prompt}
----"""

DEMOGRAPHIC_PROMPT = """User Demographic (try to simulate such personalities and info)
----
{user_demographic_info}
----
"""

STRUCTURE_PROMPT_MULTIPLE = """Give distinct prompts.
Output structure below. Don't add blank lines b/w fields.
Prompts:-
----
Info: ..
Personality: ..
Call Objective: ..
Call Path: ..
----
Info: ..
Personality: ..
Call Objective: ..
Call Path: ..
----
and so on
"""

STRUCTURE_PROMPT_SINGLE = """Output structure below. Don't add blank lines b/w fields.
Prompts:-
----
Info: ..
Personality: ..
Call Objective: ..
Call Path: ..
----
"""

OUTPUT_PROMPT = "Prompts:-\n----"


def get_prompt_part(count):
    return (
        "a single TESTING agent prompt"
        if count == 1
        else f"{count} different TESTING agent prompts"
    )


def generate_test_cases(
    agent_prompt: str,
    generation_instruction: str,
    count: int,
    user_demographic_info: Optional[str] = None,
):
    start_prompt = START_PROMPT.format(agent_prompt=agent_prompt)

    structure_prompt = (
        STRUCTURE_PROMPT_SINGLE if count == 1 else STRUCTURE_PROMPT_MULTIPLE
    )
    user_prompt = f"{start_prompt}\n{generation_instruction}\n{structure_prompt}"
    client = get_openai_client()
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_prompt},
    ]
    if user_demographic_info:
        demographic_prompt = DEMOGRAPHIC_PROMPT.format(
            user_demographic_info=user_demographic_info
        )
        messages.append({"role": "user", "content": demographic_prompt})

    messages.append({"role": "assistant", "content": OUTPUT_PROMPT})
    completion = client.chat.completions.create(
        model=models.TEST_CASE_GENERATOR_MODEL,
        messages=messages,
    )
    response_text = completion.choices[0].message.content
    prompts = response_text.split("----")
    prompts = [p.strip() for p in prompts if len(p.strip()) > 50]
    assert len(prompts) == count  # TODO: Add retries
    return prompts


# TODO: Use this in future
# def generate_test_cases_for_failure_reasons(
#     agent_prompt: str,
#     failure_reasons: List[str],
#     count: int = 2,
#     user_demographic_info: Optional[str] = None,
# ):
#     test_cases = []
#     for failure_reason in failure_reasons:
#         part = get_prompt_part(count)
#         instruction = (
#             f"Generate {part} that try to recreate this failure: {failure_reason}"
#         )
#         test_cases.extend(
#             generate_test_cases(agent_prompt, instruction, count, user_demographic_info)
#         )
#     return test_cases


def generate_test_cases_from_paths(
    agent_prompt: str,
    paths: List[str],
    count_per_path=2,
    user_demographic_info: Optional[str] = None,
    progress=None,
):
    test_cases = []
    progress.set_postfix({"progress": f"0/{len(paths)} paths processed"})
    for idx, path in enumerate(paths, 1):
        part = get_prompt_part(count_per_path)
        instruction = f"Generate {part} that follow this path: {path}"
        test_cases.extend(
            generate_test_cases(
                agent_prompt, instruction, count_per_path, user_demographic_info
            )
        )
        if progress:
            progress.set_postfix({"progress": f"{idx}/{len(paths)} paths processed"})
            progress.update(count_per_path)
    return test_cases


def generate_test_cases_for_edge_cases(
    agent_prompt: str,
    count: int = 2,
    user_demographic_info: Optional[str] = None,
    progress=None,
):
    if progress:
        progress.set_description("Generating Edge Cases")
        progress.set_postfix({"progress": f"0/{count} cases processed"})

    part = get_prompt_part(count)
    instruction = f"Generate {part} that simulate tricky edge cases."
    result = generate_test_cases(
        agent_prompt, instruction, count, user_demographic_info
    )
    if progress:
        progress.set_postfix({"progress": f"{count}/{count} cases processed"})
        progress.update(count)
    return result


def generate_test_cases_from_transcripts(
    agent_prompt: str,
    transcripts: List[str],
    count: int = 1,
    user_demographic_info: Optional[str] = None,
    progress=None,
):
    if progress:
        progress.set_description("Generating from Transcripts")
        progress.set_postfix(
            {"progress": f"0/{len(transcripts)} transcripts processed"}
        )

    test_cases = []
    for idx, transcript in enumerate(transcripts, 1):
        part = get_prompt_part(count)
        instruction = f"Generate {part} that try to recreate this transcript: {transcript}. You will simulate the USER."
        test_cases.extend(
            generate_test_cases(agent_prompt, instruction, count, user_demographic_info)
        )
        if progress:
            progress.set_postfix(
                {"progress": f"{idx}/{len(transcripts)} transcripts processed"}
            )
            progress.update(count)
    return test_cases


def generate_test_cases_from_recordings(
    agent_prompt: str,
    recording_paths: List[str],
    user_channels: List[str],
    user_demographic_info: Optional[str] = None,
    progress=None,
):
    if progress:
        progress.set_description("Processing Recordings")
        progress.set_postfix(
            {"progress": f"0/{len(recording_paths)} recordings processed"}
        )

    transcripts = []
    with tempfile.TemporaryDirectory() as temp_dir:
        for idx, (path, user_channel) in enumerate(
            zip(recording_paths, user_channels), 1
        ):
            out = get_transcript_and_duration(path, temp_dir, user_channel)
            transcripts.append(out[0])
            if progress:
                progress.set_postfix(
                    {"progress": f"{idx}/{len(recording_paths)} recordings processed"}
                )

    return generate_test_cases_from_transcripts(
        agent_prompt,
        transcripts,
        user_demographic_info=user_demographic_info,
        progress=progress,
    )


def generate_test_cases_from_descriptions(
    agent_prompt: str,
    descriptions: List[str],
    user_demographic_info: Optional[str] = None,
    progress=None,
):
    if progress:
        progress.set_description("Generating from Descriptions")
        progress.set_postfix(
            {"progress": f"0/{len(descriptions)} descriptions processed"}
        )

    test_cases = []
    for idx, description in enumerate(descriptions, 1):
        part = get_prompt_part(1)
        instruction = f"Generate {part} according to this description: {description}"
        test_cases.extend(
            generate_test_cases(agent_prompt, instruction, 1, user_demographic_info)
        )
        if progress:
            progress.set_postfix(
                {"progress": f"{idx}/{len(descriptions)} descriptions processed"}
            )
            progress.update(1)
    return test_cases


[docs] class TestCaseGenerator: """Generate test cases for evaluation based on the prompt and user demographic info Args: prompt (str): The prompt of the agent to generate test for user_demographic_info (Optional[str]): The user demographic info. Include things like age group, country, accents etc """ def __init__(self, prompt: str, user_demographic_info: Optional[str] = None): self.prompt = prompt self.user_demographic_info = user_demographic_info self.transcripts: List[str] = [] self.recordings: List[str] = [] self.user_channels: List[str] = [] self.versions: List["Version"] = [] self.versions_paths: List[List[str]] = [] self.version_cases_per_path: List[int] = [] self.projects: List["Project"] = [] self.projects_paths: List[List[str]] = [] self.project_cases_per_path: List[int] = [] self.descriptions: List[str] = [] self.edge_cases_count = 0 self.test_cases = []
[docs] def add_from_transcripts(self, transcripts: List[str]) -> "TestCaseGenerator": """Add test cases from transcripts. 1 test case will be generated for each transcript Args: transcripts (List[str]): List of transcripts. Transcript should have labels for each utterance . Use 'user:', 'bot:' labels" """ self.transcripts.extend(transcripts) return self
[docs] def add_from_recordings( self, recording_paths: List[str], user_channel: Literal["left", "right"] = "left", ) -> "TestCaseGenerator": """Add test cases from recordings. 1 test case will be generated for each recording Args: recording_paths (List[str]): List of recording paths. Use stereo recordings with user and bot on different channels. user_channel (str, optional): Channel of the user in the recording. Can be "left" or "right". Defaults to "left". """ self.recordings.extend(recording_paths) self.user_channels.extend([user_channel] * len(recording_paths)) return self
[docs] def add_from_version( self, version: "Version", cases_per_path: int = 1 ) -> "TestCaseGenerator": """Add test cases from a version. 1 test case will be generated for each path in the version Args: version (Version): Version object cases_per_path (int, optional): Number of test cases to generate for each path. Defaults to 1. """ self._check_generation() self.versions.append(version) self.versions_paths.append(version._get_paths()) self.version_cases_per_path.append(cases_per_path) return self
[docs] def add_from_project( self, project: "Project", cases_per_path: int = 1 ) -> "TestCaseGenerator": """Add test cases from a project. 1 test case will be generated for each path in the project Args: project (Project): Project object cases_per_path (int, optional): Number of test cases to generate for each path. Defaults to 1. """ self._check_generation() self.projects.append(project) self.projects_paths.append(project._get_paths()) self.project_cases_per_path.append(cases_per_path) return self
[docs] def add_from_descriptions(self, descriptions: List[str]) -> "TestCaseGenerator": """Add test cases from rough descriptions. 1 test case will be generated for each description Args: descriptions (List[str]): List of descriptions """ self._check_generation() self.descriptions.extend(descriptions) return self
[docs] def add_edge_cases(self, count: int) -> "TestCaseGenerator": """Create test cases for edge cases where bot might fail or behave unexpectedly Args: count (int): Number of test cases to add """ self._check_generation() self.edge_cases_count += count return self
@property def num_cases(self) -> int: project_cases = sum( len(paths) * c for paths, c in zip(self.projects_paths, self.project_cases_per_path) ) version_cases = sum( len(paths) * c for paths, c in zip(self.versions_paths, self.version_cases_per_path) ) return sum( [ len(self.transcripts), len(self.recordings), project_cases, version_cases, self.edge_cases_count, len(self.descriptions), ] )
[docs] def generate(self, show_progress=True): """Generate test cases from all the given inputs""" self._check_generation("generate") num_cases = self.num_cases if num_cases == 0: raise ValueError( "No test cases generated. " "Use one or more of these methods before calling generate: " "add_from_transcripts, add_from_recordings, add_from_version, " "add_from_project, add_from_descriptions, add_edge_cases" ) try: if show_progress: progress = tqdm(total=num_cases) progress.set_description("Generating test cases") else: progress = None test_cases = [] if self.recordings: test_cases.extend( generate_test_cases_from_recordings( self.prompt, self.recordings, self.user_channels, user_demographic_info=self.user_demographic_info, progress=progress, ) ) if self.transcripts: test_cases.extend( generate_test_cases_from_transcripts( self.prompt, self.transcripts, user_demographic_info=self.user_demographic_info, progress=progress, ) ) if self.descriptions: test_cases.extend( generate_test_cases_from_descriptions( self.prompt, self.descriptions, user_demographic_info=self.user_demographic_info, progress=progress, ) ) if self.edge_cases_count: test_cases.extend( generate_test_cases_for_edge_cases( self.prompt, self.edge_cases_count, user_demographic_info=self.user_demographic_info, progress=progress, ) ) for version, version_paths, cases_per_path in zip( self.versions, self.versions_paths, self.version_cases_per_path ): progress.set_description( f"Generating from {version.project_id}/{version.id}'s paths" ) test_cases.extend( generate_test_cases_from_paths( self.prompt, version_paths, cases_per_path, user_demographic_info=self.user_demographic_info, progress=progress, ) ) for project, project_paths, cases_per_path in zip( self.projects, self.projects_paths, self.project_cases_per_path ): progress.set_description(f"Generating from {project.id}'s paths") test_cases.extend( generate_test_cases_from_paths( self.prompt, project_paths, cases_per_path, user_demographic_info=self.user_demographic_info, progress=progress, ) ) self.test_cases = test_cases return test_cases finally: if show_progress: progress.close()
def _check_generation(self, operation="add"): if self.test_cases: raise ValueError( f"Can not {operation}. Test cases have already been generated. " "You can access them using .test_cases. " "Use a new TestCaseGenerator object to generate more test cases." )