Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 28 additions & 18 deletions coolprompt/assistant.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from coolprompt.task_detector.detector import TaskDetector
from coolprompt.data_generator.generator import SyntheticDataGenerator
from coolprompt.language_model.llm import DefaultLLM
from coolprompt.optimizer.hype import hype_optimizer
from coolprompt.optimizer.hype import HyPEOptimizer, HyPEROptimizer
from coolprompt.optimizer.reflective_prompt import reflectiveprompt
from coolprompt.optimizer.distill_prompt.run import distillprompt
from coolprompt.utils.logging_config import logger, set_verbose, setup_logging
Expand All @@ -23,10 +23,6 @@
CLASSIFICATION_TASK_TEMPLATE,
GENERATION_TASK_TEMPLATE,
)
from coolprompt.utils.prompt_templates.hype_templates import (
CLASSIFICATION_TASK_TEMPLATE_HYPE,
GENERATION_TASK_TEMPLATE_HYPE,
)
from coolprompt.utils.correction.corrector import correct
from coolprompt.utils.correction.rule import LanguageRule
from coolprompt.prompt_assistant.prompt_assistant import PromptAssistant
Expand All @@ -36,12 +32,8 @@ class PromptTuner:
"""Prompt optimization tool supporting multiple methods."""

TEMPLATE_MAP = {
(Task.CLASSIFICATION, Method.HYPE): CLASSIFICATION_TASK_TEMPLATE_HYPE,
(Task.CLASSIFICATION, Method.REFLECTIVE): CLASSIFICATION_TASK_TEMPLATE,
(Task.CLASSIFICATION, Method.DISTILL): CLASSIFICATION_TASK_TEMPLATE,
(Task.GENERATION, Method.HYPE): GENERATION_TASK_TEMPLATE_HYPE,
(Task.GENERATION, Method.REFLECTIVE): GENERATION_TASK_TEMPLATE,
(Task.GENERATION, Method.DISTILL): GENERATION_TASK_TEMPLATE,
Task.CLASSIFICATION: CLASSIFICATION_TASK_TEMPLATE,
Task.GENERATION: GENERATION_TASK_TEMPLATE,
}

def __init__(
Expand Down Expand Up @@ -102,7 +94,7 @@ def get_task_prompt_template(self, task: str, method: str) -> str:
The type of task, either "classification" or "generation".
method (str):
Optimization method to use.
Available methods are: ['hype', 'reflective', 'distill']
Available methods are: ['hype', 'reflective', 'distill', 'hyper']

Returns:
str: The prompt template for the given task.
Expand All @@ -113,7 +105,7 @@ def get_task_prompt_template(self, task: str, method: str) -> str:
)
task = validate_task(task)
method = validate_method(method)
return self.TEMPLATE_MAP[(task, method)]
return self.TEMPLATE_MAP[task]

def _get_dataset_split(
self,
Expand Down Expand Up @@ -182,7 +174,7 @@ def run(
target (Iterable):
Target iterable object for autoprompting optimization.
method (str): Optimization method to use.
Available methods are: ['hype', 'reflective', 'distill']
Available methods are: ['hype', 'reflective', 'distill', 'hyper']
Defaults to hype.
metric (str): Metric to use for optimization.
problem_description (str): a string that contains
Expand Down Expand Up @@ -297,7 +289,7 @@ def run(
prompt=start_prompt,
task=task,
problem_description=problem_description,
num_samples=generate_num_samples
num_samples=generate_num_samples,
)
self.synthetic_dataset = dataset
self.synthetic_target = target
Expand Down Expand Up @@ -329,10 +321,28 @@ def run(
logger.debug(f"Additional kwargs: {kwargs}")

if method is Method.HYPE:
final_prompt = hype_optimizer(
hype_opt = HyPEOptimizer(
model=self._target_model,
config=kwargs.get("hype_config", None),
meta_prompt=kwargs.get("hype_meta_prompt", None),
)
meta_info = {"task_description": problem_description}
if kwargs.get("meta_info", None):
meta_info.update(kwargs["meta_info"])
final_prompt = hype_opt.optimize(
prompt=start_prompt,
problem_description=problem_description,
meta_info=meta_info,
),
elif method is Method.HYPER:
hyper_opt = HyPEROptimizer(
model=self._target_model,
evaluator=evaluator,
**kwargs,
)
final_prompt = hyper_opt.optimize(
prompt=start_prompt,
dataset_split=dataset_split,
meta_info={"task_description": problem_description},
)
elif method is Method.REFLECTIVE:
final_prompt = reflectiveprompt(
Expand Down Expand Up @@ -360,7 +370,7 @@ def run(
)

logger.debug(f"Final prompt:\n{final_prompt}")
template = self.TEMPLATE_MAP[(task, method)]
template = self.TEMPLATE_MAP[task]
logger.info(f"Evaluating on given dataset for {task} task...")
self.init_metric = evaluator.evaluate(
prompt=start_prompt,
Expand Down
90 changes: 70 additions & 20 deletions coolprompt/evaluator/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import random
from langchain_core.language_models.base import BaseLanguageModel
from typing import Optional
from dataclasses import dataclass
from typing import List, Optional

from langchain_core.language_models.base import BaseLanguageModel
from langchain_core.messages.ai import AIMessage
from coolprompt.evaluator.metrics import BaseMetric
from coolprompt.utils.logging_config import logger
Expand All @@ -12,6 +12,22 @@
)


@dataclass
class FailedExampleDetailed:
instance: str
assistant_answer: str
model_answer_parsed: Optional[str] = None
metric_value: float | int = 0.0
ground_truth: str | int = ""


@dataclass
class EvalResultDetailed:
aggregate_score: float
score_per_task: List[float | int] = None
failed_examples: List[FailedExampleDetailed] = None


class Evaluator:
"""Evaluator class to perform model evaluation using a specified metric.

Expand Down Expand Up @@ -64,7 +80,6 @@ def evaluate(
logger.info(
f"Evaluating prompt for {self.task} task on {len(dataset)} samples"
)
logger.debug(f"Prompt to evaluate:\n{prompt}")
if self.task == Task.CLASSIFICATION:
self.metric.extract_labels(targets)

Expand All @@ -80,28 +95,64 @@ def evaluate(

return self.metric.compute(answers, targets, dataset)

def _get_full_prompt(
def evaluate_detailed(
self,
prompt: str,
sample: str,
dataset: list[str],
targets: list[str | int],
template: Optional[str] = None,
) -> str:
"""Inserts parts of the prompt into the task template.
) -> EvalResultDetailed:
"""Evaluate the model and return detailed results per sample."""
if template is None:
template = self._get_default_template()

Args:
prompt (str): the main instruction for the task
sample (str): the input sample
template (Optional[str]):
Prompt template for defined task type.
If None, uses default template.
logger.info(
f"Evaluating (detailed) prompt for {self.task} task on {len(dataset)} samples"
)
if self.task == Task.CLASSIFICATION:
self.metric.extract_labels(targets)

answers = self.model.batch(
[
self._get_full_prompt(prompt, sample, template)
for sample in dataset
]
)
answers = [
a.content if isinstance(a, AIMessage) else a for a in answers
]

Raises:
ValueError: if type of task is not supported
parsed_answers = [self.metric.parse_output(a) for a in answers]
aggregate_score, score_per_task = self.metric.compute_detailed(
answers, targets
)

Returns:
str: the full prompt to be passed to the model
"""
failed_examples = []
for i, score in enumerate(score_per_task):
if score == 0:
failed_examples.append(
FailedExampleDetailed(
instance=dataset[i],
assistant_answer=answers[i],
model_answer_parsed=parsed_answers[i],
metric_value=score,
ground_truth=targets[i],
)
)

return EvalResultDetailed(
aggregate_score=aggregate_score,
score_per_task=score_per_task,
failed_examples=failed_examples,
)

def _get_full_prompt(
self,
prompt: str,
sample: str,
template: Optional[str] = None,
) -> str:
"""Inserts parts of the prompt into the task template."""
if template is None:
template = self._get_default_template()

Expand All @@ -116,7 +167,6 @@ def _get_full_prompt(

def _get_default_template(self) -> str:
"""Returns the default template for the task type."""

match self.task:
case Task.CLASSIFICATION:
return CLASSIFICATION_TASK_TEMPLATE
Expand Down
84 changes: 68 additions & 16 deletions coolprompt/evaluator/metrics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import Optional
from typing import List, Optional, Tuple

from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
Expand Down Expand Up @@ -86,7 +86,7 @@ def _compute_raw(
self,
outputs: list[str | int],
targets: list[str | int],
dataset: Optional[list[str]] = None
dataset: Optional[list[str]] = None,
) -> float:
"""Compute metric value from preprocessed model answers.

Expand Down Expand Up @@ -120,7 +120,7 @@ def compute(
self,
outputs: list[str | int],
targets: list[str | int],
dataset: Optional[list[str]] = None
dataset: Optional[list[str]] = None,
) -> float:
"""Compute metric value from text model outputs

Expand All @@ -134,19 +134,46 @@ def compute(
"""
output_labels = list(
map(
lambda x: extract_answer(
x, self.ANS_TAGS, self.FORMAT_MISMATCH_LABEL
),
lambda x: extract_answer(x, self.ANS_TAGS, self.FORMAT_MISMATCH_LABEL),
outputs,
)
)
targets = list(map(str, targets))
encoded_output_labels, encoded_targets = self._encode_labels(
output_labels, targets
)
return self._compute_raw(
encoded_output_labels, encoded_targets, dataset
)
return self._compute_raw(encoded_output_labels, encoded_targets, dataset)

def parse_output(self, output: str) -> str:
"""Extract parsed answer from model output.

Args:
output: Raw model output string.

Returns:
Extracted answer from <ans> tags, or original output if not found.
"""
return extract_answer(output, self.ANS_TAGS, format_mismatch_label=output)

def compute_detailed(
self,
outputs: list[str | int],
targets: list[str | int],
dataset: Optional[list[str]] = None,
) -> Tuple[float, List[float | int]]:
"""Compute metric value per sample and aggregate.

Returns:
Tuple of (aggregate_score, score_per_task).
score_per_task[i] - score for i-th sample.
aggregate_score - same as compute().
"""
score_per_task = []
for o, t in zip(outputs, targets):
s = self._compute_raw([o], [t], dataset)
score_per_task.append(s)
aggregate = self.compute(outputs, targets, dataset)
return aggregate, score_per_task

def __str__(self) -> str:
return self._get_name()
Expand Down Expand Up @@ -219,7 +246,7 @@ class GenerationMetric(BaseMetric):

FORMAT_MISMATCH_LABEL = ""

def __init__(self):
def __init__(self, name=None):
"""Initialize metric"""

super().__init__()
Expand Down Expand Up @@ -316,6 +343,15 @@ def _compute_raw(self, outputs, targets, dataset):
f1_list = super()._compute_raw(outputs, targets)
return sum(f1_list) / len(f1_list)

def compute_detailed(
self,
outputs: list[str | int],
targets: list[str | int],
dataset: Optional[list[str]] = None,
) -> Tuple[float, List[float]]:
f1_list = super()._compute_raw(outputs, targets, dataset)
return sum(f1_list) / len(f1_list), f1_list


class LLMAsJudge(GenerationMetric):
"""LLM-as-a-judge metric for generation tasks."""
Expand Down Expand Up @@ -462,15 +498,29 @@ def _compute_raw(self, outputs, targets, dataset):
outputs = [extract_number_from_text(item) for item in outputs]
return float(mean([o == t for o, t in zip(outputs, targets)]))

def compute_detailed(
self,
outputs: list[str | int],
targets: list[str | int],
dataset: Optional[list[str]] = None,
) -> Tuple[float, List[int]]:
targets = [extract_number_from_text(item) for item in targets]
outputs = [extract_number_from_text(item) for item in outputs]
score_per_task = [1 if o == t else 0 for o, t in zip(outputs, targets)]
return mean(score_per_task), score_per_task

def parse_output(self, output: str) -> str:
extracted = extract_answer(output, self.ANS_TAGS, format_mismatch_label=output)
return extract_number_from_text(extracted)


def define_lang(outputs, targets):
langs = [detect_language(target) for target in targets]
return max(set(langs), key=langs.count)


CLASSIFICATION_METRIC_NAME_MAPPING = {
metric._get_name(): metric
for metric in ClassificationMetric.__subclasses__()
metric._get_name(): metric for metric in ClassificationMetric.__subclasses__()
}

GENERATION_METRIC_NAME_MAPPING = {
Expand Down Expand Up @@ -509,8 +559,9 @@ def validate_and_create_metric(
return CLASSIFICATION_METRIC_NAME_MAPPING[metric]()
error_msg = (
f"Invalid metric for {task} task: {metric}. "
f"Available metrics: {', '.join(
CLASSIFICATION_METRIC_NAME_MAPPING.keys())}."
f"Available metrics: {
', '.join(CLASSIFICATION_METRIC_NAME_MAPPING.keys())
}."
)
logger.error(error_msg)
raise ValueError(error_msg)
Expand Down Expand Up @@ -544,8 +595,9 @@ def validate_and_create_metric(
return GENERATION_METRIC_NAME_MAPPING[metric]()
error_msg = (
f"Invalid metric for {task} task: {metric}. "
f"Available metrics: {', '.join(
GENERATION_METRIC_NAME_MAPPING.keys())}."
f"Available metrics: {
', '.join(GENERATION_METRIC_NAME_MAPPING.keys())
}."
)
logger.error(error_msg)
raise ValueError(error_msg)
Expand Down
Loading