|
1 | | -"""Simple dataset evaluation example with Gentrace.""" |
| 1 | +"""Simple dataset evaluation example with Gentrace using local test cases.""" |
2 | 2 |
|
3 | 3 | import os |
4 | 4 | import asyncio |
5 | | -from typing import Any, Dict |
| 5 | +from typing import Optional |
| 6 | +from typing_extensions import TypedDict |
6 | 7 |
|
7 | 8 | from dotenv import load_dotenv |
8 | 9 | from openai import AsyncOpenAI |
9 | 10 |
|
10 | | -from gentrace import TestInput, init, experiment, interaction, eval_dataset |
| 11 | +from gentrace import TestCase, TestInput, init, experiment, eval_dataset |
11 | 12 |
|
12 | 13 | load_dotenv() |
13 | 14 |
|
|
21 | 22 |
|
22 | 23 | openai = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) |
23 | 24 |
|
| 25 | +class PromptInputs(TypedDict): |
| 26 | + prompt: str |
24 | 27 |
|
25 | | -async def process_ai_request(inputs: Dict[str, Any]) -> Dict[str, Any]: |
26 | | - """Process AI request using OpenAI.""" |
27 | | - # test_case.name # throwing exception |
28 | 28 |
|
29 | | - # Extract the prompt from inputs |
30 | | - prompt = inputs.get("prompt", "Hey, how are you?") |
31 | 29 |
|
32 | | - # Call OpenAI |
33 | | - response = await openai.chat.completions.create( |
34 | | - model="gpt-4o-mini", |
35 | | - messages=[{"role": "user", "content": prompt}], |
36 | | - ) |
37 | | - |
38 | | - result = response.choices[0].message.content |
| 30 | +async def process_ai_request(test_case: TestCase) -> Optional[str]: |
| 31 | + print(f"Running test case: {test_case.name}") |
39 | 32 |
|
40 | | - return { |
41 | | - "result": result, |
42 | | - "metadata": {"model": response.model, "usage": response.usage.model_dump() if response.usage else None}, |
43 | | - } |
| 33 | + prompt = test_case.inputs.get("prompt") |
44 | 34 |
|
| 35 | + response = await openai.chat.completions.create( |
| 36 | + model="gpt-4.1-nano", |
| 37 | + messages=[{"role": "user", "content": str(prompt)}], |
| 38 | + ) |
45 | 39 |
|
46 | | -@interaction(pipeline_id=PIPELINE_ID, name="Process AI Request") |
47 | | -async def traced_process_ai_request(inputs: Dict[str, Any]) -> Dict[str, Any]: |
48 | | - """Traced version of process_ai_request.""" |
49 | | - return await process_ai_request(inputs) |
| 40 | + return response.choices[0].message.content |
50 | 41 |
|
51 | 42 |
|
52 | 43 | @experiment(pipeline_id=PIPELINE_ID) |
53 | 44 | async def dataset_evaluation() -> None: |
54 | | - """Run evaluation on a dataset.""" |
55 | | - |
| 45 | + """Run evaluation on a dataset using type-safe TestInput objects with TypedDict.""" |
| 46 | + |
| 47 | + # Using TestInput with TypedDict for type safety |
| 48 | + test_cases = [ |
| 49 | + TestInput[PromptInputs]( |
| 50 | + name="greeting", |
| 51 | + inputs={"prompt": "Hello! How are you doing today?"} |
| 52 | + ), |
| 53 | + TestInput[PromptInputs]( |
| 54 | + name="factual_question", |
| 55 | + inputs={"prompt": "What is the capital of France?"} |
| 56 | + ), |
| 57 | + TestInput[PromptInputs]( |
| 58 | + name="math_problem", |
| 59 | + inputs={"prompt": "What is 25 * 4?"} |
| 60 | + ), |
| 61 | + TestInput[PromptInputs]( |
| 62 | + name="creative_writing", |
| 63 | + inputs={"prompt": "Write a haiku about artificial intelligence"} |
| 64 | + ), |
| 65 | + TestInput[PromptInputs]( |
| 66 | + inputs={"prompt": "Tell me a joke"} |
| 67 | + ) |
| 68 | + ] |
| 69 | + |
56 | 70 | await eval_dataset( |
57 | | - data=[ |
58 | | - TestInput(name="greeting", inputs={"prompt": "Hello! How are you doing today?"}), |
59 | | - TestInput(name="factual_question", inputs={"prompt": "What is the capital of France?"}), |
60 | | - TestInput(name="math_problem", inputs={"prompt": "What is 25 * 4?"}), |
61 | | - TestInput(name="creative_writing", inputs={"prompt": "Write a haiku about artificial intelligence"}), |
62 | | - ], |
63 | | - interaction=traced_process_ai_request, |
64 | | - max_concurrency=30, |
| 71 | + data=test_cases, |
| 72 | + schema=PromptInputs, |
| 73 | + interaction=process_ai_request, |
65 | 74 | ) |
66 | 75 |
|
67 | 76 | print("Dataset evaluation completed! Check your Gentrace dashboard for results.") |
68 | 77 |
|
69 | 78 |
|
70 | 79 | if __name__ == "__main__": |
71 | | - # Run the experiment |
72 | | - asyncio.run(dataset_evaluation()) |
| 80 | + result = asyncio.run(dataset_evaluation()) |
| 81 | + |
| 82 | + if result: |
| 83 | + print(f"Experiment URL: {result.url}") |
0 commit comments