from langchain_core.messages import AIMessage
import numpy as np
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langsmith.schemas import Run, Example
from ragas.metrics import Faithfulness, FactualCorrectness
from ragas import evaluate
from datasets import Dataset
from ragas.run_config import RunConfig
import os
from dotenv import load_dotenv
from ragas.llms import LangchainLLMWrapper
from ragas import SingleTurnSample, EvaluationDataset
from ragas.embeddings import LangchainEmbeddingsWrapper
def create_critic_llm():
"""Create a new instance of the critic LLM for each evaluation."""
return LangchainLLMWrapper(AzureChatOpenAI(
azure_deployment=,
api_version=,
temperature=,
max_tokens=None,
timeout=None,
max_retries=,
))
def create_embeddings():
return LangchainEmbeddingsWrapper(AzureOpenAIEmbeddings(model=))
def result_evaluate(data_sample,metric_name):
critic_llm=create_critic_llm()
ada_002_embeddings=create_embeddings()
dataset = EvaluationDataset(samples=[data_sample])
if metric_name == 'factual_correctness':
metrics=[FactualCorrectness(llm=critic_llm)]
if metric_name == 'faithfulness':
metrics=[Faithfulness(llm=critic_llm)]
score = evaluate(embeddings=ada_002_embeddings,dataset=dataset,metrics=metrics,show_progress=False,run_config=RunConfig(max_retries=1,log_tenacity=True))
return np.round(score[metric_name],4)[0]
def q_correctness(run: Run, example: Example) -> dict:
"""An example evaluator. Larger numbers are better."""
# The Example object contains the inputs and reference labels from a single row in your dataset (if provided).
prompt_inputs = example.inputs
reference_outputs = example.outputs # aka labels
# often comparing them to the reference_outputs
predicted: AIMessage = run.outputs["output"]
data_sample=SingleTurnSample(user_input=prompt_inputs['question'],
retrieved_contexts=prompt_inputs['contexts'],
response=str(predicted.content.strip()),
reference=reference_outputs['answer'])
score=result_evaluate(data_sample,'factual_correctness')
# Implement your evaluation logic here
return {
# The evaluator keys here define the metric you are measuring
# You can provide further descriptions for these in the config.json
"key": "q_correctness",
"score": score,
}
I am defining a custom evaluator with ragas, it keeps showing list index out of range error from trace, but I tested the ragas evaluate function separately, it works fine, and the dataformat, run output are right. Below is the code snippet and pic of error, I removed some parameter value. Is it because some callback trace needed when submit run on langsmith platform?