Skip to content

index error with ragas library #12

@musitafa0032

Description

@musitafa0032

I am defining a custom evaluator with ragas, it keeps showing list index out of range error from trace, but I tested the ragas evaluate function separately, it works fine, and the dataformat, run output are right. Below is the code snippet and pic of error, I removed some parameter value. Is it because some callback trace needed when submit run on langsmith platform?

from langchain_core.messages import AIMessage
import numpy as np
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langsmith.schemas import Run, Example
from ragas.metrics import Faithfulness, FactualCorrectness
from ragas import evaluate
from datasets import Dataset
from ragas.run_config import RunConfig
import os
from dotenv import load_dotenv
from ragas.llms import LangchainLLMWrapper
from ragas import SingleTurnSample, EvaluationDataset
from ragas.embeddings import LangchainEmbeddingsWrapper

def create_critic_llm():
    """Create a new instance of the critic LLM for each evaluation."""
    return LangchainLLMWrapper(AzureChatOpenAI(
        azure_deployment=,
        api_version=,
        temperature=,
        max_tokens=None,
        timeout=None,
        max_retries=,
    ))

def create_embeddings():
   
    return LangchainEmbeddingsWrapper(AzureOpenAIEmbeddings(model=))


def result_evaluate(data_sample,metric_name):
    critic_llm=create_critic_llm()
    ada_002_embeddings=create_embeddings()
    dataset = EvaluationDataset(samples=[data_sample])
    if metric_name == 'factual_correctness':
        metrics=[FactualCorrectness(llm=critic_llm)]
    if metric_name == 'faithfulness':
        metrics=[Faithfulness(llm=critic_llm)]
    score = evaluate(embeddings=ada_002_embeddings,dataset=dataset,metrics=metrics,show_progress=False,run_config=RunConfig(max_retries=1,log_tenacity=True))
    return np.round(score[metric_name],4)[0]

def q_correctness(run: Run, example: Example) -> dict:
    """An example evaluator. Larger numbers are better."""
    # The Example object contains the inputs and reference labels from a single row in your dataset (if provided).
    
   
    prompt_inputs = example.inputs
 
    reference_outputs = example.outputs  # aka labels
  
    # often comparing them to the reference_outputs 
    predicted: AIMessage = run.outputs["output"]

    data_sample=SingleTurnSample(user_input=prompt_inputs['question'],
                             retrieved_contexts=prompt_inputs['contexts'],
                             response=str(predicted.content.strip()),
                             reference=reference_outputs['answer'])

    score=result_evaluate(data_sample,'factual_correctness')

    # Implement your evaluation logic here
    return {
        # The evaluator keys here define the metric you are measuring
        # You can provide further descriptions for these in the config.json
        "key": "q_correctness",
        "score": score,
    }

image

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions