Spaces:
Running
on
Zero
Running
on
Zero
| import sys | |
| import os | |
| import csv | |
| from main import RunChain, RunGraph | |
| from retriever import BuildRetriever | |
| from ragas import EvaluationDataset, evaluate | |
| from ragas.llms import LangchainLLMWrapper | |
| from ragas.metrics import ( | |
| AnswerAccuracy, | |
| ContextRelevance, | |
| ResponseGroundedness, | |
| ) | |
| from langchain_openai import ChatOpenAI | |
| import argparse | |
| import logging | |
| import traceback | |
| # Suppress these messages: | |
| # INFO:openai._base_client:Retrying request to /chat/completions in ___ seconds | |
| # https://community.openai.com/t/suppress-http-request-post-message/583334/8 | |
| openai_logger = logging.getLogger("openai") | |
| openai_logger.setLevel(logging.WARNING) | |
| def load_queries_and_references(csv_path): | |
| """Read queries and references from CSV""" | |
| queries = [] | |
| references = [] | |
| with open(csv_path, newline="") as csvfile: | |
| reader = csv.DictReader(csvfile) | |
| for row in reader: | |
| queries.append(row["query"].strip('"')) | |
| references.append(row["reference"].strip('"')) | |
| return queries, references | |
| def build_eval_dataset(queries, references, compute_mode, workflow, search_type): | |
| """Build dataset for evaluation""" | |
| dataset = [] | |
| for query, reference in zip(queries, references): | |
| try: | |
| if workflow == "chain": | |
| print("\n\n--- Query ---") | |
| print(query) | |
| response = RunChain(query, compute_mode, search_type) | |
| print("--- Response ---") | |
| print(response) | |
| # Retrieve context documents for a query | |
| retriever = BuildRetriever(compute_mode, search_type) | |
| docs = retriever.invoke(query) | |
| retrieved_contexts = [doc.page_content for doc in docs] | |
| if workflow == "graph": | |
| result = RunGraph(query, compute_mode, search_type) | |
| retrieved_contexts = [] | |
| if "retrieved_emails" in result: | |
| # Remove the source files (e.g. R-help/2022-September.txt) as it confuses the evaluator | |
| retrieved_contexts = [ | |
| "\n\nFrom" + email.split("\n\nFrom")[1] | |
| for email in result["retrieved_emails"] | |
| ] | |
| response = result["answer"] | |
| dataset.append( | |
| { | |
| "user_input": query, | |
| "retrieved_contexts": retrieved_contexts, | |
| "response": response, | |
| "reference": reference, | |
| } | |
| ) | |
| except: | |
| print(f"--- Query omitted from evals due to failed generation: {query} ---") | |
| print(traceback.format_exc()) | |
| return dataset | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Evaluate RAG retrieval and generation." | |
| ) | |
| parser.add_argument( | |
| "--compute_mode", | |
| choices=["cloud", "edge"], | |
| required=True, | |
| help="Compute mode: cloud or edge.", | |
| ) | |
| parser.add_argument( | |
| "--workflow", | |
| choices=["chain", "graph"], | |
| required=True, | |
| help="Workflow: chain or graph.", | |
| ) | |
| parser.add_argument( | |
| "--search_type", | |
| choices=["dense", "sparse", "hybrid"], | |
| required=True, | |
| help="Search type: dense, sparse, or hybrid.", | |
| ) | |
| args = parser.parse_args() | |
| compute_mode = args.compute_mode | |
| workflow = args.workflow | |
| search_type = args.search_type | |
| queries, references = load_queries_and_references("eval.csv") | |
| dataset = build_eval_dataset( | |
| queries, references, compute_mode, workflow, search_type | |
| ) | |
| evaluation_dataset = EvaluationDataset.from_list(dataset) | |
| # Set up LLM for evaluation | |
| llm = ChatOpenAI(model="gpt-4o-mini", temperature=0) | |
| evaluator_llm = LangchainLLMWrapper(llm) | |
| # Evaluate | |
| result = evaluate( | |
| dataset=evaluation_dataset, | |
| # NVIDIA metrics | |
| metrics=[ContextRelevance(), ResponseGroundedness(), AnswerAccuracy()], | |
| llm=evaluator_llm, | |
| ) | |
| print("Evaluation Results:") | |
| print(result) | |
| if __name__ == "__main__": | |
| main() | |