import os import json import requests from typing import Dict, Any, List, Optional import gradio as gr import pandas as pd # Configuration OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") or os.getenv("my_key") class OpenRouterLLM: def __init__(self, api_key: str, model: str = "deepseek/deepseek-v3.1-terminus"): self.api_key = api_key self.model = model self.base_url = "https://openrouter.ai/api/v1/chat/completions" def __call__(self, prompt: str, max_tokens: int = 2000, temperature: float = 0.1) -> str: if not self.api_key or not self.api_key.startswith('sk-or-v1-'): return "Error: Invalid OpenRouter API key" headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json", "HTTP-Referer": "https://huggingface.co/spaces/Mehedi2/Gaia-Test-Agent", "X-Title": "GAIA Test Agent" } payload = { "model": self.model, "messages": [ { "role": "system", "content": """You are an advanced AI assistant designed for the GAIA benchmark. You excel at: - Complex reasoning and multi-step problem solving - Mathematical calculations and logical analysis - Research and fact-finding - File analysis and data interpretation - Providing precise, unambiguous answers Always think step-by-step and provide clear reasoning for your answers.""" }, { "role": "user", "content": prompt } ], "temperature": temperature, "max_tokens": max_tokens, "top_p": 0.9 } try: response = requests.post(self.base_url, headers=headers, json=payload, timeout=60) if response.status_code == 200: result = response.json() return result["choices"][0]["message"]["content"].strip() else: return f"API Error: {response.status_code} - {response.text[:200]}" except Exception as e: return f"Error: {str(e)}" class GAIATestAgent: """GAIA Benchmark Test Agent with enhanced capabilities""" def __init__(self, api_key: str, model: str = "deepseek/deepseek-v3.1-terminus"): self.llm = OpenRouterLLM(api_key, model) self.api_key = api_key def solve_gaia_question(self, question: str, file_content: Optional[str] = None, level: int = 1) -> str: """Main method to solve GAIA benchmark questions""" # Build context context_parts = [f"Question: {question}"] if file_content: context_parts.append(f"File content provided: {file_content[:2000]}") context_parts.append(f"Question Level: {level} (1=Basic, 2=Intermediate, 3=Advanced)") # Create comprehensive solving prompt main_prompt = f""" You are solving a GAIA benchmark question. These questions test advanced AI capabilities and require careful reasoning. {chr(10).join(context_parts)} Approach this systematically: 1. **Understanding**: What exactly is the question asking for? 2. **Analysis**: What information do I have and what do I need to find? 3. **Strategy**: What approach should I take to solve this? 4. **Execution**: Work through the problem step by step 5. **Verification**: Does my answer make sense? 6. **Final Answer**: Provide a clear, precise answer For GAIA questions: - Be extremely precise and factual - Show your reasoning clearly - For numerical answers, provide exact numbers - For factual answers, be concise but complete - If you need to make calculations, show your work Think carefully and solve step by step: """ return self.llm(main_prompt, max_tokens=2000, temperature=0.1) # Sample GAIA-style questions for testing SAMPLE_QUESTIONS = [ { "task_id": "math_001", "Question": "What is the sum of all prime numbers less than 20?", "Level": 1, "Final answer": "77", "explanation": "Primes < 20: 2, 3, 5, 7, 11, 13, 17, 19. Sum = 77" }, { "task_id": "logic_001", "Question": "If all roses are flowers and some flowers fade quickly, can we conclude that some roses fade quickly?", "Level": 2, "Final answer": "No", "explanation": "This is a logical fallacy - we cannot conclude that some roses fade quickly from the given premises" }, { "task_id": "calc_001", "Question": "A rectangle has a perimeter of 24 cm and an area of 32 cm². What is the length of its longer side?", "Level": 2, "Final answer": "8", "explanation": "Solving the system: 2(l+w)=24 and l×w=32 gives l=8, w=4" }, { "task_id": "reasoning_001", "Question": "In a sequence where each term is the sum of the two preceding terms, if the 5th term is 13 and the 6th term is 21, what is the 4th term?", "Level": 2, "Final answer": "8", "explanation": "Working backwards: F(4) + F(5) = F(6), so F(4) = 21 - 13 = 8" }, { "task_id": "wordplay_001", "Question": "What common English word becomes shorter when you add two letters to it?", "Level": 2, "Final answer": "Short", "explanation": "The word 'short' becomes 'shorter' when you add 'er'" } ] def evaluate_answer(agent_answer: str, correct_answer: str, question: str) -> Dict[str, Any]: """Evaluate agent answer against correct answer""" # Normalize answers agent_norm = agent_answer.strip().lower() correct_norm = correct_answer.strip().lower() # Check for exact match exact_match = agent_norm == correct_norm # Check if correct answer is contained in agent response contains_answer = correct_norm in agent_norm # For numerical answers, try to extract numbers import re agent_numbers = re.findall(r'-?\d+\.?\d*', agent_answer) correct_numbers = re.findall(r'-?\d+\.?\d*', correct_answer) numerical_match = False if agent_numbers and correct_numbers: try: agent_num = float(agent_numbers[-1]) # Take last number found correct_num = float(correct_numbers[0]) numerical_match = abs(agent_num - correct_num) < 0.001 except: pass # Determine if answer is correct is_correct = exact_match or contains_answer or numerical_match return { "is_correct": is_correct, "exact_match": exact_match, "contains_answer": contains_answer, "numerical_match": numerical_match, "confidence": "high" if exact_match else "medium" if contains_answer or numerical_match else "low" } def test_single_question(question, expected_answer, level, api_key): """Test a single question""" if not api_key: return "❌ Please provide your OpenRouter API key", "", "❌ No API key" if not question.strip(): return "❌ Please provide a question", "", "❌ No question" agent = GAIATestAgent(api_key) response = agent.solve_gaia_question(question, level=level) if expected_answer.strip(): eval_result = evaluate_answer(response, expected_answer, question) status = f"✅ Correct ({eval_result['confidence']} confidence)" if eval_result['is_correct'] else "❌ Incorrect" else: status = "⚠️ No expected answer provided" return response, expected_answer, status def test_sample_questions(api_key): """Test on predefined sample questions""" if not api_key: return "❌ Please provide your OpenRouter API key", pd.DataFrame() agent = GAIATestAgent(api_key) results = [] total_questions = len(SAMPLE_QUESTIONS) correct_count = 0 for i, q in enumerate(SAMPLE_QUESTIONS): response = agent.solve_gaia_question(q["Question"], level=q["Level"]) eval_result = evaluate_answer(response, q["Final answer"], q["Question"]) if eval_result["is_correct"]: correct_count += 1 # Truncate long responses for display display_response = response[:150] + "..." if len(response) > 150 else response display_question = q["Question"][:80] + "..." if len(q["Question"]) > 80 else q["Question"] results.append({ "ID": q["task_id"], "Question": display_question, "Level": q["Level"], "Expected": q["Final answer"], "Agent Answer": display_response, "Status": "✅" if eval_result["is_correct"] else "❌", "Confidence": eval_result["confidence"] }) accuracy = (correct_count / total_questions) * 100 # Create summary summary = f""" ## 📊 Test Results Summary ### Overall Performance - **Total Questions Tested**: {total_questions} - **Correct Answers**: {correct_count} - **Accuracy**: {accuracy:.1f}% ### By Level - **Level 1**: {sum(1 for q in SAMPLE_QUESTIONS if q['Level'] == 1)} questions - **Level 2**: {sum(1 for q in SAMPLE_QUESTIONS if q['Level'] == 2)} questions - **Level 3**: {sum(1 for q in SAMPLE_QUESTIONS if q['Level'] == 3)} questions ### Performance Analysis - **High Confidence Correct**: {sum(1 for r in results if r['Status'] == '✅' and r['Confidence'] == 'high')} - **Medium Confidence Correct**: {sum(1 for r in results if r['Status'] == '✅' and r['Confidence'] == 'medium')} - **Incorrect Answers**: {len(results) - correct_count} {('🎉 Excellent performance! Ready for GAIA submission.' if accuracy >= 80 else '👍 Good performance! Consider fine-tuning for better results.' if accuracy >= 60 else '⚠️ Performance needs improvement. Review failed cases.')} """ results_df = pd.DataFrame(results) return summary, results_df def generate_submission_template(api_key): """Generate GAIA submission template and instructions""" submission_instructions = """ # 🏆 GAIA Benchmark Submission Guide ## Step 1: Access the GAIA Dataset 1. Go to: https://huggingface.co/datasets/gaia-benchmark/GAIA 2. Accept the dataset conditions (required to prevent data leakage) 3. Load the dataset using: `datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")` ## Step 2: Generate Predictions Your submission file should be a JSON Lines (.jsonl) file: ```json {"task_id": "validation_001", "model_answer": "Your precise answer"} {"task_id": "validation_002", "model_answer": "42"} {"task_id": "validation_003", "model_answer": "The answer is Paris"} ``` ## Step 3: Submit to Leaderboard 1. Go to: https://huggingface.co/spaces/gaia-benchmark/leaderboard 2. Follow the submission instructions 3. Upload your .jsonl predictions file 4. Wait for evaluation results ## ⚠️ Important Notes: - Only submit predictions for TEST set (not validation) - Answers must be extracted cleanly (no reasoning text) - Review GAIA paper for detailed guidelines: https://arxiv.org/abs/2311.12983 """ # Create sample submission content sample_submission = [ {"task_id": "sample_001", "model_answer": "77"}, {"task_id": "sample_002", "model_answer": "No"}, {"task_id": "sample_003", "model_answer": "8"} ] submission_content = "\\n".join([json.dumps(item) for item in sample_submission]) return submission_instructions, submission_content # Create the Gradio interface def create_gaia_interface(): with gr.Blocks(title="🧪 GAIA Test Agent by Mehedi", theme=gr.themes.Soft()) as demo: gr.HTML("""
Advanced AI Testing for General AI Intelligence Assessment
Test your AI agent on complex reasoning tasks • Powered by DeepSeek V3.1 Terminus