import os import json import requests from typing import Dict, Any, List, Optional import gradio as gr import pandas as pd # Configuration OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") or os.getenv("my_key") class OpenRouterLLM: def __init__(self, api_key: str, model: str = "deepseek/deepseek-v3.1-terminus"): self.api_key = api_key self.model = model self.base_url = "https://openrouter.ai/api/v1/chat/completions" def __call__(self, prompt: str, max_tokens: int = 2000, temperature: float = 0.1) -> str: if not self.api_key or not self.api_key.startswith('sk-or-v1-'): return "Error: Invalid OpenRouter API key" headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json", "HTTP-Referer": "https://huggingface.co/spaces/Mehedi2/Gaia-Test-Agent", "X-Title": "GAIA Test Agent" } payload = { "model": self.model, "messages": [ { "role": "system", "content": """You are an advanced AI assistant designed for the GAIA benchmark. You excel at: - Complex reasoning and multi-step problem solving - Mathematical calculations and logical analysis - Research and fact-finding - File analysis and data interpretation - Providing precise, unambiguous answers Always think step-by-step and provide clear reasoning for your answers.""" }, { "role": "user", "content": prompt } ], "temperature": temperature, "max_tokens": max_tokens, "top_p": 0.9 } try: response = requests.post(self.base_url, headers=headers, json=payload, timeout=60) if response.status_code == 200: result = response.json() return result["choices"][0]["message"]["content"].strip() else: return f"API Error: {response.status_code} - {response.text[:200]}" except Exception as e: return f"Error: {str(e)}" class GAIATestAgent: """GAIA Benchmark Test Agent with enhanced capabilities""" def __init__(self, api_key: str, model: str = "deepseek/deepseek-v3.1-terminus"): self.llm = OpenRouterLLM(api_key, model) self.api_key = api_key def solve_gaia_question(self, question: str, file_content: Optional[str] = None, level: int = 1) -> str: """Main method to solve GAIA benchmark questions""" # Build context context_parts = [f"Question: {question}"] if file_content: context_parts.append(f"File content provided: {file_content[:2000]}") context_parts.append(f"Question Level: {level} (1=Basic, 2=Intermediate, 3=Advanced)") # Create comprehensive solving prompt main_prompt = f""" You are solving a GAIA benchmark question. These questions test advanced AI capabilities and require careful reasoning. {chr(10).join(context_parts)} Approach this systematically: 1. **Understanding**: What exactly is the question asking for? 2. **Analysis**: What information do I have and what do I need to find? 3. **Strategy**: What approach should I take to solve this? 4. **Execution**: Work through the problem step by step 5. **Verification**: Does my answer make sense? 6. **Final Answer**: Provide a clear, precise answer For GAIA questions: - Be extremely precise and factual - Show your reasoning clearly - For numerical answers, provide exact numbers - For factual answers, be concise but complete - If you need to make calculations, show your work Think carefully and solve step by step: """ return self.llm(main_prompt, max_tokens=2000, temperature=0.1) # Sample GAIA-style questions for testing SAMPLE_QUESTIONS = [ { "task_id": "math_001", "Question": "What is the sum of all prime numbers less than 20?", "Level": 1, "Final answer": "77", "explanation": "Primes < 20: 2, 3, 5, 7, 11, 13, 17, 19. Sum = 77" }, { "task_id": "logic_001", "Question": "If all roses are flowers and some flowers fade quickly, can we conclude that some roses fade quickly?", "Level": 2, "Final answer": "No", "explanation": "This is a logical fallacy - we cannot conclude that some roses fade quickly from the given premises" }, { "task_id": "calc_001", "Question": "A rectangle has a perimeter of 24 cm and an area of 32 cm². What is the length of its longer side?", "Level": 2, "Final answer": "8", "explanation": "Solving the system: 2(l+w)=24 and l×w=32 gives l=8, w=4" }, { "task_id": "reasoning_001", "Question": "In a sequence where each term is the sum of the two preceding terms, if the 5th term is 13 and the 6th term is 21, what is the 4th term?", "Level": 2, "Final answer": "8", "explanation": "Working backwards: F(4) + F(5) = F(6), so F(4) = 21 - 13 = 8" }, { "task_id": "wordplay_001", "Question": "What common English word becomes shorter when you add two letters to it?", "Level": 2, "Final answer": "Short", "explanation": "The word 'short' becomes 'shorter' when you add 'er'" } ] def evaluate_answer(agent_answer: str, correct_answer: str, question: str) -> Dict[str, Any]: """Evaluate agent answer against correct answer""" # Normalize answers agent_norm = agent_answer.strip().lower() correct_norm = correct_answer.strip().lower() # Check for exact match exact_match = agent_norm == correct_norm # Check if correct answer is contained in agent response contains_answer = correct_norm in agent_norm # For numerical answers, try to extract numbers import re agent_numbers = re.findall(r'-?\d+\.?\d*', agent_answer) correct_numbers = re.findall(r'-?\d+\.?\d*', correct_answer) numerical_match = False if agent_numbers and correct_numbers: try: agent_num = float(agent_numbers[-1]) # Take last number found correct_num = float(correct_numbers[0]) numerical_match = abs(agent_num - correct_num) < 0.001 except: pass # Determine if answer is correct is_correct = exact_match or contains_answer or numerical_match return { "is_correct": is_correct, "exact_match": exact_match, "contains_answer": contains_answer, "numerical_match": numerical_match, "confidence": "high" if exact_match else "medium" if contains_answer or numerical_match else "low" } def test_single_question(question, expected_answer, level, api_key): """Test a single question""" if not api_key: return "❌ Please provide your OpenRouter API key", "", "❌ No API key" if not question.strip(): return "❌ Please provide a question", "", "❌ No question" agent = GAIATestAgent(api_key) response = agent.solve_gaia_question(question, level=level) if expected_answer.strip(): eval_result = evaluate_answer(response, expected_answer, question) status = f"✅ Correct ({eval_result['confidence']} confidence)" if eval_result['is_correct'] else "❌ Incorrect" else: status = "⚠️ No expected answer provided" return response, expected_answer, status def test_sample_questions(api_key): """Test on predefined sample questions""" if not api_key: return "❌ Please provide your OpenRouter API key", pd.DataFrame() agent = GAIATestAgent(api_key) results = [] total_questions = len(SAMPLE_QUESTIONS) correct_count = 0 for i, q in enumerate(SAMPLE_QUESTIONS): response = agent.solve_gaia_question(q["Question"], level=q["Level"]) eval_result = evaluate_answer(response, q["Final answer"], q["Question"]) if eval_result["is_correct"]: correct_count += 1 # Truncate long responses for display display_response = response[:150] + "..." if len(response) > 150 else response display_question = q["Question"][:80] + "..." if len(q["Question"]) > 80 else q["Question"] results.append({ "ID": q["task_id"], "Question": display_question, "Level": q["Level"], "Expected": q["Final answer"], "Agent Answer": display_response, "Status": "✅" if eval_result["is_correct"] else "❌", "Confidence": eval_result["confidence"] }) accuracy = (correct_count / total_questions) * 100 # Create summary summary = f""" ## 📊 Test Results Summary ### Overall Performance - **Total Questions Tested**: {total_questions} - **Correct Answers**: {correct_count} - **Accuracy**: {accuracy:.1f}% ### By Level - **Level 1**: {sum(1 for q in SAMPLE_QUESTIONS if q['Level'] == 1)} questions - **Level 2**: {sum(1 for q in SAMPLE_QUESTIONS if q['Level'] == 2)} questions - **Level 3**: {sum(1 for q in SAMPLE_QUESTIONS if q['Level'] == 3)} questions ### Performance Analysis - **High Confidence Correct**: {sum(1 for r in results if r['Status'] == '✅' and r['Confidence'] == 'high')} - **Medium Confidence Correct**: {sum(1 for r in results if r['Status'] == '✅' and r['Confidence'] == 'medium')} - **Incorrect Answers**: {len(results) - correct_count} {('🎉 Excellent performance! Ready for GAIA submission.' if accuracy >= 80 else '👍 Good performance! Consider fine-tuning for better results.' if accuracy >= 60 else '⚠️ Performance needs improvement. Review failed cases.')} """ results_df = pd.DataFrame(results) return summary, results_df def generate_submission_template(api_key): """Generate GAIA submission template and instructions""" submission_instructions = """ # 🏆 GAIA Benchmark Submission Guide ## Step 1: Access the GAIA Dataset 1. Go to: https://huggingface.co/datasets/gaia-benchmark/GAIA 2. Accept the dataset conditions (required to prevent data leakage) 3. Load the dataset using: `datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")` ## Step 2: Generate Predictions Your submission file should be a JSON Lines (.jsonl) file: ```json {"task_id": "validation_001", "model_answer": "Your precise answer"} {"task_id": "validation_002", "model_answer": "42"} {"task_id": "validation_003", "model_answer": "The answer is Paris"} ``` ## Step 3: Submit to Leaderboard 1. Go to: https://huggingface.co/spaces/gaia-benchmark/leaderboard 2. Follow the submission instructions 3. Upload your .jsonl predictions file 4. Wait for evaluation results ## ⚠️ Important Notes: - Only submit predictions for TEST set (not validation) - Answers must be extracted cleanly (no reasoning text) - Review GAIA paper for detailed guidelines: https://arxiv.org/abs/2311.12983 """ # Create sample submission content sample_submission = [ {"task_id": "sample_001", "model_answer": "77"}, {"task_id": "sample_002", "model_answer": "No"}, {"task_id": "sample_003", "model_answer": "8"} ] submission_content = "\\n".join([json.dumps(item) for item in sample_submission]) return submission_instructions, submission_content # Create the Gradio interface def create_gaia_interface(): with gr.Blocks(title="🧪 GAIA Test Agent by Mehedi", theme=gr.themes.Soft()) as demo: gr.HTML("""

🧪 GAIA Benchmark Test Agent

Advanced AI Testing for General AI Intelligence Assessment

Test your AI agent on complex reasoning tasks • Powered by DeepSeek V3.1 Terminus

""") # API Key input with gr.Row(): api_key_input = gr.Textbox( label="🔑 OpenRouter API Key", placeholder="sk-or-v1-your-openrouter-api-key-here", type="password", value=OPENROUTER_API_KEY or "", info="Required for AI agent functionality. Get yours at openrouter.ai" ) # Main tabs with gr.Tabs(): # Single Question Testing with gr.Tab("🎯 Single Question Test"): gr.Markdown("### Test individual GAIA-style questions") with gr.Row(): with gr.Column(): question_input = gr.Textbox( label="Question", placeholder="Enter your GAIA-style question here...", lines=4, info="Enter complex reasoning questions similar to GAIA benchmark" ) with gr.Row(): expected_answer_input = gr.Textbox( label="Expected Answer (Optional)", placeholder="Expected answer for comparison...", info="Provide the correct answer to evaluate performance" ) level_input = gr.Slider( minimum=1, maximum=3, value=1, step=1, label="Difficulty Level", info="1=Basic, 2=Intermediate, 3=Advanced" ) test_single_btn = gr.Button("🧪 Test Question", variant="primary") with gr.Column(): agent_response_output = gr.Textbox( label="🤖 Agent Response", lines=12, show_copy_button=True, info="Complete reasoning and answer from the AI agent" ) with gr.Row(): expected_display = gr.Textbox(label="Expected", interactive=False) result_status_output = gr.Textbox(label="Evaluation", interactive=False) # Sample Questions Battery Test with gr.Tab("📝 Sample Questions Test"): gr.Markdown("### Test on curated GAIA-style questions") with gr.Column(): gr.Markdown(""" **Sample Question Types:** - 🔢 Mathematical reasoning and calculations - 🧠 Logical reasoning and inference - 🔍 Multi-step problem solving - 📊 Data analysis and interpretation - 🎯 Precision and accuracy testing """) test_samples_btn = gr.Button("🧪 Run Full Test Battery", variant="primary") test_summary_output = gr.Markdown() test_results_output = gr.Dataframe( label="📊 Detailed Test Results", wrap=True ) # Submission Guidelines with gr.Tab("📤 GAIA Submission"): gr.Markdown("### Official GAIA benchmark submission guide") generate_guide_btn = gr.Button("📋 Generate Submission Guide", variant="primary") submission_guide_output = gr.Markdown() with gr.Accordion("📄 Sample Submission File", open=False): submission_sample_output = gr.Code(language="json", label="sample_submission.jsonl") # Information footer gr.Markdown(""" --- ### 📖 About GAIA Benchmark **GAIA (General AI Intelligence Assessment)** evaluates AI systems on complex, real-world questions requiring: - Multi-step reasoning and planning - Tool usage and external knowledge integration - Mathematical calculations and logical inference - File analysis and multi-modal understanding ### 🔗 Official Resources - **🏆 Leaderboard**: [GAIA Benchmark Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard) - **📚 Dataset**: [GAIA Dataset](https://huggingface.co/datasets/gaia-benchmark/GAIA) - **📄 Research Paper**: [GAIA: A Benchmark for General AI Assistants](https://arxiv.org/abs/2311.12983) --- **⚡ Powered by**: DeepSeek V3.1 Terminus via OpenRouter | **🛠️ Built by**: Mehedi | **🎯 Purpose**: GAIA Benchmark Testing """) # Event handlers test_single_btn.click( fn=test_single_question, inputs=[question_input, expected_answer_input, level_input, api_key_input], outputs=[agent_response_output, expected_display, result_status_output] ) test_samples_btn.click( fn=test_sample_questions, inputs=[api_key_input], outputs=[test_summary_output, test_results_output] ) generate_guide_btn.click( fn=generate_submission_template, inputs=[api_key_input], outputs=[submission_guide_output, submission_sample_output] ) return demo # Launch the application if __name__ == "__main__": demo = create_gaia_interface() # Check if running on Hugging Face Spaces if os.getenv("SPACE_ID"): demo.launch( server_name="0.0.0.0", server_port=7860, show_api=False, share=False ) else: # Local development demo.launch(share=True, show_api=False, debug=True) import gradio as gr from gaia_api import app as gaia_app from app import run_agent # or correct import demo = gr.Interface(fn=run_agent, inputs="text", outputs="text") demo.launch(server_name="0.0.0.0", server_port=7860) demo.app.mount("/gaia", gaia_app)