import os
import json
import requests
from typing import Dict, Any, List, Optional
import gradio as gr
import pandas as pd

# Configuration
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") or os.getenv("my_key")

class OpenRouterLLM:
    def __init__(self, api_key: str, model: str = "deepseek/deepseek-v3.1-terminus"):
        self.api_key = api_key
        self.model = model
        self.base_url = "https://openrouter.ai/api/v1/chat/completions"
    
    def __call__(self, prompt: str, max_tokens: int = 2000, temperature: float = 0.1) -> str:
        if not self.api_key or not self.api_key.startswith('sk-or-v1-'):
            return "Error: Invalid OpenRouter API key"
        
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
            "HTTP-Referer": "https://huggingface.co/spaces/Mehedi2/Gaia-Test-Agent",
            "X-Title": "GAIA Test Agent"
        }
        
        payload = {
            "model": self.model,
            "messages": [
                {
                    "role": "system",
                    "content": """You are an advanced AI assistant designed for the GAIA benchmark. You excel at:
- Complex reasoning and multi-step problem solving
- Mathematical calculations and logical analysis
- Research and fact-finding
- File analysis and data interpretation
- Providing precise, unambiguous answers

Always think step-by-step and provide clear reasoning for your answers."""
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            "temperature": temperature,
            "max_tokens": max_tokens,
            "top_p": 0.9
        }
        
        try:
            response = requests.post(self.base_url, headers=headers, json=payload, timeout=60)
            
            if response.status_code == 200:
                result = response.json()
                return result["choices"][0]["message"]["content"].strip()
            else:
                return f"API Error: {response.status_code} - {response.text[:200]}"
                
        except Exception as e:
            return f"Error: {str(e)}"

class GAIATestAgent:
    """GAIA Benchmark Test Agent with enhanced capabilities"""
    
    def __init__(self, api_key: str, model: str = "deepseek/deepseek-v3.1-terminus"):
        self.llm = OpenRouterLLM(api_key, model)
        self.api_key = api_key
    
    def solve_gaia_question(self, question: str, file_content: Optional[str] = None, level: int = 1) -> str:
        """Main method to solve GAIA benchmark questions"""
        
        # Build context
        context_parts = [f"Question: {question}"]
        
        if file_content:
            context_parts.append(f"File content provided: {file_content[:2000]}")
        
        context_parts.append(f"Question Level: {level} (1=Basic, 2=Intermediate, 3=Advanced)")
        
        # Create comprehensive solving prompt
        main_prompt = f"""
You are solving a GAIA benchmark question. These questions test advanced AI capabilities and require careful reasoning.

{chr(10).join(context_parts)}

Approach this systematically:

1. **Understanding**: What exactly is the question asking for?
2. **Analysis**: What information do I have and what do I need to find?
3. **Strategy**: What approach should I take to solve this?
4. **Execution**: Work through the problem step by step
5. **Verification**: Does my answer make sense?
6. **Final Answer**: Provide a clear, precise answer

For GAIA questions:
- Be extremely precise and factual
- Show your reasoning clearly
- For numerical answers, provide exact numbers
- For factual answers, be concise but complete
- If you need to make calculations, show your work

Think carefully and solve step by step:
"""
        
        return self.llm(main_prompt, max_tokens=2000, temperature=0.1)

# Sample GAIA-style questions for testing
SAMPLE_QUESTIONS = [
    {
        "task_id": "math_001",
        "Question": "What is the sum of all prime numbers less than 20?",
        "Level": 1,
        "Final answer": "77",
        "explanation": "Primes < 20: 2, 3, 5, 7, 11, 13, 17, 19. Sum = 77"
    },
    {
        "task_id": "logic_001", 
        "Question": "If all roses are flowers and some flowers fade quickly, can we conclude that some roses fade quickly?",
        "Level": 2,
        "Final answer": "No",
        "explanation": "This is a logical fallacy - we cannot conclude that some roses fade quickly from the given premises"
    },
    {
        "task_id": "calc_001",
        "Question": "A rectangle has a perimeter of 24 cm and an area of 32 cm². What is the length of its longer side?",
        "Level": 2,
        "Final answer": "8",
        "explanation": "Solving the system: 2(l+w)=24 and l×w=32 gives l=8, w=4"
    },
    {
        "task_id": "reasoning_001",
        "Question": "In a sequence where each term is the sum of the two preceding terms, if the 5th term is 13 and the 6th term is 21, what is the 4th term?",
        "Level": 2,
        "Final answer": "8",
        "explanation": "Working backwards: F(4) + F(5) = F(6), so F(4) = 21 - 13 = 8"
    },
    {
        "task_id": "wordplay_001",
        "Question": "What common English word becomes shorter when you add two letters to it?",
        "Level": 2,
        "Final answer": "Short",
        "explanation": "The word 'short' becomes 'shorter' when you add 'er'"
    }
]

def evaluate_answer(agent_answer: str, correct_answer: str, question: str) -> Dict[str, Any]:
    """Evaluate agent answer against correct answer"""
    
    # Normalize answers
    agent_norm = agent_answer.strip().lower()
    correct_norm = correct_answer.strip().lower()
    
    # Check for exact match
    exact_match = agent_norm == correct_norm
    
    # Check if correct answer is contained in agent response
    contains_answer = correct_norm in agent_norm
    
    # For numerical answers, try to extract numbers
    import re
    agent_numbers = re.findall(r'-?\d+\.?\d*', agent_answer)
    correct_numbers = re.findall(r'-?\d+\.?\d*', correct_answer)
    
    numerical_match = False
    if agent_numbers and correct_numbers:
        try:
            agent_num = float(agent_numbers[-1])  # Take last number found
            correct_num = float(correct_numbers[0])
            numerical_match = abs(agent_num - correct_num) < 0.001
        except:
            pass
    
    # Determine if answer is correct
    is_correct = exact_match or contains_answer or numerical_match
    
    return {
        "is_correct": is_correct,
        "exact_match": exact_match,
        "contains_answer": contains_answer,
        "numerical_match": numerical_match,
        "confidence": "high" if exact_match else "medium" if contains_answer or numerical_match else "low"
    }

def test_single_question(question, expected_answer, level, api_key):
    """Test a single question"""
    if not api_key:
        return "❌ Please provide your OpenRouter API key", "", "❌ No API key"
    
    if not question.strip():
        return "❌ Please provide a question", "", "❌ No question"
    
    agent = GAIATestAgent(api_key)
    response = agent.solve_gaia_question(question, level=level)
    
    if expected_answer.strip():
        eval_result = evaluate_answer(response, expected_answer, question)
        status = f"✅ Correct ({eval_result['confidence']} confidence)" if eval_result['is_correct'] else "❌ Incorrect"
    else:
        status = "⚠️ No expected answer provided"
    
    return response, expected_answer, status

def test_sample_questions(api_key):
    """Test on predefined sample questions"""
    if not api_key:
        return "❌ Please provide your OpenRouter API key", pd.DataFrame()
    
    agent = GAIATestAgent(api_key)
    results = []
    
    total_questions = len(SAMPLE_QUESTIONS)
    correct_count = 0
    
    for i, q in enumerate(SAMPLE_QUESTIONS):
        response = agent.solve_gaia_question(q["Question"], level=q["Level"])
        eval_result = evaluate_answer(response, q["Final answer"], q["Question"])
        
        if eval_result["is_correct"]:
            correct_count += 1
        
        # Truncate long responses for display
        display_response = response[:150] + "..." if len(response) > 150 else response
        display_question = q["Question"][:80] + "..." if len(q["Question"]) > 80 else q["Question"]
        
        results.append({
            "ID": q["task_id"],
            "Question": display_question,
            "Level": q["Level"],
            "Expected": q["Final answer"],
            "Agent Answer": display_response,
            "Status": "✅" if eval_result["is_correct"] else "❌",
            "Confidence": eval_result["confidence"]
        })
    
    accuracy = (correct_count / total_questions) * 100
    
    # Create summary
    summary = f"""
## 📊 Test Results Summary

### Overall Performance
- **Total Questions Tested**: {total_questions}
- **Correct Answers**: {correct_count}
- **Accuracy**: {accuracy:.1f}%

### By Level
- **Level 1**: {sum(1 for q in SAMPLE_QUESTIONS if q['Level'] == 1)} questions
- **Level 2**: {sum(1 for q in SAMPLE_QUESTIONS if q['Level'] == 2)} questions  
- **Level 3**: {sum(1 for q in SAMPLE_QUESTIONS if q['Level'] == 3)} questions

### Performance Analysis
- **High Confidence Correct**: {sum(1 for r in results if r['Status'] == '✅' and r['Confidence'] == 'high')}
- **Medium Confidence Correct**: {sum(1 for r in results if r['Status'] == '✅' and r['Confidence'] == 'medium')}
- **Incorrect Answers**: {len(results) - correct_count}

{('🎉 Excellent performance! Ready for GAIA submission.' if accuracy >= 80 else 
  '👍 Good performance! Consider fine-tuning for better results.' if accuracy >= 60 else
  '⚠️ Performance needs improvement. Review failed cases.')}
"""
    
    results_df = pd.DataFrame(results)
    
    return summary, results_df

def generate_submission_template(api_key):
    """Generate GAIA submission template and instructions"""
    
    submission_instructions = """
# 🏆 GAIA Benchmark Submission Guide

## Step 1: Access the GAIA Dataset
1. Go to: https://huggingface.co/datasets/gaia-benchmark/GAIA
2. Accept the dataset conditions (required to prevent data leakage)
3. Load the dataset using: `datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")`

## Step 2: Generate Predictions
Your submission file should be a JSON Lines (.jsonl) file:

```json
{"task_id": "validation_001", "model_answer": "Your precise answer"}
{"task_id": "validation_002", "model_answer": "42"}
{"task_id": "validation_003", "model_answer": "The answer is Paris"}
```

## Step 3: Submit to Leaderboard
1. Go to: https://huggingface.co/spaces/gaia-benchmark/leaderboard
2. Follow the submission instructions
3. Upload your .jsonl predictions file
4. Wait for evaluation results

## ⚠️ Important Notes:
- Only submit predictions for TEST set (not validation)
- Answers must be extracted cleanly (no reasoning text)
- Review GAIA paper for detailed guidelines: https://arxiv.org/abs/2311.12983
"""
    
    # Create sample submission content
    sample_submission = [
        {"task_id": "sample_001", "model_answer": "77"},
        {"task_id": "sample_002", "model_answer": "No"},
        {"task_id": "sample_003", "model_answer": "8"}
    ]
    
    submission_content = "\\n".join([json.dumps(item) for item in sample_submission])
    
    return submission_instructions, submission_content

# Create the Gradio interface
def create_gaia_interface():
    
    with gr.Blocks(title="🧪 GAIA Test Agent by Mehedi", theme=gr.themes.Soft()) as demo:
        
        gr.HTML("""
        <div style="text-align: center; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 30px; border-radius: 15px; margin-bottom: 25px; box-shadow: 0 4px 15px rgba(0,0,0,0.1);">
            <h1>🧪 GAIA Benchmark Test Agent</h1>
            <p><strong>Advanced AI Testing for General AI Intelligence Assessment</strong></p>
            <p>Test your AI agent on complex reasoning tasks • Powered by DeepSeek V3.1 Terminus</p>
        </div>
        """)
        
        # API Key input
        with gr.Row():
            api_key_input = gr.Textbox(
                label="🔑 OpenRouter API Key", 
                placeholder="sk-or-v1-your-openrouter-api-key-here",
                type="password",
                value=OPENROUTER_API_KEY or "",
                info="Required for AI agent functionality. Get yours at openrouter.ai"
            )
        
        # Main tabs
        with gr.Tabs():
            
            # Single Question Testing
            with gr.Tab("🎯 Single Question Test"):
                gr.Markdown("### Test individual GAIA-style questions")
                
                with gr.Row():
                    with gr.Column():
                        question_input = gr.Textbox(
                            label="Question",
                            placeholder="Enter your GAIA-style question here...",
                            lines=4,
                            info="Enter complex reasoning questions similar to GAIA benchmark"
                        )
                        
                        with gr.Row():
                            expected_answer_input = gr.Textbox(
                                label="Expected Answer (Optional)",
                                placeholder="Expected answer for comparison...",
                                info="Provide the correct answer to evaluate performance"
                            )
                            level_input = gr.Slider(
                                minimum=1, maximum=3, value=1, step=1,
                                label="Difficulty Level",
                                info="1=Basic, 2=Intermediate, 3=Advanced"
                            )
                        
                        test_single_btn = gr.Button("🧪 Test Question", variant="primary")
                    
                    with gr.Column():
                        agent_response_output = gr.Textbox(
                            label="🤖 Agent Response",
                            lines=12,
                            show_copy_button=True,
                            info="Complete reasoning and answer from the AI agent"
                        )
                        
                        with gr.Row():
                            expected_display = gr.Textbox(label="Expected", interactive=False)
                            result_status_output = gr.Textbox(label="Evaluation", interactive=False)
            
            # Sample Questions Battery Test  
            with gr.Tab("📝 Sample Questions Test"):
                gr.Markdown("### Test on curated GAIA-style questions")
                
                with gr.Column():
                    gr.Markdown("""
                    **Sample Question Types:**
                    - 🔢 Mathematical reasoning and calculations
                    - 🧠 Logical reasoning and inference  
                    - 🔍 Multi-step problem solving
                    - 📊 Data analysis and interpretation
                    - 🎯 Precision and accuracy testing
                    """)
                    
                    test_samples_btn = gr.Button("🧪 Run Full Test Battery", variant="primary")
                    
                    test_summary_output = gr.Markdown()
                    
                    test_results_output = gr.Dataframe(
                        label="📊 Detailed Test Results",
                        wrap=True
                    )
            
            # Submission Guidelines
            with gr.Tab("📤 GAIA Submission"):
                gr.Markdown("### Official GAIA benchmark submission guide")
                
                generate_guide_btn = gr.Button("📋 Generate Submission Guide", variant="primary")
                
                submission_guide_output = gr.Markdown()
                
                with gr.Accordion("📄 Sample Submission File", open=False):
                    submission_sample_output = gr.Code(language="json", label="sample_submission.jsonl")
        
        # Information footer
        gr.Markdown("""
        ---
        ### 📖 About GAIA Benchmark
        
        **GAIA (General AI Intelligence Assessment)** evaluates AI systems on complex, real-world questions requiring:
        - Multi-step reasoning and planning
        - Tool usage and external knowledge integration  
        - Mathematical calculations and logical inference
        - File analysis and multi-modal understanding
        
        ### 🔗 Official Resources
        - **🏆 Leaderboard**: [GAIA Benchmark Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
        - **📚 Dataset**: [GAIA Dataset](https://huggingface.co/datasets/gaia-benchmark/GAIA)
        - **📄 Research Paper**: [GAIA: A Benchmark for General AI Assistants](https://arxiv.org/abs/2311.12983)
        
        ---
        **⚡ Powered by**: DeepSeek V3.1 Terminus via OpenRouter | **🛠️ Built by**: Mehedi | **🎯 Purpose**: GAIA Benchmark Testing
        """)
        
        # Event handlers
        test_single_btn.click(
            fn=test_single_question,
            inputs=[question_input, expected_answer_input, level_input, api_key_input],
            outputs=[agent_response_output, expected_display, result_status_output]
        )
        
        test_samples_btn.click(
            fn=test_sample_questions,
            inputs=[api_key_input],
            outputs=[test_summary_output, test_results_output]
        )
        
        generate_guide_btn.click(
            fn=generate_submission_template,
            inputs=[api_key_input],
            outputs=[submission_guide_output, submission_sample_output]
        )
    
    return demo

# Launch the application
if __name__ == "__main__":
    demo = create_gaia_interface()
    
    # Check if running on Hugging Face Spaces
    if os.getenv("SPACE_ID"):
        demo.launch(
            server_name="0.0.0.0",
            server_port=7860,
            show_api=False,
            share=False
        )
    else:
        # Local development
        demo.launch(share=True, show_api=False, debug=True)


import gradio as gr
from gaia_api import app as gaia_app
from app import run_agent    # or correct import

demo = gr.Interface(fn=run_agent, inputs="text", outputs="text")

demo.launch(server_name="0.0.0.0", server_port=7860)
demo.app.mount("/gaia", gaia_app)