"""
Tokenizer Manager
=================
Handles tokenizer loading, caching, and availability checking
"""

import os
from typing import Dict, List, Any
from transformers import AutoTokenizer, logging
from config import TOKENIZER_REGISTRY, TokenizerInfo

logging.set_verbosity_error()

# HuggingFace authentication
HF_TOKEN = os.getenv('HF_TOKEN')
if HF_TOKEN:
    HF_TOKEN = HF_TOKEN.strip()
    from huggingface_hub import login
    login(token=HF_TOKEN)


class TokenizerManager:
    """Manages tokenizer loading and caching"""
    
    def __init__(self):
        self._cache: Dict[str, Any] = {}
        self._available: Dict[str, TokenizerInfo] = {}
        self._initialize_available_tokenizers()
    
    def _initialize_available_tokenizers(self):
        """Check which tokenizers are available and can be loaded"""
        print("🔄 Initializing tokenizer registry...")
        
        for model_id, info in TOKENIZER_REGISTRY.items():
            try:
                _ = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
                self._available[model_id] = info
                print(f"  ✓ {info.name}")
            except Exception as e:
                print(f"  ✗ {info.name}: {str(e)[:50]}")
        
        print(f"\n✅ Total available tokenizers: {len(self._available)}")
    
    def get_tokenizer(self, model_id: str):
        """Get tokenizer from cache or load it"""
        if model_id not in self._cache:
            self._cache[model_id] = AutoTokenizer.from_pretrained(
                model_id, 
                trust_remote_code=True
            )
        return self._cache[model_id]
    
    def get_available_tokenizers(self) -> Dict[str, TokenizerInfo]:
        """Get all available tokenizers"""
        return self._available
    
    def get_tokenizer_choices(self) -> List[str]:
        """Get list of tokenizer display names for dropdown"""
        return [f"{info.name} ({info.organization})" for info in self._available.values()]
    
    def get_model_id_from_choice(self, choice: str) -> str:
        """Convert display choice back to model ID"""
        for model_id, info in self._available.items():
            if f"{info.name} ({info.organization})" == choice:
                return model_id
        return list(self._available.keys())[0] if self._available else ""
    
    def get_tokenizers_by_type(self) -> Dict[str, List[str]]:
        """Group available tokenizers by type"""
        choices = self.get_tokenizer_choices()
        
        arabic_bert = [t for t in choices if any(x in t for x in ['AraBERT', 'CAMeL', 'MARBERT', 'ARBERT', 'Safaya'])]
        arabic_specific = [t for t in choices if any(x in t for x in ['Aranizer'])]
        arabic_llms = [t for t in choices if any(x in t for x in ['Jais', 'AceGPT', 'SILMA', 'Fanar', 'StableLM', 'Yehia', 'Atlas'])]
        multilingual = [t for t in choices if t not in arabic_bert and t not in arabic_specific and t not in arabic_llms]
        
        return {
            "Arabic BERT Models": arabic_bert,
            "Arabic Tokenizers": arabic_specific,
            "Arabic LLMs": arabic_llms,
            "Multilingual Models": multilingual
        }


# Global tokenizer manager instance
tokenizer_manager = TokenizerManager()