""" Tokenizer Manager ================= Handles tokenizer loading, caching, and availability checking """ import os from typing import Dict, List, Any from transformers import AutoTokenizer, logging from config import TOKENIZER_REGISTRY, TokenizerInfo logging.set_verbosity_error() # HuggingFace authentication HF_TOKEN = os.getenv('HF_TOKEN') if HF_TOKEN: HF_TOKEN = HF_TOKEN.strip() from huggingface_hub import login login(token=HF_TOKEN) class TokenizerManager: """Manages tokenizer loading and caching""" def __init__(self): self._cache: Dict[str, Any] = {} self._available: Dict[str, TokenizerInfo] = {} self._initialize_available_tokenizers() def _initialize_available_tokenizers(self): """Check which tokenizers are available and can be loaded""" print("šŸ”„ Initializing tokenizer registry...") for model_id, info in TOKENIZER_REGISTRY.items(): try: _ = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) self._available[model_id] = info print(f" āœ“ {info.name}") except Exception as e: print(f" āœ— {info.name}: {str(e)[:50]}") print(f"\nāœ… Total available tokenizers: {len(self._available)}") def get_tokenizer(self, model_id: str): """Get tokenizer from cache or load it""" if model_id not in self._cache: self._cache[model_id] = AutoTokenizer.from_pretrained( model_id, trust_remote_code=True ) return self._cache[model_id] def get_available_tokenizers(self) -> Dict[str, TokenizerInfo]: """Get all available tokenizers""" return self._available def get_tokenizer_choices(self) -> List[str]: """Get list of tokenizer display names for dropdown""" return [f"{info.name} ({info.organization})" for info in self._available.values()] def get_model_id_from_choice(self, choice: str) -> str: """Convert display choice back to model ID""" for model_id, info in self._available.items(): if f"{info.name} ({info.organization})" == choice: return model_id return list(self._available.keys())[0] if self._available else "" def get_tokenizers_by_type(self) -> Dict[str, List[str]]: """Group available tokenizers by type""" choices = self.get_tokenizer_choices() arabic_bert = [t for t in choices if any(x in t for x in ['AraBERT', 'CAMeL', 'MARBERT', 'ARBERT', 'Safaya'])] arabic_specific = [t for t in choices if any(x in t for x in ['Aranizer'])] arabic_llms = [t for t in choices if any(x in t for x in ['Jais', 'AceGPT', 'SILMA', 'Fanar', 'StableLM', 'Yehia', 'Atlas'])] multilingual = [t for t in choices if t not in arabic_bert and t not in arabic_specific and t not in arabic_llms] return { "Arabic BERT Models": arabic_bert, "Arabic Tokenizers": arabic_specific, "Arabic LLMs": arabic_llms, "Multilingual Models": multilingual } # Global tokenizer manager instance tokenizer_manager = TokenizerManager()