Arabic_Tokenizer / ui_components.py
HeshamHaroon's picture
Refactor: modularize codebase into separate modules
f32d4c7
"""
UI Components
=============
HTML generation functions for the Gradio interface
"""
from typing import List
from config import TokenizerInfo, TokenizationMetrics
from utils import is_arabic_char
def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str:
"""Generate beautiful HTML visualization of tokens"""
colors = [
('#1a1a2e', '#eaeaea'),
('#16213e', '#f0f0f0'),
('#0f3460', '#ffffff'),
('#533483', '#f5f5f5'),
('#e94560', '#ffffff'),
('#0f4c75', '#f0f0f0'),
('#3282b8', '#ffffff'),
('#bbe1fa', '#1a1a2e'),
]
html_parts = []
for i, (token, tid) in enumerate(zip(tokens, token_ids)):
bg, fg = colors[i % len(colors)]
display_token = token.replace('<', '&lt;').replace('>', '&gt;')
is_arabic = any(is_arabic_char(c) for c in token)
direction = 'rtl' if is_arabic else 'ltr'
html_parts.append(f'''
<span class="token" style="
background: {bg};
color: {fg};
direction: {direction};
" title="ID: {tid}">
{display_token}
<span class="token-id">{tid}</span>
</span>
''')
return f'''
<div class="token-container">
{''.join(html_parts)}
</div>
'''
def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) -> str:
"""Generate metrics visualization card"""
fertility_quality = "excellent" if metrics.fertility < 1.5 else "good" if metrics.fertility < 2.5 else "poor"
strr_quality = "excellent" if metrics.single_token_retention_rate > 0.5 else "good" if metrics.single_token_retention_rate > 0.3 else "poor"
compression_quality = "excellent" if metrics.compression_ratio > 4 else "good" if metrics.compression_ratio > 2.5 else "poor"
return f'''
<div class="metrics-grid">
<div class="metric-card primary">
<div class="metric-icon">πŸ“Š</div>
<div class="metric-value">{metrics.total_tokens}</div>
<div class="metric-label">Total Tokens</div>
</div>
<div class="metric-card {fertility_quality}">
<div class="metric-icon">🎯</div>
<div class="metric-value">{metrics.fertility:.3f}</div>
<div class="metric-label">Fertility (tokens/word)</div>
<div class="metric-hint">Lower is better (1.0 ideal)</div>
</div>
<div class="metric-card {compression_quality}">
<div class="metric-icon">πŸ“¦</div>
<div class="metric-value">{metrics.compression_ratio:.2f}</div>
<div class="metric-label">Compression (bytes/token)</div>
<div class="metric-hint">Higher is better</div>
</div>
<div class="metric-card {strr_quality}">
<div class="metric-icon">✨</div>
<div class="metric-value">{metrics.single_token_retention_rate:.1%}</div>
<div class="metric-label">STRR (Single Token Retention)</div>
<div class="metric-hint">Higher is better</div>
</div>
<div class="metric-card">
<div class="metric-icon">πŸ”€</div>
<div class="metric-value">{metrics.char_per_token:.2f}</div>
<div class="metric-label">Characters/Token</div>
</div>
<div class="metric-card {'excellent' if metrics.oov_percentage == 0 else 'poor' if metrics.oov_percentage > 5 else 'good'}">
<div class="metric-icon">❓</div>
<div class="metric-value">{metrics.oov_percentage:.1f}%</div>
<div class="metric-label">OOV Rate</div>
<div class="metric-hint">Lower is better (0% ideal)</div>
</div>
<div class="metric-card">
<div class="metric-icon">🌍</div>
<div class="metric-value">{metrics.arabic_fertility:.3f}</div>
<div class="metric-label">Arabic Fertility</div>
</div>
<div class="metric-card">
<div class="metric-icon">⚑</div>
<div class="metric-value">{metrics.tokenization_time_ms:.2f}ms</div>
<div class="metric-label">Processing Time</div>
</div>
</div>
'''
def generate_tokenizer_info_card(info: TokenizerInfo) -> str:
"""Generate tokenizer information card"""
dialect_badges = ''.join([f'<span class="badge dialect">{d}</span>' for d in info.dialect_support])
feature_badges = ''.join([f'<span class="badge feature">{f}</span>' for f in info.special_features])
support_class = "native" if info.arabic_support == "Native" else "supported" if info.arabic_support == "Supported" else "limited"
return f'''
<div class="info-card">
<div class="info-header">
<h3>{info.name}</h3>
<span class="org-badge">{info.organization}</span>
</div>
<p class="description">{info.description}</p>
<div class="info-grid">
<div class="info-item">
<span class="info-label">Type:</span>
<span class="info-value">{info.type.value}</span>
</div>
<div class="info-item">
<span class="info-label">Algorithm:</span>
<span class="info-value">{info.algorithm.value}</span>
</div>
<div class="info-item">
<span class="info-label">Vocab Size:</span>
<span class="info-value">{info.vocab_size:,}</span>
</div>
<div class="info-item">
<span class="info-label">Arabic Support:</span>
<span class="info-value support-{support_class}">{info.arabic_support}</span>
</div>
</div>
<div class="badge-container">
<div class="badge-group">
<span class="badge-label">Dialects:</span>
{dialect_badges}
</div>
<div class="badge-group">
<span class="badge-label">Features:</span>
{feature_badges}
</div>
</div>
</div>
'''
def generate_decoded_section(metrics: TokenizationMetrics) -> str:
"""Generate decoded output section"""
return f'''
<div class="decoded-section">
<h4>Decoded Output</h4>
<div class="decoded-text" dir="auto">{metrics.decoded_text}</div>
<div class="decoded-meta">
Diacritics preserved: {'βœ… Yes' if metrics.diacritic_preservation else '❌ No'}
</div>
</div>
'''
def generate_about_html(tokenizers_by_type: dict, total_count: int) -> str:
"""Generate About page HTML"""
# Build tokenizer lists
sections = []
for category, tokenizers in tokenizers_by_type.items():
if tokenizers:
items = ''.join([f'<li>{t}</li>' for t in tokenizers[:12]])
if len(tokenizers) > 12:
items += f'<li><em>...and {len(tokenizers) - 12} more</em></li>'
sections.append(f'''
<div class="about-category">
<h4>{category}</h4>
<ul>{items}</ul>
</div>
''')
return f'''
<div class="about-container">
<div class="about-header">
<h2>🏟️ Arabic Tokenizer Arena Pro</h2>
<p class="about-subtitle">A comprehensive platform for evaluating Arabic tokenizers across multiple dimensions</p>
</div>
<div class="about-stats">
<div class="stat-card">
<div class="stat-value">{total_count}</div>
<div class="stat-label">Available Tokenizers</div>
</div>
<div class="stat-card">
<div class="stat-value">8</div>
<div class="stat-label">Evaluation Datasets</div>
</div>
<div class="stat-card">
<div class="stat-value">8+</div>
<div class="stat-label">Metrics</div>
</div>
</div>
<div class="about-tokenizers">
<h3>πŸ“š Available Tokenizers</h3>
<div class="tokenizer-grid">
{''.join(sections)}
</div>
</div>
<div class="about-features">
<h3>✨ Features</h3>
<div class="feature-grid">
<div class="feature-item">
<span class="feature-icon">πŸ“Š</span>
<span>Comprehensive efficiency metrics (fertility, compression, STRR)</span>
</div>
<div class="feature-item">
<span class="feature-icon">🌍</span>
<span>Arabic-specific analysis (dialect support, diacritic preservation)</span>
</div>
<div class="feature-item">
<span class="feature-icon">βš–οΈ</span>
<span>Side-by-side tokenizer comparison</span>
</div>
<div class="feature-item">
<span class="feature-icon">🎨</span>
<span>Beautiful token visualization</span>
</div>
<div class="feature-item">
<span class="feature-icon">πŸ†</span>
<span>Leaderboard with real HuggingFace datasets</span>
</div>
<div class="feature-item">
<span class="feature-icon">πŸ“–</span>
<span>Support for MSA, dialectal, and Classical Arabic</span>
</div>
</div>
</div>
<div class="about-usecases">
<h3>🎯 Use Cases</h3>
<div class="usecase-grid">
<div class="usecase-card">
<h4>πŸ”¬ Research</h4>
<p>Compare tokenizers for Arabic NLP experiments</p>
</div>
<div class="usecase-card">
<h4>πŸš€ Production</h4>
<p>Select optimal tokenizer for deployment</p>
</div>
<div class="usecase-card">
<h4>πŸ“š Education</h4>
<p>Understand how different algorithms handle Arabic</p>
</div>
<div class="usecase-card">
<h4>πŸ’° Optimization</h4>
<p>Identify cost-efficient tokenizers for API usage</p>
</div>
</div>
</div>
<div class="about-footer">
<p>Built with ❀️ for the Arabic NLP community</p>
</div>
</div>
'''