Spaces:
Running
Running
| """ | |
| UI Components | |
| ============= | |
| HTML generation functions for the Gradio interface | |
| """ | |
| from typing import List | |
| from config import TokenizerInfo, TokenizationMetrics | |
| from utils import is_arabic_char | |
| def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str: | |
| """Generate beautiful HTML visualization of tokens""" | |
| colors = [ | |
| ('#1a1a2e', '#eaeaea'), | |
| ('#16213e', '#f0f0f0'), | |
| ('#0f3460', '#ffffff'), | |
| ('#533483', '#f5f5f5'), | |
| ('#e94560', '#ffffff'), | |
| ('#0f4c75', '#f0f0f0'), | |
| ('#3282b8', '#ffffff'), | |
| ('#bbe1fa', '#1a1a2e'), | |
| ] | |
| html_parts = [] | |
| for i, (token, tid) in enumerate(zip(tokens, token_ids)): | |
| bg, fg = colors[i % len(colors)] | |
| display_token = token.replace('<', '<').replace('>', '>') | |
| is_arabic = any(is_arabic_char(c) for c in token) | |
| direction = 'rtl' if is_arabic else 'ltr' | |
| html_parts.append(f''' | |
| <span class="token" style=" | |
| background: {bg}; | |
| color: {fg}; | |
| direction: {direction}; | |
| " title="ID: {tid}"> | |
| {display_token} | |
| <span class="token-id">{tid}</span> | |
| </span> | |
| ''') | |
| return f''' | |
| <div class="token-container"> | |
| {''.join(html_parts)} | |
| </div> | |
| ''' | |
| def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) -> str: | |
| """Generate metrics visualization card""" | |
| fertility_quality = "excellent" if metrics.fertility < 1.5 else "good" if metrics.fertility < 2.5 else "poor" | |
| strr_quality = "excellent" if metrics.single_token_retention_rate > 0.5 else "good" if metrics.single_token_retention_rate > 0.3 else "poor" | |
| compression_quality = "excellent" if metrics.compression_ratio > 4 else "good" if metrics.compression_ratio > 2.5 else "poor" | |
| return f''' | |
| <div class="metrics-grid"> | |
| <div class="metric-card primary"> | |
| <div class="metric-icon">π</div> | |
| <div class="metric-value">{metrics.total_tokens}</div> | |
| <div class="metric-label">Total Tokens</div> | |
| </div> | |
| <div class="metric-card {fertility_quality}"> | |
| <div class="metric-icon">π―</div> | |
| <div class="metric-value">{metrics.fertility:.3f}</div> | |
| <div class="metric-label">Fertility (tokens/word)</div> | |
| <div class="metric-hint">Lower is better (1.0 ideal)</div> | |
| </div> | |
| <div class="metric-card {compression_quality}"> | |
| <div class="metric-icon">π¦</div> | |
| <div class="metric-value">{metrics.compression_ratio:.2f}</div> | |
| <div class="metric-label">Compression (bytes/token)</div> | |
| <div class="metric-hint">Higher is better</div> | |
| </div> | |
| <div class="metric-card {strr_quality}"> | |
| <div class="metric-icon">β¨</div> | |
| <div class="metric-value">{metrics.single_token_retention_rate:.1%}</div> | |
| <div class="metric-label">STRR (Single Token Retention)</div> | |
| <div class="metric-hint">Higher is better</div> | |
| </div> | |
| <div class="metric-card"> | |
| <div class="metric-icon">π€</div> | |
| <div class="metric-value">{metrics.char_per_token:.2f}</div> | |
| <div class="metric-label">Characters/Token</div> | |
| </div> | |
| <div class="metric-card {'excellent' if metrics.oov_percentage == 0 else 'poor' if metrics.oov_percentage > 5 else 'good'}"> | |
| <div class="metric-icon">β</div> | |
| <div class="metric-value">{metrics.oov_percentage:.1f}%</div> | |
| <div class="metric-label">OOV Rate</div> | |
| <div class="metric-hint">Lower is better (0% ideal)</div> | |
| </div> | |
| <div class="metric-card"> | |
| <div class="metric-icon">π</div> | |
| <div class="metric-value">{metrics.arabic_fertility:.3f}</div> | |
| <div class="metric-label">Arabic Fertility</div> | |
| </div> | |
| <div class="metric-card"> | |
| <div class="metric-icon">β‘</div> | |
| <div class="metric-value">{metrics.tokenization_time_ms:.2f}ms</div> | |
| <div class="metric-label">Processing Time</div> | |
| </div> | |
| </div> | |
| ''' | |
| def generate_tokenizer_info_card(info: TokenizerInfo) -> str: | |
| """Generate tokenizer information card""" | |
| dialect_badges = ''.join([f'<span class="badge dialect">{d}</span>' for d in info.dialect_support]) | |
| feature_badges = ''.join([f'<span class="badge feature">{f}</span>' for f in info.special_features]) | |
| support_class = "native" if info.arabic_support == "Native" else "supported" if info.arabic_support == "Supported" else "limited" | |
| return f''' | |
| <div class="info-card"> | |
| <div class="info-header"> | |
| <h3>{info.name}</h3> | |
| <span class="org-badge">{info.organization}</span> | |
| </div> | |
| <p class="description">{info.description}</p> | |
| <div class="info-grid"> | |
| <div class="info-item"> | |
| <span class="info-label">Type:</span> | |
| <span class="info-value">{info.type.value}</span> | |
| </div> | |
| <div class="info-item"> | |
| <span class="info-label">Algorithm:</span> | |
| <span class="info-value">{info.algorithm.value}</span> | |
| </div> | |
| <div class="info-item"> | |
| <span class="info-label">Vocab Size:</span> | |
| <span class="info-value">{info.vocab_size:,}</span> | |
| </div> | |
| <div class="info-item"> | |
| <span class="info-label">Arabic Support:</span> | |
| <span class="info-value support-{support_class}">{info.arabic_support}</span> | |
| </div> | |
| </div> | |
| <div class="badge-container"> | |
| <div class="badge-group"> | |
| <span class="badge-label">Dialects:</span> | |
| {dialect_badges} | |
| </div> | |
| <div class="badge-group"> | |
| <span class="badge-label">Features:</span> | |
| {feature_badges} | |
| </div> | |
| </div> | |
| </div> | |
| ''' | |
| def generate_decoded_section(metrics: TokenizationMetrics) -> str: | |
| """Generate decoded output section""" | |
| return f''' | |
| <div class="decoded-section"> | |
| <h4>Decoded Output</h4> | |
| <div class="decoded-text" dir="auto">{metrics.decoded_text}</div> | |
| <div class="decoded-meta"> | |
| Diacritics preserved: {'β Yes' if metrics.diacritic_preservation else 'β No'} | |
| </div> | |
| </div> | |
| ''' | |
| def generate_about_html(tokenizers_by_type: dict, total_count: int) -> str: | |
| """Generate About page HTML""" | |
| # Build tokenizer lists | |
| sections = [] | |
| for category, tokenizers in tokenizers_by_type.items(): | |
| if tokenizers: | |
| items = ''.join([f'<li>{t}</li>' for t in tokenizers[:12]]) | |
| if len(tokenizers) > 12: | |
| items += f'<li><em>...and {len(tokenizers) - 12} more</em></li>' | |
| sections.append(f''' | |
| <div class="about-category"> | |
| <h4>{category}</h4> | |
| <ul>{items}</ul> | |
| </div> | |
| ''') | |
| return f''' | |
| <div class="about-container"> | |
| <div class="about-header"> | |
| <h2>ποΈ Arabic Tokenizer Arena Pro</h2> | |
| <p class="about-subtitle">A comprehensive platform for evaluating Arabic tokenizers across multiple dimensions</p> | |
| </div> | |
| <div class="about-stats"> | |
| <div class="stat-card"> | |
| <div class="stat-value">{total_count}</div> | |
| <div class="stat-label">Available Tokenizers</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-value">8</div> | |
| <div class="stat-label">Evaluation Datasets</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-value">8+</div> | |
| <div class="stat-label">Metrics</div> | |
| </div> | |
| </div> | |
| <div class="about-tokenizers"> | |
| <h3>π Available Tokenizers</h3> | |
| <div class="tokenizer-grid"> | |
| {''.join(sections)} | |
| </div> | |
| </div> | |
| <div class="about-features"> | |
| <h3>β¨ Features</h3> | |
| <div class="feature-grid"> | |
| <div class="feature-item"> | |
| <span class="feature-icon">π</span> | |
| <span>Comprehensive efficiency metrics (fertility, compression, STRR)</span> | |
| </div> | |
| <div class="feature-item"> | |
| <span class="feature-icon">π</span> | |
| <span>Arabic-specific analysis (dialect support, diacritic preservation)</span> | |
| </div> | |
| <div class="feature-item"> | |
| <span class="feature-icon">βοΈ</span> | |
| <span>Side-by-side tokenizer comparison</span> | |
| </div> | |
| <div class="feature-item"> | |
| <span class="feature-icon">π¨</span> | |
| <span>Beautiful token visualization</span> | |
| </div> | |
| <div class="feature-item"> | |
| <span class="feature-icon">π</span> | |
| <span>Leaderboard with real HuggingFace datasets</span> | |
| </div> | |
| <div class="feature-item"> | |
| <span class="feature-icon">π</span> | |
| <span>Support for MSA, dialectal, and Classical Arabic</span> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="about-usecases"> | |
| <h3>π― Use Cases</h3> | |
| <div class="usecase-grid"> | |
| <div class="usecase-card"> | |
| <h4>π¬ Research</h4> | |
| <p>Compare tokenizers for Arabic NLP experiments</p> | |
| </div> | |
| <div class="usecase-card"> | |
| <h4>π Production</h4> | |
| <p>Select optimal tokenizer for deployment</p> | |
| </div> | |
| <div class="usecase-card"> | |
| <h4>π Education</h4> | |
| <p>Understand how different algorithms handle Arabic</p> | |
| </div> | |
| <div class="usecase-card"> | |
| <h4>π° Optimization</h4> | |
| <p>Identify cost-efficient tokenizers for API usage</p> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="about-footer"> | |
| <p>Built with β€οΈ for the Arabic NLP community</p> | |
| </div> | |
| </div> | |
| ''' | |