Spaces:

HeshamHaroon
/

Arabic_Tokenizer

Running

App Files Files Community

Arabic_Tokenizer / app.py

HeshamHaroon

Add leaderboard caching and fix dataset configurations

532fc72 10 days ago

raw

history blame contribute delete

13.8 kB

	"""
	Arabic Tokenizer Arena Pro - Main Application
	==============================================
	Advanced research & production platform for Arabic tokenization analysis

	Run with: python app.py
	"""

	import gradio as gr

	# Import modules
	from config import SAMPLE_TEXTS, LEADERBOARD_DATASETS
	from styles import CUSTOM_CSS
	from tokenizer_manager import tokenizer_manager
	from analysis import analyze_single_tokenizer, compare_tokenizers
	from leaderboard import run_leaderboard_evaluation, evaluate_submitted_tokenizer, get_cached_leaderboard
	from ui_components import generate_about_html


	def create_interface():
	"""Create the Gradio interface"""

	available_tokenizers = tokenizer_manager.get_tokenizer_choices()
	tokenizers_by_type = tokenizer_manager.get_tokenizers_by_type()

	with gr.Blocks(
	css=CUSTOM_CSS,
	title="Arabic Tokenizer Arena Pro",
	theme=gr.themes.Base(
	primary_hue="green",
	secondary_hue="blue",
	neutral_hue="slate",
	font=["IBM Plex Sans Arabic", "system-ui", "sans-serif"]
	)
	) as demo:

	# Header
	gr.HTML("""
	<div class="header-section">
	<h1>🏟️ Arabic Tokenizer Arena Pro</h1>
	<p>Advanced research & production platform for Arabic tokenization analysis</p>
	</div>
	""")

	with gr.Tabs():
	# ===== TAB 1: Single Tokenizer Analysis =====
	with gr.TabItem("🔬 Single Analysis", id="single"):
	with gr.Row():
	with gr.Column(scale=1):
	tokenizer_dropdown = gr.Dropdown(
	choices=available_tokenizers,
	value=available_tokenizers[0] if available_tokenizers else None,
	label="Select Tokenizer",
	info="Choose a tokenizer to analyze"
	)

	sample_dropdown = gr.Dropdown(
	choices=list(SAMPLE_TEXTS.keys()),
	label="Sample Texts",
	info="Select a sample or enter custom text"
	)

	input_text = gr.Textbox(
	lines=4,
	placeholder="اكتب النص العربي هنا...\nEnter Arabic text here...",
	label="Input Text",
	rtl=True
	)

	analyze_btn = gr.Button("🔍 Analyze", variant="primary", size="lg")

	with gr.Column(scale=2):
	info_output = gr.HTML(label="Tokenizer Information")

	metrics_output = gr.HTML(label="Evaluation Metrics")
	tokens_output = gr.HTML(label="Token Visualization")
	decoded_output = gr.HTML(label="Decoded Output")

	sample_dropdown.change(
	lambda x: SAMPLE_TEXTS.get(x, ""),
	inputs=[sample_dropdown],
	outputs=[input_text]
	)

	analyze_btn.click(
	analyze_single_tokenizer,
	inputs=[tokenizer_dropdown, input_text],
	outputs=[info_output, metrics_output, tokens_output, decoded_output]
	)

	# ===== TAB 2: Comparison Mode =====
	with gr.TabItem("⚖️ Compare Tokenizers", id="compare"):
	with gr.Row():
	with gr.Column(scale=1):
	compare_tokenizers_select = gr.CheckboxGroup(
	choices=available_tokenizers,
	value=available_tokenizers[:5] if len(available_tokenizers) >= 5 else available_tokenizers,
	label="Select Tokenizers to Compare",
	info="Choose 2 or more tokenizers"
	)

	compare_sample = gr.Dropdown(
	choices=list(SAMPLE_TEXTS.keys()),
	label="Sample Texts"
	)

	compare_text = gr.Textbox(
	lines=4,
	placeholder="اكتب النص العربي هنا...",
	label="Input Text",
	rtl=True
	)

	compare_btn = gr.Button("⚖️ Compare", variant="primary", size="lg")

	with gr.Column(scale=2):
	comparison_output = gr.HTML(label="Comparison Results")

	compare_sample.change(
	lambda x: SAMPLE_TEXTS.get(x, ""),
	inputs=[compare_sample],
	outputs=[compare_text]
	)

	compare_btn.click(
	compare_tokenizers,
	inputs=[compare_tokenizers_select, compare_text],
	outputs=[comparison_output]
	)

	# ===== TAB 3: LEADERBOARD =====
	with gr.TabItem("🏆 Leaderboard", id="leaderboard"):
	gr.Markdown("""
	## 🏆 Arabic Tokenizer Leaderboard

	All tokenizers evaluated on all 8 Arabic datasets from HuggingFace (~36,000+ samples total).
	""")

	with gr.Row():
	status_output = gr.Markdown("⏳ Loading cached results...")
	re_evaluate_btn = gr.Button("🔄 Re-evaluate All", variant="secondary", size="sm")

	gr.Markdown("### 📊 Leaderboard Results")
	leaderboard_output = gr.HTML()

	gr.Markdown("### 📈 Per-Dataset Breakdown")
	per_dataset_output = gr.HTML()

	re_evaluate_btn.click(
	fn=run_leaderboard_evaluation,
	inputs=[],
	outputs=[leaderboard_output, per_dataset_output, status_output]
	)

	gr.Markdown("""
	---
	### 📖 Evaluation Datasets

	\| Dataset \| Category \| Samples \|
	\|---------\|----------\|---------\|
	\| ArabicMMLU \| MSA Benchmark \| 5,000 \|
	\| ASTD \| Egyptian Dialect \| 5,000 \|
	\| ATHAR \| Classical Arabic \| 5,000 \|
	\| ARCD \| QA Dataset \| 1,395 \|
	\| Ashaar \| Poetry \| 5,000 \|
	\| Hadith \| Religious \| 5,000 \|
	\| Arabic Sentiment \| Social Media \| 5,000 \|
	\| SANAD \| News \| 5,000 \|
	""")

	# ===== TAB 4: Metrics Reference =====
	with gr.TabItem("📖 Metrics Guide", id="guide"):
	gr.Markdown("""
	## Tokenization Evaluation Metrics Guide

	### Efficiency Metrics

	\| Metric \| Description \| Ideal Value \| Why It Matters \|
	\|--------\|-------------\|-------------\|----------------\|
	\| Fertility \| Tokens per word \| 1.0 \| Lower fertility = fewer tokens = faster inference & lower cost \|
	\| Compression Ratio \| Bytes per token \| Higher is better \| Better compression = more efficient encoding \|
	\| Chars/Token \| Characters per token \| Higher is better \| More characters per token = better vocabulary utilization \|

	### Coverage Metrics

	\| Metric \| Description \| Ideal Value \| Why It Matters \|
	\|--------\|-------------\|-------------\|----------------\|
	\| OOV Rate \| Out-of-vocabulary percentage \| 0% \| Lower OOV = better vocabulary coverage \|
	\| STRR \| Single Token Retention Rate \| Higher is better \| More words preserved as single tokens = better semantic boundaries \|
	\| Continued Words Ratio \| Words split into multiple tokens \| Lower is better \| Fewer splits = better word boundary preservation \|

	### Arabic-Specific Metrics

	\| Metric \| Description \| Why It Matters \|
	\|--------\|-------------\|----------------\|
	\| Arabic Fertility \| Tokens per Arabic word \| Arabic-specific efficiency measure \|
	\| Diacritic Preservation \| Whether tashkeel is preserved \| Important for religious & educational texts \|

	### Scoring Formula (Leaderboard)

	```
	Score = (Fertility Score × 0.45) + (Compression Score × 0.35) + (UNK Score × 0.20) × 100
	```

	Where:
	- Fertility Score = 2.0 / fertility (capped 0-1, inverted - lower fertility = higher score)
	- Compression Score = compression / 6 (capped 0-1)
	- UNK Score = 1 - (unk_ratio × 20) (capped 0-1, inverted)

	### Research Background

	These metrics are based on recent research including:
	- "A Comprehensive Analysis of Various Tokenizers for Arabic LLMs" (2024)
	- "Evaluating Various Tokenizers for Arabic Text Classification" (Alyafeai et al.)
	- "Beyond Fertility: STRR as a Metric for Multilingual Tokenization" (2025)
	- "Arabic Stable LM: Adapting Stable LM to Arabic" (2024)
	""")

	# ===== TAB 5: Submit Tokenizer =====
	with gr.TabItem("🚀 Submit", id="submit"):
	gr.Markdown("""
	## 🚀 Submit Your Tokenizer

	Evaluate any HuggingFace tokenizer on all 8 Arabic datasets and see how it compares.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Model Information")

	submit_model_id = gr.Textbox(
	label="HuggingFace Model ID *",
	placeholder="e.g., google/gemma-2-9b",
	info="The model ID from HuggingFace Hub"
	)

	submit_model_name = gr.Textbox(
	label="Display Name (optional)",
	placeholder="e.g., My Custom Tokenizer",
	info="Leave empty to use model name"
	)

	submit_organization = gr.Textbox(
	label="Organization (optional)",
	placeholder="e.g., My Organization",
	info="Leave empty to auto-detect"
	)

	submit_model_type = gr.Dropdown(
	choices=[
	"Arabic LLM",
	"Arabic BERT",
	"Arabic Tokenizer",
	"Multilingual LLM",
	"Custom"
	],
	value="Custom",
	label="Model Type"
	)

	submit_btn = gr.Button("🚀 Evaluate Tokenizer", variant="primary", size="lg")

	submit_status = gr.Markdown("")

	with gr.Column(scale=2):
	gr.Markdown("### Evaluation Results")
	submit_results = gr.HTML()

	submit_btn.click(
	fn=evaluate_submitted_tokenizer,
	inputs=[submit_model_id, submit_model_name, submit_organization, submit_model_type],
	outputs=[submit_results, submit_status]
	)

	gr.Markdown("""
	---
	### 📋 Submission Guidelines

	- Model ID: Must be a valid HuggingFace model ID (e.g., `organization/model-name`)
	- Tokenizer: The model must have a tokenizer that can be loaded with `AutoTokenizer`
	- Public Models: Only public models on HuggingFace Hub are supported
	- Evaluation: Your tokenizer will be evaluated on all 8 Arabic datasets (~36,000+ samples)

	### 💡 Tips

	- Lower fertility scores indicate better Arabic tokenization efficiency
	- Compare your results with the leaderboard to see how your tokenizer ranks
	""")

	# ===== TAB 6: About =====
	with gr.TabItem("ℹ️ About", id="about"):
	about_html = generate_about_html(
	tokenizers_by_type,
	len(available_tokenizers)
	)
	gr.HTML(about_html)

	# Load cached leaderboard results on page load (fast)
	demo.load(
	fn=get_cached_leaderboard,
	inputs=[],
	outputs=[leaderboard_output, per_dataset_output, status_output]
	)

	return demo


	# ============================================================================
	# MAIN
	# ============================================================================

	if __name__ == "__main__":
	demo = create_interface()
	demo.launch()