Spaces:

casamN
/

NRF_LLM

Running

App Files Files Community

NRF_LLM / app.py

casamN

Update app.py

1aadc7b verified about 2 months ago

raw

history blame contribute delete

18.5 kB

	# =========================================================
	# KIEMBU ↔ ENGLISH — NRF KENYA TRANSLATION SUITE
	# =========================================================

	import os
	import gradio as gr
	import fitz
	import faiss
	import re
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	from reportlab.lib.pagesizes import A4
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
	from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER
	from reportlab.lib.units import inch
	from reportlab.pdfbase.cidfonts import UnicodeCIDFont
	from reportlab.pdfbase import pdfmetrics
	from PyPDF2 import PdfReader

	# ============================================
	# SECTION 1 — SIMPLE DICTIONARY TRANSLATOR
	# ============================================
	# ===========================================
	# Kiembu ↔ English Dictionary (Case & Punctuation Insensitive)
	# ===========================================
	kiembu_to_english = {
	# Existing entries
	"Uvoro": "how are you", "Ri?": "When?", "Ku?": "Where?", "Uka": "come",
	"Hava": "here", "Varia": "there", "Vakuve": "close", "Kuraca": "far",
	"Ciakwa": "my/mine", "Cucu": "grandmother", "Mundu": "person",
	"Andu": "people", "Mwana": "child", "Mutumia": "woman", "Muthuri": "man",
	"Ngai": "God", "Wendo": "love", "Ngui": "dog", "Nyomba": "house", "Ndawa": "medicine",
	"Maaĩ": "water", "Mwaki": "fire", "Rĩũa": "sun", "Mweri": "moon", "Njata": "star",
	"ĩthiga": "stone", "Mũtĩ": "tree", "ĩthangũ": "leaf", "Mũri": "root", "ĩkoro": "bark",
	"MũndũMũrume": "man", "MũndũMũka": "woman", "fafa": "father", "Mami": "mother",
	"Gũtũ": "ear", "Ritho": "eye", "ĩniũrũ": "nose", "Kanyua": "mouth", "ĩgego": "tooth",
	"rũrĩmĩ": "tongue", "Njara": "hand", "Kũgũrũ": "foot", "Thakame": "blood", "ĩvĩndĩ": "bone",
	"Ngothi": "skin", "Nyama": "meat", "Nthamaki": "fish", "Giconi": "bird", "ĩtumbĩ": "egg",
	"Nvĩa": "horn", "Mũkia": "tail", "ĩvuta": "feather", "Njuĩrĩ": "hair", "Kĩongo": "head",
	"Ngingo": "neck", "Mũrukuthu": "back", "Ngoro": "heart", "Itema": "liver", "nyua": "drink",
	"ria": "eat", "mama": "sleep", "kua": "die", "ũka": "come", "ona": "see", "ĩgua": "hear",
	"menya": "know", "ĩciria": "think", "uga": "say", "ĩmwe": "one", "ĩgarĩ": "two",
	"ĩthatũ": "three", "ĩnya": "four", "ĩthano": "five", "ithathatu": "six", "mugwanja": "seven",
	"inyanya": "eight", "kenda": "nine", "ĩkumi": "ten", "nene": "big", "nini": "small",
	"ndaca": "long", "nguvi": "short", "mbega": "good", "njũku": "bad", "mbĩcuru": "full",
	"ĩtikĩndu": "empty", "nviũ": "hot", "nvoru": "cold", "ũtukũ": "night", "Mũthenya": "day",
	"Mbura": "rain", "rũkũngi": "wind", "nthĩ": "earth", "kĩrĩma": "mountain", "rũnjĩ": "river",
	"ĩria": "lake/sea", "cumbĩ": "salt", "mũthanga": "sand", "ndogo": "smoke", "nyaki": "grass",
	"njira": "path", "kivaro": "field", "kuraca": "far", "vakuvi": "near", "ava": "here",
	"varia": "there", "ũũ": "who", "ndwi": "what", "kũ": "where", "rĩ": "when", "atia": "how",
	"ka": "not", "onthe": "all", "engĩ": "many", "anini": "few", "jerũ": "new", "ngũrũ": "old",
	"kĩthũrũrũ": "round", "kaũgĩ": "sharp", "ritwa": "name", "tirama": "stand", "ĩkara": "sit",
	"thiĩ": "walk", "ngaria": "run", "va": "give", "oca": "take", "nyita": "hold",
	"tiniaa": "cut", "ringa": "hit", "ikia": "throw", "via": "burn", "ĩthambĩra": "swim",
	"ina": "sing", "katika": "dance", "theka": "laugh", "rĩra": "cry", "rũma": "bite",
	"mumunya": "suck", "nungira": "smell", "ĩtigĩra": "fear", "wina toro": "sleepy",
	"mũvũtu": "hungry", "mũnyondu": "thirsty", "ndune": "red", "njerũ": "white",
	"mbirũ": "black", "ngirini": "green", "yellũ": "yellow", "mbulu": "blue",
	"matu": "cloud", "maturĩ": "sky", "rũkũngũ": "dust", "mũu": "ashes", "mũkanda": "rope",
	"kamũti": "stick", "kaviũ": "knife", "ũta": "bow", "mũgwi": "arrow", "itumũ": "spear",
	"gitegithamaki": "fishhook", "neti": "net", "ĩtaru": "canoe", "mũrango": "door",
	"ĩtara": "roof", "nthĩ": "floor", "Mũgeka": "mat", "kĩtanda": "bed", "mũrengeti": "blanket",
	"nyũngũ": "pot", "kanya": "calabash", "gĩkapũ": "basket", "nduramu": "drum",
	"rwĩmbo": "song", "rũgano": "story", "thakania": "play", "mũrata": "friend",
	"nthũ": "enemy", "civũ": "chief", "mũkũrũ": "elder", "ĩria": "milk", "ngombe": "cow/cattle",
	"mbũri": "goat", "ngondu": "sheep", "ngũkũ": "chicken", "ngamĩra": "camel",
	"nvuda": "donkey", "ndegwa": "ox", "mbegũ": "seed", "ketha": "harvest", "ũma": "hoe",
	"ĩthanwa": "axe", "mũro": "digging stick", "Mũtumi ciodo": "weaver",
	"Mwaki nyũngũ": "potter", "mũturi": "blacksmith", "mũgwĩmi": "hunter",
	"mũrĩthi": "herdsman", "mũteginthamaki": "fisherman", "thoko": "market",
	"kwendia": "trade", "cenjania": "barter", "mathaa": "time", "mavinda": "season",
	"ĩvinda rĩa riũa": "dry season", "ivinda ria mbura": "rainy season", "rĩũra": "famine",
	"thayũ": "peace", "mbara": "war", "gũrana": "marriage", "mũviki": "bride",
	"mũvikania": "groom", "ĩrua": "initiation", "kũrua": "circumcision",
	"kĩkuũ": "death", "ngoma": "spirit", "ngomi": "ancestor", "mũgĩmbĩ": "finger millet",
	"mũkombi": "pearl millet", "mwere": "bulrush millet", "mũvia": "sorghum",
	"mbembe": "maize", "minji": "cowpea", "ndengũ": "green gram", "njavĩ": "pigeon pea",
	"ndũma": "arrowroot/taro", "mwanga": "cassava", "gĩkũa": "yam", "ngwacĩ": "sweet potato",
	"ĩrenge": "pumpkin", "sukuma": "kale", "terere": "amaranth", "Thageti": "spider plant",
	"kaũrũra": "pumpkin leaves", "kunde": "cowpea leaves", "Mabuyu": "baobab fruit",
	"nthithi": "tamarind", "mbera": "guava", "matimoko": "custard apple/soursop",
	"macuca": "loquat", "kĩgwa": "sugarcane", "njahĩ": "sesame", "Marũrũ": "sunflower",
	"mbiringanya": "eggplant", "nyanya": "tomato", "gĩtũngũrũ": "onion",
	"kĩtũngũrũ saumu": "garlic", "tangauthi": "ginger", "murende": "turmeric",
	"nduru": "chili", "mboga": "cabbage", "karati": "carrot", "njukĩ": "bee", "ukĩ": "honey",
	"mwatu": "beehive", "mabaki": "wax", "maguta": "butter", "kĩrimũ": "cream",
	"alenya": "ghee", "ĩria ra kũgandithua": "sour milk"
	}

	# --- Helper: Normalize user input ---
	def normalize(text):
	"""
	Cleans text for case-insensitive and punctuation-insensitive lookup.
	Removes punctuation (.,?!-), converts to lowercase, trims spaces.
	"""
	text = text.lower().strip()
	text = re.sub(r"[.,?!-]", "", text) # remove punctuation
	return text

	# --- Prepare lookup tables (in lowercase) ---
	kiembu_lower = {normalize(k): v for k, v in kiembu_to_english.items()}
	english_lower = {normalize(v): k for k, v in kiembu_to_english.items()}

	# --- Translation Function ---
	def translate_word(word, direction):
	"""Translate a word between Kiembu and English, ignoring case & punctuation."""
	cleaned = normalize(word)

	if direction == "Kiembu → English":
	return kiembu_lower.get(cleaned, "Not found in dictionary")
	elif direction == "English → Kiembu":
	return english_lower.get(cleaned, "Not found in dictionary")
	else:
	return "Invalid translation direction. Use 'Kiembu → English' or 'English → Kiembu'."


	# ============================================
	# SECTION 2 — PDF TRANSLATION (Transformer + PDF)
	# ============================================

	translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-sw") # placeholder

	def extract_text_from_pdf(pdf_file):
	reader = PdfReader(pdf_file)
	text = ""
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	return text.strip()

	def translate_text(text):
	chunks = text.split(". ")
	translated = []
	for chunk in chunks:
	if chunk.strip():
	try:
	tr = translator(chunk.strip())[0]["translation_text"]
	translated.append(tr)
	except Exception:
	translated.append(chunk)
	return ". ".join(translated)

	def create_pdf(translated_text, output_path="translated_output.pdf"):
	pdfmetrics.registerFont(UnicodeCIDFont('HeiseiKakuGo-W5'))
	doc = SimpleDocTemplate(output_path, pagesize=A4, rightMargin=60, leftMargin=60, topMargin=72, bottomMargin=72)
	styles = getSampleStyleSheet()
	title_style = ParagraphStyle(name='TitleStyle', parent=styles['Heading1'],
	alignment=TA_CENTER, fontName='HeiseiKakuGo-W5',
	fontSize=16, spaceAfter=20)
	body_style = ParagraphStyle(name='BodyStyle', parent=styles['Normal'],
	alignment=TA_JUSTIFY, fontName='HeiseiKakuGo-W5',
	fontSize=12, leading=16)
	story = [Paragraph("Translated Document — English → Kiembu", title_style),
	Spacer(1, 0.3 * inch)]
	for para in translated_text.split("\n"):
	if para.strip():
	story.append(Paragraph(para.strip(), body_style))
	story.append(Spacer(1, 0.2 * inch))
	doc.build(story)
	return output_path

	def translate_pdf_to_kiembu(pdf_file):
	text = extract_text_from_pdf(pdf_file.name)
	if not text:
	return None, "No readable text found in the uploaded PDF."
	translated_text = translate_text(text)
	output_pdf_path = create_pdf(translated_text)
	return output_pdf_path, "Translation complete! Download below."


	# ============================================
	# SECTION 3 — NRF LLM MODEL PDF CHAT
	# ============================================

	embed_model = SentenceTransformer("all-MiniLM-L6-v2")
	model_name = "google/gemma-2b-it"
	hf_token = os.getenv("NRF_LLM_TOKEN")
	tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
	model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto", use_auth_token=hf_token)
	generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)

	chunks, index, pdf_loaded = [], None, False

	def extract_pdf_text(pdf_file):
	doc = fitz.open(pdf_file.name)
	text = ""
	for page in doc:
	text += page.get_text()
	return text

	def chunk_text(text, chunk_size=500, overlap=100):
	words = text.split()
	chunk_list = []
	start = 0
	while start < len(words):
	end = min(start + chunk_size, len(words))
	chunk_list.append(" ".join(words[start:end]))
	start += chunk_size - overlap
	return chunk_list

	def embed_chunks(chunks_list):
	embeddings = embed_model.encode(chunks_list)
	idx = faiss.IndexFlatL2(embeddings.shape[1])
	idx.add(np.array(embeddings))
	return idx

	def load_pdf_and_prepare(pdf_file):
	global chunks, index, pdf_loaded
	try:
	text = extract_pdf_text(pdf_file)
	chunks = chunk_text(text)
	index = embed_chunks(chunks)
	pdf_loaded = True
	return "✅ PDF uploaded and processed successfully."
	except Exception as e:
	return f"❌ Error: {str(e)}"

	def delete_pdf():
	global chunks, index, pdf_loaded
	chunks, index, pdf_loaded = [], None, False
	return "🗑️ PDF cleared. Ready for new upload."

	def query_pdf(question, top_k=3):
	if not pdf_loaded:
	return "⚠️ Please upload and process a PDF first."
	question_embedding = embed_model.encode([question])
	D, I = index.search(np.array(question_embedding), top_k)
	context = "\n".join([chunks[i] for i in I[0]])
	prompt = f"Answer the question using the context:\n\n{context}\n\nQuestion: {question}\nAnswer:"
	response = generator(prompt)[0]["generated_text"]
	return response.split("Answer:")[-1].strip()


	# ============================================
	# SECTION 4 — ENHANCED GRADIO UI
	# ============================================

	def build_app():
	custom_css = """
	body {
	background: #f5f5f5;
	margin: 0;
	padding: 0;
	overflow: auto;
	}

	.gradio-container {
	display: flex;
	flex-direction: column;
	align-items: center;
	justify-content: flex-start;
	min-height: 100vh;
	padding: 30px 15px;
	box-sizing: border-box;
	border: 2px solid #ccc;
	border-radius: 16px;
	box-shadow: 0 4px 16px rgba(0,0,0,0.1);
	background: white;
	max-width: 900px;
	margin: 20px auto;
	overflow-y: auto;
	}

	::-webkit-scrollbar {
	width: 10px;
	}
	::-webkit-scrollbar-track {
	background: #eee;
	border-radius: 10px;
	}
	::-webkit-scrollbar-thumb {
	background: #aaa;
	border-radius: 10px;
	}
	::-webkit-scrollbar-thumb:hover {
	background: #777;
	}

	textarea, input[type="text"], .gr-textbox, .gr-input {
	border: 2px solid #bbb !important;
	border-radius: 10px !important;
	padding: 8px !important;
	box-shadow: inset 0 2px 4px rgba(0,0,0,0.05);
	transition: border-color 0.2s ease, box-shadow 0.2s ease;
	}
	textarea:focus, input[type="text"]:focus {
	border-color: #0078D7 !important;
	box-shadow: 0 0 5px rgba(0,120,215,0.3) !important;
	outline: none;
	}

	button, .gr-button {
	border-radius: 10px !important;
	padding: 10px 16px !important;
	font-weight: 600 !important;
	}
	"""

	with gr.Blocks(
	title="Kiembu ↔ English — NRF Kenya Project",
	css=custom_css,
	theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")
	) as app:

	gr.Markdown("""
	<div style='text-align:center'>
	<h1 style='color:#003366;'>Kiembu ↔ English Translation Suite</h1>
	<h3 style='color:#d4a017;'>Funded by NRF Kenya — Creating LLMs that Understand Native Languages</h3>
	<hr style='border:1px solid #003366;width:80%;margin:auto'>
	</div>
	""")

	with gr.Tabs():

	# -----------------------------
	# TAB 1: DICTIONARY TRANSLATOR
	# -----------------------------
	with gr.TabItem("Dictionary Translator"):
	gr.Markdown("""
	### Quick Word Translation — Kiembu ↔ English
	Enter a single word or short phrase and get its translation instantly.

	Sample Words:
	\| Kiembu \| English \|
	\|:--\|:--\|
	\| Uvoro \| how are you \|
	\| Ri? \| When? \|
	\| Ku? \| Where? \|
	\| Uka \| come \|
	""")
	inp = gr.Textbox(label="Enter Word", placeholder="e.g. 'Uvoro' or 'how are you'", lines=1)
	dir_sel = gr.Radio(
	["Kiembu → English", "English → Kiembu"],
	value="Kiembu → English",
	label="Select Direction"
	)
	out = gr.Textbox(label="Translation Result")
	gr.Button("Translate").click(translate_word, [inp, dir_sel], out)

	# -----------------------------
	# TAB 2: PDF TRANSLATION
	# -----------------------------
	with gr.TabItem("PDF Translation"):
	gr.Markdown("""
	### English → Kiembu PDF Translator
	Upload an English PDF document (e.g., ID form, hospital form, passport form)
	and get a translated PDF in Kiembu for download.
	""")
	pdf_input = gr.File(label="Upload English PDF", file_types=[".pdf"])
	translate_btn = gr.Button("Translate to Kiembu")
	output_file = gr.File(label="Download Translated PDF")
	status = gr.Textbox(label="Status", interactive=False)
	translate_btn.click(translate_pdf_to_kiembu, inputs=[pdf_input], outputs=[output_file, status])

	# -----------------------------
	# TAB 3: NRF LLM MODEL Q&A
	# -----------------------------
	with gr.TabItem("PDF Chat (NRF LLM Model)"):
	gr.Markdown("""
	### Interactive PDF Chat — NRF LLM Model
	Upload any informative PDF (e.g., government report, history book, or manual)
	and ask natural-language questions to understand its content better.

	Examples:
	- "What does this document say about birth registration?"
	- "Summarize Chapter 2."
	""")
	pdf = gr.File(label="Upload PDF Document")
	status = gr.Textbox(label="Status")
	gr.Button("Process PDF").click(load_pdf_and_prepare, pdf, status)
	gr.Button("Clear PDF").click(delete_pdf, None, status)
	q = gr.Textbox(lines=2, label="Ask a Question", placeholder="e.g. 'Summarize the introduction section.'")
	ans = gr.Textbox(lines=6, label="Answer")
	gr.Button("Query PDF").click(query_pdf, q, ans)

	# -----------------------------
	# TAB 4: ABOUT
	# -----------------------------
	with gr.TabItem("About"):
	gr.Markdown("""
	### About the Project
	The NRF Kenya Project on Creating LLMs that Understand Native Languages
	aims to preserve and promote indigenous linguistic heritage through advanced AI translation tools.

	- Languages Supported: Kiembu ↔ English
	- Core Engine: NRF LLM Model under development
	- : Principal Investigator: Prof Lucy Kawira – Chuka University
	- **Developed by: Technical Team: Coordinator- Casam Njagi – Chuka University
	- Funding Agency: National Research Fund (NRF), Kenya
	- Objective: Foster inclusion of native languages in AI-driven communication.
	""")

	gr.Markdown("""
	<hr style='border:0.5px solid #ccc'>
	<div style='text-align:center;color:#003366;font-size:14px'>
	© 2025 National Research Fund (NRF) Kenya — All Rights Reserved
	</div>
	""")

	return app


	demo = build_app()
	demo.launch()