NRF_LLM / app.py
casamN's picture
Update app.py
1aadc7b verified
# =========================================================
# KIEMBU ↔ ENGLISH — NRF KENYA TRANSLATION SUITE
# =========================================================
import os
import gradio as gr
import fitz
import faiss
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER
from reportlab.lib.units import inch
from reportlab.pdfbase.cidfonts import UnicodeCIDFont
from reportlab.pdfbase import pdfmetrics
from PyPDF2 import PdfReader
# ============================================
# SECTION 1 — SIMPLE DICTIONARY TRANSLATOR
# ============================================
# ===========================================
# Kiembu ↔ English Dictionary (Case & Punctuation Insensitive)
# ===========================================
kiembu_to_english = {
# Existing entries
"Uvoro": "how are you", "Ri?": "When?", "Ku?": "Where?", "Uka": "come",
"Hava": "here", "Varia": "there", "Vakuve": "close", "Kuraca": "far",
"Ciakwa": "my/mine", "Cucu": "grandmother", "Mundu": "person",
"Andu": "people", "Mwana": "child", "Mutumia": "woman", "Muthuri": "man",
"Ngai": "God", "Wendo": "love", "Ngui": "dog", "Nyomba": "house", "Ndawa": "medicine",
"Maaĩ": "water", "Mwaki": "fire", "Rĩũa": "sun", "Mweri": "moon", "Njata": "star",
"ĩthiga": "stone", "Mũtĩ": "tree", "ĩthangũ": "leaf", "Mũri": "root", "ĩkoro": "bark",
"MũndũMũrume": "man", "MũndũMũka": "woman", "fafa": "father", "Mami": "mother",
"Gũtũ": "ear", "Ritho": "eye", "ĩniũrũ": "nose", "Kanyua": "mouth", "ĩgego": "tooth",
"rũrĩmĩ": "tongue", "Njara": "hand", "Kũgũrũ": "foot", "Thakame": "blood", "ĩvĩndĩ": "bone",
"Ngothi": "skin", "Nyama": "meat", "Nthamaki": "fish", "Giconi": "bird", "ĩtumbĩ": "egg",
"Nvĩa": "horn", "Mũkia": "tail", "ĩvuta": "feather", "Njuĩrĩ": "hair", "Kĩongo": "head",
"Ngingo": "neck", "Mũrukuthu": "back", "Ngoro": "heart", "Itema": "liver", "nyua": "drink",
"ria": "eat", "mama": "sleep", "kua": "die", "ũka": "come", "ona": "see", "ĩgua": "hear",
"menya": "know", "ĩciria": "think", "uga": "say", "ĩmwe": "one", "ĩgarĩ": "two",
"ĩthatũ": "three", "ĩnya": "four", "ĩthano": "five", "ithathatu": "six", "mugwanja": "seven",
"inyanya": "eight", "kenda": "nine", "ĩkumi": "ten", "nene": "big", "nini": "small",
"ndaca": "long", "nguvi": "short", "mbega": "good", "njũku": "bad", "mbĩcuru": "full",
"ĩtikĩndu": "empty", "nviũ": "hot", "nvoru": "cold", "ũtukũ": "night", "Mũthenya": "day",
"Mbura": "rain", "rũkũngi": "wind", "nthĩ": "earth", "kĩrĩma": "mountain", "rũnjĩ": "river",
"ĩria": "lake/sea", "cumbĩ": "salt", "mũthanga": "sand", "ndogo": "smoke", "nyaki": "grass",
"njira": "path", "kivaro": "field", "kuraca": "far", "vakuvi": "near", "ava": "here",
"varia": "there", "ũũ": "who", "ndwi": "what", "kũ": "where", "rĩ": "when", "atia": "how",
"ka": "not", "onthe": "all", "engĩ": "many", "anini": "few", "jerũ": "new", "ngũrũ": "old",
"kĩthũrũrũ": "round", "kaũgĩ": "sharp", "ritwa": "name", "tirama": "stand", "ĩkara": "sit",
"thiĩ": "walk", "ngaria": "run", "va": "give", "oca": "take", "nyita": "hold",
"tiniaa": "cut", "ringa": "hit", "ikia": "throw", "via": "burn", "ĩthambĩra": "swim",
"ina": "sing", "katika": "dance", "theka": "laugh", "rĩra": "cry", "rũma": "bite",
"mumunya": "suck", "nungira": "smell", "ĩtigĩra": "fear", "wina toro": "sleepy",
"mũvũtu": "hungry", "mũnyondu": "thirsty", "ndune": "red", "njerũ": "white",
"mbirũ": "black", "ngirini": "green", "yellũ": "yellow", "mbulu": "blue",
"matu": "cloud", "maturĩ": "sky", "rũkũngũ": "dust", "mũu": "ashes", "mũkanda": "rope",
"kamũti": "stick", "kaviũ": "knife", "ũta": "bow", "mũgwi": "arrow", "itumũ": "spear",
"gitegithamaki": "fishhook", "neti": "net", "ĩtaru": "canoe", "mũrango": "door",
"ĩtara": "roof", "nthĩ": "floor", "Mũgeka": "mat", "kĩtanda": "bed", "mũrengeti": "blanket",
"nyũngũ": "pot", "kanya": "calabash", "gĩkapũ": "basket", "nduramu": "drum",
"rwĩmbo": "song", "rũgano": "story", "thakania": "play", "mũrata": "friend",
"nthũ": "enemy", "civũ": "chief", "mũkũrũ": "elder", "ĩria": "milk", "ngombe": "cow/cattle",
"mbũri": "goat", "ngondu": "sheep", "ngũkũ": "chicken", "ngamĩra": "camel",
"nvuda": "donkey", "ndegwa": "ox", "mbegũ": "seed", "ketha": "harvest", "ũma": "hoe",
"ĩthanwa": "axe", "mũro": "digging stick", "Mũtumi ciodo": "weaver",
"Mwaki nyũngũ": "potter", "mũturi": "blacksmith", "mũgwĩmi": "hunter",
"mũrĩthi": "herdsman", "mũteginthamaki": "fisherman", "thoko": "market",
"kwendia": "trade", "cenjania": "barter", "mathaa": "time", "mavinda": "season",
"ĩvinda rĩa riũa": "dry season", "ivinda ria mbura": "rainy season", "rĩũra": "famine",
"thayũ": "peace", "mbara": "war", "gũrana": "marriage", "mũviki": "bride",
"mũvikania": "groom", "ĩrua": "initiation", "kũrua": "circumcision",
"kĩkuũ": "death", "ngoma": "spirit", "ngomi": "ancestor", "mũgĩmbĩ": "finger millet",
"mũkombi": "pearl millet", "mwere": "bulrush millet", "mũvia": "sorghum",
"mbembe": "maize", "minji": "cowpea", "ndengũ": "green gram", "njavĩ": "pigeon pea",
"ndũma": "arrowroot/taro", "mwanga": "cassava", "gĩkũa": "yam", "ngwacĩ": "sweet potato",
"ĩrenge": "pumpkin", "sukuma": "kale", "terere": "amaranth", "Thageti": "spider plant",
"kaũrũra": "pumpkin leaves", "kunde": "cowpea leaves", "Mabuyu": "baobab fruit",
"nthithi": "tamarind", "mbera": "guava", "matimoko": "custard apple/soursop",
"macuca": "loquat", "kĩgwa": "sugarcane", "njahĩ": "sesame", "Marũrũ": "sunflower",
"mbiringanya": "eggplant", "nyanya": "tomato", "gĩtũngũrũ": "onion",
"kĩtũngũrũ saumu": "garlic", "tangauthi": "ginger", "murende": "turmeric",
"nduru": "chili", "mboga": "cabbage", "karati": "carrot", "njukĩ": "bee", "ukĩ": "honey",
"mwatu": "beehive", "mabaki": "wax", "maguta": "butter", "kĩrimũ": "cream",
"alenya": "ghee", "ĩria ra kũgandithua": "sour milk"
}
# --- Helper: Normalize user input ---
def normalize(text):
"""
Cleans text for case-insensitive and punctuation-insensitive lookup.
Removes punctuation (.,?!-), converts to lowercase, trims spaces.
"""
text = text.lower().strip()
text = re.sub(r"[.,?!-]", "", text) # remove punctuation
return text
# --- Prepare lookup tables (in lowercase) ---
kiembu_lower = {normalize(k): v for k, v in kiembu_to_english.items()}
english_lower = {normalize(v): k for k, v in kiembu_to_english.items()}
# --- Translation Function ---
def translate_word(word, direction):
"""Translate a word between Kiembu and English, ignoring case & punctuation."""
cleaned = normalize(word)
if direction == "Kiembu → English":
return kiembu_lower.get(cleaned, "Not found in dictionary")
elif direction == "English → Kiembu":
return english_lower.get(cleaned, "Not found in dictionary")
else:
return "Invalid translation direction. Use 'Kiembu → English' or 'English → Kiembu'."
# ============================================
# SECTION 2 — PDF TRANSLATION (Transformer + PDF)
# ============================================
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-sw") # placeholder
def extract_text_from_pdf(pdf_file):
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text.strip()
def translate_text(text):
chunks = text.split(". ")
translated = []
for chunk in chunks:
if chunk.strip():
try:
tr = translator(chunk.strip())[0]["translation_text"]
translated.append(tr)
except Exception:
translated.append(chunk)
return ". ".join(translated)
def create_pdf(translated_text, output_path="translated_output.pdf"):
pdfmetrics.registerFont(UnicodeCIDFont('HeiseiKakuGo-W5'))
doc = SimpleDocTemplate(output_path, pagesize=A4, rightMargin=60, leftMargin=60, topMargin=72, bottomMargin=72)
styles = getSampleStyleSheet()
title_style = ParagraphStyle(name='TitleStyle', parent=styles['Heading1'],
alignment=TA_CENTER, fontName='HeiseiKakuGo-W5',
fontSize=16, spaceAfter=20)
body_style = ParagraphStyle(name='BodyStyle', parent=styles['Normal'],
alignment=TA_JUSTIFY, fontName='HeiseiKakuGo-W5',
fontSize=12, leading=16)
story = [Paragraph("Translated Document — English → Kiembu", title_style),
Spacer(1, 0.3 * inch)]
for para in translated_text.split("\n"):
if para.strip():
story.append(Paragraph(para.strip(), body_style))
story.append(Spacer(1, 0.2 * inch))
doc.build(story)
return output_path
def translate_pdf_to_kiembu(pdf_file):
text = extract_text_from_pdf(pdf_file.name)
if not text:
return None, "No readable text found in the uploaded PDF."
translated_text = translate_text(text)
output_pdf_path = create_pdf(translated_text)
return output_pdf_path, "Translation complete! Download below."
# ============================================
# SECTION 3 — NRF LLM MODEL PDF CHAT
# ============================================
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
model_name = "google/gemma-2b-it"
hf_token = os.getenv("NRF_LLM_TOKEN")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto", use_auth_token=hf_token)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
chunks, index, pdf_loaded = [], None, False
def extract_pdf_text(pdf_file):
doc = fitz.open(pdf_file.name)
text = ""
for page in doc:
text += page.get_text()
return text
def chunk_text(text, chunk_size=500, overlap=100):
words = text.split()
chunk_list = []
start = 0
while start < len(words):
end = min(start + chunk_size, len(words))
chunk_list.append(" ".join(words[start:end]))
start += chunk_size - overlap
return chunk_list
def embed_chunks(chunks_list):
embeddings = embed_model.encode(chunks_list)
idx = faiss.IndexFlatL2(embeddings.shape[1])
idx.add(np.array(embeddings))
return idx
def load_pdf_and_prepare(pdf_file):
global chunks, index, pdf_loaded
try:
text = extract_pdf_text(pdf_file)
chunks = chunk_text(text)
index = embed_chunks(chunks)
pdf_loaded = True
return "✅ PDF uploaded and processed successfully."
except Exception as e:
return f"❌ Error: {str(e)}"
def delete_pdf():
global chunks, index, pdf_loaded
chunks, index, pdf_loaded = [], None, False
return "🗑️ PDF cleared. Ready for new upload."
def query_pdf(question, top_k=3):
if not pdf_loaded:
return "⚠️ Please upload and process a PDF first."
question_embedding = embed_model.encode([question])
D, I = index.search(np.array(question_embedding), top_k)
context = "\n".join([chunks[i] for i in I[0]])
prompt = f"Answer the question using the context:\n\n{context}\n\nQuestion: {question}\nAnswer:"
response = generator(prompt)[0]["generated_text"]
return response.split("Answer:")[-1].strip()
# ============================================
# SECTION 4 — ENHANCED GRADIO UI
# ============================================
def build_app():
custom_css = """
body {
background: #f5f5f5;
margin: 0;
padding: 0;
overflow: auto;
}
.gradio-container {
display: flex;
flex-direction: column;
align-items: center;
justify-content: flex-start;
min-height: 100vh;
padding: 30px 15px;
box-sizing: border-box;
border: 2px solid #ccc;
border-radius: 16px;
box-shadow: 0 4px 16px rgba(0,0,0,0.1);
background: white;
max-width: 900px;
margin: 20px auto;
overflow-y: auto;
}
::-webkit-scrollbar {
width: 10px;
}
::-webkit-scrollbar-track {
background: #eee;
border-radius: 10px;
}
::-webkit-scrollbar-thumb {
background: #aaa;
border-radius: 10px;
}
::-webkit-scrollbar-thumb:hover {
background: #777;
}
textarea, input[type="text"], .gr-textbox, .gr-input {
border: 2px solid #bbb !important;
border-radius: 10px !important;
padding: 8px !important;
box-shadow: inset 0 2px 4px rgba(0,0,0,0.05);
transition: border-color 0.2s ease, box-shadow 0.2s ease;
}
textarea:focus, input[type="text"]:focus {
border-color: #0078D7 !important;
box-shadow: 0 0 5px rgba(0,120,215,0.3) !important;
outline: none;
}
button, .gr-button {
border-radius: 10px !important;
padding: 10px 16px !important;
font-weight: 600 !important;
}
"""
with gr.Blocks(
title="Kiembu ↔ English — NRF Kenya Project",
css=custom_css,
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")
) as app:
gr.Markdown("""
<div style='text-align:center'>
<h1 style='color:#003366;'>Kiembu ↔ English Translation Suite</h1>
<h3 style='color:#d4a017;'>Funded by NRF Kenya — Creating LLMs that Understand Native Languages</h3>
<hr style='border:1px solid #003366;width:80%;margin:auto'>
</div>
""")
with gr.Tabs():
# -----------------------------
# TAB 1: DICTIONARY TRANSLATOR
# -----------------------------
with gr.TabItem("Dictionary Translator"):
gr.Markdown("""
### Quick Word Translation — **Kiembu ↔ English**
Enter a single word or short phrase and get its translation instantly.
**Sample Words:**
| Kiembu | English |
|:--|:--|
| Uvoro | how are you |
| Ri? | When? |
| Ku? | Where? |
| Uka | come |
""")
inp = gr.Textbox(label="Enter Word", placeholder="e.g. 'Uvoro' or 'how are you'", lines=1)
dir_sel = gr.Radio(
["Kiembu → English", "English → Kiembu"],
value="Kiembu → English",
label="Select Direction"
)
out = gr.Textbox(label="Translation Result")
gr.Button("Translate").click(translate_word, [inp, dir_sel], out)
# -----------------------------
# TAB 2: PDF TRANSLATION
# -----------------------------
with gr.TabItem("PDF Translation"):
gr.Markdown("""
### **English → Kiembu PDF Translator**
Upload an **English PDF document** (e.g., ID form, hospital form, passport form)
and get a **translated PDF in Kiembu** for download.
""")
pdf_input = gr.File(label="Upload English PDF", file_types=[".pdf"])
translate_btn = gr.Button("Translate to Kiembu")
output_file = gr.File(label="Download Translated PDF")
status = gr.Textbox(label="Status", interactive=False)
translate_btn.click(translate_pdf_to_kiembu, inputs=[pdf_input], outputs=[output_file, status])
# -----------------------------
# TAB 3: NRF LLM MODEL Q&A
# -----------------------------
with gr.TabItem("PDF Chat (NRF LLM Model)"):
gr.Markdown("""
### **Interactive PDF Chat — NRF LLM Model**
Upload any **informative PDF** (e.g., government report, history book, or manual)
and ask natural-language questions to understand its content better.
**Examples:**
- "What does this document say about birth registration?"
- "Summarize Chapter 2."
""")
pdf = gr.File(label="Upload PDF Document")
status = gr.Textbox(label="Status")
gr.Button("Process PDF").click(load_pdf_and_prepare, pdf, status)
gr.Button("Clear PDF").click(delete_pdf, None, status)
q = gr.Textbox(lines=2, label="Ask a Question", placeholder="e.g. 'Summarize the introduction section.'")
ans = gr.Textbox(lines=6, label="Answer")
gr.Button("Query PDF").click(query_pdf, q, ans)
# -----------------------------
# TAB 4: ABOUT
# -----------------------------
with gr.TabItem("About"):
gr.Markdown("""
### About the Project
The **NRF Kenya Project** on *Creating LLMs that Understand Native Languages*
aims to preserve and promote indigenous linguistic heritage through advanced AI translation tools.
- **Languages Supported:** Kiembu ↔ English
- **Core Engine:** NRF LLM Model under development
- **:** Principal Investigator: Prof Lucy Kawira – Chuka University
- **Developed by: Technical Team: Coordinator- Casam Njagi – Chuka University
- **Funding Agency:** National Research Fund (NRF), Kenya
- **Objective:** Foster inclusion of native languages in AI-driven communication.
""")
gr.Markdown("""
<hr style='border:0.5px solid #ccc'>
<div style='text-align:center;color:#003366;font-size:14px'>
© 2025 National Research Fund (NRF) Kenya — All Rights Reserved
</div>
""")
return app
demo = build_app()
demo.launch()