|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import gradio as gr |
|
|
import fitz |
|
|
import faiss |
|
|
import re |
|
|
import numpy as np |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline |
|
|
from reportlab.lib.pagesizes import A4 |
|
|
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle |
|
|
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer |
|
|
from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER |
|
|
from reportlab.lib.units import inch |
|
|
from reportlab.pdfbase.cidfonts import UnicodeCIDFont |
|
|
from reportlab.pdfbase import pdfmetrics |
|
|
from PyPDF2 import PdfReader |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
kiembu_to_english = { |
|
|
|
|
|
"Uvoro": "how are you", "Ri?": "When?", "Ku?": "Where?", "Uka": "come", |
|
|
"Hava": "here", "Varia": "there", "Vakuve": "close", "Kuraca": "far", |
|
|
"Ciakwa": "my/mine", "Cucu": "grandmother", "Mundu": "person", |
|
|
"Andu": "people", "Mwana": "child", "Mutumia": "woman", "Muthuri": "man", |
|
|
"Ngai": "God", "Wendo": "love", "Ngui": "dog", "Nyomba": "house", "Ndawa": "medicine", |
|
|
"Maaĩ": "water", "Mwaki": "fire", "Rĩũa": "sun", "Mweri": "moon", "Njata": "star", |
|
|
"ĩthiga": "stone", "Mũtĩ": "tree", "ĩthangũ": "leaf", "Mũri": "root", "ĩkoro": "bark", |
|
|
"MũndũMũrume": "man", "MũndũMũka": "woman", "fafa": "father", "Mami": "mother", |
|
|
"Gũtũ": "ear", "Ritho": "eye", "ĩniũrũ": "nose", "Kanyua": "mouth", "ĩgego": "tooth", |
|
|
"rũrĩmĩ": "tongue", "Njara": "hand", "Kũgũrũ": "foot", "Thakame": "blood", "ĩvĩndĩ": "bone", |
|
|
"Ngothi": "skin", "Nyama": "meat", "Nthamaki": "fish", "Giconi": "bird", "ĩtumbĩ": "egg", |
|
|
"Nvĩa": "horn", "Mũkia": "tail", "ĩvuta": "feather", "Njuĩrĩ": "hair", "Kĩongo": "head", |
|
|
"Ngingo": "neck", "Mũrukuthu": "back", "Ngoro": "heart", "Itema": "liver", "nyua": "drink", |
|
|
"ria": "eat", "mama": "sleep", "kua": "die", "ũka": "come", "ona": "see", "ĩgua": "hear", |
|
|
"menya": "know", "ĩciria": "think", "uga": "say", "ĩmwe": "one", "ĩgarĩ": "two", |
|
|
"ĩthatũ": "three", "ĩnya": "four", "ĩthano": "five", "ithathatu": "six", "mugwanja": "seven", |
|
|
"inyanya": "eight", "kenda": "nine", "ĩkumi": "ten", "nene": "big", "nini": "small", |
|
|
"ndaca": "long", "nguvi": "short", "mbega": "good", "njũku": "bad", "mbĩcuru": "full", |
|
|
"ĩtikĩndu": "empty", "nviũ": "hot", "nvoru": "cold", "ũtukũ": "night", "Mũthenya": "day", |
|
|
"Mbura": "rain", "rũkũngi": "wind", "nthĩ": "earth", "kĩrĩma": "mountain", "rũnjĩ": "river", |
|
|
"ĩria": "lake/sea", "cumbĩ": "salt", "mũthanga": "sand", "ndogo": "smoke", "nyaki": "grass", |
|
|
"njira": "path", "kivaro": "field", "kuraca": "far", "vakuvi": "near", "ava": "here", |
|
|
"varia": "there", "ũũ": "who", "ndwi": "what", "kũ": "where", "rĩ": "when", "atia": "how", |
|
|
"ka": "not", "onthe": "all", "engĩ": "many", "anini": "few", "jerũ": "new", "ngũrũ": "old", |
|
|
"kĩthũrũrũ": "round", "kaũgĩ": "sharp", "ritwa": "name", "tirama": "stand", "ĩkara": "sit", |
|
|
"thiĩ": "walk", "ngaria": "run", "va": "give", "oca": "take", "nyita": "hold", |
|
|
"tiniaa": "cut", "ringa": "hit", "ikia": "throw", "via": "burn", "ĩthambĩra": "swim", |
|
|
"ina": "sing", "katika": "dance", "theka": "laugh", "rĩra": "cry", "rũma": "bite", |
|
|
"mumunya": "suck", "nungira": "smell", "ĩtigĩra": "fear", "wina toro": "sleepy", |
|
|
"mũvũtu": "hungry", "mũnyondu": "thirsty", "ndune": "red", "njerũ": "white", |
|
|
"mbirũ": "black", "ngirini": "green", "yellũ": "yellow", "mbulu": "blue", |
|
|
"matu": "cloud", "maturĩ": "sky", "rũkũngũ": "dust", "mũu": "ashes", "mũkanda": "rope", |
|
|
"kamũti": "stick", "kaviũ": "knife", "ũta": "bow", "mũgwi": "arrow", "itumũ": "spear", |
|
|
"gitegithamaki": "fishhook", "neti": "net", "ĩtaru": "canoe", "mũrango": "door", |
|
|
"ĩtara": "roof", "nthĩ": "floor", "Mũgeka": "mat", "kĩtanda": "bed", "mũrengeti": "blanket", |
|
|
"nyũngũ": "pot", "kanya": "calabash", "gĩkapũ": "basket", "nduramu": "drum", |
|
|
"rwĩmbo": "song", "rũgano": "story", "thakania": "play", "mũrata": "friend", |
|
|
"nthũ": "enemy", "civũ": "chief", "mũkũrũ": "elder", "ĩria": "milk", "ngombe": "cow/cattle", |
|
|
"mbũri": "goat", "ngondu": "sheep", "ngũkũ": "chicken", "ngamĩra": "camel", |
|
|
"nvuda": "donkey", "ndegwa": "ox", "mbegũ": "seed", "ketha": "harvest", "ũma": "hoe", |
|
|
"ĩthanwa": "axe", "mũro": "digging stick", "Mũtumi ciodo": "weaver", |
|
|
"Mwaki nyũngũ": "potter", "mũturi": "blacksmith", "mũgwĩmi": "hunter", |
|
|
"mũrĩthi": "herdsman", "mũteginthamaki": "fisherman", "thoko": "market", |
|
|
"kwendia": "trade", "cenjania": "barter", "mathaa": "time", "mavinda": "season", |
|
|
"ĩvinda rĩa riũa": "dry season", "ivinda ria mbura": "rainy season", "rĩũra": "famine", |
|
|
"thayũ": "peace", "mbara": "war", "gũrana": "marriage", "mũviki": "bride", |
|
|
"mũvikania": "groom", "ĩrua": "initiation", "kũrua": "circumcision", |
|
|
"kĩkuũ": "death", "ngoma": "spirit", "ngomi": "ancestor", "mũgĩmbĩ": "finger millet", |
|
|
"mũkombi": "pearl millet", "mwere": "bulrush millet", "mũvia": "sorghum", |
|
|
"mbembe": "maize", "minji": "cowpea", "ndengũ": "green gram", "njavĩ": "pigeon pea", |
|
|
"ndũma": "arrowroot/taro", "mwanga": "cassava", "gĩkũa": "yam", "ngwacĩ": "sweet potato", |
|
|
"ĩrenge": "pumpkin", "sukuma": "kale", "terere": "amaranth", "Thageti": "spider plant", |
|
|
"kaũrũra": "pumpkin leaves", "kunde": "cowpea leaves", "Mabuyu": "baobab fruit", |
|
|
"nthithi": "tamarind", "mbera": "guava", "matimoko": "custard apple/soursop", |
|
|
"macuca": "loquat", "kĩgwa": "sugarcane", "njahĩ": "sesame", "Marũrũ": "sunflower", |
|
|
"mbiringanya": "eggplant", "nyanya": "tomato", "gĩtũngũrũ": "onion", |
|
|
"kĩtũngũrũ saumu": "garlic", "tangauthi": "ginger", "murende": "turmeric", |
|
|
"nduru": "chili", "mboga": "cabbage", "karati": "carrot", "njukĩ": "bee", "ukĩ": "honey", |
|
|
"mwatu": "beehive", "mabaki": "wax", "maguta": "butter", "kĩrimũ": "cream", |
|
|
"alenya": "ghee", "ĩria ra kũgandithua": "sour milk" |
|
|
} |
|
|
|
|
|
|
|
|
def normalize(text): |
|
|
""" |
|
|
Cleans text for case-insensitive and punctuation-insensitive lookup. |
|
|
Removes punctuation (.,?!-), converts to lowercase, trims spaces. |
|
|
""" |
|
|
text = text.lower().strip() |
|
|
text = re.sub(r"[.,?!-]", "", text) |
|
|
return text |
|
|
|
|
|
|
|
|
kiembu_lower = {normalize(k): v for k, v in kiembu_to_english.items()} |
|
|
english_lower = {normalize(v): k for k, v in kiembu_to_english.items()} |
|
|
|
|
|
|
|
|
def translate_word(word, direction): |
|
|
"""Translate a word between Kiembu and English, ignoring case & punctuation.""" |
|
|
cleaned = normalize(word) |
|
|
|
|
|
if direction == "Kiembu → English": |
|
|
return kiembu_lower.get(cleaned, "Not found in dictionary") |
|
|
elif direction == "English → Kiembu": |
|
|
return english_lower.get(cleaned, "Not found in dictionary") |
|
|
else: |
|
|
return "Invalid translation direction. Use 'Kiembu → English' or 'English → Kiembu'." |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-sw") |
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
|
reader = PdfReader(pdf_file) |
|
|
text = "" |
|
|
for page in reader.pages: |
|
|
page_text = page.extract_text() |
|
|
if page_text: |
|
|
text += page_text + "\n" |
|
|
return text.strip() |
|
|
|
|
|
def translate_text(text): |
|
|
chunks = text.split(". ") |
|
|
translated = [] |
|
|
for chunk in chunks: |
|
|
if chunk.strip(): |
|
|
try: |
|
|
tr = translator(chunk.strip())[0]["translation_text"] |
|
|
translated.append(tr) |
|
|
except Exception: |
|
|
translated.append(chunk) |
|
|
return ". ".join(translated) |
|
|
|
|
|
def create_pdf(translated_text, output_path="translated_output.pdf"): |
|
|
pdfmetrics.registerFont(UnicodeCIDFont('HeiseiKakuGo-W5')) |
|
|
doc = SimpleDocTemplate(output_path, pagesize=A4, rightMargin=60, leftMargin=60, topMargin=72, bottomMargin=72) |
|
|
styles = getSampleStyleSheet() |
|
|
title_style = ParagraphStyle(name='TitleStyle', parent=styles['Heading1'], |
|
|
alignment=TA_CENTER, fontName='HeiseiKakuGo-W5', |
|
|
fontSize=16, spaceAfter=20) |
|
|
body_style = ParagraphStyle(name='BodyStyle', parent=styles['Normal'], |
|
|
alignment=TA_JUSTIFY, fontName='HeiseiKakuGo-W5', |
|
|
fontSize=12, leading=16) |
|
|
story = [Paragraph("Translated Document — English → Kiembu", title_style), |
|
|
Spacer(1, 0.3 * inch)] |
|
|
for para in translated_text.split("\n"): |
|
|
if para.strip(): |
|
|
story.append(Paragraph(para.strip(), body_style)) |
|
|
story.append(Spacer(1, 0.2 * inch)) |
|
|
doc.build(story) |
|
|
return output_path |
|
|
|
|
|
def translate_pdf_to_kiembu(pdf_file): |
|
|
text = extract_text_from_pdf(pdf_file.name) |
|
|
if not text: |
|
|
return None, "No readable text found in the uploaded PDF." |
|
|
translated_text = translate_text(text) |
|
|
output_pdf_path = create_pdf(translated_text) |
|
|
return output_pdf_path, "Translation complete! Download below." |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embed_model = SentenceTransformer("all-MiniLM-L6-v2") |
|
|
model_name = "google/gemma-2b-it" |
|
|
hf_token = os.getenv("NRF_LLM_TOKEN") |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token) |
|
|
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto", use_auth_token=hf_token) |
|
|
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200) |
|
|
|
|
|
chunks, index, pdf_loaded = [], None, False |
|
|
|
|
|
def extract_pdf_text(pdf_file): |
|
|
doc = fitz.open(pdf_file.name) |
|
|
text = "" |
|
|
for page in doc: |
|
|
text += page.get_text() |
|
|
return text |
|
|
|
|
|
def chunk_text(text, chunk_size=500, overlap=100): |
|
|
words = text.split() |
|
|
chunk_list = [] |
|
|
start = 0 |
|
|
while start < len(words): |
|
|
end = min(start + chunk_size, len(words)) |
|
|
chunk_list.append(" ".join(words[start:end])) |
|
|
start += chunk_size - overlap |
|
|
return chunk_list |
|
|
|
|
|
def embed_chunks(chunks_list): |
|
|
embeddings = embed_model.encode(chunks_list) |
|
|
idx = faiss.IndexFlatL2(embeddings.shape[1]) |
|
|
idx.add(np.array(embeddings)) |
|
|
return idx |
|
|
|
|
|
def load_pdf_and_prepare(pdf_file): |
|
|
global chunks, index, pdf_loaded |
|
|
try: |
|
|
text = extract_pdf_text(pdf_file) |
|
|
chunks = chunk_text(text) |
|
|
index = embed_chunks(chunks) |
|
|
pdf_loaded = True |
|
|
return "✅ PDF uploaded and processed successfully." |
|
|
except Exception as e: |
|
|
return f"❌ Error: {str(e)}" |
|
|
|
|
|
def delete_pdf(): |
|
|
global chunks, index, pdf_loaded |
|
|
chunks, index, pdf_loaded = [], None, False |
|
|
return "🗑️ PDF cleared. Ready for new upload." |
|
|
|
|
|
def query_pdf(question, top_k=3): |
|
|
if not pdf_loaded: |
|
|
return "⚠️ Please upload and process a PDF first." |
|
|
question_embedding = embed_model.encode([question]) |
|
|
D, I = index.search(np.array(question_embedding), top_k) |
|
|
context = "\n".join([chunks[i] for i in I[0]]) |
|
|
prompt = f"Answer the question using the context:\n\n{context}\n\nQuestion: {question}\nAnswer:" |
|
|
response = generator(prompt)[0]["generated_text"] |
|
|
return response.split("Answer:")[-1].strip() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_app(): |
|
|
custom_css = """ |
|
|
body { |
|
|
background: #f5f5f5; |
|
|
margin: 0; |
|
|
padding: 0; |
|
|
overflow: auto; |
|
|
} |
|
|
|
|
|
.gradio-container { |
|
|
display: flex; |
|
|
flex-direction: column; |
|
|
align-items: center; |
|
|
justify-content: flex-start; |
|
|
min-height: 100vh; |
|
|
padding: 30px 15px; |
|
|
box-sizing: border-box; |
|
|
border: 2px solid #ccc; |
|
|
border-radius: 16px; |
|
|
box-shadow: 0 4px 16px rgba(0,0,0,0.1); |
|
|
background: white; |
|
|
max-width: 900px; |
|
|
margin: 20px auto; |
|
|
overflow-y: auto; |
|
|
} |
|
|
|
|
|
::-webkit-scrollbar { |
|
|
width: 10px; |
|
|
} |
|
|
::-webkit-scrollbar-track { |
|
|
background: #eee; |
|
|
border-radius: 10px; |
|
|
} |
|
|
::-webkit-scrollbar-thumb { |
|
|
background: #aaa; |
|
|
border-radius: 10px; |
|
|
} |
|
|
::-webkit-scrollbar-thumb:hover { |
|
|
background: #777; |
|
|
} |
|
|
|
|
|
textarea, input[type="text"], .gr-textbox, .gr-input { |
|
|
border: 2px solid #bbb !important; |
|
|
border-radius: 10px !important; |
|
|
padding: 8px !important; |
|
|
box-shadow: inset 0 2px 4px rgba(0,0,0,0.05); |
|
|
transition: border-color 0.2s ease, box-shadow 0.2s ease; |
|
|
} |
|
|
textarea:focus, input[type="text"]:focus { |
|
|
border-color: #0078D7 !important; |
|
|
box-shadow: 0 0 5px rgba(0,120,215,0.3) !important; |
|
|
outline: none; |
|
|
} |
|
|
|
|
|
button, .gr-button { |
|
|
border-radius: 10px !important; |
|
|
padding: 10px 16px !important; |
|
|
font-weight: 600 !important; |
|
|
} |
|
|
""" |
|
|
|
|
|
with gr.Blocks( |
|
|
title="Kiembu ↔ English — NRF Kenya Project", |
|
|
css=custom_css, |
|
|
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray") |
|
|
) as app: |
|
|
|
|
|
gr.Markdown(""" |
|
|
<div style='text-align:center'> |
|
|
<h1 style='color:#003366;'>Kiembu ↔ English Translation Suite</h1> |
|
|
<h3 style='color:#d4a017;'>Funded by NRF Kenya — Creating LLMs that Understand Native Languages</h3> |
|
|
<hr style='border:1px solid #003366;width:80%;margin:auto'> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.TabItem("Dictionary Translator"): |
|
|
gr.Markdown(""" |
|
|
### Quick Word Translation — **Kiembu ↔ English** |
|
|
Enter a single word or short phrase and get its translation instantly. |
|
|
|
|
|
**Sample Words:** |
|
|
| Kiembu | English | |
|
|
|:--|:--| |
|
|
| Uvoro | how are you | |
|
|
| Ri? | When? | |
|
|
| Ku? | Where? | |
|
|
| Uka | come | |
|
|
""") |
|
|
inp = gr.Textbox(label="Enter Word", placeholder="e.g. 'Uvoro' or 'how are you'", lines=1) |
|
|
dir_sel = gr.Radio( |
|
|
["Kiembu → English", "English → Kiembu"], |
|
|
value="Kiembu → English", |
|
|
label="Select Direction" |
|
|
) |
|
|
out = gr.Textbox(label="Translation Result") |
|
|
gr.Button("Translate").click(translate_word, [inp, dir_sel], out) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.TabItem("PDF Translation"): |
|
|
gr.Markdown(""" |
|
|
### **English → Kiembu PDF Translator** |
|
|
Upload an **English PDF document** (e.g., ID form, hospital form, passport form) |
|
|
and get a **translated PDF in Kiembu** for download. |
|
|
""") |
|
|
pdf_input = gr.File(label="Upload English PDF", file_types=[".pdf"]) |
|
|
translate_btn = gr.Button("Translate to Kiembu") |
|
|
output_file = gr.File(label="Download Translated PDF") |
|
|
status = gr.Textbox(label="Status", interactive=False) |
|
|
translate_btn.click(translate_pdf_to_kiembu, inputs=[pdf_input], outputs=[output_file, status]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.TabItem("PDF Chat (NRF LLM Model)"): |
|
|
gr.Markdown(""" |
|
|
### **Interactive PDF Chat — NRF LLM Model** |
|
|
Upload any **informative PDF** (e.g., government report, history book, or manual) |
|
|
and ask natural-language questions to understand its content better. |
|
|
|
|
|
**Examples:** |
|
|
- "What does this document say about birth registration?" |
|
|
- "Summarize Chapter 2." |
|
|
""") |
|
|
pdf = gr.File(label="Upload PDF Document") |
|
|
status = gr.Textbox(label="Status") |
|
|
gr.Button("Process PDF").click(load_pdf_and_prepare, pdf, status) |
|
|
gr.Button("Clear PDF").click(delete_pdf, None, status) |
|
|
q = gr.Textbox(lines=2, label="Ask a Question", placeholder="e.g. 'Summarize the introduction section.'") |
|
|
ans = gr.Textbox(lines=6, label="Answer") |
|
|
gr.Button("Query PDF").click(query_pdf, q, ans) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.TabItem("About"): |
|
|
gr.Markdown(""" |
|
|
### About the Project |
|
|
The **NRF Kenya Project** on *Creating LLMs that Understand Native Languages* |
|
|
aims to preserve and promote indigenous linguistic heritage through advanced AI translation tools. |
|
|
|
|
|
- **Languages Supported:** Kiembu ↔ English |
|
|
- **Core Engine:** NRF LLM Model under development |
|
|
- **:** Principal Investigator: Prof Lucy Kawira – Chuka University |
|
|
- **Developed by: Technical Team: Coordinator- Casam Njagi – Chuka University |
|
|
- **Funding Agency:** National Research Fund (NRF), Kenya |
|
|
- **Objective:** Foster inclusion of native languages in AI-driven communication. |
|
|
""") |
|
|
|
|
|
gr.Markdown(""" |
|
|
<hr style='border:0.5px solid #ccc'> |
|
|
<div style='text-align:center;color:#003366;font-size:14px'> |
|
|
© 2025 National Research Fund (NRF) Kenya — All Rights Reserved |
|
|
</div> |
|
|
""") |
|
|
|
|
|
return app |
|
|
|
|
|
|
|
|
demo = build_app() |
|
|
demo.launch() |
|
|
|
|
|
|