Spaces:

casamN
/

NRF_LLM

Running

App Files Files Community

casamN commited on Oct 29

Commit

c096ba0

verified ·

1 Parent(s): cf4a73b

Update app.py

Browse files

Files changed (1) hide show

app.py +211 -161

app.py CHANGED Viewed

@@ -1,89 +1,172 @@
 # =========================================================
-# Kiembu ↔ English — NRF Kenya Project
-# Funded by NRF Kenya — Creating LLMs that Understand Native Languages
 # =========================================================
 import gradio as gr
-from transformers import MarianMTModel, MarianTokenizer
-import fitz  # PyMuPDF for PDF reading
-# -----------------------------
-# MODEL LOADING
-# -----------------------------
-# Light, fast MarianMT models
-EN_TO_SW_MODEL = "Helsinki-NLP/opus-mt-en-sw"
-SW_TO_EN_MODEL = "Helsinki-NLP/opus-mt-sw-en"
-en_tokenizer = MarianTokenizer.from_pretrained(EN_TO_SW_MODEL)
-en_model = MarianMTModel.from_pretrained(EN_TO_SW_MODEL)
-sw_tokenizer = MarianTokenizer.from_pretrained(SW_TO_EN_MODEL)
-sw_model = MarianMTModel.from_pretrained(SW_TO_EN_MODEL)
-# -----------------------------
-# TRANSLATION FUNCTIONS
-# -----------------------------
-def translate_text(text, direction):
-    if not text.strip():
-        return "Please enter some text to translate."
-    if direction == "English → Kiembu":
-        tokenizer, model = en_tokenizer, en_model
     else:
-        tokenizer, model = sw_tokenizer, sw_model
-    inputs = tokenizer(text, return_tensors="pt", padding=True)
-    outputs = model.generate(**inputs, max_length=400)
-    translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return translated
-# -----------------------------
-# PDF CHAT FUNCTION
-# -----------------------------
-def read_pdf(pdf_file):
-    if pdf_file is None:
-        return "Please upload a PDF document."
     doc = fitz.open(pdf_file.name)
     text = ""
     for page in doc:
-        text += page.get_text("text") + "\n"
-    doc.close()
-    return text if text.strip() else "No readable text found in the PDF."
-# -----------------------------
-# PROVERBS SECTION
-# -----------------------------
-proverbs_data = {
-    "Kiembu": [
-        ("Mûno mûno ndîgîa gûtî na mûcîrî", "Too much of anything is bad."),
-        ("Mûkûrû ti mûtî", "An elder is not a tree (meaning: respect wisdom)."),
-        ("Nîkenda kûigua atîa nîyo mûtumia aigua", "The woman listens in her own way."),
-    ],
-    "English": [
-        ("Wisdom is like a baobab tree; no one individual can embrace it.", "Uûgî nî mûtî mûkûrû ûtîkûthîrwa na mûtû ũmwe."),
-        ("A single bracelet does not jingle.", "Kîrîa kîmwe kîgûthîrîra ndîrî."),
-        ("Even the best cooking pot will not produce food.", "Ata ndîgîra yambîrî ndîkûgîa thîna ûthîrî."),
-    ],
-}
-def get_proverbs(language):
-    selected = proverbs_data[language]
-    formatted = "\n\n".join([f"**{p[0]}**\n→ {p[1]}" for p in selected])
-    return formatted
-# -----------------------------
-# BUILD GRADIO APP
-# -----------------------------
 def build_app():
     custom_css = """
     .gradio-container {
         font-family: 'Inter', 'Segoe UI', sans-serif;
-        background-color: #f9fafb;
-        color: #111827;
     }
     h1, h2, h3 {
         color: #003366 !important;
@@ -92,105 +175,72 @@ def build_app():
     .tab-nav button {
         font-size: 16px !important;
         font-weight: 500 !important;
     }
-    textarea, input {
         font-size: 15px !important;
     }
     """
-    with gr.Blocks(
-        title="Kiembu ↔ English — NRF Kenya Project",
-        css=custom_css,
-        theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")
-    ) as app:
-        gr.Markdown(
-            """
-            # Kiembu ↔ English — NRF Kenya Project
-            ### Funded by NRF Kenya — Creating LLMs that Understand Native Languages
-            ---
-            """
-        )
         with gr.Tabs():
-            # -----------------------------
-            # TAB 1: TRANSLATOR
-            # -----------------------------
-            with gr.TabItem("Translator"):
-                gr.Markdown("### Translate between English and Kiembu using the NRF LLM Model")
-                with gr.Row():
-                    direction = gr.Radio(
-                        ["English → Kiembu", "Kiembu → English"],
-                        label="Select Translation Direction",
-                        value="English → Kiembu",
-                    )
-                with gr.Row():
-                    input_text = gr.Textbox(
-                        label="Enter Text",
-                        placeholder="Type text to translate...",
-                        lines=5,
-                    )
-                    output_text = gr.Textbox(
-                        label="Translated Text",
-                        placeholder="Translation will appear here...",
-                        lines=5,
-                    )
-                translate_btn = gr.Button("Translate", variant="primary")
-                translate_btn.click(translate_text, inputs=[input_text, direction], outputs=output_text)
-            # -----------------------------
-            # TAB 2: PROVERBS
-            # -----------------------------
-            with gr.TabItem("Proverbs"):
-                gr.Markdown("### Explore Traditional Proverbs and Their Meanings")
-                lang_choice = gr.Radio(["Kiembu", "English"], label="Select Language", value="Kiembu")
-                show_btn = gr.Button("Show Proverbs", variant="primary")
-                proverb_box = gr.Markdown()
-                show_btn.click(get_proverbs, inputs=lang_choice, outputs=proverb_box)
-            # -----------------------------
-            # TAB 3: PDF CHAT
-            # -----------------------------
-            with gr.TabItem("PDF Chat"):
-                gr.Markdown("### Extract Text from PDF and Analyze Using NRF LLM Model")
-                pdf_input = gr.File(label="Upload PDF File")
-                pdf_output = gr.Textbox(label="Extracted Text", lines=15)
-                extract_btn = gr.Button("Extract Text", variant="primary")
-                extract_btn.click(read_pdf, inputs=pdf_input, outputs=pdf_output)
-            # -----------------------------
-            # TAB 4: ABOUT
-            # -----------------------------
-            with gr.TabItem("About"):
-                gr.Markdown(
-                    """
-                    ### About the Project
-                    The **NRF Kenya Project** on *Creating LLMs that Understand Native Languages*
-                    aims to preserve and promote indigenous linguistic heritage through advanced AI translation tools.
-                    - **Languages Supported:** Kiembu ↔ English
-                    - **Core Engine:** NRF LLM Model (based on lightweight MarianMT)
-                    - **Developed by:** Casam Njagi Nyaga
-                    - **Funding Agency:** National Research Fund (NRF), Kenya
-                    - **Objective:** Foster inclusion of native languages in AI-driven communication.
-                    """
-                )
-        gr.Markdown("---\n© 2025 NRF Kenya — All Rights Reserved")
     return app
-# -----------------------------
-# LAUNCH APP
-# -----------------------------
-if __name__ == "__main__":
-    app = build_app()
-    app.launch()

 # =========================================================
+# KIEMBU ↔ ENGLISH — NRF KENYA TRANSLATION SUITE
 # =========================================================
+import os
 import gradio as gr
+import fitz
+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
+from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER
+from reportlab.lib.units import inch
+from reportlab.pdfbase.cidfonts import UnicodeCIDFont
+from reportlab.pdfbase import pdfmetrics
+from PyPDF2 import PdfReader
+# ============================================
+# SECTION 1 — SIMPLE DICTIONARY TRANSLATOR
+# ============================================
+kiembu_to_english = {
+    "Uvoro": "how are you", "Ri?": "When?", "Ku?": "Where?", "Uka": "come",
+    "Hava": "here", "Varia": "there", "Vakuve": "close", "Kuraca": "far",
+    "Ciakwa": "my/mine", "Cucu": "grandmother", "Mundu": "person",
+    "Andu": "people", "Mwana": "child", "Mutumia": "woman", "Muthuri": "man",
+    "Ngai": "God", "Wendo": "love", "Ngui": "dog", "Nyomba": "house", "Ndawa": "medicine"
+}
+english_to_kiembu = {v.lower(): k for k, v in kiembu_to_english.items()}
+def translate_word(word, direction):
+    if direction == "Kiembu → English":
+        return kiembu_to_english.get(word, "Not found in dictionary")
     else:
+        return english_to_kiembu.get(word.lower(), "Not found in dictionary")
+# ============================================
+# SECTION 2 — PDF TRANSLATION (Transformer + PDF)
+# ============================================
+translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-sw")  # placeholder
+def extract_text_from_pdf(pdf_file):
+    reader = PdfReader(pdf_file)
+    text = ""
+    for page in reader.pages:
+        page_text = page.extract_text()
+        if page_text:
+            text += page_text + "\n"
+    return text.strip()
+def translate_text(text):
+    chunks = text.split(". ")
+    translated = []
+    for chunk in chunks:
+        if chunk.strip():
+            try:
+                tr = translator(chunk.strip())[0]["translation_text"]
+                translated.append(tr)
+            except Exception:
+                translated.append(chunk)
+    return ". ".join(translated)
+def create_pdf(translated_text, output_path="translated_output.pdf"):
+    pdfmetrics.registerFont(UnicodeCIDFont('HeiseiKakuGo-W5'))
+    doc = SimpleDocTemplate(output_path, pagesize=A4, rightMargin=60, leftMargin=60, topMargin=72, bottomMargin=72)
+    styles = getSampleStyleSheet()
+    title_style = ParagraphStyle(name='TitleStyle', parent=styles['Heading1'],
+                                 alignment=TA_CENTER, fontName='HeiseiKakuGo-W5',
+                                 fontSize=16, spaceAfter=20)
+    body_style = ParagraphStyle(name='BodyStyle', parent=styles['Normal'],
+                                alignment=TA_JUSTIFY, fontName='HeiseiKakuGo-W5',
+                                fontSize=12, leading=16)
+    story = [Paragraph("Translated Document — English → Kiembu", title_style),
+             Spacer(1, 0.3 * inch)]
+    for para in translated_text.split("\n"):
+        if para.strip():
+            story.append(Paragraph(para.strip(), body_style))
+            story.append(Spacer(1, 0.2 * inch))
+    doc.build(story)
+    return output_path
+def translate_pdf_to_kiembu(pdf_file):
+    text = extract_text_from_pdf(pdf_file.name)
+    if not text:
+        return None, "No readable text found in the uploaded PDF."
+    translated_text = translate_text(text)
+    output_pdf_path = create_pdf(translated_text)
+    return output_pdf_path, "Translation complete! Download below."
+# ============================================
+# SECTION 3 — NRF LLM MODEL PDF CHAT
+# ============================================
+embed_model = SentenceTransformer("all-MiniLM-L6-v2")
+model_name = "google/gemma-2b-it"
+hf_token = os.getenv("NRF_LLM_TOKEN")
+tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto", use_auth_token=hf_token)
+generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
+chunks, index, pdf_loaded = [], None, False
+def extract_pdf_text(pdf_file):
     doc = fitz.open(pdf_file.name)
     text = ""
     for page in doc:
+        text += page.get_text()
+    return text
+def chunk_text(text, chunk_size=500, overlap=100):
+    words = text.split()
+    chunk_list = []
+    start = 0
+    while start < len(words):
+        end = min(start + chunk_size, len(words))
+        chunk_list.append(" ".join(words[start:end]))
+        start += chunk_size - overlap
+    return chunk_list
+def embed_chunks(chunks_list):
+    embeddings = embed_model.encode(chunks_list)
+    idx = faiss.IndexFlatL2(embeddings.shape[1])
+    idx.add(np.array(embeddings))
+    return idx
+def load_pdf_and_prepare(pdf_file):
+    global chunks, index, pdf_loaded
+    try:
+        text = extract_pdf_text(pdf_file)
+        chunks = chunk_text(text)
+        index = embed_chunks(chunks)
+        pdf_loaded = True
+        return "✅ PDF uploaded and processed successfully."
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+def delete_pdf():
+    global chunks, index, pdf_loaded
+    chunks, index, pdf_loaded = [], None, False
+    return "🗑️ PDF cleared. Ready for new upload."
+def query_pdf(question, top_k=3):
+    if not pdf_loaded:
+        return "⚠️ Please upload and process a PDF first."
+    question_embedding = embed_model.encode([question])
+    D, I = index.search(np.array(question_embedding), top_k)
+    context = "\n".join([chunks[i] for i in I[0]])
+    prompt = f"Answer the question using the context:\n\n{context}\n\nQuestion: {question}\nAnswer:"
+    response = generator(prompt)[0]["generated_text"]
+    return response.split("Answer:")[-1].strip()
+# ============================================
+# SECTION 4 — ENHANCED GRADIO UI
+# ============================================
 def build_app():
     custom_css = """
     .gradio-container {
         font-family: 'Inter', 'Segoe UI', sans-serif;
+        background: #f9fafb;
+        color: #1f2937;
     }
     h1, h2, h3 {
         color: #003366 !important;
     .tab-nav button {
         font-size: 16px !important;
         font-weight: 500 !important;
+        border-radius: 8px !important;
     }
+    textarea, input, .gr-text-input {
         font-size: 15px !important;
+        border-radius: 10px !important;
+    }
+    .gr-button {
+        background-color: #003366 !important;
+        color: white !important;
+        border-radius: 10px !important;
+        font-weight: 500 !important;
+    }
+    .gr-button:hover {
+        background-color: #0055a4 !important;
     }
     """
+    with gr.Blocks(title="Kiembu ↔ English — NRF Kenya Project", css=custom_css,
+                   theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")) as app:
+        gr.Markdown("""
+        <div style='text-align:center'>
+        <h1 style='color:#003366;'>Kiembu ↔ English Translation Suite</h1>
+        <h3 style='color:#d4a017;'>Funded by NRF Kenya — Creating LLMs that Understand Native Languages</h3>
+        <hr style='border:1px solid #003366;width:80%;margin:auto'>
+        </div>
+        """)
         with gr.Tabs():
+            # Dictionary Tab
+            with gr.TabItem("Dictionary Translator"):
+                gr.Markdown("### Quick Word Translation — **Kiembu ↔ English**")
+                inp = gr.Textbox(label="Enter Word", placeholder="e.g. 'Uvoro' or 'how are you'", lines=1)
+                dir_sel = gr.Radio(["Kiembu → English", "English → Kiembu"], value="Kiembu → English", label="Select Direction")
+                out = gr.Textbox(label="Translation Result")
+                gr.Button("Translate").click(translate_word, [inp, dir_sel], out)
+            # PDF Translation Tab
+            with gr.TabItem("PDF Translation"):
+                gr.Markdown("### Upload English PDF → Get Kiembu Translated PDF")
+                pdf_input = gr.File(label="Upload English PDF", file_types=[".pdf"])
+                translate_btn = gr.Button("Translate to Kiembu")
+                output_file = gr.File(label="Download Translated PDF")
+                status = gr.Textbox(label="Status", interactive=False)
+                translate_btn.click(translate_pdf_to_kiembu, inputs=[pdf_input], outputs=[output_file, status])
+            # NRF LLM Model Q&A Tab
+            with gr.TabItem("PDF Chat (NRF LLM Model)"):
+                gr.Markdown("### Ask Questions from Any PDF using the NRF LLM Model")
+                pdf = gr.File(label="Upload PDF Document")
+                status = gr.Textbox(label="Status")
+                gr.Button("Process PDF").click(load_pdf_and_prepare, pdf, status)
+                gr.Button("Clear PDF").click(delete_pdf, None, status)
+                q = gr.Textbox(lines=2, label="Ask a Question", placeholder="e.g. 'Summarize the introduction section.'")
+                ans = gr.Textbox(lines=6, label="Answer")
+                gr.Button("Query PDF").click(query_pdf, q, ans)
+        gr.Markdown("""
+        <hr style='border:0.5px solid #ccc'>
+        <div style='text-align:center;color:#003366;font-size:14px'>
+        © 2025 National Research Fund (NRF) Kenya — All Rights Reserved
+        </div>
+        """)
     return app
+demo = build_app()
+demo.launch()