Update app.py
Browse files
app.py
CHANGED
|
@@ -6,6 +6,7 @@ import os
|
|
| 6 |
import gradio as gr
|
| 7 |
import fitz
|
| 8 |
import faiss
|
|
|
|
| 9 |
import numpy as np
|
| 10 |
from sentence_transformers import SentenceTransformer
|
| 11 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
|
@@ -21,22 +22,98 @@ from PyPDF2 import PdfReader
|
|
| 21 |
# ============================================
|
| 22 |
# SECTION 1 — SIMPLE DICTIONARY TRANSLATOR
|
| 23 |
# ============================================
|
| 24 |
-
|
|
|
|
|
|
|
| 25 |
kiembu_to_english = {
|
|
|
|
| 26 |
"Uvoro": "how are you", "Ri?": "When?", "Ku?": "Where?", "Uka": "come",
|
| 27 |
"Hava": "here", "Varia": "there", "Vakuve": "close", "Kuraca": "far",
|
| 28 |
"Ciakwa": "my/mine", "Cucu": "grandmother", "Mundu": "person",
|
| 29 |
"Andu": "people", "Mwana": "child", "Mutumia": "woman", "Muthuri": "man",
|
| 30 |
-
"Ngai": "God", "Wendo": "love", "Ngui": "dog", "Nyomba": "house", "Ndawa": "medicine"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
}
|
| 32 |
-
english_to_kiembu = {v.lower(): k for k, v in kiembu_to_english.items()}
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
|
|
|
| 35 |
def translate_word(word, direction):
|
|
|
|
|
|
|
|
|
|
| 36 |
if direction == "Kiembu → English":
|
| 37 |
-
return
|
|
|
|
|
|
|
| 38 |
else:
|
| 39 |
-
return
|
| 40 |
|
| 41 |
|
| 42 |
# ============================================
|
|
@@ -309,8 +386,9 @@ def build_app():
|
|
| 309 |
aims to preserve and promote indigenous linguistic heritage through advanced AI translation tools.
|
| 310 |
|
| 311 |
- **Languages Supported:** Kiembu ↔ English
|
| 312 |
-
- **Core Engine:** NRF LLM Model
|
| 313 |
-
-
|
|
|
|
| 314 |
- **Funding Agency:** National Research Fund (NRF), Kenya
|
| 315 |
- **Objective:** Foster inclusion of native languages in AI-driven communication.
|
| 316 |
""")
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
import fitz
|
| 8 |
import faiss
|
| 9 |
+
import re
|
| 10 |
import numpy as np
|
| 11 |
from sentence_transformers import SentenceTransformer
|
| 12 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
|
|
|
| 22 |
# ============================================
|
| 23 |
# SECTION 1 — SIMPLE DICTIONARY TRANSLATOR
|
| 24 |
# ============================================
|
| 25 |
+
# ===========================================
|
| 26 |
+
# Kiembu ↔ English Dictionary (Case & Punctuation Insensitive)
|
| 27 |
+
# ===========================================
|
| 28 |
kiembu_to_english = {
|
| 29 |
+
# Existing entries
|
| 30 |
"Uvoro": "how are you", "Ri?": "When?", "Ku?": "Where?", "Uka": "come",
|
| 31 |
"Hava": "here", "Varia": "there", "Vakuve": "close", "Kuraca": "far",
|
| 32 |
"Ciakwa": "my/mine", "Cucu": "grandmother", "Mundu": "person",
|
| 33 |
"Andu": "people", "Mwana": "child", "Mutumia": "woman", "Muthuri": "man",
|
| 34 |
+
"Ngai": "God", "Wendo": "love", "Ngui": "dog", "Nyomba": "house", "Ndawa": "medicine",
|
| 35 |
+
"Maaĩ": "water", "Mwaki": "fire", "Rĩũa": "sun", "Mweri": "moon", "Njata": "star",
|
| 36 |
+
"ĩthiga": "stone", "Mũtĩ": "tree", "ĩthangũ": "leaf", "Mũri": "root", "ĩkoro": "bark",
|
| 37 |
+
"MũndũMũrume": "man", "MũndũMũka": "woman", "fafa": "father", "Mami": "mother",
|
| 38 |
+
"Gũtũ": "ear", "Ritho": "eye", "ĩniũrũ": "nose", "Kanyua": "mouth", "ĩgego": "tooth",
|
| 39 |
+
"rũrĩmĩ": "tongue", "Njara": "hand", "Kũgũrũ": "foot", "Thakame": "blood", "ĩvĩndĩ": "bone",
|
| 40 |
+
"Ngothi": "skin", "Nyama": "meat", "Nthamaki": "fish", "Giconi": "bird", "ĩtumbĩ": "egg",
|
| 41 |
+
"Nvĩa": "horn", "Mũkia": "tail", "ĩvuta": "feather", "Njuĩrĩ": "hair", "Kĩongo": "head",
|
| 42 |
+
"Ngingo": "neck", "Mũrukuthu": "back", "Ngoro": "heart", "Itema": "liver", "nyua": "drink",
|
| 43 |
+
"ria": "eat", "mama": "sleep", "kua": "die", "ũka": "come", "ona": "see", "ĩgua": "hear",
|
| 44 |
+
"menya": "know", "ĩciria": "think", "uga": "say", "ĩmwe": "one", "ĩgarĩ": "two",
|
| 45 |
+
"ĩthatũ": "three", "ĩnya": "four", "ĩthano": "five", "ithathatu": "six", "mugwanja": "seven",
|
| 46 |
+
"inyanya": "eight", "kenda": "nine", "ĩkumi": "ten", "nene": "big", "nini": "small",
|
| 47 |
+
"ndaca": "long", "nguvi": "short", "mbega": "good", "njũku": "bad", "mbĩcuru": "full",
|
| 48 |
+
"ĩtikĩndu": "empty", "nviũ": "hot", "nvoru": "cold", "ũtukũ": "night", "Mũthenya": "day",
|
| 49 |
+
"Mbura": "rain", "rũkũngi": "wind", "nthĩ": "earth", "kĩrĩma": "mountain", "rũnjĩ": "river",
|
| 50 |
+
"ĩria": "lake/sea", "cumbĩ": "salt", "mũthanga": "sand", "ndogo": "smoke", "nyaki": "grass",
|
| 51 |
+
"njira": "path", "kivaro": "field", "kuraca": "far", "vakuvi": "near", "ava": "here",
|
| 52 |
+
"varia": "there", "ũũ": "who", "ndwi": "what", "kũ": "where", "rĩ": "when", "atia": "how",
|
| 53 |
+
"ka": "not", "onthe": "all", "engĩ": "many", "anini": "few", "jerũ": "new", "ngũrũ": "old",
|
| 54 |
+
"kĩthũrũrũ": "round", "kaũgĩ": "sharp", "ritwa": "name", "tirama": "stand", "ĩkara": "sit",
|
| 55 |
+
"thiĩ": "walk", "ngaria": "run", "va": "give", "oca": "take", "nyita": "hold",
|
| 56 |
+
"tiniaa": "cut", "ringa": "hit", "ikia": "throw", "via": "burn", "ĩthambĩra": "swim",
|
| 57 |
+
"ina": "sing", "katika": "dance", "theka": "laugh", "rĩra": "cry", "rũma": "bite",
|
| 58 |
+
"mumunya": "suck", "nungira": "smell", "ĩtigĩra": "fear", "wina toro": "sleepy",
|
| 59 |
+
"mũvũtu": "hungry", "mũnyondu": "thirsty", "ndune": "red", "njerũ": "white",
|
| 60 |
+
"mbirũ": "black", "ngirini": "green", "yellũ": "yellow", "mbulu": "blue",
|
| 61 |
+
"matu": "cloud", "maturĩ": "sky", "rũkũngũ": "dust", "mũu": "ashes", "mũkanda": "rope",
|
| 62 |
+
"kamũti": "stick", "kaviũ": "knife", "ũta": "bow", "mũgwi": "arrow", "itumũ": "spear",
|
| 63 |
+
"gitegithamaki": "fishhook", "neti": "net", "ĩtaru": "canoe", "mũrango": "door",
|
| 64 |
+
"ĩtara": "roof", "nthĩ": "floor", "Mũgeka": "mat", "kĩtanda": "bed", "mũrengeti": "blanket",
|
| 65 |
+
"nyũngũ": "pot", "kanya": "calabash", "gĩkapũ": "basket", "nduramu": "drum",
|
| 66 |
+
"rwĩmbo": "song", "rũgano": "story", "thakania": "play", "mũrata": "friend",
|
| 67 |
+
"nthũ": "enemy", "civũ": "chief", "mũkũrũ": "elder", "ĩria": "milk", "ngombe": "cow/cattle",
|
| 68 |
+
"mbũri": "goat", "ngondu": "sheep", "ngũkũ": "chicken", "ngamĩra": "camel",
|
| 69 |
+
"nvuda": "donkey", "ndegwa": "ox", "mbegũ": "seed", "ketha": "harvest", "ũma": "hoe",
|
| 70 |
+
"ĩthanwa": "axe", "mũro": "digging stick", "Mũtumi ciodo": "weaver",
|
| 71 |
+
"Mwaki nyũngũ": "potter", "mũturi": "blacksmith", "mũgwĩmi": "hunter",
|
| 72 |
+
"mũrĩthi": "herdsman", "mũteginthamaki": "fisherman", "thoko": "market",
|
| 73 |
+
"kwendia": "trade", "cenjania": "barter", "mathaa": "time", "mavinda": "season",
|
| 74 |
+
"ĩvinda rĩa riũa": "dry season", "ivinda ria mbura": "rainy season", "rĩũra": "famine",
|
| 75 |
+
"thayũ": "peace", "mbara": "war", "gũrana": "marriage", "mũviki": "bride",
|
| 76 |
+
"mũvikania": "groom", "ĩrua": "initiation", "kũrua": "circumcision",
|
| 77 |
+
"kĩkuũ": "death", "ngoma": "spirit", "ngomi": "ancestor", "mũgĩmbĩ": "finger millet",
|
| 78 |
+
"mũkombi": "pearl millet", "mwere": "bulrush millet", "mũvia": "sorghum",
|
| 79 |
+
"mbembe": "maize", "minji": "cowpea", "ndengũ": "green gram", "njavĩ": "pigeon pea",
|
| 80 |
+
"ndũma": "arrowroot/taro", "mwanga": "cassava", "gĩkũa": "yam", "ngwacĩ": "sweet potato",
|
| 81 |
+
"ĩrenge": "pumpkin", "sukuma": "kale", "terere": "amaranth", "Thageti": "spider plant",
|
| 82 |
+
"kaũrũra": "pumpkin leaves", "kunde": "cowpea leaves", "Mabuyu": "baobab fruit",
|
| 83 |
+
"nthithi": "tamarind", "mbera": "guava", "matimoko": "custard apple/soursop",
|
| 84 |
+
"macuca": "loquat", "kĩgwa": "sugarcane", "njahĩ": "sesame", "Marũrũ": "sunflower",
|
| 85 |
+
"mbiringanya": "eggplant", "nyanya": "tomato", "gĩtũngũrũ": "onion",
|
| 86 |
+
"kĩtũngũrũ saumu": "garlic", "tangauthi": "ginger", "murende": "turmeric",
|
| 87 |
+
"nduru": "chili", "mboga": "cabbage", "karati": "carrot", "njukĩ": "bee", "ukĩ": "honey",
|
| 88 |
+
"mwatu": "beehive", "mabaki": "wax", "maguta": "butter", "kĩrimũ": "cream",
|
| 89 |
+
"alenya": "ghee", "ĩria ra kũgandithua": "sour milk"
|
| 90 |
}
|
|
|
|
| 91 |
|
| 92 |
+
# --- Helper: Normalize user input ---
|
| 93 |
+
def normalize(text):
|
| 94 |
+
"""
|
| 95 |
+
Cleans text for case-insensitive and punctuation-insensitive lookup.
|
| 96 |
+
Removes punctuation (.,?!-), converts to lowercase, trims spaces.
|
| 97 |
+
"""
|
| 98 |
+
text = text.lower().strip()
|
| 99 |
+
text = re.sub(r"[.,?!-]", "", text) # remove punctuation
|
| 100 |
+
return text
|
| 101 |
+
|
| 102 |
+
# --- Prepare lookup tables (in lowercase) ---
|
| 103 |
+
kiembu_lower = {normalize(k): v for k, v in kiembu_to_english.items()}
|
| 104 |
+
english_lower = {normalize(v): k for k, v in kiembu_to_english.items()}
|
| 105 |
|
| 106 |
+
# --- Translation Function ---
|
| 107 |
def translate_word(word, direction):
|
| 108 |
+
"""Translate a word between Kiembu and English, ignoring case & punctuation."""
|
| 109 |
+
cleaned = normalize(word)
|
| 110 |
+
|
| 111 |
if direction == "Kiembu → English":
|
| 112 |
+
return kiembu_lower.get(cleaned, "Not found in dictionary")
|
| 113 |
+
elif direction == "English → Kiembu":
|
| 114 |
+
return english_lower.get(cleaned, "Not found in dictionary")
|
| 115 |
else:
|
| 116 |
+
return "Invalid translation direction. Use 'Kiembu → English' or 'English → Kiembu'."
|
| 117 |
|
| 118 |
|
| 119 |
# ============================================
|
|
|
|
| 386 |
aims to preserve and promote indigenous linguistic heritage through advanced AI translation tools.
|
| 387 |
|
| 388 |
- **Languages Supported:** Kiembu ↔ English
|
| 389 |
+
- **Core Engine:** NRF LLM Model under development
|
| 390 |
+
- **:** Principal Investigator: Prof Lucy Kawira – Chuka University
|
| 391 |
+
- **Developed by: Technical Team: Coordinator- Casam Njagi – Chuka University
|
| 392 |
- **Funding Agency:** National Research Fund (NRF), Kenya
|
| 393 |
- **Objective:** Foster inclusion of native languages in AI-driven communication.
|
| 394 |
""")
|