casamN commited on
Commit
1aadc7b
·
verified ·
1 Parent(s): c3483ff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -7
app.py CHANGED
@@ -6,6 +6,7 @@ import os
6
  import gradio as gr
7
  import fitz
8
  import faiss
 
9
  import numpy as np
10
  from sentence_transformers import SentenceTransformer
11
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
@@ -21,22 +22,98 @@ from PyPDF2 import PdfReader
21
  # ============================================
22
  # SECTION 1 — SIMPLE DICTIONARY TRANSLATOR
23
  # ============================================
24
-
 
 
25
  kiembu_to_english = {
 
26
  "Uvoro": "how are you", "Ri?": "When?", "Ku?": "Where?", "Uka": "come",
27
  "Hava": "here", "Varia": "there", "Vakuve": "close", "Kuraca": "far",
28
  "Ciakwa": "my/mine", "Cucu": "grandmother", "Mundu": "person",
29
  "Andu": "people", "Mwana": "child", "Mutumia": "woman", "Muthuri": "man",
30
- "Ngai": "God", "Wendo": "love", "Ngui": "dog", "Nyomba": "house", "Ndawa": "medicine"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
- english_to_kiembu = {v.lower(): k for k, v in kiembu_to_english.items()}
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
 
35
  def translate_word(word, direction):
 
 
 
36
  if direction == "Kiembu → English":
37
- return kiembu_to_english.get(word, "Not found in dictionary")
 
 
38
  else:
39
- return english_to_kiembu.get(word.lower(), "Not found in dictionary")
40
 
41
 
42
  # ============================================
@@ -309,8 +386,9 @@ def build_app():
309
  aims to preserve and promote indigenous linguistic heritage through advanced AI translation tools.
310
 
311
  - **Languages Supported:** Kiembu ↔ English
312
- - **Core Engine:** NRF LLM Model (based on lightweight MarianMT)
313
- - **Developed by:** Casam Njagi Nyaga
 
314
  - **Funding Agency:** National Research Fund (NRF), Kenya
315
  - **Objective:** Foster inclusion of native languages in AI-driven communication.
316
  """)
 
6
  import gradio as gr
7
  import fitz
8
  import faiss
9
+ import re
10
  import numpy as np
11
  from sentence_transformers import SentenceTransformer
12
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 
22
  # ============================================
23
  # SECTION 1 — SIMPLE DICTIONARY TRANSLATOR
24
  # ============================================
25
+ # ===========================================
26
+ # Kiembu ↔ English Dictionary (Case & Punctuation Insensitive)
27
+ # ===========================================
28
  kiembu_to_english = {
29
+ # Existing entries
30
  "Uvoro": "how are you", "Ri?": "When?", "Ku?": "Where?", "Uka": "come",
31
  "Hava": "here", "Varia": "there", "Vakuve": "close", "Kuraca": "far",
32
  "Ciakwa": "my/mine", "Cucu": "grandmother", "Mundu": "person",
33
  "Andu": "people", "Mwana": "child", "Mutumia": "woman", "Muthuri": "man",
34
+ "Ngai": "God", "Wendo": "love", "Ngui": "dog", "Nyomba": "house", "Ndawa": "medicine",
35
+ "Maaĩ": "water", "Mwaki": "fire", "Rĩũa": "sun", "Mweri": "moon", "Njata": "star",
36
+ "ĩthiga": "stone", "Mũtĩ": "tree", "ĩthangũ": "leaf", "Mũri": "root", "ĩkoro": "bark",
37
+ "MũndũMũrume": "man", "MũndũMũka": "woman", "fafa": "father", "Mami": "mother",
38
+ "Gũtũ": "ear", "Ritho": "eye", "ĩniũrũ": "nose", "Kanyua": "mouth", "ĩgego": "tooth",
39
+ "rũrĩmĩ": "tongue", "Njara": "hand", "Kũgũrũ": "foot", "Thakame": "blood", "ĩvĩndĩ": "bone",
40
+ "Ngothi": "skin", "Nyama": "meat", "Nthamaki": "fish", "Giconi": "bird", "ĩtumbĩ": "egg",
41
+ "Nvĩa": "horn", "Mũkia": "tail", "ĩvuta": "feather", "Njuĩrĩ": "hair", "Kĩongo": "head",
42
+ "Ngingo": "neck", "Mũrukuthu": "back", "Ngoro": "heart", "Itema": "liver", "nyua": "drink",
43
+ "ria": "eat", "mama": "sleep", "kua": "die", "ũka": "come", "ona": "see", "ĩgua": "hear",
44
+ "menya": "know", "ĩciria": "think", "uga": "say", "ĩmwe": "one", "ĩgarĩ": "two",
45
+ "ĩthatũ": "three", "ĩnya": "four", "ĩthano": "five", "ithathatu": "six", "mugwanja": "seven",
46
+ "inyanya": "eight", "kenda": "nine", "ĩkumi": "ten", "nene": "big", "nini": "small",
47
+ "ndaca": "long", "nguvi": "short", "mbega": "good", "njũku": "bad", "mbĩcuru": "full",
48
+ "ĩtikĩndu": "empty", "nviũ": "hot", "nvoru": "cold", "ũtukũ": "night", "Mũthenya": "day",
49
+ "Mbura": "rain", "rũkũngi": "wind", "nthĩ": "earth", "kĩrĩma": "mountain", "rũnjĩ": "river",
50
+ "ĩria": "lake/sea", "cumbĩ": "salt", "mũthanga": "sand", "ndogo": "smoke", "nyaki": "grass",
51
+ "njira": "path", "kivaro": "field", "kuraca": "far", "vakuvi": "near", "ava": "here",
52
+ "varia": "there", "ũũ": "who", "ndwi": "what", "kũ": "where", "rĩ": "when", "atia": "how",
53
+ "ka": "not", "onthe": "all", "engĩ": "many", "anini": "few", "jerũ": "new", "ngũrũ": "old",
54
+ "kĩthũrũrũ": "round", "kaũgĩ": "sharp", "ritwa": "name", "tirama": "stand", "ĩkara": "sit",
55
+ "thiĩ": "walk", "ngaria": "run", "va": "give", "oca": "take", "nyita": "hold",
56
+ "tiniaa": "cut", "ringa": "hit", "ikia": "throw", "via": "burn", "ĩthambĩra": "swim",
57
+ "ina": "sing", "katika": "dance", "theka": "laugh", "rĩra": "cry", "rũma": "bite",
58
+ "mumunya": "suck", "nungira": "smell", "ĩtigĩra": "fear", "wina toro": "sleepy",
59
+ "mũvũtu": "hungry", "mũnyondu": "thirsty", "ndune": "red", "njerũ": "white",
60
+ "mbirũ": "black", "ngirini": "green", "yellũ": "yellow", "mbulu": "blue",
61
+ "matu": "cloud", "maturĩ": "sky", "rũkũngũ": "dust", "mũu": "ashes", "mũkanda": "rope",
62
+ "kamũti": "stick", "kaviũ": "knife", "ũta": "bow", "mũgwi": "arrow", "itumũ": "spear",
63
+ "gitegithamaki": "fishhook", "neti": "net", "ĩtaru": "canoe", "mũrango": "door",
64
+ "ĩtara": "roof", "nthĩ": "floor", "Mũgeka": "mat", "kĩtanda": "bed", "mũrengeti": "blanket",
65
+ "nyũngũ": "pot", "kanya": "calabash", "gĩkapũ": "basket", "nduramu": "drum",
66
+ "rwĩmbo": "song", "rũgano": "story", "thakania": "play", "mũrata": "friend",
67
+ "nthũ": "enemy", "civũ": "chief", "mũkũrũ": "elder", "ĩria": "milk", "ngombe": "cow/cattle",
68
+ "mbũri": "goat", "ngondu": "sheep", "ngũkũ": "chicken", "ngamĩra": "camel",
69
+ "nvuda": "donkey", "ndegwa": "ox", "mbegũ": "seed", "ketha": "harvest", "ũma": "hoe",
70
+ "ĩthanwa": "axe", "mũro": "digging stick", "Mũtumi ciodo": "weaver",
71
+ "Mwaki nyũngũ": "potter", "mũturi": "blacksmith", "mũgwĩmi": "hunter",
72
+ "mũrĩthi": "herdsman", "mũteginthamaki": "fisherman", "thoko": "market",
73
+ "kwendia": "trade", "cenjania": "barter", "mathaa": "time", "mavinda": "season",
74
+ "ĩvinda rĩa riũa": "dry season", "ivinda ria mbura": "rainy season", "rĩũra": "famine",
75
+ "thayũ": "peace", "mbara": "war", "gũrana": "marriage", "mũviki": "bride",
76
+ "mũvikania": "groom", "ĩrua": "initiation", "kũrua": "circumcision",
77
+ "kĩkuũ": "death", "ngoma": "spirit", "ngomi": "ancestor", "mũgĩmbĩ": "finger millet",
78
+ "mũkombi": "pearl millet", "mwere": "bulrush millet", "mũvia": "sorghum",
79
+ "mbembe": "maize", "minji": "cowpea", "ndengũ": "green gram", "njavĩ": "pigeon pea",
80
+ "ndũma": "arrowroot/taro", "mwanga": "cassava", "gĩkũa": "yam", "ngwacĩ": "sweet potato",
81
+ "ĩrenge": "pumpkin", "sukuma": "kale", "terere": "amaranth", "Thageti": "spider plant",
82
+ "kaũrũra": "pumpkin leaves", "kunde": "cowpea leaves", "Mabuyu": "baobab fruit",
83
+ "nthithi": "tamarind", "mbera": "guava", "matimoko": "custard apple/soursop",
84
+ "macuca": "loquat", "kĩgwa": "sugarcane", "njahĩ": "sesame", "Marũrũ": "sunflower",
85
+ "mbiringanya": "eggplant", "nyanya": "tomato", "gĩtũngũrũ": "onion",
86
+ "kĩtũngũrũ saumu": "garlic", "tangauthi": "ginger", "murende": "turmeric",
87
+ "nduru": "chili", "mboga": "cabbage", "karati": "carrot", "njukĩ": "bee", "ukĩ": "honey",
88
+ "mwatu": "beehive", "mabaki": "wax", "maguta": "butter", "kĩrimũ": "cream",
89
+ "alenya": "ghee", "ĩria ra kũgandithua": "sour milk"
90
  }
 
91
 
92
+ # --- Helper: Normalize user input ---
93
+ def normalize(text):
94
+ """
95
+ Cleans text for case-insensitive and punctuation-insensitive lookup.
96
+ Removes punctuation (.,?!-), converts to lowercase, trims spaces.
97
+ """
98
+ text = text.lower().strip()
99
+ text = re.sub(r"[.,?!-]", "", text) # remove punctuation
100
+ return text
101
+
102
+ # --- Prepare lookup tables (in lowercase) ---
103
+ kiembu_lower = {normalize(k): v for k, v in kiembu_to_english.items()}
104
+ english_lower = {normalize(v): k for k, v in kiembu_to_english.items()}
105
 
106
+ # --- Translation Function ---
107
  def translate_word(word, direction):
108
+ """Translate a word between Kiembu and English, ignoring case & punctuation."""
109
+ cleaned = normalize(word)
110
+
111
  if direction == "Kiembu → English":
112
+ return kiembu_lower.get(cleaned, "Not found in dictionary")
113
+ elif direction == "English → Kiembu":
114
+ return english_lower.get(cleaned, "Not found in dictionary")
115
  else:
116
+ return "Invalid translation direction. Use 'Kiembu English' or 'English → Kiembu'."
117
 
118
 
119
  # ============================================
 
386
  aims to preserve and promote indigenous linguistic heritage through advanced AI translation tools.
387
 
388
  - **Languages Supported:** Kiembu ↔ English
389
+ - **Core Engine:** NRF LLM Model under development
390
+ - **:** Principal Investigator: Prof Lucy Kawira – Chuka University
391
+ - **Developed by: Technical Team: Coordinator- Casam Njagi – Chuka University
392
  - **Funding Agency:** National Research Fund (NRF), Kenya
393
  - **Objective:** Foster inclusion of native languages in AI-driven communication.
394
  """)