casamN commited on
Commit
c096ba0
·
verified ·
1 Parent(s): cf4a73b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +211 -161
app.py CHANGED
@@ -1,89 +1,172 @@
1
  # =========================================================
2
- # KiembuEnglish — NRF Kenya Project
3
- # Funded by NRF Kenya — Creating LLMs that Understand Native Languages
4
  # =========================================================
5
 
 
6
  import gradio as gr
7
- from transformers import MarianMTModel, MarianTokenizer
8
- import fitz # PyMuPDF for PDF reading
9
-
10
- # -----------------------------
11
- # MODEL LOADING
12
- # -----------------------------
13
- # Light, fast MarianMT models
14
- EN_TO_SW_MODEL = "Helsinki-NLP/opus-mt-en-sw"
15
- SW_TO_EN_MODEL = "Helsinki-NLP/opus-mt-sw-en"
16
-
17
- en_tokenizer = MarianTokenizer.from_pretrained(EN_TO_SW_MODEL)
18
- en_model = MarianMTModel.from_pretrained(EN_TO_SW_MODEL)
19
-
20
- sw_tokenizer = MarianTokenizer.from_pretrained(SW_TO_EN_MODEL)
21
- sw_model = MarianMTModel.from_pretrained(SW_TO_EN_MODEL)
22
-
23
- # -----------------------------
24
- # TRANSLATION FUNCTIONS
25
- # -----------------------------
26
- def translate_text(text, direction):
27
- if not text.strip():
28
- return "Please enter some text to translate."
29
-
30
- if direction == "English Kiembu":
31
- tokenizer, model = en_tokenizer, en_model
 
 
 
 
 
 
32
  else:
33
- tokenizer, model = sw_tokenizer, sw_model
 
34
 
35
- inputs = tokenizer(text, return_tensors="pt", padding=True)
36
- outputs = model.generate(**inputs, max_length=400)
37
- translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
38
- return translated
39
 
 
40
 
41
- # -----------------------------
42
- # PDF CHAT FUNCTION
43
- # -----------------------------
44
- def read_pdf(pdf_file):
45
- if pdf_file is None:
46
- return "Please upload a PDF document."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  doc = fitz.open(pdf_file.name)
48
  text = ""
49
  for page in doc:
50
- text += page.get_text("text") + "\n"
51
- doc.close()
52
- return text if text.strip() else "No readable text found in the PDF."
53
-
54
-
55
- # -----------------------------
56
- # PROVERBS SECTION
57
- # -----------------------------
58
- proverbs_data = {
59
- "Kiembu": [
60
- ("Mûno mûno ndîgîa gûtî na mûcîrî", "Too much of anything is bad."),
61
- ("Mûkûrû ti mûtî", "An elder is not a tree (meaning: respect wisdom)."),
62
- ("Nîkenda kûigua atîa nîyo mûtumia aigua", "The woman listens in her own way."),
63
- ],
64
- "English": [
65
- ("Wisdom is like a baobab tree; no one individual can embrace it.", "Uûgî nî mûtî mûkûrû ûtîkûthîrwa na mûtû ũmwe."),
66
- ("A single bracelet does not jingle.", "Kîrîa kîmwe kîgûthîrîra ndîrî."),
67
- ("Even the best cooking pot will not produce food.", "Ata ndîgîra yambîrî ndîkûgîa thîna ûthîrî."),
68
- ],
69
- }
70
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- def get_proverbs(language):
73
- selected = proverbs_data[language]
74
- formatted = "\n\n".join([f"**{p[0]}**\n→ {p[1]}" for p in selected])
75
- return formatted
76
-
77
-
78
- # -----------------------------
79
- # BUILD GRADIO APP
80
- # -----------------------------
81
  def build_app():
82
  custom_css = """
83
  .gradio-container {
84
  font-family: 'Inter', 'Segoe UI', sans-serif;
85
- background-color: #f9fafb;
86
- color: #111827;
87
  }
88
  h1, h2, h3 {
89
  color: #003366 !important;
@@ -92,105 +175,72 @@ def build_app():
92
  .tab-nav button {
93
  font-size: 16px !important;
94
  font-weight: 500 !important;
 
95
  }
96
- textarea, input {
97
  font-size: 15px !important;
 
 
 
 
 
 
 
 
 
 
98
  }
99
  """
100
 
101
- with gr.Blocks(
102
- title="Kiembu English — NRF Kenya Project",
103
- css=custom_css,
104
- theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")
105
- ) as app:
106
- gr.Markdown(
107
- """
108
- # Kiembu English — NRF Kenya Project
109
- ### Funded by NRF Kenya — Creating LLMs that Understand Native Languages
110
- ---
111
- """
112
- )
113
 
114
  with gr.Tabs():
115
- # -----------------------------
116
- # TAB 1: TRANSLATOR
117
- # -----------------------------
118
- with gr.TabItem("Translator"):
119
- gr.Markdown("### Translate between English and Kiembu using the NRF LLM Model")
120
-
121
- with gr.Row():
122
- direction = gr.Radio(
123
- ["English Kiembu", "Kiembu → English"],
124
- label="Select Translation Direction",
125
- value="English → Kiembu",
126
- )
127
-
128
- with gr.Row():
129
- input_text = gr.Textbox(
130
- label="Enter Text",
131
- placeholder="Type text to translate...",
132
- lines=5,
133
- )
134
- output_text = gr.Textbox(
135
- label="Translated Text",
136
- placeholder="Translation will appear here...",
137
- lines=5,
138
- )
139
-
140
- translate_btn = gr.Button("Translate", variant="primary")
141
- translate_btn.click(translate_text, inputs=[input_text, direction], outputs=output_text)
142
-
143
- # -----------------------------
144
- # TAB 2: PROVERBS
145
- # -----------------------------
146
- with gr.TabItem("Proverbs"):
147
- gr.Markdown("### Explore Traditional Proverbs and Their Meanings")
148
-
149
- lang_choice = gr.Radio(["Kiembu", "English"], label="Select Language", value="Kiembu")
150
- show_btn = gr.Button("Show Proverbs", variant="primary")
151
- proverb_box = gr.Markdown()
152
-
153
- show_btn.click(get_proverbs, inputs=lang_choice, outputs=proverb_box)
154
-
155
- # -----------------------------
156
- # TAB 3: PDF CHAT
157
- # -----------------------------
158
- with gr.TabItem("PDF Chat"):
159
- gr.Markdown("### Extract Text from PDF and Analyze Using NRF LLM Model")
160
-
161
- pdf_input = gr.File(label="Upload PDF File")
162
- pdf_output = gr.Textbox(label="Extracted Text", lines=15)
163
-
164
- extract_btn = gr.Button("Extract Text", variant="primary")
165
- extract_btn.click(read_pdf, inputs=pdf_input, outputs=pdf_output)
166
-
167
- # -----------------------------
168
- # TAB 4: ABOUT
169
- # -----------------------------
170
- with gr.TabItem("About"):
171
- gr.Markdown(
172
- """
173
- ### About the Project
174
- The **NRF Kenya Project** on *Creating LLMs that Understand Native Languages*
175
- aims to preserve and promote indigenous linguistic heritage through advanced AI translation tools.
176
-
177
- - **Languages Supported:** Kiembu ↔ English
178
- - **Core Engine:** NRF LLM Model (based on lightweight MarianMT)
179
- - **Developed by:** Casam Njagi Nyaga
180
- - **Funding Agency:** National Research Fund (NRF), Kenya
181
- - **Objective:** Foster inclusion of native languages in AI-driven communication.
182
- """
183
- )
184
-
185
- gr.Markdown("---\n© 2025 NRF Kenya — All Rights Reserved")
186
 
187
  return app
188
 
189
 
190
- # -----------------------------
191
- # LAUNCH APP
192
- # -----------------------------
193
- if __name__ == "__main__":
194
- app = build_app()
195
- app.launch()
196
-
 
1
  # =========================================================
2
+ # KIEMBUENGLISH — NRF KENYA TRANSLATION SUITE
 
3
  # =========================================================
4
 
5
+ import os
6
  import gradio as gr
7
+ import fitz
8
+ import faiss
9
+ import numpy as np
10
+ from sentence_transformers import SentenceTransformer
11
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
12
+ from reportlab.lib.pagesizes import A4
13
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
14
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
15
+ from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER
16
+ from reportlab.lib.units import inch
17
+ from reportlab.pdfbase.cidfonts import UnicodeCIDFont
18
+ from reportlab.pdfbase import pdfmetrics
19
+ from PyPDF2 import PdfReader
20
+
21
+ # ============================================
22
+ # SECTION 1 — SIMPLE DICTIONARY TRANSLATOR
23
+ # ============================================
24
+
25
+ kiembu_to_english = {
26
+ "Uvoro": "how are you", "Ri?": "When?", "Ku?": "Where?", "Uka": "come",
27
+ "Hava": "here", "Varia": "there", "Vakuve": "close", "Kuraca": "far",
28
+ "Ciakwa": "my/mine", "Cucu": "grandmother", "Mundu": "person",
29
+ "Andu": "people", "Mwana": "child", "Mutumia": "woman", "Muthuri": "man",
30
+ "Ngai": "God", "Wendo": "love", "Ngui": "dog", "Nyomba": "house", "Ndawa": "medicine"
31
+ }
32
+ english_to_kiembu = {v.lower(): k for k, v in kiembu_to_english.items()}
33
+
34
+
35
+ def translate_word(word, direction):
36
+ if direction == "Kiembu → English":
37
+ return kiembu_to_english.get(word, "Not found in dictionary")
38
  else:
39
+ return english_to_kiembu.get(word.lower(), "Not found in dictionary")
40
+
41
 
42
+ # ============================================
43
+ # SECTION 2 — PDF TRANSLATION (Transformer + PDF)
44
+ # ============================================
 
45
 
46
+ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-sw") # placeholder
47
 
48
+ def extract_text_from_pdf(pdf_file):
49
+ reader = PdfReader(pdf_file)
50
+ text = ""
51
+ for page in reader.pages:
52
+ page_text = page.extract_text()
53
+ if page_text:
54
+ text += page_text + "\n"
55
+ return text.strip()
56
+
57
+ def translate_text(text):
58
+ chunks = text.split(". ")
59
+ translated = []
60
+ for chunk in chunks:
61
+ if chunk.strip():
62
+ try:
63
+ tr = translator(chunk.strip())[0]["translation_text"]
64
+ translated.append(tr)
65
+ except Exception:
66
+ translated.append(chunk)
67
+ return ". ".join(translated)
68
+
69
+ def create_pdf(translated_text, output_path="translated_output.pdf"):
70
+ pdfmetrics.registerFont(UnicodeCIDFont('HeiseiKakuGo-W5'))
71
+ doc = SimpleDocTemplate(output_path, pagesize=A4, rightMargin=60, leftMargin=60, topMargin=72, bottomMargin=72)
72
+ styles = getSampleStyleSheet()
73
+ title_style = ParagraphStyle(name='TitleStyle', parent=styles['Heading1'],
74
+ alignment=TA_CENTER, fontName='HeiseiKakuGo-W5',
75
+ fontSize=16, spaceAfter=20)
76
+ body_style = ParagraphStyle(name='BodyStyle', parent=styles['Normal'],
77
+ alignment=TA_JUSTIFY, fontName='HeiseiKakuGo-W5',
78
+ fontSize=12, leading=16)
79
+ story = [Paragraph("Translated Document — English → Kiembu", title_style),
80
+ Spacer(1, 0.3 * inch)]
81
+ for para in translated_text.split("\n"):
82
+ if para.strip():
83
+ story.append(Paragraph(para.strip(), body_style))
84
+ story.append(Spacer(1, 0.2 * inch))
85
+ doc.build(story)
86
+ return output_path
87
+
88
+ def translate_pdf_to_kiembu(pdf_file):
89
+ text = extract_text_from_pdf(pdf_file.name)
90
+ if not text:
91
+ return None, "No readable text found in the uploaded PDF."
92
+ translated_text = translate_text(text)
93
+ output_pdf_path = create_pdf(translated_text)
94
+ return output_pdf_path, "Translation complete! Download below."
95
+
96
+
97
+ # ============================================
98
+ # SECTION 3 — NRF LLM MODEL PDF CHAT
99
+ # ============================================
100
+
101
+ embed_model = SentenceTransformer("all-MiniLM-L6-v2")
102
+ model_name = "google/gemma-2b-it"
103
+ hf_token = os.getenv("NRF_LLM_TOKEN")
104
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
105
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto", use_auth_token=hf_token)
106
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
107
+
108
+ chunks, index, pdf_loaded = [], None, False
109
+
110
+ def extract_pdf_text(pdf_file):
111
  doc = fitz.open(pdf_file.name)
112
  text = ""
113
  for page in doc:
114
+ text += page.get_text()
115
+ return text
116
+
117
+ def chunk_text(text, chunk_size=500, overlap=100):
118
+ words = text.split()
119
+ chunk_list = []
120
+ start = 0
121
+ while start < len(words):
122
+ end = min(start + chunk_size, len(words))
123
+ chunk_list.append(" ".join(words[start:end]))
124
+ start += chunk_size - overlap
125
+ return chunk_list
126
+
127
+ def embed_chunks(chunks_list):
128
+ embeddings = embed_model.encode(chunks_list)
129
+ idx = faiss.IndexFlatL2(embeddings.shape[1])
130
+ idx.add(np.array(embeddings))
131
+ return idx
132
+
133
+ def load_pdf_and_prepare(pdf_file):
134
+ global chunks, index, pdf_loaded
135
+ try:
136
+ text = extract_pdf_text(pdf_file)
137
+ chunks = chunk_text(text)
138
+ index = embed_chunks(chunks)
139
+ pdf_loaded = True
140
+ return "✅ PDF uploaded and processed successfully."
141
+ except Exception as e:
142
+ return f"❌ Error: {str(e)}"
143
+
144
+ def delete_pdf():
145
+ global chunks, index, pdf_loaded
146
+ chunks, index, pdf_loaded = [], None, False
147
+ return "🗑️ PDF cleared. Ready for new upload."
148
+
149
+ def query_pdf(question, top_k=3):
150
+ if not pdf_loaded:
151
+ return "⚠️ Please upload and process a PDF first."
152
+ question_embedding = embed_model.encode([question])
153
+ D, I = index.search(np.array(question_embedding), top_k)
154
+ context = "\n".join([chunks[i] for i in I[0]])
155
+ prompt = f"Answer the question using the context:\n\n{context}\n\nQuestion: {question}\nAnswer:"
156
+ response = generator(prompt)[0]["generated_text"]
157
+ return response.split("Answer:")[-1].strip()
158
+
159
+
160
+ # ============================================
161
+ # SECTION 4 — ENHANCED GRADIO UI
162
+ # ============================================
163
 
 
 
 
 
 
 
 
 
 
164
  def build_app():
165
  custom_css = """
166
  .gradio-container {
167
  font-family: 'Inter', 'Segoe UI', sans-serif;
168
+ background: #f9fafb;
169
+ color: #1f2937;
170
  }
171
  h1, h2, h3 {
172
  color: #003366 !important;
 
175
  .tab-nav button {
176
  font-size: 16px !important;
177
  font-weight: 500 !important;
178
+ border-radius: 8px !important;
179
  }
180
+ textarea, input, .gr-text-input {
181
  font-size: 15px !important;
182
+ border-radius: 10px !important;
183
+ }
184
+ .gr-button {
185
+ background-color: #003366 !important;
186
+ color: white !important;
187
+ border-radius: 10px !important;
188
+ font-weight: 500 !important;
189
+ }
190
+ .gr-button:hover {
191
+ background-color: #0055a4 !important;
192
  }
193
  """
194
 
195
+ with gr.Blocks(title="Kiembu ↔ English — NRF Kenya Project", css=custom_css,
196
+ theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")) as app:
197
+
198
+ gr.Markdown("""
199
+ <div style='text-align:center'>
200
+ <h1 style='color:#003366;'>Kiembu ↔ English Translation Suite</h1>
201
+ <h3 style='color:#d4a017;'>Funded by NRF Kenya — Creating LLMs that Understand Native Languages</h3>
202
+ <hr style='border:1px solid #003366;width:80%;margin:auto'>
203
+ </div>
204
+ """)
 
 
205
 
206
  with gr.Tabs():
207
+ # Dictionary Tab
208
+ with gr.TabItem("Dictionary Translator"):
209
+ gr.Markdown("### Quick Word Translation — **Kiembu ↔ English**")
210
+ inp = gr.Textbox(label="Enter Word", placeholder="e.g. 'Uvoro' or 'how are you'", lines=1)
211
+ dir_sel = gr.Radio(["Kiembu English", "English Kiembu"], value="Kiembu English", label="Select Direction")
212
+ out = gr.Textbox(label="Translation Result")
213
+ gr.Button("Translate").click(translate_word, [inp, dir_sel], out)
214
+
215
+ # PDF Translation Tab
216
+ with gr.TabItem("PDF Translation"):
217
+ gr.Markdown("### Upload English PDF Get Kiembu Translated PDF")
218
+ pdf_input = gr.File(label="Upload English PDF", file_types=[".pdf"])
219
+ translate_btn = gr.Button("Translate to Kiembu")
220
+ output_file = gr.File(label="Download Translated PDF")
221
+ status = gr.Textbox(label="Status", interactive=False)
222
+ translate_btn.click(translate_pdf_to_kiembu, inputs=[pdf_input], outputs=[output_file, status])
223
+
224
+ # NRF LLM Model Q&A Tab
225
+ with gr.TabItem("PDF Chat (NRF LLM Model)"):
226
+ gr.Markdown("### Ask Questions from Any PDF using the NRF LLM Model")
227
+ pdf = gr.File(label="Upload PDF Document")
228
+ status = gr.Textbox(label="Status")
229
+ gr.Button("Process PDF").click(load_pdf_and_prepare, pdf, status)
230
+ gr.Button("Clear PDF").click(delete_pdf, None, status)
231
+ q = gr.Textbox(lines=2, label="Ask a Question", placeholder="e.g. 'Summarize the introduction section.'")
232
+ ans = gr.Textbox(lines=6, label="Answer")
233
+ gr.Button("Query PDF").click(query_pdf, q, ans)
234
+
235
+ gr.Markdown("""
236
+ <hr style='border:0.5px solid #ccc'>
237
+ <div style='text-align:center;color:#003366;font-size:14px'>
238
+ © 2025 National Research Fund (NRF) Kenya — All Rights Reserved
239
+ </div>
240
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  return app
243
 
244
 
245
+ demo = build_app()
246
+ demo.launch()