acecalisto3 commited on
Commit
8a867c6
·
verified ·
1 Parent(s): 2fd860e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +288 -236
app.py CHANGED
@@ -1,5 +1,3 @@
1
- # app.py
2
-
3
  import os
4
  import re
5
  import requests
@@ -16,45 +14,103 @@ from urllib.parse import urljoin, urlparse
16
 
17
  # Third-party libraries
18
  import gradio as gr
19
- from huggingface_hub import InferenceClient
 
20
  from pypdf import PdfReader
21
  from bs4 import BeautifulSoup
22
  import nltk
23
 
24
- # Local imports from the enhanced prompt library
25
- from agent_prompt import PromptLibrary, SystemAuditor
26
-
27
  # --- CONFIGURATION ---
28
  class Config:
 
29
  HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
30
  HF_TOKEN = os.getenv("HF_TOKEN")
 
 
 
 
31
  VERBOSE = os.getenv("VERBOSE", "True").lower() == "true"
32
- MAX_NEW_TOKENS_REPORT = 8192
33
- MAX_NEW_TOKENS_CHAT = 2048
34
- REQUESTS_TIMEOUT = 15
 
 
35
  USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
36
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # --- UTILITIES ---
38
  def log(message: str) -> None:
39
- if Config.VERBOSE:
40
- print(f"[{datetime.datetime.now(datetime.timezone.utc).isoformat()}] {message}")
41
-
42
- class SessionManager:
43
- # ... (No changes from previous version)
44
- def __init__(self, session_id: str):
45
- self.session_id = session_id
46
- self.temp_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
47
- def __enter__(self) -> str:
48
- os.makedirs(self.temp_dir, exist_ok=True)
49
- return self.temp_dir
50
- def __exit__(self, exc_type, exc_val, exc_tb):
51
- shutil.rmtree(self.temp_dir, ignore_errors=True)
52
 
53
  # --- CORE APPLICATION ENGINE ---
54
  class MaestroEngine:
55
- """Handles all data processing and LLM interaction logic."""
56
  def __init__(self):
 
57
  self.client = InferenceClient(model=Config.HF_MODEL, token=Config.HF_TOKEN)
 
58
  try:
59
  nltk.data.find("tokenizers/punkt")
60
  except LookupError:
@@ -62,165 +118,177 @@ class MaestroEngine:
62
  nltk.download('punkt', quiet=True)
63
  log("MaestroEngine initialized.")
64
 
65
- # --- File Processors ---
66
- def _read_pdf(self, file_path: str) -> str:
67
  try:
68
- reader = PdfReader(file_path)
69
- return "\n".join(page.extract_text() or "" for page in reader.pages)
70
- except Exception as e: return f"Error reading PDF {os.path.basename(file_path)}: {e}"
71
-
72
- def _process_zip(self, zip_path: str, temp_dir: str) -> str:
73
- # ... (Identical to previous correct version)
74
- extracted_texts = []
75
- extract_path = os.path.join(temp_dir, "zip_extract")
76
- os.makedirs(extract_path, exist_ok=True)
77
- try:
78
- with zipfile.ZipFile(zip_path, 'r') as zf:
79
- for member in zf.infolist():
80
- if member.is_dir() or member.filename.startswith('__MACOSX'): continue
81
- member_path = zf.extract(member, path=extract_path)
82
- if member.filename.endswith('.pdf'): extracted_texts.append(self._read_pdf(member_path))
83
- elif member.filename.endswith('.txt'):
84
- with open(member_path, 'r', encoding='utf-8', errors='ignore') as f: extracted_texts.append(f.read())
85
- return "\n\n".join(extracted_texts)
86
- except Exception as e: return f"Error processing ZIP {os.path.basename(zip_path)}: {e}"
87
-
88
- # --- Google Workspace Integration ---
89
- def _get_google_drive_id(self, url: str) -> str | None:
90
- match = re.search(r"/file/d/([a-zA-Z0-9_-]+)", url)
91
- return match.group(1) if match else None
92
 
93
- def _download_google_drive_file(self, file_id: str, temp_dir: str) -> Tuple[str | None, str | None]:
94
- URL = "https://docs.google.com/uc?export=download&id="
95
  try:
96
- response = requests.get(URL + file_id, stream=True, timeout=30)
97
  response.raise_for_status()
98
- disposition = response.headers.get('content-disposition', '')
99
- fname = re.findall('filename="(.+)"', disposition)
100
- filename = fname[0] if fname else f"{file_id}_download"
101
- file_path = os.path.join(temp_dir, filename)
102
- with open(file_path, "wb") as f:
103
- for chunk in response.iter_content(chunk_size=8192):
104
- f.write(chunk)
105
- return file_path, None
106
- except Exception as e:
107
- return None, f"Failed to download Google Drive file ({file_id}): {e}"
108
 
109
- def _fetch_google_doc(self, url: str) -> Tuple[str | None, str | None]:
110
  try:
111
- doc_id = re.search(r"/document/d/([a-zA-Z0-9_-]+)", url).group(1)
112
- export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=txt"
113
- response = requests.get(export_url, timeout=Config.REQUESTS_TIMEOUT)
114
  response.raise_for_status()
115
- return response.text, None
116
- except Exception as e: return None, f"Failed to fetch Google Doc: {e}"
117
-
118
- # --- Web Crawler ---
119
- def _crawl_website(self, start_url: str, max_depth: int) -> Tuple[str, List[str]]:
120
- if not start_url: return "", []
121
- visited: Set[str] = set()
122
- to_visit: List[Tuple[str, int]] = [(start_url, 0)]
123
- all_text, errors = [], []
124
- base_netloc = urlparse(start_url).netloc
125
-
126
- while to_visit and len(visited) < 50: # Safety break
127
- current_url, depth = to_visit.pop(0)
128
- if current_url in visited or depth > max_depth: continue
129
-
130
- log(f"Crawling (depth {depth}): {current_url}")
131
- visited.add(current_url)
132
-
133
- try:
134
- response = requests.get(current_url, headers={'User-Agent': Config.USER_AGENT}, timeout=Config.REQUESTS_TIMEOUT)
135
- response.raise_for_status()
136
- soup = BeautifulSoup(response.content, 'html.parser')
137
- all_text.append(soup.get_text(separator="\n", strip=True))
138
-
139
- if depth < max_depth:
140
- for link in soup.find_all('a', href=True):
141
- abs_url = urljoin(current_url, link['href'])
142
- if urlparse(abs_url).netloc == base_netloc and abs_url not in visited:
143
- to_visit.append((abs_url, depth + 1))
144
- except Exception as e:
145
- errors.append(f"Crawl Error on {current_url}: {e}")
146
- return "\n\n".join(all_text), errors
147
-
148
- # --- Main Orchestrator ---
149
- def process_data_sources(self, session_id: str, url: str, crawl_depth: int, text: str, file_paths: List[str]) -> Tuple[str, List[str]]:
150
  all_content, errors = [], []
151
- with SessionManager(session_id) as temp_dir:
152
- # 1. Direct Text
153
- if text: all_content.append(text)
154
-
155
- # 2. Uploaded Files
156
- if file_paths:
157
- for path in file_paths:
158
- if not path: continue
159
- filename = os.path.basename(path)
160
- ext = os.path.splitext(filename)[1].lower()
161
- if ext == '.pdf': all_content.append(self._read_pdf(path))
162
- elif ext == '.txt':
163
- with open(path, 'r', encoding='utf-8', errors='ignore') as f: all_content.append(f.read())
164
- elif ext == '.zip': all_content.append(self._process_zip(path, temp_dir))
165
- else: errors.append(f"Unsupported file type: {filename}")
166
-
167
- # 3. URL Processing (Crawler, Google Drive, Google Docs)
168
- if url:
169
- if "drive.google.com/file/d/" in url:
170
- file_id = self._get_google_drive_id(url)
171
- if file_id:
172
- path, err = self._download_google_drive_file(file_id, temp_dir)
173
- if err: errors.append(err)
174
- elif path: all_content.append(self._read_pdf(path)) # Assuming PDF for simplicity
175
- else: errors.append(f"Invalid Google Drive URL: {url}")
176
- elif "docs.google.com/document/d/" in url:
177
- content, err = self._fetch_google_doc(url)
178
- if err: errors.append(err)
179
- else: all_content.append(content)
180
- else: # Standard web crawling
181
- content, crawl_errors = self._crawl_website(url, crawl_depth)
182
- all_content.append(content)
183
- errors.extend(crawl_errors)
184
-
185
  return "\n\n---\n\n".join(all_content), errors
186
 
187
  # --- LLM Interaction ---
188
- def _query_llm_stream(self, system_prompt: str, user_prompt: str, max_tokens: int) -> Generator[str, None, None]:
189
- full_prompt = f"<s>[INST] {system_prompt}\n\n{user_prompt} [/INST]"
190
- try:
191
- for token in self.client.text_generation(full_prompt, max_new_tokens=max_tokens, stream=True, temperature=0.7, top_p=0.95):
192
- yield token
193
- except Exception as e:
194
- log(f"LLM stream query failed: {e}")
195
- yield f"Error communicating with the model: {e}"
196
-
197
- def run_rag_query(self, query: str, context: str, purpose: str) -> Generator[str, None, None]:
198
  system_prompt = PromptLibrary.AGENT_PREFIX.format(
199
  dynamic_timestamp_utc=datetime.datetime.now(datetime.timezone.utc).isoformat(),
200
- user_purpose=purpose or "Direct Question & Answer"
201
  )
202
- user_prompt = f"Based *only* on the context provided below, answer the user's question.\n\nCONTEXT:\n---\n{context}\n---\n\nQUESTION: {query}"
203
- yield from self._query_llm_stream(system_prompt, user_prompt, Config.MAX_NEW_TOKENS_CHAT)
 
 
 
 
 
204
 
205
- def generate_report(self, report_type: str, context: str, objective: str) -> str:
206
- system_prompt = PromptLibrary.AGENT_PREFIX.format(
207
- dynamic_timestamp_utc=datetime.datetime.now(datetime.timezone.utc).isoformat(),
208
- user_purpose=objective
209
- )
210
- if report_type == "Narrative Prose Report":
211
- user_prompt = PromptLibrary.NARRATIVE_PROSE_REPORT.format(task_objective=objective, knowledge_base=context)
212
- else: # Technical JSON Report
213
- user_prompt = PromptLibrary.TECHNICAL_JSON_REPORT.format(task_objective=objective, knowledge_base=context)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
- # Reports are not streamed
216
- full_prompt = f"<s>[INST] {system_prompt}\n\n{user_prompt} [/INST]"
217
- response = self.client.text_generation(full_prompt, max_new_tokens=Config.MAX_NEW_TOKENS_REPORT)
 
218
 
219
- if report_type == "Technical JSON Report":
220
- clean_json_str = re.sub(r'```json\s*|\s*```', '', response).strip()
221
- try: return json.dumps(json.loads(clean_json_str), indent=2)
222
- except json.JSONDecodeError: return json.dumps({"error": "Model returned invalid JSON", "raw_response": response})
223
- return response
 
224
 
225
  # --- GRADIO APPLICATION ---
226
  class GradioApp:
@@ -229,91 +297,75 @@ class GradioApp:
229
  self.app = self._build_ui()
230
 
231
  def _build_ui(self) -> gr.Blocks:
232
- with gr.Blocks(theme=gr.themes.Soft(primary_hue="emerald", secondary_hue="green"), title="Maestro AI Engine") as app:
233
  session_id = gr.State(lambda: secrets.token_hex(16))
234
- processed_data = gr.State("")
235
-
236
  gr.Markdown("# 🧠 Maestro: AI Data Engine & Synthesis Platform")
237
 
238
  with gr.Tabs():
239
- with gr.TabItem(" Data Ingestion"):
240
- user_purpose = gr.Textbox(label="High-Level Goal / Purpose", placeholder="e.g., 'Research AI impact on agriculture for a market report'")
241
  with gr.Row():
242
  with gr.Column(scale=3):
243
- url_input = gr.Textbox(label="Ingest from URL", placeholder="Enter a standard URL, Google Drive, or Google Docs link")
244
- text_input = gr.Textbox(label="Paste Text", lines=8)
245
- with gr.Column(scale=2):
246
- crawl_depth = gr.Slider(label="Web Crawl Depth", minimum=0, maximum=5, value=1, step=1, info="For standard URLs only. 0=current page, 1=page + links, etc.")
247
- file_upload = gr.File(label="Upload Files (.pdf, .txt, .zip)", file_count="multiple", type="filepath")
248
- process_button = gr.Button("🚀 Process All Sources", variant="primary")
249
- ingestion_summary = gr.Textbox(label="Ingestion Summary", interactive=False, lines=2)
250
- error_log = gr.Textbox(label="Errors & Warnings", interactive=False, lines=3)
251
-
252
- with gr.TabItem(" Reporting & Synthesis"):
 
 
 
 
 
253
  with gr.Row():
254
- report_type = gr.Dropdown(label="Select Report Type", choices=["Narrative Prose Report", "Technical JSON Report"], value="Narrative Prose Report")
255
- generate_button = gr.Button("Generate Report", variant="primary")
256
- with gr.Tabs():
257
- with gr.TabItem("Narrative Output"): report_output_md = gr.Markdown()
258
- with gr.TabItem("JSON Output"): report_output_json = gr.JSON()
 
 
259
 
260
- with gr.TabItem("③ Direct Chat Q&A"):
261
- chatbot = gr.Chatbot(label="Chat Interface", height=550, bubble_full_width=False)
262
- msg_input = gr.Textbox(label="Your Question", placeholder="Ask a question about the processed data...", scale=4)
263
- msg_input.submit(self._chat_workflow, [msg_input, chatbot, processed_data, user_purpose], [msg_input, chatbot])
264
 
265
- process_button.click(self._ingest_workflow, [session_id, url_input, crawl_depth, text_input, file_upload, process_button], [processed_data, ingestion_summary, error_log, process_button])
266
- generate_button.click(self._reporting_workflow, [session_id, report_type, processed_data, user_purpose, generate_button], [report_output_md, report_output_json, generate_button])
 
 
 
 
 
 
 
 
267
 
268
- return app
 
 
 
269
 
270
- def _ingest_workflow(self, s_id, url, depth, text, files, btn):
271
- yield gr.update(value="⚙️ Processing...", interactive=False)
272
- log(f"Starting ingestion for session {s_id}...")
273
- data, errors = self.engine.process_data_sources(s_id, url, depth, text, files)
274
- summary = f"Processing complete. {len(data):,} characters ingested from all sources. {len(errors)} errors encountered."
275
- yield data, summary, "\n".join(errors), gr.update(value="🚀 Process All Sources", interactive=True)
276
-
277
- def _chat_workflow(self, message, history, context, purpose):
278
- if not context:
279
- history.append((message, "Error: No data has been ingested. Please process data in Tab 1 first."))
280
- return "", history
281
 
282
- history.append((message, ""))
283
- log(f"Starting RAG query. Purpose: {purpose}")
284
- full_response = ""
285
- for token in self.engine.run_rag_query(message, context, purpose):
286
- full_response += token
287
- history[-1] = (message, full_response)
288
- yield "", history
289
-
290
- def _reporting_workflow(self, s_id, r_type, context, objective, btn):
291
- yield gr.update(value="Generating...", interactive=False), None, None
292
- auditor = SystemAuditor(session_id=s_id)
293
- if not context:
294
- md_out = "### Error: No data ingested. Please process sources in Tab 1."
295
- yield gr.update(value="Generate Report", interactive=True), md_out, None
296
- return
297
-
298
- start_time = time.time()
299
- log(auditor.format_prompt_log(f"Generating report: '{r_type}'"))
300
- response = self.engine.generate_report(r_type, context, objective)
301
- latency = (time.time() - start_time) * 1000
302
- log(auditor.format_response_log(response, latency, 1, 0.95))
303
-
304
- if r_type == "Narrative Prose Report":
305
- yield gr.update(interactive=True), response, None
306
- else: # Technical JSON Report
307
- try: yield gr.update(interactive=True), None, json.loads(response)
308
- except json.JSONDecodeError: yield gr.update(interactive=True), None, {"error": "Could not parse JSON from model."}
309
 
310
  def launch(self): self.app.launch(debug=Config.VERBOSE, share=False)
311
 
312
  if __name__ == "__main__":
313
  if not Config.HF_TOKEN:
314
- print("FATAL: Hugging Face token (HF_TOKEN) not found in environment variables.")
315
  else:
316
- log("Instantiating Maestro Engine and launching Gradio App...")
317
- maestro_engine = MaestroEngine()
318
- gradio_app = GradioApp(engine=maestro_engine)
319
- gradio_app.launch()
 
 
 
 
 
1
  import os
2
  import re
3
  import requests
 
14
 
15
  # Third-party libraries
16
  import gradio as gr
17
+ from huggingface_hub import InferenceClient, HfApi, hf_hub_download
18
+ from huggingface_hub.utils import HfHubHTTPError
19
  from pypdf import PdfReader
20
  from bs4 import BeautifulSoup
21
  import nltk
22
 
 
 
 
23
  # --- CONFIGURATION ---
24
  class Config:
25
+ """Centralized configuration for the Maestro application."""
26
  HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
27
  HF_TOKEN = os.getenv("HF_TOKEN")
28
+ HF_DATASET_REPO = "Omnibus/tmp" # As specified in the user's script
29
+ MEMORY_MAIN_PATH = "mem-test2/main.json"
30
+ MEMORY_INDEX_PATH = "mem-test2/index.json"
31
+ MEMORY_DATA_PATH = "mem-test2"
32
  VERBOSE = os.getenv("VERBOSE", "True").lower() == "true"
33
+ MAX_TOKENS_SYNTHESIS = 4096
34
+ MAX_TOKENS_REPORT = 8192
35
+ MAX_TOKENS_CHAT = 2048
36
+ MAX_DATA_CHUNK = 20000 # For processing large text bodies
37
+ REQUESTS_TIMEOUT = 20
38
  USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
39
+
40
+ # --- PROMPT LIBRARY (Integrated for simplicity) ---
41
+ class PromptLibrary:
42
+ """A centralized library of meticulously crafted prompt templates."""
43
+ AGENT_PREFIX = """
44
+ You are Maestro, an Expert Information Retrieval and Synthesis Agent. Your operation is governed by these directives:
45
+ 1. Ethical Safeguard [v2.4]: Refuse to process harmful, illegal, or unethical requests.
46
+ 2. Temporal Awareness: Use the timestamp {dynamic_timestamp_utc} to evaluate data relevance.
47
+ 3. Contextual Prioritization: Analyze the user's purpose '{user_purpose}' to weigh data relevance.
48
+ """
49
+ COMPRESS_JSON = """
50
+ Task: {task}
51
+ Based on the AGENT_PREFIX context and the following data, generate a structured and concise JSON summary.
52
+
53
+ Input Data Chunk:
54
+ ---
55
+ {history}
56
+ ---
57
+
58
+ Existing Knowledge (for context):
59
+ ---
60
+ {knowledge}
61
+ ---
62
+
63
+ Instructions:
64
+ Compile and categorize the data above into a JSON dictionary string. Extract key information, group related entities, and ensure the output is a single, valid JSON object.
65
+ """
66
+ COMPRESS_REPORT = """
67
+ Task: {task}
68
+ Based on the AGENT_PREFIX context and the summarized knowledge you have, compile a detailed, exhaustive report (~8000 words).
69
+
70
+ Summarized Knowledge:
71
+ ---
72
+ {knowledge}
73
+ ---
74
+
75
+ Last Chunk of Raw Data (for final context):
76
+ ---
77
+ {history}
78
+ ---
79
+
80
+ Instructions:
81
+ Synthesize all provided information into a single, comprehensive narrative. Be thorough, detailed, and structure the report with clear headings and sections.
82
+ """
83
+ SAVE_MEMORY = """
84
+ Task: {task}
85
+ Data:
86
+ ---
87
+ {history}
88
+ ---
89
+ Instructions:
90
+ Compile and categorize the data above into a JSON dictionary string. Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format.
91
+ Required keys: "keywords", "title", "description", "content", "url". The "keywords" list should be comprehensive.
92
+ """
93
+ RECALL_MEMORY = """
94
+ The user will give you a query and a list of keywords from a database index.
95
+ Your duty is to choose the words from the list that are most closely related to the search query.
96
+ If no keywords are relevant, return an empty list: [].
97
+ Respond only with a single, valid JSON list of strings.
98
+
99
+ USER QUERY: {prompt}
100
+ KEYWORD LIST: {keywords}
101
+ """
102
+
103
  # --- UTILITIES ---
104
  def log(message: str) -> None:
105
+ if Config.VERBOSE: print(f"[{datetime.datetime.now(datetime.timezone.utc).isoformat()}] {message}")
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  # --- CORE APPLICATION ENGINE ---
108
  class MaestroEngine:
109
+ """Handles all data processing, memory management, and LLM interaction."""
110
  def __init__(self):
111
+ if not Config.HF_TOKEN: raise ValueError("HF_TOKEN environment variable not set!")
112
  self.client = InferenceClient(model=Config.HF_MODEL, token=Config.HF_TOKEN)
113
+ self.api = HfApi(token=Config.HF_TOKEN)
114
  try:
115
  nltk.data.find("tokenizers/punkt")
116
  except LookupError:
 
118
  nltk.download('punkt', quiet=True)
119
  log("MaestroEngine initialized.")
120
 
121
+ # --- Data Ingestion ---
122
+ def _read_pdf_from_path(self, path: str) -> str:
123
  try:
124
+ return "\n".join(page.extract_text() or "" for page in PdfReader(path).pages)
125
+ except Exception as e: return f"Error reading PDF {os.path.basename(path)}: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ def _read_pdf_from_url(self, url: str) -> str:
 
128
  try:
129
+ response = requests.get(url, stream=True, timeout=Config.REQUESTS_TIMEOUT)
130
  response.raise_for_status()
131
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
132
+ tmp_file.write(response.content)
133
+ return self._read_pdf_from_path(tmp_file.name)
134
+ except Exception as e: return f"Failed to download or read PDF from {url}: {e}"
135
+ finally:
136
+ if 'tmp_file' in locals() and os.path.exists(tmp_file.name): os.remove(tmp_file.name)
 
 
 
 
137
 
138
+ def _get_web_text(self, url: str) -> str:
139
  try:
140
+ response = requests.get(url, headers={'User-Agent': Config.USER_AGENT}, timeout=Config.REQUESTS_TIMEOUT)
 
 
141
  response.raise_for_status()
142
+ return BeautifulSoup(response.content, 'lxml').get_text(separator="\n", strip=True)
143
+ except Exception as e: return f"Failed to fetch URL {url}: {e}"
144
+
145
+ def process_data_sources(self, text: str, files: List[str], url: str, pdf_url: str, pdf_batch: str) -> Tuple[str, List[str]]:
146
+ """Orchestrates data ingestion from all provided sources."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  all_content, errors = [], []
148
+ if text: all_content.append(text)
149
+ if url: all_content.append(self._get_web_text(url))
150
+ if pdf_url: all_content.append(self._read_pdf_from_url(pdf_url))
151
+ if pdf_batch:
152
+ urls = [u.strip() for u in pdf_batch.split(',') if u.strip()]
153
+ for u in urls:
154
+ content = self._read_pdf_from_url(u)
155
+ if content.startswith("Error"): errors.append(content)
156
+ else: all_content.append(content)
157
+ if files:
158
+ for path in files:
159
+ if not path: continue
160
+ filename, ext = os.path.basename(path), os.path.splitext(path)[1].lower()
161
+ if ext == '.pdf': all_content.append(self._read_pdf_from_path(path))
162
+ elif ext == '.txt':
163
+ with open(path, 'r', encoding='utf-8', errors='ignore') as f: all_content.append(f.read())
164
+ else: errors.append(f"Unsupported file type: {filename}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  return "\n\n---\n\n".join(all_content), errors
166
 
167
  # --- LLM Interaction ---
168
+ def _run_gpt(self, prompt_template: str, max_tokens: int, **kwargs) -> str:
169
+ """Core LLM call function."""
 
 
 
 
 
 
 
 
170
  system_prompt = PromptLibrary.AGENT_PREFIX.format(
171
  dynamic_timestamp_utc=datetime.datetime.now(datetime.timezone.utc).isoformat(),
172
+ user_purpose=kwargs.get('task', 'completing a system task.')
173
  )
174
+ full_prompt = f"<s>[INST] {system_prompt}\n\n{prompt_template.format(**kwargs)} [/INST]"
175
+ log(f"Running GPT. Template: {prompt_template[:50]}...")
176
+ try:
177
+ return self.client.text_generation(full_prompt, max_new_tokens=max_tokens, temperature=0.8, top_p=0.95).strip()
178
+ except Exception as e:
179
+ log(f"LLM Error: {e}")
180
+ return f'{{"error": "LLM call failed", "details": "{e}"}}'
181
 
182
+ def _chunk_and_process(self, text: str, prompt_template: str, task: str, max_tokens: int) -> List[str]:
183
+ """Chunks large text and processes each chunk with an LLM."""
184
+ text_len = len(text)
185
+ if text_len == 0: return []
186
+ num_chunks = (text_len + Config.MAX_DATA_CHUNK - 1) // Config.MAX_DATA_CHUNK
187
+ chunk_size = (text_len + num_chunks - 1) // num_chunks
188
+
189
+ results, knowledge = [], ""
190
+ for i in range(num_chunks):
191
+ chunk = text[i*chunk_size : (i+1)*chunk_size]
192
+ log(f"Processing chunk {i+1}/{num_chunks}...")
193
+ resp = self._run_gpt(prompt_template, max_tokens, task=task, knowledge=knowledge, history=chunk)
194
+ knowledge = resp if len(resp) < 2000 else resp[:2000] # Use response as context for next chunk
195
+ results.append(resp)
196
+ return results
197
+
198
+ # --- Synthesis & Reporting Workflow ---
199
+ def synthesis_workflow(self, text: str, task: str, do_summarize: bool, do_report: bool) -> Tuple[str, List[Dict]]:
200
+ """Handles the multi-stage summarization and reporting process."""
201
+ if not text: return "No data to process.", []
202
+ json_summary_objects, final_report = [], ""
203
+
204
+ if do_summarize or do_report: # Summarization is a prerequisite for reporting
205
+ log("Starting summarization stage...")
206
+ summaries = self._chunk_and_process(text, PromptLibrary.COMPRESS_JSON, task, Config.MAX_TOKENS_SYNTHESIS)
207
+ for s in summaries:
208
+ try: json_summary_objects.append(json.loads(s))
209
+ except json.JSONDecodeError: json_summary_objects.append({"error": "Failed to parse summary JSON", "raw": s})
210
+ log("Summarization stage complete.")
211
+
212
+ if do_report:
213
+ log("Starting report generation stage...")
214
+ # Use the collected JSON summaries as knowledge for the final report
215
+ knowledge_for_report = json.dumps(json_summary_objects, indent=2)
216
+ final_report = self._run_gpt(PromptLibrary.COMPRESS_REPORT, Config.MAX_TOKENS_REPORT, task=task, knowledge=knowledge_for_report, history="All data chunks have been summarized.")
217
+ log("Report generation complete.")
218
+ return final_report, json_summary_objects
219
+
220
+ return "Summarization complete.", json_summary_objects
221
+
222
+ # --- Persistent Memory System ---
223
+ def _hf_download_json(self, repo_path: str, default: Any = []) -> Any:
224
+ try:
225
+ path = hf_hub_download(repo_id=Config.HF_DATASET_REPO, filename=repo_path, repo_type="dataset", token=Config.HF_TOKEN)
226
+ with open(path, 'r') as f: return json.load(f)
227
+ except HfHubHTTPError: return default # File doesn't exist, return default
228
+ except (json.JSONDecodeError, IOError): return default
229
+
230
+ def _hf_upload_json(self, data: Any, repo_path: str):
231
+ with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix=".json") as tmp_file:
232
+ json.dump(data, tmp_file, indent=4)
233
+ tmp_path = tmp_file.name
234
+ self.api.upload_file(path_or_fileobj=tmp_path, path_in_repo=repo_path, repo_id=Config.HF_DATASET_REPO, repo_type="dataset")
235
+ os.remove(tmp_path)
236
+
237
+ def save_to_memory(self, text: str, task: str) -> List[Dict]:
238
+ """Saves processed text to the Hugging Face Dataset repo."""
239
+ log("Starting memory save process...")
240
+ json_chunks = self._chunk_and_process(text, PromptLibrary.SAVE_MEMORY, task, Config.MAX_TOKENS_SYNTHESIS)
241
+ parsed_chunks, main_file = [], self._hf_download_json(Config.MEMORY_MAIN_PATH)
242
+
243
+ for i, chunk_str in enumerate(json_chunks):
244
+ try:
245
+ data = json.loads(chunk_str)
246
+ ts = datetime.datetime.now(datetime.timezone.utc)
247
+ filename = f"{ts.strftime('%Y-%m-%d-%H-%M-%S')}-{uuid.uuid4().hex[:8]}.json"
248
+ self._hf_upload_json(data, f"{Config.MEMORY_DATA_PATH}/{filename}")
249
+ main_file.append({"file_name": filename, "keywords": data.get("keywords", []), "description": data.get("description", "")})
250
+ parsed_chunks.append(data)
251
+ except json.JSONDecodeError: log(f"Could not parse memory chunk {i} into JSON.")
252
+
253
+ self._hf_upload_json(main_file, Config.MEMORY_MAIN_PATH)
254
+ self.update_keyword_index(main_file)
255
+ log("Memory save complete.")
256
+ return parsed_chunks
257
+
258
+ def update_keyword_index(self, main_file_content: List[Dict]):
259
+ log("Updating keyword index...")
260
+ keyword_index = {}
261
+ for entry in main_file_content:
262
+ for keyword in entry.get("keywords", []):
263
+ k = keyword.strip().lower()
264
+ if k not in keyword_index: keyword_index[k] = []
265
+ if entry["file_name"] not in keyword_index[k]: keyword_index[k].append(entry["file_name"])
266
+ self._hf_upload_json(keyword_index, Config.MEMORY_INDEX_PATH)
267
+ log("Keyword index updated.")
268
+
269
+ def recall_from_memory(self, query: str) -> str:
270
+ log("Recalling from memory...")
271
+ index = self._hf_download_json(Config.MEMORY_INDEX_PATH, default={})
272
+ if not index: return "Memory index is empty or could not be loaded."
273
+
274
+ relevant_keywords_str = self._run_gpt(PromptLibrary.RECALL_MEMORY, 256, prompt=query, keywords=list(index.keys()))
275
+ try:
276
+ relevant_keywords = json.loads(relevant_keywords_str)
277
+ except json.JSONDecodeError: return "Could not determine relevant keywords from memory."
278
+
279
+ if not relevant_keywords: return "Found no relevant information in memory for that query."
280
 
281
+ # Fetch data from relevant files
282
+ matched_files, fetched_data = set(), []
283
+ for k in relevant_keywords:
284
+ for fname in index.get(k.lower().strip(), []): matched_files.add(fname)
285
 
286
+ for fname in list(matched_files)[:5]: # Limit fetches
287
+ data = self._hf_download_json(f"{Config.MEMORY_DATA_PATH}/{fname}", default={})
288
+ fetched_data.append(data)
289
+
290
+ return f"Recalled {len(fetched_data)} entries from memory:\n\n{json.dumps(fetched_data, indent=2)}"
291
+
292
 
293
  # --- GRADIO APPLICATION ---
294
  class GradioApp:
 
297
  self.app = self._build_ui()
298
 
299
  def _build_ui(self) -> gr.Blocks:
300
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky"), title="Maestro AI Engine") as app:
301
  session_id = gr.State(lambda: secrets.token_hex(16))
302
+
 
303
  gr.Markdown("# 🧠 Maestro: AI Data Engine & Synthesis Platform")
304
 
305
  with gr.Tabs():
306
+ with gr.TabItem("⚙️ Ingestion & Synthesis"):
 
307
  with gr.Row():
308
  with gr.Column(scale=3):
309
+ task_instructions = gr.Textbox(label="Primary Task / Instructions", placeholder="e.g., 'Summarize the key findings regarding renewable energy adoption'")
310
+ with gr.Tabs():
311
+ with gr.TabItem("Text Input"): text_input = gr.Textbox(lines=10)
312
+ with gr.TabItem("File Upload"): file_upload = gr.File(label="Upload Files (.pdf, .txt)", file_count="multiple", type="filepath")
313
+ with gr.TabItem("Web URL"): url_input = gr.Textbox(label="URL")
314
+ with gr.TabItem("PDF URL"): pdf_url_input = gr.Textbox(label="Single PDF URL")
315
+ with gr.TabItem("Batch PDF URLs"): pdf_batch_input = gr.Textbox(label="Comma-separated PDF URLs", lines=3)
316
+ with gr.Column(scale=1):
317
+ gr.Markdown("### Processing Options")
318
+ summarize_check = gr.Checkbox(label="Create JSON Summary", value=True)
319
+ report_check = gr.Checkbox(label="Generate Full Report (requires summary)", value=False)
320
+ memory_check = gr.Checkbox(label="Save to Persistent Memory", value=False)
321
+ process_button = gr.Button("🚀 Process & Synthesize", variant="primary", scale=2)
322
+
323
+ gr.Markdown("### Results")
324
  with gr.Row():
325
+ final_report_output = gr.Markdown(label="Final Report")
326
+ json_summary_output = gr.JSON(label="JSON Summaries")
327
+
328
+ with gr.TabItem("🔎 Memory Recall"):
329
+ memory_query = gr.Textbox(label="Query Persistent Memory", placeholder="e.g., 'What do we know about market trends in 2024?'")
330
+ recall_button = gr.Button("Recall", variant="primary")
331
+ memory_output = gr.Textbox(label="Recalled Information", lines=20, interactive=False)
332
 
333
+ process_button.click(self._synthesis_workflow, [task_instructions, text_input, file_upload, url_input, pdf_url_input, pdf_batch_input, summarize_check, report_check, memory_check], [final_report_output, json_summary_output])
334
+ recall_button.click(self.engine.recall_from_memory, [memory_query], [memory_output])
335
+ return app
 
336
 
337
+ def _synthesis_workflow(self, task, text, files, url, pdf_url, pdf_batch, do_sum, do_rep, do_mem):
338
+ log("Starting synthesis workflow...")
339
+ # 1. Ingest Data
340
+ ingested_text, errors = self.engine.process_data_sources(text, files, url, pdf_url, pdf_batch)
341
+ if errors:
342
+ log(f"Ingestion errors: {errors}")
343
+ # For simplicity, we show errors in the log. A real app might have a dedicated error box.
344
+
345
+ if not ingested_text:
346
+ return "No data was successfully ingested. Please check your inputs and logs.", None
347
 
348
+ # 2. Save to Memory (if requested)
349
+ if do_mem:
350
+ self.engine.save_to_memory(ingested_text, task)
351
+ # We don't wait for this to finish for the UI, it's a background-like task
352
 
353
+ # 3. Summarize and Report
354
+ if do_sum or do_rep:
355
+ report, summaries = self.engine.synthesis_workflow(ingested_text, task, do_sum, do_rep)
356
+ return report, summaries
 
 
 
 
 
 
 
357
 
358
+ return "Processing complete. No synthesis option was selected.", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
 
360
  def launch(self): self.app.launch(debug=Config.VERBOSE, share=False)
361
 
362
  if __name__ == "__main__":
363
  if not Config.HF_TOKEN:
364
+ print("FATAL: HF_TOKEN environment variable not set.")
365
  else:
366
+ log("Instantiating Maestro Engine...")
367
+ engine = MaestroEngine()
368
+ app = GradioApp(engine)
369
+ log("Launching Gradio App...")
370
+ app.launch()
371
+