acecalisto3 commited on
Commit
b115530
·
verified ·
1 Parent(s): 9f57626

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -128
app.py CHANGED
@@ -1,6 +1,7 @@
1
  # app.py
2
 
3
  import os
 
4
  import requests
5
  import uuid
6
  import datetime
@@ -10,7 +11,8 @@ import shutil
10
  import secrets
11
  import time
12
  import json
13
- from typing import List, Tuple, Any, Dict
 
14
 
15
  # Third-party libraries
16
  import gradio as gr
@@ -24,34 +26,29 @@ from agent_prompt import PromptLibrary, SystemAuditor
24
 
25
  # --- CONFIGURATION ---
26
  class Config:
27
- """Centralized configuration for the Maestro application."""
28
  HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
29
  HF_TOKEN = os.getenv("HF_TOKEN")
30
  VERBOSE = os.getenv("VERBOSE", "True").lower() == "true"
31
- MAX_NEW_TOKENS_REPORT = 4096
32
- MAX_NEW_TOKENS_CHAT = 1024
33
  REQUESTS_TIMEOUT = 15
34
  USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
35
-
36
  # --- UTILITIES ---
37
  def log(message: str) -> None:
38
  if Config.VERBOSE:
39
- print(f"[LOG] {datetime.datetime.now(datetime.timezone.utc).isoformat()} - {message}")
40
 
41
  class SessionManager:
42
- """A context manager for creating and cleaning up session-specific temporary directories."""
43
  def __init__(self, session_id: str):
44
  self.session_id = session_id
45
  self.temp_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
46
-
47
  def __enter__(self) -> str:
48
  os.makedirs(self.temp_dir, exist_ok=True)
49
- log(f"Session '{self.session_id}' started. Temp dir: {self.temp_dir}")
50
  return self.temp_dir
51
-
52
  def __exit__(self, exc_type, exc_val, exc_tb):
53
  shutil.rmtree(self.temp_dir, ignore_errors=True)
54
- log(f"Session '{self.session_id}' ended. Temp dir cleaned up.")
55
 
56
  # --- CORE APPLICATION ENGINE ---
57
  class MaestroEngine:
@@ -65,104 +62,174 @@ class MaestroEngine:
65
  nltk.download('punkt', quiet=True)
66
  log("MaestroEngine initialized.")
67
 
 
68
  def _read_pdf(self, file_path: str) -> str:
69
  try:
70
  reader = PdfReader(file_path)
71
  return "\n".join(page.extract_text() or "" for page in reader.pages)
72
- except Exception as e:
73
- log(f"Error reading PDF {os.path.basename(file_path)}: {e}")
74
- return f"Error reading PDF: {e}"
75
 
76
  def _process_zip(self, zip_path: str, temp_dir: str) -> str:
 
77
  extracted_texts = []
78
  extract_path = os.path.join(temp_dir, "zip_extract")
79
  os.makedirs(extract_path, exist_ok=True)
80
  try:
81
  with zipfile.ZipFile(zip_path, 'r') as zf:
82
  for member in zf.infolist():
83
- if member.filename.endswith('.pdf'):
84
- zf.extract(member, extract_path)
85
- extracted_texts.append(self._read_pdf(os.path.join(extract_path, member.filename)))
86
  elif member.filename.endswith('.txt'):
87
- extracted_texts.append(zf.read(member).decode('utf-8', errors='ignore'))
88
  return "\n\n".join(extracted_texts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  except Exception as e:
90
- log(f"Error processing ZIP {os.path.basename(zip_path)}: {e}")
91
- return f"Error processing ZIP: {e}"
92
 
93
- def process_data_sources(self, session_id: str, url: str, text: str, files: List[Any]) -> Tuple[str, List[str]]:
94
- """Orchestrates data ingestion from all provided sources."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  all_content, errors = [], []
96
  with SessionManager(session_id) as temp_dir:
97
- if url:
98
- try:
99
- response = requests.get(url, headers={'User-Agent': Config.USER_AGENT}, timeout=Config.REQUESTS_TIMEOUT)
100
- response.raise_for_status()
101
- all_content.append(BeautifulSoup(response.content, 'html.parser').get_text(separator="\n", strip=True))
102
- except Exception as e:
103
- errors.append(f"URL Fetch Error: {e}")
104
- if text:
105
- all_content.append(text)
106
- if files:
107
- for file_obj in files:
108
- file_path = os.path.join(temp_dir, os.path.basename(file_obj.name))
109
- with open(file_path, "wb") as f:
110
- shutil.copyfileobj(file_obj, f)
111
- ext = os.path.splitext(file_obj.name)[1].lower()
112
- if ext == '.pdf':
113
- all_content.append(self._read_pdf(file_path))
114
  elif ext == '.txt':
115
- all_content.append(open(file_path, 'r', encoding='utf-8').read())
116
- elif ext == '.zip':
117
- all_content.append(self._process_zip(file_path, temp_dir))
118
- else:
119
- errors.append(f"Unsupported file type: {file_obj.name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  return "\n\n---\n\n".join(all_content), errors
121
 
122
- def _query_llm(self, prompt: str, max_tokens: int) -> str:
 
 
123
  try:
124
- response = self.client.text_generation(prompt, max_new_tokens=max_tokens, temperature=0.7, top_p=0.95)
125
- return response.strip()
126
  except Exception as e:
127
- log(f"LLM query failed: {e}")
128
- return f"Error communicating with the model: {e}"
129
 
130
- def run_rag_query(self, query: str, context: str) -> str:
131
- prompt = f"Context:\n---\n{context}\n---\nBased only on the context provided, answer the following question:\nQuestion: {query}"
132
- return self._query_llm(prompt, Config.MAX_NEW_TOKENS_CHAT)
 
 
 
 
133
 
134
  def generate_report(self, report_type: str, context: str, objective: str) -> str:
 
 
 
 
135
  if report_type == "Narrative Prose Report":
136
- prompt = PromptLibrary.NARRATIVE_PROSE_REPORT.format(
137
- task_objective=objective,
138
- knowledge_base=context
139
- )
140
- return self._query_llm(prompt, Config.MAX_NEW_TOKENS_REPORT)
141
- elif report_type == "Technical JSON Report":
142
- prompt = PromptLibrary.TECHNICAL_JSON_REPORT.format(
143
- task_objective=objective,
144
- baseline_knowledge="Previously established facts",
145
- new_information=context
146
- )
147
- raw_response = self._query_llm(prompt, Config.MAX_NEW_TOKENS_REPORT)
148
- clean_json_str = raw_response.replace("```json", "").replace("```", "").strip()
149
- try:
150
- return json.dumps(json.loads(clean_json_str), indent=2)
151
- except json.JSONDecodeError:
152
- log(f"Failed to parse LLM response as JSON. Raw response: {raw_response}")
153
- return '{"error": "The model did not return valid JSON.", "raw_response": ' + json.dumps(raw_response) + '}'
154
- return "Invalid report type selected."
155
 
156
  # --- GRADIO APPLICATION ---
157
  class GradioApp:
158
- """Manages the Gradio UI and application workflow."""
159
  def __init__(self, engine: MaestroEngine):
160
  self.engine = engine
161
  self.app = self._build_ui()
162
 
163
  def _build_ui(self) -> gr.Blocks:
164
- with gr.Blocks(theme=gr.themes.Monochrome(), title="Maestro AI Engine") as app:
165
- # State management
166
  session_id = gr.State(lambda: secrets.token_hex(16))
167
  processed_data = gr.State("")
168
 
@@ -170,88 +237,81 @@ class GradioApp:
170
 
171
  with gr.Tabs():
172
  with gr.TabItem("① Data Ingestion"):
 
173
  with gr.Row():
174
- with gr.Column():
175
- url_input = gr.Textbox(label="Scrape from URL")
176
- text_input = gr.Textbox(label="Paste Text", lines=10)
177
- file_upload = gr.Files(label="Upload Files (.pdf, .txt, .zip)", type="file")
 
 
178
  process_button = gr.Button("🚀 Process All Sources", variant="primary")
179
- ingestion_summary = gr.Textbox(label="Ingestion Summary", interactive=False)
180
- error_log = gr.Textbox(label="Errors", interactive=False)
181
 
182
  with gr.TabItem("② Reporting & Synthesis"):
183
- report_objective = gr.Textbox(label="Report Objective", placeholder="e.g., 'Synthesize findings on AI in agriculture'")
184
- report_type = gr.Dropdown(label="Select Report Type", choices=["Narrative Prose Report", "Technical JSON Report"])
185
- generate_button = gr.Button("Generate Report", variant="primary")
186
  with gr.Tabs():
187
- with gr.TabItem("Narrative Output"):
188
- report_output_md = gr.Markdown()
189
- with gr.TabItem("JSON Output"):
190
- report_output_json = gr.JSON()
191
 
192
  with gr.TabItem("③ Direct Chat Q&A"):
193
- chatbot = gr.Chatbot(label="Chat Interface", height=550)
194
- msg_input = gr.Textbox(label="Your Question", placeholder="Ask a question about the processed data...")
195
- msg_input.submit(self._chat_workflow, [msg_input, chatbot, processed_data], [msg_input, chatbot])
196
 
197
- # --- Workflow Connections ---
198
- process_button.click(self._ingest_workflow, [session_id, url_input, text_input, file_upload], [processed_data, ingestion_summary, error_log])
199
-
200
- # **FIXED LINE**
201
- # Pass session_id to the reporting workflow instead of the non-existent auditor state
202
- generate_button.click(self._reporting_workflow, [session_id, report_type, processed_data, report_objective], [report_output_md, report_output_json])
203
 
204
  return app
205
 
206
- def _ingest_workflow(self, s_id, url, text, files):
 
207
  log(f"Starting ingestion for session {s_id}...")
208
- data, errors = self.engine.process_data_sources(s_id, url, text, files)
209
- summary = f"Processing complete. {len(data)} characters ingested. {len(errors)} errors encountered."
210
- return data, summary, "\n".join(errors)
211
 
212
- def _chat_workflow(self, message, history, context):
213
  if not context:
214
  history.append((message, "Error: No data has been ingested. Please process data in Tab 1 first."))
215
  return "", history
216
- response = self.engine.run_rag_query(message, context)
217
- history.append((message, response))
218
- return "", history
219
-
220
- # **FIXED METHOD**
221
- # The method now accepts session_id (s_id) and creates the auditor instance internally.
222
- def _reporting_workflow(self, s_id: str, r_type: str, context: str, objective: str):
223
- auditor = SystemAuditor(session_id=s_id) # Create auditor instance on-the-fly
224
 
 
 
 
225
  if not context:
226
- md_out = "### Error: No data has been ingested. Please process data in Tab 1 first."
227
- log(auditor.format_prompt_log(f"REPORTING FAILED: No context data. Objective: {objective}"))
228
- return md_out, None
229
-
230
  start_time = time.time()
231
- log(auditor.format_prompt_log(f"Generating report of type '{r_type}' with objective: '{objective}'"))
232
  response = self.engine.generate_report(r_type, context, objective)
233
  latency = (time.time() - start_time) * 1000
234
-
235
- log(auditor.format_response_log(response, latency, 1, 0.95)) # Log the event
236
 
237
  if r_type == "Narrative Prose Report":
238
- return response, None
239
- elif r_type == "Technical JSON Report":
240
- try:
241
- # The engine already returns a JSON string or an error object
242
- json_data = json.loads(response)
243
- return None, json_data
244
- except json.JSONDecodeError:
245
- return None, {"error": "Failed to decode the final JSON response from the engine."}
246
 
247
- def launch(self):
248
- self.app.launch(debug=Config.VERBOSE)
249
 
250
- # --- MAIN EXECUTION BLOCK ---
251
  if __name__ == "__main__":
252
  if not Config.HF_TOKEN:
253
  print("FATAL: Hugging Face token (HF_TOKEN) not found in environment variables.")
254
- print("Please set your token, e.g., `export HF_TOKEN='hf_...'`")
255
  else:
256
  log("Instantiating Maestro Engine and launching Gradio App...")
257
  maestro_engine = MaestroEngine()
 
1
  # app.py
2
 
3
  import os
4
+ import re
5
  import requests
6
  import uuid
7
  import datetime
 
11
  import secrets
12
  import time
13
  import json
14
+ from typing import List, Tuple, Any, Dict, Set, Generator
15
+ from urllib.parse import urljoin, urlparse
16
 
17
  # Third-party libraries
18
  import gradio as gr
 
26
 
27
  # --- CONFIGURATION ---
28
  class Config:
 
29
  HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
30
  HF_TOKEN = os.getenv("HF_TOKEN")
31
  VERBOSE = os.getenv("VERBOSE", "True").lower() == "true"
32
+ MAX_NEW_TOKENS_REPORT = 8192
33
+ MAX_NEW_TOKENS_CHAT = 2048
34
  REQUESTS_TIMEOUT = 15
35
  USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
36
+
37
  # --- UTILITIES ---
38
  def log(message: str) -> None:
39
  if Config.VERBOSE:
40
+ print(f"[{datetime.datetime.now(datetime.timezone.utc).isoformat()}] {message}")
41
 
42
  class SessionManager:
43
+ # ... (No changes from previous version)
44
  def __init__(self, session_id: str):
45
  self.session_id = session_id
46
  self.temp_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
 
47
  def __enter__(self) -> str:
48
  os.makedirs(self.temp_dir, exist_ok=True)
 
49
  return self.temp_dir
 
50
  def __exit__(self, exc_type, exc_val, exc_tb):
51
  shutil.rmtree(self.temp_dir, ignore_errors=True)
 
52
 
53
  # --- CORE APPLICATION ENGINE ---
54
  class MaestroEngine:
 
62
  nltk.download('punkt', quiet=True)
63
  log("MaestroEngine initialized.")
64
 
65
+ # --- File Processors ---
66
  def _read_pdf(self, file_path: str) -> str:
67
  try:
68
  reader = PdfReader(file_path)
69
  return "\n".join(page.extract_text() or "" for page in reader.pages)
70
+ except Exception as e: return f"Error reading PDF {os.path.basename(file_path)}: {e}"
 
 
71
 
72
  def _process_zip(self, zip_path: str, temp_dir: str) -> str:
73
+ # ... (Identical to previous correct version)
74
  extracted_texts = []
75
  extract_path = os.path.join(temp_dir, "zip_extract")
76
  os.makedirs(extract_path, exist_ok=True)
77
  try:
78
  with zipfile.ZipFile(zip_path, 'r') as zf:
79
  for member in zf.infolist():
80
+ if member.is_dir() or member.filename.startswith('__MACOSX'): continue
81
+ member_path = zf.extract(member, path=extract_path)
82
+ if member.filename.endswith('.pdf'): extracted_texts.append(self._read_pdf(member_path))
83
  elif member.filename.endswith('.txt'):
84
+ with open(member_path, 'r', encoding='utf-8', errors='ignore') as f: extracted_texts.append(f.read())
85
  return "\n\n".join(extracted_texts)
86
+ except Exception as e: return f"Error processing ZIP {os.path.basename(zip_path)}: {e}"
87
+
88
+ # --- Google Workspace Integration ---
89
+ def _get_google_drive_id(self, url: str) -> str | None:
90
+ match = re.search(r"/file/d/([a-zA-Z0-9_-]+)", url)
91
+ return match.group(1) if match else None
92
+
93
+ def _download_google_drive_file(self, file_id: str, temp_dir: str) -> Tuple[str | None, str | None]:
94
+ URL = "https://docs.google.com/uc?export=download&id="
95
+ try:
96
+ response = requests.get(URL + file_id, stream=True, timeout=30)
97
+ response.raise_for_status()
98
+ disposition = response.headers.get('content-disposition', '')
99
+ fname = re.findall('filename="(.+)"', disposition)
100
+ filename = fname[0] if fname else f"{file_id}_download"
101
+ file_path = os.path.join(temp_dir, filename)
102
+ with open(file_path, "wb") as f:
103
+ for chunk in response.iter_content(chunk_size=8192):
104
+ f.write(chunk)
105
+ return file_path, None
106
  except Exception as e:
107
+ return None, f"Failed to download Google Drive file ({file_id}): {e}"
 
108
 
109
+ def _fetch_google_doc(self, url: str) -> Tuple[str | None, str | None]:
110
+ try:
111
+ doc_id = re.search(r"/document/d/([a-zA-Z0-9_-]+)", url).group(1)
112
+ export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=txt"
113
+ response = requests.get(export_url, timeout=Config.REQUESTS_TIMEOUT)
114
+ response.raise_for_status()
115
+ return response.text, None
116
+ except Exception as e: return None, f"Failed to fetch Google Doc: {e}"
117
+
118
+ # --- Web Crawler ---
119
+ def _crawl_website(self, start_url: str, max_depth: int) -> Tuple[str, List[str]]:
120
+ if not start_url: return "", []
121
+ visited: Set[str] = set()
122
+ to_visit: List[Tuple[str, int]] = [(start_url, 0)]
123
+ all_text, errors = [], []
124
+ base_netloc = urlparse(start_url).netloc
125
+
126
+ while to_visit and len(visited) < 50: # Safety break
127
+ current_url, depth = to_visit.pop(0)
128
+ if current_url in visited or depth > max_depth: continue
129
+
130
+ log(f"Crawling (depth {depth}): {current_url}")
131
+ visited.add(current_url)
132
+
133
+ try:
134
+ response = requests.get(current_url, headers={'User-Agent': Config.USER_AGENT}, timeout=Config.REQUESTS_TIMEOUT)
135
+ response.raise_for_status()
136
+ soup = BeautifulSoup(response.content, 'html.parser')
137
+ all_text.append(soup.get_text(separator="\n", strip=True))
138
+
139
+ if depth < max_depth:
140
+ for link in soup.find_all('a', href=True):
141
+ abs_url = urljoin(current_url, link['href'])
142
+ if urlparse(abs_url).netloc == base_netloc and abs_url not in visited:
143
+ to_visit.append((abs_url, depth + 1))
144
+ except Exception as e:
145
+ errors.append(f"Crawl Error on {current_url}: {e}")
146
+ return "\n\n".join(all_text), errors
147
+
148
+ # --- Main Orchestrator ---
149
+ def process_data_sources(self, session_id: str, url: str, crawl_depth: int, text: str, file_paths: List[str]) -> Tuple[str, List[str]]:
150
  all_content, errors = [], []
151
  with SessionManager(session_id) as temp_dir:
152
+ # 1. Direct Text
153
+ if text: all_content.append(text)
154
+
155
+ # 2. Uploaded Files
156
+ if file_paths:
157
+ for path in file_paths:
158
+ if not path: continue
159
+ filename = os.path.basename(path)
160
+ ext = os.path.splitext(filename)[1].lower()
161
+ if ext == '.pdf': all_content.append(self._read_pdf(path))
 
 
 
 
 
 
 
162
  elif ext == '.txt':
163
+ with open(path, 'r', encoding='utf-8', errors='ignore') as f: all_content.append(f.read())
164
+ elif ext == '.zip': all_content.append(self._process_zip(path, temp_dir))
165
+ else: errors.append(f"Unsupported file type: {filename}")
166
+
167
+ # 3. URL Processing (Crawler, Google Drive, Google Docs)
168
+ if url:
169
+ if "drive.google.com/file/d/" in url:
170
+ file_id = self._get_google_drive_id(url)
171
+ if file_id:
172
+ path, err = self._download_google_drive_file(file_id, temp_dir)
173
+ if err: errors.append(err)
174
+ elif path: all_content.append(self._read_pdf(path)) # Assuming PDF for simplicity
175
+ else: errors.append(f"Invalid Google Drive URL: {url}")
176
+ elif "docs.google.com/document/d/" in url:
177
+ content, err = self._fetch_google_doc(url)
178
+ if err: errors.append(err)
179
+ else: all_content.append(content)
180
+ else: # Standard web crawling
181
+ content, crawl_errors = self._crawl_website(url, crawl_depth)
182
+ all_content.append(content)
183
+ errors.extend(crawl_errors)
184
+
185
  return "\n\n---\n\n".join(all_content), errors
186
 
187
+ # --- LLM Interaction ---
188
+ def _query_llm_stream(self, system_prompt: str, user_prompt: str, max_tokens: int) -> Generator[str, None, None]:
189
+ full_prompt = f"<s>[INST] {system_prompt}\n\n{user_prompt} [/INST]"
190
  try:
191
+ for token in self.client.text_generation(full_prompt, max_new_tokens=max_tokens, stream=True, temperature=0.7, top_p=0.95):
192
+ yield token
193
  except Exception as e:
194
+ log(f"LLM stream query failed: {e}")
195
+ yield f"Error communicating with the model: {e}"
196
 
197
+ def run_rag_query(self, query: str, context: str, purpose: str) -> Generator[str, None, None]:
198
+ system_prompt = PromptLibrary.AGENT_PREFIX.format(
199
+ dynamic_timestamp_utc=datetime.datetime.now(datetime.timezone.utc).isoformat(),
200
+ user_purpose=purpose or "Direct Question & Answer"
201
+ )
202
+ user_prompt = f"Based *only* on the context provided below, answer the user's question.\n\nCONTEXT:\n---\n{context}\n---\n\nQUESTION: {query}"
203
+ yield from self._query_llm_stream(system_prompt, user_prompt, Config.MAX_NEW_TOKENS_CHAT)
204
 
205
  def generate_report(self, report_type: str, context: str, objective: str) -> str:
206
+ system_prompt = PromptLibrary.AGENT_PREFIX.format(
207
+ dynamic_timestamp_utc=datetime.datetime.now(datetime.timezone.utc).isoformat(),
208
+ user_purpose=objective
209
+ )
210
  if report_type == "Narrative Prose Report":
211
+ user_prompt = PromptLibrary.NARRATIVE_PROSE_REPORT.format(task_objective=objective, knowledge_base=context)
212
+ else: # Technical JSON Report
213
+ user_prompt = PromptLibrary.TECHNICAL_JSON_REPORT.format(task_objective=objective, knowledge_base=context)
214
+
215
+ # Reports are not streamed
216
+ full_prompt = f"<s>[INST] {system_prompt}\n\n{user_prompt} [/INST]"
217
+ response = self.client.text_generation(full_prompt, max_new_tokens=Config.MAX_NEW_TOKENS_REPORT)
218
+
219
+ if report_type == "Technical JSON Report":
220
+ clean_json_str = re.sub(r'```json\s*|\s*```', '', response).strip()
221
+ try: return json.dumps(json.loads(clean_json_str), indent=2)
222
+ except json.JSONDecodeError: return json.dumps({"error": "Model returned invalid JSON", "raw_response": response})
223
+ return response
 
 
 
 
 
 
224
 
225
  # --- GRADIO APPLICATION ---
226
  class GradioApp:
 
227
  def __init__(self, engine: MaestroEngine):
228
  self.engine = engine
229
  self.app = self._build_ui()
230
 
231
  def _build_ui(self) -> gr.Blocks:
232
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="emerald", secondary_hue="green"), title="Maestro AI Engine") as app:
 
233
  session_id = gr.State(lambda: secrets.token_hex(16))
234
  processed_data = gr.State("")
235
 
 
237
 
238
  with gr.Tabs():
239
  with gr.TabItem("① Data Ingestion"):
240
+ user_purpose = gr.Textbox(label="High-Level Goal / Purpose", placeholder="e.g., 'Research AI impact on agriculture for a market report'")
241
  with gr.Row():
242
+ with gr.Column(scale=3):
243
+ url_input = gr.Textbox(label="Ingest from URL", placeholder="Enter a standard URL, Google Drive, or Google Docs link")
244
+ text_input = gr.Textbox(label="Paste Text", lines=8)
245
+ with gr.Column(scale=2):
246
+ crawl_depth = gr.Slider(label="Web Crawl Depth", minimum=0, maximum=5, value=1, step=1, info="For standard URLs only. 0=current page, 1=page + links, etc.")
247
+ file_upload = gr.File(label="Upload Files (.pdf, .txt, .zip)", file_count="multiple", type="filepath")
248
  process_button = gr.Button("🚀 Process All Sources", variant="primary")
249
+ ingestion_summary = gr.Textbox(label="Ingestion Summary", interactive=False, lines=2)
250
+ error_log = gr.Textbox(label="Errors & Warnings", interactive=False, lines=3)
251
 
252
  with gr.TabItem("② Reporting & Synthesis"):
253
+ with gr.Row():
254
+ report_type = gr.Dropdown(label="Select Report Type", choices=["Narrative Prose Report", "Technical JSON Report"], value="Narrative Prose Report")
255
+ generate_button = gr.Button("Generate Report", variant="primary")
256
  with gr.Tabs():
257
+ with gr.TabItem("Narrative Output"): report_output_md = gr.Markdown()
258
+ with gr.TabItem("JSON Output"): report_output_json = gr.JSON()
 
 
259
 
260
  with gr.TabItem("③ Direct Chat Q&A"):
261
+ chatbot = gr.Chatbot(label="Chat Interface", height=550, bubble_full_width=False)
262
+ msg_input = gr.Textbox(label="Your Question", placeholder="Ask a question about the processed data...", scale=4)
263
+ msg_input.submit(self._chat_workflow, [msg_input, chatbot, processed_data, user_purpose], [msg_input, chatbot])
264
 
265
+ process_button.click(self._ingest_workflow, [session_id, url_input, crawl_depth, text_input, file_upload, process_button], [processed_data, ingestion_summary, error_log, process_button])
266
+ generate_button.click(self._reporting_workflow, [session_id, report_type, processed_data, user_purpose, generate_button], [report_output_md, report_output_json, generate_button])
 
 
 
 
267
 
268
  return app
269
 
270
+ def _ingest_workflow(self, s_id, url, depth, text, files, btn):
271
+ yield gr.update(value="⚙️ Processing...", interactive=False)
272
  log(f"Starting ingestion for session {s_id}...")
273
+ data, errors = self.engine.process_data_sources(s_id, url, depth, text, files)
274
+ summary = f"Processing complete. {len(data):,} characters ingested from all sources. {len(errors)} errors encountered."
275
+ yield data, summary, "\n".join(errors), gr.update(value="🚀 Process All Sources", interactive=True)
276
 
277
+ def _chat_workflow(self, message, history, context, purpose):
278
  if not context:
279
  history.append((message, "Error: No data has been ingested. Please process data in Tab 1 first."))
280
  return "", history
281
+
282
+ history.append((message, ""))
283
+ log(f"Starting RAG query. Purpose: {purpose}")
284
+ full_response = ""
285
+ for token in self.engine.run_rag_query(message, context, purpose):
286
+ full_response += token
287
+ history[-1] = (message, full_response)
288
+ yield "", history
289
 
290
+ def _reporting_workflow(self, s_id, r_type, context, objective, btn):
291
+ yield gr.update(value="Generating...", interactive=False), None, None
292
+ auditor = SystemAuditor(session_id=s_id)
293
  if not context:
294
+ md_out = "### Error: No data ingested. Please process sources in Tab 1."
295
+ yield gr.update(value="Generate Report", interactive=True), md_out, None
296
+ return
297
+
298
  start_time = time.time()
299
+ log(auditor.format_prompt_log(f"Generating report: '{r_type}'"))
300
  response = self.engine.generate_report(r_type, context, objective)
301
  latency = (time.time() - start_time) * 1000
302
+ log(auditor.format_response_log(response, latency, 1, 0.95))
 
303
 
304
  if r_type == "Narrative Prose Report":
305
+ yield gr.update(interactive=True), response, None
306
+ else: # Technical JSON Report
307
+ try: yield gr.update(interactive=True), None, json.loads(response)
308
+ except json.JSONDecodeError: yield gr.update(interactive=True), None, {"error": "Could not parse JSON from model."}
 
 
 
 
309
 
310
+ def launch(self): self.app.launch(debug=Config.VERBOSE, share=False)
 
311
 
 
312
  if __name__ == "__main__":
313
  if not Config.HF_TOKEN:
314
  print("FATAL: Hugging Face token (HF_TOKEN) not found in environment variables.")
 
315
  else:
316
  log("Instantiating Maestro Engine and launching Gradio App...")
317
  maestro_engine = MaestroEngine()