Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
# app.py
|
| 2 |
-
|
| 3 |
import os
|
| 4 |
import re
|
| 5 |
import requests
|
|
@@ -16,45 +14,103 @@ from urllib.parse import urljoin, urlparse
|
|
| 16 |
|
| 17 |
# Third-party libraries
|
| 18 |
import gradio as gr
|
| 19 |
-
from huggingface_hub import InferenceClient
|
|
|
|
| 20 |
from pypdf import PdfReader
|
| 21 |
from bs4 import BeautifulSoup
|
| 22 |
import nltk
|
| 23 |
|
| 24 |
-
# Local imports from the enhanced prompt library
|
| 25 |
-
from agent_prompt import PromptLibrary, SystemAuditor
|
| 26 |
-
|
| 27 |
# --- CONFIGURATION ---
|
| 28 |
class Config:
|
|
|
|
| 29 |
HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
|
| 30 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
VERBOSE = os.getenv("VERBOSE", "True").lower() == "true"
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
| 35 |
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# --- UTILITIES ---
|
| 38 |
def log(message: str) -> None:
|
| 39 |
-
if Config.VERBOSE:
|
| 40 |
-
print(f"[{datetime.datetime.now(datetime.timezone.utc).isoformat()}] {message}")
|
| 41 |
-
|
| 42 |
-
class SessionManager:
|
| 43 |
-
# ... (No changes from previous version)
|
| 44 |
-
def __init__(self, session_id: str):
|
| 45 |
-
self.session_id = session_id
|
| 46 |
-
self.temp_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
|
| 47 |
-
def __enter__(self) -> str:
|
| 48 |
-
os.makedirs(self.temp_dir, exist_ok=True)
|
| 49 |
-
return self.temp_dir
|
| 50 |
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 51 |
-
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
| 52 |
|
| 53 |
# --- CORE APPLICATION ENGINE ---
|
| 54 |
class MaestroEngine:
|
| 55 |
-
"""Handles all data processing and LLM interaction
|
| 56 |
def __init__(self):
|
|
|
|
| 57 |
self.client = InferenceClient(model=Config.HF_MODEL, token=Config.HF_TOKEN)
|
|
|
|
| 58 |
try:
|
| 59 |
nltk.data.find("tokenizers/punkt")
|
| 60 |
except LookupError:
|
|
@@ -62,165 +118,177 @@ class MaestroEngine:
|
|
| 62 |
nltk.download('punkt', quiet=True)
|
| 63 |
log("MaestroEngine initialized.")
|
| 64 |
|
| 65 |
-
# ---
|
| 66 |
-
def
|
| 67 |
try:
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
except Exception as e: return f"Error reading PDF {os.path.basename(file_path)}: {e}"
|
| 71 |
-
|
| 72 |
-
def _process_zip(self, zip_path: str, temp_dir: str) -> str:
|
| 73 |
-
# ... (Identical to previous correct version)
|
| 74 |
-
extracted_texts = []
|
| 75 |
-
extract_path = os.path.join(temp_dir, "zip_extract")
|
| 76 |
-
os.makedirs(extract_path, exist_ok=True)
|
| 77 |
-
try:
|
| 78 |
-
with zipfile.ZipFile(zip_path, 'r') as zf:
|
| 79 |
-
for member in zf.infolist():
|
| 80 |
-
if member.is_dir() or member.filename.startswith('__MACOSX'): continue
|
| 81 |
-
member_path = zf.extract(member, path=extract_path)
|
| 82 |
-
if member.filename.endswith('.pdf'): extracted_texts.append(self._read_pdf(member_path))
|
| 83 |
-
elif member.filename.endswith('.txt'):
|
| 84 |
-
with open(member_path, 'r', encoding='utf-8', errors='ignore') as f: extracted_texts.append(f.read())
|
| 85 |
-
return "\n\n".join(extracted_texts)
|
| 86 |
-
except Exception as e: return f"Error processing ZIP {os.path.basename(zip_path)}: {e}"
|
| 87 |
-
|
| 88 |
-
# --- Google Workspace Integration ---
|
| 89 |
-
def _get_google_drive_id(self, url: str) -> str | None:
|
| 90 |
-
match = re.search(r"/file/d/([a-zA-Z0-9_-]+)", url)
|
| 91 |
-
return match.group(1) if match else None
|
| 92 |
|
| 93 |
-
def
|
| 94 |
-
URL = "https://docs.google.com/uc?export=download&id="
|
| 95 |
try:
|
| 96 |
-
response = requests.get(
|
| 97 |
response.raise_for_status()
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
f.write(chunk)
|
| 105 |
-
return file_path, None
|
| 106 |
-
except Exception as e:
|
| 107 |
-
return None, f"Failed to download Google Drive file ({file_id}): {e}"
|
| 108 |
|
| 109 |
-
def
|
| 110 |
try:
|
| 111 |
-
|
| 112 |
-
export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=txt"
|
| 113 |
-
response = requests.get(export_url, timeout=Config.REQUESTS_TIMEOUT)
|
| 114 |
response.raise_for_status()
|
| 115 |
-
return response.
|
| 116 |
-
except Exception as e: return
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
if not start_url: return "", []
|
| 121 |
-
visited: Set[str] = set()
|
| 122 |
-
to_visit: List[Tuple[str, int]] = [(start_url, 0)]
|
| 123 |
-
all_text, errors = [], []
|
| 124 |
-
base_netloc = urlparse(start_url).netloc
|
| 125 |
-
|
| 126 |
-
while to_visit and len(visited) < 50: # Safety break
|
| 127 |
-
current_url, depth = to_visit.pop(0)
|
| 128 |
-
if current_url in visited or depth > max_depth: continue
|
| 129 |
-
|
| 130 |
-
log(f"Crawling (depth {depth}): {current_url}")
|
| 131 |
-
visited.add(current_url)
|
| 132 |
-
|
| 133 |
-
try:
|
| 134 |
-
response = requests.get(current_url, headers={'User-Agent': Config.USER_AGENT}, timeout=Config.REQUESTS_TIMEOUT)
|
| 135 |
-
response.raise_for_status()
|
| 136 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
| 137 |
-
all_text.append(soup.get_text(separator="\n", strip=True))
|
| 138 |
-
|
| 139 |
-
if depth < max_depth:
|
| 140 |
-
for link in soup.find_all('a', href=True):
|
| 141 |
-
abs_url = urljoin(current_url, link['href'])
|
| 142 |
-
if urlparse(abs_url).netloc == base_netloc and abs_url not in visited:
|
| 143 |
-
to_visit.append((abs_url, depth + 1))
|
| 144 |
-
except Exception as e:
|
| 145 |
-
errors.append(f"Crawl Error on {current_url}: {e}")
|
| 146 |
-
return "\n\n".join(all_text), errors
|
| 147 |
-
|
| 148 |
-
# --- Main Orchestrator ---
|
| 149 |
-
def process_data_sources(self, session_id: str, url: str, crawl_depth: int, text: str, file_paths: List[str]) -> Tuple[str, List[str]]:
|
| 150 |
all_content, errors = [], []
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
if url:
|
| 169 |
-
if "drive.google.com/file/d/" in url:
|
| 170 |
-
file_id = self._get_google_drive_id(url)
|
| 171 |
-
if file_id:
|
| 172 |
-
path, err = self._download_google_drive_file(file_id, temp_dir)
|
| 173 |
-
if err: errors.append(err)
|
| 174 |
-
elif path: all_content.append(self._read_pdf(path)) # Assuming PDF for simplicity
|
| 175 |
-
else: errors.append(f"Invalid Google Drive URL: {url}")
|
| 176 |
-
elif "docs.google.com/document/d/" in url:
|
| 177 |
-
content, err = self._fetch_google_doc(url)
|
| 178 |
-
if err: errors.append(err)
|
| 179 |
-
else: all_content.append(content)
|
| 180 |
-
else: # Standard web crawling
|
| 181 |
-
content, crawl_errors = self._crawl_website(url, crawl_depth)
|
| 182 |
-
all_content.append(content)
|
| 183 |
-
errors.extend(crawl_errors)
|
| 184 |
-
|
| 185 |
return "\n\n---\n\n".join(all_content), errors
|
| 186 |
|
| 187 |
# --- LLM Interaction ---
|
| 188 |
-
def
|
| 189 |
-
|
| 190 |
-
try:
|
| 191 |
-
for token in self.client.text_generation(full_prompt, max_new_tokens=max_tokens, stream=True, temperature=0.7, top_p=0.95):
|
| 192 |
-
yield token
|
| 193 |
-
except Exception as e:
|
| 194 |
-
log(f"LLM stream query failed: {e}")
|
| 195 |
-
yield f"Error communicating with the model: {e}"
|
| 196 |
-
|
| 197 |
-
def run_rag_query(self, query: str, context: str, purpose: str) -> Generator[str, None, None]:
|
| 198 |
system_prompt = PromptLibrary.AGENT_PREFIX.format(
|
| 199 |
dynamic_timestamp_utc=datetime.datetime.now(datetime.timezone.utc).isoformat(),
|
| 200 |
-
user_purpose=
|
| 201 |
)
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
-
def
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
)
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
-
#
|
| 216 |
-
|
| 217 |
-
|
|
|
|
| 218 |
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
return
|
|
|
|
| 224 |
|
| 225 |
# --- GRADIO APPLICATION ---
|
| 226 |
class GradioApp:
|
|
@@ -229,91 +297,75 @@ class GradioApp:
|
|
| 229 |
self.app = self._build_ui()
|
| 230 |
|
| 231 |
def _build_ui(self) -> gr.Blocks:
|
| 232 |
-
with gr.Blocks(theme=gr.themes.Soft(primary_hue="
|
| 233 |
session_id = gr.State(lambda: secrets.token_hex(16))
|
| 234 |
-
|
| 235 |
-
|
| 236 |
gr.Markdown("# 🧠 Maestro: AI Data Engine & Synthesis Platform")
|
| 237 |
|
| 238 |
with gr.Tabs():
|
| 239 |
-
with gr.TabItem("
|
| 240 |
-
user_purpose = gr.Textbox(label="High-Level Goal / Purpose", placeholder="e.g., 'Research AI impact on agriculture for a market report'")
|
| 241 |
with gr.Row():
|
| 242 |
with gr.Column(scale=3):
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
with gr.Row():
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
|
|
|
|
|
|
| 259 |
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
msg_input.submit(self._chat_workflow, [msg_input, chatbot, processed_data, user_purpose], [msg_input, chatbot])
|
| 264 |
|
| 265 |
-
|
| 266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
summary = f"Processing complete. {len(data):,} characters ingested from all sources. {len(errors)} errors encountered."
|
| 275 |
-
yield data, summary, "\n".join(errors), gr.update(value="🚀 Process All Sources", interactive=True)
|
| 276 |
-
|
| 277 |
-
def _chat_workflow(self, message, history, context, purpose):
|
| 278 |
-
if not context:
|
| 279 |
-
history.append((message, "Error: No data has been ingested. Please process data in Tab 1 first."))
|
| 280 |
-
return "", history
|
| 281 |
|
| 282 |
-
|
| 283 |
-
log(f"Starting RAG query. Purpose: {purpose}")
|
| 284 |
-
full_response = ""
|
| 285 |
-
for token in self.engine.run_rag_query(message, context, purpose):
|
| 286 |
-
full_response += token
|
| 287 |
-
history[-1] = (message, full_response)
|
| 288 |
-
yield "", history
|
| 289 |
-
|
| 290 |
-
def _reporting_workflow(self, s_id, r_type, context, objective, btn):
|
| 291 |
-
yield gr.update(value="Generating...", interactive=False), None, None
|
| 292 |
-
auditor = SystemAuditor(session_id=s_id)
|
| 293 |
-
if not context:
|
| 294 |
-
md_out = "### Error: No data ingested. Please process sources in Tab 1."
|
| 295 |
-
yield gr.update(value="Generate Report", interactive=True), md_out, None
|
| 296 |
-
return
|
| 297 |
-
|
| 298 |
-
start_time = time.time()
|
| 299 |
-
log(auditor.format_prompt_log(f"Generating report: '{r_type}'"))
|
| 300 |
-
response = self.engine.generate_report(r_type, context, objective)
|
| 301 |
-
latency = (time.time() - start_time) * 1000
|
| 302 |
-
log(auditor.format_response_log(response, latency, 1, 0.95))
|
| 303 |
-
|
| 304 |
-
if r_type == "Narrative Prose Report":
|
| 305 |
-
yield gr.update(interactive=True), response, None
|
| 306 |
-
else: # Technical JSON Report
|
| 307 |
-
try: yield gr.update(interactive=True), None, json.loads(response)
|
| 308 |
-
except json.JSONDecodeError: yield gr.update(interactive=True), None, {"error": "Could not parse JSON from model."}
|
| 309 |
|
| 310 |
def launch(self): self.app.launch(debug=Config.VERBOSE, share=False)
|
| 311 |
|
| 312 |
if __name__ == "__main__":
|
| 313 |
if not Config.HF_TOKEN:
|
| 314 |
-
print("FATAL:
|
| 315 |
else:
|
| 316 |
-
log("Instantiating Maestro Engine
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
import requests
|
|
|
|
| 14 |
|
| 15 |
# Third-party libraries
|
| 16 |
import gradio as gr
|
| 17 |
+
from huggingface_hub import InferenceClient, HfApi, hf_hub_download
|
| 18 |
+
from huggingface_hub.utils import HfHubHTTPError
|
| 19 |
from pypdf import PdfReader
|
| 20 |
from bs4 import BeautifulSoup
|
| 21 |
import nltk
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
# --- CONFIGURATION ---
|
| 24 |
class Config:
|
| 25 |
+
"""Centralized configuration for the Maestro application."""
|
| 26 |
HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
|
| 27 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 28 |
+
HF_DATASET_REPO = "Omnibus/tmp" # As specified in the user's script
|
| 29 |
+
MEMORY_MAIN_PATH = "mem-test2/main.json"
|
| 30 |
+
MEMORY_INDEX_PATH = "mem-test2/index.json"
|
| 31 |
+
MEMORY_DATA_PATH = "mem-test2"
|
| 32 |
VERBOSE = os.getenv("VERBOSE", "True").lower() == "true"
|
| 33 |
+
MAX_TOKENS_SYNTHESIS = 4096
|
| 34 |
+
MAX_TOKENS_REPORT = 8192
|
| 35 |
+
MAX_TOKENS_CHAT = 2048
|
| 36 |
+
MAX_DATA_CHUNK = 20000 # For processing large text bodies
|
| 37 |
+
REQUESTS_TIMEOUT = 20
|
| 38 |
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
|
| 39 |
+
|
| 40 |
+
# --- PROMPT LIBRARY (Integrated for simplicity) ---
|
| 41 |
+
class PromptLibrary:
|
| 42 |
+
"""A centralized library of meticulously crafted prompt templates."""
|
| 43 |
+
AGENT_PREFIX = """
|
| 44 |
+
You are Maestro, an Expert Information Retrieval and Synthesis Agent. Your operation is governed by these directives:
|
| 45 |
+
1. Ethical Safeguard [v2.4]: Refuse to process harmful, illegal, or unethical requests.
|
| 46 |
+
2. Temporal Awareness: Use the timestamp {dynamic_timestamp_utc} to evaluate data relevance.
|
| 47 |
+
3. Contextual Prioritization: Analyze the user's purpose '{user_purpose}' to weigh data relevance.
|
| 48 |
+
"""
|
| 49 |
+
COMPRESS_JSON = """
|
| 50 |
+
Task: {task}
|
| 51 |
+
Based on the AGENT_PREFIX context and the following data, generate a structured and concise JSON summary.
|
| 52 |
+
|
| 53 |
+
Input Data Chunk:
|
| 54 |
+
---
|
| 55 |
+
{history}
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
Existing Knowledge (for context):
|
| 59 |
+
---
|
| 60 |
+
{knowledge}
|
| 61 |
+
---
|
| 62 |
+
|
| 63 |
+
Instructions:
|
| 64 |
+
Compile and categorize the data above into a JSON dictionary string. Extract key information, group related entities, and ensure the output is a single, valid JSON object.
|
| 65 |
+
"""
|
| 66 |
+
COMPRESS_REPORT = """
|
| 67 |
+
Task: {task}
|
| 68 |
+
Based on the AGENT_PREFIX context and the summarized knowledge you have, compile a detailed, exhaustive report (~8000 words).
|
| 69 |
+
|
| 70 |
+
Summarized Knowledge:
|
| 71 |
+
---
|
| 72 |
+
{knowledge}
|
| 73 |
+
---
|
| 74 |
+
|
| 75 |
+
Last Chunk of Raw Data (for final context):
|
| 76 |
+
---
|
| 77 |
+
{history}
|
| 78 |
+
---
|
| 79 |
+
|
| 80 |
+
Instructions:
|
| 81 |
+
Synthesize all provided information into a single, comprehensive narrative. Be thorough, detailed, and structure the report with clear headings and sections.
|
| 82 |
+
"""
|
| 83 |
+
SAVE_MEMORY = """
|
| 84 |
+
Task: {task}
|
| 85 |
+
Data:
|
| 86 |
+
---
|
| 87 |
+
{history}
|
| 88 |
+
---
|
| 89 |
+
Instructions:
|
| 90 |
+
Compile and categorize the data above into a JSON dictionary string. Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format.
|
| 91 |
+
Required keys: "keywords", "title", "description", "content", "url". The "keywords" list should be comprehensive.
|
| 92 |
+
"""
|
| 93 |
+
RECALL_MEMORY = """
|
| 94 |
+
The user will give you a query and a list of keywords from a database index.
|
| 95 |
+
Your duty is to choose the words from the list that are most closely related to the search query.
|
| 96 |
+
If no keywords are relevant, return an empty list: [].
|
| 97 |
+
Respond only with a single, valid JSON list of strings.
|
| 98 |
+
|
| 99 |
+
USER QUERY: {prompt}
|
| 100 |
+
KEYWORD LIST: {keywords}
|
| 101 |
+
"""
|
| 102 |
+
|
| 103 |
# --- UTILITIES ---
|
| 104 |
def log(message: str) -> None:
|
| 105 |
+
if Config.VERBOSE: print(f"[{datetime.datetime.now(datetime.timezone.utc).isoformat()}] {message}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
# --- CORE APPLICATION ENGINE ---
|
| 108 |
class MaestroEngine:
|
| 109 |
+
"""Handles all data processing, memory management, and LLM interaction."""
|
| 110 |
def __init__(self):
|
| 111 |
+
if not Config.HF_TOKEN: raise ValueError("HF_TOKEN environment variable not set!")
|
| 112 |
self.client = InferenceClient(model=Config.HF_MODEL, token=Config.HF_TOKEN)
|
| 113 |
+
self.api = HfApi(token=Config.HF_TOKEN)
|
| 114 |
try:
|
| 115 |
nltk.data.find("tokenizers/punkt")
|
| 116 |
except LookupError:
|
|
|
|
| 118 |
nltk.download('punkt', quiet=True)
|
| 119 |
log("MaestroEngine initialized.")
|
| 120 |
|
| 121 |
+
# --- Data Ingestion ---
|
| 122 |
+
def _read_pdf_from_path(self, path: str) -> str:
|
| 123 |
try:
|
| 124 |
+
return "\n".join(page.extract_text() or "" for page in PdfReader(path).pages)
|
| 125 |
+
except Exception as e: return f"Error reading PDF {os.path.basename(path)}: {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
+
def _read_pdf_from_url(self, url: str) -> str:
|
|
|
|
| 128 |
try:
|
| 129 |
+
response = requests.get(url, stream=True, timeout=Config.REQUESTS_TIMEOUT)
|
| 130 |
response.raise_for_status()
|
| 131 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
| 132 |
+
tmp_file.write(response.content)
|
| 133 |
+
return self._read_pdf_from_path(tmp_file.name)
|
| 134 |
+
except Exception as e: return f"Failed to download or read PDF from {url}: {e}"
|
| 135 |
+
finally:
|
| 136 |
+
if 'tmp_file' in locals() and os.path.exists(tmp_file.name): os.remove(tmp_file.name)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
+
def _get_web_text(self, url: str) -> str:
|
| 139 |
try:
|
| 140 |
+
response = requests.get(url, headers={'User-Agent': Config.USER_AGENT}, timeout=Config.REQUESTS_TIMEOUT)
|
|
|
|
|
|
|
| 141 |
response.raise_for_status()
|
| 142 |
+
return BeautifulSoup(response.content, 'lxml').get_text(separator="\n", strip=True)
|
| 143 |
+
except Exception as e: return f"Failed to fetch URL {url}: {e}"
|
| 144 |
+
|
| 145 |
+
def process_data_sources(self, text: str, files: List[str], url: str, pdf_url: str, pdf_batch: str) -> Tuple[str, List[str]]:
|
| 146 |
+
"""Orchestrates data ingestion from all provided sources."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
all_content, errors = [], []
|
| 148 |
+
if text: all_content.append(text)
|
| 149 |
+
if url: all_content.append(self._get_web_text(url))
|
| 150 |
+
if pdf_url: all_content.append(self._read_pdf_from_url(pdf_url))
|
| 151 |
+
if pdf_batch:
|
| 152 |
+
urls = [u.strip() for u in pdf_batch.split(',') if u.strip()]
|
| 153 |
+
for u in urls:
|
| 154 |
+
content = self._read_pdf_from_url(u)
|
| 155 |
+
if content.startswith("Error"): errors.append(content)
|
| 156 |
+
else: all_content.append(content)
|
| 157 |
+
if files:
|
| 158 |
+
for path in files:
|
| 159 |
+
if not path: continue
|
| 160 |
+
filename, ext = os.path.basename(path), os.path.splitext(path)[1].lower()
|
| 161 |
+
if ext == '.pdf': all_content.append(self._read_pdf_from_path(path))
|
| 162 |
+
elif ext == '.txt':
|
| 163 |
+
with open(path, 'r', encoding='utf-8', errors='ignore') as f: all_content.append(f.read())
|
| 164 |
+
else: errors.append(f"Unsupported file type: {filename}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
return "\n\n---\n\n".join(all_content), errors
|
| 166 |
|
| 167 |
# --- LLM Interaction ---
|
| 168 |
+
def _run_gpt(self, prompt_template: str, max_tokens: int, **kwargs) -> str:
|
| 169 |
+
"""Core LLM call function."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
system_prompt = PromptLibrary.AGENT_PREFIX.format(
|
| 171 |
dynamic_timestamp_utc=datetime.datetime.now(datetime.timezone.utc).isoformat(),
|
| 172 |
+
user_purpose=kwargs.get('task', 'completing a system task.')
|
| 173 |
)
|
| 174 |
+
full_prompt = f"<s>[INST] {system_prompt}\n\n{prompt_template.format(**kwargs)} [/INST]"
|
| 175 |
+
log(f"Running GPT. Template: {prompt_template[:50]}...")
|
| 176 |
+
try:
|
| 177 |
+
return self.client.text_generation(full_prompt, max_new_tokens=max_tokens, temperature=0.8, top_p=0.95).strip()
|
| 178 |
+
except Exception as e:
|
| 179 |
+
log(f"LLM Error: {e}")
|
| 180 |
+
return f'{{"error": "LLM call failed", "details": "{e}"}}'
|
| 181 |
|
| 182 |
+
def _chunk_and_process(self, text: str, prompt_template: str, task: str, max_tokens: int) -> List[str]:
|
| 183 |
+
"""Chunks large text and processes each chunk with an LLM."""
|
| 184 |
+
text_len = len(text)
|
| 185 |
+
if text_len == 0: return []
|
| 186 |
+
num_chunks = (text_len + Config.MAX_DATA_CHUNK - 1) // Config.MAX_DATA_CHUNK
|
| 187 |
+
chunk_size = (text_len + num_chunks - 1) // num_chunks
|
| 188 |
+
|
| 189 |
+
results, knowledge = [], ""
|
| 190 |
+
for i in range(num_chunks):
|
| 191 |
+
chunk = text[i*chunk_size : (i+1)*chunk_size]
|
| 192 |
+
log(f"Processing chunk {i+1}/{num_chunks}...")
|
| 193 |
+
resp = self._run_gpt(prompt_template, max_tokens, task=task, knowledge=knowledge, history=chunk)
|
| 194 |
+
knowledge = resp if len(resp) < 2000 else resp[:2000] # Use response as context for next chunk
|
| 195 |
+
results.append(resp)
|
| 196 |
+
return results
|
| 197 |
+
|
| 198 |
+
# --- Synthesis & Reporting Workflow ---
|
| 199 |
+
def synthesis_workflow(self, text: str, task: str, do_summarize: bool, do_report: bool) -> Tuple[str, List[Dict]]:
|
| 200 |
+
"""Handles the multi-stage summarization and reporting process."""
|
| 201 |
+
if not text: return "No data to process.", []
|
| 202 |
+
json_summary_objects, final_report = [], ""
|
| 203 |
+
|
| 204 |
+
if do_summarize or do_report: # Summarization is a prerequisite for reporting
|
| 205 |
+
log("Starting summarization stage...")
|
| 206 |
+
summaries = self._chunk_and_process(text, PromptLibrary.COMPRESS_JSON, task, Config.MAX_TOKENS_SYNTHESIS)
|
| 207 |
+
for s in summaries:
|
| 208 |
+
try: json_summary_objects.append(json.loads(s))
|
| 209 |
+
except json.JSONDecodeError: json_summary_objects.append({"error": "Failed to parse summary JSON", "raw": s})
|
| 210 |
+
log("Summarization stage complete.")
|
| 211 |
+
|
| 212 |
+
if do_report:
|
| 213 |
+
log("Starting report generation stage...")
|
| 214 |
+
# Use the collected JSON summaries as knowledge for the final report
|
| 215 |
+
knowledge_for_report = json.dumps(json_summary_objects, indent=2)
|
| 216 |
+
final_report = self._run_gpt(PromptLibrary.COMPRESS_REPORT, Config.MAX_TOKENS_REPORT, task=task, knowledge=knowledge_for_report, history="All data chunks have been summarized.")
|
| 217 |
+
log("Report generation complete.")
|
| 218 |
+
return final_report, json_summary_objects
|
| 219 |
+
|
| 220 |
+
return "Summarization complete.", json_summary_objects
|
| 221 |
+
|
| 222 |
+
# --- Persistent Memory System ---
|
| 223 |
+
def _hf_download_json(self, repo_path: str, default: Any = []) -> Any:
|
| 224 |
+
try:
|
| 225 |
+
path = hf_hub_download(repo_id=Config.HF_DATASET_REPO, filename=repo_path, repo_type="dataset", token=Config.HF_TOKEN)
|
| 226 |
+
with open(path, 'r') as f: return json.load(f)
|
| 227 |
+
except HfHubHTTPError: return default # File doesn't exist, return default
|
| 228 |
+
except (json.JSONDecodeError, IOError): return default
|
| 229 |
+
|
| 230 |
+
def _hf_upload_json(self, data: Any, repo_path: str):
|
| 231 |
+
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix=".json") as tmp_file:
|
| 232 |
+
json.dump(data, tmp_file, indent=4)
|
| 233 |
+
tmp_path = tmp_file.name
|
| 234 |
+
self.api.upload_file(path_or_fileobj=tmp_path, path_in_repo=repo_path, repo_id=Config.HF_DATASET_REPO, repo_type="dataset")
|
| 235 |
+
os.remove(tmp_path)
|
| 236 |
+
|
| 237 |
+
def save_to_memory(self, text: str, task: str) -> List[Dict]:
|
| 238 |
+
"""Saves processed text to the Hugging Face Dataset repo."""
|
| 239 |
+
log("Starting memory save process...")
|
| 240 |
+
json_chunks = self._chunk_and_process(text, PromptLibrary.SAVE_MEMORY, task, Config.MAX_TOKENS_SYNTHESIS)
|
| 241 |
+
parsed_chunks, main_file = [], self._hf_download_json(Config.MEMORY_MAIN_PATH)
|
| 242 |
+
|
| 243 |
+
for i, chunk_str in enumerate(json_chunks):
|
| 244 |
+
try:
|
| 245 |
+
data = json.loads(chunk_str)
|
| 246 |
+
ts = datetime.datetime.now(datetime.timezone.utc)
|
| 247 |
+
filename = f"{ts.strftime('%Y-%m-%d-%H-%M-%S')}-{uuid.uuid4().hex[:8]}.json"
|
| 248 |
+
self._hf_upload_json(data, f"{Config.MEMORY_DATA_PATH}/{filename}")
|
| 249 |
+
main_file.append({"file_name": filename, "keywords": data.get("keywords", []), "description": data.get("description", "")})
|
| 250 |
+
parsed_chunks.append(data)
|
| 251 |
+
except json.JSONDecodeError: log(f"Could not parse memory chunk {i} into JSON.")
|
| 252 |
+
|
| 253 |
+
self._hf_upload_json(main_file, Config.MEMORY_MAIN_PATH)
|
| 254 |
+
self.update_keyword_index(main_file)
|
| 255 |
+
log("Memory save complete.")
|
| 256 |
+
return parsed_chunks
|
| 257 |
+
|
| 258 |
+
def update_keyword_index(self, main_file_content: List[Dict]):
|
| 259 |
+
log("Updating keyword index...")
|
| 260 |
+
keyword_index = {}
|
| 261 |
+
for entry in main_file_content:
|
| 262 |
+
for keyword in entry.get("keywords", []):
|
| 263 |
+
k = keyword.strip().lower()
|
| 264 |
+
if k not in keyword_index: keyword_index[k] = []
|
| 265 |
+
if entry["file_name"] not in keyword_index[k]: keyword_index[k].append(entry["file_name"])
|
| 266 |
+
self._hf_upload_json(keyword_index, Config.MEMORY_INDEX_PATH)
|
| 267 |
+
log("Keyword index updated.")
|
| 268 |
+
|
| 269 |
+
def recall_from_memory(self, query: str) -> str:
|
| 270 |
+
log("Recalling from memory...")
|
| 271 |
+
index = self._hf_download_json(Config.MEMORY_INDEX_PATH, default={})
|
| 272 |
+
if not index: return "Memory index is empty or could not be loaded."
|
| 273 |
+
|
| 274 |
+
relevant_keywords_str = self._run_gpt(PromptLibrary.RECALL_MEMORY, 256, prompt=query, keywords=list(index.keys()))
|
| 275 |
+
try:
|
| 276 |
+
relevant_keywords = json.loads(relevant_keywords_str)
|
| 277 |
+
except json.JSONDecodeError: return "Could not determine relevant keywords from memory."
|
| 278 |
+
|
| 279 |
+
if not relevant_keywords: return "Found no relevant information in memory for that query."
|
| 280 |
|
| 281 |
+
# Fetch data from relevant files
|
| 282 |
+
matched_files, fetched_data = set(), []
|
| 283 |
+
for k in relevant_keywords:
|
| 284 |
+
for fname in index.get(k.lower().strip(), []): matched_files.add(fname)
|
| 285 |
|
| 286 |
+
for fname in list(matched_files)[:5]: # Limit fetches
|
| 287 |
+
data = self._hf_download_json(f"{Config.MEMORY_DATA_PATH}/{fname}", default={})
|
| 288 |
+
fetched_data.append(data)
|
| 289 |
+
|
| 290 |
+
return f"Recalled {len(fetched_data)} entries from memory:\n\n{json.dumps(fetched_data, indent=2)}"
|
| 291 |
+
|
| 292 |
|
| 293 |
# --- GRADIO APPLICATION ---
|
| 294 |
class GradioApp:
|
|
|
|
| 297 |
self.app = self._build_ui()
|
| 298 |
|
| 299 |
def _build_ui(self) -> gr.Blocks:
|
| 300 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky"), title="Maestro AI Engine") as app:
|
| 301 |
session_id = gr.State(lambda: secrets.token_hex(16))
|
| 302 |
+
|
|
|
|
| 303 |
gr.Markdown("# 🧠 Maestro: AI Data Engine & Synthesis Platform")
|
| 304 |
|
| 305 |
with gr.Tabs():
|
| 306 |
+
with gr.TabItem("⚙️ Ingestion & Synthesis"):
|
|
|
|
| 307 |
with gr.Row():
|
| 308 |
with gr.Column(scale=3):
|
| 309 |
+
task_instructions = gr.Textbox(label="Primary Task / Instructions", placeholder="e.g., 'Summarize the key findings regarding renewable energy adoption'")
|
| 310 |
+
with gr.Tabs():
|
| 311 |
+
with gr.TabItem("Text Input"): text_input = gr.Textbox(lines=10)
|
| 312 |
+
with gr.TabItem("File Upload"): file_upload = gr.File(label="Upload Files (.pdf, .txt)", file_count="multiple", type="filepath")
|
| 313 |
+
with gr.TabItem("Web URL"): url_input = gr.Textbox(label="URL")
|
| 314 |
+
with gr.TabItem("PDF URL"): pdf_url_input = gr.Textbox(label="Single PDF URL")
|
| 315 |
+
with gr.TabItem("Batch PDF URLs"): pdf_batch_input = gr.Textbox(label="Comma-separated PDF URLs", lines=3)
|
| 316 |
+
with gr.Column(scale=1):
|
| 317 |
+
gr.Markdown("### Processing Options")
|
| 318 |
+
summarize_check = gr.Checkbox(label="Create JSON Summary", value=True)
|
| 319 |
+
report_check = gr.Checkbox(label="Generate Full Report (requires summary)", value=False)
|
| 320 |
+
memory_check = gr.Checkbox(label="Save to Persistent Memory", value=False)
|
| 321 |
+
process_button = gr.Button("🚀 Process & Synthesize", variant="primary", scale=2)
|
| 322 |
+
|
| 323 |
+
gr.Markdown("### Results")
|
| 324 |
with gr.Row():
|
| 325 |
+
final_report_output = gr.Markdown(label="Final Report")
|
| 326 |
+
json_summary_output = gr.JSON(label="JSON Summaries")
|
| 327 |
+
|
| 328 |
+
with gr.TabItem("🔎 Memory Recall"):
|
| 329 |
+
memory_query = gr.Textbox(label="Query Persistent Memory", placeholder="e.g., 'What do we know about market trends in 2024?'")
|
| 330 |
+
recall_button = gr.Button("Recall", variant="primary")
|
| 331 |
+
memory_output = gr.Textbox(label="Recalled Information", lines=20, interactive=False)
|
| 332 |
|
| 333 |
+
process_button.click(self._synthesis_workflow, [task_instructions, text_input, file_upload, url_input, pdf_url_input, pdf_batch_input, summarize_check, report_check, memory_check], [final_report_output, json_summary_output])
|
| 334 |
+
recall_button.click(self.engine.recall_from_memory, [memory_query], [memory_output])
|
| 335 |
+
return app
|
|
|
|
| 336 |
|
| 337 |
+
def _synthesis_workflow(self, task, text, files, url, pdf_url, pdf_batch, do_sum, do_rep, do_mem):
|
| 338 |
+
log("Starting synthesis workflow...")
|
| 339 |
+
# 1. Ingest Data
|
| 340 |
+
ingested_text, errors = self.engine.process_data_sources(text, files, url, pdf_url, pdf_batch)
|
| 341 |
+
if errors:
|
| 342 |
+
log(f"Ingestion errors: {errors}")
|
| 343 |
+
# For simplicity, we show errors in the log. A real app might have a dedicated error box.
|
| 344 |
+
|
| 345 |
+
if not ingested_text:
|
| 346 |
+
return "No data was successfully ingested. Please check your inputs and logs.", None
|
| 347 |
|
| 348 |
+
# 2. Save to Memory (if requested)
|
| 349 |
+
if do_mem:
|
| 350 |
+
self.engine.save_to_memory(ingested_text, task)
|
| 351 |
+
# We don't wait for this to finish for the UI, it's a background-like task
|
| 352 |
|
| 353 |
+
# 3. Summarize and Report
|
| 354 |
+
if do_sum or do_rep:
|
| 355 |
+
report, summaries = self.engine.synthesis_workflow(ingested_text, task, do_sum, do_rep)
|
| 356 |
+
return report, summaries
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
|
| 358 |
+
return "Processing complete. No synthesis option was selected.", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
|
| 360 |
def launch(self): self.app.launch(debug=Config.VERBOSE, share=False)
|
| 361 |
|
| 362 |
if __name__ == "__main__":
|
| 363 |
if not Config.HF_TOKEN:
|
| 364 |
+
print("FATAL: HF_TOKEN environment variable not set.")
|
| 365 |
else:
|
| 366 |
+
log("Instantiating Maestro Engine...")
|
| 367 |
+
engine = MaestroEngine()
|
| 368 |
+
app = GradioApp(engine)
|
| 369 |
+
log("Launching Gradio App...")
|
| 370 |
+
app.launch()
|
| 371 |
+
|