Ndux

Runtime error

App Files Files Community

acecalisto3 commited on Aug 7, 2025

Commit

8a867c6

verified ·

1 Parent(s): 2fd860e

Update app.py

Browse files

Files changed (1) hide show

app.py +288 -236

app.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# app.py
 import os
 import re
 import requests
@@ -16,45 +14,103 @@ from urllib.parse import urljoin, urlparse
 # Third-party libraries
 import gradio as gr
-from huggingface_hub import InferenceClient
 from pypdf import PdfReader
 from bs4 import BeautifulSoup
 import nltk
-# Local imports from the enhanced prompt library
-from agent_prompt import PromptLibrary, SystemAuditor
 # --- CONFIGURATION ---
 class Config:
     HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
     HF_TOKEN = os.getenv("HF_TOKEN")
     VERBOSE = os.getenv("VERBOSE", "True").lower() == "true"
-    MAX_NEW_TOKENS_REPORT = 8192
-    MAX_NEW_TOKENS_CHAT = 2048
-    REQUESTS_TIMEOUT = 15
     USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
 # --- UTILITIES ---
 def log(message: str) -> None:
-    if Config.VERBOSE:
-        print(f"[{datetime.datetime.now(datetime.timezone.utc).isoformat()}] {message}")
-class SessionManager:
-    # ... (No changes from previous version)
-    def __init__(self, session_id: str):
-        self.session_id = session_id
-        self.temp_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
-    def __enter__(self) -> str:
-        os.makedirs(self.temp_dir, exist_ok=True)
-        return self.temp_dir
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        shutil.rmtree(self.temp_dir, ignore_errors=True)
 # --- CORE APPLICATION ENGINE ---
 class MaestroEngine:
-    """Handles all data processing and LLM interaction logic."""
     def __init__(self):
         self.client = InferenceClient(model=Config.HF_MODEL, token=Config.HF_TOKEN)
         try:
             nltk.data.find("tokenizers/punkt")
         except LookupError:
@@ -62,165 +118,177 @@ class MaestroEngine:
             nltk.download('punkt', quiet=True)
         log("MaestroEngine initialized.")
-    # --- File Processors ---
-    def _read_pdf(self, file_path: str) -> str:
         try:
-            reader = PdfReader(file_path)
-            return "\n".join(page.extract_text() or "" for page in reader.pages)
-        except Exception as e: return f"Error reading PDF {os.path.basename(file_path)}: {e}"
-    def _process_zip(self, zip_path: str, temp_dir: str) -> str:
-        # ... (Identical to previous correct version)
-        extracted_texts = []
-        extract_path = os.path.join(temp_dir, "zip_extract")
-        os.makedirs(extract_path, exist_ok=True)
-        try:
-            with zipfile.ZipFile(zip_path, 'r') as zf:
-                for member in zf.infolist():
-                    if member.is_dir() or member.filename.startswith('__MACOSX'): continue
-                    member_path = zf.extract(member, path=extract_path)
-                    if member.filename.endswith('.pdf'): extracted_texts.append(self._read_pdf(member_path))
-                    elif member.filename.endswith('.txt'):
-                         with open(member_path, 'r', encoding='utf-8', errors='ignore') as f: extracted_texts.append(f.read())
-            return "\n\n".join(extracted_texts)
-        except Exception as e: return f"Error processing ZIP {os.path.basename(zip_path)}: {e}"
-    # --- Google Workspace Integration ---
-    def _get_google_drive_id(self, url: str) -> str | None:
-        match = re.search(r"/file/d/([a-zA-Z0-9_-]+)", url)
-        return match.group(1) if match else None
-    def _download_google_drive_file(self, file_id: str, temp_dir: str) -> Tuple[str | None, str | None]:
-        URL = "https://docs.google.com/uc?export=download&id="
         try:
-            response = requests.get(URL + file_id, stream=True, timeout=30)
             response.raise_for_status()
-            disposition = response.headers.get('content-disposition', '')
-            fname = re.findall('filename="(.+)"', disposition)
-            filename = fname[0] if fname else f"{file_id}_download"
-            file_path = os.path.join(temp_dir, filename)
-            with open(file_path, "wb") as f:
-                for chunk in response.iter_content(chunk_size=8192):
-                    f.write(chunk)
-            return file_path, None
-        except Exception as e:
-            return None, f"Failed to download Google Drive file ({file_id}): {e}"
-    def _fetch_google_doc(self, url: str) -> Tuple[str | None, str | None]:
         try:
-            doc_id = re.search(r"/document/d/([a-zA-Z0-9_-]+)", url).group(1)
-            export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=txt"
-            response = requests.get(export_url, timeout=Config.REQUESTS_TIMEOUT)
             response.raise_for_status()
-            return response.text, None
-        except Exception as e: return None, f"Failed to fetch Google Doc: {e}"
-    # --- Web Crawler ---
-    def _crawl_website(self, start_url: str, max_depth: int) -> Tuple[str, List[str]]:
-        if not start_url: return "", []
-        visited: Set[str] = set()
-        to_visit: List[Tuple[str, int]] = [(start_url, 0)]
-        all_text, errors = [], []
-        base_netloc = urlparse(start_url).netloc
-        while to_visit and len(visited) < 50: # Safety break
-            current_url, depth = to_visit.pop(0)
-            if current_url in visited or depth > max_depth: continue
-            log(f"Crawling (depth {depth}): {current_url}")
-            visited.add(current_url)
-            try:
-                response = requests.get(current_url, headers={'User-Agent': Config.USER_AGENT}, timeout=Config.REQUESTS_TIMEOUT)
-                response.raise_for_status()
-                soup = BeautifulSoup(response.content, 'html.parser')
-                all_text.append(soup.get_text(separator="\n", strip=True))
-                if depth < max_depth:
-                    for link in soup.find_all('a', href=True):
-                        abs_url = urljoin(current_url, link['href'])
-                        if urlparse(abs_url).netloc == base_netloc and abs_url not in visited:
-                            to_visit.append((abs_url, depth + 1))
-            except Exception as e:
-                errors.append(f"Crawl Error on {current_url}: {e}")
-        return "\n\n".join(all_text), errors
-    # --- Main Orchestrator ---
-    def process_data_sources(self, session_id: str, url: str, crawl_depth: int, text: str, file_paths: List[str]) -> Tuple[str, List[str]]:
         all_content, errors = [], []
-        with SessionManager(session_id) as temp_dir:
-            # 1. Direct Text
-            if text: all_content.append(text)
-            # 2. Uploaded Files
-            if file_paths:
-                for path in file_paths:
-                    if not path: continue
-                    filename = os.path.basename(path)
-                    ext = os.path.splitext(filename)[1].lower()
-                    if ext == '.pdf': all_content.append(self._read_pdf(path))
-                    elif ext == '.txt':
-                        with open(path, 'r', encoding='utf-8', errors='ignore') as f: all_content.append(f.read())
-                    elif ext == '.zip': all_content.append(self._process_zip(path, temp_dir))
-                    else: errors.append(f"Unsupported file type: {filename}")
-            # 3. URL Processing (Crawler, Google Drive, Google Docs)
-            if url:
-                if "drive.google.com/file/d/" in url:
-                    file_id = self._get_google_drive_id(url)
-                    if file_id:
-                        path, err = self._download_google_drive_file(file_id, temp_dir)
-                        if err: errors.append(err)
-                        elif path: all_content.append(self._read_pdf(path)) # Assuming PDF for simplicity
-                    else: errors.append(f"Invalid Google Drive URL: {url}")
-                elif "docs.google.com/document/d/" in url:
-                    content, err = self._fetch_google_doc(url)
-                    if err: errors.append(err)
-                    else: all_content.append(content)
-                else: # Standard web crawling
-                    content, crawl_errors = self._crawl_website(url, crawl_depth)
-                    all_content.append(content)
-                    errors.extend(crawl_errors)
         return "\n\n---\n\n".join(all_content), errors
     # --- LLM Interaction ---
-    def _query_llm_stream(self, system_prompt: str, user_prompt: str, max_tokens: int) -> Generator[str, None, None]:
-        full_prompt = f"<s>[INST] {system_prompt}\n\n{user_prompt} [/INST]"
-        try:
-            for token in self.client.text_generation(full_prompt, max_new_tokens=max_tokens, stream=True, temperature=0.7, top_p=0.95):
-                yield token
-        except Exception as e:
-            log(f"LLM stream query failed: {e}")
-            yield f"Error communicating with the model: {e}"
-    def run_rag_query(self, query: str, context: str, purpose: str) -> Generator[str, None, None]:
         system_prompt = PromptLibrary.AGENT_PREFIX.format(
             dynamic_timestamp_utc=datetime.datetime.now(datetime.timezone.utc).isoformat(),
-            user_purpose=purpose or "Direct Question & Answer"
         )
-        user_prompt = f"Based *only* on the context provided below, answer the user's question.\n\nCONTEXT:\n---\n{context}\n---\n\nQUESTION: {query}"
-        yield from self._query_llm_stream(system_prompt, user_prompt, Config.MAX_NEW_TOKENS_CHAT)
-    def generate_report(self, report_type: str, context: str, objective: str) -> str:
-        system_prompt = PromptLibrary.AGENT_PREFIX.format(
-            dynamic_timestamp_utc=datetime.datetime.now(datetime.timezone.utc).isoformat(),
-            user_purpose=objective
-        )
-        if report_type == "Narrative Prose Report":
-            user_prompt = PromptLibrary.NARRATIVE_PROSE_REPORT.format(task_objective=objective, knowledge_base=context)
-        else: # Technical JSON Report
-            user_prompt = PromptLibrary.TECHNICAL_JSON_REPORT.format(task_objective=objective, knowledge_base=context)
-        # Reports are not streamed
-        full_prompt = f"<s>[INST] {system_prompt}\n\n{user_prompt} [/INST]"
-        response = self.client.text_generation(full_prompt, max_new_tokens=Config.MAX_NEW_TOKENS_REPORT)
-        if report_type == "Technical JSON Report":
-            clean_json_str = re.sub(r'```json\s*|\s*```', '', response).strip()
-            try: return json.dumps(json.loads(clean_json_str), indent=2)
-            except json.JSONDecodeError: return json.dumps({"error": "Model returned invalid JSON", "raw_response": response})
-        return response
 # --- GRADIO APPLICATION ---
 class GradioApp:
@@ -229,91 +297,75 @@ class GradioApp:
         self.app = self._build_ui()
     def _build_ui(self) -> gr.Blocks:
-        with gr.Blocks(theme=gr.themes.Soft(primary_hue="emerald", secondary_hue="green"), title="Maestro AI Engine") as app:
             session_id = gr.State(lambda: secrets.token_hex(16))
-            processed_data = gr.State("")
             gr.Markdown("# 🧠 Maestro: AI Data Engine & Synthesis Platform")
             with gr.Tabs():
-                with gr.TabItem("① Data Ingestion"):
-                    user_purpose = gr.Textbox(label="High-Level Goal / Purpose", placeholder="e.g., 'Research AI impact on agriculture for a market report'")
                     with gr.Row():
                         with gr.Column(scale=3):
-                            url_input = gr.Textbox(label="Ingest from URL", placeholder="Enter a standard URL, Google Drive, or Google Docs link")
-                            text_input = gr.Textbox(label="Paste Text", lines=8)
-                        with gr.Column(scale=2):
-                            crawl_depth = gr.Slider(label="Web Crawl Depth", minimum=0, maximum=5, value=1, step=1, info="For standard URLs only. 0=current page, 1=page + links, etc.")
-                            file_upload = gr.File(label="Upload Files (.pdf, .txt, .zip)", file_count="multiple", type="filepath")
-                    process_button = gr.Button("🚀 Process All Sources", variant="primary")
-                    ingestion_summary = gr.Textbox(label="Ingestion Summary", interactive=False, lines=2)
-                    error_log = gr.Textbox(label="Errors & Warnings", interactive=False, lines=3)
-                with gr.TabItem("② Reporting & Synthesis"):
                     with gr.Row():
-                        report_type = gr.Dropdown(label="Select Report Type", choices=["Narrative Prose Report", "Technical JSON Report"], value="Narrative Prose Report")
-                        generate_button = gr.Button("Generate Report", variant="primary")
-                    with gr.Tabs():
-                        with gr.TabItem("Narrative Output"): report_output_md = gr.Markdown()
-                        with gr.TabItem("JSON Output"): report_output_json = gr.JSON()
-                with gr.TabItem("③ Direct Chat Q&A"):
-                    chatbot = gr.Chatbot(label="Chat Interface", height=550, bubble_full_width=False)
-                    msg_input = gr.Textbox(label="Your Question", placeholder="Ask a question about the processed data...", scale=4)
-                    msg_input.submit(self._chat_workflow, [msg_input, chatbot, processed_data, user_purpose], [msg_input, chatbot])
-            process_button.click(self._ingest_workflow, [session_id, url_input, crawl_depth, text_input, file_upload, process_button], [processed_data, ingestion_summary, error_log, process_button])
-            generate_button.click(self._reporting_workflow, [session_id, report_type, processed_data, user_purpose, generate_button], [report_output_md, report_output_json, generate_button])
-        return app
-    def _ingest_workflow(self, s_id, url, depth, text, files, btn):
-        yield gr.update(value="⚙️ Processing...", interactive=False)
-        log(f"Starting ingestion for session {s_id}...")
-        data, errors = self.engine.process_data_sources(s_id, url, depth, text, files)
-        summary = f"Processing complete. {len(data):,} characters ingested from all sources. {len(errors)} errors encountered."
-        yield data, summary, "\n".join(errors), gr.update(value="🚀 Process All Sources", interactive=True)
-    def _chat_workflow(self, message, history, context, purpose):
-        if not context:
-            history.append((message, "Error: No data has been ingested. Please process data in Tab 1 first."))
-            return "", history
-        history.append((message, ""))
-        log(f"Starting RAG query. Purpose: {purpose}")
-        full_response = ""
-        for token in self.engine.run_rag_query(message, context, purpose):
-            full_response += token
-            history[-1] = (message, full_response)
-            yield "", history
-    def _reporting_workflow(self, s_id, r_type, context, objective, btn):
-        yield gr.update(value="Generating...", interactive=False), None, None
-        auditor = SystemAuditor(session_id=s_id)
-        if not context:
-            md_out = "### Error: No data ingested. Please process sources in Tab 1."
-            yield gr.update(value="Generate Report", interactive=True), md_out, None
-            return
-        start_time = time.time()
-        log(auditor.format_prompt_log(f"Generating report: '{r_type}'"))
-        response = self.engine.generate_report(r_type, context, objective)
-        latency = (time.time() - start_time) * 1000
-        log(auditor.format_response_log(response, latency, 1, 0.95))
-        if r_type == "Narrative Prose Report":
-            yield gr.update(interactive=True), response, None
-        else: # Technical JSON Report
-            try: yield gr.update(interactive=True), None, json.loads(response)
-            except json.JSONDecodeError: yield gr.update(interactive=True), None, {"error": "Could not parse JSON from model."}
     def launch(self): self.app.launch(debug=Config.VERBOSE, share=False)
 if __name__ == "__main__":
     if not Config.HF_TOKEN:
-        print("FATAL: Hugging Face token (HF_TOKEN) not found in environment variables.")
     else:
-        log("Instantiating Maestro Engine and launching Gradio App...")
-        maestro_engine = MaestroEngine()
-        gradio_app = GradioApp(engine=maestro_engine)
-        gradio_app.launch()

 import os
 import re
 import requests
 # Third-party libraries
 import gradio as gr
+from huggingface_hub import InferenceClient, HfApi, hf_hub_download
+from huggingface_hub.utils import HfHubHTTPError
 from pypdf import PdfReader
 from bs4 import BeautifulSoup
 import nltk
 # --- CONFIGURATION ---
 class Config:
+    """Centralized configuration for the Maestro application."""
     HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
     HF_TOKEN = os.getenv("HF_TOKEN")
+    HF_DATASET_REPO = "Omnibus/tmp" # As specified in the user's script
+    MEMORY_MAIN_PATH = "mem-test2/main.json"
+    MEMORY_INDEX_PATH = "mem-test2/index.json"
+    MEMORY_DATA_PATH = "mem-test2"
     VERBOSE = os.getenv("VERBOSE", "True").lower() == "true"
+    MAX_TOKENS_SYNTHESIS = 4096
+    MAX_TOKENS_REPORT = 8192
+    MAX_TOKENS_CHAT = 2048
+    MAX_DATA_CHUNK = 20000 # For processing large text bodies
+    REQUESTS_TIMEOUT = 20
     USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
+# --- PROMPT LIBRARY (Integrated for simplicity) ---
+class PromptLibrary:
+    """A centralized library of meticulously crafted prompt templates."""
+    AGENT_PREFIX = """
+You are Maestro, an Expert Information Retrieval and Synthesis Agent. Your operation is governed by these directives:
+1. Ethical Safeguard [v2.4]: Refuse to process harmful, illegal, or unethical requests.
+2. Temporal Awareness: Use the timestamp {dynamic_timestamp_utc} to evaluate data relevance.
+3. Contextual Prioritization: Analyze the user's purpose '{user_purpose}' to weigh data relevance.
+"""
+    COMPRESS_JSON = """
+Task: {task}
+Based on the AGENT_PREFIX context and the following data, generate a structured and concise JSON summary.
+Input Data Chunk:
+---
+{history}
+---
+Existing Knowledge (for context):
+---
+{knowledge}
+---
+Instructions:
+Compile and categorize the data above into a JSON dictionary string. Extract key information, group related entities, and ensure the output is a single, valid JSON object.
+"""
+    COMPRESS_REPORT = """
+Task: {task}
+Based on the AGENT_PREFIX context and the summarized knowledge you have, compile a detailed, exhaustive report (~8000 words).
+Summarized Knowledge:
+---
+{knowledge}
+---
+Last Chunk of Raw Data (for final context):
+---
+{history}
+---
+Instructions:
+Synthesize all provided information into a single, comprehensive narrative. Be thorough, detailed, and structure the report with clear headings and sections.
+"""
+    SAVE_MEMORY = """
+Task: {task}
+Data:
+---
+{history}
+---
+Instructions:
+Compile and categorize the data above into a JSON dictionary string. Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format.
+Required keys: "keywords", "title", "description", "content", "url". The "keywords" list should be comprehensive.
+"""
+    RECALL_MEMORY = """
+The user will give you a query and a list of keywords from a database index.
+Your duty is to choose the words from the list that are most closely related to the search query.
+If no keywords are relevant, return an empty list: [].
+Respond only with a single, valid JSON list of strings.
+USER QUERY: {prompt}
+KEYWORD LIST: {keywords}
+"""
 # --- UTILITIES ---
 def log(message: str) -> None:
+    if Config.VERBOSE: print(f"[{datetime.datetime.now(datetime.timezone.utc).isoformat()}] {message}")
 # --- CORE APPLICATION ENGINE ---
 class MaestroEngine:
+    """Handles all data processing, memory management, and LLM interaction."""
     def __init__(self):
+        if not Config.HF_TOKEN: raise ValueError("HF_TOKEN environment variable not set!")
         self.client = InferenceClient(model=Config.HF_MODEL, token=Config.HF_TOKEN)
+        self.api = HfApi(token=Config.HF_TOKEN)
         try:
             nltk.data.find("tokenizers/punkt")
         except LookupError:
             nltk.download('punkt', quiet=True)
         log("MaestroEngine initialized.")
+    # --- Data Ingestion ---
+    def _read_pdf_from_path(self, path: str) -> str:
         try:
+            return "\n".join(page.extract_text() or "" for page in PdfReader(path).pages)
+        except Exception as e: return f"Error reading PDF {os.path.basename(path)}: {e}"
+    def _read_pdf_from_url(self, url: str) -> str:
         try:
+            response = requests.get(url, stream=True, timeout=Config.REQUESTS_TIMEOUT)
             response.raise_for_status()
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+                tmp_file.write(response.content)
+                return self._read_pdf_from_path(tmp_file.name)
+        except Exception as e: return f"Failed to download or read PDF from {url}: {e}"
+        finally:
+            if 'tmp_file' in locals() and os.path.exists(tmp_file.name): os.remove(tmp_file.name)
+    def _get_web_text(self, url: str) -> str:
         try:
+            response = requests.get(url, headers={'User-Agent': Config.USER_AGENT}, timeout=Config.REQUESTS_TIMEOUT)
             response.raise_for_status()
+            return BeautifulSoup(response.content, 'lxml').get_text(separator="\n", strip=True)
+        except Exception as e: return f"Failed to fetch URL {url}: {e}"
+    def process_data_sources(self, text: str, files: List[str], url: str, pdf_url: str, pdf_batch: str) -> Tuple[str, List[str]]:
+        """Orchestrates data ingestion from all provided sources."""
         all_content, errors = [], []
+        if text: all_content.append(text)
+        if url: all_content.append(self._get_web_text(url))
+        if pdf_url: all_content.append(self._read_pdf_from_url(pdf_url))
+        if pdf_batch:
+            urls = [u.strip() for u in pdf_batch.split(',') if u.strip()]
+            for u in urls:
+                content = self._read_pdf_from_url(u)
+                if content.startswith("Error"): errors.append(content)
+                else: all_content.append(content)
+        if files:
+            for path in files:
+                if not path: continue
+                filename, ext = os.path.basename(path), os.path.splitext(path)[1].lower()
+                if ext == '.pdf': all_content.append(self._read_pdf_from_path(path))
+                elif ext == '.txt':
+                    with open(path, 'r', encoding='utf-8', errors='ignore') as f: all_content.append(f.read())
+                else: errors.append(f"Unsupported file type: {filename}")
         return "\n\n---\n\n".join(all_content), errors
     # --- LLM Interaction ---
+    def _run_gpt(self, prompt_template: str, max_tokens: int, **kwargs) -> str:
+        """Core LLM call function."""
         system_prompt = PromptLibrary.AGENT_PREFIX.format(
             dynamic_timestamp_utc=datetime.datetime.now(datetime.timezone.utc).isoformat(),
+            user_purpose=kwargs.get('task', 'completing a system task.')
         )
+        full_prompt = f"<s>[INST] {system_prompt}\n\n{prompt_template.format(**kwargs)} [/INST]"
+        log(f"Running GPT. Template: {prompt_template[:50]}...")
+        try:
+            return self.client.text_generation(full_prompt, max_new_tokens=max_tokens, temperature=0.8, top_p=0.95).strip()
+        except Exception as e:
+            log(f"LLM Error: {e}")
+            return f'{{"error": "LLM call failed", "details": "{e}"}}'
+    def _chunk_and_process(self, text: str, prompt_template: str, task: str, max_tokens: int) -> List[str]:
+        """Chunks large text and processes each chunk with an LLM."""
+        text_len = len(text)
+        if text_len == 0: return []
+        num_chunks = (text_len + Config.MAX_DATA_CHUNK - 1) // Config.MAX_DATA_CHUNK
+        chunk_size = (text_len + num_chunks - 1) // num_chunks
+        results, knowledge = [], ""
+        for i in range(num_chunks):
+            chunk = text[i*chunk_size : (i+1)*chunk_size]
+            log(f"Processing chunk {i+1}/{num_chunks}...")
+            resp = self._run_gpt(prompt_template, max_tokens, task=task, knowledge=knowledge, history=chunk)
+            knowledge = resp if len(resp) < 2000 else resp[:2000] # Use response as context for next chunk
+            results.append(resp)
+        return results
+    # --- Synthesis & Reporting Workflow ---
+    def synthesis_workflow(self, text: str, task: str, do_summarize: bool, do_report: bool) -> Tuple[str, List[Dict]]:
+        """Handles the multi-stage summarization and reporting process."""
+        if not text: return "No data to process.", []
+        json_summary_objects, final_report = [], ""
+        if do_summarize or do_report: # Summarization is a prerequisite for reporting
+            log("Starting summarization stage...")
+            summaries = self._chunk_and_process(text, PromptLibrary.COMPRESS_JSON, task, Config.MAX_TOKENS_SYNTHESIS)
+            for s in summaries:
+                try: json_summary_objects.append(json.loads(s))
+                except json.JSONDecodeError: json_summary_objects.append({"error": "Failed to parse summary JSON", "raw": s})
+            log("Summarization stage complete.")
+        if do_report:
+            log("Starting report generation stage...")
+            # Use the collected JSON summaries as knowledge for the final report
+            knowledge_for_report = json.dumps(json_summary_objects, indent=2)
+            final_report = self._run_gpt(PromptLibrary.COMPRESS_REPORT, Config.MAX_TOKENS_REPORT, task=task, knowledge=knowledge_for_report, history="All data chunks have been summarized.")
+            log("Report generation complete.")
+            return final_report, json_summary_objects
+        return "Summarization complete.", json_summary_objects
+    # --- Persistent Memory System ---
+    def _hf_download_json(self, repo_path: str, default: Any = []) -> Any:
+        try:
+            path = hf_hub_download(repo_id=Config.HF_DATASET_REPO, filename=repo_path, repo_type="dataset", token=Config.HF_TOKEN)
+            with open(path, 'r') as f: return json.load(f)
+        except HfHubHTTPError: return default # File doesn't exist, return default
+        except (json.JSONDecodeError, IOError): return default
+    def _hf_upload_json(self, data: Any, repo_path: str):
+        with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix=".json") as tmp_file:
+            json.dump(data, tmp_file, indent=4)
+            tmp_path = tmp_file.name
+        self.api.upload_file(path_or_fileobj=tmp_path, path_in_repo=repo_path, repo_id=Config.HF_DATASET_REPO, repo_type="dataset")
+        os.remove(tmp_path)
+    def save_to_memory(self, text: str, task: str) -> List[Dict]:
+        """Saves processed text to the Hugging Face Dataset repo."""
+        log("Starting memory save process...")
+        json_chunks = self._chunk_and_process(text, PromptLibrary.SAVE_MEMORY, task, Config.MAX_TOKENS_SYNTHESIS)
+        parsed_chunks, main_file = [], self._hf_download_json(Config.MEMORY_MAIN_PATH)
+        for i, chunk_str in enumerate(json_chunks):
+            try:
+                data = json.loads(chunk_str)
+                ts = datetime.datetime.now(datetime.timezone.utc)
+                filename = f"{ts.strftime('%Y-%m-%d-%H-%M-%S')}-{uuid.uuid4().hex[:8]}.json"
+                self._hf_upload_json(data, f"{Config.MEMORY_DATA_PATH}/{filename}")
+                main_file.append({"file_name": filename, "keywords": data.get("keywords", []), "description": data.get("description", "")})
+                parsed_chunks.append(data)
+            except json.JSONDecodeError: log(f"Could not parse memory chunk {i} into JSON.")
+        self._hf_upload_json(main_file, Config.MEMORY_MAIN_PATH)
+        self.update_keyword_index(main_file)
+        log("Memory save complete.")
+        return parsed_chunks
+    def update_keyword_index(self, main_file_content: List[Dict]):
+        log("Updating keyword index...")
+        keyword_index = {}
+        for entry in main_file_content:
+            for keyword in entry.get("keywords", []):
+                k = keyword.strip().lower()
+                if k not in keyword_index: keyword_index[k] = []
+                if entry["file_name"] not in keyword_index[k]: keyword_index[k].append(entry["file_name"])
+        self._hf_upload_json(keyword_index, Config.MEMORY_INDEX_PATH)
+        log("Keyword index updated.")
+    def recall_from_memory(self, query: str) -> str:
+        log("Recalling from memory...")
+        index = self._hf_download_json(Config.MEMORY_INDEX_PATH, default={})
+        if not index: return "Memory index is empty or could not be loaded."
+        relevant_keywords_str = self._run_gpt(PromptLibrary.RECALL_MEMORY, 256, prompt=query, keywords=list(index.keys()))
+        try:
+            relevant_keywords = json.loads(relevant_keywords_str)
+        except json.JSONDecodeError: return "Could not determine relevant keywords from memory."
+        if not relevant_keywords: return "Found no relevant information in memory for that query."
+        # Fetch data from relevant files
+        matched_files, fetched_data = set(), []
+        for k in relevant_keywords:
+            for fname in index.get(k.lower().strip(), []): matched_files.add(fname)
+        for fname in list(matched_files)[:5]: # Limit fetches
+            data = self._hf_download_json(f"{Config.MEMORY_DATA_PATH}/{fname}", default={})
+            fetched_data.append(data)
+        return f"Recalled {len(fetched_data)} entries from memory:\n\n{json.dumps(fetched_data, indent=2)}"
 # --- GRADIO APPLICATION ---
 class GradioApp:
         self.app = self._build_ui()
     def _build_ui(self) -> gr.Blocks:
+        with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky"), title="Maestro AI Engine") as app:
             session_id = gr.State(lambda: secrets.token_hex(16))
             gr.Markdown("# 🧠 Maestro: AI Data Engine & Synthesis Platform")
             with gr.Tabs():
+                with gr.TabItem("⚙️ Ingestion & Synthesis"):
                     with gr.Row():
                         with gr.Column(scale=3):
+                            task_instructions = gr.Textbox(label="Primary Task / Instructions", placeholder="e.g., 'Summarize the key findings regarding renewable energy adoption'")
+                            with gr.Tabs():
+                                with gr.TabItem("Text Input"): text_input = gr.Textbox(lines=10)
+                                with gr.TabItem("File Upload"): file_upload = gr.File(label="Upload Files (.pdf, .txt)", file_count="multiple", type="filepath")
+                                with gr.TabItem("Web URL"): url_input = gr.Textbox(label="URL")
+                                with gr.TabItem("PDF URL"): pdf_url_input = gr.Textbox(label="Single PDF URL")
+                                with gr.TabItem("Batch PDF URLs"): pdf_batch_input = gr.Textbox(label="Comma-separated PDF URLs", lines=3)
+                        with gr.Column(scale=1):
+                            gr.Markdown("### Processing Options")
+                            summarize_check = gr.Checkbox(label="Create JSON Summary", value=True)
+                            report_check = gr.Checkbox(label="Generate Full Report (requires summary)", value=False)
+                            memory_check = gr.Checkbox(label="Save to Persistent Memory", value=False)
+                            process_button = gr.Button("🚀 Process & Synthesize", variant="primary", scale=2)
+                    gr.Markdown("### Results")
                     with gr.Row():
+                        final_report_output = gr.Markdown(label="Final Report")
+                        json_summary_output = gr.JSON(label="JSON Summaries")
+                with gr.TabItem("🔎 Memory Recall"):
+                    memory_query = gr.Textbox(label="Query Persistent Memory", placeholder="e.g., 'What do we know about market trends in 2024?'")
+                    recall_button = gr.Button("Recall", variant="primary")
+                    memory_output = gr.Textbox(label="Recalled Information", lines=20, interactive=False)
+            process_button.click(self._synthesis_workflow, [task_instructions, text_input, file_upload, url_input, pdf_url_input, pdf_batch_input, summarize_check, report_check, memory_check], [final_report_output, json_summary_output])
+            recall_button.click(self.engine.recall_from_memory, [memory_query], [memory_output])
+        return app
+    def _synthesis_workflow(self, task, text, files, url, pdf_url, pdf_batch, do_sum, do_rep, do_mem):
+        log("Starting synthesis workflow...")
+        # 1. Ingest Data
+        ingested_text, errors = self.engine.process_data_sources(text, files, url, pdf_url, pdf_batch)
+        if errors:
+            log(f"Ingestion errors: {errors}")
+            # For simplicity, we show errors in the log. A real app might have a dedicated error box.
+        if not ingested_text:
+            return "No data was successfully ingested. Please check your inputs and logs.", None
+        # 2. Save to Memory (if requested)
+        if do_mem:
+            self.engine.save_to_memory(ingested_text, task)
+            # We don't wait for this to finish for the UI, it's a background-like task
+        # 3. Summarize and Report
+        if do_sum or do_rep:
+            report, summaries = self.engine.synthesis_workflow(ingested_text, task, do_sum, do_rep)
+            return report, summaries
+        return "Processing complete. No synthesis option was selected.", None
     def launch(self): self.app.launch(debug=Config.VERBOSE, share=False)
 if __name__ == "__main__":
     if not Config.HF_TOKEN:
+        print("FATAL: HF_TOKEN environment variable not set.")
     else:
+        log("Instantiating Maestro Engine...")
+        engine = MaestroEngine()
+        app = GradioApp(engine)
+        log("Launching Gradio App...")
+        app.launch()