Spaces:

zade-frontier
/

andrej-karpathy-llm-council

Running

App Files Files Community

burtenshaw commited on 13 days ago

Commit

e996b22

1 Parent(s): 0a12050

add streaming

Browse files

Files changed (4) hide show

app.py +42 -17
backend/config.py +1 -2
backend/council.py +78 -69
backend/openrouter.py +41 -15

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from backend.council import stage1_collect_responses, stage2_collect_rankings, stage3_synthesize_final
 from backend.config import COUNCIL_MODELS, CHAIRMAN_MODEL
@@ -23,42 +23,67 @@ async def ask_council(question: str, progress=gr.Progress()):
     )
     try:
         # Stage 1: Collect individual responses
         progress(0.1, desc="Stage 1: Collecting individual responses...")
-        yield ("## 🟡 Stage 1: Collecting individual responses from council members...")
         stage1_results = await stage1_collect_responses(question)
         if not stage1_results:
-            yield "❌ The council failed to generate a response."
             return
         # Stage 2: Collect rankings
         progress(0.4, desc="Stage 2: Council members are ranking responses...")
-        yield (
-            f"## 🟢 Stage 1 Complete ({len(stage1_results)} responses received).\n\n"
-            "## 🟡 Stage 2: Council members are ranking each other's responses..."
-        )
         stage2_results, _ = await stage2_collect_rankings(question, stage1_results)
         # Stage 3: Synthesize final answer
         progress(0.7, desc="Stage 3: Chairman is synthesizing the final answer...")
-        yield (
-            "## 🟢 Stage 2 Complete (Rankings collected).\n\n"
-            "## 🟡 Stage 3: Chairman is synthesizing the final answer..."
-        )
-        stage3_result = await stage3_synthesize_final(question, stage1_results, stage2_results)
         progress(1.0, desc="Complete!")
-        response = stage3_result.get("response")
-        if not response:
-            yield "❌ The council failed to generate a final synthesis."
             return
-        yield response
     except Exception as e:
         yield f"❌ Error consulting the council: {str(e)}"
@@ -66,7 +91,7 @@ async def ask_council(question: str, progress=gr.Progress()):
 description = """
 An MCP server that consults a council of LLMs to answer questions.
-![image](https://pbs.twimg.com/media/G6ZZO7ragAAtnCZ?format=jpg)
 ⚠️ We're using 5 models in the council, so it takes a minute to answer.
 """

 import gradio as gr
+from backend.council import stage1_collect_responses, stage2_collect_rankings, stage3_synthesize_final_stream
 from backend.config import COUNCIL_MODELS, CHAIRMAN_MODEL
     )
     try:
+        buffer = ""
         # Stage 1: Collect individual responses
         progress(0.1, desc="Stage 1: Collecting individual responses...")
+        buffer += "## 🟡 Stage 1: Collecting individual responses from council members...\n\n"
+        yield buffer
         stage1_results = await stage1_collect_responses(question)
         if not stage1_results:
+            buffer += "\n❌ The council failed to generate a response."
+            yield buffer
             return
+        # Format Stage 1 results
+        buffer += f"### ✅ Received {len(stage1_results)} responses:\n"
+        for res in stage1_results:
+            model_name = res["model"].split("/")[-1]
+            preview = res["response"][:100].replace("\n", " ") + "..."
+            buffer += f"- **{model_name}**: {preview}\n"
+        buffer += "\n---\n\n"
+        yield buffer
         # Stage 2: Collect rankings
         progress(0.4, desc="Stage 2: Council members are ranking responses...")
+        buffer += "## 🟡 Stage 2: Council members are ranking each other's responses...\n\n"
+        yield buffer
         stage2_results, _ = await stage2_collect_rankings(question, stage1_results)
+        # Format Stage 2 results
+        buffer += "### ✅ Rankings Collected:\n"
+        for res in stage2_results:
+            model_name = res["model"].split("/")[-1]
+            # Extract just the ranking part if possible, or just say "Ranked"
+            buffer += f"- **{model_name}** has submitted their rankings.\n"
+        buffer += "\n---\n\n"
+        yield buffer
         # Stage 3: Synthesize final answer
         progress(0.7, desc="Stage 3: Chairman is synthesizing the final answer...")
+        buffer += "## 🟡 Stage 3: Chairman is synthesizing the final answer...\n\n"
+        yield buffer
+        full_response = ""
+        async for chunk in stage3_synthesize_final_stream(question, stage1_results, stage2_results):
+            full_response += chunk
+            yield buffer + full_response
         progress(1.0, desc="Complete!")
+        if not full_response:
+            buffer += "\n❌ The council failed to generate a final synthesis."
+            yield buffer
             return
+        # Let's keep the history but mark Stage 3 as done
+        final_buffer = buffer.replace(
+            "## 🟡 Stage 3: Chairman is synthesizing the final answer...", "## 🟢 Stage 3: Final Answer"
+        )
+        yield final_buffer + full_response
     except Exception as e:
         yield f"❌ Error consulting the council: {str(e)}"
 description = """
 An MCP server that consults a council of LLMs to answer questions.
+<img src="https://pbs.twimg.com/media/G6ZZO7ragAAtnCZ?format=jpg" alt="MCP Server" style="width: 100%; height: auto; text-align: center;">
 ⚠️ We're using 5 models in the council, so it takes a minute to answer.
 """

backend/config.py CHANGED Viewed

@@ -10,14 +10,13 @@ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
 # Council members - list of OpenRouter model identifiers
 COUNCIL_MODELS = [
-    "moonshotai/Kimi-K2-Thinking:novita",
     "openai/gpt-oss-120b:hyperbolic",
     "deepseek-ai/DeepSeek-V3.2-Exp:novita",
     "Qwen/Qwen3-235B-A22B-Instruct-2507:hyperbolic",
 ]
 # Chairman model - synthesizes final response
-CHAIRMAN_MODEL = "moonshotai/Kimi-K2-Thinking:novita"
 # OpenRouter API endpoint
 OPENROUTER_API_URL = "https://router.huggingface.co/v1/chat/completions"

 # Council members - list of OpenRouter model identifiers
 COUNCIL_MODELS = [
     "openai/gpt-oss-120b:hyperbolic",
     "deepseek-ai/DeepSeek-V3.2-Exp:novita",
     "Qwen/Qwen3-235B-A22B-Instruct-2507:hyperbolic",
 ]
 # Chairman model - synthesizes final response
+CHAIRMAN_MODEL = "deepseek-ai/DeepSeek-V3.2-Exp:novita"
 # OpenRouter API endpoint
 OPENROUTER_API_URL = "https://router.huggingface.co/v1/chat/completions"

backend/council.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """3-stage LLM Council orchestration."""
 from typing import List, Dict, Any, Tuple
-from .openrouter import query_models_parallel, query_model
 from .config import COUNCIL_MODELS, CHAIRMAN_MODEL
@@ -25,18 +25,14 @@ async def stage1_collect_responses(user_query: str) -> List[Dict[str, Any]]:
     stage1_results = []
     for model, response in responses.items():
         if response is not None:  # Only include successful responses
-            stage1_results.append({
-                "model": model,
-                "response": response.get('content', '')
-            })
     print(f"STAGE 1 COMPLETE: Received {len(stage1_results)} responses.")
     return stage1_results
 async def stage2_collect_rankings(
-    user_query: str,
-    stage1_results: List[Dict[str, Any]]
 ) -> Tuple[List[Dict[str, Any]], Dict[str, str]]:
     """
     Stage 2: Each model ranks the anonymized responses.
@@ -53,16 +49,12 @@ async def stage2_collect_rankings(
     labels = [chr(65 + i) for i in range(len(stage1_results))]  # A, B, C, ...
     # Create mapping from label to model name
-    label_to_model = {
-        f"Response {label}": result['model']
-        for label, result in zip(labels, stage1_results)
-    }
     # Build the ranking prompt
-    responses_text = "\n\n".join([
-        f"Response {label}:\n{result['response']}"
-        for label, result in zip(labels, stage1_results)
-    ])
     ranking_prompt = f"""You are evaluating different responses to the following question:
@@ -104,22 +96,16 @@ Now provide your evaluation and ranking:"""
     stage2_results = []
     for model, response in responses.items():
         if response is not None:
-            full_text = response.get('content', '')
             parsed = parse_ranking_from_text(full_text)
-            stage2_results.append({
-                "model": model,
-                "ranking": full_text,
-                "parsed_ranking": parsed
-            })
     print("STAGE 2 COMPLETE: Rankings collected.")
     return stage2_results, label_to_model
 async def stage3_synthesize_final(
-    user_query: str,
-    stage1_results: List[Dict[str, Any]],
-    stage2_results: List[Dict[str, Any]]
 ) -> Dict[str, Any]:
     """
     Stage 3: Chairman synthesizes final response.
@@ -134,15 +120,13 @@ async def stage3_synthesize_final(
     """
     print("STAGE 3: Chairman is synthesizing the final answer...")
     # Build comprehensive context for chairman
-    stage1_text = "\n\n".join([
-        f"Model: {result['model']}\nResponse: {result['response']}"
-        for result in stage1_results
-    ])
-    stage2_text = "\n\n".join([
-        f"Model: {result['model']}\nRanking: {result['ranking']}"
-        for result in stage2_results
-    ])
     chairman_prompt = f"""You are the Chairman of an LLM Council. Multiple AI models have provided responses to a user's question, and then ranked each other's responses.
@@ -169,16 +153,54 @@ Provide a clear, well-reasoned final answer that represents the council's collec
     if response is None:
         # Fallback if chairman fails
         print("STAGE 3 ERROR: Unable to generate final synthesis.")
-        return {
-            "model": CHAIRMAN_MODEL,
-            "response": "Error: Unable to generate final synthesis."
-        }
     print("STAGE 3 COMPLETE: Final answer synthesized.")
-    return {
-        "model": CHAIRMAN_MODEL,
-        "response": response.get('content', '')
-    }
 def parse_ranking_from_text(ranking_text: str) -> List[str]:
@@ -201,23 +223,22 @@ def parse_ranking_from_text(ranking_text: str) -> List[str]:
             ranking_section = parts[1]
             # Try to extract numbered list format (e.g., "1. Response A")
             # This pattern looks for: number, period, optional space, "Response X"
-            numbered_matches = re.findall(r'\d+\.\s*Response [A-Z]', ranking_section)
             if numbered_matches:
                 # Extract just the "Response X" part
-                return [re.search(r'Response [A-Z]', m).group() for m in numbered_matches]
             # Fallback: Extract all "Response X" patterns in order
-            matches = re.findall(r'Response [A-Z]', ranking_section)
             return matches
     # Fallback: try to find any "Response X" patterns in order
-    matches = re.findall(r'Response [A-Z]', ranking_text)
     return matches
 def calculate_aggregate_rankings(
-    stage2_results: List[Dict[str, Any]],
-    label_to_model: Dict[str, str]
 ) -> List[Dict[str, Any]]:
     """
     Calculate aggregate rankings across all models.
@@ -235,7 +256,7 @@ def calculate_aggregate_rankings(
     model_positions = defaultdict(list)
     for ranking in stage2_results:
-        ranking_text = ranking['ranking']
         # Parse the ranking from the structured format
         parsed_ranking = parse_ranking_from_text(ranking_text)
@@ -250,14 +271,12 @@ def calculate_aggregate_rankings(
     for model, positions in model_positions.items():
         if positions:
             avg_rank = sum(positions) / len(positions)
-            aggregate.append({
-                "model": model,
-                "average_rank": round(avg_rank, 2),
-                "rankings_count": len(positions)
-            })
     # Sort by average rank (lower is better)
-    aggregate.sort(key=lambda x: x['average_rank'])
     return aggregate
@@ -288,10 +307,10 @@ Title:"""
         # Fallback to a generic title
         return "New Conversation"
-    title = response.get('content', 'New Conversation').strip()
     # Clean up the title - remove quotes, limit length
-    title = title.strip('"\'')
     # Truncate if too long
     if len(title) > 50:
@@ -315,28 +334,18 @@ async def run_full_council(user_query: str) -> Tuple[List, List, Dict, Dict]:
     # If no models responded successfully, return error
     if not stage1_results:
-        return [], [], {
-            "model": "error",
-            "response": "All models failed to respond. Please try again."
-        }, {}
     # Stage 2: Collect rankings
     stage2_results, label_to_model = await stage2_collect_rankings(user_query, stage1_results)
     # Calculate aggregate rankings
     aggregate_rankings = calculate_aggregate_rankings(stage2_results, label_to_model)
     # Stage 3: Synthesize final answer
-    stage3_result = await stage3_synthesize_final(
-        user_query,
-        stage1_results,
-        stage2_results
-    )
     # Prepare metadata
-    metadata = {
-        "label_to_model": label_to_model,
-        "aggregate_rankings": aggregate_rankings
-    }
     return stage1_results, stage2_results, stage3_result, metadata

 """3-stage LLM Council orchestration."""
 from typing import List, Dict, Any, Tuple
+from .openrouter import query_models_parallel, query_model, query_model_stream
 from .config import COUNCIL_MODELS, CHAIRMAN_MODEL
     stage1_results = []
     for model, response in responses.items():
         if response is not None:  # Only include successful responses
+            stage1_results.append({"model": model, "response": response.get("content", "")})
     print(f"STAGE 1 COMPLETE: Received {len(stage1_results)} responses.")
     return stage1_results
 async def stage2_collect_rankings(
+    user_query: str, stage1_results: List[Dict[str, Any]]
 ) -> Tuple[List[Dict[str, Any]], Dict[str, str]]:
     """
     Stage 2: Each model ranks the anonymized responses.
     labels = [chr(65 + i) for i in range(len(stage1_results))]  # A, B, C, ...
     # Create mapping from label to model name
+    label_to_model = {f"Response {label}": result["model"] for label, result in zip(labels, stage1_results)}
     # Build the ranking prompt
+    responses_text = "\n\n".join(
+        [f"Response {label}:\n{result['response']}" for label, result in zip(labels, stage1_results)]
+    )
     ranking_prompt = f"""You are evaluating different responses to the following question:
     stage2_results = []
     for model, response in responses.items():
         if response is not None:
+            full_text = response.get("content", "")
             parsed = parse_ranking_from_text(full_text)
+            stage2_results.append({"model": model, "ranking": full_text, "parsed_ranking": parsed})
     print("STAGE 2 COMPLETE: Rankings collected.")
     return stage2_results, label_to_model
 async def stage3_synthesize_final(
+    user_query: str, stage1_results: List[Dict[str, Any]], stage2_results: List[Dict[str, Any]]
 ) -> Dict[str, Any]:
     """
     Stage 3: Chairman synthesizes final response.
     """
     print("STAGE 3: Chairman is synthesizing the final answer...")
     # Build comprehensive context for chairman
+    stage1_text = "\n\n".join(
+        [f"Model: {result['model']}\nResponse: {result['response']}" for result in stage1_results]
+    )
+    stage2_text = "\n\n".join(
+        [f"Model: {result['model']}\nRanking: {result['ranking']}" for result in stage2_results]
+    )
     chairman_prompt = f"""You are the Chairman of an LLM Council. Multiple AI models have provided responses to a user's question, and then ranked each other's responses.
     if response is None:
         # Fallback if chairman fails
         print("STAGE 3 ERROR: Unable to generate final synthesis.")
+        return {"model": CHAIRMAN_MODEL, "response": "Error: Unable to generate final synthesis."}
     print("STAGE 3 COMPLETE: Final answer synthesized.")
+    return {"model": CHAIRMAN_MODEL, "response": response.get("content", "")}
+async def stage3_synthesize_final_stream(
+    user_query: str, stage1_results: List[Dict[str, Any]], stage2_results: List[Dict[str, Any]]
+):
+    """
+    Stage 3: Chairman synthesizes final response (Streaming).
+    Yields chunks of text.
+    """
+    print("STAGE 3: Chairman is synthesizing the final answer (Streaming)...")
+    # Build comprehensive context for chairman
+    stage1_text = "\n\n".join(
+        [f"Model: {result['model']}\nResponse: {result['response']}" for result in stage1_results]
+    )
+    stage2_text = "\n\n".join(
+        [f"Model: {result['model']}\nRanking: {result['ranking']}" for result in stage2_results]
+    )
+    chairman_prompt = f"""You are the Chairman of an LLM Council. Multiple AI models have provided responses to a user's question, and then ranked each other's responses.
+Original Question: {user_query}
+STAGE 1 - Individual Responses:
+{stage1_text}
+STAGE 2 - Peer Rankings:
+{stage2_text}
+Your task as Chairman is to synthesize all of this information into a single, comprehensive, accurate answer to the user's original question. Consider:
+- The individual responses and their insights
+- The peer rankings and what they reveal about response quality
+- Any patterns of agreement or disagreement
+Provide a clear, well-reasoned final answer that represents the council's collective wisdom:"""
+    messages = [{"role": "user", "content": chairman_prompt}]
+    # Stream the chairman model
+    async for chunk in query_model_stream(CHAIRMAN_MODEL, messages):
+        yield chunk
+    print("STAGE 3 COMPLETE: Final answer stream finished.")
 def parse_ranking_from_text(ranking_text: str) -> List[str]:
             ranking_section = parts[1]
             # Try to extract numbered list format (e.g., "1. Response A")
             # This pattern looks for: number, period, optional space, "Response X"
+            numbered_matches = re.findall(r"\d+\.\s*Response [A-Z]", ranking_section)
             if numbered_matches:
                 # Extract just the "Response X" part
+                return [re.search(r"Response [A-Z]", m).group() for m in numbered_matches]
             # Fallback: Extract all "Response X" patterns in order
+            matches = re.findall(r"Response [A-Z]", ranking_section)
             return matches
     # Fallback: try to find any "Response X" patterns in order
+    matches = re.findall(r"Response [A-Z]", ranking_text)
     return matches
 def calculate_aggregate_rankings(
+    stage2_results: List[Dict[str, Any]], label_to_model: Dict[str, str]
 ) -> List[Dict[str, Any]]:
     """
     Calculate aggregate rankings across all models.
     model_positions = defaultdict(list)
     for ranking in stage2_results:
+        ranking_text = ranking["ranking"]
         # Parse the ranking from the structured format
         parsed_ranking = parse_ranking_from_text(ranking_text)
     for model, positions in model_positions.items():
         if positions:
             avg_rank = sum(positions) / len(positions)
+            aggregate.append(
+                {"model": model, "average_rank": round(avg_rank, 2), "rankings_count": len(positions)}
+            )
     # Sort by average rank (lower is better)
+    aggregate.sort(key=lambda x: x["average_rank"])
     return aggregate
         # Fallback to a generic title
         return "New Conversation"
+    title = response.get("content", "New Conversation").strip()
     # Clean up the title - remove quotes, limit length
+    title = title.strip("\"'")
     # Truncate if too long
     if len(title) > 50:
     # If no models responded successfully, return error
     if not stage1_results:
+        return [], [], {"model": "error", "response": "All models failed to respond. Please try again."}, {}
     # Stage 2: Collect rankings
     stage2_results, label_to_model = await stage2_collect_rankings(user_query, stage1_results)
     # Calculate aggregate rankings
     aggregate_rankings = calculate_aggregate_rankings(stage2_results, label_to_model)
     # Stage 3: Synthesize final answer
+    stage3_result = await stage3_synthesize_final(user_query, stage1_results, stage2_results)
     # Prepare metadata
+    metadata = {"label_to_model": label_to_model, "aggregate_rankings": aggregate_rankings}
     return stage1_results, stage2_results, stage3_result, metadata

backend/openrouter.py CHANGED Viewed

@@ -6,9 +6,7 @@ from .config import OPENROUTER_API_KEY, OPENROUTER_API_URL
 async def query_model(
-    model: str,
-    messages: List[Dict[str, str]],
-    timeout: float = 120.0
 ) -> Optional[Dict[str, Any]]:
     """
     Query a single model via OpenRouter API.
@@ -33,29 +31,57 @@ async def query_model(
     try:
         async with httpx.AsyncClient(timeout=timeout) as client:
-            response = await client.post(
-                OPENROUTER_API_URL,
-                headers=headers,
-                json=payload
-            )
             response.raise_for_status()
             data = response.json()
-            message = data['choices'][0]['message']
-            return {
-                'content': message.get('content'),
-                'reasoning_details': message.get('reasoning_details')
-            }
     except Exception as e:
         print(f"Error querying model {model}: {e}")
         return None
 async def query_models_parallel(
-    models: List[str],
-    messages: List[Dict[str, str]]
 ) -> Dict[str, Optional[Dict[str, Any]]]:
     """
     Query multiple models in parallel.

 async def query_model(
+    model: str, messages: List[Dict[str, str]], timeout: float = 120.0
 ) -> Optional[Dict[str, Any]]:
     """
     Query a single model via OpenRouter API.
     try:
         async with httpx.AsyncClient(timeout=timeout) as client:
+            response = await client.post(OPENROUTER_API_URL, headers=headers, json=payload)
             response.raise_for_status()
             data = response.json()
+            message = data["choices"][0]["message"]
+            return {"content": message.get("content"), "reasoning_details": message.get("reasoning_details")}
     except Exception as e:
         print(f"Error querying model {model}: {e}")
         return None
+async def query_model_stream(model: str, messages: List[Dict[str, str]], timeout: float = 120.0):
+    """
+    Query a model via OpenRouter API and stream the response.
+    Yields content chunks as they arrive.
+    """
+    headers = {
+        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
+        "Content-Type": "application/json",
+    }
+    payload = {"model": model, "messages": messages, "stream": True}
+    import json
+    try:
+        async with httpx.AsyncClient(timeout=timeout) as client:
+            async with client.stream("POST", OPENROUTER_API_URL, headers=headers, json=payload) as response:
+                response.raise_for_status()
+                async for line in response.aiter_lines():
+                    if line.startswith("data: "):
+                        data_str = line[6:]
+                        if data_str.strip() == "[DONE]":
+                            break
+                        try:
+                            data = json.loads(data_str)
+                            delta = data["choices"][0]["delta"]
+                            content = delta.get("content")
+                            if content:
+                                yield content
+                        except json.JSONDecodeError:
+                            pass
+    except Exception as e:
+        print(f"Error streaming model {model}: {e}")
+        yield f"[Error: {str(e)}]"
 async def query_models_parallel(
+    models: List[str], messages: List[Dict[str, str]]
 ) -> Dict[str, Optional[Dict[str, Any]]]:
     """
     Query multiple models in parallel.