|
|
from __future__ import annotations |
|
|
|
|
|
import asyncio |
|
|
from dataclasses import dataclass |
|
|
from typing import Literal |
|
|
import os |
|
|
import dotenv |
|
|
dotenv.load_dotenv() |
|
|
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") |
|
|
|
|
|
from agents import Agent, ItemHelpers, Runner, TResponseInputItem, trace |
|
|
|
|
|
""" |
|
|
This example shows the LLM as a judge pattern. The first agent generates an outline for a story. |
|
|
The second agent judges the outline and provides feedback. We loop until the judge is satisfied |
|
|
with the outline. |
|
|
""" |
|
|
|
|
|
story_outline_generator = Agent( |
|
|
name="story_outline_generator", |
|
|
instructions=( |
|
|
"You generate a very short story outline based on the user's input. " |
|
|
"If there is any feedback provided, use it to improve the outline." |
|
|
), |
|
|
) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class EvaluationFeedback: |
|
|
feedback: str |
|
|
score: Literal["pass", "needs_improvement", "fail"] |
|
|
|
|
|
|
|
|
evaluator = Agent[None]( |
|
|
name="evaluator", |
|
|
instructions=( |
|
|
"You evaluate a story outline and decide if it's good enough. " |
|
|
"If it's not good enough, you provide feedback on what needs to be improved. " |
|
|
"Never give it a pass on the first try. After 5 attempts, you can give it a pass if the story outline is good enough - do not go for perfection" |
|
|
), |
|
|
output_type=EvaluationFeedback, |
|
|
) |
|
|
|
|
|
|
|
|
async def main() -> None: |
|
|
msg = input("What kind of story would you like to hear? ") |
|
|
input_items: list[TResponseInputItem] = [{"content": msg, "role": "user"}] |
|
|
|
|
|
latest_outline: str | None = None |
|
|
|
|
|
|
|
|
with trace("LLM as a judge"): |
|
|
while True: |
|
|
story_outline_result = await Runner.run( |
|
|
story_outline_generator, |
|
|
input_items, |
|
|
) |
|
|
|
|
|
input_items = story_outline_result.to_input_list() |
|
|
latest_outline = ItemHelpers.text_message_outputs(story_outline_result.new_items) |
|
|
print("Story outline generated") |
|
|
|
|
|
evaluator_result = await Runner.run(evaluator, input_items) |
|
|
result: EvaluationFeedback = evaluator_result.final_output |
|
|
|
|
|
print(f"Evaluator score: {result.score}") |
|
|
|
|
|
if result.score == "pass": |
|
|
print("Story outline is good enough, exiting.") |
|
|
break |
|
|
|
|
|
print("Re-running with feedback") |
|
|
|
|
|
input_items.append({"content": f"Feedback: {result.feedback}", "role": "user"}) |
|
|
|
|
|
print(f"Final story outline: {latest_outline}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
asyncio.run(main()) |