File size: 5,111 Bytes
e72c8ca
4d13530
7eddc37
e72c8ca
 
 
4d13530
e72c8ca
 
7eddc37
e72c8ca
7eddc37
fb5b735
c4592de
 
2c72d2c
7eddc37
 
 
 
 
24de656
 
12579ea
2c72d2c
7eddc37
24de656
2c72d2c
 
4d13530
fb5b735
 
12579ea
24de656
e72c8ca
 
 
 
 
 
fb5b735
e72c8ca
24de656
 
 
 
 
 
 
e72c8ca
 
 
24de656
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e72c8ca
 
 
 
 
24de656
 
2c72d2c
 
24de656
 
 
 
 
e72c8ca
 
24de656
 
 
 
 
2c72d2c
e72c8ca
2c72d2c
e72c8ca
 
 
24de656
 
 
 
 
 
e72c8ca
24de656
 
e72c8ca
 
 
 
 
 
 
24de656
e72c8ca
 
 
24de656
e72c8ca
24de656
 
 
e72c8ca
 
 
 
24de656
 
e72c8ca
24de656
 
 
e72c8ca
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import HTMLResponse
from transformers import AutoProcessor, Florence2ForConditionalGeneration # <--- DIRECT IMPORT
from PIL import Image
import torch
import io

app = FastAPI()

print("⏳ Initializing Florence-2 (Hardcoded Class Mode)...")

# We use the community fork for clean config
model_id = "florence-community/Florence-2-large"
device = "cpu"

try:
    # 1. Load Processor
    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=False)

    # 2. Load Model using the SPECIFIC CLASS (No "AutoModel" guessing)
    model = Florence2ForConditionalGeneration.from_pretrained(
        model_id, 
        trust_remote_code=False, 
        torch_dtype=torch.float32
    ).to(device)
    
    print("βœ… Model Loaded Successfully!")

except Exception as e:
    print(f"❌ Load Error: {e}")
    model = None
    processor = None

# --- UI ---
html_content = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Naman's AI Vision</title>
    <style>
        body { font-family: sans-serif; background: #0f172a; color: white; display: flex; flex-direction: column; align-items: center; min-height: 100vh; margin: 0; padding: 20px; }
        .box { background: #1e293b; padding: 30px; border-radius: 15px; width: 100%; max-width: 600px; text-align: center; border: 1px solid #334155; }
        h1 { margin-top: 0; color: #38bdf8; }
        button { background: #38bdf8; color: #000; border: none; padding: 10px 20px; border-radius: 5px; font-weight: bold; cursor: pointer; margin-top: 10px; }
        button:disabled { opacity: 0.5; }
        #result { margin-top: 20px; white-space: pre-wrap; text-align: left; background: #000; padding: 15px; border-radius: 5px; font-family: monospace; display: none; }
        img { max-width: 100%; border-radius: 10px; margin-top: 10px; display: none; }
    </style>
</head>
<body>
    <div class="box">
        <h1>πŸ‘οΈ Florence-2 Vision AI</h1>
        <p>Advanced OCR & Image Understanding (CPU)</p>
        
        <input type="file" id="file" accept="image/*" style="display: none;">
        <button onclick="document.getElementById('file').click()">πŸ“‚ Upload Image</button>
        
        <br><br>
        <select id="task" style="padding: 10px; border-radius: 5px;">
            <option value="<OCR>">πŸ“„ Read Text (OCR)</option>
            <option value="<CAPTION>">πŸ–ΌοΈ Describe Image</option>
            <option value="<OD>">πŸ“¦ Detect Objects</option>
        </select>
        <button onclick="runAI()" id="runBtn">Run AI</button>

        <img id="preview">
        <div id="result"></div>
    </div>

    <script>
        const fileInput = document.getElementById('file');
        const preview = document.getElementById('preview');
        const result = document.getElementById('result');
        const runBtn = document.getElementById('runBtn');
        let currentFile = null;

        fileInput.addEventListener('change', (e) => {
            currentFile = e.target.files[0];
            preview.src = URL.createObjectURL(currentFile);
            preview.style.display = 'block';
            result.style.display = 'none';
        });

        async function runAI() {
            if (!currentFile) return alert("Select an image first!");
            runBtn.innerText = "Processing...";
            runBtn.disabled = true;
            result.style.display = 'none';

            const formData = new FormData();
            formData.append('file', currentFile);
            formData.append('task_prompt', document.getElementById('task').value);

            try {
                const res = await fetch('/analyze', { method: 'POST', body: formData });
                const data = await res.json();
                result.innerText = data.result || "Error: " + JSON.stringify(data);
                result.style.display = 'block';
            } catch (e) {
                alert("Error: " + e);
            }
            runBtn.innerText = "Run AI";
            runBtn.disabled = false;
        }
    </script>
</body>
</html>
"""

@app.get("/", response_class=HTMLResponse)
def home(): return html_content

@app.post("/analyze")
async def analyze(task_prompt: str = "<OCR>", file: UploadFile = File(...)):
    if not model: return {"error": "Model failed to load"}
    try:
        img = Image.open(io.BytesIO(await file.read())).convert("RGB")
        inputs = processor(text=task_prompt, images=img, return_tensors="pt").to(device)
        
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            num_beams=3,
            do_sample=False
        )
        text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
        parsed = processor.post_process_generation(text, task=task_prompt, image_size=img.size)
        return {"result": str(parsed)}
    except Exception as e:
        return {"error": str(e)}