Spaces:

chenguittiMaroua
/

asm-app

Sleeping

File size: 5,931 Bytes

from fastapi import FastAPI, UploadFile, File, Form, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from transformers import pipeline
from typing import Optional
import io
import fitz  # PyMuPDF
from PIL import Image
import pandas as pd
import uvicorn
from docx import Document
from pptx import Presentation
import pytesseract
import logging
import re

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI()

# CORS Configuration
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

# Constants
MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
SUPPORTED_FILE_TYPES = {
    "docx", "xlsx", "pptx", "pdf", "jpg", "jpeg", "png"
}

# Model caching
summarizer = None
qa_model = None
image_captioner = None

def get_summarizer():
    global summarizer
    if summarizer is None:
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    return summarizer

def get_qa_model():
    global qa_model
    if qa_model is None:
        qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
    return qa_model

def get_image_captioner():
    global image_captioner
    if image_captioner is None:
        image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
    return image_captioner

async def process_uploaded_file(file: UploadFile):
    if not file.filename:
        raise HTTPException(400, "No file provided")
    
    file_ext = file.filename.split('.')[-1].lower()
    if file_ext not in SUPPORTED_FILE_TYPES:
        raise HTTPException(400, f"Unsupported file type. Supported: {', '.join(SUPPORTED_FILE_TYPES)}")
    
    content = await file.read()
    if len(content) > MAX_FILE_SIZE:
        raise HTTPException(413, f"File too large. Max size: {MAX_FILE_SIZE//1024//1024}MB")
    
    return file_ext, content

def extract_text(content: bytes, file_ext: str) -> str:
    try:
        if file_ext == "docx":
            doc = Document(io.BytesIO(content))
            return " ".join(p.text for p in doc.paragraphs if p.text.strip())
        
        elif file_ext in {"xls", "xlsx"}:
            df = pd.read_excel(io.BytesIO(content))
            return " ".join(df.iloc[:, 0].dropna().astype(str).tolist())
        
        elif file_ext == "pptx":
            ppt = Presentation(io.BytesIO(content))
            return " ".join(shape.text for slide in ppt.slides 
                          for shape in slide.shapes if hasattr(shape, "text"))
        
        elif file_ext == "pdf":
            pdf = fitz.open(stream=content, filetype="pdf")
            text = []
            for page in pdf:
                page_text = page.get_text("text")
                if page_text.strip():
                    text.append(page_text)
            return " ".join(text)
        
        elif file_ext in {"jpg", "jpeg", "png"}:
            image = Image.open(io.BytesIO(content))
            return pytesseract.image_to_string(image, config='--psm 6')
        
    except Exception as e:
        logger.error(f"Text extraction failed: {str(e)}")
        raise HTTPException(422, f"Failed to extract text from {file_ext} file")

@app.post("/summarize")
async def summarize_document(file: UploadFile = File(...)):
    try:
        file_ext, content = await process_uploaded_file(file)
        text = extract_text(content, file_ext)
        
        if not text.strip():
            raise HTTPException(400, "No extractable text found")
        
        # Clean and chunk text
        text = re.sub(r'\s+', ' ', text).strip()
        chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
        
        # Summarize each chunk
        summarizer = get_summarizer()
        summaries = []
        for chunk in chunks:
            summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
            summaries.append(summary)
        
        return {"summary": " ".join(summaries)}
        
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Summarization failed: {str(e)}")
        raise HTTPException(500, "Document summarization failed")

@app.post("/qa")
async def question_answering(
    file: UploadFile = File(...),
    question: str = Form(...),
    language: str = Form("fr")
):
    try:
        file_ext, content = await process_uploaded_file(file)
        text = extract_text(content, file_ext)
        
        if not text.strip():
            raise HTTPException(400, "No extractable text found")
        
        # Clean text
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Handle theme questions
        theme_keywords = ["thème", "sujet principal", "quoi le sujet", "theme", "main topic"]
        if any(kw in question.lower() for kw in theme_keywords):
            # Use summarization for theme detection
            summarizer = get_summarizer()
            theme = summarizer(text, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
            return {
                "question": question,
                "answer": f"Le document traite principalement de : {theme}",
                "confidence": 0.95,
                "language": language
            }
        
        # Standard QA processing
        qa = get_qa_model()
        result = qa(question=question, context=text)
        
        return {
            "question": question,
            "answer": result["answer"],
            "confidence": result["score"],
            "language": language
        }
        
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"QA processing failed: {str(e)}")
        raise HTTPException(500, "Document analysis failed")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)