Spaces:

chenguittiMaroua
/

asm-app

Sleeping

App Files Files Community

asm-app / main.py

chenguittiMaroua

Update main.py

32c93ca verified 8 months ago

raw

history blame

22.5 kB

	from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import JSONResponse
	from transformers import pipeline
	from typing import Tuple, Optional
	import io
	import fitz # PyMuPDF
	from PIL import Image
	import pandas as pd
	import uvicorn
	from docx import Document
	from pptx import Presentation
	import pytesseract
	import logging
	import re
	from slowapi import Limiter
	from slowapi.util import get_remote_address
	from slowapi.errors import RateLimitExceeded
	from slowapi.middleware import SlowAPIMiddleware
	import matplotlib.pyplot as plt
	import seaborn as sns
	import tempfile
	import base64
	from io import BytesIO
	from pydantic import BaseModel
	import traceback
	import ast
	from fastapi.responses import HTMLResponse
	from fastapi import Request
	from pathlib import Path
	from fastapi.staticfiles import StaticFiles
	import numpy as np # Add this import
	import pandas as pd
	from io import BytesIO
	import os
	import torch
	# Standard library imports
	import io
	import re
	import logging
	import tempfile
	import base64
	import warnings
	from typing import Tuple, Optional
	from pathlib import Path
	from docx import Document
	from pptx import Presentation
	import re
	from concurrent.futures import ThreadPoolExecutor
	from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import JSONResponse, HTMLResponse
	from transformers import pipeline
	import fitz # PyMuPDF
	from PIL import Image
	import pandas as pd
	import uvicorn
	from docx import Document
	from pptx import Presentation
	import pytesseract
	from slowapi import Limiter
	from slowapi.util import get_remote_address
	from slowapi.errors import RateLimitExceeded
	from slowapi.middleware import SlowAPIMiddleware
	import matplotlib.pyplot as plt
	import seaborn as sns
	from pydantic import BaseModel
	import traceback
	import ast
	from openpyxl import Workbook
	import uuid
	# Suppress openpyxl warnings
	warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

	# Rest of your code (app setup, routes, etc.)...
	# Initialize rate limiter
	limiter = Limiter(key_func=get_remote_address)

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	app = FastAPI()


	# Serve static files (frontend)
	app.mount("/static", StaticFiles(directory="static"), name="static")


	@app.get("/", response_class=HTMLResponse)
	def home ():
	with open("static/indexAI.html","r") as file :
	return file.read()


	# Apply rate limiting middleware
	app.state.limiter = limiter
	app.add_middleware(SlowAPIMiddleware)

	# CORS Configuration
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	UPLOAD_FOLDER = "uploads"
	OUTPUT_FOLDER = "static"
	os.makedirs(UPLOAD_FOLDER, exist_ok=True)
	os.makedirs(OUTPUT_FOLDER, exist_ok=True)

	# Lightweight model configuration
	MODEL_NAME = "distilgpt2"
	MAX_FILE_SIZE = 2 * 1024 * 1024 # 2MB
	TIMEOUT = 10 # seconds
	MAX_ROWS = 100
	MAX_COLUMNS = 5
	try:
	visualization_model = pipeline(
	"text-generation",
	model=MODEL_NAME,
	device=-1, # CPU
	framework="pt"
	)
	except Exception as e:
	print(f"Model loading failed: {str(e)}")
	visualization_model = None

	executor = ThreadPoolExecutor(max_workers=2)

	def safe_read_file(file_content, file_ext):
	"""Robust file reading with size limits"""
	file_like = io.BytesIO(file_content)
	if file_ext == 'csv':
	return pd.read_csv(file_like, nrows=MAX_ROWS)
	return pd.read_excel(file_like, nrows=MAX_ROWS)

	def generate_simple_plot(df, chart_type):
	"""Fallback plotting function"""
	plt.figure(figsize=(8, 5))
	numeric_cols = df.select_dtypes(include='number').columns

	if len(numeric_cols) >= 2:
	df[numeric_cols[:2]].plot(kind=chart_type if chart_type in ['bar', 'line', 'scatter'] else 'bar')
	elif len(numeric_cols) == 1:
	df[numeric_cols[0]].plot(kind='bar')
	else:
	df.iloc[:, 0].value_counts().plot(kind='bar')

	plt.tight_layout()


	SUPPORTED_FILE_TYPES = {
	"docx", "xlsx", "pptx", "pdf", "jpg", "jpeg", "png", "txt"
	}

	# Model caching
	summarizer = None
	qa_model = None
	image_captioner = None

	def get_summarizer():
	global summarizer
	if summarizer is None:
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
	return summarizer

	#def get_qa_model():
	# global qa_model
	# if qa_model is None:
	# qa_model= pipe = pipeline("question-answering", model="deepset/roberta-base-squad2")
	#return qa_model




	MODEL_CHOICES = [
	"cmarkea/flan-t5-base-fr", # Best for French
	"moussaKam/barthez-orangesum-abstract", # French summarization
	"google/flan-t5-xl" # Higher quality fallback
	]

	qa_pipeline = None
	current_model = None

	def initialize_qa():
	global qa_pipeline, current_model

	# Try each model in order
	for model_name in MODEL_CHOICES:
	try:
	logger.info(f"Attempting to load {model_name}")

	qa_pipeline = pipeline(
	"text2text-generation",
	model=model_name,
	device=0 if torch.cuda.is_available() else -1,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
	)

	current_model = model_name
	logger.info(f"Successfully loaded {model_name}")
	return True

	except Exception as e:
	logger.warning(f"Failed to load {model_name}: {str(e)}")
	continue

	logger.error("All model loading attempts failed")
	return False

	@app.on_event("startup")
	async def startup_event():
	if not initialize_qa():
	logger.error("QA system failed to initialize")















	def get_image_captioner():
	global image_captioner
	if image_captioner is None:
	image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
	return image_captioner

	async def process_uploaded_file(file: UploadFile) -> Tuple[str, bytes]:
	"""Your existing file processing function"""
	if not file.filename:
	raise HTTPException(400, "No filename provided")

	file_ext = file.filename.split('.')[-1].lower()
	if file_ext not in SUPPORTED_FILE_TYPES:
	raise HTTPException(400, f"Unsupported file type. Supported: {', '.join(SUPPORTED_FILE_TYPES)}")

	content = await file.read()
	if len(content) > MAX_FILE_SIZE:
	raise HTTPException(413, f"File too large. Max size: {MAX_FILE_SIZE//1024//1024}MB")

	if file_ext == "pdf":
	try:
	with fitz.open(stream=content, filetype="pdf") as doc:
	if doc.is_encrypted:
	if not doc.authenticate(""):
	raise ValueError("Encrypted PDF - cannot extract text")
	if len(doc) > 50:
	raise ValueError("PDF too large (max 50 pages)")
	except Exception as e:
	logger.error(f"PDF validation failed: {str(e)}")
	raise HTTPException(422, detail=f"Invalid PDF file: {str(e)}")

	await file.seek(0)
	return file_ext, content

	def extract_text(content: bytes, file_ext: str) -> str:
	"""Your existing text extraction function"""
	try:
	if file_ext == "txt":
	return content.decode("utf-8", errors="replace").strip()

	if file_ext == "docx":
	doc = Document(io.BytesIO(content))
	return "\n".join(para.text for para in doc.paragraphs if para.text.strip())

	elif file_ext in {"xlsx", "xls"}:
	df = pd.read_excel(
	io.BytesIO(content),
	sheet_name=None,
	engine='openpyxl',
	na_values=['', 'NA', 'N/A', 'NaN', 'null'],
	keep_default_na=False,
	parse_dates=True
	)

	all_text = []
	for sheet_name, sheet_data in df.items():
	sheet_text = []
	for column in sheet_data.columns:
	if pd.api.types.is_datetime64_any_dtype(sheet_data[column]):
	sheet_data[column] = sheet_data[column].dt.strftime('%Y-%m-%d %H:%M:%S')
	col_text = sheet_data[column].astype(str).replace(['nan', 'None', 'NaT'], '').tolist()
	sheet_text.extend([x for x in col_text if x.strip()])

	all_text.append(f"Sheet: {sheet_name}\n" + "\n".join(sheet_text))

	return "\n\n".join(all_text)

	elif file_ext == "pptx":
	ppt = Presentation(io.BytesIO(content))
	text = []
	for slide in ppt.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text") and shape.text.strip():
	text.append(shape.text)
	return "\n".join(text)

	elif file_ext == "pdf":
	pdf = fitz.open(stream=content, filetype="pdf")
	return "\n".join(page.get_text("text") for page in pdf)

	elif file_ext in {"jpg", "jpeg", "png"}:
	try:
	image = Image.open(io.BytesIO(content))
	text = pytesseract.image_to_string(image, config='--psm 6')
	if text.strip():
	return text

	captioner = get_image_captioner()
	result = captioner(image)
	return result[0]['generated_text']
	except Exception as img_e:
	logger.error(f"Image processing failed: {str(img_e)}")
	raise ValueError("Could not extract text or caption from image")

	except Exception as e:
	logger.error(f"Text extraction failed for {file_ext}: {str(e)}", exc_info=True)
	raise HTTPException(422, f"Failed to extract text from {file_ext} file: {str(e)}")




























	from concurrent.futures import ThreadPoolExecutor
	import asyncio

	# Global thread pool for CPU-bound tasks
	executor = ThreadPoolExecutor(max_workers=4)

	@app.post("/summarize")
	@limiter.limit("5/minute")
	async def summarize_document(request: Request, file: UploadFile = File(...)):
	"""Optimized document summarization with parallel processing"""
	try:
	# 1. Fast file processing
	file_ext, content = await process_uploaded_file(file)

	# 2. Parallel text extraction
	loop = asyncio.get_event_loop()
	text = await loop.run_in_executor(executor, extract_text, content, file_ext)

	if not text.strip():
	raise HTTPException(400, "No extractable text found")

	# 3. Efficient text cleaning
	text = re.sub(r'\s+', ' ', text).strip()

	# 4. Smart chunking with sentence boundaries
	sentences = [s for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
	chunks = []
	current_chunk = []
	current_length = 0

	for sentence in sentences:
	sent_length = len(sentence)
	if current_length + sent_length <= 800: # Slightly smaller chunks for faster processing
	current_chunk.append(sentence)
	current_length += sent_length
	else:
	chunks.append(' '.join(current_chunk))
	current_chunk = [sentence]
	current_length = sent_length
	if current_chunk:
	chunks.append(' '.join(current_chunk))

	# 5. Parallel summarization
	summarizer = get_summarizer()

	def summarize_chunk(chunk):
	return summarizer(
	chunk,
	max_length=120, # Smaller output for faster processing
	min_length=40,
	do_sample=False,
	truncation=True
	)[0]["summary_text"]

	# Process chunks in parallel
	with ThreadPoolExecutor(max_workers=min(4, len(chunks))) as pool:
	summaries = list(pool.map(summarize_chunk, chunks))

	# 6. Fast final combination
	combined = ' '.join(summaries)

	# Optional: Single refinement pass if needed
	if len(combined.split()) > 200:
	combined = summarize_chunk(combined[:3000]) # Limit input size

	return {"summary": combined}

	except HTTPException:
	raise
	except Exception as e:
	logger.error(f"Summarization failed: {str(e)}", exc_info=True)
	raise HTTPException(500, "Document summarization failed")



	from typing import Optional

	@app.post("/qa")
	async def question_answering(
	question: str = Form(...),
	file: Optional[UploadFile] = File(None)
	):
	if qa_pipeline is None:
	raise HTTPException(503, detail="Service temporairement indisponible")

	try:
	# Process file with enhanced cleaning
	context = None
	if file:
	_, content = await process_uploaded_file(file)
	raw_text = extract_text(content, file.filename.split('.')[-1])
	context = clean_and_translate_to_french(raw_text)[:1200] # New cleaning function

	# Theme detection with strict French enforcement
	if "thème" in question.lower() or "theme" in question.lower():
	if not context:
	return {
	"question": question,
	"answer": "Aucun document fourni pour identifier le thème",
	"context_used": False
	}

	# Optimized three-step process
	response = generate_theme_answer(context)

	return {
	"question": question,
	"answer": response,
	"model": current_model,
	"context_used": True
	}

	# Standard QA with language enforcement
	input_text = (
	f"En tant qu'expert, réponds en français à la question suivante "
	f"en utilisant exclusivement le contexte fourni:\n"
	f"Question: {question}\n"
	f"Contexte: {context[:1000] if context else 'Aucun contexte disponible'}\n"
	f"Réponse concise:"
	)

	result = qa_pipeline(
	input_text,
	max_length=80,
	num_beams=2,
	temperature=0.2,
	repetition_penalty=3.0
	)

	# Final validation and cleaning
	final_answer = validate_french_response(result[0]["generated_text"])

	return {
	"question": question,
	"answer": final_answer,
	"model": current_model,
	"context_used": context is not None
	}

	except Exception as e:
	logger.error(f"Erreur: {str(e)}")
	raise HTTPException(500, "Erreur lors du traitement")

	# New helper functions
	def clean_and_translate_to_french(text: str) -> str:
	"""Enhanced text cleaning with basic translation"""
	# Remove headers/footers
	text = re.sub(r'^\s\d+\s$', '', text, flags=re.MULTILINE)
	# Convert common English terms to French
	replacements = {
	"welcome": "bienvenue",
	"introduction": "introduction",
	"chapter": "chapitre",
	"section": "section"
	}
	for eng, fr in replacements.items():
	text = text.replace(eng, fr)
	return text[:2000]

	def generate_theme_answer(context: str) -> str:
	"""Specialized theme extraction pipeline"""
	# Step 1: Identify key topics
	topics_prompt = (
	"Liste 3-5 mots-clés en français représentant les sujets principaux "
	f"de ce texte:\n{context[:1000]}"
	)
	topics = qa_pipeline(topics_prompt, max_length=50)[0]["generated_text"]

	# Step 2: Generate French summary
	summary_prompt = (
	"Résume en une phrase en français pour un étudiant:\n"
	f"Mots-clés: {topics}\nTexte: {context[:800]}"
	)
	summary = qa_pipeline(summary_prompt, max_length=60)[0]["generated_text"]

	# Step 3: Format as theme answer
	return summary.split(":")[-1].split(".")[0].strip().capitalize() + "."

	def validate_french_response(text: str) -> str:
	"""Ensure proper French output"""
	# Remove English fragments
	text = re.sub(r'[A-Za-z]{3,}', '', text)
	# Ensure proper sentence structure
	if not text.endswith(('.', '!', '?')):
	text = text.split('.')[0] + '.'
	return text.capitalize()




	@app.post("/generate-visualization")
	async def generate_visualization(
	file: UploadFile = File(...),
	request: str = Form(...),
	chart_type: Optional[str] = Form("auto")
	):
	try:
	# 1. Validate input
	file_ext = file.filename.split('.')[-1].lower()
	if file_ext not in ['csv', 'xlsx', 'xls']:
	raise HTTPException(400, "Only CSV/Excel files accepted")

	file_content = await file.read()
	if len(file_content) > MAX_FILE_SIZE:
	raise HTTPException(400, f"File size exceeds {MAX_FILE_SIZE//1024}KB limit")

	# 2. Process data
	df = await asyncio.get_event_loop().run_in_executor(
	executor,
	lambda: safe_read_file(file_content, file_ext)
	)

	# Simplify dataframe
	df = df.iloc[:, :MAX_COLUMNS].dropna(how='all')
	if df.empty:
	raise HTTPException(400, "No plottable data found")

	# 3. Generate visualization
	plt.switch_backend('Agg')
	generated_code = None

	if visualization_model:
	try:
	prompt = f"Create {chart_type} chart for {list(df.columns)}. Python code only:"
	code = visualization_model(
	prompt,
	max_length=300,
	num_return_sequences=1,
	temperature=0.3
	)[0]['generated_text'].split("```python")[-1].split("```")[0].strip()

	if code:
	generated_code = code
	exec(code, {'df': df, 'plt': plt})
	except Exception as e:
	print(f"Model failed, using fallback: {e}")
	generate_simple_plot(df, chart_type)
	numeric_cols = df.select_dtypes(include='number').columns.tolist()
	if len(numeric_cols) >= 2:
	cols = numeric_cols[:2]
	generated_code = f"""
	import pandas as pd
	import matplotlib.pyplot as plt
	data = {df[cols].to_dict()}
	df = pd.DataFrame(data)
	df.plot(kind='{chart_type if chart_type in ['bar', 'line', 'scatter'] else 'bar'}')
	plt.tight_layout()
	plt.show()
	"""
	elif len(numeric_cols) == 1:
	generated_code = f"""
	import pandas as pd
	import matplotlib.pyplot as plt
	data = {df[numeric_cols[0]].to_dict()}
	df = pd.DataFrame(data)
	df.plot(kind='bar')
	plt.tight_layout()
	plt.show()
	"""
	else:
	generated_code = f"""
	import pandas as pd
	import matplotlib.pyplot as plt
	data = {df.iloc[:, 0].value_counts().to_dict()}
	df = pd.DataFrame(list(data.items()), columns=['Category', 'Count'])
	df.plot(x='Category', y='Count', kind='bar')
	plt.tight_layout()
	plt.show()
	"""
	else:
	generate_simple_plot(df, chart_type)
	numeric_cols = df.select_dtypes(include='number').columns.tolist()
	if len(numeric_cols) >= 2:
	cols = numeric_cols[:2]
	generated_code = f"""
	import pandas as pd
	import matplotlib.pyplot as plt
	data = {df[cols].to_dict()}
	df = pd.DataFrame(data)
	df.plot(kind='{chart_type if chart_type in ['bar', 'line', 'scatter'] else 'bar'}')
	plt.tight_layout()
	plt.show()
	"""
	elif len(numeric_cols) == 1:
	generated_code = f"""
	import pandas as pd
	import matplotlib.pyplot as plt
	data = {df[numeric_cols[0]].to_dict()}
	df = pd.DataFrame(data)
	df.plot(kind='bar')
	plt.tight_layout()
	plt.show()
	"""
	else:
	generated_code = f"""
	import pandas as pd
	import matplotlib.pyplot as plt
	data = {df.iloc[:, 0].value_counts().to_dict()}
	df = pd.DataFrame(list(data.items()), columns=['Category', 'Count'])
	df.plot(x='Category', y='Count', kind='bar')
	plt.tight_layout()
	plt.show()
	"""

	# 4. Save output
	output_id = uuid.uuid4().hex[:8]
	image_path = f"{OUTPUT_FOLDER}/plot_{output_id}.png"
	plt.savefig(image_path, bbox_inches='tight', dpi=80)
	plt.close()

	return JSONResponse({
	"image_url": f"/static/plot_{output_id}.png",
	"python_code": generated_code,
	"columns": list(df.columns),
	"note": "Visualization generated successfully"
	})

	except HTTPException:
	raise
	except Exception as e:
	raise HTTPException(500, f"Processing error: {str(e)}")

	@app.get("/static/{filename}")
	async def serve_static(filename: str):
	file_path = f"{OUTPUT_FOLDER}/{filename}"
	if not os.path.exists(file_path):
	raise HTTPException(404, "Image not found")
	return FileResponse(file_path)














	# ===== ADD THIS AT THE BOTTOM OF main.py =====
	if __name__ == "__main__":
	# Run the FastAPI application
	uvicorn.run(
	app,
	host="0.0.0.0",
	port=8000,
	timeout_keep_alive=15
	)

	# ===== TESTING CODE (OPTIONAL) =====
	# This should be in a separate test file, not in main.py
	"""
	def test_visualization():
	from fastapi.testclient import TestClient
	from io import BytesIO
	import base64
	from PIL import Image
	import matplotlib.pyplot as plt

	client = TestClient(app)

	test_file = "test.xlsx"
	test_prompt = "Show me a bar chart of sales by region"

	with open(test_file, "rb") as f:
	response = client.post(
	"/visualize/natural",
	files={"file": ("test.xlsx", f, "application/vnd.ms-excel")},
	data={"prompt": test_prompt}
	)

	if response.status_code == 200:
	result = response.json()
	print("Visualization generated successfully!")
	image_data = result["image"].split(",")[1]
	image_bytes = base64.b64decode(image_data)
	image = Image.open(BytesIO(image_bytes))
	plt.imshow(image)
	plt.axis("off")
	plt.show()
	else:
	print(f"Error: {response.status_code}\n{response.text}")
	"""