Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -449,7 +449,6 @@ async def summarize_document(request: Request, file: UploadFile = File(...)):
|
|
| 449 |
|
| 450 |
|
| 451 |
from typing import Optional
|
| 452 |
-
|
| 453 |
@app.post("/qa")
|
| 454 |
async def question_answering(
|
| 455 |
question: str = Form(...),
|
|
@@ -462,33 +461,10 @@ async def question_answering(
|
|
| 462 |
# Process file with enhanced cleaning
|
| 463 |
context = None
|
| 464 |
if file:
|
| 465 |
-
file_ext = file.filename.split('.')[-1].lower()
|
| 466 |
_, content = await process_uploaded_file(file)
|
| 467 |
-
|
| 468 |
-
#
|
| 469 |
-
if file_ext in {"jpg", "jpeg", "png"}:
|
| 470 |
-
# First try OCR
|
| 471 |
-
try:
|
| 472 |
-
image = Image.open(io.BytesIO(content))
|
| 473 |
-
# Pre-process image for better OCR
|
| 474 |
-
image = image.convert('L') # Convert to grayscale
|
| 475 |
-
text = pytesseract.image_to_string(image, config='--psm 6', lang='fra+eng')
|
| 476 |
-
if text.strip():
|
| 477 |
-
context = clean_and_translate_to_french(text)[:1200]
|
| 478 |
-
else:
|
| 479 |
-
# If OCR fails, try image captioning
|
| 480 |
-
captioner = get_image_captioner()
|
| 481 |
-
result = captioner(image)
|
| 482 |
-
context = clean_and_translate_to_french(result[0]['generated_text'])[:1200]
|
| 483 |
-
except Exception as img_e:
|
| 484 |
-
logger.error(f"Image processing failed: {str(img_e)}")
|
| 485 |
-
raise HTTPException(422, "Could not extract text from image")
|
| 486 |
-
else:
|
| 487 |
-
# Handle non-image files as before
|
| 488 |
-
raw_text = extract_text(content, file_ext)
|
| 489 |
-
context = clean_and_translate_to_french(raw_text)[:1200]
|
| 490 |
|
| 491 |
-
# Rest of your QA processing remains the same...
|
| 492 |
# Theme detection with strict French enforcement
|
| 493 |
if "thème" in question.lower() or "theme" in question.lower():
|
| 494 |
if not context:
|
|
@@ -535,12 +511,54 @@ async def question_answering(
|
|
| 535 |
"context_used": context is not None
|
| 536 |
}
|
| 537 |
|
| 538 |
-
except HTTPException:
|
| 539 |
-
raise
|
| 540 |
except Exception as e:
|
| 541 |
logger.error(f"Erreur: {str(e)}")
|
| 542 |
raise HTTPException(500, "Erreur lors du traitement")
|
| 543 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
|
| 545 |
|
| 546 |
|
|
|
|
| 449 |
|
| 450 |
|
| 451 |
from typing import Optional
|
|
|
|
| 452 |
@app.post("/qa")
|
| 453 |
async def question_answering(
|
| 454 |
question: str = Form(...),
|
|
|
|
| 461 |
# Process file with enhanced cleaning
|
| 462 |
context = None
|
| 463 |
if file:
|
|
|
|
| 464 |
_, content = await process_uploaded_file(file)
|
| 465 |
+
raw_text = extract_text(content, file.filename.split('.')[-1])
|
| 466 |
+
context = clean_and_translate_to_french(raw_text)[:1200] # New cleaning function
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
|
|
|
|
| 468 |
# Theme detection with strict French enforcement
|
| 469 |
if "thème" in question.lower() or "theme" in question.lower():
|
| 470 |
if not context:
|
|
|
|
| 511 |
"context_used": context is not None
|
| 512 |
}
|
| 513 |
|
|
|
|
|
|
|
| 514 |
except Exception as e:
|
| 515 |
logger.error(f"Erreur: {str(e)}")
|
| 516 |
raise HTTPException(500, "Erreur lors du traitement")
|
| 517 |
|
| 518 |
+
# New helper functions
|
| 519 |
+
def clean_and_translate_to_french(text: str) -> str:
|
| 520 |
+
"""Enhanced text cleaning with basic translation"""
|
| 521 |
+
# Remove headers/footers
|
| 522 |
+
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
|
| 523 |
+
# Convert common English terms to French
|
| 524 |
+
replacements = {
|
| 525 |
+
"welcome": "bienvenue",
|
| 526 |
+
"introduction": "introduction",
|
| 527 |
+
"chapter": "chapitre",
|
| 528 |
+
"section": "section"
|
| 529 |
+
}
|
| 530 |
+
for eng, fr in replacements.items():
|
| 531 |
+
text = text.replace(eng, fr)
|
| 532 |
+
return text[:2000]
|
| 533 |
+
|
| 534 |
+
def generate_theme_answer(context: str) -> str:
|
| 535 |
+
"""Specialized theme extraction pipeline"""
|
| 536 |
+
# Step 1: Identify key topics
|
| 537 |
+
topics_prompt = (
|
| 538 |
+
"Liste 3-5 mots-clés en français représentant les sujets principaux "
|
| 539 |
+
f"de ce texte:\n{context[:1000]}"
|
| 540 |
+
)
|
| 541 |
+
topics = qa_pipeline(topics_prompt, max_length=50)[0]["generated_text"]
|
| 542 |
+
|
| 543 |
+
# Step 2: Generate French summary
|
| 544 |
+
summary_prompt = (
|
| 545 |
+
"Résume en une phrase en français pour un étudiant:\n"
|
| 546 |
+
f"Mots-clés: {topics}\nTexte: {context[:800]}"
|
| 547 |
+
)
|
| 548 |
+
summary = qa_pipeline(summary_prompt, max_length=60)[0]["generated_text"]
|
| 549 |
+
|
| 550 |
+
# Step 3: Format as theme answer
|
| 551 |
+
return summary.split(":")[-1].split(".")[0].strip().capitalize() + "."
|
| 552 |
+
|
| 553 |
+
def validate_french_response(text: str) -> str:
|
| 554 |
+
"""Ensure proper French output"""
|
| 555 |
+
# Remove English fragments
|
| 556 |
+
text = re.sub(r'[A-Za-z]{3,}', '', text)
|
| 557 |
+
# Ensure proper sentence structure
|
| 558 |
+
if not text.endswith(('.', '!', '?')):
|
| 559 |
+
text = text.split('.')[0] + '.'
|
| 560 |
+
return text.capitalize()
|
| 561 |
+
|
| 562 |
|
| 563 |
|
| 564 |
|