chenguittiMaroua commited on
Commit
752eaa7
·
verified ·
1 Parent(s): e7a980f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +46 -28
main.py CHANGED
@@ -449,7 +449,6 @@ async def summarize_document(request: Request, file: UploadFile = File(...)):
449
 
450
 
451
  from typing import Optional
452
-
453
  @app.post("/qa")
454
  async def question_answering(
455
  question: str = Form(...),
@@ -462,33 +461,10 @@ async def question_answering(
462
  # Process file with enhanced cleaning
463
  context = None
464
  if file:
465
- file_ext = file.filename.split('.')[-1].lower()
466
  _, content = await process_uploaded_file(file)
467
-
468
- # Handle image files differently
469
- if file_ext in {"jpg", "jpeg", "png"}:
470
- # First try OCR
471
- try:
472
- image = Image.open(io.BytesIO(content))
473
- # Pre-process image for better OCR
474
- image = image.convert('L') # Convert to grayscale
475
- text = pytesseract.image_to_string(image, config='--psm 6', lang='fra+eng')
476
- if text.strip():
477
- context = clean_and_translate_to_french(text)[:1200]
478
- else:
479
- # If OCR fails, try image captioning
480
- captioner = get_image_captioner()
481
- result = captioner(image)
482
- context = clean_and_translate_to_french(result[0]['generated_text'])[:1200]
483
- except Exception as img_e:
484
- logger.error(f"Image processing failed: {str(img_e)}")
485
- raise HTTPException(422, "Could not extract text from image")
486
- else:
487
- # Handle non-image files as before
488
- raw_text = extract_text(content, file_ext)
489
- context = clean_and_translate_to_french(raw_text)[:1200]
490
 
491
- # Rest of your QA processing remains the same...
492
  # Theme detection with strict French enforcement
493
  if "thème" in question.lower() or "theme" in question.lower():
494
  if not context:
@@ -535,12 +511,54 @@ async def question_answering(
535
  "context_used": context is not None
536
  }
537
 
538
- except HTTPException:
539
- raise
540
  except Exception as e:
541
  logger.error(f"Erreur: {str(e)}")
542
  raise HTTPException(500, "Erreur lors du traitement")
543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
544
 
545
 
546
 
 
449
 
450
 
451
  from typing import Optional
 
452
  @app.post("/qa")
453
  async def question_answering(
454
  question: str = Form(...),
 
461
  # Process file with enhanced cleaning
462
  context = None
463
  if file:
 
464
  _, content = await process_uploaded_file(file)
465
+ raw_text = extract_text(content, file.filename.split('.')[-1])
466
+ context = clean_and_translate_to_french(raw_text)[:1200] # New cleaning function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
467
 
 
468
  # Theme detection with strict French enforcement
469
  if "thème" in question.lower() or "theme" in question.lower():
470
  if not context:
 
511
  "context_used": context is not None
512
  }
513
 
 
 
514
  except Exception as e:
515
  logger.error(f"Erreur: {str(e)}")
516
  raise HTTPException(500, "Erreur lors du traitement")
517
 
518
+ # New helper functions
519
+ def clean_and_translate_to_french(text: str) -> str:
520
+ """Enhanced text cleaning with basic translation"""
521
+ # Remove headers/footers
522
+ text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
523
+ # Convert common English terms to French
524
+ replacements = {
525
+ "welcome": "bienvenue",
526
+ "introduction": "introduction",
527
+ "chapter": "chapitre",
528
+ "section": "section"
529
+ }
530
+ for eng, fr in replacements.items():
531
+ text = text.replace(eng, fr)
532
+ return text[:2000]
533
+
534
+ def generate_theme_answer(context: str) -> str:
535
+ """Specialized theme extraction pipeline"""
536
+ # Step 1: Identify key topics
537
+ topics_prompt = (
538
+ "Liste 3-5 mots-clés en français représentant les sujets principaux "
539
+ f"de ce texte:\n{context[:1000]}"
540
+ )
541
+ topics = qa_pipeline(topics_prompt, max_length=50)[0]["generated_text"]
542
+
543
+ # Step 2: Generate French summary
544
+ summary_prompt = (
545
+ "Résume en une phrase en français pour un étudiant:\n"
546
+ f"Mots-clés: {topics}\nTexte: {context[:800]}"
547
+ )
548
+ summary = qa_pipeline(summary_prompt, max_length=60)[0]["generated_text"]
549
+
550
+ # Step 3: Format as theme answer
551
+ return summary.split(":")[-1].split(".")[0].strip().capitalize() + "."
552
+
553
+ def validate_french_response(text: str) -> str:
554
+ """Ensure proper French output"""
555
+ # Remove English fragments
556
+ text = re.sub(r'[A-Za-z]{3,}', '', text)
557
+ # Ensure proper sentence structure
558
+ if not text.endswith(('.', '!', '?')):
559
+ text = text.split('.')[0] + '.'
560
+ return text.capitalize()
561
+
562
 
563
 
564