chenguittiMaroua commited on
Commit
7d57745
·
verified ·
1 Parent(s): bd5c109

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +28 -46
main.py CHANGED
@@ -108,7 +108,7 @@ app.add_middleware(
108
  # Constants
109
  MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB
110
  SUPPORTED_FILE_TYPES = {
111
- "docx", "xlsx", "pptx", "pdf", "jpg", "jpeg", "png"
112
  }
113
 
114
  # Model caching
@@ -166,6 +166,10 @@ async def process_uploaded_file(file: UploadFile) -> Tuple[str, bytes]:
166
  def extract_text(content: bytes, file_ext: str) -> str:
167
  """Extract text from various file formats with enhanced Excel support"""
168
  try:
 
 
 
 
169
  if file_ext == "docx":
170
  doc = Document(io.BytesIO(content))
171
  return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
@@ -801,69 +805,47 @@ async def summarize_document(request: Request, file: UploadFile = File(...)):
801
 
802
 
803
  @app.post("/qa")
804
- @limiter.limit("5/minute")
805
  async def question_answering(
806
- request: Request,
807
- file: UploadFile = File(...),
808
  question: str = Form(...),
809
- language: str = Form("fr")
 
810
  ):
811
  try:
812
- file_ext, content = await process_uploaded_file(file)
813
- text = extract_text(content, file_ext)
 
 
 
 
 
 
814
 
 
 
 
815
  if not text.strip():
816
- raise HTTPException(400, "No extractable text found")
817
 
818
  # Clean and truncate text
819
  text = re.sub(r'\s+', ' ', text).strip()[:5000]
820
 
821
- # Theme detection
822
- theme_keywords = ["thème", "sujet principal", "quoi le sujet", "theme", "main topic"]
823
- if any(kw in question.lower() for kw in theme_keywords):
824
- try:
825
- summarizer = get_summarizer()
826
- summary_output = summarizer(
827
- text,
828
- max_length=min(100, len(text)//4),
829
- min_length=30,
830
- do_sample=False,
831
- truncation=True
832
- )
833
-
834
- theme = summary_output[0].get("summary_text", text[:200] + "...")
835
- return {
836
- "question": question,
837
- "answer": f"Le document traite principalement de : {theme}",
838
- "confidence": 0.95,
839
- "language": language
840
- }
841
- except Exception:
842
- theme = text[:200] + ("..." if len(text) > 200 else "")
843
- return {
844
- "question": question,
845
- "answer": f"D'après le document : {theme}",
846
- "confidence": 0.7,
847
- "language": language,
848
- "warning": "theme_summary_fallback"
849
- }
850
 
851
  # Standard QA
852
  qa = get_qa_model()
853
  result = qa(question=question, context=text[:3000])
854
-
855
- return {
856
- "question": question,
857
- "answer": result["answer"],
858
- "confidence": result["score"],
859
- "language": language
860
- }
861
 
862
  except HTTPException:
863
  raise
864
  except Exception as e:
865
- logger.error(f"QA processing failed: {str(e)}")
866
- raise HTTPException(500, detail=f"Analysis failed: {str(e)}")
867
  @app.post("/visualize/natural")
868
  async def natural_language_visualization(
869
  file: UploadFile = File(...),
 
108
  # Constants
109
  MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB
110
  SUPPORTED_FILE_TYPES = {
111
+ "docx", "xlsx", "pptx", "pdf", "jpg", "jpeg", "png", "txt"
112
  }
113
 
114
  # Model caching
 
166
  def extract_text(content: bytes, file_ext: str) -> str:
167
  """Extract text from various file formats with enhanced Excel support"""
168
  try:
169
+ if file_ext == "txt":
170
+ # Decode plain text (handle encoding issues)
171
+ return content.decode("utf-8", errors="replace").strip()
172
+
173
  if file_ext == "docx":
174
  doc = Document(io.BytesIO(content))
175
  return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
 
805
 
806
 
807
  @app.post("/qa")
 
808
  async def question_answering(
809
+ file: UploadFile = File(None), # Make optional for plain text
 
810
  question: str = Form(...),
811
+ text_input: str = Form(None), # Alternative to file upload
812
+ language: str = Form("en")
813
  ):
814
  try:
815
+ # Case 1: User uploaded a file
816
+ if file:
817
+ file_ext, content = await process_uploaded_file(file)
818
+ text = extract_text(content, file_ext)
819
+
820
+ # Case 2: User provided raw text
821
+ elif text_input:
822
+ text = text_input.strip()
823
 
824
+ else:
825
+ raise HTTPException(400, "Either a file or text input is required.")
826
+
827
  if not text.strip():
828
+ raise HTTPException(400, "No usable text found.")
829
 
830
  # Clean and truncate text
831
  text = re.sub(r'\s+', ' ', text).strip()[:5000]
832
 
833
+ # Theme detection (if question asks for topic)
834
+ if any(kw in question.lower() for kw in ["theme", "topic", "subject"]):
835
+ summarizer = get_summarizer()
836
+ summary = summarizer(text, max_length=100, min_length=30)[0]["summary_text"]
837
+ return {"answer": f"The main topic is: {summary}"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838
 
839
  # Standard QA
840
  qa = get_qa_model()
841
  result = qa(question=question, context=text[:3000])
842
+ return {"answer": result["answer"], "confidence": result["score"]}
 
 
 
 
 
 
843
 
844
  except HTTPException:
845
  raise
846
  except Exception as e:
847
+ logger.error(f"QA failed: {str(e)}")
848
+ raise HTTPException(500, "Internal server error.")
849
  @app.post("/visualize/natural")
850
  async def natural_language_visualization(
851
  file: UploadFile = File(...),