chenguittiMaroua commited on
Commit
66d89ea
·
verified ·
1 Parent(s): 3388479

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +11 -6
main.py CHANGED
@@ -43,7 +43,9 @@ import base64
43
  import warnings
44
  from typing import Tuple, Optional
45
  from pathlib import Path
46
-
 
 
47
  # Third-party imports
48
  from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
49
  from fastapi.middleware.cors import CORSMiddleware
@@ -727,6 +729,7 @@ async def summarize_document(request: Request, file: UploadFile = File(...)):
727
  try:
728
  # Check file type
729
  filename = file.filename.lower()
 
730
 
731
  # Process different file types
732
  if filename.endswith(('.txt', '.md')):
@@ -746,10 +749,12 @@ async def summarize_document(request: Request, file: UploadFile = File(...)):
746
  content = await file.read()
747
  text = extract_text(content, 'pdf') # Your existing PDF extraction
748
  else:
749
- # Fallback to textract for other formats (rtf, etc.)
750
- content = await file.read()
751
- text = textract.process(content).decode('utf-8')
752
-
 
 
753
  if not text.strip():
754
  raise HTTPException(400, "No extractable text found")
755
 
@@ -769,7 +774,7 @@ async def summarize_document(request: Request, file: UploadFile = File(...)):
769
  except HTTPException:
770
  raise
771
  except Exception as e:
772
- logger.error(f"Summarization failed: {str(e)}")
773
  raise HTTPException(500, "Document summarization failed")
774
 
775
  @app.post("/qa")
 
43
  import warnings
44
  from typing import Tuple, Optional
45
  from pathlib import Path
46
+ from docx import Document
47
+ from pptx import Presentation
48
+ import re
49
  # Third-party imports
50
  from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
51
  from fastapi.middleware.cors import CORSMiddleware
 
729
  try:
730
  # Check file type
731
  filename = file.filename.lower()
732
+ text = ""
733
 
734
  # Process different file types
735
  if filename.endswith(('.txt', '.md')):
 
749
  content = await file.read()
750
  text = extract_text(content, 'pdf') # Your existing PDF extraction
751
  else:
752
+ # For unsupported formats, try to read as plain text
753
+ try:
754
+ text = (await file.read()).decode('utf-8')
755
+ except UnicodeDecodeError:
756
+ raise HTTPException(400, "Unsupported file format")
757
+
758
  if not text.strip():
759
  raise HTTPException(400, "No extractable text found")
760
 
 
774
  except HTTPException:
775
  raise
776
  except Exception as e:
777
+ logger.error(f"Summarization failed: {str(e)}", exc_info=True)
778
  raise HTTPException(500, "Document summarization failed")
779
 
780
  @app.post("/qa")