chenguittiMaroua commited on
Commit
15f5c7f
·
verified ·
1 Parent(s): 66d89ea

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +66 -36
main.py CHANGED
@@ -727,55 +727,85 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
727
  @limiter.limit("5/minute")
728
  async def summarize_document(request: Request, file: UploadFile = File(...)):
729
  try:
730
- # Check file type
731
  filename = file.filename.lower()
732
- text = ""
733
 
734
- # Process different file types
735
- if filename.endswith(('.txt', '.md')):
736
- text = (await file.read()).decode('utf-8')
737
- elif filename.endswith('.docx'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
738
  doc = Document(file.file)
739
- text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
740
- elif filename.endswith('.pptx'):
741
  prs = Presentation(file.file)
742
  text = []
743
  for slide in prs.slides:
744
  for shape in slide.shapes:
745
  if hasattr(shape, "text"):
746
  text.append(shape.text)
747
- text = "\n".join(text)
748
- elif filename.endswith('.pdf'):
 
 
 
 
749
  content = await file.read()
750
- text = extract_text(content, 'pdf') # Your existing PDF extraction
751
- else:
752
- # For unsupported formats, try to read as plain text
753
  try:
754
- text = (await file.read()).decode('utf-8')
755
  except UnicodeDecodeError:
756
- raise HTTPException(400, "Unsupported file format")
757
-
758
- if not text.strip():
759
- raise HTTPException(400, "No extractable text found")
760
-
761
- # Clean and chunk text
762
- text = re.sub(r'\s+', ' ', text).strip()
763
- chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
764
-
765
- # Summarize each chunk
766
- summarizer = get_summarizer()
767
- summaries = []
768
- for chunk in chunks:
769
- summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
770
- summaries.append(summary)
771
-
772
- return {"summary": " ".join(summaries)}
773
-
774
- except HTTPException:
775
- raise
776
  except Exception as e:
777
- logger.error(f"Summarization failed: {str(e)}", exc_info=True)
778
- raise HTTPException(500, "Document summarization failed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
779
 
780
  @app.post("/qa")
781
  @limiter.limit("5/minute")
 
727
  @limiter.limit("5/minute")
728
  async def summarize_document(request: Request, file: UploadFile = File(...)):
729
  try:
730
+ # Validate file type
731
  filename = file.filename.lower()
732
+ file_ext = None
733
 
734
+ for ext in SUPPORTED_EXTENSIONS:
735
+ if filename.endswith(ext):
736
+ file_ext = ext
737
+ break
738
+
739
+ if not file_ext:
740
+ supported_formats = ", ".join(SUPPORTED_EXTENSIONS.keys())
741
+ raise HTTPException(
742
+ status_code=400,
743
+ detail=f"Unsupported file format. Supported formats: {supported_formats}"
744
+ )
745
+
746
+ # Process file based on extension
747
+ text = await extract_text_from_file(file, file_ext)
748
+
749
+ if not text.strip():
750
+ raise HTTPException(400, "The document appears to be empty or contains no extractable text")
751
+
752
+ # Clean and chunk text
753
+ text = clean_text(text)
754
+ summary = await generate_summary(text)
755
+
756
+ return {"summary": summary}
757
+
758
+ except HTTPException as he:
759
+ logger.warning(f"Client error: {he.detail}")
760
+ raise
761
+ except Exception as e:
762
+ logger.error(f"Summarization failed: {str(e)}", exc_info=True)
763
+ raise HTTPException(500, "Document summarization failed")
764
+
765
+ async def extract_text_from_file(file: UploadFile, file_ext: str) -> str:
766
+ """Extract text from different file formats"""
767
+ try:
768
+ if file_ext in ('.txt', '.md', '.rtf'):
769
+ return (await file.read()).decode('utf-8')
770
+ elif file_ext == '.docx':
771
  doc = Document(file.file)
772
+ return "\n".join([paragraph.text for paragraph in doc.paragraphs])
773
+ elif file_ext == '.pptx':
774
  prs = Presentation(file.file)
775
  text = []
776
  for slide in prs.slides:
777
  for shape in slide.shapes:
778
  if hasattr(shape, "text"):
779
  text.append(shape.text)
780
+ return "\n".join(text)
781
+ elif file_ext == '.pdf':
782
+ content = await file.read()
783
+ return extract_text(content, 'pdf') # Your existing PDF extraction
784
+ elif file_ext == '.odt':
785
+ # For ODT files, we'll use a simple text extraction
786
  content = await file.read()
 
 
 
787
  try:
788
+ return content.decode('utf-8')
789
  except UnicodeDecodeError:
790
+ # ODT is a zip file, would need proper parsing in production
791
+ raise HTTPException(400, "ODT file parsing requires additional libraries")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
792
  except Exception as e:
793
+ raise HTTPException(400, f"Failed to extract text from file: {str(e)}")
794
+
795
+ def clean_text(text: str) -> str:
796
+ """Clean and normalize text"""
797
+ text = re.sub(r'\s+', ' ', text).strip()
798
+ return text
799
+
800
+ async def generate_summary(text: str, chunk_size: int = 1000) -> str:
801
+ """Generate summary from text in chunks"""
802
+ chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
803
+ summarizer = get_summarizer()
804
+ summaries = []
805
+ for chunk in chunks:
806
+ summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
807
+ summaries.append(summary)
808
+ return " ".join(summaries)
809
 
810
  @app.post("/qa")
811
  @limiter.limit("5/minute")