Spaces:

chenguittiMaroua
/

asm-app

Sleeping

App Files Files Community

chenguittiMaroua commited on Apr 14

Commit

15f5c7f

verified ·

1 Parent(s): 66d89ea

Update main.py

Browse files

Files changed (1) hide show

main.py +66 -36

main.py CHANGED Viewed

@@ -727,55 +727,85 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
 @limiter.limit("5/minute")
 async def summarize_document(request: Request, file: UploadFile = File(...)):
     try:
-        # Check file type
         filename = file.filename.lower()
-        text = ""
-        # Process different file types
-        if filename.endswith(('.txt', '.md')):
-            text = (await file.read()).decode('utf-8')
-        elif filename.endswith('.docx'):
             doc = Document(file.file)
-            text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
-        elif filename.endswith('.pptx'):
             prs = Presentation(file.file)
             text = []
             for slide in prs.slides:
                 for shape in slide.shapes:
                     if hasattr(shape, "text"):
                         text.append(shape.text)
-            text = "\n".join(text)
-        elif filename.endswith('.pdf'):
             content = await file.read()
-            text = extract_text(content, 'pdf')  # Your existing PDF extraction
-        else:
-            # For unsupported formats, try to read as plain text
             try:
-                text = (await file.read()).decode('utf-8')
             except UnicodeDecodeError:
-                raise HTTPException(400, "Unsupported file format")
-        if not text.strip():
-            raise HTTPException(400, "No extractable text found")
-        # Clean and chunk text
-        text = re.sub(r'\s+', ' ', text).strip()
-        chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
-        # Summarize each chunk
-        summarizer = get_summarizer()
-        summaries = []
-        for chunk in chunks:
-            summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
-            summaries.append(summary)
-        return {"summary": " ".join(summaries)}
-    except HTTPException:
-        raise
     except Exception as e:
-        logger.error(f"Summarization failed: {str(e)}", exc_info=True)
-        raise HTTPException(500, "Document summarization failed")
 @app.post("/qa")
 @limiter.limit("5/minute")

 @limiter.limit("5/minute")
 async def summarize_document(request: Request, file: UploadFile = File(...)):
     try:
+        # Validate file type
         filename = file.filename.lower()
+        file_ext = None
+        for ext in SUPPORTED_EXTENSIONS:
+            if filename.endswith(ext):
+                file_ext = ext
+                break
+        if not file_ext:
+            supported_formats = ", ".join(SUPPORTED_EXTENSIONS.keys())
+            raise HTTPException(
+                status_code=400,
+                detail=f"Unsupported file format. Supported formats: {supported_formats}"
+            )
+        # Process file based on extension
+        text = await extract_text_from_file(file, file_ext)
+        if not text.strip():
+            raise HTTPException(400, "The document appears to be empty or contains no extractable text")
+        # Clean and chunk text
+        text = clean_text(text)
+        summary = await generate_summary(text)
+        return {"summary": summary}
+    except HTTPException as he:
+        logger.warning(f"Client error: {he.detail}")
+        raise
+    except Exception as e:
+        logger.error(f"Summarization failed: {str(e)}", exc_info=True)
+        raise HTTPException(500, "Document summarization failed")
+async def extract_text_from_file(file: UploadFile, file_ext: str) -> str:
+    """Extract text from different file formats"""
+    try:
+        if file_ext in ('.txt', '.md', '.rtf'):
+            return (await file.read()).decode('utf-8')
+        elif file_ext == '.docx':
             doc = Document(file.file)
+            return "\n".join([paragraph.text for paragraph in doc.paragraphs])
+        elif file_ext == '.pptx':
             prs = Presentation(file.file)
             text = []
             for slide in prs.slides:
                 for shape in slide.shapes:
                     if hasattr(shape, "text"):
                         text.append(shape.text)
+            return "\n".join(text)
+        elif file_ext == '.pdf':
+            content = await file.read()
+            return extract_text(content, 'pdf')  # Your existing PDF extraction
+        elif file_ext == '.odt':
+            # For ODT files, we'll use a simple text extraction
             content = await file.read()
             try:
+                return content.decode('utf-8')
             except UnicodeDecodeError:
+                # ODT is a zip file, would need proper parsing in production
+                raise HTTPException(400, "ODT file parsing requires additional libraries")
     except Exception as e:
+        raise HTTPException(400, f"Failed to extract text from file: {str(e)}")
+def clean_text(text: str) -> str:
+    """Clean and normalize text"""
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+async def generate_summary(text: str, chunk_size: int = 1000) -> str:
+    """Generate summary from text in chunks"""
+    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
+    summarizer = get_summarizer()
+    summaries = []
+    for chunk in chunks:
+        summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
+        summaries.append(summary)
+    return " ".join(summaries)
 @app.post("/qa")
 @limiter.limit("5/minute")