Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -727,55 +727,85 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
|
|
| 727 |
@limiter.limit("5/minute")
|
| 728 |
async def summarize_document(request: Request, file: UploadFile = File(...)):
|
| 729 |
try:
|
| 730 |
-
#
|
| 731 |
filename = file.filename.lower()
|
| 732 |
-
|
| 733 |
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 738 |
doc = Document(file.file)
|
| 739 |
-
|
| 740 |
-
elif
|
| 741 |
prs = Presentation(file.file)
|
| 742 |
text = []
|
| 743 |
for slide in prs.slides:
|
| 744 |
for shape in slide.shapes:
|
| 745 |
if hasattr(shape, "text"):
|
| 746 |
text.append(shape.text)
|
| 747 |
-
|
| 748 |
-
elif
|
|
|
|
|
|
|
|
|
|
|
|
|
| 749 |
content = await file.read()
|
| 750 |
-
text = extract_text(content, 'pdf') # Your existing PDF extraction
|
| 751 |
-
else:
|
| 752 |
-
# For unsupported formats, try to read as plain text
|
| 753 |
try:
|
| 754 |
-
|
| 755 |
except UnicodeDecodeError:
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
if not text.strip():
|
| 759 |
-
raise HTTPException(400, "No extractable text found")
|
| 760 |
-
|
| 761 |
-
# Clean and chunk text
|
| 762 |
-
text = re.sub(r'\s+', ' ', text).strip()
|
| 763 |
-
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
|
| 764 |
-
|
| 765 |
-
# Summarize each chunk
|
| 766 |
-
summarizer = get_summarizer()
|
| 767 |
-
summaries = []
|
| 768 |
-
for chunk in chunks:
|
| 769 |
-
summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
|
| 770 |
-
summaries.append(summary)
|
| 771 |
-
|
| 772 |
-
return {"summary": " ".join(summaries)}
|
| 773 |
-
|
| 774 |
-
except HTTPException:
|
| 775 |
-
raise
|
| 776 |
except Exception as e:
|
| 777 |
-
|
| 778 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 779 |
|
| 780 |
@app.post("/qa")
|
| 781 |
@limiter.limit("5/minute")
|
|
|
|
| 727 |
@limiter.limit("5/minute")
|
| 728 |
async def summarize_document(request: Request, file: UploadFile = File(...)):
|
| 729 |
try:
|
| 730 |
+
# Validate file type
|
| 731 |
filename = file.filename.lower()
|
| 732 |
+
file_ext = None
|
| 733 |
|
| 734 |
+
for ext in SUPPORTED_EXTENSIONS:
|
| 735 |
+
if filename.endswith(ext):
|
| 736 |
+
file_ext = ext
|
| 737 |
+
break
|
| 738 |
+
|
| 739 |
+
if not file_ext:
|
| 740 |
+
supported_formats = ", ".join(SUPPORTED_EXTENSIONS.keys())
|
| 741 |
+
raise HTTPException(
|
| 742 |
+
status_code=400,
|
| 743 |
+
detail=f"Unsupported file format. Supported formats: {supported_formats}"
|
| 744 |
+
)
|
| 745 |
+
|
| 746 |
+
# Process file based on extension
|
| 747 |
+
text = await extract_text_from_file(file, file_ext)
|
| 748 |
+
|
| 749 |
+
if not text.strip():
|
| 750 |
+
raise HTTPException(400, "The document appears to be empty or contains no extractable text")
|
| 751 |
+
|
| 752 |
+
# Clean and chunk text
|
| 753 |
+
text = clean_text(text)
|
| 754 |
+
summary = await generate_summary(text)
|
| 755 |
+
|
| 756 |
+
return {"summary": summary}
|
| 757 |
+
|
| 758 |
+
except HTTPException as he:
|
| 759 |
+
logger.warning(f"Client error: {he.detail}")
|
| 760 |
+
raise
|
| 761 |
+
except Exception as e:
|
| 762 |
+
logger.error(f"Summarization failed: {str(e)}", exc_info=True)
|
| 763 |
+
raise HTTPException(500, "Document summarization failed")
|
| 764 |
+
|
| 765 |
+
async def extract_text_from_file(file: UploadFile, file_ext: str) -> str:
|
| 766 |
+
"""Extract text from different file formats"""
|
| 767 |
+
try:
|
| 768 |
+
if file_ext in ('.txt', '.md', '.rtf'):
|
| 769 |
+
return (await file.read()).decode('utf-8')
|
| 770 |
+
elif file_ext == '.docx':
|
| 771 |
doc = Document(file.file)
|
| 772 |
+
return "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
| 773 |
+
elif file_ext == '.pptx':
|
| 774 |
prs = Presentation(file.file)
|
| 775 |
text = []
|
| 776 |
for slide in prs.slides:
|
| 777 |
for shape in slide.shapes:
|
| 778 |
if hasattr(shape, "text"):
|
| 779 |
text.append(shape.text)
|
| 780 |
+
return "\n".join(text)
|
| 781 |
+
elif file_ext == '.pdf':
|
| 782 |
+
content = await file.read()
|
| 783 |
+
return extract_text(content, 'pdf') # Your existing PDF extraction
|
| 784 |
+
elif file_ext == '.odt':
|
| 785 |
+
# For ODT files, we'll use a simple text extraction
|
| 786 |
content = await file.read()
|
|
|
|
|
|
|
|
|
|
| 787 |
try:
|
| 788 |
+
return content.decode('utf-8')
|
| 789 |
except UnicodeDecodeError:
|
| 790 |
+
# ODT is a zip file, would need proper parsing in production
|
| 791 |
+
raise HTTPException(400, "ODT file parsing requires additional libraries")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
except Exception as e:
|
| 793 |
+
raise HTTPException(400, f"Failed to extract text from file: {str(e)}")
|
| 794 |
+
|
| 795 |
+
def clean_text(text: str) -> str:
|
| 796 |
+
"""Clean and normalize text"""
|
| 797 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 798 |
+
return text
|
| 799 |
+
|
| 800 |
+
async def generate_summary(text: str, chunk_size: int = 1000) -> str:
|
| 801 |
+
"""Generate summary from text in chunks"""
|
| 802 |
+
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
| 803 |
+
summarizer = get_summarizer()
|
| 804 |
+
summaries = []
|
| 805 |
+
for chunk in chunks:
|
| 806 |
+
summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
|
| 807 |
+
summaries.append(summary)
|
| 808 |
+
return " ".join(summaries)
|
| 809 |
|
| 810 |
@app.post("/qa")
|
| 811 |
@limiter.limit("5/minute")
|