Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -728,30 +728,78 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
|
|
| 728 |
@app.post("/summarize")
|
| 729 |
@limiter.limit("5/minute")
|
| 730 |
async def summarize_document(request: Request, file: UploadFile = File(...)):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 731 |
try:
|
|
|
|
| 732 |
file_ext, content = await process_uploaded_file(file)
|
|
|
|
|
|
|
| 733 |
text = extract_text(content, file_ext)
|
| 734 |
|
| 735 |
if not text.strip():
|
| 736 |
raise HTTPException(400, "No extractable text found")
|
| 737 |
|
| 738 |
-
# Clean
|
| 739 |
text = re.sub(r'\s+', ' ', text).strip()
|
| 740 |
-
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
|
| 741 |
|
| 742 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 743 |
summarizer = get_summarizer()
|
|
|
|
|
|
|
| 744 |
summaries = []
|
| 745 |
for chunk in chunks:
|
| 746 |
-
|
| 747 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 748 |
|
| 749 |
-
return {"summary":
|
| 750 |
|
| 751 |
except HTTPException:
|
| 752 |
raise
|
| 753 |
except Exception as e:
|
| 754 |
-
logger.error(f"Summarization failed: {str(e)}")
|
| 755 |
raise HTTPException(500, "Document summarization failed")
|
| 756 |
@app.post("/qa")
|
| 757 |
@limiter.limit("5/minute")
|
|
|
|
| 728 |
@app.post("/summarize")
|
| 729 |
@limiter.limit("5/minute")
|
| 730 |
async def summarize_document(request: Request, file: UploadFile = File(...)):
|
| 731 |
+
"""
|
| 732 |
+
Summarize content from various file types (PDF, Word, Excel, PowerPoint, Images)
|
| 733 |
+
Returns a concise summary of the document's main points.
|
| 734 |
+
"""
|
| 735 |
try:
|
| 736 |
+
# Use your existing file processing and validation
|
| 737 |
file_ext, content = await process_uploaded_file(file)
|
| 738 |
+
|
| 739 |
+
# Use your existing text extraction function
|
| 740 |
text = extract_text(content, file_ext)
|
| 741 |
|
| 742 |
if not text.strip():
|
| 743 |
raise HTTPException(400, "No extractable text found")
|
| 744 |
|
| 745 |
+
# Clean text (preserving your existing approach)
|
| 746 |
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
|
| 747 |
|
| 748 |
+
# Improved chunking with sentence awareness
|
| 749 |
+
sentences = re.split(r'(?<=[.!?]) +', text)
|
| 750 |
+
chunks = []
|
| 751 |
+
current_chunk = ""
|
| 752 |
+
|
| 753 |
+
for sentence in sentences:
|
| 754 |
+
if len(current_chunk) + len(sentence) <= 1000:
|
| 755 |
+
current_chunk += " " + sentence
|
| 756 |
+
else:
|
| 757 |
+
chunks.append(current_chunk.strip())
|
| 758 |
+
current_chunk = sentence
|
| 759 |
+
if current_chunk:
|
| 760 |
+
chunks.append(current_chunk.strip())
|
| 761 |
+
|
| 762 |
+
# Get your cached summarizer
|
| 763 |
summarizer = get_summarizer()
|
| 764 |
+
|
| 765 |
+
# Summarize each chunk with error handling
|
| 766 |
summaries = []
|
| 767 |
for chunk in chunks:
|
| 768 |
+
try:
|
| 769 |
+
summary = summarizer(
|
| 770 |
+
chunk,
|
| 771 |
+
max_length=150,
|
| 772 |
+
min_length=50,
|
| 773 |
+
do_sample=False,
|
| 774 |
+
truncation=True
|
| 775 |
+
)[0]["summary_text"]
|
| 776 |
+
summaries.append(summary)
|
| 777 |
+
except Exception as chunk_error:
|
| 778 |
+
logger.warning(f"Failed to summarize chunk: {str(chunk_error)}")
|
| 779 |
+
# Fallback: include the first 3 sentences of the chunk
|
| 780 |
+
fallback = " ".join(chunk.split('.')[:3]) + "."
|
| 781 |
+
summaries.append(fallback)
|
| 782 |
+
|
| 783 |
+
# Combine and clean the final summary
|
| 784 |
+
combined_summary = " ".join(summaries)
|
| 785 |
+
combined_summary = re.sub(r'\s+', ' ', combined_summary).strip()
|
| 786 |
+
|
| 787 |
+
# If summary is too long, summarize it again
|
| 788 |
+
if len(combined_summary.split()) > 300:
|
| 789 |
+
combined_summary = summarizer(
|
| 790 |
+
combined_summary,
|
| 791 |
+
max_length=200,
|
| 792 |
+
min_length=100,
|
| 793 |
+
do_sample=False,
|
| 794 |
+
truncation=True
|
| 795 |
+
)[0]["summary_text"]
|
| 796 |
|
| 797 |
+
return {"summary": combined_summary}
|
| 798 |
|
| 799 |
except HTTPException:
|
| 800 |
raise
|
| 801 |
except Exception as e:
|
| 802 |
+
logger.error(f"Summarization failed: {str(e)}", exc_info=True)
|
| 803 |
raise HTTPException(500, "Document summarization failed")
|
| 804 |
@app.post("/qa")
|
| 805 |
@limiter.limit("5/minute")
|