Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -723,18 +723,26 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
|
|
| 723 |
|
| 724 |
|
| 725 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
@app.post("/summarize")
|
| 727 |
@limiter.limit("5/minute")
|
| 728 |
async def summarize_document(request: Request, file: UploadFile = File(...)):
|
| 729 |
try:
|
| 730 |
# Validate file type
|
|
|
|
|
|
|
|
|
|
| 731 |
filename = file.filename.lower()
|
| 732 |
-
file_ext = None
|
| 733 |
-
|
| 734 |
-
for ext in SUPPORTED_EXTENSIONS:
|
| 735 |
-
if filename.endswith(ext):
|
| 736 |
-
file_ext = ext
|
| 737 |
-
break
|
| 738 |
|
| 739 |
if not file_ext:
|
| 740 |
supported_formats = ", ".join(SUPPORTED_EXTENSIONS.keys())
|
|
@@ -747,7 +755,10 @@ async def summarize_document(request: Request, file: UploadFile = File(...)):
|
|
| 747 |
text = await extract_text_from_file(file, file_ext)
|
| 748 |
|
| 749 |
if not text.strip():
|
| 750 |
-
raise HTTPException(
|
|
|
|
|
|
|
|
|
|
| 751 |
|
| 752 |
# Clean and chunk text
|
| 753 |
text = clean_text(text)
|
|
@@ -787,10 +798,15 @@ async def extract_text_from_file(file: UploadFile, file_ext: str) -> str:
|
|
| 787 |
try:
|
| 788 |
return content.decode('utf-8')
|
| 789 |
except UnicodeDecodeError:
|
| 790 |
-
|
| 791 |
-
|
|
|
|
|
|
|
| 792 |
except Exception as e:
|
| 793 |
-
raise HTTPException(
|
|
|
|
|
|
|
|
|
|
| 794 |
|
| 795 |
def clean_text(text: str) -> str:
|
| 796 |
"""Clean and normalize text"""
|
|
@@ -803,10 +819,18 @@ async def generate_summary(text: str, chunk_size: int = 1000) -> str:
|
|
| 803 |
summarizer = get_summarizer()
|
| 804 |
summaries = []
|
| 805 |
for chunk in chunks:
|
| 806 |
-
summary = summarizer(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 807 |
summaries.append(summary)
|
| 808 |
return " ".join(summaries)
|
| 809 |
|
|
|
|
|
|
|
|
|
|
| 810 |
@app.post("/qa")
|
| 811 |
@limiter.limit("5/minute")
|
| 812 |
async def question_answering(
|
|
|
|
| 723 |
|
| 724 |
|
| 725 |
|
| 726 |
+
SUPPORTED_EXTENSIONS: Dict[str, str] = {
|
| 727 |
+
'.txt': 'text/plain',
|
| 728 |
+
'.md': 'text/markdown',
|
| 729 |
+
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
| 730 |
+
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
| 731 |
+
'.pdf': 'application/pdf',
|
| 732 |
+
'.rtf': 'application/rtf',
|
| 733 |
+
'.odt': 'application/vnd.oasis.opendocument.text'
|
| 734 |
+
}
|
| 735 |
+
|
| 736 |
@app.post("/summarize")
|
| 737 |
@limiter.limit("5/minute")
|
| 738 |
async def summarize_document(request: Request, file: UploadFile = File(...)):
|
| 739 |
try:
|
| 740 |
# Validate file type
|
| 741 |
+
if not file.filename:
|
| 742 |
+
raise HTTPException(status_code=400, detail="No filename provided")
|
| 743 |
+
|
| 744 |
filename = file.filename.lower()
|
| 745 |
+
file_ext = next((ext for ext in SUPPORTED_EXTENSIONS if filename.endswith(ext)), None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 746 |
|
| 747 |
if not file_ext:
|
| 748 |
supported_formats = ", ".join(SUPPORTED_EXTENSIONS.keys())
|
|
|
|
| 755 |
text = await extract_text_from_file(file, file_ext)
|
| 756 |
|
| 757 |
if not text.strip():
|
| 758 |
+
raise HTTPException(
|
| 759 |
+
status_code=400,
|
| 760 |
+
detail="The document appears to be empty or contains no extractable text"
|
| 761 |
+
)
|
| 762 |
|
| 763 |
# Clean and chunk text
|
| 764 |
text = clean_text(text)
|
|
|
|
| 798 |
try:
|
| 799 |
return content.decode('utf-8')
|
| 800 |
except UnicodeDecodeError:
|
| 801 |
+
raise HTTPException(
|
| 802 |
+
status_code=400,
|
| 803 |
+
detail="ODT file parsing requires additional libraries"
|
| 804 |
+
)
|
| 805 |
except Exception as e:
|
| 806 |
+
raise HTTPException(
|
| 807 |
+
status_code=400,
|
| 808 |
+
detail=f"Failed to extract text from file: {str(e)}"
|
| 809 |
+
)
|
| 810 |
|
| 811 |
def clean_text(text: str) -> str:
|
| 812 |
"""Clean and normalize text"""
|
|
|
|
| 819 |
summarizer = get_summarizer()
|
| 820 |
summaries = []
|
| 821 |
for chunk in chunks:
|
| 822 |
+
summary = summarizer(
|
| 823 |
+
chunk,
|
| 824 |
+
max_length=150,
|
| 825 |
+
min_length=50,
|
| 826 |
+
do_sample=False
|
| 827 |
+
)[0]["summary_text"]
|
| 828 |
summaries.append(summary)
|
| 829 |
return " ".join(summaries)
|
| 830 |
|
| 831 |
+
|
| 832 |
+
|
| 833 |
+
|
| 834 |
@app.post("/qa")
|
| 835 |
@limiter.limit("5/minute")
|
| 836 |
async def question_answering(
|