Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -108,7 +108,7 @@ app.add_middleware(
|
|
| 108 |
# Constants
|
| 109 |
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB
|
| 110 |
SUPPORTED_FILE_TYPES = {
|
| 111 |
-
"docx", "xlsx", "pptx", "pdf", "jpg", "jpeg", "png"
|
| 112 |
}
|
| 113 |
|
| 114 |
# Model caching
|
|
@@ -166,6 +166,10 @@ async def process_uploaded_file(file: UploadFile) -> Tuple[str, bytes]:
|
|
| 166 |
def extract_text(content: bytes, file_ext: str) -> str:
|
| 167 |
"""Extract text from various file formats with enhanced Excel support"""
|
| 168 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
if file_ext == "docx":
|
| 170 |
doc = Document(io.BytesIO(content))
|
| 171 |
return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
|
|
@@ -801,69 +805,47 @@ async def summarize_document(request: Request, file: UploadFile = File(...)):
|
|
| 801 |
|
| 802 |
|
| 803 |
@app.post("/qa")
|
| 804 |
-
@limiter.limit("5/minute")
|
| 805 |
async def question_answering(
|
| 806 |
-
|
| 807 |
-
file: UploadFile = File(...),
|
| 808 |
question: str = Form(...),
|
| 809 |
-
|
|
|
|
| 810 |
):
|
| 811 |
try:
|
| 812 |
-
|
| 813 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 814 |
|
|
|
|
|
|
|
|
|
|
| 815 |
if not text.strip():
|
| 816 |
-
raise HTTPException(400, "No
|
| 817 |
|
| 818 |
# Clean and truncate text
|
| 819 |
text = re.sub(r'\s+', ' ', text).strip()[:5000]
|
| 820 |
|
| 821 |
-
# Theme detection
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
summary_output = summarizer(
|
| 827 |
-
text,
|
| 828 |
-
max_length=min(100, len(text)//4),
|
| 829 |
-
min_length=30,
|
| 830 |
-
do_sample=False,
|
| 831 |
-
truncation=True
|
| 832 |
-
)
|
| 833 |
-
|
| 834 |
-
theme = summary_output[0].get("summary_text", text[:200] + "...")
|
| 835 |
-
return {
|
| 836 |
-
"question": question,
|
| 837 |
-
"answer": f"Le document traite principalement de : {theme}",
|
| 838 |
-
"confidence": 0.95,
|
| 839 |
-
"language": language
|
| 840 |
-
}
|
| 841 |
-
except Exception:
|
| 842 |
-
theme = text[:200] + ("..." if len(text) > 200 else "")
|
| 843 |
-
return {
|
| 844 |
-
"question": question,
|
| 845 |
-
"answer": f"D'après le document : {theme}",
|
| 846 |
-
"confidence": 0.7,
|
| 847 |
-
"language": language,
|
| 848 |
-
"warning": "theme_summary_fallback"
|
| 849 |
-
}
|
| 850 |
|
| 851 |
# Standard QA
|
| 852 |
qa = get_qa_model()
|
| 853 |
result = qa(question=question, context=text[:3000])
|
| 854 |
-
|
| 855 |
-
return {
|
| 856 |
-
"question": question,
|
| 857 |
-
"answer": result["answer"],
|
| 858 |
-
"confidence": result["score"],
|
| 859 |
-
"language": language
|
| 860 |
-
}
|
| 861 |
|
| 862 |
except HTTPException:
|
| 863 |
raise
|
| 864 |
except Exception as e:
|
| 865 |
-
logger.error(f"QA
|
| 866 |
-
raise HTTPException(500,
|
| 867 |
@app.post("/visualize/natural")
|
| 868 |
async def natural_language_visualization(
|
| 869 |
file: UploadFile = File(...),
|
|
|
|
| 108 |
# Constants
|
| 109 |
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB
|
| 110 |
SUPPORTED_FILE_TYPES = {
|
| 111 |
+
"docx", "xlsx", "pptx", "pdf", "jpg", "jpeg", "png", "txt"
|
| 112 |
}
|
| 113 |
|
| 114 |
# Model caching
|
|
|
|
| 166 |
def extract_text(content: bytes, file_ext: str) -> str:
|
| 167 |
"""Extract text from various file formats with enhanced Excel support"""
|
| 168 |
try:
|
| 169 |
+
if file_ext == "txt":
|
| 170 |
+
# Decode plain text (handle encoding issues)
|
| 171 |
+
return content.decode("utf-8", errors="replace").strip()
|
| 172 |
+
|
| 173 |
if file_ext == "docx":
|
| 174 |
doc = Document(io.BytesIO(content))
|
| 175 |
return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
|
|
|
|
| 805 |
|
| 806 |
|
| 807 |
@app.post("/qa")
|
|
|
|
| 808 |
async def question_answering(
|
| 809 |
+
file: UploadFile = File(None), # Make optional for plain text
|
|
|
|
| 810 |
question: str = Form(...),
|
| 811 |
+
text_input: str = Form(None), # Alternative to file upload
|
| 812 |
+
language: str = Form("en")
|
| 813 |
):
|
| 814 |
try:
|
| 815 |
+
# Case 1: User uploaded a file
|
| 816 |
+
if file:
|
| 817 |
+
file_ext, content = await process_uploaded_file(file)
|
| 818 |
+
text = extract_text(content, file_ext)
|
| 819 |
+
|
| 820 |
+
# Case 2: User provided raw text
|
| 821 |
+
elif text_input:
|
| 822 |
+
text = text_input.strip()
|
| 823 |
|
| 824 |
+
else:
|
| 825 |
+
raise HTTPException(400, "Either a file or text input is required.")
|
| 826 |
+
|
| 827 |
if not text.strip():
|
| 828 |
+
raise HTTPException(400, "No usable text found.")
|
| 829 |
|
| 830 |
# Clean and truncate text
|
| 831 |
text = re.sub(r'\s+', ' ', text).strip()[:5000]
|
| 832 |
|
| 833 |
+
# Theme detection (if question asks for topic)
|
| 834 |
+
if any(kw in question.lower() for kw in ["theme", "topic", "subject"]):
|
| 835 |
+
summarizer = get_summarizer()
|
| 836 |
+
summary = summarizer(text, max_length=100, min_length=30)[0]["summary_text"]
|
| 837 |
+
return {"answer": f"The main topic is: {summary}"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 838 |
|
| 839 |
# Standard QA
|
| 840 |
qa = get_qa_model()
|
| 841 |
result = qa(question=question, context=text[:3000])
|
| 842 |
+
return {"answer": result["answer"], "confidence": result["score"]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 843 |
|
| 844 |
except HTTPException:
|
| 845 |
raise
|
| 846 |
except Exception as e:
|
| 847 |
+
logger.error(f"QA failed: {str(e)}")
|
| 848 |
+
raise HTTPException(500, "Internal server error.")
|
| 849 |
@app.post("/visualize/natural")
|
| 850 |
async def natural_language_visualization(
|
| 851 |
file: UploadFile = File(...),
|