Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -70,16 +70,27 @@ def get_qa_model():
|
|
| 70 |
|
| 71 |
|
| 72 |
#########################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
def extract_text_from_file(file_content: bytes, file_ext: str):
|
| 74 |
text = ""
|
| 75 |
-
|
| 76 |
try:
|
| 77 |
if file_ext == "docx":
|
| 78 |
doc = Document(io.BytesIO(file_content))
|
| 79 |
text = " ".join([p.text for p in doc.paragraphs if p.text.strip()])
|
| 80 |
elif file_ext in ["xls", "xlsx"]:
|
| 81 |
df = pd.read_excel(io.BytesIO(file_content))
|
| 82 |
-
text = " ".join(df.iloc[:, 0].dropna().astype(str).tolist())
|
| 83 |
elif file_ext == "pptx":
|
| 84 |
ppt = Presentation(io.BytesIO(file_content))
|
| 85 |
text = " ".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])
|
|
@@ -88,7 +99,7 @@ def extract_text_from_file(file_content: bytes, file_ext: str):
|
|
| 88 |
text = " ".join([page.get_text("text") for page in pdf])
|
| 89 |
elif file_ext in ["jpg", "jpeg", "png"]:
|
| 90 |
image = Image.open(io.BytesIO(file_content))
|
| 91 |
-
text = pytesseract.image_to_string(image
|
| 92 |
else:
|
| 93 |
raise HTTPException(status_code=400, detail="Unsupported file format.")
|
| 94 |
except Exception as e:
|
|
@@ -96,7 +107,6 @@ def extract_text_from_file(file_content: bytes, file_ext: str):
|
|
| 96 |
|
| 97 |
if not text.strip():
|
| 98 |
raise HTTPException(status_code=400, detail="No extractable text found.")
|
| 99 |
-
|
| 100 |
return text
|
| 101 |
|
| 102 |
########################################################
|
|
@@ -150,26 +160,19 @@ async def summarize_document(file: UploadFile = File(...)):
|
|
| 150 |
#################################################################
|
| 151 |
@app.post("/qa")
|
| 152 |
async def question_answering(file: UploadFile = File(...), question: str = Form(...)):
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
# 🔥 Step 2: Use summarized text for QA
|
| 166 |
-
qa_model = get_qa_model()
|
| 167 |
-
answer = qa_model(question=question, context=summarized_text) # Fixed argument format
|
| 168 |
|
| 169 |
-
return {"question": question, "answer": answer["answer"], "context_used": summarized_text}
|
| 170 |
-
|
| 171 |
-
except Exception as e:
|
| 172 |
-
raise HTTPException(status_code=500, detail=f"Error processing question: {str(e)}")
|
| 173 |
###############################################
|
| 174 |
|
| 175 |
@app.post("/api/caption")
|
|
|
|
| 70 |
|
| 71 |
|
| 72 |
#########################################################
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
models_cache: Dict[str, pipeline] = {}
|
| 76 |
+
|
| 77 |
+
def get_model(model_name: str, task: str):
|
| 78 |
+
if model_name not in models_cache:
|
| 79 |
+
models_cache[model_name] = pipeline(task, model=model_name)
|
| 80 |
+
return models_cache[model_name]
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
def extract_text_from_file(file_content: bytes, file_ext: str):
|
| 86 |
text = ""
|
|
|
|
| 87 |
try:
|
| 88 |
if file_ext == "docx":
|
| 89 |
doc = Document(io.BytesIO(file_content))
|
| 90 |
text = " ".join([p.text for p in doc.paragraphs if p.text.strip()])
|
| 91 |
elif file_ext in ["xls", "xlsx"]:
|
| 92 |
df = pd.read_excel(io.BytesIO(file_content))
|
| 93 |
+
text = " ".join(df.iloc[:, 0].dropna().astype(str).tolist())
|
| 94 |
elif file_ext == "pptx":
|
| 95 |
ppt = Presentation(io.BytesIO(file_content))
|
| 96 |
text = " ".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])
|
|
|
|
| 99 |
text = " ".join([page.get_text("text") for page in pdf])
|
| 100 |
elif file_ext in ["jpg", "jpeg", "png"]:
|
| 101 |
image = Image.open(io.BytesIO(file_content))
|
| 102 |
+
text = pytesseract.image_to_string(image, config='--psm 6')
|
| 103 |
else:
|
| 104 |
raise HTTPException(status_code=400, detail="Unsupported file format.")
|
| 105 |
except Exception as e:
|
|
|
|
| 107 |
|
| 108 |
if not text.strip():
|
| 109 |
raise HTTPException(status_code=400, detail="No extractable text found.")
|
|
|
|
| 110 |
return text
|
| 111 |
|
| 112 |
########################################################
|
|
|
|
| 160 |
#################################################################
|
| 161 |
@app.post("/qa")
|
| 162 |
async def question_answering(file: UploadFile = File(...), question: str = Form(...)):
|
| 163 |
+
content = await file.read()
|
| 164 |
+
file_ext = file.filename.split(".")[-1].lower()
|
| 165 |
+
extracted_text = extract_text_from_file(content, file_ext)
|
| 166 |
+
# Use a pipeline as a high-level helper
|
| 167 |
+
summarizer = get_model("google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", "summarization")
|
| 168 |
+
if len(extracted_text) > 2000:
|
| 169 |
+
extracted_text = summarizer(extracted_text[:2000], max_length=500, min_length=100, do_sample=False)[0]["summary_text"]
|
| 170 |
+
|
| 171 |
+
qa_model = get_model("distilbert-base-cased-distilled-squad", "question-answering")
|
| 172 |
+
answer = qa_model(question=question, context=extracted_text)
|
| 173 |
+
|
| 174 |
+
return {"question": question, "answer": answer["answer"], "context_used": extracted_text}
|
|
|
|
|
|
|
|
|
|
| 175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
###############################################
|
| 177 |
|
| 178 |
@app.post("/api/caption")
|