Spaces:

chenguittiMaroua
/

asm-app

Sleeping

App Files Files Community

chenguittiMaroua commited on Apr 9

Commit

297e3be

verified ·

1 Parent(s): 0940d8b

Update main.py

Browse files

Files changed (1) hide show

main.py +26 -23

main.py CHANGED Viewed

@@ -70,16 +70,27 @@ def get_qa_model():
 #########################################################
 def extract_text_from_file(file_content: bytes, file_ext: str):
     text = ""
     try:
         if file_ext == "docx":
             doc = Document(io.BytesIO(file_content))
             text = " ".join([p.text for p in doc.paragraphs if p.text.strip()])
         elif file_ext in ["xls", "xlsx"]:
             df = pd.read_excel(io.BytesIO(file_content))
-            text = " ".join(df.iloc[:, 0].dropna().astype(str).tolist())  # Extract first column text
         elif file_ext == "pptx":
             ppt = Presentation(io.BytesIO(file_content))
             text = " ".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])
@@ -88,7 +99,7 @@ def extract_text_from_file(file_content: bytes, file_ext: str):
             text = " ".join([page.get_text("text") for page in pdf])
         elif file_ext in ["jpg", "jpeg", "png"]:
             image = Image.open(io.BytesIO(file_content))
-            text = pytesseract.image_to_string(image)  # OCR for text extraction
         else:
             raise HTTPException(status_code=400, detail="Unsupported file format.")
     except Exception as e:
@@ -96,7 +107,6 @@ def extract_text_from_file(file_content: bytes, file_ext: str):
     if not text.strip():
         raise HTTPException(status_code=400, detail="No extractable text found.")
     return text
     ########################################################
@@ -150,26 +160,19 @@ async def summarize_document(file: UploadFile = File(...)):
 #################################################################
 @app.post("/qa")
 async def question_answering(file: UploadFile = File(...), question: str = Form(...)):
-    try:
-        content = await file.read()
-        file_ext = file.filename.split(".")[-1].lower()
-        extracted_text = extract_text_from_file(content, file_ext)
-        # 🔥 Step 1: Summarize first (if text is too long)
-        if len(extracted_text) > 2000:
-            summarizer = get_summarizer()
-            summarized_text = summarizer(extracted_text[:2000], max_length=500, min_length=100, do_sample=False)[0]["summary_text"]
-        else:
-            summarized_text = extracted_text
-        # 🔥 Step 2: Use summarized text for QA
-        qa_model = get_qa_model()
-        answer = qa_model(question=question, context=summarized_text)  # Fixed argument format
-        return {"question": question, "answer": answer["answer"], "context_used": summarized_text}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error processing question: {str(e)}")
 ###############################################
 @app.post("/api/caption")

 #########################################################
+models_cache: Dict[str, pipeline] = {}
+def get_model(model_name: str, task: str):
+    if model_name not in models_cache:
+        models_cache[model_name] = pipeline(task, model=model_name)
+    return models_cache[model_name]
 def extract_text_from_file(file_content: bytes, file_ext: str):
     text = ""
     try:
         if file_ext == "docx":
             doc = Document(io.BytesIO(file_content))
             text = " ".join([p.text for p in doc.paragraphs if p.text.strip()])
         elif file_ext in ["xls", "xlsx"]:
             df = pd.read_excel(io.BytesIO(file_content))
+            text = " ".join(df.iloc[:, 0].dropna().astype(str).tolist())
         elif file_ext == "pptx":
             ppt = Presentation(io.BytesIO(file_content))
             text = " ".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])
             text = " ".join([page.get_text("text") for page in pdf])
         elif file_ext in ["jpg", "jpeg", "png"]:
             image = Image.open(io.BytesIO(file_content))
+            text = pytesseract.image_to_string(image, config='--psm 6')
         else:
             raise HTTPException(status_code=400, detail="Unsupported file format.")
     except Exception as e:
     if not text.strip():
         raise HTTPException(status_code=400, detail="No extractable text found.")
     return text
     ########################################################
 #################################################################
 @app.post("/qa")
 async def question_answering(file: UploadFile = File(...), question: str = Form(...)):
+    content = await file.read()
+    file_ext = file.filename.split(".")[-1].lower()
+    extracted_text = extract_text_from_file(content, file_ext)
+    # Use a pipeline as a high-level helper
+    summarizer = get_model("google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", "summarization")
+    if len(extracted_text) > 2000:
+        extracted_text = summarizer(extracted_text[:2000], max_length=500, min_length=100, do_sample=False)[0]["summary_text"]
+    qa_model = get_model("distilbert-base-cased-distilled-squad", "question-answering")
+    answer = qa_model(question=question, context=extracted_text)
+    return {"question": question, "answer": answer["answer"], "context_used": extracted_text}
 ###############################################
 @app.post("/api/caption")