chenguittiMaroua commited on
Commit
297e3be
·
verified ·
1 Parent(s): 0940d8b

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +26 -23
main.py CHANGED
@@ -70,16 +70,27 @@ def get_qa_model():
70
 
71
 
72
  #########################################################
 
 
 
 
 
 
 
 
 
 
 
 
73
  def extract_text_from_file(file_content: bytes, file_ext: str):
74
  text = ""
75
-
76
  try:
77
  if file_ext == "docx":
78
  doc = Document(io.BytesIO(file_content))
79
  text = " ".join([p.text for p in doc.paragraphs if p.text.strip()])
80
  elif file_ext in ["xls", "xlsx"]:
81
  df = pd.read_excel(io.BytesIO(file_content))
82
- text = " ".join(df.iloc[:, 0].dropna().astype(str).tolist()) # Extract first column text
83
  elif file_ext == "pptx":
84
  ppt = Presentation(io.BytesIO(file_content))
85
  text = " ".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])
@@ -88,7 +99,7 @@ def extract_text_from_file(file_content: bytes, file_ext: str):
88
  text = " ".join([page.get_text("text") for page in pdf])
89
  elif file_ext in ["jpg", "jpeg", "png"]:
90
  image = Image.open(io.BytesIO(file_content))
91
- text = pytesseract.image_to_string(image) # OCR for text extraction
92
  else:
93
  raise HTTPException(status_code=400, detail="Unsupported file format.")
94
  except Exception as e:
@@ -96,7 +107,6 @@ def extract_text_from_file(file_content: bytes, file_ext: str):
96
 
97
  if not text.strip():
98
  raise HTTPException(status_code=400, detail="No extractable text found.")
99
-
100
  return text
101
 
102
  ########################################################
@@ -150,26 +160,19 @@ async def summarize_document(file: UploadFile = File(...)):
150
  #################################################################
151
  @app.post("/qa")
152
  async def question_answering(file: UploadFile = File(...), question: str = Form(...)):
153
- try:
154
- content = await file.read()
155
- file_ext = file.filename.split(".")[-1].lower()
156
- extracted_text = extract_text_from_file(content, file_ext)
157
-
158
- # 🔥 Step 1: Summarize first (if text is too long)
159
- if len(extracted_text) > 2000:
160
- summarizer = get_summarizer()
161
- summarized_text = summarizer(extracted_text[:2000], max_length=500, min_length=100, do_sample=False)[0]["summary_text"]
162
- else:
163
- summarized_text = extracted_text
164
-
165
- # 🔥 Step 2: Use summarized text for QA
166
- qa_model = get_qa_model()
167
- answer = qa_model(question=question, context=summarized_text) # Fixed argument format
168
 
169
- return {"question": question, "answer": answer["answer"], "context_used": summarized_text}
170
-
171
- except Exception as e:
172
- raise HTTPException(status_code=500, detail=f"Error processing question: {str(e)}")
173
  ###############################################
174
 
175
  @app.post("/api/caption")
 
70
 
71
 
72
  #########################################################
73
+
74
+
75
+ models_cache: Dict[str, pipeline] = {}
76
+
77
+ def get_model(model_name: str, task: str):
78
+ if model_name not in models_cache:
79
+ models_cache[model_name] = pipeline(task, model=model_name)
80
+ return models_cache[model_name]
81
+
82
+
83
+
84
+
85
  def extract_text_from_file(file_content: bytes, file_ext: str):
86
  text = ""
 
87
  try:
88
  if file_ext == "docx":
89
  doc = Document(io.BytesIO(file_content))
90
  text = " ".join([p.text for p in doc.paragraphs if p.text.strip()])
91
  elif file_ext in ["xls", "xlsx"]:
92
  df = pd.read_excel(io.BytesIO(file_content))
93
+ text = " ".join(df.iloc[:, 0].dropna().astype(str).tolist())
94
  elif file_ext == "pptx":
95
  ppt = Presentation(io.BytesIO(file_content))
96
  text = " ".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])
 
99
  text = " ".join([page.get_text("text") for page in pdf])
100
  elif file_ext in ["jpg", "jpeg", "png"]:
101
  image = Image.open(io.BytesIO(file_content))
102
+ text = pytesseract.image_to_string(image, config='--psm 6')
103
  else:
104
  raise HTTPException(status_code=400, detail="Unsupported file format.")
105
  except Exception as e:
 
107
 
108
  if not text.strip():
109
  raise HTTPException(status_code=400, detail="No extractable text found.")
 
110
  return text
111
 
112
  ########################################################
 
160
  #################################################################
161
  @app.post("/qa")
162
  async def question_answering(file: UploadFile = File(...), question: str = Form(...)):
163
+ content = await file.read()
164
+ file_ext = file.filename.split(".")[-1].lower()
165
+ extracted_text = extract_text_from_file(content, file_ext)
166
+ # Use a pipeline as a high-level helper
167
+ summarizer = get_model("google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", "summarization")
168
+ if len(extracted_text) > 2000:
169
+ extracted_text = summarizer(extracted_text[:2000], max_length=500, min_length=100, do_sample=False)[0]["summary_text"]
170
+
171
+ qa_model = get_model("distilbert-base-cased-distilled-squad", "question-answering")
172
+ answer = qa_model(question=question, context=extracted_text)
173
+
174
+ return {"question": question, "answer": answer["answer"], "context_used": extracted_text}
 
 
 
175
 
 
 
 
 
176
  ###############################################
177
 
178
  @app.post("/api/caption")