Spaces:

chenguittiMaroua
/

asm-app

Sleeping

App Files Files Community

chenguittiMaroua commited on Apr 9

Commit

0940d8b

verified ·

1 Parent(s): b176e46

Update main.py

Browse files

Files changed (1) hide show

main.py +52 -262

main.py CHANGED Viewed

@@ -64,87 +64,42 @@ def get_image_captioning():
 @lru_cache()
 def get_translator():
     return pipeline("translation", model="facebook/nllb-200-distilled-600M")
 @lru_cache()
 def get_qa_model():
-    model_name = "deepset/roberta-base-squad2"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
-    return tokenizer, model
-# Helper Functions
-def answer_question(question: str, context: str) -> dict:
-    tokenizer, model = get_qa_model()
     try:
-        # Try with the full context first
-        inputs = tokenizer(
-            question,
-            context,
-            max_length=512,
-            truncation="only_second",
-            padding="max_length",
-            return_tensors="pt"
-        )
-        with torch.no_grad():
-            outputs = model(
-                input_ids=inputs["input_ids"],
-                attention_mask=inputs["attention_mask"]
-            )
-        answer_start = torch.argmax(outputs.start_logits)
-        answer_end = torch.argmax(outputs.end_logits) + 1
-        answer = tokenizer.decode(
-            inputs["input_ids"][0][answer_start:answer_end],
-            skip_special_tokens=True
-        ).strip()
-        # Calculate confidence
-        start_score = torch.max(torch.nn.functional.softmax(outputs.start_logits, dim=1)).item()
-        end_score = torch.max(torch.nn.functional.softmax(outputs.end_logits, dim=1)).item()
-        confidence = (start_score + end_score) / 2
-        # If no answer found, try sentence by sentence
-        if not answer or confidence < 0.5:
-            sentences = [s.strip() for s in context.split('.') if len(s.strip()) > 10]
-            for sentence in sentences:
-                if any(word in sentence.lower() for word in question.lower().split()):
-                    inputs = tokenizer(
-                        question,
-                        sentence,
-                        max_length=512,
-                        truncation="only_second",
-                        padding="max_length",
-                        return_tensors="pt"
-                    )
-                    with torch.no_grad():
-                        outputs = model(
-                            input_ids=inputs["input_ids"],
-                            attention_mask=inputs["attention_mask"]
-                        )
-                    temp_start = torch.argmax(outputs.start_logits)
-                    temp_end = torch.argmax(outputs.end_logits) + 1
-                    temp_answer = tokenizer.decode(
-                        inputs["input_ids"][0][temp_start:temp_end],
-                        skip_special_tokens=True
-                    ).strip()
-                    if temp_answer:
-                        return {
-                            "answer": temp_answer,
-                            "confidence": 0.7,  # Slightly lower confidence for fallback
-                            "context_used": sentence
-                        }
-        return {
-            "answer": answer if answer else "No answer found in the given context",
-            "confidence": confidence
-        }
     except Exception as e:
-        return {
-            "answer": f"Error processing answer: {str(e)}",
-            "confidence": 0.0
-        }
 @app.get("/", response_class=HTMLResponse)
 def home ():
     with open("static/indexAI.html","r") as file :
@@ -192,195 +147,30 @@ async def summarize_document(file: UploadFile = File(...)):
         return {"summary": summary}
     except Exception as e:
         raise HTTPException(500, f"Error processing document: {str(e)}")
-@app.post("/ask")
-async def ask_question(
-    question: str = Form(...),
-    file: Optional[UploadFile] = File(None),
-    text: Optional[str] = Form(None)
-):
     try:
-        # 1. Extract and preprocess context
-        context = await extract_context(file, text)
-        if not context.strip():
-            raise HTTPException(400, "No extractable content found")
-        # 2. Clean and prepare context
-        context = clean_context(context)
-        # 3. Try primary QA model
-        qa_result = answer_with_model(question, context)
-        # 4. If high confidence but no answer found, try sentence-level analysis
-        if qa_result["confidence"] > 0.6 and is_no_answer(qa_result["answer"]):
-            sentence_result = answer_from_sentences(question, context)
-            if sentence_result:
-                return format_response(sentence_result, context)
-        # 5. If low confidence, try semantic similarity
-        if qa_result["confidence"] < 0.4:
-            semantic_result = answer_with_semantic_search(question, context)
-            if semantic_result["confidence"] > qa_result["confidence"]:
-                return format_response(semantic_result, context)
-        # 6. Final fallback to keyword matching
-        if is_no_answer(qa_result["answer"]):
-            keyword_result = answer_with_keywords(question, context)
-            if keyword_result:
-                return format_response(keyword_result, context)
-        # 7. Return whatever answer we have
-        return format_response(qa_result, context)
-    except Exception as e:
-        raise HTTPException(500, f"Error processing question: {str(e)}")
-# Helper functions
-async def extract_context(file: Optional[UploadFile], text: Optional[str]) -> str:
-    """Extract text from file or use provided text"""
-    if file:
-        content = await file.read()
-        file_ext = file.filename.split(".")[-1].lower()
-        if file_ext == "pdf":
-            pdf = fitz.open(stream=content, filetype="pdf")
-            return " ".join([page.get_text("text") for page in pdf])
-        elif file_ext == "docx":
-            doc = Document(io.BytesIO(content))
-            return " ".join([p.text for p in doc.paragraphs if p.text.strip()])
-        elif file_ext in ["xls", "xlsx"]:
-            df = pd.read_excel(io.BytesIO(content))
-            return " ".join(df.iloc[:, 0].dropna().astype(str).tolist())
-        elif file_ext == "pptx":
-            ppt = Presentation(io.BytesIO(content))
-            return " ".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])
-        elif file_ext in ["jpg", "jpeg", "png"]:
-            image = Image.open(io.BytesIO(content))
-            try:
-                context = pytesseract.image_to_string(image)
-                return context if context.strip() else get_image_captioning()(image)[0]['generated_text']
-            except:
-                return get_image_captioning()(image)[0]['generated_text']
-    return text or ""
-def clean_context(context: str) -> str:
-    """Clean and normalize context text"""
-    context = " ".join(context.split())  # Remove excessive whitespace
-    return context[:10000]  # Limit context size
-def answer_with_model(question: str, context: str) -> dict:
-    """Use QA model to find answer"""
-    tokenizer, model = get_qa_model()
-    inputs = tokenizer(
-        question,
-        context,
-        max_length=512,
-        truncation="only_second",
-        padding="max_length",
-        return_tensors="pt"
-    )
-    with torch.no_grad():
-        outputs = model(
-            input_ids=inputs["input_ids"],
-            attention_mask=inputs["attention_mask"]
-        )
-    answer_start = torch.argmax(outputs.start_logits)
-    answer_end = torch.argmax(outputs.end_logits) + 1
-    answer = tokenizer.decode(
-        inputs["input_ids"][0][answer_start:answer_end],
-        skip_special_tokens=True
-    ).strip()
-    confidence = (torch.max(torch.nn.functional.softmax(outputs.start_logits, dim=1)).item() +
-                torch.max(torch.nn.functional.softmax(outputs.end_logits, dim=1)).item()) / 2
-    return {
-        "answer": answer if answer else "No answer found",
-        "confidence": confidence
-    }
-def answer_from_sentences(question: str, context: str) -> Optional[dict]:
-    """Try to find answer by analyzing individual sentences"""
-    sentences = [s.strip() for s in context.split('.') if len(s.strip()) > 20]
-    for sentence in sentences:
-        if any(word in sentence.lower() for word in question.lower().split() if len(word) > 3):
-            tokenizer, model = get_qa_model()
-            inputs = tokenizer(
-                question,
-                sentence,
-                max_length=512,
-                truncation="only_second",
-                padding="max_length",
-                return_tensors="pt"
-            )
-            with torch.no_grad():
-                outputs = model(
-                    input_ids=inputs["input_ids"],
-                    attention_mask=inputs["attention_mask"]
-                )
-            answer_start = torch.argmax(outputs.start_logits)
-            answer_end = torch.argmax(outputs.end_logits) + 1
-            answer = tokenizer.decode(
-                inputs["input_ids"][0][answer_start:answer_end],
-                skip_special_tokens=True
-            ).strip()
-            if answer and answer.lower() not in ["no answer", "no answer found"]:
-                return {
-                    "answer": answer,
-                    "confidence": 0.7  # Slightly lower confidence for fallback
-                }
-    return None
-def answer_with_semantic_search(question: str, context: str) -> dict:
-    """Use semantic similarity to find relevant answer"""
-    model = SentenceTransformer('all-MiniLM-L6-v2')
-    sentences = [s.strip() for s in context.split('.') if len(s.strip()) > 20]
-    if not sentences:
-        return {"answer": "No answer found", "confidence": 0.0}
-    question_embedding = model.encode(question, convert_to_tensor=True)
-    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
-    cos_scores = util.cos_sim(question_embedding, sentence_embeddings)[0]
-    best_idx = torch.argmax(cos_scores).item()
-    if cos_scores[best_idx] > 0.5:
-        result = answer_with_model(question, sentences[best_idx])
-        if not is_no_answer(result["answer"]):
-            result["confidence"] = min(result["confidence"] + 0.1, 0.9)  # Boost confidence slightly
-            return result
-    return {"answer": "No answer found", "confidence": 0.0}
-def answer_with_keywords(question: str, context: str) -> Optional[dict]:
-    """Simple keyword matching fallback"""
-    keywords = [word for word in question.lower().split() if len(word) > 3]
-    sentences = [s.strip() for s in context.split('.') if any(kw in s.lower() for kw in keywords)]
-    if sentences:
-        return {
-            "answer": sentences[0],
-            "confidence": 0.6
-        }
-    return None
-def is_no_answer(answer: str) -> bool:
-    """Check if answer indicates no answer found"""
-    return answer.lower() in ["no answer", "no answer found", "no answer found in the given context"]
-def format_response(result: dict, context: str) -> dict:
-    """Format final response"""
-    return {
-        "answer": result["answer"],
-        "confidence": result["confidence"],
-        "context_used": context[:500] + "..." if len(context) > 500 else context
-    }
 @app.post("/api/caption")
 async def caption_image(file: UploadFile = File(...)):

 @lru_cache()
 def get_translator():
     return pipeline("translation", model="facebook/nllb-200-distilled-600M")
 @lru_cache()
 def get_qa_model():
+    return pipeline("question-answering", model="deepset/roberta-base-squad2")
+#########################################################
+def extract_text_from_file(file_content: bytes, file_ext: str):
+    text = ""
     try:
+        if file_ext == "docx":
+            doc = Document(io.BytesIO(file_content))
+            text = " ".join([p.text for p in doc.paragraphs if p.text.strip()])
+        elif file_ext in ["xls", "xlsx"]:
+            df = pd.read_excel(io.BytesIO(file_content))
+            text = " ".join(df.iloc[:, 0].dropna().astype(str).tolist())  # Extract first column text
+        elif file_ext == "pptx":
+            ppt = Presentation(io.BytesIO(file_content))
+            text = " ".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])
+        elif file_ext == "pdf":
+            pdf = fitz.open(stream=file_content, filetype="pdf")
+            text = " ".join([page.get_text("text") for page in pdf])
+        elif file_ext in ["jpg", "jpeg", "png"]:
+            image = Image.open(io.BytesIO(file_content))
+            text = pytesseract.image_to_string(image)  # OCR for text extraction
+        else:
+            raise HTTPException(status_code=400, detail="Unsupported file format.")
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="No extractable text found.")
+    return text
+    ########################################################
 @app.get("/", response_class=HTMLResponse)
 def home ():
     with open("static/indexAI.html","r") as file :
         return {"summary": summary}
     except Exception as e:
         raise HTTPException(500, f"Error processing document: {str(e)}")
+#################################################################
+@app.post("/qa")
+async def question_answering(file: UploadFile = File(...), question: str = Form(...)):
     try:
+        content = await file.read()
+        file_ext = file.filename.split(".")[-1].lower()
+        extracted_text = extract_text_from_file(content, file_ext)
+        # 🔥 Step 1: Summarize first (if text is too long)
+        if len(extracted_text) > 2000:
+            summarizer = get_summarizer()
+            summarized_text = summarizer(extracted_text[:2000], max_length=500, min_length=100, do_sample=False)[0]["summary_text"]
+        else:
+            summarized_text = extracted_text
+        # 🔥 Step 2: Use summarized text for QA
+        qa_model = get_qa_model()
+        answer = qa_model(question=question, context=summarized_text)  # Fixed argument format
+        return {"question": question, "answer": answer["answer"], "context_used": summarized_text}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing question: {str(e)}")
+###############################################
 @app.post("/api/caption")
 async def caption_image(file: UploadFile = File(...)):