Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -41,7 +41,7 @@ def fin_ext_bis(text):
|
|
| 41 |
results = fin_model_bis(split_in_sentences(text))
|
| 42 |
return make_spans(text, results)
|
| 43 |
|
| 44 |
-
def
|
| 45 |
if not pdf1 or not pdf2:
|
| 46 |
return [], []
|
| 47 |
|
|
@@ -57,13 +57,12 @@ def extract_and_summarize(pdf1, pdf2):
|
|
| 57 |
|
| 58 |
start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
|
| 59 |
start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)
|
| 60 |
-
|
| 61 |
paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
|
| 62 |
paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
return paragraphs_1, paragraphs_2
|
| 68 |
|
| 69 |
# Gradio interface setup
|
|
@@ -152,7 +151,58 @@ def process_and_compare(file1, sheet1, file2, sheet2):
|
|
| 152 |
|
| 153 |
return file_path
|
| 154 |
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
stored_paragraphs_1 = []
|
| 157 |
stored_paragraphs_2 = []
|
| 158 |
|
|
@@ -173,7 +223,7 @@ with gr.Blocks() as demo:
|
|
| 173 |
|
| 174 |
def update_paragraphs(pdf1, pdf2):
|
| 175 |
global stored_paragraphs_1, stored_paragraphs_2
|
| 176 |
-
stored_paragraphs_1, stored_paragraphs_2 =
|
| 177 |
updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
|
| 178 |
updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
|
| 179 |
return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)
|
|
@@ -236,5 +286,10 @@ with gr.Blocks() as demo:
|
|
| 236 |
|
| 237 |
b1 = gr.Button("Compare Data")
|
| 238 |
b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
demo.launch()
|
|
|
|
| 41 |
results = fin_model_bis(split_in_sentences(text))
|
| 42 |
return make_spans(text, results)
|
| 43 |
|
| 44 |
+
def extract_and_paragraph(pdf1, pdf2, paragraph):
|
| 45 |
if not pdf1 or not pdf2:
|
| 46 |
return [], []
|
| 47 |
|
|
|
|
| 57 |
|
| 58 |
start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
|
| 59 |
start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)
|
|
|
|
| 60 |
paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
|
| 61 |
paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)
|
| 62 |
+
if paragraph:
|
| 63 |
+
paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 0)
|
| 64 |
+
paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 0)
|
| 65 |
+
|
| 66 |
return paragraphs_1, paragraphs_2
|
| 67 |
|
| 68 |
# Gradio interface setup
|
|
|
|
| 151 |
|
| 152 |
return file_path
|
| 153 |
|
| 154 |
+
def find_sentences_with_keywords(text, keywords):
|
| 155 |
+
# Split text into sentences using regular expression to match sentence-ending punctuation
|
| 156 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 157 |
+
|
| 158 |
+
matched_sentences = []
|
| 159 |
+
|
| 160 |
+
# For each keyword, find sentences that contain the keyword as a whole word
|
| 161 |
+
for keyword in keywords:
|
| 162 |
+
keyword_pattern = re.compile(rf'\b{re.escape(keyword)}\b', re.IGNORECASE) # Using word boundaries
|
| 163 |
+
|
| 164 |
+
for sentence in sentences:
|
| 165 |
+
if keyword_pattern.search(sentence):
|
| 166 |
+
matched_sentences.append(sentence)
|
| 167 |
+
|
| 168 |
+
return matched_sentences
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# Main function to process both PDFs based on the Excel file names and the sheet name
|
| 172 |
+
def process_pdfs(file1, file2, sheet):
|
| 173 |
+
# Derive PDF file names from the Excel file paths
|
| 174 |
+
pdf_file1 = file1.replace(".xlsx", ".pdf")
|
| 175 |
+
pdf_file2 = file2.replace(".xlsx", ".pdf")
|
| 176 |
+
set = {
|
| 177 |
+
'GDP': ['GDP'],
|
| 178 |
+
'HICP': ['HICP'],
|
| 179 |
+
'RRE prices': ['RRE', 'residential'],
|
| 180 |
+
'Unemployment' : 'Unemployment',
|
| 181 |
+
'CRE prices': ['CRE', 'commercial']
|
| 182 |
+
}
|
| 183 |
+
# Extract text from both PDFs
|
| 184 |
+
pdf_text1,pdf_text2 = extract_and_paragraph(pdf_file1, pdf_file2, False)
|
| 185 |
+
|
| 186 |
+
# Find sentences that match the sheet names (used as keywords)
|
| 187 |
+
matched_sentences1 = find_sentences_with_keywords(pdf_text1, set[sheet])
|
| 188 |
+
matched_sentences2 = find_sentences_with_keywords(pdf_text2, set[sheet])
|
| 189 |
+
|
| 190 |
+
# Format the results for output
|
| 191 |
+
result = {
|
| 192 |
+
"PDF 1": {
|
| 193 |
+
"File": pdf_file1,
|
| 194 |
+
"Keyword": set[sheet],
|
| 195 |
+
"Sentences": matched_sentences1
|
| 196 |
+
},
|
| 197 |
+
"PDF 2": {
|
| 198 |
+
"File": pdf_file2,
|
| 199 |
+
"Keyword": set[sheet],
|
| 200 |
+
"Sentences": matched_sentences2
|
| 201 |
+
}
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
return result
|
| 205 |
+
|
| 206 |
stored_paragraphs_1 = []
|
| 207 |
stored_paragraphs_2 = []
|
| 208 |
|
|
|
|
| 223 |
|
| 224 |
def update_paragraphs(pdf1, pdf2):
|
| 225 |
global stored_paragraphs_1, stored_paragraphs_2
|
| 226 |
+
stored_paragraphs_1, stored_paragraphs_2 = extract_and_paragraph(pdf1, pdf2, True)
|
| 227 |
updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
|
| 228 |
updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
|
| 229 |
return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)
|
|
|
|
| 286 |
|
| 287 |
b1 = gr.Button("Compare Data")
|
| 288 |
b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=result)
|
| 289 |
+
with gr.Row():
|
| 290 |
+
with gr.Column():
|
| 291 |
+
result = gr.JSON(label="Comparison Result")
|
| 292 |
+
b2 = gr.Button("Extract text information")
|
| 293 |
+
b2.click(fn=process_pdfs, inputs=[file1, file2, sheet], outputs=result)
|
| 294 |
|
| 295 |
demo.launch()
|