try again
Browse files- extractor_compare.py +101 -114
extractor_compare.py
CHANGED
|
@@ -217,7 +217,17 @@ def create_interface():
|
|
| 217 |
label="PDF Document",
|
| 218 |
value='''
|
| 219 |
<div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
|
| 220 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
<div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%;
|
| 222 |
display:flex; align-items:center; justify-content:center; padding:20px; text-align:center;">
|
| 223 |
Click "Load PDFs" to start viewing documents.
|
|
@@ -336,80 +346,41 @@ def create_interface():
|
|
| 336 |
outputs=[extractor2_text]
|
| 337 |
)
|
| 338 |
|
| 339 |
-
#
|
| 340 |
demo.load(
|
| 341 |
-
None,
|
| 342 |
js="""
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
return;
|
| 355 |
-
}
|
| 356 |
-
|
| 357 |
-
// Revoke previous Blob URL to free memory
|
| 358 |
-
if (window.currentPdfBlobUrl) {
|
| 359 |
-
console.log('Revoking previous Blob URL:', window.currentPdfBlobUrl);
|
| 360 |
-
URL.revokeObjectURL(window.currentPdfBlobUrl);
|
| 361 |
-
window.currentPdfBlobUrl = null;
|
| 362 |
-
}
|
| 363 |
-
|
| 364 |
-
if (base64Data && base64Data.length > 100) { // Ensure there's actual content
|
| 365 |
-
try {
|
| 366 |
-
// Hide fallback message
|
| 367 |
-
if (fallbackDiv) fallbackDiv.style.display = 'none';
|
| 368 |
-
|
| 369 |
-
// Decode Base64
|
| 370 |
-
const byteCharacters = atob(base64Data);
|
| 371 |
-
console.log('Base64 decoded successfully, length:', byteCharacters.length);
|
| 372 |
-
const byteNumbers = new Array(byteCharacters.length);
|
| 373 |
-
for (let i = 0; i < byteCharacters.length; i++) {
|
| 374 |
-
byteNumbers[i] = byteCharacters.charCodeAt(i);
|
| 375 |
-
}
|
| 376 |
-
const byteArray = new Uint8Array(byteNumbers);
|
| 377 |
-
|
| 378 |
-
// Create Blob and URL
|
| 379 |
-
const blob = new Blob([byteArray], {type: 'application/pdf'});
|
| 380 |
-
window.currentPdfBlobUrl = URL.createObjectURL(blob);
|
| 381 |
-
console.log('Created new Blob URL:', window.currentPdfBlobUrl);
|
| 382 |
-
|
| 383 |
-
// Update iframe source
|
| 384 |
-
iframe.src = window.currentPdfBlobUrl;
|
| 385 |
-
console.log('Iframe src updated to Blob URL');
|
| 386 |
-
} catch (e) {
|
| 387 |
-
console.error('Error processing Base64 data or creating Blob URL:', e);
|
| 388 |
-
if (fallbackDiv) {
|
| 389 |
-
fallbackDiv.innerHTML = '<div style="color:red;">Error loading PDF: ' + e.message + '</div>';
|
| 390 |
-
fallbackDiv.style.display = 'flex';
|
| 391 |
-
}
|
| 392 |
-
iframe.src = 'about:blank'; // Clear iframe on error
|
| 393 |
}
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
|
|
|
| 399 |
}
|
| 400 |
-
iframe.src = 'about:blank'; // Clear iframe if no data
|
| 401 |
-
}
|
| 402 |
-
}
|
| 403 |
-
|
| 404 |
-
// MutationObserver to watch the hidden Textbox
|
| 405 |
-
const targetNode = document.getElementById('pdf_base64_data');
|
| 406 |
-
if (targetNode) {
|
| 407 |
-
// Find the actual textarea inside the Gradio component structure
|
| 408 |
-
const hiddenTextArea = targetNode.querySelector('textarea');
|
| 409 |
-
if(hiddenTextArea){
|
| 410 |
-
console.log('Found hidden textarea to observe.');
|
| 411 |
-
const observerConfig = { characterData: true, childList: true, subtree: true, attributes: true }; // Watch for all changes
|
| 412 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
const observer = new MutationObserver(function(mutationsList) {
|
| 414 |
console.log('Mutation detected, checking textarea value');
|
| 415 |
if (hiddenTextArea.value && hiddenTextArea.value.length > 100) {
|
|
@@ -418,55 +389,71 @@ def create_interface():
|
|
| 418 |
}
|
| 419 |
});
|
| 420 |
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
} else {
|
| 433 |
-
console.error('Could not find the textarea within #pdf_base64_data!');
|
| 434 |
}
|
| 435 |
-
}
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
}
|
| 444 |
-
let targetButtonId = null;
|
| 445 |
-
const key = event.key;
|
| 446 |
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
targetButton.click();
|
| 455 |
-
}
|
| 456 |
-
}
|
| 457 |
-
});
|
| 458 |
-
console.log('Keydown listener added.');
|
| 459 |
-
|
| 460 |
-
// Additional style for basic font
|
| 461 |
-
const additionalStyle = document.createElement('style');
|
| 462 |
-
additionalStyle.textContent = `
|
| 463 |
-
.extraction-text textarea {
|
| 464 |
-
font-family: Arial, Helvetica, sans-serif !important;
|
| 465 |
-
font-size: 14px !important;
|
| 466 |
}
|
| 467 |
-
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
"""
|
| 471 |
)
|
| 472 |
|
|
|
|
| 217 |
label="PDF Document",
|
| 218 |
value='''
|
| 219 |
<div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
|
| 220 |
+
<style>
|
| 221 |
+
@font-face {
|
| 222 |
+
font-family: 'Local Arial';
|
| 223 |
+
src: local('Arial');
|
| 224 |
+
}
|
| 225 |
+
body {
|
| 226 |
+
font-family: 'Local Arial', sans-serif;
|
| 227 |
+
}
|
| 228 |
+
</style>
|
| 229 |
+
<meta http-equiv="Content-Security-Policy" content="default-src * blob:; script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';">
|
| 230 |
+
<iframe id="pdf-iframe" width="100%" height="100%" style="border:none;" src="about:blank" sandbox="allow-same-origin allow-scripts allow-forms"></iframe>
|
| 231 |
<div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%;
|
| 232 |
display:flex; align-items:center; justify-content:center; padding:20px; text-align:center;">
|
| 233 |
Click "Load PDFs" to start viewing documents.
|
|
|
|
| 346 |
outputs=[extractor2_text]
|
| 347 |
)
|
| 348 |
|
| 349 |
+
# JavaScript for PDF handling
|
| 350 |
demo.load(
|
| 351 |
+
fn=None,
|
| 352 |
js="""
|
| 353 |
+
// Function to safely setup the MutationObserver for the PDF data
|
| 354 |
+
function setupPdfDataObserver() {
|
| 355 |
+
console.log('Setting up PDF data observer...');
|
| 356 |
+
|
| 357 |
+
// Wait for Gradio components to fully render
|
| 358 |
+
setTimeout(() => {
|
| 359 |
+
try {
|
| 360 |
+
const targetNode = document.getElementById('pdf_base64_data');
|
| 361 |
+
if (!targetNode) {
|
| 362 |
+
console.error('PDF data container not found!');
|
| 363 |
+
return;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
}
|
| 365 |
+
|
| 366 |
+
// Find the textarea within the Gradio component
|
| 367 |
+
const hiddenTextArea = targetNode.querySelector('textarea');
|
| 368 |
+
if (!hiddenTextArea) {
|
| 369 |
+
console.error('Hidden textarea not found within the container!');
|
| 370 |
+
return;
|
| 371 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
+
console.log('Found hidden textarea to observe');
|
| 374 |
+
|
| 375 |
+
// Setup observer configuration
|
| 376 |
+
const observerConfig = {
|
| 377 |
+
characterData: true,
|
| 378 |
+
childList: true,
|
| 379 |
+
subtree: true,
|
| 380 |
+
attributes: true
|
| 381 |
+
};
|
| 382 |
+
|
| 383 |
+
// Create and attach the observer
|
| 384 |
const observer = new MutationObserver(function(mutationsList) {
|
| 385 |
console.log('Mutation detected, checking textarea value');
|
| 386 |
if (hiddenTextArea.value && hiddenTextArea.value.length > 100) {
|
|
|
|
| 389 |
}
|
| 390 |
});
|
| 391 |
|
| 392 |
+
// Observe the textarea itself, not its parent
|
| 393 |
+
observer.observe(hiddenTextArea, observerConfig);
|
| 394 |
+
console.log('MutationObserver attached to textarea');
|
| 395 |
+
|
| 396 |
+
// Also check initial value
|
| 397 |
+
if (hiddenTextArea.value && hiddenTextArea.value.length > 100) {
|
| 398 |
+
console.log('Initial valid value found, displaying PDF');
|
| 399 |
+
displayPdfBlob(hiddenTextArea.value);
|
| 400 |
+
}
|
| 401 |
+
} catch (error) {
|
| 402 |
+
console.error('Error setting up observer:', error);
|
|
|
|
|
|
|
| 403 |
}
|
| 404 |
+
}, 1000); // Wait 1 second for components to render
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
// Function to display PDF from base64 data
|
| 408 |
+
function displayPdfBlob(base64Data) {
|
| 409 |
+
try {
|
| 410 |
+
// Get iframe and fallback elements
|
| 411 |
+
const iframe = document.getElementById('pdf-iframe');
|
| 412 |
+
const fallback = document.getElementById('pdf-fallback');
|
| 413 |
+
|
| 414 |
+
if (!iframe || !fallback) {
|
| 415 |
+
console.error('PDF viewer elements not found');
|
| 416 |
+
return;
|
| 417 |
}
|
|
|
|
|
|
|
| 418 |
|
| 419 |
+
// Convert base64 to binary
|
| 420 |
+
const binaryString = atob(base64Data);
|
| 421 |
+
const len = binaryString.length;
|
| 422 |
+
const bytes = new Uint8Array(len);
|
| 423 |
+
|
| 424 |
+
for (let i = 0; i < len; i++) {
|
| 425 |
+
bytes[i] = binaryString.charCodeAt(i);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
}
|
| 427 |
+
|
| 428 |
+
// Create blob and URL
|
| 429 |
+
const blob = new Blob([bytes], { type: 'application/pdf' });
|
| 430 |
+
const objectUrl = URL.createObjectURL(blob);
|
| 431 |
+
|
| 432 |
+
// Update iframe
|
| 433 |
+
iframe.src = objectUrl;
|
| 434 |
+
|
| 435 |
+
// Hide fallback message
|
| 436 |
+
fallback.style.display = 'none';
|
| 437 |
+
|
| 438 |
+
// Log success
|
| 439 |
+
console.log('PDF displayed successfully');
|
| 440 |
+
} catch (error) {
|
| 441 |
+
console.error('Error displaying PDF:', error);
|
| 442 |
+
}
|
| 443 |
}
|
| 444 |
+
|
| 445 |
+
// Initialize the observer after everything is loaded
|
| 446 |
+
window.addEventListener('load', function() {
|
| 447 |
+
console.log('Window loaded, initializing PDF observer...');
|
| 448 |
+
setupPdfDataObserver();
|
| 449 |
+
});
|
| 450 |
+
|
| 451 |
+
// Also setup when Gradio mounts the component
|
| 452 |
+
document.addEventListener('DOMContentLoaded', function() {
|
| 453 |
+
console.log('DOM loaded, waiting for Gradio components...');
|
| 454 |
+
// Wait a bit longer for Gradio components to mount
|
| 455 |
+
setTimeout(setupPdfDataObserver, 2000);
|
| 456 |
+
});
|
| 457 |
"""
|
| 458 |
)
|
| 459 |
|