fuck sandbox on hf space
Browse files- extractor_compare.py +203 -55
extractor_compare.py
CHANGED
|
@@ -198,7 +198,11 @@ def create_interface():
|
|
| 198 |
}
|
| 199 |
"""
|
| 200 |
|
| 201 |
-
with gr.Blocks(title="PDF Extractor Comparer", theme="soft", css=custom_css
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
gr.Markdown("## PDF Extractor Comparer")
|
| 203 |
|
| 204 |
with gr.Row():
|
|
@@ -217,18 +221,7 @@ def create_interface():
|
|
| 217 |
label="PDF Document",
|
| 218 |
value='''
|
| 219 |
<div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
|
| 220 |
-
<style>
|
| 221 |
-
@font-face {
|
| 222 |
-
font-family: 'Local Arial';
|
| 223 |
-
src: local('Arial');
|
| 224 |
-
}
|
| 225 |
-
body {
|
| 226 |
-
font-family: 'Local Arial', sans-serif;
|
| 227 |
-
}
|
| 228 |
-
</style>
|
| 229 |
-
<object id="pdf-object" type="application/pdf" width="100%" height="100%" style="display:none;">
|
| 230 |
-
<p>PDF cannot be displayed</p>
|
| 231 |
-
</object>
|
| 232 |
<div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%;
|
| 233 |
display:flex; align-items:center; justify-content:center; padding:20px; text-align:center; font-family: Arial, sans-serif;">
|
| 234 |
Click "Load PDFs" to start viewing documents.
|
|
@@ -352,92 +345,247 @@ def create_interface():
|
|
| 352 |
fn=None,
|
| 353 |
js="""
|
| 354 |
function() {
|
| 355 |
-
console.log("Setting up PDF viewer");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
|
| 357 |
-
//
|
| 358 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
|
| 360 |
// Function to display PDF from base64 data
|
| 361 |
-
function displayPdfFromBase64(base64Data) {
|
| 362 |
try {
|
| 363 |
if (!base64Data || base64Data.length < 100) {
|
| 364 |
console.log("No valid PDF data received");
|
| 365 |
document.getElementById('pdf-fallback').style.display = 'flex';
|
| 366 |
-
document.getElementById('pdf-
|
| 367 |
return;
|
| 368 |
}
|
| 369 |
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
}
|
| 376 |
|
| 377 |
-
//
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
}
|
| 383 |
|
| 384 |
-
//
|
| 385 |
-
const
|
| 386 |
-
|
|
|
|
|
|
|
|
|
|
| 387 |
|
| 388 |
-
//
|
| 389 |
-
const
|
| 390 |
-
|
|
|
|
| 391 |
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
}
|
| 400 |
} catch (error) {
|
| 401 |
-
console.error("Error
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
|
|
|
| 407 |
}
|
| 408 |
}
|
| 409 |
|
| 410 |
-
// Check for PDF data
|
| 411 |
-
function
|
| 412 |
const dataElement = document.getElementById('pdf_base64_data');
|
| 413 |
if (!dataElement) {
|
| 414 |
console.log("PDF data element not found, will retry");
|
| 415 |
-
setTimeout(
|
| 416 |
return;
|
| 417 |
}
|
| 418 |
|
| 419 |
const textarea = dataElement.querySelector('textarea');
|
| 420 |
if (!textarea) {
|
| 421 |
console.log("Textarea not found, will retry");
|
| 422 |
-
setTimeout(
|
| 423 |
return;
|
| 424 |
}
|
| 425 |
|
|
|
|
|
|
|
| 426 |
// Display initial data if available
|
| 427 |
if (textarea.value && textarea.value.length > 100) {
|
| 428 |
displayPdfFromBase64(textarea.value);
|
| 429 |
}
|
| 430 |
|
| 431 |
-
//
|
| 432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
if (textarea.value && textarea.value.length > 100) {
|
| 434 |
displayPdfFromBase64(textarea.value);
|
| 435 |
}
|
| 436 |
-
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
}
|
| 438 |
|
| 439 |
// Start checking for PDF data
|
| 440 |
-
setTimeout(
|
| 441 |
|
| 442 |
// Add keyboard shortcuts
|
| 443 |
document.addEventListener('keydown', function(event) {
|
|
|
|
| 198 |
}
|
| 199 |
"""
|
| 200 |
|
| 201 |
+
with gr.Blocks(title="PDF Extractor Comparer", theme="soft", css=custom_css, head=
|
| 202 |
+
"""
|
| 203 |
+
<script src="https://unpkg.com/[email protected]/build/pdf.min.js"></script>
|
| 204 |
+
"""
|
| 205 |
+
) as demo:
|
| 206 |
gr.Markdown("## PDF Extractor Comparer")
|
| 207 |
|
| 208 |
with gr.Row():
|
|
|
|
| 221 |
label="PDF Document",
|
| 222 |
value='''
|
| 223 |
<div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
|
| 224 |
+
<div id="pdf-container" style="width:100%; height:100%; overflow:auto;"></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
<div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%;
|
| 226 |
display:flex; align-items:center; justify-content:center; padding:20px; text-align:center; font-family: Arial, sans-serif;">
|
| 227 |
Click "Load PDFs" to start viewing documents.
|
|
|
|
| 345 |
fn=None,
|
| 346 |
js="""
|
| 347 |
function() {
|
| 348 |
+
console.log("Setting up PDF.js viewer");
|
| 349 |
+
|
| 350 |
+
// Configure PDF.js worker
|
| 351 |
+
if (window.pdfjsLib) {
|
| 352 |
+
window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/[email protected]/build/pdf.worker.min.js";
|
| 353 |
+
console.log("PDF.js configured with worker");
|
| 354 |
+
} else {
|
| 355 |
+
console.warn("PDF.js not found in head, attempting to load dynamically");
|
| 356 |
+
// Fallback to load PDF.js dynamically if not in the head
|
| 357 |
+
const pdfJsScript = document.createElement('script');
|
| 358 |
+
pdfJsScript.src = "https://unpkg.com/[email protected]/build/pdf.min.js";
|
| 359 |
+
document.head.appendChild(pdfJsScript);
|
| 360 |
+
|
| 361 |
+
pdfJsScript.onload = function() {
|
| 362 |
+
window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/[email protected]/build/pdf.worker.min.js";
|
| 363 |
+
console.log("PDF.js loaded dynamically");
|
| 364 |
+
};
|
| 365 |
+
}
|
| 366 |
|
| 367 |
+
// To track when we should force a refresh
|
| 368 |
+
let currentPdfHash = "";
|
| 369 |
+
|
| 370 |
+
// Function to render a PDF page
|
| 371 |
+
async function renderPage(pdf, pageNumber, container) {
|
| 372 |
+
try {
|
| 373 |
+
const page = await pdf.getPage(pageNumber);
|
| 374 |
+
|
| 375 |
+
// Create page container
|
| 376 |
+
const pageContainer = document.createElement('div');
|
| 377 |
+
pageContainer.className = 'pdf-page';
|
| 378 |
+
pageContainer.style.position = 'relative';
|
| 379 |
+
pageContainer.style.margin = '10px auto';
|
| 380 |
+
pageContainer.style.boxShadow = '0 2px 5px rgba(0,0,0,0.2)';
|
| 381 |
+
|
| 382 |
+
// Create canvas for this page
|
| 383 |
+
const canvas = document.createElement('canvas');
|
| 384 |
+
const context = canvas.getContext('2d');
|
| 385 |
+
pageContainer.appendChild(canvas);
|
| 386 |
+
|
| 387 |
+
// Set up viewport with scale based on container width
|
| 388 |
+
const containerWidth = container.clientWidth - 30; // Account for margins
|
| 389 |
+
const originalViewport = page.getViewport({ scale: 1 });
|
| 390 |
+
const scale = containerWidth / originalViewport.width;
|
| 391 |
+
const viewport = page.getViewport({ scale });
|
| 392 |
+
|
| 393 |
+
// Set canvas dimensions
|
| 394 |
+
canvas.width = viewport.width;
|
| 395 |
+
canvas.height = viewport.height;
|
| 396 |
+
|
| 397 |
+
// Render the PDF page into canvas context
|
| 398 |
+
await page.render({
|
| 399 |
+
canvasContext: context,
|
| 400 |
+
viewport: viewport
|
| 401 |
+
}).promise;
|
| 402 |
+
|
| 403 |
+
// Add to the container
|
| 404 |
+
container.appendChild(pageContainer);
|
| 405 |
+
|
| 406 |
+
return true;
|
| 407 |
+
} catch (error) {
|
| 408 |
+
console.error(`Error rendering page ${pageNumber}:`, error);
|
| 409 |
+
return false;
|
| 410 |
+
}
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
// Simple hash function for PDF data to detect changes
|
| 414 |
+
function hashData(str) {
|
| 415 |
+
let hash = 0;
|
| 416 |
+
if (str.length === 0) return hash;
|
| 417 |
+
for (let i = 0; i < Math.min(str.length, 10000); i++) {
|
| 418 |
+
const char = str.charCodeAt(i);
|
| 419 |
+
hash = ((hash << 5) - hash) + char;
|
| 420 |
+
hash = hash & hash;
|
| 421 |
+
}
|
| 422 |
+
// Also include the length as PDFs with same start can be different
|
| 423 |
+
return `${hash}_${str.length}`;
|
| 424 |
+
}
|
| 425 |
|
| 426 |
// Function to display PDF from base64 data
|
| 427 |
+
async function displayPdfFromBase64(base64Data) {
|
| 428 |
try {
|
| 429 |
if (!base64Data || base64Data.length < 100) {
|
| 430 |
console.log("No valid PDF data received");
|
| 431 |
document.getElementById('pdf-fallback').style.display = 'flex';
|
| 432 |
+
document.getElementById('pdf-container').innerHTML = '';
|
| 433 |
return;
|
| 434 |
}
|
| 435 |
|
| 436 |
+
// Check if this is the same PDF we already have displayed
|
| 437 |
+
const dataHash = hashData(base64Data);
|
| 438 |
+
if (dataHash === currentPdfHash) {
|
| 439 |
+
console.log("Same PDF already displayed, skipping render");
|
| 440 |
+
return;
|
| 441 |
}
|
| 442 |
|
| 443 |
+
// Update the current PDF hash
|
| 444 |
+
currentPdfHash = dataHash;
|
| 445 |
+
console.log("PDF changed, rendering new document");
|
| 446 |
+
|
| 447 |
+
// Check if PDF.js is loaded
|
| 448 |
+
if (!window.pdfjsLib) {
|
| 449 |
+
console.warn("PDF.js not loaded yet, waiting...");
|
| 450 |
+
document.getElementById('pdf-fallback').innerHTML =
|
| 451 |
+
'<div style="font-family: Arial, sans-serif;">Loading PDF viewer...</div>';
|
| 452 |
+
setTimeout(() => displayPdfFromBase64(base64Data), 500);
|
| 453 |
+
return;
|
| 454 |
}
|
| 455 |
|
| 456 |
+
// Convert base64 to array buffer
|
| 457 |
+
const binaryString = atob(base64Data);
|
| 458 |
+
const bytes = new Uint8Array(binaryString.length);
|
| 459 |
+
for (let i = 0; i < binaryString.length; i++) {
|
| 460 |
+
bytes[i] = binaryString.charCodeAt(i);
|
| 461 |
+
}
|
| 462 |
|
| 463 |
+
// Clear existing content
|
| 464 |
+
const container = document.getElementById('pdf-container');
|
| 465 |
+
container.innerHTML = '';
|
| 466 |
+
document.getElementById('pdf-fallback').style.display = 'none';
|
| 467 |
|
| 468 |
+
// Load and render the PDF
|
| 469 |
+
try {
|
| 470 |
+
// Show loading indicator
|
| 471 |
+
const loadingIndicator = document.createElement('div');
|
| 472 |
+
loadingIndicator.style.padding = '20px';
|
| 473 |
+
loadingIndicator.style.textAlign = 'center';
|
| 474 |
+
loadingIndicator.innerText = 'Loading PDF...';
|
| 475 |
+
container.appendChild(loadingIndicator);
|
| 476 |
+
|
| 477 |
+
// Load document
|
| 478 |
+
const loadingTask = window.pdfjsLib.getDocument({ data: bytes });
|
| 479 |
+
const pdf = await loadingTask.promise;
|
| 480 |
+
|
| 481 |
+
// Clear the loading indicator
|
| 482 |
+
container.innerHTML = '';
|
| 483 |
+
|
| 484 |
+
console.log(`PDF loaded with ${pdf.numPages} pages`);
|
| 485 |
+
|
| 486 |
+
// Render all pages
|
| 487 |
+
const pagePromises = [];
|
| 488 |
+
for (let i = 1; i <= pdf.numPages; i++) {
|
| 489 |
+
pagePromises.push(renderPage(pdf, i, container));
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
// Wait for all pages to render
|
| 493 |
+
await Promise.all(pagePromises);
|
| 494 |
+
console.log("All pages rendered");
|
| 495 |
+
|
| 496 |
+
// Scroll to top
|
| 497 |
+
container.scrollTop = 0;
|
| 498 |
+
|
| 499 |
+
} catch (error) {
|
| 500 |
+
console.error("Error loading PDF:", error);
|
| 501 |
+
document.getElementById('pdf-fallback').innerHTML =
|
| 502 |
+
`<div style="color: red; font-family: Arial, sans-serif;">
|
| 503 |
+
Error loading PDF: ${error.message || 'Unknown error'}
|
| 504 |
+
</div>`;
|
| 505 |
+
document.getElementById('pdf-fallback').style.display = 'flex';
|
| 506 |
+
currentPdfHash = ""; // Reset hash to allow retry
|
| 507 |
}
|
| 508 |
} catch (error) {
|
| 509 |
+
console.error("Error processing PDF data:", error);
|
| 510 |
+
document.getElementById('pdf-fallback').innerHTML =
|
| 511 |
+
`<div style="color: red; font-family: Arial, sans-serif;">
|
| 512 |
+
Error processing PDF: ${error.message || 'Unknown error'}
|
| 513 |
+
</div>`;
|
| 514 |
+
document.getElementById('pdf-fallback').style.display = 'flex';
|
| 515 |
+
currentPdfHash = ""; // Reset hash to allow retry
|
| 516 |
}
|
| 517 |
}
|
| 518 |
|
| 519 |
+
// Check for PDF data
|
| 520 |
+
function setupPdfListener() {
|
| 521 |
const dataElement = document.getElementById('pdf_base64_data');
|
| 522 |
if (!dataElement) {
|
| 523 |
console.log("PDF data element not found, will retry");
|
| 524 |
+
setTimeout(setupPdfListener, 1000);
|
| 525 |
return;
|
| 526 |
}
|
| 527 |
|
| 528 |
const textarea = dataElement.querySelector('textarea');
|
| 529 |
if (!textarea) {
|
| 530 |
console.log("Textarea not found, will retry");
|
| 531 |
+
setTimeout(setupPdfListener, 1000);
|
| 532 |
return;
|
| 533 |
}
|
| 534 |
|
| 535 |
+
console.log("Found PDF data element, setting up listeners");
|
| 536 |
+
|
| 537 |
// Display initial data if available
|
| 538 |
if (textarea.value && textarea.value.length > 100) {
|
| 539 |
displayPdfFromBase64(textarea.value);
|
| 540 |
}
|
| 541 |
|
| 542 |
+
// Use both an observer and polling for robustness
|
| 543 |
+
// 1. Create MutationObserver to watch for value changes
|
| 544 |
+
const observer = new MutationObserver((mutations) => {
|
| 545 |
+
for (const mutation of mutations) {
|
| 546 |
+
if (textarea.value && textarea.value.length > 100) {
|
| 547 |
+
displayPdfFromBase64(textarea.value);
|
| 548 |
+
break;
|
| 549 |
+
}
|
| 550 |
+
}
|
| 551 |
+
});
|
| 552 |
+
|
| 553 |
+
// Observe the textarea for changes
|
| 554 |
+
observer.observe(textarea, {
|
| 555 |
+
attributes: true,
|
| 556 |
+
characterData: true,
|
| 557 |
+
subtree: true,
|
| 558 |
+
childList: true
|
| 559 |
+
});
|
| 560 |
+
|
| 561 |
+
// 2. Also use polling as a fallback
|
| 562 |
+
setInterval(() => {
|
| 563 |
if (textarea.value && textarea.value.length > 100) {
|
| 564 |
displayPdfFromBase64(textarea.value);
|
| 565 |
}
|
| 566 |
+
}, 1000);
|
| 567 |
+
|
| 568 |
+
// Monitor the next/prev buttons to force PDF refresh
|
| 569 |
+
const prevButton = document.getElementById('prev_button');
|
| 570 |
+
const nextButton = document.getElementById('next_button');
|
| 571 |
+
|
| 572 |
+
if (prevButton) {
|
| 573 |
+
prevButton.addEventListener('click', () => {
|
| 574 |
+
console.log("Prev button clicked, forcing PDF refresh");
|
| 575 |
+
currentPdfHash = ""; // Reset hash to force refresh
|
| 576 |
+
});
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
if (nextButton) {
|
| 580 |
+
nextButton.addEventListener('click', () => {
|
| 581 |
+
console.log("Next button clicked, forcing PDF refresh");
|
| 582 |
+
currentPdfHash = ""; // Reset hash to force refresh
|
| 583 |
+
});
|
| 584 |
+
}
|
| 585 |
}
|
| 586 |
|
| 587 |
// Start checking for PDF data
|
| 588 |
+
setTimeout(setupPdfListener, 1000);
|
| 589 |
|
| 590 |
// Add keyboard shortcuts
|
| 591 |
document.addEventListener('keydown', function(event) {
|