Spaces:

rishabhsetiya
/

CAIAssignmentGradio

Running

App Files Files Community

rishabhsetiya commited on Aug 24

Commit

ef1795c

verified ·

1 Parent(s): 2d70a09

Update generate_indexes.py

Browse files

Files changed (1) hide show

generate_indexes.py +7 -38

generate_indexes.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import List, Dict
 import numpy as np
 import faiss
 import pandas as pd
 from sentence_transformers import SentenceTransformer
 from rank_bm25 import BM25Okapi
@@ -44,46 +45,14 @@ def create_chunks(texts: List[str], max_tokens: int) -> List[str]:
         chunks.append(" ".join(current_chunk))
     return chunks
-import pdfplumber
-import pandas as pd
-def read_pdf_tables(pdf_path, pages="all"):
-    """
-    Extracts tables from a PDF using pdfplumber, similar to tabula.read_pdf(..., multiple_tables=True)
-    Args:
-        pdf_path (str): Path to the PDF file
-        pages (str or list): Pages to extract from ("all" or list of page numbers, 1-based)
-    Returns:
-        List[pd.DataFrame]: List of tables extracted from the PDF
-    """
-    tables = []
-    with pdfplumber.open(pdf_path) as pdf:
-        if pages == "all":
-            page_numbers = range(len(pdf.pages))
-        else:
-            # Convert 1-based to 0-based indices
-            page_numbers = [p-1 for p in pages]
-        for i in page_numbers:
-            page = pdf.pages[i]
-            # Extract tables from this page
-            page_tables = page.extract_tables()
-            for table in page_tables:
-                if table:  # ignore empty tables
-                    df = pd.DataFrame(table[1:], columns=table[0])  # first row as header
-                    # convert all columns to str to mimic pandas_options={'dtype': str}
-                    df = df.astype(str)
-                    tables.append(df)
-    return tables
 def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
     """Extract tables from financial PDF into structured row-year-value dicts."""
-    tables = read_pdf_tables(pdf_path)
     table_rows = []
     row_id = 0

 import numpy as np
 import faiss
 import pandas as pd
+import tabula
 from sentence_transformers import SentenceTransformer
 from rank_bm25 import BM25Okapi
         chunks.append(" ".join(current_chunk))
     return chunks
 def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
     """Extract tables from financial PDF into structured row-year-value dicts."""
+    tables = tabula.read_pdf(
+        pdf_path,
+        pages=pages,
+        multiple_tables=True,
+        pandas_options={'dtype': str}
+    )
     table_rows = []
     row_id = 0