Update generate_indexes.py
Browse files- generate_indexes.py +7 -38
generate_indexes.py
CHANGED
|
@@ -7,6 +7,7 @@ from typing import List, Dict
|
|
| 7 |
import numpy as np
|
| 8 |
import faiss
|
| 9 |
import pandas as pd
|
|
|
|
| 10 |
from sentence_transformers import SentenceTransformer
|
| 11 |
from rank_bm25 import BM25Okapi
|
| 12 |
|
|
@@ -44,46 +45,14 @@ def create_chunks(texts: List[str], max_tokens: int) -> List[str]:
|
|
| 44 |
chunks.append(" ".join(current_chunk))
|
| 45 |
return chunks
|
| 46 |
|
| 47 |
-
import pdfplumber
|
| 48 |
-
import pandas as pd
|
| 49 |
-
|
| 50 |
-
def read_pdf_tables(pdf_path, pages="all"):
|
| 51 |
-
"""
|
| 52 |
-
Extracts tables from a PDF using pdfplumber, similar to tabula.read_pdf(..., multiple_tables=True)
|
| 53 |
-
|
| 54 |
-
Args:
|
| 55 |
-
pdf_path (str): Path to the PDF file
|
| 56 |
-
pages (str or list): Pages to extract from ("all" or list of page numbers, 1-based)
|
| 57 |
-
|
| 58 |
-
Returns:
|
| 59 |
-
List[pd.DataFrame]: List of tables extracted from the PDF
|
| 60 |
-
"""
|
| 61 |
-
tables = []
|
| 62 |
-
|
| 63 |
-
with pdfplumber.open(pdf_path) as pdf:
|
| 64 |
-
if pages == "all":
|
| 65 |
-
page_numbers = range(len(pdf.pages))
|
| 66 |
-
else:
|
| 67 |
-
# Convert 1-based to 0-based indices
|
| 68 |
-
page_numbers = [p-1 for p in pages]
|
| 69 |
-
|
| 70 |
-
for i in page_numbers:
|
| 71 |
-
page = pdf.pages[i]
|
| 72 |
-
# Extract tables from this page
|
| 73 |
-
page_tables = page.extract_tables()
|
| 74 |
-
|
| 75 |
-
for table in page_tables:
|
| 76 |
-
if table: # ignore empty tables
|
| 77 |
-
df = pd.DataFrame(table[1:], columns=table[0]) # first row as header
|
| 78 |
-
# convert all columns to str to mimic pandas_options={'dtype': str}
|
| 79 |
-
df = df.astype(str)
|
| 80 |
-
tables.append(df)
|
| 81 |
-
|
| 82 |
-
return tables
|
| 83 |
-
|
| 84 |
def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
|
| 85 |
"""Extract tables from financial PDF into structured row-year-value dicts."""
|
| 86 |
-
tables =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
table_rows = []
|
| 89 |
row_id = 0
|
|
|
|
| 7 |
import numpy as np
|
| 8 |
import faiss
|
| 9 |
import pandas as pd
|
| 10 |
+
import tabula
|
| 11 |
from sentence_transformers import SentenceTransformer
|
| 12 |
from rank_bm25 import BM25Okapi
|
| 13 |
|
|
|
|
| 45 |
chunks.append(" ".join(current_chunk))
|
| 46 |
return chunks
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
|
| 49 |
"""Extract tables from financial PDF into structured row-year-value dicts."""
|
| 50 |
+
tables = tabula.read_pdf(
|
| 51 |
+
pdf_path,
|
| 52 |
+
pages=pages,
|
| 53 |
+
multiple_tables=True,
|
| 54 |
+
pandas_options={'dtype': str}
|
| 55 |
+
)
|
| 56 |
|
| 57 |
table_rows = []
|
| 58 |
row_id = 0
|