rishabhsetiya commited on
Commit
ef1795c
·
verified ·
1 Parent(s): 2d70a09

Update generate_indexes.py

Browse files
Files changed (1) hide show
  1. generate_indexes.py +7 -38
generate_indexes.py CHANGED
@@ -7,6 +7,7 @@ from typing import List, Dict
7
  import numpy as np
8
  import faiss
9
  import pandas as pd
 
10
  from sentence_transformers import SentenceTransformer
11
  from rank_bm25 import BM25Okapi
12
 
@@ -44,46 +45,14 @@ def create_chunks(texts: List[str], max_tokens: int) -> List[str]:
44
  chunks.append(" ".join(current_chunk))
45
  return chunks
46
 
47
- import pdfplumber
48
- import pandas as pd
49
-
50
- def read_pdf_tables(pdf_path, pages="all"):
51
- """
52
- Extracts tables from a PDF using pdfplumber, similar to tabula.read_pdf(..., multiple_tables=True)
53
-
54
- Args:
55
- pdf_path (str): Path to the PDF file
56
- pages (str or list): Pages to extract from ("all" or list of page numbers, 1-based)
57
-
58
- Returns:
59
- List[pd.DataFrame]: List of tables extracted from the PDF
60
- """
61
- tables = []
62
-
63
- with pdfplumber.open(pdf_path) as pdf:
64
- if pages == "all":
65
- page_numbers = range(len(pdf.pages))
66
- else:
67
- # Convert 1-based to 0-based indices
68
- page_numbers = [p-1 for p in pages]
69
-
70
- for i in page_numbers:
71
- page = pdf.pages[i]
72
- # Extract tables from this page
73
- page_tables = page.extract_tables()
74
-
75
- for table in page_tables:
76
- if table: # ignore empty tables
77
- df = pd.DataFrame(table[1:], columns=table[0]) # first row as header
78
- # convert all columns to str to mimic pandas_options={'dtype': str}
79
- df = df.astype(str)
80
- tables.append(df)
81
-
82
- return tables
83
-
84
  def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
85
  """Extract tables from financial PDF into structured row-year-value dicts."""
86
- tables = read_pdf_tables(pdf_path)
 
 
 
 
 
87
 
88
  table_rows = []
89
  row_id = 0
 
7
  import numpy as np
8
  import faiss
9
  import pandas as pd
10
+ import tabula
11
  from sentence_transformers import SentenceTransformer
12
  from rank_bm25 import BM25Okapi
13
 
 
45
  chunks.append(" ".join(current_chunk))
46
  return chunks
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
49
  """Extract tables from financial PDF into structured row-year-value dicts."""
50
+ tables = tabula.read_pdf(
51
+ pdf_path,
52
+ pages=pages,
53
+ multiple_tables=True,
54
+ pandas_options={'dtype': str}
55
+ )
56
 
57
  table_rows = []
58
  row_id = 0