Spaces:

Orias171
/

doc-ai-api

Sleeping

App Files Files Community

doc-ai-api / Libraries /Faiss_Embedding.py

LongK171

Add all

dbe2c62 2 months ago

raw

history blame

10.6 kB

	import logging
	import re, os
	import torch
	import faiss
	import numpy as np

	from typing import Dict, List, Any, Tuple, Optional

	from . import Common_MyUtils as MyUtils

	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

	class DirectFaissIndexer:
	"""
	1) FaissPath (.faiss): chỉ chứa vectors,
	2) MapDataPath (.json): content + index,
	3) MappingPath (.json): ánh xạ key <-> index.
	"""

	def __init__(
	self,
	indexer: Any,
	device: str = "cpu",
	batch_size: int = 32,
	show_progress: bool = False,
	flatten_mode: str = "split",
	join_sep: str = "\n",
	allowed_schema_types: Tuple[str, ...] = ("string", "array", "dict"),
	max_chars_per_text: Optional[int] = None,
	normalize: bool = True,
	verbose: bool = False,
	list_policy: str = "split", # "merge" \| "split"
	):
	self.indexer = indexer
	self.device = device
	self.batch_size = batch_size
	self.show_progress = show_progress
	self.flatten_mode = flatten_mode
	self.join_sep = join_sep
	self.allowed_schema_types = allowed_schema_types
	self.max_chars_per_text = max_chars_per_text
	self.normalize = normalize
	self.verbose = verbose
	self.list_policy = list_policy

	self._non_keep_pattern = re.compile(r"[^\w\s\(\)\.\,\;\:\-–]", flags=re.UNICODE)

	# ---------- Schema & chọn trường ----------

	@staticmethod
	def _base_key_for_schema(key: str) -> str:

	return re.sub(r"\[\d+\]", "", key)

	def _eligible_by_schema(self, key: str, schema: Optional[Dict[str, str]]) -> bool:
	if schema is None:
	return True
	base_key = self._base_key_for_schema(key)
	typ = schema.get(base_key)
	return (typ in self.allowed_schema_types) if typ is not None else False

	# ---------- Tiền xử lý & flatten ----------
	def _preprocess_data(self, data: Any) -> Any:

	if MyUtils and hasattr(MyUtils, "preprocess_data"):
	return MyUtils.preprocess_data(
	data,
	non_keep_pattern=self._non_keep_pattern,
	max_chars_per_text=self.max_chars_per_text
	)

	def _flatten_json(self, data: Any) -> Dict[str, Any]:
	"""
	Flatten JSON theo list_policy:
	- merge: gộp list/dict chứa chuỗi thành 1 đoạn text duy nhất
	- split: tách từng phần tử
	"""
	# Nếu merge, xử lý JSON trước khi flatten
	if self.list_policy == "merge":
	def _merge_lists(obj):
	if isinstance(obj, dict):
	return {k: _merge_lists(v) for k, v in obj.items()}
	elif isinstance(obj, list):
	# Nếu list chỉ chứa chuỗi / số, gộp lại
	if all(isinstance(i, (str, int, float)) for i in obj):
	return self.join_sep.join(map(str, obj))
	# Nếu list chứa dict hoặc list lồng, đệ quy
	return [_merge_lists(v) for v in obj]
	else:
	return obj

	data = _merge_lists(data)

	# Sau đó gọi MyUtils.flatten_json như cũ
	return MyUtils.flatten_json(
	data,
	prefix="",
	flatten_mode=self.flatten_mode,
	join_sep=self.join_sep
	)

	# ---------- Encode (batch) với fallback OOM CPU ----------
	def _encode_texts(self, texts: List[str]) -> torch.Tensor:
	try:
	embs = self.indexer.encode(
	sentences=texts,
	batch_size=self.batch_size,
	convert_to_tensor=True,
	device=self.device,
	show_progress_bar=self.show_progress,
	)
	return embs
	except RuntimeError as e:
	if "CUDA out of memory" in str(e):
	print("⚠️ CUDA OOM → fallback CPU.")
	try:
	self.indexer.to("cpu")
	except Exception:
	pass
	embs = self.indexer.encode(
	sentences=texts,
	batch_size=self.batch_size,
	convert_to_tensor=True,
	device="cpu",
	show_progress_bar=self.show_progress,
	)
	return embs
	raise

	# ---------- Build FAISS ----------
	@staticmethod
	def _l2_normalize(mat: np.ndarray) -> np.ndarray:
	norms = np.linalg.norm(mat, axis=1, keepdims=True)
	norms[norms == 0.0] = 1.0
	return mat / norms

	def _create_faiss_index(self, matrix: np.ndarray) -> faiss.Index:
	dim = int(matrix.shape[1])
	index = faiss.IndexFlatIP(dim)
	index.add(matrix.astype("float32"))
	return index


	# ================================================================
	# Hàm lọc trùng nhưng vẫn gom nhóm chunk tương ứng
	# ================================================================
	def deduplicates_with_mask(
	self,
	pairs: List[Tuple[str, str]],
	chunk_map: List[int]
	) -> Tuple[List[Tuple[str, str]], List[List[int]]]:

	assert len(pairs) == len(chunk_map), "pairs và chunk_map phải đồng dài"

	seen_per_key: Dict[str, Dict[str, int]] = {}
	# base_key -> text_norm -> index trong filtered_pairs

	filtered_pairs: List[Tuple[str, str]] = []
	chunk_groups: List[List[int]] = [] # song song với filtered_pairs

	for (key, text), c in zip(pairs, chunk_map):
	text_norm = text.strip()
	if not text_norm:
	continue

	base_key = re.sub(r"\[\d+\]", "", key)
	if base_key not in seen_per_key:
	seen_per_key[base_key] = {}

	# Nếu text đã xuất hiện → thêm chunk vào nhóm cũ
	if text_norm in seen_per_key[base_key]:
	idx = seen_per_key[base_key][text_norm]
	if c not in chunk_groups[idx]:
	chunk_groups[idx].append(c)
	continue

	# Nếu chưa có → tạo mới
	seen_per_key[base_key][text_norm] = len(filtered_pairs)
	filtered_pairs.append((key, text_norm))
	chunk_groups.append([c])

	return filtered_pairs, chunk_groups

	# ================================================================
	# Ghi ChunkMapping
	# ================================================================
	def write_chunk_mapping(self, MapChunkPath: str, SegmentPath: str, chunk_groups: List[List[int]]) -> None:
	# Ghi chunk mapping dạng gọn: mỗi index một dòng
	with open(MapChunkPath, "w", encoding="utf-8") as f:
	f.write('{\n')
	f.write(' "index_to_chunk": {\n')

	items = list(enumerate(chunk_groups))
	for i, (idx, group) in enumerate(items):
	group_str = "[" + ", ".join(map(str, group)) + "]"
	comma = "," if i < len(items) - 1 else ""
	f.write(f' "{idx}": {group_str}{comma}\n')

	f.write(' },\n')
	f.write(' "meta": {\n')
	f.write(f' "count": {len(chunk_groups)},\n')
	f.write(f' "source": "{os.path.basename(SegmentPath)}"\n')
	f.write(' }\n')
	f.write('}\n')

	# ================================================================
	# Hàm build_from_json
	# ================================================================
	def build_from_json(
	self,
	SegmentPath: str,
	SchemaDict: Optional[str],
	FaissPath: str,
	MapDataPath: str,
	MappingPath: str,
	MapChunkPath: Optional[str] = None,
	) -> None:
	assert os.path.exists(SegmentPath), f"Không thấy file JSON: {SegmentPath}"

	os.makedirs(os.path.dirname(FaissPath), exist_ok=True)
	os.makedirs(os.path.dirname(MapDataPath), exist_ok=True)
	os.makedirs(os.path.dirname(MappingPath), exist_ok=True)
	if MapChunkPath:
	os.makedirs(os.path.dirname(MapChunkPath), exist_ok=True)

	schema = SchemaDict

	# 1️⃣ Read JSON
	data_obj = MyUtils.read_json(SegmentPath)
	data_list = data_obj if isinstance(data_obj, list) else [data_obj]

	# 2️⃣ Flatten + lưu chunk_id
	pair_list: List[Tuple[str, str]] = []
	chunk_map: List[int] = []
	for chunk_id, item in enumerate(data_list, start=1):
	processed = self._preprocess_data(item)
	flat = self._flatten_json(processed)
	for k, v in flat.items():
	if not self._eligible_by_schema(k, schema):
	continue
	if isinstance(v, str) and v.strip():
	pair_list.append((k, v.strip()))
	chunk_map.append(chunk_id)

	if not pair_list:
	raise ValueError("Không tìm thấy nội dung văn bản hợp lệ để encode.")

	# 3️⃣ Loại trùng nhưng gom nhóm chunk
	pair_list, chunk_groups = self.deduplicates_with_mask(pair_list, chunk_map)

	# 4️⃣ Encode
	keys = [k for k, _ in pair_list]
	texts = [t for _, t in pair_list]
	embs_t = self._encode_texts(texts)
	embs = embs_t.detach().cpu().numpy()
	if self.normalize:
	embs = self._l2_normalize(embs)

	# 5️⃣ FAISS
	index = self._create_faiss_index(embs)
	faiss.write_index(index, FaissPath)
	logging.info(f"✅ Đã xây FAISS: {FaissPath}")

	# 6️⃣ Mapping + MapData

	index_to_key = {str(i): k for i, k in enumerate(keys)}
	Mapping = {
	"meta": {
	"count": len(keys),
	"dim": int(embs.shape[1]),
	"metric": "ip",
	"normalized": bool(self.normalize),
	},

	"index_to_key": index_to_key,
	}
	MapData = {
	"items": [{"index": i, "key": k, "text": t} for i, (k, t) in enumerate(pair_list)],
	"meta": {
	"count": len(keys),
	"flatten_mode": self.flatten_mode,
	"schema_used": schema is not None,
	"list_policy": self.list_policy
	}
	}

	self.write_chunk_mapping(MapChunkPath, SegmentPath, chunk_groups)
	return Mapping, MapData