File size: 2,747 Bytes
dbe2c62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e92cb9
dbe2c62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import logging
import os
import faiss

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

def ConfigValues(service="Search", inputs="file.pdf"):

    # Inputs
    inputFolder = f"./Private/Tests"
    inputPath = f"{inputFolder}/{inputs}"

    # Assets
    assetsFolder = f"./Assets"
    exceptPath = f"{assetsFolder}/ex.exceptions.json"
    markerPath = f"{assetsFolder}/ex.markers.json"
    statusPath = f"{assetsFolder}/ex.status.json"

    # Documents
    DocFolder = "./Documents"
    DocPath = f"{DocFolder}/{service}"
    PdfPath = f"{DocPath}.pdf"
    DocPath = f"{DocPath}.docx"

    # Database
    DBFolder = "./Database"
    DBPath = f"{DBFolder}/{service}/{service}"

    RawExtractPath = f"{DBPath}_Extract"
    ChunksPath = f"{DBPath}_Chunks"
    EmbeddingPath = f"{DBPath}_Embedding"

    RawDataPath = f"{RawExtractPath}_Raw.json"
    RawLvlsPath = f"{RawExtractPath}_Levels.json"

    StructsPath = f"{ChunksPath}_Struct.json"
    SegmentPath = f"{ChunksPath}_Segment.json"
    SchemaPath = f"{ChunksPath}_Schema.json"
    
    FaissPath = f"{EmbeddingPath}_Index.faiss"
    MappingPath = f"{EmbeddingPath}_Mapping.json"
    MapDataPath = f"{EmbeddingPath}_MapData.json"
    MapChunkPath = f"{EmbeddingPath}_MapChunk.json"
    MetaPath = f"{EmbeddingPath}_Meta.json"

    # Keys
    DATA_KEY = "contents"
    EMBE_KEY = "embeddings"

    # Models
    SEARCH_EGINE = faiss.IndexFlatIP
    RERANK_MODEL = "BAAI/bge-reranker-base"
    CHUNKS_MODEL = "paraphrase-multilingual-MiniLM-L12-v2"
    EMBEDD_MODEL = "VoVanPhuc/sup-SimCSE-VietNamese-phobert-base"
    RESPON_MODEL = "gpt-3.5-turbo"
    SUMARY_MODEL = "LongK171/bartpho-syllable-vnexpress"

    WORD_LIMIT = 1000

    return {
        "inputPath": inputPath,
        "PdfPath": PdfPath,
        "DocPath": DocPath,
        "exceptPath": exceptPath,
        "markerPath": markerPath,
        "statusPath": statusPath,
        "RawDataPath": RawDataPath,
        "RawLvlsPath": RawLvlsPath,
        "StructsPath": StructsPath,
        "SegmentPath": SegmentPath,
        "SchemaPath": SchemaPath,
        "FaissPath": FaissPath,
        "MappingPath": MappingPath,
        "MapDataPath": MapDataPath,
        "MapChunkPath": MapChunkPath,
        "MetaPath": MetaPath,
        "DATA_KEY": DATA_KEY,
        "EMBE_KEY": EMBE_KEY,
        "SEARCH_EGINE": SEARCH_EGINE,
        "RERANK_MODEL": RERANK_MODEL,
        "RESPON_MODEL": RESPON_MODEL,        
        "CHUNKS_MODEL": CHUNKS_MODEL,
        "EMBEDD_MODEL": EMBEDD_MODEL,
        "SUMARY_MODEL": SUMARY_MODEL,
        "WORD_LIMIT": WORD_LIMIT
    }