Spaces:

Artteiv
/

crop-diag-module

Running on CPU Upgrade

App Files Files Community

Sontranwakumo commited on May 28

Commit

88cc76c

1 Parent(s): 77d75db

init: move from github

Browse files

Files changed (32) hide show

.DS_Store +0 -0
.env.example +5 -0
.gitattributes +2 -0
.gitignore +114 -0
README.md +128 -1
app/.DS_Store +0 -0
app/__init__.py +0 -0
app/api/__init__.py +0 -0
app/api/dto/kg_query.py +23 -0
app/api/routes.py +40 -0
app/core/__init__.py +0 -0
app/core/config.py +29 -0
app/core/dependencies.py +28 -0
app/core/type.py +46 -0
app/data/faiss_index.index +3 -0
app/data/image_faiss_index.index +3 -0
app/data/vector_embeddings.db +3 -0
app/main.py +84 -0
app/models/__init__.py +0 -0
app/models/crop_clip.py +98 -0
app/models/gemini_caller.py +41 -0
app/models/knowledge_graph.py +126 -0
app/services/__init__.py +0 -0
app/services/predict.py +60 -0
app/utils/constant.py +2 -0
app/utils/data_mapping.py +65 -0
app/utils/extract_entity.py +25 -0
app/utils/prompt.py +85 -0
environment.yml +167 -0
prepare_script/image_caption_embeddings.py +207 -0
prepare_script/sync_neo4j_node.py +172 -0
requirements.txt +8 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.env.example ADDED Viewed

	@@ -0,0 +1,5 @@

+NEO4J_URI=
+NEO4J_USER=neo4j
+NEO4J_PASSWORD=
+OPENAI_API_KEY=
+GEMINI_API_KEY=

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.index filter=lfs diff=lfs merge=lfs -text
+*.db filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,114 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+weights/
+## Big data
+/Data
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+# IDE specific files
+.idea/
+.vscode/
+*.swp
+*.swo
+# FastAPI specific
+.pytest_cache/

README.md CHANGED Viewed

@@ -8,4 +8,131 @@ pinned: false
 short_description: Crop diagnosis module
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 short_description: Crop diagnosis module
 ---
+# Crop Diagnosis Knowledge Graph Module
+A powerful tool for querying crop diagnosis knowledge graphs using LangChain and Neo4j. This module provides an API interface to interact with a knowledge graph containing agricultural and crop disease information.
+## Features
+- Natural language querying of crop diagnosis knowledge graph
+- Integration with LangChain for intelligent query processing
+- Neo4j database backend for efficient graph operations
+- RESTful API interface
+- Environment-based configuration
+## Prerequisites
+- Python 3.8+
+- Neo4j Database (version 5.x)
+- OpenAI API key (for LangChain integration)
+## Installation
+1. Clone the repository:
+```bash
+git clone [repository-url]
+cd crop-diag-module
+```
+2. Create and activate a virtual environment:
+```bash
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+```
+3. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+## Configuration
+1. Create a `.env` file in the project root:
+```bash
+cp .env.example .env
+```
+2. Edit the `.env` file with your configuration:
+```env
+# Neo4j Configuration
+NEO4J_URI=bolt://localhost:7687
+NEO4J_USER=neo4j
+NEO4J_PASSWORD=your_password
+# API Configuration
+API_HOST=0.0.0.0
+API_PORT=8000
+DEBUG=True
+# LangChain Configuration
+OPENAI_API_KEY=your_openai_api_key
+```
+Replace the following values:
+- `NEO4J_URI`: Your Neo4j database URI
+- `NEO4J_USER`: Neo4j username
+- `NEO4J_PASSWORD`: Neo4j password
+- `OPENAI_API_KEY`: Your OpenAI API key
+## Running the Application
+1. Start the FastAPI server:
+```bash
+uvicorn app.main:app --reload
+```
+2. Access the API documentation:
+- Swagger UI: http://localhost:8000/docs
+- ReDoc: http://localhost:8000/redoc
+## API Usage
+### Query the Knowledge Graph
+```bash
+curl -X POST "http://localhost:8000/api/query" \
+     -H "Content-Type: application/json" \
+     -d '{"question": "What are the symptoms of rice blast disease?"}'
+```
+## Project Structure
+```
+crop-diag-module/
+├── app/
+│   ├── api/          # API routes and endpoints
+│   ├── core/         # Core functionality
+│   ├── models/       # Data models
+│   └── utils/        # Utility functions
+├── KG/               # Knowledge Graph data
+├── tests/            # Test cases
+├── requirements.txt  # Project dependencies
+└── .env             # Environment configuration
+```
+## Development
+### Running Tests
+```bash
+pytest tests/
+```
+### Code Style
+This project follows PEP 8 style guidelines. Use the following command to check code style:
+```bash
+uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
+uvicorn app.main:app --reload
+```
+## License
+[Add your license information here]
+## Contributing
+[Add contribution guidelines here]

app/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

app/__init__.py ADDED Viewed

File without changes

app/api/__init__.py ADDED Viewed

File without changes

app/api/dto/kg_query.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from typing import List, Optional
+from pydantic import BaseModel
+from app.core.type import Node
+class QueryContext(BaseModel):
+    crop_id: Optional[str] = None
+    nodes: Optional[List[Node]] = None
+    predicted_labels: Optional[List[str]] = None
+class PredictedLabel(BaseModel):
+    crop_name: str
+    label: str
+    confidence: float
+class KGQueryRequest(BaseModel):
+    context: Optional[QueryContext] = None
+    crop_id: Optional[str] = None
+    additional_info: Optional[str] = None
+class KGQueryResponse(BaseModel):
+    answer: str
+    sources: List[str]

app/api/routes.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File
+from pydantic import BaseModel
+from typing import List, Optional
+from app.api.dto.kg_query import KGQueryRequest, KGQueryResponse, PredictedLabel, QueryContext
+from app.core.dependencies import get_all_models, get_clip_model, get_data_mapper
+from app.core.type import Node
+from app.models.crop_clip import CLIPModule
+from app.services.predict import PredictService, get_predict_service
+from app.utils.extract_entity import extract_entities
+router = APIRouter()
+class QueryRequest(BaseModel):
+    question: str
+    context: Optional[List[str]] = None
+class QueryResponse(BaseModel):
+    answer: str
+    sources: List[str]
+@router.post("/analyze")
+async def analyze(
+    image: UploadFile = File(None),
+    predict_service: PredictService = Depends(get_predict_service)
+):
+    predicted_label = predict_service.predict_image(image)
+    return {
+        "crop_id": predicted_label[0].crop_name,
+        "predicted_labels": predicted_label,
+        "nodes": []
+    }
+@router.post("/kg-query")
+async def query_kg(
+    request: KGQueryRequest,
+    predict_service: PredictService = Depends(get_predict_service),
+):
+    return predict_service.retrieve_kg(request)

app/core/__init__.py ADDED Viewed

File without changes

app/core/config.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from pydantic_settings import BaseSettings
+from functools import lru_cache
+class Settings(BaseSettings):
+    # Neo4j Configuration
+    neo4j_uri: str
+    neo4j_user: str
+    neo4j_password: str
+    neo4j_database: str = "neo4j"
+    # API Configuration
+    api_host: str = "0.0.0.0"
+    api_port: int = 8000
+    debug: bool = True
+    openai_api_key: str
+    gemini_api_key: str
+    load_clip_model: bool = True
+    load_gemini_model: bool = True
+    load_data_mapper: bool = True
+    load_knowledge_graph: bool = True
+    class Config:
+        env_file = ".env"
+@lru_cache()
+def get_settings() -> Settings:
+    return Settings()

app/core/dependencies.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from fastapi import Depends, Request
+from app.models.crop_clip import CLIPModule
+from app.utils.data_mapping import DataMapping
+from app.models.knowledge_graph import KnowledgeGraphUtils, Neo4jConnection
+def get_clip_model(request: Request) -> CLIPModule:
+    """Lấy CLIP model từ app.state"""
+    return request.app.state.model_loader.clip_model
+def get_data_mapper(request: Request) -> DataMapping:
+    """Lấy DataMapper từ app.state"""
+    return request.app.state.model_loader.data_mapper
+def get_knowledge_graph(request: Request) -> KnowledgeGraphUtils:
+    """Lấy KnowledgeGraph từ app.state"""
+    return request.app.state.model_loader.knowledge_graph
+def get_all_models(
+    clip_model: CLIPModule = Depends(get_clip_model),
+    data_mapper: DataMapping = Depends(get_data_mapper),
+    knowledge_graph: KnowledgeGraphUtils = Depends(get_knowledge_graph)
+):
+    """Lấy tất cả các model từ app.state"""
+    return {
+        "clip_model": clip_model,
+        "data_mapper": data_mapper,
+        "knowledge_graph": knowledge_graph
+    }

app/core/type.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from pydantic import BaseModel
+from typing import List, Optional
+import json
+class Node(BaseModel):
+    id: str
+    label: str
+    name: str
+    properties: dict
+    score: Optional[float] = None
+    @staticmethod
+    def map_json_to_node(json_data: dict, label: str = None) -> 'Node':
+        node_data = {
+            "name": json_data.pop("name") if "name" in json_data else json_data["id"],
+            "id": json_data.pop("id"),
+            "label": label if label else json_data.pop("label"),
+            "properties": json_data
+        }
+        return Node(**node_data)
+    @staticmethod
+    def data_row_to_node(data_row: list[str], score = None) -> 'Node':
+        return Node(
+            id=data_row[1],
+            name=data_row[2],
+            label=data_row[3],
+            properties=json.loads(data_row[4]),
+            score=score
+        )
+class Relationship(BaseModel):
+    source: str
+    target: str
+    type: str
+    properties: dict
+class KnowledgeGraph(BaseModel):
+    nodes: List[Node]
+    relationships: List[Relationship]
+class GraphQuery(BaseModel):
+    key: str
+    cypher: str
+    parameters: Optional[dict] = None
+    description: Optional[str] = None

app/data/faiss_index.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1af6778f8fb10ee5a2d44f2815bc288c0ebac355c74a2b144d95720af5b8171
+size 1188909

app/data/image_faiss_index.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1f42c9ecb4da4a64cf34d90dc887564accc467233a8bfde986e6fa02b788b10
+size 41824301

app/data/vector_embeddings.db ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b9b2d7cab4f196a4a739830702944fedfe10f123e2c8ca3f4285857f56e7996
+size 5394432

app/main.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import logging
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from app.core.config import get_settings
+from app.api.routes import router as api_router
+from app.models.crop_clip import CLIPModule
+from app.models.gemini_caller import GeminiGenerator
+from app.utils.data_mapping import DataMapping, SingletonModel
+from app.models.knowledge_graph import KnowledgeGraphUtils, Neo4jConnection
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+settings = get_settings()
+class ModelLoader:
+    def __init__(self):
+        self.clip_model = None
+        self.gemini_model = None
+        self.sentence_transformer = None
+        self.neo4j_connection = None
+    def load_models(self):
+        try:
+            if settings.load_clip_model:
+                logger.info("Loading CLIP model...")
+                self.clip_model = CLIPModule()
+                logger.info("CLIP model loaded successfully")
+            if settings.load_gemini_model:
+                logger.info("Loading Gemini model...")
+                self.gemini_model = GeminiGenerator()
+                logger.info("Gemini model loaded successfully")
+            if settings.load_data_mapper:
+                logger.info("Loading DataMapper model...")
+                self.data_mapper = DataMapping()
+                logger.info("DataMapper model loaded successfully")
+            if settings.load_knowledge_graph:
+                logger.info("Connecting to Knowledge Graph...")
+                self.knowledge_graph = KnowledgeGraphUtils()
+                logger.info("Knowledge Graph connection established")
+        except Exception as e:
+            logger.error(f"Failed to load models: {e}")
+            raise
+    def close(self):
+        if self.neo4j_connection:
+            logger.info("Closing Neo4j connection...")
+            self.neo4j_connection.close()
+        self.clip_model = None
+        self.gemini_model = None
+        self.sentence_transformer = None
+        logger.info("Models released")
+# Lifespan event handler
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    loop = asyncio.get_event_loop()
+    with ThreadPoolExecutor() as pool:
+        await loop.run_in_executor(pool, app.state.model_loader.load_models)
+    logger.info("Application startup complete")
+    yield
+    app.state.model_loader.close()
+    logger.info("Application shutdown complete")
+app = FastAPI(
+    title="Crop Diagnosis Knowledge Graph API",
+    description="API for querying crop diagnosis knowledge graph using LangChain",
+    version="1.0.0",
+    debug=settings.debug,
+    lifespan=lifespan
+)
+app.state.model_loader = ModelLoader()
+app.include_router(api_router, prefix="/api")
+@app.get("/")
+async def root():
+    return {"message": "Welcome to Crop Diagnosis Knowledge Graph API"}

app/models/__init__.py ADDED Viewed

File without changes

app/models/crop_clip.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from typing import List
+import torch.nn as nn
+import torch
+from torchvision import transforms
+import clip
+from PIL import Image
+import os
+from app.api.dto.kg_query import PredictedLabel
+CLASS_NAMES = ['benhVerticilliumWiltCaChua', 'benhChayLaCaChua', 'benhXoanLaCaChua', 'benhDomLaCaChua',
+               'benhNhenXanhSan', 'benhKhamLaSan', 'cassava healthy', 'benhDomNau',
+               'boCanhCungHaiLaNgo', 'corn healthy', 'benhChayLaNgo', 'benhRiSatNgo', 'benhSocLaNgo',
+               'benhDomLaNgo', 'benhBacLaLua', 'benhDaoOnLua', 'benhDomNauLuaNuoc']
+CROP_NAMES = ['caChua', 'caChua', 'caChua', 'caChua', 'san', 'san', 'san', 'san',
+              'ngo', 'ngo', 'ngo', 'ngo', 'ngo', 'ngo', 'luaNuoc', 'luaNuoc', 'luaNuoc']
+WEIGHTS_PATH = os.path.join(os.path.dirname(__file__), 'weights', 'clip_finetuned.pth')
+class CLIPFineTuner(nn.Module):
+    def __init__(self, model, num_classes):
+        super(CLIPFineTuner, self).__init__()
+        self.model = model
+        self.classifier = nn.Linear(model.visual.output_dim, num_classes)
+    def forward(self, x):
+        with torch.no_grad():
+            features = self.model.encode_image(x).float()  # Convert to float32
+        return self.classifier(features)
+class CLIPModule:
+    def __init__(self):
+        model, preprocess = clip.load("ViT-B/32", jit=False)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = CLIPFineTuner(model, 17)
+        self.model.load_state_dict(torch.load(WEIGHTS_PATH, map_location=self.device))
+        self.model.to(self.device)
+        self.model.eval()
+        self.classes = CLASS_NAMES
+        self.transform = preprocess
+    def predict_image(self, image: Image.Image):
+        output = self.__predict(image)
+        probabilities = torch.nn.functional.softmax(output, dim=1)[0]
+        predictions: List[PredictedLabel] = []
+        for idx, prob in enumerate(probabilities):
+            predictions.append(PredictedLabel(
+                crop_name=CROP_NAMES[idx],
+                label=self.classes[idx],
+                confidence=float(prob)
+            ))
+        # Sắp xếp giảm dần theo xác suất
+        predictions.sort(key=lambda x: x.confidence, reverse=True)
+        return predictions
+    def __predict(self, image_input):
+        """
+        Dự đoán nhãn cho một ảnh.
+        Args:
+            image_input: Đường dẫn file ảnh (str) hoặc đối tượng PIL.Image
+            device: Thiết bị chạy mô hình ('cuda' hoặc 'cpu').
+        Returns:
+            str: Nhãn dự đoán (e.g., "cassava_leaf beetle").
+        """
+        try:
+            image = self.__handle_image(image_input)
+            image_tensor = self.transform(image)
+        except ValueError as e:
+            raise e
+        except Exception as e:
+            raise ValueError(f"Không thể xử lý ảnh đầu vào: {str(e)}")
+        if image_tensor.dim() == 3:
+            image_tensor = image_tensor.unsqueeze(0)
+        print(image_tensor.shape)
+        image_tensor = image_tensor.to(self.device)
+        with torch.no_grad():
+            output = self.model(image_tensor)
+        return output ## an array of 17 values, no softmax
+    def __handle_image(self, image_input):
+        if isinstance(image_input, str):
+            image = Image.open(image_input).convert('RGB')
+        elif isinstance(image_input, Image.Image):
+            image = image_input
+        else:
+            raise ValueError("Invalid image input")
+        return image

app/models/gemini_caller.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import google.generativeai as genai
+from dotenv import load_dotenv
+load_dotenv()
+class GeminiGenerator:
+    def __init__(self, model_name="gemini-2.0-flash", temperature=0):
+        self.key = os.environ.get("GEMINI_API_KEY")
+        genai.configure(api_key=self.key)
+        # Cấu hình generation config
+        self.generation_config = {
+            "temperature": temperature
+        }
+        # Hệ thống prompt mặc định
+        self.system_prompt = "Bạn là một trợ lý AI hữu ích cho các dự án IT về cây trồng và bệnh cây trồng. Bạn có khả năng trích xuất thông tin từ văn bản được cung cấp và trả dữ liệu bằng tiếng Việt theo yêu cầu."
+        # Khởi tạo model
+        self.model = genai.GenerativeModel(
+            model_name=model_name,
+            generation_config=self.generation_config,
+            system_instruction=self.system_prompt
+        )
+    def generate(self, prompt="Hello, world!", system_prompt=None):
+        # Sử dụng system prompt tùy chỉnh nếu được cung cấp
+        if system_prompt:
+            model = genai.GenerativeModel(
+                model_name=self.model.model_name,
+                generation_config=self.generation_config,
+                system_instruction=system_prompt
+            )
+            response = model.generate_content(prompt)
+        else:
+            response = self.model.generate_content(prompt)
+        return response
+if __name__ == "__main__":
+    generator = GeminiGenerator()

app/models/knowledge_graph.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import os
+import sys
+from fastapi import Depends
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from app.core.config import Settings, get_settings
+from utils.data_mapping import DataMapping
+from utils.extract_entity import extract_entities
+from core.type import Node
+from neo4j import GraphDatabase
+from utils.constant import NEO4J_LABELS, NEO4J_RELATIONS
+NEO4J_URI = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
+NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
+NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password")
+NEO4J_DATABASE = os.getenv("NEO4J_DATABASE", "neo4j")
+class Neo4jConnection:
+    def __init__(self):
+        """Khởi tạo kết nối tới Neo4j"""
+        self.uri = NEO4J_URI
+        self.user = NEO4J_USER
+        self.password = NEO4J_PASSWORD
+        self.database = NEO4J_DATABASE
+        self.driver = GraphDatabase.driver(
+            self.uri,
+            auth=(self.user, self.password),
+            database=self.database
+        )
+        self.entity_types = []
+        self.relations = []
+        with self.driver.session() as session:
+            result = session.run("CALL db.info()")
+            self.database_info = result.single().data()
+        self.entity_types = NEO4J_LABELS
+        self.relations = NEO4J_RELATIONS
+    def get_database_info(self):
+        """Trả về thông tin về database đang kết nối"""
+        return self.database_info
+    def close(self):
+        """Đóng kết nối tới Neo4j"""
+        if self.driver is not None:
+            self.driver.close()
+    def execute_query(self, query, parameters=None):
+        """Thực thi một truy vấn Cypher bất kỳ"""
+        with self.driver.session() as session:
+            result = session.run(query, parameters)
+            return [record for record in result]
+class KnowledgeGraphUtils:
+    def get_disease_from_env_factors(self, crop_id: str, params: list[Node]):
+        envFactors = [param.id for param in params if param.label == "EnvironmentalFactor"]
+        query = f"""
+            MATCH (c:Crop {{id: "{crop_id}"}})
+            WITH c
+            MATCH (d:Disease)-[:AFFECTS]-(c)
+            OPTIONAL MATCH (ef:EnvironmentalFactor)-[:FAVORS]-(d)
+            WHERE ef.id IN {envFactors}
+            OPTIONAL MATCH (ef2:EnvironmentalFactor)-[:FAVORS]-(cause:Cause)-[:CAUSES|AFFECTS]-(d)
+            WHERE ef2.id IN {envFactors}
+            WITH d, COLLECT(DISTINCT ef.id) AS direct_env, COLLECT(DISTINCT ef2.id) AS indirect_env
+            WHERE SIZE(direct_env) > 0 OR SIZE(indirect_env) > 0
+            RETURN DISTINCT d, direct_env, indirect_env
+        """
+        kg = Neo4jConnection()
+        result = kg.execute_query(query)
+        print(result)
+        final_result = []
+        for record in result:
+            record_dict = dict(record)
+            disease = Node.map_json_to_node(dict(record_dict["d"]), "Disease")
+            env_ids = list(record_dict["direct_env"]) + list(record_dict["indirect_env"])
+            print(env_ids)
+            score = 0
+            for env_id in env_ids:
+                for param in params:
+                    if param.id == env_id:
+                        score = max(score, param.score)
+            disease.score = score
+            final_result.append({
+                "disease": disease,
+                "env_ids": env_ids
+            })
+        final_result.sort(key=lambda x: x["disease"].score, reverse=True)
+        return final_result
+    def get_disease_from_symptoms(self, crop_id: str, params: list[Node]) -> list:
+        symptoms = [param.id for param in params if param.label == "Symptom"]
+        query = f"""
+            MATCH (c:Crop {{id: "{crop_id}"}})
+            WITH c
+            MATCH (d:Disease)-[:AFFECTS]-(c)
+            OPTIONAL MATCH (sym1:Symptom)-[:HAS_SYMPTOM]-(d)
+            WHERE sym1.id IN {symptoms}
+            OPTIONAL MATCH (sym2:Symptom)-[:HAS_SYMPTOM|LOCATED_ON]-(p:PlantPart)-[:CONTAINS]-(d)
+            WHERE sym2.id IN {symptoms}
+            WITH d, p, c, sym1, sym2, COLLECT(DISTINCT sym1.id) AS direct_env, COLLECT(DISTINCT sym2.id) AS indirect_env
+            WHERE SIZE(direct_env) > 0 OR SIZE(indirect_env) > 0
+            RETURN d, c, p, sym1, sym2
+        """
+        kg = Neo4jConnection()
+        result = kg.execute_query(query)
+        final_result = []
+        for record in result:
+            record_dict = dict(record)
+            disease = Node.map_json_to_node(dict(record_dict["d"]), "Disease")
+            symptom_ids = list(record_dict["sym1"]) + list(record_dict["sym2"])
+            score = 0
+            for symptom_id in symptom_ids:
+                for param in params:
+                    if param.id == symptom_id:
+                        score = max(score, param.score)
+            disease.score = score
+            final_result.append({
+                "disease": disease,
+                "symptom_ids": symptom_ids
+            })
+        final_result.sort(key=lambda x: x["disease"].score, reverse=True)
+        return final_result

app/services/__init__.py ADDED Viewed

File without changes

app/services/predict.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from fastapi import Depends, UploadFile
+import torch
+from torchvision import transforms
+from PIL import Image
+from app.api.dto.kg_query import KGQueryRequest, QueryContext
+from app.core.dependencies import get_all_models
+from app.core.type import Node
+from app.models.crop_clip import CLIPModule
+from app.models.knowledge_graph import KnowledgeGraphUtils
+from app.utils.data_mapping import DataMapping
+from app.utils.extract_entity import extract_entities
+class PredictService:
+    def __init__(self, models):
+        self.models = models
+    def predict_image(self, image: UploadFile):
+        clip_model: CLIPModule = self.models["clip_model"]
+        image_content = image.file.read()
+        pil_image = Image.open(Image.io.BytesIO(image_content)).convert('RGB')
+        return clip_model.predict_image(pil_image)
+    def retrieve_kg(self, request: KGQueryRequest):
+        try:
+            kg: KnowledgeGraphUtils = self.models["knowledge_graph"]
+            if not request.context:
+                request.context = QueryContext()
+            if request.crop_id:
+                request.context.crop_id = request.crop_id
+            if request.additional_info:
+                request.context.nodes = self.__get_nodes_from_additional_info(request.additional_info, self.models["data_mapper"])
+            env_result = kg.get_disease_from_env_factors(request.context.crop_id, request.context.nodes)
+            symptom_result = kg.get_disease_from_symptoms(request.context.crop_id, request.context.nodes)
+            context = request.context
+            context.nodes.extend([env_result["disease"] for env_result in env_result])
+            context.nodes.extend([symptom_result["disease"] for symptom_result in symptom_result])
+            context.nodes.sort(key=lambda x: x.score, reverse=True)
+            return {
+                "context": context,
+                "env_result": env_result,
+                "symptom_result": symptom_result
+            }
+        except Exception as e:
+            print(e)
+            raise e
+    def __get_nodes_from_additional_info(self, additional_info: str, data_mapper: DataMapping):
+        entities = extract_entities(additional_info)
+        top_results: list[Node] = []
+        for entity in entities:
+            top_result = data_mapper.get_top_result_by_text(entity.name, 3)
+            print([result.name for result in top_result])
+            for result in top_result:
+                top_results.append(result)
+        return top_results
+def get_predict_service(models = Depends(get_all_models)):
+    return PredictService(models)

app/utils/constant.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ NEO4J_LABELS =['Disease', 'Symptom', 'Treatment', 'Cause', 'Effect', 'Prevention', 'EnvironmentalFactor', 'Stage', 'Crop', 'CropType', 'PlantPart', 'SoilType', 'DiagnosisMethod']
2	+ NEO4J_RELATIONS = ['CAUSES', 'HAS_SYMPTOM', 'PRODUCES', 'FAVORS', 'IS_TREATED_BY', 'PREVENTS', 'OCCURS_AT', 'BELONGS_TO', 'CONTAINS', 'LOCATED_ON', 'AFFECTS', 'IS_APPLIED_TO']

app/utils/data_mapping.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+from sentence_transformers import SentenceTransformer
+import faiss
+from pyvi.ViTokenizer import tokenize
+import sqlite3
+from app.core.type import Node
+FAISS_INDEX_PATH = 'app/data/faiss_index.index'
+VECTOR_EMBEDDINGS_DB_PATH = 'app/data/vector_embeddings.db'
+class SingletonModel:
+    _instance = None
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(SingletonModel, cls).__new__(cls)
+            cls._instance.model = SentenceTransformer('dangvantuan/vietnamese-embedding')
+        return cls._instance
+class DataMapping:
+    def __init__(self):
+        try:
+            self.model: SentenceTransformer = SingletonModel().model
+            self.index: faiss.IndexFlatL2 = self.__load_faiss_index()
+            self.conn = sqlite3.connect(VECTOR_EMBEDDINGS_DB_PATH, check_same_thread=False)
+            self.cursor = self.conn.cursor()
+        except Exception as e:
+            print(f"Error while initializing DataMapping: {e}")
+            raise
+    def __del__(self):
+        self.cursor.close()
+        self.conn.close()
+    def __load_faiss_index(self, index_file = FAISS_INDEX_PATH):
+        if os.path.exists(index_file):
+            index = faiss.read_index(index_file)
+            print(f"Đã nạp FAISS index từ {index_file}")
+            return index
+        return None
+    def get_top_index_by_text(self, text, top_k=1, distance_threshold=float(0.6)):
+        if not text or top_k < 1:
+            raise ValueError("Invalid input: text cannot be empty and top_k must be positive")
+        q_token = tokenize(text)
+        q_vec = self.model.encode([q_token])
+        faiss.normalize_L2(q_vec)
+        D, I = self.index.search(q_vec, top_k)
+        mask = D[0] >= distance_threshold
+        filtered_indices = I[0][mask].tolist()
+        distances = D[0][mask].tolist()
+        return filtered_indices, distances
+    def get_embedding_by_id(self, id):
+        self.cursor.execute("SELECT * FROM embeddings WHERE e_index = ?", (id,))
+        return self.cursor.fetchone()
+    def get_top_result_by_text(self, text, top_k = 1, type = None) -> list[Node]:
+        top_index, distances = self.get_top_index_by_text(text, top_k)
+        results = [self.get_embedding_by_id(int(index)) for index in top_index]
+        if type:
+            results = [result for result in results if result[3] == type]
+        return [Node.data_row_to_node(result, distance) for result, distance in zip(results, distances)]

app/utils/extract_entity.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import json
+from string import Template
+from fastapi import Depends
+from langchain_google_genai import ChatGoogleGenerativeAI
+import dotenv
+import os
+from app.models.gemini_caller import GeminiGenerator
+from app.core.type import Node
+from app.utils.prompt import EXTRACT_ENTITIES_PROMPT
+dotenv.load_dotenv()
+def extract_entities(text: str) -> list[Node]:
+    try:
+        gemini = GeminiGenerator()
+        prompt = Template(EXTRACT_ENTITIES_PROMPT).substitute(ctext=text)
+        entities = gemini.generate(prompt)
+        entities = (json.loads(clean_text(entities.text)))["entities"]
+        return [Node.map_json_to_node(entity) for entity in entities]
+    except Exception as e:
+        print(f"Error while extract knowledge entities: {str(e)}")
+        return []
+def clean_text(text: str):
+  text = text.replace("```json", "").replace("```", "")
+  return text

app/utils/prompt.py ADDED Viewed

	@@ -0,0 +1,85 @@

+EXTRACT_ENTITIES_PROMPT = """
+Từ mô tả bên dưới, hãy trích xuất các Thực thể được mô tả theo định dạng được chỉ định. Đảm bảo kết quả hoàn chỉnh, không thiếu thông tin.
+0. LUÔN LUÔN HOÀN TẤT KẾT QUẢ. Không gửi kết quả bị thiếu
+1. Trích xuất Thực thể (entities)
+- Mỗi thực thể phải có thuộc tính `id` là chuỗi chữ và số duy nhất, định dạng **camelCase** (ví dụ: `benhDaoOn`, `laVang`). Thuộc tính `id` được sử dụng để liên kết trong mối quan hệ.
+- Chỉ tạo thực thể thuộc các loại được liệt kê, không tạo loại mới.
+- Đảm bảo các thuộc tính (`name`, `description`, v.v.) khớp với nội dung văn bản.
+Các loại thực thể:
+- **Disease**: Tình trạng cây bị hại bởi vi sinh vật, nấm, hoặc yếu tố môi trường. Đảm bảo trong ngữ cảnh đầu vào chỉ đang nói đến một bệnh duy nhất.
+  - `id`: Tên bệnh ở dạng camelCase, bao gồm thông tin cây trồng nếu bệnh xuất hiện trên cây cụ thể (ví dụ: `benhDomNauSan` cho bệnh đốm nâu trên sắn, `benhDomNauCaChua` cho bệnh đốm nâu trên cà chua). Không có các giới từ như "trên".
+  - `name`: Tên bệnh trong văn bản, bao gồm thông tin cây trồng nếu có (ví dụ: "Bệnh đốm nâu trên sắn"). Nếu không có thông tin cây trồng, sử dụng tên bệnh chung (ví dụ: "Bệnh đốm nâu").
+  - `description`: Mô tả tình trạng bệnh, ưu tiên đề cập cây trồng nếu có (ví dụ: "Bệnh đốm nâu trên cây sắn do nấm gây ra"). Nếu không có thông tin, dùng "Không có mô tả cụ thể".
+- **Symptom**: Dấu hiệu bất thường trên cây (lá vàng, héo, đốm).
+  - `id`: Tên triệu chứng ở dạng camelCase (ví dụ: `laVang`).
+  - `name`: Tên triệu chứng trong văn bản (ví dụ: "Lá vàng").
+  - `description`: Mô tả triệu chứng.
+- **Treatment**: Biện pháp kiểm soát bệnh/sâu hại (thuốc, sinh học).
+  - `id`: Tên biện pháp ở dạng camelCase, gắn với hoạt chất hoặc loại thuốc cụ thể (ví dụ: `thuocDietNamThiophanate`).
+  - `name`: Tên biện pháp trong văn bản, phản ánh hoạt chất hoặc loại thuốc (ví dụ: "Thuốc Diệt Nấm chứa Thiophanate").
+  - `method`: Cách thực hiện biện pháp. (ví dụ: "Phun thuốc lên lá")
+  - `activeIngredient` (tùy chọn): Tên hoạt chất chính, bao gồm nồng độ nếu có (ví dụ: "Thiophanate 0.20%"). Nếu không xác định, để trống.
+- **Cause**: Tác nhân gây bệnh/sâu hại (nấm, virus, côn trùng).
+  - `id`: Tên tác nhân ở dạng camelCase, gắn liền với tên của tác nhân viết gọn (ví dụ: `namMHenningsii`).
+  - `name`: Tên tác nhân trong văn bản (ví dụ: "Nấm Mycosphaerella henningsi")
+  - `type`: Loại tác nhân (nấm, virus, côn trùng, vi khuẩn, ...).
+- **Effect**: Tác động của bệnh/sâu hại (giảm năng suất, cây chết).
+  - `id`: Tên tác động ở dạng camelCase, sử dụng dạng ngắn gọn và chung nhất (ví dụ: `giamNangSuat` cho mọi trường hợp liên quan đến giảm năng suất, thay vì `nangSuatGiamDangKe`).
+  - `name`: Tên tác động được chuẩn hóa, sử dụng dạng chung nhất từ văn bản (ví dụ: "Giảm năng suất" thay vì "Giảm năng suất đáng kể"). Loại bỏ các từ ngữ bổ nghĩa như "đáng kể", "nghiêm trọng".
+  - `impact`: Mô tả ngắn gọn mức độ ảnh hưởng, ưu tiên sử dụng cụm từ chung (ví dụ: "Ảnh hưởng đến sản lượng" thay vì sao chép toàn bộ mô tả chi tiết từ văn bản).
+- **Prevention**: Biện pháp ngăn ngừa bệnh/sâu hại (luân canh, giống kháng).
+  - `id`: Tên biện pháp ở dạng camelCase (ví dụ: `luanCanh`).
+  - `name`: Tên biện pháp trong văn bản.
+  - `method`: Cách thực hiện biện pháp.
+- **EnvironmentalFactor**: Yếu tố tự nhiên ảnh hưởng cây (nhiệt độ, độ ẩm, ...).
+  - `id`: Tên yếu tố ở dạng camelCase (ví dụ: `doAmCao`).
+  - `name`: Tên yếu tố trong văn bản.
+  - `description`: Mô tả yếu tố.
+- **Stage**: Giai đoạn phát triển của cây.
+  - `id`: Tên giai đoạn ở dạng camelCase (ví dụ: `giaiDoanRaHoa`).
+  - `start`: Thời gian bắt đầu (tháng, kiểu float).
+  - `end`: Thời gian kết thúc (tháng, kiểu float).
+- **Crop**: Cây trồng, không được tạo ngoài danh sách: "Lúa", "Sắn", "Cà chua", "Ngô"
+  - `id`: Tên cây ở dạng camelCase, chỉ nằm trong danh sách: [lua,san,caChua,ngo].
+  - `name`: Tên cây trong văn bản (ví dụ: "Sắn").
+- **CropType**: Phân loại cây (lương thực, ăn quả, công nghiệp).
+  - `id`: Tên loại cây ở dạng camelCase (ví dụ: `luongThuc`).
+  - `name`: Tên loại cây trong văn bản.
+- **PlantPart**: Phần cây bị ảnh hưởng (lá, thân, rễ, quả).
+  - `id`: Tên phần cây ở dạng camelCase (ví dụ: `la`).
+  - `name`: Tên phần cây trong văn bản.
+- **SoilType**: Loại đất trồng cây.
+  - `id`: Tên loại đất ở dạng camelCase (ví dụ: `datPhuSa`).
+  - `name`: Tên loại đất trong văn bản.
+- **DiagnosisMethod**: Kỹ thuật xác định bệnh/sâu hại (quan sát, xét nghiệm).
+  - `id`: Tên kỹ thuật ở dạng camelCase (ví dụ: `quanSat`).
+  - `name`: Tên kỹ thuật trong văn bản.
+  - `technique`: Cách thực hiện kỹ thuật.
+2. Trả về kết quả dưới dạng JSON:
+- Trả về JSON với một trường duy nhất là `entities`
+  - `entities`: Danh sách các thực thể, mỗi thực thể là một object với các thuộc tính theo loại.
+Ví dụ:
+```json
+{
+    "entities": [{"label":"Disease","id":string,"name":string,"description":string}]
+}
+```
+Ngữ cảnh:
+$ctext
+"""

environment.yml ADDED Viewed

	@@ -0,0 +1,167 @@

+name: graduated2
+channels:
+  - defaults
+  - https://repo.anaconda.com/pkgs/main
+  - https://repo.anaconda.com/pkgs/r
+dependencies:
+  - ca-certificates=2025.2.25=hecd8cb5_0
+  - libcxx=14.0.6=h9765a3e_0
+  - libffi=3.4.4=hecd8cb5_1
+  - ncurses=6.4=hcec6c5f_0
+  - openssl=3.0.16=h184c1cd_0
+  - pip=25.1=pyhc872135_2
+  - python=3.9.21=hce00570_1
+  - readline=8.2=hca72f7f_0
+  - setuptools=78.1.1=py39hecd8cb5_0
+  - sqlite=3.45.3=h6c40b1e_0
+  - tk=8.6.14=h4d00af3_0
+  - tzdata=2025b=h04d1e81_0
+  - wheel=0.45.1=py39hecd8cb5_0
+  - xz=5.6.4=h46256e1_1
+  - zlib=1.2.13=h4b97444_1
+  - pip:
+      - aiohappyeyeballs==2.6.1
+      - aiohttp==3.11.18
+      - aiosignal==1.3.2
+      - annotated-types==0.7.0
+      - anyio==4.9.0
+      - asgiref==3.8.1
+      - async-timeout==4.0.3
+      - attrs==25.3.0
+      - backoff==2.2.1
+      - bcrypt==4.3.0
+      - build==1.2.2.post1
+      - cachetools==5.5.2
+      - certifi==2025.4.26
+      - charset-normalizer==3.4.2
+      - chromadb==1.0.8
+      - click==8.1.8
+      - coloredlogs==15.0.1
+      - dataclasses-json==0.6.7
+      - deprecated==1.2.18
+      - distro==1.9.0
+      - durationpy==0.9
+      - exceptiongroup==1.3.0
+      - fastapi==0.115.9
+      - filelock==3.18.0
+      - filetype==1.2.0
+      - flatbuffers==25.2.10
+      - frozenlist==1.6.0
+      - fsspec==2024.12.0
+      - google-ai-generativelanguage==0.6.18
+      - google-api-core==2.24.2
+      - google-auth==2.40.1
+      - googleapis-common-protos==1.70.0
+      - greenlet==3.2.2
+      - grpcio==1.72.0rc1
+      - grpcio-status==1.72.0rc1
+      - h11==0.16.0
+      - hf-xet==1.1.0
+      - httpcore==1.0.9
+      - httptools==0.6.4
+      - httpx==0.28.1
+      - httpx-sse==0.4.0
+      - huggingface-hub==0.31.1
+      - humanfriendly==10.0
+      - idna==3.10
+      - importlib-metadata==8.6.1
+      - importlib-resources==6.5.2
+      - jinja2==3.1.6
+      - jiter==0.10.0
+      - json-repair==0.39.1
+      - jsonpatch==1.33
+      - jsonpointer==3.0.0
+      - jsonschema==4.23.0
+      - jsonschema-specifications==2025.4.1
+      - kubernetes==32.0.1
+      - langchain==0.3.25
+      - langchain-community==0.3.24
+      - langchain-core==0.3.60
+      - langchain-google-genai==2.1.4
+      - langchain-neo4j==0.4.0
+      - langchain-text-splitters==0.3.8
+      - langsmith==0.3.42
+      - markdown-it-py==3.0.0
+      - markupsafe==3.0.2
+      - marshmallow==3.26.1
+      - mdurl==0.1.2
+      - mmh3==5.1.0
+      - mpmath==1.3.0
+      - multidict==6.4.4
+      - mypy-extensions==1.1.0
+      - neo4j==5.28.1
+      - neo4j-graphrag==1.7.0
+      - networkx==3.2.1
+      - numpy==2.0.2
+      - oauthlib==3.2.2
+      - onnxruntime==1.19.2
+      - openai==1.79.0
+      - opentelemetry-api==1.33.0
+      - opentelemetry-exporter-otlp-proto-common==1.33.0
+      - opentelemetry-exporter-otlp-proto-grpc==1.33.0
+      - opentelemetry-instrumentation==0.54b0
+      - opentelemetry-instrumentation-asgi==0.54b0
+      - opentelemetry-instrumentation-fastapi==0.54b0
+      - opentelemetry-proto==1.33.0
+      - opentelemetry-sdk==1.33.0
+      - opentelemetry-semantic-conventions==0.54b0
+      - opentelemetry-util-http==0.54b0
+      - orjson==3.10.18
+      - overrides==7.7.0
+      - packaging==24.2
+      - pillow==11.2.1
+      - posthog==4.0.1
+      - propcache==0.3.1
+      - proto-plus==1.26.1
+      - protobuf==6.31.0
+      - pyasn1==0.6.1
+      - pyasn1-modules==0.4.2
+      - pydantic==2.11.4
+      - pydantic-core==2.33.2
+      - pydantic-settings==2.9.1
+      - pygments==2.19.1
+      - pypdf==5.5.0
+      - pypika==0.48.9
+      - pyproject-hooks==1.2.0
+      - python-dateutil==2.9.0.post0
+      - python-dotenv==1.1.0
+      - pytz==2025.2
+      - pyyaml==6.0.2
+      - referencing==0.36.2
+      - regex==2024.11.6
+      - requests==2.32.3
+      - requests-oauthlib==2.0.0
+      - requests-toolbelt==1.0.0
+      - rich==14.0.0
+      - rpds-py==0.24.0
+      - rsa==4.9.1
+      - safetensors==0.5.3
+      - shellingham==1.5.4
+      - six==1.17.0
+      - sniffio==1.3.1
+      - sqlalchemy==2.0.41
+      - starlette==0.45.3
+      - sympy==1.14.0
+      - tenacity==9.1.2
+      - tokenizers==0.21.1
+      - tomli==2.2.1
+      - torch==2.2.2
+      - torchvision==0.17.2
+      - tqdm==4.67.1
+      - transformers==4.51.3
+      - typer==0.15.3
+      - types-pyyaml==6.0.12.20250516
+      - typing-extensions==4.13.2
+      - typing-inspect==0.9.0
+      - typing-inspection==0.4.0
+      - urllib3==2.4.0
+      - uvicorn==0.34.2
+      - uvloop==0.21.0
+      - watchfiles==1.0.5
+      - websocket-client==1.8.0
+      - websockets==15.0.1
+      - wrapt==1.17.2
+      - yarl==1.20.0
+      - zipp==3.21.0
+      - zstandard==0.23.0
+prefix: /Users/artteiv/miniconda3/envs/graduated2

prepare_script/image_caption_embeddings.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import json
+import sqlite3
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import torch
+from PIL import Image
+import clip
+import faiss
+import numpy as np
+import glob
+# Đường dẫn lưu trữ
+VECTOR_EMBEDDINGS_DB_PATH = 'app/data/vector_embeddings.db'
+IMAGE_FAISS_INDEX_PATH = 'app/data/image_faiss_index.index'
+TEXT_FAISS_INDEX_PATH = 'app/data/text_faiss_index.index'
+# Đường dẫn dữ liệu
+DATA_ROOT = '/Users/artteiv/Desktop/Graduated/chore-graduated/Data'
+MAIN_DATA_PATH = os.path.join(DATA_ROOT, 'main_data')
+CAPTIONS_PATH = os.path.join(DATA_ROOT, 'captions')
+# Kết nối SQLite
+conn = sqlite3.connect(VECTOR_EMBEDDINGS_DB_PATH)
+cursor = conn.cursor()
+# Tạo bảng embeddings cho ảnh và văn bản
+cursor.execute('''
+    CREATE TABLE IF NOT EXISTS image_embeddings (
+        e_index INTEGER PRIMARY KEY,
+        image_path TEXT NOT NULL,
+        caption TEXT NOT NULL,
+        category TEXT NOT NULL,
+        subcategory TEXT NOT NULL
+    )
+''')
+cursor.execute('''
+    CREATE TABLE IF NOT EXISTS text_embeddings (
+        e_index INTEGER PRIMARY KEY,
+        text TEXT NOT NULL,
+        category TEXT NOT NULL,
+        subcategory TEXT NOT NULL
+    )
+''')
+def insert_image_embedding(e_index, image_path, caption, category, subcategory):
+    """Thêm embedding ảnh vào SQLite."""
+    cursor.execute('''
+        INSERT INTO image_embeddings (e_index, image_path, caption, category, subcategory)
+        VALUES (?, ?, ?, ?, ?)
+    ''', (e_index, image_path, caption, category, subcategory))
+    conn.commit()
+    print(f"Đã thêm embedding ảnh: {image_path}")
+def insert_text_embedding(e_index, text, category, subcategory):
+    """Thêm embedding văn bản vào SQLite."""
+    cursor.execute('''
+        INSERT INTO text_embeddings (e_index, text, category, subcategory)
+        VALUES (?, ?, ?, ?)
+    ''', (e_index, text, category, subcategory))
+    conn.commit()
+    print(f"Đã thêm embedding văn bản: {text[:50]}...")
+def save_faiss_index(index, index_file):
+    """Lưu FAISS index vào file."""
+    faiss.write_index(index, index_file)
+    print(f"Đã lưu FAISS index vào {index_file}")
+def load_faiss_index(index_file):
+    """Nạp FAISS index từ file."""
+    if os.path.exists(index_file):
+        index = faiss.read_index(index_file)
+        print(f"Đã nạp FAISS index từ {index_file}")
+        return index
+    return None
+def compute_embeddings():
+    """Tính toán embeddings cho ảnh và văn bản sử dụng CLIP."""
+    print("Loading CLIP model...")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model, preprocess = clip.load("ViT-B/32", device=device)
+    print("Model loaded")
+    # Lấy danh sách các thư mục con (categories)
+    categories = [d for d in os.listdir(MAIN_DATA_PATH) if os.path.isdir(os.path.join(MAIN_DATA_PATH, d))]
+    image_paths = []
+    captions = []
+    texts = []
+    categories_list = []
+    subcategories_list = []
+    # Chuẩn bị dữ liệu
+    print("Processing data from directories...")
+    for category in categories:
+        # Đường dẫn đến thư mục category
+        category_path = os.path.join(MAIN_DATA_PATH, category)
+        # Lấy danh sách các subcategories
+        subcategories = [d for d in os.listdir(category_path) if os.path.isdir(os.path.join(category_path, d))]
+        for subcategory in subcategories:
+            # Đường dẫn đến thư mục ảnh và caption của subcategory
+            subcategory_image_path = os.path.join(category_path, subcategory)
+            subcategory_caption_path = os.path.join(CAPTIONS_PATH, category, subcategory)
+            # Lấy danh sách ảnh
+            image_files = glob.glob(os.path.join(subcategory_image_path, '*.*'))
+            for img_path in image_files:
+                # Lấy tên file không có phần mở rộng
+                base_name = os.path.splitext(os.path.basename(img_path))[0]
+                caption_file = os.path.join(subcategory_caption_path, f"{base_name}.txt")
+                if os.path.exists(caption_file):
+                    try:
+                        # Đọc caption
+                        with open(caption_file, 'r', encoding='utf-8') as f:
+                            caption = f.read().strip()
+                        # Thêm vào danh sách
+                        image_paths.append(img_path)
+                        captions.append(caption)
+                        texts.append(caption)  # Sử dụng caption làm text
+                        categories_list.append(category)
+                        subcategories_list.append(subcategory)
+                    except Exception as e:
+                        print(f"Error processing {img_path}: {e}")
+                        continue
+    # Tính toán embeddings cho ảnh
+    # if image_paths:
+    #     print("Computing image embeddings...")
+    #     image_embeddings = []
+    #     for idx, img_path in enumerate(image_paths):
+    #         try:
+    #             image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
+    #             with torch.no_grad():
+    #                 image_features = model.encode_image(image)
+    #                 image_features = image_features.cpu().numpy()
+    #                 faiss.normalize_L2(image_features)
+    #                 image_embeddings.append(image_features[0])
+    #                 insert_image_embedding(idx, img_path, captions[idx], categories_list[idx], subcategories_list[idx])
+    #         except Exception as e:
+    #             print(f"Error processing image {img_path}: {e}")
+    #             continue
+    #     if image_embeddings:
+    #         image_embeddings = np.array(image_embeddings)
+    #         d = image_embeddings.shape[1]
+    #         image_index = faiss.IndexFlatIP(d)
+    #         image_index.add(image_embeddings)
+    #         save_faiss_index(image_index, IMAGE_FAISS_INDEX_PATH)
+    # Tính toán embeddings cho văn bản
+    if texts:
+        print("Computing text embeddings...")
+        text_tokens = clip.tokenize(texts, truncate=True).to(device)
+        print("Kích thước của text_tokens:", text_tokens.shape)
+        with torch.no_grad():
+            text_features = model.encode_text(text_tokens)
+            text_features = text_features.cpu().numpy()
+            faiss.normalize_L2(text_features)
+            d = text_features.shape[1]
+            text_index = faiss.IndexFlatIP(d)
+            text_index.add(text_features)
+            # Lưu text embeddings vào SQLite
+            for idx, (text, category, subcategory) in enumerate(zip(texts, categories_list, subcategories_list)):
+                insert_text_embedding(idx, text, category, subcategory)
+            save_faiss_index(text_index, TEXT_FAISS_INDEX_PATH)
+    print("Processing completed")
+    return image_index if image_paths else None, text_index if texts else None
+def predict_image(image_path):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model, preprocess = clip.load("ViT-B/32", device=device)
+    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
+    with torch.no_grad():
+        image_features = model.encode_image(image)
+        image_features = image_features.cpu().numpy()
+        faiss.normalize_L2(image_features)
+    index = load_faiss_index(IMAGE_FAISS_INDEX_PATH)
+    distances, indices = index.search(image_features, k=10)
+    return distances, indices
+if __name__ == '__main__':
+    ## Predict
+    try:
+        image_index, text_index = compute_embeddings()
+        if image_index:
+            print(f"Image index ready with {image_index.ntotal} embeddings")
+        if text_index:
+            print(f"Text index ready with {text_index.ntotal} embeddings")
+    finally:
+        conn.close()
+        print("SQLite connection closed")

prepare_script/sync_neo4j_node.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import json
+import sqlite3
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from app.models.knowledge_graph import Neo4jConnection
+from sentence_transformers import SentenceTransformer
+from pyvi.ViTokenizer import tokenize
+import faiss
+import numpy as np
+"""
+Script này thực hiện lấy các entity từ neo4j từ xa về và tạo ra các data lưu trong sqlite, đồng thời tạo các embeddings
+dựa trên từng row.
+"""
+# Kết nối SQLite
+VECTOR_EMBEDDINGS_DB_PATH = 'app/data/vector_embeddings.db'
+FAISS_INDEX_PATH = 'app/data/faiss_index.index'
+conn = sqlite3.connect(VECTOR_EMBEDDINGS_DB_PATH)
+cursor = conn.cursor()
+# Tạo bảng embeddings nếu chưa tồn tại
+cursor.execute('''
+    CREATE TABLE IF NOT EXISTS embeddings (
+        e_index INTEGER PRIMARY KEY,
+        id TEXT NOT NULL,
+        name TEXT NOT NULL,
+        label TEXT NOT NULL,
+        properties TEXT NOT NULL
+    )
+''')
+def insert_embedding(e_index, id, name, label, properties):
+    """Thêm embedding vào SQLite."""
+    cursor.execute('''
+        INSERT INTO embeddings (e_index, id, name, label, properties)
+        VALUES (?, ?, ?, ?, ?)
+    ''', (e_index, id, name, label, json.dumps(properties)))
+    conn.commit()
+    print(f"Đã thêm embedding: {name}")
+def update_embedding(embedding_id, id, name, label, properties):
+    """Cập nhật embedding trong SQLite."""
+    cursor.execute('''
+        UPDATE embeddings
+        SET id = ?, name = ?, label = ?, properties = ?
+        WHERE e_index = ?
+    ''', (id, name, label, json.dumps(properties), embedding_id))
+    conn.commit()
+    print(f"Đã cập nhật embedding ID: {embedding_id}")
+def get_all_embeddings():
+    """Lấy tất cả embeddings từ SQLite."""
+    cursor.execute('SELECT * FROM embeddings')
+    return cursor.fetchall()
+def get_embedding_by_id(embedding_id):
+    """Lấy embedding theo e_index từ SQLite."""
+    cursor.execute('SELECT * FROM embeddings WHERE e_index = ?', (embedding_id,))
+    return cursor.fetchone()
+def save_faiss_index(index, index_file=FAISS_INDEX_PATH):
+    """Lưu FAISS index vào file."""
+    faiss.write_index(index, index_file)
+    print(f"Đã lưu FAISS index vào {index_file}")
+def load_faiss_index(index_file=FAISS_INDEX_PATH):
+    """Nạp FAISS index từ file."""
+    if os.path.exists(index_file):
+        index = faiss.read_index(index_file)
+        print(f"Đã nạp FAISS index từ {index_file}")
+        return index
+    return None
+def compute_and_save_embeddings(index_file=FAISS_INDEX_PATH):
+    """Tính toán embeddings, lưu vào FAISS và đồng bộ metadata vào SQLite."""
+    print("Loading model...")
+    model = SentenceTransformer('dangvantuan/vietnamese-embedding')
+    print("Model loaded")
+    # Lấy dữ liệu từ Neo4j
+    neo4j = Neo4jConnection()
+    result = neo4j.execute_query("MATCH (n) RETURN n")
+    corpus = []
+    # Chuẩn bị corpus và lưu metadata vào SQLite
+    print("Processing Neo4j data and saving to SQLite...")
+    for index, record in enumerate(result):
+        print(record)
+        label = list(record["n"].labels)[0]
+        print(label)
+        embedding = dict(record["n"])
+        id = embedding.pop('id')
+        name = embedding.pop('name') if 'name' in embedding else id
+        properties = embedding
+        corpus.append(name)
+        # Kiểm tra và cập nhật/thêm vào SQLite
+        cursor.execute('SELECT e_index FROM embeddings WHERE e_index = ?', (index,))
+        existing = cursor.fetchone()
+        if existing:
+            update_embedding(index, id, name, label, properties)
+        else:
+            insert_embedding(index, id, name, label, properties)
+    # Tính toán embeddings
+    print("Tokenizing and encoding...")
+    tokenized = [tokenize(s) for s in corpus]
+    embeddings = model.encode(tokenized, show_progress_bar=False)
+    print("Encoding done")
+    # Chuẩn hóa embeddings
+    print("Normalizing...")
+    faiss.normalize_L2(embeddings)
+    print("Normalized")
+    # Tạo và lưu FAISS index
+    d = embeddings.shape[1]
+    index = faiss.IndexFlatIP(d)
+    index.add(embeddings)
+    save_faiss_index(index, index_file)
+    print("Processing completed")
+    return index, corpus, embeddings
+def load_or_compute_embeddings(index_file=FAISS_INDEX_PATH):
+    """Nạp hoặc tính toán embeddings và FAISS index."""
+    # Thử nạp FAISS index
+    index = load_faiss_index(index_file)
+    # Lấy corpus từ SQLite
+    embeddings_data = get_all_embeddings()
+    corpus = [row[2] for row in embeddings_data]  # Lấy cột name
+    if index is None or not corpus:
+        print("No saved index or corpus found, computing new ones...")
+        index, corpus, embeddings = compute_and_save_embeddings(index_file)
+    else:
+        print("Loaded existing index and corpus")
+    return index, corpus
+def get_qvec_by_text(model, text):
+    q_token = tokenize(text)
+    q_vec = model.encode([q_token])
+    faiss.normalize_L2(q_vec)
+    return q_vec
+if __name__ == "__main__":
+    try:
+        index, corpus = load_or_compute_embeddings()
+        print(f"Index ready with {index.ntotal} embeddings, corpus size: {len(corpus)}")
+        model = SentenceTransformer('dangvantuan/vietnamese-embedding')
+        while True:
+            try:
+                query = input("Nhập câu truy vấn (nhấn Ctrl+C để thoát): ")
+                q_vec = get_qvec_by_text(model, query)
+                k = 1  # số kết quả cần lấy
+                D, I = index.search(q_vec, k)
+                print("Câu truy vấn:", query)
+                print(I[0][0])
+                print(type(I[0][0]))
+                print("Câu gần nhất:", get_embedding_by_id(int(I[0][0])), "(khoảng cách:", D[0][0], ")")
+                print("-" * 50)
+            except KeyboardInterrupt:
+                print("\nĐã dừng chương trình!")
+                break
+    finally:
+        conn.close()
+        print("SQLite connection closed")

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+langchain>=0.3.23
+neo4j>=5.28.1
+python-dotenv>=1.0.1
+fastapi>=0.115.12
+uvicorn>=0.34.2
+pydantic>=2.10.6
+faiss-cpu>=1.11.0