# Install dependencies
# !pip install openai langchain pandas

import os
import io
import json
import mimetypes
import datetime
import pandas as pd
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
import dotenv
# Load environment variables from .env file if present
dotenv.load_dotenv()

# Set your OpenAI API Key (better: use Colab "Secrets" to avoid hardcoding)
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

custom_metadata = {"company":"accusaga"}

def create_file_metadata(file_path: str):
    """
    Create metadata JSON for any file type using LangChain + OpenAI.
    Includes df.info() and df.describe() if tabular.
    """

    # --- Basic File Info ---
    file_name = os.path.basename(file_path)
    file_type, _ = mimetypes.guess_type(file_path)
    file_size = os.path.getsize(file_path)
    created_date = datetime.datetime.fromtimestamp(os.path.getctime(file_path)).strftime("%Y-%m-%d")
    modified_date = datetime.datetime.fromtimestamp(os.path.getmtime(file_path)).strftime("%Y-%m-%d")

    # --- Extract Structured Info if CSV/Excel ---
    all_columns_list = []
    data_types = {}
    sheet_info = []
    df_info_str = None
    df_describe_dict = None

    try:
        if file_path.endswith(".csv"):
            df = pd.read_csv(file_path)
            buf = io.StringIO()
            df.info(buf=buf)
            df_info_str = buf.getvalue()
            df_describe_dict = df.describe(include="all").to_dict()

            sheet_info = [{
                "sheet_name": "CSV",
                "num_rows": len(df),
                "num_columns": len(df.columns),
                "columns": df.columns.tolist(),
                "sample_data": df.head(3).to_dict(orient="records")
            }]
            all_columns_list.extend(df.columns.tolist())
            data_types.update({col: str(df[col].dtype) for col in df.columns})

        elif file_path.endswith((".xls", ".xlsx")):
            xls = pd.ExcelFile(file_path)
            for sheet in xls.sheet_names:
                df = pd.read_excel(file_path, sheet_name=sheet)
                buf = io.StringIO()
                df.info(buf=buf)
                df_info_str = buf.getvalue()
                df_describe_dict = df.describe(include="all").to_dict()

                sheet_info.append({
                    "sheet_name": sheet,
                    "num_rows": len(df),
                    "num_columns": len(df.columns),
                    "columns": df.columns.tolist(),
                    "sample_data": df.head(3).to_dict(orient="records")
                })
                all_columns_list.extend(df.columns.tolist())
                data_types.update({col: str(df[col].dtype) for col in df.columns})
    except Exception as e:
        sheet_info = []

    # --- Text preview for LLM ---
    file_preview = ""
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            file_preview = f.read(1000)  # first 1000 chars
    except:
        file_preview = "Binary file preview not available."

    # --- OpenAI LLM for summary + tags ---
    llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=os.environ["OPENAI_API_KEY"])

    # Ensure that sheet_info is not empty before passing it to the LLM
    sheets = sheet_info if sheet_info else "No sheets available"

    prompt = PromptTemplate(
    input_variables=["file_name", "file_type", "columns", "sheets", "preview"],
    template="""
You are a metadata extractor for data files.
Given this file:
- File Name: {file_name}
- File Type: {file_type}
- Sheets: {sheets}
- Columns: {columns}
- Preview Content: {preview}

Generate strictly valid JSON with keys:
- summary_text: A concise 2-3 sentence description of the file.
- tags: 8-12 recommended metadata search keywords. These tags should be:
  * Derived from the file name, sheet names, column names, and content
  * Consider the inferred data domain (healthcare, finance, etc.)
  * Focus on analysis use cases (e.g., "trends", "forecasting", "customer behavior")
"""
)

    ai_response = llm.predict(prompt.format(file_name=file_name, file_type=file_type, columns=all_columns_list, sheets=sheets, preview=file_preview))

    # Try parsing the AI response to valid JSON
    try:
        ai_json = json.loads(ai_response)
    except:
        # In case the AI response is not valid JSON, fallback to simple text
        ai_json = {"summary_text": ai_response.strip(), "tags": ["data", "file", "metadata"]}
    

     
    
   

    # --- Build Final Metadata JSON ---
    metadata = {
        "file_name": file_name,
        "file_path": file_path,
        "file_type": file_type if file_type else "unknown",
        "file_size_bytes": file_size,
        "num_sheets": len(sheet_info) if sheet_info else None,
        "sheets": sheet_info,
        "all_columns_list": list(set(all_columns_list)),
        "data_types": data_types,
        "df_info": df_info_str,
        "df_describe": df_describe_dict,
        "tags": ai_json.get("tags", []),
        "summary_text": ai_json.get("summary_text", ""),
        "created_date": created_date,
        "modified_date": modified_date,
        "creator": "system_user",
        "custom_user_given_metadata": custom_metadata
        
        
    }

    #add custom metadata to the metadata
   
    return metadata 


# ===============================
# 🔹 Run Example
# ===============================
if __name__ == "__main__":
    test_file = r"C:\\Users\\Dell\\Documents\\MR-AI\\files\\healthcare-data-30.csv"  # replace with your uploaded file in Colab
    meta = create_file_metadata(test_file)
    print(json.dumps(meta, indent=2))