# Install dependencies # !pip install openai langchain pandas import os import io import json import mimetypes import datetime import pandas as pd from langchain.chat_models import ChatOpenAI from langchain.prompts import PromptTemplate import dotenv # Load environment variables from .env file if present dotenv.load_dotenv() # Set your OpenAI API Key (better: use Colab "Secrets" to avoid hardcoding) os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") custom_metadata = {"company":"accusaga"} def create_file_metadata(file_path: str): """ Create metadata JSON for any file type using LangChain + OpenAI. Includes df.info() and df.describe() if tabular. """ # --- Basic File Info --- file_name = os.path.basename(file_path) file_type, _ = mimetypes.guess_type(file_path) file_size = os.path.getsize(file_path) created_date = datetime.datetime.fromtimestamp(os.path.getctime(file_path)).strftime("%Y-%m-%d") modified_date = datetime.datetime.fromtimestamp(os.path.getmtime(file_path)).strftime("%Y-%m-%d") # --- Extract Structured Info if CSV/Excel --- all_columns_list = [] data_types = {} sheet_info = [] df_info_str = None df_describe_dict = None try: if file_path.endswith(".csv"): df = pd.read_csv(file_path) buf = io.StringIO() df.info(buf=buf) df_info_str = buf.getvalue() df_describe_dict = df.describe(include="all").to_dict() sheet_info = [{ "sheet_name": "CSV", "num_rows": len(df), "num_columns": len(df.columns), "columns": df.columns.tolist(), "sample_data": df.head(3).to_dict(orient="records") }] all_columns_list.extend(df.columns.tolist()) data_types.update({col: str(df[col].dtype) for col in df.columns}) elif file_path.endswith((".xls", ".xlsx")): xls = pd.ExcelFile(file_path) for sheet in xls.sheet_names: df = pd.read_excel(file_path, sheet_name=sheet) buf = io.StringIO() df.info(buf=buf) df_info_str = buf.getvalue() df_describe_dict = df.describe(include="all").to_dict() sheet_info.append({ "sheet_name": sheet, "num_rows": len(df), "num_columns": len(df.columns), "columns": df.columns.tolist(), "sample_data": df.head(3).to_dict(orient="records") }) all_columns_list.extend(df.columns.tolist()) data_types.update({col: str(df[col].dtype) for col in df.columns}) except Exception as e: sheet_info = [] # --- Text preview for LLM --- file_preview = "" try: with open(file_path, "r", encoding="utf-8", errors="ignore") as f: file_preview = f.read(1000) # first 1000 chars except: file_preview = "Binary file preview not available." # --- OpenAI LLM for summary + tags --- llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=os.environ["OPENAI_API_KEY"]) # Ensure that sheet_info is not empty before passing it to the LLM sheets = sheet_info if sheet_info else "No sheets available" prompt = PromptTemplate( input_variables=["file_name", "file_type", "columns", "sheets", "preview"], template=""" You are a metadata extractor for data files. Given this file: - File Name: {file_name} - File Type: {file_type} - Sheets: {sheets} - Columns: {columns} - Preview Content: {preview} Generate strictly valid JSON with keys: - summary_text: A concise 2-3 sentence description of the file. - tags: 8-12 recommended metadata search keywords. These tags should be: * Derived from the file name, sheet names, column names, and content * Consider the inferred data domain (healthcare, finance, etc.) * Focus on analysis use cases (e.g., "trends", "forecasting", "customer behavior") """ ) ai_response = llm.predict(prompt.format(file_name=file_name, file_type=file_type, columns=all_columns_list, sheets=sheets, preview=file_preview)) # Try parsing the AI response to valid JSON try: ai_json = json.loads(ai_response) except: # In case the AI response is not valid JSON, fallback to simple text ai_json = {"summary_text": ai_response.strip(), "tags": ["data", "file", "metadata"]} # --- Build Final Metadata JSON --- metadata = { "file_name": file_name, "file_path": file_path, "file_type": file_type if file_type else "unknown", "file_size_bytes": file_size, "num_sheets": len(sheet_info) if sheet_info else None, "sheets": sheet_info, "all_columns_list": list(set(all_columns_list)), "data_types": data_types, "df_info": df_info_str, "df_describe": df_describe_dict, "tags": ai_json.get("tags", []), "summary_text": ai_json.get("summary_text", ""), "created_date": created_date, "modified_date": modified_date, "creator": "system_user", "custom_user_given_metadata": custom_metadata } #add custom metadata to the metadata return metadata # =============================== # 🔹 Run Example # =============================== if __name__ == "__main__": test_file = r"C:\\Users\\Dell\\Documents\\MR-AI\\files\\healthcare-data-30.csv" # replace with your uploaded file in Colab meta = create_file_metadata(test_file) print(json.dumps(meta, indent=2))