Spaces:

srivatsavdamaraju
/

mr_mvp_dev

Sleeping

App Files Files Community

srivatsavdamaraju commited on Nov 22, 2025

Commit

75c39de

verified ·

1 Parent(s): 2287980

Create r8.py

Browse files

Files changed (1) hide show

s3/r8.py +593 -0

s3/r8.py ADDED Viewed

	@@ -0,0 +1,593 @@

+#with chunking and proper file metadata upload including file_hash
+from fastapi import FastAPI, UploadFile, File, HTTPException, Query, APIRouter, Form
+from fastapi.responses import JSONResponse
+import pandas as pd
+from autoviz.AutoViz_Class import AutoViz_Class
+import io, os, boto3, tempfile, glob, matplotlib, json, hashlib, shutil
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import sys
+from pathlib import Path
+from typing import List, Optional
+import httpx
+import datetime
+import numpy as np
+# --- Project Root Setup ---
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+from retrieve_secret import *
+from s3.meta_data_creation_from_s3 import create_file_metadata_from_df
+from s3.create_dataset_graphs import create_data_set_graphs_dict
+# --- File Validation ---
+MAX_FILE_SIZE_BYTES = 100 * 1024 * 1024  # 100 MiB
+MAX_ROWS_ALLOWED = 1_000_000
+ALLOWED_EXTENSIONS = {".csv", ".xlsx", ".xls", ".ods"}
+ALLOWED_METADATA_EXTENSIONS = {".json", ".csv", ".xlsx", ".xls"}
+# --- AWS S3 Config ---
+print("AWS S3 config:", AWS_S3_CREDS_KEY_ID, AWS_S3_CREDS_SECRET_KEY, BUCKET_NAME)
+ACCESS_KEY = AWS_S3_CREDS_KEY_ID
+SECRET_KEY = AWS_S3_CREDS_SECRET_KEY
+BUCKET_NAME = BUCKET_NAME
+REGION_NAME = "us-east-1"
+s3 = boto3.client(
+    "s3",
+    aws_access_key_id=ACCESS_KEY,
+    aws_secret_access_key=SECRET_KEY,
+    region_name=REGION_NAME
+)
+ENDPOINT_URL = f"https://s3.{REGION_NAME}.amazonaws.com"
+# --- FastAPI Router ---
+s3_bucket_router1 = APIRouter(prefix="/s3/v3", tags=["s3_v3"])
+# --- Helper: S3 Key ---
+def make_key(path: str, filename: str) -> str:
+    return f"{path.strip('/')}/{filename}" if path else filename
+# --- Sanitize for JSON ---
+def sanitize_for_json(obj):
+    """Recursively sanitize objects to be JSON serializable"""
+    if isinstance(obj, dict):
+        return {k: sanitize_for_json(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [sanitize_for_json(item) for item in obj]
+    elif isinstance(obj, tuple):
+        return [sanitize_for_json(item) for item in obj]
+    elif isinstance(obj, (pd.Timestamp, pd.DatetimeTZDtype, datetime.datetime, datetime.date, datetime.time)):
+        return obj.isoformat()
+    elif isinstance(obj, (np.integer, np.int64, np.int32, np.int16, np.int8)):
+        return int(obj)
+    elif isinstance(obj, (np.floating, np.float64, np.float32, np.float16)):
+        return float(obj)
+    elif isinstance(obj, (np.bool_, bool)):
+        return bool(obj)
+    elif isinstance(obj, np.ndarray):
+        return sanitize_for_json(obj.tolist())
+    elif pd.isna(obj):
+        return None
+    elif isinstance(obj, (int, float, str, bool, type(None))):
+        return obj
+    else:
+        return str(obj)
+# --- Vector DB Placeholders ---
+def check_vdb(user_id: str):
+    print(f"Checking VDB for user: {user_id}")
+async def add_metadata_only(collection_name: str, metadata: dict):
+    print(f"Adding metadata to collection: {collection_name}")
+    return {"status": "success", "collection": collection_name}
+# --- Convert to Parquet ---
+import pyarrow as pa
+import pyarrow.parquet as pq
+def convert_df_to_parquet(df: pd.DataFrame) -> io.BytesIO:
+    """
+    Robust DataFrame → Parquet conversion with automated dtype correction.
+    Solves:
+    - Mixed-type object columns
+    - Non-JSON-serializable datetime values
+    - NaN / NaT issues
+    """
+    df_copy = df.copy()
+    for col in df_copy.columns:
+        # Skip if already correct dtype
+        if pd.api.types.is_numeric_dtype(df_copy[col]) or \
+           pd.api.types.is_datetime64_any_dtype(df_copy[col]) or \
+           pd.api.types.is_bool_dtype(df_copy[col]):
+            continue
+        # Clean object columns
+        if df_copy[col].dtype == "object":
+            sample = df_copy[col].dropna()
+            if len(sample) == 0:
+                df_copy[col] = None
+                continue
+            # Detect if column has datetime-like values
+            sample_values = sample.head(50)
+            has_datetime = any(
+                isinstance(x, (pd.Timestamp, datetime.date, datetime.time)) or
+                (isinstance(x, str) and any(k in x.lower() for k in ["-", "/", ":"]))
+                for x in sample_values
+            )
+            if has_datetime:
+                try:
+                    df_copy[col] = pd.to_datetime(df_copy[col], errors="coerce")
+                    # Convert to ISO string for Parquet
+                    df_copy[col] = df_copy[col].dt.strftime("%Y-%m-%d %H:%M:%S")
+                    continue
+                except Exception:
+                    pass
+            # Try numeric conversion
+            try:
+                numeric_conv = pd.to_numeric(df_copy[col], errors="coerce")
+                if numeric_conv.notna().sum() / sample.count() > 0.70:
+                    df_copy[col] = numeric_conv
+                    continue
+            except Exception:
+                pass
+            # Final fallback → string
+            df_copy[col] = df_copy[col].astype(str)
+    # Replace remaining bad values (for Parquet safety)
+    df_copy = df_copy.replace({"nan": None, "NaT": None, "None": None})
+    # Ensure all timezone-aware datetime columns convert safely
+    for col in df_copy.columns:
+        if pd.api.types.is_datetime64_any_dtype(df_copy[col]):
+            df_copy[col] = df_copy[col].dt.tz_localize(None).astype(str)
+    # Convert to Parquet binary buffer
+    buffer = io.BytesIO()
+    table = pa.Table.from_pandas(df_copy)
+    pq.write_table(table, buffer, compression="snappy")
+    buffer.seek(0)
+    print(f"📦 Parquet conversion OK — {len(buffer.getvalue()):,} bytes")
+    return buffer
+# --- Check File Hash Exists ---
+async def check_file_hash_exists(user_id: str, file_hash: str) -> dict:
+    """Check if a file hash already exists for a user."""
+    url = f"https://mr-mvp-api-dev.dev.ingenspark.com/auth/UserMetadata/{user_id}/check_file_hash?file_hash={file_hash}"
+    headers = {"accept": "application/json"}
+    async with httpx.AsyncClient(timeout=10.0) as client:
+        try:
+            r = await client.get(url, headers=headers)
+            r.raise_for_status()
+            result = r.json()
+            print(f"Hash check API response: {result}")
+            exists = result.get("exists", None)
+            if exists is None and "message" in result:
+                message_lower = result["message"].lower()
+                exists = "already existed" in message_lower or "already exists" in message_lower or "duplicate" in message_lower
+            if exists is None:
+                exists = False
+            return {
+                "success": True,
+                "exists": exists,
+                "data": result
+            }
+        except httpx.HTTPStatusError as e:
+            print(f"Hash check HTTP error: {e.response.status_code} - {e.response.text}")
+            return {
+                "success": False,
+                "exists": False,
+                "message": f"HTTP {e.response.status_code}",
+                "error_detail": e.response.text
+            }
+        except Exception as e:
+            print(f"Hash check exception: {str(e)}")
+            return {
+                "success": False,
+                "exists": False,
+                "message": f"Request failed: {str(e)}"
+            }
+# --- PostgreSQL Metadata Upload ---
+async def user_metadata_upload_pg(
+    user_id: str,
+    user_metadata: str,
+    path: str,
+    url: str,
+    filename: str,
+    file_type: str,
+    file_size_bytes: int,
+    file_hash: str,
+    timeout: float = 10.0,
+    data_sets_preview_graph: str = None,
+    is_metadata_file: bool = False,
+    metadata_file_path: str = None
+):
+    payload = {
+        "user_id": user_id,
+        "user_metadata": user_metadata,
+        "path": path,
+        "url": url,
+        "filename": filename,
+        "file_type": file_type,
+        "file_size_bytes": file_size_bytes,
+        "file_hash": file_hash,
+        "data_sets_preview_graph": data_sets_preview_graph,
+        "is_metadata_file": is_metadata_file,
+        "metadata_file_path": metadata_file_path
+    }
+    print(f"PostgreSQL payload file_hash: {payload['file_hash']}")
+    async with httpx.AsyncClient() as client:
+        try:
+            r = await client.post(
+                "https://mr-mvp-api-dev.dev.ingenspark.com/auth/UserMetadataCreate",
+                json=payload,
+                timeout=timeout
+            )
+            r.raise_for_status()
+            result = r.json()
+            result["file_hash"] = file_hash
+            return {"success": True, "data": result}
+        except httpx.HTTPStatusError as e:
+            return {
+                "success": False,
+                "error": "HTTP error",
+                "status_code": e.response.status_code,
+                "detail": e.response.text,
+                "file_hash": file_hash
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": "Request failed",
+                "detail": str(e),
+                "file_hash": file_hash
+            }
+# --- DEBUG ENDPOINT ---
+@s3_bucket_router1.get("/debug/check_hash/{user_id}/{file_hash}")
+async def debug_check_hash(user_id: str, file_hash: str):
+    """Debug endpoint to test hash checking"""
+    result = await check_file_hash_exists(user_id, file_hash)
+    return {
+        "raw_result": result,
+        "exists_value": result.get("exists"),
+        "exists_type": type(result.get("exists")).__name__,
+        "success_value": result.get("success"),
+        "interpretation": "File exists" if result.get("exists") is True else "File does not exist or check failed"
+    }
+# --- MAIN ENDPOINT WITH OPTIONAL METADATA FILE ---
+@s3_bucket_router1.post("/upload_datasets_v3/")
+async def upload_file(
+    file: UploadFile = File(..., description="Main data file"),
+    user_id: str = Query(..., description="User ID"),
+    path: str = Query("", description="Optional subpath"),
+    is_metadata_file: bool = Form(False, description="Toggle for separate metadata file upload"),
+    metadata_file: Optional[UploadFile] = File(None, description="Optional separate metadata file")
+):
+    """
+    Upload dataset with optional separate metadata file.
+    - If is_metadata_file=False (default): Single file upload, metadata generated from data
+    - If is_metadata_file=True: Main file + separate metadata file required
+    """
+    html_tmp_dir = None
+    bokeh_tmp_dir = None
+    file_content = None
+    metadata_content = None
+    try:
+        # 1. Validate main file extension
+        file_ext = os.path.splitext(file.filename)[1].lower()
+        if file_ext not in ALLOWED_EXTENSIONS:
+            raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_ext}")
+        # 2. Handle metadata file toggle
+        if is_metadata_file:
+            if not metadata_file:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Metadata file is required when is_metadata_file=True"
+                )
+            # Validate metadata file extension
+            metadata_ext = os.path.splitext(metadata_file.filename)[1].lower()
+            if metadata_ext not in ALLOWED_METADATA_EXTENSIONS:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Unsupported metadata file type: {metadata_ext}"
+                )
+            # Read metadata file
+            metadata_content = await metadata_file.read()
+            if not metadata_content:
+                raise HTTPException(status_code=400, detail="Empty metadata file")
+            print(f"📋 Separate metadata file provided: {metadata_file.filename}")
+        # 3. Read main file
+        file_content = await file.read()
+        if not file_content:
+            raise HTTPException(status_code=400, detail="Empty file")
+        if len(file_content) > MAX_FILE_SIZE_BYTES:
+            raise HTTPException(status_code=413, detail="File exceeds 100 MiB limit")
+        # 4. Generate hash
+        file_hash = hashlib.sha256(file_content).hexdigest()
+        print(f"Generated file hash: {file_hash}")
+        # 5. Check hash via API
+        hash_result = await check_file_hash_exists(user_id, file_hash)
+        print(f"Hash check result: {hash_result}")
+        if not hash_result.get("success", False):
+            print(f"⚠️ Warning: Hash check API failed: {hash_result.get('message')}")
+        if hash_result.get("exists") is True:
+            print(f"🚫 Duplicate file detected: {file_hash}")
+            return JSONResponse(
+                status_code=409,
+                content={
+                    "message": "File already uploaded.",
+                    "reason": "Duplicate file detected via SHA-256 hash.",
+                    "file_hash": file_hash,
+                    "user_id": user_id,
+                    "filename": file.filename,
+                    "action": "skipped",
+                    "existing_file_info": hash_result.get("data")
+                }
+            )
+        print("✅ Hash check passed. New file - proceeding with upload.")
+        # 6. Load DataFrame
+        try:
+            if file_ext == ".csv":
+                df = pd.read_csv(io.BytesIO(file_content))
+            elif file_ext in {".xlsx", ".xls"}:
+                engine = 'openpyxl' if file_ext == ".xlsx" else 'xlrd'
+                df = pd.read_excel(io.BytesIO(file_content), engine=engine)
+            elif file_ext == ".ods":
+                df = pd.read_excel(io.BytesIO(file_content), engine='odf')
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Failed to parse file: {str(e)}")
+        if len(df) > MAX_ROWS_ALLOWED:
+            raise HTTPException(status_code=413, detail=f"Too many rows: {len(df):,} > {MAX_ROWS_ALLOWED:,}")
+        # 7. Convert to Parquet
+        parquet_buffer = convert_df_to_parquet(df)
+        parquet_size = parquet_buffer.getbuffer().nbytes
+        # 8. Upload Parquet to S3
+        base_filename = os.path.splitext(file.filename)[0]
+        parquet_filename = f"{base_filename}.parquet"
+        file_key = f"{user_id}/files/datasets/{parquet_filename}"
+        file_url = f"{ENDPOINT_URL}/{BUCKET_NAME}/{file_key}"
+        s3.upload_fileobj(parquet_buffer, BUCKET_NAME, file_key,
+                          ExtraArgs={'ContentType': 'application/octet-stream'})
+        print(f"Uploaded Parquet: {file_url}")
+        # 9. Handle metadata file upload to S3 (if separate metadata file provided)
+        metadata_file_s3_url = None
+        metadata_file_s3_key = None
+        if is_metadata_file and metadata_content:
+            # Upload metadata file directly to S3 without conversion
+            print("📋 Uploading separate metadata file to S3...")
+            try:
+                metadata_filename = metadata_file.filename
+                metadata_ext = os.path.splitext(metadata_filename)[1].lower()
+                # Determine content type
+                content_type_map = {
+                    '.json': 'application/json',
+                    '.csv': 'text/csv',
+                    '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+                    '.xls': 'application/vnd.ms-excel'
+                }
+                content_type = content_type_map.get(metadata_ext, 'application/octet-stream')
+                # Upload to S3 in metadata subfolder
+                metadata_file_s3_key = f"{user_id}/files/datasets/{base_filename}/metadata/{metadata_filename}"
+                metadata_file_s3_url = f"{ENDPOINT_URL}/{BUCKET_NAME}/{metadata_file_s3_key}"
+                s3.put_object(
+                    Bucket=BUCKET_NAME,
+                    Key=metadata_file_s3_key,
+                    Body=metadata_content,
+                    ContentType=content_type
+                )
+                print(f"✅ Uploaded metadata file to S3: {metadata_file_s3_url}")
+            except Exception as e:
+                print(f"⚠️ Failed to upload metadata file to S3: {e}")
+                raise HTTPException(
+                    status_code=500,
+                    detail=f"Failed to upload metadata file: {str(e)}"
+                )
+        # 10. Generate auto metadata from data
+        print("🔄 Auto-generating metadata from data...")
+        try:
+            metadata = create_file_metadata_from_df(df, parquet_filename, file_key)
+            print("✅ Raw metadata generated")
+        except Exception as e:
+            print(f"⚠️ Error generating metadata: {e}")
+            import traceback
+            traceback.print_exc()
+            # Fallback minimal metadata
+            metadata = {
+                "filename": parquet_filename,
+                "s3_path": file_key,
+                "rows": len(df),
+                "columns": len(df.columns),
+                "column_names": list(df.columns),
+                "error": f"Metadata generation failed: {str(e)}"
+            }
+        # ✅ CRITICAL FIX: Sanitize metadata immediately after creation
+        print("🔄 Sanitizing metadata for JSON compatibility...")
+        metadata = sanitize_for_json(metadata)
+        print("✅ Metadata sanitized successfully")
+        # Add metadata file info if provided
+        if is_metadata_file and metadata_file_s3_url:
+            metadata["metadata_source"] = "separate_file"
+            metadata["metadata_file"] = {
+                "filename": metadata_file.filename,
+                "s3_path": metadata_file_s3_key,
+                "s3_url": metadata_file_s3_url,
+                "size_bytes": len(metadata_content)
+            }
+        else:
+            metadata["metadata_source"] = "auto_generated"
+        # Add common metadata fields
+        metadata.update({
+            "user_id": user_id,
+            "s3_path": file_key,
+            "s3_url": file_url,
+            "source_file": file.filename,
+            "source_file_type": file_ext[1:],
+            "file_type": "parquet",
+            "original_file_size_bytes": len(file_content),
+            "parquet_file_size_bytes": parquet_size,
+            "compression_ratio": f"{(1 - parquet_size/len(file_content))*100:.1f}%",
+            "file_hash": file_hash,
+            "has_separate_metadata": is_metadata_file
+        })
+        # 11. Generate dataset preview graphs
+        print("Generating dataset preview graphs...")
+        dataset_graphs = None
+        try:
+            dataset_graphs = create_data_set_graphs_dict(df, max_rows=200)
+            # ✅ Sanitize graphs immediately
+            dataset_graphs = sanitize_for_json(dataset_graphs)
+            print(f"✅ Generated graphs for {len(dataset_graphs.get('columnSummaries', []))} columns")
+        except Exception as e:
+            print(f"⚠️ Failed to generate dataset graphs: {e}")
+            import traceback
+            traceback.print_exc()
+            dataset_graphs = {
+                "error": str(e),
+                "columnSummaries": []
+            }
+        # Ensure dataset_graphs is sanitized
+        if dataset_graphs is None:
+            dataset_graphs = {"columnSummaries": []}
+        # 12. Vector DB
+        check_vdb(user_id)
+        try:
+            vdb_res = await add_metadata_only("sri_1_files_&_files_metadata", metadata)
+            vdb_success = vdb_res.get("status") == "success"
+            print(f"VDB upload success: {vdb_success}")
+        except Exception as e:
+            print(f"⚠️ VDB upload failed: {e}")
+            vdb_success = False
+        # 13. PostgreSQL Metadata
+        try:
+            # Convert metadata to JSON string
+            metadata_json_str = json.dumps(metadata)
+            dataset_graphs_json = dataset_graphs  # Already sanitized, pass as dict
+            pg_result = await user_metadata_upload_pg(
+                user_id=user_id,
+                user_metadata=metadata_json_str,
+                path=file_key,
+                url=file_url,
+                filename=parquet_filename,
+                file_type="parquet",
+                file_size_bytes=parquet_size,
+                file_hash=file_hash,
+                data_sets_preview_graph=dataset_graphs_json,
+                is_metadata_file=is_metadata_file,
+                metadata_file_path=metadata_file_s3_key
+            )
+            print(f"PostgreSQL upload result: {pg_result}")
+            pg_success = pg_result.get("success", False)
+        except Exception as e:
+            print(f"⚠️ PostgreSQL upload failed: {e}")
+            import traceback
+            traceback.print_exc()
+            pg_success = False
+            pg_result = {"success": False, "error": str(e)}
+        graphs_count = len(dataset_graphs.get("columnSummaries", []))
+        # 14. Return success
+        response_data = {
+            "message": "Upload successful.",
+            "filename": parquet_filename,
+            "original_filename": file.filename,
+            "user_id": user_id,
+            "file_path": file_key,
+            "file_url": file_url,
+            "file_hash": file_hash,
+            "source_file_type": file_ext[1:],
+            "file_type": "parquet",
+            "original_file_size_bytes": len(file_content),
+            "parquet_file_size_bytes": parquet_size,
+            "compression_ratio": f"{(1 - parquet_size/len(file_content))*100:.1f}%",
+            "rows": len(df),
+            "columns": len(df.columns),
+            "has_separate_metadata": is_metadata_file,
+            "metadata": metadata,
+            "upload_dataset_vdb": vdb_success,
+            "upload_dataset_pg": pg_success,
+            "pg_details": pg_result,
+            "graphs_generated": graphs_count
+        }
+        if is_metadata_file and metadata_file_s3_url:
+            response_data["metadata_file_uploaded"] = {
+                "filename": metadata_file.filename,
+                "s3_path": metadata_file_s3_key,
+                "s3_url": metadata_file_s3_url,
+                "size_bytes": len(metadata_content)
+            }
+        return response_data
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        # Clean up temp directories
+        for d in (html_tmp_dir, bokeh_tmp_dir):
+            if d and os.path.exists(d):
+                shutil.rmtree(d, ignore_errors=True)