Spaces:
Running
on
Zero
Running
on
Zero
moqingyan123
commited on
Commit
·
f71f431
1
Parent(s):
19446f1
updates
Browse files- app.py +6 -0
- src/vine_hf/OVERVIEW.md +0 -218
- src/vine_hf/README.md +0 -355
- src/vine_hf/__init__.py +0 -23
- src/vine_hf/__pycache__/__init__.cpython-310.pyc +0 -0
- src/vine_hf/__pycache__/flattening.cpython-310.pyc +0 -0
- src/vine_hf/__pycache__/vine_config.cpython-310.pyc +0 -0
- src/vine_hf/__pycache__/vine_model.cpython-310.pyc +0 -0
- src/vine_hf/__pycache__/vine_pipeline.cpython-310.pyc +0 -0
- src/vine_hf/__pycache__/vis_utils.cpython-310.pyc +0 -0
- src/vine_hf/convert_inference.py +0 -288
- src/vine_hf/example_ensemble_weights.py +0 -333
- src/vine_hf/example_sam2_masks.py +0 -331
- src/vine_hf/example_usage.ipynb +0 -310
- src/vine_hf/example_usage.py +0 -283
- src/vine_hf/example_visualization.py +0 -146
- src/vine_hf/example_with_pretrained_vine.py +0 -287
- src/vine_hf/flattening.py +0 -124
- src/vine_hf/push_to_hub.py +0 -232
- src/vine_hf/setup.py +0 -63
- src/vine_hf/vine_config.py +0 -108
- src/vine_hf/vine_hf.egg-info/PKG-INFO +0 -401
- src/vine_hf/vine_hf.egg-info/SOURCES.txt +0 -21
- src/vine_hf/vine_hf.egg-info/dependency_links.txt +0 -1
- src/vine_hf/vine_hf.egg-info/entry_points.txt +0 -2
- src/vine_hf/vine_hf.egg-info/requires.txt +0 -16
- src/vine_hf/vine_hf.egg-info/top_level.txt +0 -1
- src/vine_hf/vine_model.py +0 -702
- src/vine_hf/vine_pipeline.py +0 -691
- src/vine_hf/vis_utils.py +0 -941
- test_vine.py +5 -2
- vine_hf/__init__.py +15 -1
- vine_hf/__pycache__/__init__.cpython-310.pyc +0 -0
- vine_hf/__pycache__/vine_config.cpython-310.pyc +0 -0
- vine_hf/__pycache__/vine_model.cpython-310.pyc +0 -0
- vine_hf/convert_inference.py +6 -2
- vine_hf/example_ensemble_weights.py +6 -2
- vine_hf/example_sam2_masks.py +6 -3
- vine_hf/example_usage.py +6 -2
- vine_hf/example_visualization.py +6 -2
- vine_hf/example_with_pretrained_vine.py +6 -2
- vine_hf/push_to_hub.py +6 -2
- vine_hf/push_to_video_fm.py +5 -4
- vine_hf/vine_model.py +7 -0
- vine_hf/vine_pipeline.py +7 -0
- vine_hf/vis_utils.py +9 -1
app.py
CHANGED
|
@@ -7,6 +7,12 @@ import tempfile
|
|
| 7 |
import os
|
| 8 |
import sys
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
import spaces # <-- ZeroGPU integration
|
| 11 |
import gradio as gr
|
| 12 |
import torch
|
|
|
|
| 7 |
import os
|
| 8 |
import sys
|
| 9 |
|
| 10 |
+
# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
|
| 11 |
+
current_dir = Path(__file__).resolve().parent
|
| 12 |
+
src_dir = current_dir / "src"
|
| 13 |
+
if src_dir.is_dir() and str(src_dir) not in sys.path:
|
| 14 |
+
sys.path.insert(0, str(src_dir))
|
| 15 |
+
|
| 16 |
import spaces # <-- ZeroGPU integration
|
| 17 |
import gradio as gr
|
| 18 |
import torch
|
src/vine_hf/OVERVIEW.md
DELETED
|
@@ -1,218 +0,0 @@
|
|
| 1 |
-
# VINE HuggingFace Interface - Complete Overview
|
| 2 |
-
|
| 3 |
-
This directory contains a complete HuggingFace-compatible interface for the VINE (Video Understanding with Natural Language) model. The interface allows you to easily use, share, and deploy your VINE model through the HuggingFace ecosystem.
|
| 4 |
-
|
| 5 |
-
## 📁 Directory Structure
|
| 6 |
-
|
| 7 |
-
```
|
| 8 |
-
vine_hf/
|
| 9 |
-
├── __init__.py # Package initialization and exports
|
| 10 |
-
├── vine_config.py # VineConfig class (PretrainedConfig)
|
| 11 |
-
├── vine_model.py # VineModel class (PreTrainedModel)
|
| 12 |
-
├── vine_pipeline.py # VinePipeline class (Pipeline)
|
| 13 |
-
├── example_usage.py # Comprehensive usage examples
|
| 14 |
-
├── convert_inference.py # Migration guide from inference.py
|
| 15 |
-
├── push_to_hub.py # Script to push model to HF Hub
|
| 16 |
-
├── setup.py # Package setup configuration
|
| 17 |
-
├── README.md # Detailed documentation
|
| 18 |
-
└── OVERVIEW.md # This file
|
| 19 |
-
```
|
| 20 |
-
|
| 21 |
-
## 🏗️ Architecture Components
|
| 22 |
-
|
| 23 |
-
### 1. VineConfig (`vine_config.py`)
|
| 24 |
-
- Inherits from `PretrainedConfig`
|
| 25 |
-
- Configures model parameters, segmentation methods, and processing options
|
| 26 |
-
- Compatible with HuggingFace configuration system
|
| 27 |
-
|
| 28 |
-
### 2. VineModel (`vine_model.py`)
|
| 29 |
-
- Inherits from `PreTrainedModel`
|
| 30 |
-
- Implements the core VINE model with three CLIP backbones
|
| 31 |
-
- Supports categorical, unary, and binary predictions
|
| 32 |
-
- Provides both `forward()` and `predict()` methods
|
| 33 |
-
|
| 34 |
-
### 3. VinePipeline (`vine_pipeline.py`)
|
| 35 |
-
- Inherits from `Pipeline`
|
| 36 |
-
- Handles end-to-end video processing workflow
|
| 37 |
-
- Integrates segmentation (SAM2, Grounding DINO + SAM2)
|
| 38 |
-
- Provides user-friendly interface for video understanding
|
| 39 |
-
|
| 40 |
-
## 🚀 Key Features
|
| 41 |
-
|
| 42 |
-
✅ **Full HuggingFace Compatibility**
|
| 43 |
-
- Compatible with `transformers` library
|
| 44 |
-
- Supports `AutoModel` and `pipeline` interfaces
|
| 45 |
-
- Can be pushed to and loaded from HuggingFace Hub
|
| 46 |
-
|
| 47 |
-
✅ **Flexible Segmentation**
|
| 48 |
-
- Support for SAM2 automatic segmentation
|
| 49 |
-
- Support for Grounding DINO + SAM2 text-guided segmentation
|
| 50 |
-
- Configurable thresholds and parameters
|
| 51 |
-
|
| 52 |
-
✅ **Multi-Modal Understanding**
|
| 53 |
-
- Categorical classification (object types)
|
| 54 |
-
- Unary predicates (single object actions)
|
| 55 |
-
- Binary relations (object-object relationships)
|
| 56 |
-
|
| 57 |
-
✅ **Easy Integration**
|
| 58 |
-
- Simple pipeline interface for end users
|
| 59 |
-
- Direct model access for researchers
|
| 60 |
-
- Comprehensive configuration options
|
| 61 |
-
|
| 62 |
-
## 📖 Usage Examples
|
| 63 |
-
|
| 64 |
-
### Quick Start with Pipeline
|
| 65 |
-
```python
|
| 66 |
-
from transformers import pipeline
|
| 67 |
-
from vine_hf import VineModel, VinePipeline
|
| 68 |
-
|
| 69 |
-
# Create pipeline
|
| 70 |
-
vine_pipeline = pipeline(
|
| 71 |
-
"vine-video-understanding",
|
| 72 |
-
model="your-username/vine-model",
|
| 73 |
-
trust_remote_code=True
|
| 74 |
-
)
|
| 75 |
-
|
| 76 |
-
# Process video
|
| 77 |
-
results = vine_pipeline(
|
| 78 |
-
"video.mp4",
|
| 79 |
-
categorical_keywords=['human', 'dog', 'frisbee'],
|
| 80 |
-
unary_keywords=['running', 'jumping'],
|
| 81 |
-
binary_keywords=['chasing', 'behind']
|
| 82 |
-
)
|
| 83 |
-
```
|
| 84 |
-
|
| 85 |
-
### Direct Model Usage
|
| 86 |
-
```python
|
| 87 |
-
from vine_hf import VineConfig, VineModel
|
| 88 |
-
|
| 89 |
-
config = VineConfig(segmentation_method="grounding_dino_sam2")
|
| 90 |
-
model = VineModel(config)
|
| 91 |
-
|
| 92 |
-
results = model.predict(
|
| 93 |
-
video_frames=video_tensor,
|
| 94 |
-
masks=masks_dict,
|
| 95 |
-
bboxes=bboxes_dict,
|
| 96 |
-
categorical_keywords=['human', 'dog'],
|
| 97 |
-
unary_keywords=['running', 'sitting'],
|
| 98 |
-
binary_keywords=['chasing', 'near']
|
| 99 |
-
)
|
| 100 |
-
```
|
| 101 |
-
|
| 102 |
-
## 🔧 Migration from Original Code
|
| 103 |
-
|
| 104 |
-
The `convert_inference.py` script shows how to migrate from the original `inference.py` workflow:
|
| 105 |
-
|
| 106 |
-
**Original Approach:**
|
| 107 |
-
- Manual model loading and configuration
|
| 108 |
-
- Direct handling of segmentation pipeline
|
| 109 |
-
- Custom result processing
|
| 110 |
-
- Complex setup requirements
|
| 111 |
-
|
| 112 |
-
**New HuggingFace Interface:**
|
| 113 |
-
- Standardized model configuration
|
| 114 |
-
- Automatic preprocessing/postprocessing
|
| 115 |
-
- Simple pipeline interface
|
| 116 |
-
- Easy sharing via HuggingFace Hub
|
| 117 |
-
|
| 118 |
-
## 📤 Sharing Your Model
|
| 119 |
-
|
| 120 |
-
Use the `push_to_hub.py` script to share your trained model:
|
| 121 |
-
|
| 122 |
-
```bash
|
| 123 |
-
python vine_hf/push_to_hub.py \
|
| 124 |
-
--weights path/to/your/model.pth \
|
| 125 |
-
--repo your-username/vine-model \
|
| 126 |
-
--login
|
| 127 |
-
```
|
| 128 |
-
|
| 129 |
-
## 🛠️ Installation & Setup
|
| 130 |
-
|
| 131 |
-
1. **Install Dependencies:**
|
| 132 |
-
```bash
|
| 133 |
-
pip install transformers torch torchvision opencv-python pillow numpy
|
| 134 |
-
```
|
| 135 |
-
|
| 136 |
-
2. **Install Segmentation Models (Optional):**
|
| 137 |
-
- SAM2: https://github.com/facebookresearch/sam2
|
| 138 |
-
- Grounding DINO: https://github.com/IDEA-Research/GroundingDINO
|
| 139 |
-
|
| 140 |
-
3. **Install VINE HF Interface:**
|
| 141 |
-
```bash
|
| 142 |
-
cd vine_hf
|
| 143 |
-
pip install -e .
|
| 144 |
-
```
|
| 145 |
-
|
| 146 |
-
## 🎯 Configuration Options
|
| 147 |
-
|
| 148 |
-
The `VineConfig` class supports extensive configuration:
|
| 149 |
-
|
| 150 |
-
- **Model Settings:** CLIP backbone, hidden dimensions
|
| 151 |
-
- **Segmentation:** Method, thresholds, target FPS
|
| 152 |
-
- **Processing:** Alpha values, top-k results, video length limits
|
| 153 |
-
- **Performance:** Multi-class mode, output format options
|
| 154 |
-
|
| 155 |
-
## 📊 Output Format
|
| 156 |
-
|
| 157 |
-
The interface returns structured predictions:
|
| 158 |
-
|
| 159 |
-
```python
|
| 160 |
-
{
|
| 161 |
-
"categorical_predictions": {obj_id: [(prob, category), ...]},
|
| 162 |
-
"unary_predictions": {(frame, obj): [(prob, action), ...]},
|
| 163 |
-
"binary_predictions": {(frame, pair): [(prob, relation), ...]},
|
| 164 |
-
"confidence_scores": {"categorical": float, "unary": float, "binary": float},
|
| 165 |
-
"summary": {
|
| 166 |
-
"num_objects_detected": int,
|
| 167 |
-
"top_categories": [(category, prob), ...],
|
| 168 |
-
"top_actions": [(action, prob), ...],
|
| 169 |
-
"top_relations": [(relation, prob), ...]
|
| 170 |
-
}
|
| 171 |
-
}
|
| 172 |
-
```
|
| 173 |
-
|
| 174 |
-
## 🔍 Testing & Validation
|
| 175 |
-
|
| 176 |
-
Run the example scripts to test your setup:
|
| 177 |
-
|
| 178 |
-
```bash
|
| 179 |
-
# Test basic functionality
|
| 180 |
-
python vine_hf/example_usage.py
|
| 181 |
-
|
| 182 |
-
# Test migration from original code
|
| 183 |
-
python vine_hf/convert_inference.py
|
| 184 |
-
```
|
| 185 |
-
|
| 186 |
-
## 🤝 Contributing
|
| 187 |
-
|
| 188 |
-
To contribute or customize:
|
| 189 |
-
|
| 190 |
-
1. **Modify Configuration:** Edit `vine_config.py` for new parameters
|
| 191 |
-
2. **Extend Model:** Add functionality to `vine_model.py`
|
| 192 |
-
3. **Enhance Pipeline:** Improve preprocessing/postprocessing in `vine_pipeline.py`
|
| 193 |
-
4. **Add Features:** Create additional utility scripts
|
| 194 |
-
|
| 195 |
-
## 📝 Next Steps
|
| 196 |
-
|
| 197 |
-
1. **Load Your Weights:** Use your trained VINE model weights
|
| 198 |
-
2. **Test Segmentation:** Set up Grounding DINO and SAM2 models
|
| 199 |
-
3. **Validate Results:** Compare with original inference.py output
|
| 200 |
-
4. **Share Model:** Push to HuggingFace Hub for community use
|
| 201 |
-
5. **Deploy:** Use in applications, demos, or research projects
|
| 202 |
-
|
| 203 |
-
## 🐛 Troubleshooting
|
| 204 |
-
|
| 205 |
-
**Common Issues:**
|
| 206 |
-
- **Import Errors:** Check PYTHONPATH and package installation
|
| 207 |
-
- **Segmentation Failures:** Verify Grounding DINO/SAM2 setup
|
| 208 |
-
- **Weight Loading:** Adjust weight loading logic in `convert_inference.py`
|
| 209 |
-
- **CUDA Issues:** Check GPU availability and PyTorch installation
|
| 210 |
-
|
| 211 |
-
**Support:**
|
| 212 |
-
- Check the README.md for detailed documentation
|
| 213 |
-
- Review example_usage.py for working code examples
|
| 214 |
-
- Examine convert_inference.py for migration guidance
|
| 215 |
-
|
| 216 |
-
---
|
| 217 |
-
|
| 218 |
-
This HuggingFace interface makes VINE accessible to the broader ML community while maintaining all the powerful video understanding capabilities of the original model. The standardized interface enables easy sharing, deployment, and integration with existing HuggingFace workflows.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/README.md
DELETED
|
@@ -1,355 +0,0 @@
|
|
| 1 |
-
# VINE HuggingFace Interface
|
| 2 |
-
|
| 3 |
-
VINE (Video Understanding with Natural Language) is a model that processes videos along with categorical, unary, and binary keywords to return probability distributions over those keywords for detected objects and their relationships.
|
| 4 |
-
|
| 5 |
-
This package provides a HuggingFace-compatible interface for the VINE model, making it easy to use for video understanding tasks.
|
| 6 |
-
|
| 7 |
-
## Features
|
| 8 |
-
|
| 9 |
-
- **Categorical Classification**: Classify objects in videos (e.g., "human", "dog", "frisbee")
|
| 10 |
-
- **Unary Predicates**: Detect actions on single objects (e.g., "running", "jumping", "sitting")
|
| 11 |
-
- **Binary Relations**: Detect relationships between object pairs (e.g., "behind", "in front of", "chasing")
|
| 12 |
-
- **Multiple Segmentation Methods**: Support for SAM2 and Grounding DINO + SAM2
|
| 13 |
-
- **HuggingFace Integration**: Full compatibility with HuggingFace transformers and pipelines
|
| 14 |
-
- **Visualization Hooks**: Optional high-level visualizations plus lightweight debug mask dumps for quick sanity checks
|
| 15 |
-
|
| 16 |
-
## Installation
|
| 17 |
-
|
| 18 |
-
```bash
|
| 19 |
-
# Install the package (assuming it's in your Python path)
|
| 20 |
-
pip install transformers torch torchvision
|
| 21 |
-
pip install opencv-python pillow numpy
|
| 22 |
-
|
| 23 |
-
# For segmentation functionality, you'll also need:
|
| 24 |
-
# - SAM2: https://github.com/facebookresearch/sam2
|
| 25 |
-
# - Grounding DINO: https://github.com/IDEA-Research/GroundingDINO
|
| 26 |
-
```
|
| 27 |
-
|
| 28 |
-
## Segmentation Model Configuration
|
| 29 |
-
|
| 30 |
-
`VinePipeline` lazily brings up the segmentation stack the first time a call needs masks. Thresholds, FPS, visualization toggles, and device selection live in `VineConfig`; the pipeline constructor tells it where to fetch SAM2 / GroundingDINO weights or lets you inject already-instantiated modules.
|
| 31 |
-
|
| 32 |
-
### Provide file paths at construction (most common)
|
| 33 |
-
|
| 34 |
-
```python
|
| 35 |
-
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 36 |
-
|
| 37 |
-
vine_config = VineConfig(
|
| 38 |
-
segmentation_method="grounding_dino_sam2", # or "sam2"
|
| 39 |
-
box_threshold=0.35,
|
| 40 |
-
text_threshold=0.25,
|
| 41 |
-
target_fps=5,
|
| 42 |
-
visualization_dir="output/visualizations", # where to write visualizations (and debug visualizations if enabled)
|
| 43 |
-
debug_visualizations=True, # Write videos of the groundingDINO/SAM2/Binary/Unary, etc... outputs
|
| 44 |
-
pretrained_vine_path="/abs/path/to/laser_model_v1.pkl",
|
| 45 |
-
device="cuda:0", # accepts int, str, or torch.device
|
| 46 |
-
)
|
| 47 |
-
|
| 48 |
-
vine_model = VineModel(vine_config)
|
| 49 |
-
|
| 50 |
-
vine_pipeline = VinePipeline(
|
| 51 |
-
model=vine_model,
|
| 52 |
-
tokenizer=None,
|
| 53 |
-
sam_config_path="/abs/path/to/sam2/sam2.1_hiera_t.yaml",
|
| 54 |
-
sam_checkpoint_path="/abs/path/to/sam2/sam2_hiera_tiny.pt",
|
| 55 |
-
gd_config_path="/abs/path/to/groundingdino/config/GroundingDINO_SwinT_OGC.py",
|
| 56 |
-
gd_checkpoint_path="/abs/path/to/groundingdino/weights/groundingdino_swint_ogc.pth",
|
| 57 |
-
device=vine_config._device,
|
| 58 |
-
)
|
| 59 |
-
```
|
| 60 |
-
|
| 61 |
-
When `segmentation_method="grounding_dino_sam2"`, both SAM2 and GroundingDINO must be reachable. The pipeline validates the paths; missing files raise a `ValueError`. If you pick `"sam2"`, only the SAM2 config and checkpoint are required.
|
| 62 |
-
|
| 63 |
-
### Reuse pre-initialized segmentation modules
|
| 64 |
-
|
| 65 |
-
If you build the segmentation stack elsewhere, inject the components with `set_segmentation_models` before running the pipeline:
|
| 66 |
-
|
| 67 |
-
```python
|
| 68 |
-
from sam2.build_sam import build_sam2_video_predictor, build_sam2
|
| 69 |
-
from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
|
| 70 |
-
from groundingdino.util.inference import Model as GroundingDINOModel
|
| 71 |
-
|
| 72 |
-
sam_predictor = build_sam2_video_predictor(..., device=vine_config._device)
|
| 73 |
-
mask_generator = SAM2AutomaticMaskGenerator(build_sam2(..., device=vine_config._device))
|
| 74 |
-
grounding_model = GroundingDINOModel(..., device=vine_config._device)
|
| 75 |
-
|
| 76 |
-
vine_pipeline.set_segmentation_models(
|
| 77 |
-
sam_predictor=sam_predictor,
|
| 78 |
-
mask_generator=mask_generator,
|
| 79 |
-
grounding_model=grounding_model,
|
| 80 |
-
)
|
| 81 |
-
```
|
| 82 |
-
|
| 83 |
-
Any argument left as `None` is initialized lazily from the file paths when the pipeline first needs that backend.
|
| 84 |
-
|
| 85 |
-
## Quick Start
|
| 86 |
-
|
| 87 |
-
## Requirements
|
| 88 |
-
-torch
|
| 89 |
-
-torchvision
|
| 90 |
-
-transformers
|
| 91 |
-
-opencv-python
|
| 92 |
-
-matplotlib
|
| 93 |
-
-seaborn
|
| 94 |
-
-pandas
|
| 95 |
-
-numpy
|
| 96 |
-
-ipywidgets
|
| 97 |
-
-tqdm
|
| 98 |
-
-scikit-learn
|
| 99 |
-
-sam2 (from Facebook Research) "https://github.com/video-fm/video-sam2"
|
| 100 |
-
-sam2 weights (downloaded separately. EX: https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_tiny.pt)
|
| 101 |
-
-groundingdino (from IDEA Research)
|
| 102 |
-
-groundingdino weights (downloaded separately. EX:https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth)
|
| 103 |
-
-spacy-fastlang
|
| 104 |
-
-en-core-web-sm (for spacy-fastlang)
|
| 105 |
-
-ffmpeg (for video processing)
|
| 106 |
-
-(optional) laser weights/full model checkpoint (downloaded separately. EX: https://huggingface.co/video-fm/vine_v0)
|
| 107 |
-
|
| 108 |
-
Usually, by running the laser/environments/laser_env.yml from the LASER repo, most dependencies will be installed. You will need to manually install sam2 and groundingdino as per their instructions.
|
| 109 |
-
|
| 110 |
-
### Using the Pipeline (Recommended)
|
| 111 |
-
```python
|
| 112 |
-
from transformers.pipelines import PIPELINE_REGISTRY
|
| 113 |
-
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 114 |
-
|
| 115 |
-
PIPELINE_REGISTRY.register_pipeline(
|
| 116 |
-
"vine-video-understanding",
|
| 117 |
-
pipeline_class=VinePipeline,
|
| 118 |
-
pt_model=VineModel,
|
| 119 |
-
type="multimodal",
|
| 120 |
-
)
|
| 121 |
-
|
| 122 |
-
config = VineConfig(
|
| 123 |
-
segmentation_method="grounding_dino_sam2",
|
| 124 |
-
pretrained_vine_path="/abs/path/to/laser_model_v1.pkl",
|
| 125 |
-
visualization_dir="output",
|
| 126 |
-
visualize=True,
|
| 127 |
-
device="cuda:0",
|
| 128 |
-
)
|
| 129 |
-
|
| 130 |
-
model = VineModel(config)
|
| 131 |
-
|
| 132 |
-
vine_pipeline = VinePipeline(
|
| 133 |
-
model=model,
|
| 134 |
-
tokenizer=None,
|
| 135 |
-
sam_config_path="/abs/path/to/sam2/sam2.1_hiera_t.yaml",
|
| 136 |
-
sam_checkpoint_path="/abs/path/to/sam2/sam2_hiera_tiny.pt",
|
| 137 |
-
gd_config_path="/abs/path/to/groundingdino/config/GroundingDINO_SwinT_OGC.py",
|
| 138 |
-
gd_checkpoint_path="/abs/path/to/groundingdino/weights/groundingdino_swint_ogc.pth",
|
| 139 |
-
device=config._device,
|
| 140 |
-
)
|
| 141 |
-
|
| 142 |
-
results = vine_pipeline(
|
| 143 |
-
"/path/to/video.mp4",
|
| 144 |
-
categorical_keywords=["dog", "human"],
|
| 145 |
-
unary_keywords=["running"],
|
| 146 |
-
binary_keywords=["chasing"],
|
| 147 |
-
object_pairs=[(0, 1)],
|
| 148 |
-
return_top_k=3,
|
| 149 |
-
include_visualizations=True,
|
| 150 |
-
)
|
| 151 |
-
print(results["summary"])
|
| 152 |
-
```
|
| 153 |
-
|
| 154 |
-
### Using the Model Directly (Advanced)
|
| 155 |
-
|
| 156 |
-
For advanced users who want to provide their own segmentation:
|
| 157 |
-
|
| 158 |
-
```python
|
| 159 |
-
from vine_hf import VineConfig, VineModel
|
| 160 |
-
import torch
|
| 161 |
-
|
| 162 |
-
# Create configuration
|
| 163 |
-
config = VineConfig(
|
| 164 |
-
pretrained_vine_path="/path/to/your/vine/weights" # Optional: your fine-tuned weights
|
| 165 |
-
)
|
| 166 |
-
|
| 167 |
-
# Initialize model
|
| 168 |
-
model = VineModel(config)
|
| 169 |
-
|
| 170 |
-
# If you have your own video frames, masks, and bboxes from external segmentation
|
| 171 |
-
video_frames = torch.randn(3, 224, 224, 3) * 255 # Your video frames
|
| 172 |
-
masks = {0: {1: torch.ones(224, 224, 1)}} # Your segmentation masks
|
| 173 |
-
bboxes = {0: {1: [50, 50, 150, 150]}} # Your bounding boxes
|
| 174 |
-
|
| 175 |
-
# Run prediction
|
| 176 |
-
results = model.predict(
|
| 177 |
-
video_frames=video_frames,
|
| 178 |
-
masks=masks,
|
| 179 |
-
bboxes=bboxes,
|
| 180 |
-
categorical_keywords=['human', 'dog', 'frisbee'],
|
| 181 |
-
unary_keywords=['running', 'jumping'],
|
| 182 |
-
binary_keywords=['chasing', 'following'],
|
| 183 |
-
object_pairs=[(1, 2)],
|
| 184 |
-
return_top_k=3
|
| 185 |
-
)
|
| 186 |
-
```
|
| 187 |
-
|
| 188 |
-
**Note**: For most users, the pipeline approach above is recommended as it handles video loading and segmentation automatically.
|
| 189 |
-
|
| 190 |
-
## Configuration Options
|
| 191 |
-
|
| 192 |
-
The `VineConfig` class supports the following parameters (non-exhaustive):
|
| 193 |
-
|
| 194 |
-
- `model_name`: CLIP model backbone (default: `"openai/clip-vit-large-patch14-336"`)
|
| 195 |
-
- `pretrained_vine_path`: Optional path or Hugging Face repo with pretrained VINE weights
|
| 196 |
-
- `segmentation_method`: `"sam2"` or `"grounding_dino_sam2"` (default: `"grounding_dino_sam2"`)
|
| 197 |
-
- `box_threshold` / `text_threshold`: Grounding DINO thresholds
|
| 198 |
-
- `target_fps`: Target FPS for video processing (default: `1`)
|
| 199 |
-
- `alpha`, `white_alpha`: Rendering parameters used when extracting masked crops
|
| 200 |
-
- `topk_cate`: Top-k categories to return per object (default: `3`)
|
| 201 |
-
- `max_video_length`: Maximum frames to process (default: `100`)
|
| 202 |
-
- `visualize`: When `True`, pipeline post-processing attempts to create stitched visualizations
|
| 203 |
-
- `visualization_dir`: Optional base directory where visualization assets are written
|
| 204 |
-
- `debug_visualizations`: When `True`, the model saves a single first-frame mask composite for quick inspection
|
| 205 |
-
- `debug_visualization_path`: Target filepath for the debug mask composite (must point to a writable file)
|
| 206 |
-
- `return_flattened_segments`, `return_valid_pairs`, `interested_object_pairs`: Advanced geometry outputs for downstream consumers
|
| 207 |
-
|
| 208 |
-
## Output Format
|
| 209 |
-
|
| 210 |
-
The model returns a dictionary with the following structure:
|
| 211 |
-
|
| 212 |
-
```python
|
| 213 |
-
{
|
| 214 |
-
"masks" : {},
|
| 215 |
-
|
| 216 |
-
"boxes" : {},
|
| 217 |
-
|
| 218 |
-
"categorical_predictions": {
|
| 219 |
-
object_id: [(probability, category), ...]
|
| 220 |
-
},
|
| 221 |
-
"unary_predictions": {
|
| 222 |
-
(frame_id, object_id): [(probability, action), ...]
|
| 223 |
-
},
|
| 224 |
-
"binary_predictions": {
|
| 225 |
-
(frame_id, (obj1_id, obj2_id)): [(probability, relation), ...]
|
| 226 |
-
},
|
| 227 |
-
"confidence_scores": {
|
| 228 |
-
"categorical": max_categorical_confidence,
|
| 229 |
-
"unary": max_unary_confidence,
|
| 230 |
-
"binary": max_binary_confidence
|
| 231 |
-
},
|
| 232 |
-
"summary": {
|
| 233 |
-
"num_objects_detected": int,
|
| 234 |
-
"top_categories": [(category, probability), ...],
|
| 235 |
-
"top_actions": [(action, probability), ...],
|
| 236 |
-
"top_relations": [(relation, probability), ...]
|
| 237 |
-
}
|
| 238 |
-
}
|
| 239 |
-
```
|
| 240 |
-
|
| 241 |
-
## Visualization & Debugging
|
| 242 |
-
|
| 243 |
-
There are two complementary visualization layers:
|
| 244 |
-
|
| 245 |
-
- **Post-process visualizations** (`include_visualizations=True` in the pipeline call) produces a high-level stitched video summarizing detections, actions, and relations over time.
|
| 246 |
-
|
| 247 |
-
- **Debug visualizations** (`debug_visualizations=True` in `VineConfig`) dumps videos of intermediate segmentation masks and outputs from GroundingDINO, SAM2, Unary, Binary, etc. for quick sanity checks.
|
| 248 |
-
|
| 249 |
-
If you plan to enable either option, ensure the relevant output directories exist before running the pipeline.
|
| 250 |
-
|
| 251 |
-
## Segmentation Methods
|
| 252 |
-
|
| 253 |
-
### Grounding DINO + SAM2 (Recommended)
|
| 254 |
-
|
| 255 |
-
Uses Grounding DINO for object detection based on text prompts, then SAM2 for precise segmentation.
|
| 256 |
-
|
| 257 |
-
Requirements:
|
| 258 |
-
- Grounding DINO model and weights
|
| 259 |
-
- SAM2 model and weights
|
| 260 |
-
- Properly configured paths to model checkpoints
|
| 261 |
-
|
| 262 |
-
### SAM2 Only
|
| 263 |
-
|
| 264 |
-
Uses SAM2's automatic mask generation without text-based object detection.
|
| 265 |
-
|
| 266 |
-
Requirements:
|
| 267 |
-
- SAM2 model and weights
|
| 268 |
-
|
| 269 |
-
## Model Architecture
|
| 270 |
-
|
| 271 |
-
VINE is built on top of CLIP and uses three separate CLIP models for different tasks:
|
| 272 |
-
- **Categorical Model**: For object classification
|
| 273 |
-
- **Unary Model**: For single-object action recognition
|
| 274 |
-
- **Binary Model**: For relationship detection between object pairs
|
| 275 |
-
|
| 276 |
-
Each model processes both visual and textual features to compute similarity scores and probability distributions.
|
| 277 |
-
|
| 278 |
-
## Pushing to HuggingFace Hub
|
| 279 |
-
|
| 280 |
-
```python
|
| 281 |
-
from vine_hf import VineConfig, VineModel
|
| 282 |
-
|
| 283 |
-
# Create and configure your model
|
| 284 |
-
config = VineConfig()
|
| 285 |
-
model = VineModel(config)
|
| 286 |
-
|
| 287 |
-
# Load your pretrained weights
|
| 288 |
-
# model.load_state_dict(torch.load('path/to/your/weights.pth'))
|
| 289 |
-
|
| 290 |
-
# Register for auto classes
|
| 291 |
-
config.register_for_auto_class()
|
| 292 |
-
model.register_for_auto_class("AutoModel")
|
| 293 |
-
|
| 294 |
-
# Push to Hub
|
| 295 |
-
config.push_to_hub('your-username/vine-model')
|
| 296 |
-
model.push_to_hub('your-username/vine-model')
|
| 297 |
-
```
|
| 298 |
-
|
| 299 |
-
## Loading from HuggingFace Hub
|
| 300 |
-
|
| 301 |
-
```python
|
| 302 |
-
from transformers import AutoModel, pipeline
|
| 303 |
-
|
| 304 |
-
# Load model
|
| 305 |
-
model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)
|
| 306 |
-
|
| 307 |
-
# Or use with pipeline
|
| 308 |
-
vine_pipeline = pipeline(
|
| 309 |
-
'vine-video-understanding',
|
| 310 |
-
model='your-username/vine-model',
|
| 311 |
-
trust_remote_code=True
|
| 312 |
-
)
|
| 313 |
-
```
|
| 314 |
-
|
| 315 |
-
## Examples
|
| 316 |
-
|
| 317 |
-
See `example_usage.py` for comprehensive examples including:
|
| 318 |
-
- Direct model usage
|
| 319 |
-
- Pipeline usage
|
| 320 |
-
- HuggingFace Hub integration
|
| 321 |
-
- Real video processing
|
| 322 |
-
|
| 323 |
-
## Requirements
|
| 324 |
-
|
| 325 |
-
- Python 3.7+
|
| 326 |
-
- PyTorch 1.9+
|
| 327 |
-
- transformers 4.20+
|
| 328 |
-
- OpenCV
|
| 329 |
-
- PIL/Pillow
|
| 330 |
-
- NumPy
|
| 331 |
-
|
| 332 |
-
For segmentation:
|
| 333 |
-
- SAM2 (Facebook Research)
|
| 334 |
-
- Grounding DINO (IDEA Research)
|
| 335 |
-
|
| 336 |
-
## Citation
|
| 337 |
-
|
| 338 |
-
If you use VINE in your research, please cite:
|
| 339 |
-
|
| 340 |
-
```bibtex
|
| 341 |
-
@article{vine2024,
|
| 342 |
-
title={VINE: Video Understanding with Natural Language},
|
| 343 |
-
author={Your Authors},
|
| 344 |
-
journal={Your Journal},
|
| 345 |
-
year={2024}
|
| 346 |
-
}
|
| 347 |
-
```
|
| 348 |
-
|
| 349 |
-
## License
|
| 350 |
-
|
| 351 |
-
[Your License Here]
|
| 352 |
-
|
| 353 |
-
## Contact
|
| 354 |
-
|
| 355 |
-
[Your Contact Information Here]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/__init__.py
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
VINE HuggingFace Interface
|
| 3 |
-
|
| 4 |
-
VINE (Video Understanding with Natural Language) is a model that processes videos
|
| 5 |
-
along with categorical, unary, and binary keywords to return probability
|
| 6 |
-
distributions over those keywords for detected objects and their relationships.
|
| 7 |
-
|
| 8 |
-
This package provides a HuggingFace-compatible interface for the VINE model,
|
| 9 |
-
including configuration, model, and pipeline classes.
|
| 10 |
-
"""
|
| 11 |
-
|
| 12 |
-
from .vine_config import VineConfig
|
| 13 |
-
from .vine_model import VineModel
|
| 14 |
-
from .vine_pipeline import VinePipeline
|
| 15 |
-
|
| 16 |
-
__version__ = "1.0.0"
|
| 17 |
-
__author__ = "LASER Team"
|
| 18 |
-
|
| 19 |
-
__all__ = [
|
| 20 |
-
"VineConfig",
|
| 21 |
-
"VineModel",
|
| 22 |
-
"VinePipeline"
|
| 23 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/__pycache__/__init__.cpython-310.pyc
DELETED
|
Binary file (780 Bytes)
|
|
|
src/vine_hf/__pycache__/flattening.cpython-310.pyc
DELETED
|
Binary file (3.31 kB)
|
|
|
src/vine_hf/__pycache__/vine_config.cpython-310.pyc
DELETED
|
Binary file (4.41 kB)
|
|
|
src/vine_hf/__pycache__/vine_model.cpython-310.pyc
DELETED
|
Binary file (16.3 kB)
|
|
|
src/vine_hf/__pycache__/vine_pipeline.cpython-310.pyc
DELETED
|
Binary file (18.1 kB)
|
|
|
src/vine_hf/__pycache__/vis_utils.cpython-310.pyc
DELETED
|
Binary file (25.1 kB)
|
|
|
src/vine_hf/convert_inference.py
DELETED
|
@@ -1,288 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Script to convert existing inference.py workflow to use VINE HuggingFace interface
|
| 3 |
-
|
| 4 |
-
This script demonstrates how to migrate from the original inference.py approach
|
| 5 |
-
to the new HuggingFace-compatible interface.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import os
|
| 9 |
-
import sys
|
| 10 |
-
import torch
|
| 11 |
-
import numpy as np
|
| 12 |
-
from typing import Dict, List, Tuple, Any
|
| 13 |
-
|
| 14 |
-
# Add paths for imports
|
| 15 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 16 |
-
|
| 17 |
-
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 18 |
-
from laser.loading import load_video
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
def load_pretrained_vine_model(model_dir: str, model_name: str, epoch: int = 0) -> VineModel:
|
| 22 |
-
"""
|
| 23 |
-
Load a pretrained VINE model from the original format into HuggingFace format.
|
| 24 |
-
|
| 25 |
-
Args:
|
| 26 |
-
model_dir: Directory containing the model
|
| 27 |
-
model_name: Name of the model file (without .{epoch}.model extension)
|
| 28 |
-
epoch: Epoch number to load
|
| 29 |
-
|
| 30 |
-
Returns:
|
| 31 |
-
VineModel instance with loaded weights
|
| 32 |
-
"""
|
| 33 |
-
print(f"Loading pretrained VINE model from {model_dir}")
|
| 34 |
-
|
| 35 |
-
# Create configuration (adjust parameters as needed)
|
| 36 |
-
# We expect local ensemble weights in `model_dir`, so configure
|
| 37 |
-
# VineConfig to load from local directory/filename.
|
| 38 |
-
model_file = f"{model_name}.{epoch}.model"
|
| 39 |
-
config = VineConfig(
|
| 40 |
-
model_name="openai/clip-vit-base-patch32",
|
| 41 |
-
segmentation_method="grounding_dino_sam2",
|
| 42 |
-
target_fps=1,
|
| 43 |
-
box_threshold=0.35,
|
| 44 |
-
text_threshold=0.25,
|
| 45 |
-
use_hf_repo=False,
|
| 46 |
-
local_dir=model_dir,
|
| 47 |
-
local_filename=model_file,
|
| 48 |
-
)
|
| 49 |
-
|
| 50 |
-
# Initialize model (VineModel will consult the config when loading)
|
| 51 |
-
vine_model = VineModel(config)
|
| 52 |
-
|
| 53 |
-
# Load original weights
|
| 54 |
-
model_file = f"{model_name}.{epoch}.model"
|
| 55 |
-
model_path = os.path.join(model_dir, model_file)
|
| 56 |
-
|
| 57 |
-
if os.path.exists(model_path):
|
| 58 |
-
print(f"Loading weights from: {model_path}")
|
| 59 |
-
try:
|
| 60 |
-
# Add safe globals for PyTorch 2.6+
|
| 61 |
-
import torch.serialization
|
| 62 |
-
from laser.models.llava_clip_model_v3 import PredicateModel
|
| 63 |
-
torch.serialization.add_safe_globals([PredicateModel])
|
| 64 |
-
|
| 65 |
-
# Load the original model
|
| 66 |
-
original_model = torch.load(model_path, map_location='cpu', weights_only=False)
|
| 67 |
-
|
| 68 |
-
# Transfer weights to HuggingFace model
|
| 69 |
-
# This assumes the original model has the same structure
|
| 70 |
-
# You may need to adjust this based on your specific model structure
|
| 71 |
-
|
| 72 |
-
if hasattr(original_model, 'clip_cate_model'):
|
| 73 |
-
vine_model.clip_cate_model.load_state_dict(original_model.clip_cate_model.state_dict())
|
| 74 |
-
if hasattr(original_model, 'clip_unary_model'):
|
| 75 |
-
vine_model.clip_unary_model.load_state_dict(original_model.clip_unary_model.state_dict())
|
| 76 |
-
if hasattr(original_model, 'clip_binary_model'):
|
| 77 |
-
vine_model.clip_binary_model.load_state_dict(original_model.clip_binary_model.state_dict())
|
| 78 |
-
if hasattr(original_model, 'clip_tokenizer'):
|
| 79 |
-
vine_model.clip_tokenizer = original_model.clip_tokenizer
|
| 80 |
-
if hasattr(original_model, 'clip_processor'):
|
| 81 |
-
vine_model.clip_processor = original_model.clip_processor
|
| 82 |
-
|
| 83 |
-
print("✓ Weights transferred successfully")
|
| 84 |
-
|
| 85 |
-
except Exception as e:
|
| 86 |
-
print(f"✗ Error loading weights: {e}")
|
| 87 |
-
print("You may need to adjust the weight loading logic for your specific model")
|
| 88 |
-
|
| 89 |
-
else:
|
| 90 |
-
print(f"✗ Model file not found: {model_path}")
|
| 91 |
-
|
| 92 |
-
return vine_model
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
def convert_inference_workflow():
|
| 96 |
-
"""
|
| 97 |
-
Convert the original inference.py workflow to use HuggingFace interface.
|
| 98 |
-
|
| 99 |
-
This function demonstrates how to replicate the original inference workflow
|
| 100 |
-
using the new HuggingFace-compatible components.
|
| 101 |
-
"""
|
| 102 |
-
print("=== Converting Inference Workflow ===")
|
| 103 |
-
|
| 104 |
-
# Original parameters from inference.py
|
| 105 |
-
video_id = 'v1'
|
| 106 |
-
target_fps = 1
|
| 107 |
-
classes = ['human', 'dog', 'frisbee']
|
| 108 |
-
unary_keywords = ['running', 'jumping', 'sitting', 'standing']
|
| 109 |
-
binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left']
|
| 110 |
-
|
| 111 |
-
# Paths (adjust these to match your setup)
|
| 112 |
-
demo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../demo"))
|
| 113 |
-
video_dir = os.path.join(demo_dir, "videos")
|
| 114 |
-
video_path = os.path.join(video_dir, f"{video_id}.mp4")
|
| 115 |
-
|
| 116 |
-
# Model paths (adjust these to match your setup)
|
| 117 |
-
data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
|
| 118 |
-
model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
|
| 119 |
-
model_name = "ensemble-2025-02-10-14-57-22"
|
| 120 |
-
|
| 121 |
-
# Segmentation model paths (adjust these to your actual paths)
|
| 122 |
-
sam_config_path = "/path/to/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml"
|
| 123 |
-
sam_checkpoint_path = "/path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt"
|
| 124 |
-
gd_config_path = "/path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py"
|
| 125 |
-
gd_checkpoint_path = "/path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth"
|
| 126 |
-
|
| 127 |
-
print(f"Video path: {video_path}")
|
| 128 |
-
print(f"Model dir: {model_dir}")
|
| 129 |
-
print(f"SAM2 config: {sam_config_path}")
|
| 130 |
-
print(f"GroundingDINO config: {gd_config_path}")
|
| 131 |
-
|
| 132 |
-
# Check if video exists
|
| 133 |
-
if not os.path.exists(video_path):
|
| 134 |
-
print(f"✗ Video not found: {video_path}")
|
| 135 |
-
print("Please adjust the video path or use your own video file")
|
| 136 |
-
return
|
| 137 |
-
|
| 138 |
-
# 1. Load video (same as original)
|
| 139 |
-
print(f"Loading video: {video_id}")
|
| 140 |
-
video_tensor = load_video(video_path, target_fps=target_fps)
|
| 141 |
-
print(f"Video shape: {video_tensor.shape}")
|
| 142 |
-
|
| 143 |
-
# 2. Load VINE model with HuggingFace interface
|
| 144 |
-
print("Loading VINE model...")
|
| 145 |
-
if os.path.exists(model_dir):
|
| 146 |
-
vine_model = load_pretrained_vine_model(model_dir, model_name, epoch=0)
|
| 147 |
-
else:
|
| 148 |
-
print(f"Model directory not found: {model_dir}")
|
| 149 |
-
print("Creating new model with random weights for demonstration")
|
| 150 |
-
config = VineConfig()
|
| 151 |
-
vine_model = VineModel(config)
|
| 152 |
-
|
| 153 |
-
# 3. Create pipeline for easier use
|
| 154 |
-
print("Creating VINE pipeline...")
|
| 155 |
-
from transformers.pipelines import PIPELINE_REGISTRY
|
| 156 |
-
|
| 157 |
-
# Register pipeline if not already registered
|
| 158 |
-
try:
|
| 159 |
-
PIPELINE_REGISTRY.register_pipeline(
|
| 160 |
-
"vine-video-understanding",
|
| 161 |
-
pipeline_class=VinePipeline,
|
| 162 |
-
pt_model=VineModel,
|
| 163 |
-
type="multimodal",
|
| 164 |
-
)
|
| 165 |
-
except Exception:
|
| 166 |
-
pass # Already registered
|
| 167 |
-
|
| 168 |
-
# Create pipeline instance with segmentation model paths
|
| 169 |
-
vine_pipeline = VinePipeline(
|
| 170 |
-
model=vine_model,
|
| 171 |
-
tokenizer=None,
|
| 172 |
-
# SAM2 configuration
|
| 173 |
-
sam_config_path=sam_config_path,
|
| 174 |
-
sam_checkpoint_path=sam_checkpoint_path,
|
| 175 |
-
# GroundingDINO configuration
|
| 176 |
-
gd_config_path=gd_config_path,
|
| 177 |
-
gd_checkpoint_path=gd_checkpoint_path
|
| 178 |
-
)
|
| 179 |
-
|
| 180 |
-
# 4. Process video with new interface
|
| 181 |
-
print("Processing video with VINE HuggingFace interface...")
|
| 182 |
-
|
| 183 |
-
try:
|
| 184 |
-
# Use the pipeline to process the video
|
| 185 |
-
results = vine_pipeline(
|
| 186 |
-
video_path,
|
| 187 |
-
categorical_keywords=classes,
|
| 188 |
-
unary_keywords=unary_keywords,
|
| 189 |
-
binary_keywords=binary_keywords,
|
| 190 |
-
object_pairs=[(1, 2), (2, 3)], # Example object pairs
|
| 191 |
-
segmentation_method='grounding_dino_sam2',
|
| 192 |
-
target_fps=target_fps,
|
| 193 |
-
return_top_k=3,
|
| 194 |
-
include_visualizations=False
|
| 195 |
-
)
|
| 196 |
-
|
| 197 |
-
# 5. Display results (similar to original format)
|
| 198 |
-
print("\n=== VINE Results (HuggingFace Interface) ===")
|
| 199 |
-
|
| 200 |
-
# Categorical predictions
|
| 201 |
-
print("\nCategorical Predictions:")
|
| 202 |
-
for obj_id, predictions in results['categorical_predictions'].items():
|
| 203 |
-
print(f" Object {obj_id}:")
|
| 204 |
-
for prob, category in predictions:
|
| 205 |
-
print(f" {prob:.3f}: {category}")
|
| 206 |
-
|
| 207 |
-
# Unary predictions
|
| 208 |
-
print("\nUnary Predictions:")
|
| 209 |
-
for (frame_id, obj_id), predictions in results['unary_predictions'].items():
|
| 210 |
-
print(f" Frame {frame_id}, Object {obj_id}:")
|
| 211 |
-
for prob, action in predictions:
|
| 212 |
-
print(f" {prob:.3f}: {action}")
|
| 213 |
-
|
| 214 |
-
# Binary predictions
|
| 215 |
-
print("\nBinary Predictions:")
|
| 216 |
-
for (frame_id, obj_pair), predictions in results['binary_predictions'].items():
|
| 217 |
-
print(f" Frame {frame_id}, Objects {obj_pair}:")
|
| 218 |
-
for prob, relation in predictions:
|
| 219 |
-
print(f" {prob:.3f}: {relation}")
|
| 220 |
-
|
| 221 |
-
# Summary
|
| 222 |
-
print(f"\nSummary:")
|
| 223 |
-
print(f" Objects detected: {results['summary']['num_objects_detected']}")
|
| 224 |
-
print(f" Top categories: {results['summary']['top_categories']}")
|
| 225 |
-
print(f" Top actions: {results['summary']['top_actions']}")
|
| 226 |
-
print(f" Top relations: {results['summary']['top_relations']}")
|
| 227 |
-
|
| 228 |
-
print("\n✓ Successfully processed video with VINE HuggingFace interface!")
|
| 229 |
-
|
| 230 |
-
except Exception as e:
|
| 231 |
-
print(f"✗ Error processing video: {e}")
|
| 232 |
-
print("This may be due to missing segmentation models or other dependencies")
|
| 233 |
-
print("The interface is set up correctly, but full functionality requires:")
|
| 234 |
-
print(" 1. Properly installed Grounding DINO and SAM2")
|
| 235 |
-
print(" 2. Correct model weights")
|
| 236 |
-
print(" 3. Proper configuration paths")
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
def compare_interfaces():
|
| 240 |
-
"""
|
| 241 |
-
Compare the original inference.py approach with the new HuggingFace interface.
|
| 242 |
-
"""
|
| 243 |
-
print("\n=== Interface Comparison ===")
|
| 244 |
-
|
| 245 |
-
print("\nOriginal inference.py approach:")
|
| 246 |
-
print("✓ Direct access to model internals")
|
| 247 |
-
print("✓ Full control over segmentation pipeline")
|
| 248 |
-
print("✗ Complex setup and configuration")
|
| 249 |
-
print("✗ Not compatible with HuggingFace ecosystem")
|
| 250 |
-
print("✗ Requires manual handling of all components")
|
| 251 |
-
|
| 252 |
-
print("\nNew HuggingFace interface:")
|
| 253 |
-
print("✓ Easy to use pipeline interface")
|
| 254 |
-
print("✓ Compatible with HuggingFace Hub")
|
| 255 |
-
print("✓ Standardized configuration")
|
| 256 |
-
print("✓ Automatic handling of preprocessing/postprocessing")
|
| 257 |
-
print("✓ Easy sharing and distribution")
|
| 258 |
-
print("✓ Configurable segmentation model paths")
|
| 259 |
-
print("✗ Slightly less direct control (can still access model directly)")
|
| 260 |
-
|
| 261 |
-
print("\nMigration benefits:")
|
| 262 |
-
print("• Share your model easily on HuggingFace Hub")
|
| 263 |
-
print("• Users can load your model with a single line")
|
| 264 |
-
print("• Standardized interface for video understanding")
|
| 265 |
-
print("• Better integration with other HuggingFace tools")
|
| 266 |
-
print("• Simplified deployment and inference")
|
| 267 |
-
print("• Flexible segmentation model configuration")
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
if __name__ == "__main__":
|
| 271 |
-
print("VINE HuggingFace Interface Conversion")
|
| 272 |
-
print("=" * 50)
|
| 273 |
-
|
| 274 |
-
# Run conversion demonstration
|
| 275 |
-
convert_inference_workflow()
|
| 276 |
-
|
| 277 |
-
# Show comparison
|
| 278 |
-
compare_interfaces()
|
| 279 |
-
|
| 280 |
-
print("\n" + "=" * 50)
|
| 281 |
-
print("Next steps:")
|
| 282 |
-
print("1. Install SAM2 and GroundingDINO dependencies")
|
| 283 |
-
print("2. Download the required model checkpoints")
|
| 284 |
-
print("3. Update the paths in this script to point to your models")
|
| 285 |
-
print("4. Test the interface with your specific model weights")
|
| 286 |
-
print("5. Adjust configuration parameters as needed")
|
| 287 |
-
print("6. Push your model to HuggingFace Hub using push_to_hub.py")
|
| 288 |
-
print("7. Share with the community!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/example_ensemble_weights.py
DELETED
|
@@ -1,333 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Example demonstrating how to load and use VINE ensemble weights
|
| 3 |
-
|
| 4 |
-
This script shows the correct way to load your pretrained VINE ensemble weights
|
| 5 |
-
and use them with the HuggingFace interface, based on the actual inference.py workflow.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import os
|
| 9 |
-
import sys
|
| 10 |
-
import torch
|
| 11 |
-
import numpy as np
|
| 12 |
-
from transformers.pipelines import PIPELINE_REGISTRY
|
| 13 |
-
|
| 14 |
-
#os.environ["OPENAI_API_KEY"]="dummy-key" # Set your OpenAI API key here or via environment variable
|
| 15 |
-
|
| 16 |
-
# Add the parent directory to the path to import vine_hf
|
| 17 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 18 |
-
|
| 19 |
-
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 20 |
-
from laser.loading import load_video
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
def example_load_ensemble_weights():
|
| 24 |
-
"""Example of loading ensemble weights correctly."""
|
| 25 |
-
print("=== Loading Ensemble VINE Weights ===")
|
| 26 |
-
|
| 27 |
-
# Path to your ensemble model (adjust this to your actual path)
|
| 28 |
-
data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
|
| 29 |
-
model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
|
| 30 |
-
|
| 31 |
-
print(f"Looking for ensemble weights in: {model_dir}")
|
| 32 |
-
|
| 33 |
-
if os.path.exists(model_dir):
|
| 34 |
-
print("✓ Model directory found")
|
| 35 |
-
|
| 36 |
-
# List available model files
|
| 37 |
-
model_files = [f for f in os.listdir(model_dir) if f.endswith('.model')]
|
| 38 |
-
print(f"Available model files: {model_files}")
|
| 39 |
-
|
| 40 |
-
if model_files:
|
| 41 |
-
# Create configuration with ensemble path (local directory with .model files)
|
| 42 |
-
config = VineConfig(
|
| 43 |
-
segmentation_method="grounding_dino_sam2",
|
| 44 |
-
use_hf_repo=False,
|
| 45 |
-
local_dir=model_dir,
|
| 46 |
-
local_filename=None,
|
| 47 |
-
)
|
| 48 |
-
|
| 49 |
-
print("Creating VINE model with ensemble weights...")
|
| 50 |
-
vine_model = VineModel(config)
|
| 51 |
-
|
| 52 |
-
print("✓ VINE model created with ensemble weights!")
|
| 53 |
-
return vine_model
|
| 54 |
-
else:
|
| 55 |
-
print("✗ No .model files found in directory")
|
| 56 |
-
return None
|
| 57 |
-
else:
|
| 58 |
-
print(f"✗ Model directory not found: {model_dir}")
|
| 59 |
-
print("Please adjust the path to point to your ensemble weights")
|
| 60 |
-
return None
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
def example_direct_ensemble_loading():
|
| 64 |
-
"""Example of loading ensemble weights using from_pretrained_vine."""
|
| 65 |
-
print("\n=== Direct Ensemble Loading ===")
|
| 66 |
-
|
| 67 |
-
# Path to specific ensemble file
|
| 68 |
-
data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
|
| 69 |
-
model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
|
| 70 |
-
|
| 71 |
-
if os.path.exists(model_dir):
|
| 72 |
-
try:
|
| 73 |
-
# Use the class method for direct loading
|
| 74 |
-
vine_model = VineModel.from_pretrained_vine(
|
| 75 |
-
model_path=model_dir,
|
| 76 |
-
epoch=0 # Load epoch 0
|
| 77 |
-
)
|
| 78 |
-
|
| 79 |
-
print("✓ Model loaded using from_pretrained_vine!")
|
| 80 |
-
return vine_model
|
| 81 |
-
|
| 82 |
-
except Exception as e:
|
| 83 |
-
print(f"✗ Error loading with from_pretrained_vine: {e}")
|
| 84 |
-
return None
|
| 85 |
-
else:
|
| 86 |
-
print(f"✗ Model directory not found: {model_dir}")
|
| 87 |
-
return None
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
def example_compare_original_vs_hf():
|
| 91 |
-
"""Compare the original inference.py approach with HuggingFace interface."""
|
| 92 |
-
print("\n=== Comparing Original vs HuggingFace Interface ===")
|
| 93 |
-
|
| 94 |
-
data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
|
| 95 |
-
model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
|
| 96 |
-
model_name = "ensemble-2025-02-10-14-57-22"
|
| 97 |
-
epoch = 0
|
| 98 |
-
|
| 99 |
-
if not os.path.exists(model_dir):
|
| 100 |
-
print(f"Model directory not found: {model_dir}")
|
| 101 |
-
return
|
| 102 |
-
|
| 103 |
-
print("Original approach (from inference.py):")
|
| 104 |
-
print("```python")
|
| 105 |
-
print("def load_model(model_dir, model_name, epoch, device):")
|
| 106 |
-
print(" model_name = model_name + f'.{epoch}.model'")
|
| 107 |
-
print(" predicate_model = torch.load(os.path.join(model_dir, model_name), map_location=device, weights_only=False)")
|
| 108 |
-
print(" return predicate_model")
|
| 109 |
-
print("")
|
| 110 |
-
print("predicate_model = load_model(model_dir, model_name, epoch, device)")
|
| 111 |
-
print("```")
|
| 112 |
-
|
| 113 |
-
print("\nNew HuggingFace approach:")
|
| 114 |
-
print("```python")
|
| 115 |
-
print("config = VineConfig(pretrained_vine_path=model_dir)")
|
| 116 |
-
print("vine_model = VineModel(config)")
|
| 117 |
-
print("# or")
|
| 118 |
-
print("vine_model = VineModel.from_pretrained_vine(model_dir, epoch=0)")
|
| 119 |
-
print("```")
|
| 120 |
-
|
| 121 |
-
# Try to load with both approaches if possible
|
| 122 |
-
try:
|
| 123 |
-
# Original approach
|
| 124 |
-
def load_model(model_dir, model_name, epoch, device):
|
| 125 |
-
model_name = model_name + f'.{epoch}.model'
|
| 126 |
-
model_path = os.path.join(model_dir, model_name)
|
| 127 |
-
if os.path.exists(model_path):
|
| 128 |
-
return torch.load(model_path, map_location=device, weights_only=False)
|
| 129 |
-
else:
|
| 130 |
-
print(f"Model file not found: {model_path}")
|
| 131 |
-
return None
|
| 132 |
-
|
| 133 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 134 |
-
original_model = load_model(model_dir, model_name, epoch, device)
|
| 135 |
-
|
| 136 |
-
if original_model:
|
| 137 |
-
print(f"✓ Original model loaded: {type(original_model)}")
|
| 138 |
-
print(f" Has clip_cate_model: {hasattr(original_model, 'clip_cate_model')}")
|
| 139 |
-
print(f" Has clip_unary_model: {hasattr(original_model, 'clip_unary_model')}")
|
| 140 |
-
print(f" Has clip_binary_model: {hasattr(original_model, 'clip_binary_model')}")
|
| 141 |
-
|
| 142 |
-
# HuggingFace approach
|
| 143 |
-
vine_model = VineModel.from_pretrained_vine(model_dir, epoch=epoch)
|
| 144 |
-
|
| 145 |
-
if vine_model:
|
| 146 |
-
print(f"✓ HuggingFace model loaded: {type(vine_model)}")
|
| 147 |
-
print(f" Has clip_cate_model: {hasattr(vine_model, 'clip_cate_model')}")
|
| 148 |
-
print(f" Has clip_unary_model: {hasattr(vine_model, 'clip_unary_model')}")
|
| 149 |
-
print(f" Has clip_binary_model: {hasattr(vine_model, 'clip_binary_model')}")
|
| 150 |
-
|
| 151 |
-
print("\n✓ Both approaches work! HuggingFace interface successfully loads ensemble weights.")
|
| 152 |
-
|
| 153 |
-
except Exception as e:
|
| 154 |
-
print(f"Error in comparison: {e}")
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
def example_ensemble_with_pipeline():
|
| 158 |
-
"""Example using ensemble weights with the pipeline."""
|
| 159 |
-
print("\n=== Using Ensemble Weights with Pipeline ===")
|
| 160 |
-
|
| 161 |
-
data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
|
| 162 |
-
model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
|
| 163 |
-
|
| 164 |
-
if not os.path.exists(model_dir):
|
| 165 |
-
print(f"Model directory not found: {model_dir}")
|
| 166 |
-
return
|
| 167 |
-
|
| 168 |
-
# Register pipeline
|
| 169 |
-
PIPELINE_REGISTRY.register_pipeline(
|
| 170 |
-
"vine-video-understanding",
|
| 171 |
-
pipeline_class=VinePipeline,
|
| 172 |
-
pt_model=VineModel,
|
| 173 |
-
type="multimodal",
|
| 174 |
-
)
|
| 175 |
-
|
| 176 |
-
# Create model with ensemble weights (local directory)
|
| 177 |
-
config = VineConfig(
|
| 178 |
-
segmentation_method="grounding_dino_sam2",
|
| 179 |
-
use_hf_repo=False,
|
| 180 |
-
local_dir=model_dir,
|
| 181 |
-
local_filename=None,
|
| 182 |
-
)
|
| 183 |
-
|
| 184 |
-
vine_model = VineModel(config)
|
| 185 |
-
# Create pipeline with segmentation model paths
|
| 186 |
-
vine_pipeline = VinePipeline(
|
| 187 |
-
model=vine_model,
|
| 188 |
-
tokenizer=None,
|
| 189 |
-
# SAM2 configuration
|
| 190 |
-
sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
|
| 191 |
-
sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
|
| 192 |
-
# GroundingDINO configuration
|
| 193 |
-
gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
|
| 194 |
-
gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
|
| 195 |
-
device="cuda" if torch.cuda.is_available() else "cpu",
|
| 196 |
-
)
|
| 197 |
-
|
| 198 |
-
print("✓ Pipeline created with ensemble VINE weights")
|
| 199 |
-
|
| 200 |
-
# Check for demo video
|
| 201 |
-
demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
|
| 202 |
-
|
| 203 |
-
if os.path.exists(demo_video):
|
| 204 |
-
print(f"Found demo video: {demo_video}")
|
| 205 |
-
|
| 206 |
-
# Use the same keywords as in the original inference.py
|
| 207 |
-
categorical_keywords = ['human', 'dog', 'frisbee']
|
| 208 |
-
unary_keywords = ['running', 'jumping', 'catching', 'throwing']
|
| 209 |
-
binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left']
|
| 210 |
-
|
| 211 |
-
print("Example pipeline usage:")
|
| 212 |
-
print("```python")
|
| 213 |
-
print("results = vine_pipeline(")
|
| 214 |
-
print(f" '{demo_video}',")
|
| 215 |
-
print(f" categorical_keywords={categorical_keywords},")
|
| 216 |
-
print(f" unary_keywords={unary_keywords},")
|
| 217 |
-
print(f" binary_keywords={binary_keywords},")
|
| 218 |
-
print(" segmentation_method='grounding_dino_sam2'")
|
| 219 |
-
print(")")
|
| 220 |
-
print("```")
|
| 221 |
-
|
| 222 |
-
# Uncomment to actually run (requires segmentation models)
|
| 223 |
-
# try:
|
| 224 |
-
# results = vine_pipeline(
|
| 225 |
-
# demo_video,
|
| 226 |
-
# categorical_keywords=categorical_keywords,
|
| 227 |
-
# unary_keywords=unary_keywords,
|
| 228 |
-
# binary_keywords=binary_keywords,
|
| 229 |
-
# segmentation_method='grounding_dino_sam2'
|
| 230 |
-
# )
|
| 231 |
-
# print("Results:", results['summary'])
|
| 232 |
-
# except Exception as e:
|
| 233 |
-
# print(f"Pipeline execution failed: {e}")
|
| 234 |
-
# print("This is expected if segmentation models are not set up")
|
| 235 |
-
|
| 236 |
-
return vine_pipeline
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
def demonstrate_weight_transfer():
|
| 241 |
-
"""Demonstrate how weights are transferred from ensemble to HuggingFace format."""
|
| 242 |
-
print("\n=== Weight Transfer Demonstration ===")
|
| 243 |
-
|
| 244 |
-
print("The ensemble model structure (PredicateModel):")
|
| 245 |
-
print("- clip_cate_model: CLIP model for categorical classification")
|
| 246 |
-
print("- clip_unary_model: CLIP model for unary predicates")
|
| 247 |
-
print("- clip_binary_model: CLIP model for binary relations")
|
| 248 |
-
print("- clip_tokenizer: Tokenizer for text processing")
|
| 249 |
-
print("- clip_processor: Processor for image processing")
|
| 250 |
-
|
| 251 |
-
print("\nWeight transfer process:")
|
| 252 |
-
print("1. Load ensemble model with torch.load()")
|
| 253 |
-
print("2. Initialize base CLIP models in HuggingFace format")
|
| 254 |
-
print("3. Transfer state_dict from ensemble to HuggingFace models:")
|
| 255 |
-
print(" - ensemble.clip_cate_model → hf.clip_cate_model")
|
| 256 |
-
print(" - ensemble.clip_unary_model → hf.clip_unary_model")
|
| 257 |
-
print(" - ensemble.clip_binary_model → hf.clip_binary_model")
|
| 258 |
-
print("4. Transfer tokenizer and processor")
|
| 259 |
-
|
| 260 |
-
print("\nThis preserves all your fine-tuned weights while making them HuggingFace compatible!")
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
def troubleshooting_guide():
|
| 264 |
-
"""Provide troubleshooting guide for common issues."""
|
| 265 |
-
print("\n=== Troubleshooting Guide ===")
|
| 266 |
-
|
| 267 |
-
print("Common Issues:")
|
| 268 |
-
print("1. 'No model file found for epoch X'")
|
| 269 |
-
print(" → Check that .model files exist in the directory")
|
| 270 |
-
print(" → Verify the epoch number is correct")
|
| 271 |
-
print(" → List files: ls /path/to/model/dir/*.model")
|
| 272 |
-
|
| 273 |
-
print("\n2. 'Error loading VINE weights'")
|
| 274 |
-
print(" → Check file permissions")
|
| 275 |
-
print(" → Verify the model file is not corrupted")
|
| 276 |
-
print(" → Try loading with torch.load() directly first")
|
| 277 |
-
|
| 278 |
-
print("\n3. 'CLIP model mismatch'")
|
| 279 |
-
print(" → Ensure config.model_name matches the base model used in training")
|
| 280 |
-
|
| 281 |
-
print("\n4. 'Device mismatch errors'")
|
| 282 |
-
print(" → Models are loaded to CPU first, then moved to device")
|
| 283 |
-
print(" → Check CUDA availability with torch.cuda.is_available()")
|
| 284 |
-
|
| 285 |
-
print("\nDebugging steps:")
|
| 286 |
-
print("1. Test loading ensemble model directly:")
|
| 287 |
-
print(" model = torch.load('path/to/model.0.model', map_location='cpu')")
|
| 288 |
-
print("2. Check model attributes:")
|
| 289 |
-
print(" print(dir(model))")
|
| 290 |
-
print("3. Verify state_dict keys:")
|
| 291 |
-
print(" print(model.clip_cate_model.state_dict().keys())")
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
if __name__ == "__main__":
|
| 295 |
-
print("VINE Ensemble Weights Loading Examples")
|
| 296 |
-
print("=" * 50)
|
| 297 |
-
|
| 298 |
-
# Test ensemble weight loading
|
| 299 |
-
try:
|
| 300 |
-
model1 = example_load_ensemble_weights()
|
| 301 |
-
except Exception as e:
|
| 302 |
-
print(f"Ensemble loading example failed: {e}")
|
| 303 |
-
|
| 304 |
-
try:
|
| 305 |
-
model2 = example_direct_ensemble_loading()
|
| 306 |
-
except Exception as e:
|
| 307 |
-
print(f"Direct loading example failed: {e}")
|
| 308 |
-
|
| 309 |
-
# Compare approaches
|
| 310 |
-
try:
|
| 311 |
-
example_compare_original_vs_hf()
|
| 312 |
-
except Exception as e:
|
| 313 |
-
print(f"Comparison example failed: {e}")
|
| 314 |
-
|
| 315 |
-
# Test pipeline with ensemble weights
|
| 316 |
-
try:
|
| 317 |
-
pipeline = example_ensemble_with_pipeline()
|
| 318 |
-
except Exception as e:
|
| 319 |
-
print(f"Pipeline example failed: {e}")
|
| 320 |
-
|
| 321 |
-
# Educational content
|
| 322 |
-
demonstrate_weight_transfer()
|
| 323 |
-
troubleshooting_guide()
|
| 324 |
-
|
| 325 |
-
print("\n" + "=" * 50)
|
| 326 |
-
print("Key Points:")
|
| 327 |
-
print("1. AutoModel.from_pretrained() won't work with .pt ensemble weights")
|
| 328 |
-
print("2. Use torch.load() to load the ensemble, then transfer weights")
|
| 329 |
-
print("3. The HuggingFace interface preserves your fine-tuned weights")
|
| 330 |
-
print("4. Specify pretrained_vine_path in VineConfig to auto-load weights")
|
| 331 |
-
print("5. Use VineModel.from_pretrained_vine() for direct loading")
|
| 332 |
-
|
| 333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/example_sam2_masks.py
DELETED
|
@@ -1,331 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Example demonstrating SAM2 mask generation in VINE HuggingFace interface
|
| 3 |
-
|
| 4 |
-
This script shows how to use both SAM2-only and Grounding DINO + SAM2
|
| 5 |
-
segmentation methods with the VINE model.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import os
|
| 9 |
-
import sys
|
| 10 |
-
import torch
|
| 11 |
-
import numpy as np
|
| 12 |
-
from transformers.pipelines import PIPELINE_REGISTRY
|
| 13 |
-
|
| 14 |
-
# Add the parent directory to the path to import vine_hf
|
| 15 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 16 |
-
# Add the parent directory to the path to import vine_hf
|
| 17 |
-
|
| 18 |
-
#Either uncomment the below or set a environemental key, though it isn't needed to run.
|
| 19 |
-
#os.environ['OPENAI_API_KEY'] = 'dummy-key'
|
| 20 |
-
|
| 21 |
-
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 22 |
-
from laser.loading import load_video
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
def example_sam2_only_segmentation():
|
| 26 |
-
"""Example using SAM2 automatic mask generation only."""
|
| 27 |
-
print("=== SAM2-Only Segmentation Example ===")
|
| 28 |
-
|
| 29 |
-
# Create configuration for SAM2-only
|
| 30 |
-
config = VineConfig(
|
| 31 |
-
use_hf_repo=True,
|
| 32 |
-
model_repo="video-fm/vine_v0",
|
| 33 |
-
segmentation_method="sam2", # Use SAM2 only
|
| 34 |
-
target_fps=1,
|
| 35 |
-
debug_visualizations=True,
|
| 36 |
-
)
|
| 37 |
-
|
| 38 |
-
# Register pipeline
|
| 39 |
-
PIPELINE_REGISTRY.register_pipeline(
|
| 40 |
-
"vine-video-understanding",
|
| 41 |
-
pipeline_class=VinePipeline,
|
| 42 |
-
pt_model=VineModel,
|
| 43 |
-
type="multimodal",
|
| 44 |
-
)
|
| 45 |
-
|
| 46 |
-
# Create model and pipeline with SAM2 paths
|
| 47 |
-
vine_model = VineModel(config)
|
| 48 |
-
vine_pipeline = VinePipeline(
|
| 49 |
-
model=vine_model,
|
| 50 |
-
tokenizer=None,
|
| 51 |
-
sam_config_path="path/to/your/sam2/sam_config.yaml",
|
| 52 |
-
sam_checkpoint_path="path/to/your/sam2/sam_checkpoint.pth",
|
| 53 |
-
gd_config_path="path/to/your/groundingdino/config.py",
|
| 54 |
-
gd_checkpoint_path="path/to/your/groundingdino/checkpoint.pth",
|
| 55 |
-
)
|
| 56 |
-
|
| 57 |
-
# Check for demo video
|
| 58 |
-
demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/output.mp4")
|
| 59 |
-
|
| 60 |
-
if os.path.exists(demo_video):
|
| 61 |
-
print(f"Processing video: {demo_video}")
|
| 62 |
-
|
| 63 |
-
# Define keywords (SAM2 will find all objects, then classify them)
|
| 64 |
-
categorical_keywords = ['human', 'dog', 'frisbee', 'object', 'person', 'animal']
|
| 65 |
-
unary_keywords = ['running', 'jumping', 'sitting', 'standing', 'moving', 'static']
|
| 66 |
-
binary_keywords = ['behind', 'in front of', 'next to', 'chasing', 'following']
|
| 67 |
-
object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3), (0,4)]
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
print("Using SAM2 automatic mask generation...")
|
| 71 |
-
print("This will find all objects in the video automatically")
|
| 72 |
-
|
| 73 |
-
try:
|
| 74 |
-
# Process with SAM2 only
|
| 75 |
-
results = vine_pipeline(
|
| 76 |
-
demo_video,
|
| 77 |
-
categorical_keywords=categorical_keywords,
|
| 78 |
-
unary_keywords=unary_keywords,
|
| 79 |
-
binary_keywords=binary_keywords,
|
| 80 |
-
object_pairs=object_pairs,
|
| 81 |
-
segmentation_method="sam2",
|
| 82 |
-
return_top_k=3,
|
| 83 |
-
debug_visualizations=True,
|
| 84 |
-
debug_visualization_path=os.path.join(os.getcwd(), "sam2_debug_masks.png"),
|
| 85 |
-
)
|
| 86 |
-
|
| 87 |
-
print("\n✓ SAM2 segmentation completed!")
|
| 88 |
-
print("Results summary:")
|
| 89 |
-
print(f" Objects detected: {results['summary']['num_objects_detected']}")
|
| 90 |
-
print(f" Top categories: {results['summary']['top_categories']}")
|
| 91 |
-
print(f" Top actions: {results['summary']['top_actions']}")
|
| 92 |
-
|
| 93 |
-
return results
|
| 94 |
-
|
| 95 |
-
except Exception as e:
|
| 96 |
-
print(f"SAM2 segmentation failed: {e}")
|
| 97 |
-
print("Make sure SAM2 models are properly installed")
|
| 98 |
-
return None
|
| 99 |
-
else:
|
| 100 |
-
print(f"Demo video not found: {demo_video}")
|
| 101 |
-
return None
|
| 102 |
-
|
| 103 |
-
def example_grounding_dino_sam2_segmentation():
|
| 104 |
-
"""Example using Grounding DINO + SAM2 text-guided segmentation."""
|
| 105 |
-
print("\n=== Grounding DINO + SAM2 Segmentation Example ===")
|
| 106 |
-
|
| 107 |
-
# Create configuration for Grounding DINO + SAM2
|
| 108 |
-
config = VineConfig(
|
| 109 |
-
use_hf_repo=True,
|
| 110 |
-
model_repo="video-fm/vine_v0",
|
| 111 |
-
segmentation_method="grounding_dino_sam2", # Use text-guided segmentation
|
| 112 |
-
box_threshold=0.35,
|
| 113 |
-
text_threshold=0.25,
|
| 114 |
-
target_fps=1,
|
| 115 |
-
debug_visualizations=True,
|
| 116 |
-
)
|
| 117 |
-
|
| 118 |
-
# Create model and pipeline with both SAM2 and GroundingDINO paths
|
| 119 |
-
vine_model = VineModel(config)
|
| 120 |
-
vine_pipeline = VinePipeline(
|
| 121 |
-
model=vine_model,
|
| 122 |
-
tokenizer=None,
|
| 123 |
-
# SAM2 configuration
|
| 124 |
-
sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
|
| 125 |
-
sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
|
| 126 |
-
gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
|
| 127 |
-
gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
|
| 128 |
-
device=0,
|
| 129 |
-
)
|
| 130 |
-
|
| 131 |
-
# Check for demo video
|
| 132 |
-
demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/output.mp4")
|
| 133 |
-
|
| 134 |
-
if os.path.exists(demo_video):
|
| 135 |
-
print(f"Processing video: {demo_video}")
|
| 136 |
-
|
| 137 |
-
# Define keywords (Grounding DINO will look specifically for these)
|
| 138 |
-
categorical_keywords = ['human', 'dog', 'frisbee'] # Specific objects to find
|
| 139 |
-
unary_keywords = ['running', 'jumping', 'catching', 'throwing']
|
| 140 |
-
binary_keywords = ['behind', 'chasing', 'next to', 'throwing to']
|
| 141 |
-
object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3), (0,4)]
|
| 142 |
-
print("Using Grounding DINO + SAM2 text-guided segmentation...")
|
| 143 |
-
print(f"Looking specifically for: {categorical_keywords}")
|
| 144 |
-
|
| 145 |
-
try:
|
| 146 |
-
# Process with Grounding DINO + SAM2
|
| 147 |
-
results = vine_pipeline(
|
| 148 |
-
demo_video,
|
| 149 |
-
categorical_keywords=categorical_keywords,
|
| 150 |
-
unary_keywords=unary_keywords,
|
| 151 |
-
binary_keywords=binary_keywords,
|
| 152 |
-
object_pairs=object_pairs,
|
| 153 |
-
segmentation_method="grounding_dino_sam2",
|
| 154 |
-
box_threshold=0.35,
|
| 155 |
-
text_threshold=0.25,
|
| 156 |
-
return_top_k=3,
|
| 157 |
-
debug_visualizations=True,
|
| 158 |
-
)
|
| 159 |
-
|
| 160 |
-
print("\n✓ Grounding DINO + SAM2 segmentation completed!")
|
| 161 |
-
print("Results summary:")
|
| 162 |
-
print(f" Objects detected: {results['summary']['num_objects_detected']}")
|
| 163 |
-
print(f" Top categories: {results['summary']['top_categories']}")
|
| 164 |
-
print(f" Top actions: {results['summary']['top_actions']}")
|
| 165 |
-
print(f" Top relations: {results['summary']['top_relations']}")
|
| 166 |
-
|
| 167 |
-
return results
|
| 168 |
-
|
| 169 |
-
except Exception as e:
|
| 170 |
-
print(f"Grounding DINO + SAM2 segmentation failed: {e}")
|
| 171 |
-
print("Make sure both Grounding DINO and SAM2 models are properly installed")
|
| 172 |
-
return None
|
| 173 |
-
else:
|
| 174 |
-
print(f"Demo video not found: {demo_video}")
|
| 175 |
-
return None
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
def compare_segmentation_methods():
|
| 179 |
-
"""Compare SAM2-only vs Grounding DINO + SAM2 approaches."""
|
| 180 |
-
print("\n=== Comparing Segmentation Methods ===")
|
| 181 |
-
|
| 182 |
-
print("\nSAM2-Only Approach:")
|
| 183 |
-
print("✓ Finds all objects automatically")
|
| 184 |
-
print("✓ No need to specify what to look for")
|
| 185 |
-
print("✓ Good for exploratory analysis")
|
| 186 |
-
print("✗ May find too many irrelevant objects")
|
| 187 |
-
print("✗ Less precise for specific object types")
|
| 188 |
-
|
| 189 |
-
print("\nGrounding DINO + SAM2 Approach:")
|
| 190 |
-
print("✓ Finds specific objects based on text prompts")
|
| 191 |
-
print("✓ More precise and targeted")
|
| 192 |
-
print("✓ Better for known object categories")
|
| 193 |
-
print("✓ Integrates object detection with segmentation")
|
| 194 |
-
print("✗ Limited to specified categories")
|
| 195 |
-
print("✗ Requires knowing what objects to look for")
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
def demonstrate_mask_processing():
|
| 199 |
-
"""Demonstrate how masks are processed internally."""
|
| 200 |
-
print("\n=== Mask Processing Demonstration ===")
|
| 201 |
-
|
| 202 |
-
# Load a video to show the processing pipeline
|
| 203 |
-
demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/output.mp4")
|
| 204 |
-
|
| 205 |
-
if os.path.exists(demo_video):
|
| 206 |
-
print("Loading video for mask processing demo...")
|
| 207 |
-
|
| 208 |
-
# Load video tensor
|
| 209 |
-
video_tensor = np.asarray(load_video(demo_video, target_fps=1))
|
| 210 |
-
print(f"Video shape: {video_tensor.shape}")
|
| 211 |
-
|
| 212 |
-
# Create pipeline with segmentation model paths
|
| 213 |
-
config = VineConfig(segmentation_method="sam2")
|
| 214 |
-
vine_model = VineModel(config)
|
| 215 |
-
vine_pipeline = VinePipeline(
|
| 216 |
-
model=vine_model,
|
| 217 |
-
tokenizer=None,
|
| 218 |
-
# SAM2 configuration
|
| 219 |
-
sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
|
| 220 |
-
sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
|
| 221 |
-
gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
|
| 222 |
-
gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
|
| 223 |
-
)
|
| 224 |
-
|
| 225 |
-
try:
|
| 226 |
-
# Process just the first few frames to show the pipeline
|
| 227 |
-
print("\nProcessing first 2 frames with SAM2...")
|
| 228 |
-
|
| 229 |
-
# Manually call the preprocessing to show the steps
|
| 230 |
-
processed_data = vine_pipeline.preprocess(
|
| 231 |
-
video_tensor[:2], # Just first 2 frames
|
| 232 |
-
segmentation_method="sam2",
|
| 233 |
-
categorical_keywords=['object']
|
| 234 |
-
)
|
| 235 |
-
|
| 236 |
-
print("Mask processing results:")
|
| 237 |
-
print(f" Number of frames processed: {processed_data['num_frames']}")
|
| 238 |
-
print(f" Frames with masks: {list(processed_data['masks'].keys())}")
|
| 239 |
-
|
| 240 |
-
# Show mask details
|
| 241 |
-
for frame_id, frame_masks in processed_data['masks'].items():
|
| 242 |
-
print(f" Frame {frame_id}: {len(frame_masks)} objects detected")
|
| 243 |
-
for obj_id, mask in frame_masks.items():
|
| 244 |
-
print(f" Object {obj_id}: mask shape {mask.shape}")
|
| 245 |
-
|
| 246 |
-
print("\nBounding box extraction:")
|
| 247 |
-
for frame_id, frame_bboxes in processed_data['bboxes'].items():
|
| 248 |
-
print(f" Frame {frame_id}: {len(frame_bboxes)} bounding boxes")
|
| 249 |
-
for obj_id, bbox in frame_bboxes.items():
|
| 250 |
-
print(f" Object {obj_id}: bbox {bbox}")
|
| 251 |
-
|
| 252 |
-
except Exception as e:
|
| 253 |
-
print(f"Mask processing failed: {e}")
|
| 254 |
-
print("This is expected if SAM2 models are not properly set up")
|
| 255 |
-
else:
|
| 256 |
-
print(f"Demo video not found: {demo_video}")
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
def test_mask_formats():
|
| 260 |
-
"""Test different mask input formats."""
|
| 261 |
-
print("\n=== Testing Mask Formats ===")
|
| 262 |
-
|
| 263 |
-
# Create dummy data to test mask processing
|
| 264 |
-
height, width = 224, 224
|
| 265 |
-
|
| 266 |
-
# Test different mask formats
|
| 267 |
-
print("Testing mask format conversions...")
|
| 268 |
-
|
| 269 |
-
# Format 1: NumPy boolean array
|
| 270 |
-
mask_np = np.random.rand(height, width) > 0.5
|
| 271 |
-
print(f"NumPy mask: {mask_np.shape}, dtype: {mask_np.dtype}")
|
| 272 |
-
|
| 273 |
-
# Format 2: PyTorch tensor
|
| 274 |
-
mask_torch = torch.from_numpy(mask_np)
|
| 275 |
-
print(f"PyTorch mask: {mask_torch.shape}, dtype: {mask_torch.dtype}")
|
| 276 |
-
|
| 277 |
-
# Format 3: 3D mask with singleton dimension
|
| 278 |
-
mask_3d = mask_torch.unsqueeze(-1)
|
| 279 |
-
print(f"3D mask: {mask_3d.shape}")
|
| 280 |
-
|
| 281 |
-
# Test bounding box extraction
|
| 282 |
-
from laser.preprocess.mask_generation_grounding_dino import mask_to_bbox
|
| 283 |
-
|
| 284 |
-
try:
|
| 285 |
-
bbox = mask_to_bbox(mask_torch)
|
| 286 |
-
print(f"Extracted bbox: {bbox}")
|
| 287 |
-
print("✓ Mask format testing successful")
|
| 288 |
-
except Exception as e:
|
| 289 |
-
print(f"Mask format testing failed: {e}")
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
if __name__ == "__main__":
|
| 293 |
-
print("VINE SAM2 Mask Generation Examples")
|
| 294 |
-
print("=" * 50)
|
| 295 |
-
|
| 296 |
-
# Test SAM2-only approach
|
| 297 |
-
try:
|
| 298 |
-
sam2_results = example_sam2_only_segmentation()
|
| 299 |
-
except Exception as e:
|
| 300 |
-
print(f"SAM2-only example failed: {e}")
|
| 301 |
-
|
| 302 |
-
# Test Grounding DINO + SAM2 approach
|
| 303 |
-
try:
|
| 304 |
-
gd_sam2_results = example_grounding_dino_sam2_segmentation()
|
| 305 |
-
except Exception as e:
|
| 306 |
-
print(f"Grounding DINO + SAM2 example failed: {e}")
|
| 307 |
-
|
| 308 |
-
# Compare approaches
|
| 309 |
-
compare_segmentation_methods()
|
| 310 |
-
|
| 311 |
-
# Demonstrate mask processing
|
| 312 |
-
try:
|
| 313 |
-
demonstrate_mask_processing()
|
| 314 |
-
except Exception as e:
|
| 315 |
-
print(f"Mask processing demo failed: {e}")
|
| 316 |
-
|
| 317 |
-
# Test mask formats
|
| 318 |
-
try:
|
| 319 |
-
test_mask_formats()
|
| 320 |
-
except Exception as e:
|
| 321 |
-
print(f"Mask format testing failed: {e}")
|
| 322 |
-
|
| 323 |
-
print("\n" + "=" * 50)
|
| 324 |
-
print("Examples completed!")
|
| 325 |
-
print("\nKey takeaways:")
|
| 326 |
-
print("1. SAM2-only: Automatic object detection and segmentation")
|
| 327 |
-
print("2. Grounding DINO + SAM2: Text-guided object detection and segmentation")
|
| 328 |
-
print("3. Both methods provide masks and bounding boxes for VINE model")
|
| 329 |
-
print("4. Choose method based on whether you know what objects to look for")
|
| 330 |
-
|
| 331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/example_usage.ipynb
DELETED
|
@@ -1,310 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"cells": [
|
| 3 |
-
{
|
| 4 |
-
"cell_type": "code",
|
| 5 |
-
"execution_count": null,
|
| 6 |
-
"id": "44d53281",
|
| 7 |
-
"metadata": {},
|
| 8 |
-
"outputs": [
|
| 9 |
-
{
|
| 10 |
-
"name": "stderr",
|
| 11 |
-
"output_type": "stream",
|
| 12 |
-
"text": [
|
| 13 |
-
"/home/kevinx/miniconda3/envs/laser_env/lib/python3.10/site-packages/pydantic/_internal/_config.py:383: UserWarning: Valid config keys have changed in V2:\n",
|
| 14 |
-
"* 'schema_extra' has been renamed to 'json_schema_extra'\n",
|
| 15 |
-
" warnings.warn(message, UserWarning)\n",
|
| 16 |
-
"Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n",
|
| 17 |
-
"Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
|
| 18 |
-
]
|
| 19 |
-
}
|
| 20 |
-
],
|
| 21 |
-
"source": [
|
| 22 |
-
"import os\n",
|
| 23 |
-
"import sys\n",
|
| 24 |
-
"import torch\n",
|
| 25 |
-
"from transformers import pipeline, AutoModel\n",
|
| 26 |
-
"from transformers.pipelines import PIPELINE_REGISTRY\n",
|
| 27 |
-
"\n",
|
| 28 |
-
"# Uncomment or set your own\n",
|
| 29 |
-
"#os.environ['OPENAI_API_KEY'] = 'dummy-key'\n",
|
| 30 |
-
"from vine_hf import VineConfig, VineModel, VinePipeline"
|
| 31 |
-
]
|
| 32 |
-
},
|
| 33 |
-
{
|
| 34 |
-
"cell_type": "code",
|
| 35 |
-
"execution_count": 2,
|
| 36 |
-
"id": "174e479f",
|
| 37 |
-
"metadata": {},
|
| 38 |
-
"outputs": [],
|
| 39 |
-
"source": [
|
| 40 |
-
"PIPELINE_REGISTRY.register_pipeline(\n",
|
| 41 |
-
" \"vine-video-understanding\",\n",
|
| 42 |
-
" pipeline_class=VinePipeline,\n",
|
| 43 |
-
" pt_model=VineModel,\n",
|
| 44 |
-
" type=\"multimodal\",\n",
|
| 45 |
-
")"
|
| 46 |
-
]
|
| 47 |
-
},
|
| 48 |
-
{
|
| 49 |
-
"cell_type": "code",
|
| 50 |
-
"execution_count": null,
|
| 51 |
-
"id": "a9af2770",
|
| 52 |
-
"metadata": {},
|
| 53 |
-
"outputs": [],
|
| 54 |
-
"source": [
|
| 55 |
-
"vine_config = VineConfig(\n",
|
| 56 |
-
" model_name=\"openai/clip-vit-base-patch32\",\n",
|
| 57 |
-
" # Local file example: set use_hf_repo=False and provide local_dir/local_filename\n",
|
| 58 |
-
" use_hf_repo=False,\n",
|
| 59 |
-
" local_dir=os.path.dirname('/path/to/your/pretrained/model.pt'),\n",
|
| 60 |
-
" local_filename=os.path.basename('/path/to/your/pretrained/model.pt'), # Local file path\n",
|
| 61 |
-
" segmentation_method=\"grounding_dino_sam2\",\n",
|
| 62 |
-
" visualize=True,\n",
|
| 63 |
-
" visualization_dir=\"path/to/visualization/dir\",\n",
|
| 64 |
-
" debug_visualizations=True,\n",
|
| 65 |
-
" device=0, # Change to your desired device\n",
|
| 66 |
-
")"
|
| 67 |
-
]
|
| 68 |
-
},
|
| 69 |
-
{
|
| 70 |
-
"cell_type": "code",
|
| 71 |
-
"execution_count": null,
|
| 72 |
-
"id": "274e6515",
|
| 73 |
-
"metadata": {},
|
| 74 |
-
"outputs": [
|
| 75 |
-
{
|
| 76 |
-
"name": "stdout",
|
| 77 |
-
"output_type": "stream",
|
| 78 |
-
"text": [
|
| 79 |
-
"Loaded state type: <class 'collections.OrderedDict'>\n"
|
| 80 |
-
]
|
| 81 |
-
}
|
| 82 |
-
],
|
| 83 |
-
"source": [
|
| 84 |
-
"vine_pipeline = VinePipeline(\n",
|
| 85 |
-
" model=VineModel(vine_config), \n",
|
| 86 |
-
" tokenizer=None,\n",
|
| 87 |
-
" sam_config_path=\"path/to/sam2/configs/sam2_hiera_base_plus.yaml\",\n",
|
| 88 |
-
" sam_checkpoint_path=\"path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt\",\n",
|
| 89 |
-
" gd_config_path=\"path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py\",\n",
|
| 90 |
-
" gd_checkpoint_path=\"path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth\",\n",
|
| 91 |
-
")"
|
| 92 |
-
]
|
| 93 |
-
},
|
| 94 |
-
{
|
| 95 |
-
"cell_type": "code",
|
| 96 |
-
"execution_count": 6,
|
| 97 |
-
"id": "123a090d",
|
| 98 |
-
"metadata": {},
|
| 99 |
-
"outputs": [],
|
| 100 |
-
"source": [
|
| 101 |
-
"categorical_keywords = ['human', 'dog', 'frisbee']\n",
|
| 102 |
-
"unary_keywords = ['running', 'jumping', 'catching', 'throwing']\n",
|
| 103 |
-
"binary_keywords = ['behind', 'in front of', 'next to', 'chasing']\n",
|
| 104 |
-
"object_pairs = [(0, 1), (0, 2), (1, 2)] # human-dog, dog-frisbee relationships "
|
| 105 |
-
]
|
| 106 |
-
},
|
| 107 |
-
{
|
| 108 |
-
"cell_type": "code",
|
| 109 |
-
"execution_count": 7,
|
| 110 |
-
"id": "0b42f032",
|
| 111 |
-
"metadata": {},
|
| 112 |
-
"outputs": [],
|
| 113 |
-
"source": [
|
| 114 |
-
"demo_video_path = \"/home/kevinx/LASER/LASER/demo/videos/v1.mp4\" # Replace with your video file path"
|
| 115 |
-
]
|
| 116 |
-
},
|
| 117 |
-
{
|
| 118 |
-
"cell_type": "code",
|
| 119 |
-
"execution_count": 8,
|
| 120 |
-
"id": "8202c654",
|
| 121 |
-
"metadata": {},
|
| 122 |
-
"outputs": [
|
| 123 |
-
{
|
| 124 |
-
"name": "stdout",
|
| 125 |
-
"output_type": "stream",
|
| 126 |
-
"text": [
|
| 127 |
-
"Segmentation method: grounding_dino_sam2\n",
|
| 128 |
-
"Generating Grounding DINO + SAM2 masks...\n",
|
| 129 |
-
"<class 'int'>\n",
|
| 130 |
-
"✓ SAM2 models initialized successfully\n",
|
| 131 |
-
"<class 'int'>\n"
|
| 132 |
-
]
|
| 133 |
-
},
|
| 134 |
-
{
|
| 135 |
-
"name": "stderr",
|
| 136 |
-
"output_type": "stream",
|
| 137 |
-
"text": [
|
| 138 |
-
"UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at /pytorch/aten/src/ATen/native/TensorShape.cpp:4314.)\n"
|
| 139 |
-
]
|
| 140 |
-
},
|
| 141 |
-
{
|
| 142 |
-
"name": "stdout",
|
| 143 |
-
"output_type": "stream",
|
| 144 |
-
"text": [
|
| 145 |
-
"final text_encoder_type: bert-base-uncased\n",
|
| 146 |
-
"✓ GroundingDINO model initialized successfully\n",
|
| 147 |
-
"Start detecting objects at time 05:08:58.178592\n"
|
| 148 |
-
]
|
| 149 |
-
},
|
| 150 |
-
{
|
| 151 |
-
"name": "stderr",
|
| 152 |
-
"output_type": "stream",
|
| 153 |
-
"text": [
|
| 154 |
-
"Detecting objects: 0%| | 0/3 [00:00<?, ?it/s]FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
|
| 155 |
-
"UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
|
| 156 |
-
"UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
|
| 157 |
-
"FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 158 |
-
"Detecting objects: 100%|██████████| 3/3 [00:01<00:00, 2.82it/s]\n"
|
| 159 |
-
]
|
| 160 |
-
},
|
| 161 |
-
{
|
| 162 |
-
"name": "stdout",
|
| 163 |
-
"output_type": "stream",
|
| 164 |
-
"text": [
|
| 165 |
-
"Finished detecting objects at time 05:08:59.250419\n",
|
| 166 |
-
"Loading inference state at time 05:08:59.544425\n",
|
| 167 |
-
"Number of frames: 3\n",
|
| 168 |
-
"None\n"
|
| 169 |
-
]
|
| 170 |
-
},
|
| 171 |
-
{
|
| 172 |
-
"name": "stderr",
|
| 173 |
-
"output_type": "stream",
|
| 174 |
-
"text": [
|
| 175 |
-
"Processing frames: 100%|██████████| 3/3 [00:00<00:00, 11.77it/s]\n"
|
| 176 |
-
]
|
| 177 |
-
},
|
| 178 |
-
{
|
| 179 |
-
"name": "stdout",
|
| 180 |
-
"output_type": "stream",
|
| 181 |
-
"text": [
|
| 182 |
-
"Annotated frames: []\n",
|
| 183 |
-
"Find the most dense prompt at time 05:09:01.413703\n",
|
| 184 |
-
"Most dense frame: 0\n",
|
| 185 |
-
"\n",
|
| 186 |
-
"\n",
|
| 187 |
-
"Start propagating objects at time 05:09:01.416367\n",
|
| 188 |
-
"Pass count: 0\n"
|
| 189 |
-
]
|
| 190 |
-
},
|
| 191 |
-
{
|
| 192 |
-
"name": "stderr",
|
| 193 |
-
"output_type": "stream",
|
| 194 |
-
"text": [
|
| 195 |
-
"propagate in video: 100%|██████████| 3/3 [00:00<00:00, 20.20it/s]\n",
|
| 196 |
-
"propagate in video: 0it [00:00, ?it/s]\n"
|
| 197 |
-
]
|
| 198 |
-
},
|
| 199 |
-
{
|
| 200 |
-
"name": "stdout",
|
| 201 |
-
"output_type": "stream",
|
| 202 |
-
"text": [
|
| 203 |
-
"Most dense frame: 1\n",
|
| 204 |
-
"\n",
|
| 205 |
-
"\n",
|
| 206 |
-
"Pass count: 1\n"
|
| 207 |
-
]
|
| 208 |
-
},
|
| 209 |
-
{
|
| 210 |
-
"name": "stderr",
|
| 211 |
-
"output_type": "stream",
|
| 212 |
-
"text": [
|
| 213 |
-
"propagate in video: 100%|██████████| 3/3 [00:00<00:00, 19.25it/s]\n",
|
| 214 |
-
"propagate in video: 0it [00:00, ?it/s]\n"
|
| 215 |
-
]
|
| 216 |
-
},
|
| 217 |
-
{
|
| 218 |
-
"name": "stdout",
|
| 219 |
-
"output_type": "stream",
|
| 220 |
-
"text": [
|
| 221 |
-
"Most dense frame: 2\n",
|
| 222 |
-
"\n",
|
| 223 |
-
"\n",
|
| 224 |
-
"Pass count: 2\n"
|
| 225 |
-
]
|
| 226 |
-
},
|
| 227 |
-
{
|
| 228 |
-
"name": "stderr",
|
| 229 |
-
"output_type": "stream",
|
| 230 |
-
"text": [
|
| 231 |
-
"propagate in video: 100%|██████████| 3/3 [00:00<00:00, 25.92it/s]\n",
|
| 232 |
-
"propagate in video: 0it [00:00, ?it/s]\n"
|
| 233 |
-
]
|
| 234 |
-
},
|
| 235 |
-
{
|
| 236 |
-
"name": "stdout",
|
| 237 |
-
"output_type": "stream",
|
| 238 |
-
"text": [
|
| 239 |
-
"Most dense frame: -1\n",
|
| 240 |
-
"\n",
|
| 241 |
-
"\n",
|
| 242 |
-
"\n",
|
| 243 |
-
"Results:\n",
|
| 244 |
-
"Summary: {'num_objects_detected': 4, 'num_unary_predictions': 10, 'num_binary_predictions': 3, 'top_categories': [('frisbee', 0.9989640712738037), ('dog', 0.957672655582428), ('dog', 0.957672655582428)], 'top_actions': [('running', 0.8483631610870361), ('running', 0.832377016544342), ('running', 0.8178836107254028)], 'top_relations': [('chasing', 0.9616015553474426), ('chasing', 0.9478002786636353), ('chasing', 0.6380977630615234)]}\n"
|
| 245 |
-
]
|
| 246 |
-
}
|
| 247 |
-
],
|
| 248 |
-
"source": [
|
| 249 |
-
"try:\n",
|
| 250 |
-
" results = vine_pipeline(\n",
|
| 251 |
-
" demo_video_path,\n",
|
| 252 |
-
" categorical_keywords=categorical_keywords,\n",
|
| 253 |
-
" unary_keywords=unary_keywords,\n",
|
| 254 |
-
" binary_keywords=binary_keywords,\n",
|
| 255 |
-
" object_pairs=object_pairs,\n",
|
| 256 |
-
" segmentation_method='grounding_dino_sam2',\n",
|
| 257 |
-
" return_top_k=3,\n",
|
| 258 |
-
" include_visualizations=False,\n",
|
| 259 |
-
" debug_visualizations=False,\n",
|
| 260 |
-
" )\n",
|
| 261 |
-
" \n",
|
| 262 |
-
" print(\"\\nResults:\")\n",
|
| 263 |
-
" print(f\"Summary: {results['summary']}\")\n",
|
| 264 |
-
" \n",
|
| 265 |
-
"except Exception as e:\n",
|
| 266 |
-
" print(f\"Note: Full execution requires segmentation models to be properly set up.\")\n",
|
| 267 |
-
" print(f\"Error: {e}\")"
|
| 268 |
-
]
|
| 269 |
-
},
|
| 270 |
-
{
|
| 271 |
-
"cell_type": "code",
|
| 272 |
-
"execution_count": 9,
|
| 273 |
-
"id": "414ede9b",
|
| 274 |
-
"metadata": {},
|
| 275 |
-
"outputs": [
|
| 276 |
-
{
|
| 277 |
-
"name": "stdout",
|
| 278 |
-
"output_type": "stream",
|
| 279 |
-
"text": [
|
| 280 |
-
"Summary: {'num_objects_detected': 4, 'num_unary_predictions': 10, 'num_binary_predictions': 3, 'top_categories': [('frisbee', 0.9989640712738037), ('dog', 0.957672655582428), ('dog', 0.957672655582428)], 'top_actions': [('running', 0.8483631610870361), ('running', 0.832377016544342), ('running', 0.8178836107254028)], 'top_relations': [('chasing', 0.9616015553474426), ('chasing', 0.9478002786636353), ('chasing', 0.6380977630615234)]}\n"
|
| 281 |
-
]
|
| 282 |
-
}
|
| 283 |
-
],
|
| 284 |
-
"source": [
|
| 285 |
-
"print(f\"Summary: {results['summary']}\")"
|
| 286 |
-
]
|
| 287 |
-
}
|
| 288 |
-
],
|
| 289 |
-
"metadata": {
|
| 290 |
-
"kernelspec": {
|
| 291 |
-
"display_name": "laser_env",
|
| 292 |
-
"language": "python",
|
| 293 |
-
"name": "python3"
|
| 294 |
-
},
|
| 295 |
-
"language_info": {
|
| 296 |
-
"codemirror_mode": {
|
| 297 |
-
"name": "ipython",
|
| 298 |
-
"version": 3
|
| 299 |
-
},
|
| 300 |
-
"file_extension": ".py",
|
| 301 |
-
"mimetype": "text/x-python",
|
| 302 |
-
"name": "python",
|
| 303 |
-
"nbconvert_exporter": "python",
|
| 304 |
-
"pygments_lexer": "ipython3",
|
| 305 |
-
"version": "3.10.0"
|
| 306 |
-
}
|
| 307 |
-
},
|
| 308 |
-
"nbformat": 4,
|
| 309 |
-
"nbformat_minor": 5
|
| 310 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/example_usage.py
DELETED
|
@@ -1,283 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Example usage of VINE HuggingFace interface
|
| 3 |
-
|
| 4 |
-
This script demonstrates how to use the VINE model through the HuggingFace interface
|
| 5 |
-
for video understanding with categorical, unary, and binary keyword predictions.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import os
|
| 9 |
-
import sys
|
| 10 |
-
import torch
|
| 11 |
-
from transformers import pipeline, AutoModel
|
| 12 |
-
from transformers.pipelines import PIPELINE_REGISTRY
|
| 13 |
-
|
| 14 |
-
# Add the parent directory to the path to import vine_hf
|
| 15 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 16 |
-
|
| 17 |
-
# Uncomment or set your own
|
| 18 |
-
#os.environ['OPENAI_API_KEY'] = 'dummy-key'
|
| 19 |
-
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 20 |
-
|
| 21 |
-
def example_direct_model_usage():
|
| 22 |
-
"""Example of using the VINE model directly."""
|
| 23 |
-
print("=== Direct Model Usage ===")
|
| 24 |
-
|
| 25 |
-
# Create configuration
|
| 26 |
-
config = VineConfig(
|
| 27 |
-
model_name="openai/clip-vit-base-patch32",
|
| 28 |
-
segmentation_method="grounding_dino_sam2",
|
| 29 |
-
use_hf_repo=True,
|
| 30 |
-
model_repo="video-fm/vine_v0", # Your HF Hub model
|
| 31 |
-
debug_visualizations=True,
|
| 32 |
-
debug_visualization_path=os.path.join(os.getcwd(), "debug_masks.png"),
|
| 33 |
-
target_fps=30,
|
| 34 |
-
box_threshold=0.35,
|
| 35 |
-
text_threshold=0.25
|
| 36 |
-
)
|
| 37 |
-
|
| 38 |
-
# Initialize model
|
| 39 |
-
model = VineModel(config)
|
| 40 |
-
|
| 41 |
-
print(f"Model initialized with CLIP backbone: {config.model_name}")
|
| 42 |
-
print(f"Segmentation method: {config.segmentation_method}")
|
| 43 |
-
print(f"Device: {model.device}")
|
| 44 |
-
|
| 45 |
-
# Example video data (placeholder - in real usage, load from video file)
|
| 46 |
-
num_frames, height, width = 3, 224, 224
|
| 47 |
-
video_frames = torch.randn(num_frames, height, width, 3) * 255
|
| 48 |
-
video_frames = video_frames.clamp(0, 255).byte()
|
| 49 |
-
|
| 50 |
-
# Example masks and bboxes (placeholder - in real usage, generated by segmentation)
|
| 51 |
-
masks = {
|
| 52 |
-
0: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)},
|
| 53 |
-
1: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)},
|
| 54 |
-
2: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)}
|
| 55 |
-
}
|
| 56 |
-
|
| 57 |
-
bboxes = {
|
| 58 |
-
0: {1: [50, 50, 150, 150], 2: [100, 100, 200, 200]},
|
| 59 |
-
1: {1: [52, 52, 152, 152], 2: [102, 102, 202, 202]},
|
| 60 |
-
2: {1: [54, 54, 154, 154], 2: [104, 104, 204, 204]}
|
| 61 |
-
}
|
| 62 |
-
|
| 63 |
-
# Define keywords
|
| 64 |
-
categorical_keywords = ["human", "dog", "frisbee"]
|
| 65 |
-
unary_keywords = ["running", "jumping", "sitting", "standing"]
|
| 66 |
-
binary_keywords = ["behind", "in front of", "next to", "throwing to", "catching from"]
|
| 67 |
-
object_pairs = [(1, 2)] # Object 1 relates to Object 2
|
| 68 |
-
|
| 69 |
-
# Run prediction
|
| 70 |
-
print("\nRunning prediction...")
|
| 71 |
-
results = model.predict(
|
| 72 |
-
video_frames=video_frames,
|
| 73 |
-
masks=masks,
|
| 74 |
-
bboxes=bboxes,
|
| 75 |
-
categorical_keywords=categorical_keywords,
|
| 76 |
-
unary_keywords=unary_keywords,
|
| 77 |
-
binary_keywords=binary_keywords,
|
| 78 |
-
object_pairs=object_pairs,
|
| 79 |
-
return_top_k=3
|
| 80 |
-
)
|
| 81 |
-
|
| 82 |
-
print("\nResults:")
|
| 83 |
-
print(f"Categorical predictions: {len(results['categorical_predictions'])} objects")
|
| 84 |
-
print(f"Unary predictions: {len(results['unary_predictions'])} actions")
|
| 85 |
-
print(f"Binary predictions: {len(results['binary_predictions'])} relations")
|
| 86 |
-
print(f"Confidence scores: {results['confidence_scores']}")
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
def example_pipeline_usage():
|
| 90 |
-
"""Example of using the VINE pipeline."""
|
| 91 |
-
print("\n=== Pipeline Usage ===")
|
| 92 |
-
|
| 93 |
-
# Register the pipeline
|
| 94 |
-
PIPELINE_REGISTRY.register_pipeline(
|
| 95 |
-
"vine-video-understanding",
|
| 96 |
-
pipeline_class=VinePipeline,
|
| 97 |
-
pt_model=VineModel,
|
| 98 |
-
type="multimodal",
|
| 99 |
-
)
|
| 100 |
-
vine_config = VineConfig(
|
| 101 |
-
model_name="openai/clip-vit-base-patch32",
|
| 102 |
-
use_hf_repo=True,
|
| 103 |
-
model_repo="video-fm/vine_v0", # Your HF Hub model
|
| 104 |
-
segmentation_method="grounding_dino_sam2",
|
| 105 |
-
debug_visualizations=True,
|
| 106 |
-
)
|
| 107 |
-
|
| 108 |
-
vine_pipe = VinePipeline(
|
| 109 |
-
model=VineModel(vine_config),
|
| 110 |
-
tokenizer=None,
|
| 111 |
-
trust_remote_code=True,
|
| 112 |
-
# SAM2 configuration
|
| 113 |
-
sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
|
| 114 |
-
sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
|
| 115 |
-
gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
|
| 116 |
-
gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
|
| 117 |
-
device=0,
|
| 118 |
-
)
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
print("Pipeline created successfully!")
|
| 122 |
-
|
| 123 |
-
# Example usage with video path
|
| 124 |
-
video_path = "path/to/your/video.mp4" # Replace with actual video path
|
| 125 |
-
|
| 126 |
-
# For demonstration, we'll show the expected usage format
|
| 127 |
-
print(f"\nExample pipeline call (replace with actual video path):")
|
| 128 |
-
print(f"results = vine_pipeline(")
|
| 129 |
-
print(f" '{video_path}',")
|
| 130 |
-
print(f" categorical_keywords=['human', 'dog', 'frisbee'],")
|
| 131 |
-
print(f" unary_keywords=['running', 'jumping', 'sitting'],")
|
| 132 |
-
print(f" binary_keywords=['behind', 'in front of', 'next to'],")
|
| 133 |
-
print(f" object_pairs=[(1, 2)],")
|
| 134 |
-
print(f" segmentation_method='grounding_dino_sam2',")
|
| 135 |
-
print(f" return_top_k=3,")
|
| 136 |
-
print(f" return_flattened_segments=True,")
|
| 137 |
-
print(f" return_valid_pairs=True,")
|
| 138 |
-
print(f" include_visualizations=True,")
|
| 139 |
-
print(f" debug_visualizations=True")
|
| 140 |
-
print(f")")
|
| 141 |
-
|
| 142 |
-
# Note: Actual execution would require proper video file and segmentation models
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
def example_huggingface_hub_usage():
|
| 146 |
-
"""Example of how to push and load from HuggingFace Hub."""
|
| 147 |
-
print("\n=== HuggingFace Hub Usage ===")
|
| 148 |
-
|
| 149 |
-
# Example of preparing model for Hub
|
| 150 |
-
config = VineConfig()
|
| 151 |
-
model = VineModel(config)
|
| 152 |
-
|
| 153 |
-
# Register for auto classes
|
| 154 |
-
config.register_for_auto_class()
|
| 155 |
-
model.register_for_auto_class("AutoModel")
|
| 156 |
-
|
| 157 |
-
print("Model registered for auto classes")
|
| 158 |
-
|
| 159 |
-
# Example push to hub (commented out - requires actual model weights and credentials)
|
| 160 |
-
# config.push_to_hub('your-username/vine-model')
|
| 161 |
-
# model.push_to_hub('your-username/vine-model')
|
| 162 |
-
|
| 163 |
-
# Example load from hub (commented out - requires actual model on hub)
|
| 164 |
-
# model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)
|
| 165 |
-
# pipeline = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)
|
| 166 |
-
|
| 167 |
-
print("To push to Hub:")
|
| 168 |
-
print("1. config.push_to_hub('your-username/vine-model')")
|
| 169 |
-
print("2. model.push_to_hub('your-username/vine-model')")
|
| 170 |
-
print("\nTo load from Hub:")
|
| 171 |
-
print("model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)")
|
| 172 |
-
print("pipe = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)")
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
def example_with_real_video():
|
| 176 |
-
"""Example showing how to use with a real video file."""
|
| 177 |
-
print("\n=== Real Video Usage Example ===")
|
| 178 |
-
|
| 179 |
-
# Check if demo video exists
|
| 180 |
-
demo_video_path = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
|
| 181 |
-
|
| 182 |
-
if os.path.exists(demo_video_path):
|
| 183 |
-
print(f"Found demo video: {demo_video_path}")
|
| 184 |
-
|
| 185 |
-
# Create pipeline with segmentation model paths
|
| 186 |
-
PIPELINE_REGISTRY.register_pipeline(
|
| 187 |
-
"vine-video-understanding",
|
| 188 |
-
pipeline_class=VinePipeline,
|
| 189 |
-
pt_model=VineModel,
|
| 190 |
-
type="multimodal",
|
| 191 |
-
)
|
| 192 |
-
|
| 193 |
-
vine_config = VineConfig(
|
| 194 |
-
model_name="openai/clip-vit-base-patch32",
|
| 195 |
-
use_hf_repo=True,
|
| 196 |
-
model_repo="video-fm/vine_v0", # Your HF Hub model
|
| 197 |
-
segmentation_method="grounding_dino_sam2",
|
| 198 |
-
debug_visualizations=True,
|
| 199 |
-
debug_visualization_path=os.path.join(os.getcwd(), "real_video_debug_masks.png"),
|
| 200 |
-
)
|
| 201 |
-
|
| 202 |
-
vine_pipeline = VinePipeline(
|
| 203 |
-
model=VineModel(vine_config),
|
| 204 |
-
tokenizer=None,
|
| 205 |
-
trust_remote_code=True,
|
| 206 |
-
# SAM2 configuration
|
| 207 |
-
sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
|
| 208 |
-
sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
|
| 209 |
-
gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
|
| 210 |
-
gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
|
| 211 |
-
)
|
| 212 |
-
|
| 213 |
-
# Define keywords based on the demo
|
| 214 |
-
categorical_keywords = ['human', 'dog', 'frisbee']
|
| 215 |
-
unary_keywords = ['running', 'jumping', 'catching', 'throwing']
|
| 216 |
-
binary_keywords = ['behind', 'in front of', 'next to', 'chasing']
|
| 217 |
-
object_pairs = [(0, 1), (0, 2), (1, 2)] # human-dog, dog-frisbee relationships
|
| 218 |
-
|
| 219 |
-
print("\nProcessing video with VINE...")
|
| 220 |
-
print("Keywords:")
|
| 221 |
-
print(f" Categorical: {categorical_keywords}")
|
| 222 |
-
print(f" Unary: {unary_keywords}")
|
| 223 |
-
print(f" Binary: {binary_keywords}")
|
| 224 |
-
print(f" Object pairs: {object_pairs}")
|
| 225 |
-
|
| 226 |
-
# Note: This would require proper segmentation models to be set up
|
| 227 |
-
try:
|
| 228 |
-
results = vine_pipeline(
|
| 229 |
-
demo_video_path,
|
| 230 |
-
categorical_keywords=categorical_keywords,
|
| 231 |
-
unary_keywords=unary_keywords,
|
| 232 |
-
binary_keywords=binary_keywords,
|
| 233 |
-
object_pairs=object_pairs,
|
| 234 |
-
segmentation_method='grounding_dino_sam2',
|
| 235 |
-
return_top_k=3,
|
| 236 |
-
include_visualizations=False,
|
| 237 |
-
debug_visualizations=True,
|
| 238 |
-
)
|
| 239 |
-
|
| 240 |
-
print("\nResults:")
|
| 241 |
-
print(f"Summary: {results['summary']}")
|
| 242 |
-
|
| 243 |
-
except Exception as e:
|
| 244 |
-
print(f"Note: Full execution requires segmentation models to be properly set up.")
|
| 245 |
-
print(f"Error: {e}")
|
| 246 |
-
|
| 247 |
-
else:
|
| 248 |
-
print(f"Demo video not found at: {demo_video_path}")
|
| 249 |
-
print("To use with a real video, provide the path to your video file.")
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
if __name__ == "__main__":
|
| 253 |
-
print("VINE HuggingFace Interface Examples")
|
| 254 |
-
print("=" * 50)
|
| 255 |
-
|
| 256 |
-
# Run examples
|
| 257 |
-
try:
|
| 258 |
-
example_direct_model_usage()
|
| 259 |
-
except Exception as e:
|
| 260 |
-
print(f"Direct model usage failed: {e}")
|
| 261 |
-
|
| 262 |
-
try:
|
| 263 |
-
example_pipeline_usage()
|
| 264 |
-
except Exception as e:
|
| 265 |
-
print(f"Pipeline usage failed: {e}")
|
| 266 |
-
|
| 267 |
-
try:
|
| 268 |
-
example_huggingface_hub_usage()
|
| 269 |
-
except Exception as e:
|
| 270 |
-
print(f"Hub usage example failed: {e}")
|
| 271 |
-
|
| 272 |
-
try:
|
| 273 |
-
example_with_real_video()
|
| 274 |
-
except Exception as e:
|
| 275 |
-
print(f"Real video example failed: {e}")
|
| 276 |
-
|
| 277 |
-
print("\n" + "=" * 50)
|
| 278 |
-
print("Examples completed!")
|
| 279 |
-
print("\nNext steps:")
|
| 280 |
-
print("1. Set up Grounding DINO and SAM2 models for segmentation")
|
| 281 |
-
print("2. Load your pretrained VINE model weights")
|
| 282 |
-
print("3. Test with your own videos")
|
| 283 |
-
print("4. Push to HuggingFace Hub for sharing")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/example_visualization.py
DELETED
|
@@ -1,146 +0,0 @@
|
|
| 1 |
-
# Example visualization runner for VINE
|
| 2 |
-
# - Loads a video (path, demo, or random)
|
| 3 |
-
# - Runs the VINE pipeline
|
| 4 |
-
# - Saves annotated frames and an MP4 if available
|
| 5 |
-
|
| 6 |
-
import os
|
| 7 |
-
import sys
|
| 8 |
-
import argparse
|
| 9 |
-
import cv2
|
| 10 |
-
import numpy as np
|
| 11 |
-
from collections.abc import Mapping, Sequence
|
| 12 |
-
|
| 13 |
-
from transformers.pipelines import PIPELINE_REGISTRY
|
| 14 |
-
from transformers import pipeline
|
| 15 |
-
|
| 16 |
-
# Set your OpenAI API key here or via environment variable
|
| 17 |
-
os.environ['OPENAI_API_KEY'] = "dummy-key"
|
| 18 |
-
|
| 19 |
-
# Local imports (workspace)
|
| 20 |
-
sys.path.append(os.path.dirname(__file__))
|
| 21 |
-
|
| 22 |
-
from vine_hf.vine_pipeline import VinePipeline # https://github.com link not needed; local path used
|
| 23 |
-
from vine_hf.vine_model import VineModel
|
| 24 |
-
from vine_hf.vine_config import VineConfig
|
| 25 |
-
from laser.loading import load_video
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
def build_pipeline(args) -> VinePipeline:
|
| 29 |
-
# Register pipeline type
|
| 30 |
-
PIPELINE_REGISTRY.register_pipeline(
|
| 31 |
-
"vine-video-understanding",
|
| 32 |
-
pipeline_class=VinePipeline,
|
| 33 |
-
pt_model=VineModel,
|
| 34 |
-
type="multimodal",
|
| 35 |
-
)
|
| 36 |
-
|
| 37 |
-
config = VineConfig(
|
| 38 |
-
segmentation_method="grounding_dino_sam2",
|
| 39 |
-
model_name="openai/clip-vit-base-patch32",
|
| 40 |
-
# Example: load from HF repo
|
| 41 |
-
use_hf_repo=True,
|
| 42 |
-
model_repo="video-fm/vine_v0",
|
| 43 |
-
# Alternatively use a local path by setting use_hf_repo=False and local_dir/local_filename
|
| 44 |
-
box_threshold=args.box_threshold,
|
| 45 |
-
text_threshold=args.text_threshold,
|
| 46 |
-
target_fps=args.fps,
|
| 47 |
-
topk_cate=args.topk_cate,
|
| 48 |
-
visualization_dir=args.out_dir,
|
| 49 |
-
visualize=True,
|
| 50 |
-
debug_visualizations=True,
|
| 51 |
-
device=args.device,
|
| 52 |
-
)
|
| 53 |
-
|
| 54 |
-
model = VineModel(config)
|
| 55 |
-
|
| 56 |
-
# Create pipeline instance with segmentation model paths (if provided)
|
| 57 |
-
vine_pipe = VinePipeline(
|
| 58 |
-
model=model,
|
| 59 |
-
tokenizer=None,
|
| 60 |
-
sam_config_path="//home/kevinx/LASER/video-sam2/sam2/sam2_hiera_t.yaml",
|
| 61 |
-
sam_checkpoint_path="//home/kevinx/LASER/video-sam2/sam2_hiera_tiny.pt",
|
| 62 |
-
gd_config_path="//home/kevinx/LASER/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
|
| 63 |
-
gd_checkpoint_path="//home/kevinx/LASER/GroundingDINO/weights/groundingdino_swint_ogc.pth",
|
| 64 |
-
device=args.device,
|
| 65 |
-
trust_remote_code=True,
|
| 66 |
-
)
|
| 67 |
-
return vine_pipe
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
def resolve_video(args) -> np.ndarray | str:
|
| 71 |
-
# Priority: user --video -> demo video -> random frames
|
| 72 |
-
if args.video and os.path.exists(args.video):
|
| 73 |
-
return args.video
|
| 74 |
-
|
| 75 |
-
demo_video = "//home/kevinx/LASER/LASER/demo/videos/v1.mp4"
|
| 76 |
-
demo_alt = "//home/kevinx/LASER/LASER/demo/videos/v2.mp4"
|
| 77 |
-
if os.path.exists(demo_video):
|
| 78 |
-
return demo_video
|
| 79 |
-
if os.path.exists(demo_alt):
|
| 80 |
-
return demo_alt
|
| 81 |
-
|
| 82 |
-
# Fallback to random frames (uint8 HxWx3) shaped as T x H x W x 3
|
| 83 |
-
print("No video found; using random frames.")
|
| 84 |
-
rng = np.random.default_rng(0)
|
| 85 |
-
frames = rng.integers(0, 255, size=(args.rand_frames, args.height, args.width, 3), dtype=np.uint8)
|
| 86 |
-
return frames
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
def main():
|
| 91 |
-
parser = argparse.ArgumentParser(description="VINE visualization example")
|
| 92 |
-
parser.add_argument("--video", type=str, default=None, help="Path to a video file")
|
| 93 |
-
parser.add_argument("--out_dir", type=str, default="output", help="Output directory")
|
| 94 |
-
parser.add_argument("--method", type=str, default="grounding_dino_sam2", choices=["sam2", "grounding_dino_sam2"], help="Segmentation method")
|
| 95 |
-
parser.add_argument("--fps", type=int, default=5, help="Target FPS for processing")
|
| 96 |
-
parser.add_argument("--box_threshold", type=float, default=0.3, help="GroundingDINO box threshold")
|
| 97 |
-
parser.add_argument("--text_threshold", type=float, default=0.3, help="GroundingDINO text threshold")
|
| 98 |
-
parser.add_argument("--topk_cate", type=int, default=5, help="Top-K categories to display")
|
| 99 |
-
parser.add_argument("--device", type=int, default=0, help="CUDA device index or -1 for CPU")
|
| 100 |
-
parser.add_argument("--debug_visualizations", action="store_true", help="Enable debug visualizations")
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
args = parser.parse_args()
|
| 104 |
-
|
| 105 |
-
vine_pipe = build_pipeline(args)
|
| 106 |
-
video = resolve_video(args)
|
| 107 |
-
|
| 108 |
-
# Keywords similar to examples/tests
|
| 109 |
-
categorical_keywords = ["dog", "frisbee", "cat"]
|
| 110 |
-
unary_keywords = ["running", "jumping", "sitting", "flying"]
|
| 111 |
-
binary_keywords = ["behind", "next to", "chasing","biting"]
|
| 112 |
-
object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3)]
|
| 113 |
-
|
| 114 |
-
print("Running VINE pipeline...")
|
| 115 |
-
call_kwargs = dict(
|
| 116 |
-
categorical_keywords=categorical_keywords,
|
| 117 |
-
unary_keywords=unary_keywords,
|
| 118 |
-
binary_keywords=binary_keywords,
|
| 119 |
-
object_pairs=object_pairs,
|
| 120 |
-
segmentation_method=args.method,
|
| 121 |
-
return_top_k=args.topk_cate,
|
| 122 |
-
include_visualizations=True,
|
| 123 |
-
debug_visualizations=args.debug_visualizations,
|
| 124 |
-
)
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
results = vine_pipe(
|
| 128 |
-
video,
|
| 129 |
-
**call_kwargs,
|
| 130 |
-
)
|
| 131 |
-
|
| 132 |
-
# Normalize pipeline output to a dict (can be dict or list[dict])
|
| 133 |
-
if isinstance(results, Mapping):
|
| 134 |
-
result = results
|
| 135 |
-
elif isinstance(results, Sequence) and results and isinstance(results[0], Mapping):
|
| 136 |
-
result = results[0]
|
| 137 |
-
else:
|
| 138 |
-
result = {}
|
| 139 |
-
|
| 140 |
-
# Print brief summary
|
| 141 |
-
summary = result.get("summary", {}) if isinstance(result, dict) else {}
|
| 142 |
-
print("Summary:", summary)
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
if __name__ == "__main__":
|
| 146 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/example_with_pretrained_vine.py
DELETED
|
@@ -1,287 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Example usage of VINE HuggingFace interface with pretrained VINE weights
|
| 3 |
-
|
| 4 |
-
This script demonstrates how to use the VINE model with your pretrained weights
|
| 5 |
-
from the ensemble format or from video-fm/vine_v0.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import os
|
| 9 |
-
import sys
|
| 10 |
-
import torch
|
| 11 |
-
from transformers import pipeline
|
| 12 |
-
from transformers.pipelines import PIPELINE_REGISTRY
|
| 13 |
-
|
| 14 |
-
# Set your OpenAI API key here or via environment variable
|
| 15 |
-
#os.environ['OPENAI_API_KEY'] = "dummy-key"
|
| 16 |
-
|
| 17 |
-
# Add the parent directory to the path to import vine_hf
|
| 18 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 19 |
-
|
| 20 |
-
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
def example_with_local_pretrained_weights():
|
| 24 |
-
print("=== Using Local Pretrained VINE Weights ===")
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
# Download https://huggingface.co/video-fm/vine_v0/tree/main/laser_model_v1.pt
|
| 28 |
-
pretrained_vine_file = "/path/to/your/local/laser_model_v1.pt" # Replace with your local path
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
# Create configuration with your pretrained path (local file)
|
| 32 |
-
config = VineConfig(
|
| 33 |
-
model_name="openai/clip-vit-base-patch32",
|
| 34 |
-
segmentation_method="grounding_dino_sam2",
|
| 35 |
-
target_fps=1,
|
| 36 |
-
visualize=True,
|
| 37 |
-
visualization_dir="path/to/visualization/dir",
|
| 38 |
-
debug_visualizations=True,
|
| 39 |
-
use_hf_repo=False,
|
| 40 |
-
local_dir=os.path.dirname(pretrained_vine_file),
|
| 41 |
-
local_filename=os.path.basename(pretrained_vine_file),
|
| 42 |
-
)
|
| 43 |
-
|
| 44 |
-
# Method 1: Initialize model directly
|
| 45 |
-
print("Method 1: Direct model initialization")
|
| 46 |
-
vine_model = VineModel(config)
|
| 47 |
-
print(f"✓ Model initialized with pretrained weights from: {pretrained_vine_file}")
|
| 48 |
-
|
| 49 |
-
# Method 2: Use the from_pretrained_vine class method
|
| 50 |
-
print("\nMethod 2: Using from_pretrained_vine class method")
|
| 51 |
-
vine_model_2 = VineModel.from_pretrained_vine(
|
| 52 |
-
model_path=pretrained_vine_file,
|
| 53 |
-
config=config,
|
| 54 |
-
epoch=0 # Specify epoch number
|
| 55 |
-
)
|
| 56 |
-
print("✓ Model loaded using from_pretrained_vine method")
|
| 57 |
-
|
| 58 |
-
return vine_model
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
def example_with_huggingface_hub():
|
| 62 |
-
"""Example using VINE weights from HuggingFace Hub."""
|
| 63 |
-
print("\n=== Using HuggingFace Hub Weights ===")
|
| 64 |
-
|
| 65 |
-
# Create configuration to use HuggingFace Hub weights
|
| 66 |
-
config = VineConfig(
|
| 67 |
-
model_name="openai/clip-vit-base-patch32",
|
| 68 |
-
use_hf_repo=True,
|
| 69 |
-
model_repo="video-fm/vine_v0", # Your HF Hub model
|
| 70 |
-
segmentation_method="grounding_dino_sam2",
|
| 71 |
-
visualize=True,
|
| 72 |
-
visualization_dir="path/to/visualization/dir",
|
| 73 |
-
debug_visualizations=True,
|
| 74 |
-
)
|
| 75 |
-
|
| 76 |
-
try:
|
| 77 |
-
# Initialize model (will try to load from HF Hub)
|
| 78 |
-
vine_model = VineModel(config)
|
| 79 |
-
print("✓ Model loaded from HuggingFace Hub: video-fm/vine_v0")
|
| 80 |
-
return vine_model
|
| 81 |
-
except Exception as e:
|
| 82 |
-
print(f"✗ Could not load from HuggingFace Hub: {e}")
|
| 83 |
-
print("Make sure your model is pushed to video-fm/vine_v0")
|
| 84 |
-
return None
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
def example_pipeline_with_pretrained():
|
| 88 |
-
"""Example using pipeline with pretrained VINE weights."""
|
| 89 |
-
print("\n=== Pipeline with Pretrained VINE ===")
|
| 90 |
-
|
| 91 |
-
# Register the pipeline
|
| 92 |
-
PIPELINE_REGISTRY.register_pipeline(
|
| 93 |
-
"vine-video-understanding",
|
| 94 |
-
pipeline_class=VinePipeline,
|
| 95 |
-
pt_model=VineModel,
|
| 96 |
-
type="multimodal",
|
| 97 |
-
)
|
| 98 |
-
|
| 99 |
-
# Create configuration with your weights
|
| 100 |
-
pretrained_vine_file = "/path/to/your/local/laser_model_v1.pt" # Replace with your local path
|
| 101 |
-
config = VineConfig(
|
| 102 |
-
model_name="openai/clip-vit-base-patch32",
|
| 103 |
-
segmentation_method="grounding_dino_sam2",
|
| 104 |
-
visualize=True,
|
| 105 |
-
visualization_dir="path/to/visualization/dir",
|
| 106 |
-
debug_visualizations=True,
|
| 107 |
-
use_hf_repo=False,
|
| 108 |
-
local_dir=os.path.dirname(pretrained_vine_file),
|
| 109 |
-
local_filename=os.path.basename(pretrained_vine_file),
|
| 110 |
-
)
|
| 111 |
-
|
| 112 |
-
# Create model with pretrained weights
|
| 113 |
-
vine_model = VineModel(config)
|
| 114 |
-
|
| 115 |
-
# Create pipeline with segmentation model paths
|
| 116 |
-
vine_pipeline = VinePipeline(
|
| 117 |
-
model=vine_model,
|
| 118 |
-
tokenizer=None,
|
| 119 |
-
sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
|
| 120 |
-
sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
|
| 121 |
-
gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
|
| 122 |
-
gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
|
| 123 |
-
device=0
|
| 124 |
-
)
|
| 125 |
-
|
| 126 |
-
print("✓ Pipeline created with pretrained VINE weights")
|
| 127 |
-
|
| 128 |
-
# Example usage (would require actual video file)
|
| 129 |
-
demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
|
| 130 |
-
|
| 131 |
-
if os.path.exists(demo_video):
|
| 132 |
-
print(f"Found demo video: {demo_video}")
|
| 133 |
-
print("Example pipeline call:")
|
| 134 |
-
print(f"results = vine_pipeline(")
|
| 135 |
-
print(f" '{demo_video}',")
|
| 136 |
-
print(f" categorical_keywords=['human', 'dog', 'frisbee'],")
|
| 137 |
-
print(f" unary_keywords=['running', 'jumping', 'sitting'],")
|
| 138 |
-
print(f" binary_keywords=['behind', 'chasing', 'next to']")
|
| 139 |
-
print(f" debug_visualizations=True")
|
| 140 |
-
print(f")")
|
| 141 |
-
|
| 142 |
-
# Uncomment to actually run (requires segmentation models)
|
| 143 |
-
# results = vine_pipeline(
|
| 144 |
-
# demo_video,
|
| 145 |
-
# categorical_keywords=['human', 'dog', 'frisbee'],
|
| 146 |
-
# unary_keywords=['running', 'jumping', 'sitting'],
|
| 147 |
-
# binary_keywords=['behind', 'chasing', 'next to'],
|
| 148 |
-
# debug_visualizations=True,
|
| 149 |
-
# )
|
| 150 |
-
# print("Results:", results['summary'])
|
| 151 |
-
|
| 152 |
-
return vine_pipeline
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
def example_manual_weight_loading():
|
| 157 |
-
"""Example of manually loading weights after model creation."""
|
| 158 |
-
print("\n=== Manual Weight Loading ===")
|
| 159 |
-
|
| 160 |
-
# Create model with base CLIP weights
|
| 161 |
-
# No pretrained path: create base config (no HF repo or local file configured)
|
| 162 |
-
config = VineConfig()
|
| 163 |
-
vine_model = VineModel(config)
|
| 164 |
-
print("✓ Model created with base CLIP weights")
|
| 165 |
-
model_dir = "/path/to/your/local/ensemble/model_dir.pt" # Replace with your model directory
|
| 166 |
-
|
| 167 |
-
if os.path.exists(model_dir):
|
| 168 |
-
success = vine_model.load_pretrained_vine_weights(model_dir, epoch=0)
|
| 169 |
-
if success:
|
| 170 |
-
print("✓ Successfully loaded pretrained VINE weights manually")
|
| 171 |
-
else:
|
| 172 |
-
print("✗ Failed to load pretrained weights")
|
| 173 |
-
else:
|
| 174 |
-
print(f"✗ Model directory not found: {model_dir}")
|
| 175 |
-
|
| 176 |
-
return vine_model
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
def compare_model_outputs():
|
| 180 |
-
"""Compare outputs between base CLIP and pretrained VINE."""
|
| 181 |
-
print("\n=== Comparing Model Outputs ===")
|
| 182 |
-
|
| 183 |
-
# Create dummy data for testing
|
| 184 |
-
video_frames = torch.randn(3, 224, 224, 3) * 255 # 3 frames
|
| 185 |
-
video_frames = video_frames.clamp(0, 255).byte()
|
| 186 |
-
|
| 187 |
-
masks = {
|
| 188 |
-
0: {1: torch.ones(224, 224, 1)},
|
| 189 |
-
1: {1: torch.ones(224, 224, 1)},
|
| 190 |
-
2: {1: torch.ones(224, 224, 1)}
|
| 191 |
-
}
|
| 192 |
-
|
| 193 |
-
bboxes = {
|
| 194 |
-
0: {1: [50, 50, 150, 150]},
|
| 195 |
-
1: {1: [52, 52, 152, 152]},
|
| 196 |
-
2: {1: [54, 54, 154, 154]}
|
| 197 |
-
}
|
| 198 |
-
|
| 199 |
-
keywords = ['human', 'dog', 'frisbee']
|
| 200 |
-
|
| 201 |
-
# Model 1: Base CLIP
|
| 202 |
-
print("Creating model with base CLIP weights...")
|
| 203 |
-
config_base = VineConfig()
|
| 204 |
-
model_base = VineModel(config_base)
|
| 205 |
-
|
| 206 |
-
# Model 2: Pretrained VINE (if available)
|
| 207 |
-
data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
|
| 208 |
-
model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
|
| 209 |
-
|
| 210 |
-
if os.path.exists(model_dir):
|
| 211 |
-
print("Creating model with pretrained VINE weights...")
|
| 212 |
-
config_vine = VineConfig(
|
| 213 |
-
use_hf_repo=False,
|
| 214 |
-
local_dir=model_dir,
|
| 215 |
-
local_filename=None,
|
| 216 |
-
)
|
| 217 |
-
model_vine = VineModel(config_vine)
|
| 218 |
-
|
| 219 |
-
print("\nComparing predictions...")
|
| 220 |
-
|
| 221 |
-
# Get predictions from both models
|
| 222 |
-
with torch.no_grad():
|
| 223 |
-
results_base = model_base.predict(
|
| 224 |
-
video_frames=video_frames,
|
| 225 |
-
masks=masks,
|
| 226 |
-
bboxes=bboxes,
|
| 227 |
-
categorical_keywords=keywords,
|
| 228 |
-
return_top_k=3
|
| 229 |
-
)
|
| 230 |
-
|
| 231 |
-
results_vine = model_vine.predict(
|
| 232 |
-
video_frames=video_frames,
|
| 233 |
-
masks=masks,
|
| 234 |
-
bboxes=bboxes,
|
| 235 |
-
categorical_keywords=keywords,
|
| 236 |
-
return_top_k=3
|
| 237 |
-
)
|
| 238 |
-
|
| 239 |
-
print("Base CLIP confidence scores:", results_base['confidence_scores'])
|
| 240 |
-
print("Pretrained VINE confidence scores:", results_vine['confidence_scores'])
|
| 241 |
-
|
| 242 |
-
print("✓ Successfully compared both models")
|
| 243 |
-
else:
|
| 244 |
-
print(f"Pretrained model not found at: {model_dir}")
|
| 245 |
-
print("Skipping comparison")
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
if __name__ == "__main__":
|
| 249 |
-
print("VINE HuggingFace Interface - Pretrained Weights Examples")
|
| 250 |
-
print("=" * 60)
|
| 251 |
-
|
| 252 |
-
try:
|
| 253 |
-
# Test local pretrained weights
|
| 254 |
-
model1 = example_with_local_pretrained_weights()
|
| 255 |
-
except Exception as e:
|
| 256 |
-
print(f"Local weights example failed: {e}")
|
| 257 |
-
|
| 258 |
-
try:
|
| 259 |
-
# Test HuggingFace Hub weights
|
| 260 |
-
model2 = example_with_huggingface_hub()
|
| 261 |
-
except Exception as e:
|
| 262 |
-
print(f"HuggingFace Hub example failed: {e}")
|
| 263 |
-
|
| 264 |
-
try:
|
| 265 |
-
# Test pipeline with pretrained weights
|
| 266 |
-
pipeline = example_pipeline_with_pretrained()
|
| 267 |
-
except Exception as e:
|
| 268 |
-
print(f"Pipeline example failed: {e}")
|
| 269 |
-
|
| 270 |
-
# try:
|
| 271 |
-
# # Test manual weight loading
|
| 272 |
-
# #model3 = example_manual_weight_loading()
|
| 273 |
-
# except Exception as e:
|
| 274 |
-
# print(f"Manual loading example failed: {e}")
|
| 275 |
-
|
| 276 |
-
# try:
|
| 277 |
-
# # Compare model outputs
|
| 278 |
-
# #compare_model_outputs()
|
| 279 |
-
# except Exception as e:
|
| 280 |
-
# print(f"Comparison example failed: {e}")
|
| 281 |
-
|
| 282 |
-
print("\n" + "=" * 60)
|
| 283 |
-
print("Examples completed!")
|
| 284 |
-
print("\nUsage Summary:")
|
| 285 |
-
print("1. Configure VineConfig with `use_hf_repo` + `model_repo` for Hub models, or `use_hf_repo=False` + `local_dir`/`local_filename` for local weights")
|
| 286 |
-
print("2. Use VineModel.from_pretrained_vine() for direct loading")
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/flattening.py
DELETED
|
@@ -1,124 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
from collections import defaultdict
|
| 4 |
-
from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
|
| 5 |
-
|
| 6 |
-
import numpy as np
|
| 7 |
-
import torch
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
MaskType = Union[np.ndarray, torch.Tensor]
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def _to_numpy_mask(mask: MaskType) -> np.ndarray:
|
| 14 |
-
"""
|
| 15 |
-
Convert assorted mask formats to a 2D numpy boolean array.
|
| 16 |
-
"""
|
| 17 |
-
if isinstance(mask, torch.Tensor):
|
| 18 |
-
mask_np = mask.detach().cpu().numpy()
|
| 19 |
-
else:
|
| 20 |
-
mask_np = np.asarray(mask)
|
| 21 |
-
|
| 22 |
-
# Remove singleton dimensions at the front/back
|
| 23 |
-
while mask_np.ndim > 2 and mask_np.shape[0] == 1:
|
| 24 |
-
mask_np = np.squeeze(mask_np, axis=0)
|
| 25 |
-
if mask_np.ndim > 2 and mask_np.shape[-1] == 1:
|
| 26 |
-
mask_np = np.squeeze(mask_np, axis=-1)
|
| 27 |
-
|
| 28 |
-
if mask_np.ndim != 2:
|
| 29 |
-
raise ValueError(f"Expected mask to be 2D after squeezing, got shape {mask_np.shape}")
|
| 30 |
-
|
| 31 |
-
return mask_np.astype(bool)
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
def _mask_to_bbox(mask: np.ndarray) -> Optional[Tuple[int, int, int, int]]:
|
| 35 |
-
"""
|
| 36 |
-
Compute a bounding box for a 2D boolean mask.
|
| 37 |
-
"""
|
| 38 |
-
if not mask.any():
|
| 39 |
-
return None
|
| 40 |
-
rows, cols = np.nonzero(mask)
|
| 41 |
-
y_min, y_max = rows.min(), rows.max()
|
| 42 |
-
x_min, x_max = cols.min(), cols.max()
|
| 43 |
-
return x_min, y_min, x_max, y_max
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
def flatten_segments_for_batch(
|
| 47 |
-
video_id: int,
|
| 48 |
-
segments: Dict[int, Dict[int, MaskType]],
|
| 49 |
-
bbox_min_dim: int = 5,
|
| 50 |
-
) -> Dict[str, List]:
|
| 51 |
-
"""
|
| 52 |
-
Flatten nested segmentation data into batched lists suitable for predicate
|
| 53 |
-
models or downstream visualizations. Mirrors the notebook helper but is
|
| 54 |
-
robust to differing mask dtypes/shapes.
|
| 55 |
-
"""
|
| 56 |
-
batched_object_ids: List[Tuple[int, int, int]] = []
|
| 57 |
-
batched_masks: List[np.ndarray] = []
|
| 58 |
-
batched_bboxes: List[Tuple[int, int, int, int]] = []
|
| 59 |
-
frame_pairs: List[Tuple[int, int, Tuple[int, int]]] = []
|
| 60 |
-
|
| 61 |
-
for frame_id, frame_objects in segments.items():
|
| 62 |
-
valid_objects: List[int] = []
|
| 63 |
-
for object_id, raw_mask in frame_objects.items():
|
| 64 |
-
mask = _to_numpy_mask(raw_mask)
|
| 65 |
-
bbox = _mask_to_bbox(mask)
|
| 66 |
-
if bbox is None:
|
| 67 |
-
continue
|
| 68 |
-
|
| 69 |
-
x_min, y_min, x_max, y_max = bbox
|
| 70 |
-
if abs(y_max - y_min) < bbox_min_dim or abs(x_max - x_min) < bbox_min_dim:
|
| 71 |
-
continue
|
| 72 |
-
|
| 73 |
-
valid_objects.append(object_id)
|
| 74 |
-
batched_object_ids.append((video_id, frame_id, object_id))
|
| 75 |
-
batched_masks.append(mask)
|
| 76 |
-
batched_bboxes.append(bbox)
|
| 77 |
-
|
| 78 |
-
for i in valid_objects:
|
| 79 |
-
for j in valid_objects:
|
| 80 |
-
if i == j:
|
| 81 |
-
continue
|
| 82 |
-
frame_pairs.append((video_id, frame_id, (i, j)))
|
| 83 |
-
|
| 84 |
-
return {
|
| 85 |
-
"object_ids": batched_object_ids,
|
| 86 |
-
"masks": batched_masks,
|
| 87 |
-
"bboxes": batched_bboxes,
|
| 88 |
-
"pairs": frame_pairs,
|
| 89 |
-
}
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
def extract_valid_object_pairs(
|
| 93 |
-
batched_object_ids: Sequence[Tuple[int, int, int]],
|
| 94 |
-
interested_object_pairs: Optional[Iterable[Tuple[int, int]]] = None,
|
| 95 |
-
) -> List[Tuple[int, int, Tuple[int, int]]]:
|
| 96 |
-
"""
|
| 97 |
-
Filter object pairs per frame. If `interested_object_pairs` is provided, only
|
| 98 |
-
emit those combinations when both objects are present; otherwise emit all
|
| 99 |
-
permutations (i, j) with i != j for each frame.
|
| 100 |
-
"""
|
| 101 |
-
frame_to_objects: Dict[Tuple[int, int], set] = defaultdict(set)
|
| 102 |
-
for vid, fid, oid in batched_object_ids:
|
| 103 |
-
frame_to_objects[(vid, fid)].add(oid)
|
| 104 |
-
|
| 105 |
-
interested = (
|
| 106 |
-
list(interested_object_pairs)
|
| 107 |
-
if interested_object_pairs is not None
|
| 108 |
-
else None
|
| 109 |
-
)
|
| 110 |
-
|
| 111 |
-
valid_pairs: List[Tuple[int, int, Tuple[int, int]]] = []
|
| 112 |
-
for (vid, fid), object_ids in frame_to_objects.items():
|
| 113 |
-
if interested:
|
| 114 |
-
for src, dst in interested:
|
| 115 |
-
if src in object_ids and dst in object_ids:
|
| 116 |
-
valid_pairs.append((vid, fid, (src, dst)))
|
| 117 |
-
else:
|
| 118 |
-
for src in object_ids:
|
| 119 |
-
for dst in object_ids:
|
| 120 |
-
if src == dst:
|
| 121 |
-
continue
|
| 122 |
-
valid_pairs.append((vid, fid, (src, dst)))
|
| 123 |
-
|
| 124 |
-
return valid_pairs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/push_to_hub.py
DELETED
|
@@ -1,232 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Script to push VINE model to HuggingFace Hub
|
| 3 |
-
|
| 4 |
-
This script helps you push your trained VINE model to the HuggingFace Hub
|
| 5 |
-
for easy sharing and distribution.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import os
|
| 9 |
-
import sys
|
| 10 |
-
import torch
|
| 11 |
-
import argparse
|
| 12 |
-
from huggingface_hub import notebook_login
|
| 13 |
-
from transformers.pipelines import PIPELINE_REGISTRY
|
| 14 |
-
|
| 15 |
-
# Add the parent directory to the path to import vine_hf
|
| 16 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 17 |
-
|
| 18 |
-
os.environ['OPENAI_API_KEY'] = "dummy-key"
|
| 19 |
-
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
def push_vine_to_hub(
|
| 23 |
-
model_weights_path: str,
|
| 24 |
-
repo_name: str,
|
| 25 |
-
model_name: str = "openai/clip-vit-base-patch32",
|
| 26 |
-
segmentation_method: str = "grounding_dino_sam2",
|
| 27 |
-
commit_message: str = "Upload VINE model",
|
| 28 |
-
private: bool = False
|
| 29 |
-
):
|
| 30 |
-
"""
|
| 31 |
-
Push VINE model to HuggingFace Hub.
|
| 32 |
-
|
| 33 |
-
Args:
|
| 34 |
-
model_weights_path: Path to the trained model weights (.pth file)
|
| 35 |
-
repo_name: Name for the repository (e.g., "username/vine-model")
|
| 36 |
-
model_name: CLIP model backbone name
|
| 37 |
-
segmentation_method: Segmentation method used
|
| 38 |
-
commit_message: Commit message for the push
|
| 39 |
-
private: Whether to create a private repository
|
| 40 |
-
"""
|
| 41 |
-
|
| 42 |
-
print("=== Pushing VINE Model to HuggingFace Hub ===")
|
| 43 |
-
|
| 44 |
-
# 1. Create configuration
|
| 45 |
-
print(f"Creating configuration with backbone: {model_name}")
|
| 46 |
-
config = VineConfig(
|
| 47 |
-
model_name=model_name,
|
| 48 |
-
segmentation_method=segmentation_method
|
| 49 |
-
)
|
| 50 |
-
|
| 51 |
-
# 2. Initialize model
|
| 52 |
-
print("Initializing model...")
|
| 53 |
-
model = VineModel(config)
|
| 54 |
-
|
| 55 |
-
# 3. Load trained weights
|
| 56 |
-
if os.path.exists(model_weights_path):
|
| 57 |
-
print(f"Loading weights from: {model_weights_path}")
|
| 58 |
-
try:
|
| 59 |
-
# Try loading with weights_only=False for compatibility
|
| 60 |
-
weights = torch.load(model_weights_path, map_location='cpu', weights_only=False)
|
| 61 |
-
|
| 62 |
-
# Handle different weight formats
|
| 63 |
-
if isinstance(weights, dict):
|
| 64 |
-
if 'state_dict' in weights:
|
| 65 |
-
model.load_state_dict(weights['state_dict'])
|
| 66 |
-
elif 'model' in weights:
|
| 67 |
-
model.load_state_dict(weights['model'])
|
| 68 |
-
else:
|
| 69 |
-
model.load_state_dict(weights)
|
| 70 |
-
else:
|
| 71 |
-
# Assume it's the model directly
|
| 72 |
-
model = weights
|
| 73 |
-
|
| 74 |
-
print("✓ Weights loaded successfully")
|
| 75 |
-
except Exception as e:
|
| 76 |
-
print(f"✗ Error loading weights: {e}")
|
| 77 |
-
print("Please check your weights file format")
|
| 78 |
-
return False
|
| 79 |
-
else:
|
| 80 |
-
print(f"✗ Weights file not found: {model_weights_path}")
|
| 81 |
-
return False
|
| 82 |
-
|
| 83 |
-
# 4. Register for auto classes
|
| 84 |
-
print("Registering for auto classes...")
|
| 85 |
-
config.register_for_auto_class()
|
| 86 |
-
model.register_for_auto_class("AutoModel")
|
| 87 |
-
|
| 88 |
-
# 5. Register pipeline
|
| 89 |
-
print("Registering pipeline...")
|
| 90 |
-
PIPELINE_REGISTRY.register_pipeline(
|
| 91 |
-
"vine-video-understanding",
|
| 92 |
-
pipeline_class=VinePipeline,
|
| 93 |
-
pt_model=VineModel,
|
| 94 |
-
type="multimodal",
|
| 95 |
-
)
|
| 96 |
-
|
| 97 |
-
# 6. Create pipeline instance
|
| 98 |
-
print("Creating pipeline...")
|
| 99 |
-
vine_pipeline = VinePipeline(model=model, tokenizer=None)
|
| 100 |
-
|
| 101 |
-
try:
|
| 102 |
-
# 7. Push configuration to hub
|
| 103 |
-
print(f"Pushing configuration to {repo_name}...")
|
| 104 |
-
config.push_to_hub(
|
| 105 |
-
repo_name,
|
| 106 |
-
commit_message=f"{commit_message} - config",
|
| 107 |
-
private=private
|
| 108 |
-
)
|
| 109 |
-
print("✓ Configuration pushed successfully")
|
| 110 |
-
|
| 111 |
-
# 8. Push model to hub
|
| 112 |
-
print(f"Pushing model to {repo_name}...")
|
| 113 |
-
model.push_to_hub(
|
| 114 |
-
repo_name,
|
| 115 |
-
commit_message=f"{commit_message} - model",
|
| 116 |
-
private=private
|
| 117 |
-
)
|
| 118 |
-
print("✓ Model pushed successfully")
|
| 119 |
-
|
| 120 |
-
# 9. Push pipeline to hub
|
| 121 |
-
print(f"Pushing pipeline to {repo_name}...")
|
| 122 |
-
vine_pipeline.push_to_hub(
|
| 123 |
-
repo_name,
|
| 124 |
-
commit_message=f"{commit_message} - pipeline",
|
| 125 |
-
private=private
|
| 126 |
-
)
|
| 127 |
-
print("✓ Pipeline pushed successfully")
|
| 128 |
-
|
| 129 |
-
print(f"\n🎉 Successfully pushed VINE model to: https://huggingface.co/{repo_name}")
|
| 130 |
-
print(f"\nTo use your model:")
|
| 131 |
-
print(f"```python")
|
| 132 |
-
print(f"from transformers import pipeline")
|
| 133 |
-
print(f"")
|
| 134 |
-
print(f"vine_pipeline = pipeline(")
|
| 135 |
-
print(f" 'vine-video-understanding',")
|
| 136 |
-
print(f" model='{repo_name}',")
|
| 137 |
-
print(f" trust_remote_code=True")
|
| 138 |
-
print(f")")
|
| 139 |
-
print(f"")
|
| 140 |
-
print(f"results = vine_pipeline(")
|
| 141 |
-
print(f" 'path/to/video.mp4',")
|
| 142 |
-
print(f" categorical_keywords=['human', 'dog', 'frisbee'],")
|
| 143 |
-
print(f" unary_keywords=['running', 'jumping'],")
|
| 144 |
-
print(f" binary_keywords=['chasing', 'behind']")
|
| 145 |
-
print(f")")
|
| 146 |
-
print(f"```")
|
| 147 |
-
|
| 148 |
-
return True
|
| 149 |
-
|
| 150 |
-
except Exception as e:
|
| 151 |
-
print(f"✗ Error pushing to hub: {e}")
|
| 152 |
-
print("Please check your HuggingFace credentials and repository permissions")
|
| 153 |
-
return False
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
def main():
|
| 157 |
-
parser = argparse.ArgumentParser(description="Push VINE model to HuggingFace Hub")
|
| 158 |
-
|
| 159 |
-
parser.add_argument(
|
| 160 |
-
"--weights",
|
| 161 |
-
type=str,
|
| 162 |
-
required=True,
|
| 163 |
-
help="Path to the trained model weights (.pth file)"
|
| 164 |
-
)
|
| 165 |
-
|
| 166 |
-
parser.add_argument(
|
| 167 |
-
"--repo",
|
| 168 |
-
type=str,
|
| 169 |
-
required=True,
|
| 170 |
-
help="Repository name (e.g., 'username/vine-model')"
|
| 171 |
-
)
|
| 172 |
-
|
| 173 |
-
parser.add_argument(
|
| 174 |
-
"--model-name",
|
| 175 |
-
type=str,
|
| 176 |
-
default="openai/clip-vit-base-patch32",
|
| 177 |
-
help="CLIP model backbone name"
|
| 178 |
-
)
|
| 179 |
-
|
| 180 |
-
parser.add_argument(
|
| 181 |
-
"--segmentation",
|
| 182 |
-
type=str,
|
| 183 |
-
default="grounding_dino_sam2",
|
| 184 |
-
choices=["sam2", "grounding_dino_sam2"],
|
| 185 |
-
help="Segmentation method"
|
| 186 |
-
)
|
| 187 |
-
|
| 188 |
-
parser.add_argument(
|
| 189 |
-
"--message",
|
| 190 |
-
type=str,
|
| 191 |
-
default="Upload VINE model",
|
| 192 |
-
help="Commit message"
|
| 193 |
-
)
|
| 194 |
-
|
| 195 |
-
parser.add_argument(
|
| 196 |
-
"--private",
|
| 197 |
-
action="store_true",
|
| 198 |
-
help="Create private repository"
|
| 199 |
-
)
|
| 200 |
-
|
| 201 |
-
parser.add_argument(
|
| 202 |
-
"--login",
|
| 203 |
-
action="store_true",
|
| 204 |
-
help="Login to HuggingFace Hub first"
|
| 205 |
-
)
|
| 206 |
-
|
| 207 |
-
args = parser.parse_args()
|
| 208 |
-
|
| 209 |
-
# Login if requested
|
| 210 |
-
if args.login:
|
| 211 |
-
print("Logging in to HuggingFace Hub...")
|
| 212 |
-
notebook_login()
|
| 213 |
-
|
| 214 |
-
# Push model
|
| 215 |
-
success = push_vine_to_hub(
|
| 216 |
-
model_weights_path=args.weights,
|
| 217 |
-
repo_name=args.repo,
|
| 218 |
-
model_name=args.model_name,
|
| 219 |
-
segmentation_method=args.segmentation,
|
| 220 |
-
commit_message=args.message,
|
| 221 |
-
private=args.private
|
| 222 |
-
)
|
| 223 |
-
|
| 224 |
-
if success:
|
| 225 |
-
print("\n✅ Model successfully pushed to HuggingFace Hub!")
|
| 226 |
-
else:
|
| 227 |
-
print("\n❌ Failed to push model to HuggingFace Hub")
|
| 228 |
-
sys.exit(1)
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
if __name__ == "__main__":
|
| 232 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/setup.py
DELETED
|
@@ -1,63 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Setup script for VINE HuggingFace Interface
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
-
from setuptools import setup, find_packages
|
| 6 |
-
|
| 7 |
-
with open("README.md", "r", encoding="utf-8") as fh:
|
| 8 |
-
long_description = fh.read()
|
| 9 |
-
|
| 10 |
-
setup(
|
| 11 |
-
name="vine-hf",
|
| 12 |
-
version="1.0.0",
|
| 13 |
-
author="LASER Team",
|
| 14 |
-
author_email="[email protected]",
|
| 15 |
-
description="HuggingFace interface for VINE (Video Understanding with Natural Language)",
|
| 16 |
-
long_description=long_description,
|
| 17 |
-
long_description_content_type="text/markdown",
|
| 18 |
-
url="https://github.com/your-username/vine-hf",
|
| 19 |
-
packages=["vine_hf"],
|
| 20 |
-
package_dir={"vine_hf": "."},
|
| 21 |
-
classifiers=[
|
| 22 |
-
"Development Status :: 4 - Beta",
|
| 23 |
-
"Intended Audience :: Developers",
|
| 24 |
-
"Intended Audience :: Science/Research",
|
| 25 |
-
"License :: OSI Approved :: MIT License",
|
| 26 |
-
"Operating System :: OS Independent",
|
| 27 |
-
"Programming Language :: Python :: 3",
|
| 28 |
-
"Programming Language :: Python :: 3.7",
|
| 29 |
-
"Programming Language :: Python :: 3.8",
|
| 30 |
-
"Programming Language :: Python :: 3.9",
|
| 31 |
-
"Programming Language :: Python :: 3.10",
|
| 32 |
-
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 33 |
-
"Topic :: Multimedia :: Video",
|
| 34 |
-
],
|
| 35 |
-
python_requires=">=3.7",
|
| 36 |
-
install_requires=[
|
| 37 |
-
"torch>=1.9.0",
|
| 38 |
-
"torchvision>=0.10.0",
|
| 39 |
-
"transformers>=4.20.0",
|
| 40 |
-
"opencv-python>=4.5.0",
|
| 41 |
-
"pillow>=8.0.0",
|
| 42 |
-
"numpy>=1.20.0",
|
| 43 |
-
"huggingface-hub>=0.10.0",
|
| 44 |
-
"tqdm>=4.60.0",
|
| 45 |
-
],
|
| 46 |
-
extras_require={
|
| 47 |
-
"dev": [
|
| 48 |
-
"pytest>=6.0",
|
| 49 |
-
"black>=22.0",
|
| 50 |
-
"flake8>=4.0",
|
| 51 |
-
"isort>=5.0",
|
| 52 |
-
],
|
| 53 |
-
"segmentation": [
|
| 54 |
-
# Note: SAM2 and Grounding DINO need to be installed separately
|
| 55 |
-
# as they're not available on PyPI
|
| 56 |
-
],
|
| 57 |
-
},
|
| 58 |
-
entry_points={
|
| 59 |
-
"console_scripts": [
|
| 60 |
-
"vine-push-to-hub=vine_hf.push_to_hub:main",
|
| 61 |
-
],
|
| 62 |
-
},
|
| 63 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/vine_config.py
DELETED
|
@@ -1,108 +0,0 @@
|
|
| 1 |
-
import torch
|
| 2 |
-
from transformers import PretrainedConfig
|
| 3 |
-
from typing import List, Optional, Dict, Any, Tuple
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
class VineConfig(PretrainedConfig):
|
| 8 |
-
"""
|
| 9 |
-
Configuration class for VINE (Video Understanding with Natural Language) model.
|
| 10 |
-
|
| 11 |
-
VINE is a video understanding model that processes categorical (object class names),
|
| 12 |
-
unary keywords (actions on one object), and binary keywords (relations between two objects),
|
| 13 |
-
and returns probability distributions over all of them when passed a video.
|
| 14 |
-
|
| 15 |
-
Args:
|
| 16 |
-
model_name (str): The CLIP model name to use as backbone. Default: "openai/clip-vit-large-patch14-336"
|
| 17 |
-
hidden_dim (int): Hidden dimension size. Default: 768
|
| 18 |
-
num_top_pairs (int): Number of top object pairs to consider. Default: 10
|
| 19 |
-
segmentation_method (str): Segmentation method to use ("sam2" or "grounding_dino_sam2"). Default: "grounding_dino_sam2"
|
| 20 |
-
box_threshold (float): Box threshold for Grounding DINO. Default: 0.35
|
| 21 |
-
text_threshold (float): Text threshold for Grounding DINO. Default: 0.25
|
| 22 |
-
target_fps (int): Target FPS for video processing. Default: 1
|
| 23 |
-
alpha (float): Alpha value for object extraction. Default: 0.5
|
| 24 |
-
white_alpha (float): White alpha value for background blending. Default: 0.8
|
| 25 |
-
topk_cate (int): Top-k categories to return. Default: 3
|
| 26 |
-
multi_class (bool): Whether to use multi-class classification. Default: False
|
| 27 |
-
output_logit (bool): Whether to output logits instead of probabilities. Default: False
|
| 28 |
-
max_video_length (int): Maximum number of frames to process. Default: 100
|
| 29 |
-
bbox_min_dim (int): Minimum bounding box dimension. Default: 5
|
| 30 |
-
visualize (bool): Whether to visualize results. Default: False
|
| 31 |
-
visualization_dir (str, optional): Directory to save visualizations. Default: None
|
| 32 |
-
debug_visualizations (bool): Whether to save debug visualizations. Default: False
|
| 33 |
-
return_flattened_segments (bool): Whether to return flattened segments. Default: False
|
| 34 |
-
return_valid_pairs (bool): Whether to return valid object pairs. Default: False
|
| 35 |
-
interested_object_pairs (List[Tuple[int, int]], optional): List of interested object pairs
|
| 36 |
-
"""
|
| 37 |
-
|
| 38 |
-
model_type = "vine"
|
| 39 |
-
|
| 40 |
-
def __init__(
|
| 41 |
-
self,
|
| 42 |
-
model_name: str = "openai/clip-vit-base-patch32",
|
| 43 |
-
hidden_dim = 768,
|
| 44 |
-
|
| 45 |
-
use_hf_repo: bool = True,
|
| 46 |
-
model_repo: Optional[str] = "KevinX-Penn28/testing",
|
| 47 |
-
model_file: Optional[str] = None,
|
| 48 |
-
local_dir: Optional[str] = str(Path(__file__).resolve().parent),
|
| 49 |
-
local_filename: Optional[str] = "laser_model_v1.pkl",
|
| 50 |
-
|
| 51 |
-
num_top_pairs: int = 18,
|
| 52 |
-
segmentation_method: str = "grounding_dino_sam2",
|
| 53 |
-
box_threshold: float = 0.35,
|
| 54 |
-
text_threshold: float = 0.25,
|
| 55 |
-
target_fps: int = 1,
|
| 56 |
-
alpha: float = 0.5,
|
| 57 |
-
white_alpha: float = 0.8,
|
| 58 |
-
topk_cate: int = 3,
|
| 59 |
-
multi_class: bool = False,
|
| 60 |
-
output_logit: bool = False,
|
| 61 |
-
max_video_length: int = 100,
|
| 62 |
-
bbox_min_dim: int = 5,
|
| 63 |
-
visualize: bool = False,
|
| 64 |
-
visualization_dir: Optional[str] = None,
|
| 65 |
-
return_flattened_segments: bool = False,
|
| 66 |
-
return_valid_pairs: bool = False,
|
| 67 |
-
interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
|
| 68 |
-
debug_visualizations: bool = False,
|
| 69 |
-
device: Optional[str | int] = None,
|
| 70 |
-
**kwargs
|
| 71 |
-
):
|
| 72 |
-
self.model_name = model_name
|
| 73 |
-
self.use_hf_repo = use_hf_repo
|
| 74 |
-
if use_hf_repo:
|
| 75 |
-
self.model_repo = model_repo
|
| 76 |
-
self.model_file = model_file
|
| 77 |
-
self.local_dir = None
|
| 78 |
-
self.local_filename = None
|
| 79 |
-
else:
|
| 80 |
-
self.model_repo = None
|
| 81 |
-
self.model_file = None
|
| 82 |
-
self.local_dir = local_dir
|
| 83 |
-
self.local_filename = local_filename
|
| 84 |
-
self.hidden_dim = hidden_dim
|
| 85 |
-
self.num_top_pairs = num_top_pairs
|
| 86 |
-
self.segmentation_method = segmentation_method
|
| 87 |
-
self.box_threshold = box_threshold
|
| 88 |
-
self.text_threshold = text_threshold
|
| 89 |
-
self.target_fps = target_fps
|
| 90 |
-
self.alpha = alpha
|
| 91 |
-
self.white_alpha = white_alpha
|
| 92 |
-
self.topk_cate = topk_cate
|
| 93 |
-
self.multi_class = multi_class
|
| 94 |
-
self.output_logit = output_logit
|
| 95 |
-
self.max_video_length = max_video_length
|
| 96 |
-
self.bbox_min_dim = bbox_min_dim
|
| 97 |
-
self.visualize = visualize
|
| 98 |
-
self.visualization_dir = visualization_dir
|
| 99 |
-
self.return_flattened_segments = return_flattened_segments
|
| 100 |
-
self.return_valid_pairs = return_valid_pairs
|
| 101 |
-
self.interested_object_pairs = interested_object_pairs or []
|
| 102 |
-
self.debug_visualizations = debug_visualizations
|
| 103 |
-
if device is int:
|
| 104 |
-
self._device = f"cuda:{device}" if torch.cuda.is_available() else "cpu"
|
| 105 |
-
else:
|
| 106 |
-
self._device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 107 |
-
|
| 108 |
-
super().__init__(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/vine_hf.egg-info/PKG-INFO
DELETED
|
@@ -1,401 +0,0 @@
|
|
| 1 |
-
Metadata-Version: 2.4
|
| 2 |
-
Name: vine-hf
|
| 3 |
-
Version: 1.0.0
|
| 4 |
-
Summary: HuggingFace interface for VINE (Video Understanding with Natural Language)
|
| 5 |
-
Home-page: https://github.com/your-username/vine-hf
|
| 6 |
-
Author: LASER Team
|
| 7 |
-
Author-email: [email protected]
|
| 8 |
-
Classifier: Development Status :: 4 - Beta
|
| 9 |
-
Classifier: Intended Audience :: Developers
|
| 10 |
-
Classifier: Intended Audience :: Science/Research
|
| 11 |
-
Classifier: License :: OSI Approved :: MIT License
|
| 12 |
-
Classifier: Operating System :: OS Independent
|
| 13 |
-
Classifier: Programming Language :: Python :: 3
|
| 14 |
-
Classifier: Programming Language :: Python :: 3.7
|
| 15 |
-
Classifier: Programming Language :: Python :: 3.8
|
| 16 |
-
Classifier: Programming Language :: Python :: 3.9
|
| 17 |
-
Classifier: Programming Language :: Python :: 3.10
|
| 18 |
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
| 19 |
-
Classifier: Topic :: Multimedia :: Video
|
| 20 |
-
Requires-Python: >=3.7
|
| 21 |
-
Description-Content-Type: text/markdown
|
| 22 |
-
Requires-Dist: torch>=1.9.0
|
| 23 |
-
Requires-Dist: torchvision>=0.10.0
|
| 24 |
-
Requires-Dist: transformers>=4.20.0
|
| 25 |
-
Requires-Dist: opencv-python>=4.5.0
|
| 26 |
-
Requires-Dist: pillow>=8.0.0
|
| 27 |
-
Requires-Dist: numpy>=1.20.0
|
| 28 |
-
Requires-Dist: huggingface-hub>=0.10.0
|
| 29 |
-
Requires-Dist: tqdm>=4.60.0
|
| 30 |
-
Provides-Extra: dev
|
| 31 |
-
Requires-Dist: pytest>=6.0; extra == "dev"
|
| 32 |
-
Requires-Dist: black>=22.0; extra == "dev"
|
| 33 |
-
Requires-Dist: flake8>=4.0; extra == "dev"
|
| 34 |
-
Requires-Dist: isort>=5.0; extra == "dev"
|
| 35 |
-
Provides-Extra: segmentation
|
| 36 |
-
Dynamic: author
|
| 37 |
-
Dynamic: author-email
|
| 38 |
-
Dynamic: classifier
|
| 39 |
-
Dynamic: description
|
| 40 |
-
Dynamic: description-content-type
|
| 41 |
-
Dynamic: home-page
|
| 42 |
-
Dynamic: provides-extra
|
| 43 |
-
Dynamic: requires-dist
|
| 44 |
-
Dynamic: requires-python
|
| 45 |
-
Dynamic: summary
|
| 46 |
-
|
| 47 |
-
# VINE HuggingFace Interface
|
| 48 |
-
|
| 49 |
-
VINE (Video Understanding with Natural Language) is a model that processes videos along with categorical, unary, and binary keywords to return probability distributions over those keywords for detected objects and their relationships.
|
| 50 |
-
|
| 51 |
-
This package provides a HuggingFace-compatible interface for the VINE model, making it easy to use for video understanding tasks.
|
| 52 |
-
|
| 53 |
-
## Features
|
| 54 |
-
|
| 55 |
-
- **Categorical Classification**: Classify objects in videos (e.g., "human", "dog", "frisbee")
|
| 56 |
-
- **Unary Predicates**: Detect actions on single objects (e.g., "running", "jumping", "sitting")
|
| 57 |
-
- **Binary Relations**: Detect relationships between object pairs (e.g., "behind", "in front of", "chasing")
|
| 58 |
-
- **Multiple Segmentation Methods**: Support for SAM2 and Grounding DINO + SAM2
|
| 59 |
-
- **HuggingFace Integration**: Full compatibility with HuggingFace transformers and pipelines
|
| 60 |
-
- **Visualization Hooks**: Optional high-level visualizations plus lightweight debug mask dumps for quick sanity checks
|
| 61 |
-
|
| 62 |
-
## Installation
|
| 63 |
-
|
| 64 |
-
```bash
|
| 65 |
-
# Install the package (assuming it's in your Python path)
|
| 66 |
-
pip install transformers torch torchvision
|
| 67 |
-
pip install opencv-python pillow numpy
|
| 68 |
-
|
| 69 |
-
# For segmentation functionality, you'll also need:
|
| 70 |
-
# - SAM2: https://github.com/facebookresearch/sam2
|
| 71 |
-
# - Grounding DINO: https://github.com/IDEA-Research/GroundingDINO
|
| 72 |
-
```
|
| 73 |
-
|
| 74 |
-
## Segmentation Model Configuration
|
| 75 |
-
|
| 76 |
-
`VinePipeline` lazily brings up the segmentation stack the first time a call needs masks. Thresholds, FPS, visualization toggles, and device selection live in `VineConfig`; the pipeline constructor tells it where to fetch SAM2 / GroundingDINO weights or lets you inject already-instantiated modules.
|
| 77 |
-
|
| 78 |
-
### Provide file paths at construction (most common)
|
| 79 |
-
|
| 80 |
-
```python
|
| 81 |
-
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 82 |
-
|
| 83 |
-
vine_config = VineConfig(
|
| 84 |
-
segmentation_method="grounding_dino_sam2", # or "sam2"
|
| 85 |
-
box_threshold=0.35,
|
| 86 |
-
text_threshold=0.25,
|
| 87 |
-
target_fps=5,
|
| 88 |
-
visualization_dir="output/visualizations", # where to write visualizations (and debug visualizations if enabled)
|
| 89 |
-
debug_visualizations=True, # Write videos of the groundingDINO/SAM2/Binary/Unary, etc... outputs
|
| 90 |
-
pretrained_vine_path="/abs/path/to/laser_model_v1.pkl",
|
| 91 |
-
device="cuda:0", # accepts int, str, or torch.device
|
| 92 |
-
)
|
| 93 |
-
|
| 94 |
-
vine_model = VineModel(vine_config)
|
| 95 |
-
|
| 96 |
-
vine_pipeline = VinePipeline(
|
| 97 |
-
model=vine_model,
|
| 98 |
-
tokenizer=None,
|
| 99 |
-
sam_config_path="/abs/path/to/sam2/sam2.1_hiera_t.yaml",
|
| 100 |
-
sam_checkpoint_path="/abs/path/to/sam2/sam2_hiera_tiny.pt",
|
| 101 |
-
gd_config_path="/abs/path/to/groundingdino/config/GroundingDINO_SwinT_OGC.py",
|
| 102 |
-
gd_checkpoint_path="/abs/path/to/groundingdino/weights/groundingdino_swint_ogc.pth",
|
| 103 |
-
device=vine_config._device,
|
| 104 |
-
)
|
| 105 |
-
```
|
| 106 |
-
|
| 107 |
-
When `segmentation_method="grounding_dino_sam2"`, both SAM2 and GroundingDINO must be reachable. The pipeline validates the paths; missing files raise a `ValueError`. If you pick `"sam2"`, only the SAM2 config and checkpoint are required.
|
| 108 |
-
|
| 109 |
-
### Reuse pre-initialized segmentation modules
|
| 110 |
-
|
| 111 |
-
If you build the segmentation stack elsewhere, inject the components with `set_segmentation_models` before running the pipeline:
|
| 112 |
-
|
| 113 |
-
```python
|
| 114 |
-
from sam2.build_sam import build_sam2_video_predictor, build_sam2
|
| 115 |
-
from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
|
| 116 |
-
from groundingdino.util.inference import Model as GroundingDINOModel
|
| 117 |
-
|
| 118 |
-
sam_predictor = build_sam2_video_predictor(..., device=vine_config._device)
|
| 119 |
-
mask_generator = SAM2AutomaticMaskGenerator(build_sam2(..., device=vine_config._device))
|
| 120 |
-
grounding_model = GroundingDINOModel(..., device=vine_config._device)
|
| 121 |
-
|
| 122 |
-
vine_pipeline.set_segmentation_models(
|
| 123 |
-
sam_predictor=sam_predictor,
|
| 124 |
-
mask_generator=mask_generator,
|
| 125 |
-
grounding_model=grounding_model,
|
| 126 |
-
)
|
| 127 |
-
```
|
| 128 |
-
|
| 129 |
-
Any argument left as `None` is initialized lazily from the file paths when the pipeline first needs that backend.
|
| 130 |
-
|
| 131 |
-
## Quick Start
|
| 132 |
-
|
| 133 |
-
## Requirements
|
| 134 |
-
-torch
|
| 135 |
-
-torchvision
|
| 136 |
-
-transformers
|
| 137 |
-
-opencv-python
|
| 138 |
-
-matplotlib
|
| 139 |
-
-seaborn
|
| 140 |
-
-pandas
|
| 141 |
-
-numpy
|
| 142 |
-
-ipywidgets
|
| 143 |
-
-tqdm
|
| 144 |
-
-scikit-learn
|
| 145 |
-
-sam2 (from Facebook Research) "https://github.com/video-fm/video-sam2"
|
| 146 |
-
-sam2 weights (downloaded separately. EX: https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_tiny.pt)
|
| 147 |
-
-groundingdino (from IDEA Research)
|
| 148 |
-
-groundingdino weights (downloaded separately. EX:https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth)
|
| 149 |
-
-spacy-fastlang
|
| 150 |
-
-en-core-web-sm (for spacy-fastlang)
|
| 151 |
-
-ffmpeg (for video processing)
|
| 152 |
-
-(optional) laser weights/full model checkpoint (downloaded separately. EX: https://huggingface.co/video-fm/vine_v0)
|
| 153 |
-
|
| 154 |
-
Usually, by running the laser/environments/laser_env.yml from the LASER repo, most dependencies will be installed. You will need to manually install sam2 and groundingdino as per their instructions.
|
| 155 |
-
|
| 156 |
-
### Using the Pipeline (Recommended)
|
| 157 |
-
```python
|
| 158 |
-
from transformers.pipelines import PIPELINE_REGISTRY
|
| 159 |
-
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 160 |
-
|
| 161 |
-
PIPELINE_REGISTRY.register_pipeline(
|
| 162 |
-
"vine-video-understanding",
|
| 163 |
-
pipeline_class=VinePipeline,
|
| 164 |
-
pt_model=VineModel,
|
| 165 |
-
type="multimodal",
|
| 166 |
-
)
|
| 167 |
-
|
| 168 |
-
config = VineConfig(
|
| 169 |
-
segmentation_method="grounding_dino_sam2",
|
| 170 |
-
pretrained_vine_path="/abs/path/to/laser_model_v1.pkl",
|
| 171 |
-
visualization_dir="output",
|
| 172 |
-
visualize=True,
|
| 173 |
-
device="cuda:0",
|
| 174 |
-
)
|
| 175 |
-
|
| 176 |
-
model = VineModel(config)
|
| 177 |
-
|
| 178 |
-
vine_pipeline = VinePipeline(
|
| 179 |
-
model=model,
|
| 180 |
-
tokenizer=None,
|
| 181 |
-
sam_config_path="/abs/path/to/sam2/sam2.1_hiera_t.yaml",
|
| 182 |
-
sam_checkpoint_path="/abs/path/to/sam2/sam2_hiera_tiny.pt",
|
| 183 |
-
gd_config_path="/abs/path/to/groundingdino/config/GroundingDINO_SwinT_OGC.py",
|
| 184 |
-
gd_checkpoint_path="/abs/path/to/groundingdino/weights/groundingdino_swint_ogc.pth",
|
| 185 |
-
device=config._device,
|
| 186 |
-
)
|
| 187 |
-
|
| 188 |
-
results = vine_pipeline(
|
| 189 |
-
"/path/to/video.mp4",
|
| 190 |
-
categorical_keywords=["dog", "human"],
|
| 191 |
-
unary_keywords=["running"],
|
| 192 |
-
binary_keywords=["chasing"],
|
| 193 |
-
object_pairs=[(0, 1)],
|
| 194 |
-
return_top_k=3,
|
| 195 |
-
include_visualizations=True,
|
| 196 |
-
)
|
| 197 |
-
print(results["summary"])
|
| 198 |
-
```
|
| 199 |
-
|
| 200 |
-
### Using the Model Directly (Advanced)
|
| 201 |
-
|
| 202 |
-
For advanced users who want to provide their own segmentation:
|
| 203 |
-
|
| 204 |
-
```python
|
| 205 |
-
from vine_hf import VineConfig, VineModel
|
| 206 |
-
import torch
|
| 207 |
-
|
| 208 |
-
# Create configuration
|
| 209 |
-
config = VineConfig(
|
| 210 |
-
pretrained_vine_path="/path/to/your/vine/weights" # Optional: your fine-tuned weights
|
| 211 |
-
)
|
| 212 |
-
|
| 213 |
-
# Initialize model
|
| 214 |
-
model = VineModel(config)
|
| 215 |
-
|
| 216 |
-
# If you have your own video frames, masks, and bboxes from external segmentation
|
| 217 |
-
video_frames = torch.randn(3, 224, 224, 3) * 255 # Your video frames
|
| 218 |
-
masks = {0: {1: torch.ones(224, 224, 1)}} # Your segmentation masks
|
| 219 |
-
bboxes = {0: {1: [50, 50, 150, 150]}} # Your bounding boxes
|
| 220 |
-
|
| 221 |
-
# Run prediction
|
| 222 |
-
results = model.predict(
|
| 223 |
-
video_frames=video_frames,
|
| 224 |
-
masks=masks,
|
| 225 |
-
bboxes=bboxes,
|
| 226 |
-
categorical_keywords=['human', 'dog', 'frisbee'],
|
| 227 |
-
unary_keywords=['running', 'jumping'],
|
| 228 |
-
binary_keywords=['chasing', 'following'],
|
| 229 |
-
object_pairs=[(1, 2)],
|
| 230 |
-
return_top_k=3
|
| 231 |
-
)
|
| 232 |
-
```
|
| 233 |
-
|
| 234 |
-
**Note**: For most users, the pipeline approach above is recommended as it handles video loading and segmentation automatically.
|
| 235 |
-
|
| 236 |
-
## Configuration Options
|
| 237 |
-
|
| 238 |
-
The `VineConfig` class supports the following parameters (non-exhaustive):
|
| 239 |
-
|
| 240 |
-
- `model_name`: CLIP model backbone (default: `"openai/clip-vit-large-patch14-336"`)
|
| 241 |
-
- `pretrained_vine_path`: Optional path or Hugging Face repo with pretrained VINE weights
|
| 242 |
-
- `segmentation_method`: `"sam2"` or `"grounding_dino_sam2"` (default: `"grounding_dino_sam2"`)
|
| 243 |
-
- `box_threshold` / `text_threshold`: Grounding DINO thresholds
|
| 244 |
-
- `target_fps`: Target FPS for video processing (default: `1`)
|
| 245 |
-
- `alpha`, `white_alpha`: Rendering parameters used when extracting masked crops
|
| 246 |
-
- `topk_cate`: Top-k categories to return per object (default: `3`)
|
| 247 |
-
- `max_video_length`: Maximum frames to process (default: `100`)
|
| 248 |
-
- `visualize`: When `True`, pipeline post-processing attempts to create stitched visualizations
|
| 249 |
-
- `visualization_dir`: Optional base directory where visualization assets are written
|
| 250 |
-
- `debug_visualizations`: When `True`, the model saves a single first-frame mask composite for quick inspection
|
| 251 |
-
- `debug_visualization_path`: Target filepath for the debug mask composite (must point to a writable file)
|
| 252 |
-
- `return_flattened_segments`, `return_valid_pairs`, `interested_object_pairs`: Advanced geometry outputs for downstream consumers
|
| 253 |
-
|
| 254 |
-
## Output Format
|
| 255 |
-
|
| 256 |
-
The model returns a dictionary with the following structure:
|
| 257 |
-
|
| 258 |
-
```python
|
| 259 |
-
{
|
| 260 |
-
"masks" : {},
|
| 261 |
-
|
| 262 |
-
"boxes" : {},
|
| 263 |
-
|
| 264 |
-
"categorical_predictions": {
|
| 265 |
-
object_id: [(probability, category), ...]
|
| 266 |
-
},
|
| 267 |
-
"unary_predictions": {
|
| 268 |
-
(frame_id, object_id): [(probability, action), ...]
|
| 269 |
-
},
|
| 270 |
-
"binary_predictions": {
|
| 271 |
-
(frame_id, (obj1_id, obj2_id)): [(probability, relation), ...]
|
| 272 |
-
},
|
| 273 |
-
"confidence_scores": {
|
| 274 |
-
"categorical": max_categorical_confidence,
|
| 275 |
-
"unary": max_unary_confidence,
|
| 276 |
-
"binary": max_binary_confidence
|
| 277 |
-
},
|
| 278 |
-
"summary": {
|
| 279 |
-
"num_objects_detected": int,
|
| 280 |
-
"top_categories": [(category, probability), ...],
|
| 281 |
-
"top_actions": [(action, probability), ...],
|
| 282 |
-
"top_relations": [(relation, probability), ...]
|
| 283 |
-
}
|
| 284 |
-
}
|
| 285 |
-
```
|
| 286 |
-
|
| 287 |
-
## Visualization & Debugging
|
| 288 |
-
|
| 289 |
-
There are two complementary visualization layers:
|
| 290 |
-
|
| 291 |
-
- **Post-process visualizations** (`include_visualizations=True` in the pipeline call) produces a high-level stitched video summarizing detections, actions, and relations over time.
|
| 292 |
-
|
| 293 |
-
- **Debug visualizations** (`debug_visualizations=True` in `VineConfig`) dumps videos of intermediate segmentation masks and outputs from GroundingDINO, SAM2, Unary, Binary, etc. for quick sanity checks.
|
| 294 |
-
|
| 295 |
-
If you plan to enable either option, ensure the relevant output directories exist before running the pipeline.
|
| 296 |
-
|
| 297 |
-
## Segmentation Methods
|
| 298 |
-
|
| 299 |
-
### Grounding DINO + SAM2 (Recommended)
|
| 300 |
-
|
| 301 |
-
Uses Grounding DINO for object detection based on text prompts, then SAM2 for precise segmentation.
|
| 302 |
-
|
| 303 |
-
Requirements:
|
| 304 |
-
- Grounding DINO model and weights
|
| 305 |
-
- SAM2 model and weights
|
| 306 |
-
- Properly configured paths to model checkpoints
|
| 307 |
-
|
| 308 |
-
### SAM2 Only
|
| 309 |
-
|
| 310 |
-
Uses SAM2's automatic mask generation without text-based object detection.
|
| 311 |
-
|
| 312 |
-
Requirements:
|
| 313 |
-
- SAM2 model and weights
|
| 314 |
-
|
| 315 |
-
## Model Architecture
|
| 316 |
-
|
| 317 |
-
VINE is built on top of CLIP and uses three separate CLIP models for different tasks:
|
| 318 |
-
- **Categorical Model**: For object classification
|
| 319 |
-
- **Unary Model**: For single-object action recognition
|
| 320 |
-
- **Binary Model**: For relationship detection between object pairs
|
| 321 |
-
|
| 322 |
-
Each model processes both visual and textual features to compute similarity scores and probability distributions.
|
| 323 |
-
|
| 324 |
-
## Pushing to HuggingFace Hub
|
| 325 |
-
|
| 326 |
-
```python
|
| 327 |
-
from vine_hf import VineConfig, VineModel
|
| 328 |
-
|
| 329 |
-
# Create and configure your model
|
| 330 |
-
config = VineConfig()
|
| 331 |
-
model = VineModel(config)
|
| 332 |
-
|
| 333 |
-
# Load your pretrained weights
|
| 334 |
-
# model.load_state_dict(torch.load('path/to/your/weights.pth'))
|
| 335 |
-
|
| 336 |
-
# Register for auto classes
|
| 337 |
-
config.register_for_auto_class()
|
| 338 |
-
model.register_for_auto_class("AutoModel")
|
| 339 |
-
|
| 340 |
-
# Push to Hub
|
| 341 |
-
config.push_to_hub('your-username/vine-model')
|
| 342 |
-
model.push_to_hub('your-username/vine-model')
|
| 343 |
-
```
|
| 344 |
-
|
| 345 |
-
## Loading from HuggingFace Hub
|
| 346 |
-
|
| 347 |
-
```python
|
| 348 |
-
from transformers import AutoModel, pipeline
|
| 349 |
-
|
| 350 |
-
# Load model
|
| 351 |
-
model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)
|
| 352 |
-
|
| 353 |
-
# Or use with pipeline
|
| 354 |
-
vine_pipeline = pipeline(
|
| 355 |
-
'vine-video-understanding',
|
| 356 |
-
model='your-username/vine-model',
|
| 357 |
-
trust_remote_code=True
|
| 358 |
-
)
|
| 359 |
-
```
|
| 360 |
-
|
| 361 |
-
## Examples
|
| 362 |
-
|
| 363 |
-
See `example_usage.py` for comprehensive examples including:
|
| 364 |
-
- Direct model usage
|
| 365 |
-
- Pipeline usage
|
| 366 |
-
- HuggingFace Hub integration
|
| 367 |
-
- Real video processing
|
| 368 |
-
|
| 369 |
-
## Requirements
|
| 370 |
-
|
| 371 |
-
- Python 3.7+
|
| 372 |
-
- PyTorch 1.9+
|
| 373 |
-
- transformers 4.20+
|
| 374 |
-
- OpenCV
|
| 375 |
-
- PIL/Pillow
|
| 376 |
-
- NumPy
|
| 377 |
-
|
| 378 |
-
For segmentation:
|
| 379 |
-
- SAM2 (Facebook Research)
|
| 380 |
-
- Grounding DINO (IDEA Research)
|
| 381 |
-
|
| 382 |
-
## Citation
|
| 383 |
-
|
| 384 |
-
If you use VINE in your research, please cite:
|
| 385 |
-
|
| 386 |
-
```bibtex
|
| 387 |
-
@article{vine2024,
|
| 388 |
-
title={VINE: Video Understanding with Natural Language},
|
| 389 |
-
author={Your Authors},
|
| 390 |
-
journal={Your Journal},
|
| 391 |
-
year={2024}
|
| 392 |
-
}
|
| 393 |
-
```
|
| 394 |
-
|
| 395 |
-
## License
|
| 396 |
-
|
| 397 |
-
[Your License Here]
|
| 398 |
-
|
| 399 |
-
## Contact
|
| 400 |
-
|
| 401 |
-
[Your Contact Information Here]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/vine_hf.egg-info/SOURCES.txt
DELETED
|
@@ -1,21 +0,0 @@
|
|
| 1 |
-
README.md
|
| 2 |
-
setup.py
|
| 3 |
-
./__init__.py
|
| 4 |
-
./convert_inference.py
|
| 5 |
-
./example_ensemble_weights.py
|
| 6 |
-
./example_sam2_masks.py
|
| 7 |
-
./example_usage.py
|
| 8 |
-
./example_visualization.py
|
| 9 |
-
./example_with_pretrained_vine.py
|
| 10 |
-
./flattening.py
|
| 11 |
-
./push_to_hub.py
|
| 12 |
-
./vine_config.py
|
| 13 |
-
./vine_model.py
|
| 14 |
-
./vine_pipeline.py
|
| 15 |
-
./vis_utils.py
|
| 16 |
-
vine_hf.egg-info/PKG-INFO
|
| 17 |
-
vine_hf.egg-info/SOURCES.txt
|
| 18 |
-
vine_hf.egg-info/dependency_links.txt
|
| 19 |
-
vine_hf.egg-info/entry_points.txt
|
| 20 |
-
vine_hf.egg-info/requires.txt
|
| 21 |
-
vine_hf.egg-info/top_level.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/vine_hf.egg-info/dependency_links.txt
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
|
|
|
|
|
|
src/vine_hf/vine_hf.egg-info/entry_points.txt
DELETED
|
@@ -1,2 +0,0 @@
|
|
| 1 |
-
[console_scripts]
|
| 2 |
-
vine-push-to-hub = vine_hf.push_to_hub:main
|
|
|
|
|
|
|
|
|
src/vine_hf/vine_hf.egg-info/requires.txt
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
torch>=1.9.0
|
| 2 |
-
torchvision>=0.10.0
|
| 3 |
-
transformers>=4.20.0
|
| 4 |
-
opencv-python>=4.5.0
|
| 5 |
-
pillow>=8.0.0
|
| 6 |
-
numpy>=1.20.0
|
| 7 |
-
huggingface-hub>=0.10.0
|
| 8 |
-
tqdm>=4.60.0
|
| 9 |
-
|
| 10 |
-
[dev]
|
| 11 |
-
pytest>=6.0
|
| 12 |
-
black>=22.0
|
| 13 |
-
flake8>=4.0
|
| 14 |
-
isort>=5.0
|
| 15 |
-
|
| 16 |
-
[segmentation]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/vine_hf.egg-info/top_level.txt
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
vine_hf
|
|
|
|
|
|
src/vine_hf/vine_model.py
DELETED
|
@@ -1,702 +0,0 @@
|
|
| 1 |
-
from flax import config
|
| 2 |
-
import torch
|
| 3 |
-
from torch import nn
|
| 4 |
-
import torch.nn.functional as F
|
| 5 |
-
import torch.utils.checkpoint as cp
|
| 6 |
-
from transformers import PreTrainedModel, AutoTokenizer, AutoModel, AutoProcessor
|
| 7 |
-
from typing import Dict, List, Tuple, Optional, Any, Union
|
| 8 |
-
import numpy as np
|
| 9 |
-
import os
|
| 10 |
-
import cv2
|
| 11 |
-
from collections import defaultdict
|
| 12 |
-
import builtins
|
| 13 |
-
import sys
|
| 14 |
-
from laser.models import llava_clip_model_v3
|
| 15 |
-
sys.modules["llava_clip_model_v3"] = llava_clip_model_v3
|
| 16 |
-
from safetensors.torch import load_file
|
| 17 |
-
|
| 18 |
-
import inspect
|
| 19 |
-
from transformers.models.clip import modeling_clip
|
| 20 |
-
import transformers
|
| 21 |
-
from huggingface_hub import snapshot_download
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
from .vine_config import VineConfig
|
| 27 |
-
from laser.models.model_utils import (
|
| 28 |
-
extract_single_object,
|
| 29 |
-
extract_object_subject,
|
| 30 |
-
crop_image_contain_bboxes,
|
| 31 |
-
segment_list
|
| 32 |
-
)
|
| 33 |
-
from .flattening import (
|
| 34 |
-
extract_valid_object_pairs,
|
| 35 |
-
flatten_segments_for_batch,
|
| 36 |
-
)
|
| 37 |
-
|
| 38 |
-
from .vis_utils import save_mask_one_image
|
| 39 |
-
|
| 40 |
-
class VineModel(PreTrainedModel):
|
| 41 |
-
"""
|
| 42 |
-
VINE (Video Understanding with Natural Language) Model
|
| 43 |
-
|
| 44 |
-
This model processes videos along with categorical, unary, and binary keywords
|
| 45 |
-
to return probability distributions over those keywords for detected objects
|
| 46 |
-
and their relationships in the video.
|
| 47 |
-
"""
|
| 48 |
-
|
| 49 |
-
config_class = VineConfig
|
| 50 |
-
|
| 51 |
-
def __init__(self, config: VineConfig):
|
| 52 |
-
super().__init__(config)
|
| 53 |
-
|
| 54 |
-
self.config = config
|
| 55 |
-
self.visualize = getattr(config, "visualize", False)
|
| 56 |
-
self.visualization_dir = getattr(config, "visualization_dir", None)
|
| 57 |
-
self.debug_visualizations = getattr(config, "debug_visualizations", False)
|
| 58 |
-
self._device = getattr(config, "_device")
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
# Initialize CLIP components
|
| 63 |
-
self.clip_tokenizer = AutoTokenizer.from_pretrained(config.model_name)
|
| 64 |
-
if self.clip_tokenizer.pad_token is None:
|
| 65 |
-
self.clip_tokenizer.pad_token = (
|
| 66 |
-
self.clip_tokenizer.unk_token
|
| 67 |
-
if self.clip_tokenizer.unk_token
|
| 68 |
-
else self.clip_tokenizer.eos_token
|
| 69 |
-
)
|
| 70 |
-
self.clip_processor = AutoProcessor.from_pretrained(config.model_name)
|
| 71 |
-
self.clip_cate_model = AutoModel.from_pretrained(config.model_name)
|
| 72 |
-
self.clip_unary_model = AutoModel.from_pretrained(config.model_name)
|
| 73 |
-
self.clip_binary_model = AutoModel.from_pretrained(config.model_name)
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
# Then try to load pretrained VINE weights if specified
|
| 77 |
-
if config.use_hf_repo:
|
| 78 |
-
self._load_huggingface_vine_weights(config.model_repo, config.model_file)
|
| 79 |
-
else:
|
| 80 |
-
self._load_local_pretrained_vine_weights(config.local_dir, config.local_filename)
|
| 81 |
-
|
| 82 |
-
# Move models to devicexwxw
|
| 83 |
-
self.to(self._device)
|
| 84 |
-
|
| 85 |
-
def _load_huggingface_vine_weights(self, model_repo: str, model_file: Optional[str] = None):
|
| 86 |
-
"""
|
| 87 |
-
Load pretrained VINE weights from HuggingFace Hub.
|
| 88 |
-
"""
|
| 89 |
-
try:
|
| 90 |
-
print(f"Loading VINE weights from HuggingFace repo: {model_repo}")
|
| 91 |
-
repo_path = snapshot_download(model_repo, revision=model_file or "main")
|
| 92 |
-
weights = load_file(os.path.join(repo_path, "model.safetensors"))
|
| 93 |
-
self.load_state_dict(weights, strict=False)
|
| 94 |
-
print("✓ Successfully loaded VINE weights from HuggingFace Hub")
|
| 95 |
-
return True
|
| 96 |
-
except Exception as e:
|
| 97 |
-
print(f"✗ Error loading VINE weights from HuggingFace Hub: {e}")
|
| 98 |
-
print("Using base CLIP models instead")
|
| 99 |
-
return False
|
| 100 |
-
|
| 101 |
-
def _load_local_pretrained_vine_weights(self, local_dir: str, local_filename: Optional[str] = None, epoch: int = 0):
|
| 102 |
-
"""
|
| 103 |
-
Load pretrained VINE weights from a saved .pt file or ensemble format.
|
| 104 |
-
"""
|
| 105 |
-
#try: # simple .pt or .pth checkpoint
|
| 106 |
-
|
| 107 |
-
# x = torch.load(pretrained_path, map_location=self._device, weights_only=False)
|
| 108 |
-
# print(f"Loaded VINE checkpoint type: {type(x)}")
|
| 109 |
-
full_path = os.path.join(local_dir, local_filename) if local_filename else local_dir
|
| 110 |
-
|
| 111 |
-
if full_path.endswith(".pkl"):
|
| 112 |
-
print(f"Loading VINE weights from: {full_path}")
|
| 113 |
-
loaded_vine_model = torch.load(full_path, map_location=self._device, weights_only=False)
|
| 114 |
-
|
| 115 |
-
print(f"Loaded state type: {type(loaded_vine_model)}")
|
| 116 |
-
if not isinstance(loaded_vine_model, dict):
|
| 117 |
-
if hasattr(loaded_vine_model, 'clip_cate_model'):
|
| 118 |
-
self.clip_cate_model.load_state_dict(loaded_vine_model.clip_cate_model.state_dict())
|
| 119 |
-
if hasattr(loaded_vine_model, 'clip_unary_model'):
|
| 120 |
-
self.clip_unary_model.load_state_dict(loaded_vine_model.clip_unary_model.state_dict())
|
| 121 |
-
if hasattr(loaded_vine_model, 'clip_binary_model'):
|
| 122 |
-
self.clip_binary_model.load_state_dict(loaded_vine_model.clip_binary_model.state_dict())
|
| 123 |
-
return True
|
| 124 |
-
|
| 125 |
-
elif full_path.endswith(".pt") or full_path.endswith(".pth"):
|
| 126 |
-
state = torch.load(full_path, map_location=self._device, weights_only=True)
|
| 127 |
-
print(f"Loaded state type: {type(state)}")
|
| 128 |
-
self.load_state_dict(state)
|
| 129 |
-
return True
|
| 130 |
-
|
| 131 |
-
# handle directory + epoch format
|
| 132 |
-
if os.path.isdir(full_path):
|
| 133 |
-
model_files = [f for f in os.listdir(full_path) if f.endswith(f'.{epoch}.model')]
|
| 134 |
-
if model_files:
|
| 135 |
-
model_file = os.path.join(full_path, model_files[0])
|
| 136 |
-
print(f"Loading VINE weights from: {model_file}")
|
| 137 |
-
pretrained_model = torch.load(model_file, map_location="cpu")
|
| 138 |
-
|
| 139 |
-
# Conversion from PredicateModel-like object to VineModel
|
| 140 |
-
# Only copy if attributes exist
|
| 141 |
-
if hasattr(pretrained_model, 'clip_cate_model'):
|
| 142 |
-
self.clip_cate_model.load_state_dict(pretrained_model.clip_cate_model.state_dict())
|
| 143 |
-
if hasattr(pretrained_model, 'clip_unary_model'):
|
| 144 |
-
self.clip_unary_model.load_state_dict(pretrained_model.clip_unary_model.state_dict())
|
| 145 |
-
if hasattr(pretrained_model, 'clip_binary_model'):
|
| 146 |
-
self.clip_binary_model.load_state_dict(pretrained_model.clip_binary_model.state_dict())
|
| 147 |
-
print("✓ Loaded all sub-model weights from ensemble format")
|
| 148 |
-
return True
|
| 149 |
-
else:
|
| 150 |
-
print(f"No model file found for epoch {epoch} in {full_path}")
|
| 151 |
-
return False
|
| 152 |
-
|
| 153 |
-
print("Unsupported format for pretrained_vine_path")
|
| 154 |
-
return False
|
| 155 |
-
|
| 156 |
-
# except Exception as e:
|
| 157 |
-
# print(f"✗ Error loading VINE weights: {e}")
|
| 158 |
-
# print("Using base CLIP models instead")
|
| 159 |
-
# return False
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
# def _load_pretrained_vine_weights(self, pretrained_path: str, epoch: int = 0):
|
| 164 |
-
# """
|
| 165 |
-
# Load pretrained VINE weights from local ensemble format.
|
| 166 |
-
|
| 167 |
-
# Args:
|
| 168 |
-
# pretrained_path: Path to the pretrained model directory or HF model name
|
| 169 |
-
# epoch: Epoch number to load (for ensemble format)
|
| 170 |
-
# """
|
| 171 |
-
# if pretrained_path == "video-fm/vine_v0":
|
| 172 |
-
# # Try to load from HuggingFace Hubtry:
|
| 173 |
-
# # ✅ TODO FIXED: Added support for loading .pt/.pth checkpoints with state dicts
|
| 174 |
-
# if pretrained_path.endswith(".pt") or pretrained_path.endswith(".pth"):
|
| 175 |
-
# print(f"Loading VINE weights from: {pretrained_path}")
|
| 176 |
-
# state = torch.load(pretrained_path, map_location="cpu")
|
| 177 |
-
|
| 178 |
-
# if "clip_cate_model" in state:
|
| 179 |
-
# self.clip_cate_model.load_state_dict(state["clip_cate_model"])
|
| 180 |
-
# print("✓ Loaded categorical model weights")
|
| 181 |
-
# if "clip_unary_model" in state:
|
| 182 |
-
# self.clip_unary_model.load_state_dict(state["clip_unary_model"])
|
| 183 |
-
# print("✓ Loaded unary model weights")
|
| 184 |
-
# if "clip_binary_model" in state:
|
| 185 |
-
# self.clip_binary_model.load_state_dict(state["clip_binary_model"])
|
| 186 |
-
# print("✓ Loaded binary model weights")
|
| 187 |
-
|
| 188 |
-
# if "clip_tokenizer" in state:
|
| 189 |
-
# self.clip_tokenizer = state["clip_tokenizer"]
|
| 190 |
-
# print("✓ Loaded tokenizer")
|
| 191 |
-
# if "clip_processor" in state:
|
| 192 |
-
# self.clip_processor = state["clip_processor"]
|
| 193 |
-
# print("✓ Loaded processor")
|
| 194 |
-
|
| 195 |
-
# print("✓ All VINE weights loaded successfully")
|
| 196 |
-
# return True
|
| 197 |
-
|
| 198 |
-
# # Load from local ensemble format
|
| 199 |
-
# try:
|
| 200 |
-
# if os.path.isdir(pretrained_path):
|
| 201 |
-
# # Directory format - look for ensemble file
|
| 202 |
-
# model_files = [f for f in os.listdir(pretrained_path) if f.endswith(f'.{epoch}.model')]
|
| 203 |
-
# if model_files:
|
| 204 |
-
# model_file = os.path.join(pretrained_path, model_files[0])
|
| 205 |
-
# else:
|
| 206 |
-
# print(f"No model file found for epoch {epoch} in {pretrained_path}")
|
| 207 |
-
# return False
|
| 208 |
-
# else:
|
| 209 |
-
# # Direct file path
|
| 210 |
-
# model_file = pretrained_path
|
| 211 |
-
|
| 212 |
-
# print(f"Loading VINE weights from: {model_file}")
|
| 213 |
-
|
| 214 |
-
# # Load the ensemble model (PredicateModel instance)
|
| 215 |
-
# # TODO: conversion from PredicateModel to VineModel
|
| 216 |
-
# pretrained_model = torch.load(model_file, map_location='cpu', weights_only=False)
|
| 217 |
-
|
| 218 |
-
# # Transfer weights from the pretrained model to our HuggingFace models
|
| 219 |
-
# if hasattr(pretrained_model, 'clip_cate_model'):
|
| 220 |
-
# self.clip_cate_model.load_state_dict(pretrained_model.clip_cate_model.state_dict())
|
| 221 |
-
# print("✓ Loaded categorical model weights")
|
| 222 |
-
|
| 223 |
-
# if hasattr(pretrained_model, 'clip_unary_model'):
|
| 224 |
-
# self.clip_unary_model.load_state_dict(pretrained_model.clip_unary_model.state_dict())
|
| 225 |
-
# print("✓ Loaded unary model weights")
|
| 226 |
-
|
| 227 |
-
# if hasattr(pretrained_model, 'clip_binary_model'):
|
| 228 |
-
# self.clip_binary_model.load_state_dict(pretrained_model.clip_binary_model.state_dict())
|
| 229 |
-
# print("✓ Loaded binary model weights")
|
| 230 |
-
|
| 231 |
-
# # Also transfer tokenizer and processor if available
|
| 232 |
-
# if hasattr(pretrained_model, 'clip_tokenizer'):
|
| 233 |
-
# self.clip_tokenizer = pretrained_model.clip_tokenizer
|
| 234 |
-
# print("✓ Loaded tokenizer")
|
| 235 |
-
|
| 236 |
-
# if hasattr(pretrained_model, 'clip_processor'):
|
| 237 |
-
# self.clip_processor = pretrained_model.clip_processor
|
| 238 |
-
# print("✓ Loaded processor")
|
| 239 |
-
|
| 240 |
-
# print("✓ Successfully loaded all VINE weights")
|
| 241 |
-
# return True
|
| 242 |
-
|
| 243 |
-
# except Exception as e:
|
| 244 |
-
# print(f"✗ Error loading VINE weights: {e}")
|
| 245 |
-
# print("Using base CLIP models instead")
|
| 246 |
-
# return False
|
| 247 |
-
|
| 248 |
-
@classmethod
|
| 249 |
-
def from_pretrained_vine(
|
| 250 |
-
cls,
|
| 251 |
-
model_path: str,
|
| 252 |
-
config: Optional[VineConfig] = None,
|
| 253 |
-
epoch: int = 0,
|
| 254 |
-
**kwargs
|
| 255 |
-
):
|
| 256 |
-
"""
|
| 257 |
-
Create VineModel from pretrained VINE weights.
|
| 258 |
-
|
| 259 |
-
Args:
|
| 260 |
-
model_path: Path to pretrained VINE model
|
| 261 |
-
config: Optional config, will create default if None
|
| 262 |
-
epoch: Epoch number to load
|
| 263 |
-
**kwargs: Additional arguments
|
| 264 |
-
|
| 265 |
-
Returns:
|
| 266 |
-
VineModel instance with loaded weights
|
| 267 |
-
"""
|
| 268 |
-
# Normalize the incoming model_path into the new VineConfig fields.
|
| 269 |
-
if config is None:
|
| 270 |
-
# Heuristics: if path looks like a HF repo (contains a "/" and
|
| 271 |
-
# doesn't exist on disk) treat it as a repo. Otherwise treat as local.
|
| 272 |
-
if model_path and ("/" in model_path and not os.path.exists(model_path)):
|
| 273 |
-
config = VineConfig(use_hf_repo=True, model_repo=model_path)
|
| 274 |
-
else:
|
| 275 |
-
# Local path: could be a file or directory
|
| 276 |
-
if os.path.isdir(model_path):
|
| 277 |
-
config = VineConfig(use_hf_repo=False, local_dir=model_path)
|
| 278 |
-
else:
|
| 279 |
-
config = VineConfig(
|
| 280 |
-
use_hf_repo=False,
|
| 281 |
-
local_dir=os.path.dirname(model_path) or None,
|
| 282 |
-
local_filename=os.path.basename(model_path) or None,
|
| 283 |
-
)
|
| 284 |
-
else:
|
| 285 |
-
# Update provided config to reflect the requested pretrained path
|
| 286 |
-
if model_path and ("/" in model_path and not os.path.exists(model_path)):
|
| 287 |
-
config.use_hf_repo = True
|
| 288 |
-
config.model_repo = model_path
|
| 289 |
-
config.model_file = None
|
| 290 |
-
config.local_dir = None
|
| 291 |
-
config.local_filename = None
|
| 292 |
-
else:
|
| 293 |
-
config.use_hf_repo = False
|
| 294 |
-
if os.path.isdir(model_path):
|
| 295 |
-
config.local_dir = model_path
|
| 296 |
-
config.local_filename = None
|
| 297 |
-
else:
|
| 298 |
-
config.local_dir = os.path.dirname(model_path) or None
|
| 299 |
-
config.local_filename = os.path.basename(model_path) or None
|
| 300 |
-
|
| 301 |
-
# Create model instance (will automatically load weights)
|
| 302 |
-
model = cls(config, **kwargs)
|
| 303 |
-
|
| 304 |
-
return model
|
| 305 |
-
|
| 306 |
-
def _text_features_checkpoint(self, model, tokens):
|
| 307 |
-
"""Extract text features with gradient checkpointing."""
|
| 308 |
-
token_keys = list(tokens.keys())
|
| 309 |
-
|
| 310 |
-
def get_text_features_wrapped(*inputs):
|
| 311 |
-
kwargs = {key: value for key, value in zip(token_keys, inputs)}
|
| 312 |
-
return model.get_text_features(**kwargs)
|
| 313 |
-
|
| 314 |
-
token_values = [tokens[key] for key in token_keys]
|
| 315 |
-
return cp.checkpoint(get_text_features_wrapped, *token_values, use_reentrant=False)
|
| 316 |
-
|
| 317 |
-
def _image_features_checkpoint(self, model, images):
|
| 318 |
-
"""Extract image features with gradient checkpointing."""
|
| 319 |
-
return cp.checkpoint(model.get_image_features, images, use_reentrant=False)
|
| 320 |
-
|
| 321 |
-
def clip_sim(self, model, nl_feat, img_feat):
|
| 322 |
-
img_feat = img_feat / img_feat.norm(p=2, dim=-1, keepdim=True)
|
| 323 |
-
nl_feat = nl_feat / nl_feat.norm(p=2, dim=-1, keepdim=True)
|
| 324 |
-
logits = torch.matmul(img_feat, nl_feat.T)
|
| 325 |
-
if hasattr(model, "logit_scale"):
|
| 326 |
-
logits = logits * model.logit_scale.exp()
|
| 327 |
-
return logits
|
| 328 |
-
|
| 329 |
-
def forward(
|
| 330 |
-
self,
|
| 331 |
-
video_frames: torch.Tensor,
|
| 332 |
-
masks: Dict[int, Dict[int, torch.Tensor]],
|
| 333 |
-
bboxes: Dict[int, Dict[int, List]],
|
| 334 |
-
categorical_keywords: List[str],
|
| 335 |
-
unary_keywords: Optional[List[str]] = None,
|
| 336 |
-
binary_keywords: Optional[List[str]] = None,
|
| 337 |
-
object_pairs: Optional[List[Tuple[int, int]]] = None,
|
| 338 |
-
return_flattened_segments: Optional[bool] = None,
|
| 339 |
-
return_valid_pairs: Optional[bool] = None,
|
| 340 |
-
interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
|
| 341 |
-
debug_visualizations: Optional[bool] = None,
|
| 342 |
-
**kwargs
|
| 343 |
-
) -> Dict[str, Any]:
|
| 344 |
-
"""
|
| 345 |
-
Forward pass of the VINE model.
|
| 346 |
-
|
| 347 |
-
Args:
|
| 348 |
-
video_frames: Tensor of shape (num_frames, height, width, 3)
|
| 349 |
-
masks: Dict mapping frame_id -> object_id -> mask tensor
|
| 350 |
-
bboxes: Dict mapping frame_id -> object_id -> [x1, y1, x2, y2]
|
| 351 |
-
categorical_keywords: List of category names to classify objects
|
| 352 |
-
unary_keywords: Optional list of unary predicates (actions on single objects)
|
| 353 |
-
binary_keywords: Optional list of binary predicates (relations between objects)
|
| 354 |
-
object_pairs: Optional list of (obj1_id, obj2_id) pairs for binary classification
|
| 355 |
-
|
| 356 |
-
Returns:
|
| 357 |
-
Dict containing probability distributions for categorical, unary, and binary predictions
|
| 358 |
-
"""
|
| 359 |
-
if unary_keywords is None:
|
| 360 |
-
unary_keywords = []
|
| 361 |
-
if binary_keywords is None:
|
| 362 |
-
binary_keywords = []
|
| 363 |
-
if object_pairs is None:
|
| 364 |
-
object_pairs = []
|
| 365 |
-
if return_flattened_segments is None:
|
| 366 |
-
return_flattened_segments = self.config.return_flattened_segments
|
| 367 |
-
if return_valid_pairs is None:
|
| 368 |
-
return_valid_pairs = self.config.return_valid_pairs
|
| 369 |
-
if interested_object_pairs is None or len(interested_object_pairs) == 0:
|
| 370 |
-
interested_object_pairs = getattr(self.config, "interested_object_pairs", []) or []
|
| 371 |
-
if debug_visualizations is None:
|
| 372 |
-
debug_visualizations = self.debug_visualizations
|
| 373 |
-
|
| 374 |
-
# Prepare dummy strings for empty categories
|
| 375 |
-
dummy_str = ""
|
| 376 |
-
|
| 377 |
-
# Fill empty categories with dummy strings
|
| 378 |
-
if len(categorical_keywords) == 0:
|
| 379 |
-
categorical_keywords = [dummy_str]
|
| 380 |
-
if len(unary_keywords) == 0:
|
| 381 |
-
unary_keywords = [dummy_str]
|
| 382 |
-
if len(binary_keywords) == 0:
|
| 383 |
-
binary_keywords = [dummy_str]
|
| 384 |
-
|
| 385 |
-
# Extract text features for all keyword types
|
| 386 |
-
categorical_features = self._extract_text_features(
|
| 387 |
-
self.clip_cate_model, categorical_keywords
|
| 388 |
-
)
|
| 389 |
-
unary_features = self._extract_text_features(
|
| 390 |
-
self.clip_unary_model, unary_keywords
|
| 391 |
-
)
|
| 392 |
-
binary_features = self._extract_text_features(
|
| 393 |
-
self.clip_binary_model, binary_keywords
|
| 394 |
-
)
|
| 395 |
-
|
| 396 |
-
# Process video frames and extract object features
|
| 397 |
-
categorical_probs = {}
|
| 398 |
-
unary_probs = {}
|
| 399 |
-
binary_probs = {}
|
| 400 |
-
|
| 401 |
-
# Process each frame
|
| 402 |
-
for frame_id, frame_masks in masks.items():
|
| 403 |
-
if frame_id >= len(video_frames):
|
| 404 |
-
continue
|
| 405 |
-
|
| 406 |
-
frame = self._frame_to_numpy(video_frames[frame_id])
|
| 407 |
-
frame_bboxes = bboxes.get(frame_id, {})
|
| 408 |
-
|
| 409 |
-
# Extract object features for categorical classification
|
| 410 |
-
for obj_id, mask in frame_masks.items():
|
| 411 |
-
if obj_id not in frame_bboxes:
|
| 412 |
-
continue
|
| 413 |
-
|
| 414 |
-
bbox = frame_bboxes[obj_id]
|
| 415 |
-
|
| 416 |
-
# Extract single object image
|
| 417 |
-
mask_np = self._mask_to_numpy(mask)
|
| 418 |
-
|
| 419 |
-
obj_image = extract_single_object(
|
| 420 |
-
frame, mask_np, alpha=self.config.alpha
|
| 421 |
-
)
|
| 422 |
-
|
| 423 |
-
# Get image features
|
| 424 |
-
obj_features = self._extract_image_features(
|
| 425 |
-
self.clip_cate_model, obj_image
|
| 426 |
-
)
|
| 427 |
-
|
| 428 |
-
# Compute similarities for categorical classification
|
| 429 |
-
cat_similarities = self.clip_sim(
|
| 430 |
-
self.clip_cate_model, categorical_features, obj_features
|
| 431 |
-
)
|
| 432 |
-
cat_probs = F.softmax(cat_similarities, dim=-1)
|
| 433 |
-
|
| 434 |
-
# Store categorical predictions
|
| 435 |
-
for i, keyword in enumerate(categorical_keywords):
|
| 436 |
-
if keyword != dummy_str:
|
| 437 |
-
categorical_probs[(obj_id, keyword)] = cat_probs[0, i].item()
|
| 438 |
-
|
| 439 |
-
# Compute unary predictions
|
| 440 |
-
if len(unary_keywords) > 0 and unary_keywords[0] != dummy_str:
|
| 441 |
-
unary_similarities = self.clip_sim(
|
| 442 |
-
self.clip_unary_model, unary_features, obj_features
|
| 443 |
-
)
|
| 444 |
-
unary_probs_tensor = F.softmax(unary_similarities, dim=-1)
|
| 445 |
-
|
| 446 |
-
for i, keyword in enumerate(unary_keywords):
|
| 447 |
-
if keyword != dummy_str:
|
| 448 |
-
unary_probs[(frame_id, obj_id, keyword)] = unary_probs_tensor[0, i].item()
|
| 449 |
-
|
| 450 |
-
# Process binary relationships
|
| 451 |
-
if len(binary_keywords) > 0 and binary_keywords[0] != dummy_str and len(object_pairs) > 0:
|
| 452 |
-
for obj1_id, obj2_id in object_pairs:
|
| 453 |
-
for frame_id, frame_masks in masks.items():
|
| 454 |
-
if frame_id >= len(video_frames):
|
| 455 |
-
continue
|
| 456 |
-
if (obj1_id in frame_masks and obj2_id in frame_masks and
|
| 457 |
-
obj1_id in bboxes.get(frame_id, {}) and obj2_id in bboxes.get(frame_id, {})):
|
| 458 |
-
|
| 459 |
-
frame = self._frame_to_numpy(video_frames[frame_id])
|
| 460 |
-
mask1 = frame_masks[obj1_id]
|
| 461 |
-
mask2 = frame_masks[obj2_id]
|
| 462 |
-
|
| 463 |
-
mask1_np = self._mask_to_numpy(mask1)
|
| 464 |
-
mask2_np = self._mask_to_numpy(mask2)
|
| 465 |
-
|
| 466 |
-
# Extract object pair image
|
| 467 |
-
pair_image = extract_object_subject(
|
| 468 |
-
frame, mask1_np[..., None], mask2_np[..., None],
|
| 469 |
-
alpha=self.config.alpha,
|
| 470 |
-
white_alpha=self.config.white_alpha
|
| 471 |
-
)
|
| 472 |
-
|
| 473 |
-
# Crop to contain both objects
|
| 474 |
-
bbox1 = bboxes[frame_id][obj1_id]
|
| 475 |
-
bbox2 = bboxes[frame_id][obj2_id]
|
| 476 |
-
|
| 477 |
-
# Bounding box overlap check
|
| 478 |
-
if bbox1[0] >= bbox2[2] or bbox2[1] >= bbox1[3] or \
|
| 479 |
-
bbox2[0] >= bbox1[2] or bbox1[1] >= bbox2[3]:
|
| 480 |
-
continue
|
| 481 |
-
|
| 482 |
-
cropped_image = crop_image_contain_bboxes(
|
| 483 |
-
pair_image, [bbox1, bbox2], f"frame_{frame_id}"
|
| 484 |
-
)
|
| 485 |
-
|
| 486 |
-
# Get image features
|
| 487 |
-
pair_features = self._extract_image_features(
|
| 488 |
-
self.clip_binary_model, cropped_image
|
| 489 |
-
)
|
| 490 |
-
|
| 491 |
-
# Compute similarities for binary classification
|
| 492 |
-
binary_similarities = self.clip_sim(
|
| 493 |
-
self.clip_binary_model, binary_features, pair_features
|
| 494 |
-
)
|
| 495 |
-
binary_probs_tensor = F.softmax(binary_similarities, dim=-1)
|
| 496 |
-
|
| 497 |
-
for i, keyword in enumerate(binary_keywords):
|
| 498 |
-
if keyword != dummy_str:
|
| 499 |
-
binary_probs[(frame_id, (obj1_id, obj2_id), keyword)] = binary_probs_tensor[0, i].item()
|
| 500 |
-
|
| 501 |
-
# Calculate dummy probability (for compatibility)
|
| 502 |
-
dummy_prob = 1.0 / max(len(categorical_keywords), len(unary_keywords), len(binary_keywords))
|
| 503 |
-
|
| 504 |
-
result: Dict[str, Any] = {
|
| 505 |
-
"categorical_probs": {0: categorical_probs}, # Video ID 0
|
| 506 |
-
"unary_probs": {0: unary_probs},
|
| 507 |
-
"binary_probs": [binary_probs], # List format for compatibility
|
| 508 |
-
"dummy_prob": dummy_prob
|
| 509 |
-
}
|
| 510 |
-
|
| 511 |
-
if return_flattened_segments or return_valid_pairs:
|
| 512 |
-
flattened = flatten_segments_for_batch(
|
| 513 |
-
video_id=0,
|
| 514 |
-
segments=masks,
|
| 515 |
-
bbox_min_dim=self.config.bbox_min_dim,
|
| 516 |
-
)
|
| 517 |
-
if return_flattened_segments:
|
| 518 |
-
result["flattened_segments"] = flattened
|
| 519 |
-
if return_valid_pairs:
|
| 520 |
-
interested_pairs = interested_object_pairs if interested_object_pairs else None
|
| 521 |
-
result["valid_pairs"] = extract_valid_object_pairs(
|
| 522 |
-
flattened["object_ids"],
|
| 523 |
-
interested_pairs,
|
| 524 |
-
)
|
| 525 |
-
if interested_pairs is None:
|
| 526 |
-
# Provide all generated pairs for clarity when auto-generated.
|
| 527 |
-
result["valid_pairs_metadata"] = {"pair_source": "all_pairs"}
|
| 528 |
-
else:
|
| 529 |
-
result["valid_pairs_metadata"] = {"pair_source": "filtered", "requested_pairs": interested_pairs}
|
| 530 |
-
|
| 531 |
-
return result
|
| 532 |
-
|
| 533 |
-
def _frame_to_numpy(self, frame: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
|
| 534 |
-
"""Convert a frame tensor/array to a contiguous numpy array."""
|
| 535 |
-
if torch.is_tensor(frame):
|
| 536 |
-
frame_np = frame.detach().cpu().numpy()
|
| 537 |
-
else:
|
| 538 |
-
frame_np = np.asarray(frame)
|
| 539 |
-
return np.ascontiguousarray(frame_np)
|
| 540 |
-
|
| 541 |
-
def _mask_to_numpy(self, mask: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
|
| 542 |
-
"""Convert a mask tensor/array to a 2D boolean numpy array."""
|
| 543 |
-
if torch.is_tensor(mask):
|
| 544 |
-
mask_np = mask.detach().cpu().numpy()
|
| 545 |
-
else:
|
| 546 |
-
mask_np = np.asarray(mask)
|
| 547 |
-
|
| 548 |
-
if mask_np.ndim == 3:
|
| 549 |
-
if mask_np.shape[0] == 1:
|
| 550 |
-
mask_np = mask_np.squeeze(0)
|
| 551 |
-
elif mask_np.shape[2] == 1:
|
| 552 |
-
mask_np = mask_np.squeeze(2)
|
| 553 |
-
|
| 554 |
-
if mask_np.ndim != 2:
|
| 555 |
-
raise ValueError(f"Mask must be 2D after squeezing, got shape {mask_np.shape}")
|
| 556 |
-
|
| 557 |
-
return mask_np.astype(bool, copy=False)
|
| 558 |
-
|
| 559 |
-
def _extract_text_features(self, model, keywords):
|
| 560 |
-
"""Extract text features for given keywords."""
|
| 561 |
-
tokens = self.clip_tokenizer(
|
| 562 |
-
keywords,
|
| 563 |
-
return_tensors="pt",
|
| 564 |
-
max_length=75,
|
| 565 |
-
truncation=True,
|
| 566 |
-
padding='max_length'
|
| 567 |
-
).to(self._device)
|
| 568 |
-
|
| 569 |
-
return self._text_features_checkpoint(model, tokens)
|
| 570 |
-
|
| 571 |
-
def _extract_image_features(self, model, image):
|
| 572 |
-
"""Extract image features for given image."""
|
| 573 |
-
# Ensure image is in correct format
|
| 574 |
-
if isinstance(image, np.ndarray):
|
| 575 |
-
if image.dtype != np.uint8:
|
| 576 |
-
image = image.astype(np.uint8)
|
| 577 |
-
# Convert BGR to RGB if needed
|
| 578 |
-
if len(image.shape) == 3 and image.shape[2] == 3:
|
| 579 |
-
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 580 |
-
|
| 581 |
-
# Process image with CLIP processor
|
| 582 |
-
inputs = self.clip_processor(
|
| 583 |
-
images=image,
|
| 584 |
-
return_tensors="pt"
|
| 585 |
-
).to(self._device)
|
| 586 |
-
|
| 587 |
-
return self._image_features_checkpoint(model, inputs['pixel_values'])
|
| 588 |
-
#TODO: return masks and bboxes and their corresponding index
|
| 589 |
-
def predict(
|
| 590 |
-
self,
|
| 591 |
-
video_frames: torch.Tensor,
|
| 592 |
-
masks: Dict[int, Dict[int, torch.Tensor]],
|
| 593 |
-
bboxes: Dict[int, Dict[int, List]],
|
| 594 |
-
categorical_keywords: List[str],
|
| 595 |
-
unary_keywords: Optional[List[str]] = None,
|
| 596 |
-
binary_keywords: Optional[List[str]] = None,
|
| 597 |
-
object_pairs: Optional[List[Tuple[int, int]]] = None,
|
| 598 |
-
return_top_k: int = 3,
|
| 599 |
-
return_flattened_segments: Optional[bool] = None,
|
| 600 |
-
return_valid_pairs: Optional[bool] = None,
|
| 601 |
-
interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
|
| 602 |
-
debug_visualizations: Optional[bool] = None,
|
| 603 |
-
) -> Dict[str, Any]:
|
| 604 |
-
"""
|
| 605 |
-
High-level prediction method that returns formatted results.
|
| 606 |
-
|
| 607 |
-
Args:
|
| 608 |
-
video_frames: Tensor of shape (num_frames, height, width, 3)
|
| 609 |
-
masks: Dict mapping frame_id -> object_id -> mask tensor
|
| 610 |
-
bboxes: Dict mapping frame_id -> object_id -> [x1, y1, x2, y2]
|
| 611 |
-
categorical_keywords: List of category names
|
| 612 |
-
unary_keywords: Optional list of unary predicates
|
| 613 |
-
binary_keywords: Optional list of binary predicates
|
| 614 |
-
object_pairs: Optional list of object pairs for binary relations
|
| 615 |
-
return_top_k: Number of top predictions to return
|
| 616 |
-
return_flattened_segments: Whether to include flattened mask/bbox tensors
|
| 617 |
-
return_valid_pairs: Whether to compute valid object pairs per frame
|
| 618 |
-
interested_object_pairs: Optional subset of object pairs to track
|
| 619 |
-
|
| 620 |
-
Returns:
|
| 621 |
-
Formatted prediction results
|
| 622 |
-
"""
|
| 623 |
-
|
| 624 |
-
with torch.no_grad():
|
| 625 |
-
outputs = self.forward(
|
| 626 |
-
video_frames=video_frames,
|
| 627 |
-
masks=masks,
|
| 628 |
-
bboxes=bboxes,
|
| 629 |
-
categorical_keywords=categorical_keywords,
|
| 630 |
-
unary_keywords=unary_keywords,
|
| 631 |
-
binary_keywords=binary_keywords,
|
| 632 |
-
object_pairs=object_pairs,
|
| 633 |
-
return_flattened_segments=return_flattened_segments,
|
| 634 |
-
return_valid_pairs=return_valid_pairs,
|
| 635 |
-
interested_object_pairs=interested_object_pairs,
|
| 636 |
-
debug_visualizations=debug_visualizations,
|
| 637 |
-
)
|
| 638 |
-
|
| 639 |
-
# Format categorical results
|
| 640 |
-
formatted_categorical = {}
|
| 641 |
-
for (obj_id, category), prob in outputs["categorical_probs"][0].items():
|
| 642 |
-
if obj_id not in formatted_categorical:
|
| 643 |
-
formatted_categorical[obj_id] = []
|
| 644 |
-
formatted_categorical[obj_id].append((prob, category))
|
| 645 |
-
|
| 646 |
-
# Sort and take top-k for each object
|
| 647 |
-
for obj_id in formatted_categorical:
|
| 648 |
-
formatted_categorical[obj_id] = sorted(
|
| 649 |
-
formatted_categorical[obj_id], reverse=True
|
| 650 |
-
)[:return_top_k]
|
| 651 |
-
|
| 652 |
-
# Format unary results
|
| 653 |
-
formatted_unary = {}
|
| 654 |
-
for (frame_id, obj_id, predicate), prob in outputs["unary_probs"][0].items():
|
| 655 |
-
key = (frame_id, obj_id)
|
| 656 |
-
if key not in formatted_unary:
|
| 657 |
-
formatted_unary[key] = []
|
| 658 |
-
formatted_unary[key].append((prob, predicate))
|
| 659 |
-
|
| 660 |
-
# Sort and take top-k
|
| 661 |
-
for key in formatted_unary:
|
| 662 |
-
formatted_unary[key] = sorted(
|
| 663 |
-
formatted_unary[key], reverse=True
|
| 664 |
-
)[:return_top_k]
|
| 665 |
-
|
| 666 |
-
# Format binary results
|
| 667 |
-
formatted_binary = {}
|
| 668 |
-
if len(outputs["binary_probs"]) > 0:
|
| 669 |
-
for (frame_id, obj_pair, predicate), prob in outputs["binary_probs"][0].items():
|
| 670 |
-
key = (frame_id, obj_pair)
|
| 671 |
-
if key not in formatted_binary:
|
| 672 |
-
formatted_binary[key] = []
|
| 673 |
-
formatted_binary[key].append((prob, predicate))
|
| 674 |
-
|
| 675 |
-
# Sort and take top-k
|
| 676 |
-
for key in formatted_binary:
|
| 677 |
-
formatted_binary[key] = sorted(
|
| 678 |
-
formatted_binary[key], reverse=True
|
| 679 |
-
)[:return_top_k]
|
| 680 |
-
|
| 681 |
-
result: Dict[str, Any] = {
|
| 682 |
-
"categorical_predictions": formatted_categorical,
|
| 683 |
-
"unary_predictions": formatted_unary,
|
| 684 |
-
"binary_predictions": formatted_binary,
|
| 685 |
-
"confidence_scores": {
|
| 686 |
-
"categorical": max([max([p for p, _ in preds], default=0.0)
|
| 687 |
-
for preds in formatted_categorical.values()], default=0.0),
|
| 688 |
-
"unary": max([max([p for p, _ in preds], default=0.0)
|
| 689 |
-
for preds in formatted_unary.values()], default=0.0),
|
| 690 |
-
"binary": max([max([p for p, _ in preds], default=0.0)
|
| 691 |
-
for preds in formatted_binary.values()], default=0.0)
|
| 692 |
-
}
|
| 693 |
-
}
|
| 694 |
-
|
| 695 |
-
if "flattened_segments" in outputs:
|
| 696 |
-
result["flattened_segments"] = outputs["flattened_segments"]
|
| 697 |
-
if "valid_pairs" in outputs:
|
| 698 |
-
result["valid_pairs"] = outputs["valid_pairs"]
|
| 699 |
-
if "valid_pairs_metadata" in outputs:
|
| 700 |
-
result["valid_pairs_metadata"] = outputs["valid_pairs_metadata"]
|
| 701 |
-
|
| 702 |
-
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/vine_pipeline.py
DELETED
|
@@ -1,691 +0,0 @@
|
|
| 1 |
-
import torch
|
| 2 |
-
import numpy as np
|
| 3 |
-
import cv2
|
| 4 |
-
import os
|
| 5 |
-
from typing import Dict, List, Tuple, Optional, Any, Union
|
| 6 |
-
from transformers import Pipeline
|
| 7 |
-
import tempfile
|
| 8 |
-
import uuid
|
| 9 |
-
|
| 10 |
-
from .vine_config import VineConfig
|
| 11 |
-
from .vine_model import VineModel
|
| 12 |
-
from .vis_utils import render_dino_frames, render_sam_frames, render_vine_frame_sets
|
| 13 |
-
from laser.loading import load_video
|
| 14 |
-
from laser.preprocess.mask_generation_grounding_dino import generate_masks_grounding_dino
|
| 15 |
-
|
| 16 |
-
class VinePipeline(Pipeline):
|
| 17 |
-
"""
|
| 18 |
-
Pipeline for VINE model that handles end-to-end video understanding.
|
| 19 |
-
|
| 20 |
-
This pipeline takes a video file or frames, along with segmentation method
|
| 21 |
-
and keyword lists, and returns probability distributions over the keywords.
|
| 22 |
-
|
| 23 |
-
Segmentation Model Configuration:
|
| 24 |
-
The pipeline requires SAM2 and GroundingDINO models for mask generation.
|
| 25 |
-
You can configure custom paths via constructor kwargs:
|
| 26 |
-
|
| 27 |
-
- sam_config_path: Path to SAM2 config (e.g., "configs/sam2.1/sam2.1_hiera_b+.yaml")
|
| 28 |
-
- sam_checkpoint_path: Path to SAM2 checkpoint (e.g., "checkpoints/sam2.1_hiera_base_plus.pt")
|
| 29 |
-
- gd_config_path: Path to GroundingDINO config (e.g., "groundingdino/config/GroundingDINO_SwinT_OGC.py")
|
| 30 |
-
- gd_checkpoint_path: Path to GroundingDINO checkpoint (e.g., "checkpoints/groundingdino_swint_ogc.pth")
|
| 31 |
-
|
| 32 |
-
Old:
|
| 33 |
-
- SAM2: ~/research/sam2/ or /home/asethi04/LASER_NEW/LASER/sam2/
|
| 34 |
-
- GroundingDINO: /home/asethi04/LASER_NEW/LASER/GroundingDINO/
|
| 35 |
-
|
| 36 |
-
Alternative: Use set_segmentation_models() to provide pre-initialized model instances.
|
| 37 |
-
"""
|
| 38 |
-
|
| 39 |
-
def __init__(
|
| 40 |
-
self,
|
| 41 |
-
sam_config_path: Optional[str] = None,
|
| 42 |
-
sam_checkpoint_path: Optional[str] = None,
|
| 43 |
-
gd_config_path: Optional[str] = None,
|
| 44 |
-
gd_checkpoint_path: Optional[str] = None,
|
| 45 |
-
**kwargs
|
| 46 |
-
):
|
| 47 |
-
self.grounding_model = None
|
| 48 |
-
self.sam_predictor = None
|
| 49 |
-
self.mask_generator = None
|
| 50 |
-
|
| 51 |
-
self.sam_config_path = sam_config_path
|
| 52 |
-
self.sam_checkpoint_path = sam_checkpoint_path
|
| 53 |
-
self.gd_config_path = gd_config_path
|
| 54 |
-
self.gd_checkpoint_path = gd_checkpoint_path
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
super().__init__(**kwargs)
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
# Set default parameters from config
|
| 61 |
-
self.segmentation_method = getattr(self.model.config, 'segmentation_method', 'grounding_dino_sam2')
|
| 62 |
-
self.box_threshold = getattr(self.model.config, 'box_threshold', 0.35)
|
| 63 |
-
self.text_threshold = getattr(self.model.config, 'text_threshold', 0.25)
|
| 64 |
-
self.target_fps = getattr(self.model.config, 'target_fps', 1)
|
| 65 |
-
self.visualize = getattr(self.model.config, 'visualize', False)
|
| 66 |
-
self.visualization_dir = getattr(self.model.config, 'visualization_dir', None)
|
| 67 |
-
self.debug_visualizations = getattr(self.model.config, 'debug_visualizations', False)
|
| 68 |
-
self._device = getattr(self.model.config, '_device')
|
| 69 |
-
if kwargs.get("device") is not None:
|
| 70 |
-
self._device = kwargs.get("device")
|
| 71 |
-
|
| 72 |
-
def set_segmentation_models(
|
| 73 |
-
self,
|
| 74 |
-
*,
|
| 75 |
-
sam_predictor=None,
|
| 76 |
-
mask_generator=None,
|
| 77 |
-
grounding_model=None
|
| 78 |
-
):
|
| 79 |
-
"""
|
| 80 |
-
Set pre-initialized segmentation models, bypassing automatic initialization/current_values
|
| 81 |
-
|
| 82 |
-
Args:
|
| 83 |
-
sam_predictor: Pre-built SAM2 video predictor
|
| 84 |
-
mask_generator: Pre-built SAM2 automatic mask generator
|
| 85 |
-
grounding_model: Pre-built GroundingDINO model
|
| 86 |
-
"""
|
| 87 |
-
if sam_predictor is not None:
|
| 88 |
-
self.sam_predictor = sam_predictor
|
| 89 |
-
if mask_generator is not None:
|
| 90 |
-
self.mask_generator = mask_generator
|
| 91 |
-
if grounding_model is not None:
|
| 92 |
-
self.grounding_model = grounding_model
|
| 93 |
-
|
| 94 |
-
def _sanitize_parameters(self, **kwargs):
|
| 95 |
-
"""Sanitize parameters for different pipeline stages."""
|
| 96 |
-
preprocess_kwargs = {}
|
| 97 |
-
forward_kwargs = {}
|
| 98 |
-
postprocess_kwargs = {}
|
| 99 |
-
|
| 100 |
-
# Preprocess parameters
|
| 101 |
-
if "segmentation_method" in kwargs:
|
| 102 |
-
preprocess_kwargs["segmentation_method"] = kwargs["segmentation_method"]
|
| 103 |
-
if "target_fps" in kwargs:
|
| 104 |
-
preprocess_kwargs["target_fps"] = kwargs["target_fps"]
|
| 105 |
-
if "box_threshold" in kwargs:
|
| 106 |
-
preprocess_kwargs["box_threshold"] = kwargs["box_threshold"]
|
| 107 |
-
if "text_threshold" in kwargs:
|
| 108 |
-
preprocess_kwargs["text_threshold"] = kwargs["text_threshold"]
|
| 109 |
-
if "categorical_keywords" in kwargs:
|
| 110 |
-
preprocess_kwargs["categorical_keywords"] = kwargs["categorical_keywords"]
|
| 111 |
-
|
| 112 |
-
# Forward parameters
|
| 113 |
-
if "categorical_keywords" in kwargs:
|
| 114 |
-
forward_kwargs["categorical_keywords"] = kwargs["categorical_keywords"]
|
| 115 |
-
if "unary_keywords" in kwargs:
|
| 116 |
-
forward_kwargs["unary_keywords"] = kwargs["unary_keywords"]
|
| 117 |
-
if "binary_keywords" in kwargs:
|
| 118 |
-
forward_kwargs["binary_keywords"] = kwargs["binary_keywords"]
|
| 119 |
-
if "object_pairs" in kwargs:
|
| 120 |
-
forward_kwargs["object_pairs"] = kwargs["object_pairs"]
|
| 121 |
-
if "return_flattened_segments" in kwargs:
|
| 122 |
-
forward_kwargs["return_flattened_segments"] = kwargs["return_flattened_segments"]
|
| 123 |
-
if "return_valid_pairs" in kwargs:
|
| 124 |
-
forward_kwargs["return_valid_pairs"] = kwargs["return_valid_pairs"]
|
| 125 |
-
if "interested_object_pairs" in kwargs:
|
| 126 |
-
forward_kwargs["interested_object_pairs"] = kwargs["interested_object_pairs"]
|
| 127 |
-
if "debug_visualizations" in kwargs:
|
| 128 |
-
forward_kwargs["debug_visualizations"] = kwargs["debug_visualizations"]
|
| 129 |
-
postprocess_kwargs["debug_visualizations"] = kwargs["debug_visualizations"]
|
| 130 |
-
|
| 131 |
-
# Postprocess parameters
|
| 132 |
-
if "return_top_k" in kwargs:
|
| 133 |
-
postprocess_kwargs["return_top_k"] = kwargs["return_top_k"]
|
| 134 |
-
if "self.visualize" in kwargs:
|
| 135 |
-
postprocess_kwargs["self.visualize"] = kwargs["self.visualize"]
|
| 136 |
-
|
| 137 |
-
return preprocess_kwargs, forward_kwargs, postprocess_kwargs
|
| 138 |
-
|
| 139 |
-
def preprocess(
|
| 140 |
-
self,
|
| 141 |
-
video_input: Union[str, np.ndarray, torch.Tensor],
|
| 142 |
-
segmentation_method: str = None,
|
| 143 |
-
target_fps: int = None,
|
| 144 |
-
box_threshold: float = None,
|
| 145 |
-
text_threshold: float = None,
|
| 146 |
-
categorical_keywords: List[str] = None,
|
| 147 |
-
**kwargs
|
| 148 |
-
) -> Dict[str, Any]:
|
| 149 |
-
"""
|
| 150 |
-
Preprocess video input and generate masks.
|
| 151 |
-
|
| 152 |
-
Args:
|
| 153 |
-
video_input: Path to video file, or video tensor/array
|
| 154 |
-
segmentation_method: "sam2" or "grounding_dino_sam2"
|
| 155 |
-
target_fps: Target FPS for video processing
|
| 156 |
-
box_threshold: Box threshold for Grounding DINO
|
| 157 |
-
text_threshold: Text threshold for Grounding DINO
|
| 158 |
-
categorical_keywords: Keywords for Grounding DINO segmentation
|
| 159 |
-
|
| 160 |
-
Returns:
|
| 161 |
-
Dict containing video frames, masks, and bboxes
|
| 162 |
-
"""
|
| 163 |
-
# Use defaults from config if not provided
|
| 164 |
-
if segmentation_method is None:
|
| 165 |
-
segmentation_method = self.segmentation_method
|
| 166 |
-
if target_fps is None:
|
| 167 |
-
target_fps = self.target_fps
|
| 168 |
-
if box_threshold is None:
|
| 169 |
-
box_threshold = self.box_threshold
|
| 170 |
-
if text_threshold is None:
|
| 171 |
-
text_threshold = self.text_threshold
|
| 172 |
-
if categorical_keywords is None:
|
| 173 |
-
categorical_keywords = ["object"] # Default generic category
|
| 174 |
-
|
| 175 |
-
if isinstance(video_input, str):
|
| 176 |
-
# Video file path
|
| 177 |
-
video_tensor = load_video(video_input, target_fps=target_fps)
|
| 178 |
-
if isinstance(video_tensor, list):
|
| 179 |
-
video_tensor = np.array(video_tensor)
|
| 180 |
-
elif isinstance(video_tensor, torch.Tensor):
|
| 181 |
-
video_tensor = video_tensor.cpu().numpy()
|
| 182 |
-
|
| 183 |
-
elif isinstance(video_input, (np.ndarray, torch.Tensor)):
|
| 184 |
-
# Video tensor/array
|
| 185 |
-
if isinstance(video_input, torch.Tensor):
|
| 186 |
-
video_tensor = video_input.numpy()
|
| 187 |
-
else:
|
| 188 |
-
video_tensor = video_input
|
| 189 |
-
else:
|
| 190 |
-
raise ValueError(f"Unsupported video input type: {type(video_input)}")
|
| 191 |
-
|
| 192 |
-
# Ensure video tensor is numpy array
|
| 193 |
-
if not isinstance(video_tensor, np.ndarray):
|
| 194 |
-
video_tensor = np.array(video_tensor)
|
| 195 |
-
|
| 196 |
-
# Ensure video tensor is in correct format
|
| 197 |
-
if len(video_tensor.shape) != 4:
|
| 198 |
-
raise ValueError(f"Expected video tensor shape (frames, height, width, channels), got {video_tensor.shape}")
|
| 199 |
-
|
| 200 |
-
# Generate masks and bboxes based on segmentation method
|
| 201 |
-
visualization_data: Dict[str, Any] = {}
|
| 202 |
-
print(f"Segmentation method: {segmentation_method}")
|
| 203 |
-
if segmentation_method == "sam2":
|
| 204 |
-
masks, bboxes, vis_data = self._generate_sam2_masks(video_tensor)
|
| 205 |
-
elif segmentation_method == "grounding_dino_sam2":
|
| 206 |
-
masks, bboxes, vis_data = self._generate_grounding_dino_sam2_masks(
|
| 207 |
-
video_tensor, categorical_keywords, box_threshold, text_threshold, video_input
|
| 208 |
-
)
|
| 209 |
-
else:
|
| 210 |
-
raise ValueError(f"Unsupported segmentation method: {segmentation_method}")
|
| 211 |
-
if vis_data:
|
| 212 |
-
visualization_data.update(vis_data)
|
| 213 |
-
visualization_data.setdefault("sam_masks", masks)
|
| 214 |
-
|
| 215 |
-
return {
|
| 216 |
-
"video_frames": torch.tensor(video_tensor),
|
| 217 |
-
"masks": masks,
|
| 218 |
-
"bboxes": bboxes,
|
| 219 |
-
"num_frames": len(video_tensor),
|
| 220 |
-
"visualization_data": visualization_data,
|
| 221 |
-
}
|
| 222 |
-
|
| 223 |
-
def _generate_sam2_masks(self, video_tensor: np.ndarray) -> Tuple[Dict, Dict, Dict[str, Any]]:
|
| 224 |
-
"""Generate masks using SAM2 automatic mask generation."""
|
| 225 |
-
# Initialize SAM2 models if not already done
|
| 226 |
-
print("Generating SAM2 masks...")
|
| 227 |
-
if self.mask_generator is None:
|
| 228 |
-
self._initialize_segmentation_models()
|
| 229 |
-
|
| 230 |
-
if self.mask_generator is None:
|
| 231 |
-
raise ValueError("SAM2 mask generator not available")
|
| 232 |
-
|
| 233 |
-
masks: Dict[int, Dict[int, torch.Tensor]] = {}
|
| 234 |
-
bboxes: Dict[int, Dict[int, List[int]]] = {}
|
| 235 |
-
|
| 236 |
-
for frame_id, frame in enumerate(video_tensor):
|
| 237 |
-
if isinstance(frame, np.ndarray) and frame.dtype != np.uint8:
|
| 238 |
-
frame = (frame * 255).astype(np.uint8) if frame.max() <= 1 else frame.astype(np.uint8)
|
| 239 |
-
|
| 240 |
-
height, width, _ = frame.shape
|
| 241 |
-
frame_masks = self.mask_generator.generate(frame)
|
| 242 |
-
|
| 243 |
-
masks[frame_id] = {}
|
| 244 |
-
bboxes[frame_id] = {}
|
| 245 |
-
|
| 246 |
-
for obj_id, mask_data in enumerate(frame_masks):
|
| 247 |
-
mask = mask_data["segmentation"]
|
| 248 |
-
if isinstance(mask, np.ndarray):
|
| 249 |
-
mask = torch.from_numpy(mask)
|
| 250 |
-
|
| 251 |
-
if len(mask.shape) == 2:
|
| 252 |
-
mask = mask.unsqueeze(-1)
|
| 253 |
-
elif len(mask.shape) == 3 and mask.shape[0] == 1:
|
| 254 |
-
mask = mask.permute(1, 2, 0)
|
| 255 |
-
|
| 256 |
-
wrapped_id = obj_id + 1
|
| 257 |
-
masks[frame_id][wrapped_id] = mask
|
| 258 |
-
|
| 259 |
-
mask_np = mask.squeeze().numpy() if isinstance(mask, torch.Tensor) else mask.squeeze()
|
| 260 |
-
|
| 261 |
-
coords = np.where(mask_np > 0)
|
| 262 |
-
if len(coords[0]) > 0:
|
| 263 |
-
y1, y2 = coords[0].min(), coords[0].max()
|
| 264 |
-
x1, x2 = coords[1].min(), coords[1].max()
|
| 265 |
-
bboxes[frame_id][wrapped_id] = [x1, y1, x2, y2]
|
| 266 |
-
|
| 267 |
-
return masks, bboxes, {"sam_masks": masks}
|
| 268 |
-
|
| 269 |
-
def _generate_grounding_dino_sam2_masks(
|
| 270 |
-
self,
|
| 271 |
-
video_tensor: np.ndarray,
|
| 272 |
-
categorical_keywords: List[str],
|
| 273 |
-
box_threshold: float,
|
| 274 |
-
text_threshold: float,
|
| 275 |
-
video_path: str,
|
| 276 |
-
) -> Tuple[Dict, Dict, Dict[str, Any]]:
|
| 277 |
-
"""Generate masks using Grounding DINO + SAM2."""
|
| 278 |
-
# Initialize models if not already done
|
| 279 |
-
print("Generating Grounding DINO + SAM2 masks...")
|
| 280 |
-
if self.grounding_model is None or self.sam_predictor is None:
|
| 281 |
-
self._initialize_segmentation_models()
|
| 282 |
-
|
| 283 |
-
if self.grounding_model is None or self.sam_predictor is None:
|
| 284 |
-
raise ValueError("GroundingDINO or SAM2 models not available")
|
| 285 |
-
|
| 286 |
-
temp_video_path = None
|
| 287 |
-
if video_path is None or not isinstance(video_path, str):
|
| 288 |
-
temp_video_path = self._create_temp_video(video_tensor)
|
| 289 |
-
video_path = temp_video_path
|
| 290 |
-
|
| 291 |
-
CHUNK = 5
|
| 292 |
-
classes_ls = [categorical_keywords[i:i + CHUNK] for i in range(0, len(categorical_keywords), CHUNK)]
|
| 293 |
-
video_segments, oid_class_pred, _ = generate_masks_grounding_dino(
|
| 294 |
-
self.grounding_model,
|
| 295 |
-
box_threshold,
|
| 296 |
-
text_threshold,
|
| 297 |
-
self.sam_predictor,
|
| 298 |
-
self.mask_generator,
|
| 299 |
-
video_tensor,
|
| 300 |
-
video_path,
|
| 301 |
-
"temp_video",
|
| 302 |
-
out_dir=tempfile.gettempdir(),
|
| 303 |
-
classes_ls=classes_ls,
|
| 304 |
-
target_fps=self.target_fps,
|
| 305 |
-
visualize=self.debug_visualizations,
|
| 306 |
-
frames=None,
|
| 307 |
-
max_prop_time=10
|
| 308 |
-
)
|
| 309 |
-
|
| 310 |
-
masks: Dict[int, Dict[int, torch.Tensor]] = {}
|
| 311 |
-
bboxes: Dict[int, Dict[int, List[int]]] = {}
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
for frame_id, frame_masks in video_segments.items():
|
| 315 |
-
masks[frame_id] = {}
|
| 316 |
-
bboxes[frame_id] = {}
|
| 317 |
-
|
| 318 |
-
for obj_id, mask in frame_masks.items():
|
| 319 |
-
if not isinstance(mask, torch.Tensor):
|
| 320 |
-
mask = torch.tensor(mask)
|
| 321 |
-
masks[frame_id][obj_id] = mask
|
| 322 |
-
mask_np = mask.numpy()
|
| 323 |
-
if mask_np.ndim == 3 and mask_np.shape[0] == 1:
|
| 324 |
-
mask_np = np.squeeze(mask_np, axis=0)
|
| 325 |
-
|
| 326 |
-
coords = np.where(mask_np > 0)
|
| 327 |
-
if len(coords[0]) > 0:
|
| 328 |
-
y1, y2 = coords[0].min(), coords[0].max()
|
| 329 |
-
x1, x2 = coords[1].min(), coords[1].max()
|
| 330 |
-
bboxes[frame_id][obj_id] = [x1, y1, x2, y2]
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
if temp_video_path and os.path.exists(temp_video_path):
|
| 334 |
-
os.remove(temp_video_path)
|
| 335 |
-
|
| 336 |
-
vis_data: Dict[str, Any] = {
|
| 337 |
-
"sam_masks": masks,
|
| 338 |
-
"dino_labels": oid_class_pred,
|
| 339 |
-
}
|
| 340 |
-
return masks, bboxes, vis_data
|
| 341 |
-
|
| 342 |
-
def _initialize_segmentation_models(self):
|
| 343 |
-
"""Initialize segmentation models based on the requested method and configured paths."""
|
| 344 |
-
if (self.sam_predictor is None or self.mask_generator is None):
|
| 345 |
-
self._initialize_sam2_models()
|
| 346 |
-
|
| 347 |
-
if self.grounding_model is None:
|
| 348 |
-
self._initialize_grounding_dino_model()
|
| 349 |
-
|
| 350 |
-
def _initialize_sam2_models(self):
|
| 351 |
-
"""Initialize SAM2 video predictor and mask generator."""
|
| 352 |
-
try:
|
| 353 |
-
from sam2.build_sam import build_sam2_video_predictor, build_sam2
|
| 354 |
-
from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
|
| 355 |
-
except ImportError as e:
|
| 356 |
-
print(f"Warning: Could not import SAM2: {e}")
|
| 357 |
-
return
|
| 358 |
-
|
| 359 |
-
# Resolve SAM2 paths
|
| 360 |
-
config_path, checkpoint_path = self._resolve_sam2_paths()
|
| 361 |
-
|
| 362 |
-
# Validate paths if custom ones were provided
|
| 363 |
-
if self.sam_config_path is not None and not os.path.exists(config_path):
|
| 364 |
-
raise ValueError(f"SAM2 config path not found: {config_path}")
|
| 365 |
-
if self.sam_checkpoint_path is not None and not os.path.exists(checkpoint_path):
|
| 366 |
-
raise ValueError(f"SAM2 checkpoint path not found: {checkpoint_path}")
|
| 367 |
-
|
| 368 |
-
# Only proceed if we have valid paths
|
| 369 |
-
if not os.path.exists(checkpoint_path):
|
| 370 |
-
print(f"Warning: SAM2 checkpoint not found at {checkpoint_path}")
|
| 371 |
-
print("SAM2 functionality will be unavailable")
|
| 372 |
-
return
|
| 373 |
-
|
| 374 |
-
try:
|
| 375 |
-
device = self._device
|
| 376 |
-
|
| 377 |
-
print(type(device))
|
| 378 |
-
# Video predictor
|
| 379 |
-
self.sam_predictor = build_sam2_video_predictor(
|
| 380 |
-
config_path, checkpoint_path, device=device
|
| 381 |
-
)
|
| 382 |
-
|
| 383 |
-
# Mask generator
|
| 384 |
-
sam2_model = build_sam2(config_path, checkpoint_path, device=device, apply_postprocessing=False)
|
| 385 |
-
self.mask_generator = SAM2AutomaticMaskGenerator(
|
| 386 |
-
model=sam2_model,
|
| 387 |
-
points_per_side=32,
|
| 388 |
-
points_per_batch=32,
|
| 389 |
-
pred_iou_thresh=0.7,
|
| 390 |
-
stability_score_thresh=0.8,
|
| 391 |
-
crop_n_layers=2,
|
| 392 |
-
box_nms_thresh=0.6,
|
| 393 |
-
crop_n_points_downscale_factor=2,
|
| 394 |
-
min_mask_region_area=100,
|
| 395 |
-
use_m2m=True,
|
| 396 |
-
)
|
| 397 |
-
print("✓ SAM2 models initialized successfully")
|
| 398 |
-
|
| 399 |
-
except Exception as e:
|
| 400 |
-
raise ValueError(f"Failed to initialize SAM2 with custom paths: {e}")
|
| 401 |
-
|
| 402 |
-
def _initialize_grounding_dino_model(self):
|
| 403 |
-
"""Initialize GroundingDINO model."""
|
| 404 |
-
try:
|
| 405 |
-
from groundingdino.util.inference import Model as gd_Model
|
| 406 |
-
except ImportError as e:
|
| 407 |
-
print(f"Warning: Could not import GroundingDINO: {e}")
|
| 408 |
-
return
|
| 409 |
-
|
| 410 |
-
# Resolve GroundingDINO paths
|
| 411 |
-
config_path, checkpoint_path = self._resolve_grounding_dino_paths()
|
| 412 |
-
|
| 413 |
-
# Validate paths if custom ones were provided
|
| 414 |
-
if self.gd_config_path is not None and not os.path.exists(config_path):
|
| 415 |
-
raise ValueError(f"GroundingDINO config path not found: {config_path}")
|
| 416 |
-
if self.gd_checkpoint_path is not None and not os.path.exists(checkpoint_path):
|
| 417 |
-
raise ValueError(f"GroundingDINO checkpoint path not found: {checkpoint_path}")
|
| 418 |
-
|
| 419 |
-
# Only proceed if we have valid paths
|
| 420 |
-
if not (os.path.exists(config_path) and os.path.exists(checkpoint_path)):
|
| 421 |
-
print(f"Warning: GroundingDINO models not found at {config_path} / {checkpoint_path}")
|
| 422 |
-
print("GroundingDINO functionality will be unavailable")
|
| 423 |
-
return
|
| 424 |
-
|
| 425 |
-
try:
|
| 426 |
-
device = self._device
|
| 427 |
-
print(type(device))
|
| 428 |
-
self.grounding_model = gd_Model(
|
| 429 |
-
model_config_path=config_path,
|
| 430 |
-
model_checkpoint_path=checkpoint_path,
|
| 431 |
-
device=device
|
| 432 |
-
)
|
| 433 |
-
print("✓ GroundingDINO model initialized successfully")
|
| 434 |
-
|
| 435 |
-
except Exception as e:
|
| 436 |
-
raise ValueError(f"Failed to initialize GroundingDINO with custom paths: {e}")
|
| 437 |
-
|
| 438 |
-
def _resolve_sam2_paths(self):
|
| 439 |
-
"""Resolve SAM2 config and checkpoint paths."""
|
| 440 |
-
# Use custom paths if provided
|
| 441 |
-
if self.sam_config_path and self.sam_checkpoint_path:
|
| 442 |
-
return self.sam_config_path, self.sam_checkpoint_path
|
| 443 |
-
|
| 444 |
-
def _resolve_grounding_dino_paths(self):
|
| 445 |
-
"""Resolve GroundingDINO config and checkpoint paths."""
|
| 446 |
-
# Use custom paths if provided
|
| 447 |
-
if self.gd_config_path and self.gd_checkpoint_path:
|
| 448 |
-
return self.gd_config_path, self.gd_checkpoint_path
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
def _prepare_visualization_dir(self, name: str, enabled: bool) -> Optional[str]:
|
| 452 |
-
"""
|
| 453 |
-
Ensure a directory exists for visualization artifacts and return it.
|
| 454 |
-
If visualization is disabled, returns None.
|
| 455 |
-
"""
|
| 456 |
-
if not enabled:
|
| 457 |
-
return None
|
| 458 |
-
|
| 459 |
-
if self.visualization_dir:
|
| 460 |
-
target_dir = os.path.join(self.visualization_dir, name) if name else self.visualization_dir
|
| 461 |
-
os.makedirs(target_dir, exist_ok=True)
|
| 462 |
-
return target_dir
|
| 463 |
-
|
| 464 |
-
return tempfile.mkdtemp(prefix=f"vine_{name}_")
|
| 465 |
-
|
| 466 |
-
def _create_temp_video(self, video_tensor: np.ndarray, base_dir: Optional[str] = None, prefix: str = "temp_video") -> str:
|
| 467 |
-
"""Create a temporary video file from video tensor."""
|
| 468 |
-
if base_dir is None:
|
| 469 |
-
base_dir = tempfile.mkdtemp(prefix=f"vine_{prefix}_")
|
| 470 |
-
else:
|
| 471 |
-
os.makedirs(base_dir, exist_ok=True)
|
| 472 |
-
file_name = f"{prefix}_{uuid.uuid4().hex}.mp4"
|
| 473 |
-
temp_path = os.path.join(base_dir, file_name)
|
| 474 |
-
|
| 475 |
-
# Use OpenCV to write video
|
| 476 |
-
height, width = video_tensor.shape[1:3]
|
| 477 |
-
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
| 478 |
-
out = cv2.VideoWriter(temp_path, fourcc, self.target_fps, (width, height))
|
| 479 |
-
|
| 480 |
-
for frame in video_tensor:
|
| 481 |
-
# Convert RGB to BGR for OpenCV
|
| 482 |
-
if len(frame.shape) == 3 and frame.shape[2] == 3:
|
| 483 |
-
frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
|
| 484 |
-
else:
|
| 485 |
-
frame_bgr = frame
|
| 486 |
-
out.write(frame_bgr.astype(np.uint8))
|
| 487 |
-
|
| 488 |
-
out.release()
|
| 489 |
-
return temp_path
|
| 490 |
-
|
| 491 |
-
def _forward(self, model_inputs: Dict[str, Any], **forward_kwargs) -> Dict[str, Any]:
|
| 492 |
-
"""Forward pass through the model."""
|
| 493 |
-
outputs = self.model.predict(
|
| 494 |
-
video_frames=model_inputs["video_frames"],
|
| 495 |
-
masks=model_inputs["masks"],
|
| 496 |
-
bboxes=model_inputs["bboxes"],
|
| 497 |
-
**forward_kwargs
|
| 498 |
-
)
|
| 499 |
-
outputs.setdefault("video_frames", model_inputs.get("video_frames"))
|
| 500 |
-
outputs.setdefault("bboxes", model_inputs.get("bboxes"))
|
| 501 |
-
outputs.setdefault("masks", model_inputs.get("masks"))
|
| 502 |
-
outputs.setdefault("visualization_data", model_inputs.get("visualization_data"))
|
| 503 |
-
return outputs
|
| 504 |
-
|
| 505 |
-
def postprocess(
|
| 506 |
-
self,
|
| 507 |
-
model_outputs: Dict[str, Any],
|
| 508 |
-
return_top_k: int = 3,
|
| 509 |
-
visualize: Optional[bool] = None,
|
| 510 |
-
**kwargs
|
| 511 |
-
) -> Dict[str, Any]:
|
| 512 |
-
"""
|
| 513 |
-
Postprocess model outputs into user-friendly format.
|
| 514 |
-
|
| 515 |
-
Args:
|
| 516 |
-
model_outputs: Raw model outputs
|
| 517 |
-
return_top_k: Number of top predictions to return
|
| 518 |
-
self.visualize: Whether to include visualization data
|
| 519 |
-
|
| 520 |
-
Returns:
|
| 521 |
-
Formatted results
|
| 522 |
-
"""
|
| 523 |
-
results = {
|
| 524 |
-
"categorical_predictions": model_outputs.get("categorical_predictions", {}),
|
| 525 |
-
"unary_predictions": model_outputs.get("unary_predictions", {}),
|
| 526 |
-
"binary_predictions": model_outputs.get("binary_predictions", {}),
|
| 527 |
-
"confidence_scores": model_outputs.get("confidence_scores", {}),
|
| 528 |
-
"summary": self._generate_summary(model_outputs)
|
| 529 |
-
}
|
| 530 |
-
if "flattened_segments" in model_outputs:
|
| 531 |
-
results["flattened_segments"] = model_outputs["flattened_segments"]
|
| 532 |
-
if "valid_pairs" in model_outputs:
|
| 533 |
-
results["valid_pairs"] = model_outputs["valid_pairs"]
|
| 534 |
-
if "valid_pairs_metadata" in model_outputs:
|
| 535 |
-
results["valid_pairs_metadata"] = model_outputs["valid_pairs_metadata"]
|
| 536 |
-
if "visualization_data" in model_outputs:
|
| 537 |
-
results["visualization_data"] = model_outputs["visualization_data"]
|
| 538 |
-
|
| 539 |
-
if self.visualize and "video_frames" in model_outputs and "bboxes" in model_outputs:
|
| 540 |
-
frames_tensor = model_outputs["video_frames"]
|
| 541 |
-
if isinstance(frames_tensor, torch.Tensor):
|
| 542 |
-
frames_np = frames_tensor.detach().cpu().numpy()
|
| 543 |
-
else:
|
| 544 |
-
frames_np = np.asarray(frames_tensor)
|
| 545 |
-
if frames_np.dtype != np.uint8:
|
| 546 |
-
if np.issubdtype(frames_np.dtype, np.floating):
|
| 547 |
-
max_val = frames_np.max() if frames_np.size else 0.0
|
| 548 |
-
scale = 255.0 if max_val <= 1.0 else 1.0
|
| 549 |
-
frames_np = (frames_np * scale).clip(0, 255).astype(np.uint8)
|
| 550 |
-
else:
|
| 551 |
-
frames_np = frames_np.clip(0, 255).astype(np.uint8)
|
| 552 |
-
|
| 553 |
-
cat_label_lookup: Dict[int, Tuple[str, float]] = {}
|
| 554 |
-
for obj_id, preds in model_outputs.get("categorical_predictions", {}).items():
|
| 555 |
-
if preds:
|
| 556 |
-
prob, label = preds[0]
|
| 557 |
-
cat_label_lookup[obj_id] = (label, prob)
|
| 558 |
-
|
| 559 |
-
unary_preds = model_outputs.get("unary_predictions", {})
|
| 560 |
-
unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]] = {}
|
| 561 |
-
for (frame_id, obj_id), preds in unary_preds.items():
|
| 562 |
-
if preds:
|
| 563 |
-
unary_lookup.setdefault(frame_id, {})[obj_id] = preds
|
| 564 |
-
|
| 565 |
-
binary_preds = model_outputs.get("binary_predictions", {})
|
| 566 |
-
binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]] = {}
|
| 567 |
-
for (frame_id, obj_pair), preds in binary_preds.items():
|
| 568 |
-
if preds:
|
| 569 |
-
binary_lookup.setdefault(frame_id, []).append((obj_pair, preds))
|
| 570 |
-
|
| 571 |
-
bboxes = model_outputs["bboxes"]
|
| 572 |
-
visualization_data = model_outputs.get("visualization_data", {})
|
| 573 |
-
visualizations: Dict[str, Dict[str, Any]] = {}
|
| 574 |
-
debug_visualizations = kwargs.get("debug_visualizations")
|
| 575 |
-
if debug_visualizations is None:
|
| 576 |
-
debug_visualizations = self.debug_visualizations
|
| 577 |
-
|
| 578 |
-
vine_frame_sets = render_vine_frame_sets(
|
| 579 |
-
frames_np,
|
| 580 |
-
bboxes,
|
| 581 |
-
cat_label_lookup,
|
| 582 |
-
unary_lookup,
|
| 583 |
-
binary_lookup,
|
| 584 |
-
visualization_data.get("sam_masks"),
|
| 585 |
-
)
|
| 586 |
-
|
| 587 |
-
vine_visuals: Dict[str, Dict[str, Any]] = {}
|
| 588 |
-
final_frames = vine_frame_sets.get("all", [])
|
| 589 |
-
if final_frames:
|
| 590 |
-
final_entry: Dict[str, Any] = {"frames": final_frames, "video_path": None}
|
| 591 |
-
final_dir = self._prepare_visualization_dir("all", enabled=self.visualize)
|
| 592 |
-
final_entry["video_path"] = self._create_temp_video(
|
| 593 |
-
np.stack(final_frames, axis=0),
|
| 594 |
-
base_dir=final_dir,
|
| 595 |
-
prefix="all_visualization"
|
| 596 |
-
)
|
| 597 |
-
vine_visuals["all"] = final_entry
|
| 598 |
-
|
| 599 |
-
if debug_visualizations:
|
| 600 |
-
sam_masks = visualization_data.get("sam_masks")
|
| 601 |
-
if sam_masks:
|
| 602 |
-
sam_frames = render_sam_frames(frames_np, sam_masks, visualization_data.get("dino_labels"))
|
| 603 |
-
sam_entry = {"frames": sam_frames, "video_path": None}
|
| 604 |
-
if sam_frames:
|
| 605 |
-
sam_dir = self._prepare_visualization_dir("sam", enabled=self.visualize)
|
| 606 |
-
sam_entry["video_path"] = self._create_temp_video(
|
| 607 |
-
np.stack(sam_frames, axis=0),
|
| 608 |
-
base_dir=sam_dir,
|
| 609 |
-
prefix="sam_visualization"
|
| 610 |
-
)
|
| 611 |
-
visualizations["sam"] = sam_entry
|
| 612 |
-
|
| 613 |
-
dino_labels = visualization_data.get("dino_labels")
|
| 614 |
-
if dino_labels:
|
| 615 |
-
dino_frames = render_dino_frames(frames_np, bboxes, dino_labels)
|
| 616 |
-
dino_entry = {"frames": dino_frames, "video_path": None}
|
| 617 |
-
if dino_frames:
|
| 618 |
-
dino_dir = self._prepare_visualization_dir("dino", enabled=self.visualize)
|
| 619 |
-
dino_entry["video_path"] = self._create_temp_video(
|
| 620 |
-
np.stack(dino_frames, axis=0),
|
| 621 |
-
base_dir=dino_dir,
|
| 622 |
-
prefix="dino_visualization"
|
| 623 |
-
)
|
| 624 |
-
visualizations["dino"] = dino_entry
|
| 625 |
-
|
| 626 |
-
for name in ("object", "unary", "binary"):
|
| 627 |
-
frames_list = vine_frame_sets.get(name, [])
|
| 628 |
-
entry: Dict[str, Any] = {"frames": frames_list, "video_path": None}
|
| 629 |
-
if frames_list:
|
| 630 |
-
vine_dir = self._prepare_visualization_dir(name, enabled=self.visualize)
|
| 631 |
-
entry["video_path"] = self._create_temp_video(
|
| 632 |
-
np.stack(frames_list, axis=0),
|
| 633 |
-
base_dir=vine_dir,
|
| 634 |
-
prefix=f"{name}_visualization"
|
| 635 |
-
)
|
| 636 |
-
vine_visuals[name] = entry
|
| 637 |
-
|
| 638 |
-
if vine_visuals:
|
| 639 |
-
visualizations["vine"] = vine_visuals
|
| 640 |
-
|
| 641 |
-
if visualizations:
|
| 642 |
-
results["visualizations"] = visualizations
|
| 643 |
-
|
| 644 |
-
return results
|
| 645 |
-
|
| 646 |
-
def _generate_summary(self, model_outputs: Dict[str, Any]) -> Dict[str, Any]:
|
| 647 |
-
"""Generate a summary of the predictions."""
|
| 648 |
-
categorical_preds = model_outputs.get("categorical_predictions", {})
|
| 649 |
-
unary_preds = model_outputs.get("unary_predictions", {})
|
| 650 |
-
binary_preds = model_outputs.get("binary_predictions", {})
|
| 651 |
-
|
| 652 |
-
summary = {
|
| 653 |
-
"num_objects_detected": len(categorical_preds),
|
| 654 |
-
"num_unary_predictions": len(unary_preds),
|
| 655 |
-
"num_binary_predictions": len(binary_preds),
|
| 656 |
-
"top_categories": [],
|
| 657 |
-
"top_actions": [],
|
| 658 |
-
"top_relations": []
|
| 659 |
-
}
|
| 660 |
-
|
| 661 |
-
# Extract top categories
|
| 662 |
-
all_categories = []
|
| 663 |
-
for obj_preds in categorical_preds.values():
|
| 664 |
-
if obj_preds:
|
| 665 |
-
all_categories.extend(obj_preds)
|
| 666 |
-
|
| 667 |
-
if all_categories:
|
| 668 |
-
sorted_categories = sorted(all_categories, reverse=True)
|
| 669 |
-
summary["top_categories"] = [(cat, prob) for prob, cat in sorted_categories[:3]]
|
| 670 |
-
|
| 671 |
-
# Extract top actions
|
| 672 |
-
all_actions = []
|
| 673 |
-
for action_preds in unary_preds.values():
|
| 674 |
-
if action_preds:
|
| 675 |
-
all_actions.extend(action_preds)
|
| 676 |
-
|
| 677 |
-
if all_actions:
|
| 678 |
-
sorted_actions = sorted(all_actions, reverse=True)
|
| 679 |
-
summary["top_actions"] = [(act, prob) for prob, act in sorted_actions[:3]]
|
| 680 |
-
|
| 681 |
-
# Extract top relations
|
| 682 |
-
all_relations = []
|
| 683 |
-
for rel_preds in binary_preds.values():
|
| 684 |
-
if rel_preds:
|
| 685 |
-
all_relations.extend(rel_preds)
|
| 686 |
-
|
| 687 |
-
if all_relations:
|
| 688 |
-
sorted_relations = sorted(all_relations, reverse=True)
|
| 689 |
-
summary["top_relations"] = [(rel, prob) for prob, rel in sorted_relations[:3]]
|
| 690 |
-
|
| 691 |
-
return summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vine_hf/vis_utils.py
DELETED
|
@@ -1,941 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import cv2
|
| 3 |
-
import numpy as np
|
| 4 |
-
import matplotlib.pyplot as plt
|
| 5 |
-
import torch
|
| 6 |
-
import random
|
| 7 |
-
import math
|
| 8 |
-
from matplotlib.patches import Rectangle
|
| 9 |
-
import itertools
|
| 10 |
-
from typing import Any, Dict, List, Tuple, Optional, Union
|
| 11 |
-
|
| 12 |
-
from laser.preprocess.mask_generation_grounding_dino import mask_to_bbox
|
| 13 |
-
|
| 14 |
-
########################################################################################
|
| 15 |
-
########## Visualization Library ########
|
| 16 |
-
########################################################################################
|
| 17 |
-
# This module renders SAM masks, GroundingDINO boxes, and VINE predictions.
|
| 18 |
-
#
|
| 19 |
-
# Conventions (RGB frames, pixel coords):
|
| 20 |
-
# - Frames: list[np.ndarray] with shape (H, W, 3) in RGB, or np.ndarray with shape (T, H, W, 3).
|
| 21 |
-
# - Masks: 2D boolean arrays (H, W) or tensors convertible to that; (H, W, 1) is also accepted.
|
| 22 |
-
# - BBoxes: (x1, y1, x2, y2) integer pixel coordinates with x2 > x1 and y2 > y1.
|
| 23 |
-
#
|
| 24 |
-
# Per-frame stores use one of:
|
| 25 |
-
# - Dict[int(frame_id) -> Dict[int(obj_id) -> value]]
|
| 26 |
-
# - List indexed by frame_id (each item may be a dict of obj_id->value or a list in order)
|
| 27 |
-
#
|
| 28 |
-
# Renderer inputs/outputs:
|
| 29 |
-
# 1) render_sam_frames(frames, sam_masks, dino_labels=None) -> List[np.ndarray]
|
| 30 |
-
# - sam_masks: Dict[frame_id, Dict[obj_id, Mask]] or a list; Mask can be np.ndarray or torch.Tensor.
|
| 31 |
-
# - dino_labels: Optional Dict[obj_id, str] to annotate boxes derived from masks.
|
| 32 |
-
#
|
| 33 |
-
# 2) render_dino_frames(frames, bboxes, dino_labels=None) -> List[np.ndarray]
|
| 34 |
-
# - bboxes: Dict[frame_id, Dict[obj_id, Sequence[float]]] or a list; each bbox as [x1, y1, x2, y2].
|
| 35 |
-
#
|
| 36 |
-
# 3) render_vine_frames(frames, bboxes, cat_label_lookup, unary_lookup, binary_lookup, masks=None)
|
| 37 |
-
# -> List[np.ndarray] (the "all" view)
|
| 38 |
-
# - cat_label_lookup: Dict[obj_id, (label: str, prob: float)]
|
| 39 |
-
# - unary_lookup: Dict[frame_id, Dict[obj_id, List[(prob: float, label: str)]]]
|
| 40 |
-
# - binary_lookup: Dict[frame_id, List[((sub_id: int, obj_id: int), List[(prob: float, relation: str)])]]
|
| 41 |
-
# - masks: Optional; same structure as sam_masks, used for translucent overlays when unary labels exist.
|
| 42 |
-
#
|
| 43 |
-
# Ground-truth helpers used by plotting utilities:
|
| 44 |
-
# - For a single frame, gt_relations is represented as List[(subject_label, object_label, relation_label)].
|
| 45 |
-
#
|
| 46 |
-
# All rendered frames returned by functions are RGB np.ndarray images suitable for saving or video writing.
|
| 47 |
-
########################################################################################
|
| 48 |
-
|
| 49 |
-
def clean_label(label):
|
| 50 |
-
"""Replace underscores and slashes with spaces for uniformity."""
|
| 51 |
-
return label.replace("_", " ").replace("/", " ")
|
| 52 |
-
|
| 53 |
-
# Should be performed somewhere else I believe
|
| 54 |
-
def format_cate_preds(cate_preds):
|
| 55 |
-
# Group object predictions from the model output.
|
| 56 |
-
obj_pred_dict = {}
|
| 57 |
-
for (oid, label), prob in cate_preds.items():
|
| 58 |
-
# Clean the predicted label as well.
|
| 59 |
-
clean_pred = clean_label(label)
|
| 60 |
-
if oid not in obj_pred_dict:
|
| 61 |
-
obj_pred_dict[oid] = []
|
| 62 |
-
obj_pred_dict[oid].append((clean_pred, prob))
|
| 63 |
-
for oid in obj_pred_dict:
|
| 64 |
-
obj_pred_dict[oid].sort(key=lambda x: x[1], reverse=True)
|
| 65 |
-
return obj_pred_dict
|
| 66 |
-
|
| 67 |
-
def format_binary_cate_preds(binary_preds):
|
| 68 |
-
frame_binary_preds = []
|
| 69 |
-
for key, score in binary_preds.items():
|
| 70 |
-
# Expect key format: (frame_id, (subject, object), predicted_relation)
|
| 71 |
-
try:
|
| 72 |
-
f_id, (subj, obj), pred_rel = key
|
| 73 |
-
frame_binary_preds.append((f_id, subj, obj, pred_rel, score))
|
| 74 |
-
except Exception as e:
|
| 75 |
-
print("Skipping key with unexpected format:", key)
|
| 76 |
-
continue
|
| 77 |
-
frame_binary_preds.sort(key=lambda x: x[3], reverse=True)
|
| 78 |
-
return frame_binary_preds
|
| 79 |
-
|
| 80 |
-
_FONT = cv2.FONT_HERSHEY_SIMPLEX
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
def _to_numpy_mask(mask: Union[np.ndarray, torch.Tensor, None]) -> Optional[np.ndarray]:
|
| 84 |
-
if mask is None:
|
| 85 |
-
return None
|
| 86 |
-
if isinstance(mask, torch.Tensor):
|
| 87 |
-
mask_np = mask.detach().cpu().numpy()
|
| 88 |
-
else:
|
| 89 |
-
mask_np = np.asarray(mask)
|
| 90 |
-
if mask_np.ndim == 0:
|
| 91 |
-
return None
|
| 92 |
-
if mask_np.ndim == 3:
|
| 93 |
-
mask_np = np.squeeze(mask_np)
|
| 94 |
-
if mask_np.ndim != 2:
|
| 95 |
-
return None
|
| 96 |
-
if mask_np.dtype == bool:
|
| 97 |
-
return mask_np
|
| 98 |
-
return mask_np > 0
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def _sanitize_bbox(bbox: Union[List[float], Tuple[float, ...], None], width: int, height: int) -> Optional[Tuple[int, int, int, int]]:
|
| 102 |
-
if bbox is None:
|
| 103 |
-
return None
|
| 104 |
-
if isinstance(bbox, (list, tuple)) and len(bbox) >= 4:
|
| 105 |
-
x1, y1, x2, y2 = [float(b) for b in bbox[:4]]
|
| 106 |
-
elif isinstance(bbox, np.ndarray) and bbox.size >= 4:
|
| 107 |
-
x1, y1, x2, y2 = [float(b) for b in bbox.flat[:4]]
|
| 108 |
-
else:
|
| 109 |
-
return None
|
| 110 |
-
x1 = int(np.clip(round(x1), 0, width - 1))
|
| 111 |
-
y1 = int(np.clip(round(y1), 0, height - 1))
|
| 112 |
-
x2 = int(np.clip(round(x2), 0, width - 1))
|
| 113 |
-
y2 = int(np.clip(round(y2), 0, height - 1))
|
| 114 |
-
if x2 <= x1 or y2 <= y1:
|
| 115 |
-
return None
|
| 116 |
-
return (x1, y1, x2, y2)
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
def _object_color_bgr(obj_id: int) -> Tuple[int, int, int]:
|
| 120 |
-
color = get_color(obj_id)
|
| 121 |
-
rgb = [int(np.clip(c, 0.0, 1.0) * 255) for c in color[:3]]
|
| 122 |
-
return (rgb[2], rgb[1], rgb[0])
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
def _background_color(color: Tuple[int, int, int]) -> Tuple[int, int, int]:
|
| 126 |
-
return tuple(int(0.25 * 255 + 0.75 * channel) for channel in color)
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
def _draw_label_block(
|
| 130 |
-
image: np.ndarray,
|
| 131 |
-
lines: List[str],
|
| 132 |
-
anchor: Tuple[int, int],
|
| 133 |
-
color: Tuple[int, int, int],
|
| 134 |
-
font_scale: float = 0.5,
|
| 135 |
-
thickness: int = 1,
|
| 136 |
-
direction: str = "up",
|
| 137 |
-
) -> None:
|
| 138 |
-
if not lines:
|
| 139 |
-
return
|
| 140 |
-
img_h, img_w = image.shape[:2]
|
| 141 |
-
x, y = anchor
|
| 142 |
-
x = int(np.clip(x, 0, img_w - 1))
|
| 143 |
-
y_cursor = int(np.clip(y, 0, img_h - 1))
|
| 144 |
-
bg_color = _background_color(color)
|
| 145 |
-
|
| 146 |
-
if direction == "down":
|
| 147 |
-
for text in lines:
|
| 148 |
-
text = str(text)
|
| 149 |
-
(tw, th), baseline = cv2.getTextSize(text, _FONT, font_scale, thickness)
|
| 150 |
-
left_x = x
|
| 151 |
-
right_x = min(left_x + tw + 8, img_w - 1)
|
| 152 |
-
top_y = int(np.clip(y_cursor + 6, 0, img_h - 1))
|
| 153 |
-
bottom_y = int(np.clip(top_y + th + baseline + 6, 0, img_h - 1))
|
| 154 |
-
if bottom_y <= top_y:
|
| 155 |
-
break
|
| 156 |
-
cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), bg_color, -1)
|
| 157 |
-
text_x = left_x + 4
|
| 158 |
-
text_y = min(bottom_y - baseline - 2, img_h - 1)
|
| 159 |
-
cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
|
| 160 |
-
y_cursor = bottom_y
|
| 161 |
-
else:
|
| 162 |
-
for text in lines:
|
| 163 |
-
text = str(text)
|
| 164 |
-
(tw, th), baseline = cv2.getTextSize(text, _FONT, font_scale, thickness)
|
| 165 |
-
top_y = max(y_cursor - th - baseline - 6, 0)
|
| 166 |
-
left_x = x
|
| 167 |
-
right_x = min(left_x + tw + 8, img_w - 1)
|
| 168 |
-
bottom_y = min(top_y + th + baseline + 6, img_h - 1)
|
| 169 |
-
cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), bg_color, -1)
|
| 170 |
-
text_x = left_x + 4
|
| 171 |
-
text_y = min(bottom_y - baseline - 2, img_h - 1)
|
| 172 |
-
cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
|
| 173 |
-
y_cursor = top_y
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
def _draw_centered_label(
|
| 177 |
-
image: np.ndarray,
|
| 178 |
-
text: str,
|
| 179 |
-
center: Tuple[int, int],
|
| 180 |
-
color: Tuple[int, int, int],
|
| 181 |
-
font_scale: float = 0.5,
|
| 182 |
-
thickness: int = 1,
|
| 183 |
-
) -> None:
|
| 184 |
-
text = str(text)
|
| 185 |
-
img_h, img_w = image.shape[:2]
|
| 186 |
-
(tw, th), baseline = cv2.getTextSize(text, _FONT, font_scale, thickness)
|
| 187 |
-
cx = int(np.clip(center[0], 0, img_w - 1))
|
| 188 |
-
cy = int(np.clip(center[1], 0, img_h - 1))
|
| 189 |
-
left_x = int(np.clip(cx - tw // 2 - 4, 0, img_w - 1))
|
| 190 |
-
top_y = int(np.clip(cy - th // 2 - baseline - 4, 0, img_h - 1))
|
| 191 |
-
right_x = int(np.clip(left_x + tw + 8, 0, img_w - 1))
|
| 192 |
-
bottom_y = int(np.clip(top_y + th + baseline + 6, 0, img_h - 1))
|
| 193 |
-
cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), _background_color(color), -1)
|
| 194 |
-
text_x = left_x + 4
|
| 195 |
-
text_y = min(bottom_y - baseline - 2, img_h - 1)
|
| 196 |
-
cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
def _extract_frame_entities(store: Union[Dict[int, Dict[int, Any]], List, None], frame_idx: int) -> Dict[int, Any]:
|
| 200 |
-
if isinstance(store, dict):
|
| 201 |
-
frame_entry = store.get(frame_idx, {})
|
| 202 |
-
elif isinstance(store, list) and 0 <= frame_idx < len(store):
|
| 203 |
-
frame_entry = store[frame_idx]
|
| 204 |
-
else:
|
| 205 |
-
frame_entry = {}
|
| 206 |
-
if isinstance(frame_entry, dict):
|
| 207 |
-
return frame_entry
|
| 208 |
-
if isinstance(frame_entry, list):
|
| 209 |
-
return {i: value for i, value in enumerate(frame_entry)}
|
| 210 |
-
return {}
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
def _label_anchor_and_direction(
|
| 214 |
-
bbox: Tuple[int, int, int, int],
|
| 215 |
-
position: str,
|
| 216 |
-
) -> Tuple[Tuple[int, int], str]:
|
| 217 |
-
x1, y1, x2, y2 = bbox
|
| 218 |
-
if position == "bottom":
|
| 219 |
-
return (x1, y2), "down"
|
| 220 |
-
return (x1, y1), "up"
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
def _draw_bbox_with_label(
|
| 224 |
-
image: np.ndarray,
|
| 225 |
-
bbox: Tuple[int, int, int, int],
|
| 226 |
-
obj_id: int,
|
| 227 |
-
title: Optional[str] = None,
|
| 228 |
-
sub_lines: Optional[List[str]] = None,
|
| 229 |
-
label_position: str = "top",
|
| 230 |
-
) -> None:
|
| 231 |
-
color = _object_color_bgr(obj_id)
|
| 232 |
-
cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2)
|
| 233 |
-
head = title if title else f"#{obj_id}"
|
| 234 |
-
if not head.startswith("#"):
|
| 235 |
-
head = f"#{obj_id} {head}"
|
| 236 |
-
lines = [head]
|
| 237 |
-
if sub_lines:
|
| 238 |
-
lines.extend(sub_lines)
|
| 239 |
-
anchor, direction = _label_anchor_and_direction(bbox, label_position)
|
| 240 |
-
_draw_label_block(image, lines, anchor, color, direction=direction)
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
def render_sam_frames(
|
| 244 |
-
frames: Union[np.ndarray, List[np.ndarray]],
|
| 245 |
-
sam_masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None],
|
| 246 |
-
dino_labels: Optional[Dict[int, str]] = None,
|
| 247 |
-
) -> List[np.ndarray]:
|
| 248 |
-
results: List[np.ndarray] = []
|
| 249 |
-
frames_iterable = frames if isinstance(frames, list) else list(frames)
|
| 250 |
-
dino_labels = dino_labels or {}
|
| 251 |
-
|
| 252 |
-
for frame_idx, frame in enumerate(frames_iterable):
|
| 253 |
-
if frame is None:
|
| 254 |
-
continue
|
| 255 |
-
frame_rgb = np.asarray(frame)
|
| 256 |
-
frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
|
| 257 |
-
overlay = frame_bgr.astype(np.float32)
|
| 258 |
-
masks_for_frame = _extract_frame_entities(sam_masks, frame_idx)
|
| 259 |
-
|
| 260 |
-
for obj_id, mask in masks_for_frame.items():
|
| 261 |
-
mask_np = _to_numpy_mask(mask)
|
| 262 |
-
if mask_np is None or not np.any(mask_np):
|
| 263 |
-
continue
|
| 264 |
-
color = _object_color_bgr(obj_id)
|
| 265 |
-
alpha = 0.45
|
| 266 |
-
overlay[mask_np] = (1.0 - alpha) * overlay[mask_np] + alpha * np.array(color, dtype=np.float32)
|
| 267 |
-
|
| 268 |
-
annotated = np.clip(overlay, 0, 255).astype(np.uint8)
|
| 269 |
-
frame_h, frame_w = annotated.shape[:2]
|
| 270 |
-
|
| 271 |
-
for obj_id, mask in masks_for_frame.items():
|
| 272 |
-
mask_np = _to_numpy_mask(mask)
|
| 273 |
-
if mask_np is None or not np.any(mask_np):
|
| 274 |
-
continue
|
| 275 |
-
bbox = mask_to_bbox(mask_np)
|
| 276 |
-
bbox = _sanitize_bbox(bbox, frame_w, frame_h)
|
| 277 |
-
if not bbox:
|
| 278 |
-
continue
|
| 279 |
-
label = dino_labels.get(obj_id)
|
| 280 |
-
title = f"{label}" if label else None
|
| 281 |
-
_draw_bbox_with_label(annotated, bbox, obj_id, title=title)
|
| 282 |
-
|
| 283 |
-
results.append(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
|
| 284 |
-
|
| 285 |
-
return results
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
def render_dino_frames(
|
| 289 |
-
frames: Union[np.ndarray, List[np.ndarray]],
|
| 290 |
-
bboxes: Union[Dict[int, Dict[int, Union[List[float], np.ndarray]]], List, None],
|
| 291 |
-
dino_labels: Optional[Dict[int, str]] = None,
|
| 292 |
-
) -> List[np.ndarray]:
|
| 293 |
-
results: List[np.ndarray] = []
|
| 294 |
-
frames_iterable = frames if isinstance(frames, list) else list(frames)
|
| 295 |
-
dino_labels = dino_labels or {}
|
| 296 |
-
|
| 297 |
-
for frame_idx, frame in enumerate(frames_iterable):
|
| 298 |
-
if frame is None:
|
| 299 |
-
continue
|
| 300 |
-
frame_rgb = np.asarray(frame)
|
| 301 |
-
annotated = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
|
| 302 |
-
frame_h, frame_w = annotated.shape[:2]
|
| 303 |
-
frame_bboxes = _extract_frame_entities(bboxes, frame_idx)
|
| 304 |
-
|
| 305 |
-
for obj_id, bbox_values in frame_bboxes.items():
|
| 306 |
-
bbox = _sanitize_bbox(bbox_values, frame_w, frame_h)
|
| 307 |
-
if not bbox:
|
| 308 |
-
continue
|
| 309 |
-
label = dino_labels.get(obj_id)
|
| 310 |
-
title = f"{label}" if label else None
|
| 311 |
-
_draw_bbox_with_label(annotated, bbox, obj_id, title=title)
|
| 312 |
-
|
| 313 |
-
results.append(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
|
| 314 |
-
|
| 315 |
-
return results
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
def render_vine_frame_sets(
|
| 319 |
-
frames: Union[np.ndarray, List[np.ndarray]],
|
| 320 |
-
bboxes: Union[Dict[int, Dict[int, Union[List[float], np.ndarray]]], List, None],
|
| 321 |
-
cat_label_lookup: Dict[int, Tuple[str, float]],
|
| 322 |
-
unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
|
| 323 |
-
binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
|
| 324 |
-
masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None] = None,
|
| 325 |
-
) -> Dict[str, List[np.ndarray]]:
|
| 326 |
-
frame_groups: Dict[str, List[np.ndarray]] = {
|
| 327 |
-
"object": [],
|
| 328 |
-
"unary": [],
|
| 329 |
-
"binary": [],
|
| 330 |
-
"all": [],
|
| 331 |
-
}
|
| 332 |
-
frames_iterable = frames if isinstance(frames, list) else list(frames)
|
| 333 |
-
|
| 334 |
-
for frame_idx, frame in enumerate(frames_iterable):
|
| 335 |
-
if frame is None:
|
| 336 |
-
continue
|
| 337 |
-
frame_rgb = np.asarray(frame)
|
| 338 |
-
base_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
|
| 339 |
-
frame_h, frame_w = base_bgr.shape[:2]
|
| 340 |
-
frame_bboxes = _extract_frame_entities(bboxes, frame_idx)
|
| 341 |
-
frame_masks = _extract_frame_entities(masks, frame_idx) if masks is not None else {}
|
| 342 |
-
|
| 343 |
-
objects_bgr = base_bgr.copy()
|
| 344 |
-
unary_bgr = base_bgr.copy()
|
| 345 |
-
binary_bgr = base_bgr.copy()
|
| 346 |
-
all_bgr = base_bgr.copy()
|
| 347 |
-
|
| 348 |
-
bbox_lookup: Dict[int, Tuple[int, int, int, int]] = {}
|
| 349 |
-
unary_lines_lookup: Dict[int, List[str]] = {}
|
| 350 |
-
titles_lookup: Dict[int, Optional[str]] = {}
|
| 351 |
-
|
| 352 |
-
for obj_id, bbox_values in frame_bboxes.items():
|
| 353 |
-
bbox = _sanitize_bbox(bbox_values, frame_w, frame_h)
|
| 354 |
-
if not bbox:
|
| 355 |
-
continue
|
| 356 |
-
bbox_lookup[obj_id] = bbox
|
| 357 |
-
cat_label, cat_prob = cat_label_lookup.get(obj_id, (None, None))
|
| 358 |
-
title_parts = []
|
| 359 |
-
if cat_label:
|
| 360 |
-
if cat_prob is not None:
|
| 361 |
-
title_parts.append(f"{cat_label} {cat_prob:.2f}")
|
| 362 |
-
else:
|
| 363 |
-
title_parts.append(cat_label)
|
| 364 |
-
titles_lookup[obj_id] = " ".join(title_parts) if title_parts else None
|
| 365 |
-
unary_preds = unary_lookup.get(frame_idx, {}).get(obj_id, [])
|
| 366 |
-
unary_lines = [f"{label} {prob:.2f}" for prob, label in unary_preds]
|
| 367 |
-
unary_lines_lookup[obj_id] = unary_lines
|
| 368 |
-
|
| 369 |
-
for obj_id, bbox in bbox_lookup.items():
|
| 370 |
-
unary_lines = unary_lines_lookup.get(obj_id, [])
|
| 371 |
-
if not unary_lines:
|
| 372 |
-
continue
|
| 373 |
-
mask_raw = frame_masks.get(obj_id)
|
| 374 |
-
mask_np = _to_numpy_mask(mask_raw)
|
| 375 |
-
if mask_np is None or not np.any(mask_np):
|
| 376 |
-
continue
|
| 377 |
-
color = np.array(_object_color_bgr(obj_id), dtype=np.float32)
|
| 378 |
-
alpha = 0.45
|
| 379 |
-
for target in (unary_bgr, all_bgr):
|
| 380 |
-
target_vals = target[mask_np].astype(np.float32)
|
| 381 |
-
blended = (1.0 - alpha) * target_vals + alpha * color
|
| 382 |
-
target[mask_np] = np.clip(blended, 0, 255).astype(np.uint8)
|
| 383 |
-
|
| 384 |
-
for obj_id, bbox in bbox_lookup.items():
|
| 385 |
-
title = titles_lookup.get(obj_id)
|
| 386 |
-
unary_lines = unary_lines_lookup.get(obj_id, [])
|
| 387 |
-
_draw_bbox_with_label(objects_bgr, bbox, obj_id, title=title, label_position="top")
|
| 388 |
-
_draw_bbox_with_label(unary_bgr, bbox, obj_id, title=title, label_position="top")
|
| 389 |
-
if unary_lines:
|
| 390 |
-
anchor, direction = _label_anchor_and_direction(bbox, "bottom")
|
| 391 |
-
_draw_label_block(unary_bgr, unary_lines, anchor, _object_color_bgr(obj_id), direction=direction)
|
| 392 |
-
_draw_bbox_with_label(binary_bgr, bbox, obj_id, title=title, label_position="top")
|
| 393 |
-
_draw_bbox_with_label(all_bgr, bbox, obj_id, title=title, label_position="top")
|
| 394 |
-
if unary_lines:
|
| 395 |
-
anchor, direction = _label_anchor_and_direction(bbox, "bottom")
|
| 396 |
-
_draw_label_block(all_bgr, unary_lines, anchor, _object_color_bgr(obj_id), direction=direction)
|
| 397 |
-
|
| 398 |
-
for obj_pair, relation_preds in binary_lookup.get(frame_idx, []):
|
| 399 |
-
if len(obj_pair) != 2 or not relation_preds:
|
| 400 |
-
continue
|
| 401 |
-
subj_id, obj_id = obj_pair
|
| 402 |
-
subj_bbox = bbox_lookup.get(subj_id)
|
| 403 |
-
obj_bbox = bbox_lookup.get(obj_id)
|
| 404 |
-
if not subj_bbox or not obj_bbox:
|
| 405 |
-
continue
|
| 406 |
-
start, end = relation_line(subj_bbox, obj_bbox)
|
| 407 |
-
color = tuple(int(c) for c in np.clip(
|
| 408 |
-
(np.array(_object_color_bgr(subj_id), dtype=np.float32) +
|
| 409 |
-
np.array(_object_color_bgr(obj_id), dtype=np.float32)) / 2.0,
|
| 410 |
-
0, 255
|
| 411 |
-
))
|
| 412 |
-
prob, relation = relation_preds[0]
|
| 413 |
-
label_text = f"{relation} {prob:.2f}"
|
| 414 |
-
mid_point = (int((start[0] + end[0]) / 2), int((start[1] + end[1]) / 2))
|
| 415 |
-
cv2.line(binary_bgr, start, end, color, 6, cv2.LINE_AA)
|
| 416 |
-
cv2.line(all_bgr, start, end, color, 6, cv2.LINE_AA)
|
| 417 |
-
_draw_centered_label(binary_bgr, label_text, mid_point, color)
|
| 418 |
-
_draw_centered_label(all_bgr, label_text, mid_point, color)
|
| 419 |
-
|
| 420 |
-
frame_groups["object"].append(cv2.cvtColor(objects_bgr, cv2.COLOR_BGR2RGB))
|
| 421 |
-
frame_groups["unary"].append(cv2.cvtColor(unary_bgr, cv2.COLOR_BGR2RGB))
|
| 422 |
-
frame_groups["binary"].append(cv2.cvtColor(binary_bgr, cv2.COLOR_BGR2RGB))
|
| 423 |
-
frame_groups["all"].append(cv2.cvtColor(all_bgr, cv2.COLOR_BGR2RGB))
|
| 424 |
-
|
| 425 |
-
return frame_groups
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
def render_vine_frames(
|
| 429 |
-
frames: Union[np.ndarray, List[np.ndarray]],
|
| 430 |
-
bboxes: Union[Dict[int, Dict[int, Union[List[float], np.ndarray]]], List, None],
|
| 431 |
-
cat_label_lookup: Dict[int, Tuple[str, float]],
|
| 432 |
-
unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
|
| 433 |
-
binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
|
| 434 |
-
masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None] = None,
|
| 435 |
-
) -> List[np.ndarray]:
|
| 436 |
-
return render_vine_frame_sets(
|
| 437 |
-
frames,
|
| 438 |
-
bboxes,
|
| 439 |
-
cat_label_lookup,
|
| 440 |
-
unary_lookup,
|
| 441 |
-
binary_lookup,
|
| 442 |
-
masks,
|
| 443 |
-
).get("all", [])
|
| 444 |
-
|
| 445 |
-
def color_for_cate_correctness(obj_pred_dict, gt_labels, topk_object):
|
| 446 |
-
all_colors = []
|
| 447 |
-
all_texts = []
|
| 448 |
-
for (obj_id, bbox, gt_label) in gt_labels:
|
| 449 |
-
preds = obj_pred_dict.get(obj_id, [])
|
| 450 |
-
if len(preds) == 0:
|
| 451 |
-
top1 = "N/A"
|
| 452 |
-
box_color = (0, 0, 255) # bright red if no prediction
|
| 453 |
-
else:
|
| 454 |
-
top1, prob1 = preds[0]
|
| 455 |
-
topk_labels = [p[0] for p in preds[:topk_object]]
|
| 456 |
-
# Compare cleaned labels.
|
| 457 |
-
if top1.lower() == gt_label.lower():
|
| 458 |
-
box_color = (0, 255, 0) # bright green for correct
|
| 459 |
-
elif gt_label.lower() in [p.lower() for p in topk_labels]:
|
| 460 |
-
box_color = (0, 165, 255) # bright orange for partial match
|
| 461 |
-
else:
|
| 462 |
-
box_color = (0, 0, 255) # bright red for incorrect
|
| 463 |
-
|
| 464 |
-
label_text = f"ID:{obj_id}/P:{top1}/GT:{gt_label}"
|
| 465 |
-
all_colors.append(box_color)
|
| 466 |
-
all_texts.append(label_text)
|
| 467 |
-
return all_colors, all_texts
|
| 468 |
-
|
| 469 |
-
def plot_unary(frame_img, gt_labels, all_colors, all_texts):
|
| 470 |
-
|
| 471 |
-
for (obj_id, bbox, gt_label), box_color, label_text in zip(gt_labels, all_colors, all_texts):
|
| 472 |
-
x1, y1, x2, y2 = map(int, bbox)
|
| 473 |
-
cv2.rectangle(frame_img, (x1, y1), (x2, y2), color=box_color, thickness=2)
|
| 474 |
-
(tw, th), baseline = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
|
| 475 |
-
cv2.rectangle(frame_img, (x1, y1 - th - baseline - 4), (x1 + tw, y1), box_color, -1)
|
| 476 |
-
cv2.putText(frame_img, label_text, (x1, y1 - 2), cv2.FONT_HERSHEY_SIMPLEX,
|
| 477 |
-
0.5, (0, 0, 0), 1, cv2.LINE_AA)
|
| 478 |
-
|
| 479 |
-
return frame_img
|
| 480 |
-
|
| 481 |
-
def get_white_pane(pane_height,
|
| 482 |
-
pane_width=600,
|
| 483 |
-
header_height = 50,
|
| 484 |
-
header_font = cv2.FONT_HERSHEY_SIMPLEX,
|
| 485 |
-
header_font_scale = 0.7,
|
| 486 |
-
header_thickness = 2,
|
| 487 |
-
header_color = (0, 0, 0)):
|
| 488 |
-
# Create an expanded white pane to display text info.
|
| 489 |
-
white_pane = 255 * np.ones((pane_height, pane_width, 3), dtype=np.uint8)
|
| 490 |
-
|
| 491 |
-
# --- Adjust pane split: make predictions column wider (60% vs. 40%) ---
|
| 492 |
-
left_width = int(pane_width * 0.6)
|
| 493 |
-
right_width = pane_width - left_width
|
| 494 |
-
left_pane = white_pane[:, :left_width, :].copy()
|
| 495 |
-
right_pane = white_pane[:, left_width:, :].copy()
|
| 496 |
-
|
| 497 |
-
cv2.putText(left_pane, "Binary Predictions", (10, header_height - 30),
|
| 498 |
-
header_font, header_font_scale, header_color, header_thickness, cv2.LINE_AA)
|
| 499 |
-
cv2.putText(right_pane, "Ground Truth", (10, header_height - 30),
|
| 500 |
-
header_font, header_font_scale, header_color, header_thickness, cv2.LINE_AA)
|
| 501 |
-
|
| 502 |
-
return white_pane
|
| 503 |
-
|
| 504 |
-
# This is for ploting binary prediction results with frame-based scene graphs
|
| 505 |
-
def plot_binary_sg(frame_img,
|
| 506 |
-
white_pane,
|
| 507 |
-
bin_preds,
|
| 508 |
-
gt_relations,
|
| 509 |
-
topk_binary,
|
| 510 |
-
header_height=50,
|
| 511 |
-
indicator_size=20,
|
| 512 |
-
pane_width=600):
|
| 513 |
-
# Leave vertical space for the headers.
|
| 514 |
-
line_height = 30 # vertical spacing per line
|
| 515 |
-
x_text = 10 # left margin for text
|
| 516 |
-
y_text_left = header_height + 10 # starting y for left pane text
|
| 517 |
-
y_text_right = header_height + 10 # starting y for right pane text
|
| 518 |
-
|
| 519 |
-
# Left section: top-k binary predictions.
|
| 520 |
-
left_width = int(pane_width * 0.6)
|
| 521 |
-
right_width = pane_width - left_width
|
| 522 |
-
left_pane = white_pane[:, :left_width, :].copy()
|
| 523 |
-
right_pane = white_pane[:, left_width:, :].copy()
|
| 524 |
-
|
| 525 |
-
for (subj, pred_rel, obj, score) in bin_preds[:topk_binary]:
|
| 526 |
-
correct = any((subj == gt[0] and pred_rel.lower() == gt[2].lower() and obj == gt[1])
|
| 527 |
-
for gt in gt_relations)
|
| 528 |
-
indicator_color = (0, 255, 0) if correct else (0, 0, 255)
|
| 529 |
-
cv2.rectangle(left_pane, (x_text, y_text_left - indicator_size + 5),
|
| 530 |
-
(x_text + indicator_size, y_text_left + 5), indicator_color, -1)
|
| 531 |
-
text = f"{subj} - {pred_rel} - {obj} :: {score:.2f}"
|
| 532 |
-
cv2.putText(left_pane, text, (x_text + indicator_size + 5, y_text_left + 5),
|
| 533 |
-
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 1, cv2.LINE_AA)
|
| 534 |
-
y_text_left += line_height
|
| 535 |
-
|
| 536 |
-
# Right section: ground truth binary relations.
|
| 537 |
-
for gt in gt_relations:
|
| 538 |
-
if len(gt) != 3:
|
| 539 |
-
continue
|
| 540 |
-
text = f"{gt[0]} - {gt[2]} - {gt[1]}"
|
| 541 |
-
cv2.putText(right_pane, text, (x_text, y_text_right + 5),
|
| 542 |
-
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 1, cv2.LINE_AA)
|
| 543 |
-
y_text_right += line_height
|
| 544 |
-
|
| 545 |
-
# Combine the two text panes and then with the frame image.
|
| 546 |
-
combined_pane = np.hstack((left_pane, right_pane))
|
| 547 |
-
combined_image = np.hstack((frame_img, combined_pane))
|
| 548 |
-
return combined_image
|
| 549 |
-
|
| 550 |
-
def visualized_frame(frame_img,
|
| 551 |
-
bboxes,
|
| 552 |
-
object_ids,
|
| 553 |
-
gt_labels,
|
| 554 |
-
cate_preds,
|
| 555 |
-
binary_preds,
|
| 556 |
-
gt_relations,
|
| 557 |
-
topk_object,
|
| 558 |
-
topk_binary,
|
| 559 |
-
phase="unary"):
|
| 560 |
-
|
| 561 |
-
"""Return the combined annotated frame for frame index i as an image (in BGR)."""
|
| 562 |
-
# Get the frame image (assuming batched_data['batched_reshaped_raw_videos'] is a list of frames)
|
| 563 |
-
|
| 564 |
-
# --- Process Object Predictions (for overlaying bboxes) ---
|
| 565 |
-
if phase == "unary":
|
| 566 |
-
objs = []
|
| 567 |
-
for ((_, f_id, obj_id), bbox, gt_label) in zip(object_ids, bboxes, gt_labels):
|
| 568 |
-
gt_label = clean_label(gt_label)
|
| 569 |
-
objs.append((obj_id, bbox, gt_label))
|
| 570 |
-
|
| 571 |
-
formatted_cate_preds = format_cate_preds(cate_preds)
|
| 572 |
-
all_colors, all_texts = color_for_cate_correctness(formatted_cate_preds, gt_labels, topk_object)
|
| 573 |
-
updated_frame_img = plot_unary(frame_img, gt_labels, all_colors, all_texts)
|
| 574 |
-
return updated_frame_img
|
| 575 |
-
|
| 576 |
-
else:
|
| 577 |
-
# --- Process Binary Predictions & Ground Truth for the Text Pane ---
|
| 578 |
-
formatted_binary_preds = format_binary_cate_preds(binary_preds)
|
| 579 |
-
|
| 580 |
-
# Ground truth binary relations for the frame.
|
| 581 |
-
# Clean ground truth relations.
|
| 582 |
-
gt_relations = [(clean_label(str(s)), clean_label(str(o)), clean_label(rel)) for s, o, rel in gt_relations]
|
| 583 |
-
|
| 584 |
-
pane_width = 600 # increased pane width for more horizontal space
|
| 585 |
-
pane_height = frame_img.shape[0]
|
| 586 |
-
|
| 587 |
-
# --- Add header labels to each text pane with extra space ---
|
| 588 |
-
header_height = 50 # increased header space
|
| 589 |
-
white_pane = get_white_pane(pane_height, pane_width, header_height=header_height)
|
| 590 |
-
|
| 591 |
-
combined_image = plot_binary_sg(frame_img, white_pane, formatted_binary_preds, gt_relations, topk_binary)
|
| 592 |
-
|
| 593 |
-
return combined_image
|
| 594 |
-
|
| 595 |
-
def show_mask(mask, ax, obj_id=None, det_class=None, random_color=False):
|
| 596 |
-
# Ensure mask is a numpy array
|
| 597 |
-
mask = np.array(mask)
|
| 598 |
-
# Handle different mask shapes
|
| 599 |
-
if mask.ndim == 3:
|
| 600 |
-
# (1, H, W) -> (H, W)
|
| 601 |
-
if mask.shape[0] == 1:
|
| 602 |
-
mask = mask.squeeze(0)
|
| 603 |
-
# (H, W, 1) -> (H, W)
|
| 604 |
-
elif mask.shape[2] == 1:
|
| 605 |
-
mask = mask.squeeze(2)
|
| 606 |
-
# Now mask should be (H, W)
|
| 607 |
-
assert mask.ndim == 2, f"Mask must be 2D after squeezing, got shape {mask.shape}"
|
| 608 |
-
|
| 609 |
-
if random_color:
|
| 610 |
-
color = np.concatenate([np.random.random(3), np.array([0.8])], axis=0)
|
| 611 |
-
else:
|
| 612 |
-
cmap = plt.get_cmap("gist_rainbow")
|
| 613 |
-
cmap_idx = 0 if obj_id is None else obj_id
|
| 614 |
-
color = list(cmap((cmap_idx * 47) % 256))
|
| 615 |
-
color[3] = 0.5
|
| 616 |
-
color = np.array(color)
|
| 617 |
-
|
| 618 |
-
# Expand mask to (H, W, 1) for broadcasting
|
| 619 |
-
mask_expanded = mask[..., None]
|
| 620 |
-
mask_image = mask_expanded * color.reshape(1, 1, -1)
|
| 621 |
-
|
| 622 |
-
# draw a box around the mask with the det_class as the label
|
| 623 |
-
if not det_class is None:
|
| 624 |
-
# Find the bounding box coordinates
|
| 625 |
-
y_indices, x_indices = np.where(mask > 0)
|
| 626 |
-
if y_indices.size > 0 and x_indices.size > 0:
|
| 627 |
-
x_min, x_max = x_indices.min(), x_indices.max()
|
| 628 |
-
y_min, y_max = y_indices.min(), y_indices.max()
|
| 629 |
-
rect = Rectangle(
|
| 630 |
-
(x_min, y_min),
|
| 631 |
-
x_max - x_min,
|
| 632 |
-
y_max - y_min,
|
| 633 |
-
linewidth=1.5,
|
| 634 |
-
edgecolor=color[:3],
|
| 635 |
-
facecolor="none",
|
| 636 |
-
alpha=color[3]
|
| 637 |
-
)
|
| 638 |
-
ax.add_patch(rect)
|
| 639 |
-
ax.text(
|
| 640 |
-
x_min,
|
| 641 |
-
y_min - 5,
|
| 642 |
-
f"{det_class}",
|
| 643 |
-
color="white",
|
| 644 |
-
fontsize=6,
|
| 645 |
-
backgroundcolor=np.array(color),
|
| 646 |
-
alpha=1
|
| 647 |
-
)
|
| 648 |
-
ax.imshow(mask_image)
|
| 649 |
-
|
| 650 |
-
def save_mask_one_image(frame_image, masks, save_path):
|
| 651 |
-
"""Render masks on top of a frame and store the visualization on disk."""
|
| 652 |
-
fig, ax = plt.subplots(1, figsize=(6, 6))
|
| 653 |
-
|
| 654 |
-
frame_np = (
|
| 655 |
-
frame_image.detach().cpu().numpy()
|
| 656 |
-
if torch.is_tensor(frame_image)
|
| 657 |
-
else np.asarray(frame_image)
|
| 658 |
-
)
|
| 659 |
-
frame_np = np.ascontiguousarray(frame_np)
|
| 660 |
-
|
| 661 |
-
if isinstance(masks, dict):
|
| 662 |
-
mask_iter = masks.items()
|
| 663 |
-
else:
|
| 664 |
-
mask_iter = enumerate(masks)
|
| 665 |
-
|
| 666 |
-
prepared_masks = {
|
| 667 |
-
obj_id: (
|
| 668 |
-
mask.detach().cpu().numpy()
|
| 669 |
-
if torch.is_tensor(mask)
|
| 670 |
-
else np.asarray(mask)
|
| 671 |
-
)
|
| 672 |
-
for obj_id, mask in mask_iter
|
| 673 |
-
}
|
| 674 |
-
|
| 675 |
-
ax.imshow(frame_np)
|
| 676 |
-
ax.axis("off")
|
| 677 |
-
|
| 678 |
-
for obj_id, mask_np in prepared_masks.items():
|
| 679 |
-
show_mask(mask_np, ax, obj_id=obj_id, det_class=None, random_color=False)
|
| 680 |
-
|
| 681 |
-
fig.savefig(save_path, bbox_inches="tight", pad_inches=0)
|
| 682 |
-
plt.close(fig)
|
| 683 |
-
return save_path
|
| 684 |
-
|
| 685 |
-
def get_video_masks_visualization(video_tensor,
|
| 686 |
-
video_masks,
|
| 687 |
-
video_id,
|
| 688 |
-
video_save_base_dir,
|
| 689 |
-
oid_class_pred=None,
|
| 690 |
-
sample_rate = 1):
|
| 691 |
-
|
| 692 |
-
video_save_dir = os.path.join(video_save_base_dir, video_id)
|
| 693 |
-
if not os.path.exists(video_save_dir):
|
| 694 |
-
os.makedirs(video_save_dir, exist_ok=True)
|
| 695 |
-
|
| 696 |
-
for frame_id, image in enumerate(video_tensor):
|
| 697 |
-
if frame_id not in video_masks:
|
| 698 |
-
print("No mask for Frame", frame_id)
|
| 699 |
-
continue
|
| 700 |
-
|
| 701 |
-
masks = video_masks[frame_id]
|
| 702 |
-
save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
|
| 703 |
-
get_mask_one_image(image, masks, oid_class_pred)
|
| 704 |
-
|
| 705 |
-
def get_mask_one_image(frame_image, masks, oid_class_pred=None):
|
| 706 |
-
# Create a figure and axis
|
| 707 |
-
fig, ax = plt.subplots(1, figsize=(6, 6))
|
| 708 |
-
|
| 709 |
-
# Display the frame image
|
| 710 |
-
ax.imshow(frame_image)
|
| 711 |
-
ax.axis('off')
|
| 712 |
-
|
| 713 |
-
if type(masks) == list:
|
| 714 |
-
masks = {i: m for i, m in enumerate(masks)}
|
| 715 |
-
|
| 716 |
-
# Add the masks
|
| 717 |
-
for obj_id, mask in masks.items():
|
| 718 |
-
det_class = f"{obj_id}. {oid_class_pred[obj_id]}" if not oid_class_pred is None else None
|
| 719 |
-
show_mask(mask, ax, obj_id=obj_id, det_class=det_class, random_color=False)
|
| 720 |
-
|
| 721 |
-
# Show the plot
|
| 722 |
-
return fig, ax
|
| 723 |
-
|
| 724 |
-
def save_video(frames, output_filename, output_fps):
|
| 725 |
-
|
| 726 |
-
# --- Create a video from all frames ---
|
| 727 |
-
num_frames = len(frames)
|
| 728 |
-
frame_h, frame_w = frames.shape[:2]
|
| 729 |
-
|
| 730 |
-
# Use a codec supported by VS Code (H.264 via 'avc1').
|
| 731 |
-
fourcc = cv2.VideoWriter_fourcc(*'avc1')
|
| 732 |
-
out = cv2.VideoWriter(output_filename, fourcc, output_fps, (frame_w, frame_h))
|
| 733 |
-
|
| 734 |
-
print(f"Processing {num_frames} frames...")
|
| 735 |
-
for i in range(num_frames):
|
| 736 |
-
vis_frame = get_visualized_frame(i)
|
| 737 |
-
out.write(vis_frame)
|
| 738 |
-
if i % 10 == 0:
|
| 739 |
-
print(f"Processed frame {i+1}/{num_frames}")
|
| 740 |
-
|
| 741 |
-
out.release()
|
| 742 |
-
print(f"Video saved as {output_filename}")
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
def list_depth(lst):
|
| 746 |
-
"""Calculates the depth of a nested list."""
|
| 747 |
-
if not (isinstance(lst, list) or isinstance(lst, torch.Tensor)):
|
| 748 |
-
return 0
|
| 749 |
-
elif (isinstance(lst, torch.Tensor) and lst.shape == torch.Size([])) or (isinstance(lst, list) and len(lst) == 0):
|
| 750 |
-
return 1
|
| 751 |
-
else:
|
| 752 |
-
return 1 + max(list_depth(item) for item in lst)
|
| 753 |
-
|
| 754 |
-
def normalize_prompt(points, labels):
|
| 755 |
-
if list_depth(points) == 3:
|
| 756 |
-
points = torch.stack([p.unsqueeze(0) for p in points])
|
| 757 |
-
labels = torch.stack([l.unsqueeze(0) for l in labels])
|
| 758 |
-
return points, labels
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
def show_box(box, ax, object_id):
|
| 762 |
-
if len(box) == 0:
|
| 763 |
-
return
|
| 764 |
-
|
| 765 |
-
cmap = plt.get_cmap("gist_rainbow")
|
| 766 |
-
cmap_idx = 0 if object_id is None else object_id
|
| 767 |
-
color = list(cmap((cmap_idx * 47) % 256))
|
| 768 |
-
|
| 769 |
-
x0, y0 = box[0], box[1]
|
| 770 |
-
w, h = box[2] - box[0], box[3] - box[1]
|
| 771 |
-
ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor=color, facecolor=(0,0,0,0), lw=2))
|
| 772 |
-
|
| 773 |
-
def show_points(coords, labels, ax, object_id=None, marker_size=375):
|
| 774 |
-
if len(labels) == 0:
|
| 775 |
-
return
|
| 776 |
-
|
| 777 |
-
pos_points = coords[labels==1]
|
| 778 |
-
neg_points = coords[labels==0]
|
| 779 |
-
|
| 780 |
-
cmap = plt.get_cmap("gist_rainbow")
|
| 781 |
-
cmap_idx = 0 if object_id is None else object_id
|
| 782 |
-
color = list(cmap((cmap_idx * 47) % 256))
|
| 783 |
-
|
| 784 |
-
ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='P', s=marker_size, edgecolor=color, linewidth=1.25)
|
| 785 |
-
ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='s', s=marker_size, edgecolor=color, linewidth=1.25)
|
| 786 |
-
|
| 787 |
-
def save_prompts_one_image(frame_image, boxes, points, labels, save_path):
|
| 788 |
-
# Create a figure and axis
|
| 789 |
-
fig, ax = plt.subplots(1, figsize=(6, 6))
|
| 790 |
-
|
| 791 |
-
# Display the frame image
|
| 792 |
-
ax.imshow(frame_image)
|
| 793 |
-
ax.axis('off')
|
| 794 |
-
|
| 795 |
-
points, labels = normalize_prompt(points, labels)
|
| 796 |
-
if type(boxes) == torch.Tensor:
|
| 797 |
-
for object_id, box in enumerate(boxes):
|
| 798 |
-
# Add the bounding boxes
|
| 799 |
-
if not box is None:
|
| 800 |
-
show_box(box.cpu(), ax, object_id=object_id)
|
| 801 |
-
elif type(boxes) == dict:
|
| 802 |
-
for object_id, box in boxes.items():
|
| 803 |
-
# Add the bounding boxes
|
| 804 |
-
if not box is None:
|
| 805 |
-
show_box(box.cpu(), ax, object_id=object_id)
|
| 806 |
-
elif type(boxes) == list and len(boxes) == 0:
|
| 807 |
-
pass
|
| 808 |
-
else:
|
| 809 |
-
raise Exception()
|
| 810 |
-
|
| 811 |
-
for object_id, (point_ls, label_ls) in enumerate(zip(points, labels)):
|
| 812 |
-
if not len(point_ls) == 0:
|
| 813 |
-
show_points(point_ls.cpu(), label_ls.cpu(), ax, object_id=object_id)
|
| 814 |
-
|
| 815 |
-
# Show the plot
|
| 816 |
-
plt.savefig(save_path)
|
| 817 |
-
plt.close()
|
| 818 |
-
|
| 819 |
-
def save_video_prompts_visualization(video_tensor, video_boxes, video_points, video_labels, video_id, video_save_base_dir):
|
| 820 |
-
video_save_dir = os.path.join(video_save_base_dir, video_id)
|
| 821 |
-
if not os.path.exists(video_save_dir):
|
| 822 |
-
os.makedirs(video_save_dir, exist_ok=True)
|
| 823 |
-
|
| 824 |
-
for frame_id, image in enumerate(video_tensor):
|
| 825 |
-
boxes, points, labels = [], [], []
|
| 826 |
-
|
| 827 |
-
if frame_id in video_boxes:
|
| 828 |
-
boxes = video_boxes[frame_id]
|
| 829 |
-
|
| 830 |
-
if frame_id in video_points:
|
| 831 |
-
points = video_points[frame_id]
|
| 832 |
-
if frame_id in video_labels:
|
| 833 |
-
labels = video_labels[frame_id]
|
| 834 |
-
|
| 835 |
-
save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
|
| 836 |
-
save_prompts_one_image(image, boxes, points, labels, save_path)
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
def save_video_masks_visualization(video_tensor, video_masks, video_id, video_save_base_dir, oid_class_pred=None, sample_rate = 1):
|
| 840 |
-
video_save_dir = os.path.join(video_save_base_dir, video_id)
|
| 841 |
-
if not os.path.exists(video_save_dir):
|
| 842 |
-
os.makedirs(video_save_dir, exist_ok=True)
|
| 843 |
-
|
| 844 |
-
for frame_id, image in enumerate(video_tensor):
|
| 845 |
-
if random.random() > sample_rate:
|
| 846 |
-
continue
|
| 847 |
-
if frame_id not in video_masks:
|
| 848 |
-
print("No mask for Frame", frame_id)
|
| 849 |
-
continue
|
| 850 |
-
masks = video_masks[frame_id]
|
| 851 |
-
save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
|
| 852 |
-
save_mask_one_image(image, masks, save_path)
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
def get_color(obj_id, cmap_name="gist_rainbow",alpha=0.5):
|
| 857 |
-
cmap = plt.get_cmap(cmap_name)
|
| 858 |
-
cmap_idx = 0 if obj_id is None else obj_id
|
| 859 |
-
color = list(cmap((cmap_idx * 47) % 256))
|
| 860 |
-
color[3] = 0.5
|
| 861 |
-
color = np.array(color)
|
| 862 |
-
return color
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
def _bbox_center(bbox: Tuple[int, int, int, int]) -> Tuple[float, float]:
|
| 866 |
-
return ((bbox[0] + bbox[2]) / 2.0, (bbox[1] + bbox[3]) / 2.0)
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
def relation_line(
|
| 870 |
-
bbox1: Tuple[int, int, int, int],
|
| 871 |
-
bbox2: Tuple[int, int, int, int],
|
| 872 |
-
) -> Tuple[Tuple[int, int], Tuple[int, int]]:
|
| 873 |
-
"""
|
| 874 |
-
Returns integer pixel centers suitable for drawing a relation line. For
|
| 875 |
-
coincident boxes, nudges the target center to ensure the segment has span.
|
| 876 |
-
"""
|
| 877 |
-
center1 = _bbox_center(bbox1)
|
| 878 |
-
center2 = _bbox_center(bbox2)
|
| 879 |
-
if math.isclose(center1[0], center2[0], abs_tol=1e-3) and math.isclose(center1[1], center2[1], abs_tol=1e-3):
|
| 880 |
-
offset = max(1.0, (bbox2[2] - bbox2[0]) * 0.05)
|
| 881 |
-
center2 = (center2[0] + offset, center2[1])
|
| 882 |
-
start = (int(round(center1[0])), int(round(center1[1])))
|
| 883 |
-
end = (int(round(center2[0])), int(round(center2[1])))
|
| 884 |
-
if start == end:
|
| 885 |
-
end = (end[0] + 1, end[1])
|
| 886 |
-
return start, end
|
| 887 |
-
|
| 888 |
-
def get_binary_mask_one_image(frame_image, masks, rel_pred_ls=None):
|
| 889 |
-
# Create a figure and axis
|
| 890 |
-
fig, ax = plt.subplots(1, figsize=(6, 6))
|
| 891 |
-
|
| 892 |
-
# Display the frame image
|
| 893 |
-
ax.imshow(frame_image)
|
| 894 |
-
ax.axis('off')
|
| 895 |
-
|
| 896 |
-
all_objs_to_show = set()
|
| 897 |
-
all_lines_to_show = []
|
| 898 |
-
|
| 899 |
-
# print(rel_pred_ls[0])
|
| 900 |
-
for (from_obj_id, to_obj_id), rel_text in rel_pred_ls.items():
|
| 901 |
-
all_objs_to_show.add(from_obj_id)
|
| 902 |
-
all_objs_to_show.add(to_obj_id)
|
| 903 |
-
|
| 904 |
-
from_mask = masks[from_obj_id]
|
| 905 |
-
bbox1 = mask_to_bbox(from_mask)
|
| 906 |
-
to_mask = masks[to_obj_id]
|
| 907 |
-
bbox2 = mask_to_bbox(to_mask)
|
| 908 |
-
|
| 909 |
-
c1, c2 = shortest_line_between_bboxes(bbox1, bbox2)
|
| 910 |
-
|
| 911 |
-
line_color = get_color(from_obj_id)
|
| 912 |
-
face_color = get_color(to_obj_id)
|
| 913 |
-
line = c1, c2, face_color, line_color, rel_text
|
| 914 |
-
all_lines_to_show.append(line)
|
| 915 |
-
|
| 916 |
-
masks_to_show = {}
|
| 917 |
-
for oid in all_objs_to_show:
|
| 918 |
-
masks_to_show[oid] = masks[oid]
|
| 919 |
-
|
| 920 |
-
# Add the masks
|
| 921 |
-
for obj_id, mask in masks_to_show.items():
|
| 922 |
-
show_mask(mask, ax, obj_id=obj_id, random_color=False)
|
| 923 |
-
|
| 924 |
-
for (from_pt_x, from_pt_y), (to_pt_x, to_pt_y), face_color, line_color, rel_text in all_lines_to_show:
|
| 925 |
-
|
| 926 |
-
plt.plot([from_pt_x, to_pt_x], [from_pt_y, to_pt_y], color=line_color, linestyle='-', linewidth=3)
|
| 927 |
-
mid_pt_x = (from_pt_x + to_pt_x) / 2
|
| 928 |
-
mid_pt_y = (from_pt_y + to_pt_y) / 2
|
| 929 |
-
ax.text(
|
| 930 |
-
mid_pt_x - 5,
|
| 931 |
-
mid_pt_y,
|
| 932 |
-
rel_text,
|
| 933 |
-
color="white",
|
| 934 |
-
fontsize=6,
|
| 935 |
-
backgroundcolor=np.array(line_color),
|
| 936 |
-
bbox=dict(facecolor=face_color, edgecolor=line_color, boxstyle='round,pad=1'),
|
| 937 |
-
alpha=1
|
| 938 |
-
)
|
| 939 |
-
|
| 940 |
-
# Show the plot
|
| 941 |
-
return fig, ax
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_vine.py
CHANGED
|
@@ -8,8 +8,11 @@ import torch
|
|
| 8 |
|
| 9 |
os.environ['OPENAI_API_KEY'] = "dummy-key"
|
| 10 |
|
| 11 |
-
# Add src to path
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# Determine device
|
| 15 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 8 |
|
| 9 |
os.environ['OPENAI_API_KEY'] = "dummy-key"
|
| 10 |
|
| 11 |
+
# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
|
| 12 |
+
current_dir = Path(__file__).resolve().parent
|
| 13 |
+
src_dir = current_dir / "src"
|
| 14 |
+
if src_dir.is_dir() and str(src_dir) not in sys.path:
|
| 15 |
+
sys.path.insert(0, str(src_dir))
|
| 16 |
|
| 17 |
# Determine device
|
| 18 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
vine_hf/__init__.py
CHANGED
|
@@ -2,13 +2,27 @@
|
|
| 2 |
VINE HuggingFace Interface
|
| 3 |
|
| 4 |
VINE (Video Understanding with Natural Language) is a model that processes videos
|
| 5 |
-
along with categorical, unary, and binary keywords to return probability
|
| 6 |
distributions over those keywords for detected objects and their relationships.
|
| 7 |
|
| 8 |
This package provides a HuggingFace-compatible interface for the VINE model,
|
| 9 |
including configuration, model, and pipeline classes.
|
| 10 |
"""
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
from .vine_config import VineConfig
|
| 13 |
from .vine_model import VineModel
|
| 14 |
from .vine_pipeline import VinePipeline
|
|
|
|
| 2 |
VINE HuggingFace Interface
|
| 3 |
|
| 4 |
VINE (Video Understanding with Natural Language) is a model that processes videos
|
| 5 |
+
along with categorical, unary, and binary keywords to return probability
|
| 6 |
distributions over those keywords for detected objects and their relationships.
|
| 7 |
|
| 8 |
This package provides a HuggingFace-compatible interface for the VINE model,
|
| 9 |
including configuration, model, and pipeline classes.
|
| 10 |
"""
|
| 11 |
|
| 12 |
+
import sys
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
|
| 16 |
+
current_dir = Path(__file__).resolve().parent
|
| 17 |
+
src_dir = current_dir.parent / "src"
|
| 18 |
+
if src_dir.is_dir() and str(src_dir) not in sys.path:
|
| 19 |
+
sys.path.insert(0, str(src_dir))
|
| 20 |
+
|
| 21 |
+
# Add LASER directory to sys.path (laser module is inside src/LASER/)
|
| 22 |
+
laser_dir = src_dir / "LASER"
|
| 23 |
+
if laser_dir.is_dir() and str(laser_dir) not in sys.path:
|
| 24 |
+
sys.path.insert(0, str(laser_dir))
|
| 25 |
+
|
| 26 |
from .vine_config import VineConfig
|
| 27 |
from .vine_model import VineModel
|
| 28 |
from .vine_pipeline import VinePipeline
|
vine_hf/__pycache__/__init__.cpython-310.pyc
CHANGED
|
Binary files a/vine_hf/__pycache__/__init__.cpython-310.pyc and b/vine_hf/__pycache__/__init__.cpython-310.pyc differ
|
|
|
vine_hf/__pycache__/vine_config.cpython-310.pyc
CHANGED
|
Binary files a/vine_hf/__pycache__/vine_config.cpython-310.pyc and b/vine_hf/__pycache__/vine_config.cpython-310.pyc differ
|
|
|
vine_hf/__pycache__/vine_model.cpython-310.pyc
CHANGED
|
Binary files a/vine_hf/__pycache__/vine_model.cpython-310.pyc and b/vine_hf/__pycache__/vine_model.cpython-310.pyc differ
|
|
|
vine_hf/convert_inference.py
CHANGED
|
@@ -7,12 +7,16 @@ to the new HuggingFace-compatible interface.
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import sys
|
|
|
|
| 10 |
import torch
|
| 11 |
import numpy as np
|
| 12 |
from typing import Dict, List, Tuple, Any
|
| 13 |
|
| 14 |
-
# Add
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 18 |
from laser.loading import load_video
|
|
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
import torch
|
| 12 |
import numpy as np
|
| 13 |
from typing import Dict, List, Tuple, Any
|
| 14 |
|
| 15 |
+
# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
|
| 16 |
+
current_dir = Path(__file__).resolve().parent
|
| 17 |
+
src_dir = current_dir.parent / "src"
|
| 18 |
+
if src_dir.is_dir() and str(src_dir) not in sys.path:
|
| 19 |
+
sys.path.insert(0, str(src_dir))
|
| 20 |
|
| 21 |
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 22 |
from laser.loading import load_video
|
vine_hf/example_ensemble_weights.py
CHANGED
|
@@ -7,14 +7,18 @@ and use them with the HuggingFace interface, based on the actual inference.py wo
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import sys
|
|
|
|
| 10 |
import torch
|
| 11 |
import numpy as np
|
| 12 |
from transformers.pipelines import PIPELINE_REGISTRY
|
| 13 |
|
| 14 |
#os.environ["OPENAI_API_KEY"]="dummy-key" # Set your OpenAI API key here or via environment variable
|
| 15 |
|
| 16 |
-
# Add
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 20 |
from laser.loading import load_video
|
|
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
import torch
|
| 12 |
import numpy as np
|
| 13 |
from transformers.pipelines import PIPELINE_REGISTRY
|
| 14 |
|
| 15 |
#os.environ["OPENAI_API_KEY"]="dummy-key" # Set your OpenAI API key here or via environment variable
|
| 16 |
|
| 17 |
+
# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
|
| 18 |
+
current_dir = Path(__file__).resolve().parent
|
| 19 |
+
src_dir = current_dir.parent / "src"
|
| 20 |
+
if src_dir.is_dir() and str(src_dir) not in sys.path:
|
| 21 |
+
sys.path.insert(0, str(src_dir))
|
| 22 |
|
| 23 |
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 24 |
from laser.loading import load_video
|
vine_hf/example_sam2_masks.py
CHANGED
|
@@ -7,13 +7,16 @@ segmentation methods with the VINE model.
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import sys
|
|
|
|
| 10 |
import torch
|
| 11 |
import numpy as np
|
| 12 |
from transformers.pipelines import PIPELINE_REGISTRY
|
| 13 |
|
| 14 |
-
# Add
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
| 17 |
|
| 18 |
#Either uncomment the below or set a environemental key, though it isn't needed to run.
|
| 19 |
#os.environ['OPENAI_API_KEY'] = 'dummy-key'
|
|
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
import torch
|
| 12 |
import numpy as np
|
| 13 |
from transformers.pipelines import PIPELINE_REGISTRY
|
| 14 |
|
| 15 |
+
# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
|
| 16 |
+
current_dir = Path(__file__).resolve().parent
|
| 17 |
+
src_dir = current_dir.parent / "src"
|
| 18 |
+
if src_dir.is_dir() and str(src_dir) not in sys.path:
|
| 19 |
+
sys.path.insert(0, str(src_dir))
|
| 20 |
|
| 21 |
#Either uncomment the below or set a environemental key, though it isn't needed to run.
|
| 22 |
#os.environ['OPENAI_API_KEY'] = 'dummy-key'
|
vine_hf/example_usage.py
CHANGED
|
@@ -7,12 +7,16 @@ for video understanding with categorical, unary, and binary keyword predictions.
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import sys
|
|
|
|
| 10 |
import torch
|
| 11 |
from transformers import pipeline, AutoModel
|
| 12 |
from transformers.pipelines import PIPELINE_REGISTRY
|
| 13 |
|
| 14 |
-
# Add
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# Uncomment or set your own
|
| 18 |
#os.environ['OPENAI_API_KEY'] = 'dummy-key'
|
|
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
import torch
|
| 12 |
from transformers import pipeline, AutoModel
|
| 13 |
from transformers.pipelines import PIPELINE_REGISTRY
|
| 14 |
|
| 15 |
+
# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
|
| 16 |
+
current_dir = Path(__file__).resolve().parent
|
| 17 |
+
src_dir = current_dir.parent / "src"
|
| 18 |
+
if src_dir.is_dir() and str(src_dir) not in sys.path:
|
| 19 |
+
sys.path.insert(0, str(src_dir))
|
| 20 |
|
| 21 |
# Uncomment or set your own
|
| 22 |
#os.environ['OPENAI_API_KEY'] = 'dummy-key'
|
vine_hf/example_visualization.py
CHANGED
|
@@ -5,6 +5,7 @@
|
|
| 5 |
|
| 6 |
import os
|
| 7 |
import sys
|
|
|
|
| 8 |
import argparse
|
| 9 |
import cv2
|
| 10 |
import numpy as np
|
|
@@ -16,8 +17,11 @@ from transformers import pipeline
|
|
| 16 |
# Set your OpenAI API key here or via environment variable
|
| 17 |
os.environ['OPENAI_API_KEY'] = "dummy-key"
|
| 18 |
|
| 19 |
-
#
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
from vine_hf.vine_pipeline import VinePipeline # https://github.com link not needed; local path used
|
| 23 |
from vine_hf.vine_model import VineModel
|
|
|
|
| 5 |
|
| 6 |
import os
|
| 7 |
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
import argparse
|
| 10 |
import cv2
|
| 11 |
import numpy as np
|
|
|
|
| 17 |
# Set your OpenAI API key here or via environment variable
|
| 18 |
os.environ['OPENAI_API_KEY'] = "dummy-key"
|
| 19 |
|
| 20 |
+
# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
|
| 21 |
+
current_dir = Path(__file__).resolve().parent
|
| 22 |
+
src_dir = current_dir.parent / "src"
|
| 23 |
+
if src_dir.is_dir() and str(src_dir) not in sys.path:
|
| 24 |
+
sys.path.insert(0, str(src_dir))
|
| 25 |
|
| 26 |
from vine_hf.vine_pipeline import VinePipeline # https://github.com link not needed; local path used
|
| 27 |
from vine_hf.vine_model import VineModel
|
vine_hf/example_with_pretrained_vine.py
CHANGED
|
@@ -7,6 +7,7 @@ from the ensemble format or from video-fm/vine_v0.
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import sys
|
|
|
|
| 10 |
import torch
|
| 11 |
from transformers import pipeline
|
| 12 |
from transformers.pipelines import PIPELINE_REGISTRY
|
|
@@ -14,8 +15,11 @@ from transformers.pipelines import PIPELINE_REGISTRY
|
|
| 14 |
# Set your OpenAI API key here or via environment variable
|
| 15 |
#os.environ['OPENAI_API_KEY'] = "dummy-key"
|
| 16 |
|
| 17 |
-
# Add
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 21 |
|
|
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
import torch
|
| 12 |
from transformers import pipeline
|
| 13 |
from transformers.pipelines import PIPELINE_REGISTRY
|
|
|
|
| 15 |
# Set your OpenAI API key here or via environment variable
|
| 16 |
#os.environ['OPENAI_API_KEY'] = "dummy-key"
|
| 17 |
|
| 18 |
+
# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
|
| 19 |
+
current_dir = Path(__file__).resolve().parent
|
| 20 |
+
src_dir = current_dir.parent / "src"
|
| 21 |
+
if src_dir.is_dir() and str(src_dir) not in sys.path:
|
| 22 |
+
sys.path.insert(0, str(src_dir))
|
| 23 |
|
| 24 |
from vine_hf import VineConfig, VineModel, VinePipeline
|
| 25 |
|
vine_hf/push_to_hub.py
CHANGED
|
@@ -7,13 +7,17 @@ for easy sharing and distribution.
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import sys
|
|
|
|
| 10 |
import torch
|
| 11 |
import argparse
|
| 12 |
from huggingface_hub import notebook_login
|
| 13 |
from transformers.pipelines import PIPELINE_REGISTRY
|
| 14 |
|
| 15 |
-
# Add
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
os.environ['OPENAI_API_KEY'] = "dummy-key"
|
| 19 |
from vine_hf import VineConfig, VineModel, VinePipeline
|
|
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
import torch
|
| 12 |
import argparse
|
| 13 |
from huggingface_hub import notebook_login
|
| 14 |
from transformers.pipelines import PIPELINE_REGISTRY
|
| 15 |
|
| 16 |
+
# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
|
| 17 |
+
current_dir = Path(__file__).resolve().parent
|
| 18 |
+
src_dir = current_dir.parent / "src"
|
| 19 |
+
if src_dir.is_dir() and str(src_dir) not in sys.path:
|
| 20 |
+
sys.path.insert(0, str(src_dir))
|
| 21 |
|
| 22 |
os.environ['OPENAI_API_KEY'] = "dummy-key"
|
| 23 |
from vine_hf import VineConfig, VineModel, VinePipeline
|
vine_hf/push_to_video_fm.py
CHANGED
|
@@ -15,10 +15,11 @@ from transformers.pipelines import PIPELINE_REGISTRY
|
|
| 15 |
from transformers import AutoModel
|
| 16 |
from safetensors.torch import save_file
|
| 17 |
|
| 18 |
-
# Add
|
| 19 |
-
current_dir = Path(__file__).parent
|
| 20 |
-
|
| 21 |
-
|
|
|
|
| 22 |
|
| 23 |
os.environ['OPENAI_API_KEY'] = "dummy-key"
|
| 24 |
|
|
|
|
| 15 |
from transformers import AutoModel
|
| 16 |
from safetensors.torch import save_file
|
| 17 |
|
| 18 |
+
# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
|
| 19 |
+
current_dir = Path(__file__).resolve().parent
|
| 20 |
+
src_dir = current_dir.parent / "src"
|
| 21 |
+
if src_dir.is_dir() and str(src_dir) not in sys.path:
|
| 22 |
+
sys.path.insert(0, str(src_dir))
|
| 23 |
|
| 24 |
os.environ['OPENAI_API_KEY'] = "dummy-key"
|
| 25 |
|
vine_hf/vine_model.py
CHANGED
|
@@ -1,7 +1,14 @@
|
|
| 1 |
import os
|
| 2 |
import sys
|
|
|
|
| 3 |
from typing import Dict, List, Tuple, Optional, Any, Union
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
import cv2
|
| 6 |
import numpy as np
|
| 7 |
import torch
|
|
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
from typing import Dict, List, Tuple, Optional, Any, Union
|
| 5 |
|
| 6 |
+
# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
|
| 7 |
+
current_dir = Path(__file__).resolve().parent
|
| 8 |
+
src_dir = current_dir.parent / "src"
|
| 9 |
+
if src_dir.is_dir() and str(src_dir) not in sys.path:
|
| 10 |
+
sys.path.insert(0, str(src_dir))
|
| 11 |
+
|
| 12 |
import cv2
|
| 13 |
import numpy as np
|
| 14 |
import torch
|
vine_hf/vine_pipeline.py
CHANGED
|
@@ -1,10 +1,17 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
import uuid
|
| 3 |
import hashlib
|
| 4 |
import tempfile
|
| 5 |
from pathlib import Path
|
| 6 |
from typing import Dict, List, Tuple, Optional, Any, Union
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
import cv2
|
| 9 |
import numpy as np
|
| 10 |
import torch
|
|
|
|
| 1 |
import os
|
| 2 |
+
import sys
|
| 3 |
import uuid
|
| 4 |
import hashlib
|
| 5 |
import tempfile
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import Dict, List, Tuple, Optional, Any, Union
|
| 8 |
|
| 9 |
+
# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
|
| 10 |
+
current_dir = Path(__file__).resolve().parent
|
| 11 |
+
src_dir = current_dir.parent / "src"
|
| 12 |
+
if src_dir.is_dir() and str(src_dir) not in sys.path:
|
| 13 |
+
sys.path.insert(0, str(src_dir))
|
| 14 |
+
|
| 15 |
import cv2
|
| 16 |
import numpy as np
|
| 17 |
import torch
|
vine_hf/vis_utils.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
| 2 |
import cv2
|
| 3 |
-
import numpy as np
|
| 4 |
import matplotlib.pyplot as plt
|
| 5 |
import torch
|
| 6 |
import random
|
|
@@ -9,6 +11,12 @@ from matplotlib.patches import Rectangle
|
|
| 9 |
import itertools
|
| 10 |
from typing import Any, Dict, List, Tuple, Optional, Union
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
from laser.preprocess.mask_generation_grounding_dino import mask_to_bbox
|
| 13 |
|
| 14 |
########################################################################################
|
|
|
|
| 1 |
import os
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
import cv2
|
| 5 |
+
import numpy as np
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
import torch
|
| 8 |
import random
|
|
|
|
| 11 |
import itertools
|
| 12 |
from typing import Any, Dict, List, Tuple, Optional, Union
|
| 13 |
|
| 14 |
+
# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
|
| 15 |
+
current_dir = Path(__file__).resolve().parent
|
| 16 |
+
src_dir = current_dir.parent / "src"
|
| 17 |
+
if src_dir.is_dir() and str(src_dir) not in sys.path:
|
| 18 |
+
sys.path.insert(0, str(src_dir))
|
| 19 |
+
|
| 20 |
from laser.preprocess.mask_generation_grounding_dino import mask_to_bbox
|
| 21 |
|
| 22 |
########################################################################################
|