moqingyan123 commited on
Commit
f71f431
·
1 Parent(s): 19446f1
Files changed (46) hide show
  1. app.py +6 -0
  2. src/vine_hf/OVERVIEW.md +0 -218
  3. src/vine_hf/README.md +0 -355
  4. src/vine_hf/__init__.py +0 -23
  5. src/vine_hf/__pycache__/__init__.cpython-310.pyc +0 -0
  6. src/vine_hf/__pycache__/flattening.cpython-310.pyc +0 -0
  7. src/vine_hf/__pycache__/vine_config.cpython-310.pyc +0 -0
  8. src/vine_hf/__pycache__/vine_model.cpython-310.pyc +0 -0
  9. src/vine_hf/__pycache__/vine_pipeline.cpython-310.pyc +0 -0
  10. src/vine_hf/__pycache__/vis_utils.cpython-310.pyc +0 -0
  11. src/vine_hf/convert_inference.py +0 -288
  12. src/vine_hf/example_ensemble_weights.py +0 -333
  13. src/vine_hf/example_sam2_masks.py +0 -331
  14. src/vine_hf/example_usage.ipynb +0 -310
  15. src/vine_hf/example_usage.py +0 -283
  16. src/vine_hf/example_visualization.py +0 -146
  17. src/vine_hf/example_with_pretrained_vine.py +0 -287
  18. src/vine_hf/flattening.py +0 -124
  19. src/vine_hf/push_to_hub.py +0 -232
  20. src/vine_hf/setup.py +0 -63
  21. src/vine_hf/vine_config.py +0 -108
  22. src/vine_hf/vine_hf.egg-info/PKG-INFO +0 -401
  23. src/vine_hf/vine_hf.egg-info/SOURCES.txt +0 -21
  24. src/vine_hf/vine_hf.egg-info/dependency_links.txt +0 -1
  25. src/vine_hf/vine_hf.egg-info/entry_points.txt +0 -2
  26. src/vine_hf/vine_hf.egg-info/requires.txt +0 -16
  27. src/vine_hf/vine_hf.egg-info/top_level.txt +0 -1
  28. src/vine_hf/vine_model.py +0 -702
  29. src/vine_hf/vine_pipeline.py +0 -691
  30. src/vine_hf/vis_utils.py +0 -941
  31. test_vine.py +5 -2
  32. vine_hf/__init__.py +15 -1
  33. vine_hf/__pycache__/__init__.cpython-310.pyc +0 -0
  34. vine_hf/__pycache__/vine_config.cpython-310.pyc +0 -0
  35. vine_hf/__pycache__/vine_model.cpython-310.pyc +0 -0
  36. vine_hf/convert_inference.py +6 -2
  37. vine_hf/example_ensemble_weights.py +6 -2
  38. vine_hf/example_sam2_masks.py +6 -3
  39. vine_hf/example_usage.py +6 -2
  40. vine_hf/example_visualization.py +6 -2
  41. vine_hf/example_with_pretrained_vine.py +6 -2
  42. vine_hf/push_to_hub.py +6 -2
  43. vine_hf/push_to_video_fm.py +5 -4
  44. vine_hf/vine_model.py +7 -0
  45. vine_hf/vine_pipeline.py +7 -0
  46. vine_hf/vis_utils.py +9 -1
app.py CHANGED
@@ -7,6 +7,12 @@ import tempfile
7
  import os
8
  import sys
9
 
 
 
 
 
 
 
10
  import spaces # <-- ZeroGPU integration
11
  import gradio as gr
12
  import torch
 
7
  import os
8
  import sys
9
 
10
+ # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
11
+ current_dir = Path(__file__).resolve().parent
12
+ src_dir = current_dir / "src"
13
+ if src_dir.is_dir() and str(src_dir) not in sys.path:
14
+ sys.path.insert(0, str(src_dir))
15
+
16
  import spaces # <-- ZeroGPU integration
17
  import gradio as gr
18
  import torch
src/vine_hf/OVERVIEW.md DELETED
@@ -1,218 +0,0 @@
1
- # VINE HuggingFace Interface - Complete Overview
2
-
3
- This directory contains a complete HuggingFace-compatible interface for the VINE (Video Understanding with Natural Language) model. The interface allows you to easily use, share, and deploy your VINE model through the HuggingFace ecosystem.
4
-
5
- ## 📁 Directory Structure
6
-
7
- ```
8
- vine_hf/
9
- ├── __init__.py # Package initialization and exports
10
- ├── vine_config.py # VineConfig class (PretrainedConfig)
11
- ├── vine_model.py # VineModel class (PreTrainedModel)
12
- ├── vine_pipeline.py # VinePipeline class (Pipeline)
13
- ├── example_usage.py # Comprehensive usage examples
14
- ├── convert_inference.py # Migration guide from inference.py
15
- ├── push_to_hub.py # Script to push model to HF Hub
16
- ├── setup.py # Package setup configuration
17
- ├── README.md # Detailed documentation
18
- └── OVERVIEW.md # This file
19
- ```
20
-
21
- ## 🏗️ Architecture Components
22
-
23
- ### 1. VineConfig (`vine_config.py`)
24
- - Inherits from `PretrainedConfig`
25
- - Configures model parameters, segmentation methods, and processing options
26
- - Compatible with HuggingFace configuration system
27
-
28
- ### 2. VineModel (`vine_model.py`)
29
- - Inherits from `PreTrainedModel`
30
- - Implements the core VINE model with three CLIP backbones
31
- - Supports categorical, unary, and binary predictions
32
- - Provides both `forward()` and `predict()` methods
33
-
34
- ### 3. VinePipeline (`vine_pipeline.py`)
35
- - Inherits from `Pipeline`
36
- - Handles end-to-end video processing workflow
37
- - Integrates segmentation (SAM2, Grounding DINO + SAM2)
38
- - Provides user-friendly interface for video understanding
39
-
40
- ## 🚀 Key Features
41
-
42
- ✅ **Full HuggingFace Compatibility**
43
- - Compatible with `transformers` library
44
- - Supports `AutoModel` and `pipeline` interfaces
45
- - Can be pushed to and loaded from HuggingFace Hub
46
-
47
- ✅ **Flexible Segmentation**
48
- - Support for SAM2 automatic segmentation
49
- - Support for Grounding DINO + SAM2 text-guided segmentation
50
- - Configurable thresholds and parameters
51
-
52
- ✅ **Multi-Modal Understanding**
53
- - Categorical classification (object types)
54
- - Unary predicates (single object actions)
55
- - Binary relations (object-object relationships)
56
-
57
- ✅ **Easy Integration**
58
- - Simple pipeline interface for end users
59
- - Direct model access for researchers
60
- - Comprehensive configuration options
61
-
62
- ## 📖 Usage Examples
63
-
64
- ### Quick Start with Pipeline
65
- ```python
66
- from transformers import pipeline
67
- from vine_hf import VineModel, VinePipeline
68
-
69
- # Create pipeline
70
- vine_pipeline = pipeline(
71
- "vine-video-understanding",
72
- model="your-username/vine-model",
73
- trust_remote_code=True
74
- )
75
-
76
- # Process video
77
- results = vine_pipeline(
78
- "video.mp4",
79
- categorical_keywords=['human', 'dog', 'frisbee'],
80
- unary_keywords=['running', 'jumping'],
81
- binary_keywords=['chasing', 'behind']
82
- )
83
- ```
84
-
85
- ### Direct Model Usage
86
- ```python
87
- from vine_hf import VineConfig, VineModel
88
-
89
- config = VineConfig(segmentation_method="grounding_dino_sam2")
90
- model = VineModel(config)
91
-
92
- results = model.predict(
93
- video_frames=video_tensor,
94
- masks=masks_dict,
95
- bboxes=bboxes_dict,
96
- categorical_keywords=['human', 'dog'],
97
- unary_keywords=['running', 'sitting'],
98
- binary_keywords=['chasing', 'near']
99
- )
100
- ```
101
-
102
- ## 🔧 Migration from Original Code
103
-
104
- The `convert_inference.py` script shows how to migrate from the original `inference.py` workflow:
105
-
106
- **Original Approach:**
107
- - Manual model loading and configuration
108
- - Direct handling of segmentation pipeline
109
- - Custom result processing
110
- - Complex setup requirements
111
-
112
- **New HuggingFace Interface:**
113
- - Standardized model configuration
114
- - Automatic preprocessing/postprocessing
115
- - Simple pipeline interface
116
- - Easy sharing via HuggingFace Hub
117
-
118
- ## 📤 Sharing Your Model
119
-
120
- Use the `push_to_hub.py` script to share your trained model:
121
-
122
- ```bash
123
- python vine_hf/push_to_hub.py \
124
- --weights path/to/your/model.pth \
125
- --repo your-username/vine-model \
126
- --login
127
- ```
128
-
129
- ## 🛠️ Installation & Setup
130
-
131
- 1. **Install Dependencies:**
132
- ```bash
133
- pip install transformers torch torchvision opencv-python pillow numpy
134
- ```
135
-
136
- 2. **Install Segmentation Models (Optional):**
137
- - SAM2: https://github.com/facebookresearch/sam2
138
- - Grounding DINO: https://github.com/IDEA-Research/GroundingDINO
139
-
140
- 3. **Install VINE HF Interface:**
141
- ```bash
142
- cd vine_hf
143
- pip install -e .
144
- ```
145
-
146
- ## 🎯 Configuration Options
147
-
148
- The `VineConfig` class supports extensive configuration:
149
-
150
- - **Model Settings:** CLIP backbone, hidden dimensions
151
- - **Segmentation:** Method, thresholds, target FPS
152
- - **Processing:** Alpha values, top-k results, video length limits
153
- - **Performance:** Multi-class mode, output format options
154
-
155
- ## 📊 Output Format
156
-
157
- The interface returns structured predictions:
158
-
159
- ```python
160
- {
161
- "categorical_predictions": {obj_id: [(prob, category), ...]},
162
- "unary_predictions": {(frame, obj): [(prob, action), ...]},
163
- "binary_predictions": {(frame, pair): [(prob, relation), ...]},
164
- "confidence_scores": {"categorical": float, "unary": float, "binary": float},
165
- "summary": {
166
- "num_objects_detected": int,
167
- "top_categories": [(category, prob), ...],
168
- "top_actions": [(action, prob), ...],
169
- "top_relations": [(relation, prob), ...]
170
- }
171
- }
172
- ```
173
-
174
- ## 🔍 Testing & Validation
175
-
176
- Run the example scripts to test your setup:
177
-
178
- ```bash
179
- # Test basic functionality
180
- python vine_hf/example_usage.py
181
-
182
- # Test migration from original code
183
- python vine_hf/convert_inference.py
184
- ```
185
-
186
- ## 🤝 Contributing
187
-
188
- To contribute or customize:
189
-
190
- 1. **Modify Configuration:** Edit `vine_config.py` for new parameters
191
- 2. **Extend Model:** Add functionality to `vine_model.py`
192
- 3. **Enhance Pipeline:** Improve preprocessing/postprocessing in `vine_pipeline.py`
193
- 4. **Add Features:** Create additional utility scripts
194
-
195
- ## 📝 Next Steps
196
-
197
- 1. **Load Your Weights:** Use your trained VINE model weights
198
- 2. **Test Segmentation:** Set up Grounding DINO and SAM2 models
199
- 3. **Validate Results:** Compare with original inference.py output
200
- 4. **Share Model:** Push to HuggingFace Hub for community use
201
- 5. **Deploy:** Use in applications, demos, or research projects
202
-
203
- ## 🐛 Troubleshooting
204
-
205
- **Common Issues:**
206
- - **Import Errors:** Check PYTHONPATH and package installation
207
- - **Segmentation Failures:** Verify Grounding DINO/SAM2 setup
208
- - **Weight Loading:** Adjust weight loading logic in `convert_inference.py`
209
- - **CUDA Issues:** Check GPU availability and PyTorch installation
210
-
211
- **Support:**
212
- - Check the README.md for detailed documentation
213
- - Review example_usage.py for working code examples
214
- - Examine convert_inference.py for migration guidance
215
-
216
- ---
217
-
218
- This HuggingFace interface makes VINE accessible to the broader ML community while maintaining all the powerful video understanding capabilities of the original model. The standardized interface enables easy sharing, deployment, and integration with existing HuggingFace workflows.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/README.md DELETED
@@ -1,355 +0,0 @@
1
- # VINE HuggingFace Interface
2
-
3
- VINE (Video Understanding with Natural Language) is a model that processes videos along with categorical, unary, and binary keywords to return probability distributions over those keywords for detected objects and their relationships.
4
-
5
- This package provides a HuggingFace-compatible interface for the VINE model, making it easy to use for video understanding tasks.
6
-
7
- ## Features
8
-
9
- - **Categorical Classification**: Classify objects in videos (e.g., "human", "dog", "frisbee")
10
- - **Unary Predicates**: Detect actions on single objects (e.g., "running", "jumping", "sitting")
11
- - **Binary Relations**: Detect relationships between object pairs (e.g., "behind", "in front of", "chasing")
12
- - **Multiple Segmentation Methods**: Support for SAM2 and Grounding DINO + SAM2
13
- - **HuggingFace Integration**: Full compatibility with HuggingFace transformers and pipelines
14
- - **Visualization Hooks**: Optional high-level visualizations plus lightweight debug mask dumps for quick sanity checks
15
-
16
- ## Installation
17
-
18
- ```bash
19
- # Install the package (assuming it's in your Python path)
20
- pip install transformers torch torchvision
21
- pip install opencv-python pillow numpy
22
-
23
- # For segmentation functionality, you'll also need:
24
- # - SAM2: https://github.com/facebookresearch/sam2
25
- # - Grounding DINO: https://github.com/IDEA-Research/GroundingDINO
26
- ```
27
-
28
- ## Segmentation Model Configuration
29
-
30
- `VinePipeline` lazily brings up the segmentation stack the first time a call needs masks. Thresholds, FPS, visualization toggles, and device selection live in `VineConfig`; the pipeline constructor tells it where to fetch SAM2 / GroundingDINO weights or lets you inject already-instantiated modules.
31
-
32
- ### Provide file paths at construction (most common)
33
-
34
- ```python
35
- from vine_hf import VineConfig, VineModel, VinePipeline
36
-
37
- vine_config = VineConfig(
38
- segmentation_method="grounding_dino_sam2", # or "sam2"
39
- box_threshold=0.35,
40
- text_threshold=0.25,
41
- target_fps=5,
42
- visualization_dir="output/visualizations", # where to write visualizations (and debug visualizations if enabled)
43
- debug_visualizations=True, # Write videos of the groundingDINO/SAM2/Binary/Unary, etc... outputs
44
- pretrained_vine_path="/abs/path/to/laser_model_v1.pkl",
45
- device="cuda:0", # accepts int, str, or torch.device
46
- )
47
-
48
- vine_model = VineModel(vine_config)
49
-
50
- vine_pipeline = VinePipeline(
51
- model=vine_model,
52
- tokenizer=None,
53
- sam_config_path="/abs/path/to/sam2/sam2.1_hiera_t.yaml",
54
- sam_checkpoint_path="/abs/path/to/sam2/sam2_hiera_tiny.pt",
55
- gd_config_path="/abs/path/to/groundingdino/config/GroundingDINO_SwinT_OGC.py",
56
- gd_checkpoint_path="/abs/path/to/groundingdino/weights/groundingdino_swint_ogc.pth",
57
- device=vine_config._device,
58
- )
59
- ```
60
-
61
- When `segmentation_method="grounding_dino_sam2"`, both SAM2 and GroundingDINO must be reachable. The pipeline validates the paths; missing files raise a `ValueError`. If you pick `"sam2"`, only the SAM2 config and checkpoint are required.
62
-
63
- ### Reuse pre-initialized segmentation modules
64
-
65
- If you build the segmentation stack elsewhere, inject the components with `set_segmentation_models` before running the pipeline:
66
-
67
- ```python
68
- from sam2.build_sam import build_sam2_video_predictor, build_sam2
69
- from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
70
- from groundingdino.util.inference import Model as GroundingDINOModel
71
-
72
- sam_predictor = build_sam2_video_predictor(..., device=vine_config._device)
73
- mask_generator = SAM2AutomaticMaskGenerator(build_sam2(..., device=vine_config._device))
74
- grounding_model = GroundingDINOModel(..., device=vine_config._device)
75
-
76
- vine_pipeline.set_segmentation_models(
77
- sam_predictor=sam_predictor,
78
- mask_generator=mask_generator,
79
- grounding_model=grounding_model,
80
- )
81
- ```
82
-
83
- Any argument left as `None` is initialized lazily from the file paths when the pipeline first needs that backend.
84
-
85
- ## Quick Start
86
-
87
- ## Requirements
88
- -torch
89
- -torchvision
90
- -transformers
91
- -opencv-python
92
- -matplotlib
93
- -seaborn
94
- -pandas
95
- -numpy
96
- -ipywidgets
97
- -tqdm
98
- -scikit-learn
99
- -sam2 (from Facebook Research) "https://github.com/video-fm/video-sam2"
100
- -sam2 weights (downloaded separately. EX: https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_tiny.pt)
101
- -groundingdino (from IDEA Research)
102
- -groundingdino weights (downloaded separately. EX:https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth)
103
- -spacy-fastlang
104
- -en-core-web-sm (for spacy-fastlang)
105
- -ffmpeg (for video processing)
106
- -(optional) laser weights/full model checkpoint (downloaded separately. EX: https://huggingface.co/video-fm/vine_v0)
107
-
108
- Usually, by running the laser/environments/laser_env.yml from the LASER repo, most dependencies will be installed. You will need to manually install sam2 and groundingdino as per their instructions.
109
-
110
- ### Using the Pipeline (Recommended)
111
- ```python
112
- from transformers.pipelines import PIPELINE_REGISTRY
113
- from vine_hf import VineConfig, VineModel, VinePipeline
114
-
115
- PIPELINE_REGISTRY.register_pipeline(
116
- "vine-video-understanding",
117
- pipeline_class=VinePipeline,
118
- pt_model=VineModel,
119
- type="multimodal",
120
- )
121
-
122
- config = VineConfig(
123
- segmentation_method="grounding_dino_sam2",
124
- pretrained_vine_path="/abs/path/to/laser_model_v1.pkl",
125
- visualization_dir="output",
126
- visualize=True,
127
- device="cuda:0",
128
- )
129
-
130
- model = VineModel(config)
131
-
132
- vine_pipeline = VinePipeline(
133
- model=model,
134
- tokenizer=None,
135
- sam_config_path="/abs/path/to/sam2/sam2.1_hiera_t.yaml",
136
- sam_checkpoint_path="/abs/path/to/sam2/sam2_hiera_tiny.pt",
137
- gd_config_path="/abs/path/to/groundingdino/config/GroundingDINO_SwinT_OGC.py",
138
- gd_checkpoint_path="/abs/path/to/groundingdino/weights/groundingdino_swint_ogc.pth",
139
- device=config._device,
140
- )
141
-
142
- results = vine_pipeline(
143
- "/path/to/video.mp4",
144
- categorical_keywords=["dog", "human"],
145
- unary_keywords=["running"],
146
- binary_keywords=["chasing"],
147
- object_pairs=[(0, 1)],
148
- return_top_k=3,
149
- include_visualizations=True,
150
- )
151
- print(results["summary"])
152
- ```
153
-
154
- ### Using the Model Directly (Advanced)
155
-
156
- For advanced users who want to provide their own segmentation:
157
-
158
- ```python
159
- from vine_hf import VineConfig, VineModel
160
- import torch
161
-
162
- # Create configuration
163
- config = VineConfig(
164
- pretrained_vine_path="/path/to/your/vine/weights" # Optional: your fine-tuned weights
165
- )
166
-
167
- # Initialize model
168
- model = VineModel(config)
169
-
170
- # If you have your own video frames, masks, and bboxes from external segmentation
171
- video_frames = torch.randn(3, 224, 224, 3) * 255 # Your video frames
172
- masks = {0: {1: torch.ones(224, 224, 1)}} # Your segmentation masks
173
- bboxes = {0: {1: [50, 50, 150, 150]}} # Your bounding boxes
174
-
175
- # Run prediction
176
- results = model.predict(
177
- video_frames=video_frames,
178
- masks=masks,
179
- bboxes=bboxes,
180
- categorical_keywords=['human', 'dog', 'frisbee'],
181
- unary_keywords=['running', 'jumping'],
182
- binary_keywords=['chasing', 'following'],
183
- object_pairs=[(1, 2)],
184
- return_top_k=3
185
- )
186
- ```
187
-
188
- **Note**: For most users, the pipeline approach above is recommended as it handles video loading and segmentation automatically.
189
-
190
- ## Configuration Options
191
-
192
- The `VineConfig` class supports the following parameters (non-exhaustive):
193
-
194
- - `model_name`: CLIP model backbone (default: `"openai/clip-vit-large-patch14-336"`)
195
- - `pretrained_vine_path`: Optional path or Hugging Face repo with pretrained VINE weights
196
- - `segmentation_method`: `"sam2"` or `"grounding_dino_sam2"` (default: `"grounding_dino_sam2"`)
197
- - `box_threshold` / `text_threshold`: Grounding DINO thresholds
198
- - `target_fps`: Target FPS for video processing (default: `1`)
199
- - `alpha`, `white_alpha`: Rendering parameters used when extracting masked crops
200
- - `topk_cate`: Top-k categories to return per object (default: `3`)
201
- - `max_video_length`: Maximum frames to process (default: `100`)
202
- - `visualize`: When `True`, pipeline post-processing attempts to create stitched visualizations
203
- - `visualization_dir`: Optional base directory where visualization assets are written
204
- - `debug_visualizations`: When `True`, the model saves a single first-frame mask composite for quick inspection
205
- - `debug_visualization_path`: Target filepath for the debug mask composite (must point to a writable file)
206
- - `return_flattened_segments`, `return_valid_pairs`, `interested_object_pairs`: Advanced geometry outputs for downstream consumers
207
-
208
- ## Output Format
209
-
210
- The model returns a dictionary with the following structure:
211
-
212
- ```python
213
- {
214
- "masks" : {},
215
-
216
- "boxes" : {},
217
-
218
- "categorical_predictions": {
219
- object_id: [(probability, category), ...]
220
- },
221
- "unary_predictions": {
222
- (frame_id, object_id): [(probability, action), ...]
223
- },
224
- "binary_predictions": {
225
- (frame_id, (obj1_id, obj2_id)): [(probability, relation), ...]
226
- },
227
- "confidence_scores": {
228
- "categorical": max_categorical_confidence,
229
- "unary": max_unary_confidence,
230
- "binary": max_binary_confidence
231
- },
232
- "summary": {
233
- "num_objects_detected": int,
234
- "top_categories": [(category, probability), ...],
235
- "top_actions": [(action, probability), ...],
236
- "top_relations": [(relation, probability), ...]
237
- }
238
- }
239
- ```
240
-
241
- ## Visualization & Debugging
242
-
243
- There are two complementary visualization layers:
244
-
245
- - **Post-process visualizations** (`include_visualizations=True` in the pipeline call) produces a high-level stitched video summarizing detections, actions, and relations over time.
246
-
247
- - **Debug visualizations** (`debug_visualizations=True` in `VineConfig`) dumps videos of intermediate segmentation masks and outputs from GroundingDINO, SAM2, Unary, Binary, etc. for quick sanity checks.
248
-
249
- If you plan to enable either option, ensure the relevant output directories exist before running the pipeline.
250
-
251
- ## Segmentation Methods
252
-
253
- ### Grounding DINO + SAM2 (Recommended)
254
-
255
- Uses Grounding DINO for object detection based on text prompts, then SAM2 for precise segmentation.
256
-
257
- Requirements:
258
- - Grounding DINO model and weights
259
- - SAM2 model and weights
260
- - Properly configured paths to model checkpoints
261
-
262
- ### SAM2 Only
263
-
264
- Uses SAM2's automatic mask generation without text-based object detection.
265
-
266
- Requirements:
267
- - SAM2 model and weights
268
-
269
- ## Model Architecture
270
-
271
- VINE is built on top of CLIP and uses three separate CLIP models for different tasks:
272
- - **Categorical Model**: For object classification
273
- - **Unary Model**: For single-object action recognition
274
- - **Binary Model**: For relationship detection between object pairs
275
-
276
- Each model processes both visual and textual features to compute similarity scores and probability distributions.
277
-
278
- ## Pushing to HuggingFace Hub
279
-
280
- ```python
281
- from vine_hf import VineConfig, VineModel
282
-
283
- # Create and configure your model
284
- config = VineConfig()
285
- model = VineModel(config)
286
-
287
- # Load your pretrained weights
288
- # model.load_state_dict(torch.load('path/to/your/weights.pth'))
289
-
290
- # Register for auto classes
291
- config.register_for_auto_class()
292
- model.register_for_auto_class("AutoModel")
293
-
294
- # Push to Hub
295
- config.push_to_hub('your-username/vine-model')
296
- model.push_to_hub('your-username/vine-model')
297
- ```
298
-
299
- ## Loading from HuggingFace Hub
300
-
301
- ```python
302
- from transformers import AutoModel, pipeline
303
-
304
- # Load model
305
- model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)
306
-
307
- # Or use with pipeline
308
- vine_pipeline = pipeline(
309
- 'vine-video-understanding',
310
- model='your-username/vine-model',
311
- trust_remote_code=True
312
- )
313
- ```
314
-
315
- ## Examples
316
-
317
- See `example_usage.py` for comprehensive examples including:
318
- - Direct model usage
319
- - Pipeline usage
320
- - HuggingFace Hub integration
321
- - Real video processing
322
-
323
- ## Requirements
324
-
325
- - Python 3.7+
326
- - PyTorch 1.9+
327
- - transformers 4.20+
328
- - OpenCV
329
- - PIL/Pillow
330
- - NumPy
331
-
332
- For segmentation:
333
- - SAM2 (Facebook Research)
334
- - Grounding DINO (IDEA Research)
335
-
336
- ## Citation
337
-
338
- If you use VINE in your research, please cite:
339
-
340
- ```bibtex
341
- @article{vine2024,
342
- title={VINE: Video Understanding with Natural Language},
343
- author={Your Authors},
344
- journal={Your Journal},
345
- year={2024}
346
- }
347
- ```
348
-
349
- ## License
350
-
351
- [Your License Here]
352
-
353
- ## Contact
354
-
355
- [Your Contact Information Here]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/__init__.py DELETED
@@ -1,23 +0,0 @@
1
- """
2
- VINE HuggingFace Interface
3
-
4
- VINE (Video Understanding with Natural Language) is a model that processes videos
5
- along with categorical, unary, and binary keywords to return probability
6
- distributions over those keywords for detected objects and their relationships.
7
-
8
- This package provides a HuggingFace-compatible interface for the VINE model,
9
- including configuration, model, and pipeline classes.
10
- """
11
-
12
- from .vine_config import VineConfig
13
- from .vine_model import VineModel
14
- from .vine_pipeline import VinePipeline
15
-
16
- __version__ = "1.0.0"
17
- __author__ = "LASER Team"
18
-
19
- __all__ = [
20
- "VineConfig",
21
- "VineModel",
22
- "VinePipeline"
23
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (780 Bytes)
 
src/vine_hf/__pycache__/flattening.cpython-310.pyc DELETED
Binary file (3.31 kB)
 
src/vine_hf/__pycache__/vine_config.cpython-310.pyc DELETED
Binary file (4.41 kB)
 
src/vine_hf/__pycache__/vine_model.cpython-310.pyc DELETED
Binary file (16.3 kB)
 
src/vine_hf/__pycache__/vine_pipeline.cpython-310.pyc DELETED
Binary file (18.1 kB)
 
src/vine_hf/__pycache__/vis_utils.cpython-310.pyc DELETED
Binary file (25.1 kB)
 
src/vine_hf/convert_inference.py DELETED
@@ -1,288 +0,0 @@
1
- """
2
- Script to convert existing inference.py workflow to use VINE HuggingFace interface
3
-
4
- This script demonstrates how to migrate from the original inference.py approach
5
- to the new HuggingFace-compatible interface.
6
- """
7
-
8
- import os
9
- import sys
10
- import torch
11
- import numpy as np
12
- from typing import Dict, List, Tuple, Any
13
-
14
- # Add paths for imports
15
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
-
17
- from vine_hf import VineConfig, VineModel, VinePipeline
18
- from laser.loading import load_video
19
-
20
-
21
- def load_pretrained_vine_model(model_dir: str, model_name: str, epoch: int = 0) -> VineModel:
22
- """
23
- Load a pretrained VINE model from the original format into HuggingFace format.
24
-
25
- Args:
26
- model_dir: Directory containing the model
27
- model_name: Name of the model file (without .{epoch}.model extension)
28
- epoch: Epoch number to load
29
-
30
- Returns:
31
- VineModel instance with loaded weights
32
- """
33
- print(f"Loading pretrained VINE model from {model_dir}")
34
-
35
- # Create configuration (adjust parameters as needed)
36
- # We expect local ensemble weights in `model_dir`, so configure
37
- # VineConfig to load from local directory/filename.
38
- model_file = f"{model_name}.{epoch}.model"
39
- config = VineConfig(
40
- model_name="openai/clip-vit-base-patch32",
41
- segmentation_method="grounding_dino_sam2",
42
- target_fps=1,
43
- box_threshold=0.35,
44
- text_threshold=0.25,
45
- use_hf_repo=False,
46
- local_dir=model_dir,
47
- local_filename=model_file,
48
- )
49
-
50
- # Initialize model (VineModel will consult the config when loading)
51
- vine_model = VineModel(config)
52
-
53
- # Load original weights
54
- model_file = f"{model_name}.{epoch}.model"
55
- model_path = os.path.join(model_dir, model_file)
56
-
57
- if os.path.exists(model_path):
58
- print(f"Loading weights from: {model_path}")
59
- try:
60
- # Add safe globals for PyTorch 2.6+
61
- import torch.serialization
62
- from laser.models.llava_clip_model_v3 import PredicateModel
63
- torch.serialization.add_safe_globals([PredicateModel])
64
-
65
- # Load the original model
66
- original_model = torch.load(model_path, map_location='cpu', weights_only=False)
67
-
68
- # Transfer weights to HuggingFace model
69
- # This assumes the original model has the same structure
70
- # You may need to adjust this based on your specific model structure
71
-
72
- if hasattr(original_model, 'clip_cate_model'):
73
- vine_model.clip_cate_model.load_state_dict(original_model.clip_cate_model.state_dict())
74
- if hasattr(original_model, 'clip_unary_model'):
75
- vine_model.clip_unary_model.load_state_dict(original_model.clip_unary_model.state_dict())
76
- if hasattr(original_model, 'clip_binary_model'):
77
- vine_model.clip_binary_model.load_state_dict(original_model.clip_binary_model.state_dict())
78
- if hasattr(original_model, 'clip_tokenizer'):
79
- vine_model.clip_tokenizer = original_model.clip_tokenizer
80
- if hasattr(original_model, 'clip_processor'):
81
- vine_model.clip_processor = original_model.clip_processor
82
-
83
- print("✓ Weights transferred successfully")
84
-
85
- except Exception as e:
86
- print(f"✗ Error loading weights: {e}")
87
- print("You may need to adjust the weight loading logic for your specific model")
88
-
89
- else:
90
- print(f"✗ Model file not found: {model_path}")
91
-
92
- return vine_model
93
-
94
-
95
- def convert_inference_workflow():
96
- """
97
- Convert the original inference.py workflow to use HuggingFace interface.
98
-
99
- This function demonstrates how to replicate the original inference workflow
100
- using the new HuggingFace-compatible components.
101
- """
102
- print("=== Converting Inference Workflow ===")
103
-
104
- # Original parameters from inference.py
105
- video_id = 'v1'
106
- target_fps = 1
107
- classes = ['human', 'dog', 'frisbee']
108
- unary_keywords = ['running', 'jumping', 'sitting', 'standing']
109
- binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left']
110
-
111
- # Paths (adjust these to match your setup)
112
- demo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../demo"))
113
- video_dir = os.path.join(demo_dir, "videos")
114
- video_path = os.path.join(video_dir, f"{video_id}.mp4")
115
-
116
- # Model paths (adjust these to match your setup)
117
- data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
118
- model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
119
- model_name = "ensemble-2025-02-10-14-57-22"
120
-
121
- # Segmentation model paths (adjust these to your actual paths)
122
- sam_config_path = "/path/to/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml"
123
- sam_checkpoint_path = "/path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt"
124
- gd_config_path = "/path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py"
125
- gd_checkpoint_path = "/path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth"
126
-
127
- print(f"Video path: {video_path}")
128
- print(f"Model dir: {model_dir}")
129
- print(f"SAM2 config: {sam_config_path}")
130
- print(f"GroundingDINO config: {gd_config_path}")
131
-
132
- # Check if video exists
133
- if not os.path.exists(video_path):
134
- print(f"✗ Video not found: {video_path}")
135
- print("Please adjust the video path or use your own video file")
136
- return
137
-
138
- # 1. Load video (same as original)
139
- print(f"Loading video: {video_id}")
140
- video_tensor = load_video(video_path, target_fps=target_fps)
141
- print(f"Video shape: {video_tensor.shape}")
142
-
143
- # 2. Load VINE model with HuggingFace interface
144
- print("Loading VINE model...")
145
- if os.path.exists(model_dir):
146
- vine_model = load_pretrained_vine_model(model_dir, model_name, epoch=0)
147
- else:
148
- print(f"Model directory not found: {model_dir}")
149
- print("Creating new model with random weights for demonstration")
150
- config = VineConfig()
151
- vine_model = VineModel(config)
152
-
153
- # 3. Create pipeline for easier use
154
- print("Creating VINE pipeline...")
155
- from transformers.pipelines import PIPELINE_REGISTRY
156
-
157
- # Register pipeline if not already registered
158
- try:
159
- PIPELINE_REGISTRY.register_pipeline(
160
- "vine-video-understanding",
161
- pipeline_class=VinePipeline,
162
- pt_model=VineModel,
163
- type="multimodal",
164
- )
165
- except Exception:
166
- pass # Already registered
167
-
168
- # Create pipeline instance with segmentation model paths
169
- vine_pipeline = VinePipeline(
170
- model=vine_model,
171
- tokenizer=None,
172
- # SAM2 configuration
173
- sam_config_path=sam_config_path,
174
- sam_checkpoint_path=sam_checkpoint_path,
175
- # GroundingDINO configuration
176
- gd_config_path=gd_config_path,
177
- gd_checkpoint_path=gd_checkpoint_path
178
- )
179
-
180
- # 4. Process video with new interface
181
- print("Processing video with VINE HuggingFace interface...")
182
-
183
- try:
184
- # Use the pipeline to process the video
185
- results = vine_pipeline(
186
- video_path,
187
- categorical_keywords=classes,
188
- unary_keywords=unary_keywords,
189
- binary_keywords=binary_keywords,
190
- object_pairs=[(1, 2), (2, 3)], # Example object pairs
191
- segmentation_method='grounding_dino_sam2',
192
- target_fps=target_fps,
193
- return_top_k=3,
194
- include_visualizations=False
195
- )
196
-
197
- # 5. Display results (similar to original format)
198
- print("\n=== VINE Results (HuggingFace Interface) ===")
199
-
200
- # Categorical predictions
201
- print("\nCategorical Predictions:")
202
- for obj_id, predictions in results['categorical_predictions'].items():
203
- print(f" Object {obj_id}:")
204
- for prob, category in predictions:
205
- print(f" {prob:.3f}: {category}")
206
-
207
- # Unary predictions
208
- print("\nUnary Predictions:")
209
- for (frame_id, obj_id), predictions in results['unary_predictions'].items():
210
- print(f" Frame {frame_id}, Object {obj_id}:")
211
- for prob, action in predictions:
212
- print(f" {prob:.3f}: {action}")
213
-
214
- # Binary predictions
215
- print("\nBinary Predictions:")
216
- for (frame_id, obj_pair), predictions in results['binary_predictions'].items():
217
- print(f" Frame {frame_id}, Objects {obj_pair}:")
218
- for prob, relation in predictions:
219
- print(f" {prob:.3f}: {relation}")
220
-
221
- # Summary
222
- print(f"\nSummary:")
223
- print(f" Objects detected: {results['summary']['num_objects_detected']}")
224
- print(f" Top categories: {results['summary']['top_categories']}")
225
- print(f" Top actions: {results['summary']['top_actions']}")
226
- print(f" Top relations: {results['summary']['top_relations']}")
227
-
228
- print("\n✓ Successfully processed video with VINE HuggingFace interface!")
229
-
230
- except Exception as e:
231
- print(f"✗ Error processing video: {e}")
232
- print("This may be due to missing segmentation models or other dependencies")
233
- print("The interface is set up correctly, but full functionality requires:")
234
- print(" 1. Properly installed Grounding DINO and SAM2")
235
- print(" 2. Correct model weights")
236
- print(" 3. Proper configuration paths")
237
-
238
-
239
- def compare_interfaces():
240
- """
241
- Compare the original inference.py approach with the new HuggingFace interface.
242
- """
243
- print("\n=== Interface Comparison ===")
244
-
245
- print("\nOriginal inference.py approach:")
246
- print("✓ Direct access to model internals")
247
- print("✓ Full control over segmentation pipeline")
248
- print("✗ Complex setup and configuration")
249
- print("✗ Not compatible with HuggingFace ecosystem")
250
- print("✗ Requires manual handling of all components")
251
-
252
- print("\nNew HuggingFace interface:")
253
- print("✓ Easy to use pipeline interface")
254
- print("✓ Compatible with HuggingFace Hub")
255
- print("✓ Standardized configuration")
256
- print("✓ Automatic handling of preprocessing/postprocessing")
257
- print("✓ Easy sharing and distribution")
258
- print("✓ Configurable segmentation model paths")
259
- print("✗ Slightly less direct control (can still access model directly)")
260
-
261
- print("\nMigration benefits:")
262
- print("• Share your model easily on HuggingFace Hub")
263
- print("• Users can load your model with a single line")
264
- print("• Standardized interface for video understanding")
265
- print("• Better integration with other HuggingFace tools")
266
- print("• Simplified deployment and inference")
267
- print("• Flexible segmentation model configuration")
268
-
269
-
270
- if __name__ == "__main__":
271
- print("VINE HuggingFace Interface Conversion")
272
- print("=" * 50)
273
-
274
- # Run conversion demonstration
275
- convert_inference_workflow()
276
-
277
- # Show comparison
278
- compare_interfaces()
279
-
280
- print("\n" + "=" * 50)
281
- print("Next steps:")
282
- print("1. Install SAM2 and GroundingDINO dependencies")
283
- print("2. Download the required model checkpoints")
284
- print("3. Update the paths in this script to point to your models")
285
- print("4. Test the interface with your specific model weights")
286
- print("5. Adjust configuration parameters as needed")
287
- print("6. Push your model to HuggingFace Hub using push_to_hub.py")
288
- print("7. Share with the community!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/example_ensemble_weights.py DELETED
@@ -1,333 +0,0 @@
1
- """
2
- Example demonstrating how to load and use VINE ensemble weights
3
-
4
- This script shows the correct way to load your pretrained VINE ensemble weights
5
- and use them with the HuggingFace interface, based on the actual inference.py workflow.
6
- """
7
-
8
- import os
9
- import sys
10
- import torch
11
- import numpy as np
12
- from transformers.pipelines import PIPELINE_REGISTRY
13
-
14
- #os.environ["OPENAI_API_KEY"]="dummy-key" # Set your OpenAI API key here or via environment variable
15
-
16
- # Add the parent directory to the path to import vine_hf
17
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
18
-
19
- from vine_hf import VineConfig, VineModel, VinePipeline
20
- from laser.loading import load_video
21
-
22
-
23
- def example_load_ensemble_weights():
24
- """Example of loading ensemble weights correctly."""
25
- print("=== Loading Ensemble VINE Weights ===")
26
-
27
- # Path to your ensemble model (adjust this to your actual path)
28
- data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
29
- model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
30
-
31
- print(f"Looking for ensemble weights in: {model_dir}")
32
-
33
- if os.path.exists(model_dir):
34
- print("✓ Model directory found")
35
-
36
- # List available model files
37
- model_files = [f for f in os.listdir(model_dir) if f.endswith('.model')]
38
- print(f"Available model files: {model_files}")
39
-
40
- if model_files:
41
- # Create configuration with ensemble path (local directory with .model files)
42
- config = VineConfig(
43
- segmentation_method="grounding_dino_sam2",
44
- use_hf_repo=False,
45
- local_dir=model_dir,
46
- local_filename=None,
47
- )
48
-
49
- print("Creating VINE model with ensemble weights...")
50
- vine_model = VineModel(config)
51
-
52
- print("✓ VINE model created with ensemble weights!")
53
- return vine_model
54
- else:
55
- print("✗ No .model files found in directory")
56
- return None
57
- else:
58
- print(f"✗ Model directory not found: {model_dir}")
59
- print("Please adjust the path to point to your ensemble weights")
60
- return None
61
-
62
-
63
- def example_direct_ensemble_loading():
64
- """Example of loading ensemble weights using from_pretrained_vine."""
65
- print("\n=== Direct Ensemble Loading ===")
66
-
67
- # Path to specific ensemble file
68
- data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
69
- model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
70
-
71
- if os.path.exists(model_dir):
72
- try:
73
- # Use the class method for direct loading
74
- vine_model = VineModel.from_pretrained_vine(
75
- model_path=model_dir,
76
- epoch=0 # Load epoch 0
77
- )
78
-
79
- print("✓ Model loaded using from_pretrained_vine!")
80
- return vine_model
81
-
82
- except Exception as e:
83
- print(f"✗ Error loading with from_pretrained_vine: {e}")
84
- return None
85
- else:
86
- print(f"✗ Model directory not found: {model_dir}")
87
- return None
88
-
89
-
90
- def example_compare_original_vs_hf():
91
- """Compare the original inference.py approach with HuggingFace interface."""
92
- print("\n=== Comparing Original vs HuggingFace Interface ===")
93
-
94
- data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
95
- model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
96
- model_name = "ensemble-2025-02-10-14-57-22"
97
- epoch = 0
98
-
99
- if not os.path.exists(model_dir):
100
- print(f"Model directory not found: {model_dir}")
101
- return
102
-
103
- print("Original approach (from inference.py):")
104
- print("```python")
105
- print("def load_model(model_dir, model_name, epoch, device):")
106
- print(" model_name = model_name + f'.{epoch}.model'")
107
- print(" predicate_model = torch.load(os.path.join(model_dir, model_name), map_location=device, weights_only=False)")
108
- print(" return predicate_model")
109
- print("")
110
- print("predicate_model = load_model(model_dir, model_name, epoch, device)")
111
- print("```")
112
-
113
- print("\nNew HuggingFace approach:")
114
- print("```python")
115
- print("config = VineConfig(pretrained_vine_path=model_dir)")
116
- print("vine_model = VineModel(config)")
117
- print("# or")
118
- print("vine_model = VineModel.from_pretrained_vine(model_dir, epoch=0)")
119
- print("```")
120
-
121
- # Try to load with both approaches if possible
122
- try:
123
- # Original approach
124
- def load_model(model_dir, model_name, epoch, device):
125
- model_name = model_name + f'.{epoch}.model'
126
- model_path = os.path.join(model_dir, model_name)
127
- if os.path.exists(model_path):
128
- return torch.load(model_path, map_location=device, weights_only=False)
129
- else:
130
- print(f"Model file not found: {model_path}")
131
- return None
132
-
133
- device = "cuda" if torch.cuda.is_available() else "cpu"
134
- original_model = load_model(model_dir, model_name, epoch, device)
135
-
136
- if original_model:
137
- print(f"✓ Original model loaded: {type(original_model)}")
138
- print(f" Has clip_cate_model: {hasattr(original_model, 'clip_cate_model')}")
139
- print(f" Has clip_unary_model: {hasattr(original_model, 'clip_unary_model')}")
140
- print(f" Has clip_binary_model: {hasattr(original_model, 'clip_binary_model')}")
141
-
142
- # HuggingFace approach
143
- vine_model = VineModel.from_pretrained_vine(model_dir, epoch=epoch)
144
-
145
- if vine_model:
146
- print(f"✓ HuggingFace model loaded: {type(vine_model)}")
147
- print(f" Has clip_cate_model: {hasattr(vine_model, 'clip_cate_model')}")
148
- print(f" Has clip_unary_model: {hasattr(vine_model, 'clip_unary_model')}")
149
- print(f" Has clip_binary_model: {hasattr(vine_model, 'clip_binary_model')}")
150
-
151
- print("\n✓ Both approaches work! HuggingFace interface successfully loads ensemble weights.")
152
-
153
- except Exception as e:
154
- print(f"Error in comparison: {e}")
155
-
156
-
157
- def example_ensemble_with_pipeline():
158
- """Example using ensemble weights with the pipeline."""
159
- print("\n=== Using Ensemble Weights with Pipeline ===")
160
-
161
- data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
162
- model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
163
-
164
- if not os.path.exists(model_dir):
165
- print(f"Model directory not found: {model_dir}")
166
- return
167
-
168
- # Register pipeline
169
- PIPELINE_REGISTRY.register_pipeline(
170
- "vine-video-understanding",
171
- pipeline_class=VinePipeline,
172
- pt_model=VineModel,
173
- type="multimodal",
174
- )
175
-
176
- # Create model with ensemble weights (local directory)
177
- config = VineConfig(
178
- segmentation_method="grounding_dino_sam2",
179
- use_hf_repo=False,
180
- local_dir=model_dir,
181
- local_filename=None,
182
- )
183
-
184
- vine_model = VineModel(config)
185
- # Create pipeline with segmentation model paths
186
- vine_pipeline = VinePipeline(
187
- model=vine_model,
188
- tokenizer=None,
189
- # SAM2 configuration
190
- sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
191
- sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
192
- # GroundingDINO configuration
193
- gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
194
- gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
195
- device="cuda" if torch.cuda.is_available() else "cpu",
196
- )
197
-
198
- print("✓ Pipeline created with ensemble VINE weights")
199
-
200
- # Check for demo video
201
- demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
202
-
203
- if os.path.exists(demo_video):
204
- print(f"Found demo video: {demo_video}")
205
-
206
- # Use the same keywords as in the original inference.py
207
- categorical_keywords = ['human', 'dog', 'frisbee']
208
- unary_keywords = ['running', 'jumping', 'catching', 'throwing']
209
- binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left']
210
-
211
- print("Example pipeline usage:")
212
- print("```python")
213
- print("results = vine_pipeline(")
214
- print(f" '{demo_video}',")
215
- print(f" categorical_keywords={categorical_keywords},")
216
- print(f" unary_keywords={unary_keywords},")
217
- print(f" binary_keywords={binary_keywords},")
218
- print(" segmentation_method='grounding_dino_sam2'")
219
- print(")")
220
- print("```")
221
-
222
- # Uncomment to actually run (requires segmentation models)
223
- # try:
224
- # results = vine_pipeline(
225
- # demo_video,
226
- # categorical_keywords=categorical_keywords,
227
- # unary_keywords=unary_keywords,
228
- # binary_keywords=binary_keywords,
229
- # segmentation_method='grounding_dino_sam2'
230
- # )
231
- # print("Results:", results['summary'])
232
- # except Exception as e:
233
- # print(f"Pipeline execution failed: {e}")
234
- # print("This is expected if segmentation models are not set up")
235
-
236
- return vine_pipeline
237
-
238
-
239
-
240
- def demonstrate_weight_transfer():
241
- """Demonstrate how weights are transferred from ensemble to HuggingFace format."""
242
- print("\n=== Weight Transfer Demonstration ===")
243
-
244
- print("The ensemble model structure (PredicateModel):")
245
- print("- clip_cate_model: CLIP model for categorical classification")
246
- print("- clip_unary_model: CLIP model for unary predicates")
247
- print("- clip_binary_model: CLIP model for binary relations")
248
- print("- clip_tokenizer: Tokenizer for text processing")
249
- print("- clip_processor: Processor for image processing")
250
-
251
- print("\nWeight transfer process:")
252
- print("1. Load ensemble model with torch.load()")
253
- print("2. Initialize base CLIP models in HuggingFace format")
254
- print("3. Transfer state_dict from ensemble to HuggingFace models:")
255
- print(" - ensemble.clip_cate_model → hf.clip_cate_model")
256
- print(" - ensemble.clip_unary_model → hf.clip_unary_model")
257
- print(" - ensemble.clip_binary_model → hf.clip_binary_model")
258
- print("4. Transfer tokenizer and processor")
259
-
260
- print("\nThis preserves all your fine-tuned weights while making them HuggingFace compatible!")
261
-
262
-
263
- def troubleshooting_guide():
264
- """Provide troubleshooting guide for common issues."""
265
- print("\n=== Troubleshooting Guide ===")
266
-
267
- print("Common Issues:")
268
- print("1. 'No model file found for epoch X'")
269
- print(" → Check that .model files exist in the directory")
270
- print(" → Verify the epoch number is correct")
271
- print(" → List files: ls /path/to/model/dir/*.model")
272
-
273
- print("\n2. 'Error loading VINE weights'")
274
- print(" → Check file permissions")
275
- print(" → Verify the model file is not corrupted")
276
- print(" → Try loading with torch.load() directly first")
277
-
278
- print("\n3. 'CLIP model mismatch'")
279
- print(" → Ensure config.model_name matches the base model used in training")
280
-
281
- print("\n4. 'Device mismatch errors'")
282
- print(" → Models are loaded to CPU first, then moved to device")
283
- print(" → Check CUDA availability with torch.cuda.is_available()")
284
-
285
- print("\nDebugging steps:")
286
- print("1. Test loading ensemble model directly:")
287
- print(" model = torch.load('path/to/model.0.model', map_location='cpu')")
288
- print("2. Check model attributes:")
289
- print(" print(dir(model))")
290
- print("3. Verify state_dict keys:")
291
- print(" print(model.clip_cate_model.state_dict().keys())")
292
-
293
-
294
- if __name__ == "__main__":
295
- print("VINE Ensemble Weights Loading Examples")
296
- print("=" * 50)
297
-
298
- # Test ensemble weight loading
299
- try:
300
- model1 = example_load_ensemble_weights()
301
- except Exception as e:
302
- print(f"Ensemble loading example failed: {e}")
303
-
304
- try:
305
- model2 = example_direct_ensemble_loading()
306
- except Exception as e:
307
- print(f"Direct loading example failed: {e}")
308
-
309
- # Compare approaches
310
- try:
311
- example_compare_original_vs_hf()
312
- except Exception as e:
313
- print(f"Comparison example failed: {e}")
314
-
315
- # Test pipeline with ensemble weights
316
- try:
317
- pipeline = example_ensemble_with_pipeline()
318
- except Exception as e:
319
- print(f"Pipeline example failed: {e}")
320
-
321
- # Educational content
322
- demonstrate_weight_transfer()
323
- troubleshooting_guide()
324
-
325
- print("\n" + "=" * 50)
326
- print("Key Points:")
327
- print("1. AutoModel.from_pretrained() won't work with .pt ensemble weights")
328
- print("2. Use torch.load() to load the ensemble, then transfer weights")
329
- print("3. The HuggingFace interface preserves your fine-tuned weights")
330
- print("4. Specify pretrained_vine_path in VineConfig to auto-load weights")
331
- print("5. Use VineModel.from_pretrained_vine() for direct loading")
332
-
333
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/example_sam2_masks.py DELETED
@@ -1,331 +0,0 @@
1
- """
2
- Example demonstrating SAM2 mask generation in VINE HuggingFace interface
3
-
4
- This script shows how to use both SAM2-only and Grounding DINO + SAM2
5
- segmentation methods with the VINE model.
6
- """
7
-
8
- import os
9
- import sys
10
- import torch
11
- import numpy as np
12
- from transformers.pipelines import PIPELINE_REGISTRY
13
-
14
- # Add the parent directory to the path to import vine_hf
15
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
- # Add the parent directory to the path to import vine_hf
17
-
18
- #Either uncomment the below or set a environemental key, though it isn't needed to run.
19
- #os.environ['OPENAI_API_KEY'] = 'dummy-key'
20
-
21
- from vine_hf import VineConfig, VineModel, VinePipeline
22
- from laser.loading import load_video
23
-
24
-
25
- def example_sam2_only_segmentation():
26
- """Example using SAM2 automatic mask generation only."""
27
- print("=== SAM2-Only Segmentation Example ===")
28
-
29
- # Create configuration for SAM2-only
30
- config = VineConfig(
31
- use_hf_repo=True,
32
- model_repo="video-fm/vine_v0",
33
- segmentation_method="sam2", # Use SAM2 only
34
- target_fps=1,
35
- debug_visualizations=True,
36
- )
37
-
38
- # Register pipeline
39
- PIPELINE_REGISTRY.register_pipeline(
40
- "vine-video-understanding",
41
- pipeline_class=VinePipeline,
42
- pt_model=VineModel,
43
- type="multimodal",
44
- )
45
-
46
- # Create model and pipeline with SAM2 paths
47
- vine_model = VineModel(config)
48
- vine_pipeline = VinePipeline(
49
- model=vine_model,
50
- tokenizer=None,
51
- sam_config_path="path/to/your/sam2/sam_config.yaml",
52
- sam_checkpoint_path="path/to/your/sam2/sam_checkpoint.pth",
53
- gd_config_path="path/to/your/groundingdino/config.py",
54
- gd_checkpoint_path="path/to/your/groundingdino/checkpoint.pth",
55
- )
56
-
57
- # Check for demo video
58
- demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/output.mp4")
59
-
60
- if os.path.exists(demo_video):
61
- print(f"Processing video: {demo_video}")
62
-
63
- # Define keywords (SAM2 will find all objects, then classify them)
64
- categorical_keywords = ['human', 'dog', 'frisbee', 'object', 'person', 'animal']
65
- unary_keywords = ['running', 'jumping', 'sitting', 'standing', 'moving', 'static']
66
- binary_keywords = ['behind', 'in front of', 'next to', 'chasing', 'following']
67
- object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3), (0,4)]
68
-
69
-
70
- print("Using SAM2 automatic mask generation...")
71
- print("This will find all objects in the video automatically")
72
-
73
- try:
74
- # Process with SAM2 only
75
- results = vine_pipeline(
76
- demo_video,
77
- categorical_keywords=categorical_keywords,
78
- unary_keywords=unary_keywords,
79
- binary_keywords=binary_keywords,
80
- object_pairs=object_pairs,
81
- segmentation_method="sam2",
82
- return_top_k=3,
83
- debug_visualizations=True,
84
- debug_visualization_path=os.path.join(os.getcwd(), "sam2_debug_masks.png"),
85
- )
86
-
87
- print("\n✓ SAM2 segmentation completed!")
88
- print("Results summary:")
89
- print(f" Objects detected: {results['summary']['num_objects_detected']}")
90
- print(f" Top categories: {results['summary']['top_categories']}")
91
- print(f" Top actions: {results['summary']['top_actions']}")
92
-
93
- return results
94
-
95
- except Exception as e:
96
- print(f"SAM2 segmentation failed: {e}")
97
- print("Make sure SAM2 models are properly installed")
98
- return None
99
- else:
100
- print(f"Demo video not found: {demo_video}")
101
- return None
102
-
103
- def example_grounding_dino_sam2_segmentation():
104
- """Example using Grounding DINO + SAM2 text-guided segmentation."""
105
- print("\n=== Grounding DINO + SAM2 Segmentation Example ===")
106
-
107
- # Create configuration for Grounding DINO + SAM2
108
- config = VineConfig(
109
- use_hf_repo=True,
110
- model_repo="video-fm/vine_v0",
111
- segmentation_method="grounding_dino_sam2", # Use text-guided segmentation
112
- box_threshold=0.35,
113
- text_threshold=0.25,
114
- target_fps=1,
115
- debug_visualizations=True,
116
- )
117
-
118
- # Create model and pipeline with both SAM2 and GroundingDINO paths
119
- vine_model = VineModel(config)
120
- vine_pipeline = VinePipeline(
121
- model=vine_model,
122
- tokenizer=None,
123
- # SAM2 configuration
124
- sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
125
- sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
126
- gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
127
- gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
128
- device=0,
129
- )
130
-
131
- # Check for demo video
132
- demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/output.mp4")
133
-
134
- if os.path.exists(demo_video):
135
- print(f"Processing video: {demo_video}")
136
-
137
- # Define keywords (Grounding DINO will look specifically for these)
138
- categorical_keywords = ['human', 'dog', 'frisbee'] # Specific objects to find
139
- unary_keywords = ['running', 'jumping', 'catching', 'throwing']
140
- binary_keywords = ['behind', 'chasing', 'next to', 'throwing to']
141
- object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3), (0,4)]
142
- print("Using Grounding DINO + SAM2 text-guided segmentation...")
143
- print(f"Looking specifically for: {categorical_keywords}")
144
-
145
- try:
146
- # Process with Grounding DINO + SAM2
147
- results = vine_pipeline(
148
- demo_video,
149
- categorical_keywords=categorical_keywords,
150
- unary_keywords=unary_keywords,
151
- binary_keywords=binary_keywords,
152
- object_pairs=object_pairs,
153
- segmentation_method="grounding_dino_sam2",
154
- box_threshold=0.35,
155
- text_threshold=0.25,
156
- return_top_k=3,
157
- debug_visualizations=True,
158
- )
159
-
160
- print("\n✓ Grounding DINO + SAM2 segmentation completed!")
161
- print("Results summary:")
162
- print(f" Objects detected: {results['summary']['num_objects_detected']}")
163
- print(f" Top categories: {results['summary']['top_categories']}")
164
- print(f" Top actions: {results['summary']['top_actions']}")
165
- print(f" Top relations: {results['summary']['top_relations']}")
166
-
167
- return results
168
-
169
- except Exception as e:
170
- print(f"Grounding DINO + SAM2 segmentation failed: {e}")
171
- print("Make sure both Grounding DINO and SAM2 models are properly installed")
172
- return None
173
- else:
174
- print(f"Demo video not found: {demo_video}")
175
- return None
176
-
177
-
178
- def compare_segmentation_methods():
179
- """Compare SAM2-only vs Grounding DINO + SAM2 approaches."""
180
- print("\n=== Comparing Segmentation Methods ===")
181
-
182
- print("\nSAM2-Only Approach:")
183
- print("✓ Finds all objects automatically")
184
- print("✓ No need to specify what to look for")
185
- print("✓ Good for exploratory analysis")
186
- print("✗ May find too many irrelevant objects")
187
- print("✗ Less precise for specific object types")
188
-
189
- print("\nGrounding DINO + SAM2 Approach:")
190
- print("✓ Finds specific objects based on text prompts")
191
- print("✓ More precise and targeted")
192
- print("✓ Better for known object categories")
193
- print("✓ Integrates object detection with segmentation")
194
- print("✗ Limited to specified categories")
195
- print("✗ Requires knowing what objects to look for")
196
-
197
-
198
- def demonstrate_mask_processing():
199
- """Demonstrate how masks are processed internally."""
200
- print("\n=== Mask Processing Demonstration ===")
201
-
202
- # Load a video to show the processing pipeline
203
- demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/output.mp4")
204
-
205
- if os.path.exists(demo_video):
206
- print("Loading video for mask processing demo...")
207
-
208
- # Load video tensor
209
- video_tensor = np.asarray(load_video(demo_video, target_fps=1))
210
- print(f"Video shape: {video_tensor.shape}")
211
-
212
- # Create pipeline with segmentation model paths
213
- config = VineConfig(segmentation_method="sam2")
214
- vine_model = VineModel(config)
215
- vine_pipeline = VinePipeline(
216
- model=vine_model,
217
- tokenizer=None,
218
- # SAM2 configuration
219
- sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
220
- sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
221
- gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
222
- gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
223
- )
224
-
225
- try:
226
- # Process just the first few frames to show the pipeline
227
- print("\nProcessing first 2 frames with SAM2...")
228
-
229
- # Manually call the preprocessing to show the steps
230
- processed_data = vine_pipeline.preprocess(
231
- video_tensor[:2], # Just first 2 frames
232
- segmentation_method="sam2",
233
- categorical_keywords=['object']
234
- )
235
-
236
- print("Mask processing results:")
237
- print(f" Number of frames processed: {processed_data['num_frames']}")
238
- print(f" Frames with masks: {list(processed_data['masks'].keys())}")
239
-
240
- # Show mask details
241
- for frame_id, frame_masks in processed_data['masks'].items():
242
- print(f" Frame {frame_id}: {len(frame_masks)} objects detected")
243
- for obj_id, mask in frame_masks.items():
244
- print(f" Object {obj_id}: mask shape {mask.shape}")
245
-
246
- print("\nBounding box extraction:")
247
- for frame_id, frame_bboxes in processed_data['bboxes'].items():
248
- print(f" Frame {frame_id}: {len(frame_bboxes)} bounding boxes")
249
- for obj_id, bbox in frame_bboxes.items():
250
- print(f" Object {obj_id}: bbox {bbox}")
251
-
252
- except Exception as e:
253
- print(f"Mask processing failed: {e}")
254
- print("This is expected if SAM2 models are not properly set up")
255
- else:
256
- print(f"Demo video not found: {demo_video}")
257
-
258
-
259
- def test_mask_formats():
260
- """Test different mask input formats."""
261
- print("\n=== Testing Mask Formats ===")
262
-
263
- # Create dummy data to test mask processing
264
- height, width = 224, 224
265
-
266
- # Test different mask formats
267
- print("Testing mask format conversions...")
268
-
269
- # Format 1: NumPy boolean array
270
- mask_np = np.random.rand(height, width) > 0.5
271
- print(f"NumPy mask: {mask_np.shape}, dtype: {mask_np.dtype}")
272
-
273
- # Format 2: PyTorch tensor
274
- mask_torch = torch.from_numpy(mask_np)
275
- print(f"PyTorch mask: {mask_torch.shape}, dtype: {mask_torch.dtype}")
276
-
277
- # Format 3: 3D mask with singleton dimension
278
- mask_3d = mask_torch.unsqueeze(-1)
279
- print(f"3D mask: {mask_3d.shape}")
280
-
281
- # Test bounding box extraction
282
- from laser.preprocess.mask_generation_grounding_dino import mask_to_bbox
283
-
284
- try:
285
- bbox = mask_to_bbox(mask_torch)
286
- print(f"Extracted bbox: {bbox}")
287
- print("✓ Mask format testing successful")
288
- except Exception as e:
289
- print(f"Mask format testing failed: {e}")
290
-
291
-
292
- if __name__ == "__main__":
293
- print("VINE SAM2 Mask Generation Examples")
294
- print("=" * 50)
295
-
296
- # Test SAM2-only approach
297
- try:
298
- sam2_results = example_sam2_only_segmentation()
299
- except Exception as e:
300
- print(f"SAM2-only example failed: {e}")
301
-
302
- # Test Grounding DINO + SAM2 approach
303
- try:
304
- gd_sam2_results = example_grounding_dino_sam2_segmentation()
305
- except Exception as e:
306
- print(f"Grounding DINO + SAM2 example failed: {e}")
307
-
308
- # Compare approaches
309
- compare_segmentation_methods()
310
-
311
- # Demonstrate mask processing
312
- try:
313
- demonstrate_mask_processing()
314
- except Exception as e:
315
- print(f"Mask processing demo failed: {e}")
316
-
317
- # Test mask formats
318
- try:
319
- test_mask_formats()
320
- except Exception as e:
321
- print(f"Mask format testing failed: {e}")
322
-
323
- print("\n" + "=" * 50)
324
- print("Examples completed!")
325
- print("\nKey takeaways:")
326
- print("1. SAM2-only: Automatic object detection and segmentation")
327
- print("2. Grounding DINO + SAM2: Text-guided object detection and segmentation")
328
- print("3. Both methods provide masks and bounding boxes for VINE model")
329
- print("4. Choose method based on whether you know what objects to look for")
330
-
331
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/example_usage.ipynb DELETED
@@ -1,310 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "id": "44d53281",
7
- "metadata": {},
8
- "outputs": [
9
- {
10
- "name": "stderr",
11
- "output_type": "stream",
12
- "text": [
13
- "/home/kevinx/miniconda3/envs/laser_env/lib/python3.10/site-packages/pydantic/_internal/_config.py:383: UserWarning: Valid config keys have changed in V2:\n",
14
- "* 'schema_extra' has been renamed to 'json_schema_extra'\n",
15
- " warnings.warn(message, UserWarning)\n",
16
- "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n",
17
- "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
18
- ]
19
- }
20
- ],
21
- "source": [
22
- "import os\n",
23
- "import sys\n",
24
- "import torch\n",
25
- "from transformers import pipeline, AutoModel\n",
26
- "from transformers.pipelines import PIPELINE_REGISTRY\n",
27
- "\n",
28
- "# Uncomment or set your own\n",
29
- "#os.environ['OPENAI_API_KEY'] = 'dummy-key'\n",
30
- "from vine_hf import VineConfig, VineModel, VinePipeline"
31
- ]
32
- },
33
- {
34
- "cell_type": "code",
35
- "execution_count": 2,
36
- "id": "174e479f",
37
- "metadata": {},
38
- "outputs": [],
39
- "source": [
40
- "PIPELINE_REGISTRY.register_pipeline(\n",
41
- " \"vine-video-understanding\",\n",
42
- " pipeline_class=VinePipeline,\n",
43
- " pt_model=VineModel,\n",
44
- " type=\"multimodal\",\n",
45
- ")"
46
- ]
47
- },
48
- {
49
- "cell_type": "code",
50
- "execution_count": null,
51
- "id": "a9af2770",
52
- "metadata": {},
53
- "outputs": [],
54
- "source": [
55
- "vine_config = VineConfig(\n",
56
- " model_name=\"openai/clip-vit-base-patch32\",\n",
57
- " # Local file example: set use_hf_repo=False and provide local_dir/local_filename\n",
58
- " use_hf_repo=False,\n",
59
- " local_dir=os.path.dirname('/path/to/your/pretrained/model.pt'),\n",
60
- " local_filename=os.path.basename('/path/to/your/pretrained/model.pt'), # Local file path\n",
61
- " segmentation_method=\"grounding_dino_sam2\",\n",
62
- " visualize=True,\n",
63
- " visualization_dir=\"path/to/visualization/dir\",\n",
64
- " debug_visualizations=True,\n",
65
- " device=0, # Change to your desired device\n",
66
- ")"
67
- ]
68
- },
69
- {
70
- "cell_type": "code",
71
- "execution_count": null,
72
- "id": "274e6515",
73
- "metadata": {},
74
- "outputs": [
75
- {
76
- "name": "stdout",
77
- "output_type": "stream",
78
- "text": [
79
- "Loaded state type: <class 'collections.OrderedDict'>\n"
80
- ]
81
- }
82
- ],
83
- "source": [
84
- "vine_pipeline = VinePipeline(\n",
85
- " model=VineModel(vine_config), \n",
86
- " tokenizer=None,\n",
87
- " sam_config_path=\"path/to/sam2/configs/sam2_hiera_base_plus.yaml\",\n",
88
- " sam_checkpoint_path=\"path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt\",\n",
89
- " gd_config_path=\"path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py\",\n",
90
- " gd_checkpoint_path=\"path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth\",\n",
91
- ")"
92
- ]
93
- },
94
- {
95
- "cell_type": "code",
96
- "execution_count": 6,
97
- "id": "123a090d",
98
- "metadata": {},
99
- "outputs": [],
100
- "source": [
101
- "categorical_keywords = ['human', 'dog', 'frisbee']\n",
102
- "unary_keywords = ['running', 'jumping', 'catching', 'throwing']\n",
103
- "binary_keywords = ['behind', 'in front of', 'next to', 'chasing']\n",
104
- "object_pairs = [(0, 1), (0, 2), (1, 2)] # human-dog, dog-frisbee relationships "
105
- ]
106
- },
107
- {
108
- "cell_type": "code",
109
- "execution_count": 7,
110
- "id": "0b42f032",
111
- "metadata": {},
112
- "outputs": [],
113
- "source": [
114
- "demo_video_path = \"/home/kevinx/LASER/LASER/demo/videos/v1.mp4\" # Replace with your video file path"
115
- ]
116
- },
117
- {
118
- "cell_type": "code",
119
- "execution_count": 8,
120
- "id": "8202c654",
121
- "metadata": {},
122
- "outputs": [
123
- {
124
- "name": "stdout",
125
- "output_type": "stream",
126
- "text": [
127
- "Segmentation method: grounding_dino_sam2\n",
128
- "Generating Grounding DINO + SAM2 masks...\n",
129
- "<class 'int'>\n",
130
- "✓ SAM2 models initialized successfully\n",
131
- "<class 'int'>\n"
132
- ]
133
- },
134
- {
135
- "name": "stderr",
136
- "output_type": "stream",
137
- "text": [
138
- "UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at /pytorch/aten/src/ATen/native/TensorShape.cpp:4314.)\n"
139
- ]
140
- },
141
- {
142
- "name": "stdout",
143
- "output_type": "stream",
144
- "text": [
145
- "final text_encoder_type: bert-base-uncased\n",
146
- "✓ GroundingDINO model initialized successfully\n",
147
- "Start detecting objects at time 05:08:58.178592\n"
148
- ]
149
- },
150
- {
151
- "name": "stderr",
152
- "output_type": "stream",
153
- "text": [
154
- "Detecting objects: 0%| | 0/3 [00:00<?, ?it/s]FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
155
- "UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
156
- "UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
157
- "FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
158
- "Detecting objects: 100%|██████████| 3/3 [00:01<00:00, 2.82it/s]\n"
159
- ]
160
- },
161
- {
162
- "name": "stdout",
163
- "output_type": "stream",
164
- "text": [
165
- "Finished detecting objects at time 05:08:59.250419\n",
166
- "Loading inference state at time 05:08:59.544425\n",
167
- "Number of frames: 3\n",
168
- "None\n"
169
- ]
170
- },
171
- {
172
- "name": "stderr",
173
- "output_type": "stream",
174
- "text": [
175
- "Processing frames: 100%|██████████| 3/3 [00:00<00:00, 11.77it/s]\n"
176
- ]
177
- },
178
- {
179
- "name": "stdout",
180
- "output_type": "stream",
181
- "text": [
182
- "Annotated frames: []\n",
183
- "Find the most dense prompt at time 05:09:01.413703\n",
184
- "Most dense frame: 0\n",
185
- "\n",
186
- "\n",
187
- "Start propagating objects at time 05:09:01.416367\n",
188
- "Pass count: 0\n"
189
- ]
190
- },
191
- {
192
- "name": "stderr",
193
- "output_type": "stream",
194
- "text": [
195
- "propagate in video: 100%|██████████| 3/3 [00:00<00:00, 20.20it/s]\n",
196
- "propagate in video: 0it [00:00, ?it/s]\n"
197
- ]
198
- },
199
- {
200
- "name": "stdout",
201
- "output_type": "stream",
202
- "text": [
203
- "Most dense frame: 1\n",
204
- "\n",
205
- "\n",
206
- "Pass count: 1\n"
207
- ]
208
- },
209
- {
210
- "name": "stderr",
211
- "output_type": "stream",
212
- "text": [
213
- "propagate in video: 100%|██████████| 3/3 [00:00<00:00, 19.25it/s]\n",
214
- "propagate in video: 0it [00:00, ?it/s]\n"
215
- ]
216
- },
217
- {
218
- "name": "stdout",
219
- "output_type": "stream",
220
- "text": [
221
- "Most dense frame: 2\n",
222
- "\n",
223
- "\n",
224
- "Pass count: 2\n"
225
- ]
226
- },
227
- {
228
- "name": "stderr",
229
- "output_type": "stream",
230
- "text": [
231
- "propagate in video: 100%|██████████| 3/3 [00:00<00:00, 25.92it/s]\n",
232
- "propagate in video: 0it [00:00, ?it/s]\n"
233
- ]
234
- },
235
- {
236
- "name": "stdout",
237
- "output_type": "stream",
238
- "text": [
239
- "Most dense frame: -1\n",
240
- "\n",
241
- "\n",
242
- "\n",
243
- "Results:\n",
244
- "Summary: {'num_objects_detected': 4, 'num_unary_predictions': 10, 'num_binary_predictions': 3, 'top_categories': [('frisbee', 0.9989640712738037), ('dog', 0.957672655582428), ('dog', 0.957672655582428)], 'top_actions': [('running', 0.8483631610870361), ('running', 0.832377016544342), ('running', 0.8178836107254028)], 'top_relations': [('chasing', 0.9616015553474426), ('chasing', 0.9478002786636353), ('chasing', 0.6380977630615234)]}\n"
245
- ]
246
- }
247
- ],
248
- "source": [
249
- "try:\n",
250
- " results = vine_pipeline(\n",
251
- " demo_video_path,\n",
252
- " categorical_keywords=categorical_keywords,\n",
253
- " unary_keywords=unary_keywords,\n",
254
- " binary_keywords=binary_keywords,\n",
255
- " object_pairs=object_pairs,\n",
256
- " segmentation_method='grounding_dino_sam2',\n",
257
- " return_top_k=3,\n",
258
- " include_visualizations=False,\n",
259
- " debug_visualizations=False,\n",
260
- " )\n",
261
- " \n",
262
- " print(\"\\nResults:\")\n",
263
- " print(f\"Summary: {results['summary']}\")\n",
264
- " \n",
265
- "except Exception as e:\n",
266
- " print(f\"Note: Full execution requires segmentation models to be properly set up.\")\n",
267
- " print(f\"Error: {e}\")"
268
- ]
269
- },
270
- {
271
- "cell_type": "code",
272
- "execution_count": 9,
273
- "id": "414ede9b",
274
- "metadata": {},
275
- "outputs": [
276
- {
277
- "name": "stdout",
278
- "output_type": "stream",
279
- "text": [
280
- "Summary: {'num_objects_detected': 4, 'num_unary_predictions': 10, 'num_binary_predictions': 3, 'top_categories': [('frisbee', 0.9989640712738037), ('dog', 0.957672655582428), ('dog', 0.957672655582428)], 'top_actions': [('running', 0.8483631610870361), ('running', 0.832377016544342), ('running', 0.8178836107254028)], 'top_relations': [('chasing', 0.9616015553474426), ('chasing', 0.9478002786636353), ('chasing', 0.6380977630615234)]}\n"
281
- ]
282
- }
283
- ],
284
- "source": [
285
- "print(f\"Summary: {results['summary']}\")"
286
- ]
287
- }
288
- ],
289
- "metadata": {
290
- "kernelspec": {
291
- "display_name": "laser_env",
292
- "language": "python",
293
- "name": "python3"
294
- },
295
- "language_info": {
296
- "codemirror_mode": {
297
- "name": "ipython",
298
- "version": 3
299
- },
300
- "file_extension": ".py",
301
- "mimetype": "text/x-python",
302
- "name": "python",
303
- "nbconvert_exporter": "python",
304
- "pygments_lexer": "ipython3",
305
- "version": "3.10.0"
306
- }
307
- },
308
- "nbformat": 4,
309
- "nbformat_minor": 5
310
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/example_usage.py DELETED
@@ -1,283 +0,0 @@
1
- """
2
- Example usage of VINE HuggingFace interface
3
-
4
- This script demonstrates how to use the VINE model through the HuggingFace interface
5
- for video understanding with categorical, unary, and binary keyword predictions.
6
- """
7
-
8
- import os
9
- import sys
10
- import torch
11
- from transformers import pipeline, AutoModel
12
- from transformers.pipelines import PIPELINE_REGISTRY
13
-
14
- # Add the parent directory to the path to import vine_hf
15
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
-
17
- # Uncomment or set your own
18
- #os.environ['OPENAI_API_KEY'] = 'dummy-key'
19
- from vine_hf import VineConfig, VineModel, VinePipeline
20
-
21
- def example_direct_model_usage():
22
- """Example of using the VINE model directly."""
23
- print("=== Direct Model Usage ===")
24
-
25
- # Create configuration
26
- config = VineConfig(
27
- model_name="openai/clip-vit-base-patch32",
28
- segmentation_method="grounding_dino_sam2",
29
- use_hf_repo=True,
30
- model_repo="video-fm/vine_v0", # Your HF Hub model
31
- debug_visualizations=True,
32
- debug_visualization_path=os.path.join(os.getcwd(), "debug_masks.png"),
33
- target_fps=30,
34
- box_threshold=0.35,
35
- text_threshold=0.25
36
- )
37
-
38
- # Initialize model
39
- model = VineModel(config)
40
-
41
- print(f"Model initialized with CLIP backbone: {config.model_name}")
42
- print(f"Segmentation method: {config.segmentation_method}")
43
- print(f"Device: {model.device}")
44
-
45
- # Example video data (placeholder - in real usage, load from video file)
46
- num_frames, height, width = 3, 224, 224
47
- video_frames = torch.randn(num_frames, height, width, 3) * 255
48
- video_frames = video_frames.clamp(0, 255).byte()
49
-
50
- # Example masks and bboxes (placeholder - in real usage, generated by segmentation)
51
- masks = {
52
- 0: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)},
53
- 1: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)},
54
- 2: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)}
55
- }
56
-
57
- bboxes = {
58
- 0: {1: [50, 50, 150, 150], 2: [100, 100, 200, 200]},
59
- 1: {1: [52, 52, 152, 152], 2: [102, 102, 202, 202]},
60
- 2: {1: [54, 54, 154, 154], 2: [104, 104, 204, 204]}
61
- }
62
-
63
- # Define keywords
64
- categorical_keywords = ["human", "dog", "frisbee"]
65
- unary_keywords = ["running", "jumping", "sitting", "standing"]
66
- binary_keywords = ["behind", "in front of", "next to", "throwing to", "catching from"]
67
- object_pairs = [(1, 2)] # Object 1 relates to Object 2
68
-
69
- # Run prediction
70
- print("\nRunning prediction...")
71
- results = model.predict(
72
- video_frames=video_frames,
73
- masks=masks,
74
- bboxes=bboxes,
75
- categorical_keywords=categorical_keywords,
76
- unary_keywords=unary_keywords,
77
- binary_keywords=binary_keywords,
78
- object_pairs=object_pairs,
79
- return_top_k=3
80
- )
81
-
82
- print("\nResults:")
83
- print(f"Categorical predictions: {len(results['categorical_predictions'])} objects")
84
- print(f"Unary predictions: {len(results['unary_predictions'])} actions")
85
- print(f"Binary predictions: {len(results['binary_predictions'])} relations")
86
- print(f"Confidence scores: {results['confidence_scores']}")
87
-
88
-
89
- def example_pipeline_usage():
90
- """Example of using the VINE pipeline."""
91
- print("\n=== Pipeline Usage ===")
92
-
93
- # Register the pipeline
94
- PIPELINE_REGISTRY.register_pipeline(
95
- "vine-video-understanding",
96
- pipeline_class=VinePipeline,
97
- pt_model=VineModel,
98
- type="multimodal",
99
- )
100
- vine_config = VineConfig(
101
- model_name="openai/clip-vit-base-patch32",
102
- use_hf_repo=True,
103
- model_repo="video-fm/vine_v0", # Your HF Hub model
104
- segmentation_method="grounding_dino_sam2",
105
- debug_visualizations=True,
106
- )
107
-
108
- vine_pipe = VinePipeline(
109
- model=VineModel(vine_config),
110
- tokenizer=None,
111
- trust_remote_code=True,
112
- # SAM2 configuration
113
- sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
114
- sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
115
- gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
116
- gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
117
- device=0,
118
- )
119
-
120
-
121
- print("Pipeline created successfully!")
122
-
123
- # Example usage with video path
124
- video_path = "path/to/your/video.mp4" # Replace with actual video path
125
-
126
- # For demonstration, we'll show the expected usage format
127
- print(f"\nExample pipeline call (replace with actual video path):")
128
- print(f"results = vine_pipeline(")
129
- print(f" '{video_path}',")
130
- print(f" categorical_keywords=['human', 'dog', 'frisbee'],")
131
- print(f" unary_keywords=['running', 'jumping', 'sitting'],")
132
- print(f" binary_keywords=['behind', 'in front of', 'next to'],")
133
- print(f" object_pairs=[(1, 2)],")
134
- print(f" segmentation_method='grounding_dino_sam2',")
135
- print(f" return_top_k=3,")
136
- print(f" return_flattened_segments=True,")
137
- print(f" return_valid_pairs=True,")
138
- print(f" include_visualizations=True,")
139
- print(f" debug_visualizations=True")
140
- print(f")")
141
-
142
- # Note: Actual execution would require proper video file and segmentation models
143
-
144
-
145
- def example_huggingface_hub_usage():
146
- """Example of how to push and load from HuggingFace Hub."""
147
- print("\n=== HuggingFace Hub Usage ===")
148
-
149
- # Example of preparing model for Hub
150
- config = VineConfig()
151
- model = VineModel(config)
152
-
153
- # Register for auto classes
154
- config.register_for_auto_class()
155
- model.register_for_auto_class("AutoModel")
156
-
157
- print("Model registered for auto classes")
158
-
159
- # Example push to hub (commented out - requires actual model weights and credentials)
160
- # config.push_to_hub('your-username/vine-model')
161
- # model.push_to_hub('your-username/vine-model')
162
-
163
- # Example load from hub (commented out - requires actual model on hub)
164
- # model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)
165
- # pipeline = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)
166
-
167
- print("To push to Hub:")
168
- print("1. config.push_to_hub('your-username/vine-model')")
169
- print("2. model.push_to_hub('your-username/vine-model')")
170
- print("\nTo load from Hub:")
171
- print("model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)")
172
- print("pipe = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)")
173
-
174
-
175
- def example_with_real_video():
176
- """Example showing how to use with a real video file."""
177
- print("\n=== Real Video Usage Example ===")
178
-
179
- # Check if demo video exists
180
- demo_video_path = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
181
-
182
- if os.path.exists(demo_video_path):
183
- print(f"Found demo video: {demo_video_path}")
184
-
185
- # Create pipeline with segmentation model paths
186
- PIPELINE_REGISTRY.register_pipeline(
187
- "vine-video-understanding",
188
- pipeline_class=VinePipeline,
189
- pt_model=VineModel,
190
- type="multimodal",
191
- )
192
-
193
- vine_config = VineConfig(
194
- model_name="openai/clip-vit-base-patch32",
195
- use_hf_repo=True,
196
- model_repo="video-fm/vine_v0", # Your HF Hub model
197
- segmentation_method="grounding_dino_sam2",
198
- debug_visualizations=True,
199
- debug_visualization_path=os.path.join(os.getcwd(), "real_video_debug_masks.png"),
200
- )
201
-
202
- vine_pipeline = VinePipeline(
203
- model=VineModel(vine_config),
204
- tokenizer=None,
205
- trust_remote_code=True,
206
- # SAM2 configuration
207
- sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
208
- sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
209
- gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
210
- gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
211
- )
212
-
213
- # Define keywords based on the demo
214
- categorical_keywords = ['human', 'dog', 'frisbee']
215
- unary_keywords = ['running', 'jumping', 'catching', 'throwing']
216
- binary_keywords = ['behind', 'in front of', 'next to', 'chasing']
217
- object_pairs = [(0, 1), (0, 2), (1, 2)] # human-dog, dog-frisbee relationships
218
-
219
- print("\nProcessing video with VINE...")
220
- print("Keywords:")
221
- print(f" Categorical: {categorical_keywords}")
222
- print(f" Unary: {unary_keywords}")
223
- print(f" Binary: {binary_keywords}")
224
- print(f" Object pairs: {object_pairs}")
225
-
226
- # Note: This would require proper segmentation models to be set up
227
- try:
228
- results = vine_pipeline(
229
- demo_video_path,
230
- categorical_keywords=categorical_keywords,
231
- unary_keywords=unary_keywords,
232
- binary_keywords=binary_keywords,
233
- object_pairs=object_pairs,
234
- segmentation_method='grounding_dino_sam2',
235
- return_top_k=3,
236
- include_visualizations=False,
237
- debug_visualizations=True,
238
- )
239
-
240
- print("\nResults:")
241
- print(f"Summary: {results['summary']}")
242
-
243
- except Exception as e:
244
- print(f"Note: Full execution requires segmentation models to be properly set up.")
245
- print(f"Error: {e}")
246
-
247
- else:
248
- print(f"Demo video not found at: {demo_video_path}")
249
- print("To use with a real video, provide the path to your video file.")
250
-
251
-
252
- if __name__ == "__main__":
253
- print("VINE HuggingFace Interface Examples")
254
- print("=" * 50)
255
-
256
- # Run examples
257
- try:
258
- example_direct_model_usage()
259
- except Exception as e:
260
- print(f"Direct model usage failed: {e}")
261
-
262
- try:
263
- example_pipeline_usage()
264
- except Exception as e:
265
- print(f"Pipeline usage failed: {e}")
266
-
267
- try:
268
- example_huggingface_hub_usage()
269
- except Exception as e:
270
- print(f"Hub usage example failed: {e}")
271
-
272
- try:
273
- example_with_real_video()
274
- except Exception as e:
275
- print(f"Real video example failed: {e}")
276
-
277
- print("\n" + "=" * 50)
278
- print("Examples completed!")
279
- print("\nNext steps:")
280
- print("1. Set up Grounding DINO and SAM2 models for segmentation")
281
- print("2. Load your pretrained VINE model weights")
282
- print("3. Test with your own videos")
283
- print("4. Push to HuggingFace Hub for sharing")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/example_visualization.py DELETED
@@ -1,146 +0,0 @@
1
- # Example visualization runner for VINE
2
- # - Loads a video (path, demo, or random)
3
- # - Runs the VINE pipeline
4
- # - Saves annotated frames and an MP4 if available
5
-
6
- import os
7
- import sys
8
- import argparse
9
- import cv2
10
- import numpy as np
11
- from collections.abc import Mapping, Sequence
12
-
13
- from transformers.pipelines import PIPELINE_REGISTRY
14
- from transformers import pipeline
15
-
16
- # Set your OpenAI API key here or via environment variable
17
- os.environ['OPENAI_API_KEY'] = "dummy-key"
18
-
19
- # Local imports (workspace)
20
- sys.path.append(os.path.dirname(__file__))
21
-
22
- from vine_hf.vine_pipeline import VinePipeline # https://github.com link not needed; local path used
23
- from vine_hf.vine_model import VineModel
24
- from vine_hf.vine_config import VineConfig
25
- from laser.loading import load_video
26
-
27
-
28
- def build_pipeline(args) -> VinePipeline:
29
- # Register pipeline type
30
- PIPELINE_REGISTRY.register_pipeline(
31
- "vine-video-understanding",
32
- pipeline_class=VinePipeline,
33
- pt_model=VineModel,
34
- type="multimodal",
35
- )
36
-
37
- config = VineConfig(
38
- segmentation_method="grounding_dino_sam2",
39
- model_name="openai/clip-vit-base-patch32",
40
- # Example: load from HF repo
41
- use_hf_repo=True,
42
- model_repo="video-fm/vine_v0",
43
- # Alternatively use a local path by setting use_hf_repo=False and local_dir/local_filename
44
- box_threshold=args.box_threshold,
45
- text_threshold=args.text_threshold,
46
- target_fps=args.fps,
47
- topk_cate=args.topk_cate,
48
- visualization_dir=args.out_dir,
49
- visualize=True,
50
- debug_visualizations=True,
51
- device=args.device,
52
- )
53
-
54
- model = VineModel(config)
55
-
56
- # Create pipeline instance with segmentation model paths (if provided)
57
- vine_pipe = VinePipeline(
58
- model=model,
59
- tokenizer=None,
60
- sam_config_path="//home/kevinx/LASER/video-sam2/sam2/sam2_hiera_t.yaml",
61
- sam_checkpoint_path="//home/kevinx/LASER/video-sam2/sam2_hiera_tiny.pt",
62
- gd_config_path="//home/kevinx/LASER/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
63
- gd_checkpoint_path="//home/kevinx/LASER/GroundingDINO/weights/groundingdino_swint_ogc.pth",
64
- device=args.device,
65
- trust_remote_code=True,
66
- )
67
- return vine_pipe
68
-
69
-
70
- def resolve_video(args) -> np.ndarray | str:
71
- # Priority: user --video -> demo video -> random frames
72
- if args.video and os.path.exists(args.video):
73
- return args.video
74
-
75
- demo_video = "//home/kevinx/LASER/LASER/demo/videos/v1.mp4"
76
- demo_alt = "//home/kevinx/LASER/LASER/demo/videos/v2.mp4"
77
- if os.path.exists(demo_video):
78
- return demo_video
79
- if os.path.exists(demo_alt):
80
- return demo_alt
81
-
82
- # Fallback to random frames (uint8 HxWx3) shaped as T x H x W x 3
83
- print("No video found; using random frames.")
84
- rng = np.random.default_rng(0)
85
- frames = rng.integers(0, 255, size=(args.rand_frames, args.height, args.width, 3), dtype=np.uint8)
86
- return frames
87
-
88
-
89
-
90
- def main():
91
- parser = argparse.ArgumentParser(description="VINE visualization example")
92
- parser.add_argument("--video", type=str, default=None, help="Path to a video file")
93
- parser.add_argument("--out_dir", type=str, default="output", help="Output directory")
94
- parser.add_argument("--method", type=str, default="grounding_dino_sam2", choices=["sam2", "grounding_dino_sam2"], help="Segmentation method")
95
- parser.add_argument("--fps", type=int, default=5, help="Target FPS for processing")
96
- parser.add_argument("--box_threshold", type=float, default=0.3, help="GroundingDINO box threshold")
97
- parser.add_argument("--text_threshold", type=float, default=0.3, help="GroundingDINO text threshold")
98
- parser.add_argument("--topk_cate", type=int, default=5, help="Top-K categories to display")
99
- parser.add_argument("--device", type=int, default=0, help="CUDA device index or -1 for CPU")
100
- parser.add_argument("--debug_visualizations", action="store_true", help="Enable debug visualizations")
101
-
102
-
103
- args = parser.parse_args()
104
-
105
- vine_pipe = build_pipeline(args)
106
- video = resolve_video(args)
107
-
108
- # Keywords similar to examples/tests
109
- categorical_keywords = ["dog", "frisbee", "cat"]
110
- unary_keywords = ["running", "jumping", "sitting", "flying"]
111
- binary_keywords = ["behind", "next to", "chasing","biting"]
112
- object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3)]
113
-
114
- print("Running VINE pipeline...")
115
- call_kwargs = dict(
116
- categorical_keywords=categorical_keywords,
117
- unary_keywords=unary_keywords,
118
- binary_keywords=binary_keywords,
119
- object_pairs=object_pairs,
120
- segmentation_method=args.method,
121
- return_top_k=args.topk_cate,
122
- include_visualizations=True,
123
- debug_visualizations=args.debug_visualizations,
124
- )
125
-
126
-
127
- results = vine_pipe(
128
- video,
129
- **call_kwargs,
130
- )
131
-
132
- # Normalize pipeline output to a dict (can be dict or list[dict])
133
- if isinstance(results, Mapping):
134
- result = results
135
- elif isinstance(results, Sequence) and results and isinstance(results[0], Mapping):
136
- result = results[0]
137
- else:
138
- result = {}
139
-
140
- # Print brief summary
141
- summary = result.get("summary", {}) if isinstance(result, dict) else {}
142
- print("Summary:", summary)
143
-
144
-
145
- if __name__ == "__main__":
146
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/example_with_pretrained_vine.py DELETED
@@ -1,287 +0,0 @@
1
- """
2
- Example usage of VINE HuggingFace interface with pretrained VINE weights
3
-
4
- This script demonstrates how to use the VINE model with your pretrained weights
5
- from the ensemble format or from video-fm/vine_v0.
6
- """
7
-
8
- import os
9
- import sys
10
- import torch
11
- from transformers import pipeline
12
- from transformers.pipelines import PIPELINE_REGISTRY
13
-
14
- # Set your OpenAI API key here or via environment variable
15
- #os.environ['OPENAI_API_KEY'] = "dummy-key"
16
-
17
- # Add the parent directory to the path to import vine_hf
18
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
19
-
20
- from vine_hf import VineConfig, VineModel, VinePipeline
21
-
22
-
23
- def example_with_local_pretrained_weights():
24
- print("=== Using Local Pretrained VINE Weights ===")
25
-
26
-
27
- # Download https://huggingface.co/video-fm/vine_v0/tree/main/laser_model_v1.pt
28
- pretrained_vine_file = "/path/to/your/local/laser_model_v1.pt" # Replace with your local path
29
-
30
-
31
- # Create configuration with your pretrained path (local file)
32
- config = VineConfig(
33
- model_name="openai/clip-vit-base-patch32",
34
- segmentation_method="grounding_dino_sam2",
35
- target_fps=1,
36
- visualize=True,
37
- visualization_dir="path/to/visualization/dir",
38
- debug_visualizations=True,
39
- use_hf_repo=False,
40
- local_dir=os.path.dirname(pretrained_vine_file),
41
- local_filename=os.path.basename(pretrained_vine_file),
42
- )
43
-
44
- # Method 1: Initialize model directly
45
- print("Method 1: Direct model initialization")
46
- vine_model = VineModel(config)
47
- print(f"✓ Model initialized with pretrained weights from: {pretrained_vine_file}")
48
-
49
- # Method 2: Use the from_pretrained_vine class method
50
- print("\nMethod 2: Using from_pretrained_vine class method")
51
- vine_model_2 = VineModel.from_pretrained_vine(
52
- model_path=pretrained_vine_file,
53
- config=config,
54
- epoch=0 # Specify epoch number
55
- )
56
- print("✓ Model loaded using from_pretrained_vine method")
57
-
58
- return vine_model
59
-
60
-
61
- def example_with_huggingface_hub():
62
- """Example using VINE weights from HuggingFace Hub."""
63
- print("\n=== Using HuggingFace Hub Weights ===")
64
-
65
- # Create configuration to use HuggingFace Hub weights
66
- config = VineConfig(
67
- model_name="openai/clip-vit-base-patch32",
68
- use_hf_repo=True,
69
- model_repo="video-fm/vine_v0", # Your HF Hub model
70
- segmentation_method="grounding_dino_sam2",
71
- visualize=True,
72
- visualization_dir="path/to/visualization/dir",
73
- debug_visualizations=True,
74
- )
75
-
76
- try:
77
- # Initialize model (will try to load from HF Hub)
78
- vine_model = VineModel(config)
79
- print("✓ Model loaded from HuggingFace Hub: video-fm/vine_v0")
80
- return vine_model
81
- except Exception as e:
82
- print(f"✗ Could not load from HuggingFace Hub: {e}")
83
- print("Make sure your model is pushed to video-fm/vine_v0")
84
- return None
85
-
86
-
87
- def example_pipeline_with_pretrained():
88
- """Example using pipeline with pretrained VINE weights."""
89
- print("\n=== Pipeline with Pretrained VINE ===")
90
-
91
- # Register the pipeline
92
- PIPELINE_REGISTRY.register_pipeline(
93
- "vine-video-understanding",
94
- pipeline_class=VinePipeline,
95
- pt_model=VineModel,
96
- type="multimodal",
97
- )
98
-
99
- # Create configuration with your weights
100
- pretrained_vine_file = "/path/to/your/local/laser_model_v1.pt" # Replace with your local path
101
- config = VineConfig(
102
- model_name="openai/clip-vit-base-patch32",
103
- segmentation_method="grounding_dino_sam2",
104
- visualize=True,
105
- visualization_dir="path/to/visualization/dir",
106
- debug_visualizations=True,
107
- use_hf_repo=False,
108
- local_dir=os.path.dirname(pretrained_vine_file),
109
- local_filename=os.path.basename(pretrained_vine_file),
110
- )
111
-
112
- # Create model with pretrained weights
113
- vine_model = VineModel(config)
114
-
115
- # Create pipeline with segmentation model paths
116
- vine_pipeline = VinePipeline(
117
- model=vine_model,
118
- tokenizer=None,
119
- sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
120
- sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
121
- gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
122
- gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
123
- device=0
124
- )
125
-
126
- print("✓ Pipeline created with pretrained VINE weights")
127
-
128
- # Example usage (would require actual video file)
129
- demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
130
-
131
- if os.path.exists(demo_video):
132
- print(f"Found demo video: {demo_video}")
133
- print("Example pipeline call:")
134
- print(f"results = vine_pipeline(")
135
- print(f" '{demo_video}',")
136
- print(f" categorical_keywords=['human', 'dog', 'frisbee'],")
137
- print(f" unary_keywords=['running', 'jumping', 'sitting'],")
138
- print(f" binary_keywords=['behind', 'chasing', 'next to']")
139
- print(f" debug_visualizations=True")
140
- print(f")")
141
-
142
- # Uncomment to actually run (requires segmentation models)
143
- # results = vine_pipeline(
144
- # demo_video,
145
- # categorical_keywords=['human', 'dog', 'frisbee'],
146
- # unary_keywords=['running', 'jumping', 'sitting'],
147
- # binary_keywords=['behind', 'chasing', 'next to'],
148
- # debug_visualizations=True,
149
- # )
150
- # print("Results:", results['summary'])
151
-
152
- return vine_pipeline
153
-
154
-
155
-
156
- def example_manual_weight_loading():
157
- """Example of manually loading weights after model creation."""
158
- print("\n=== Manual Weight Loading ===")
159
-
160
- # Create model with base CLIP weights
161
- # No pretrained path: create base config (no HF repo or local file configured)
162
- config = VineConfig()
163
- vine_model = VineModel(config)
164
- print("✓ Model created with base CLIP weights")
165
- model_dir = "/path/to/your/local/ensemble/model_dir.pt" # Replace with your model directory
166
-
167
- if os.path.exists(model_dir):
168
- success = vine_model.load_pretrained_vine_weights(model_dir, epoch=0)
169
- if success:
170
- print("✓ Successfully loaded pretrained VINE weights manually")
171
- else:
172
- print("✗ Failed to load pretrained weights")
173
- else:
174
- print(f"✗ Model directory not found: {model_dir}")
175
-
176
- return vine_model
177
-
178
-
179
- def compare_model_outputs():
180
- """Compare outputs between base CLIP and pretrained VINE."""
181
- print("\n=== Comparing Model Outputs ===")
182
-
183
- # Create dummy data for testing
184
- video_frames = torch.randn(3, 224, 224, 3) * 255 # 3 frames
185
- video_frames = video_frames.clamp(0, 255).byte()
186
-
187
- masks = {
188
- 0: {1: torch.ones(224, 224, 1)},
189
- 1: {1: torch.ones(224, 224, 1)},
190
- 2: {1: torch.ones(224, 224, 1)}
191
- }
192
-
193
- bboxes = {
194
- 0: {1: [50, 50, 150, 150]},
195
- 1: {1: [52, 52, 152, 152]},
196
- 2: {1: [54, 54, 154, 154]}
197
- }
198
-
199
- keywords = ['human', 'dog', 'frisbee']
200
-
201
- # Model 1: Base CLIP
202
- print("Creating model with base CLIP weights...")
203
- config_base = VineConfig()
204
- model_base = VineModel(config_base)
205
-
206
- # Model 2: Pretrained VINE (if available)
207
- data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
208
- model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
209
-
210
- if os.path.exists(model_dir):
211
- print("Creating model with pretrained VINE weights...")
212
- config_vine = VineConfig(
213
- use_hf_repo=False,
214
- local_dir=model_dir,
215
- local_filename=None,
216
- )
217
- model_vine = VineModel(config_vine)
218
-
219
- print("\nComparing predictions...")
220
-
221
- # Get predictions from both models
222
- with torch.no_grad():
223
- results_base = model_base.predict(
224
- video_frames=video_frames,
225
- masks=masks,
226
- bboxes=bboxes,
227
- categorical_keywords=keywords,
228
- return_top_k=3
229
- )
230
-
231
- results_vine = model_vine.predict(
232
- video_frames=video_frames,
233
- masks=masks,
234
- bboxes=bboxes,
235
- categorical_keywords=keywords,
236
- return_top_k=3
237
- )
238
-
239
- print("Base CLIP confidence scores:", results_base['confidence_scores'])
240
- print("Pretrained VINE confidence scores:", results_vine['confidence_scores'])
241
-
242
- print("✓ Successfully compared both models")
243
- else:
244
- print(f"Pretrained model not found at: {model_dir}")
245
- print("Skipping comparison")
246
-
247
-
248
- if __name__ == "__main__":
249
- print("VINE HuggingFace Interface - Pretrained Weights Examples")
250
- print("=" * 60)
251
-
252
- try:
253
- # Test local pretrained weights
254
- model1 = example_with_local_pretrained_weights()
255
- except Exception as e:
256
- print(f"Local weights example failed: {e}")
257
-
258
- try:
259
- # Test HuggingFace Hub weights
260
- model2 = example_with_huggingface_hub()
261
- except Exception as e:
262
- print(f"HuggingFace Hub example failed: {e}")
263
-
264
- try:
265
- # Test pipeline with pretrained weights
266
- pipeline = example_pipeline_with_pretrained()
267
- except Exception as e:
268
- print(f"Pipeline example failed: {e}")
269
-
270
- # try:
271
- # # Test manual weight loading
272
- # #model3 = example_manual_weight_loading()
273
- # except Exception as e:
274
- # print(f"Manual loading example failed: {e}")
275
-
276
- # try:
277
- # # Compare model outputs
278
- # #compare_model_outputs()
279
- # except Exception as e:
280
- # print(f"Comparison example failed: {e}")
281
-
282
- print("\n" + "=" * 60)
283
- print("Examples completed!")
284
- print("\nUsage Summary:")
285
- print("1. Configure VineConfig with `use_hf_repo` + `model_repo` for Hub models, or `use_hf_repo=False` + `local_dir`/`local_filename` for local weights")
286
- print("2. Use VineModel.from_pretrained_vine() for direct loading")
287
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/flattening.py DELETED
@@ -1,124 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from collections import defaultdict
4
- from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
5
-
6
- import numpy as np
7
- import torch
8
-
9
-
10
- MaskType = Union[np.ndarray, torch.Tensor]
11
-
12
-
13
- def _to_numpy_mask(mask: MaskType) -> np.ndarray:
14
- """
15
- Convert assorted mask formats to a 2D numpy boolean array.
16
- """
17
- if isinstance(mask, torch.Tensor):
18
- mask_np = mask.detach().cpu().numpy()
19
- else:
20
- mask_np = np.asarray(mask)
21
-
22
- # Remove singleton dimensions at the front/back
23
- while mask_np.ndim > 2 and mask_np.shape[0] == 1:
24
- mask_np = np.squeeze(mask_np, axis=0)
25
- if mask_np.ndim > 2 and mask_np.shape[-1] == 1:
26
- mask_np = np.squeeze(mask_np, axis=-1)
27
-
28
- if mask_np.ndim != 2:
29
- raise ValueError(f"Expected mask to be 2D after squeezing, got shape {mask_np.shape}")
30
-
31
- return mask_np.astype(bool)
32
-
33
-
34
- def _mask_to_bbox(mask: np.ndarray) -> Optional[Tuple[int, int, int, int]]:
35
- """
36
- Compute a bounding box for a 2D boolean mask.
37
- """
38
- if not mask.any():
39
- return None
40
- rows, cols = np.nonzero(mask)
41
- y_min, y_max = rows.min(), rows.max()
42
- x_min, x_max = cols.min(), cols.max()
43
- return x_min, y_min, x_max, y_max
44
-
45
-
46
- def flatten_segments_for_batch(
47
- video_id: int,
48
- segments: Dict[int, Dict[int, MaskType]],
49
- bbox_min_dim: int = 5,
50
- ) -> Dict[str, List]:
51
- """
52
- Flatten nested segmentation data into batched lists suitable for predicate
53
- models or downstream visualizations. Mirrors the notebook helper but is
54
- robust to differing mask dtypes/shapes.
55
- """
56
- batched_object_ids: List[Tuple[int, int, int]] = []
57
- batched_masks: List[np.ndarray] = []
58
- batched_bboxes: List[Tuple[int, int, int, int]] = []
59
- frame_pairs: List[Tuple[int, int, Tuple[int, int]]] = []
60
-
61
- for frame_id, frame_objects in segments.items():
62
- valid_objects: List[int] = []
63
- for object_id, raw_mask in frame_objects.items():
64
- mask = _to_numpy_mask(raw_mask)
65
- bbox = _mask_to_bbox(mask)
66
- if bbox is None:
67
- continue
68
-
69
- x_min, y_min, x_max, y_max = bbox
70
- if abs(y_max - y_min) < bbox_min_dim or abs(x_max - x_min) < bbox_min_dim:
71
- continue
72
-
73
- valid_objects.append(object_id)
74
- batched_object_ids.append((video_id, frame_id, object_id))
75
- batched_masks.append(mask)
76
- batched_bboxes.append(bbox)
77
-
78
- for i in valid_objects:
79
- for j in valid_objects:
80
- if i == j:
81
- continue
82
- frame_pairs.append((video_id, frame_id, (i, j)))
83
-
84
- return {
85
- "object_ids": batched_object_ids,
86
- "masks": batched_masks,
87
- "bboxes": batched_bboxes,
88
- "pairs": frame_pairs,
89
- }
90
-
91
-
92
- def extract_valid_object_pairs(
93
- batched_object_ids: Sequence[Tuple[int, int, int]],
94
- interested_object_pairs: Optional[Iterable[Tuple[int, int]]] = None,
95
- ) -> List[Tuple[int, int, Tuple[int, int]]]:
96
- """
97
- Filter object pairs per frame. If `interested_object_pairs` is provided, only
98
- emit those combinations when both objects are present; otherwise emit all
99
- permutations (i, j) with i != j for each frame.
100
- """
101
- frame_to_objects: Dict[Tuple[int, int], set] = defaultdict(set)
102
- for vid, fid, oid in batched_object_ids:
103
- frame_to_objects[(vid, fid)].add(oid)
104
-
105
- interested = (
106
- list(interested_object_pairs)
107
- if interested_object_pairs is not None
108
- else None
109
- )
110
-
111
- valid_pairs: List[Tuple[int, int, Tuple[int, int]]] = []
112
- for (vid, fid), object_ids in frame_to_objects.items():
113
- if interested:
114
- for src, dst in interested:
115
- if src in object_ids and dst in object_ids:
116
- valid_pairs.append((vid, fid, (src, dst)))
117
- else:
118
- for src in object_ids:
119
- for dst in object_ids:
120
- if src == dst:
121
- continue
122
- valid_pairs.append((vid, fid, (src, dst)))
123
-
124
- return valid_pairs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/push_to_hub.py DELETED
@@ -1,232 +0,0 @@
1
- """
2
- Script to push VINE model to HuggingFace Hub
3
-
4
- This script helps you push your trained VINE model to the HuggingFace Hub
5
- for easy sharing and distribution.
6
- """
7
-
8
- import os
9
- import sys
10
- import torch
11
- import argparse
12
- from huggingface_hub import notebook_login
13
- from transformers.pipelines import PIPELINE_REGISTRY
14
-
15
- # Add the parent directory to the path to import vine_hf
16
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
17
-
18
- os.environ['OPENAI_API_KEY'] = "dummy-key"
19
- from vine_hf import VineConfig, VineModel, VinePipeline
20
-
21
-
22
- def push_vine_to_hub(
23
- model_weights_path: str,
24
- repo_name: str,
25
- model_name: str = "openai/clip-vit-base-patch32",
26
- segmentation_method: str = "grounding_dino_sam2",
27
- commit_message: str = "Upload VINE model",
28
- private: bool = False
29
- ):
30
- """
31
- Push VINE model to HuggingFace Hub.
32
-
33
- Args:
34
- model_weights_path: Path to the trained model weights (.pth file)
35
- repo_name: Name for the repository (e.g., "username/vine-model")
36
- model_name: CLIP model backbone name
37
- segmentation_method: Segmentation method used
38
- commit_message: Commit message for the push
39
- private: Whether to create a private repository
40
- """
41
-
42
- print("=== Pushing VINE Model to HuggingFace Hub ===")
43
-
44
- # 1. Create configuration
45
- print(f"Creating configuration with backbone: {model_name}")
46
- config = VineConfig(
47
- model_name=model_name,
48
- segmentation_method=segmentation_method
49
- )
50
-
51
- # 2. Initialize model
52
- print("Initializing model...")
53
- model = VineModel(config)
54
-
55
- # 3. Load trained weights
56
- if os.path.exists(model_weights_path):
57
- print(f"Loading weights from: {model_weights_path}")
58
- try:
59
- # Try loading with weights_only=False for compatibility
60
- weights = torch.load(model_weights_path, map_location='cpu', weights_only=False)
61
-
62
- # Handle different weight formats
63
- if isinstance(weights, dict):
64
- if 'state_dict' in weights:
65
- model.load_state_dict(weights['state_dict'])
66
- elif 'model' in weights:
67
- model.load_state_dict(weights['model'])
68
- else:
69
- model.load_state_dict(weights)
70
- else:
71
- # Assume it's the model directly
72
- model = weights
73
-
74
- print("✓ Weights loaded successfully")
75
- except Exception as e:
76
- print(f"✗ Error loading weights: {e}")
77
- print("Please check your weights file format")
78
- return False
79
- else:
80
- print(f"✗ Weights file not found: {model_weights_path}")
81
- return False
82
-
83
- # 4. Register for auto classes
84
- print("Registering for auto classes...")
85
- config.register_for_auto_class()
86
- model.register_for_auto_class("AutoModel")
87
-
88
- # 5. Register pipeline
89
- print("Registering pipeline...")
90
- PIPELINE_REGISTRY.register_pipeline(
91
- "vine-video-understanding",
92
- pipeline_class=VinePipeline,
93
- pt_model=VineModel,
94
- type="multimodal",
95
- )
96
-
97
- # 6. Create pipeline instance
98
- print("Creating pipeline...")
99
- vine_pipeline = VinePipeline(model=model, tokenizer=None)
100
-
101
- try:
102
- # 7. Push configuration to hub
103
- print(f"Pushing configuration to {repo_name}...")
104
- config.push_to_hub(
105
- repo_name,
106
- commit_message=f"{commit_message} - config",
107
- private=private
108
- )
109
- print("✓ Configuration pushed successfully")
110
-
111
- # 8. Push model to hub
112
- print(f"Pushing model to {repo_name}...")
113
- model.push_to_hub(
114
- repo_name,
115
- commit_message=f"{commit_message} - model",
116
- private=private
117
- )
118
- print("✓ Model pushed successfully")
119
-
120
- # 9. Push pipeline to hub
121
- print(f"Pushing pipeline to {repo_name}...")
122
- vine_pipeline.push_to_hub(
123
- repo_name,
124
- commit_message=f"{commit_message} - pipeline",
125
- private=private
126
- )
127
- print("✓ Pipeline pushed successfully")
128
-
129
- print(f"\n🎉 Successfully pushed VINE model to: https://huggingface.co/{repo_name}")
130
- print(f"\nTo use your model:")
131
- print(f"```python")
132
- print(f"from transformers import pipeline")
133
- print(f"")
134
- print(f"vine_pipeline = pipeline(")
135
- print(f" 'vine-video-understanding',")
136
- print(f" model='{repo_name}',")
137
- print(f" trust_remote_code=True")
138
- print(f")")
139
- print(f"")
140
- print(f"results = vine_pipeline(")
141
- print(f" 'path/to/video.mp4',")
142
- print(f" categorical_keywords=['human', 'dog', 'frisbee'],")
143
- print(f" unary_keywords=['running', 'jumping'],")
144
- print(f" binary_keywords=['chasing', 'behind']")
145
- print(f")")
146
- print(f"```")
147
-
148
- return True
149
-
150
- except Exception as e:
151
- print(f"✗ Error pushing to hub: {e}")
152
- print("Please check your HuggingFace credentials and repository permissions")
153
- return False
154
-
155
-
156
- def main():
157
- parser = argparse.ArgumentParser(description="Push VINE model to HuggingFace Hub")
158
-
159
- parser.add_argument(
160
- "--weights",
161
- type=str,
162
- required=True,
163
- help="Path to the trained model weights (.pth file)"
164
- )
165
-
166
- parser.add_argument(
167
- "--repo",
168
- type=str,
169
- required=True,
170
- help="Repository name (e.g., 'username/vine-model')"
171
- )
172
-
173
- parser.add_argument(
174
- "--model-name",
175
- type=str,
176
- default="openai/clip-vit-base-patch32",
177
- help="CLIP model backbone name"
178
- )
179
-
180
- parser.add_argument(
181
- "--segmentation",
182
- type=str,
183
- default="grounding_dino_sam2",
184
- choices=["sam2", "grounding_dino_sam2"],
185
- help="Segmentation method"
186
- )
187
-
188
- parser.add_argument(
189
- "--message",
190
- type=str,
191
- default="Upload VINE model",
192
- help="Commit message"
193
- )
194
-
195
- parser.add_argument(
196
- "--private",
197
- action="store_true",
198
- help="Create private repository"
199
- )
200
-
201
- parser.add_argument(
202
- "--login",
203
- action="store_true",
204
- help="Login to HuggingFace Hub first"
205
- )
206
-
207
- args = parser.parse_args()
208
-
209
- # Login if requested
210
- if args.login:
211
- print("Logging in to HuggingFace Hub...")
212
- notebook_login()
213
-
214
- # Push model
215
- success = push_vine_to_hub(
216
- model_weights_path=args.weights,
217
- repo_name=args.repo,
218
- model_name=args.model_name,
219
- segmentation_method=args.segmentation,
220
- commit_message=args.message,
221
- private=args.private
222
- )
223
-
224
- if success:
225
- print("\n✅ Model successfully pushed to HuggingFace Hub!")
226
- else:
227
- print("\n❌ Failed to push model to HuggingFace Hub")
228
- sys.exit(1)
229
-
230
-
231
- if __name__ == "__main__":
232
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/setup.py DELETED
@@ -1,63 +0,0 @@
1
- """
2
- Setup script for VINE HuggingFace Interface
3
- """
4
-
5
- from setuptools import setup, find_packages
6
-
7
- with open("README.md", "r", encoding="utf-8") as fh:
8
- long_description = fh.read()
9
-
10
- setup(
11
- name="vine-hf",
12
- version="1.0.0",
13
- author="LASER Team",
14
- author_email="[email protected]",
15
- description="HuggingFace interface for VINE (Video Understanding with Natural Language)",
16
- long_description=long_description,
17
- long_description_content_type="text/markdown",
18
- url="https://github.com/your-username/vine-hf",
19
- packages=["vine_hf"],
20
- package_dir={"vine_hf": "."},
21
- classifiers=[
22
- "Development Status :: 4 - Beta",
23
- "Intended Audience :: Developers",
24
- "Intended Audience :: Science/Research",
25
- "License :: OSI Approved :: MIT License",
26
- "Operating System :: OS Independent",
27
- "Programming Language :: Python :: 3",
28
- "Programming Language :: Python :: 3.7",
29
- "Programming Language :: Python :: 3.8",
30
- "Programming Language :: Python :: 3.9",
31
- "Programming Language :: Python :: 3.10",
32
- "Topic :: Scientific/Engineering :: Artificial Intelligence",
33
- "Topic :: Multimedia :: Video",
34
- ],
35
- python_requires=">=3.7",
36
- install_requires=[
37
- "torch>=1.9.0",
38
- "torchvision>=0.10.0",
39
- "transformers>=4.20.0",
40
- "opencv-python>=4.5.0",
41
- "pillow>=8.0.0",
42
- "numpy>=1.20.0",
43
- "huggingface-hub>=0.10.0",
44
- "tqdm>=4.60.0",
45
- ],
46
- extras_require={
47
- "dev": [
48
- "pytest>=6.0",
49
- "black>=22.0",
50
- "flake8>=4.0",
51
- "isort>=5.0",
52
- ],
53
- "segmentation": [
54
- # Note: SAM2 and Grounding DINO need to be installed separately
55
- # as they're not available on PyPI
56
- ],
57
- },
58
- entry_points={
59
- "console_scripts": [
60
- "vine-push-to-hub=vine_hf.push_to_hub:main",
61
- ],
62
- },
63
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/vine_config.py DELETED
@@ -1,108 +0,0 @@
1
- import torch
2
- from transformers import PretrainedConfig
3
- from typing import List, Optional, Dict, Any, Tuple
4
- from pathlib import Path
5
-
6
-
7
- class VineConfig(PretrainedConfig):
8
- """
9
- Configuration class for VINE (Video Understanding with Natural Language) model.
10
-
11
- VINE is a video understanding model that processes categorical (object class names),
12
- unary keywords (actions on one object), and binary keywords (relations between two objects),
13
- and returns probability distributions over all of them when passed a video.
14
-
15
- Args:
16
- model_name (str): The CLIP model name to use as backbone. Default: "openai/clip-vit-large-patch14-336"
17
- hidden_dim (int): Hidden dimension size. Default: 768
18
- num_top_pairs (int): Number of top object pairs to consider. Default: 10
19
- segmentation_method (str): Segmentation method to use ("sam2" or "grounding_dino_sam2"). Default: "grounding_dino_sam2"
20
- box_threshold (float): Box threshold for Grounding DINO. Default: 0.35
21
- text_threshold (float): Text threshold for Grounding DINO. Default: 0.25
22
- target_fps (int): Target FPS for video processing. Default: 1
23
- alpha (float): Alpha value for object extraction. Default: 0.5
24
- white_alpha (float): White alpha value for background blending. Default: 0.8
25
- topk_cate (int): Top-k categories to return. Default: 3
26
- multi_class (bool): Whether to use multi-class classification. Default: False
27
- output_logit (bool): Whether to output logits instead of probabilities. Default: False
28
- max_video_length (int): Maximum number of frames to process. Default: 100
29
- bbox_min_dim (int): Minimum bounding box dimension. Default: 5
30
- visualize (bool): Whether to visualize results. Default: False
31
- visualization_dir (str, optional): Directory to save visualizations. Default: None
32
- debug_visualizations (bool): Whether to save debug visualizations. Default: False
33
- return_flattened_segments (bool): Whether to return flattened segments. Default: False
34
- return_valid_pairs (bool): Whether to return valid object pairs. Default: False
35
- interested_object_pairs (List[Tuple[int, int]], optional): List of interested object pairs
36
- """
37
-
38
- model_type = "vine"
39
-
40
- def __init__(
41
- self,
42
- model_name: str = "openai/clip-vit-base-patch32",
43
- hidden_dim = 768,
44
-
45
- use_hf_repo: bool = True,
46
- model_repo: Optional[str] = "KevinX-Penn28/testing",
47
- model_file: Optional[str] = None,
48
- local_dir: Optional[str] = str(Path(__file__).resolve().parent),
49
- local_filename: Optional[str] = "laser_model_v1.pkl",
50
-
51
- num_top_pairs: int = 18,
52
- segmentation_method: str = "grounding_dino_sam2",
53
- box_threshold: float = 0.35,
54
- text_threshold: float = 0.25,
55
- target_fps: int = 1,
56
- alpha: float = 0.5,
57
- white_alpha: float = 0.8,
58
- topk_cate: int = 3,
59
- multi_class: bool = False,
60
- output_logit: bool = False,
61
- max_video_length: int = 100,
62
- bbox_min_dim: int = 5,
63
- visualize: bool = False,
64
- visualization_dir: Optional[str] = None,
65
- return_flattened_segments: bool = False,
66
- return_valid_pairs: bool = False,
67
- interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
68
- debug_visualizations: bool = False,
69
- device: Optional[str | int] = None,
70
- **kwargs
71
- ):
72
- self.model_name = model_name
73
- self.use_hf_repo = use_hf_repo
74
- if use_hf_repo:
75
- self.model_repo = model_repo
76
- self.model_file = model_file
77
- self.local_dir = None
78
- self.local_filename = None
79
- else:
80
- self.model_repo = None
81
- self.model_file = None
82
- self.local_dir = local_dir
83
- self.local_filename = local_filename
84
- self.hidden_dim = hidden_dim
85
- self.num_top_pairs = num_top_pairs
86
- self.segmentation_method = segmentation_method
87
- self.box_threshold = box_threshold
88
- self.text_threshold = text_threshold
89
- self.target_fps = target_fps
90
- self.alpha = alpha
91
- self.white_alpha = white_alpha
92
- self.topk_cate = topk_cate
93
- self.multi_class = multi_class
94
- self.output_logit = output_logit
95
- self.max_video_length = max_video_length
96
- self.bbox_min_dim = bbox_min_dim
97
- self.visualize = visualize
98
- self.visualization_dir = visualization_dir
99
- self.return_flattened_segments = return_flattened_segments
100
- self.return_valid_pairs = return_valid_pairs
101
- self.interested_object_pairs = interested_object_pairs or []
102
- self.debug_visualizations = debug_visualizations
103
- if device is int:
104
- self._device = f"cuda:{device}" if torch.cuda.is_available() else "cpu"
105
- else:
106
- self._device = device or ("cuda" if torch.cuda.is_available() else "cpu")
107
-
108
- super().__init__(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/vine_hf.egg-info/PKG-INFO DELETED
@@ -1,401 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: vine-hf
3
- Version: 1.0.0
4
- Summary: HuggingFace interface for VINE (Video Understanding with Natural Language)
5
- Home-page: https://github.com/your-username/vine-hf
6
- Author: LASER Team
7
- Author-email: [email protected]
8
- Classifier: Development Status :: 4 - Beta
9
- Classifier: Intended Audience :: Developers
10
- Classifier: Intended Audience :: Science/Research
11
- Classifier: License :: OSI Approved :: MIT License
12
- Classifier: Operating System :: OS Independent
13
- Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3.7
15
- Classifier: Programming Language :: Python :: 3.8
16
- Classifier: Programming Language :: Python :: 3.9
17
- Classifier: Programming Language :: Python :: 3.10
18
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
- Classifier: Topic :: Multimedia :: Video
20
- Requires-Python: >=3.7
21
- Description-Content-Type: text/markdown
22
- Requires-Dist: torch>=1.9.0
23
- Requires-Dist: torchvision>=0.10.0
24
- Requires-Dist: transformers>=4.20.0
25
- Requires-Dist: opencv-python>=4.5.0
26
- Requires-Dist: pillow>=8.0.0
27
- Requires-Dist: numpy>=1.20.0
28
- Requires-Dist: huggingface-hub>=0.10.0
29
- Requires-Dist: tqdm>=4.60.0
30
- Provides-Extra: dev
31
- Requires-Dist: pytest>=6.0; extra == "dev"
32
- Requires-Dist: black>=22.0; extra == "dev"
33
- Requires-Dist: flake8>=4.0; extra == "dev"
34
- Requires-Dist: isort>=5.0; extra == "dev"
35
- Provides-Extra: segmentation
36
- Dynamic: author
37
- Dynamic: author-email
38
- Dynamic: classifier
39
- Dynamic: description
40
- Dynamic: description-content-type
41
- Dynamic: home-page
42
- Dynamic: provides-extra
43
- Dynamic: requires-dist
44
- Dynamic: requires-python
45
- Dynamic: summary
46
-
47
- # VINE HuggingFace Interface
48
-
49
- VINE (Video Understanding with Natural Language) is a model that processes videos along with categorical, unary, and binary keywords to return probability distributions over those keywords for detected objects and their relationships.
50
-
51
- This package provides a HuggingFace-compatible interface for the VINE model, making it easy to use for video understanding tasks.
52
-
53
- ## Features
54
-
55
- - **Categorical Classification**: Classify objects in videos (e.g., "human", "dog", "frisbee")
56
- - **Unary Predicates**: Detect actions on single objects (e.g., "running", "jumping", "sitting")
57
- - **Binary Relations**: Detect relationships between object pairs (e.g., "behind", "in front of", "chasing")
58
- - **Multiple Segmentation Methods**: Support for SAM2 and Grounding DINO + SAM2
59
- - **HuggingFace Integration**: Full compatibility with HuggingFace transformers and pipelines
60
- - **Visualization Hooks**: Optional high-level visualizations plus lightweight debug mask dumps for quick sanity checks
61
-
62
- ## Installation
63
-
64
- ```bash
65
- # Install the package (assuming it's in your Python path)
66
- pip install transformers torch torchvision
67
- pip install opencv-python pillow numpy
68
-
69
- # For segmentation functionality, you'll also need:
70
- # - SAM2: https://github.com/facebookresearch/sam2
71
- # - Grounding DINO: https://github.com/IDEA-Research/GroundingDINO
72
- ```
73
-
74
- ## Segmentation Model Configuration
75
-
76
- `VinePipeline` lazily brings up the segmentation stack the first time a call needs masks. Thresholds, FPS, visualization toggles, and device selection live in `VineConfig`; the pipeline constructor tells it where to fetch SAM2 / GroundingDINO weights or lets you inject already-instantiated modules.
77
-
78
- ### Provide file paths at construction (most common)
79
-
80
- ```python
81
- from vine_hf import VineConfig, VineModel, VinePipeline
82
-
83
- vine_config = VineConfig(
84
- segmentation_method="grounding_dino_sam2", # or "sam2"
85
- box_threshold=0.35,
86
- text_threshold=0.25,
87
- target_fps=5,
88
- visualization_dir="output/visualizations", # where to write visualizations (and debug visualizations if enabled)
89
- debug_visualizations=True, # Write videos of the groundingDINO/SAM2/Binary/Unary, etc... outputs
90
- pretrained_vine_path="/abs/path/to/laser_model_v1.pkl",
91
- device="cuda:0", # accepts int, str, or torch.device
92
- )
93
-
94
- vine_model = VineModel(vine_config)
95
-
96
- vine_pipeline = VinePipeline(
97
- model=vine_model,
98
- tokenizer=None,
99
- sam_config_path="/abs/path/to/sam2/sam2.1_hiera_t.yaml",
100
- sam_checkpoint_path="/abs/path/to/sam2/sam2_hiera_tiny.pt",
101
- gd_config_path="/abs/path/to/groundingdino/config/GroundingDINO_SwinT_OGC.py",
102
- gd_checkpoint_path="/abs/path/to/groundingdino/weights/groundingdino_swint_ogc.pth",
103
- device=vine_config._device,
104
- )
105
- ```
106
-
107
- When `segmentation_method="grounding_dino_sam2"`, both SAM2 and GroundingDINO must be reachable. The pipeline validates the paths; missing files raise a `ValueError`. If you pick `"sam2"`, only the SAM2 config and checkpoint are required.
108
-
109
- ### Reuse pre-initialized segmentation modules
110
-
111
- If you build the segmentation stack elsewhere, inject the components with `set_segmentation_models` before running the pipeline:
112
-
113
- ```python
114
- from sam2.build_sam import build_sam2_video_predictor, build_sam2
115
- from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
116
- from groundingdino.util.inference import Model as GroundingDINOModel
117
-
118
- sam_predictor = build_sam2_video_predictor(..., device=vine_config._device)
119
- mask_generator = SAM2AutomaticMaskGenerator(build_sam2(..., device=vine_config._device))
120
- grounding_model = GroundingDINOModel(..., device=vine_config._device)
121
-
122
- vine_pipeline.set_segmentation_models(
123
- sam_predictor=sam_predictor,
124
- mask_generator=mask_generator,
125
- grounding_model=grounding_model,
126
- )
127
- ```
128
-
129
- Any argument left as `None` is initialized lazily from the file paths when the pipeline first needs that backend.
130
-
131
- ## Quick Start
132
-
133
- ## Requirements
134
- -torch
135
- -torchvision
136
- -transformers
137
- -opencv-python
138
- -matplotlib
139
- -seaborn
140
- -pandas
141
- -numpy
142
- -ipywidgets
143
- -tqdm
144
- -scikit-learn
145
- -sam2 (from Facebook Research) "https://github.com/video-fm/video-sam2"
146
- -sam2 weights (downloaded separately. EX: https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_tiny.pt)
147
- -groundingdino (from IDEA Research)
148
- -groundingdino weights (downloaded separately. EX:https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth)
149
- -spacy-fastlang
150
- -en-core-web-sm (for spacy-fastlang)
151
- -ffmpeg (for video processing)
152
- -(optional) laser weights/full model checkpoint (downloaded separately. EX: https://huggingface.co/video-fm/vine_v0)
153
-
154
- Usually, by running the laser/environments/laser_env.yml from the LASER repo, most dependencies will be installed. You will need to manually install sam2 and groundingdino as per their instructions.
155
-
156
- ### Using the Pipeline (Recommended)
157
- ```python
158
- from transformers.pipelines import PIPELINE_REGISTRY
159
- from vine_hf import VineConfig, VineModel, VinePipeline
160
-
161
- PIPELINE_REGISTRY.register_pipeline(
162
- "vine-video-understanding",
163
- pipeline_class=VinePipeline,
164
- pt_model=VineModel,
165
- type="multimodal",
166
- )
167
-
168
- config = VineConfig(
169
- segmentation_method="grounding_dino_sam2",
170
- pretrained_vine_path="/abs/path/to/laser_model_v1.pkl",
171
- visualization_dir="output",
172
- visualize=True,
173
- device="cuda:0",
174
- )
175
-
176
- model = VineModel(config)
177
-
178
- vine_pipeline = VinePipeline(
179
- model=model,
180
- tokenizer=None,
181
- sam_config_path="/abs/path/to/sam2/sam2.1_hiera_t.yaml",
182
- sam_checkpoint_path="/abs/path/to/sam2/sam2_hiera_tiny.pt",
183
- gd_config_path="/abs/path/to/groundingdino/config/GroundingDINO_SwinT_OGC.py",
184
- gd_checkpoint_path="/abs/path/to/groundingdino/weights/groundingdino_swint_ogc.pth",
185
- device=config._device,
186
- )
187
-
188
- results = vine_pipeline(
189
- "/path/to/video.mp4",
190
- categorical_keywords=["dog", "human"],
191
- unary_keywords=["running"],
192
- binary_keywords=["chasing"],
193
- object_pairs=[(0, 1)],
194
- return_top_k=3,
195
- include_visualizations=True,
196
- )
197
- print(results["summary"])
198
- ```
199
-
200
- ### Using the Model Directly (Advanced)
201
-
202
- For advanced users who want to provide their own segmentation:
203
-
204
- ```python
205
- from vine_hf import VineConfig, VineModel
206
- import torch
207
-
208
- # Create configuration
209
- config = VineConfig(
210
- pretrained_vine_path="/path/to/your/vine/weights" # Optional: your fine-tuned weights
211
- )
212
-
213
- # Initialize model
214
- model = VineModel(config)
215
-
216
- # If you have your own video frames, masks, and bboxes from external segmentation
217
- video_frames = torch.randn(3, 224, 224, 3) * 255 # Your video frames
218
- masks = {0: {1: torch.ones(224, 224, 1)}} # Your segmentation masks
219
- bboxes = {0: {1: [50, 50, 150, 150]}} # Your bounding boxes
220
-
221
- # Run prediction
222
- results = model.predict(
223
- video_frames=video_frames,
224
- masks=masks,
225
- bboxes=bboxes,
226
- categorical_keywords=['human', 'dog', 'frisbee'],
227
- unary_keywords=['running', 'jumping'],
228
- binary_keywords=['chasing', 'following'],
229
- object_pairs=[(1, 2)],
230
- return_top_k=3
231
- )
232
- ```
233
-
234
- **Note**: For most users, the pipeline approach above is recommended as it handles video loading and segmentation automatically.
235
-
236
- ## Configuration Options
237
-
238
- The `VineConfig` class supports the following parameters (non-exhaustive):
239
-
240
- - `model_name`: CLIP model backbone (default: `"openai/clip-vit-large-patch14-336"`)
241
- - `pretrained_vine_path`: Optional path or Hugging Face repo with pretrained VINE weights
242
- - `segmentation_method`: `"sam2"` or `"grounding_dino_sam2"` (default: `"grounding_dino_sam2"`)
243
- - `box_threshold` / `text_threshold`: Grounding DINO thresholds
244
- - `target_fps`: Target FPS for video processing (default: `1`)
245
- - `alpha`, `white_alpha`: Rendering parameters used when extracting masked crops
246
- - `topk_cate`: Top-k categories to return per object (default: `3`)
247
- - `max_video_length`: Maximum frames to process (default: `100`)
248
- - `visualize`: When `True`, pipeline post-processing attempts to create stitched visualizations
249
- - `visualization_dir`: Optional base directory where visualization assets are written
250
- - `debug_visualizations`: When `True`, the model saves a single first-frame mask composite for quick inspection
251
- - `debug_visualization_path`: Target filepath for the debug mask composite (must point to a writable file)
252
- - `return_flattened_segments`, `return_valid_pairs`, `interested_object_pairs`: Advanced geometry outputs for downstream consumers
253
-
254
- ## Output Format
255
-
256
- The model returns a dictionary with the following structure:
257
-
258
- ```python
259
- {
260
- "masks" : {},
261
-
262
- "boxes" : {},
263
-
264
- "categorical_predictions": {
265
- object_id: [(probability, category), ...]
266
- },
267
- "unary_predictions": {
268
- (frame_id, object_id): [(probability, action), ...]
269
- },
270
- "binary_predictions": {
271
- (frame_id, (obj1_id, obj2_id)): [(probability, relation), ...]
272
- },
273
- "confidence_scores": {
274
- "categorical": max_categorical_confidence,
275
- "unary": max_unary_confidence,
276
- "binary": max_binary_confidence
277
- },
278
- "summary": {
279
- "num_objects_detected": int,
280
- "top_categories": [(category, probability), ...],
281
- "top_actions": [(action, probability), ...],
282
- "top_relations": [(relation, probability), ...]
283
- }
284
- }
285
- ```
286
-
287
- ## Visualization & Debugging
288
-
289
- There are two complementary visualization layers:
290
-
291
- - **Post-process visualizations** (`include_visualizations=True` in the pipeline call) produces a high-level stitched video summarizing detections, actions, and relations over time.
292
-
293
- - **Debug visualizations** (`debug_visualizations=True` in `VineConfig`) dumps videos of intermediate segmentation masks and outputs from GroundingDINO, SAM2, Unary, Binary, etc. for quick sanity checks.
294
-
295
- If you plan to enable either option, ensure the relevant output directories exist before running the pipeline.
296
-
297
- ## Segmentation Methods
298
-
299
- ### Grounding DINO + SAM2 (Recommended)
300
-
301
- Uses Grounding DINO for object detection based on text prompts, then SAM2 for precise segmentation.
302
-
303
- Requirements:
304
- - Grounding DINO model and weights
305
- - SAM2 model and weights
306
- - Properly configured paths to model checkpoints
307
-
308
- ### SAM2 Only
309
-
310
- Uses SAM2's automatic mask generation without text-based object detection.
311
-
312
- Requirements:
313
- - SAM2 model and weights
314
-
315
- ## Model Architecture
316
-
317
- VINE is built on top of CLIP and uses three separate CLIP models for different tasks:
318
- - **Categorical Model**: For object classification
319
- - **Unary Model**: For single-object action recognition
320
- - **Binary Model**: For relationship detection between object pairs
321
-
322
- Each model processes both visual and textual features to compute similarity scores and probability distributions.
323
-
324
- ## Pushing to HuggingFace Hub
325
-
326
- ```python
327
- from vine_hf import VineConfig, VineModel
328
-
329
- # Create and configure your model
330
- config = VineConfig()
331
- model = VineModel(config)
332
-
333
- # Load your pretrained weights
334
- # model.load_state_dict(torch.load('path/to/your/weights.pth'))
335
-
336
- # Register for auto classes
337
- config.register_for_auto_class()
338
- model.register_for_auto_class("AutoModel")
339
-
340
- # Push to Hub
341
- config.push_to_hub('your-username/vine-model')
342
- model.push_to_hub('your-username/vine-model')
343
- ```
344
-
345
- ## Loading from HuggingFace Hub
346
-
347
- ```python
348
- from transformers import AutoModel, pipeline
349
-
350
- # Load model
351
- model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)
352
-
353
- # Or use with pipeline
354
- vine_pipeline = pipeline(
355
- 'vine-video-understanding',
356
- model='your-username/vine-model',
357
- trust_remote_code=True
358
- )
359
- ```
360
-
361
- ## Examples
362
-
363
- See `example_usage.py` for comprehensive examples including:
364
- - Direct model usage
365
- - Pipeline usage
366
- - HuggingFace Hub integration
367
- - Real video processing
368
-
369
- ## Requirements
370
-
371
- - Python 3.7+
372
- - PyTorch 1.9+
373
- - transformers 4.20+
374
- - OpenCV
375
- - PIL/Pillow
376
- - NumPy
377
-
378
- For segmentation:
379
- - SAM2 (Facebook Research)
380
- - Grounding DINO (IDEA Research)
381
-
382
- ## Citation
383
-
384
- If you use VINE in your research, please cite:
385
-
386
- ```bibtex
387
- @article{vine2024,
388
- title={VINE: Video Understanding with Natural Language},
389
- author={Your Authors},
390
- journal={Your Journal},
391
- year={2024}
392
- }
393
- ```
394
-
395
- ## License
396
-
397
- [Your License Here]
398
-
399
- ## Contact
400
-
401
- [Your Contact Information Here]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/vine_hf.egg-info/SOURCES.txt DELETED
@@ -1,21 +0,0 @@
1
- README.md
2
- setup.py
3
- ./__init__.py
4
- ./convert_inference.py
5
- ./example_ensemble_weights.py
6
- ./example_sam2_masks.py
7
- ./example_usage.py
8
- ./example_visualization.py
9
- ./example_with_pretrained_vine.py
10
- ./flattening.py
11
- ./push_to_hub.py
12
- ./vine_config.py
13
- ./vine_model.py
14
- ./vine_pipeline.py
15
- ./vis_utils.py
16
- vine_hf.egg-info/PKG-INFO
17
- vine_hf.egg-info/SOURCES.txt
18
- vine_hf.egg-info/dependency_links.txt
19
- vine_hf.egg-info/entry_points.txt
20
- vine_hf.egg-info/requires.txt
21
- vine_hf.egg-info/top_level.txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/vine_hf.egg-info/dependency_links.txt DELETED
@@ -1 +0,0 @@
1
-
 
 
src/vine_hf/vine_hf.egg-info/entry_points.txt DELETED
@@ -1,2 +0,0 @@
1
- [console_scripts]
2
- vine-push-to-hub = vine_hf.push_to_hub:main
 
 
 
src/vine_hf/vine_hf.egg-info/requires.txt DELETED
@@ -1,16 +0,0 @@
1
- torch>=1.9.0
2
- torchvision>=0.10.0
3
- transformers>=4.20.0
4
- opencv-python>=4.5.0
5
- pillow>=8.0.0
6
- numpy>=1.20.0
7
- huggingface-hub>=0.10.0
8
- tqdm>=4.60.0
9
-
10
- [dev]
11
- pytest>=6.0
12
- black>=22.0
13
- flake8>=4.0
14
- isort>=5.0
15
-
16
- [segmentation]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/vine_hf.egg-info/top_level.txt DELETED
@@ -1 +0,0 @@
1
- vine_hf
 
 
src/vine_hf/vine_model.py DELETED
@@ -1,702 +0,0 @@
1
- from flax import config
2
- import torch
3
- from torch import nn
4
- import torch.nn.functional as F
5
- import torch.utils.checkpoint as cp
6
- from transformers import PreTrainedModel, AutoTokenizer, AutoModel, AutoProcessor
7
- from typing import Dict, List, Tuple, Optional, Any, Union
8
- import numpy as np
9
- import os
10
- import cv2
11
- from collections import defaultdict
12
- import builtins
13
- import sys
14
- from laser.models import llava_clip_model_v3
15
- sys.modules["llava_clip_model_v3"] = llava_clip_model_v3
16
- from safetensors.torch import load_file
17
-
18
- import inspect
19
- from transformers.models.clip import modeling_clip
20
- import transformers
21
- from huggingface_hub import snapshot_download
22
-
23
-
24
-
25
-
26
- from .vine_config import VineConfig
27
- from laser.models.model_utils import (
28
- extract_single_object,
29
- extract_object_subject,
30
- crop_image_contain_bboxes,
31
- segment_list
32
- )
33
- from .flattening import (
34
- extract_valid_object_pairs,
35
- flatten_segments_for_batch,
36
- )
37
-
38
- from .vis_utils import save_mask_one_image
39
-
40
- class VineModel(PreTrainedModel):
41
- """
42
- VINE (Video Understanding with Natural Language) Model
43
-
44
- This model processes videos along with categorical, unary, and binary keywords
45
- to return probability distributions over those keywords for detected objects
46
- and their relationships in the video.
47
- """
48
-
49
- config_class = VineConfig
50
-
51
- def __init__(self, config: VineConfig):
52
- super().__init__(config)
53
-
54
- self.config = config
55
- self.visualize = getattr(config, "visualize", False)
56
- self.visualization_dir = getattr(config, "visualization_dir", None)
57
- self.debug_visualizations = getattr(config, "debug_visualizations", False)
58
- self._device = getattr(config, "_device")
59
-
60
-
61
-
62
- # Initialize CLIP components
63
- self.clip_tokenizer = AutoTokenizer.from_pretrained(config.model_name)
64
- if self.clip_tokenizer.pad_token is None:
65
- self.clip_tokenizer.pad_token = (
66
- self.clip_tokenizer.unk_token
67
- if self.clip_tokenizer.unk_token
68
- else self.clip_tokenizer.eos_token
69
- )
70
- self.clip_processor = AutoProcessor.from_pretrained(config.model_name)
71
- self.clip_cate_model = AutoModel.from_pretrained(config.model_name)
72
- self.clip_unary_model = AutoModel.from_pretrained(config.model_name)
73
- self.clip_binary_model = AutoModel.from_pretrained(config.model_name)
74
-
75
-
76
- # Then try to load pretrained VINE weights if specified
77
- if config.use_hf_repo:
78
- self._load_huggingface_vine_weights(config.model_repo, config.model_file)
79
- else:
80
- self._load_local_pretrained_vine_weights(config.local_dir, config.local_filename)
81
-
82
- # Move models to devicexwxw
83
- self.to(self._device)
84
-
85
- def _load_huggingface_vine_weights(self, model_repo: str, model_file: Optional[str] = None):
86
- """
87
- Load pretrained VINE weights from HuggingFace Hub.
88
- """
89
- try:
90
- print(f"Loading VINE weights from HuggingFace repo: {model_repo}")
91
- repo_path = snapshot_download(model_repo, revision=model_file or "main")
92
- weights = load_file(os.path.join(repo_path, "model.safetensors"))
93
- self.load_state_dict(weights, strict=False)
94
- print("✓ Successfully loaded VINE weights from HuggingFace Hub")
95
- return True
96
- except Exception as e:
97
- print(f"✗ Error loading VINE weights from HuggingFace Hub: {e}")
98
- print("Using base CLIP models instead")
99
- return False
100
-
101
- def _load_local_pretrained_vine_weights(self, local_dir: str, local_filename: Optional[str] = None, epoch: int = 0):
102
- """
103
- Load pretrained VINE weights from a saved .pt file or ensemble format.
104
- """
105
- #try: # simple .pt or .pth checkpoint
106
-
107
- # x = torch.load(pretrained_path, map_location=self._device, weights_only=False)
108
- # print(f"Loaded VINE checkpoint type: {type(x)}")
109
- full_path = os.path.join(local_dir, local_filename) if local_filename else local_dir
110
-
111
- if full_path.endswith(".pkl"):
112
- print(f"Loading VINE weights from: {full_path}")
113
- loaded_vine_model = torch.load(full_path, map_location=self._device, weights_only=False)
114
-
115
- print(f"Loaded state type: {type(loaded_vine_model)}")
116
- if not isinstance(loaded_vine_model, dict):
117
- if hasattr(loaded_vine_model, 'clip_cate_model'):
118
- self.clip_cate_model.load_state_dict(loaded_vine_model.clip_cate_model.state_dict())
119
- if hasattr(loaded_vine_model, 'clip_unary_model'):
120
- self.clip_unary_model.load_state_dict(loaded_vine_model.clip_unary_model.state_dict())
121
- if hasattr(loaded_vine_model, 'clip_binary_model'):
122
- self.clip_binary_model.load_state_dict(loaded_vine_model.clip_binary_model.state_dict())
123
- return True
124
-
125
- elif full_path.endswith(".pt") or full_path.endswith(".pth"):
126
- state = torch.load(full_path, map_location=self._device, weights_only=True)
127
- print(f"Loaded state type: {type(state)}")
128
- self.load_state_dict(state)
129
- return True
130
-
131
- # handle directory + epoch format
132
- if os.path.isdir(full_path):
133
- model_files = [f for f in os.listdir(full_path) if f.endswith(f'.{epoch}.model')]
134
- if model_files:
135
- model_file = os.path.join(full_path, model_files[0])
136
- print(f"Loading VINE weights from: {model_file}")
137
- pretrained_model = torch.load(model_file, map_location="cpu")
138
-
139
- # Conversion from PredicateModel-like object to VineModel
140
- # Only copy if attributes exist
141
- if hasattr(pretrained_model, 'clip_cate_model'):
142
- self.clip_cate_model.load_state_dict(pretrained_model.clip_cate_model.state_dict())
143
- if hasattr(pretrained_model, 'clip_unary_model'):
144
- self.clip_unary_model.load_state_dict(pretrained_model.clip_unary_model.state_dict())
145
- if hasattr(pretrained_model, 'clip_binary_model'):
146
- self.clip_binary_model.load_state_dict(pretrained_model.clip_binary_model.state_dict())
147
- print("✓ Loaded all sub-model weights from ensemble format")
148
- return True
149
- else:
150
- print(f"No model file found for epoch {epoch} in {full_path}")
151
- return False
152
-
153
- print("Unsupported format for pretrained_vine_path")
154
- return False
155
-
156
- # except Exception as e:
157
- # print(f"✗ Error loading VINE weights: {e}")
158
- # print("Using base CLIP models instead")
159
- # return False
160
-
161
-
162
-
163
- # def _load_pretrained_vine_weights(self, pretrained_path: str, epoch: int = 0):
164
- # """
165
- # Load pretrained VINE weights from local ensemble format.
166
-
167
- # Args:
168
- # pretrained_path: Path to the pretrained model directory or HF model name
169
- # epoch: Epoch number to load (for ensemble format)
170
- # """
171
- # if pretrained_path == "video-fm/vine_v0":
172
- # # Try to load from HuggingFace Hubtry:
173
- # # ✅ TODO FIXED: Added support for loading .pt/.pth checkpoints with state dicts
174
- # if pretrained_path.endswith(".pt") or pretrained_path.endswith(".pth"):
175
- # print(f"Loading VINE weights from: {pretrained_path}")
176
- # state = torch.load(pretrained_path, map_location="cpu")
177
-
178
- # if "clip_cate_model" in state:
179
- # self.clip_cate_model.load_state_dict(state["clip_cate_model"])
180
- # print("✓ Loaded categorical model weights")
181
- # if "clip_unary_model" in state:
182
- # self.clip_unary_model.load_state_dict(state["clip_unary_model"])
183
- # print("✓ Loaded unary model weights")
184
- # if "clip_binary_model" in state:
185
- # self.clip_binary_model.load_state_dict(state["clip_binary_model"])
186
- # print("✓ Loaded binary model weights")
187
-
188
- # if "clip_tokenizer" in state:
189
- # self.clip_tokenizer = state["clip_tokenizer"]
190
- # print("✓ Loaded tokenizer")
191
- # if "clip_processor" in state:
192
- # self.clip_processor = state["clip_processor"]
193
- # print("✓ Loaded processor")
194
-
195
- # print("✓ All VINE weights loaded successfully")
196
- # return True
197
-
198
- # # Load from local ensemble format
199
- # try:
200
- # if os.path.isdir(pretrained_path):
201
- # # Directory format - look for ensemble file
202
- # model_files = [f for f in os.listdir(pretrained_path) if f.endswith(f'.{epoch}.model')]
203
- # if model_files:
204
- # model_file = os.path.join(pretrained_path, model_files[0])
205
- # else:
206
- # print(f"No model file found for epoch {epoch} in {pretrained_path}")
207
- # return False
208
- # else:
209
- # # Direct file path
210
- # model_file = pretrained_path
211
-
212
- # print(f"Loading VINE weights from: {model_file}")
213
-
214
- # # Load the ensemble model (PredicateModel instance)
215
- # # TODO: conversion from PredicateModel to VineModel
216
- # pretrained_model = torch.load(model_file, map_location='cpu', weights_only=False)
217
-
218
- # # Transfer weights from the pretrained model to our HuggingFace models
219
- # if hasattr(pretrained_model, 'clip_cate_model'):
220
- # self.clip_cate_model.load_state_dict(pretrained_model.clip_cate_model.state_dict())
221
- # print("✓ Loaded categorical model weights")
222
-
223
- # if hasattr(pretrained_model, 'clip_unary_model'):
224
- # self.clip_unary_model.load_state_dict(pretrained_model.clip_unary_model.state_dict())
225
- # print("✓ Loaded unary model weights")
226
-
227
- # if hasattr(pretrained_model, 'clip_binary_model'):
228
- # self.clip_binary_model.load_state_dict(pretrained_model.clip_binary_model.state_dict())
229
- # print("✓ Loaded binary model weights")
230
-
231
- # # Also transfer tokenizer and processor if available
232
- # if hasattr(pretrained_model, 'clip_tokenizer'):
233
- # self.clip_tokenizer = pretrained_model.clip_tokenizer
234
- # print("✓ Loaded tokenizer")
235
-
236
- # if hasattr(pretrained_model, 'clip_processor'):
237
- # self.clip_processor = pretrained_model.clip_processor
238
- # print("✓ Loaded processor")
239
-
240
- # print("✓ Successfully loaded all VINE weights")
241
- # return True
242
-
243
- # except Exception as e:
244
- # print(f"✗ Error loading VINE weights: {e}")
245
- # print("Using base CLIP models instead")
246
- # return False
247
-
248
- @classmethod
249
- def from_pretrained_vine(
250
- cls,
251
- model_path: str,
252
- config: Optional[VineConfig] = None,
253
- epoch: int = 0,
254
- **kwargs
255
- ):
256
- """
257
- Create VineModel from pretrained VINE weights.
258
-
259
- Args:
260
- model_path: Path to pretrained VINE model
261
- config: Optional config, will create default if None
262
- epoch: Epoch number to load
263
- **kwargs: Additional arguments
264
-
265
- Returns:
266
- VineModel instance with loaded weights
267
- """
268
- # Normalize the incoming model_path into the new VineConfig fields.
269
- if config is None:
270
- # Heuristics: if path looks like a HF repo (contains a "/" and
271
- # doesn't exist on disk) treat it as a repo. Otherwise treat as local.
272
- if model_path and ("/" in model_path and not os.path.exists(model_path)):
273
- config = VineConfig(use_hf_repo=True, model_repo=model_path)
274
- else:
275
- # Local path: could be a file or directory
276
- if os.path.isdir(model_path):
277
- config = VineConfig(use_hf_repo=False, local_dir=model_path)
278
- else:
279
- config = VineConfig(
280
- use_hf_repo=False,
281
- local_dir=os.path.dirname(model_path) or None,
282
- local_filename=os.path.basename(model_path) or None,
283
- )
284
- else:
285
- # Update provided config to reflect the requested pretrained path
286
- if model_path and ("/" in model_path and not os.path.exists(model_path)):
287
- config.use_hf_repo = True
288
- config.model_repo = model_path
289
- config.model_file = None
290
- config.local_dir = None
291
- config.local_filename = None
292
- else:
293
- config.use_hf_repo = False
294
- if os.path.isdir(model_path):
295
- config.local_dir = model_path
296
- config.local_filename = None
297
- else:
298
- config.local_dir = os.path.dirname(model_path) or None
299
- config.local_filename = os.path.basename(model_path) or None
300
-
301
- # Create model instance (will automatically load weights)
302
- model = cls(config, **kwargs)
303
-
304
- return model
305
-
306
- def _text_features_checkpoint(self, model, tokens):
307
- """Extract text features with gradient checkpointing."""
308
- token_keys = list(tokens.keys())
309
-
310
- def get_text_features_wrapped(*inputs):
311
- kwargs = {key: value for key, value in zip(token_keys, inputs)}
312
- return model.get_text_features(**kwargs)
313
-
314
- token_values = [tokens[key] for key in token_keys]
315
- return cp.checkpoint(get_text_features_wrapped, *token_values, use_reentrant=False)
316
-
317
- def _image_features_checkpoint(self, model, images):
318
- """Extract image features with gradient checkpointing."""
319
- return cp.checkpoint(model.get_image_features, images, use_reentrant=False)
320
-
321
- def clip_sim(self, model, nl_feat, img_feat):
322
- img_feat = img_feat / img_feat.norm(p=2, dim=-1, keepdim=True)
323
- nl_feat = nl_feat / nl_feat.norm(p=2, dim=-1, keepdim=True)
324
- logits = torch.matmul(img_feat, nl_feat.T)
325
- if hasattr(model, "logit_scale"):
326
- logits = logits * model.logit_scale.exp()
327
- return logits
328
-
329
- def forward(
330
- self,
331
- video_frames: torch.Tensor,
332
- masks: Dict[int, Dict[int, torch.Tensor]],
333
- bboxes: Dict[int, Dict[int, List]],
334
- categorical_keywords: List[str],
335
- unary_keywords: Optional[List[str]] = None,
336
- binary_keywords: Optional[List[str]] = None,
337
- object_pairs: Optional[List[Tuple[int, int]]] = None,
338
- return_flattened_segments: Optional[bool] = None,
339
- return_valid_pairs: Optional[bool] = None,
340
- interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
341
- debug_visualizations: Optional[bool] = None,
342
- **kwargs
343
- ) -> Dict[str, Any]:
344
- """
345
- Forward pass of the VINE model.
346
-
347
- Args:
348
- video_frames: Tensor of shape (num_frames, height, width, 3)
349
- masks: Dict mapping frame_id -> object_id -> mask tensor
350
- bboxes: Dict mapping frame_id -> object_id -> [x1, y1, x2, y2]
351
- categorical_keywords: List of category names to classify objects
352
- unary_keywords: Optional list of unary predicates (actions on single objects)
353
- binary_keywords: Optional list of binary predicates (relations between objects)
354
- object_pairs: Optional list of (obj1_id, obj2_id) pairs for binary classification
355
-
356
- Returns:
357
- Dict containing probability distributions for categorical, unary, and binary predictions
358
- """
359
- if unary_keywords is None:
360
- unary_keywords = []
361
- if binary_keywords is None:
362
- binary_keywords = []
363
- if object_pairs is None:
364
- object_pairs = []
365
- if return_flattened_segments is None:
366
- return_flattened_segments = self.config.return_flattened_segments
367
- if return_valid_pairs is None:
368
- return_valid_pairs = self.config.return_valid_pairs
369
- if interested_object_pairs is None or len(interested_object_pairs) == 0:
370
- interested_object_pairs = getattr(self.config, "interested_object_pairs", []) or []
371
- if debug_visualizations is None:
372
- debug_visualizations = self.debug_visualizations
373
-
374
- # Prepare dummy strings for empty categories
375
- dummy_str = ""
376
-
377
- # Fill empty categories with dummy strings
378
- if len(categorical_keywords) == 0:
379
- categorical_keywords = [dummy_str]
380
- if len(unary_keywords) == 0:
381
- unary_keywords = [dummy_str]
382
- if len(binary_keywords) == 0:
383
- binary_keywords = [dummy_str]
384
-
385
- # Extract text features for all keyword types
386
- categorical_features = self._extract_text_features(
387
- self.clip_cate_model, categorical_keywords
388
- )
389
- unary_features = self._extract_text_features(
390
- self.clip_unary_model, unary_keywords
391
- )
392
- binary_features = self._extract_text_features(
393
- self.clip_binary_model, binary_keywords
394
- )
395
-
396
- # Process video frames and extract object features
397
- categorical_probs = {}
398
- unary_probs = {}
399
- binary_probs = {}
400
-
401
- # Process each frame
402
- for frame_id, frame_masks in masks.items():
403
- if frame_id >= len(video_frames):
404
- continue
405
-
406
- frame = self._frame_to_numpy(video_frames[frame_id])
407
- frame_bboxes = bboxes.get(frame_id, {})
408
-
409
- # Extract object features for categorical classification
410
- for obj_id, mask in frame_masks.items():
411
- if obj_id not in frame_bboxes:
412
- continue
413
-
414
- bbox = frame_bboxes[obj_id]
415
-
416
- # Extract single object image
417
- mask_np = self._mask_to_numpy(mask)
418
-
419
- obj_image = extract_single_object(
420
- frame, mask_np, alpha=self.config.alpha
421
- )
422
-
423
- # Get image features
424
- obj_features = self._extract_image_features(
425
- self.clip_cate_model, obj_image
426
- )
427
-
428
- # Compute similarities for categorical classification
429
- cat_similarities = self.clip_sim(
430
- self.clip_cate_model, categorical_features, obj_features
431
- )
432
- cat_probs = F.softmax(cat_similarities, dim=-1)
433
-
434
- # Store categorical predictions
435
- for i, keyword in enumerate(categorical_keywords):
436
- if keyword != dummy_str:
437
- categorical_probs[(obj_id, keyword)] = cat_probs[0, i].item()
438
-
439
- # Compute unary predictions
440
- if len(unary_keywords) > 0 and unary_keywords[0] != dummy_str:
441
- unary_similarities = self.clip_sim(
442
- self.clip_unary_model, unary_features, obj_features
443
- )
444
- unary_probs_tensor = F.softmax(unary_similarities, dim=-1)
445
-
446
- for i, keyword in enumerate(unary_keywords):
447
- if keyword != dummy_str:
448
- unary_probs[(frame_id, obj_id, keyword)] = unary_probs_tensor[0, i].item()
449
-
450
- # Process binary relationships
451
- if len(binary_keywords) > 0 and binary_keywords[0] != dummy_str and len(object_pairs) > 0:
452
- for obj1_id, obj2_id in object_pairs:
453
- for frame_id, frame_masks in masks.items():
454
- if frame_id >= len(video_frames):
455
- continue
456
- if (obj1_id in frame_masks and obj2_id in frame_masks and
457
- obj1_id in bboxes.get(frame_id, {}) and obj2_id in bboxes.get(frame_id, {})):
458
-
459
- frame = self._frame_to_numpy(video_frames[frame_id])
460
- mask1 = frame_masks[obj1_id]
461
- mask2 = frame_masks[obj2_id]
462
-
463
- mask1_np = self._mask_to_numpy(mask1)
464
- mask2_np = self._mask_to_numpy(mask2)
465
-
466
- # Extract object pair image
467
- pair_image = extract_object_subject(
468
- frame, mask1_np[..., None], mask2_np[..., None],
469
- alpha=self.config.alpha,
470
- white_alpha=self.config.white_alpha
471
- )
472
-
473
- # Crop to contain both objects
474
- bbox1 = bboxes[frame_id][obj1_id]
475
- bbox2 = bboxes[frame_id][obj2_id]
476
-
477
- # Bounding box overlap check
478
- if bbox1[0] >= bbox2[2] or bbox2[1] >= bbox1[3] or \
479
- bbox2[0] >= bbox1[2] or bbox1[1] >= bbox2[3]:
480
- continue
481
-
482
- cropped_image = crop_image_contain_bboxes(
483
- pair_image, [bbox1, bbox2], f"frame_{frame_id}"
484
- )
485
-
486
- # Get image features
487
- pair_features = self._extract_image_features(
488
- self.clip_binary_model, cropped_image
489
- )
490
-
491
- # Compute similarities for binary classification
492
- binary_similarities = self.clip_sim(
493
- self.clip_binary_model, binary_features, pair_features
494
- )
495
- binary_probs_tensor = F.softmax(binary_similarities, dim=-1)
496
-
497
- for i, keyword in enumerate(binary_keywords):
498
- if keyword != dummy_str:
499
- binary_probs[(frame_id, (obj1_id, obj2_id), keyword)] = binary_probs_tensor[0, i].item()
500
-
501
- # Calculate dummy probability (for compatibility)
502
- dummy_prob = 1.0 / max(len(categorical_keywords), len(unary_keywords), len(binary_keywords))
503
-
504
- result: Dict[str, Any] = {
505
- "categorical_probs": {0: categorical_probs}, # Video ID 0
506
- "unary_probs": {0: unary_probs},
507
- "binary_probs": [binary_probs], # List format for compatibility
508
- "dummy_prob": dummy_prob
509
- }
510
-
511
- if return_flattened_segments or return_valid_pairs:
512
- flattened = flatten_segments_for_batch(
513
- video_id=0,
514
- segments=masks,
515
- bbox_min_dim=self.config.bbox_min_dim,
516
- )
517
- if return_flattened_segments:
518
- result["flattened_segments"] = flattened
519
- if return_valid_pairs:
520
- interested_pairs = interested_object_pairs if interested_object_pairs else None
521
- result["valid_pairs"] = extract_valid_object_pairs(
522
- flattened["object_ids"],
523
- interested_pairs,
524
- )
525
- if interested_pairs is None:
526
- # Provide all generated pairs for clarity when auto-generated.
527
- result["valid_pairs_metadata"] = {"pair_source": "all_pairs"}
528
- else:
529
- result["valid_pairs_metadata"] = {"pair_source": "filtered", "requested_pairs": interested_pairs}
530
-
531
- return result
532
-
533
- def _frame_to_numpy(self, frame: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
534
- """Convert a frame tensor/array to a contiguous numpy array."""
535
- if torch.is_tensor(frame):
536
- frame_np = frame.detach().cpu().numpy()
537
- else:
538
- frame_np = np.asarray(frame)
539
- return np.ascontiguousarray(frame_np)
540
-
541
- def _mask_to_numpy(self, mask: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
542
- """Convert a mask tensor/array to a 2D boolean numpy array."""
543
- if torch.is_tensor(mask):
544
- mask_np = mask.detach().cpu().numpy()
545
- else:
546
- mask_np = np.asarray(mask)
547
-
548
- if mask_np.ndim == 3:
549
- if mask_np.shape[0] == 1:
550
- mask_np = mask_np.squeeze(0)
551
- elif mask_np.shape[2] == 1:
552
- mask_np = mask_np.squeeze(2)
553
-
554
- if mask_np.ndim != 2:
555
- raise ValueError(f"Mask must be 2D after squeezing, got shape {mask_np.shape}")
556
-
557
- return mask_np.astype(bool, copy=False)
558
-
559
- def _extract_text_features(self, model, keywords):
560
- """Extract text features for given keywords."""
561
- tokens = self.clip_tokenizer(
562
- keywords,
563
- return_tensors="pt",
564
- max_length=75,
565
- truncation=True,
566
- padding='max_length'
567
- ).to(self._device)
568
-
569
- return self._text_features_checkpoint(model, tokens)
570
-
571
- def _extract_image_features(self, model, image):
572
- """Extract image features for given image."""
573
- # Ensure image is in correct format
574
- if isinstance(image, np.ndarray):
575
- if image.dtype != np.uint8:
576
- image = image.astype(np.uint8)
577
- # Convert BGR to RGB if needed
578
- if len(image.shape) == 3 and image.shape[2] == 3:
579
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
580
-
581
- # Process image with CLIP processor
582
- inputs = self.clip_processor(
583
- images=image,
584
- return_tensors="pt"
585
- ).to(self._device)
586
-
587
- return self._image_features_checkpoint(model, inputs['pixel_values'])
588
- #TODO: return masks and bboxes and their corresponding index
589
- def predict(
590
- self,
591
- video_frames: torch.Tensor,
592
- masks: Dict[int, Dict[int, torch.Tensor]],
593
- bboxes: Dict[int, Dict[int, List]],
594
- categorical_keywords: List[str],
595
- unary_keywords: Optional[List[str]] = None,
596
- binary_keywords: Optional[List[str]] = None,
597
- object_pairs: Optional[List[Tuple[int, int]]] = None,
598
- return_top_k: int = 3,
599
- return_flattened_segments: Optional[bool] = None,
600
- return_valid_pairs: Optional[bool] = None,
601
- interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
602
- debug_visualizations: Optional[bool] = None,
603
- ) -> Dict[str, Any]:
604
- """
605
- High-level prediction method that returns formatted results.
606
-
607
- Args:
608
- video_frames: Tensor of shape (num_frames, height, width, 3)
609
- masks: Dict mapping frame_id -> object_id -> mask tensor
610
- bboxes: Dict mapping frame_id -> object_id -> [x1, y1, x2, y2]
611
- categorical_keywords: List of category names
612
- unary_keywords: Optional list of unary predicates
613
- binary_keywords: Optional list of binary predicates
614
- object_pairs: Optional list of object pairs for binary relations
615
- return_top_k: Number of top predictions to return
616
- return_flattened_segments: Whether to include flattened mask/bbox tensors
617
- return_valid_pairs: Whether to compute valid object pairs per frame
618
- interested_object_pairs: Optional subset of object pairs to track
619
-
620
- Returns:
621
- Formatted prediction results
622
- """
623
-
624
- with torch.no_grad():
625
- outputs = self.forward(
626
- video_frames=video_frames,
627
- masks=masks,
628
- bboxes=bboxes,
629
- categorical_keywords=categorical_keywords,
630
- unary_keywords=unary_keywords,
631
- binary_keywords=binary_keywords,
632
- object_pairs=object_pairs,
633
- return_flattened_segments=return_flattened_segments,
634
- return_valid_pairs=return_valid_pairs,
635
- interested_object_pairs=interested_object_pairs,
636
- debug_visualizations=debug_visualizations,
637
- )
638
-
639
- # Format categorical results
640
- formatted_categorical = {}
641
- for (obj_id, category), prob in outputs["categorical_probs"][0].items():
642
- if obj_id not in formatted_categorical:
643
- formatted_categorical[obj_id] = []
644
- formatted_categorical[obj_id].append((prob, category))
645
-
646
- # Sort and take top-k for each object
647
- for obj_id in formatted_categorical:
648
- formatted_categorical[obj_id] = sorted(
649
- formatted_categorical[obj_id], reverse=True
650
- )[:return_top_k]
651
-
652
- # Format unary results
653
- formatted_unary = {}
654
- for (frame_id, obj_id, predicate), prob in outputs["unary_probs"][0].items():
655
- key = (frame_id, obj_id)
656
- if key not in formatted_unary:
657
- formatted_unary[key] = []
658
- formatted_unary[key].append((prob, predicate))
659
-
660
- # Sort and take top-k
661
- for key in formatted_unary:
662
- formatted_unary[key] = sorted(
663
- formatted_unary[key], reverse=True
664
- )[:return_top_k]
665
-
666
- # Format binary results
667
- formatted_binary = {}
668
- if len(outputs["binary_probs"]) > 0:
669
- for (frame_id, obj_pair, predicate), prob in outputs["binary_probs"][0].items():
670
- key = (frame_id, obj_pair)
671
- if key not in formatted_binary:
672
- formatted_binary[key] = []
673
- formatted_binary[key].append((prob, predicate))
674
-
675
- # Sort and take top-k
676
- for key in formatted_binary:
677
- formatted_binary[key] = sorted(
678
- formatted_binary[key], reverse=True
679
- )[:return_top_k]
680
-
681
- result: Dict[str, Any] = {
682
- "categorical_predictions": formatted_categorical,
683
- "unary_predictions": formatted_unary,
684
- "binary_predictions": formatted_binary,
685
- "confidence_scores": {
686
- "categorical": max([max([p for p, _ in preds], default=0.0)
687
- for preds in formatted_categorical.values()], default=0.0),
688
- "unary": max([max([p for p, _ in preds], default=0.0)
689
- for preds in formatted_unary.values()], default=0.0),
690
- "binary": max([max([p for p, _ in preds], default=0.0)
691
- for preds in formatted_binary.values()], default=0.0)
692
- }
693
- }
694
-
695
- if "flattened_segments" in outputs:
696
- result["flattened_segments"] = outputs["flattened_segments"]
697
- if "valid_pairs" in outputs:
698
- result["valid_pairs"] = outputs["valid_pairs"]
699
- if "valid_pairs_metadata" in outputs:
700
- result["valid_pairs_metadata"] = outputs["valid_pairs_metadata"]
701
-
702
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/vine_pipeline.py DELETED
@@ -1,691 +0,0 @@
1
- import torch
2
- import numpy as np
3
- import cv2
4
- import os
5
- from typing import Dict, List, Tuple, Optional, Any, Union
6
- from transformers import Pipeline
7
- import tempfile
8
- import uuid
9
-
10
- from .vine_config import VineConfig
11
- from .vine_model import VineModel
12
- from .vis_utils import render_dino_frames, render_sam_frames, render_vine_frame_sets
13
- from laser.loading import load_video
14
- from laser.preprocess.mask_generation_grounding_dino import generate_masks_grounding_dino
15
-
16
- class VinePipeline(Pipeline):
17
- """
18
- Pipeline for VINE model that handles end-to-end video understanding.
19
-
20
- This pipeline takes a video file or frames, along with segmentation method
21
- and keyword lists, and returns probability distributions over the keywords.
22
-
23
- Segmentation Model Configuration:
24
- The pipeline requires SAM2 and GroundingDINO models for mask generation.
25
- You can configure custom paths via constructor kwargs:
26
-
27
- - sam_config_path: Path to SAM2 config (e.g., "configs/sam2.1/sam2.1_hiera_b+.yaml")
28
- - sam_checkpoint_path: Path to SAM2 checkpoint (e.g., "checkpoints/sam2.1_hiera_base_plus.pt")
29
- - gd_config_path: Path to GroundingDINO config (e.g., "groundingdino/config/GroundingDINO_SwinT_OGC.py")
30
- - gd_checkpoint_path: Path to GroundingDINO checkpoint (e.g., "checkpoints/groundingdino_swint_ogc.pth")
31
-
32
- Old:
33
- - SAM2: ~/research/sam2/ or /home/asethi04/LASER_NEW/LASER/sam2/
34
- - GroundingDINO: /home/asethi04/LASER_NEW/LASER/GroundingDINO/
35
-
36
- Alternative: Use set_segmentation_models() to provide pre-initialized model instances.
37
- """
38
-
39
- def __init__(
40
- self,
41
- sam_config_path: Optional[str] = None,
42
- sam_checkpoint_path: Optional[str] = None,
43
- gd_config_path: Optional[str] = None,
44
- gd_checkpoint_path: Optional[str] = None,
45
- **kwargs
46
- ):
47
- self.grounding_model = None
48
- self.sam_predictor = None
49
- self.mask_generator = None
50
-
51
- self.sam_config_path = sam_config_path
52
- self.sam_checkpoint_path = sam_checkpoint_path
53
- self.gd_config_path = gd_config_path
54
- self.gd_checkpoint_path = gd_checkpoint_path
55
-
56
-
57
- super().__init__(**kwargs)
58
-
59
-
60
- # Set default parameters from config
61
- self.segmentation_method = getattr(self.model.config, 'segmentation_method', 'grounding_dino_sam2')
62
- self.box_threshold = getattr(self.model.config, 'box_threshold', 0.35)
63
- self.text_threshold = getattr(self.model.config, 'text_threshold', 0.25)
64
- self.target_fps = getattr(self.model.config, 'target_fps', 1)
65
- self.visualize = getattr(self.model.config, 'visualize', False)
66
- self.visualization_dir = getattr(self.model.config, 'visualization_dir', None)
67
- self.debug_visualizations = getattr(self.model.config, 'debug_visualizations', False)
68
- self._device = getattr(self.model.config, '_device')
69
- if kwargs.get("device") is not None:
70
- self._device = kwargs.get("device")
71
-
72
- def set_segmentation_models(
73
- self,
74
- *,
75
- sam_predictor=None,
76
- mask_generator=None,
77
- grounding_model=None
78
- ):
79
- """
80
- Set pre-initialized segmentation models, bypassing automatic initialization/current_values
81
-
82
- Args:
83
- sam_predictor: Pre-built SAM2 video predictor
84
- mask_generator: Pre-built SAM2 automatic mask generator
85
- grounding_model: Pre-built GroundingDINO model
86
- """
87
- if sam_predictor is not None:
88
- self.sam_predictor = sam_predictor
89
- if mask_generator is not None:
90
- self.mask_generator = mask_generator
91
- if grounding_model is not None:
92
- self.grounding_model = grounding_model
93
-
94
- def _sanitize_parameters(self, **kwargs):
95
- """Sanitize parameters for different pipeline stages."""
96
- preprocess_kwargs = {}
97
- forward_kwargs = {}
98
- postprocess_kwargs = {}
99
-
100
- # Preprocess parameters
101
- if "segmentation_method" in kwargs:
102
- preprocess_kwargs["segmentation_method"] = kwargs["segmentation_method"]
103
- if "target_fps" in kwargs:
104
- preprocess_kwargs["target_fps"] = kwargs["target_fps"]
105
- if "box_threshold" in kwargs:
106
- preprocess_kwargs["box_threshold"] = kwargs["box_threshold"]
107
- if "text_threshold" in kwargs:
108
- preprocess_kwargs["text_threshold"] = kwargs["text_threshold"]
109
- if "categorical_keywords" in kwargs:
110
- preprocess_kwargs["categorical_keywords"] = kwargs["categorical_keywords"]
111
-
112
- # Forward parameters
113
- if "categorical_keywords" in kwargs:
114
- forward_kwargs["categorical_keywords"] = kwargs["categorical_keywords"]
115
- if "unary_keywords" in kwargs:
116
- forward_kwargs["unary_keywords"] = kwargs["unary_keywords"]
117
- if "binary_keywords" in kwargs:
118
- forward_kwargs["binary_keywords"] = kwargs["binary_keywords"]
119
- if "object_pairs" in kwargs:
120
- forward_kwargs["object_pairs"] = kwargs["object_pairs"]
121
- if "return_flattened_segments" in kwargs:
122
- forward_kwargs["return_flattened_segments"] = kwargs["return_flattened_segments"]
123
- if "return_valid_pairs" in kwargs:
124
- forward_kwargs["return_valid_pairs"] = kwargs["return_valid_pairs"]
125
- if "interested_object_pairs" in kwargs:
126
- forward_kwargs["interested_object_pairs"] = kwargs["interested_object_pairs"]
127
- if "debug_visualizations" in kwargs:
128
- forward_kwargs["debug_visualizations"] = kwargs["debug_visualizations"]
129
- postprocess_kwargs["debug_visualizations"] = kwargs["debug_visualizations"]
130
-
131
- # Postprocess parameters
132
- if "return_top_k" in kwargs:
133
- postprocess_kwargs["return_top_k"] = kwargs["return_top_k"]
134
- if "self.visualize" in kwargs:
135
- postprocess_kwargs["self.visualize"] = kwargs["self.visualize"]
136
-
137
- return preprocess_kwargs, forward_kwargs, postprocess_kwargs
138
-
139
- def preprocess(
140
- self,
141
- video_input: Union[str, np.ndarray, torch.Tensor],
142
- segmentation_method: str = None,
143
- target_fps: int = None,
144
- box_threshold: float = None,
145
- text_threshold: float = None,
146
- categorical_keywords: List[str] = None,
147
- **kwargs
148
- ) -> Dict[str, Any]:
149
- """
150
- Preprocess video input and generate masks.
151
-
152
- Args:
153
- video_input: Path to video file, or video tensor/array
154
- segmentation_method: "sam2" or "grounding_dino_sam2"
155
- target_fps: Target FPS for video processing
156
- box_threshold: Box threshold for Grounding DINO
157
- text_threshold: Text threshold for Grounding DINO
158
- categorical_keywords: Keywords for Grounding DINO segmentation
159
-
160
- Returns:
161
- Dict containing video frames, masks, and bboxes
162
- """
163
- # Use defaults from config if not provided
164
- if segmentation_method is None:
165
- segmentation_method = self.segmentation_method
166
- if target_fps is None:
167
- target_fps = self.target_fps
168
- if box_threshold is None:
169
- box_threshold = self.box_threshold
170
- if text_threshold is None:
171
- text_threshold = self.text_threshold
172
- if categorical_keywords is None:
173
- categorical_keywords = ["object"] # Default generic category
174
-
175
- if isinstance(video_input, str):
176
- # Video file path
177
- video_tensor = load_video(video_input, target_fps=target_fps)
178
- if isinstance(video_tensor, list):
179
- video_tensor = np.array(video_tensor)
180
- elif isinstance(video_tensor, torch.Tensor):
181
- video_tensor = video_tensor.cpu().numpy()
182
-
183
- elif isinstance(video_input, (np.ndarray, torch.Tensor)):
184
- # Video tensor/array
185
- if isinstance(video_input, torch.Tensor):
186
- video_tensor = video_input.numpy()
187
- else:
188
- video_tensor = video_input
189
- else:
190
- raise ValueError(f"Unsupported video input type: {type(video_input)}")
191
-
192
- # Ensure video tensor is numpy array
193
- if not isinstance(video_tensor, np.ndarray):
194
- video_tensor = np.array(video_tensor)
195
-
196
- # Ensure video tensor is in correct format
197
- if len(video_tensor.shape) != 4:
198
- raise ValueError(f"Expected video tensor shape (frames, height, width, channels), got {video_tensor.shape}")
199
-
200
- # Generate masks and bboxes based on segmentation method
201
- visualization_data: Dict[str, Any] = {}
202
- print(f"Segmentation method: {segmentation_method}")
203
- if segmentation_method == "sam2":
204
- masks, bboxes, vis_data = self._generate_sam2_masks(video_tensor)
205
- elif segmentation_method == "grounding_dino_sam2":
206
- masks, bboxes, vis_data = self._generate_grounding_dino_sam2_masks(
207
- video_tensor, categorical_keywords, box_threshold, text_threshold, video_input
208
- )
209
- else:
210
- raise ValueError(f"Unsupported segmentation method: {segmentation_method}")
211
- if vis_data:
212
- visualization_data.update(vis_data)
213
- visualization_data.setdefault("sam_masks", masks)
214
-
215
- return {
216
- "video_frames": torch.tensor(video_tensor),
217
- "masks": masks,
218
- "bboxes": bboxes,
219
- "num_frames": len(video_tensor),
220
- "visualization_data": visualization_data,
221
- }
222
-
223
- def _generate_sam2_masks(self, video_tensor: np.ndarray) -> Tuple[Dict, Dict, Dict[str, Any]]:
224
- """Generate masks using SAM2 automatic mask generation."""
225
- # Initialize SAM2 models if not already done
226
- print("Generating SAM2 masks...")
227
- if self.mask_generator is None:
228
- self._initialize_segmentation_models()
229
-
230
- if self.mask_generator is None:
231
- raise ValueError("SAM2 mask generator not available")
232
-
233
- masks: Dict[int, Dict[int, torch.Tensor]] = {}
234
- bboxes: Dict[int, Dict[int, List[int]]] = {}
235
-
236
- for frame_id, frame in enumerate(video_tensor):
237
- if isinstance(frame, np.ndarray) and frame.dtype != np.uint8:
238
- frame = (frame * 255).astype(np.uint8) if frame.max() <= 1 else frame.astype(np.uint8)
239
-
240
- height, width, _ = frame.shape
241
- frame_masks = self.mask_generator.generate(frame)
242
-
243
- masks[frame_id] = {}
244
- bboxes[frame_id] = {}
245
-
246
- for obj_id, mask_data in enumerate(frame_masks):
247
- mask = mask_data["segmentation"]
248
- if isinstance(mask, np.ndarray):
249
- mask = torch.from_numpy(mask)
250
-
251
- if len(mask.shape) == 2:
252
- mask = mask.unsqueeze(-1)
253
- elif len(mask.shape) == 3 and mask.shape[0] == 1:
254
- mask = mask.permute(1, 2, 0)
255
-
256
- wrapped_id = obj_id + 1
257
- masks[frame_id][wrapped_id] = mask
258
-
259
- mask_np = mask.squeeze().numpy() if isinstance(mask, torch.Tensor) else mask.squeeze()
260
-
261
- coords = np.where(mask_np > 0)
262
- if len(coords[0]) > 0:
263
- y1, y2 = coords[0].min(), coords[0].max()
264
- x1, x2 = coords[1].min(), coords[1].max()
265
- bboxes[frame_id][wrapped_id] = [x1, y1, x2, y2]
266
-
267
- return masks, bboxes, {"sam_masks": masks}
268
-
269
- def _generate_grounding_dino_sam2_masks(
270
- self,
271
- video_tensor: np.ndarray,
272
- categorical_keywords: List[str],
273
- box_threshold: float,
274
- text_threshold: float,
275
- video_path: str,
276
- ) -> Tuple[Dict, Dict, Dict[str, Any]]:
277
- """Generate masks using Grounding DINO + SAM2."""
278
- # Initialize models if not already done
279
- print("Generating Grounding DINO + SAM2 masks...")
280
- if self.grounding_model is None or self.sam_predictor is None:
281
- self._initialize_segmentation_models()
282
-
283
- if self.grounding_model is None or self.sam_predictor is None:
284
- raise ValueError("GroundingDINO or SAM2 models not available")
285
-
286
- temp_video_path = None
287
- if video_path is None or not isinstance(video_path, str):
288
- temp_video_path = self._create_temp_video(video_tensor)
289
- video_path = temp_video_path
290
-
291
- CHUNK = 5
292
- classes_ls = [categorical_keywords[i:i + CHUNK] for i in range(0, len(categorical_keywords), CHUNK)]
293
- video_segments, oid_class_pred, _ = generate_masks_grounding_dino(
294
- self.grounding_model,
295
- box_threshold,
296
- text_threshold,
297
- self.sam_predictor,
298
- self.mask_generator,
299
- video_tensor,
300
- video_path,
301
- "temp_video",
302
- out_dir=tempfile.gettempdir(),
303
- classes_ls=classes_ls,
304
- target_fps=self.target_fps,
305
- visualize=self.debug_visualizations,
306
- frames=None,
307
- max_prop_time=10
308
- )
309
-
310
- masks: Dict[int, Dict[int, torch.Tensor]] = {}
311
- bboxes: Dict[int, Dict[int, List[int]]] = {}
312
-
313
-
314
- for frame_id, frame_masks in video_segments.items():
315
- masks[frame_id] = {}
316
- bboxes[frame_id] = {}
317
-
318
- for obj_id, mask in frame_masks.items():
319
- if not isinstance(mask, torch.Tensor):
320
- mask = torch.tensor(mask)
321
- masks[frame_id][obj_id] = mask
322
- mask_np = mask.numpy()
323
- if mask_np.ndim == 3 and mask_np.shape[0] == 1:
324
- mask_np = np.squeeze(mask_np, axis=0)
325
-
326
- coords = np.where(mask_np > 0)
327
- if len(coords[0]) > 0:
328
- y1, y2 = coords[0].min(), coords[0].max()
329
- x1, x2 = coords[1].min(), coords[1].max()
330
- bboxes[frame_id][obj_id] = [x1, y1, x2, y2]
331
-
332
-
333
- if temp_video_path and os.path.exists(temp_video_path):
334
- os.remove(temp_video_path)
335
-
336
- vis_data: Dict[str, Any] = {
337
- "sam_masks": masks,
338
- "dino_labels": oid_class_pred,
339
- }
340
- return masks, bboxes, vis_data
341
-
342
- def _initialize_segmentation_models(self):
343
- """Initialize segmentation models based on the requested method and configured paths."""
344
- if (self.sam_predictor is None or self.mask_generator is None):
345
- self._initialize_sam2_models()
346
-
347
- if self.grounding_model is None:
348
- self._initialize_grounding_dino_model()
349
-
350
- def _initialize_sam2_models(self):
351
- """Initialize SAM2 video predictor and mask generator."""
352
- try:
353
- from sam2.build_sam import build_sam2_video_predictor, build_sam2
354
- from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
355
- except ImportError as e:
356
- print(f"Warning: Could not import SAM2: {e}")
357
- return
358
-
359
- # Resolve SAM2 paths
360
- config_path, checkpoint_path = self._resolve_sam2_paths()
361
-
362
- # Validate paths if custom ones were provided
363
- if self.sam_config_path is not None and not os.path.exists(config_path):
364
- raise ValueError(f"SAM2 config path not found: {config_path}")
365
- if self.sam_checkpoint_path is not None and not os.path.exists(checkpoint_path):
366
- raise ValueError(f"SAM2 checkpoint path not found: {checkpoint_path}")
367
-
368
- # Only proceed if we have valid paths
369
- if not os.path.exists(checkpoint_path):
370
- print(f"Warning: SAM2 checkpoint not found at {checkpoint_path}")
371
- print("SAM2 functionality will be unavailable")
372
- return
373
-
374
- try:
375
- device = self._device
376
-
377
- print(type(device))
378
- # Video predictor
379
- self.sam_predictor = build_sam2_video_predictor(
380
- config_path, checkpoint_path, device=device
381
- )
382
-
383
- # Mask generator
384
- sam2_model = build_sam2(config_path, checkpoint_path, device=device, apply_postprocessing=False)
385
- self.mask_generator = SAM2AutomaticMaskGenerator(
386
- model=sam2_model,
387
- points_per_side=32,
388
- points_per_batch=32,
389
- pred_iou_thresh=0.7,
390
- stability_score_thresh=0.8,
391
- crop_n_layers=2,
392
- box_nms_thresh=0.6,
393
- crop_n_points_downscale_factor=2,
394
- min_mask_region_area=100,
395
- use_m2m=True,
396
- )
397
- print("✓ SAM2 models initialized successfully")
398
-
399
- except Exception as e:
400
- raise ValueError(f"Failed to initialize SAM2 with custom paths: {e}")
401
-
402
- def _initialize_grounding_dino_model(self):
403
- """Initialize GroundingDINO model."""
404
- try:
405
- from groundingdino.util.inference import Model as gd_Model
406
- except ImportError as e:
407
- print(f"Warning: Could not import GroundingDINO: {e}")
408
- return
409
-
410
- # Resolve GroundingDINO paths
411
- config_path, checkpoint_path = self._resolve_grounding_dino_paths()
412
-
413
- # Validate paths if custom ones were provided
414
- if self.gd_config_path is not None and not os.path.exists(config_path):
415
- raise ValueError(f"GroundingDINO config path not found: {config_path}")
416
- if self.gd_checkpoint_path is not None and not os.path.exists(checkpoint_path):
417
- raise ValueError(f"GroundingDINO checkpoint path not found: {checkpoint_path}")
418
-
419
- # Only proceed if we have valid paths
420
- if not (os.path.exists(config_path) and os.path.exists(checkpoint_path)):
421
- print(f"Warning: GroundingDINO models not found at {config_path} / {checkpoint_path}")
422
- print("GroundingDINO functionality will be unavailable")
423
- return
424
-
425
- try:
426
- device = self._device
427
- print(type(device))
428
- self.grounding_model = gd_Model(
429
- model_config_path=config_path,
430
- model_checkpoint_path=checkpoint_path,
431
- device=device
432
- )
433
- print("✓ GroundingDINO model initialized successfully")
434
-
435
- except Exception as e:
436
- raise ValueError(f"Failed to initialize GroundingDINO with custom paths: {e}")
437
-
438
- def _resolve_sam2_paths(self):
439
- """Resolve SAM2 config and checkpoint paths."""
440
- # Use custom paths if provided
441
- if self.sam_config_path and self.sam_checkpoint_path:
442
- return self.sam_config_path, self.sam_checkpoint_path
443
-
444
- def _resolve_grounding_dino_paths(self):
445
- """Resolve GroundingDINO config and checkpoint paths."""
446
- # Use custom paths if provided
447
- if self.gd_config_path and self.gd_checkpoint_path:
448
- return self.gd_config_path, self.gd_checkpoint_path
449
-
450
-
451
- def _prepare_visualization_dir(self, name: str, enabled: bool) -> Optional[str]:
452
- """
453
- Ensure a directory exists for visualization artifacts and return it.
454
- If visualization is disabled, returns None.
455
- """
456
- if not enabled:
457
- return None
458
-
459
- if self.visualization_dir:
460
- target_dir = os.path.join(self.visualization_dir, name) if name else self.visualization_dir
461
- os.makedirs(target_dir, exist_ok=True)
462
- return target_dir
463
-
464
- return tempfile.mkdtemp(prefix=f"vine_{name}_")
465
-
466
- def _create_temp_video(self, video_tensor: np.ndarray, base_dir: Optional[str] = None, prefix: str = "temp_video") -> str:
467
- """Create a temporary video file from video tensor."""
468
- if base_dir is None:
469
- base_dir = tempfile.mkdtemp(prefix=f"vine_{prefix}_")
470
- else:
471
- os.makedirs(base_dir, exist_ok=True)
472
- file_name = f"{prefix}_{uuid.uuid4().hex}.mp4"
473
- temp_path = os.path.join(base_dir, file_name)
474
-
475
- # Use OpenCV to write video
476
- height, width = video_tensor.shape[1:3]
477
- fourcc = cv2.VideoWriter_fourcc(*'mp4v')
478
- out = cv2.VideoWriter(temp_path, fourcc, self.target_fps, (width, height))
479
-
480
- for frame in video_tensor:
481
- # Convert RGB to BGR for OpenCV
482
- if len(frame.shape) == 3 and frame.shape[2] == 3:
483
- frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
484
- else:
485
- frame_bgr = frame
486
- out.write(frame_bgr.astype(np.uint8))
487
-
488
- out.release()
489
- return temp_path
490
-
491
- def _forward(self, model_inputs: Dict[str, Any], **forward_kwargs) -> Dict[str, Any]:
492
- """Forward pass through the model."""
493
- outputs = self.model.predict(
494
- video_frames=model_inputs["video_frames"],
495
- masks=model_inputs["masks"],
496
- bboxes=model_inputs["bboxes"],
497
- **forward_kwargs
498
- )
499
- outputs.setdefault("video_frames", model_inputs.get("video_frames"))
500
- outputs.setdefault("bboxes", model_inputs.get("bboxes"))
501
- outputs.setdefault("masks", model_inputs.get("masks"))
502
- outputs.setdefault("visualization_data", model_inputs.get("visualization_data"))
503
- return outputs
504
-
505
- def postprocess(
506
- self,
507
- model_outputs: Dict[str, Any],
508
- return_top_k: int = 3,
509
- visualize: Optional[bool] = None,
510
- **kwargs
511
- ) -> Dict[str, Any]:
512
- """
513
- Postprocess model outputs into user-friendly format.
514
-
515
- Args:
516
- model_outputs: Raw model outputs
517
- return_top_k: Number of top predictions to return
518
- self.visualize: Whether to include visualization data
519
-
520
- Returns:
521
- Formatted results
522
- """
523
- results = {
524
- "categorical_predictions": model_outputs.get("categorical_predictions", {}),
525
- "unary_predictions": model_outputs.get("unary_predictions", {}),
526
- "binary_predictions": model_outputs.get("binary_predictions", {}),
527
- "confidence_scores": model_outputs.get("confidence_scores", {}),
528
- "summary": self._generate_summary(model_outputs)
529
- }
530
- if "flattened_segments" in model_outputs:
531
- results["flattened_segments"] = model_outputs["flattened_segments"]
532
- if "valid_pairs" in model_outputs:
533
- results["valid_pairs"] = model_outputs["valid_pairs"]
534
- if "valid_pairs_metadata" in model_outputs:
535
- results["valid_pairs_metadata"] = model_outputs["valid_pairs_metadata"]
536
- if "visualization_data" in model_outputs:
537
- results["visualization_data"] = model_outputs["visualization_data"]
538
-
539
- if self.visualize and "video_frames" in model_outputs and "bboxes" in model_outputs:
540
- frames_tensor = model_outputs["video_frames"]
541
- if isinstance(frames_tensor, torch.Tensor):
542
- frames_np = frames_tensor.detach().cpu().numpy()
543
- else:
544
- frames_np = np.asarray(frames_tensor)
545
- if frames_np.dtype != np.uint8:
546
- if np.issubdtype(frames_np.dtype, np.floating):
547
- max_val = frames_np.max() if frames_np.size else 0.0
548
- scale = 255.0 if max_val <= 1.0 else 1.0
549
- frames_np = (frames_np * scale).clip(0, 255).astype(np.uint8)
550
- else:
551
- frames_np = frames_np.clip(0, 255).astype(np.uint8)
552
-
553
- cat_label_lookup: Dict[int, Tuple[str, float]] = {}
554
- for obj_id, preds in model_outputs.get("categorical_predictions", {}).items():
555
- if preds:
556
- prob, label = preds[0]
557
- cat_label_lookup[obj_id] = (label, prob)
558
-
559
- unary_preds = model_outputs.get("unary_predictions", {})
560
- unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]] = {}
561
- for (frame_id, obj_id), preds in unary_preds.items():
562
- if preds:
563
- unary_lookup.setdefault(frame_id, {})[obj_id] = preds
564
-
565
- binary_preds = model_outputs.get("binary_predictions", {})
566
- binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]] = {}
567
- for (frame_id, obj_pair), preds in binary_preds.items():
568
- if preds:
569
- binary_lookup.setdefault(frame_id, []).append((obj_pair, preds))
570
-
571
- bboxes = model_outputs["bboxes"]
572
- visualization_data = model_outputs.get("visualization_data", {})
573
- visualizations: Dict[str, Dict[str, Any]] = {}
574
- debug_visualizations = kwargs.get("debug_visualizations")
575
- if debug_visualizations is None:
576
- debug_visualizations = self.debug_visualizations
577
-
578
- vine_frame_sets = render_vine_frame_sets(
579
- frames_np,
580
- bboxes,
581
- cat_label_lookup,
582
- unary_lookup,
583
- binary_lookup,
584
- visualization_data.get("sam_masks"),
585
- )
586
-
587
- vine_visuals: Dict[str, Dict[str, Any]] = {}
588
- final_frames = vine_frame_sets.get("all", [])
589
- if final_frames:
590
- final_entry: Dict[str, Any] = {"frames": final_frames, "video_path": None}
591
- final_dir = self._prepare_visualization_dir("all", enabled=self.visualize)
592
- final_entry["video_path"] = self._create_temp_video(
593
- np.stack(final_frames, axis=0),
594
- base_dir=final_dir,
595
- prefix="all_visualization"
596
- )
597
- vine_visuals["all"] = final_entry
598
-
599
- if debug_visualizations:
600
- sam_masks = visualization_data.get("sam_masks")
601
- if sam_masks:
602
- sam_frames = render_sam_frames(frames_np, sam_masks, visualization_data.get("dino_labels"))
603
- sam_entry = {"frames": sam_frames, "video_path": None}
604
- if sam_frames:
605
- sam_dir = self._prepare_visualization_dir("sam", enabled=self.visualize)
606
- sam_entry["video_path"] = self._create_temp_video(
607
- np.stack(sam_frames, axis=0),
608
- base_dir=sam_dir,
609
- prefix="sam_visualization"
610
- )
611
- visualizations["sam"] = sam_entry
612
-
613
- dino_labels = visualization_data.get("dino_labels")
614
- if dino_labels:
615
- dino_frames = render_dino_frames(frames_np, bboxes, dino_labels)
616
- dino_entry = {"frames": dino_frames, "video_path": None}
617
- if dino_frames:
618
- dino_dir = self._prepare_visualization_dir("dino", enabled=self.visualize)
619
- dino_entry["video_path"] = self._create_temp_video(
620
- np.stack(dino_frames, axis=0),
621
- base_dir=dino_dir,
622
- prefix="dino_visualization"
623
- )
624
- visualizations["dino"] = dino_entry
625
-
626
- for name in ("object", "unary", "binary"):
627
- frames_list = vine_frame_sets.get(name, [])
628
- entry: Dict[str, Any] = {"frames": frames_list, "video_path": None}
629
- if frames_list:
630
- vine_dir = self._prepare_visualization_dir(name, enabled=self.visualize)
631
- entry["video_path"] = self._create_temp_video(
632
- np.stack(frames_list, axis=0),
633
- base_dir=vine_dir,
634
- prefix=f"{name}_visualization"
635
- )
636
- vine_visuals[name] = entry
637
-
638
- if vine_visuals:
639
- visualizations["vine"] = vine_visuals
640
-
641
- if visualizations:
642
- results["visualizations"] = visualizations
643
-
644
- return results
645
-
646
- def _generate_summary(self, model_outputs: Dict[str, Any]) -> Dict[str, Any]:
647
- """Generate a summary of the predictions."""
648
- categorical_preds = model_outputs.get("categorical_predictions", {})
649
- unary_preds = model_outputs.get("unary_predictions", {})
650
- binary_preds = model_outputs.get("binary_predictions", {})
651
-
652
- summary = {
653
- "num_objects_detected": len(categorical_preds),
654
- "num_unary_predictions": len(unary_preds),
655
- "num_binary_predictions": len(binary_preds),
656
- "top_categories": [],
657
- "top_actions": [],
658
- "top_relations": []
659
- }
660
-
661
- # Extract top categories
662
- all_categories = []
663
- for obj_preds in categorical_preds.values():
664
- if obj_preds:
665
- all_categories.extend(obj_preds)
666
-
667
- if all_categories:
668
- sorted_categories = sorted(all_categories, reverse=True)
669
- summary["top_categories"] = [(cat, prob) for prob, cat in sorted_categories[:3]]
670
-
671
- # Extract top actions
672
- all_actions = []
673
- for action_preds in unary_preds.values():
674
- if action_preds:
675
- all_actions.extend(action_preds)
676
-
677
- if all_actions:
678
- sorted_actions = sorted(all_actions, reverse=True)
679
- summary["top_actions"] = [(act, prob) for prob, act in sorted_actions[:3]]
680
-
681
- # Extract top relations
682
- all_relations = []
683
- for rel_preds in binary_preds.values():
684
- if rel_preds:
685
- all_relations.extend(rel_preds)
686
-
687
- if all_relations:
688
- sorted_relations = sorted(all_relations, reverse=True)
689
- summary["top_relations"] = [(rel, prob) for prob, rel in sorted_relations[:3]]
690
-
691
- return summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/vine_hf/vis_utils.py DELETED
@@ -1,941 +0,0 @@
1
- import os
2
- import cv2
3
- import numpy as np
4
- import matplotlib.pyplot as plt
5
- import torch
6
- import random
7
- import math
8
- from matplotlib.patches import Rectangle
9
- import itertools
10
- from typing import Any, Dict, List, Tuple, Optional, Union
11
-
12
- from laser.preprocess.mask_generation_grounding_dino import mask_to_bbox
13
-
14
- ########################################################################################
15
- ########## Visualization Library ########
16
- ########################################################################################
17
- # This module renders SAM masks, GroundingDINO boxes, and VINE predictions.
18
- #
19
- # Conventions (RGB frames, pixel coords):
20
- # - Frames: list[np.ndarray] with shape (H, W, 3) in RGB, or np.ndarray with shape (T, H, W, 3).
21
- # - Masks: 2D boolean arrays (H, W) or tensors convertible to that; (H, W, 1) is also accepted.
22
- # - BBoxes: (x1, y1, x2, y2) integer pixel coordinates with x2 > x1 and y2 > y1.
23
- #
24
- # Per-frame stores use one of:
25
- # - Dict[int(frame_id) -> Dict[int(obj_id) -> value]]
26
- # - List indexed by frame_id (each item may be a dict of obj_id->value or a list in order)
27
- #
28
- # Renderer inputs/outputs:
29
- # 1) render_sam_frames(frames, sam_masks, dino_labels=None) -> List[np.ndarray]
30
- # - sam_masks: Dict[frame_id, Dict[obj_id, Mask]] or a list; Mask can be np.ndarray or torch.Tensor.
31
- # - dino_labels: Optional Dict[obj_id, str] to annotate boxes derived from masks.
32
- #
33
- # 2) render_dino_frames(frames, bboxes, dino_labels=None) -> List[np.ndarray]
34
- # - bboxes: Dict[frame_id, Dict[obj_id, Sequence[float]]] or a list; each bbox as [x1, y1, x2, y2].
35
- #
36
- # 3) render_vine_frames(frames, bboxes, cat_label_lookup, unary_lookup, binary_lookup, masks=None)
37
- # -> List[np.ndarray] (the "all" view)
38
- # - cat_label_lookup: Dict[obj_id, (label: str, prob: float)]
39
- # - unary_lookup: Dict[frame_id, Dict[obj_id, List[(prob: float, label: str)]]]
40
- # - binary_lookup: Dict[frame_id, List[((sub_id: int, obj_id: int), List[(prob: float, relation: str)])]]
41
- # - masks: Optional; same structure as sam_masks, used for translucent overlays when unary labels exist.
42
- #
43
- # Ground-truth helpers used by plotting utilities:
44
- # - For a single frame, gt_relations is represented as List[(subject_label, object_label, relation_label)].
45
- #
46
- # All rendered frames returned by functions are RGB np.ndarray images suitable for saving or video writing.
47
- ########################################################################################
48
-
49
- def clean_label(label):
50
- """Replace underscores and slashes with spaces for uniformity."""
51
- return label.replace("_", " ").replace("/", " ")
52
-
53
- # Should be performed somewhere else I believe
54
- def format_cate_preds(cate_preds):
55
- # Group object predictions from the model output.
56
- obj_pred_dict = {}
57
- for (oid, label), prob in cate_preds.items():
58
- # Clean the predicted label as well.
59
- clean_pred = clean_label(label)
60
- if oid not in obj_pred_dict:
61
- obj_pred_dict[oid] = []
62
- obj_pred_dict[oid].append((clean_pred, prob))
63
- for oid in obj_pred_dict:
64
- obj_pred_dict[oid].sort(key=lambda x: x[1], reverse=True)
65
- return obj_pred_dict
66
-
67
- def format_binary_cate_preds(binary_preds):
68
- frame_binary_preds = []
69
- for key, score in binary_preds.items():
70
- # Expect key format: (frame_id, (subject, object), predicted_relation)
71
- try:
72
- f_id, (subj, obj), pred_rel = key
73
- frame_binary_preds.append((f_id, subj, obj, pred_rel, score))
74
- except Exception as e:
75
- print("Skipping key with unexpected format:", key)
76
- continue
77
- frame_binary_preds.sort(key=lambda x: x[3], reverse=True)
78
- return frame_binary_preds
79
-
80
- _FONT = cv2.FONT_HERSHEY_SIMPLEX
81
-
82
-
83
- def _to_numpy_mask(mask: Union[np.ndarray, torch.Tensor, None]) -> Optional[np.ndarray]:
84
- if mask is None:
85
- return None
86
- if isinstance(mask, torch.Tensor):
87
- mask_np = mask.detach().cpu().numpy()
88
- else:
89
- mask_np = np.asarray(mask)
90
- if mask_np.ndim == 0:
91
- return None
92
- if mask_np.ndim == 3:
93
- mask_np = np.squeeze(mask_np)
94
- if mask_np.ndim != 2:
95
- return None
96
- if mask_np.dtype == bool:
97
- return mask_np
98
- return mask_np > 0
99
-
100
-
101
- def _sanitize_bbox(bbox: Union[List[float], Tuple[float, ...], None], width: int, height: int) -> Optional[Tuple[int, int, int, int]]:
102
- if bbox is None:
103
- return None
104
- if isinstance(bbox, (list, tuple)) and len(bbox) >= 4:
105
- x1, y1, x2, y2 = [float(b) for b in bbox[:4]]
106
- elif isinstance(bbox, np.ndarray) and bbox.size >= 4:
107
- x1, y1, x2, y2 = [float(b) for b in bbox.flat[:4]]
108
- else:
109
- return None
110
- x1 = int(np.clip(round(x1), 0, width - 1))
111
- y1 = int(np.clip(round(y1), 0, height - 1))
112
- x2 = int(np.clip(round(x2), 0, width - 1))
113
- y2 = int(np.clip(round(y2), 0, height - 1))
114
- if x2 <= x1 or y2 <= y1:
115
- return None
116
- return (x1, y1, x2, y2)
117
-
118
-
119
- def _object_color_bgr(obj_id: int) -> Tuple[int, int, int]:
120
- color = get_color(obj_id)
121
- rgb = [int(np.clip(c, 0.0, 1.0) * 255) for c in color[:3]]
122
- return (rgb[2], rgb[1], rgb[0])
123
-
124
-
125
- def _background_color(color: Tuple[int, int, int]) -> Tuple[int, int, int]:
126
- return tuple(int(0.25 * 255 + 0.75 * channel) for channel in color)
127
-
128
-
129
- def _draw_label_block(
130
- image: np.ndarray,
131
- lines: List[str],
132
- anchor: Tuple[int, int],
133
- color: Tuple[int, int, int],
134
- font_scale: float = 0.5,
135
- thickness: int = 1,
136
- direction: str = "up",
137
- ) -> None:
138
- if not lines:
139
- return
140
- img_h, img_w = image.shape[:2]
141
- x, y = anchor
142
- x = int(np.clip(x, 0, img_w - 1))
143
- y_cursor = int(np.clip(y, 0, img_h - 1))
144
- bg_color = _background_color(color)
145
-
146
- if direction == "down":
147
- for text in lines:
148
- text = str(text)
149
- (tw, th), baseline = cv2.getTextSize(text, _FONT, font_scale, thickness)
150
- left_x = x
151
- right_x = min(left_x + tw + 8, img_w - 1)
152
- top_y = int(np.clip(y_cursor + 6, 0, img_h - 1))
153
- bottom_y = int(np.clip(top_y + th + baseline + 6, 0, img_h - 1))
154
- if bottom_y <= top_y:
155
- break
156
- cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), bg_color, -1)
157
- text_x = left_x + 4
158
- text_y = min(bottom_y - baseline - 2, img_h - 1)
159
- cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
160
- y_cursor = bottom_y
161
- else:
162
- for text in lines:
163
- text = str(text)
164
- (tw, th), baseline = cv2.getTextSize(text, _FONT, font_scale, thickness)
165
- top_y = max(y_cursor - th - baseline - 6, 0)
166
- left_x = x
167
- right_x = min(left_x + tw + 8, img_w - 1)
168
- bottom_y = min(top_y + th + baseline + 6, img_h - 1)
169
- cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), bg_color, -1)
170
- text_x = left_x + 4
171
- text_y = min(bottom_y - baseline - 2, img_h - 1)
172
- cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
173
- y_cursor = top_y
174
-
175
-
176
- def _draw_centered_label(
177
- image: np.ndarray,
178
- text: str,
179
- center: Tuple[int, int],
180
- color: Tuple[int, int, int],
181
- font_scale: float = 0.5,
182
- thickness: int = 1,
183
- ) -> None:
184
- text = str(text)
185
- img_h, img_w = image.shape[:2]
186
- (tw, th), baseline = cv2.getTextSize(text, _FONT, font_scale, thickness)
187
- cx = int(np.clip(center[0], 0, img_w - 1))
188
- cy = int(np.clip(center[1], 0, img_h - 1))
189
- left_x = int(np.clip(cx - tw // 2 - 4, 0, img_w - 1))
190
- top_y = int(np.clip(cy - th // 2 - baseline - 4, 0, img_h - 1))
191
- right_x = int(np.clip(left_x + tw + 8, 0, img_w - 1))
192
- bottom_y = int(np.clip(top_y + th + baseline + 6, 0, img_h - 1))
193
- cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), _background_color(color), -1)
194
- text_x = left_x + 4
195
- text_y = min(bottom_y - baseline - 2, img_h - 1)
196
- cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
197
-
198
-
199
- def _extract_frame_entities(store: Union[Dict[int, Dict[int, Any]], List, None], frame_idx: int) -> Dict[int, Any]:
200
- if isinstance(store, dict):
201
- frame_entry = store.get(frame_idx, {})
202
- elif isinstance(store, list) and 0 <= frame_idx < len(store):
203
- frame_entry = store[frame_idx]
204
- else:
205
- frame_entry = {}
206
- if isinstance(frame_entry, dict):
207
- return frame_entry
208
- if isinstance(frame_entry, list):
209
- return {i: value for i, value in enumerate(frame_entry)}
210
- return {}
211
-
212
-
213
- def _label_anchor_and_direction(
214
- bbox: Tuple[int, int, int, int],
215
- position: str,
216
- ) -> Tuple[Tuple[int, int], str]:
217
- x1, y1, x2, y2 = bbox
218
- if position == "bottom":
219
- return (x1, y2), "down"
220
- return (x1, y1), "up"
221
-
222
-
223
- def _draw_bbox_with_label(
224
- image: np.ndarray,
225
- bbox: Tuple[int, int, int, int],
226
- obj_id: int,
227
- title: Optional[str] = None,
228
- sub_lines: Optional[List[str]] = None,
229
- label_position: str = "top",
230
- ) -> None:
231
- color = _object_color_bgr(obj_id)
232
- cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2)
233
- head = title if title else f"#{obj_id}"
234
- if not head.startswith("#"):
235
- head = f"#{obj_id} {head}"
236
- lines = [head]
237
- if sub_lines:
238
- lines.extend(sub_lines)
239
- anchor, direction = _label_anchor_and_direction(bbox, label_position)
240
- _draw_label_block(image, lines, anchor, color, direction=direction)
241
-
242
-
243
- def render_sam_frames(
244
- frames: Union[np.ndarray, List[np.ndarray]],
245
- sam_masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None],
246
- dino_labels: Optional[Dict[int, str]] = None,
247
- ) -> List[np.ndarray]:
248
- results: List[np.ndarray] = []
249
- frames_iterable = frames if isinstance(frames, list) else list(frames)
250
- dino_labels = dino_labels or {}
251
-
252
- for frame_idx, frame in enumerate(frames_iterable):
253
- if frame is None:
254
- continue
255
- frame_rgb = np.asarray(frame)
256
- frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
257
- overlay = frame_bgr.astype(np.float32)
258
- masks_for_frame = _extract_frame_entities(sam_masks, frame_idx)
259
-
260
- for obj_id, mask in masks_for_frame.items():
261
- mask_np = _to_numpy_mask(mask)
262
- if mask_np is None or not np.any(mask_np):
263
- continue
264
- color = _object_color_bgr(obj_id)
265
- alpha = 0.45
266
- overlay[mask_np] = (1.0 - alpha) * overlay[mask_np] + alpha * np.array(color, dtype=np.float32)
267
-
268
- annotated = np.clip(overlay, 0, 255).astype(np.uint8)
269
- frame_h, frame_w = annotated.shape[:2]
270
-
271
- for obj_id, mask in masks_for_frame.items():
272
- mask_np = _to_numpy_mask(mask)
273
- if mask_np is None or not np.any(mask_np):
274
- continue
275
- bbox = mask_to_bbox(mask_np)
276
- bbox = _sanitize_bbox(bbox, frame_w, frame_h)
277
- if not bbox:
278
- continue
279
- label = dino_labels.get(obj_id)
280
- title = f"{label}" if label else None
281
- _draw_bbox_with_label(annotated, bbox, obj_id, title=title)
282
-
283
- results.append(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
284
-
285
- return results
286
-
287
-
288
- def render_dino_frames(
289
- frames: Union[np.ndarray, List[np.ndarray]],
290
- bboxes: Union[Dict[int, Dict[int, Union[List[float], np.ndarray]]], List, None],
291
- dino_labels: Optional[Dict[int, str]] = None,
292
- ) -> List[np.ndarray]:
293
- results: List[np.ndarray] = []
294
- frames_iterable = frames if isinstance(frames, list) else list(frames)
295
- dino_labels = dino_labels or {}
296
-
297
- for frame_idx, frame in enumerate(frames_iterable):
298
- if frame is None:
299
- continue
300
- frame_rgb = np.asarray(frame)
301
- annotated = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
302
- frame_h, frame_w = annotated.shape[:2]
303
- frame_bboxes = _extract_frame_entities(bboxes, frame_idx)
304
-
305
- for obj_id, bbox_values in frame_bboxes.items():
306
- bbox = _sanitize_bbox(bbox_values, frame_w, frame_h)
307
- if not bbox:
308
- continue
309
- label = dino_labels.get(obj_id)
310
- title = f"{label}" if label else None
311
- _draw_bbox_with_label(annotated, bbox, obj_id, title=title)
312
-
313
- results.append(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
314
-
315
- return results
316
-
317
-
318
- def render_vine_frame_sets(
319
- frames: Union[np.ndarray, List[np.ndarray]],
320
- bboxes: Union[Dict[int, Dict[int, Union[List[float], np.ndarray]]], List, None],
321
- cat_label_lookup: Dict[int, Tuple[str, float]],
322
- unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
323
- binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
324
- masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None] = None,
325
- ) -> Dict[str, List[np.ndarray]]:
326
- frame_groups: Dict[str, List[np.ndarray]] = {
327
- "object": [],
328
- "unary": [],
329
- "binary": [],
330
- "all": [],
331
- }
332
- frames_iterable = frames if isinstance(frames, list) else list(frames)
333
-
334
- for frame_idx, frame in enumerate(frames_iterable):
335
- if frame is None:
336
- continue
337
- frame_rgb = np.asarray(frame)
338
- base_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
339
- frame_h, frame_w = base_bgr.shape[:2]
340
- frame_bboxes = _extract_frame_entities(bboxes, frame_idx)
341
- frame_masks = _extract_frame_entities(masks, frame_idx) if masks is not None else {}
342
-
343
- objects_bgr = base_bgr.copy()
344
- unary_bgr = base_bgr.copy()
345
- binary_bgr = base_bgr.copy()
346
- all_bgr = base_bgr.copy()
347
-
348
- bbox_lookup: Dict[int, Tuple[int, int, int, int]] = {}
349
- unary_lines_lookup: Dict[int, List[str]] = {}
350
- titles_lookup: Dict[int, Optional[str]] = {}
351
-
352
- for obj_id, bbox_values in frame_bboxes.items():
353
- bbox = _sanitize_bbox(bbox_values, frame_w, frame_h)
354
- if not bbox:
355
- continue
356
- bbox_lookup[obj_id] = bbox
357
- cat_label, cat_prob = cat_label_lookup.get(obj_id, (None, None))
358
- title_parts = []
359
- if cat_label:
360
- if cat_prob is not None:
361
- title_parts.append(f"{cat_label} {cat_prob:.2f}")
362
- else:
363
- title_parts.append(cat_label)
364
- titles_lookup[obj_id] = " ".join(title_parts) if title_parts else None
365
- unary_preds = unary_lookup.get(frame_idx, {}).get(obj_id, [])
366
- unary_lines = [f"{label} {prob:.2f}" for prob, label in unary_preds]
367
- unary_lines_lookup[obj_id] = unary_lines
368
-
369
- for obj_id, bbox in bbox_lookup.items():
370
- unary_lines = unary_lines_lookup.get(obj_id, [])
371
- if not unary_lines:
372
- continue
373
- mask_raw = frame_masks.get(obj_id)
374
- mask_np = _to_numpy_mask(mask_raw)
375
- if mask_np is None or not np.any(mask_np):
376
- continue
377
- color = np.array(_object_color_bgr(obj_id), dtype=np.float32)
378
- alpha = 0.45
379
- for target in (unary_bgr, all_bgr):
380
- target_vals = target[mask_np].astype(np.float32)
381
- blended = (1.0 - alpha) * target_vals + alpha * color
382
- target[mask_np] = np.clip(blended, 0, 255).astype(np.uint8)
383
-
384
- for obj_id, bbox in bbox_lookup.items():
385
- title = titles_lookup.get(obj_id)
386
- unary_lines = unary_lines_lookup.get(obj_id, [])
387
- _draw_bbox_with_label(objects_bgr, bbox, obj_id, title=title, label_position="top")
388
- _draw_bbox_with_label(unary_bgr, bbox, obj_id, title=title, label_position="top")
389
- if unary_lines:
390
- anchor, direction = _label_anchor_and_direction(bbox, "bottom")
391
- _draw_label_block(unary_bgr, unary_lines, anchor, _object_color_bgr(obj_id), direction=direction)
392
- _draw_bbox_with_label(binary_bgr, bbox, obj_id, title=title, label_position="top")
393
- _draw_bbox_with_label(all_bgr, bbox, obj_id, title=title, label_position="top")
394
- if unary_lines:
395
- anchor, direction = _label_anchor_and_direction(bbox, "bottom")
396
- _draw_label_block(all_bgr, unary_lines, anchor, _object_color_bgr(obj_id), direction=direction)
397
-
398
- for obj_pair, relation_preds in binary_lookup.get(frame_idx, []):
399
- if len(obj_pair) != 2 or not relation_preds:
400
- continue
401
- subj_id, obj_id = obj_pair
402
- subj_bbox = bbox_lookup.get(subj_id)
403
- obj_bbox = bbox_lookup.get(obj_id)
404
- if not subj_bbox or not obj_bbox:
405
- continue
406
- start, end = relation_line(subj_bbox, obj_bbox)
407
- color = tuple(int(c) for c in np.clip(
408
- (np.array(_object_color_bgr(subj_id), dtype=np.float32) +
409
- np.array(_object_color_bgr(obj_id), dtype=np.float32)) / 2.0,
410
- 0, 255
411
- ))
412
- prob, relation = relation_preds[0]
413
- label_text = f"{relation} {prob:.2f}"
414
- mid_point = (int((start[0] + end[0]) / 2), int((start[1] + end[1]) / 2))
415
- cv2.line(binary_bgr, start, end, color, 6, cv2.LINE_AA)
416
- cv2.line(all_bgr, start, end, color, 6, cv2.LINE_AA)
417
- _draw_centered_label(binary_bgr, label_text, mid_point, color)
418
- _draw_centered_label(all_bgr, label_text, mid_point, color)
419
-
420
- frame_groups["object"].append(cv2.cvtColor(objects_bgr, cv2.COLOR_BGR2RGB))
421
- frame_groups["unary"].append(cv2.cvtColor(unary_bgr, cv2.COLOR_BGR2RGB))
422
- frame_groups["binary"].append(cv2.cvtColor(binary_bgr, cv2.COLOR_BGR2RGB))
423
- frame_groups["all"].append(cv2.cvtColor(all_bgr, cv2.COLOR_BGR2RGB))
424
-
425
- return frame_groups
426
-
427
-
428
- def render_vine_frames(
429
- frames: Union[np.ndarray, List[np.ndarray]],
430
- bboxes: Union[Dict[int, Dict[int, Union[List[float], np.ndarray]]], List, None],
431
- cat_label_lookup: Dict[int, Tuple[str, float]],
432
- unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
433
- binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
434
- masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None] = None,
435
- ) -> List[np.ndarray]:
436
- return render_vine_frame_sets(
437
- frames,
438
- bboxes,
439
- cat_label_lookup,
440
- unary_lookup,
441
- binary_lookup,
442
- masks,
443
- ).get("all", [])
444
-
445
- def color_for_cate_correctness(obj_pred_dict, gt_labels, topk_object):
446
- all_colors = []
447
- all_texts = []
448
- for (obj_id, bbox, gt_label) in gt_labels:
449
- preds = obj_pred_dict.get(obj_id, [])
450
- if len(preds) == 0:
451
- top1 = "N/A"
452
- box_color = (0, 0, 255) # bright red if no prediction
453
- else:
454
- top1, prob1 = preds[0]
455
- topk_labels = [p[0] for p in preds[:topk_object]]
456
- # Compare cleaned labels.
457
- if top1.lower() == gt_label.lower():
458
- box_color = (0, 255, 0) # bright green for correct
459
- elif gt_label.lower() in [p.lower() for p in topk_labels]:
460
- box_color = (0, 165, 255) # bright orange for partial match
461
- else:
462
- box_color = (0, 0, 255) # bright red for incorrect
463
-
464
- label_text = f"ID:{obj_id}/P:{top1}/GT:{gt_label}"
465
- all_colors.append(box_color)
466
- all_texts.append(label_text)
467
- return all_colors, all_texts
468
-
469
- def plot_unary(frame_img, gt_labels, all_colors, all_texts):
470
-
471
- for (obj_id, bbox, gt_label), box_color, label_text in zip(gt_labels, all_colors, all_texts):
472
- x1, y1, x2, y2 = map(int, bbox)
473
- cv2.rectangle(frame_img, (x1, y1), (x2, y2), color=box_color, thickness=2)
474
- (tw, th), baseline = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
475
- cv2.rectangle(frame_img, (x1, y1 - th - baseline - 4), (x1 + tw, y1), box_color, -1)
476
- cv2.putText(frame_img, label_text, (x1, y1 - 2), cv2.FONT_HERSHEY_SIMPLEX,
477
- 0.5, (0, 0, 0), 1, cv2.LINE_AA)
478
-
479
- return frame_img
480
-
481
- def get_white_pane(pane_height,
482
- pane_width=600,
483
- header_height = 50,
484
- header_font = cv2.FONT_HERSHEY_SIMPLEX,
485
- header_font_scale = 0.7,
486
- header_thickness = 2,
487
- header_color = (0, 0, 0)):
488
- # Create an expanded white pane to display text info.
489
- white_pane = 255 * np.ones((pane_height, pane_width, 3), dtype=np.uint8)
490
-
491
- # --- Adjust pane split: make predictions column wider (60% vs. 40%) ---
492
- left_width = int(pane_width * 0.6)
493
- right_width = pane_width - left_width
494
- left_pane = white_pane[:, :left_width, :].copy()
495
- right_pane = white_pane[:, left_width:, :].copy()
496
-
497
- cv2.putText(left_pane, "Binary Predictions", (10, header_height - 30),
498
- header_font, header_font_scale, header_color, header_thickness, cv2.LINE_AA)
499
- cv2.putText(right_pane, "Ground Truth", (10, header_height - 30),
500
- header_font, header_font_scale, header_color, header_thickness, cv2.LINE_AA)
501
-
502
- return white_pane
503
-
504
- # This is for ploting binary prediction results with frame-based scene graphs
505
- def plot_binary_sg(frame_img,
506
- white_pane,
507
- bin_preds,
508
- gt_relations,
509
- topk_binary,
510
- header_height=50,
511
- indicator_size=20,
512
- pane_width=600):
513
- # Leave vertical space for the headers.
514
- line_height = 30 # vertical spacing per line
515
- x_text = 10 # left margin for text
516
- y_text_left = header_height + 10 # starting y for left pane text
517
- y_text_right = header_height + 10 # starting y for right pane text
518
-
519
- # Left section: top-k binary predictions.
520
- left_width = int(pane_width * 0.6)
521
- right_width = pane_width - left_width
522
- left_pane = white_pane[:, :left_width, :].copy()
523
- right_pane = white_pane[:, left_width:, :].copy()
524
-
525
- for (subj, pred_rel, obj, score) in bin_preds[:topk_binary]:
526
- correct = any((subj == gt[0] and pred_rel.lower() == gt[2].lower() and obj == gt[1])
527
- for gt in gt_relations)
528
- indicator_color = (0, 255, 0) if correct else (0, 0, 255)
529
- cv2.rectangle(left_pane, (x_text, y_text_left - indicator_size + 5),
530
- (x_text + indicator_size, y_text_left + 5), indicator_color, -1)
531
- text = f"{subj} - {pred_rel} - {obj} :: {score:.2f}"
532
- cv2.putText(left_pane, text, (x_text + indicator_size + 5, y_text_left + 5),
533
- cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 1, cv2.LINE_AA)
534
- y_text_left += line_height
535
-
536
- # Right section: ground truth binary relations.
537
- for gt in gt_relations:
538
- if len(gt) != 3:
539
- continue
540
- text = f"{gt[0]} - {gt[2]} - {gt[1]}"
541
- cv2.putText(right_pane, text, (x_text, y_text_right + 5),
542
- cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 1, cv2.LINE_AA)
543
- y_text_right += line_height
544
-
545
- # Combine the two text panes and then with the frame image.
546
- combined_pane = np.hstack((left_pane, right_pane))
547
- combined_image = np.hstack((frame_img, combined_pane))
548
- return combined_image
549
-
550
- def visualized_frame(frame_img,
551
- bboxes,
552
- object_ids,
553
- gt_labels,
554
- cate_preds,
555
- binary_preds,
556
- gt_relations,
557
- topk_object,
558
- topk_binary,
559
- phase="unary"):
560
-
561
- """Return the combined annotated frame for frame index i as an image (in BGR)."""
562
- # Get the frame image (assuming batched_data['batched_reshaped_raw_videos'] is a list of frames)
563
-
564
- # --- Process Object Predictions (for overlaying bboxes) ---
565
- if phase == "unary":
566
- objs = []
567
- for ((_, f_id, obj_id), bbox, gt_label) in zip(object_ids, bboxes, gt_labels):
568
- gt_label = clean_label(gt_label)
569
- objs.append((obj_id, bbox, gt_label))
570
-
571
- formatted_cate_preds = format_cate_preds(cate_preds)
572
- all_colors, all_texts = color_for_cate_correctness(formatted_cate_preds, gt_labels, topk_object)
573
- updated_frame_img = plot_unary(frame_img, gt_labels, all_colors, all_texts)
574
- return updated_frame_img
575
-
576
- else:
577
- # --- Process Binary Predictions & Ground Truth for the Text Pane ---
578
- formatted_binary_preds = format_binary_cate_preds(binary_preds)
579
-
580
- # Ground truth binary relations for the frame.
581
- # Clean ground truth relations.
582
- gt_relations = [(clean_label(str(s)), clean_label(str(o)), clean_label(rel)) for s, o, rel in gt_relations]
583
-
584
- pane_width = 600 # increased pane width for more horizontal space
585
- pane_height = frame_img.shape[0]
586
-
587
- # --- Add header labels to each text pane with extra space ---
588
- header_height = 50 # increased header space
589
- white_pane = get_white_pane(pane_height, pane_width, header_height=header_height)
590
-
591
- combined_image = plot_binary_sg(frame_img, white_pane, formatted_binary_preds, gt_relations, topk_binary)
592
-
593
- return combined_image
594
-
595
- def show_mask(mask, ax, obj_id=None, det_class=None, random_color=False):
596
- # Ensure mask is a numpy array
597
- mask = np.array(mask)
598
- # Handle different mask shapes
599
- if mask.ndim == 3:
600
- # (1, H, W) -> (H, W)
601
- if mask.shape[0] == 1:
602
- mask = mask.squeeze(0)
603
- # (H, W, 1) -> (H, W)
604
- elif mask.shape[2] == 1:
605
- mask = mask.squeeze(2)
606
- # Now mask should be (H, W)
607
- assert mask.ndim == 2, f"Mask must be 2D after squeezing, got shape {mask.shape}"
608
-
609
- if random_color:
610
- color = np.concatenate([np.random.random(3), np.array([0.8])], axis=0)
611
- else:
612
- cmap = plt.get_cmap("gist_rainbow")
613
- cmap_idx = 0 if obj_id is None else obj_id
614
- color = list(cmap((cmap_idx * 47) % 256))
615
- color[3] = 0.5
616
- color = np.array(color)
617
-
618
- # Expand mask to (H, W, 1) for broadcasting
619
- mask_expanded = mask[..., None]
620
- mask_image = mask_expanded * color.reshape(1, 1, -1)
621
-
622
- # draw a box around the mask with the det_class as the label
623
- if not det_class is None:
624
- # Find the bounding box coordinates
625
- y_indices, x_indices = np.where(mask > 0)
626
- if y_indices.size > 0 and x_indices.size > 0:
627
- x_min, x_max = x_indices.min(), x_indices.max()
628
- y_min, y_max = y_indices.min(), y_indices.max()
629
- rect = Rectangle(
630
- (x_min, y_min),
631
- x_max - x_min,
632
- y_max - y_min,
633
- linewidth=1.5,
634
- edgecolor=color[:3],
635
- facecolor="none",
636
- alpha=color[3]
637
- )
638
- ax.add_patch(rect)
639
- ax.text(
640
- x_min,
641
- y_min - 5,
642
- f"{det_class}",
643
- color="white",
644
- fontsize=6,
645
- backgroundcolor=np.array(color),
646
- alpha=1
647
- )
648
- ax.imshow(mask_image)
649
-
650
- def save_mask_one_image(frame_image, masks, save_path):
651
- """Render masks on top of a frame and store the visualization on disk."""
652
- fig, ax = plt.subplots(1, figsize=(6, 6))
653
-
654
- frame_np = (
655
- frame_image.detach().cpu().numpy()
656
- if torch.is_tensor(frame_image)
657
- else np.asarray(frame_image)
658
- )
659
- frame_np = np.ascontiguousarray(frame_np)
660
-
661
- if isinstance(masks, dict):
662
- mask_iter = masks.items()
663
- else:
664
- mask_iter = enumerate(masks)
665
-
666
- prepared_masks = {
667
- obj_id: (
668
- mask.detach().cpu().numpy()
669
- if torch.is_tensor(mask)
670
- else np.asarray(mask)
671
- )
672
- for obj_id, mask in mask_iter
673
- }
674
-
675
- ax.imshow(frame_np)
676
- ax.axis("off")
677
-
678
- for obj_id, mask_np in prepared_masks.items():
679
- show_mask(mask_np, ax, obj_id=obj_id, det_class=None, random_color=False)
680
-
681
- fig.savefig(save_path, bbox_inches="tight", pad_inches=0)
682
- plt.close(fig)
683
- return save_path
684
-
685
- def get_video_masks_visualization(video_tensor,
686
- video_masks,
687
- video_id,
688
- video_save_base_dir,
689
- oid_class_pred=None,
690
- sample_rate = 1):
691
-
692
- video_save_dir = os.path.join(video_save_base_dir, video_id)
693
- if not os.path.exists(video_save_dir):
694
- os.makedirs(video_save_dir, exist_ok=True)
695
-
696
- for frame_id, image in enumerate(video_tensor):
697
- if frame_id not in video_masks:
698
- print("No mask for Frame", frame_id)
699
- continue
700
-
701
- masks = video_masks[frame_id]
702
- save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
703
- get_mask_one_image(image, masks, oid_class_pred)
704
-
705
- def get_mask_one_image(frame_image, masks, oid_class_pred=None):
706
- # Create a figure and axis
707
- fig, ax = plt.subplots(1, figsize=(6, 6))
708
-
709
- # Display the frame image
710
- ax.imshow(frame_image)
711
- ax.axis('off')
712
-
713
- if type(masks) == list:
714
- masks = {i: m for i, m in enumerate(masks)}
715
-
716
- # Add the masks
717
- for obj_id, mask in masks.items():
718
- det_class = f"{obj_id}. {oid_class_pred[obj_id]}" if not oid_class_pred is None else None
719
- show_mask(mask, ax, obj_id=obj_id, det_class=det_class, random_color=False)
720
-
721
- # Show the plot
722
- return fig, ax
723
-
724
- def save_video(frames, output_filename, output_fps):
725
-
726
- # --- Create a video from all frames ---
727
- num_frames = len(frames)
728
- frame_h, frame_w = frames.shape[:2]
729
-
730
- # Use a codec supported by VS Code (H.264 via 'avc1').
731
- fourcc = cv2.VideoWriter_fourcc(*'avc1')
732
- out = cv2.VideoWriter(output_filename, fourcc, output_fps, (frame_w, frame_h))
733
-
734
- print(f"Processing {num_frames} frames...")
735
- for i in range(num_frames):
736
- vis_frame = get_visualized_frame(i)
737
- out.write(vis_frame)
738
- if i % 10 == 0:
739
- print(f"Processed frame {i+1}/{num_frames}")
740
-
741
- out.release()
742
- print(f"Video saved as {output_filename}")
743
-
744
-
745
- def list_depth(lst):
746
- """Calculates the depth of a nested list."""
747
- if not (isinstance(lst, list) or isinstance(lst, torch.Tensor)):
748
- return 0
749
- elif (isinstance(lst, torch.Tensor) and lst.shape == torch.Size([])) or (isinstance(lst, list) and len(lst) == 0):
750
- return 1
751
- else:
752
- return 1 + max(list_depth(item) for item in lst)
753
-
754
- def normalize_prompt(points, labels):
755
- if list_depth(points) == 3:
756
- points = torch.stack([p.unsqueeze(0) for p in points])
757
- labels = torch.stack([l.unsqueeze(0) for l in labels])
758
- return points, labels
759
-
760
-
761
- def show_box(box, ax, object_id):
762
- if len(box) == 0:
763
- return
764
-
765
- cmap = plt.get_cmap("gist_rainbow")
766
- cmap_idx = 0 if object_id is None else object_id
767
- color = list(cmap((cmap_idx * 47) % 256))
768
-
769
- x0, y0 = box[0], box[1]
770
- w, h = box[2] - box[0], box[3] - box[1]
771
- ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor=color, facecolor=(0,0,0,0), lw=2))
772
-
773
- def show_points(coords, labels, ax, object_id=None, marker_size=375):
774
- if len(labels) == 0:
775
- return
776
-
777
- pos_points = coords[labels==1]
778
- neg_points = coords[labels==0]
779
-
780
- cmap = plt.get_cmap("gist_rainbow")
781
- cmap_idx = 0 if object_id is None else object_id
782
- color = list(cmap((cmap_idx * 47) % 256))
783
-
784
- ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='P', s=marker_size, edgecolor=color, linewidth=1.25)
785
- ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='s', s=marker_size, edgecolor=color, linewidth=1.25)
786
-
787
- def save_prompts_one_image(frame_image, boxes, points, labels, save_path):
788
- # Create a figure and axis
789
- fig, ax = plt.subplots(1, figsize=(6, 6))
790
-
791
- # Display the frame image
792
- ax.imshow(frame_image)
793
- ax.axis('off')
794
-
795
- points, labels = normalize_prompt(points, labels)
796
- if type(boxes) == torch.Tensor:
797
- for object_id, box in enumerate(boxes):
798
- # Add the bounding boxes
799
- if not box is None:
800
- show_box(box.cpu(), ax, object_id=object_id)
801
- elif type(boxes) == dict:
802
- for object_id, box in boxes.items():
803
- # Add the bounding boxes
804
- if not box is None:
805
- show_box(box.cpu(), ax, object_id=object_id)
806
- elif type(boxes) == list and len(boxes) == 0:
807
- pass
808
- else:
809
- raise Exception()
810
-
811
- for object_id, (point_ls, label_ls) in enumerate(zip(points, labels)):
812
- if not len(point_ls) == 0:
813
- show_points(point_ls.cpu(), label_ls.cpu(), ax, object_id=object_id)
814
-
815
- # Show the plot
816
- plt.savefig(save_path)
817
- plt.close()
818
-
819
- def save_video_prompts_visualization(video_tensor, video_boxes, video_points, video_labels, video_id, video_save_base_dir):
820
- video_save_dir = os.path.join(video_save_base_dir, video_id)
821
- if not os.path.exists(video_save_dir):
822
- os.makedirs(video_save_dir, exist_ok=True)
823
-
824
- for frame_id, image in enumerate(video_tensor):
825
- boxes, points, labels = [], [], []
826
-
827
- if frame_id in video_boxes:
828
- boxes = video_boxes[frame_id]
829
-
830
- if frame_id in video_points:
831
- points = video_points[frame_id]
832
- if frame_id in video_labels:
833
- labels = video_labels[frame_id]
834
-
835
- save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
836
- save_prompts_one_image(image, boxes, points, labels, save_path)
837
-
838
-
839
- def save_video_masks_visualization(video_tensor, video_masks, video_id, video_save_base_dir, oid_class_pred=None, sample_rate = 1):
840
- video_save_dir = os.path.join(video_save_base_dir, video_id)
841
- if not os.path.exists(video_save_dir):
842
- os.makedirs(video_save_dir, exist_ok=True)
843
-
844
- for frame_id, image in enumerate(video_tensor):
845
- if random.random() > sample_rate:
846
- continue
847
- if frame_id not in video_masks:
848
- print("No mask for Frame", frame_id)
849
- continue
850
- masks = video_masks[frame_id]
851
- save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
852
- save_mask_one_image(image, masks, save_path)
853
-
854
-
855
-
856
- def get_color(obj_id, cmap_name="gist_rainbow",alpha=0.5):
857
- cmap = plt.get_cmap(cmap_name)
858
- cmap_idx = 0 if obj_id is None else obj_id
859
- color = list(cmap((cmap_idx * 47) % 256))
860
- color[3] = 0.5
861
- color = np.array(color)
862
- return color
863
-
864
-
865
- def _bbox_center(bbox: Tuple[int, int, int, int]) -> Tuple[float, float]:
866
- return ((bbox[0] + bbox[2]) / 2.0, (bbox[1] + bbox[3]) / 2.0)
867
-
868
-
869
- def relation_line(
870
- bbox1: Tuple[int, int, int, int],
871
- bbox2: Tuple[int, int, int, int],
872
- ) -> Tuple[Tuple[int, int], Tuple[int, int]]:
873
- """
874
- Returns integer pixel centers suitable for drawing a relation line. For
875
- coincident boxes, nudges the target center to ensure the segment has span.
876
- """
877
- center1 = _bbox_center(bbox1)
878
- center2 = _bbox_center(bbox2)
879
- if math.isclose(center1[0], center2[0], abs_tol=1e-3) and math.isclose(center1[1], center2[1], abs_tol=1e-3):
880
- offset = max(1.0, (bbox2[2] - bbox2[0]) * 0.05)
881
- center2 = (center2[0] + offset, center2[1])
882
- start = (int(round(center1[0])), int(round(center1[1])))
883
- end = (int(round(center2[0])), int(round(center2[1])))
884
- if start == end:
885
- end = (end[0] + 1, end[1])
886
- return start, end
887
-
888
- def get_binary_mask_one_image(frame_image, masks, rel_pred_ls=None):
889
- # Create a figure and axis
890
- fig, ax = plt.subplots(1, figsize=(6, 6))
891
-
892
- # Display the frame image
893
- ax.imshow(frame_image)
894
- ax.axis('off')
895
-
896
- all_objs_to_show = set()
897
- all_lines_to_show = []
898
-
899
- # print(rel_pred_ls[0])
900
- for (from_obj_id, to_obj_id), rel_text in rel_pred_ls.items():
901
- all_objs_to_show.add(from_obj_id)
902
- all_objs_to_show.add(to_obj_id)
903
-
904
- from_mask = masks[from_obj_id]
905
- bbox1 = mask_to_bbox(from_mask)
906
- to_mask = masks[to_obj_id]
907
- bbox2 = mask_to_bbox(to_mask)
908
-
909
- c1, c2 = shortest_line_between_bboxes(bbox1, bbox2)
910
-
911
- line_color = get_color(from_obj_id)
912
- face_color = get_color(to_obj_id)
913
- line = c1, c2, face_color, line_color, rel_text
914
- all_lines_to_show.append(line)
915
-
916
- masks_to_show = {}
917
- for oid in all_objs_to_show:
918
- masks_to_show[oid] = masks[oid]
919
-
920
- # Add the masks
921
- for obj_id, mask in masks_to_show.items():
922
- show_mask(mask, ax, obj_id=obj_id, random_color=False)
923
-
924
- for (from_pt_x, from_pt_y), (to_pt_x, to_pt_y), face_color, line_color, rel_text in all_lines_to_show:
925
-
926
- plt.plot([from_pt_x, to_pt_x], [from_pt_y, to_pt_y], color=line_color, linestyle='-', linewidth=3)
927
- mid_pt_x = (from_pt_x + to_pt_x) / 2
928
- mid_pt_y = (from_pt_y + to_pt_y) / 2
929
- ax.text(
930
- mid_pt_x - 5,
931
- mid_pt_y,
932
- rel_text,
933
- color="white",
934
- fontsize=6,
935
- backgroundcolor=np.array(line_color),
936
- bbox=dict(facecolor=face_color, edgecolor=line_color, boxstyle='round,pad=1'),
937
- alpha=1
938
- )
939
-
940
- # Show the plot
941
- return fig, ax
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_vine.py CHANGED
@@ -8,8 +8,11 @@ import torch
8
 
9
  os.environ['OPENAI_API_KEY'] = "dummy-key"
10
 
11
- # Add src to path
12
- sys.path.insert(0, str(Path(__file__).parent / "src"))
 
 
 
13
 
14
  # Determine device
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
8
 
9
  os.environ['OPENAI_API_KEY'] = "dummy-key"
10
 
11
+ # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
12
+ current_dir = Path(__file__).resolve().parent
13
+ src_dir = current_dir / "src"
14
+ if src_dir.is_dir() and str(src_dir) not in sys.path:
15
+ sys.path.insert(0, str(src_dir))
16
 
17
  # Determine device
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
vine_hf/__init__.py CHANGED
@@ -2,13 +2,27 @@
2
  VINE HuggingFace Interface
3
 
4
  VINE (Video Understanding with Natural Language) is a model that processes videos
5
- along with categorical, unary, and binary keywords to return probability
6
  distributions over those keywords for detected objects and their relationships.
7
 
8
  This package provides a HuggingFace-compatible interface for the VINE model,
9
  including configuration, model, and pipeline classes.
10
  """
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  from .vine_config import VineConfig
13
  from .vine_model import VineModel
14
  from .vine_pipeline import VinePipeline
 
2
  VINE HuggingFace Interface
3
 
4
  VINE (Video Understanding with Natural Language) is a model that processes videos
5
+ along with categorical, unary, and binary keywords to return probability
6
  distributions over those keywords for detected objects and their relationships.
7
 
8
  This package provides a HuggingFace-compatible interface for the VINE model,
9
  including configuration, model, and pipeline classes.
10
  """
11
 
12
+ import sys
13
+ from pathlib import Path
14
+
15
+ # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
16
+ current_dir = Path(__file__).resolve().parent
17
+ src_dir = current_dir.parent / "src"
18
+ if src_dir.is_dir() and str(src_dir) not in sys.path:
19
+ sys.path.insert(0, str(src_dir))
20
+
21
+ # Add LASER directory to sys.path (laser module is inside src/LASER/)
22
+ laser_dir = src_dir / "LASER"
23
+ if laser_dir.is_dir() and str(laser_dir) not in sys.path:
24
+ sys.path.insert(0, str(laser_dir))
25
+
26
  from .vine_config import VineConfig
27
  from .vine_model import VineModel
28
  from .vine_pipeline import VinePipeline
vine_hf/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/vine_hf/__pycache__/__init__.cpython-310.pyc and b/vine_hf/__pycache__/__init__.cpython-310.pyc differ
 
vine_hf/__pycache__/vine_config.cpython-310.pyc CHANGED
Binary files a/vine_hf/__pycache__/vine_config.cpython-310.pyc and b/vine_hf/__pycache__/vine_config.cpython-310.pyc differ
 
vine_hf/__pycache__/vine_model.cpython-310.pyc CHANGED
Binary files a/vine_hf/__pycache__/vine_model.cpython-310.pyc and b/vine_hf/__pycache__/vine_model.cpython-310.pyc differ
 
vine_hf/convert_inference.py CHANGED
@@ -7,12 +7,16 @@ to the new HuggingFace-compatible interface.
7
 
8
  import os
9
  import sys
 
10
  import torch
11
  import numpy as np
12
  from typing import Dict, List, Tuple, Any
13
 
14
- # Add paths for imports
15
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
 
16
 
17
  from vine_hf import VineConfig, VineModel, VinePipeline
18
  from laser.loading import load_video
 
7
 
8
  import os
9
  import sys
10
+ from pathlib import Path
11
  import torch
12
  import numpy as np
13
  from typing import Dict, List, Tuple, Any
14
 
15
+ # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
16
+ current_dir = Path(__file__).resolve().parent
17
+ src_dir = current_dir.parent / "src"
18
+ if src_dir.is_dir() and str(src_dir) not in sys.path:
19
+ sys.path.insert(0, str(src_dir))
20
 
21
  from vine_hf import VineConfig, VineModel, VinePipeline
22
  from laser.loading import load_video
vine_hf/example_ensemble_weights.py CHANGED
@@ -7,14 +7,18 @@ and use them with the HuggingFace interface, based on the actual inference.py wo
7
 
8
  import os
9
  import sys
 
10
  import torch
11
  import numpy as np
12
  from transformers.pipelines import PIPELINE_REGISTRY
13
 
14
  #os.environ["OPENAI_API_KEY"]="dummy-key" # Set your OpenAI API key here or via environment variable
15
 
16
- # Add the parent directory to the path to import vine_hf
17
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
 
18
 
19
  from vine_hf import VineConfig, VineModel, VinePipeline
20
  from laser.loading import load_video
 
7
 
8
  import os
9
  import sys
10
+ from pathlib import Path
11
  import torch
12
  import numpy as np
13
  from transformers.pipelines import PIPELINE_REGISTRY
14
 
15
  #os.environ["OPENAI_API_KEY"]="dummy-key" # Set your OpenAI API key here or via environment variable
16
 
17
+ # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
18
+ current_dir = Path(__file__).resolve().parent
19
+ src_dir = current_dir.parent / "src"
20
+ if src_dir.is_dir() and str(src_dir) not in sys.path:
21
+ sys.path.insert(0, str(src_dir))
22
 
23
  from vine_hf import VineConfig, VineModel, VinePipeline
24
  from laser.loading import load_video
vine_hf/example_sam2_masks.py CHANGED
@@ -7,13 +7,16 @@ segmentation methods with the VINE model.
7
 
8
  import os
9
  import sys
 
10
  import torch
11
  import numpy as np
12
  from transformers.pipelines import PIPELINE_REGISTRY
13
 
14
- # Add the parent directory to the path to import vine_hf
15
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
- # Add the parent directory to the path to import vine_hf
 
 
17
 
18
  #Either uncomment the below or set a environemental key, though it isn't needed to run.
19
  #os.environ['OPENAI_API_KEY'] = 'dummy-key'
 
7
 
8
  import os
9
  import sys
10
+ from pathlib import Path
11
  import torch
12
  import numpy as np
13
  from transformers.pipelines import PIPELINE_REGISTRY
14
 
15
+ # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
16
+ current_dir = Path(__file__).resolve().parent
17
+ src_dir = current_dir.parent / "src"
18
+ if src_dir.is_dir() and str(src_dir) not in sys.path:
19
+ sys.path.insert(0, str(src_dir))
20
 
21
  #Either uncomment the below or set a environemental key, though it isn't needed to run.
22
  #os.environ['OPENAI_API_KEY'] = 'dummy-key'
vine_hf/example_usage.py CHANGED
@@ -7,12 +7,16 @@ for video understanding with categorical, unary, and binary keyword predictions.
7
 
8
  import os
9
  import sys
 
10
  import torch
11
  from transformers import pipeline, AutoModel
12
  from transformers.pipelines import PIPELINE_REGISTRY
13
 
14
- # Add the parent directory to the path to import vine_hf
15
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
 
16
 
17
  # Uncomment or set your own
18
  #os.environ['OPENAI_API_KEY'] = 'dummy-key'
 
7
 
8
  import os
9
  import sys
10
+ from pathlib import Path
11
  import torch
12
  from transformers import pipeline, AutoModel
13
  from transformers.pipelines import PIPELINE_REGISTRY
14
 
15
+ # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
16
+ current_dir = Path(__file__).resolve().parent
17
+ src_dir = current_dir.parent / "src"
18
+ if src_dir.is_dir() and str(src_dir) not in sys.path:
19
+ sys.path.insert(0, str(src_dir))
20
 
21
  # Uncomment or set your own
22
  #os.environ['OPENAI_API_KEY'] = 'dummy-key'
vine_hf/example_visualization.py CHANGED
@@ -5,6 +5,7 @@
5
 
6
  import os
7
  import sys
 
8
  import argparse
9
  import cv2
10
  import numpy as np
@@ -16,8 +17,11 @@ from transformers import pipeline
16
  # Set your OpenAI API key here or via environment variable
17
  os.environ['OPENAI_API_KEY'] = "dummy-key"
18
 
19
- # Local imports (workspace)
20
- sys.path.append(os.path.dirname(__file__))
 
 
 
21
 
22
  from vine_hf.vine_pipeline import VinePipeline # https://github.com link not needed; local path used
23
  from vine_hf.vine_model import VineModel
 
5
 
6
  import os
7
  import sys
8
+ from pathlib import Path
9
  import argparse
10
  import cv2
11
  import numpy as np
 
17
  # Set your OpenAI API key here or via environment variable
18
  os.environ['OPENAI_API_KEY'] = "dummy-key"
19
 
20
+ # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
21
+ current_dir = Path(__file__).resolve().parent
22
+ src_dir = current_dir.parent / "src"
23
+ if src_dir.is_dir() and str(src_dir) not in sys.path:
24
+ sys.path.insert(0, str(src_dir))
25
 
26
  from vine_hf.vine_pipeline import VinePipeline # https://github.com link not needed; local path used
27
  from vine_hf.vine_model import VineModel
vine_hf/example_with_pretrained_vine.py CHANGED
@@ -7,6 +7,7 @@ from the ensemble format or from video-fm/vine_v0.
7
 
8
  import os
9
  import sys
 
10
  import torch
11
  from transformers import pipeline
12
  from transformers.pipelines import PIPELINE_REGISTRY
@@ -14,8 +15,11 @@ from transformers.pipelines import PIPELINE_REGISTRY
14
  # Set your OpenAI API key here or via environment variable
15
  #os.environ['OPENAI_API_KEY'] = "dummy-key"
16
 
17
- # Add the parent directory to the path to import vine_hf
18
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
 
19
 
20
  from vine_hf import VineConfig, VineModel, VinePipeline
21
 
 
7
 
8
  import os
9
  import sys
10
+ from pathlib import Path
11
  import torch
12
  from transformers import pipeline
13
  from transformers.pipelines import PIPELINE_REGISTRY
 
15
  # Set your OpenAI API key here or via environment variable
16
  #os.environ['OPENAI_API_KEY'] = "dummy-key"
17
 
18
+ # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
19
+ current_dir = Path(__file__).resolve().parent
20
+ src_dir = current_dir.parent / "src"
21
+ if src_dir.is_dir() and str(src_dir) not in sys.path:
22
+ sys.path.insert(0, str(src_dir))
23
 
24
  from vine_hf import VineConfig, VineModel, VinePipeline
25
 
vine_hf/push_to_hub.py CHANGED
@@ -7,13 +7,17 @@ for easy sharing and distribution.
7
 
8
  import os
9
  import sys
 
10
  import torch
11
  import argparse
12
  from huggingface_hub import notebook_login
13
  from transformers.pipelines import PIPELINE_REGISTRY
14
 
15
- # Add the parent directory to the path to import vine_hf
16
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
 
17
 
18
  os.environ['OPENAI_API_KEY'] = "dummy-key"
19
  from vine_hf import VineConfig, VineModel, VinePipeline
 
7
 
8
  import os
9
  import sys
10
+ from pathlib import Path
11
  import torch
12
  import argparse
13
  from huggingface_hub import notebook_login
14
  from transformers.pipelines import PIPELINE_REGISTRY
15
 
16
+ # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
17
+ current_dir = Path(__file__).resolve().parent
18
+ src_dir = current_dir.parent / "src"
19
+ if src_dir.is_dir() and str(src_dir) not in sys.path:
20
+ sys.path.insert(0, str(src_dir))
21
 
22
  os.environ['OPENAI_API_KEY'] = "dummy-key"
23
  from vine_hf import VineConfig, VineModel, VinePipeline
vine_hf/push_to_video_fm.py CHANGED
@@ -15,10 +15,11 @@ from transformers.pipelines import PIPELINE_REGISTRY
15
  from transformers import AutoModel
16
  from safetensors.torch import save_file
17
 
18
- # Add the parent directory to path to enable vine_hf imports
19
- current_dir = Path(__file__).parent
20
- parent_dir = current_dir.parent
21
- sys.path.insert(0, str(parent_dir))
 
22
 
23
  os.environ['OPENAI_API_KEY'] = "dummy-key"
24
 
 
15
  from transformers import AutoModel
16
  from safetensors.torch import save_file
17
 
18
+ # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
19
+ current_dir = Path(__file__).resolve().parent
20
+ src_dir = current_dir.parent / "src"
21
+ if src_dir.is_dir() and str(src_dir) not in sys.path:
22
+ sys.path.insert(0, str(src_dir))
23
 
24
  os.environ['OPENAI_API_KEY'] = "dummy-key"
25
 
vine_hf/vine_model.py CHANGED
@@ -1,7 +1,14 @@
1
  import os
2
  import sys
 
3
  from typing import Dict, List, Tuple, Optional, Any, Union
4
 
 
 
 
 
 
 
5
  import cv2
6
  import numpy as np
7
  import torch
 
1
  import os
2
  import sys
3
+ from pathlib import Path
4
  from typing import Dict, List, Tuple, Optional, Any, Union
5
 
6
+ # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
7
+ current_dir = Path(__file__).resolve().parent
8
+ src_dir = current_dir.parent / "src"
9
+ if src_dir.is_dir() and str(src_dir) not in sys.path:
10
+ sys.path.insert(0, str(src_dir))
11
+
12
  import cv2
13
  import numpy as np
14
  import torch
vine_hf/vine_pipeline.py CHANGED
@@ -1,10 +1,17 @@
1
  import os
 
2
  import uuid
3
  import hashlib
4
  import tempfile
5
  from pathlib import Path
6
  from typing import Dict, List, Tuple, Optional, Any, Union
7
 
 
 
 
 
 
 
8
  import cv2
9
  import numpy as np
10
  import torch
 
1
  import os
2
+ import sys
3
  import uuid
4
  import hashlib
5
  import tempfile
6
  from pathlib import Path
7
  from typing import Dict, List, Tuple, Optional, Any, Union
8
 
9
+ # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
10
+ current_dir = Path(__file__).resolve().parent
11
+ src_dir = current_dir.parent / "src"
12
+ if src_dir.is_dir() and str(src_dir) not in sys.path:
13
+ sys.path.insert(0, str(src_dir))
14
+
15
  import cv2
16
  import numpy as np
17
  import torch
vine_hf/vis_utils.py CHANGED
@@ -1,6 +1,8 @@
1
  import os
 
 
2
  import cv2
3
- import numpy as np
4
  import matplotlib.pyplot as plt
5
  import torch
6
  import random
@@ -9,6 +11,12 @@ from matplotlib.patches import Rectangle
9
  import itertools
10
  from typing import Any, Dict, List, Tuple, Optional, Union
11
 
 
 
 
 
 
 
12
  from laser.preprocess.mask_generation_grounding_dino import mask_to_bbox
13
 
14
  ########################################################################################
 
1
  import os
2
+ import sys
3
+ from pathlib import Path
4
  import cv2
5
+ import numpy as np
6
  import matplotlib.pyplot as plt
7
  import torch
8
  import random
 
11
  import itertools
12
  from typing import Any, Dict, List, Tuple, Optional, Union
13
 
14
+ # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
15
+ current_dir = Path(__file__).resolve().parent
16
+ src_dir = current_dir.parent / "src"
17
+ if src_dir.is_dir() and str(src_dir) not in sys.path:
18
+ sys.path.insert(0, str(src_dir))
19
+
20
  from laser.preprocess.mask_generation_grounding_dino import mask_to_bbox
21
 
22
  ########################################################################################