Commit
·
c84f1a7
1
Parent(s):
a38b03f
feat: initial push for the training weight
Browse files
gesturenet/config.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "ControlNetModel",
|
| 3 |
+
"_diffusers_version": "0.25.1",
|
| 4 |
+
"act_fn": "silu",
|
| 5 |
+
"addition_time_embed_dim": 256,
|
| 6 |
+
"block_out_channels": [
|
| 7 |
+
320,
|
| 8 |
+
640,
|
| 9 |
+
1280,
|
| 10 |
+
1280
|
| 11 |
+
],
|
| 12 |
+
"conditioning_channels": 3,
|
| 13 |
+
"conditioning_embedding_out_channels": [
|
| 14 |
+
16,
|
| 15 |
+
32,
|
| 16 |
+
96,
|
| 17 |
+
256
|
| 18 |
+
],
|
| 19 |
+
"controlnet_conditioning_channel_order": "rgb",
|
| 20 |
+
"cross_attention_dim": 1024,
|
| 21 |
+
"down_block_types": [
|
| 22 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 23 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 24 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 25 |
+
"DownBlockSpatioTemporal"
|
| 26 |
+
],
|
| 27 |
+
"encoder_hid_dim": null,
|
| 28 |
+
"encoder_hid_dim_type": null,
|
| 29 |
+
"flip_sin_to_cos": true,
|
| 30 |
+
"freq_shift": 0,
|
| 31 |
+
"in_channels": 8,
|
| 32 |
+
"layers_per_block": 2,
|
| 33 |
+
"mid_block_type": "UNetMidBlockSpatioTemporal",
|
| 34 |
+
"num_attention_heads": [
|
| 35 |
+
5,
|
| 36 |
+
10,
|
| 37 |
+
20,
|
| 38 |
+
20
|
| 39 |
+
],
|
| 40 |
+
"projection_class_embeddings_input_dim": 768,
|
| 41 |
+
"transformer_layers_per_block": 1
|
| 42 |
+
}
|
gesturenet/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b922e2a38adb5a57495fd07aa87b2a5f0e49118814490655fd4d9a4f2d65679
|
| 3 |
+
size 2723872604
|
gesturenet/train_image2video_gesturenet.yaml
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Model Setting
|
| 3 |
+
pretrained_model_name_or_path: stabilityai/stable-video-diffusion-img2vid # stabilityai/pretrained
|
| 4 |
+
load_unet_path: ../saved_weights/v3_VL_raw_prompt/checkpoint-99000 # None/specific path Under this path, it should have a folder named unet and follow regular unet files needed (e.g., config and weight)
|
| 5 |
+
video_seq_length: 14
|
| 6 |
+
process_fps: 7
|
| 7 |
+
train_noise_aug_strength: 0.1
|
| 8 |
+
scheduler: EDM
|
| 9 |
+
conditioning_dropout_prob: 0.1 # 0.0/0.1 =
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# Dataset Setting
|
| 13 |
+
data_loader_type: thisthat # flow / traj / thisthat
|
| 14 |
+
dataset_name: Bridge # WebVid / Bridge
|
| 15 |
+
dataset_path: [../datasets_rob/Bridge_v1_TT14, ../datasets_rob/Bridge_v2_TT14] # ../Bridge_filter_flow, ../Bridge_v2_filter_flow/]
|
| 16 |
+
output_dir: checkpoints/img2video
|
| 17 |
+
dataset_decode_fps: 2
|
| 18 |
+
height: 256 # Ratio that is functional: 256:384 576:1024 320:448 320:576 512:640 448:640
|
| 19 |
+
width: 384 # It is said that the height and width should be a scale of 64
|
| 20 |
+
dataloader_num_workers: 4 # For Debug, it only needs 1
|
| 21 |
+
flip_aug_prob: 0.45 # Whether we flip the GT and cond vertically
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# Text setting
|
| 25 |
+
use_text: True # If this is True, we will use text value
|
| 26 |
+
pretrained_tokenizer_name_or_path: stabilityai/stable-diffusion-2-1-base # Use SD 2.1
|
| 27 |
+
empty_prompts_proportion: 0.0
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Mask setting
|
| 31 |
+
mask_unet_vae: False # Whether we use mask to map latents to be zero padding
|
| 32 |
+
mask_controlnet_vae: True
|
| 33 |
+
mask_proportion: 0.7
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# Condition Setting
|
| 37 |
+
conditioning_channels: 3 # Usually it is 3
|
| 38 |
+
num_points_left: # 1 # For flow: You can only choose one between flow_select_rate and num_points_left; num_points_left should be higher priority
|
| 39 |
+
flow_select_rate: 0.99 # For flow
|
| 40 |
+
threshold_factor: 0.2 # For flow
|
| 41 |
+
dilate: True # Traj must be True for dilate
|
| 42 |
+
inner_conditioning_scale: 1.0 # Conditioning scale for the internal value, defauly is starting from 1.0
|
| 43 |
+
outer_conditioning_scale: 1.0 # Outer Conditioning Scale for whole conditioning trainable copy 这里有点意思,直接不小心设定成2.0了
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# Motion setting
|
| 47 |
+
motion_bucket_id: 200
|
| 48 |
+
dataset_motion_mean: 25 # For 14 fps, it is N(25, 10)
|
| 49 |
+
dataset_motion_std: 10 # For 25 fps, it is N(18, 7)
|
| 50 |
+
svd_motion_mean: 180
|
| 51 |
+
svd_motion_std: 30
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# Training setting
|
| 56 |
+
resume_from_checkpoint: False # latest/False
|
| 57 |
+
# /nfs/turbo/jjparkcv-turbo-large/boyangwa/saved_weights/formal_trial1/checkpoint-170000
|
| 58 |
+
validation_img_folder: datasets/validation_TT14
|
| 59 |
+
num_train_epochs: 10
|
| 60 |
+
partial_finetune: False
|
| 61 |
+
train_batch_size: 1 # This is the batch size per GPU
|
| 62 |
+
checkpointing_steps: 3000
|
| 63 |
+
validation_step: 250
|
| 64 |
+
logging_name: logging
|
| 65 |
+
enable_xformers_memory_efficient_attention: False # Please set this as Flase since it will generate very bad quality images in pipeline (possibly useless for this func)
|
| 66 |
+
seed: 42
|
| 67 |
+
validation_store_folder: validation_videos
|
| 68 |
+
checkpoints_total_limit: 10
|
| 69 |
+
train_max_guidance_scale: 1.0 # >1.0 to use do_classifier_free_guidance in training
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# Noise Strength
|
| 73 |
+
noise_mean: 0.5
|
| 74 |
+
noise_std: 1.4
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# Inference
|
| 78 |
+
num_inference_steps: 25
|
| 79 |
+
use_instructpix2pix: False # Whether we will use the instructPix2Pix mode, which involves 3 inputs; it may needs tuning to have better result at the end.
|
| 80 |
+
inference_noise_aug_strength: 0.1
|
| 81 |
+
inference_max_guidance_scale: 3.0 # Take training and testing at different scenario
|
| 82 |
+
inference_guess_mode: False # Whether we use guess mode in the contorlnet
|
| 83 |
+
image_guidance_scale: 2.5 # Empirically, 2.5 is the best value
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# Learning Rate and Optimizer
|
| 87 |
+
learning_rate: 5e-6 # The highest should be 1e-5 at the beginning
|
| 88 |
+
scale_lr: False # TODO: Is it needed to scale the learning rate?
|
| 89 |
+
adam_beta1: 0.9
|
| 90 |
+
adam_beta2: 0.999
|
| 91 |
+
use_8bit_adam: True # Need this to save more memory
|
| 92 |
+
adam_weight_decay: 1e-2
|
| 93 |
+
adam_epsilon: 1e-08
|
| 94 |
+
lr_scheduler_name: constant # None | constant | cosine | cosine_with_restarts (If it's None, we will use the following custom LR decay)
|
| 95 |
+
lr_warmup_steps: 500
|
| 96 |
+
lr_decay_steps: []
|
| 97 |
+
lr_double_steps: []
|
| 98 |
+
lr_decay_scale: 0.5
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# Other Setting
|
| 102 |
+
mixed_precision: fp16
|
| 103 |
+
gradient_accumulation_steps: 1
|
| 104 |
+
gradient_checkpointing: 1
|
| 105 |
+
report_to: tensorboard
|
unet/config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "UNetSpatioTemporalConditionModel",
|
| 3 |
+
"_diffusers_version": "0.25.1",
|
| 4 |
+
"_name_or_path": "checkpoints/img2video/checkpoint-87000",
|
| 5 |
+
"addition_time_embed_dim": 256,
|
| 6 |
+
"block_out_channels": [
|
| 7 |
+
320,
|
| 8 |
+
640,
|
| 9 |
+
1280,
|
| 10 |
+
1280
|
| 11 |
+
],
|
| 12 |
+
"cross_attention_dim": 1024,
|
| 13 |
+
"down_block_types": [
|
| 14 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 15 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 16 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 17 |
+
"DownBlockSpatioTemporal"
|
| 18 |
+
],
|
| 19 |
+
"in_channels": 8,
|
| 20 |
+
"layers_per_block": 2,
|
| 21 |
+
"num_attention_heads": [
|
| 22 |
+
5,
|
| 23 |
+
10,
|
| 24 |
+
20,
|
| 25 |
+
20
|
| 26 |
+
],
|
| 27 |
+
"num_frames": 14,
|
| 28 |
+
"out_channels": 4,
|
| 29 |
+
"projection_class_embeddings_input_dim": 768,
|
| 30 |
+
"sample_size": 96,
|
| 31 |
+
"transformer_layers_per_block": 1,
|
| 32 |
+
"up_block_types": [
|
| 33 |
+
"UpBlockSpatioTemporal",
|
| 34 |
+
"CrossAttnUpBlockSpatioTemporal",
|
| 35 |
+
"CrossAttnUpBlockSpatioTemporal",
|
| 36 |
+
"CrossAttnUpBlockSpatioTemporal"
|
| 37 |
+
]
|
| 38 |
+
}
|
unet/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c59f3d69e5c30e58fd9950d1dc4c0144b6afe30c5e43a31f10839f6237d7e4b
|
| 3 |
+
size 6098682464
|
unet/train_image2video.yaml
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Model Setting
|
| 3 |
+
pretrained_model_name_or_path: stabilityai/stable-video-diffusion-img2vid # -xt is for 25 frames version
|
| 4 |
+
load_unet_path:
|
| 5 |
+
video_seq_length: 14 # Standardized to 14
|
| 6 |
+
train_max_guidance_scale: 1.0 # >1.0 to use do_classifier_free_guidance
|
| 7 |
+
inference_max_guidance_scale: 3.0 # Take training and testing at different scenario
|
| 8 |
+
process_fps: 7
|
| 9 |
+
train_noise_aug_strength: 0.1
|
| 10 |
+
scheduler: EDM
|
| 11 |
+
conditioning_dropout_prob: 0.1
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Dataset Setting
|
| 15 |
+
dataset_name: Bridge # WebVid / Bridge
|
| 16 |
+
dataset_path: [../datasets_rob/Bridge_v1_raw, ../datasets_rob/Bridge_v2_raw]
|
| 17 |
+
output_dir: checkpoints/img2video
|
| 18 |
+
height: 256 # Ratio that is functional: 256:384 576:1024 320:512 320:576
|
| 19 |
+
width: 384 # It is said that the height and width should be a scale of 64
|
| 20 |
+
dataloader_num_workers: 2 # For Debug, it only needs
|
| 21 |
+
flip_aug_prob: 0.45 # Whether we flip the GT and cond vertically
|
| 22 |
+
acceleration_tolerance: 5
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Text setting
|
| 26 |
+
use_text: True # If this is True, we will use text value
|
| 27 |
+
pretrained_tokenizer_name_or_path: stabilityai/stable-diffusion-2-1-base # Use SD 2.1
|
| 28 |
+
empty_prompts_proportion: 0.0 # Useless now, we already have CFG in training
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# Motion setting BE VERY CAREFUL for 14 and 25 fps setting
|
| 32 |
+
motion_bucket_id: 200 # Set it for exact value; If this is none, we will use below setting
|
| 33 |
+
dataset_motion_mean: 35.3 # For 14 fps, it is N(35.3, 18.5)
|
| 34 |
+
dataset_motion_std: 18.5 # For 25 fps, it is N(?, ?)
|
| 35 |
+
svd_motion_mean: 165
|
| 36 |
+
svd_motion_std: 22.5
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# Training setting
|
| 40 |
+
resume_from_checkpoint: False # latest/False
|
| 41 |
+
num_train_epochs: 40
|
| 42 |
+
partial_finetune: False # Whether we just tune central params to speed up
|
| 43 |
+
train_batch_size: 1 # This is the batch size per GPU
|
| 44 |
+
use_ema: False
|
| 45 |
+
checkpointing_steps: 3000
|
| 46 |
+
validation_step: 300
|
| 47 |
+
logging_name: logging
|
| 48 |
+
enable_xformers_memory_efficient_attention: False # Please set this as Flase since it will generate very bad quality images in pipeline (possibly useless for this func)
|
| 49 |
+
seed: 42
|
| 50 |
+
validation_img_folder: datasets/validation_raw
|
| 51 |
+
validation_store_folder: validation_videos
|
| 52 |
+
checkpoints_total_limit: 20
|
| 53 |
+
|
| 54 |
+
# Noise Strength
|
| 55 |
+
noise_mean: 0.5 # Regular Img2Video: (0.7, 1.6); Text2Video: (0.5, 1.4)
|
| 56 |
+
noise_std: 1.4
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# Inference
|
| 60 |
+
num_inference_steps: 25
|
| 61 |
+
inference_noise_aug_strength: 0.1
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# Learning Rate and Optimizer
|
| 65 |
+
learning_rate: 1e-5 # Usually this is ok
|
| 66 |
+
scale_lr: False # TODO: Is it needed to scale the learning rate?
|
| 67 |
+
adam_beta1: 0.9
|
| 68 |
+
adam_beta2: 0.999
|
| 69 |
+
use_8bit_adam: True # Need this to save more memory
|
| 70 |
+
adam_weight_decay: 1e-2
|
| 71 |
+
adam_epsilon: 1e-08
|
| 72 |
+
lr_warmup_steps: 500
|
| 73 |
+
lr_decay_steps: 7000 # Usually UNet training is constant
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# Other Setting
|
| 77 |
+
mixed_precision: fp16
|
| 78 |
+
gradient_accumulation_steps: 1 # ????
|
| 79 |
+
gradient_checkpointing: 1 # ????
|
| 80 |
+
report_to: tensorboard
|