HikariDawn777 commited on
Commit
c84f1a7
·
1 Parent(s): a38b03f

feat: initial push for the training weight

Browse files
gesturenet/config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "ControlNetModel",
3
+ "_diffusers_version": "0.25.1",
4
+ "act_fn": "silu",
5
+ "addition_time_embed_dim": 256,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "conditioning_channels": 3,
13
+ "conditioning_embedding_out_channels": [
14
+ 16,
15
+ 32,
16
+ 96,
17
+ 256
18
+ ],
19
+ "controlnet_conditioning_channel_order": "rgb",
20
+ "cross_attention_dim": 1024,
21
+ "down_block_types": [
22
+ "CrossAttnDownBlockSpatioTemporal",
23
+ "CrossAttnDownBlockSpatioTemporal",
24
+ "CrossAttnDownBlockSpatioTemporal",
25
+ "DownBlockSpatioTemporal"
26
+ ],
27
+ "encoder_hid_dim": null,
28
+ "encoder_hid_dim_type": null,
29
+ "flip_sin_to_cos": true,
30
+ "freq_shift": 0,
31
+ "in_channels": 8,
32
+ "layers_per_block": 2,
33
+ "mid_block_type": "UNetMidBlockSpatioTemporal",
34
+ "num_attention_heads": [
35
+ 5,
36
+ 10,
37
+ 20,
38
+ 20
39
+ ],
40
+ "projection_class_embeddings_input_dim": 768,
41
+ "transformer_layers_per_block": 1
42
+ }
gesturenet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b922e2a38adb5a57495fd07aa87b2a5f0e49118814490655fd4d9a4f2d65679
3
+ size 2723872604
gesturenet/train_image2video_gesturenet.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Model Setting
3
+ pretrained_model_name_or_path: stabilityai/stable-video-diffusion-img2vid # stabilityai/pretrained
4
+ load_unet_path: ../saved_weights/v3_VL_raw_prompt/checkpoint-99000 # None/specific path Under this path, it should have a folder named unet and follow regular unet files needed (e.g., config and weight)
5
+ video_seq_length: 14
6
+ process_fps: 7
7
+ train_noise_aug_strength: 0.1
8
+ scheduler: EDM
9
+ conditioning_dropout_prob: 0.1 # 0.0/0.1 =
10
+
11
+
12
+ # Dataset Setting
13
+ data_loader_type: thisthat # flow / traj / thisthat
14
+ dataset_name: Bridge # WebVid / Bridge
15
+ dataset_path: [../datasets_rob/Bridge_v1_TT14, ../datasets_rob/Bridge_v2_TT14] # ../Bridge_filter_flow, ../Bridge_v2_filter_flow/]
16
+ output_dir: checkpoints/img2video
17
+ dataset_decode_fps: 2
18
+ height: 256 # Ratio that is functional: 256:384 576:1024 320:448 320:576 512:640 448:640
19
+ width: 384 # It is said that the height and width should be a scale of 64
20
+ dataloader_num_workers: 4 # For Debug, it only needs 1
21
+ flip_aug_prob: 0.45 # Whether we flip the GT and cond vertically
22
+
23
+
24
+ # Text setting
25
+ use_text: True # If this is True, we will use text value
26
+ pretrained_tokenizer_name_or_path: stabilityai/stable-diffusion-2-1-base # Use SD 2.1
27
+ empty_prompts_proportion: 0.0
28
+
29
+
30
+ # Mask setting
31
+ mask_unet_vae: False # Whether we use mask to map latents to be zero padding
32
+ mask_controlnet_vae: True
33
+ mask_proportion: 0.7
34
+
35
+
36
+ # Condition Setting
37
+ conditioning_channels: 3 # Usually it is 3
38
+ num_points_left: # 1 # For flow: You can only choose one between flow_select_rate and num_points_left; num_points_left should be higher priority
39
+ flow_select_rate: 0.99 # For flow
40
+ threshold_factor: 0.2 # For flow
41
+ dilate: True # Traj must be True for dilate
42
+ inner_conditioning_scale: 1.0 # Conditioning scale for the internal value, defauly is starting from 1.0
43
+ outer_conditioning_scale: 1.0 # Outer Conditioning Scale for whole conditioning trainable copy 这里有点意思,直接不小心设定成2.0了
44
+
45
+
46
+ # Motion setting
47
+ motion_bucket_id: 200
48
+ dataset_motion_mean: 25 # For 14 fps, it is N(25, 10)
49
+ dataset_motion_std: 10 # For 25 fps, it is N(18, 7)
50
+ svd_motion_mean: 180
51
+ svd_motion_std: 30
52
+
53
+
54
+
55
+ # Training setting
56
+ resume_from_checkpoint: False # latest/False
57
+ # /nfs/turbo/jjparkcv-turbo-large/boyangwa/saved_weights/formal_trial1/checkpoint-170000
58
+ validation_img_folder: datasets/validation_TT14
59
+ num_train_epochs: 10
60
+ partial_finetune: False
61
+ train_batch_size: 1 # This is the batch size per GPU
62
+ checkpointing_steps: 3000
63
+ validation_step: 250
64
+ logging_name: logging
65
+ enable_xformers_memory_efficient_attention: False # Please set this as Flase since it will generate very bad quality images in pipeline (possibly useless for this func)
66
+ seed: 42
67
+ validation_store_folder: validation_videos
68
+ checkpoints_total_limit: 10
69
+ train_max_guidance_scale: 1.0 # >1.0 to use do_classifier_free_guidance in training
70
+
71
+
72
+ # Noise Strength
73
+ noise_mean: 0.5
74
+ noise_std: 1.4
75
+
76
+
77
+ # Inference
78
+ num_inference_steps: 25
79
+ use_instructpix2pix: False # Whether we will use the instructPix2Pix mode, which involves 3 inputs; it may needs tuning to have better result at the end.
80
+ inference_noise_aug_strength: 0.1
81
+ inference_max_guidance_scale: 3.0 # Take training and testing at different scenario
82
+ inference_guess_mode: False # Whether we use guess mode in the contorlnet
83
+ image_guidance_scale: 2.5 # Empirically, 2.5 is the best value
84
+
85
+
86
+ # Learning Rate and Optimizer
87
+ learning_rate: 5e-6 # The highest should be 1e-5 at the beginning
88
+ scale_lr: False # TODO: Is it needed to scale the learning rate?
89
+ adam_beta1: 0.9
90
+ adam_beta2: 0.999
91
+ use_8bit_adam: True # Need this to save more memory
92
+ adam_weight_decay: 1e-2
93
+ adam_epsilon: 1e-08
94
+ lr_scheduler_name: constant # None | constant | cosine | cosine_with_restarts (If it's None, we will use the following custom LR decay)
95
+ lr_warmup_steps: 500
96
+ lr_decay_steps: []
97
+ lr_double_steps: []
98
+ lr_decay_scale: 0.5
99
+
100
+
101
+ # Other Setting
102
+ mixed_precision: fp16
103
+ gradient_accumulation_steps: 1
104
+ gradient_checkpointing: 1
105
+ report_to: tensorboard
unet/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNetSpatioTemporalConditionModel",
3
+ "_diffusers_version": "0.25.1",
4
+ "_name_or_path": "checkpoints/img2video/checkpoint-87000",
5
+ "addition_time_embed_dim": 256,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "cross_attention_dim": 1024,
13
+ "down_block_types": [
14
+ "CrossAttnDownBlockSpatioTemporal",
15
+ "CrossAttnDownBlockSpatioTemporal",
16
+ "CrossAttnDownBlockSpatioTemporal",
17
+ "DownBlockSpatioTemporal"
18
+ ],
19
+ "in_channels": 8,
20
+ "layers_per_block": 2,
21
+ "num_attention_heads": [
22
+ 5,
23
+ 10,
24
+ 20,
25
+ 20
26
+ ],
27
+ "num_frames": 14,
28
+ "out_channels": 4,
29
+ "projection_class_embeddings_input_dim": 768,
30
+ "sample_size": 96,
31
+ "transformer_layers_per_block": 1,
32
+ "up_block_types": [
33
+ "UpBlockSpatioTemporal",
34
+ "CrossAttnUpBlockSpatioTemporal",
35
+ "CrossAttnUpBlockSpatioTemporal",
36
+ "CrossAttnUpBlockSpatioTemporal"
37
+ ]
38
+ }
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c59f3d69e5c30e58fd9950d1dc4c0144b6afe30c5e43a31f10839f6237d7e4b
3
+ size 6098682464
unet/train_image2video.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Model Setting
3
+ pretrained_model_name_or_path: stabilityai/stable-video-diffusion-img2vid # -xt is for 25 frames version
4
+ load_unet_path:
5
+ video_seq_length: 14 # Standardized to 14
6
+ train_max_guidance_scale: 1.0 # >1.0 to use do_classifier_free_guidance
7
+ inference_max_guidance_scale: 3.0 # Take training and testing at different scenario
8
+ process_fps: 7
9
+ train_noise_aug_strength: 0.1
10
+ scheduler: EDM
11
+ conditioning_dropout_prob: 0.1
12
+
13
+
14
+ # Dataset Setting
15
+ dataset_name: Bridge # WebVid / Bridge
16
+ dataset_path: [../datasets_rob/Bridge_v1_raw, ../datasets_rob/Bridge_v2_raw]
17
+ output_dir: checkpoints/img2video
18
+ height: 256 # Ratio that is functional: 256:384 576:1024 320:512 320:576
19
+ width: 384 # It is said that the height and width should be a scale of 64
20
+ dataloader_num_workers: 2 # For Debug, it only needs
21
+ flip_aug_prob: 0.45 # Whether we flip the GT and cond vertically
22
+ acceleration_tolerance: 5
23
+
24
+
25
+ # Text setting
26
+ use_text: True # If this is True, we will use text value
27
+ pretrained_tokenizer_name_or_path: stabilityai/stable-diffusion-2-1-base # Use SD 2.1
28
+ empty_prompts_proportion: 0.0 # Useless now, we already have CFG in training
29
+
30
+
31
+ # Motion setting BE VERY CAREFUL for 14 and 25 fps setting
32
+ motion_bucket_id: 200 # Set it for exact value; If this is none, we will use below setting
33
+ dataset_motion_mean: 35.3 # For 14 fps, it is N(35.3, 18.5)
34
+ dataset_motion_std: 18.5 # For 25 fps, it is N(?, ?)
35
+ svd_motion_mean: 165
36
+ svd_motion_std: 22.5
37
+
38
+
39
+ # Training setting
40
+ resume_from_checkpoint: False # latest/False
41
+ num_train_epochs: 40
42
+ partial_finetune: False # Whether we just tune central params to speed up
43
+ train_batch_size: 1 # This is the batch size per GPU
44
+ use_ema: False
45
+ checkpointing_steps: 3000
46
+ validation_step: 300
47
+ logging_name: logging
48
+ enable_xformers_memory_efficient_attention: False # Please set this as Flase since it will generate very bad quality images in pipeline (possibly useless for this func)
49
+ seed: 42
50
+ validation_img_folder: datasets/validation_raw
51
+ validation_store_folder: validation_videos
52
+ checkpoints_total_limit: 20
53
+
54
+ # Noise Strength
55
+ noise_mean: 0.5 # Regular Img2Video: (0.7, 1.6); Text2Video: (0.5, 1.4)
56
+ noise_std: 1.4
57
+
58
+
59
+ # Inference
60
+ num_inference_steps: 25
61
+ inference_noise_aug_strength: 0.1
62
+
63
+
64
+ # Learning Rate and Optimizer
65
+ learning_rate: 1e-5 # Usually this is ok
66
+ scale_lr: False # TODO: Is it needed to scale the learning rate?
67
+ adam_beta1: 0.9
68
+ adam_beta2: 0.999
69
+ use_8bit_adam: True # Need this to save more memory
70
+ adam_weight_decay: 1e-2
71
+ adam_epsilon: 1e-08
72
+ lr_warmup_steps: 500
73
+ lr_decay_steps: 7000 # Usually UNet training is constant
74
+
75
+
76
+ # Other Setting
77
+ mixed_precision: fp16
78
+ gradient_accumulation_steps: 1 # ????
79
+ gradient_checkpointing: 1 # ????
80
+ report_to: tensorboard