feat: initial push for the training weight

Browse files

Files changed (6) hide show

gesturenet/config.json +42 -0
gesturenet/diffusion_pytorch_model.safetensors +3 -0
gesturenet/train_image2video_gesturenet.yaml +105 -0
unet/config.json +38 -0
unet/diffusion_pytorch_model.safetensors +3 -0
unet/train_image2video.yaml +80 -0

gesturenet/config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "_class_name": "ControlNetModel",
+  "_diffusers_version": "0.25.1",
+  "act_fn": "silu",
+  "addition_time_embed_dim": 256,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "conditioning_channels": 3,
+  "conditioning_embedding_out_channels": [
+    16,
+    32,
+    96,
+    256
+  ],
+  "controlnet_conditioning_channel_order": "rgb",
+  "cross_attention_dim": 1024,
+  "down_block_types": [
+    "CrossAttnDownBlockSpatioTemporal",
+    "CrossAttnDownBlockSpatioTemporal",
+    "CrossAttnDownBlockSpatioTemporal",
+    "DownBlockSpatioTemporal"
+  ],
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 8,
+  "layers_per_block": 2,
+  "mid_block_type": "UNetMidBlockSpatioTemporal",
+  "num_attention_heads": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "projection_class_embeddings_input_dim": 768,
+  "transformer_layers_per_block": 1
+}

gesturenet/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b922e2a38adb5a57495fd07aa87b2a5f0e49118814490655fd4d9a4f2d65679
+size 2723872604

gesturenet/train_image2video_gesturenet.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+# Model Setting
+pretrained_model_name_or_path: stabilityai/stable-video-diffusion-img2vid   # stabilityai/pretrained
+load_unet_path: ../saved_weights/v3_VL_raw_prompt/checkpoint-99000          # None/specific path    Under this path, it should have a folder named unet and follow regular unet files needed (e.g., config and weight)
+video_seq_length: 14
+process_fps: 7
+train_noise_aug_strength: 0.1
+scheduler: EDM
+conditioning_dropout_prob: 0.1    # 0.0/0.1  =
+# Dataset Setting
+data_loader_type: thisthat          # flow / traj / thisthat
+dataset_name: Bridge            # WebVid / Bridge
+dataset_path: [../datasets_rob/Bridge_v1_TT14, ../datasets_rob/Bridge_v2_TT14]      # ../Bridge_filter_flow, ../Bridge_v2_filter_flow/]
+output_dir: checkpoints/img2video
+dataset_decode_fps: 2
+height: 256                 # Ratio that is functional: 256:384  576:1024  320:448  320:576  512:640  448:640
+width: 384                  # It is said that the height and width should be a scale of 64
+dataloader_num_workers: 4   # For Debug, it only needs 1
+flip_aug_prob: 0.45            # Whether we flip the GT and cond vertically
+# Text setting
+use_text: True                             # If this is True, we will use text value
+pretrained_tokenizer_name_or_path: stabilityai/stable-diffusion-2-1-base      # Use SD 2.1
+empty_prompts_proportion: 0.0
+# Mask setting
+mask_unet_vae: False        # Whether we use mask to map latents to be zero padding
+mask_controlnet_vae: True
+mask_proportion: 0.7
+# Condition Setting
+conditioning_channels: 3         # Usually it is 3
+num_points_left:    # 1          # For flow: You can only choose one between flow_select_rate and num_points_left; num_points_left should be higher priority
+flow_select_rate: 0.99           # For flow
+threshold_factor: 0.2            # For flow
+dilate: True                     # Traj must be True for dilate
+inner_conditioning_scale: 1.0    # Conditioning scale for the internal value, defauly is starting from 1.0
+outer_conditioning_scale: 1.0    # Outer Conditioning Scale for whole conditioning trainable copy  这里有点意思，直接不小心设定成2.0了
+# Motion setting
+motion_bucket_id: 200
+dataset_motion_mean: 25         # For 14 fps, it is N(25, 10)
+dataset_motion_std: 10          # For 25 fps, it is N(18, 7)
+svd_motion_mean: 180
+svd_motion_std: 30
+# Training setting
+resume_from_checkpoint: False     # latest/False
+# /nfs/turbo/jjparkcv-turbo-large/boyangwa/saved_weights/formal_trial1/checkpoint-170000
+validation_img_folder: datasets/validation_TT14
+num_train_epochs: 10
+partial_finetune: False
+train_batch_size: 1               # This is the batch size per GPU
+checkpointing_steps: 3000
+validation_step: 250
+logging_name: logging
+enable_xformers_memory_efficient_attention: False     # Please set this as Flase since it will generate very bad quality images in pipeline (possibly useless for this func)
+seed: 42
+validation_store_folder: validation_videos
+checkpoints_total_limit: 10
+train_max_guidance_scale: 1.0       # >1.0 to use do_classifier_free_guidance in training
+# Noise Strength
+noise_mean: 0.5
+noise_std: 1.4
+# Inference
+num_inference_steps: 25
+use_instructpix2pix: False          # Whether we will use the instructPix2Pix mode, which involves 3 inputs; it may needs tuning to have better result at the end.
+inference_noise_aug_strength: 0.1
+inference_max_guidance_scale: 3.0   # Take training and testing at different scenario
+inference_guess_mode: False         # Whether we use guess mode in the contorlnet
+image_guidance_scale: 2.5           # Empirically, 2.5 is the best value
+# Learning Rate and Optimizer
+learning_rate: 5e-6           # The highest should be 1e-5 at the beginning
+scale_lr: False               # TODO: Is it needed to scale the learning rate?
+adam_beta1: 0.9
+adam_beta2: 0.999
+use_8bit_adam: True           # Need this to save more memory
+adam_weight_decay: 1e-2
+adam_epsilon: 1e-08
+lr_scheduler_name: constant    # None | constant | cosine | cosine_with_restarts    (If it's None, we will use the following custom LR decay)
+lr_warmup_steps: 500
+lr_decay_steps: []
+lr_double_steps: []
+lr_decay_scale: 0.5
+# Other Setting
+mixed_precision: fp16
+gradient_accumulation_steps: 1
+gradient_checkpointing: 1
+report_to: tensorboard

unet/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "_class_name": "UNetSpatioTemporalConditionModel",
+  "_diffusers_version": "0.25.1",
+  "_name_or_path": "checkpoints/img2video/checkpoint-87000",
+  "addition_time_embed_dim": 256,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "cross_attention_dim": 1024,
+  "down_block_types": [
+    "CrossAttnDownBlockSpatioTemporal",
+    "CrossAttnDownBlockSpatioTemporal",
+    "CrossAttnDownBlockSpatioTemporal",
+    "DownBlockSpatioTemporal"
+  ],
+  "in_channels": 8,
+  "layers_per_block": 2,
+  "num_attention_heads": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "num_frames": 14,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": 768,
+  "sample_size": 96,
+  "transformer_layers_per_block": 1,
+  "up_block_types": [
+    "UpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporal"
+  ]
+}

unet/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c59f3d69e5c30e58fd9950d1dc4c0144b6afe30c5e43a31f10839f6237d7e4b
+size 6098682464

unet/train_image2video.yaml ADDED Viewed

	@@ -0,0 +1,80 @@

+# Model Setting
+pretrained_model_name_or_path: stabilityai/stable-video-diffusion-img2vid   # -xt is for 25 frames version
+load_unet_path:
+video_seq_length: 14                # Standardized to 14
+train_max_guidance_scale: 1.0       # >1.0 to use do_classifier_free_guidance
+inference_max_guidance_scale: 3.0   # Take training and testing at different scenario
+process_fps: 7
+train_noise_aug_strength: 0.1
+scheduler: EDM
+conditioning_dropout_prob: 0.1
+# Dataset Setting
+dataset_name: Bridge            # WebVid / Bridge
+dataset_path: [../datasets_rob/Bridge_v1_raw, ../datasets_rob/Bridge_v2_raw]
+output_dir: checkpoints/img2video
+height: 256                   # Ratio that is functional: 256:384  576:1024  320:512  320:576
+width: 384                    # It is said that the height and width should be a scale of 64
+dataloader_num_workers: 2     # For Debug, it only needs
+flip_aug_prob: 0.45           # Whether we flip the GT and cond vertically
+acceleration_tolerance: 5
+# Text setting
+use_text: True                             # If this is True, we will use text value
+pretrained_tokenizer_name_or_path: stabilityai/stable-diffusion-2-1-base      # Use SD 2.1
+empty_prompts_proportion: 0.0              # Useless now, we already have CFG in training
+# Motion setting  BE VERY CAREFUL for 14 and 25 fps setting
+motion_bucket_id: 200           # Set it for exact value; If this is none, we will use below setting
+dataset_motion_mean: 35.3       # For 14 fps, it is N(35.3, 18.5)
+dataset_motion_std: 18.5        # For 25 fps, it is N(?, ?)
+svd_motion_mean: 165
+svd_motion_std: 22.5
+# Training setting
+resume_from_checkpoint: False       # latest/False
+num_train_epochs: 40
+partial_finetune: False             # Whether we just tune central params to speed up
+train_batch_size: 1                 # This is the batch size per GPU
+use_ema: False
+checkpointing_steps: 3000
+validation_step: 300
+logging_name: logging
+enable_xformers_memory_efficient_attention: False     # Please set this as Flase since it will generate very bad quality images in pipeline (possibly useless for this func)
+seed: 42
+validation_img_folder: datasets/validation_raw
+validation_store_folder: validation_videos
+checkpoints_total_limit: 20
+# Noise Strength
+noise_mean: 0.5   # Regular Img2Video: (0.7, 1.6); Text2Video: (0.5, 1.4)
+noise_std: 1.4
+# Inference
+num_inference_steps: 25
+inference_noise_aug_strength: 0.1
+# Learning Rate and Optimizer
+learning_rate: 1e-5           # Usually this is ok
+scale_lr: False               # TODO: Is it needed to scale the learning rate?
+adam_beta1: 0.9
+adam_beta2: 0.999
+use_8bit_adam: True           # Need this to save more memory
+adam_weight_decay: 1e-2
+adam_epsilon: 1e-08
+lr_warmup_steps: 500
+lr_decay_steps: 7000          # Usually UNet training is constant
+# Other Setting
+mixed_precision: fp16
+gradient_accumulation_steps: 1    # ????
+gradient_checkpointing: 1         # ????
+report_to: tensorboard