common: # The number of historical images img_history_size: 2 # The number of future actions to predict action_chunk_size: 64 # The number of cameras to be used in the model num_cameras: 3 # Dimension for state/action, we use the same space for both state and action # This MUST be equal to configs/state_vec.py state_dim: 128 dataset: # We will extract the data from raw dataset # and store them in the disk buffer by producer # When training, we will read the data # randomly from the buffer by consumer # The producer will replace the data which has been # read by the consumer with new data # The path to the buffer (at least 400GB) buf_path: /home/jellyho/RDTBuffer # The number of chunks in the buffer buf_num_chunks: 128 # The number of samples (step rather than episode) in each chunk buf_chunk_size: 128 # We will filter the episodes with length less than `epsd_len_thresh_low` epsd_len_thresh_low: 32 # For those more than `epsd_len_thresh_high`, # we will randomly sample `epsd_len_thresh_high` steps each time we load the episode # to better balance the training datasets epsd_len_thresh_high: 2048 # How to fit the image size image_aspect_ratio: pad # Maximum number of language tokens tokenizer_max_length: 1024 model: # Config for condition adpators lang_adaptor: mlp2x_gelu img_adaptor: mlp2x_gelu state_adaptor: mlp3x_gelu lang_token_dim: 4096 img_token_dim: 1152 # Dim of action or proprioception vector # A `state` refers to an action or a proprioception vector state_token_dim: 128 # Config for RDT structure rdt: # 1B: num_head 32 hidden_size 2048 hidden_size: 2048 depth: 28 num_heads: 32 cond_pos_embed_type: multimodal # For noise scheduler noise_scheduler: type: ddpm num_train_timesteps: 1000 num_inference_timesteps: 5 beta_schedule: squaredcos_cap_v2 # Critical choice prediction_type: sample clip_sample: False # For EMA (params averaging) # We do not use EMA currently ema: update_after_step: 0 inv_gamma: 1.0 power: 0.75 min_value: 0.0 max_value: 0.9999