hqfang commited on
Commit
8de3e36
·
1 Parent(s): a9c11d6

update weights from bf16 to fp32

Browse files
config.json CHANGED
@@ -117,7 +117,7 @@
117
  }
118
  },
119
  "tie_word_embeddings": false,
120
- "torch_dtype": "bfloat16",
121
  "transformers_version": "4.52.3",
122
  "use_cache": true,
123
  "vit_config": {
 
117
  }
118
  },
119
  "tie_word_embeddings": false,
120
+ "torch_dtype": "float32",
121
  "transformers_version": "4.52.3",
122
  "use_cache": true,
123
  "vit_config": {
model-00001-of-00004.safetensors → model-00001-of-00007.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc6cee860646e6245edd505fef7896a8ebc5fd47d6e260852699ffcb351d4119
3
- size 4975847688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22b149604bdd0e6d5a1eab6a7764e9bc001ba2eaefd1f6506761746069027893
3
+ size 4892921912
model-00002-of-00004.safetensors → model-00002-of-00007.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3b707bb662766ec4202e82fa9245ffefc1f9ec40e4aebe011426ea2459e463b
3
- size 4890972104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4308afe5a8c8c3e9e17cae20b9a4ef5ec81bf9f959511047855b2e9ae617784a
3
+ size 4857403152
model-00003-of-00004.safetensors → model-00003-of-00007.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a772516b69576965758f7683d47a10ef81916bcf7c52e10836329e14524a6ff3
3
- size 4620250280
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6564f9fdaaa424eaadfb33cf7e1a10ad9c6c7a0af1b90085fe8aae833ed52792
3
+ size 4857403200
model-00004-of-00004.safetensors → model-00004-of-00007.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65446424dc3ae3bfe8675c254936da139406793f5329b1db40466a6f9ba02b17
3
- size 826278016
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:734aa8101f1689d7e41f8cbfaff539248712aa879d21a03578d37839b6107c9d
3
+ size 4857403200
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b024919352d9a1adddfe53bf0accf25a86ceb4168fd9c95d5ff434df4c4bad5
3
+ size 4857403200
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90ae62928101ae357c6c7eb3c89bf9c9919dc9e5056a4f8b4be9aa16b343f051
3
+ size 4651517480
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1656295cc35ed895cb50a7557fc13c5619718f01c1df02f32e80a4625011bf9e
3
+ size 1652555904
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
model.yaml ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name: molmo
2
+ llm:
3
+ d_model: 4096
4
+ n_heads: 32
5
+ n_kv_heads: null
6
+ head_dim: null
7
+ qkv_bias: false
8
+ clip_qkv: null
9
+ n_layers: 32
10
+ mlp_ratio: 4
11
+ mlp_hidden_size: 22016
12
+ activation_type: swiglu
13
+ block_type: sequential
14
+ rope: true
15
+ rope_full_precision: true
16
+ rope_theta: 500000.0
17
+ rope_type: default
18
+ rope_factor: null
19
+ rope_high_freq_factor: null
20
+ rope_low_freq_factor: null
21
+ rope_original_max_position_embeddings: null
22
+ attention_type: sdpa
23
+ float32_attention: true
24
+ attention_dropout: 0.0
25
+ attention_layer_norm: true
26
+ attention_layer_norm_type: olmo
27
+ residual_dropout: 0.1
28
+ response_residual_dropout: 0.0
29
+ layer_norm_type: rms
30
+ layer_norm_with_affine: true
31
+ layer_norm_eps: 1.0e-06
32
+ attention_layer_norm_with_affine: true
33
+ max_sequence_length: 4096
34
+ max_position_embeddings: null
35
+ include_bias: false
36
+ bias_for_layer_norm: false
37
+ norm_after: true
38
+ moe_num_experts: 8
39
+ moe_top_k: 2
40
+ moe_mlp_impl: sparse
41
+ moe_log_expert_assignment: false
42
+ moe_shared_expert: false
43
+ moe_lbl_in_fp32: false
44
+ moe_interleave: false
45
+ moe_loss_weight: 0.1
46
+ moe_zloss_weight: null
47
+ moe_dropless: true
48
+ moe_capacity_factor: 1.25
49
+ embedding_dropout: 0.0
50
+ scale_logits: false
51
+ vocab_size: 100278
52
+ additional_vocab_size: 128
53
+ weight_tying: false
54
+ embedding_size: 100864
55
+ use_position_ids: true
56
+ tokenizer:
57
+ identifier: allenai/OLMo-2-1124-7B
58
+ tokenizer_dir: null
59
+ depth_tokens: true
60
+ init_path: /weka/oe-training-default/mm-olmo/pretrained_llms/olmo2-1124-7b.pt
61
+ init_incremental: null
62
+ new_embedding_init_range: 0.02
63
+ initializer_range: 0.02
64
+ normalize_input_embeds: false
65
+ activation_checkpoint: whole_layer
66
+ compile: blocks
67
+ fix_pad_tokenizer: true
68
+ resize_vocab: true
69
+ init_std: 0.02
70
+ init_fn: normal
71
+ init_cutoff_factor: null
72
+ vision_backbone:
73
+ vit:
74
+ image_model_type: openai
75
+ image_default_input_size:
76
+ - 336
77
+ - 336
78
+ image_patch_size: 14
79
+ image_pos_patch_size: 14
80
+ image_emb_dim: 1024
81
+ image_num_heads: 16
82
+ image_num_key_value_heads: 16
83
+ image_num_layers: 23
84
+ image_head_dim: 64
85
+ image_mlp_dim: 4096
86
+ image_mlp_activations: quick_gelu
87
+ image_dropout_rate: 0.0
88
+ image_num_pos: 577
89
+ image_norm_eps: 1.0e-05
90
+ attention_dropout: 0.0
91
+ residual_dropout: 0.0
92
+ initializer_range: 0.02
93
+ float32_attention: true
94
+ attention_type: sdpa
95
+ activation_checkpointing: true
96
+ init_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
97
+ resize_mode: default
98
+ pad_value: 0.0
99
+ normalize: openai
100
+ image_pooling_2d: attention_meanq
101
+ pooling_attention_mask: false
102
+ image_projector: mlp
103
+ image_padding_embed: pad_and_partial_pad
104
+ vit_layers:
105
+ - -2
106
+ - -9
107
+ skip_unused_layers: true
108
+ image_feature_dropout: 0.0
109
+ connector_activation_checkpointing: true
110
+ compile_vit: blocks
111
+ data_formatter:
112
+ prompt_templates: uber_model
113
+ message_format: role
114
+ system_prompt: demo_or_style
115
+ always_start_with_space: false
116
+ default_inference_len: 65
117
+ select_answer: best
118
+ debug: false
119
+ image_last: false
120
+ format_message_list: null
121
+ p_one_message: 0.0
122
+ mm_preprocessor:
123
+ crop_mode: overlap-and-resize-c2
124
+ max_crops: 8
125
+ max_images: 2
126
+ max_multi_image_crops: 8
127
+ pooling_w: 2
128
+ pooling_h: 2
129
+ overlap_margins:
130
+ - 4
131
+ - 4
132
+ use_col_tokens: true
133
+ loss_token_weighting: root_subsegments
134
+ legacy_image_mask: false
135
+ max_answer_len: null
136
+ img_aug: false
137
+ bi_directional_attn: null
138
+ lora_enable: false
139
+ lora_rank: 64
140
+ lora_alpha: 16
141
+ lora_dropout: 0.05
142
+ lora_bias: none
143
+ n_action_bins: 256
144
+ norm_stats:
145
+ molmoact:
146
+ action:
147
+ mean:
148
+ - 0.0005706787342205644
149
+ - 0.0002448957529850304
150
+ - -3.5987635783385485e-05
151
+ - 0.00021597897284664214
152
+ - -0.0004896928439848125
153
+ - -0.000241481073317118
154
+ - 0.5570635199546814
155
+ std:
156
+ - 0.005207270849496126
157
+ - 0.007506529800593853
158
+ - 0.006415561307221651
159
+ - 0.013248044066131115
160
+ - 0.010928540490567684
161
+ - 0.014873150736093521
162
+ - 0.49715080857276917
163
+ min:
164
+ - -0.07434078305959702
165
+ - -0.07339745759963989
166
+ - -0.06539416313171387
167
+ - -0.1688285619020462
168
+ - -0.10289879888296127
169
+ - -0.2667275667190552
170
+ - 0.0
171
+ max:
172
+ - 0.06042003631591797
173
+ - 0.09417290985584259
174
+ - 0.07019275426864624
175
+ - 0.2616892158985138
176
+ - 0.11751057207584381
177
+ - 0.16968433558940887
178
+ - 1.0
179
+ q01:
180
+ - -0.01538565568625927
181
+ - -0.021047022193670273
182
+ - -0.01688069850206375
183
+ - -0.044314172118902206
184
+ - -0.03890235349535942
185
+ - -0.04788423702120781
186
+ - 0.0
187
+ q99:
188
+ - 0.014661382883787155
189
+ - 0.026515591889619827
190
+ - 0.021398313343524933
191
+ - 0.04216696694493294
192
+ - 0.03401297703385353
193
+ - 0.04957397282123566
194
+ - 1.0
195
+ num_entries: 1560068