{ "architectures": [ "VibeVoiceSemanticTokenizerModel" ], "bias": true, "channels": 1, "depths": [ 3, 3, 3, 3, 3, 3, 8 ], "downsampling_ratios": [ 2, 2, 4, 5, 5, 8 ], "dtype": "bfloat16", "ffn_expansion": 4, "hidden_act": "gelu", "hidden_size": 128, "kernel_size": 7, "layer_scale_init_value": 1e-06, "model_type": "vibevoice_semantic_tokenizer", "n_filters": 32, "rms_norm_eps": 1e-05, "transformers_version": "5.0.0.dev0", "weight_init_value": 0.01 }