from transformers import PretrainedConfig class Talk2DINOConfig(PretrainedConfig): model_type = "talk2dino" def __init__( self, avg_self_attn_token=False, clip_model_name="ViT-B/16", disentangled_self_attn_token=True, is_eval=True, keep_cls=False, keep_end_seq=False, loss=None, model_name="dinov2_vitb14_reg", pre_trained=True, proj_class="vitb_mlp_infonce", proj_model="ProjectionLayer", proj_name="vitb_mlp_infonce", resize_dim=518, type="DINOText", unfreeze_last_image_layer=False, unfreeze_last_text_layer=False, use_avg_text_token=False, with_bg_clean=False, **kwargs, ): super().__init__(**kwargs) # Store all parameters self.avg_self_attn_token = avg_self_attn_token self.clip_model_name = clip_model_name self.disentangled_self_attn_token = disentangled_self_attn_token self.is_eval = is_eval self.keep_cls = keep_cls self.keep_end_seq = keep_end_seq self.loss = loss self.model_name = model_name self.pre_trained = pre_trained self.proj_class = proj_class self.proj_model = proj_model self.proj_name = proj_name self.resize_dim = resize_dim self.type = type self.unfreeze_last_image_layer = unfreeze_last_image_layer self.unfreeze_last_text_layer = unfreeze_last_text_layer self.use_avg_text_token = use_avg_text_token self.with_bg_clean = with_bg_clean