|
|
|
|
|
from transformers import PretrainedConfig |
|
|
|
|
|
class Talk2DINOConfig(PretrainedConfig): |
|
|
model_type = "talk2dino" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
avg_self_attn_token=False, |
|
|
clip_model_name="ViT-B/16", |
|
|
disentangled_self_attn_token=True, |
|
|
is_eval=True, |
|
|
keep_cls=False, |
|
|
keep_end_seq=False, |
|
|
loss=None, |
|
|
model_name="dinov2_vitb14_reg", |
|
|
pre_trained=True, |
|
|
proj_class="vitb_mlp_infonce", |
|
|
proj_model="ProjectionLayer", |
|
|
proj_name="vitb_mlp_infonce", |
|
|
resize_dim=518, |
|
|
type="DINOText", |
|
|
unfreeze_last_image_layer=False, |
|
|
unfreeze_last_text_layer=False, |
|
|
use_avg_text_token=False, |
|
|
with_bg_clean=False, |
|
|
**kwargs, |
|
|
): |
|
|
super().__init__(**kwargs) |
|
|
|
|
|
|
|
|
self.avg_self_attn_token = avg_self_attn_token |
|
|
self.clip_model_name = clip_model_name |
|
|
self.disentangled_self_attn_token = disentangled_self_attn_token |
|
|
self.is_eval = is_eval |
|
|
self.keep_cls = keep_cls |
|
|
self.keep_end_seq = keep_end_seq |
|
|
self.loss = loss |
|
|
self.model_name = model_name |
|
|
self.pre_trained = pre_trained |
|
|
self.proj_class = proj_class |
|
|
self.proj_model = proj_model |
|
|
self.proj_name = proj_name |
|
|
self.resize_dim = resize_dim |
|
|
self.type = type |
|
|
self.unfreeze_last_image_layer = unfreeze_last_image_layer |
|
|
self.unfreeze_last_text_layer = unfreeze_last_text_layer |
|
|
self.use_avg_text_token = use_avg_text_token |
|
|
self.with_bg_clean = with_bg_clean |
|
|
|