File size: 1,600 Bytes
f335052
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

from transformers import PretrainedConfig

class Talk2DINOConfig(PretrainedConfig):
    model_type = "talk2dino"

    def __init__(
        self,
        avg_self_attn_token=False,
        clip_model_name="ViT-B/16",
        disentangled_self_attn_token=True,
        is_eval=True,
        keep_cls=False,
        keep_end_seq=False,
        loss=None,
        model_name="dinov2_vitb14_reg",
        pre_trained=True,
        proj_class="vitb_mlp_infonce",
        proj_model="ProjectionLayer",
        proj_name="vitb_mlp_infonce",
        resize_dim=518,
        type="DINOText",
        unfreeze_last_image_layer=False,
        unfreeze_last_text_layer=False,
        use_avg_text_token=False,
        with_bg_clean=False,
        **kwargs,
    ):
        super().__init__(**kwargs)

        # Store all parameters
        self.avg_self_attn_token = avg_self_attn_token
        self.clip_model_name = clip_model_name
        self.disentangled_self_attn_token = disentangled_self_attn_token
        self.is_eval = is_eval
        self.keep_cls = keep_cls
        self.keep_end_seq = keep_end_seq
        self.loss = loss
        self.model_name = model_name
        self.pre_trained = pre_trained
        self.proj_class = proj_class
        self.proj_model = proj_model
        self.proj_name = proj_name
        self.resize_dim = resize_dim
        self.type = type
        self.unfreeze_last_image_layer = unfreeze_last_image_layer
        self.unfreeze_last_text_layer = unfreeze_last_text_layer
        self.use_avg_text_token = use_avg_text_token
        self.with_bg_clean = with_bg_clean