File size: 1,606 Bytes
020eac1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from __future__ import annotations

from copy import deepcopy
from typing import Any, Dict, Optional

from transformers.configuration_utils import PretrainedConfig

DEFAULT_MODEL_NAME = "eva-clip-E-14-plus"

DEFAULT_VISION_CFG: Dict[str, Any] = {
    "eva_model_name": DEFAULT_MODEL_NAME,
    "image_size": 448,
    "patch_size": 14,
    "width": 1792,
    "layers": 64,
    "mlp_ratio": 8.571428571428571,
    "head_width": 112,
    "drop_path_rate": 0.0,
    "qkv_bias": True,
    "xattn": True,
    "postnorm": True,
    "global_average_pool": False,
    "patch_dropout": 0.0,
    "rope": False,
    "pt_hw_seq_len": 32,
    "intp_freq": False,
    "naiveswiglu": False,
    "subln": False,
    "fusedLN": False,
}


class EvaClipVisionConfig(PretrainedConfig):
    model_type = "eva-clip-vision"

    def __init__(
        self,
        *,
        vision_tower: str = DEFAULT_MODEL_NAME,
        embed_dim: int = 1024,
        vision_cfg: Optional[Dict[str, Any]] = None,
        vision_tower_pretrained: Optional[str] = None,
        projection_dim: Optional[int] = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(**kwargs)

        cfg = deepcopy(DEFAULT_VISION_CFG)
        if vision_cfg is not None:
            cfg.update(vision_cfg)

        self.vision_tower = vision_tower
        self.embed_dim = embed_dim
        self.vision_cfg = cfg
        self.vision_tower_pretrained = vision_tower_pretrained
        self.projection_dim = projection_dim if projection_dim is not None else embed_dim


__all__ = ["EvaClipVisionConfig", "DEFAULT_VISION_CFG", "DEFAULT_MODEL_NAME"]