File size: 1,606 Bytes
020eac1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from __future__ import annotations
from copy import deepcopy
from typing import Any, Dict, Optional
from transformers.configuration_utils import PretrainedConfig
DEFAULT_MODEL_NAME = "eva-clip-E-14-plus"
DEFAULT_VISION_CFG: Dict[str, Any] = {
"eva_model_name": DEFAULT_MODEL_NAME,
"image_size": 448,
"patch_size": 14,
"width": 1792,
"layers": 64,
"mlp_ratio": 8.571428571428571,
"head_width": 112,
"drop_path_rate": 0.0,
"qkv_bias": True,
"xattn": True,
"postnorm": True,
"global_average_pool": False,
"patch_dropout": 0.0,
"rope": False,
"pt_hw_seq_len": 32,
"intp_freq": False,
"naiveswiglu": False,
"subln": False,
"fusedLN": False,
}
class EvaClipVisionConfig(PretrainedConfig):
model_type = "eva-clip-vision"
def __init__(
self,
*,
vision_tower: str = DEFAULT_MODEL_NAME,
embed_dim: int = 1024,
vision_cfg: Optional[Dict[str, Any]] = None,
vision_tower_pretrained: Optional[str] = None,
projection_dim: Optional[int] = None,
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
cfg = deepcopy(DEFAULT_VISION_CFG)
if vision_cfg is not None:
cfg.update(vision_cfg)
self.vision_tower = vision_tower
self.embed_dim = embed_dim
self.vision_cfg = cfg
self.vision_tower_pretrained = vision_tower_pretrained
self.projection_dim = projection_dim if projection_dim is not None else embed_dim
__all__ = ["EvaClipVisionConfig", "DEFAULT_VISION_CFG", "DEFAULT_MODEL_NAME"]
|