| from __future__ import annotations | |
| from copy import deepcopy | |
| from typing import Any, Dict, Optional | |
| from transformers.configuration_utils import PretrainedConfig | |
| DEFAULT_MODEL_NAME = "eva-clip-E-14-plus" | |
| DEFAULT_VISION_CFG: Dict[str, Any] = { | |
| "eva_model_name": DEFAULT_MODEL_NAME, | |
| "image_size": 448, | |
| "patch_size": 14, | |
| "width": 1792, | |
| "layers": 64, | |
| "mlp_ratio": 8.571428571428571, | |
| "head_width": 112, | |
| "drop_path_rate": 0.0, | |
| "qkv_bias": True, | |
| "xattn": True, | |
| "postnorm": True, | |
| "global_average_pool": False, | |
| "patch_dropout": 0.0, | |
| "rope": False, | |
| "pt_hw_seq_len": 32, | |
| "intp_freq": False, | |
| "naiveswiglu": False, | |
| "subln": False, | |
| "fusedLN": False, | |
| } | |
| class EvaClipVisionConfig(PretrainedConfig): | |
| model_type = "eva-clip-vision" | |
| def __init__( | |
| self, | |
| *, | |
| vision_tower: str = DEFAULT_MODEL_NAME, | |
| embed_dim: int = 1024, | |
| vision_cfg: Optional[Dict[str, Any]] = None, | |
| vision_tower_pretrained: Optional[str] = None, | |
| projection_dim: Optional[int] = None, | |
| **kwargs: Any, | |
| ) -> None: | |
| super().__init__(**kwargs) | |
| cfg = deepcopy(DEFAULT_VISION_CFG) | |
| if vision_cfg is not None: | |
| cfg.update(vision_cfg) | |
| self.vision_tower = vision_tower | |
| self.embed_dim = embed_dim | |
| self.vision_cfg = cfg | |
| self.vision_tower_pretrained = vision_tower_pretrained | |
| self.projection_dim = projection_dim if projection_dim is not None else embed_dim | |
| __all__ = ["EvaClipVisionConfig", "DEFAULT_VISION_CFG", "DEFAULT_MODEL_NAME"] | |