from __future__ import annotations from copy import deepcopy from typing import Any, Dict, Optional from transformers.configuration_utils import PretrainedConfig DEFAULT_MODEL_NAME = "eva-clip-E-14-plus" DEFAULT_VISION_CFG: Dict[str, Any] = { "eva_model_name": DEFAULT_MODEL_NAME, "image_size": 448, "patch_size": 14, "width": 1792, "layers": 64, "mlp_ratio": 8.571428571428571, "head_width": 112, "drop_path_rate": 0.0, "qkv_bias": True, "xattn": True, "postnorm": True, "global_average_pool": False, "patch_dropout": 0.0, "rope": False, "pt_hw_seq_len": 32, "intp_freq": False, "naiveswiglu": False, "subln": False, "fusedLN": False, } class EvaClipVisionConfig(PretrainedConfig): model_type = "eva-clip-vision" def __init__( self, *, vision_tower: str = DEFAULT_MODEL_NAME, embed_dim: int = 1024, vision_cfg: Optional[Dict[str, Any]] = None, vision_tower_pretrained: Optional[str] = None, projection_dim: Optional[int] = None, **kwargs: Any, ) -> None: super().__init__(**kwargs) cfg = deepcopy(DEFAULT_VISION_CFG) if vision_cfg is not None: cfg.update(vision_cfg) self.vision_tower = vision_tower self.embed_dim = embed_dim self.vision_cfg = cfg self.vision_tower_pretrained = vision_tower_pretrained self.projection_dim = projection_dim if projection_dim is not None else embed_dim __all__ = ["EvaClipVisionConfig", "DEFAULT_VISION_CFG", "DEFAULT_MODEL_NAME"]