EVA-CLIP-E14-Plus / configuration_eva_clip.py
orrzohar's picture
Initial EVA-CLIP HF upload
020eac1 verified
from __future__ import annotations
from copy import deepcopy
from typing import Any, Dict, Optional
from transformers.configuration_utils import PretrainedConfig
DEFAULT_MODEL_NAME = "eva-clip-E-14-plus"
DEFAULT_VISION_CFG: Dict[str, Any] = {
"eva_model_name": DEFAULT_MODEL_NAME,
"image_size": 448,
"patch_size": 14,
"width": 1792,
"layers": 64,
"mlp_ratio": 8.571428571428571,
"head_width": 112,
"drop_path_rate": 0.0,
"qkv_bias": True,
"xattn": True,
"postnorm": True,
"global_average_pool": False,
"patch_dropout": 0.0,
"rope": False,
"pt_hw_seq_len": 32,
"intp_freq": False,
"naiveswiglu": False,
"subln": False,
"fusedLN": False,
}
class EvaClipVisionConfig(PretrainedConfig):
model_type = "eva-clip-vision"
def __init__(
self,
*,
vision_tower: str = DEFAULT_MODEL_NAME,
embed_dim: int = 1024,
vision_cfg: Optional[Dict[str, Any]] = None,
vision_tower_pretrained: Optional[str] = None,
projection_dim: Optional[int] = None,
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
cfg = deepcopy(DEFAULT_VISION_CFG)
if vision_cfg is not None:
cfg.update(vision_cfg)
self.vision_tower = vision_tower
self.embed_dim = embed_dim
self.vision_cfg = cfg
self.vision_tower_pretrained = vision_tower_pretrained
self.projection_dim = projection_dim if projection_dim is not None else embed_dim
__all__ = ["EvaClipVisionConfig", "DEFAULT_VISION_CFG", "DEFAULT_MODEL_NAME"]