"""DeepSeekV3.2 model configuration""" from typing import Optional from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config class DeepseekV32Config(DeepseekV3Config): r""" This is the configuration class to store the configuration of a [`DeepseekV32Model`]. `""" model_type = "deepseek_v32" #dim=2048 max_batch_size=8 max_seq_len=16384 def __init__( self, vocab_size: Optional[int] = 129280, hidden_size: Optional[int] = 7168, intermediate_size: Optional[int] = 18432, moe_intermediate_size: Optional[int] = 2048, num_hidden_layers: Optional[int] = 61, num_attention_heads: Optional[int] = 128, num_key_value_heads: Optional[int] = 128, n_shared_experts: Optional[int] = 1, n_routed_experts: Optional[int] = 256, routed_scaling_factor: Optional[float] = 2.5, kv_lora_rank: Optional[int] = 512, q_lora_rank: Optional[int] = 1536, qk_rope_head_dim: Optional[int] = 64, v_head_dim: Optional[int] = 128, qk_nope_head_dim: Optional[int] = 128, n_group: Optional[int] = 8, topk_group: Optional[int] = 4, num_experts_per_tok: Optional[int] = 8, first_k_dense_replace: Optional[int] = 3, norm_topk_prob: Optional[bool] = True, hidden_act: Optional[str] = "silu", max_position_embeddings: Optional[int] = 4096, initializer_range: Optional[float] = 0.02, rms_norm_eps: Optional[int] = 1e-6, use_cache: Optional[bool] = True, pad_token_id: Optional[int] = None, bos_token_id: Optional[int] = 0, eos_token_id: Optional[int] = 1, pretraining_tp: Optional[int] = 1, tie_word_embeddings: Optional[bool] = False, rope_scaling = None, rope_interleave: Optional[bool] = True, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, ep_size: Optional[int] = 1, n_dense_layers=3, index_head_dim=128, index_n_heads=64, index_topk=2048, moe_layer_freq=1, num_nextn_predict_layers=1, **kwargs, ): kwargs.pop('rope_parameters', None) super().__init__( vocab_size=vocab_size, hidden_size=hidden_size, intermediate_size=intermediate_size, moe_intermediate_size=moe_intermediate_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, num_key_value_heads=num_key_value_heads, n_shared_experts=n_shared_experts, n_routed_experts=n_routed_experts, routed_scaling_factor=routed_scaling_factor, kv_lora_rank=kv_lora_rank, q_lora_rank=q_lora_rank, qk_rope_head_dim=qk_rope_head_dim, v_head_dim=v_head_dim, qk_nope_head_dim=qk_nope_head_dim, n_group=n_group, topk_group=topk_group, num_experts_per_tok=num_experts_per_tok, first_k_dense_replace=first_k_dense_replace, norm_topk_prob=norm_topk_prob, hidden_act=hidden_act, max_position_embeddings=max_position_embeddings, initializer_range=initializer_range, rms_norm_eps=rms_norm_eps, use_cache=use_cache, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, pretraining_tp=pretraining_tp, tie_word_embeddings=tie_word_embeddings, #rope_scaling=rope_scaling, rope_parameters=rope_scaling, rope_interleave=rope_interleave, attention_bias=attention_bias, attention_dropout=attention_dropout, **kwargs, ) self.ep_size=ep_size self.n_dense_layers=n_dense_layers self.index_head_dim=index_head_dim self.index_n_heads=index_n_heads self.index_topk=index_topk self.moe_layer_freq=moe_layer_freq self.num_nextn_predict_layers=num_nextn_predict_layers self.dim = hidden_size __all__ = ["DeepseekV32Config"]