| { | |
| "model_architecture": { | |
| "backbone": "facebook/dinov2-large", | |
| "backbone_details": { | |
| "model_type": "Vision Transformer (ViT)", | |
| "variant": "Large", | |
| "patch_size": 14, | |
| "num_hidden_layers": 24, | |
| "num_attention_heads": 16, | |
| "hidden_size": 1024, | |
| "intermediate_size": 4096, | |
| "pretrained_image_size": 518, | |
| "finetuned_image_size": 224 | |
| }, | |
| "feature_dim": 1024, | |
| "encoder_parameters": 304367634, | |
| "head_trainable_parameters": 24598, | |
| "freeze_backbone": true, | |
| "encoder_output_shape": { | |
| "description": "Encoder outputs full sequence of tokens including CLS token", | |
| "raw_shape": ["batch_size", 257, 1024], | |
| "tokens_breakdown": { | |
| "cls_token": 1, | |
| "patch_tokens": 256, | |
| "total": 257 | |
| }, | |
| "usage": "CLS token (index 0) is extracted for feature representation" | |
| } | |
| }, | |
| "input_specification": { | |
| "image_size": [ | |
| 224, | |
| 224 | |
| ], | |
| "channels": 3, | |
| "pixel_range": [ | |
| 0.0, | |
| 1.0 | |
| ], | |
| "normalization": { | |
| "mean": [ | |
| 0.485, | |
| 0.456, | |
| 0.406 | |
| ], | |
| "std": [ | |
| 0.229, | |
| 0.224, | |
| 0.225 | |
| ], | |
| "description": "ImageNet normalization for DINOv2" | |
| }, | |
| "input_format": "RGB", | |
| "tensor_layout": "NCHW" | |
| }, | |
| "output_specification": { | |
| "heads": { | |
| "scene": { | |
| "num_classes": 6, | |
| "output_type": "logits", | |
| "activation": "softmax", | |
| "classes": [ | |
| 16000001, | |
| 16000002, | |
| 16000006, | |
| 16000008, | |
| 16000009, | |
| 16000011 | |
| ] | |
| }, | |
| "concept": { | |
| "num_classes": 3, | |
| "output_type": "logits", | |
| "activation": "softmax", | |
| "classes": [ | |
| 17000001, | |
| 17000002, | |
| 17000003 | |
| ] | |
| }, | |
| "object": { | |
| "num_classes": 13, | |
| "output_type": "logits", | |
| "activation": "softmax", | |
| "classes": [ | |
| 18000001, | |
| 18000002, | |
| 18000004, | |
| 18000005, | |
| 18000006, | |
| 18000007, | |
| 18000008, | |
| 18000009, | |
| 18000010, | |
| 18000012, | |
| 18000014, | |
| 18000016, | |
| "unclassified" | |
| ] | |
| } | |
| } | |
| }, | |
| "class_mappings": { | |
| "scene": { | |
| "0": 16000001, | |
| "1": 16000002, | |
| "2": 16000006, | |
| "3": 16000008, | |
| "4": 16000009, | |
| "5": 16000011 | |
| }, | |
| "concept": { | |
| "0": 17000001, | |
| "1": 17000002, | |
| "2": 17000003 | |
| }, | |
| "object": { | |
| "0": 18000001, | |
| "1": 18000002, | |
| "2": 18000004, | |
| "3": 18000005, | |
| "4": 18000006, | |
| "5": 18000007, | |
| "6": 18000008, | |
| "7": 18000009, | |
| "8": 18000010, | |
| "9": 18000012, | |
| "10": 18000014, | |
| "11": 18000016, | |
| "12": "unclassified" | |
| } | |
| } | |
| } | |