Spaces:

TIGER-Lab
/

MMEB-Leaderboard

Running

App Files Files Community

MMEB-Leaderboard / utils.py

ziyjiang

interface update (#79)

d7a1226 verified 23 days ago

raw

history blame contribute delete

9.44 kB

	import pandas as pd
	import os
	import pprint as pp
	# import requests

	from datasets import DATASETS

	HF_TOKEN = os.environ.get("HF_TOKEN")

	BASE_COLS = ["Rank", "Models", "Model Size(B)", "Data Source"]
	TASKS_V1 = ["V1-Overall", "I-CLS", "I-QA", "I-RET", "I-VG"]
	COLUMN_NAMES = BASE_COLS + TASKS_V1

	DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown'] + \
	['number'] * len(TASKS_V1)

	LEADERBOARD_INTRODUCTION = """
	# 📊 MMEB LEADERBOARD (VLM2Vec)

	## Introduction
	We introduce a novel benchmark, MMEB-V1 (Massive Multimodal Embedding Benchmark),
	which includes 36 datasets spanning four meta-task categories: classification, visual question answering, retrieval, and visual grounding. MMEB provides a comprehensive framework for training
	and evaluating embedding models across various combinations of text and image modalities.
	All tasks are reformulated as ranking tasks, where the model follows instructions, processes a query, and selects the correct target from a set of candidates. The query and target can be an image, text,
	or a combination of both. MMEB-V1 is divided into 20 in-distribution datasets, which can be used for
	training, and 16 out-of-distribution datasets, reserved for evaluation.

	Building upon on MMEB-V1, MMEB-V2 expands the evaluation scope to include five new tasks: four video-based tasks
	— Video Retrieval, Moment Retrieval, Video Classification, and Video Question Answering — and one task focused on visual documents, Visual Document Retrieval.
	This comprehensive suite enables robust evaluation of multimodal embedding models across static, temporal, and structured visual data settings.

	<details>
	<summary><span style='font-weight:bold'>🔥 What's NEW: The leaderboards' rankings can be directly downloaded now. Go to Files: rankings/ folder and select the leaderboard you want to download</span></summary>
	<ul>
	<li>[2025-11] The leaderboards' rankings can be directly downloaded in csv/json format. Go to Files: rankings/ folder to download. A download button will be added to this page soon.</li>
	<li>[2025-06] MMEB-V2 released!</li>
	</ul
	</details>

	\| [📈Overview](https://tiger-ai-lab.github.io/VLM2Vec/) \| [Github](https://github.com/TIGER-AI-Lab/VLM2Vec)
	\| [📖MMEB-V2/VLM2Vec-V2 Paper](https://arxiv.org/abs/2507.04590)
	\| [📖MMEB-V1/VLM2Vec-V1 Paper](https://arxiv.org/abs/2410.05160)
	\| [🤗Hugging Face](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2)
	\| [Discord](https://discord.gg/njyKubdtry) \|
	"""

	LEADERBOARD_INFO = f"""
	## Dataset Overview
	<details>
	<summary>Visual Overview</summary>
	<img src='overview.png' alt='overview'/>
	</details>
	This is the dictionary of all datasets used in our code. Please make sure all datasets' scores are included in your submission. \n
	```python
	{pp.pformat(DATASETS)}
	```
	"""

	CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
	CITATION_BUTTON_TEXT = r"""@article{jiang2024vlm2vec,
	title={VLM2Vec: Training Vision-Language Models for Massive Multimodal Embedding Tasks},
	author={Jiang, Ziyan and Meng, Rui and Yang, Xinyi and Yavuz, Semih and Zhou, Yingbo and Chen, Wenhu},
	journal={arXiv preprint arXiv:2410.05160},
	year={2024}
	}"""

	SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction \n
	## We have provided detailed step-by-step guides on how to submit your model. Please read carefully in order to submit successfully. \n
	1. Step 1️⃣: Please refer to the [GitHub page](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
	2. Step 2️⃣: After running the evaluation pipelines, please use the script we provided (e.g., [report_score_v2.py](https://github.com/TIGER-AI-Lab/VLM2Vec/blob/main/experiments/report_score_v2.py)) to generate the unified score sheet.
	- Reminder: Adjust your model's configurations in the script, including model name, URL, model size, and data source.
	- Note the "model size" field is digits-only and is by default in Billions (B), so please convert it accordingly if your model size is in different units/formats (e.x., "8" for 8 billion, "0.5" for 500 million, don't include non-digits in it).
	- If possible, please also add a contact method in case we want to reach you in the future
	3. Step 3️⃣: Finally, create a pull request and upload the generated JSON file to the *scores* folder.
	- You can directly use the Hugging Face Space's web UI to upload your files:
	- Go to the [scores folder](https://huggingface.co/spaces/TIGER-Lab/MMEB-Leaderboard/upload/main/scores)
	- Select "Upload file" and upload your JSON files.
	- Please inform us on [our discord server](https://discord.gg/njyKubdtry), or send us an email at [email protected], or leave a simple message (@ us) in the PR to indicate that you are ready to merge your PR.
	- We will then review your submission and update the leaderboard accordingly. \n\n
	## Please double check your score sheet have the following JSON format ⬇️: \n
	```json
	{
	"metadata": {
	"model_name": "<Model Name>",
	"url": "<Model URL>" or null,
	"model_size": <Model Size> or null,
	"data_source": "Self-Reported",
	"contact": [email protected]
	... ...
	},
	"metrics": {
	"image": {
	"ImageNet-1K": {
	"hit@1": 0.5,
	"ndcg@1": 0.5,
	... ...
	},
	"N24News": {
	... ...
	},
	... ...
	},
	"video": {
	... ...
	},
	... ...
	}
	}
	```
	## ⚠️ Special Instructions for submitting to MMEB Image (Previously MMEB-V1) Leaderboard
	We understand that some researchers want to exclusively submit to the Image leaderboard, but unfortunately our current leaderboard system cannot exclude your model from other modalities' leaderboards.
	To do so, please run the 36 image datasets only and simply ignore the other datasets.
	The leaderboard will automatically assign a 0 to the missing datasets.
	We might be able to hide your model from the other leaderboards in the next leaderboard updates, but for now your model will be shown on all leaderboards and might have a lower rank. \n
	We highly recommend joining our [discord server](https://discord.gg/njyKubdtry), which provides a convenient way to stay informed with latest updates, or share any feedback you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!
	"""

	def create_hyperlinked_names(df):
	def convert_url(url, model_name):
	return f'<a href="{url}">{model_name}</a>' if url else model_name

	def add_link_to_model_name(row):
	row['Models'] = convert_url(row['URL'], row['Models'])
	return row

	df = df.copy()
	df = df.apply(add_link_to_model_name, axis=1)
	return df

	# def fetch_data(file: str) -> pd.DataFrame:
	# # fetch the leaderboard data from remote
	# if file is None:
	# raise ValueError("URL Not Provided")
	# url = f"https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/{file}"
	# print(f"Fetching data from {url}")
	# response = requests.get(url)
	# if response.status_code != 200:
	# raise requests.HTTPError(f"Failed to fetch data: HTTP status code {response.status_code}")
	# return pd.read_json(io.StringIO(response.text), orient='records', lines=True)

	def get_df(file="results.jsonl"):
	df = pd.read_json(file, orient='records', lines=True)
	df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
	for task in TASKS_V1:
	if df[task].isnull().any():
	df[task] = df[task].apply(lambda score: '-' if pd.isna(score) else score)
	df = df.sort_values(by=['V1-Overall'], ascending=False)
	df = create_hyperlinked_names(df)
	df['Rank'] = range(1, len(df) + 1)
	return df

	def refresh_data():
	df = get_df()
	return df[COLUMN_NAMES]

	def search_and_filter_models(df, query, min_size, max_size):
	filtered_df = df.copy()

	if query:
	filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]

	size_mask = filtered_df['Model Size(B)'].apply(lambda x:
	(min_size <= 1000.0 <= max_size) if x == 'unknown'
	else (min_size <= x <= max_size))

	filtered_df = filtered_df[size_mask]

	return filtered_df[COLUMN_NAMES]


	def search_models(df, query):
	if query:
	return df[df['Models'].str.contains(query, case=False, na=False)]
	return df

	def get_size_range(df):
	sizes = df['Model Size(B)'].apply(lambda x: 0.0 if x == 'unknown' else x)
	if (sizes == 0.0).all():
	return 0.0, 1000.0
	return float(sizes.min()), float(sizes.max())


	def process_model_size(size):
	if pd.isna(size) or size == 'unk':
	return 'unknown'
	try:
	val = float(size)
	return round(val, 3)
	except (ValueError, TypeError):
	return 'unknown'

	def filter_columns_by_tasks(df, selected_tasks=None):
	if selected_tasks is None or len(selected_tasks) == 0:
	return df[COLUMN_NAMES]

	base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
	selected_columns = base_columns + selected_tasks

	available_columns = [col for col in selected_columns if col in df.columns]
	return df[available_columns]