import os import json import math import numpy as np import pandas as pd import gradio as gr from huggingface_hub import HfApi, hf_hub_download OWNER = "inceptionai" ARAGEN_REQUESTS_REPO_ID = f"{OWNER}/aragen-requests-dataset" HINDIGEN_REQUESTS_REPO_ID = f"{OWNER}/hindigen-requests-dataset" IFEVAL_REQUESTS_REPO_ID = f"{OWNER}/arabicifeval-requests-dataset" HEADER = """
"""
CITATION_BUTTON_TEXT = """
@misc{leaderboards,
author = {El Filali, Ali and Albarri, Sarah and Kamboj, Samta and Sengupta, Neha and Nakov, Preslav and Abouelseoud, Arwa},
title = {Multilingual Leaderboards: Generative Evaluation for Global South},
year = {2025},
publisher = {Inception},
howpublished = "url{https://huggingface.co/spaces/inceptionai/Leaderboards}"
}
"""
CITATION_BUTTON_LABEL = """
Copy the following snippet to cite the results from all Arabic Leaderboards in this Space.
"""
def extract_score_value(entry):
"""
Helper to extract (value, lower, upper) from both old v2 format (float)
and new v3/v1 formats (dict with "value"/"lower"/"upper").
All values are returned in [0, 1] space; caller can convert to percentages.
We use the "value" field as the point estimate.
"""
if entry is None:
return (math.nan, math.nan, math.nan)
# Old format: just a float
if isinstance(entry, (int, float)):
v = float(entry)
return (v, math.nan, math.nan)
# New format: dict with "value", "lower", "upper"
if isinstance(entry, dict):
v = float(entry.get("value", math.nan))
lower = entry.get("lower", math.nan)
upper = entry.get("upper", math.nan)
lower = float(lower) if isinstance(lower, (int, float)) else math.nan
upper = float(upper) if isinstance(upper, (int, float)) else math.nan
return (v, lower, upper)
return (math.nan, math.nan, math.nan)
def compute_leaderboard_3c3h(df_3c3h_base: pd.DataFrame) -> pd.DataFrame:
"""
Build the 3C3H leaderboard with:
- Rank (by 3C3H Score)
- Rank Spread (based on 3C3H Score CI)
- 95% CI (±) for 3C3H Score (only)
- Model Size Filter
All scores are in percentage space.
"""
df = df_3c3h_base.copy()
# Model size filter helper
max_model_size_value = 1000
df["Model Size Filter"] = df["Model Size"].replace(np.inf, max_model_size_value)
# Sort & rank by 3C3H Score (point estimate)
if "3C3H Score" in df.columns:
df = df.sort_values(by="3C3H Score", ascending=False)
df = df.reset_index(drop=True)
df.insert(0, "Rank", range(1, len(df) + 1))
# Rank Spread based on 3C3H Score CI
main_col = "3C3H Score"
lower_col = "3C3H Score Lower"
upper_col = "3C3H Score Upper"
# Effective lower/upper: if not present, fall back to point estimate
if lower_col in df.columns:
lower_eff = df[lower_col].copy()
else:
lower_eff = df[main_col].copy()
if upper_col in df.columns:
upper_eff = df[upper_col].copy()
else:
upper_eff = df[main_col].copy()
# order of base scenario: all models at their point estimates (value-based)
sort_desc = df.sort_values(by=main_col, ascending=False)
score_order = sort_desc[main_col].values # descending
def rank_position(x, order):
"""
Given a value x and a descending array 'order',
return the rank index where x would land
if all others stayed as in 'order'.
Rank = 1 + number of scores strictly greater than x.
"""
if np.isnan(x):
return math.nan
# Ignore NaNs in the score order
valid = order[~np.isnan(order)]
if valid.size == 0:
return math.nan
# 'valid' is descending; count how many scores are strictly greater than x
num_greater = np.sum(valid > x)
rank = num_greater + 1
# Clamp rank to [1, len(valid)] for numerical safety
if rank < 1:
rank = 1
elif rank > len(valid):
rank = len(valid)
return int(rank)
best_ranks = []
worst_ranks = []
for low, high in zip(lower_eff.values, upper_eff.values):
best = rank_position(high, score_order) # optimistic: use upper bound
worst = rank_position(low, score_order) # pessimistic: use lower bound
best_ranks.append(best)
worst_ranks.append(worst)
spread = []
for b, w in zip(best_ranks, worst_ranks):
if math.isnan(b) or math.isnan(w):
spread.append("-")
else:
spread.append(f"{int(b)} <--> {int(w)}")
df.insert(1, "Rank Spread", spread)
# 95% CI (±) for 3C3H Score only (in percentage space)
if lower_col in df.columns and upper_col in df.columns:
ci = (df[upper_col] - df[lower_col]) / 2.0
df["95% CI (±)"] = ci.round(4)
else:
df["95% CI (±)"] = np.nan
# Round score columns
score_columns_3c3h = [
"3C3H Score",
"Correctness",
"Completeness",
"Conciseness",
"Helpfulness",
"Honesty",
"Harmlessness",
]
for col in score_columns_3c3h:
if col in df.columns:
df[col] = df[col].round(4)
df["95% CI (±)"] = df["95% CI (±)"].round(4)
return df
def load_results(benchmark="aragen"):
"""
Loads results for the given benchmark.
benchmark:
- "aragen" -> uses aragen_v3_results.json (or v2 fallback)
- "hindigen" -> uses hindigen_v1_results.json
Supports:
- old v2 format (simple floats)
- new v3/v1 format (dict with value/lower/upper)
Returns:
df_3c3h : 3C3H leaderboard dataframe (with Rank, Rank Spread, 95% CI (±))
df_tasks : tasks leaderboard dataframe
task_columns: list of task score columns
"""
current_dir = os.path.dirname(os.path.abspath(__file__))
if benchmark == "hindigen":
results_file = os.path.join(current_dir, "assets", "results", "hindigen_v1_results.json")
else:
v3_file = os.path.join(current_dir, "assets", "results", "aragen_v3_results.json")
v2_file = os.path.join(current_dir, "assets", "results", "aragen_v2_results.json")
if os.path.exists(v3_file):
results_file = v3_file
else:
results_file = v2_file
with open(results_file, "r", encoding="utf-8") as f:
data = json.load(f)
# Filter out entries that only contain "_last_sync_timestamp"
filtered_data = []
for entry in data:
if len(entry.keys()) == 1 and "_last_sync_timestamp" in entry:
continue
filtered_data.append(entry)
data = filtered_data
data_3c3h = []
data_tasks = []
for model_data in data:
meta = model_data.get("Meta", {})
model_name = meta.get("Model Name", "UNK")
revision = meta.get("Revision", "UNK")
precision = meta.get("Precision", "UNK")
license_ = meta.get("License", "UNK")
params = meta.get("Params", "UNK")
# Parse model size
try:
model_size_numeric = float(params)
except Exception:
model_size_numeric = np.inf
# Find the key that holds the scores (e.g. "claude-3-7-sonnet-20250219 Scores", "claude-3.5-sonnet Scores")
scores_key = None
for k in model_data.keys():
if k.endswith("Scores"):
scores_key = k
break
scores_data = model_data.get(scores_key, {}) if scores_key else {}
scores_3c3h = scores_data.get("3C3H Scores", {})
scores_tasks = scores_data.get("Tasks Scores", {})
# --- 3C3H entry ---
entry3 = {
"Model Name": model_name,
"Revision": revision,
"License": license_,
"Precision": precision,
"Model Size": model_size_numeric,
}
for metric_name, metric_entry in scores_3c3h.items():
v, lower, upper = extract_score_value(metric_entry)
# Point estimate (percentage)
entry3[metric_name] = v * 100 if not math.isnan(v) else np.nan
# Only keep lower/upper for 3C3H Score (for CI & Rank Spread)
if metric_name == "3C3H Score":
entry3["3C3H Score Lower"] = (
lower * 100 if not math.isnan(lower) else np.nan
)
entry3["3C3H Score Upper"] = (
upper * 100 if not math.isnan(upper) else np.nan
)
data_3c3h.append(entry3)
# --- Tasks entry ---
entryt = {
"Model Name": model_name,
"Revision": revision,
"License": license_,
"Precision": precision,
"Model Size": model_size_numeric,
}
for task_name, task_entry in scores_tasks.items():
v, _, _ = extract_score_value(task_entry)
entryt[task_name] = v * 100 if not math.isnan(v) else np.nan
data_tasks.append(entryt)
df_3c3h_base = pd.DataFrame(data_3c3h)
df_tasks_base = pd.DataFrame(data_tasks)
# Build 3C3H leaderboard (rank, rank spread, CI, size filter)
df_3c3h = compute_leaderboard_3c3h(df_3c3h_base)
# Build tasks leaderboard (no weighted average, no rank spread, no CI)
if df_tasks_base.empty:
df_tasks = df_tasks_base.copy()
task_columns = []
else:
meta_cols_tasks = [
"Model Name",
"Revision",
"License",
"Precision",
"Model Size",
]
task_columns = [
col
for col in df_tasks_base.columns
if col not in meta_cols_tasks
]
df_tasks = df_tasks_base.copy()
# Round task scores
if task_columns:
df_tasks[task_columns] = df_tasks[task_columns].round(4)
# Model size filter
max_model_size_value = 1000
df_tasks["Model Size Filter"] = df_tasks["Model Size"].replace(
np.inf, max_model_size_value
)
# Sort & rank: based on the first task (typically Question Answering (QA))
if task_columns:
first_task = task_columns[0]
df_tasks = df_tasks.sort_values(by=first_task, ascending=False)
else:
df_tasks = df_tasks.sort_values(by="Model Name", ascending=True)
df_tasks = df_tasks.reset_index(drop=True)
df_tasks.insert(0, "Rank", range(1, len(df_tasks) + 1))
return df_3c3h, df_tasks, task_columns
def load_if_data():
"""
Loads the instruction-following data from ifeval_results.jsonl
and returns a dataframe with relevant columns,
converting decimal values to percentage format.
"""
current_dir = os.path.dirname(os.path.abspath(__file__))
results_file = os.path.join(current_dir, "assets", "results", "ifeval_results.jsonl")
data = []
with open(results_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
data.append(json.loads(line))
df = pd.DataFrame(data)
# Convert numeric columns
numeric_cols = ["En Prompt-lvl", "En Instruction-lvl", "Ar Prompt-lvl", "Ar Instruction-lvl"]
for col in numeric_cols:
df[col] = pd.to_numeric(df[col], errors="coerce")
# Compute average accuracy for En and Ar
df["Average Accuracy (En)"] = (df["En Prompt-lvl"] + df["En Instruction-lvl"]) / 2
df["Average Accuracy (Ar)"] = (df["Ar Prompt-lvl"] + df["Ar Instruction-lvl"]) / 2
# Convert them to percentage format (e.g., 0.871 -> 87.1)
for col in numeric_cols:
df[col] = (df[col] * 100).round(1)
df["Average Accuracy (En)"] = (df["Average Accuracy (En)"] * 100).round(1)
df["Average Accuracy (Ar)"] = (df["Average Accuracy (Ar)"] * 100).round(1)
# Handle size as numeric
def parse_size(x):
try:
return float(x)
except:
return np.inf
df["Model Size"] = df["Size (B)"].apply(parse_size)
# Add a filter column for size
max_model_size_value = 1000
df["Model Size Filter"] = df["Model Size"].replace(np.inf, max_model_size_value)
# Sort by "Average Accuracy (Ar)" as an example
df = df.sort_values(by="Average Accuracy (Ar)", ascending=False)
df = df.reset_index(drop=True)
df.insert(0, "Rank", range(1, len(df) + 1))
return df
def submit_model(model_name, revision, precision, params, license, modality, leaderboards_selected):
"""
Submits a model to one or more leaderboards:
- AraGen -> inceptionai/aragen-requests-dataset
- HindiGen -> inceptionai/hindigen-requests-dataset
- IFEval -> inceptionai/arabicifeval-requests-dataset
User must choose at least one leaderboard.
"""
if not leaderboards_selected:
return "**Error:** You must choose at least one leaderboard (AraGen, HindiGen, and/or IFEval)."
# Normalize precision
if precision == "Missing":
precision_norm = None
else:
precision_norm = precision.strip().lower() if precision else None
repo_map = {
"AraGen": ARAGEN_REQUESTS_REPO_ID,
"HindiGen": HINDIGEN_REQUESTS_REPO_ID,
"IFEval": IFEVAL_REQUESTS_REPO_ID,
}
# Map leaderboards that use the 3C3H JSON result files (for dedup vs results)
results_benchmark_map = {
"AraGen": "aragen",
"HindiGen": "hindigen",
}
api = HfApi()
# Validate model exists on HuggingFace Hub once
try:
_ = api.model_info(model_name)
except Exception:
return f"**Error: Could not find model '{model_name}' on HuggingFace Hub. Please ensure the model name is correct and the model is public.**"
org_model = model_name.split("/")
if len(org_model) != 2:
return "**Please enter the full model name including the organization or username, e.g., 'inceptionai/jais-family-30b-8k'**"
org, model_id = org_model
hf_api_token = os.environ.get("HF_API_TOKEN", None)
# Dedup & upload per leaderboard
success_targets = []
skipped_targets = []
errors = []
for leaderboard in leaderboards_selected:
repo_id = repo_map.get(leaderboard)
if repo_id is None:
errors.append(f"- Unknown leaderboard: {leaderboard}")
continue
# Deduplicate against existing results (only for AraGen/HindiGen)
already_evaluated = False
if leaderboard in results_benchmark_map:
df_3c3h_lb, _, _ = load_results(results_benchmark_map[leaderboard])
if not df_3c3h_lb.empty:
existing_models_results = df_3c3h_lb[["Model Name", "Revision", "Precision"]]
model_exists_in_results = (
(existing_models_results["Model Name"] == model_name)
& (existing_models_results["Revision"] == revision)
& (existing_models_results["Precision"] == (precision_norm if precision_norm is not None else existing_models_results["Precision"]))
).any()
if model_exists_in_results:
skipped_targets.append(
f"- **{leaderboard}**: Model already appears in the leaderboard results."
)
already_evaluated = True
# Deduplicate against pending/finished requests in this repo
def load_req(status_folder):
return load_requests(repo_id, status_folder)
df_pending = load_req("pending")
df_finished = load_req("finished")
if not already_evaluated:
if not df_pending.empty:
existing_models_pending = df_pending[["model_name", "revision", "precision"]]
model_exists_in_pending = (
(existing_models_pending["model_name"] == model_name)
& (existing_models_pending["revision"] == revision)
& (existing_models_pending["precision"] == precision_norm)
).any()
if model_exists_in_pending:
skipped_targets.append(
f"- **{leaderboard}**: Model is already in pending evaluations."
)
already_evaluated = True
if not already_evaluated:
if not df_finished.empty:
existing_models_finished = df_finished[["model_name", "revision", "precision"]]
model_exists_in_finished = (
(existing_models_finished["model_name"] == model_name)
& (existing_models_finished["revision"] == revision)
& (existing_models_finished["precision"] == precision_norm)
).any()
if model_exists_in_finished:
skipped_targets.append(
f"- **{leaderboard}**: Model has already been evaluated (finished)."
)
already_evaluated = True
if already_evaluated:
continue
# Prepare submission JSON
status = "PENDING"
submission = {
"model_name": model_name,
"license": license,
"revision": revision,
"precision": precision_norm,
"params": params,
"status": status,
"modality": modality,
"leaderboard": leaderboard,
}
submission_json = json.dumps(submission, indent=2)
precision_str = precision_norm if precision_norm else "Missing"
file_path_in_repo = f"pending/{org}/{model_id}_eval_request_{revision}_{precision_str}.json"
try:
api.upload_file(
path_or_fileobj=submission_json.encode("utf-8"),
path_in_repo=file_path_in_repo,
repo_id=repo_id,
repo_type="dataset",
token=hf_api_token,
)
success_targets.append(leaderboard)
except Exception as e:
errors.append(f"- **{leaderboard}**: Error while submitting – {str(e)}")
# Build user-facing message
messages = []
if success_targets:
messages.append(
f"✅ Model **'{model_name}'** has been submitted for evaluation to: "
+ ", ".join(f"**{lb}**" for lb in success_targets)
+ "."
)
if skipped_targets:
messages.append("⚠️ Skipped submissions:\n" + "\n".join(skipped_targets))
if errors:
messages.append("❌ Errors:\n" + "\n".join(errors))
if not messages:
return "**No submissions were made.** Please check if the model is already pending or evaluated."
return "\n\n".join(messages)
def load_requests(repo_id, status_folder):
"""
Loads request JSON files from a given dataset repo and status folder:
status_folder in {"pending", "finished", "failed"}
"""
api = HfApi()
requests_data = []
hf_api_token = os.environ.get("HF_API_TOKEN", None)
try:
files_info = api.list_repo_files(
repo_id=repo_id,
repo_type="dataset",
token=hf_api_token,
)
except Exception as e:
print(f"Error accessing dataset repository {repo_id}: {e}")
return pd.DataFrame()
files_in_folder = [
f for f in files_info if f.startswith(f"{status_folder}/") and f.endswith(".json")
]
for file_path in files_in_folder:
try:
local_file_path = hf_hub_download(
repo_id=repo_id,
filename=file_path,
repo_type="dataset",
token=hf_api_token,
)
with open(local_file_path, "r") as f:
request = json.load(f)
requests_data.append(request)
except Exception as e:
print(f"Error loading file {file_path}: {e}")
continue
df = pd.DataFrame(requests_data)
return df
# ---------- FILTER HELPERS (AraGen) ----------
def filter_df_3c3h(
search_query,
selected_cols,
precision_filters,
license_filters,
min_size,
max_size,
):
# AraGen 3C3H
df_3c3h, _, _ = load_results("aragen")
df_ = df_3c3h.copy()
# Sanity check on size range
if min_size > max_size:
min_size, max_size = max_size, min_size
# Text search
if search_query:
df_ = df_[df_["Model Name"].str.contains(search_query, case=False, na=False)]
# Precision filtering
if precision_filters:
include_missing = "Missing" in precision_filters
selected_precisions = [p for p in precision_filters if p != "Missing"]
if include_missing:
df_ = df_[
(df_["Precision"].isin(selected_precisions))
| (df_["Precision"] == "UNK")
| (df_["Precision"].isna())
]
else:
df_ = df_[df_["Precision"].isin(selected_precisions)]
# License filtering
if license_filters:
include_missing = "Missing" in license_filters
selected_licenses = [l for l in license_filters if l != "Missing"]
if include_missing:
df_ = df_[
(df_["License"].isin(selected_licenses))
| (df_["License"] == "UNK")
| (df_["License"].isna())
]
else:
df_ = df_[df_["License"].isin(selected_licenses)]
# Model size filter
df_ = df_[
(df_["Model Size Filter"] >= min_size) & (df_["Model Size Filter"] <= max_size)
]
# Keep global Rank / Rank Spread; just reset the index
df_ = df_.reset_index(drop=True)
# Column ordering
fixed_column_order = [
"Rank",
"Rank Spread",
"Model Name",
"3C3H Score",
"95% CI (±)",
"Correctness",
"Completeness",
"Conciseness",
"Helpfulness",
"Honesty",
"Harmlessness",
"Revision",
"License",
"Precision",
"Model Size",
]
selected_cols = [
col
for col in fixed_column_order
if col in selected_cols and col in df_.columns
]
return df_[selected_cols]
def filter_df_tasks(
search_query,
selected_cols,
precision_filters,
license_filters,
min_size,
max_size,
task_columns,
):
# AraGen tasks
_, df_tasks, _ = load_results("aragen")
df_ = df_tasks.copy()
if min_size > max_size:
min_size, max_size = max_size, min_size
if search_query:
df_ = df_[df_["Model Name"].str.contains(search_query, case=False, na=False)]
if precision_filters:
include_missing = "Missing" in precision_filters
selected_precisions = [p for p in precision_filters if p != "Missing"]
if include_missing:
df_ = df_[
(df_["Precision"].isin(selected_precisions))
| (df_["Precision"] == "UNK")
| (df_["Precision"].isna())
]
else:
df_ = df_[df_["Precision"].isin(selected_precisions)]
if license_filters:
include_missing = "Missing" in license_filters
selected_licenses = [l for l in license_filters if l != "Missing"]
if include_missing:
df_ = df_[
(df_["License"].isin(selected_licenses))
| (df_["License"] == "UNK")
| (df_["License"].isna())
]
else:
df_ = df_[df_["License"].isin(selected_licenses)]
df_ = df_[
(df_["Model Size Filter"] >= min_size) & (df_["Model Size Filter"] <= max_size)
]
# Re-rank within filtered subset using first task as sort key
if "Rank" in df_.columns:
df_ = df_.drop(columns=["Rank"])
if task_columns:
first_task = task_columns[0]
if first_task in df_.columns:
df_ = df_.sort_values(by=first_task, ascending=False)
else:
df_ = df_.sort_values(by="Model Name", ascending=True)
else:
df_ = df_.sort_values(by="Model Name", ascending=True)
df_ = df_.reset_index(drop=True)
df_.insert(0, "Rank", range(1, len(df_) + 1))
fixed_column_order = [
"Rank",
"Model Name",
"Question Answering (QA)",
"Orthographic and Grammatical Analysis",
"Safety",
"Reasoning",
"Revision",
"License",
"Precision",
"Model Size",
]
selected_cols = [
col for col in fixed_column_order if col in selected_cols and col in df_.columns
]
return df_[selected_cols]
# ---------- FILTER HELPERS (HindiGen) ----------
def filter_df_3c3h_hindigen(
search_query,
selected_cols,
precision_filters,
license_filters,
min_size,
max_size,
):
df_3c3h_hi, _, _ = load_results("hindigen")
df_ = df_3c3h_hi.copy()
if min_size > max_size:
min_size, max_size = max_size, min_size
if search_query:
df_ = df_[df_["Model Name"].str.contains(search_query, case=False, na=False)]
if precision_filters:
include_missing = "Missing" in precision_filters
selected_precisions = [p for p in precision_filters if p != "Missing"]
if include_missing:
df_ = df_[
(df_["Precision"].isin(selected_precisions))
| (df_["Precision"] == "UNK")
| (df_["Precision"].isna())
]
else:
df_ = df_[df_["Precision"].isin(selected_precisions)]
if license_filters:
include_missing = "Missing" in license_filters
selected_licenses = [l for l in license_filters if l != "Missing"]
if include_missing:
df_ = df_[
(df_["License"].isin(selected_licenses))
| (df_["License"] == "UNK")
| (df_["License"].isna())
]
else:
df_ = df_[df_["License"].isin(selected_licenses)]
df_ = df_[
(df_["Model Size Filter"] >= min_size) & (df_["Model Size Filter"] <= max_size)
]
df_ = df_.reset_index(drop=True)
fixed_column_order = [
"Rank",
"Rank Spread",
"Model Name",
"3C3H Score",
"95% CI (±)",
"Correctness",
"Completeness",
"Conciseness",
"Helpfulness",
"Honesty",
"Harmlessness",
"Revision",
"License",
"Precision",
"Model Size",
]
selected_cols = [
col
for col in fixed_column_order
if col in selected_cols and col in df_.columns
]
return df_[selected_cols]
def filter_df_tasks_hindigen(
search_query,
selected_cols,
precision_filters,
license_filters,
min_size,
max_size,
task_columns,
):
_, df_tasks_hi, _ = load_results("hindigen")
df_ = df_tasks_hi.copy()
if min_size > max_size:
min_size, max_size = max_size, min_size
if search_query:
df_ = df_[df_["Model Name"].str.contains(search_query, case=False, na=False)]
if precision_filters:
include_missing = "Missing" in precision_filters
selected_precisions = [p for p in precision_filters if p != "Missing"]
if include_missing:
df_ = df_[
(df_["Precision"].isin(selected_precisions))
| (df_["Precision"] == "UNK")
| (df_["Precision"].isna())
]
else:
df_ = df_[df_["Precision"].isin(selected_precisions)]
if license_filters:
include_missing = "Missing" in license_filters
selected_licenses = [l for l in license_filters if l != "Missing"]
if include_missing:
df_ = df_[
(df_["License"].isin(selected_licenses))
| (df_["License"] == "UNK")
| (df_["License"].isna())
]
else:
df_ = df_[df_["License"].isin(selected_licenses)]
df_ = df_[
(df_["Model Size Filter"] >= min_size) & (df_["Model Size Filter"] <= max_size)
]
if "Rank" in df_.columns:
df_ = df_.drop(columns=["Rank"])
if task_columns:
first_task = task_columns[0]
if first_task in df_.columns:
df_ = df_.sort_values(by=first_task, ascending=False)
else:
df_ = df_.sort_values(by="Model Name", ascending=True)
else:
df_ = df_.sort_values(by="Model Name", ascending=True)
df_ = df_.reset_index(drop=True)
df_.insert(0, "Rank", range(1, len(df_) + 1))
fixed_column_order = [
"Rank",
"Model Name",
"Question Answering (QA)",
"Grammar",
"Safety",
"Revision",
"License",
"Precision",
"Model Size",
]
selected_cols = [
col for col in fixed_column_order if col in selected_cols and col in df_.columns
]
return df_[selected_cols]
def filter_if_df(search_query, selected_cols, family_filters, min_size, max_size):
"""
Filters the instruction-following dataframe based on various criteria.
We have removed 'Filter by Type' and 'Filter by Creator'.
"""
df_ = load_if_data().copy()
if min_size > max_size:
min_size, max_size = max_size, min_size
# Search by model name
if search_query:
df_ = df_[df_["Model Name"].str.contains(search_query, case=False, na=False)]
# Filter by Family only (Creator and Type filters removed)
if family_filters:
df_ = df_[df_["Family"].isin(family_filters)]
# Filter by Model Size
df_ = df_[
(df_["Model Size Filter"] >= min_size) & (df_["Model Size Filter"] <= max_size)
]
# Re-rank within the filtered subset
if "Rank" in df_.columns:
df_ = df_.drop(columns=["Rank"])
df_ = df_.reset_index(drop=True)
df_.insert(0, "Rank", range(1, len(df_) + 1))
fixed_column_order = [
"Rank",
"Model Name",
"Average Accuracy (Ar)",
"Ar Prompt-lvl",
"Ar Instruction-lvl",
"Average Accuracy (En)",
"En Prompt-lvl",
"En Instruction-lvl",
"Type",
"Creator",
"Family",
"Size (B)",
"Base Model",
"Context Window",
"Lang.",
]
selected_cols = [
col for col in fixed_column_order if col in selected_cols and col in df_.columns
]
return df_[selected_cols]
def main():
# Load AraGen, HindiGen, and IFEval data
df_3c3h_ar, df_tasks_ar, task_columns_ar = load_results("aragen")
df_3c3h_hi, df_tasks_hi, task_columns_hi = load_results("hindigen")
df_if = load_if_data() # Instruction Following DF
# ---------- AraGen options ----------
precision_options_3c3h = sorted(df_3c3h_ar["Precision"].dropna().unique().tolist())
precision_options_3c3h = [p for p in precision_options_3c3h if p != "UNK"]
precision_options_3c3h.append("Missing")
license_options_3c3h = sorted(df_3c3h_ar["License"].dropna().unique().tolist())
license_options_3c3h = [l for l in license_options_3c3h if l != "UNK"]
license_options_3c3h.append("Missing")
precision_options_tasks = sorted(df_tasks_ar["Precision"].dropna().unique().tolist())
precision_options_tasks = [p for p in precision_options_tasks if p != "UNK"]
precision_options_tasks.append("Missing")
license_options_tasks = sorted(df_tasks_ar["License"].dropna().unique().tolist())
license_options_tasks = [l for l in license_options_tasks if l != "UNK"]
license_options_tasks.append("Missing")
min_model_size_3c3h = int(df_3c3h_ar["Model Size Filter"].min())
max_model_size_3c3h = int(df_3c3h_ar["Model Size Filter"].max())
min_model_size_tasks = int(df_tasks_ar["Model Size Filter"].min())
max_model_size_tasks = int(df_tasks_ar["Model Size Filter"].max())
column_choices_3c3h = [
col
for col in df_3c3h_ar.columns.tolist()
if col
not in [
"Model Size Filter",
"3C3H Score Lower",
"3C3H Score Upper",
]
]
column_choices_tasks = [
col
for col in df_tasks_ar.columns.tolist()
if col != "Model Size Filter"
]
# ---------- HindiGen options ----------
precision_options_3c3h_hi = sorted(df_3c3h_hi["Precision"].dropna().unique().tolist())
precision_options_3c3h_hi = [p for p in precision_options_3c3h_hi if p != "UNK"]
precision_options_3c3h_hi.append("Missing")
license_options_3c3h_hi = sorted(df_3c3h_hi["License"].dropna().unique().tolist())
license_options_3c3h_hi = [l for l in license_options_3c3h_hi if l != "UNK"]
license_options_3c3h_hi.append("Missing")
precision_options_tasks_hi = sorted(df_tasks_hi["Precision"].dropna().unique().tolist())
precision_options_tasks_hi = [p for p in precision_options_tasks_hi if p != "UNK"]
precision_options_tasks_hi.append("Missing")
license_options_tasks_hi = sorted(df_tasks_hi["License"].dropna().unique().tolist())
license_options_tasks_hi = [l for l in license_options_tasks_hi if l != "UNK"]
license_options_tasks_hi.append("Missing")
min_model_size_3c3h_hi = int(df_3c3h_hi["Model Size Filter"].min())
max_model_size_3c3h_hi = int(df_3c3h_hi["Model Size Filter"].max())
min_model_size_tasks_hi = int(df_tasks_hi["Model Size Filter"].min())
max_model_size_tasks_hi = int(df_tasks_hi["Model Size Filter"].max())
column_choices_3c3h_hi = [
col
for col in df_3c3h_hi.columns.tolist()
if col
not in [
"Model Size Filter",
"3C3H Score Lower",
"3C3H Score Upper",
]
]
column_choices_tasks_hi = [
col
for col in df_tasks_hi.columns.tolist()
if col != "Model Size Filter"
]
# ---------- IFEval options ----------
family_options_if = sorted(df_if["Family"].dropna().unique().tolist())
min_model_size_if = int(df_if["Model Size Filter"].min())
max_model_size_if = int(df_if["Model Size Filter"].max())
all_if_columns = [
"Rank",
"Model Name",
"Average Accuracy (Ar)",
"Ar Prompt-lvl",
"Ar Instruction-lvl",
"Average Accuracy (En)",
"En Prompt-lvl",
"En Instruction-lvl",
"Type",
"Creator",
"Family",
"Size (B)",
"Base Model",
"Context Window",
"Lang.",
]
default_if_columns = [
"Rank",
"Model Name",
"Average Accuracy (Ar)",
"Ar Prompt-lvl",
"Ar Instruction-lvl",
"Average Accuracy (En)",
]
with gr.Blocks() as demo:
gr.HTML(HEADER)
with gr.Tabs():
#
# AL Leaderboards Tab (AraGen + IFEval)
#
with gr.Tab("AL Leaderboards 🏅"):
with gr.Tabs():
# -------------------------
# Sub-Tab: AraGen Leaderboards
# -------------------------
with gr.Tab("🐪 AraGen Leaderboards (v3)"):
with gr.Tabs():
# 3C3H Scores
with gr.Tab("3C3H Scores"):
with gr.Accordion("⚙️ Filters", open=False):
with gr.Row():
search_box_3c3h = gr.Textbox(
placeholder="Search for models...",
label="Search",
interactive=True,
)
with gr.Row():
column_selector_3c3h = gr.CheckboxGroup(
choices=column_choices_3c3h,
value=[
"Rank",
"Rank Spread",
"Model Name",
"3C3H Score",
"95% CI (±)",
"Correctness",
"Completeness",
"Conciseness",
"Helpfulness",
"Honesty",
"Harmlessness",
],
label="Select columns to display",
)
with gr.Row():
license_filter_3c3h = gr.CheckboxGroup(
choices=license_options_3c3h,
value=license_options_3c3h.copy(),
label="Filter by License",
)
precision_filter_3c3h = gr.CheckboxGroup(
choices=precision_options_3c3h,
value=precision_options_3c3h.copy(),
label="Filter by Precision",
)
with gr.Row():
model_size_min_filter_3c3h = gr.Slider(
minimum=min_model_size_3c3h,
maximum=max_model_size_3c3h,
value=min_model_size_3c3h,
step=1,
label="Minimum Model Size",
interactive=True,
)
model_size_max_filter_3c3h = gr.Slider(
minimum=min_model_size_3c3h,
maximum=max_model_size_3c3h,
value=max_model_size_3c3h,
step=1,
label="Maximum Model Size",
interactive=True,
)
leaderboard_3c3h = gr.Dataframe(
df_3c3h_ar[
[
"Rank",
"Rank Spread",
"Model Name",
"3C3H Score",
"95% CI (±)",
"Correctness",
"Completeness",
"Conciseness",
"Helpfulness",
"Honesty",
"Harmlessness",
]
],
interactive=False,
)
filter_inputs_3c3h = [
search_box_3c3h,
column_selector_3c3h,
precision_filter_3c3h,
license_filter_3c3h,
model_size_min_filter_3c3h,
model_size_max_filter_3c3h,
]
search_box_3c3h.submit(
filter_df_3c3h,
inputs=filter_inputs_3c3h,
outputs=leaderboard_3c3h,
)
for component in filter_inputs_3c3h:
component.change(
filter_df_3c3h,
inputs=filter_inputs_3c3h,
outputs=leaderboard_3c3h,
)
# Tasks Scores
with gr.Tab("Tasks Scores"):
gr.Markdown(
"This table is sorted based on the **first task** "
"(e.g., Question Answering (QA))."
)
with gr.Accordion("⚙️ Filters", open=False):
with gr.Row():
search_box_tasks = gr.Textbox(
placeholder="Search for models...",
label="Search",
interactive=True,
)
with gr.Row():
column_selector_tasks = gr.CheckboxGroup(
choices=column_choices_tasks,
value=["Rank", "Model Name"] + task_columns_ar,
label="Select columns to display",
)
with gr.Row():
license_filter_tasks = gr.CheckboxGroup(
choices=license_options_tasks,
value=license_options_tasks.copy(),
label="Filter by License",
)
precision_filter_tasks = gr.CheckboxGroup(
choices=precision_options_tasks,
value=precision_options_tasks.copy(),
label="Filter by Precision",
)
with gr.Row():
model_size_min_filter_tasks = gr.Slider(
minimum=min_model_size_tasks,
maximum=max_model_size_tasks,
value=min_model_size_tasks,
step=1,
label="Minimum Model Size",
interactive=True,
)
model_size_max_filter_tasks = gr.Slider(
minimum=min_model_size_tasks,
maximum=max_model_size_tasks,
value=max_model_size_tasks,
step=1,
label="Maximum Model Size",
interactive=True,
)
leaderboard_tasks = gr.Dataframe(
df_tasks_ar[["Rank", "Model Name"] + task_columns_ar],
interactive=False,
)
filter_inputs_tasks = [
search_box_tasks,
column_selector_tasks,
precision_filter_tasks,
license_filter_tasks,
model_size_min_filter_tasks,
model_size_max_filter_tasks,
]
search_box_tasks.submit(
lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks(
sq, cols, pf, lf, min_val, max_val, task_columns_ar
),
inputs=filter_inputs_tasks,
outputs=leaderboard_tasks,
)
for component in filter_inputs_tasks:
component.change(
lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks(
sq, cols, pf, lf, min_val, max_val, task_columns_ar
),
inputs=filter_inputs_tasks,
outputs=leaderboard_tasks,
)
# -------------------------
# Sub-Tab: Instruction Following Leaderboard
# -------------------------
with gr.Tab("🗡️ Instruction Following Leaderboard"):
with gr.Accordion("⚙️ Filters", open=False):
with gr.Row():
search_box_if = gr.Textbox(
placeholder="Search for models...",
label="Search",
interactive=True,
)
with gr.Row():
column_selector_if = gr.CheckboxGroup(
choices=all_if_columns,
value=default_if_columns,
label="Select columns to display",
)
with gr.Row():
family_filter_if = gr.CheckboxGroup(
choices=family_options_if,
value=family_options_if.copy(),
label="Filter by Family",
)
with gr.Row():
model_size_min_filter_if = gr.Slider(
minimum=min_model_size_if,
maximum=max_model_size_if,
value=min_model_size_if,
step=1,
label="Minimum Model Size",
interactive=True,
)
model_size_max_filter_if = gr.Slider(
minimum=min_model_size_if,
maximum=max_model_size_if,
value=max_model_size_if,
step=1,
label="Maximum Model Size",
interactive=True,
)
leaderboard_if = gr.Dataframe(
df_if[default_if_columns],
interactive=False,
)
filter_inputs_if = [
search_box_if,
column_selector_if,
family_filter_if,
model_size_min_filter_if,
model_size_max_filter_if,
]
search_box_if.submit(
filter_if_df, inputs=filter_inputs_if, outputs=leaderboard_if
)
for component in filter_inputs_if:
component.change(
filter_if_df, inputs=filter_inputs_if, outputs=leaderboard_if
)
#
# HindiGen Leaderboards Tab
#
with gr.Tab("HindiGen Leaderboards 🇮🇳"):
with gr.Tabs():
# 3C3H Scores
with gr.Tab("3C3H Scores"):
with gr.Accordion("⚙️ Filters", open=False):
with gr.Row():
search_box_3c3h_hi = gr.Textbox(
placeholder="Search for models...",
label="Search",
interactive=True,
)
with gr.Row():
column_selector_3c3h_hi = gr.CheckboxGroup(
choices=column_choices_3c3h_hi,
value=[
"Rank",
"Rank Spread",
"Model Name",
"3C3H Score",
"95% CI (±)",
"Correctness",
"Completeness",
"Conciseness",
"Helpfulness",
"Honesty",
"Harmlessness",
],
label="Select columns to display",
)
with gr.Row():
license_filter_3c3h_hi = gr.CheckboxGroup(
choices=license_options_3c3h_hi,
value=license_options_3c3h_hi.copy(),
label="Filter by License",
)
precision_filter_3c3h_hi = gr.CheckboxGroup(
choices=precision_options_3c3h_hi,
value=precision_options_3c3h_hi.copy(),
label="Filter by Precision",
)
with gr.Row():
model_size_min_filter_3c3h_hi = gr.Slider(
minimum=min_model_size_3c3h_hi,
maximum=max_model_size_3c3h_hi,
value=min_model_size_3c3h_hi,
step=1,
label="Minimum Model Size",
interactive=True,
)
model_size_max_filter_3c3h_hi = gr.Slider(
minimum=min_model_size_3c3h_hi,
maximum=max_model_size_3c3h_hi,
value=max_model_size_3c3h_hi,
step=1,
label="Maximum Model Size",
interactive=True,
)
leaderboard_3c3h_hi = gr.Dataframe(
df_3c3h_hi[
[
"Rank",
"Rank Spread",
"Model Name",
"3C3H Score",
"95% CI (±)",
"Correctness",
"Completeness",
"Conciseness",
"Helpfulness",
"Honesty",
"Harmlessness",
]
],
interactive=False,
)
filter_inputs_3c3h_hi = [
search_box_3c3h_hi,
column_selector_3c3h_hi,
precision_filter_3c3h_hi,
license_filter_3c3h_hi,
model_size_min_filter_3c3h_hi,
model_size_max_filter_3c3h_hi,
]
search_box_3c3h_hi.submit(
filter_df_3c3h_hindigen,
inputs=filter_inputs_3c3h_hi,
outputs=leaderboard_3c3h_hi,
)
for component in filter_inputs_3c3h_hi:
component.change(
filter_df_3c3h_hindigen,
inputs=filter_inputs_3c3h_hi,
outputs=leaderboard_3c3h_hi,
)
# Tasks Scores
with gr.Tab("Tasks Scores"):
gr.Markdown(
"This table is sorted based on the **first task** "
"(e.g., Question Answering (QA))."
)
with gr.Accordion("⚙️ Filters", open=False):
with gr.Row():
search_box_tasks_hi = gr.Textbox(
placeholder="Search for models...",
label="Search",
interactive=True,
)
with gr.Row():
column_selector_tasks_hi = gr.CheckboxGroup(
choices=column_choices_tasks_hi,
value=["Rank", "Model Name"] + task_columns_hi,
label="Select columns to display",
)
with gr.Row():
license_filter_tasks_hi = gr.CheckboxGroup(
choices=license_options_tasks_hi,
value=license_options_tasks_hi.copy(),
label="Filter by License",
)
precision_filter_tasks_hi = gr.CheckboxGroup(
choices=precision_options_tasks_hi,
value=precision_options_tasks_hi.copy(),
label="Filter by Precision",
)
with gr.Row():
model_size_min_filter_tasks_hi = gr.Slider(
minimum=min_model_size_tasks_hi,
maximum=max_model_size_tasks_hi,
value=min_model_size_tasks_hi,
step=1,
label="Minimum Model Size",
interactive=True,
)
model_size_max_filter_tasks_hi = gr.Slider(
minimum=min_model_size_tasks_hi,
maximum=max_model_size_tasks_hi,
value=max_model_size_tasks_hi,
step=1,
label="Maximum Model Size",
interactive=True,
)
leaderboard_tasks_hi = gr.Dataframe(
df_tasks_hi[["Rank", "Model Name"] + task_columns_hi],
interactive=False,
)
filter_inputs_tasks_hi = [
search_box_tasks_hi,
column_selector_tasks_hi,
precision_filter_tasks_hi,
license_filter_tasks_hi,
model_size_min_filter_tasks_hi,
model_size_max_filter_tasks_hi,
]
search_box_tasks_hi.submit(
lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks_hindigen(
sq, cols, pf, lf, min_val, max_val, task_columns_hi
),
inputs=filter_inputs_tasks_hi,
outputs=leaderboard_tasks_hi,
)
for component in filter_inputs_tasks_hi:
component.change(
lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks_hindigen(
sq, cols, pf, lf, min_val, max_val, task_columns_hi
),
inputs=filter_inputs_tasks_hi,
outputs=leaderboard_tasks_hi,
)
#
# About & Submit Tab
#
with gr.Tab("About & Submit Page 📝"):
# Load request tables for all three request datasets
df_pending_ar = load_requests(ARAGEN_REQUESTS_REPO_ID, "pending")
df_finished_ar = load_requests(ARAGEN_REQUESTS_REPO_ID, "finished")
df_failed_ar = load_requests(ARAGEN_REQUESTS_REPO_ID, "failed")
df_pending_hi = load_requests(HINDIGEN_REQUESTS_REPO_ID, "pending")
df_finished_hi = load_requests(HINDIGEN_REQUESTS_REPO_ID, "finished")
df_failed_hi = load_requests(HINDIGEN_REQUESTS_REPO_ID, "failed")
df_pending_if = load_requests(IFEVAL_REQUESTS_REPO_ID, "pending")
df_finished_if = load_requests(IFEVAL_REQUESTS_REPO_ID, "finished")
df_failed_if = load_requests(IFEVAL_REQUESTS_REPO_ID, "failed")
gr.Markdown(ABOUT_SECTION)
gr.Markdown("## Submit Your Model for Evaluation")
with gr.Column():
model_name_input = gr.Textbox(
label="Model Name",
placeholder="Enter the full model name from HuggingFace Hub (e.g., inceptionai/jais-family-30b-8k)",
)
revision_input = gr.Textbox(
label="Revision", placeholder="main", value="main"
)
precision_input = gr.Dropdown(
choices=["float16", "float32", "bfloat16", "8bit", "4bit"],
label="Precision",
value="float16",
)
params_input = gr.Textbox(
label="Params",
placeholder="Enter the approximate number of parameters as Integer (e.g., 7, 13, 30, 70 ...)",
)
license_input = gr.Textbox(
label="License",
placeholder="Enter the license type (Generic one is 'Open' in case no License is provided)",
value="Open",
)
modality_input = gr.Radio(
choices=["Text"],
label="Modality",
value="Text",
)
leaderboard_targets = gr.CheckboxGroup(
choices=["AraGen", "HindiGen", "IFEval"],
label="Choose which leaderboard(s) to submit to",
info="You must choose at least one.",
)
submit_button = gr.Button("Submit Model")
submission_result = gr.Markdown()
submit_button.click(
submit_model,
inputs=[
model_name_input,
revision_input,
precision_input,
params_input,
license_input,
modality_input,
leaderboard_targets,
],
outputs=submission_result,
)
gr.Markdown("## Evaluation Status")
gr.Markdown("### AraGen Requests")
with gr.Accordion(
f"AraGen – Pending Evaluations ({len(df_pending_ar)})", open=False
):
if not df_pending_ar.empty:
gr.Dataframe(df_pending_ar)
else:
gr.Markdown("No pending evaluations.")
with gr.Accordion(
f"AraGen – Finished Evaluations ({len(df_finished_ar)})", open=False
):
if not df_finished_ar.empty:
gr.Dataframe(df_finished_ar)
else:
gr.Markdown("No finished evaluations.")
with gr.Accordion(
f"AraGen – Failed Evaluations ({len(df_failed_ar)})", open=False
):
if not df_failed_ar.empty:
gr.Dataframe(df_failed_ar)
else:
gr.Markdown("No failed evaluations.")
gr.Markdown("### HindiGen Requests")
with gr.Accordion(
f"HindiGen – Pending Evaluations ({len(df_pending_hi)})", open=False
):
if not df_pending_hi.empty:
gr.Dataframe(df_pending_hi)
else:
gr.Markdown("No pending evaluations.")
with gr.Accordion(
f"HindiGen – Finished Evaluations ({len(df_finished_hi)})", open=False
):
if not df_finished_hi.empty:
gr.Dataframe(df_finished_hi)
else:
gr.Markdown("No finished evaluations.")
with gr.Accordion(
f"HindiGen – Failed Evaluations ({len(df_failed_hi)})", open=False
):
if not df_failed_hi.empty:
gr.Dataframe(df_failed_hi)
else:
gr.Markdown("No failed evaluations.")
gr.Markdown("### IFEval Requests")
with gr.Accordion(
f"IFEval – Pending Evaluations ({len(df_pending_if)})", open=False
):
if not df_pending_if.empty:
gr.Dataframe(df_pending_if)
else:
gr.Markdown("No pending evaluations.")
with gr.Accordion(
f"IFEval – Finished Evaluations ({len(df_finished_if)})", open=False
):
if not df_finished_if.empty:
gr.Dataframe(df_finished_if)
else:
gr.Markdown("No finished evaluations.")
with gr.Accordion(
f"IFEval – Failed Evaluations ({len(df_failed_if)})", open=False
):
if not df_failed_if.empty:
gr.Dataframe(df_failed_if)
else:
gr.Markdown("No failed evaluations.")
# Citation Section
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=8,
elem_id="citation-button",
show_copy_button=True,
)
gr.HTML(BOTTOM_LOGO)
demo.launch()
if __name__ == "__main__":
main()