alielfilali01 commited on
Commit
f6b51b3
·
verified ·
1 Parent(s): ed3cae6

Update app.py

Browse files

Leaderboard v3.1.4

Files changed (1) hide show
  1. app.py +1309 -461
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import json
 
3
  import numpy as np
4
  import pandas as pd
5
  import gradio as gr
@@ -7,14 +8,17 @@ from huggingface_hub import HfApi, hf_hub_download
7
 
8
 
9
  OWNER = "inceptionai"
10
- DATASET_REPO_ID = f"{OWNER}/requests-dataset"
 
 
 
11
 
12
 
13
  HEADER = """
14
  <center>
15
  <br></br>
16
- <h1>Arabic Leaderboards</h1>
17
- <h2>Comprehensive Evaluation of Arabic Large Language Models</h2>
18
  <br></br>
19
  </center>
20
  """
@@ -22,15 +26,21 @@ HEADER = """
22
  ABOUT_SECTION = """
23
  ## About
24
 
25
- In our `12-24` release, we introduced the `AraGen Benchmark`, along with the `3C3H` evaluation measure (aka the 3C3H Score). You can find more details about AraGen and 3C3H, [here](https://huggingface.co/blog/leaderboard-3c3h-aragen). And you can find the first version of the benchmark, `AraGen-12-24` [here](https://huggingface.co/datasets/inceptionai/AraGen). Building on that foundation, and as part of this new release, we have expanded this space to incorporate additional tasks and evaluation metrics.
 
 
 
 
26
 
27
- In this release, we present two leaderboards:
28
 
29
- **AraGen-03-25 (v2):**
30
 
31
- - The AraGen Benchmark is designed to evaluate and compare the performance of Chat/Instruct Arabic Large Language Models on a suite of generative tasks that are culturally relevant to the Arab region, history, politics, cuisine ... etc. By leveraging **3C3H** as an evaluation metric—which assesses a model's output across six dimensions: Correctness, Completeness, Conciseness, Helpfulness, Honesty, and Harmlessness—the leaderboard offers a comprehensive and holistic evaluation of a model’s chat capabilities and its ability to generate human-like and ethically responsible content.
32
 
33
- **Instruction Following:**
 
 
34
 
35
  - We have established a robust leaderboard that benchmarks models on Arabic and English instruction following, offering an open and comparative performance landscape for the research community. Concurrently, we released the first publicly available Arabic [dataset](https://huggingface.co/datasets/inceptionai/Arabic_IFEval) aimed at evaluating LLMs' ability to follow instructions. The Arabic IFEval samples are meticulously curated to capture the language’s unique nuances—such as diacritization and distinctive phonetic features—often overlooked in generic datasets. Our dedicated linguistic team generated original samples and adapted selections from the IFEval English dataset, ensuring that the material resonates with Arabic cultural contexts and meets the highest standards of authenticity and quality.
36
 
@@ -40,18 +50,18 @@ Our evaluations are conducted in a generative mode, meaning that we expect model
40
 
41
  ### Contact
42
 
43
- For inquiries or assistance, please join the conversation on our [Discussions Tab](https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards/discussions) or reach out via [email](mailto:[email protected]).
44
  """
45
 
46
  BOTTOM_LOGO = """<img src="https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards/resolve/main/assets/pictures/03-25/arabic-leaderboards-colab-march-preview-free-3.png" style="width:50%;display:block;margin-left:auto;margin-right:auto;border-radius:15px;">"""
47
 
48
  CITATION_BUTTON_TEXT = """
49
- @misc{Arabic-Leaderboards,
50
- author = {El Filali, Ali and Albarri, Sarah and Abouelseoud, Arwa and Kamboj, Samta and Sengupta, Neha and Nakov, Preslav},
51
- title = {Arabic-Leaderboards: Comprehensive Evaluation of Arabic Large Language Models},
52
  year = {2025},
53
  publisher = {Inception},
54
- howpublished = "url{https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards}"
55
  }
56
  """
57
 
@@ -60,104 +70,300 @@ Copy the following snippet to cite the results from all Arabic Leaderboards in t
60
  """
61
 
62
 
63
- def load_results():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  """
65
- Loads the AraGen v2 results from aragen_v2_results.json and returns two dataframes:
66
- 1) df_3c3h with columns for 3C3H scores
67
- 2) df_tasks with columns for tasks scores
 
 
 
 
 
 
 
 
 
 
 
68
  """
69
  current_dir = os.path.dirname(os.path.abspath(__file__))
70
- results_file = os.path.join(current_dir, "assets", "results", "aragen_v2_results.json")
71
-
72
- with open(results_file, 'r') as f:
 
 
 
 
 
 
 
 
 
73
  data = json.load(f)
74
-
75
- # Filter out any entries that only contain '_last_sync_timestamp'
76
  filtered_data = []
77
  for entry in data:
78
  if len(entry.keys()) == 1 and "_last_sync_timestamp" in entry:
79
  continue
80
  filtered_data.append(entry)
81
-
82
  data = filtered_data
83
-
84
  data_3c3h = []
85
  data_tasks = []
86
-
87
  for model_data in data:
88
- meta = model_data.get('Meta', {})
89
- model_name = meta.get('Model Name', 'UNK')
90
- revision = meta.get('Revision', 'UNK')
91
- precision = meta.get('Precision', 'UNK')
92
- params = meta.get('Params', 'UNK')
93
-
 
 
94
  try:
95
  model_size_numeric = float(params)
96
- except (ValueError, TypeError):
97
  model_size_numeric = np.inf
98
-
99
- scores_data = model_data.get('claude-3.5-sonnet Scores', {})
100
- scores_3c3h = scores_data.get('3C3H Scores', {})
101
- scores_tasks = scores_data.get('Tasks Scores', {})
102
-
103
- formatted_scores_3c3h = {k: v*100 for k, v in scores_3c3h.items()}
104
- formatted_scores_tasks = {k: v*100 for k, v in scores_tasks.items()}
105
-
106
- data_entry_3c3h = {
107
- 'Model Name': model_name,
108
- 'Revision': revision,
109
- 'License': meta.get('License', 'UNK'),
110
- 'Precision': precision,
111
- 'Model Size': model_size_numeric,
112
- '3C3H Score': formatted_scores_3c3h.get("3C3H Score", np.nan),
113
- 'Correctness': formatted_scores_3c3h.get("Correctness", np.nan),
114
- 'Completeness': formatted_scores_3c3h.get("Completeness", np.nan),
115
- 'Conciseness': formatted_scores_3c3h.get("Conciseness", np.nan),
116
- 'Helpfulness': formatted_scores_3c3h.get("Helpfulness", np.nan),
117
- 'Honesty': formatted_scores_3c3h.get("Honesty", np.nan),
118
- 'Harmlessness': formatted_scores_3c3h.get("Harmlessness", np.nan),
119
  }
120
- data_3c3h.append(data_entry_3c3h)
121
-
122
- data_entry_tasks = {
123
- 'Model Name': model_name,
124
- 'Revision': revision,
125
- 'License': meta.get('License', 'UNK'),
126
- 'Precision': precision,
127
- 'Model Size': model_size_numeric,
128
- **formatted_scores_tasks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  }
130
- data_tasks.append(data_entry_tasks)
131
-
132
- df_3c3h = pd.DataFrame(data_3c3h)
133
- df_tasks = pd.DataFrame(data_tasks)
134
-
135
- score_columns_3c3h = ['3C3H Score', 'Correctness', 'Completeness', 'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness']
136
- df_3c3h[score_columns_3c3h] = df_3c3h[score_columns_3c3h].round(4)
137
-
138
- max_model_size_value = 1000
139
- df_3c3h['Model Size Filter'] = df_3c3h['Model Size'].replace(np.inf, max_model_size_value)
140
-
141
- if '3C3H Score' in df_3c3h.columns:
142
- df_3c3h = df_3c3h.sort_values(by='3C3H Score', ascending=False)
143
- df_3c3h.insert(0, 'Rank', range(1, len(df_3c3h) + 1))
144
- else:
145
- df_3c3h.insert(0, 'Rank', range(1, len(df_3c3h) + 1))
146
-
147
- task_columns = [col for col in df_tasks.columns if col not in ['Model Name', 'Revision', 'License', 'Precision', 'Model Size', 'Model Size Filter']]
148
- if task_columns:
149
- df_tasks[task_columns] = df_tasks[task_columns].round(4)
150
-
151
- df_tasks['Model Size Filter'] = df_tasks['Model Size'].replace(np.inf, max_model_size_value)
152
-
153
- if task_columns:
154
- first_task = task_columns[0]
155
- df_tasks = df_tasks.sort_values(by=first_task, ascending=False)
156
- df_tasks.insert(0, 'Rank', range(1, len(df_tasks) + 1))
157
  else:
158
- df_tasks = df_tasks.sort_values(by='Model Name', ascending=True)
159
- df_tasks.insert(0, 'Rank', range(1, len(df_tasks) + 1))
160
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  return df_3c3h, df_tasks, task_columns
162
 
163
 
@@ -216,114 +422,198 @@ def load_if_data():
216
  return df
217
 
218
 
219
- def submit_model(model_name, revision, precision, params, license, modality):
220
- df_3c3h, df_tasks, _ = load_results()
221
- existing_models_results = df_3c3h[['Model Name', 'Revision', 'Precision']]
 
 
 
222
 
223
- if precision == 'Missing':
224
- precision = None
 
 
 
 
 
 
225
  else:
226
- precision = precision.strip().lower()
227
-
228
- df_pending = load_requests('pending')
229
- df_finished = load_requests('finished')
230
-
231
- model_exists_in_results = (
232
- (existing_models_results['Model Name'] == model_name) &
233
- (existing_models_results['Revision'] == revision) &
234
- (existing_models_results['Precision'] == precision)
235
- ).any()
236
- if model_exists_in_results:
237
- return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' has already been evaluated.**"
238
-
239
- if not df_pending.empty:
240
- existing_models_pending = df_pending[['model_name', 'revision', 'precision']]
241
- model_exists_in_pending = (
242
- (existing_models_pending['model_name'] == model_name) &
243
- (existing_models_pending['revision'] == revision) &
244
- (existing_models_pending['precision'] == precision)
245
- ).any()
246
- if model_exists_in_pending:
247
- return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' is already in the pending evaluations.**"
248
-
249
- if not df_finished.empty:
250
- existing_models_finished = df_finished[['model_name', 'revision', 'precision']]
251
- model_exists_in_finished = (
252
- (existing_models_finished['model_name'] == model_name) &
253
- (existing_models_finished['revision'] == revision) &
254
- (existing_models_finished['precision'] == precision)
255
- ).any()
256
- if model_exists_in_finished:
257
- return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' has already been evaluated.**"
258
 
259
  api = HfApi()
 
 
260
  try:
261
  _ = api.model_info(model_name)
262
  except Exception:
263
  return f"**Error: Could not find model '{model_name}' on HuggingFace Hub. Please ensure the model name is correct and the model is public.**"
264
 
265
- status = "PENDING"
266
- submission = {
267
- "model_name": model_name,
268
- "license": license,
269
- "revision": revision,
270
- "precision": precision,
271
- "params": params,
272
- "status": status,
273
- "modality": modality
274
- }
275
- submission_json = json.dumps(submission, indent=2)
276
-
277
- org_model = model_name.split('/')
278
  if len(org_model) != 2:
279
  return "**Please enter the full model name including the organization or username, e.g., 'inceptionai/jais-family-30b-8k'**"
280
  org, model_id = org_model
281
- precision_str = precision if precision else 'Missing'
282
- file_path_in_repo = f"pending/{org}/{model_id}_eval_request_{revision}_{precision_str}.json"
283
 
284
- try:
285
- hf_api_token = os.environ.get('HF_API_TOKEN', None)
286
- api.upload_file(
287
- path_or_fileobj=submission_json.encode('utf-8'),
288
- path_in_repo=file_path_in_repo,
289
- repo_id=DATASET_REPO_ID,
290
- repo_type="dataset",
291
- token=hf_api_token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  )
293
- except Exception as e:
294
- return f"**Error: Could not submit the model. {str(e)}**"
 
 
 
 
 
295
 
296
- return f"**Model '{model_name}' has been submitted for evaluation.**"
297
 
298
 
299
- def load_requests(status_folder):
 
 
 
 
300
  api = HfApi()
301
  requests_data = []
302
- folder_path_in_repo = status_folder
303
 
304
- hf_api_token = os.environ.get('HF_API_TOKEN', None)
305
 
306
  try:
307
  files_info = api.list_repo_files(
308
- repo_id=DATASET_REPO_ID,
309
  repo_type="dataset",
310
- token=hf_api_token
311
  )
312
  except Exception as e:
313
- print(f"Error accessing dataset repository: {e}")
314
  return pd.DataFrame()
315
 
316
- files_in_folder = [f for f in files_info if f.startswith(f"{folder_path_in_repo}/") and f.endswith('.json')]
 
 
317
 
318
  for file_path in files_in_folder:
319
  try:
320
  local_file_path = hf_hub_download(
321
- repo_id=DATASET_REPO_ID,
322
  filename=file_path,
323
  repo_type="dataset",
324
- token=hf_api_token
325
  )
326
- with open(local_file_path, 'r') as f:
327
  request = json.load(f)
328
  requests_data.append(request)
329
  except Exception as e:
@@ -334,43 +624,69 @@ def load_requests(status_folder):
334
  return df
335
 
336
 
337
- def filter_df_3c3h(search_query, selected_cols, precision_filters, license_filters, min_size, max_size):
338
- df_ = load_results()[0].copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  if min_size > max_size:
340
  min_size, max_size = max_size, min_size
 
 
341
  if search_query:
342
- df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
 
 
343
  if precision_filters:
344
- include_missing = 'Missing' in precision_filters
345
- selected_precisions = [p for p in precision_filters if p != 'Missing']
346
  if include_missing:
347
  df_ = df_[
348
- (df_['Precision'].isin(selected_precisions)) |
349
- (df_['Precision'] == 'UNK') |
350
- (df_['Precision'].isna())
351
  ]
352
  else:
353
- df_ = df_[df_['Precision'].isin(selected_precisions)]
 
 
354
  if license_filters:
355
- include_missing = 'Missing' in license_filters
356
- selected_licenses = [l for l in license_filters if l != 'Missing']
357
  if include_missing:
358
  df_ = df_[
359
- (df_['License'].isin(selected_licenses)) |
360
- (df_['License'] == 'UNK') |
361
- (df_['License'].isna())
362
  ]
363
  else:
364
- df_ = df_[df_['License'].isin(selected_licenses)]
365
- df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
366
- if 'Rank' in df_.columns:
367
- df_ = df_.drop(columns=['Rank'])
 
 
 
 
368
  df_ = df_.reset_index(drop=True)
369
- df_.insert(0, 'Rank', range(1, len(df_)+1))
 
370
  fixed_column_order = [
371
  "Rank",
 
372
  "Model Name",
373
  "3C3H Score",
 
374
  "Correctness",
375
  "Completeness",
376
  "Conciseness",
@@ -380,52 +696,81 @@ def filter_df_3c3h(search_query, selected_cols, precision_filters, license_filte
380
  "Revision",
381
  "License",
382
  "Precision",
383
- "Model Size"
384
  ]
385
 
386
- selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]
 
 
 
 
387
 
388
  return df_[selected_cols]
389
 
390
 
391
- def filter_df_tasks(search_query, selected_cols, precision_filters, license_filters, min_size, max_size, task_columns):
392
- df_ = load_results()[1].copy()
 
 
 
 
 
 
 
 
 
 
 
393
  if min_size > max_size:
394
  min_size, max_size = max_size, min_size
 
395
  if search_query:
396
- df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
 
397
  if precision_filters:
398
- include_missing = 'Missing' in precision_filters
399
- selected_precisions = [p for p in precision_filters if p != 'Missing']
400
  if include_missing:
401
  df_ = df_[
402
- (df_['Precision'].isin(selected_precisions)) |
403
- (df_['Precision'] == 'UNK') |
404
- (df_['Precision'].isna())
405
  ]
406
  else:
407
- df_ = df_[df_['Precision'].isin(selected_precisions)]
 
408
  if license_filters:
409
- include_missing = 'Missing' in license_filters
410
- selected_licenses = [l for l in license_filters if l != 'Missing']
411
  if include_missing:
412
  df_ = df_[
413
- (df_['License'].isin(selected_licenses)) |
414
- (df_['License'] == 'UNK') |
415
- (df_['License'].isna())
416
  ]
417
  else:
418
- df_ = df_[df_['License'].isin(selected_licenses)]
419
- df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
420
- if 'Rank' in df_.columns:
421
- df_ = df_.drop(columns=['Rank'])
 
 
 
 
 
 
422
  if task_columns:
423
  first_task = task_columns[0]
424
- df_ = df_.sort_values(by=first_task, ascending=False)
 
 
 
425
  else:
426
- df_ = df_.sort_values(by='Model Name', ascending=True)
 
427
  df_ = df_.reset_index(drop=True)
428
- df_.insert(0, 'Rank', range(1, len(df_)+1))
 
429
  fixed_column_order = [
430
  "Rank",
431
  "Model Name",
@@ -436,10 +781,167 @@ def filter_df_tasks(search_query, selected_cols, precision_filters, license_filt
436
  "Revision",
437
  "License",
438
  "Precision",
439
- "Model Size"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  ]
441
 
442
- selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]
 
 
443
  return df_[selected_cols]
444
 
445
 
@@ -454,88 +956,138 @@ def filter_if_df(search_query, selected_cols, family_filters, min_size, max_size
454
 
455
  # Search by model name
456
  if search_query:
457
- df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
458
 
459
  # Filter by Family only (Creator and Type filters removed)
460
  if family_filters:
461
- df_ = df_[df_['Family'].isin(family_filters)]
462
 
463
  # Filter by Model Size
464
- df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
 
 
465
 
466
- # Re-rank
467
- if 'Rank' in df_.columns:
468
- df_ = df_.drop(columns=['Rank'])
469
  df_ = df_.reset_index(drop=True)
470
- df_.insert(0, 'Rank', range(1, len(df_)+1))
471
 
472
  fixed_column_order = [
473
  "Rank",
474
  "Model Name",
475
- "Creator",
476
- "Family",
477
- "Type",
478
  "Average Accuracy (Ar)",
479
  "Ar Prompt-lvl",
480
  "Ar Instruction-lvl",
481
  "Average Accuracy (En)",
482
  "En Prompt-lvl",
483
  "En Instruction-lvl",
 
 
 
484
  "Size (B)",
485
  "Base Model",
486
  "Context Window",
487
- "Lang."
488
  ]
489
 
490
- selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]
 
 
491
  return df_[selected_cols]
492
 
493
 
494
  def main():
495
- df_3c3h, df_tasks, task_columns = load_results()
 
 
496
  df_if = load_if_data() # Instruction Following DF
497
 
498
- # Setup precision/license options for the 3C3H scoreboard
499
- precision_options_3c3h = sorted(df_3c3h['Precision'].dropna().unique().tolist())
500
- precision_options_3c3h = [p for p in precision_options_3c3h if p != 'UNK']
501
- precision_options_3c3h.append('Missing')
502
-
503
- license_options_3c3h = sorted(df_3c3h['License'].dropna().unique().tolist())
504
- license_options_3c3h = [l for l in license_options_3c3h if l != 'UNK']
505
- license_options_3c3h.append('Missing')
506
-
507
- # Setup precision/license options for tasks scoreboard
508
- precision_options_tasks = sorted(df_tasks['Precision'].dropna().unique().tolist())
509
- precision_options_tasks = [p for p in precision_options_tasks if p != 'UNK']
510
- precision_options_tasks.append('Missing')
511
-
512
- license_options_tasks = sorted(df_tasks['License'].dropna().unique().tolist())
513
- license_options_tasks = [l for l in license_options_tasks if l != 'UNK']
514
- license_options_tasks.append('Missing')
515
-
516
- # Model size range for 3C3H scoreboard
517
- min_model_size_3c3h = int(df_3c3h['Model Size Filter'].min())
518
- max_model_size_3c3h = int(df_3c3h['Model Size Filter'].max())
 
 
 
 
 
 
 
 
 
 
 
 
519
 
520
- # Model size range for tasks scoreboard
521
- min_model_size_tasks = int(df_tasks['Model Size Filter'].min())
522
- max_model_size_tasks = int(df_tasks['Model Size Filter'].max())
 
 
523
 
524
- # Column choices for 3C3H
525
- column_choices_3c3h = [col for col in df_3c3h.columns.tolist() if col != 'Model Size Filter']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
526
 
527
- # Column choices for tasks
528
- column_choices_tasks = [col for col in df_tasks.columns.tolist() if col != 'Model Size Filter']
 
 
 
529
 
530
- # Now for instruction-following
531
- family_options_if = sorted(df_if['Family'].dropna().unique().tolist())
532
- min_model_size_if = int(df_if['Model Size Filter'].min())
533
- max_model_size_if = int(df_if['Model Size Filter'].max())
534
 
535
- #
536
- # IMPORTANT: Reorder the columns for the Instruction-Following leaderboard
537
- # Define the full order and the default visible columns separately.
538
- #
539
  all_if_columns = [
540
  "Rank",
541
  "Model Name",
@@ -551,7 +1103,7 @@ def main():
551
  "Size (B)",
552
  "Base Model",
553
  "Context Window",
554
- "Lang."
555
  ]
556
  default_if_columns = [
557
  "Rank",
@@ -559,209 +1111,439 @@ def main():
559
  "Average Accuracy (Ar)",
560
  "Ar Prompt-lvl",
561
  "Ar Instruction-lvl",
562
- "Average Accuracy (En)"
563
  ]
564
-
565
  with gr.Blocks() as demo:
566
  gr.HTML(HEADER)
567
 
568
  with gr.Tabs():
569
  #
570
- # AL Leaderboards Tab
571
  #
572
  with gr.Tab("AL Leaderboards 🏅"):
573
- # -------------------------
574
- # Sub-Tab: AraGen Leaderboards
575
- # -------------------------
576
- with gr.Tab("🐪 AraGen Leaderboards"):
577
- with gr.Tabs():
578
- # 3C3H Scores
579
- with gr.Tab("3C3H Scores"):
580
- with gr.Accordion("⚙️ Filters", open=False):
581
- with gr.Row():
582
- search_box_3c3h = gr.Textbox(
583
- placeholder="Search for models...",
584
- label="Search",
585
- interactive=True
586
- )
587
- with gr.Row():
588
- column_selector_3c3h = gr.CheckboxGroup(
589
- choices=column_choices_3c3h,
590
- value=[
591
- 'Rank', 'Model Name', '3C3H Score', 'Correctness', 'Completeness',
592
- 'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness'
593
- ],
594
- label="Select columns to display"
595
- )
596
- with gr.Row():
597
- license_filter_3c3h = gr.CheckboxGroup(
598
- choices=license_options_3c3h,
599
- value=license_options_3c3h.copy(),
600
- label="Filter by License"
601
- )
602
- precision_filter_3c3h = gr.CheckboxGroup(
603
- choices=precision_options_3c3h,
604
- value=precision_options_3c3h.copy(),
605
- label="Filter by Precision"
606
- )
607
- with gr.Row():
608
- model_size_min_filter_3c3h = gr.Slider(
609
- minimum=min_model_size_3c3h,
610
- maximum=max_model_size_3c3h,
611
- value=min_model_size_3c3h,
612
- step=1,
613
- label="Minimum Model Size",
614
- interactive=True
615
- )
616
- model_size_max_filter_3c3h = gr.Slider(
617
- minimum=min_model_size_3c3h,
618
- maximum=max_model_size_3c3h,
619
- value=max_model_size_3c3h,
620
- step=1,
621
- label="Maximum Model Size",
622
- interactive=True
623
- )
624
- leaderboard_3c3h = gr.Dataframe(
625
- df_3c3h[[
626
- 'Rank', 'Model Name', '3C3H Score', 'Correctness', 'Completeness',
627
- 'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness'
628
- ]],
629
- interactive=False
630
- )
631
- filter_inputs_3c3h = [
632
- search_box_3c3h, column_selector_3c3h,
633
- precision_filter_3c3h, license_filter_3c3h,
634
- model_size_min_filter_3c3h, model_size_max_filter_3c3h
635
- ]
636
- search_box_3c3h.submit(filter_df_3c3h, inputs=filter_inputs_3c3h, outputs=leaderboard_3c3h)
637
- for component in filter_inputs_3c3h:
638
- component.change(filter_df_3c3h, inputs=filter_inputs_3c3h, outputs=leaderboard_3c3h)
639
-
640
- # Tasks Scores
641
- with gr.Tab("Tasks Scores"):
642
- gr.Markdown("This Table is sorted based on the First Task (Question Answering)")
643
- with gr.Accordion("⚙️ Filters", open=False):
644
- with gr.Row():
645
- search_box_tasks = gr.Textbox(
646
- placeholder="Search for models...",
647
- label="Search",
648
- interactive=True
649
- )
650
- with gr.Row():
651
- column_selector_tasks = gr.CheckboxGroup(
652
- choices=column_choices_tasks,
653
- value=['Rank', 'Model Name'] + task_columns,
654
- label="Select columns to display"
655
- )
656
- with gr.Row():
657
- license_filter_tasks = gr.CheckboxGroup(
658
- choices=license_options_tasks,
659
- value=license_options_tasks.copy(),
660
- label="Filter by License"
661
- )
662
- precision_filter_tasks = gr.CheckboxGroup(
663
- choices=precision_options_tasks,
664
- value=precision_options_tasks.copy(),
665
- label="Filter by Precision"
666
- )
667
- with gr.Row():
668
- model_size_min_filter_tasks = gr.Slider(
669
- minimum=min_model_size_tasks,
670
- maximum=max_model_size_tasks,
671
- value=min_model_size_tasks,
672
- step=1,
673
- label="Minimum Model Size",
674
- interactive=True
675
- )
676
- model_size_max_filter_tasks = gr.Slider(
677
- minimum=min_model_size_tasks,
678
- maximum=max_model_size_tasks,
679
- value=max_model_size_tasks,
680
- step=1,
681
- label="Maximum Model Size",
682
- interactive=True
683
  )
684
- leaderboard_tasks = gr.Dataframe(
685
- df_tasks[['Rank', 'Model Name'] + task_columns],
686
- interactive=False
687
- )
688
- filter_inputs_tasks = [
689
- search_box_tasks, column_selector_tasks,
690
- precision_filter_tasks, license_filter_tasks,
691
- model_size_min_filter_tasks, model_size_max_filter_tasks
692
- ]
693
- search_box_tasks.submit(
694
- lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks(sq, cols, pf, lf, min_val, max_val, task_columns),
695
- inputs=filter_inputs_tasks,
696
- outputs=leaderboard_tasks
697
- )
698
- for component in filter_inputs_tasks:
699
- component.change(
700
- lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks(sq, cols, pf, lf, min_val, max_val, task_columns),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
701
  inputs=filter_inputs_tasks,
702
- outputs=leaderboard_tasks
703
  )
 
 
 
 
 
 
 
 
704
 
705
- # -------------------------
706
- # Sub-Tab: Instruction Following Leaderboard
707
- # -------------------------
708
- with gr.Tab("🗡️ Instruction Following Leaderboard"):
709
- with gr.Accordion("⚙️ Filters", open=False):
710
- with gr.Row():
711
- search_box_if = gr.Textbox(
712
- placeholder="Search for models...",
713
- label="Search",
714
- interactive=True
715
- )
716
- with gr.Row():
717
- column_selector_if = gr.CheckboxGroup(
718
- choices=all_if_columns,
719
- value=default_if_columns,
720
- label="Select columns to display"
721
- )
722
- with gr.Row():
723
- family_filter_if = gr.CheckboxGroup(
724
- choices=family_options_if,
725
- value=family_options_if.copy(),
726
- label="Filter by Family"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
727
  )
728
- with gr.Row():
729
- model_size_min_filter_if = gr.Slider(
730
- minimum=min_model_size_if,
731
- maximum=max_model_size_if,
732
- value=min_model_size_if,
733
- step=1,
734
- label="Minimum Model Size",
735
- interactive=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
736
  )
737
- model_size_max_filter_if = gr.Slider(
738
- minimum=min_model_size_if,
739
- maximum=max_model_size_if,
740
- value=max_model_size_if,
741
- step=1,
742
- label="Maximum Model Size",
743
- interactive=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
744
  )
745
- leaderboard_if = gr.Dataframe(
746
- df_if[default_if_columns],
747
- interactive=False
748
- )
749
- filter_inputs_if = [
750
- search_box_if, column_selector_if,
751
- family_filter_if,
752
- model_size_min_filter_if, model_size_max_filter_if
753
- ]
754
- search_box_if.submit(filter_if_df, inputs=filter_inputs_if, outputs=leaderboard_if)
755
- for component in filter_inputs_if:
756
- component.change(filter_if_df, inputs=filter_inputs_if, outputs=leaderboard_if)
757
 
758
  #
759
  # Submit Tab
760
  #
761
  with gr.Tab("Submit Here 📝"):
762
- df_pending = load_requests('pending')
763
- df_finished = load_requests('finished')
764
- df_failed = load_requests('failed')
 
 
 
 
 
 
 
 
 
765
 
766
  gr.Markdown(ABOUT_SECTION)
767
 
@@ -769,53 +1551,119 @@ def main():
769
  with gr.Column():
770
  model_name_input = gr.Textbox(
771
  label="Model Name",
772
- placeholder="Enter the full model name from HuggingFace Hub (e.g., inceptionai/jais-family-30b-8k)"
 
 
 
773
  )
774
- revision_input = gr.Textbox(label="Revision", placeholder="main", value="main")
775
  precision_input = gr.Dropdown(
776
  choices=["float16", "float32", "bfloat16", "8bit", "4bit"],
777
  label="Precision",
778
- value="float16"
779
  )
780
  params_input = gr.Textbox(
781
  label="Params",
782
- placeholder="Enter the approximate number of parameters as Integer (e.g., 7, 13, 30, 70 ...)"
783
  )
784
  license_input = gr.Textbox(
785
  label="License",
786
  placeholder="Enter the license type (Generic one is 'Open' in case no License is provided)",
787
- value="Open"
788
  )
789
  modality_input = gr.Radio(
790
  choices=["Text"],
791
  label="Modality",
792
- value="Text"
 
 
 
 
 
793
  )
794
  submit_button = gr.Button("Submit Model")
795
  submission_result = gr.Markdown()
796
  submit_button.click(
797
  submit_model,
798
  inputs=[
799
- model_name_input, revision_input, precision_input,
800
- params_input, license_input, modality_input
 
 
 
 
 
801
  ],
802
- outputs=submission_result
803
  )
804
 
805
  gr.Markdown("## Evaluation Status")
806
- with gr.Accordion(f"Pending Evaluations ({len(df_pending)})", open=False):
807
- if not df_pending.empty:
808
- gr.Dataframe(df_pending)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809
  else:
810
  gr.Markdown("No pending evaluations.")
811
- with gr.Accordion(f"Finished Evaluations ({len(df_finished)})", open=False):
812
- if not df_finished.empty:
813
- gr.Dataframe(df_finished)
 
 
814
  else:
815
  gr.Markdown("No finished evaluations.")
816
- with gr.Accordion(f"Failed Evaluations ({len(df_failed)})", open=False):
817
- if not df_failed.empty:
818
- gr.Dataframe(df_failed)
 
 
819
  else:
820
  gr.Markdown("No failed evaluations.")
821
 
@@ -827,7 +1675,7 @@ def main():
827
  label=CITATION_BUTTON_LABEL,
828
  lines=8,
829
  elem_id="citation-button",
830
- show_copy_button=True
831
  )
832
 
833
  gr.HTML(BOTTOM_LOGO)
 
1
  import os
2
  import json
3
+ import math
4
  import numpy as np
5
  import pandas as pd
6
  import gradio as gr
 
8
 
9
 
10
  OWNER = "inceptionai"
11
+
12
+ ARAGEN_REQUESTS_REPO_ID = f"{OWNER}/aragen-requests-dataset"
13
+ HINDIGEN_REQUESTS_REPO_ID = f"{OWNER}/hindigen-requests-dataset"
14
+ IFEVAL_REQUESTS_REPO_ID = f"{OWNER}/arabicifeval-requests-dataset"
15
 
16
 
17
  HEADER = """
18
  <center>
19
  <br></br>
20
+ <h1>Multilingual Leaderboards 🌍</h1>
21
+ <h2>Generative Evaluation for Global South</h2>
22
  <br></br>
23
  </center>
24
  """
 
26
  ABOUT_SECTION = """
27
  ## About
28
 
29
+ In our `12-24` release, we introduced the **AraGen Benchmark**, along with the **3C3H** evaluation measure (aka the 3C3H Score). You can find more details about AraGen and 3C3H [here](https://huggingface.co/blog/leaderboard-3c3h-aragen). The first versions of the benchmark, **AraGen-12-24** and **AraGen-03-25 (v2)**, are publicly available in the [`inceptionai/AraGen`](https://huggingface.co/datasets/inceptionai/AraGen) dataset. The current AraGen leaderboard in this Space is powered by **AraGen-v3**.
30
+
31
+ Building on that foundation, we extend our evaluation beyond Arabic, introducing **HindiGen**, a generative benchmark for Hindi that will follow the same release philosophy as AraGen. The current **HindiGen-v1** powers the HindiGen leaderboards here; a future **HindiGen-v2** release will be publicly shared along with the v1 dataset.
32
+
33
+ In this release, we present three main leaderboards:
34
 
35
+ **AraGen-v3:**
36
 
37
+ - The AraGen Benchmark is designed to evaluate and compare the performance of Chat/Instruct Arabic Large Language Models on a suite of generative tasks that are culturally relevant to the Arab region, history, politics, cuisine, and more. By leveraging **3C3H** as an evaluation metric—which assesses a model's output across six dimensions: Correctness, Completeness, Conciseness, Helpfulness, Honesty, and Harmlessness—the leaderboard offers a comprehensive and holistic evaluation of a model’s chat capabilities and its ability to generate human-like and ethically responsible content.
38
 
39
+ **HindiGen-v1:**
40
 
41
+ - The HindiGen Benchmark evaluates Chat/Instruct LLMs on Hindi generative tasks such as question answering, grammar, and safety. It follows the same 3C3H evaluation methodology and bootstrapped confidence intervals, enabling statistically grounded comparisons between models on culturally and linguistically rich Hindi content.
42
+
43
+ **Instruction Following (IFEval – Arabic & English):**
44
 
45
  - We have established a robust leaderboard that benchmarks models on Arabic and English instruction following, offering an open and comparative performance landscape for the research community. Concurrently, we released the first publicly available Arabic [dataset](https://huggingface.co/datasets/inceptionai/Arabic_IFEval) aimed at evaluating LLMs' ability to follow instructions. The Arabic IFEval samples are meticulously curated to capture the language’s unique nuances—such as diacritization and distinctive phonetic features—often overlooked in generic datasets. Our dedicated linguistic team generated original samples and adapted selections from the IFEval English dataset, ensuring that the material resonates with Arabic cultural contexts and meets the highest standards of authenticity and quality.
46
 
 
50
 
51
  ### Contact
52
 
53
+ For inquiries or assistance, please join the conversation on our [Discussions Tab](https://huggingface.co/spaces/inceptionai/Leaderboards/discussions) or reach out via [email](mailto:[email protected]).
54
  """
55
 
56
  BOTTOM_LOGO = """<img src="https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards/resolve/main/assets/pictures/03-25/arabic-leaderboards-colab-march-preview-free-3.png" style="width:50%;display:block;margin-left:auto;margin-right:auto;border-radius:15px;">"""
57
 
58
  CITATION_BUTTON_TEXT = """
59
+ @misc{leaderboards,
60
+ author = {El Filali, Ali and Albarri, Sarah and Kamboj, Samta and Sengupta, Neha and Nakov, Preslav and Abouelseoud, Arwa},
61
+ title = {Multilingual Leaderboards: Generative Evaluation for Global South},
62
  year = {2025},
63
  publisher = {Inception},
64
+ howpublished = "url{https://huggingface.co/spaces/inceptionai/Leaderboards}"
65
  }
66
  """
67
 
 
70
  """
71
 
72
 
73
+ def extract_score_value(entry):
74
+ """
75
+ Helper to extract (value, lower, upper) from both old v2 format (float)
76
+ and new v3/v1 formats (dict with "value"/"lower"/"upper").
77
+ All values are returned in [0, 1] space; caller can convert to percentages.
78
+
79
+ We use the "value" field as the point estimate.
80
+ """
81
+ if entry is None:
82
+ return (math.nan, math.nan, math.nan)
83
+
84
+ # Old format: just a float
85
+ if isinstance(entry, (int, float)):
86
+ v = float(entry)
87
+ return (v, math.nan, math.nan)
88
+
89
+ # New format: dict with "value", "lower", "upper"
90
+ if isinstance(entry, dict):
91
+ v = float(entry.get("value", math.nan))
92
+ lower = entry.get("lower", math.nan)
93
+ upper = entry.get("upper", math.nan)
94
+ lower = float(lower) if isinstance(lower, (int, float)) else math.nan
95
+ upper = float(upper) if isinstance(upper, (int, float)) else math.nan
96
+ return (v, lower, upper)
97
+
98
+ return (math.nan, math.nan, math.nan)
99
+
100
+
101
+ def compute_leaderboard_3c3h(df_3c3h_base: pd.DataFrame) -> pd.DataFrame:
102
+ """
103
+ Build the 3C3H leaderboard with:
104
+ - Rank (by 3C3H Score)
105
+ - Rank Spread (based on 3C3H Score CI)
106
+ - 95% CI (±) for 3C3H Score (only)
107
+ - Model Size Filter
108
+
109
+ All scores are in percentage space.
110
+ """
111
+ df = df_3c3h_base.copy()
112
+
113
+ # Model size filter helper
114
+ max_model_size_value = 1000
115
+ df["Model Size Filter"] = df["Model Size"].replace(np.inf, max_model_size_value)
116
+
117
+ # Sort & rank by 3C3H Score (point estimate)
118
+ if "3C3H Score" in df.columns:
119
+ df = df.sort_values(by="3C3H Score", ascending=False)
120
+ df = df.reset_index(drop=True)
121
+ df.insert(0, "Rank", range(1, len(df) + 1))
122
+
123
+ # Rank Spread based on 3C3H Score CI
124
+ main_col = "3C3H Score"
125
+ lower_col = "3C3H Score Lower"
126
+ upper_col = "3C3H Score Upper"
127
+
128
+ # Effective lower/upper: if not present, fall back to point estimate
129
+ if lower_col in df.columns:
130
+ lower_eff = df[lower_col].copy()
131
+ else:
132
+ lower_eff = df[main_col].copy()
133
+
134
+ if upper_col in df.columns:
135
+ upper_eff = df[upper_col].copy()
136
+ else:
137
+ upper_eff = df[main_col].copy()
138
+
139
+ # order of base scenario: all models at their point estimates (value-based)
140
+ sort_desc = df.sort_values(by=main_col, ascending=False)
141
+ score_order = sort_desc[main_col].values # descending
142
+
143
+ def rank_position(x, order):
144
+ """
145
+ Given a value x and a descending array 'order',
146
+ return the rank index where x would land
147
+ if all others stayed as in 'order'.
148
+
149
+ Rank = 1 + number of scores strictly greater than x.
150
+ """
151
+ if np.isnan(x):
152
+ return math.nan
153
+
154
+ # Ignore NaNs in the score order
155
+ valid = order[~np.isnan(order)]
156
+ if valid.size == 0:
157
+ return math.nan
158
+
159
+ # 'valid' is descending; count how many scores are strictly greater than x
160
+ num_greater = np.sum(valid > x)
161
+ rank = num_greater + 1
162
+
163
+ # Clamp rank to [1, len(valid)] for numerical safety
164
+ if rank < 1:
165
+ rank = 1
166
+ elif rank > len(valid):
167
+ rank = len(valid)
168
+
169
+ return int(rank)
170
+
171
+ best_ranks = []
172
+ worst_ranks = []
173
+ for low, high in zip(lower_eff.values, upper_eff.values):
174
+ best = rank_position(high, score_order) # optimistic: use upper bound
175
+ worst = rank_position(low, score_order) # pessimistic: use lower bound
176
+ best_ranks.append(best)
177
+ worst_ranks.append(worst)
178
+
179
+ spread = []
180
+ for b, w in zip(best_ranks, worst_ranks):
181
+ if math.isnan(b) or math.isnan(w):
182
+ spread.append("-")
183
+ else:
184
+ spread.append(f"{int(b)} <--> {int(w)}")
185
+ df.insert(1, "Rank Spread", spread)
186
+
187
+ # 95% CI (±) for 3C3H Score only (in percentage space)
188
+ if lower_col in df.columns and upper_col in df.columns:
189
+ ci = (df[upper_col] - df[lower_col]) / 2.0
190
+ df["95% CI (±)"] = ci.round(4)
191
+ else:
192
+ df["95% CI (±)"] = np.nan
193
+
194
+ # Round score columns
195
+ score_columns_3c3h = [
196
+ "3C3H Score",
197
+ "Correctness",
198
+ "Completeness",
199
+ "Conciseness",
200
+ "Helpfulness",
201
+ "Honesty",
202
+ "Harmlessness",
203
+ ]
204
+ for col in score_columns_3c3h:
205
+ if col in df.columns:
206
+ df[col] = df[col].round(4)
207
+
208
+ df["95% CI (±)"] = df["95% CI (±)"].round(4)
209
+
210
+ return df
211
+
212
+
213
+ def load_results(benchmark="aragen"):
214
  """
215
+ Loads results for the given benchmark.
216
+
217
+ benchmark:
218
+ - "aragen" -> uses aragen_v3_results.json (or v2 fallback)
219
+ - "hindigen" -> uses hindigen_v1_results.json
220
+
221
+ Supports:
222
+ - old v2 format (simple floats)
223
+ - new v3/v1 format (dict with value/lower/upper)
224
+
225
+ Returns:
226
+ df_3c3h : 3C3H leaderboard dataframe (with Rank, Rank Spread, 95% CI (±))
227
+ df_tasks : tasks leaderboard dataframe
228
+ task_columns: list of task score columns
229
  """
230
  current_dir = os.path.dirname(os.path.abspath(__file__))
231
+
232
+ if benchmark == "hindigen":
233
+ results_file = os.path.join(current_dir, "assets", "results", "hindigen_v1_results.json")
234
+ else:
235
+ v3_file = os.path.join(current_dir, "assets", "results", "aragen_v3_results.json")
236
+ v2_file = os.path.join(current_dir, "assets", "results", "aragen_v2_results.json")
237
+ if os.path.exists(v3_file):
238
+ results_file = v3_file
239
+ else:
240
+ results_file = v2_file
241
+
242
+ with open(results_file, "r", encoding="utf-8") as f:
243
  data = json.load(f)
244
+
245
+ # Filter out entries that only contain "_last_sync_timestamp"
246
  filtered_data = []
247
  for entry in data:
248
  if len(entry.keys()) == 1 and "_last_sync_timestamp" in entry:
249
  continue
250
  filtered_data.append(entry)
 
251
  data = filtered_data
252
+
253
  data_3c3h = []
254
  data_tasks = []
255
+
256
  for model_data in data:
257
+ meta = model_data.get("Meta", {})
258
+ model_name = meta.get("Model Name", "UNK")
259
+ revision = meta.get("Revision", "UNK")
260
+ precision = meta.get("Precision", "UNK")
261
+ license_ = meta.get("License", "UNK")
262
+ params = meta.get("Params", "UNK")
263
+
264
+ # Parse model size
265
  try:
266
  model_size_numeric = float(params)
267
+ except Exception:
268
  model_size_numeric = np.inf
269
+
270
+ # Find the key that holds the scores (e.g. "claude-3-7-sonnet-20250219 Scores", "claude-3.5-sonnet Scores")
271
+ scores_key = None
272
+ for k in model_data.keys():
273
+ if k.endswith("Scores"):
274
+ scores_key = k
275
+ break
276
+
277
+ scores_data = model_data.get(scores_key, {}) if scores_key else {}
278
+ scores_3c3h = scores_data.get("3C3H Scores", {})
279
+ scores_tasks = scores_data.get("Tasks Scores", {})
280
+
281
+ # --- 3C3H entry ---
282
+ entry3 = {
283
+ "Model Name": model_name,
284
+ "Revision": revision,
285
+ "License": license_,
286
+ "Precision": precision,
287
+ "Model Size": model_size_numeric,
 
 
288
  }
289
+
290
+ for metric_name, metric_entry in scores_3c3h.items():
291
+ v, lower, upper = extract_score_value(metric_entry)
292
+ # Point estimate (percentage)
293
+ entry3[metric_name] = v * 100 if not math.isnan(v) else np.nan
294
+
295
+ # Only keep lower/upper for 3C3H Score (for CI & Rank Spread)
296
+ if metric_name == "3C3H Score":
297
+ entry3["3C3H Score Lower"] = (
298
+ lower * 100 if not math.isnan(lower) else np.nan
299
+ )
300
+ entry3["3C3H Score Upper"] = (
301
+ upper * 100 if not math.isnan(upper) else np.nan
302
+ )
303
+
304
+ data_3c3h.append(entry3)
305
+
306
+ # --- Tasks entry ---
307
+ entryt = {
308
+ "Model Name": model_name,
309
+ "Revision": revision,
310
+ "License": license_,
311
+ "Precision": precision,
312
+ "Model Size": model_size_numeric,
313
  }
314
+
315
+ for task_name, task_entry in scores_tasks.items():
316
+ v, _, _ = extract_score_value(task_entry)
317
+ entryt[task_name] = v * 100 if not math.isnan(v) else np.nan
318
+
319
+ data_tasks.append(entryt)
320
+
321
+ df_3c3h_base = pd.DataFrame(data_3c3h)
322
+ df_tasks_base = pd.DataFrame(data_tasks)
323
+
324
+ # Build 3C3H leaderboard (rank, rank spread, CI, size filter)
325
+ df_3c3h = compute_leaderboard_3c3h(df_3c3h_base)
326
+
327
+ # Build tasks leaderboard (no weighted average, no rank spread, no CI)
328
+ if df_tasks_base.empty:
329
+ df_tasks = df_tasks_base.copy()
330
+ task_columns = []
 
 
 
 
 
 
 
 
 
 
331
  else:
332
+ meta_cols_tasks = [
333
+ "Model Name",
334
+ "Revision",
335
+ "License",
336
+ "Precision",
337
+ "Model Size",
338
+ ]
339
+ task_columns = [
340
+ col
341
+ for col in df_tasks_base.columns
342
+ if col not in meta_cols_tasks
343
+ ]
344
+
345
+ df_tasks = df_tasks_base.copy()
346
+
347
+ # Round task scores
348
+ if task_columns:
349
+ df_tasks[task_columns] = df_tasks[task_columns].round(4)
350
+
351
+ # Model size filter
352
+ max_model_size_value = 1000
353
+ df_tasks["Model Size Filter"] = df_tasks["Model Size"].replace(
354
+ np.inf, max_model_size_value
355
+ )
356
+
357
+ # Sort & rank: based on the first task (typically Question Answering (QA))
358
+ if task_columns:
359
+ first_task = task_columns[0]
360
+ df_tasks = df_tasks.sort_values(by=first_task, ascending=False)
361
+ else:
362
+ df_tasks = df_tasks.sort_values(by="Model Name", ascending=True)
363
+
364
+ df_tasks = df_tasks.reset_index(drop=True)
365
+ df_tasks.insert(0, "Rank", range(1, len(df_tasks) + 1))
366
+
367
  return df_3c3h, df_tasks, task_columns
368
 
369
 
 
422
  return df
423
 
424
 
425
+ def submit_model(model_name, revision, precision, params, license, modality, leaderboards_selected):
426
+ """
427
+ Submits a model to one or more leaderboards:
428
+ - AraGen -> inceptionai/aragen-requests-dataset
429
+ - HindiGen -> inceptionai/hindigen-requests-dataset
430
+ - IFEval -> inceptionai/arabicifeval-requests-dataset
431
 
432
+ User must choose at least one leaderboard.
433
+ """
434
+ if not leaderboards_selected:
435
+ return "**Error:** You must choose at least one leaderboard (AraGen, HindiGen, and/or IFEval)."
436
+
437
+ # Normalize precision
438
+ if precision == "Missing":
439
+ precision_norm = None
440
  else:
441
+ precision_norm = precision.strip().lower() if precision else None
442
+
443
+ repo_map = {
444
+ "AraGen": ARAGEN_REQUESTS_REPO_ID,
445
+ "HindiGen": HINDIGEN_REQUESTS_REPO_ID,
446
+ "IFEval": IFEVAL_REQUESTS_REPO_ID,
447
+ }
448
+
449
+ # Map leaderboards that use the 3C3H JSON result files (for dedup vs results)
450
+ results_benchmark_map = {
451
+ "AraGen": "aragen",
452
+ "HindiGen": "hindigen",
453
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
 
455
  api = HfApi()
456
+
457
+ # Validate model exists on HuggingFace Hub once
458
  try:
459
  _ = api.model_info(model_name)
460
  except Exception:
461
  return f"**Error: Could not find model '{model_name}' on HuggingFace Hub. Please ensure the model name is correct and the model is public.**"
462
 
463
+ org_model = model_name.split("/")
 
 
 
 
 
 
 
 
 
 
 
 
464
  if len(org_model) != 2:
465
  return "**Please enter the full model name including the organization or username, e.g., 'inceptionai/jais-family-30b-8k'**"
466
  org, model_id = org_model
 
 
467
 
468
+ hf_api_token = os.environ.get("HF_API_TOKEN", None)
469
+
470
+ # Dedup & upload per leaderboard
471
+ success_targets = []
472
+ skipped_targets = []
473
+ errors = []
474
+
475
+ for leaderboard in leaderboards_selected:
476
+ repo_id = repo_map.get(leaderboard)
477
+ if repo_id is None:
478
+ errors.append(f"- Unknown leaderboard: {leaderboard}")
479
+ continue
480
+
481
+ # Deduplicate against existing results (only for AraGen/HindiGen)
482
+ already_evaluated = False
483
+ if leaderboard in results_benchmark_map:
484
+ df_3c3h_lb, _, _ = load_results(results_benchmark_map[leaderboard])
485
+ if not df_3c3h_lb.empty:
486
+ existing_models_results = df_3c3h_lb[["Model Name", "Revision", "Precision"]]
487
+ model_exists_in_results = (
488
+ (existing_models_results["Model Name"] == model_name)
489
+ & (existing_models_results["Revision"] == revision)
490
+ & (existing_models_results["Precision"] == (precision_norm if precision_norm is not None else existing_models_results["Precision"]))
491
+ ).any()
492
+ if model_exists_in_results:
493
+ skipped_targets.append(
494
+ f"- **{leaderboard}**: Model already appears in the leaderboard results."
495
+ )
496
+ already_evaluated = True
497
+
498
+ # Deduplicate against pending/finished requests in this repo
499
+ def load_req(status_folder):
500
+ return load_requests(repo_id, status_folder)
501
+
502
+ df_pending = load_req("pending")
503
+ df_finished = load_req("finished")
504
+
505
+ if not already_evaluated:
506
+ if not df_pending.empty:
507
+ existing_models_pending = df_pending[["model_name", "revision", "precision"]]
508
+ model_exists_in_pending = (
509
+ (existing_models_pending["model_name"] == model_name)
510
+ & (existing_models_pending["revision"] == revision)
511
+ & (existing_models_pending["precision"] == precision_norm)
512
+ ).any()
513
+ if model_exists_in_pending:
514
+ skipped_targets.append(
515
+ f"- **{leaderboard}**: Model is already in pending evaluations."
516
+ )
517
+ already_evaluated = True
518
+
519
+ if not already_evaluated:
520
+ if not df_finished.empty:
521
+ existing_models_finished = df_finished[["model_name", "revision", "precision"]]
522
+ model_exists_in_finished = (
523
+ (existing_models_finished["model_name"] == model_name)
524
+ & (existing_models_finished["revision"] == revision)
525
+ & (existing_models_finished["precision"] == precision_norm)
526
+ ).any()
527
+ if model_exists_in_finished:
528
+ skipped_targets.append(
529
+ f"- **{leaderboard}**: Model has already been evaluated (finished)."
530
+ )
531
+ already_evaluated = True
532
+
533
+ if already_evaluated:
534
+ continue
535
+
536
+ # Prepare submission JSON
537
+ status = "PENDING"
538
+ submission = {
539
+ "model_name": model_name,
540
+ "license": license,
541
+ "revision": revision,
542
+ "precision": precision_norm,
543
+ "params": params,
544
+ "status": status,
545
+ "modality": modality,
546
+ "leaderboard": leaderboard,
547
+ }
548
+ submission_json = json.dumps(submission, indent=2)
549
+
550
+ precision_str = precision_norm if precision_norm else "Missing"
551
+ file_path_in_repo = f"pending/{org}/{model_id}_eval_request_{revision}_{precision_str}.json"
552
+
553
+ try:
554
+ api.upload_file(
555
+ path_or_fileobj=submission_json.encode("utf-8"),
556
+ path_in_repo=file_path_in_repo,
557
+ repo_id=repo_id,
558
+ repo_type="dataset",
559
+ token=hf_api_token,
560
+ )
561
+ success_targets.append(leaderboard)
562
+ except Exception as e:
563
+ errors.append(f"- **{leaderboard}**: Error while submitting – {str(e)}")
564
+
565
+ # Build user-facing message
566
+ messages = []
567
+ if success_targets:
568
+ messages.append(
569
+ f"✅ Model **'{model_name}'** has been submitted for evaluation to: "
570
+ + ", ".join(f"**{lb}**" for lb in success_targets)
571
+ + "."
572
  )
573
+ if skipped_targets:
574
+ messages.append("⚠️ Skipped submissions:\n" + "\n".join(skipped_targets))
575
+ if errors:
576
+ messages.append("❌ Errors:\n" + "\n".join(errors))
577
+
578
+ if not messages:
579
+ return "**No submissions were made.** Please check if the model is already pending or evaluated."
580
 
581
+ return "\n\n".join(messages)
582
 
583
 
584
+ def load_requests(repo_id, status_folder):
585
+ """
586
+ Loads request JSON files from a given dataset repo and status folder:
587
+ status_folder in {"pending", "finished", "failed"}
588
+ """
589
  api = HfApi()
590
  requests_data = []
 
591
 
592
+ hf_api_token = os.environ.get("HF_API_TOKEN", None)
593
 
594
  try:
595
  files_info = api.list_repo_files(
596
+ repo_id=repo_id,
597
  repo_type="dataset",
598
+ token=hf_api_token,
599
  )
600
  except Exception as e:
601
+ print(f"Error accessing dataset repository {repo_id}: {e}")
602
  return pd.DataFrame()
603
 
604
+ files_in_folder = [
605
+ f for f in files_info if f.startswith(f"{status_folder}/") and f.endswith(".json")
606
+ ]
607
 
608
  for file_path in files_in_folder:
609
  try:
610
  local_file_path = hf_hub_download(
611
+ repo_id=repo_id,
612
  filename=file_path,
613
  repo_type="dataset",
614
+ token=hf_api_token,
615
  )
616
+ with open(local_file_path, "r") as f:
617
  request = json.load(f)
618
  requests_data.append(request)
619
  except Exception as e:
 
624
  return df
625
 
626
 
627
+ # ---------- FILTER HELPERS (AraGen) ----------
628
+
629
+ def filter_df_3c3h(
630
+ search_query,
631
+ selected_cols,
632
+ precision_filters,
633
+ license_filters,
634
+ min_size,
635
+ max_size,
636
+ ):
637
+ # AraGen 3C3H
638
+ df_3c3h, _, _ = load_results("aragen")
639
+ df_ = df_3c3h.copy()
640
+
641
+ # Sanity check on size range
642
  if min_size > max_size:
643
  min_size, max_size = max_size, min_size
644
+
645
+ # Text search
646
  if search_query:
647
+ df_ = df_[df_["Model Name"].str.contains(search_query, case=False, na=False)]
648
+
649
+ # Precision filtering
650
  if precision_filters:
651
+ include_missing = "Missing" in precision_filters
652
+ selected_precisions = [p for p in precision_filters if p != "Missing"]
653
  if include_missing:
654
  df_ = df_[
655
+ (df_["Precision"].isin(selected_precisions))
656
+ | (df_["Precision"] == "UNK")
657
+ | (df_["Precision"].isna())
658
  ]
659
  else:
660
+ df_ = df_[df_["Precision"].isin(selected_precisions)]
661
+
662
+ # License filtering
663
  if license_filters:
664
+ include_missing = "Missing" in license_filters
665
+ selected_licenses = [l for l in license_filters if l != "Missing"]
666
  if include_missing:
667
  df_ = df_[
668
+ (df_["License"].isin(selected_licenses))
669
+ | (df_["License"] == "UNK")
670
+ | (df_["License"].isna())
671
  ]
672
  else:
673
+ df_ = df_[df_["License"].isin(selected_licenses)]
674
+
675
+ # Model size filter
676
+ df_ = df_[
677
+ (df_["Model Size Filter"] >= min_size) & (df_["Model Size Filter"] <= max_size)
678
+ ]
679
+
680
+ # Keep global Rank / Rank Spread; just reset the index
681
  df_ = df_.reset_index(drop=True)
682
+
683
+ # Column ordering
684
  fixed_column_order = [
685
  "Rank",
686
+ "Rank Spread",
687
  "Model Name",
688
  "3C3H Score",
689
+ "95% CI (±)",
690
  "Correctness",
691
  "Completeness",
692
  "Conciseness",
 
696
  "Revision",
697
  "License",
698
  "Precision",
699
+ "Model Size",
700
  ]
701
 
702
+ selected_cols = [
703
+ col
704
+ for col in fixed_column_order
705
+ if col in selected_cols and col in df_.columns
706
+ ]
707
 
708
  return df_[selected_cols]
709
 
710
 
711
+ def filter_df_tasks(
712
+ search_query,
713
+ selected_cols,
714
+ precision_filters,
715
+ license_filters,
716
+ min_size,
717
+ max_size,
718
+ task_columns,
719
+ ):
720
+ # AraGen tasks
721
+ _, df_tasks, _ = load_results("aragen")
722
+ df_ = df_tasks.copy()
723
+
724
  if min_size > max_size:
725
  min_size, max_size = max_size, min_size
726
+
727
  if search_query:
728
+ df_ = df_[df_["Model Name"].str.contains(search_query, case=False, na=False)]
729
+
730
  if precision_filters:
731
+ include_missing = "Missing" in precision_filters
732
+ selected_precisions = [p for p in precision_filters if p != "Missing"]
733
  if include_missing:
734
  df_ = df_[
735
+ (df_["Precision"].isin(selected_precisions))
736
+ | (df_["Precision"] == "UNK")
737
+ | (df_["Precision"].isna())
738
  ]
739
  else:
740
+ df_ = df_[df_["Precision"].isin(selected_precisions)]
741
+
742
  if license_filters:
743
+ include_missing = "Missing" in license_filters
744
+ selected_licenses = [l for l in license_filters if l != "Missing"]
745
  if include_missing:
746
  df_ = df_[
747
+ (df_["License"].isin(selected_licenses))
748
+ | (df_["License"] == "UNK")
749
+ | (df_["License"].isna())
750
  ]
751
  else:
752
+ df_ = df_[df_["License"].isin(selected_licenses)]
753
+
754
+ df_ = df_[
755
+ (df_["Model Size Filter"] >= min_size) & (df_["Model Size Filter"] <= max_size)
756
+ ]
757
+
758
+ # Re-rank within filtered subset using first task as sort key
759
+ if "Rank" in df_.columns:
760
+ df_ = df_.drop(columns=["Rank"])
761
+
762
  if task_columns:
763
  first_task = task_columns[0]
764
+ if first_task in df_.columns:
765
+ df_ = df_.sort_values(by=first_task, ascending=False)
766
+ else:
767
+ df_ = df_.sort_values(by="Model Name", ascending=True)
768
  else:
769
+ df_ = df_.sort_values(by="Model Name", ascending=True)
770
+
771
  df_ = df_.reset_index(drop=True)
772
+ df_.insert(0, "Rank", range(1, len(df_) + 1))
773
+
774
  fixed_column_order = [
775
  "Rank",
776
  "Model Name",
 
781
  "Revision",
782
  "License",
783
  "Precision",
784
+ "Model Size",
785
+ ]
786
+
787
+ selected_cols = [
788
+ col for col in fixed_column_order if col in selected_cols and col in df_.columns
789
+ ]
790
+ return df_[selected_cols]
791
+
792
+
793
+ # ---------- FILTER HELPERS (HindiGen) ----------
794
+
795
+ def filter_df_3c3h_hindigen(
796
+ search_query,
797
+ selected_cols,
798
+ precision_filters,
799
+ license_filters,
800
+ min_size,
801
+ max_size,
802
+ ):
803
+ df_3c3h_hi, _, _ = load_results("hindigen")
804
+ df_ = df_3c3h_hi.copy()
805
+
806
+ if min_size > max_size:
807
+ min_size, max_size = max_size, min_size
808
+
809
+ if search_query:
810
+ df_ = df_[df_["Model Name"].str.contains(search_query, case=False, na=False)]
811
+
812
+ if precision_filters:
813
+ include_missing = "Missing" in precision_filters
814
+ selected_precisions = [p for p in precision_filters if p != "Missing"]
815
+ if include_missing:
816
+ df_ = df_[
817
+ (df_["Precision"].isin(selected_precisions))
818
+ | (df_["Precision"] == "UNK")
819
+ | (df_["Precision"].isna())
820
+ ]
821
+ else:
822
+ df_ = df_[df_["Precision"].isin(selected_precisions)]
823
+
824
+ if license_filters:
825
+ include_missing = "Missing" in license_filters
826
+ selected_licenses = [l for l in license_filters if l != "Missing"]
827
+ if include_missing:
828
+ df_ = df_[
829
+ (df_["License"].isin(selected_licenses))
830
+ | (df_["License"] == "UNK")
831
+ | (df_["License"].isna())
832
+ ]
833
+ else:
834
+ df_ = df_[df_["License"].isin(selected_licenses)]
835
+
836
+ df_ = df_[
837
+ (df_["Model Size Filter"] >= min_size) & (df_["Model Size Filter"] <= max_size)
838
+ ]
839
+
840
+ df_ = df_.reset_index(drop=True)
841
+
842
+ fixed_column_order = [
843
+ "Rank",
844
+ "Rank Spread",
845
+ "Model Name",
846
+ "3C3H Score",
847
+ "95% CI (±)",
848
+ "Correctness",
849
+ "Completeness",
850
+ "Conciseness",
851
+ "Helpfulness",
852
+ "Honesty",
853
+ "Harmlessness",
854
+ "Revision",
855
+ "License",
856
+ "Precision",
857
+ "Model Size",
858
+ ]
859
+
860
+ selected_cols = [
861
+ col
862
+ for col in fixed_column_order
863
+ if col in selected_cols and col in df_.columns
864
+ ]
865
+
866
+ return df_[selected_cols]
867
+
868
+
869
+ def filter_df_tasks_hindigen(
870
+ search_query,
871
+ selected_cols,
872
+ precision_filters,
873
+ license_filters,
874
+ min_size,
875
+ max_size,
876
+ task_columns,
877
+ ):
878
+ _, df_tasks_hi, _ = load_results("hindigen")
879
+ df_ = df_tasks_hi.copy()
880
+
881
+ if min_size > max_size:
882
+ min_size, max_size = max_size, min_size
883
+
884
+ if search_query:
885
+ df_ = df_[df_["Model Name"].str.contains(search_query, case=False, na=False)]
886
+
887
+ if precision_filters:
888
+ include_missing = "Missing" in precision_filters
889
+ selected_precisions = [p for p in precision_filters if p != "Missing"]
890
+ if include_missing:
891
+ df_ = df_[
892
+ (df_["Precision"].isin(selected_precisions))
893
+ | (df_["Precision"] == "UNK")
894
+ | (df_["Precision"].isna())
895
+ ]
896
+ else:
897
+ df_ = df_[df_["Precision"].isin(selected_precisions)]
898
+
899
+ if license_filters:
900
+ include_missing = "Missing" in license_filters
901
+ selected_licenses = [l for l in license_filters if l != "Missing"]
902
+ if include_missing:
903
+ df_ = df_[
904
+ (df_["License"].isin(selected_licenses))
905
+ | (df_["License"] == "UNK")
906
+ | (df_["License"].isna())
907
+ ]
908
+ else:
909
+ df_ = df_[df_["License"].isin(selected_licenses)]
910
+
911
+ df_ = df_[
912
+ (df_["Model Size Filter"] >= min_size) & (df_["Model Size Filter"] <= max_size)
913
+ ]
914
+
915
+ if "Rank" in df_.columns:
916
+ df_ = df_.drop(columns=["Rank"])
917
+
918
+ if task_columns:
919
+ first_task = task_columns[0]
920
+ if first_task in df_.columns:
921
+ df_ = df_.sort_values(by=first_task, ascending=False)
922
+ else:
923
+ df_ = df_.sort_values(by="Model Name", ascending=True)
924
+ else:
925
+ df_ = df_.sort_values(by="Model Name", ascending=True)
926
+
927
+ df_ = df_.reset_index(drop=True)
928
+ df_.insert(0, "Rank", range(1, len(df_) + 1))
929
+
930
+ fixed_column_order = [
931
+ "Rank",
932
+ "Model Name",
933
+ "Question Answering (QA)",
934
+ "Grammar",
935
+ "Safety",
936
+ "Revision",
937
+ "License",
938
+ "Precision",
939
+ "Model Size",
940
  ]
941
 
942
+ selected_cols = [
943
+ col for col in fixed_column_order if col in selected_cols and col in df_.columns
944
+ ]
945
  return df_[selected_cols]
946
 
947
 
 
956
 
957
  # Search by model name
958
  if search_query:
959
+ df_ = df_[df_["Model Name"].str.contains(search_query, case=False, na=False)]
960
 
961
  # Filter by Family only (Creator and Type filters removed)
962
  if family_filters:
963
+ df_ = df_[df_["Family"].isin(family_filters)]
964
 
965
  # Filter by Model Size
966
+ df_ = df_[
967
+ (df_["Model Size Filter"] >= min_size) & (df_["Model Size Filter"] <= max_size)
968
+ ]
969
 
970
+ # Re-rank within the filtered subset
971
+ if "Rank" in df_.columns:
972
+ df_ = df_.drop(columns=["Rank"])
973
  df_ = df_.reset_index(drop=True)
974
+ df_.insert(0, "Rank", range(1, len(df_) + 1))
975
 
976
  fixed_column_order = [
977
  "Rank",
978
  "Model Name",
 
 
 
979
  "Average Accuracy (Ar)",
980
  "Ar Prompt-lvl",
981
  "Ar Instruction-lvl",
982
  "Average Accuracy (En)",
983
  "En Prompt-lvl",
984
  "En Instruction-lvl",
985
+ "Type",
986
+ "Creator",
987
+ "Family",
988
  "Size (B)",
989
  "Base Model",
990
  "Context Window",
991
+ "Lang.",
992
  ]
993
 
994
+ selected_cols = [
995
+ col for col in fixed_column_order if col in selected_cols and col in df_.columns
996
+ ]
997
  return df_[selected_cols]
998
 
999
 
1000
  def main():
1001
+ # Load AraGen, HindiGen, and IFEval data
1002
+ df_3c3h_ar, df_tasks_ar, task_columns_ar = load_results("aragen")
1003
+ df_3c3h_hi, df_tasks_hi, task_columns_hi = load_results("hindigen")
1004
  df_if = load_if_data() # Instruction Following DF
1005
 
1006
+ # ---------- AraGen options ----------
1007
+ precision_options_3c3h = sorted(df_3c3h_ar["Precision"].dropna().unique().tolist())
1008
+ precision_options_3c3h = [p for p in precision_options_3c3h if p != "UNK"]
1009
+ precision_options_3c3h.append("Missing")
1010
+
1011
+ license_options_3c3h = sorted(df_3c3h_ar["License"].dropna().unique().tolist())
1012
+ license_options_3c3h = [l for l in license_options_3c3h if l != "UNK"]
1013
+ license_options_3c3h.append("Missing")
1014
+
1015
+ precision_options_tasks = sorted(df_tasks_ar["Precision"].dropna().unique().tolist())
1016
+ precision_options_tasks = [p for p in precision_options_tasks if p != "UNK"]
1017
+ precision_options_tasks.append("Missing")
1018
+
1019
+ license_options_tasks = sorted(df_tasks_ar["License"].dropna().unique().tolist())
1020
+ license_options_tasks = [l for l in license_options_tasks if l != "UNK"]
1021
+ license_options_tasks.append("Missing")
1022
+
1023
+ min_model_size_3c3h = int(df_3c3h_ar["Model Size Filter"].min())
1024
+ max_model_size_3c3h = int(df_3c3h_ar["Model Size Filter"].max())
1025
+
1026
+ min_model_size_tasks = int(df_tasks_ar["Model Size Filter"].min())
1027
+ max_model_size_tasks = int(df_tasks_ar["Model Size Filter"].max())
1028
+
1029
+ column_choices_3c3h = [
1030
+ col
1031
+ for col in df_3c3h_ar.columns.tolist()
1032
+ if col
1033
+ not in [
1034
+ "Model Size Filter",
1035
+ "3C3H Score Lower",
1036
+ "3C3H Score Upper",
1037
+ ]
1038
+ ]
1039
 
1040
+ column_choices_tasks = [
1041
+ col
1042
+ for col in df_tasks_ar.columns.tolist()
1043
+ if col != "Model Size Filter"
1044
+ ]
1045
 
1046
+ # ---------- HindiGen options ----------
1047
+ precision_options_3c3h_hi = sorted(df_3c3h_hi["Precision"].dropna().unique().tolist())
1048
+ precision_options_3c3h_hi = [p for p in precision_options_3c3h_hi if p != "UNK"]
1049
+ precision_options_3c3h_hi.append("Missing")
1050
+
1051
+ license_options_3c3h_hi = sorted(df_3c3h_hi["License"].dropna().unique().tolist())
1052
+ license_options_3c3h_hi = [l for l in license_options_3c3h_hi if l != "UNK"]
1053
+ license_options_3c3h_hi.append("Missing")
1054
+
1055
+ precision_options_tasks_hi = sorted(df_tasks_hi["Precision"].dropna().unique().tolist())
1056
+ precision_options_tasks_hi = [p for p in precision_options_tasks_hi if p != "UNK"]
1057
+ precision_options_tasks_hi.append("Missing")
1058
+
1059
+ license_options_tasks_hi = sorted(df_tasks_hi["License"].dropna().unique().tolist())
1060
+ license_options_tasks_hi = [l for l in license_options_tasks_hi if l != "UNK"]
1061
+ license_options_tasks_hi.append("Missing")
1062
+
1063
+ min_model_size_3c3h_hi = int(df_3c3h_hi["Model Size Filter"].min())
1064
+ max_model_size_3c3h_hi = int(df_3c3h_hi["Model Size Filter"].max())
1065
+
1066
+ min_model_size_tasks_hi = int(df_tasks_hi["Model Size Filter"].min())
1067
+ max_model_size_tasks_hi = int(df_tasks_hi["Model Size Filter"].max())
1068
+
1069
+ column_choices_3c3h_hi = [
1070
+ col
1071
+ for col in df_3c3h_hi.columns.tolist()
1072
+ if col
1073
+ not in [
1074
+ "Model Size Filter",
1075
+ "3C3H Score Lower",
1076
+ "3C3H Score Upper",
1077
+ ]
1078
+ ]
1079
 
1080
+ column_choices_tasks_hi = [
1081
+ col
1082
+ for col in df_tasks_hi.columns.tolist()
1083
+ if col != "Model Size Filter"
1084
+ ]
1085
 
1086
+ # ---------- IFEval options ----------
1087
+ family_options_if = sorted(df_if["Family"].dropna().unique().tolist())
1088
+ min_model_size_if = int(df_if["Model Size Filter"].min())
1089
+ max_model_size_if = int(df_if["Model Size Filter"].max())
1090
 
 
 
 
 
1091
  all_if_columns = [
1092
  "Rank",
1093
  "Model Name",
 
1103
  "Size (B)",
1104
  "Base Model",
1105
  "Context Window",
1106
+ "Lang.",
1107
  ]
1108
  default_if_columns = [
1109
  "Rank",
 
1111
  "Average Accuracy (Ar)",
1112
  "Ar Prompt-lvl",
1113
  "Ar Instruction-lvl",
1114
+ "Average Accuracy (En)",
1115
  ]
1116
+
1117
  with gr.Blocks() as demo:
1118
  gr.HTML(HEADER)
1119
 
1120
  with gr.Tabs():
1121
  #
1122
+ # AL Leaderboards Tab (AraGen + IFEval)
1123
  #
1124
  with gr.Tab("AL Leaderboards 🏅"):
1125
+ with gr.Tabs():
1126
+ # -------------------------
1127
+ # Sub-Tab: AraGen Leaderboards
1128
+ # -------------------------
1129
+ with gr.Tab("🐪 AraGen Leaderboards"):
1130
+ with gr.Tabs():
1131
+ # 3C3H Scores
1132
+ with gr.Tab("3C3H Scores"):
1133
+ with gr.Accordion("⚙️ Filters", open=False):
1134
+ with gr.Row():
1135
+ search_box_3c3h = gr.Textbox(
1136
+ placeholder="Search for models...",
1137
+ label="Search",
1138
+ interactive=True,
1139
+ )
1140
+ with gr.Row():
1141
+ column_selector_3c3h = gr.CheckboxGroup(
1142
+ choices=column_choices_3c3h,
1143
+ value=[
1144
+ "Rank",
1145
+ "Rank Spread",
1146
+ "Model Name",
1147
+ "3C3H Score",
1148
+ "95% CI (±)",
1149
+ "Correctness",
1150
+ "Completeness",
1151
+ "Conciseness",
1152
+ "Helpfulness",
1153
+ "Honesty",
1154
+ "Harmlessness",
1155
+ ],
1156
+ label="Select columns to display",
1157
+ )
1158
+ with gr.Row():
1159
+ license_filter_3c3h = gr.CheckboxGroup(
1160
+ choices=license_options_3c3h,
1161
+ value=license_options_3c3h.copy(),
1162
+ label="Filter by License",
1163
+ )
1164
+ precision_filter_3c3h = gr.CheckboxGroup(
1165
+ choices=precision_options_3c3h,
1166
+ value=precision_options_3c3h.copy(),
1167
+ label="Filter by Precision",
1168
+ )
1169
+ with gr.Row():
1170
+ model_size_min_filter_3c3h = gr.Slider(
1171
+ minimum=min_model_size_3c3h,
1172
+ maximum=max_model_size_3c3h,
1173
+ value=min_model_size_3c3h,
1174
+ step=1,
1175
+ label="Minimum Model Size",
1176
+ interactive=True,
1177
+ )
1178
+ model_size_max_filter_3c3h = gr.Slider(
1179
+ minimum=min_model_size_3c3h,
1180
+ maximum=max_model_size_3c3h,
1181
+ value=max_model_size_3c3h,
1182
+ step=1,
1183
+ label="Maximum Model Size",
1184
+ interactive=True,
1185
+ )
1186
+ leaderboard_3c3h = gr.Dataframe(
1187
+ df_3c3h_ar[
1188
+ [
1189
+ "Rank",
1190
+ "Rank Spread",
1191
+ "Model Name",
1192
+ "3C3H Score",
1193
+ "95% CI (±)",
1194
+ "Correctness",
1195
+ "Completeness",
1196
+ "Conciseness",
1197
+ "Helpfulness",
1198
+ "Honesty",
1199
+ "Harmlessness",
1200
+ ]
1201
+ ],
1202
+ interactive=False,
1203
+ )
1204
+ filter_inputs_3c3h = [
1205
+ search_box_3c3h,
1206
+ column_selector_3c3h,
1207
+ precision_filter_3c3h,
1208
+ license_filter_3c3h,
1209
+ model_size_min_filter_3c3h,
1210
+ model_size_max_filter_3c3h,
1211
+ ]
1212
+ search_box_3c3h.submit(
1213
+ filter_df_3c3h,
1214
+ inputs=filter_inputs_3c3h,
1215
+ outputs=leaderboard_3c3h,
1216
+ )
1217
+ for component in filter_inputs_3c3h:
1218
+ component.change(
1219
+ filter_df_3c3h,
1220
+ inputs=filter_inputs_3c3h,
1221
+ outputs=leaderboard_3c3h,
 
 
 
 
 
 
 
 
 
 
 
 
 
1222
  )
1223
+
1224
+ # Tasks Scores
1225
+ with gr.Tab("Tasks Scores"):
1226
+ gr.Markdown(
1227
+ "This table is sorted based on the **first task** "
1228
+ "(e.g., Question Answering (QA))."
1229
+ )
1230
+ with gr.Accordion("⚙️ Filters", open=False):
1231
+ with gr.Row():
1232
+ search_box_tasks = gr.Textbox(
1233
+ placeholder="Search for models...",
1234
+ label="Search",
1235
+ interactive=True,
1236
+ )
1237
+ with gr.Row():
1238
+ column_selector_tasks = gr.CheckboxGroup(
1239
+ choices=column_choices_tasks,
1240
+ value=["Rank", "Model Name"] + task_columns_ar,
1241
+ label="Select columns to display",
1242
+ )
1243
+ with gr.Row():
1244
+ license_filter_tasks = gr.CheckboxGroup(
1245
+ choices=license_options_tasks,
1246
+ value=license_options_tasks.copy(),
1247
+ label="Filter by License",
1248
+ )
1249
+ precision_filter_tasks = gr.CheckboxGroup(
1250
+ choices=precision_options_tasks,
1251
+ value=precision_options_tasks.copy(),
1252
+ label="Filter by Precision",
1253
+ )
1254
+ with gr.Row():
1255
+ model_size_min_filter_tasks = gr.Slider(
1256
+ minimum=min_model_size_tasks,
1257
+ maximum=max_model_size_tasks,
1258
+ value=min_model_size_tasks,
1259
+ step=1,
1260
+ label="Minimum Model Size",
1261
+ interactive=True,
1262
+ )
1263
+ model_size_max_filter_tasks = gr.Slider(
1264
+ minimum=min_model_size_tasks,
1265
+ maximum=max_model_size_tasks,
1266
+ value=max_model_size_tasks,
1267
+ step=1,
1268
+ label="Maximum Model Size",
1269
+ interactive=True,
1270
+ )
1271
+ leaderboard_tasks = gr.Dataframe(
1272
+ df_tasks_ar[["Rank", "Model Name"] + task_columns_ar],
1273
+ interactive=False,
1274
+ )
1275
+ filter_inputs_tasks = [
1276
+ search_box_tasks,
1277
+ column_selector_tasks,
1278
+ precision_filter_tasks,
1279
+ license_filter_tasks,
1280
+ model_size_min_filter_tasks,
1281
+ model_size_max_filter_tasks,
1282
+ ]
1283
+ search_box_tasks.submit(
1284
+ lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks(
1285
+ sq, cols, pf, lf, min_val, max_val, task_columns_ar
1286
+ ),
1287
  inputs=filter_inputs_tasks,
1288
+ outputs=leaderboard_tasks,
1289
  )
1290
+ for component in filter_inputs_tasks:
1291
+ component.change(
1292
+ lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks(
1293
+ sq, cols, pf, lf, min_val, max_val, task_columns_ar
1294
+ ),
1295
+ inputs=filter_inputs_tasks,
1296
+ outputs=leaderboard_tasks,
1297
+ )
1298
 
1299
+ # -------------------------
1300
+ # Sub-Tab: Instruction Following Leaderboard
1301
+ # -------------------------
1302
+ with gr.Tab("🗡️ Instruction Following Leaderboard"):
1303
+ with gr.Accordion("⚙️ Filters", open=False):
1304
+ with gr.Row():
1305
+ search_box_if = gr.Textbox(
1306
+ placeholder="Search for models...",
1307
+ label="Search",
1308
+ interactive=True,
1309
+ )
1310
+ with gr.Row():
1311
+ column_selector_if = gr.CheckboxGroup(
1312
+ choices=all_if_columns,
1313
+ value=default_if_columns,
1314
+ label="Select columns to display",
1315
+ )
1316
+ with gr.Row():
1317
+ family_filter_if = gr.CheckboxGroup(
1318
+ choices=family_options_if,
1319
+ value=family_options_if.copy(),
1320
+ label="Filter by Family",
1321
+ )
1322
+ with gr.Row():
1323
+ model_size_min_filter_if = gr.Slider(
1324
+ minimum=min_model_size_if,
1325
+ maximum=max_model_size_if,
1326
+ value=min_model_size_if,
1327
+ step=1,
1328
+ label="Minimum Model Size",
1329
+ interactive=True,
1330
+ )
1331
+ model_size_max_filter_if = gr.Slider(
1332
+ minimum=min_model_size_if,
1333
+ maximum=max_model_size_if,
1334
+ value=max_model_size_if,
1335
+ step=1,
1336
+ label="Maximum Model Size",
1337
+ interactive=True,
1338
+ )
1339
+ leaderboard_if = gr.Dataframe(
1340
+ df_if[default_if_columns],
1341
+ interactive=False,
1342
+ )
1343
+ filter_inputs_if = [
1344
+ search_box_if,
1345
+ column_selector_if,
1346
+ family_filter_if,
1347
+ model_size_min_filter_if,
1348
+ model_size_max_filter_if,
1349
+ ]
1350
+ search_box_if.submit(
1351
+ filter_if_df, inputs=filter_inputs_if, outputs=leaderboard_if
1352
+ )
1353
+ for component in filter_inputs_if:
1354
+ component.change(
1355
+ filter_if_df, inputs=filter_inputs_if, outputs=leaderboard_if
1356
  )
1357
+
1358
+ #
1359
+ # HindiGen Leaderboards Tab
1360
+ #
1361
+ with gr.Tab("HindiGen Leaderboards 🇮🇳"):
1362
+ with gr.Tabs():
1363
+ # 3C3H Scores
1364
+ with gr.Tab("3C3H Scores"):
1365
+ with gr.Accordion("⚙️ Filters", open=False):
1366
+ with gr.Row():
1367
+ search_box_3c3h_hi = gr.Textbox(
1368
+ placeholder="Search for models...",
1369
+ label="Search",
1370
+ interactive=True,
1371
+ )
1372
+ with gr.Row():
1373
+ column_selector_3c3h_hi = gr.CheckboxGroup(
1374
+ choices=column_choices_3c3h_hi,
1375
+ value=[
1376
+ "Rank",
1377
+ "Rank Spread",
1378
+ "Model Name",
1379
+ "3C3H Score",
1380
+ "95% CI (±)",
1381
+ "Correctness",
1382
+ "Completeness",
1383
+ "Conciseness",
1384
+ "Helpfulness",
1385
+ "Honesty",
1386
+ "Harmlessness",
1387
+ ],
1388
+ label="Select columns to display",
1389
+ )
1390
+ with gr.Row():
1391
+ license_filter_3c3h_hi = gr.CheckboxGroup(
1392
+ choices=license_options_3c3h_hi,
1393
+ value=license_options_3c3h_hi.copy(),
1394
+ label="Filter by License",
1395
+ )
1396
+ precision_filter_3c3h_hi = gr.CheckboxGroup(
1397
+ choices=precision_options_3c3h_hi,
1398
+ value=precision_options_3c3h_hi.copy(),
1399
+ label="Filter by Precision",
1400
+ )
1401
+ with gr.Row():
1402
+ model_size_min_filter_3c3h_hi = gr.Slider(
1403
+ minimum=min_model_size_3c3h_hi,
1404
+ maximum=max_model_size_3c3h_hi,
1405
+ value=min_model_size_3c3h_hi,
1406
+ step=1,
1407
+ label="Minimum Model Size",
1408
+ interactive=True,
1409
+ )
1410
+ model_size_max_filter_3c3h_hi = gr.Slider(
1411
+ minimum=min_model_size_3c3h_hi,
1412
+ maximum=max_model_size_3c3h_hi,
1413
+ value=max_model_size_3c3h_hi,
1414
+ step=1,
1415
+ label="Maximum Model Size",
1416
+ interactive=True,
1417
+ )
1418
+ leaderboard_3c3h_hi = gr.Dataframe(
1419
+ df_3c3h_hi[
1420
+ [
1421
+ "Rank",
1422
+ "Rank Spread",
1423
+ "Model Name",
1424
+ "3C3H Score",
1425
+ "95% CI (±)",
1426
+ "Correctness",
1427
+ "Completeness",
1428
+ "Conciseness",
1429
+ "Helpfulness",
1430
+ "Honesty",
1431
+ "Harmlessness",
1432
+ ]
1433
+ ],
1434
+ interactive=False,
1435
+ )
1436
+ filter_inputs_3c3h_hi = [
1437
+ search_box_3c3h_hi,
1438
+ column_selector_3c3h_hi,
1439
+ precision_filter_3c3h_hi,
1440
+ license_filter_3c3h_hi,
1441
+ model_size_min_filter_3c3h_hi,
1442
+ model_size_max_filter_3c3h_hi,
1443
+ ]
1444
+ search_box_3c3h_hi.submit(
1445
+ filter_df_3c3h_hindigen,
1446
+ inputs=filter_inputs_3c3h_hi,
1447
+ outputs=leaderboard_3c3h_hi,
1448
+ )
1449
+ for component in filter_inputs_3c3h_hi:
1450
+ component.change(
1451
+ filter_df_3c3h_hindigen,
1452
+ inputs=filter_inputs_3c3h_hi,
1453
+ outputs=leaderboard_3c3h_hi,
1454
  )
1455
+
1456
+ # Tasks Scores
1457
+ with gr.Tab("Tasks Scores"):
1458
+ gr.Markdown(
1459
+ "This table is sorted based on the **first task** "
1460
+ "(e.g., Question Answering (QA))."
1461
+ )
1462
+ with gr.Accordion("⚙️ Filters", open=False):
1463
+ with gr.Row():
1464
+ search_box_tasks_hi = gr.Textbox(
1465
+ placeholder="Search for models...",
1466
+ label="Search",
1467
+ interactive=True,
1468
+ )
1469
+ with gr.Row():
1470
+ column_selector_tasks_hi = gr.CheckboxGroup(
1471
+ choices=column_choices_tasks_hi,
1472
+ value=["Rank", "Model Name"] + task_columns_hi,
1473
+ label="Select columns to display",
1474
+ )
1475
+ with gr.Row():
1476
+ license_filter_tasks_hi = gr.CheckboxGroup(
1477
+ choices=license_options_tasks_hi,
1478
+ value=license_options_tasks_hi.copy(),
1479
+ label="Filter by License",
1480
+ )
1481
+ precision_filter_tasks_hi = gr.CheckboxGroup(
1482
+ choices=precision_options_tasks_hi,
1483
+ value=precision_options_tasks_hi.copy(),
1484
+ label="Filter by Precision",
1485
+ )
1486
+ with gr.Row():
1487
+ model_size_min_filter_tasks_hi = gr.Slider(
1488
+ minimum=min_model_size_tasks_hi,
1489
+ maximum=max_model_size_tasks_hi,
1490
+ value=min_model_size_tasks_hi,
1491
+ step=1,
1492
+ label="Minimum Model Size",
1493
+ interactive=True,
1494
+ )
1495
+ model_size_max_filter_tasks_hi = gr.Slider(
1496
+ minimum=min_model_size_tasks_hi,
1497
+ maximum=max_model_size_tasks_hi,
1498
+ value=max_model_size_tasks_hi,
1499
+ step=1,
1500
+ label="Maximum Model Size",
1501
+ interactive=True,
1502
+ )
1503
+ leaderboard_tasks_hi = gr.Dataframe(
1504
+ df_tasks_hi[["Rank", "Model Name"] + task_columns_hi],
1505
+ interactive=False,
1506
+ )
1507
+ filter_inputs_tasks_hi = [
1508
+ search_box_tasks_hi,
1509
+ column_selector_tasks_hi,
1510
+ precision_filter_tasks_hi,
1511
+ license_filter_tasks_hi,
1512
+ model_size_min_filter_tasks_hi,
1513
+ model_size_max_filter_tasks_hi,
1514
+ ]
1515
+ search_box_tasks_hi.submit(
1516
+ lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks_hindigen(
1517
+ sq, cols, pf, lf, min_val, max_val, task_columns_hi
1518
+ ),
1519
+ inputs=filter_inputs_tasks_hi,
1520
+ outputs=leaderboard_tasks_hi,
1521
+ )
1522
+ for component in filter_inputs_tasks_hi:
1523
+ component.change(
1524
+ lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks_hindigen(
1525
+ sq, cols, pf, lf, min_val, max_val, task_columns_hi
1526
+ ),
1527
+ inputs=filter_inputs_tasks_hi,
1528
+ outputs=leaderboard_tasks_hi,
1529
  )
 
 
 
 
 
 
 
 
 
 
 
 
1530
 
1531
  #
1532
  # Submit Tab
1533
  #
1534
  with gr.Tab("Submit Here 📝"):
1535
+ # Load request tables for all three request datasets
1536
+ df_pending_ar = load_requests(ARAGEN_REQUESTS_REPO_ID, "pending")
1537
+ df_finished_ar = load_requests(ARAGEN_REQUESTS_REPO_ID, "finished")
1538
+ df_failed_ar = load_requests(ARAGEN_REQUESTS_REPO_ID, "failed")
1539
+
1540
+ df_pending_hi = load_requests(HINDIGEN_REQUESTS_REPO_ID, "pending")
1541
+ df_finished_hi = load_requests(HINDIGEN_REQUESTS_REPO_ID, "finished")
1542
+ df_failed_hi = load_requests(HINDIGEN_REQUESTS_REPO_ID, "failed")
1543
+
1544
+ df_pending_if = load_requests(IFEVAL_REQUESTS_REPO_ID, "pending")
1545
+ df_finished_if = load_requests(IFEVAL_REQUESTS_REPO_ID, "finished")
1546
+ df_failed_if = load_requests(IFEVAL_REQUESTS_REPO_ID, "failed")
1547
 
1548
  gr.Markdown(ABOUT_SECTION)
1549
 
 
1551
  with gr.Column():
1552
  model_name_input = gr.Textbox(
1553
  label="Model Name",
1554
+ placeholder="Enter the full model name from HuggingFace Hub (e.g., inceptionai/jais-family-30b-8k)",
1555
+ )
1556
+ revision_input = gr.Textbox(
1557
+ label="Revision", placeholder="main", value="main"
1558
  )
 
1559
  precision_input = gr.Dropdown(
1560
  choices=["float16", "float32", "bfloat16", "8bit", "4bit"],
1561
  label="Precision",
1562
+ value="float16",
1563
  )
1564
  params_input = gr.Textbox(
1565
  label="Params",
1566
+ placeholder="Enter the approximate number of parameters as Integer (e.g., 7, 13, 30, 70 ...)",
1567
  )
1568
  license_input = gr.Textbox(
1569
  label="License",
1570
  placeholder="Enter the license type (Generic one is 'Open' in case no License is provided)",
1571
+ value="Open",
1572
  )
1573
  modality_input = gr.Radio(
1574
  choices=["Text"],
1575
  label="Modality",
1576
+ value="Text",
1577
+ )
1578
+ leaderboard_targets = gr.CheckboxGroup(
1579
+ choices=["AraGen", "HindiGen", "IFEval"],
1580
+ label="Choose which leaderboard(s) to submit to",
1581
+ info="You must choose at least one.",
1582
  )
1583
  submit_button = gr.Button("Submit Model")
1584
  submission_result = gr.Markdown()
1585
  submit_button.click(
1586
  submit_model,
1587
  inputs=[
1588
+ model_name_input,
1589
+ revision_input,
1590
+ precision_input,
1591
+ params_input,
1592
+ license_input,
1593
+ modality_input,
1594
+ leaderboard_targets,
1595
  ],
1596
+ outputs=submission_result,
1597
  )
1598
 
1599
  gr.Markdown("## Evaluation Status")
1600
+
1601
+ gr.Markdown("### AraGen Requests")
1602
+ with gr.Accordion(
1603
+ f"AraGen – Pending Evaluations ({len(df_pending_ar)})", open=False
1604
+ ):
1605
+ if not df_pending_ar.empty:
1606
+ gr.Dataframe(df_pending_ar)
1607
+ else:
1608
+ gr.Markdown("No pending evaluations.")
1609
+ with gr.Accordion(
1610
+ f"AraGen – Finished Evaluations ({len(df_finished_ar)})", open=False
1611
+ ):
1612
+ if not df_finished_ar.empty:
1613
+ gr.Dataframe(df_finished_ar)
1614
+ else:
1615
+ gr.Markdown("No finished evaluations.")
1616
+ with gr.Accordion(
1617
+ f"AraGen – Failed Evaluations ({len(df_failed_ar)})", open=False
1618
+ ):
1619
+ if not df_failed_ar.empty:
1620
+ gr.Dataframe(df_failed_ar)
1621
+ else:
1622
+ gr.Markdown("No failed evaluations.")
1623
+
1624
+ gr.Markdown("### HindiGen Requests")
1625
+ with gr.Accordion(
1626
+ f"HindiGen – Pending Evaluations ({len(df_pending_hi)})", open=False
1627
+ ):
1628
+ if not df_pending_hi.empty:
1629
+ gr.Dataframe(df_pending_hi)
1630
+ else:
1631
+ gr.Markdown("No pending evaluations.")
1632
+ with gr.Accordion(
1633
+ f"HindiGen – Finished Evaluations ({len(df_finished_hi)})", open=False
1634
+ ):
1635
+ if not df_finished_hi.empty:
1636
+ gr.Dataframe(df_finished_hi)
1637
+ else:
1638
+ gr.Markdown("No finished evaluations.")
1639
+ with gr.Accordion(
1640
+ f"HindiGen – Failed Evaluations ({len(df_failed_hi)})", open=False
1641
+ ):
1642
+ if not df_failed_hi.empty:
1643
+ gr.Dataframe(df_failed_hi)
1644
+ else:
1645
+ gr.Markdown("No failed evaluations.")
1646
+
1647
+ gr.Markdown("### IFEval Requests")
1648
+ with gr.Accordion(
1649
+ f"IFEval – Pending Evaluations ({len(df_pending_if)})", open=False
1650
+ ):
1651
+ if not df_pending_if.empty:
1652
+ gr.Dataframe(df_pending_if)
1653
  else:
1654
  gr.Markdown("No pending evaluations.")
1655
+ with gr.Accordion(
1656
+ f"IFEval Finished Evaluations ({len(df_finished_if)})", open=False
1657
+ ):
1658
+ if not df_finished_if.empty:
1659
+ gr.Dataframe(df_finished_if)
1660
  else:
1661
  gr.Markdown("No finished evaluations.")
1662
+ with gr.Accordion(
1663
+ f"IFEval Failed Evaluations ({len(df_failed_if)})", open=False
1664
+ ):
1665
+ if not df_failed_if.empty:
1666
+ gr.Dataframe(df_failed_if)
1667
  else:
1668
  gr.Markdown("No failed evaluations.")
1669
 
 
1675
  label=CITATION_BUTTON_LABEL,
1676
  lines=8,
1677
  elem_id="citation-button",
1678
+ show_copy_button=True,
1679
  )
1680
 
1681
  gr.HTML(BOTTOM_LOGO)