TraceMind / sample_data /leaderboard.json
Mandark-droid
Add leaderboard components and enhanced data loader
24b4390
raw
history blame
2.67 kB
[
{
"run_id": "run_001_gpt4",
"model": "openai/gpt-4",
"agent_type": "both",
"provider": "litellm",
"success_rate": 95.8,
"total_tests": 100,
"successful_tests": 96,
"failed_tests": 4,
"avg_steps": 2.5,
"avg_duration_ms": 3200.0,
"total_duration_ms": 320000.0,
"total_tokens": 15000,
"avg_tokens_per_test": 150,
"total_cost_usd": 0.05,
"avg_cost_per_test_usd": 0.0005,
"co2_emissions_g": 0.22,
"gpu_utilization_avg": null,
"gpu_memory_max_mib": null,
"results_dataset": "test/results_gpt4",
"traces_dataset": "test/traces_gpt4",
"metrics_dataset": "test/metrics_gpt4",
"timestamp": "2025-01-16T14:23:00Z",
"submitted_by": "test_user",
"hf_job_id": "job_12345",
"job_type": "cpu",
"dataset_used": "huggingface/smolagents/tasks",
"smoltrace_version": "0.1.0"
},
{
"run_id": "run_002_llama31",
"model": "meta-llama/Llama-3.1-8B",
"agent_type": "both",
"provider": "transformers",
"success_rate": 93.4,
"total_tests": 100,
"successful_tests": 93,
"failed_tests": 7,
"avg_steps": 2.8,
"avg_duration_ms": 2100.0,
"total_duration_ms": 210000.0,
"total_tokens": 12500,
"avg_tokens_per_test": 125,
"total_cost_usd": 0.002,
"avg_cost_per_test_usd": 0.00002,
"co2_emissions_g": 1.45,
"gpu_utilization_avg": 67.5,
"gpu_memory_max_mib": 512.34,
"results_dataset": "test/results_llama31",
"traces_dataset": "test/traces_llama31",
"metrics_dataset": "test/metrics_llama31",
"timestamp": "2025-01-16T15:10:00Z",
"submitted_by": "test_user",
"hf_job_id": "job_12346",
"job_type": "gpu_h200",
"dataset_used": "huggingface/smolagents/tasks",
"smoltrace_version": "0.1.0"
},
{
"run_id": "run_003_claude",
"model": "anthropic/claude-3-haiku",
"agent_type": "tool",
"provider": "litellm",
"success_rate": 92.1,
"total_tests": 100,
"successful_tests": 92,
"failed_tests": 8,
"avg_steps": 2.2,
"avg_duration_ms": 2800.0,
"total_duration_ms": 280000.0,
"total_tokens": 11200,
"avg_tokens_per_test": 112,
"total_cost_usd": 0.012,
"avg_cost_per_test_usd": 0.00012,
"co2_emissions_g": 0.15,
"gpu_utilization_avg": null,
"gpu_memory_max_mib": null,
"results_dataset": "test/results_claude",
"traces_dataset": "test/traces_claude",
"metrics_dataset": "test/metrics_claude",
"timestamp": "2025-01-16T16:45:00Z",
"submitted_by": "test_user",
"hf_job_id": "job_12347",
"job_type": "cpu",
"dataset_used": "huggingface/smolagents/tasks",
"smoltrace_version": "0.1.0"
}
]