diff --git a/leaderboard/index.html b/leaderboard/index.html
index 40ded5f..34f6196 100644
--- a/leaderboard/index.html
+++ b/leaderboard/index.html
@@ -71,9 +71,14 @@
LLM Game Benchmark Leaderboard
-
-
+
+
+ Aggregate by 2nd Player LLM, Prompt Type, and Game Type
+ Aggregate by Prompt Type and Game Type
+ Aggregate by Game Type
+
+
diff --git a/leaderboard/leaderboard-data-agg-gametype-prompttype-llm2.json b/leaderboard/leaderboard-data-agg-gametype-prompttype-llm2.json
new file mode 100644
index 0000000..c303d31
--- /dev/null
+++ b/leaderboard/leaderboard-data-agg-gametype-prompttype-llm2.json
@@ -0,0 +1,90 @@
+[
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "Wins-1st": 185,
+ "Disqualifications-1st": 30,
+ "Disqualifications-2nd": 54,
+ "Draws": 5,
+ "InvalidMoves-1st": 827,
+ "InvalidMoves-2nd": 990,
+ "TotalMoves-1st": 2587,
+ "TotalMoves-2nd": 2506
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "Wins-1st": 126,
+ "Disqualifications-1st": 43,
+ "Disqualifications-2nd": 56,
+ "Draws": 8,
+ "InvalidMoves-1st": 960,
+ "InvalidMoves-2nd": 1143,
+ "TotalMoves-1st": 3149,
+ "TotalMoves-2nd": 3125
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "Wins-1st": 129,
+ "Disqualifications-1st": 55,
+ "Disqualifications-2nd": 44,
+ "Draws": 6,
+ "InvalidMoves-1st": 896,
+ "InvalidMoves-2nd": 872,
+ "TotalMoves-1st": 2700,
+ "TotalMoves-2nd": 2497
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "Wins-1st": 138,
+ "Disqualifications-1st": 75,
+ "Disqualifications-2nd": 33,
+ "Draws": 9,
+ "InvalidMoves-1st": 1257,
+ "InvalidMoves-2nd": 816,
+ "TotalMoves-1st": 3065,
+ "TotalMoves-2nd": 2457
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "Wins-1st": 137,
+ "Disqualifications-1st": 91,
+ "Disqualifications-2nd": 22,
+ "Draws": 4,
+ "InvalidMoves-1st": 1504,
+ "InvalidMoves-2nd": 598,
+ "TotalMoves-1st": 3099,
+ "TotalMoves-2nd": 2023
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "Wins-1st": 160,
+ "Disqualifications-1st": 23,
+ "Disqualifications-2nd": 64,
+ "Draws": 1,
+ "InvalidMoves-1st": 858,
+ "InvalidMoves-2nd": 1172,
+ "TotalMoves-1st": 2845,
+ "TotalMoves-2nd": 2934
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "Wins-1st": 116,
+ "Disqualifications-1st": 15,
+ "Disqualifications-2nd": 23,
+ "Draws": 4,
+ "InvalidMoves-1st": 497,
+ "InvalidMoves-2nd": 486,
+ "TotalMoves-1st": 1816,
+ "TotalMoves-2nd": 1662
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "Wins-1st": 53,
+ "Disqualifications-1st": 44,
+ "Disqualifications-2nd": 47,
+ "Draws": 0,
+ "InvalidMoves-1st": 354,
+ "InvalidMoves-2nd": 988,
+ "TotalMoves-1st": 2670,
+ "TotalMoves-2nd": 3204
+ }
+]
\ No newline at end of file
diff --git a/leaderboard/leaderboard-data-agg-gametype-prompttype.json b/leaderboard/leaderboard-data-agg-gametype-prompttype.json
new file mode 100644
index 0000000..d2bc654
--- /dev/null
+++ b/leaderboard/leaderboard-data-agg-gametype-prompttype.json
@@ -0,0 +1,730 @@
+[
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "Wins-1st": 30,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 5,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 164,
+ "InvalidMoves-2nd": 124,
+ "TotalMoves-1st": 442,
+ "TotalMoves-2nd": 366
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "Wins-1st": 25,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 6,
+ "Disqualifications-2nd": 9,
+ "Draws": 0,
+ "InvalidMoves-1st": 120,
+ "InvalidMoves-2nd": 179,
+ "TotalMoves-1st": 375,
+ "TotalMoves-2nd": 400
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "Wins-1st": 20,
+ "Wins-2nd": 9,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 14,
+ "Draws": 0,
+ "InvalidMoves-1st": 114,
+ "InvalidMoves-2nd": 221,
+ "TotalMoves-1st": 383,
+ "TotalMoves-2nd": 456
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "Wins-1st": 25,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 14,
+ "Draws": 0,
+ "InvalidMoves-1st": 93,
+ "InvalidMoves-2nd": 213,
+ "TotalMoves-1st": 328,
+ "TotalMoves-2nd": 409
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gpt-4o",
+ "Wins-1st": 31,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 9,
+ "Disqualifications-2nd": 3,
+ "Draws": 1,
+ "InvalidMoves-1st": 187,
+ "InvalidMoves-2nd": 140,
+ "TotalMoves-1st": 510,
+ "TotalMoves-2nd": 428
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "Wins-1st": 20,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 5,
+ "Draws": 3,
+ "InvalidMoves-1st": 12,
+ "InvalidMoves-2nd": 83,
+ "TotalMoves-1st": 155,
+ "TotalMoves-2nd": 198
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "random-play",
+ "Wins-1st": 34,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 7,
+ "Disqualifications-2nd": 3,
+ "Draws": 1,
+ "InvalidMoves-1st": 137,
+ "InvalidMoves-2nd": 30,
+ "TotalMoves-1st": 394,
+ "TotalMoves-2nd": 249
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "Wins-1st": 20,
+ "Wins-2nd": 11,
+ "Disqualifications-1st": 10,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 193,
+ "InvalidMoves-2nd": 174,
+ "TotalMoves-1st": 530,
+ "TotalMoves-2nd": 485
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "Wins-1st": 14,
+ "Wins-2nd": 10,
+ "Disqualifications-1st": 9,
+ "Disqualifications-2nd": 12,
+ "Draws": 0,
+ "InvalidMoves-1st": 167,
+ "InvalidMoves-2nd": 205,
+ "TotalMoves-1st": 479,
+ "TotalMoves-2nd": 491
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "Wins-1st": 18,
+ "Wins-2nd": 8,
+ "Disqualifications-1st": 6,
+ "Disqualifications-2nd": 12,
+ "Draws": 1,
+ "InvalidMoves-1st": 124,
+ "InvalidMoves-2nd": 194,
+ "TotalMoves-1st": 378,
+ "TotalMoves-2nd": 417
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "Wins-1st": 13,
+ "Wins-2nd": 11,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 16,
+ "Draws": 2,
+ "InvalidMoves-1st": 113,
+ "InvalidMoves-2nd": 289,
+ "TotalMoves-1st": 424,
+ "TotalMoves-2nd": 569
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gpt-4o",
+ "Wins-1st": 18,
+ "Wins-2nd": 10,
+ "Disqualifications-1st": 11,
+ "Disqualifications-2nd": 3,
+ "Draws": 3,
+ "InvalidMoves-1st": 223,
+ "InvalidMoves-2nd": 129,
+ "TotalMoves-1st": 606,
+ "TotalMoves-2nd": 485
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "Wins-1st": 13,
+ "Wins-2nd": 12,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 3,
+ "Draws": 1,
+ "InvalidMoves-1st": 48,
+ "InvalidMoves-2nd": 90,
+ "TotalMoves-1st": 282,
+ "TotalMoves-2nd": 295
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "random-play",
+ "Wins-1st": 30,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 6,
+ "Draws": 1,
+ "InvalidMoves-1st": 92,
+ "InvalidMoves-2nd": 62,
+ "TotalMoves-1st": 450,
+ "TotalMoves-2nd": 383
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "Wins-1st": 20,
+ "Wins-2nd": 10,
+ "Disqualifications-1st": 8,
+ "Disqualifications-2nd": 7,
+ "Draws": 0,
+ "InvalidMoves-1st": 158,
+ "InvalidMoves-2nd": 153,
+ "TotalMoves-1st": 443,
+ "TotalMoves-2nd": 411
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "Wins-1st": 17,
+ "Wins-2nd": 9,
+ "Disqualifications-1st": 15,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 204,
+ "InvalidMoves-2nd": 101,
+ "TotalMoves-1st": 489,
+ "TotalMoves-2nd": 365
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "Wins-1st": 19,
+ "Wins-2nd": 8,
+ "Disqualifications-1st": 8,
+ "Disqualifications-2nd": 10,
+ "Draws": 0,
+ "InvalidMoves-1st": 86,
+ "InvalidMoves-2nd": 182,
+ "TotalMoves-1st": 309,
+ "TotalMoves-2nd": 376
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "Wins-1st": 20,
+ "Wins-2nd": 9,
+ "Disqualifications-1st": 5,
+ "Disqualifications-2nd": 11,
+ "Draws": 0,
+ "InvalidMoves-1st": 124,
+ "InvalidMoves-2nd": 219,
+ "TotalMoves-1st": 391,
+ "TotalMoves-2nd": 455
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "gpt-4o",
+ "Wins-1st": 13,
+ "Wins-2nd": 17,
+ "Disqualifications-1st": 12,
+ "Disqualifications-2nd": 3,
+ "Draws": 0,
+ "InvalidMoves-1st": 182,
+ "InvalidMoves-2nd": 99,
+ "TotalMoves-1st": 483,
+ "TotalMoves-2nd": 384
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "Wins-1st": 10,
+ "Wins-2nd": 10,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 4,
+ "Draws": 5,
+ "InvalidMoves-1st": 24,
+ "InvalidMoves-2nd": 75,
+ "TotalMoves-1st": 190,
+ "TotalMoves-2nd": 222
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "random-play",
+ "Wins-1st": 30,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 6,
+ "Disqualifications-2nd": 5,
+ "Draws": 1,
+ "InvalidMoves-1st": 118,
+ "InvalidMoves-2nd": 43,
+ "TotalMoves-1st": 395,
+ "TotalMoves-2nd": 284
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "Wins-1st": 20,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 11,
+ "Disqualifications-2nd": 5,
+ "Draws": 4,
+ "InvalidMoves-1st": 182,
+ "InvalidMoves-2nd": 138,
+ "TotalMoves-1st": 465,
+ "TotalMoves-2nd": 392
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "Wins-1st": 21,
+ "Wins-2nd": 8,
+ "Disqualifications-1st": 12,
+ "Disqualifications-2nd": 3,
+ "Draws": 1,
+ "InvalidMoves-1st": 205,
+ "InvalidMoves-2nd": 105,
+ "TotalMoves-1st": 507,
+ "TotalMoves-2nd": 388
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "Wins-1st": 22,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 15,
+ "Disqualifications-2nd": 1,
+ "Draws": 3,
+ "InvalidMoves-1st": 202,
+ "InvalidMoves-2nd": 89,
+ "TotalMoves-1st": 427,
+ "TotalMoves-2nd": 288
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "Wins-1st": 16,
+ "Wins-2nd": 9,
+ "Disqualifications-1st": 10,
+ "Disqualifications-2nd": 10,
+ "Draws": 0,
+ "InvalidMoves-1st": 212,
+ "InvalidMoves-2nd": 209,
+ "TotalMoves-1st": 468,
+ "TotalMoves-2nd": 439
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "gpt-4o",
+ "Wins-1st": 17,
+ "Wins-2nd": 7,
+ "Disqualifications-1st": 16,
+ "Disqualifications-2nd": 4,
+ "Draws": 1,
+ "InvalidMoves-1st": 240,
+ "InvalidMoves-2nd": 149,
+ "TotalMoves-1st": 532,
+ "TotalMoves-2nd": 426
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "Wins-1st": 15,
+ "Wins-2nd": 7,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 53,
+ "InvalidMoves-2nd": 77,
+ "TotalMoves-1st": 187,
+ "TotalMoves-2nd": 192
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "random-play",
+ "Wins-1st": 27,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 7,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 163,
+ "InvalidMoves-2nd": 49,
+ "TotalMoves-1st": 479,
+ "TotalMoves-2nd": 332
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "Wins-1st": 16,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 22,
+ "Disqualifications-2nd": 1,
+ "Draws": 1,
+ "InvalidMoves-1st": 313,
+ "InvalidMoves-2nd": 93,
+ "TotalMoves-1st": 558,
+ "TotalMoves-2nd": 320
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "Wins-1st": 23,
+ "Wins-2nd": 6,
+ "Disqualifications-1st": 14,
+ "Disqualifications-2nd": 0,
+ "Draws": 2,
+ "InvalidMoves-1st": 258,
+ "InvalidMoves-2nd": 46,
+ "TotalMoves-1st": 518,
+ "TotalMoves-2nd": 281
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "Wins-1st": 22,
+ "Wins-2nd": 8,
+ "Disqualifications-1st": 9,
+ "Disqualifications-2nd": 5,
+ "Draws": 1,
+ "InvalidMoves-1st": 173,
+ "InvalidMoves-2nd": 95,
+ "TotalMoves-1st": 401,
+ "TotalMoves-2nd": 295
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "Wins-1st": 16,
+ "Wins-2nd": 7,
+ "Disqualifications-1st": 13,
+ "Disqualifications-2nd": 9,
+ "Draws": 0,
+ "InvalidMoves-1st": 245,
+ "InvalidMoves-2nd": 185,
+ "TotalMoves-1st": 490,
+ "TotalMoves-2nd": 398
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "gpt-4o",
+ "Wins-1st": 13,
+ "Wins-2nd": 12,
+ "Disqualifications-1st": 18,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 271,
+ "InvalidMoves-2nd": 108,
+ "TotalMoves-1st": 511,
+ "TotalMoves-2nd": 333
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "Wins-1st": 19,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 6,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 89,
+ "InvalidMoves-2nd": 33,
+ "TotalMoves-1st": 219,
+ "TotalMoves-2nd": 144
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "random-play",
+ "Wins-1st": 28,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 9,
+ "Disqualifications-2nd": 5,
+ "Draws": 0,
+ "InvalidMoves-1st": 155,
+ "InvalidMoves-2nd": 38,
+ "TotalMoves-1st": 402,
+ "TotalMoves-2nd": 252
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "Wins-1st": 16,
+ "Wins-2nd": 12,
+ "Disqualifications-1st": 9,
+ "Disqualifications-2nd": 8,
+ "Draws": 0,
+ "InvalidMoves-1st": 192,
+ "InvalidMoves-2nd": 176,
+ "TotalMoves-1st": 507,
+ "TotalMoves-2nd": 467
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "Wins-1st": 27,
+ "Wins-2nd": 7,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 7,
+ "Draws": 0,
+ "InvalidMoves-1st": 176,
+ "InvalidMoves-2nd": 149,
+ "TotalMoves-1st": 537,
+ "TotalMoves-2nd": 476
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "Wins-1st": 29,
+ "Wins-2nd": 8,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 70,
+ "InvalidMoves-2nd": 154,
+ "TotalMoves-1st": 325,
+ "TotalMoves-2nd": 374
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "Wins-1st": 20,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 18,
+ "Draws": 0,
+ "InvalidMoves-1st": 148,
+ "InvalidMoves-2nd": 259,
+ "TotalMoves-1st": 449,
+ "TotalMoves-2nd": 522
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "Wins-1st": 11,
+ "Wins-2nd": 14,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 17,
+ "Draws": 0,
+ "InvalidMoves-1st": 153,
+ "InvalidMoves-2nd": 303,
+ "TotalMoves-1st": 432,
+ "TotalMoves-2nd": 554
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "Wins-1st": 19,
+ "Wins-2nd": 6,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 4,
+ "Draws": 1,
+ "InvalidMoves-1st": 27,
+ "InvalidMoves-2nd": 89,
+ "TotalMoves-1st": 185,
+ "TotalMoves-2nd": 223
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "random-play",
+ "Wins-1st": 38,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 92,
+ "InvalidMoves-2nd": 42,
+ "TotalMoves-1st": 410,
+ "TotalMoves-2nd": 318
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "Wins-1st": 17,
+ "Wins-2nd": 6,
+ "Disqualifications-1st": 5,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 85,
+ "InvalidMoves-2nd": 72,
+ "TotalMoves-1st": 273,
+ "TotalMoves-2nd": 241
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "Wins-1st": 19,
+ "Wins-2nd": 9,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 0,
+ "Draws": 1,
+ "InvalidMoves-1st": 81,
+ "InvalidMoves-2nd": 29,
+ "TotalMoves-1st": 315,
+ "TotalMoves-2nd": 243
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "Wins-1st": 16,
+ "Wins-2nd": 8,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 4,
+ "Draws": 2,
+ "InvalidMoves-1st": 36,
+ "InvalidMoves-2nd": 85,
+ "TotalMoves-1st": 202,
+ "TotalMoves-2nd": 229
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "Wins-1st": 10,
+ "Wins-2nd": 12,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 73,
+ "InvalidMoves-2nd": 106,
+ "TotalMoves-1st": 248,
+ "TotalMoves-2nd": 265
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "Wins-1st": 16,
+ "Wins-2nd": 7,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 55,
+ "InvalidMoves-2nd": 116,
+ "TotalMoves-1st": 224,
+ "TotalMoves-2nd": 263
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "gpt-4o",
+ "Wins-1st": 18,
+ "Wins-2nd": 8,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 0,
+ "Draws": 1,
+ "InvalidMoves-1st": 86,
+ "InvalidMoves-2nd": 51,
+ "TotalMoves-1st": 300,
+ "TotalMoves-2nd": 246
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "random-play",
+ "Wins-1st": 20,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 5,
+ "Draws": 0,
+ "InvalidMoves-1st": 81,
+ "InvalidMoves-2nd": 27,
+ "TotalMoves-1st": 254,
+ "TotalMoves-2nd": 175
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "Wins-1st": 9,
+ "Wins-2nd": 19,
+ "Disqualifications-1st": 8,
+ "Disqualifications-2nd": 9,
+ "Draws": 0,
+ "InvalidMoves-1st": 56,
+ "InvalidMoves-2nd": 161,
+ "TotalMoves-1st": 390,
+ "TotalMoves-2nd": 477
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "Wins-1st": 7,
+ "Wins-2nd": 25,
+ "Disqualifications-1st": 9,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 91,
+ "InvalidMoves-2nd": 122,
+ "TotalMoves-1st": 559,
+ "TotalMoves-2nd": 579
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "Wins-1st": 8,
+ "Wins-2nd": 23,
+ "Disqualifications-1st": 6,
+ "Disqualifications-2nd": 8,
+ "Draws": 0,
+ "InvalidMoves-1st": 39,
+ "InvalidMoves-2nd": 139,
+ "TotalMoves-1st": 352,
+ "TotalMoves-2nd": 436
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "Wins-1st": 6,
+ "Wins-2nd": 26,
+ "Disqualifications-1st": 7,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 42,
+ "InvalidMoves-2nd": 139,
+ "TotalMoves-1st": 331,
+ "TotalMoves-2nd": 416
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "Wins-1st": 9,
+ "Wins-2nd": 18,
+ "Disqualifications-1st": 6,
+ "Disqualifications-2nd": 12,
+ "Draws": 0,
+ "InvalidMoves-1st": 41,
+ "InvalidMoves-2nd": 203,
+ "TotalMoves-1st": 282,
+ "TotalMoves-2nd": 423
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gpt-4o",
+ "Wins-1st": 8,
+ "Wins-2nd": 27,
+ "Disqualifications-1st": 6,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 53,
+ "InvalidMoves-2nd": 132,
+ "TotalMoves-1st": 435,
+ "TotalMoves-2nd": 502
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "Wins-1st": 6,
+ "Wins-2nd": 18,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 32,
+ "InvalidMoves-2nd": 92,
+ "TotalMoves-1st": 321,
+ "TotalMoves-2nd": 371
+ }
+]
\ No newline at end of file
diff --git a/leaderboard/leaderboard-data-agg-gametype.json b/leaderboard/leaderboard-data-agg-gametype.json
new file mode 100644
index 0000000..8cba8bf
--- /dev/null
+++ b/leaderboard/leaderboard-data-agg-gametype.json
@@ -0,0 +1,2158 @@
+[
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 12,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 48,
+ "InvalidMoves-2nd": 31,
+ "TotalMoves-1st": 141,
+ "TotalMoves-2nd": 112
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "image",
+ "Wins-1st": 4,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 102,
+ "InvalidMoves-2nd": 93,
+ "TotalMoves-1st": 204,
+ "TotalMoves-2nd": 185
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 14,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 14,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 97,
+ "TotalMoves-2nd": 69
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "illustration",
+ "Wins-1st": 8,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 5,
+ "Draws": 0,
+ "InvalidMoves-1st": 28,
+ "InvalidMoves-2nd": 90,
+ "TotalMoves-1st": 116,
+ "TotalMoves-2nd": 165
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "image",
+ "Wins-1st": 2,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 5,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 87,
+ "InvalidMoves-2nd": 89,
+ "TotalMoves-1st": 184,
+ "TotalMoves-2nd": 180
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "list",
+ "Wins-1st": 15,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 5,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 75,
+ "TotalMoves-2nd": 55
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "illustration",
+ "Wins-1st": 7,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 5,
+ "Draws": 0,
+ "InvalidMoves-1st": 39,
+ "InvalidMoves-2nd": 89,
+ "TotalMoves-1st": 115,
+ "TotalMoves-2nd": 153
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "image",
+ "Wins-1st": 3,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 8,
+ "Draws": 0,
+ "InvalidMoves-1st": 51,
+ "InvalidMoves-2nd": 113,
+ "TotalMoves-1st": 144,
+ "TotalMoves-2nd": 195
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "list",
+ "Wins-1st": 10,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 1,
+ "Draws": 0,
+ "InvalidMoves-1st": 24,
+ "InvalidMoves-2nd": 19,
+ "TotalMoves-1st": 124,
+ "TotalMoves-2nd": 108
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "illustration",
+ "Wins-1st": 5,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 7,
+ "Draws": 0,
+ "InvalidMoves-1st": 37,
+ "InvalidMoves-2nd": 98,
+ "TotalMoves-1st": 105,
+ "TotalMoves-2nd": 154
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "image",
+ "Wins-1st": 6,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 7,
+ "Draws": 0,
+ "InvalidMoves-1st": 36,
+ "InvalidMoves-2nd": 105,
+ "TotalMoves-1st": 123,
+ "TotalMoves-2nd": 179
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "list",
+ "Wins-1st": 14,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 20,
+ "InvalidMoves-2nd": 10,
+ "TotalMoves-1st": 100,
+ "TotalMoves-2nd": 76
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "illustration",
+ "Wins-1st": 9,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 3,
+ "Draws": 1,
+ "InvalidMoves-1st": 67,
+ "InvalidMoves-2nd": 69,
+ "TotalMoves-1st": 175,
+ "TotalMoves-2nd": 164
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "image",
+ "Wins-1st": 7,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 7,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 109,
+ "InvalidMoves-2nd": 66,
+ "TotalMoves-1st": 228,
+ "TotalMoves-2nd": 178
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "list",
+ "Wins-1st": 15,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 11,
+ "InvalidMoves-2nd": 5,
+ "TotalMoves-1st": 107,
+ "TotalMoves-2nd": 86
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 7,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 5,
+ "Draws": 1,
+ "InvalidMoves-1st": 11,
+ "InvalidMoves-2nd": 83,
+ "TotalMoves-1st": 83,
+ "TotalMoves-2nd": 142
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 13,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 2,
+ "InvalidMoves-1st": 1,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 72,
+ "TotalMoves-2nd": 56
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "illustration",
+ "Wins-1st": 10,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 2,
+ "Draws": 1,
+ "InvalidMoves-1st": 46,
+ "InvalidMoves-2nd": 9,
+ "TotalMoves-1st": 127,
+ "TotalMoves-2nd": 77
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "image",
+ "Wins-1st": 10,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 1,
+ "Draws": 0,
+ "InvalidMoves-1st": 84,
+ "InvalidMoves-2nd": 13,
+ "TotalMoves-1st": 193,
+ "TotalMoves-2nd": 111
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "list",
+ "Wins-1st": 14,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 7,
+ "InvalidMoves-2nd": 8,
+ "TotalMoves-1st": 74,
+ "TotalMoves-2nd": 61
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 5,
+ "Wins-2nd": 7,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 1,
+ "Draws": 0,
+ "InvalidMoves-1st": 76,
+ "InvalidMoves-2nd": 92,
+ "TotalMoves-1st": 214,
+ "TotalMoves-2nd": 223
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "image",
+ "Wins-1st": 4,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 8,
+ "Disqualifications-2nd": 3,
+ "Draws": 0,
+ "InvalidMoves-1st": 116,
+ "InvalidMoves-2nd": 76,
+ "TotalMoves-1st": 200,
+ "TotalMoves-2nd": 152
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 11,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 1,
+ "InvalidMoves-2nd": 6,
+ "TotalMoves-1st": 116,
+ "TotalMoves-2nd": 110
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "illustration",
+ "Wins-1st": 3,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 61,
+ "InvalidMoves-2nd": 99,
+ "TotalMoves-1st": 156,
+ "TotalMoves-2nd": 185
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "image",
+ "Wins-1st": 2,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 6,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 104,
+ "InvalidMoves-2nd": 106,
+ "TotalMoves-1st": 203,
+ "TotalMoves-2nd": 197
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "list",
+ "Wins-1st": 9,
+ "Wins-2nd": 6,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 2,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 120,
+ "TotalMoves-2nd": 109
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "illustration",
+ "Wins-1st": 4,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 37,
+ "InvalidMoves-2nd": 99,
+ "TotalMoves-1st": 127,
+ "TotalMoves-2nd": 179
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "image",
+ "Wins-1st": 3,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 6,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 86,
+ "InvalidMoves-2nd": 91,
+ "TotalMoves-1st": 160,
+ "TotalMoves-2nd": 156
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "list",
+ "Wins-1st": 11,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 1,
+ "InvalidMoves-1st": 1,
+ "InvalidMoves-2nd": 4,
+ "TotalMoves-1st": 91,
+ "TotalMoves-2nd": 82
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "illustration",
+ "Wins-1st": 1,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 10,
+ "Draws": 1,
+ "InvalidMoves-1st": 32,
+ "InvalidMoves-2nd": 123,
+ "TotalMoves-1st": 114,
+ "TotalMoves-2nd": 193
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "image",
+ "Wins-1st": 3,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 5,
+ "Draws": 0,
+ "InvalidMoves-1st": 70,
+ "InvalidMoves-2nd": 108,
+ "TotalMoves-1st": 154,
+ "TotalMoves-2nd": 184
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "list",
+ "Wins-1st": 9,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 1,
+ "Draws": 1,
+ "InvalidMoves-1st": 11,
+ "InvalidMoves-2nd": 58,
+ "TotalMoves-1st": 156,
+ "TotalMoves-2nd": 192
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "illustration",
+ "Wins-1st": 8,
+ "Wins-2nd": 6,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 65,
+ "InvalidMoves-2nd": 63,
+ "TotalMoves-1st": 195,
+ "TotalMoves-2nd": 185
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "image",
+ "Wins-1st": 1,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 10,
+ "Disqualifications-2nd": 3,
+ "Draws": 0,
+ "InvalidMoves-1st": 154,
+ "InvalidMoves-2nd": 58,
+ "TotalMoves-1st": 290,
+ "TotalMoves-2nd": 187
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "list",
+ "Wins-1st": 9,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 3,
+ "InvalidMoves-1st": 4,
+ "InvalidMoves-2nd": 8,
+ "TotalMoves-1st": 121,
+ "TotalMoves-2nd": 113
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 3,
+ "Wins-2nd": 8,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 3,
+ "Draws": 0,
+ "InvalidMoves-1st": 47,
+ "InvalidMoves-2nd": 89,
+ "TotalMoves-1st": 159,
+ "TotalMoves-2nd": 183
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 10,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 1,
+ "InvalidMoves-1st": 1,
+ "InvalidMoves-2nd": 1,
+ "TotalMoves-1st": 123,
+ "TotalMoves-2nd": 112
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "illustration",
+ "Wins-1st": 10,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 2,
+ "Draws": 1,
+ "InvalidMoves-1st": 34,
+ "InvalidMoves-2nd": 35,
+ "TotalMoves-1st": 177,
+ "TotalMoves-2nd": 165
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "image",
+ "Wins-1st": 11,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 53,
+ "InvalidMoves-2nd": 16,
+ "TotalMoves-1st": 167,
+ "TotalMoves-2nd": 117
+ },
+ {
+ "LLM1stPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "list",
+ "Wins-1st": 9,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 5,
+ "InvalidMoves-2nd": 11,
+ "TotalMoves-1st": 106,
+ "TotalMoves-2nd": 101
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 2,
+ "Wins-2nd": 8,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 1,
+ "Draws": 0,
+ "InvalidMoves-1st": 78,
+ "InvalidMoves-2nd": 61,
+ "TotalMoves-1st": 168,
+ "TotalMoves-2nd": 148
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "image",
+ "Wins-1st": 5,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 80,
+ "InvalidMoves-2nd": 92,
+ "TotalMoves-1st": 184,
+ "TotalMoves-2nd": 185
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 13,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 0,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 91,
+ "TotalMoves-2nd": 78
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 3,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 8,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 97,
+ "InvalidMoves-2nd": 15,
+ "TotalMoves-1st": 177,
+ "TotalMoves-2nd": 92
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "image",
+ "Wins-1st": 4,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 7,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 107,
+ "InvalidMoves-2nd": 86,
+ "TotalMoves-1st": 207,
+ "TotalMoves-2nd": 178
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 10,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 0,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 105,
+ "TotalMoves-2nd": 95
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "illustration",
+ "Wins-1st": 8,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 5,
+ "Draws": 0,
+ "InvalidMoves-1st": 11,
+ "InvalidMoves-2nd": 91,
+ "TotalMoves-1st": 85,
+ "TotalMoves-2nd": 152
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "image",
+ "Wins-1st": 1,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 8,
+ "Disqualifications-2nd": 5,
+ "Draws": 0,
+ "InvalidMoves-1st": 70,
+ "InvalidMoves-2nd": 90,
+ "TotalMoves-1st": 142,
+ "TotalMoves-2nd": 156
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "list",
+ "Wins-1st": 10,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 5,
+ "InvalidMoves-2nd": 1,
+ "TotalMoves-1st": 82,
+ "TotalMoves-2nd": 68
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "illustration",
+ "Wins-1st": 3,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 3,
+ "Draws": 0,
+ "InvalidMoves-1st": 77,
+ "InvalidMoves-2nd": 71,
+ "TotalMoves-1st": 146,
+ "TotalMoves-2nd": 134
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "image",
+ "Wins-1st": 6,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 8,
+ "Draws": 0,
+ "InvalidMoves-1st": 45,
+ "InvalidMoves-2nd": 109,
+ "TotalMoves-1st": 126,
+ "TotalMoves-2nd": 176
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "list",
+ "Wins-1st": 11,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 2,
+ "InvalidMoves-2nd": 39,
+ "TotalMoves-1st": 119,
+ "TotalMoves-2nd": 145
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "illustration",
+ "Wins-1st": 4,
+ "Wins-2nd": 6,
+ "Disqualifications-1st": 5,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 66,
+ "InvalidMoves-2nd": 34,
+ "TotalMoves-1st": 151,
+ "TotalMoves-2nd": 115
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "image",
+ "Wins-1st": 3,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 7,
+ "Disqualifications-2nd": 3,
+ "Draws": 0,
+ "InvalidMoves-1st": 110,
+ "InvalidMoves-2nd": 64,
+ "TotalMoves-1st": 212,
+ "TotalMoves-2nd": 160
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "list",
+ "Wins-1st": 6,
+ "Wins-2nd": 9,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 6,
+ "InvalidMoves-2nd": 1,
+ "TotalMoves-1st": 120,
+ "TotalMoves-2nd": 109
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 3,
+ "Wins-2nd": 7,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 24,
+ "InvalidMoves-2nd": 75,
+ "TotalMoves-1st": 99,
+ "TotalMoves-2nd": 143
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 7,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 5,
+ "InvalidMoves-1st": 0,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 91,
+ "TotalMoves-2nd": 79
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "illustration",
+ "Wins-1st": 10,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 1,
+ "Draws": 0,
+ "InvalidMoves-1st": 48,
+ "InvalidMoves-2nd": 10,
+ "TotalMoves-1st": 122,
+ "TotalMoves-2nd": 73
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "image",
+ "Wins-1st": 9,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 3,
+ "Draws": 0,
+ "InvalidMoves-1st": 70,
+ "InvalidMoves-2nd": 19,
+ "TotalMoves-1st": 188,
+ "TotalMoves-2nd": 125
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-flash",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "list",
+ "Wins-1st": 11,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 1,
+ "Draws": 1,
+ "InvalidMoves-1st": 0,
+ "InvalidMoves-2nd": 14,
+ "TotalMoves-1st": 85,
+ "TotalMoves-2nd": 86
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 7,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 4,
+ "Draws": 1,
+ "InvalidMoves-1st": 25,
+ "InvalidMoves-2nd": 74,
+ "TotalMoves-1st": 94,
+ "TotalMoves-2nd": 131
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "image",
+ "Wins-1st": 3,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 8,
+ "Disqualifications-2nd": 1,
+ "Draws": 0,
+ "InvalidMoves-1st": 109,
+ "InvalidMoves-2nd": 39,
+ "TotalMoves-1st": 200,
+ "TotalMoves-2nd": 126
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 10,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 0,
+ "Draws": 3,
+ "InvalidMoves-1st": 48,
+ "InvalidMoves-2nd": 25,
+ "TotalMoves-1st": 171,
+ "TotalMoves-2nd": 135
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 5,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 0,
+ "Draws": 1,
+ "InvalidMoves-1st": 94,
+ "InvalidMoves-2nd": 24,
+ "TotalMoves-1st": 188,
+ "TotalMoves-2nd": 112
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "image",
+ "Wins-1st": 2,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 8,
+ "Disqualifications-2nd": 3,
+ "Draws": 0,
+ "InvalidMoves-1st": 109,
+ "InvalidMoves-2nd": 79,
+ "TotalMoves-1st": 217,
+ "TotalMoves-2nd": 188
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 14,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 2,
+ "InvalidMoves-2nd": 2,
+ "TotalMoves-1st": 102,
+ "TotalMoves-2nd": 88
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "illustration",
+ "Wins-1st": 3,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 1,
+ "Draws": 3,
+ "InvalidMoves-1st": 94,
+ "InvalidMoves-2nd": 51,
+ "TotalMoves-1st": 178,
+ "TotalMoves-2nd": 128
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "image",
+ "Wins-1st": 4,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 11,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 108,
+ "InvalidMoves-2nd": 38,
+ "TotalMoves-1st": 184,
+ "TotalMoves-2nd": 110
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "list",
+ "Wins-1st": 15,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 0,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 65,
+ "TotalMoves-2nd": 50
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "illustration",
+ "Wins-1st": 5,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 6,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 95,
+ "InvalidMoves-2nd": 67,
+ "TotalMoves-1st": 163,
+ "TotalMoves-2nd": 126
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "image",
+ "Wins-1st": 3,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 89,
+ "InvalidMoves-2nd": 88,
+ "TotalMoves-1st": 183,
+ "TotalMoves-2nd": 175
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "list",
+ "Wins-1st": 8,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 28,
+ "InvalidMoves-2nd": 54,
+ "TotalMoves-1st": 122,
+ "TotalMoves-2nd": 138
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "illustration",
+ "Wins-1st": 6,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 90,
+ "InvalidMoves-2nd": 28,
+ "TotalMoves-1st": 174,
+ "TotalMoves-2nd": 113
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "image",
+ "Wins-1st": 2,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 11,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 122,
+ "InvalidMoves-2nd": 33,
+ "TotalMoves-1st": 226,
+ "TotalMoves-2nd": 135
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "list",
+ "Wins-1st": 9,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 4,
+ "Draws": 1,
+ "InvalidMoves-1st": 28,
+ "InvalidMoves-2nd": 88,
+ "TotalMoves-1st": 132,
+ "TotalMoves-2nd": 178
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 1,
+ "Wins-2nd": 6,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 53,
+ "InvalidMoves-2nd": 77,
+ "TotalMoves-1st": 119,
+ "TotalMoves-2nd": 138
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 14,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 0,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 68,
+ "TotalMoves-2nd": 54
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "illustration",
+ "Wins-1st": 9,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 69,
+ "InvalidMoves-2nd": 18,
+ "TotalMoves-1st": 160,
+ "TotalMoves-2nd": 98
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "image",
+ "Wins-1st": 6,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 90,
+ "InvalidMoves-2nd": 18,
+ "TotalMoves-1st": 223,
+ "TotalMoves-2nd": 143
+ },
+ {
+ "LLM1stPlayer": "gemini-1.5-pro",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "list",
+ "Wins-1st": 12,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 4,
+ "InvalidMoves-2nd": 13,
+ "TotalMoves-1st": 96,
+ "TotalMoves-2nd": 91
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 4,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 9,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 120,
+ "InvalidMoves-2nd": 43,
+ "TotalMoves-1st": 190,
+ "TotalMoves-2nd": 109
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "image",
+ "Wins-1st": 4,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 8,
+ "Disqualifications-2nd": 1,
+ "Draws": 0,
+ "InvalidMoves-1st": 106,
+ "InvalidMoves-2nd": 36,
+ "TotalMoves-1st": 174,
+ "TotalMoves-2nd": 99
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 8,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 5,
+ "Disqualifications-2nd": 0,
+ "Draws": 1,
+ "InvalidMoves-1st": 87,
+ "InvalidMoves-2nd": 14,
+ "TotalMoves-1st": 194,
+ "TotalMoves-2nd": 112
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 8,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 6,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 92,
+ "InvalidMoves-2nd": 12,
+ "TotalMoves-1st": 162,
+ "TotalMoves-2nd": 74
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "image",
+ "Wins-1st": 6,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 7,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 115,
+ "InvalidMoves-2nd": 31,
+ "TotalMoves-1st": 194,
+ "TotalMoves-2nd": 104
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 9,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 0,
+ "Draws": 2,
+ "InvalidMoves-1st": 51,
+ "InvalidMoves-2nd": 3,
+ "TotalMoves-1st": 162,
+ "TotalMoves-2nd": 103
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "illustration",
+ "Wins-1st": 5,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 4,
+ "Draws": 1,
+ "InvalidMoves-1st": 52,
+ "InvalidMoves-2nd": 54,
+ "TotalMoves-1st": 120,
+ "TotalMoves-2nd": 112
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "image",
+ "Wins-1st": 7,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 5,
+ "Disqualifications-2nd": 1,
+ "Draws": 0,
+ "InvalidMoves-1st": 99,
+ "InvalidMoves-2nd": 41,
+ "TotalMoves-1st": 178,
+ "TotalMoves-2nd": 112
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "list",
+ "Wins-1st": 10,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 22,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 103,
+ "TotalMoves-2nd": 71
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "illustration",
+ "Wins-1st": 1,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 7,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 83,
+ "InvalidMoves-2nd": 78,
+ "TotalMoves-1st": 150,
+ "TotalMoves-2nd": 133
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "image",
+ "Wins-1st": 5,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 5,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 102,
+ "InvalidMoves-2nd": 78,
+ "TotalMoves-1st": 184,
+ "TotalMoves-2nd": 151
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "list",
+ "Wins-1st": 10,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 1,
+ "Draws": 0,
+ "InvalidMoves-1st": 60,
+ "InvalidMoves-2nd": 29,
+ "TotalMoves-1st": 156,
+ "TotalMoves-2nd": 114
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "illustration",
+ "Wins-1st": 3,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 9,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 113,
+ "InvalidMoves-2nd": 27,
+ "TotalMoves-1st": 180,
+ "TotalMoves-2nd": 91
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "image",
+ "Wins-1st": 4,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 9,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 116,
+ "InvalidMoves-2nd": 36,
+ "TotalMoves-1st": 188,
+ "TotalMoves-2nd": 104
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "list",
+ "Wins-1st": 6,
+ "Wins-2nd": 7,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 42,
+ "InvalidMoves-2nd": 45,
+ "TotalMoves-1st": 143,
+ "TotalMoves-2nd": 138
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 4,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 6,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 81,
+ "InvalidMoves-2nd": 33,
+ "TotalMoves-1st": 140,
+ "TotalMoves-2nd": 88
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 15,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 8,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 79,
+ "TotalMoves-2nd": 56
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "illustration",
+ "Wins-1st": 11,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 36,
+ "InvalidMoves-2nd": 11,
+ "TotalMoves-1st": 112,
+ "TotalMoves-2nd": 74
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "image",
+ "Wins-1st": 7,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 6,
+ "Disqualifications-2nd": 1,
+ "Draws": 0,
+ "InvalidMoves-1st": 98,
+ "InvalidMoves-2nd": 9,
+ "TotalMoves-1st": 186,
+ "TotalMoves-2nd": 89
+ },
+ {
+ "LLM1stPlayer": "gpt-4-turbo",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "list",
+ "Wins-1st": 10,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 21,
+ "InvalidMoves-2nd": 18,
+ "TotalMoves-1st": 104,
+ "TotalMoves-2nd": 89
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 5,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 3,
+ "Draws": 0,
+ "InvalidMoves-1st": 85,
+ "InvalidMoves-2nd": 73,
+ "TotalMoves-1st": 191,
+ "TotalMoves-2nd": 171
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "image",
+ "Wins-1st": 2,
+ "Wins-2nd": 6,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 5,
+ "Draws": 0,
+ "InvalidMoves-1st": 46,
+ "InvalidMoves-2nd": 89,
+ "TotalMoves-1st": 154,
+ "TotalMoves-2nd": 190
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 9,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 61,
+ "InvalidMoves-2nd": 14,
+ "TotalMoves-1st": 162,
+ "TotalMoves-2nd": 106
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 9,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 63,
+ "InvalidMoves-2nd": 60,
+ "TotalMoves-1st": 197,
+ "TotalMoves-2nd": 183
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "image",
+ "Wins-1st": 6,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 5,
+ "Draws": 0,
+ "InvalidMoves-1st": 70,
+ "InvalidMoves-2nd": 89,
+ "TotalMoves-1st": 211,
+ "TotalMoves-2nd": 219
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 12,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 43,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 129,
+ "TotalMoves-2nd": 74
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "illustration",
+ "Wins-1st": 10,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 19,
+ "InvalidMoves-2nd": 89,
+ "TotalMoves-1st": 104,
+ "TotalMoves-2nd": 160
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "image",
+ "Wins-1st": 9,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 43,
+ "InvalidMoves-2nd": 65,
+ "TotalMoves-1st": 140,
+ "TotalMoves-2nd": 151
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "list",
+ "Wins-1st": 10,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 8,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 81,
+ "TotalMoves-2nd": 63
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "illustration",
+ "Wins-1st": 6,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 27,
+ "InvalidMoves-2nd": 105,
+ "TotalMoves-1st": 114,
+ "TotalMoves-2nd": 180
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "image",
+ "Wins-1st": 4,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 10,
+ "Draws": 0,
+ "InvalidMoves-1st": 40,
+ "InvalidMoves-2nd": 114,
+ "TotalMoves-1st": 152,
+ "TotalMoves-2nd": 212
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "list",
+ "Wins-1st": 10,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 81,
+ "InvalidMoves-2nd": 40,
+ "TotalMoves-1st": 183,
+ "TotalMoves-2nd": 130
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "illustration",
+ "Wins-1st": 4,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 8,
+ "Draws": 0,
+ "InvalidMoves-1st": 41,
+ "InvalidMoves-2nd": 122,
+ "TotalMoves-1st": 117,
+ "TotalMoves-2nd": 186
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "image",
+ "Wins-1st": 2,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 56,
+ "InvalidMoves-2nd": 103,
+ "TotalMoves-1st": 152,
+ "TotalMoves-2nd": 191
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "list",
+ "Wins-1st": 5,
+ "Wins-2nd": 6,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 3,
+ "Draws": 0,
+ "InvalidMoves-1st": 56,
+ "InvalidMoves-2nd": 78,
+ "TotalMoves-1st": 163,
+ "TotalMoves-2nd": 177
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 5,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 4,
+ "Draws": 1,
+ "InvalidMoves-1st": 17,
+ "InvalidMoves-2nd": 89,
+ "TotalMoves-1st": 108,
+ "TotalMoves-2nd": 170
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 14,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 10,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 77,
+ "TotalMoves-2nd": 53
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "illustration",
+ "Wins-1st": 12,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 40,
+ "InvalidMoves-2nd": 17,
+ "TotalMoves-1st": 156,
+ "TotalMoves-2nd": 119
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "image",
+ "Wins-1st": 11,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 50,
+ "InvalidMoves-2nd": 16,
+ "TotalMoves-1st": 176,
+ "TotalMoves-2nd": 129
+ },
+ {
+ "LLM1stPlayer": "gpt-4o",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "list",
+ "Wins-1st": 15,
+ "Wins-2nd": 0,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 2,
+ "InvalidMoves-2nd": 9,
+ "TotalMoves-1st": 78,
+ "TotalMoves-2nd": 70
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 3,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 5,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 85,
+ "InvalidMoves-2nd": 72,
+ "TotalMoves-1st": 185,
+ "TotalMoves-2nd": 167
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 14,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 0,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 88,
+ "TotalMoves-2nd": 74
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 9,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 80,
+ "InvalidMoves-2nd": 28,
+ "TotalMoves-1st": 198,
+ "TotalMoves-2nd": 137
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 10,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 1,
+ "InvalidMoves-1st": 1,
+ "InvalidMoves-2nd": 1,
+ "TotalMoves-1st": 117,
+ "TotalMoves-2nd": 106
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "illustration",
+ "Wins-1st": 5,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 4,
+ "Draws": 2,
+ "InvalidMoves-1st": 36,
+ "InvalidMoves-2nd": 85,
+ "TotalMoves-1st": 119,
+ "TotalMoves-2nd": 157
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "list",
+ "Wins-1st": 11,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 0,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 83,
+ "TotalMoves-2nd": 72
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "illustration",
+ "Wins-1st": 2,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 72,
+ "InvalidMoves-2nd": 106,
+ "TotalMoves-1st": 159,
+ "TotalMoves-2nd": 185
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "list",
+ "Wins-1st": 8,
+ "Wins-2nd": 7,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 1,
+ "InvalidMoves-2nd": 0,
+ "TotalMoves-1st": 89,
+ "TotalMoves-2nd": 80
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "illustration",
+ "Wins-1st": 4,
+ "Wins-2nd": 4,
+ "Disqualifications-1st": 1,
+ "Disqualifications-2nd": 6,
+ "Draws": 0,
+ "InvalidMoves-1st": 53,
+ "InvalidMoves-2nd": 101,
+ "TotalMoves-1st": 133,
+ "TotalMoves-2nd": 171
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "list",
+ "Wins-1st": 12,
+ "Wins-2nd": 3,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 2,
+ "InvalidMoves-2nd": 15,
+ "TotalMoves-1st": 91,
+ "TotalMoves-2nd": 92
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "illustration",
+ "Wins-1st": 9,
+ "Wins-2nd": 2,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 0,
+ "Draws": 1,
+ "InvalidMoves-1st": 85,
+ "InvalidMoves-2nd": 46,
+ "TotalMoves-1st": 202,
+ "TotalMoves-2nd": 153
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "list",
+ "Wins-1st": 9,
+ "Wins-2nd": 6,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 1,
+ "InvalidMoves-2nd": 5,
+ "TotalMoves-1st": 98,
+ "TotalMoves-2nd": 93
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "illustration",
+ "Wins-1st": 9,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 81,
+ "InvalidMoves-2nd": 13,
+ "TotalMoves-1st": 183,
+ "TotalMoves-2nd": 104
+ },
+ {
+ "LLM1stPlayer": "meta.llama3-70b-instruct-v1:0",
+ "LLM2ndPlayer": "random-play",
+ "PromptType": "list",
+ "Wins-1st": 11,
+ "Wins-2nd": 1,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 3,
+ "Draws": 0,
+ "InvalidMoves-1st": 0,
+ "InvalidMoves-2nd": 14,
+ "TotalMoves-1st": 71,
+ "TotalMoves-2nd": 71
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 4,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 10,
+ "InvalidMoves-2nd": 74,
+ "TotalMoves-1st": 114,
+ "TotalMoves-2nd": 170
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "image",
+ "Wins-1st": 2,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 5,
+ "Draws": 0,
+ "InvalidMoves-1st": 24,
+ "InvalidMoves-2nd": 83,
+ "TotalMoves-1st": 146,
+ "TotalMoves-2nd": 198
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 3,
+ "Wins-2nd": 9,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 22,
+ "InvalidMoves-2nd": 4,
+ "TotalMoves-1st": 130,
+ "TotalMoves-2nd": 109
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 3,
+ "Wins-2nd": 7,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 50,
+ "InvalidMoves-2nd": 65,
+ "TotalMoves-1st": 268,
+ "TotalMoves-2nd": 278
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "image",
+ "Wins-1st": 1,
+ "Wins-2nd": 9,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 22,
+ "InvalidMoves-2nd": 55,
+ "TotalMoves-1st": 143,
+ "TotalMoves-2nd": 173
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "anthropic.claude-3-sonnet-20240229-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 3,
+ "Wins-2nd": 9,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 19,
+ "InvalidMoves-2nd": 2,
+ "TotalMoves-1st": 148,
+ "TotalMoves-2nd": 128
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "illustration",
+ "Wins-1st": 2,
+ "Wins-2nd": 6,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 5,
+ "Draws": 0,
+ "InvalidMoves-1st": 13,
+ "InvalidMoves-2nd": 79,
+ "TotalMoves-1st": 112,
+ "TotalMoves-2nd": 171
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "image",
+ "Wins-1st": 3,
+ "Wins-2nd": 8,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 12,
+ "InvalidMoves-2nd": 52,
+ "TotalMoves-1st": 111,
+ "TotalMoves-2nd": 146
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gemini-1.5-flash",
+ "PromptType": "list",
+ "Wins-1st": 3,
+ "Wins-2nd": 9,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 1,
+ "Draws": 0,
+ "InvalidMoves-1st": 14,
+ "InvalidMoves-2nd": 8,
+ "TotalMoves-1st": 129,
+ "TotalMoves-2nd": 119
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "illustration",
+ "Wins-1st": 2,
+ "Wins-2nd": 7,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 14,
+ "InvalidMoves-2nd": 76,
+ "TotalMoves-1st": 88,
+ "TotalMoves-2nd": 144
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "image",
+ "Wins-1st": 1,
+ "Wins-2nd": 10,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 14,
+ "InvalidMoves-2nd": 61,
+ "TotalMoves-1st": 132,
+ "TotalMoves-2nd": 176
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gemini-1.5-pro",
+ "PromptType": "list",
+ "Wins-1st": 3,
+ "Wins-2nd": 9,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 14,
+ "InvalidMoves-2nd": 2,
+ "TotalMoves-1st": 111,
+ "TotalMoves-2nd": 96
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "illustration",
+ "Wins-1st": 3,
+ "Wins-2nd": 5,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 14,
+ "InvalidMoves-2nd": 84,
+ "TotalMoves-1st": 77,
+ "TotalMoves-2nd": 140
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "image",
+ "Wins-1st": 2,
+ "Wins-2nd": 6,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 7,
+ "Draws": 0,
+ "InvalidMoves-1st": 10,
+ "InvalidMoves-2nd": 88,
+ "TotalMoves-1st": 104,
+ "TotalMoves-2nd": 173
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gpt-4-turbo",
+ "PromptType": "list",
+ "Wins-1st": 4,
+ "Wins-2nd": 7,
+ "Disqualifications-1st": 3,
+ "Disqualifications-2nd": 1,
+ "Draws": 0,
+ "InvalidMoves-1st": 17,
+ "InvalidMoves-2nd": 31,
+ "TotalMoves-1st": 101,
+ "TotalMoves-2nd": 110
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "illustration",
+ "Wins-1st": 3,
+ "Wins-2nd": 10,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 16,
+ "InvalidMoves-2nd": 67,
+ "TotalMoves-1st": 170,
+ "TotalMoves-2nd": 216
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "image",
+ "Wins-1st": 2,
+ "Wins-2nd": 7,
+ "Disqualifications-1st": 4,
+ "Disqualifications-2nd": 2,
+ "Draws": 0,
+ "InvalidMoves-1st": 26,
+ "InvalidMoves-2nd": 61,
+ "TotalMoves-1st": 172,
+ "TotalMoves-2nd": 203
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "gpt-4o",
+ "PromptType": "list",
+ "Wins-1st": 3,
+ "Wins-2nd": 10,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 11,
+ "InvalidMoves-2nd": 4,
+ "TotalMoves-1st": 93,
+ "TotalMoves-2nd": 83
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "PromptType": "illustration",
+ "Wins-1st": 1,
+ "Wins-2nd": 8,
+ "Disqualifications-1st": 2,
+ "Disqualifications-2nd": 4,
+ "Draws": 0,
+ "InvalidMoves-1st": 14,
+ "InvalidMoves-2nd": 91,
+ "TotalMoves-1st": 131,
+ "TotalMoves-2nd": 203
+ },
+ {
+ "LLM1stPlayer": "random-play",
+ "LLM2ndPlayer": "meta.llama3-70b-instruct-v1:0",
+ "PromptType": "list",
+ "Wins-1st": 5,
+ "Wins-2nd": 10,
+ "Disqualifications-1st": 0,
+ "Disqualifications-2nd": 0,
+ "Draws": 0,
+ "InvalidMoves-1st": 18,
+ "InvalidMoves-2nd": 1,
+ "TotalMoves-1st": 190,
+ "TotalMoves-2nd": 168
+ }
+]
\ No newline at end of file
diff --git a/leaderboard/leaderboard-script.js b/leaderboard/leaderboard-script.js
index b5f0874..67cab28 100644
--- a/leaderboard/leaderboard-script.js
+++ b/leaderboard/leaderboard-script.js
@@ -1,13 +1,86 @@
-$(document).ready(function() {
- const jsonURL = './leaderboard-data.json'; //'https://raw.githubusercontent.com/jackson-harper/JSONLLM/main/newLeaderboard.json';
+function formatDecimal(value) {
+ let number = parseFloat(value);
+ return Number.isInteger(number) ? number : number.toFixed(2);
+}
- function formatDecimal(value) {
- const number = parseFloat(value);
- return Number.isInteger(number) ? number : number.toFixed(2);
- }
+// Sanitize the column names to create valid and consistent IDs
+function sanitizeColumnName(name) {
+ return name.replace(/\s+/g, '').replace(/[()]/g, '').toLowerCase();
+}
+
+function populateDropdown(columnIndex, listId, table) {
+ let columnData = table.column(columnIndex).data().unique().sort();
+ let list = $(listId);
+ list.empty();
+ // Add Select All option
+ let selectAllItem = $('')
+ .append(' ')
+ .append('Select All ')
+ .click(function(event) {
+ let isChecked = $(this).hasClass('checked');
+ event.stopPropagation(); // Prevent dropdown from closing
+ if (isChecked) {
+ list.find('.item').removeClass('checked');
+ list.find('.checkbox').removeClass('checked');
+ } else {
+ list.find('.item').addClass('checked');
+ list.find('.checkbox').addClass('checked');
+ }
+ filterTable();
+ });
+ list.append(selectAllItem);
+
+ columnData.each(function(value) {
+ let item = $(' ')
+ .append(' ')
+ .append('' + value + ' ')
+ .click(function(event) {
+ event.stopPropagation(); // Prevent dropdown from closing
+ $(this).toggleClass('checked');
+ $(this).find('.checkbox').toggleClass('checked');
+ filterTable(table);
+ });
+ list.append(item);
+ });
+}
+
+function filterTable(table) {
+ table.columns().every(function(index) {
+ let column = this;
+ // Create the appropriate ID for the dropdown list based on the column header text
+ // This ensures that the dropdown list ID matches the column header it is filtering
+ let headerTitle = sanitizeColumnName(column.header().textContent);
+ let selectedFilters = [];
+ $(`#${headerTitle}List .checked .item-text`).each(function() {
+ selectedFilters.push($.fn.dataTable.util.escapeRegex($(this).text()));
+ });
+ let regex = selectedFilters.length ? selectedFilters.join('|') : '';
+ console.log(`Filtering column ${index} with regex: ${regex}`);
+ column.search(regex ? '^(' + regex + ')$' : '', true, false).draw();
+ });
+}
+
+// Function to close all dropdowns
+function closeAllDropdowns() {
+ $('.list-items').hide();
+}
+
+// Function to empty the table's HTML in preparation for updating the table's data.
+function emptyTableHTML() {
+ // Create a new placeholder column to store each column in the aggregated JSON file.
+ document.getElementById("mytable").innerHTML = "\n" +
+ " \n" +
+ " \n" +
+ " \n" +
+ " \n" +
+ "
\n";
+}
+
+function showOriginalTable() {
+ let jsonURL = './leaderboard-data.json'; //'https://raw.githubusercontent.com/jackson-harper/JSONLLM/main/newLeaderboard.json';
$.getJSON(jsonURL, function(data) {
- const formattedData = data.map(item => [
+ let formattedData = data.map(item => [
item.LLM1stPlayer,
item.LLM2ndPlayer,
item.PromptType,
@@ -29,7 +102,9 @@ $(document).ready(function() {
item.UUID
]);
- const table = $('#mytable').DataTable({
+ emptyTableHTML(formattedData);
+
+ let table = $('#mytable').DataTable({
data: formattedData,
columns: [
{ title: "LLM (1st)" },
@@ -38,86 +113,39 @@ $(document).ready(function() {
{ title: "Prompt Version" },
{ title: "Game Type"},
{ title: "Win Ratio (1st)" },
- { title: "Win Ratio (2nd)"} ,
+ { title: "Win Ratio (2nd)"},
{ title: "Wins (1st)" },
{ title: "Wins (2nd)" },
{ title: "DQ (1st)" },
{ title: "DQ (2nd)" },
{ title: "Draws" },
- { title: "Invalid Moves Ratio (1st)" },
- { title: "Invalid Moves Ratio (2nd)" },
+ { title: "Invalid Moves (1st)" },
+ { title: "Invalid Moves (2nd)" },
{ title: "Total Moves (1st)" },
{ title: "Total Moves (2nd)" },
{ title: "Provider Email" },
{ title: "Date-Time" },
{ title: "UUID" }
],
- // Adjust positioning of dom to move search bar and
+ // Adjust positioning of dom to move search bar and
dom: 'frtlpi',
columnDefs: [
- { targets: [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,17], className: 'dt-body-right' },
+ { targets: [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17], className: 'dt-body-right' },
{ targets: [0, 1, 2, 3, 4, 16], className: 'dt-body-center' },
{ targets: [3, 16, 17, 18], visible: false}
]
});
- // Sanitize the column names to create valid and consistent IDs
- function sanitizeColumnName(name) {
- return name.replace(/\s+/g, '').replace(/[()]/g, '').toLowerCase();
- }
-
- function populateDropdown(columnIndex, listId) {
- const columnData = table.column(columnIndex).data().unique().sort();
- const list = $(listId);
- list.empty();
- // Add Select All option
- const selectAllItem = $(' ')
- .append(' ')
- .append('Select All ')
- .click(function(event) {
- const isChecked = $(this).hasClass('checked');
- event.stopPropagation(); // Prevent dropdown from closing
- if (isChecked) {
- list.find('.item').removeClass('checked');
- list.find('.checkbox').removeClass('checked');
- } else {
- list.find('.item').addClass('checked');
- list.find('.checkbox').addClass('checked');
- }
- filterTable();
- });
- list.append(selectAllItem);
-
- columnData.each(function(value) {
- const item = $(' ')
- .append(' ')
- .append('' + value + ' ')
- .click(function(event) {
- event.stopPropagation(); // Prevent dropdown from closing
- $(this).toggleClass('checked');
- $(this).find('.checkbox').toggleClass('checked');
- filterTable();
- });
- list.append(item);
- });
- }
-
- populateDropdown(0, '#llm1stList');
- populateDropdown(1, '#llm2ndList');
- populateDropdown(2, '#prompttypeList');
- populateDropdown(3, '#promptversionList');
- populateDropdown(4, '#gametypeList');
-
- // Function to close all dropdowns
- function closeAllDropdowns() {
- $('.list-items').hide();
- }
-
+ populateDropdown(0, '#llm1stList', table);
+ populateDropdown(1, '#llm2ndList', table);
+ populateDropdown(2, '#prompttypeList', table);
+ populateDropdown(3, '#promptversionList', table);
+ populateDropdown(4, '#gametypeList', table);
+
// Toggle dropdown visibility on select button click
- $('.select-btn').click(function(event) {
- event.stopPropagation();
- const list = $(this).next('.list-items');
- list.toggle();
+ $('.select-btn').click(function(event) {
+ event.stopPropagation();
+ $(this).next('.list-items').toggle();
$('.list-items').not(list).hide(); // Close other dropdowns
});
@@ -125,35 +153,23 @@ $(document).ready(function() {
$(document).click(function() {
closeAllDropdowns();
});
-
- function filterTable() {
- table.columns().every(function(index) {
- const column = this;
- // Create the appropriate ID for the dropdown list based on the column header text
- // This ensures that the dropdown list ID matches the column header it is filtering
- const headerTitle = sanitizeColumnName(column.header().textContent);
- const selectedFilters = [];
- $(`#${headerTitle}List .checked .item-text`).each(function() {
- selectedFilters.push($.fn.dataTable.util.escapeRegex($(this).text()));
- });
- const regex = selectedFilters.length ? selectedFilters.join('|') : '';
- console.log(`Filtering column ${index} with regex: ${regex}`);
- column.search(regex ? '^(' + regex + ')$' : '', true, false).draw();
- });
- }
}).fail(function() {
console.error("An error occurred while fetching the JSON data.");
});
+}
+
+$(document).ready(function() {
+ showOriginalTable();
});
document.getElementById('downloadBtn').addEventListener('click', function() {
// URL of the file to be downloaded
- const fileUrl = './leaderboard-data.json';
+ let fileUrl = './leaderboard-data.json';
// Name of the file to be saved as
- const fileName = 'leaderboard-data.json';
+ let fileName = 'leaderboard-data.json';
// Create an anchor element
- const a = document.createElement('a');
+ let a = document.createElement('a');
a.href = fileUrl;
a.download = fileName;
document.body.appendChild(a);
@@ -161,3 +177,280 @@ document.getElementById('downloadBtn').addEventListener('click', function() {
document.body.removeChild(a);
});
+document.getElementById("aggregate-gametype-prompttype-llm2-btn").addEventListener('click', function() {
+ $("#mytable").DataTable().destroy(); // Delete the existing table.
+
+ // If the button's text does not start with "De-", or in other words, it doesn't say "De-Aggregate", then aggregate the data. Otherwise, de-aggregate the data.
+ if (!document.getElementById("aggregate-gametype-prompttype-llm2-btn").innerText.startsWith("De-")) {
+ // Update button name and disable other aggregation buttons.
+ document.getElementById("aggregate-gametype-prompttype-llm2-btn").innerText = "De-" + document.getElementById("aggregate-gametype-prompttype-llm2-btn").innerText;
+ document.getElementById("aggregate-gametype-btn").disabled = true;
+ document.getElementById("aggregate-gametype-prompttype-btn").disabled = true;
+
+ let jsonURL = './leaderboard-data-agg-gametype-prompttype-llm2.json'; //'https://raw.githubusercontent.com/jackson-harper/JSONLLM/main/newLeaderboard.json';
+
+ $.getJSON(jsonURL, function(data) {
+ let formattedData = data.map(item => [
+ item.LLM1stPlayer,
+ item["Wins-1st"],
+ item["Disqualifications-1st"],
+ item["Disqualifications-2nd"],
+ item.Draws,
+ item["InvalidMoves-1st"],
+ item["InvalidMoves-2nd"],
+ item["TotalMoves-1st"],
+ item["TotalMoves-2nd"],
+ ]);
+
+ emptyTableHTML(formattedData);
+
+ // Create a new table with the columns available in the aggregated data JSON file.
+ let table = $('#mytable').DataTable({
+ data: formattedData,
+ columns: [
+ { title: "LLM (1st)" },
+ { title: "Wins (1st)" },
+ { title: "DQ (1st)" },
+ { title: "DQ (2nd)" },
+ { title: "Draws" },
+ { title: "Invalid Moves (1st)" },
+ { title: "Invalid Moves (2nd)" },
+ { title: "Total Moves (1st)" },
+ { title: "Total Moves (2nd)" },
+ ],
+ // Adjust positioning of dom to move search bar and
+ dom: 'frtlpi',
+ columnDefs: [
+ { targets: [1, 2, 3, 4, 5, 6, 7, 8], className: 'dt-body-right' },
+ { targets: [0], className: 'dt-body-center' },
+ ]
+ });
+
+ populateDropdown(0, '#llm1stList', table);
+ $('.container').each(function () {
+ if ($(this).text().includes('LLM (2nd)') || $(this).text().includes('Prompt Type') || $(this).text().includes('Game Type')) {
+ $(this).hide();
+ }
+ });
+
+ // Toggle dropdown visibility on select button click
+ $('.select-btn').click(function(event) {
+ event.stopPropagation();
+ $(this).next('.list-items').toggle();
+ $('.list-items').not(list).hide(); // Close other dropdowns
+ });
+
+ // Close dropdowns when clicking outside
+ $(document).click(function() {
+ closeAllDropdowns();
+ });
+ }).fail(function() {
+ console.error("An error occurred while fetching the JSON data.");
+ });
+ }
+ else {
+ // If we are de-aggregating the data, update the button's title, re-enable other aggregation buttons, un-hide the applicable select dropdowns, and re-show the original table.
+ document.getElementById("aggregate-gametype-prompttype-llm2-btn").innerText = document.getElementById("aggregate-gametype-prompttype-llm2-btn").innerText.replace("De-", "");
+
+ document.getElementById("aggregate-gametype-btn").disabled = false;
+ document.getElementById("aggregate-gametype-prompttype-btn").disabled = false;
+
+ $('.container').each(function () {
+ if ($(this).text().includes('LLM (2nd)') || $(this).text().includes('Prompt Type') || $(this).text().includes('Game Type')) {
+ $(this).show();
+ }
+ });
+
+ showOriginalTable();
+ }
+});
+
+document.getElementById("aggregate-gametype-prompttype-btn").addEventListener('click', function() {
+ $("#mytable").DataTable().destroy(); // Delete the existing table.
+
+ // If the button's text does not start with "De-", or in other words, it doesn't say "De-Aggregate", then aggregate the data. Otherwise, de-aggregate the data.
+ if (!document.getElementById("aggregate-gametype-prompttype-btn").innerText.startsWith("De-")) {
+ // Update button name and disable other aggregation buttons.
+ document.getElementById("aggregate-gametype-prompttype-btn").innerText = "De-" + document.getElementById("aggregate-gametype-prompttype-btn").innerText;
+ document.getElementById("aggregate-gametype-btn").disabled = true;
+ document.getElementById("aggregate-gametype-prompttype-llm2-btn").disabled = true;
+
+ let jsonURL = './leaderboard-data-agg-gametype-prompttype.json'; //'https://raw.githubusercontent.com/jackson-harper/JSONLLM/main/newLeaderboard.json';
+
+ $.getJSON(jsonURL, function(data) {
+ let formattedData = data.map(item => [
+ item.LLM1stPlayer,
+ item.LLM2ndPlayer,
+ item["Wins-1st"],
+ item["Wins-2nd"],
+ item["Disqualifications-1st"],
+ item["Disqualifications-2nd"],
+ item.Draws,
+ item["InvalidMoves-1st"],
+ item["InvalidMoves-2nd"],
+ item["TotalMoves-1st"],
+ item["TotalMoves-2nd"],
+ ]);
+
+ emptyTableHTML(formattedData);
+
+ // Create a new table with the columns available in the aggregated data JSON file.
+ let table = $('#mytable').DataTable({
+ data: formattedData,
+ columns: [
+ { title: "LLM (1st)" },
+ { title: "LLM (2nd)" },
+ { title: "Wins (1st)" },
+ { title: "Wins (2nd)" },
+ { title: "DQ (1st)" },
+ { title: "DQ (2nd)" },
+ { title: "Draws" },
+ { title: "Invalid Moves (1st)" },
+ { title: "Invalid Moves (2nd)" },
+ { title: "Total Moves (1st)" },
+ { title: "Total Moves (2nd)" },
+ ],
+ // Adjust positioning of dom to move search bar and
+ dom: 'frtlpi',
+ columnDefs: [
+ { targets: [2, 3, 4, 5, 6, 7, 8, 9, 10], className: 'dt-body-right' },
+ { targets: [0, 1], className: 'dt-body-center' },
+ ]
+ });
+
+ populateDropdown(0, '#llm1stList', table);
+ populateDropdown(1, '#llm2ndList', table);
+ $('.container').each(function () {
+ if ($(this).text().includes('Game Type') || $(this).text().includes('Prompt Type')) {
+ $(this).hide();
+ }
+ });
+
+ // Toggle dropdown visibility on select button click
+ $('.select-btn').click(function(event) {
+ event.stopPropagation();
+ $(this).next('.list-items').toggle();
+ $('.list-items').not(list).hide(); // Close other dropdowns
+ });
+
+ // Close dropdowns when clicking outside
+ $(document).click(function() {
+ closeAllDropdowns();
+ });
+ }).fail(function() {
+ console.error("An error occurred while fetching the JSON data.");
+ });
+ }
+ else {
+ // If we are de-aggregating the data, update the button's title, re-enable other aggregation buttons, un-hide the applicable select dropdowns, and re-show the original table.
+ document.getElementById("aggregate-gametype-prompttype-btn").innerText = document.getElementById("aggregate-gametype-prompttype-btn").innerText.replace("De-", "");
+
+ document.getElementById("aggregate-gametype-btn").disabled = false;
+ document.getElementById("aggregate-gametype-prompttype-llm2-btn").disabled = false;
+
+ $('.container').each(function () {
+ if ($(this).text().includes('Game Type') || $(this).text().includes('Prompt Type')) {
+ $(this).show();
+ }
+ });
+
+ showOriginalTable();
+ }
+});
+
+document.getElementById("aggregate-gametype-btn").addEventListener('click', function() {
+ $("#mytable").DataTable().destroy(); // Delete the existing table.
+
+ // If the button's text does not start with "De-", or in other words, it doesn't say "De-Aggregate", then aggregate the data. Otherwise, de-aggregate the data.
+ if (!document.getElementById("aggregate-gametype-btn").innerText.startsWith("De-")) {
+ // Update button name and disable other aggregation buttons.
+ document.getElementById("aggregate-gametype-btn").innerText = "De-" + document.getElementById("aggregate-gametype-btn").innerText;
+ document.getElementById("aggregate-gametype-prompttype-btn").disabled = true;
+ document.getElementById("aggregate-gametype-prompttype-llm2-btn").disabled = true;
+
+ let jsonURL = './leaderboard-data-agg-gametype.json'; //'https://raw.githubusercontent.com/jackson-harper/JSONLLM/main/newLeaderboard.json';
+
+ $.getJSON(jsonURL, function(data) {
+ let formattedData = data.map(item => [
+ item.LLM1stPlayer,
+ item.LLM2ndPlayer,
+ item.PromptType,
+ item["Wins-1st"],
+ item["Wins-2nd"],
+ item["Disqualifications-1st"],
+ item["Disqualifications-2nd"],
+ item.Draws,
+ item["InvalidMoves-1st"],
+ item["InvalidMoves-2nd"],
+ item["TotalMoves-1st"],
+ item["TotalMoves-2nd"],
+ ]);
+
+ emptyTableHTML(formattedData);
+
+ // Create a new table with the columns available in the aggregated data JSON file.
+ let table = $('#mytable').DataTable({
+ data: formattedData,
+ columns: [
+ { title: "LLM (1st)" },
+ { title: "LLM (2nd)" },
+ { title: "Prompt Type" },
+ { title: "Wins (1st)" },
+ { title: "Wins (2nd)" },
+ { title: "DQ (1st)" },
+ { title: "DQ (2nd)" },
+ { title: "Draws" },
+ { title: "Invalid Moves (1st)" },
+ { title: "Invalid Moves (2nd)" },
+ { title: "Total Moves (1st)" },
+ { title: "Total Moves (2nd)" },
+ ],
+ // Adjust positioning of dom to move search bar and
+ dom: 'frtlpi',
+ columnDefs: [
+ { targets: [3, 4, 5, 6, 7, 8, 9, 10, 11], className: 'dt-body-right' },
+ { targets: [0, 1, 2], className: 'dt-body-center' },
+ ]
+ });
+
+ populateDropdown(0, '#llm1stList', table);
+ populateDropdown(1, '#llm2ndList', table);
+ populateDropdown(2, '#prompttypeList', table);
+ $('.container').each(function () {
+ if ($(this).text().includes('Game Type')) {
+ $(this).hide();
+ }
+ });
+
+ // Toggle dropdown visibility on select button click
+ $('.select-btn').click(function(event) {
+ event.stopPropagation();
+ $(this).next('.list-items').toggle();
+ $('.list-items').not(list).hide(); // Close other dropdowns
+ });
+
+ // Close dropdowns when clicking outside
+ $(document).click(function() {
+ closeAllDropdowns();
+ });
+ }).fail(function() {
+ console.error("An error occurred while fetching the JSON data.");
+ });
+ }
+ else {
+ // If we are de-aggregating the data, update the button's title, re-enable other aggregation buttons, un-hide the applicable select dropdowns, and re-show the original table.
+ document.getElementById("aggregate-gametype-btn").innerText = document.getElementById("aggregate-gametype-btn").innerText.replace("De-", "");
+
+ document.getElementById("aggregate-gametype-prompttype-btn").disabled = false;
+ document.getElementById("aggregate-gametype-prompttype-llm2-btn").disabled = false;
+
+ $('.container').each(function () {
+ if ($(this).text().includes('Game Type')) {
+ $(this).show();
+ }
+ });
+
+ showOriginalTable();
+ }
+});
+
diff --git a/leaderboard/leaderboard-styles.css b/leaderboard/leaderboard-styles.css
index 411bc0e..f1adc10 100644
--- a/leaderboard/leaderboard-styles.css
+++ b/leaderboard/leaderboard-styles.css
@@ -127,7 +127,7 @@ h1 {
float: left !important;
}
- h2 {
+h2 {
margin-left: 10px;
font-family: 'Tahoma', sans-serif;
font-style: normal;
@@ -142,7 +142,7 @@ h3 {
font-size:medium;
}
- .table-container {
+.table-container {
width: fit-content; /* Adjust the width */
margin: 0; /* Center the container */
box-sizing: border-box;
@@ -160,6 +160,18 @@ h3 {
font-size: medium;
}
+#aggregate-btn-container {
+ display: inline;
+ padding-top: 20px;
+ padding-left: 210px;
+}
+
+.aggregate-btn {
+ max-width: 195px;
+ display: inline;
+ margin-bottom: 10px;
+}
+
/* Style the submit button
.submit-btn {
padding: 10px 20px;