forked from EleutherAI/lm-evaluation-harness
-
Notifications
You must be signed in to change notification settings - Fork 2
1787 lines (1778 loc) · 71.9 KB
/
gh-task-runner-Single.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
name: GH Task Runner (Single)
on:
workflow_dispatch:
inputs:
run_task:
description: 'Task to run'
required: true
default: 'anli'
type: choice
options:
- advanced_ai_risk
- advanced_ai_risk_fewshot-coordinate-itself
- advanced_ai_risk_fewshot-coordinate-other-ais
- advanced_ai_risk_fewshot-coordinate-other-versions
- advanced_ai_risk_fewshot-corrigible-less-HHH
- advanced_ai_risk_fewshot-corrigible-more-HHH
- advanced_ai_risk_fewshot-corrigible-neutral-HHH
- advanced_ai_risk_fewshot-myopic-reward
- advanced_ai_risk_fewshot-one-box-tendency
- advanced_ai_risk_fewshot-power-seeking-inclination
- advanced_ai_risk_fewshot-self-awareness-general-ai
- advanced_ai_risk_fewshot-self-awareness-good-text-model
- advanced_ai_risk_fewshot-self-awareness-text-model
- advanced_ai_risk_fewshot-self-awareness-training-architecture
- advanced_ai_risk_fewshot-self-awareness-training-web-gpt
- advanced_ai_risk_fewshot-survival-instinct
- advanced_ai_risk_fewshot-wealth-seeking-inclination
- advanced_ai_risk_human-coordinate-itself
- advanced_ai_risk_human-coordinate-other-ais
- advanced_ai_risk_human-coordinate-other-versions
- advanced_ai_risk_human-corrigible-less-HHH
- advanced_ai_risk_human-corrigible-more-HHH
- advanced_ai_risk_human-corrigible-neutral-HHH
- advanced_ai_risk_human-myopic-reward
- advanced_ai_risk_human-one-box-tendency
- advanced_ai_risk_human-power-seeking-inclination
- advanced_ai_risk_human-self-awareness-general-ai
- advanced_ai_risk_human-self-awareness-good-text-model
- advanced_ai_risk_human-self-awareness-text-model
- advanced_ai_risk_human-self-awareness-training-architecture
- advanced_ai_risk_human-self-awareness-web-gpt
- advanced_ai_risk_human-survival-instinct
- advanced_ai_risk_human-wealth-seeking-inclination
- advanced_ai_risk_lm-coordinate-itself
- advanced_ai_risk_lm-coordinate-other-ais
- advanced_ai_risk_lm-coordinate-other-versions
- advanced_ai_risk_lm-corrigible-less-HHH
- advanced_ai_risk_lm-corrigible-more-HHH
- advanced_ai_risk_lm-corrigible-neutral-HHH
- advanced_ai_risk_lm-myopic-reward
- advanced_ai_risk_lm-one-box-tendency
- advanced_ai_risk_lm-power-seeking-inclination
- advanced_ai_risk_lm-self-awareness-general-ai
- advanced_ai_risk_lm-self-awareness-good-text-model
- advanced_ai_risk_lm-self-awareness-text-model
- advanced_ai_risk_lm-self-awareness-training-architecture
- advanced_ai_risk_lm-self-awareness-training-nn-architecture
- advanced_ai_risk_lm-self-awareness-training-web-gpt
- advanced_ai_risk_lm-survival-instinct
- advanced_ai_risk_lm-wealth-seeking-inclination
- ai2_arc
- anagrams1
- anagrams2
- anli
- anli_r1
- anli_r2
- anli_r3
- arc_challenge
- arc_easy
- arithmetic
- arithmetic_1dc
- arithmetic_2da
- arithmetic_2dm
- arithmetic_2ds
- arithmetic_3da
- arithmetic_3ds
- arithmetic_4da
- arithmetic_4ds
- arithmetic_5da
- arithmetic_5ds
- asdiv
- babi
- bbh
- bbh_cot_fewshot
- bbh_cot_fewshot_boolean_expressions
- bbh_cot_fewshot_causal_judgement
- bbh_cot_fewshot_date_understanding
- bbh_cot_fewshot_disambiguation_qa
- bbh_cot_fewshot_dyck_languages
- bbh_cot_fewshot_formal_fallacies
- bbh_cot_fewshot_geometric_shapes
- bbh_cot_fewshot_hyperbaton
- bbh_cot_fewshot_logical_deduction_five_objects
- bbh_cot_fewshot_logical_deduction_seven_objects
- bbh_cot_fewshot_logical_deduction_three_objects
- bbh_cot_fewshot_movie_recommendation
- bbh_cot_fewshot_multistep_arithmetic_two
- bbh_cot_fewshot_navigate
- bbh_cot_fewshot_object_counting
- bbh_cot_fewshot_penguins_in_a_table
- bbh_cot_fewshot_reasoning_about_colored_objects
- bbh_cot_fewshot_ruin_names
- bbh_cot_fewshot_salient_translation_error_detection
- bbh_cot_fewshot_snarks
- bbh_cot_fewshot_sports_understanding
- bbh_cot_fewshot_temporal_sequences
- bbh_cot_fewshot_tracking_shuffled_objects_five_objects
- bbh_cot_fewshot_tracking_shuffled_objects_seven_objects
- bbh_cot_fewshot_tracking_shuffled_objects_three_objects
- bbh_cot_fewshot_web_of_lies
- bbh_cot_fewshot_word_sorting
- bbh_cot_zeroshot
- bbh_cot_zeroshot_boolean_expressions
- bbh_cot_zeroshot_causal_judgement
- bbh_cot_zeroshot_date_understanding
- bbh_cot_zeroshot_disambiguation_qa
- bbh_cot_zeroshot_dyck_languages
- bbh_cot_zeroshot_formal_fallacies
- bbh_cot_zeroshot_geometric_shapes
- bbh_cot_zeroshot_hyperbaton
- bbh_cot_zeroshot_logical_deduction_five_objects
- bbh_cot_zeroshot_logical_deduction_seven_objects
- bbh_cot_zeroshot_logical_deduction_three_objects
- bbh_cot_zeroshot_movie_recommendation
- bbh_cot_zeroshot_multistep_arithmetic_two
- bbh_cot_zeroshot_navigate
- bbh_cot_zeroshot_object_counting
- bbh_cot_zeroshot_penguins_in_a_table
- bbh_cot_zeroshot_reasoning_about_colored_objects
- bbh_cot_zeroshot_ruin_names
- bbh_cot_zeroshot_salient_translation_error_detection
- bbh_cot_zeroshot_snarks
- bbh_cot_zeroshot_sports_understanding
- bbh_cot_zeroshot_temporal_sequences
- bbh_cot_zeroshot_tracking_shuffled_objects_five_objects
- bbh_cot_zeroshot_tracking_shuffled_objects_seven_objects
- bbh_cot_zeroshot_tracking_shuffled_objects_three_objects
- bbh_cot_zeroshot_web_of_lies
- bbh_cot_zeroshot_word_sorting
- bbh_fewshot
- bbh_fewshot_boolean_expressions
- bbh_fewshot_causal_judgement
- bbh_fewshot_date_understanding
- bbh_fewshot_disambiguation_qa
- bbh_fewshot_dyck_languages
- bbh_fewshot_formal_fallacies
- bbh_fewshot_geometric_shapes
- bbh_fewshot_hyperbaton
- bbh_fewshot_logical_deduction_five_objects
- bbh_fewshot_logical_deduction_seven_objects
- bbh_fewshot_logical_deduction_three_objects
- bbh_fewshot_movie_recommendation
- bbh_fewshot_multistep_arithmetic_two
- bbh_fewshot_navigate
- bbh_fewshot_object_counting
- bbh_fewshot_penguins_in_a_table
- bbh_fewshot_reasoning_about_colored_objects
- bbh_fewshot_ruin_names
- bbh_fewshot_salient_translation_error_detection
- bbh_fewshot_snarks
- bbh_fewshot_sports_understanding
- bbh_fewshot_temporal_sequences
- bbh_fewshot_tracking_shuffled_objects_five_objects
- bbh_fewshot_tracking_shuffled_objects_seven_objects
- bbh_fewshot_tracking_shuffled_objects_three_objects
- bbh_fewshot_web_of_lies
- bbh_fewshot_word_sorting
- bbh_zeroshot
- bbh_zeroshot_boolean_expressions
- bbh_zeroshot_causal_judgement
- bbh_zeroshot_date_understanding
- bbh_zeroshot_disambiguation_qa
- bbh_zeroshot_dyck_languages
- bbh_zeroshot_formal_fallacies
- bbh_zeroshot_geometric_shapes
- bbh_zeroshot_hyperbaton
- bbh_zeroshot_logical_deduction_five_objects
- bbh_zeroshot_logical_deduction_seven_objects
- bbh_zeroshot_logical_deduction_three_objects
- bbh_zeroshot_movie_recommendation
- bbh_zeroshot_multistep_arithmetic_two
- bbh_zeroshot_navigate
- bbh_zeroshot_object_counting
- bbh_zeroshot_penguins_in_a_table
- bbh_zeroshot_reasoning_about_colored_objects
- bbh_zeroshot_ruin_names
- bbh_zeroshot_salient_translation_error_detection
- bbh_zeroshot_snarks
- bbh_zeroshot_sports_understanding
- bbh_zeroshot_temporal_sequences
- bbh_zeroshot_tracking_shuffled_objects_five_objects
- bbh_zeroshot_tracking_shuffled_objects_seven_objects
- bbh_zeroshot_tracking_shuffled_objects_three_objects
- bbh_zeroshot_web_of_lies
- bbh_zeroshot_word_sorting
- belebele
- belebele_acm_Arab
- belebele_afr_Latn
- belebele_als_Latn
- belebele_amh_Ethi
- belebele_apc_Arab
- belebele_arb_Arab
- belebele_arb_Latn
- belebele_ars_Arab
- belebele_ary_Arab
- belebele_arz_Arab
- belebele_asm_Beng
- belebele_azj_Latn
- belebele_bam_Latn
- belebele_ben_Beng
- belebele_ben_Latn
- belebele_bod_Tibt
- belebele_bul_Cyrl
- belebele_cat_Latn
- belebele_ceb_Latn
- belebele_ces_Latn
- belebele_ckb_Arab
- belebele_dan_Latn
- belebele_deu_Latn
- belebele_ell_Grek
- belebele_eng_Latn
- belebele_est_Latn
- belebele_eus_Latn
- belebele_fin_Latn
- belebele_fra_Latn
- belebele_fuv_Latn
- belebele_gaz_Latn
- belebele_grn_Latn
- belebele_guj_Gujr
- belebele_hat_Latn
- belebele_hau_Latn
- belebele_heb_Hebr
- belebele_hin_Deva
- belebele_hin_Latn
- belebele_hrv_Latn
- belebele_hun_Latn
- belebele_hye_Armn
- belebele_ibo_Latn
- belebele_ilo_Latn
- belebele_ind_Latn
- belebele_isl_Latn
- belebele_ita_Latn
- belebele_jav_Latn
- belebele_jpn_Jpan
- belebele_kac_Latn
- belebele_kan_Knda
- belebele_kat_Geor
- belebele_kaz_Cyrl
- belebele_kea_Latn
- belebele_khk_Cyrl
- belebele_khm_Khmr
- belebele_kin_Latn
- belebele_kir_Cyrl
- belebele_kor_Hang
- belebele_lao_Laoo
- belebele_lin_Latn
- belebele_lit_Latn
- belebele_lug_Latn
- belebele_luo_Latn
- belebele_lvs_Latn
- belebele_mal_Mlym
- belebele_mar_Deva
- belebele_mkd_Cyrl
- belebele_mlt_Latn
- belebele_mri_Latn
- belebele_mya_Mymr
- belebele_nld_Latn
- belebele_nob_Latn
- belebele_npi_Deva
- belebele_npi_Latn
- belebele_nso_Latn
- belebele_nya_Latn
- belebele_ory_Orya
- belebele_pan_Guru
- belebele_pbt_Arab
- belebele_pes_Arab
- belebele_plt_Latn
- belebele_pol_Latn
- belebele_por_Latn
- belebele_ron_Latn
- belebele_rus_Cyrl
- belebele_shn_Mymr
- belebele_sin_Latn
- belebele_sin_Sinh
- belebele_slk_Latn
- belebele_slv_Latn
- belebele_sna_Latn
- belebele_snd_Arab
- belebele_som_Latn
- belebele_sot_Latn
- belebele_spa_Latn
- belebele_srp_Cyrl
- belebele_ssw_Latn
- belebele_sun_Latn
- belebele_swe_Latn
- belebele_swh_Latn
- belebele_tam_Taml
- belebele_tel_Telu
- belebele_tgk_Cyrl
- belebele_tgl_Latn
- belebele_tha_Thai
- belebele_tir_Ethi
- belebele_tsn_Latn
- belebele_tso_Latn
- belebele_tur_Latn
- belebele_ukr_Cyrl
- belebele_urd_Arab
- belebele_urd_Latn
- belebele_uzn_Latn
- belebele_vie_Latn
- belebele_war_Latn
- belebele_wol_Latn
- belebele_xho_Latn
- belebele_yor_Latn
- belebele_zho_Hans
- belebele_zho_Hant
- belebele_zsm_Latn
- belebele_zul_Latn
- bigbench_abstract_narrative_understanding_generate_until
- bigbench_abstract_narrative_understanding_multiple_choice
- bigbench_anachronisms_generate_until
- bigbench_anachronisms_multiple_choice
- bigbench_analogical_similarity_generate_until
- bigbench_analogical_similarity_multiple_choice
- bigbench_analytic_entailment_generate_until
- bigbench_analytic_entailment_multiple_choice
- bigbench_arithmetic_generate_until
- bigbench_arithmetic_multiple_choice
- bigbench_ascii_word_recognition_generate_until
- bigbench_ascii_word_recognition_multiple_choice
- bigbench_authorship_verification_generate_until
- bigbench_authorship_verification_multiple_choice
- bigbench_auto_categorization_generate_until
- bigbench_auto_categorization_multiple_choice
- bigbench_auto_debugging_generate_until
- bigbench_auto_debugging_multiple_choice
- bigbench_bbq_lite_json_generate_until
- bigbench_bbq_lite_json_multiple_choice
- bigbench_bridging_anaphora_resolution_barqa_generate_until
- bigbench_bridging_anaphora_resolution_barqa_multiple_choice
- bigbench_causal_judgement_multiple_choice
- bigbench_causal_judgment_generate_until
- bigbench_causal_judgment_multiple_choice
- bigbench_cause_and_effect_generate_until
- bigbench_cause_and_effect_multiple_choice
- bigbench_checkmate_in_one_generate_until
- bigbench_checkmate_in_one_multiple_choice
- bigbench_chess_state_tracking_generate_until
- bigbench_chess_state_tracking_multiple_choice
- bigbench_chinese_remainder_theorem_generate_until
- bigbench_chinese_remainder_theorem_multiple_choice
- bigbench_cifar10_classification_generate_until
- bigbench_cifar10_classification_multiple_choice
- bigbench_code_line_description_generate_until
- bigbench_code_line_description_multiple_choice
- bigbench_codenames_generate_until
- bigbench_codenames_multiple_choice
- bigbench_color_generate_until
- bigbench_color_multiple_choice
- bigbench_common_morpheme_generate_until
- bigbench_common_morpheme_multiple_choice
- bigbench_conceptual_combinations_generate_until
- bigbench_conceptual_combinations_multiple_choice
- bigbench_conlang_translation_generate_until
- bigbench_conlang_translation_multiple_choice
- bigbench_contextual_parametric_knowledge_conflicts_generate_until
- bigbench_contextual_parametric_knowledge_conflicts_multiple_choice
- bigbench_crash_blossom_generate_until
- bigbench_crash_blossom_multiple_choice
- bigbench_crass_ai_generate_until
- bigbench_crass_ai_multiple_choice
- bigbench_cryobiology_spanish_generate_until
- bigbench_cryobiology_spanish_multiple_choice
- bigbench_cryptonite_generate_until
- bigbench_cryptonite_multiple_choice
- bigbench_cs_algorithms_generate_until
- bigbench_cs_algorithms_multiple_choice
- bigbench_dark_humor_detection_generate_until
- bigbench_dark_humor_detection_multiple_choice
- bigbench_date_understanding_generate_until
- bigbench_date_understanding_multiple_choice
- bigbench_disambiguation_qa_generate_until
- bigbench_disambiguation_qa_multiple_choice
- bigbench_discourse_marker_prediction_generate_until
- bigbench_discourse_marker_prediction_multiple_choice
- bigbench_disfl_qa_generate_until
- bigbench_disfl_qa_multiple_choice
- bigbench_dyck_languages_generate_until
- bigbench_dyck_languages_multiple_choice
- bigbench_elementary_math_qa_generate_until
- bigbench_elementary_math_qa_multiple_choice
- bigbench_emoji_movie_generate_until
- bigbench_emoji_movie_multiple_choice
- bigbench_emojis_emotion_prediction_generate_until
- bigbench_emojis_emotion_prediction_multiple_choice
- bigbench_empirical_judgments_generate_until
- bigbench_empirical_judgments_multiple_choice
- bigbench_english_proverbs_generate_until
- bigbench_english_proverbs_multiple_choice
- bigbench_english_russian_proverbs_generate_until
- bigbench_english_russian_proverbs_multiple_choice
- bigbench_entailed_polarity_generate_until
- bigbench_entailed_polarity_hindi_generate_until
- bigbench_entailed_polarity_hindi_multiple_choice
- bigbench_entailed_polarity_multiple_choice
- bigbench_epistemic_reasoning_generate_until
- bigbench_epistemic_reasoning_multiple_choice
- bigbench_evaluating_information_essentiality_generate_until
- bigbench_evaluating_information_essentiality_multiple_choice
- bigbench_fact_checker_generate_until
- bigbench_fact_checker_multiple_choice
- bigbench_fantasy_reasoning_generate_until
- bigbench_fantasy_reasoning_multiple_choice
- bigbench_few_shot_nlg_generate_until
- bigbench_few_shot_nlg_multiple_choice
- bigbench_figure_of_speech_detection_generate_until
- bigbench_figure_of_speech_detection_multiple_choice
- bigbench_formal_fallacies_syllogisms_negation_generate_until
- bigbench_formal_fallacies_syllogisms_negation_multiple_choice
- bigbench_gem_generate_until
- bigbench_gem_multiple_choice
- bigbench_gender_inclusive_sentences_german_generate_until
- bigbench_gender_inclusive_sentences_german_multiple_choice
- bigbench_general_knowledge_generate_until
- bigbench_general_knowledge_multiple_choice
- bigbench_generate_until
- bigbench_geometric_shapes_generate_until
- bigbench_geometric_shapes_multiple_choice
- bigbench_goal_step_wikihow_generate_until
- bigbench_goal_step_wikihow_multiple_choice
- bigbench_gre_reading_comprehension_generate_until
- bigbench_gre_reading_comprehension_multiple_choice
- bigbench_hhh_alignment_generate_until
- bigbench_hhh_alignment_multiple_choice
- bigbench_hindi_question_answering_generate_until
- bigbench_hindi_question_answering_multiple_choice
- bigbench_hindu_knowledge_generate_until
- bigbench_hindu_knowledge_multiple_choice
- bigbench_hinglish_toxicity_generate_until
- bigbench_hinglish_toxicity_multiple_choice
- bigbench_human_organs_senses_generate_until
- bigbench_human_organs_senses_multiple_choice
- bigbench_hyperbaton_generate_until
- bigbench_hyperbaton_multiple_choice
- bigbench_identify_math_theorems_generate_until
- bigbench_identify_math_theorems_multiple_choice
- bigbench_identify_odd_metaphor_generate_until
- bigbench_identify_odd_metaphor_multiple_choice
- bigbench_implicatures_generate_until
- bigbench_implicatures_multiple_choice
- bigbench_implicit_relations_generate_until
- bigbench_implicit_relations_multiple_choice
- bigbench_intent_recognition_generate_until
- bigbench_intent_recognition_multiple_choice
- bigbench_international_phonetic_alphabet_nli_generate_until
- bigbench_international_phonetic_alphabet_nli_multiple_choice
- bigbench_international_phonetic_alphabet_transliterate_generate_until
- bigbench_international_phonetic_alphabet_transliterate_multiple_choice
- bigbench_intersect_geometry_generate_until
- bigbench_intersect_geometry_multiple_choice
- bigbench_irony_identification_generate_until
- bigbench_irony_identification_multiple_choice
- bigbench_kanji_ascii_generate_until
- bigbench_kanji_ascii_multiple_choice
- bigbench_kannada_generate_until
- bigbench_kannada_multiple_choice
- bigbench_key_value_maps_generate_until
- bigbench_key_value_maps_multiple_choice
- bigbench_known_unknowns_generate_until
- bigbench_known_unknowns_multiple_choice
- bigbench_language_games_generate_until
- bigbench_language_games_multiple_choice
- bigbench_language_identification_generate_until
- bigbench_language_identification_multiple_choice
- bigbench_linguistic_mappings_generate_until
- bigbench_linguistic_mappings_multiple_choice
- bigbench_linguistics_puzzles_generate_until
- bigbench_linguistics_puzzles_multiple_choice
- bigbench_list_functions_generate_until
- bigbench_list_functions_multiple_choice
- bigbench_logic_grid_puzzle_generate_until
- bigbench_logic_grid_puzzle_multiple_choice
- bigbench_logical_args_generate_until
- bigbench_logical_args_multiple_choice
- bigbench_logical_deduction_generate_until
- bigbench_logical_deduction_multiple_choice
- bigbench_logical_fallacy_detection_generate_until
- bigbench_logical_fallacy_detection_multiple_choice
- bigbench_logical_sequence_generate_until
- bigbench_logical_sequence_multiple_choice
- bigbench_mathematical_induction_generate_until
- bigbench_mathematical_induction_multiple_choice
- bigbench_matrixshapes_generate_until
- bigbench_matrixshapes_multiple_choice
- bigbench_metaphor_boolean_generate_until
- bigbench_metaphor_boolean_multiple_choice
- bigbench_metaphor_understanding_generate_until
- bigbench_metaphor_understanding_multiple_choice
- bigbench_minute_mysteries_qa_generate_until
- bigbench_minute_mysteries_qa_multiple_choice
- bigbench_misconceptions_generate_until
- bigbench_misconceptions_multiple_choice
- bigbench_misconceptions_russian_generate_until
- bigbench_misconceptions_russian_multiple_choice
- bigbench_mnist_ascii_generate_until
- bigbench_mnist_ascii_multiple_choice
- bigbench_modified_arithmetic_generate_until
- bigbench_modified_arithmetic_multiple_choice
- bigbench_moral_permissibility_generate_until
- bigbench_moral_permissibility_multiple_choice
- bigbench_movie_dialog_same_or_different_generate_until
- bigbench_movie_dialog_same_or_different_multiple_choice
- bigbench_movie_recommendation_generate_until
- bigbench_movie_recommendation_multiple_choice
- bigbench_mult_data_wrangling_generate_until
- bigbench_mult_data_wrangling_multiple_choice
- bigbench_multiemo_generate_until
- bigbench_multiemo_multiple_choice
- bigbench_multiple_choice
- bigbench_natural_instructions_generate_until
- bigbench_natural_instructions_multiple_choice
- bigbench_navigate_generate_until
- bigbench_navigate_multiple_choice
- bigbench_nonsense_words_grammar_generate_until
- bigbench_nonsense_words_grammar_multiple_choice
- bigbench_novel_concepts_generate_until
- bigbench_novel_concepts_multiple_choice
- bigbench_object_counting_generate_until
- bigbench_object_counting_multiple_choice
- bigbench_odd_one_out_generate_until
- bigbench_odd_one_out_multiple_choice
- bigbench_operators_generate_until
- bigbench_operators_multiple_choice
- bigbench_paragraph_segmentation_generate_until
- bigbench_paragraph_segmentation_multiple_choice
- bigbench_parsinlu_qa_generate_until
- bigbench_parsinlu_qa_multiple_choice
- bigbench_parsinlu_reading_comprehension_generate_until
- bigbench_parsinlu_reading_comprehension_multiple_choice
- bigbench_penguins_in_a_table_generate_until
- bigbench_penguins_in_a_table_multiple_choice
- bigbench_periodic_elements_generate_until
- bigbench_periodic_elements_multiple_choice
- bigbench_persian_idioms_generate_until
- bigbench_persian_idioms_multiple_choice
- bigbench_phrase_relatedness_generate_until
- bigbench_phrase_relatedness_multiple_choice
- bigbench_physical_intuition_generate_until
- bigbench_physical_intuition_multiple_choice
- bigbench_physics_generate_until
- bigbench_physics_multiple_choice
- bigbench_physics_questions_generate_until
- bigbench_physics_questions_multiple_choice
- bigbench_play_dialog_same_or_different_generate_until
- bigbench_play_dialog_same_or_different_multiple_choice
- bigbench_polish_sequence_labeling_generate_until
- bigbench_polish_sequence_labeling_multiple_choice
- bigbench_presuppositions_as_nli_generate_until
- bigbench_presuppositions_as_nli_multiple_choice
- bigbench_qa_wikidata_generate_until
- bigbench_qa_wikidata_multiple_choice
- bigbench_question_selection_generate_until
- bigbench_question_selection_multiple_choice
- bigbench_real_or_fake_text_generate_until
- bigbench_real_or_fake_text_multiple_choice
- bigbench_reasoning_about_colored_objects_generate_until
- bigbench_reasoning_about_colored_objects_multiple_choice
- bigbench_repeat_copy_logic_generate_until
- bigbench_repeat_copy_logic_multiple_choice
- bigbench_rephrase_generate_until
- bigbench_rephrase_multiple_choice
- bigbench_riddle_sense_generate_until
- bigbench_riddle_sense_multiple_choice
- bigbench_ruin_names_generate_until
- bigbench_ruin_names_multiple_choice
- bigbench_salient_translation_error_detection_generate_until
- bigbench_salient_translation_error_detection_multiple_choice
- bigbench_scientific_press_release_generate_until
- bigbench_scientific_press_release_multiple_choice
- bigbench_semantic_parsing_in_context_sparc_generate_until
- bigbench_semantic_parsing_in_context_sparc_multiple_choice
- bigbench_semantic_parsing_spider_generate_until
- bigbench_semantic_parsing_spider_multiple_choice
- bigbench_sentence_ambiguity_generate_until
- bigbench_sentence_ambiguity_multiple_choice
- bigbench_similarities_abstraction_generate_until
- bigbench_similarities_abstraction_multiple_choice
- bigbench_simp_turing_concept_generate_until
- bigbench_simp_turing_concept_multiple_choice
- bigbench_simple_arithmetic_json_generate_until
- bigbench_simple_arithmetic_json_multiple_choice
- bigbench_simple_arithmetic_json_multiple_choice_generate_until
- bigbench_simple_arithmetic_json_multiple_choice_multiple_choice
- bigbench_simple_arithmetic_json_subtasks_generate_until
- bigbench_simple_arithmetic_json_subtasks_multiple_choice
- bigbench_simple_arithmetic_multiple_targets_json_generate_until
- bigbench_simple_arithmetic_multiple_targets_json_multiple_choice
- bigbench_simple_ethical_questions_generate_until
- bigbench_simple_ethical_questions_multiple_choice
- bigbench_simple_text_editing_generate_until
- bigbench_simple_text_editing_multiple_choice
- bigbench_snarks_generate_until
- bigbench_snarks_multiple_choice
- bigbench_social_iqa_generate_until
- bigbench_social_iqa_multiple_choice
- bigbench_social_support_generate_until
- bigbench_social_support_multiple_choice
- bigbench_sports_understanding_generate_until
- bigbench_sports_understanding_multiple_choice
- bigbench_strange_stories_generate_until
- bigbench_strange_stories_multiple_choice
- bigbench_strategyqa_generate_until
- bigbench_strategyqa_multiple_choice
- bigbench_sufficient_information_generate_until
- bigbench_sufficient_information_multiple_choice
- bigbench_suicide_risk_generate_until
- bigbench_suicide_risk_multiple_choice
- bigbench_swahili_english_proverbs_generate_until
- bigbench_swahili_english_proverbs_multiple_choice
- bigbench_swedish_to_german_proverbs_generate_until
- bigbench_swedish_to_german_proverbs_multiple_choice
- bigbench_symbol_interpretation_generate_until
- bigbench_symbol_interpretation_multiple_choice
- bigbench_temporal_sequences_generate_until
- bigbench_temporal_sequences_multiple_choice
- bigbench_tense_generate_until
- bigbench_tense_multiple_choice
- bigbench_timedial_generate_until
- bigbench_timedial_multiple_choice
- bigbench_topical_chat_generate_until
- bigbench_topical_chat_multiple_choice
- bigbench_tracking_shuffled_objects_generate_until
- bigbench_tracking_shuffled_objects_multiple_choice
- bigbench_understanding_fables_generate_until
- bigbench_understanding_fables_multiple_choice
- bigbench_undo_permutation_generate_until
- bigbench_undo_permutation_multiple_choice
- bigbench_unit_conversion_generate_until
- bigbench_unit_conversion_multiple_choice
- bigbench_unit_interpretation_generate_until
- bigbench_unit_interpretation_multiple_choice
- bigbench_unnatural_in_context_learning_generate_until
- bigbench_unnatural_in_context_learning_multiple_choice
- bigbench_vitaminc_fact_verification_generate_until
- bigbench_vitaminc_fact_verification_multiple_choice
- bigbench_what_is_the_tao_generate_until
- bigbench_what_is_the_tao_multiple_choice
- bigbench_which_wiki_edit_generate_until
- bigbench_which_wiki_edit_multiple_choice
- bigbench_winowhy_generate_until
- bigbench_winowhy_multiple_choice
- bigbench_word_sorting_generate_until
- bigbench_word_sorting_multiple_choice
- bigbench_word_unscrambling_generate_until
- bigbench_word_unscrambling_multiple_choice
- blimp
- blimp_adjunct_island
- blimp_anaphor_gender_agreement
- blimp_anaphor_number_agreement
- blimp_animate_subject_passive
- blimp_animate_subject_trans
- blimp_causative
- blimp_complex_NP_island
- blimp_coordinate_structure_constraint_complex_left_branch
- blimp_coordinate_structure_constraint_object_extraction
- blimp_determiner_noun_agreement_1
- blimp_determiner_noun_agreement_2
- blimp_determiner_noun_agreement_irregular_1
- blimp_determiner_noun_agreement_irregular_2
- blimp_determiner_noun_agreement_with_adj_2
- blimp_determiner_noun_agreement_with_adj_irregular_1
- blimp_determiner_noun_agreement_with_adj_irregular_2
- blimp_determiner_noun_agreement_with_adjective_1
- blimp_distractor_agreement_relational_noun
- blimp_distractor_agreement_relative_clause
- blimp_drop_argument
- blimp_ellipsis_n_bar_1
- blimp_ellipsis_n_bar_2
- blimp_existential_there_object_raising
- blimp_existential_there_quantifiers_1
- blimp_existential_there_quantifiers_2
- blimp_existential_there_subject_raising
- blimp_expletive_it_object_raising
- blimp_inchoative
- blimp_intransitive
- blimp_irregular_past_participle_adjectives
- blimp_irregular_past_participle_verbs
- blimp_irregular_plural_subject_verb_agreement_1
- blimp_irregular_plural_subject_verb_agreement_2
- blimp_left_branch_island_echo_question
- blimp_left_branch_island_simple_question
- blimp_matrix_question_npi_licensor_present
- blimp_npi_present_1
- blimp_npi_present_2
- blimp_only_npi_licensor_present
- blimp_only_npi_scope
- blimp_passive_1
- blimp_passive_2
- blimp_principle_A_c_command
- blimp_principle_A_case_1
- blimp_principle_A_case_2
- blimp_principle_A_domain_1
- blimp_principle_A_domain_2
- blimp_principle_A_domain_3
- blimp_principle_A_reconstruction
- blimp_regular_plural_subject_verb_agreement_1
- blimp_regular_plural_subject_verb_agreement_2
- blimp_sentential_negation_npi_licensor_present
- blimp_sentential_negation_npi_scope
- blimp_sentential_subject_island
- blimp_superlative_quantifiers_1
- blimp_superlative_quantifiers_2
- blimp_tough_vs_raising_1
- blimp_tough_vs_raising_2
- blimp_transitive
- blimp_wh_island
- blimp_wh_questions_object_gap
- blimp_wh_questions_subject_gap
- blimp_wh_questions_subject_gap_long_distance
- blimp_wh_vs_that_no_gap
- blimp_wh_vs_that_no_gap_long_distance
- blimp_wh_vs_that_with_gap
- blimp_wh_vs_that_with_gap_long_distance
- boolq
- boolq-seq2seq
- cb
- ceval-valid
- ceval-valid_accountant
- ceval-valid_advanced_mathematics
- ceval-valid_art_studies
- ceval-valid_basic_medicine
- ceval-valid_business_administration
- ceval-valid_chinese_language_and_literature
- ceval-valid_civil_servant
- ceval-valid_clinical_medicine
- ceval-valid_college_chemistry
- ceval-valid_college_economics
- ceval-valid_college_physics
- ceval-valid_college_programming
- ceval-valid_computer_architecture
- ceval-valid_computer_network
- ceval-valid_discrete_mathematics
- ceval-valid_education_science
- ceval-valid_electrical_engineer
- ceval-valid_environmental_impact_assessment_engineer
- ceval-valid_fire_engineer
- ceval-valid_high_school_biology
- ceval-valid_high_school_chemistry
- ceval-valid_high_school_chinese
- ceval-valid_high_school_geography
- ceval-valid_high_school_history
- ceval-valid_high_school_mathematics
- ceval-valid_high_school_physics
- ceval-valid_high_school_politics
- ceval-valid_ideological_and_moral_cultivation
- ceval-valid_law
- ceval-valid_legal_professional
- ceval-valid_logic
- ceval-valid_mao_zedong_thought
- ceval-valid_marxism
- ceval-valid_metrology_engineer
- ceval-valid_middle_school_biology
- ceval-valid_middle_school_chemistry
- ceval-valid_middle_school_geography
- ceval-valid_middle_school_history
- ceval-valid_middle_school_mathematics
- ceval-valid_middle_school_physics
- ceval-valid_middle_school_politics
- ceval-valid_modern_chinese_history
- ceval-valid_operating_system
- ceval-valid_physician
- ceval-valid_plant_protection
- ceval-valid_probability_and_statistics
- ceval-valid_professional_tour_guide
- ceval-valid_sports_science
- ceval-valid_tax_accountant
- ceval-valid_teacher_qualification
- ceval-valid_urban_and_rural_planner
- ceval-valid_veterinary_medicine
- chain_of_thought
- cmmlu
- cmmlu_agronomy
- cmmlu_anatomy
- cmmlu_ancient_chinese
- cmmlu_arts
- cmmlu_astronomy
- cmmlu_business_ethics
- cmmlu_chinese_civil_service_exam
- cmmlu_chinese_driving_rule
- cmmlu_chinese_food_culture
- cmmlu_chinese_foreign_policy
- cmmlu_chinese_history
- cmmlu_chinese_literature
- cmmlu_chinese_teacher_qualification
- cmmlu_clinical_knowledge
- cmmlu_college_actuarial_science
- cmmlu_college_education
- cmmlu_college_engineering_hydrology
- cmmlu_college_law
- cmmlu_college_mathematics
- cmmlu_college_medical_statistics
- cmmlu_college_medicine
- cmmlu_computer_science
- cmmlu_computer_security
- cmmlu_conceptual_physics
- cmmlu_construction_project_management
- cmmlu_economics
- cmmlu_education
- cmmlu_electrical_engineering
- cmmlu_elementary_chinese
- cmmlu_elementary_commonsense
- cmmlu_elementary_information_and_technology
- cmmlu_elementary_mathematics
- cmmlu_ethnology
- cmmlu_food_science
- cmmlu_genetics
- cmmlu_global_facts
- cmmlu_high_school_biology
- cmmlu_high_school_chemistry
- cmmlu_high_school_geography
- cmmlu_high_school_mathematics
- cmmlu_high_school_physics
- cmmlu_high_school_politics
- cmmlu_human_sexuality
- cmmlu_international_law
- cmmlu_journalism
- cmmlu_jurisprudence
- cmmlu_legal_and_moral_basis
- cmmlu_logical
- cmmlu_machine_learning
- cmmlu_management
- cmmlu_marketing
- cmmlu_marxist_theory
- cmmlu_modern_chinese
- cmmlu_nutrition
- cmmlu_philosophy
- cmmlu_professional_accounting
- cmmlu_professional_law
- cmmlu_professional_medicine
- cmmlu_professional_psychology
- cmmlu_public_relations
- cmmlu_security_study
- cmmlu_sociology
- cmmlu_sports_science
- cmmlu_traditional_chinese_medicine
- cmmlu_virology
- cmmlu_world_history
- cmmlu_world_religions
- code2text_go
- code2text_java
- code2text_javascript
- code2text_php
- code2text_python
- code2text_ruby
- codexglue_code2text
- cola
- copa
- coqa
- crows_pairs
- crows_pairs_english
- crows_pairs_english_age
- crows_pairs_english_autre
- crows_pairs_english_disability
- crows_pairs_english_gender
- crows_pairs_english_nationality
- crows_pairs_english_physical_appearance
- crows_pairs_english_race_color
- crows_pairs_english_religion
- crows_pairs_english_sexual_orientation
- crows_pairs_english_socioeconomic
- crows_pairs_french
- crows_pairs_french_age
- crows_pairs_french_autre
- crows_pairs_french_disability
- crows_pairs_french_gender
- crows_pairs_french_nationality
- crows_pairs_french_physical_appearance
- crows_pairs_french_race_color
- crows_pairs_french_religion
- crows_pairs_french_sexual_orientation
- crows_pairs_french_socioeconomic
- csatqa
- csatqa_gr
- csatqa_li
- csatqa_rch
- csatqa_rcs
- csatqa_rcss
- csatqa_wr
- cycle_letters
- drop
- ethics_cm
- ethics_deontology
- ethics_justice
- ethics_utilitarianism
- ethics_virtue
- flan_held_in
- flan_held_out
- fld
- fld_default
- fld_star
- freebase
- generate_until
- glue
- gpt3_translation_benchmarks
- gsm8k
- gsm8k_cot
- gsm8k_cot_self_consistency
- headqa
- headqa_en
- headqa_es
- hellaswag
- hellaswag_ar
- hellaswag_bn
- hellaswag_ca
- hellaswag_da
- hellaswag_de
- hellaswag_es
- hellaswag_eu
- hellaswag_fr
- hellaswag_gu
- hellaswag_hi
- hellaswag_hr
- hellaswag_hu
- hellaswag_hy
- hellaswag_id
- hellaswag_it
- hellaswag_kn
- hellaswag_ml
- hellaswag_mr
- hellaswag_multilingual
- hellaswag_ne
- hellaswag_nl
- hellaswag_pt
- hellaswag_ro
- hellaswag_ru
- hellaswag_sk
- hellaswag_sr
- hellaswag_sv
- hellaswag_ta
- hellaswag_te
- hellaswag_uk
- hellaswag_vi
- hendrycks_ethics
- ifeval
- iwslt2017
- iwslt2017-ar-en
- iwslt2017-en-ar
- kmmlu
- kmmlu_accounting
- kmmlu_agricultural_sciences
- kmmlu_aviation_engineering_and_maintenance
- kmmlu_biology
- kmmlu_chemical_engineering
- kmmlu_chemistry
- kmmlu_civil_engineering
- kmmlu_computer_science
- kmmlu_construction
- kmmlu_criminal_law
- kmmlu_ecology
- kmmlu_economics
- kmmlu_education
- kmmlu_electrical_engineering
- kmmlu_electronics_engineering
- kmmlu_energy_management
- kmmlu_environmental_science
- kmmlu_fashion
- kmmlu_food_processing
- kmmlu_gas_technology_and_engineering
- kmmlu_geomatics
- kmmlu_health
- kmmlu_industrial_engineer
- kmmlu_information_technology
- kmmlu_interior_architecture_and_design
- kmmlu_law
- kmmlu_machine_design_and_manufacturing
- kmmlu_management
- kmmlu_maritime_engineering
- kmmlu_marketing
- kmmlu_materials_engineering
- kmmlu_mechanical_engineering
- kmmlu_nondestructive_testing
- kmmlu_patent
- kmmlu_political_science_and_sociology
- kmmlu_psychology
- kmmlu_public_safety
- kmmlu_railway_and_automotive_engineering
- kmmlu_real_estate
- kmmlu_refrigerating_machinery
- kmmlu_social_welfare
- kmmlu_taxation
- kmmlu_telecommunications_and_wireless_technology
- kobest
- kobest_boolq
- kobest_copa
- kobest_hellaswag
- kobest_sentineg
- kobest_wic
- lambada
- lambada_cloze
- lambada_multilingual