-
Notifications
You must be signed in to change notification settings - Fork 38
/
duck.py
218 lines (188 loc) · 8.03 KB
/
duck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/bin/env python3
"""
This script converts MultiPL-E results into a DuckDB database.
It provides two subcommands:
1. init - creates a new database with the required schema
2. insert - inserts data from a directory of results
"""
import argparse
from pathlib import Path
import duckdb
import gzip
import json
from typing import Optional, List
from collections import defaultdict
import datetime
from tqdm.auto import tqdm
# Epoch start time
time_1970 = datetime.datetime(1970, 1, 1)
def gunzip_json(path: Path) -> Optional[dict]:
"""
Reads a .json.gz file, but produces None if any error occurs.
"""
try:
with gzip.open(path, "rt") as f:
return json.load(f)
except Exception as e:
return None
def init_db(args):
"""Initialize a new DuckDB database with the experiment and completions tables schema"""
conn = duckdb.connect(str(args.db_path))
# The comments below are examples values for each column.
conn.execute("""
CREATE SEQUENCE IF NOT EXISTS experiment_id START 1;
CREATE TABLE IF NOT EXISTS experiment (
id INTEGER PRIMARY KEY DEFAULT NEXTVAL('experiment_id'),
base_dataset VARCHAR NOT NULL, -- humaneval, mbpp, ...
language VARCHAR NOT NULL, -- python, cpp, ...
model VARCHAR NOT NULL, -- gpt-4o, claude-3-5-sonnet, ...
variation VARCHAR NOT NULL, -- reworded, ...
problem VARCHAR NOT NULL, -- HumanEval_53_add, ...
temperature DOUBLE NOT NULL,
top_p DOUBLE NOT NULL,
max_tokens INTEGER NOT NULL,
stop_tokens VARCHAR[] NOT NULL
);
CREATE TABLE IF NOT EXISTS completions (
experiment_id INTEGER NOT NULL REFERENCES experiment(id),
prompt VARCHAR NOT NULL,
completion VARCHAR NOT NULL,
tests VARCHAR NOT NULL,
program VARCHAR NOT NULL, -- the complete program, including tests
stdout VARCHAR NOT NULL,
stderr VARCHAR NOT NULL,
exit_code INTEGER, -- NULL exit_code on timeouts.
status VARCHAR NOT NULL,
timestamp TIMESTAMP NOT NULL,
count INTEGER NOT NULL,
PRIMARY KEY (experiment_id, prompt,completion)
)
""")
print(f"Initialized database at {args.db_path}")
conn.close()
def process_experiment(conn: duckdb.DuckDBPyConnection, experiment_path: Path):
tqdm.write(f"Processing experiment {experiment_path}")
base_dataset, language, model, temperature, variation = experiment_path.name.split("-")
# NOTE(arjun): This highlights the bad design of the MultiPL-E results
# format.
created_experiment_record = False
for executions_path in experiment_path.glob("*.results.json.gz"):
executions = gunzip_json(executions_path)
completions_path = executions_path.parent / (
executions_path.name[:-16] + ".json.gz"
)
completions = gunzip_json(completions_path)
if completions is None or executions is None:
continue
# On the first completion/execution, we create the experiment record.
if not created_experiment_record:
# Check if an experiment record already exists with these values.
# If so, we assume that the experiment has already been processed
# and skip the whole directory.
existing_experiment = conn.execute("""
SELECT id FROM experiment
WHERE base_dataset = ? AND language = ? AND model = ? AND variation = ? AND problem = ? AND temperature = ?
""", (base_dataset, language, model, variation, completions["name"], temperature)).fetchone()
if existing_experiment:
tqdm.write(f"Experiment record already exists with id {existing_experiment[0]}")
return
# Create the experiment record
conn.execute("""
INSERT INTO experiment (
base_dataset, language, model, variation, problem, temperature, top_p, max_tokens, stop_tokens
) VALUES (
?, ?, ?, ?, ?, ?, ?, ?, ?
)
RETURNING id
""", (
base_dataset, language, model, variation, completions["name"],
temperature, executions.get("top_p", 0.95), executions.get("max_tokens", 512),
str(completions["stop_tokens"])
))
experiment_id = conn.fetchone()[0]
tqdm.write(f"Created experiment record {experiment_id}")
created_experiment_record = True
for completion, result in zip(completions["completions"], executions["results"]):
conn.execute("""
INSERT INTO completions (
experiment_id, prompt,completion, tests, program, stdout, stderr, exit_code, status, timestamp, count
) VALUES (
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1
)
ON CONFLICT(experiment_id, prompt,completion) DO UPDATE SET count = count + 1
""", (
experiment_id,
completions["prompt"],
completion,
completions["tests"],
result["program"],
result["stdout"],
result["stderr"],
result["exit_code"],
result["status"],
datetime.datetime.fromtimestamp(result["timestamp"]) if "timestamp" in result else time_1970,
))
def pass1(conn: duckdb.DuckDBPyConnection):
conn.execute("""
WITH prompt_pass_rates AS (
SELECT
e.id AS experiment_id,
c.prompt,
SUM(CASE WHEN c.status = 'OK' THEN c.count ELSE 0 END) * 1.0 / SUM(c.count) AS pass_rate,
SUM(c.count) as count,
FROM
main.experiment e
JOIN
main.completions c
ON
e.id = c.experiment_id
GROUP BY
e.id, c.prompt,
)
SELECT
p.experiment_id,
ANY_VALUE(e.base_dataset) as base_dataset,
ANY_VALUE(e.model) as model,
ANY_VALUE(e.language) as language,
ANY_VALUE(e.temperature) as temperature,
MAX(p.count) AS max_completions,
MIN(p.count) AS min_completions,
ROUND(AVG(p.pass_rate), 2) AS pass1,
FROM
prompt_pass_rates p
JOIN
main.experiment e
ON e.id = p.experiment_id
GROUP BY
experiment_id;
""")
for item in conn.fetchall():
print(item)
def process_experiments(conn: duckdb.DuckDBPyConnection, experiment_paths: List[Path]):
for experiment_path in tqdm(experiment_paths):
conn.sql("BEGIN TRANSACTION")
process_experiment(conn, experiment_path)
conn.sql("COMMIT")
def main():
parser = argparse.ArgumentParser(description="Convert MultiPL-E results to DuckDB")
parser.add_argument("--db", dest="db_path", type=Path, default="completions.db",
help="Path to the DuckDB database file")
subparsers = parser.add_subparsers(dest="command", required=True)
# Init command
init_parser = subparsers.add_parser("init", help="Initialize a new database")
# Insert command
insert_parser = subparsers.add_parser("insert", help="Insert data from a directory of experiments")
insert_parser.add_argument("experiments", nargs="+", type=Path,
help="Path to the experiment directory")
pass1_parser = subparsers.add_parser("pass1", help="Compute pass1")
args = parser.parse_args()
if args.command == "init":
init_db(args)
elif args.command == "insert":
with duckdb.connect(str(args.db_path)) as conn:
process_experiments(conn, args.experiments)
elif args.command == "pass1":
with duckdb.connect(str(args.db_path)) as conn:
pass1(conn)
if __name__ == "__main__":
main()