-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathprepare_data.py
50 lines (37 loc) · 1.52 KB
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
"""
OLD - not used anymore.
The result of this script is already downloadable (see the README.md).
So this file is really not important, I just kept it because it was used to generate the data I provide.
Convert all the pickles files (from the original Othello dataset) in {pickles_dir} and put them in {data_dir} :
- pad all the games to length 60 (with -1's)
- ready to open with np.memmap() as np.int8
A .bin file will thus be a 60*N array (N is approx. 100K, the number of games per file).
Each game consists of a sequence of moves, each encoded from 0 to 63.
"""
import os
import pickle
import numpy as np
pickles_dir = "pickles/"
data_dir = "data/"
train_dir = os.path.join(data_dir, 'train')
val_dir = os.path.join(data_dir, 'val')
if not os.path.exists(train_dir):
os.makedirs(train_dir)
if not os.path.exists(val_dir):
os.makedirs(val_dir)
pickles_files = os.listdir(pickles_dir)
train_threshold = int(len(pickles_files) * 0.8)
for i, filename in enumerate(pickles_files):
with open(os.path.join(pickles_dir, filename), 'rb') as handle:
games = pickle.load(handle)
np_games = -1 * np.ones((len(games), 60), dtype=np.int8)
for k, game in enumerate(games):
len_game = len(game)
np_games[k, :len_game] = game
if i < train_threshold:
output_dir = train_dir
else:
output_dir = val_dir
np_games.tofile(os.path.join(output_dir, f"games_{i}.bin"))
if i % 10 == 0:
print(f"Processing file {i}/{len(pickles_files)}")