forked from glouppe/kaggle-marinexplore
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
64 lines (47 loc) · 1.46 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import aifc
import numpy as np
import os
import os.path
import struct
def load_aiff(filename):
# Load data
fd = aifc.open(filename, "r")
sample_width = fd.getsampwidth()
n_frames = fd.getnframes()
n_channels = fd.getnchannels()
data = fd.readframes(fd.getnframes())
fd.close()
# Convert bytes into Numpy array
samples = np.fromstring(data, np.short).byteswap()
return samples
def load_training_data(file_labels, dir_aiff):
X = []
y = []
fd_labels = open(file_labels, "r")
lines = iter(fd_labels)
# skip header
next(lines)
for line in lines:
filename, label = line.strip().split(",")
X.append(load_aiff(os.path.join(dir_aiff, filename)))
y.append(int(label))
fd_labels.close()
return np.array(X, dtype=np.float32), np.array(y, dtype=np.int)
def load_test_data(dir_aiff, n=54503):
X = []
for i in xrange(1, n+1):
filename = "test%d.aiff" % i
X.append(load_aiff(os.path.join(dir_aiff, filename)))
return np.array(X, dtype=np.float32)
if __name__ == "__main__":
X_train, y_train = load_training_data("data/train.csv", "data/train")
X_test = load_test_data("data/test")
# Save for later as numpy arrays
fd = open("data/train.npz", "wb")
np.savez(fd, X_train=X_train, y_train=y_train)
fd.close()
fd = open("data/test.npz", "wb")
np.savez(fd, X_test=X_test)
fd.close()