forked from nilearn/nilearn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
datasets.py
231 lines (196 loc) · 8.45 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
"""File to import StarPlus data
"""
import os
import urllib2
import tarfile
import numpy as np
from scipy import io
from sklearn.datasets.base import Bunch
import nibabel as ni
def fetch_star_plus_data():
"""Function returning the starplus data, downloading them if needed
Returns
-------
data : Bunch
Dictionary-like object, the interest attributes are :
'datas' : a list of 6 numpy arrays representing the data to learn
'targets' : list
targets of the datas
'masks' : the masks for the datas
Note
----
Each element will be of the form :
PATH/*.npy
The star plus datasets is composed of n_trials trials.
Each trial is composed of 13 time units.
We decided here to average on the time
/!\ y is not binarized !
Reference
---------
Documentation :
http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-81/www/\
README-data-documentation.txt
Data :
http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-81/www/
"""
# If the directory for the data doesn't exists we create it
data_dir = os.path.join(os.getcwd(), 'nisl_data')
if not os.path.exists(data_dir):
os.makedirs(data_dir)
file_names = ['data-starplus-0%d-v7.mat' % i for i in [4847,
4799, 5710, 4820, 5675, 5680]]
url1 = 'http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-81/www/'
url2 = 'http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-83/www/'
full_names = [os.path.join(data_dir, name) for name in file_names]
success_indices = []
for indice, full_name in enumerate(full_names):
print "Fetching file : %s" % full_name
if (os.path.exists(os.path.join(data_dir,
"data-starplus-%d-X.npy" % indice))
and os.path.exists(os.path.join(data_dir,
"data-starplus-%d-y.npy" % indice))):
success_indices.append(indice)
else:
# Retrieving the .mat data and saving it if needed
if not os.path.exists(full_name):
if indice >= 3:
url = url2
else:
url = url1
data_url = os.path.join(url, file_names[indice])
try:
print 'Downloading data from %s ...' % data_url
req = urllib2.Request(data_url)
data = urllib2.urlopen(req)
local_file = open(full_name, "wb")
local_file.write(data.read())
local_file.close()
except urllib2.HTTPError, e:
print "HTTP Error: %s, %s" % (e, data_url)
except urllib2.URLError, e:
print "URL Error: %s, %s" % (e, data_url)
print '...done.'
# Converting data to a more readable format
print "Converting file %d on 6..." % (indice + 1)
# General information
try:
data = io.loadmat(full_name)
n_voxels = data['meta']['nvoxels'].flat[0].squeeze()
n_trials = data['meta']['ntrials'].flat[0].squeeze()
dim_x = data['meta']['dimx'].flat[0].squeeze()
dim_y = data['meta']['dimy'].flat[0].squeeze()
dim_z = data['meta']['dimz'].flat[0].squeeze()
# Loading X
X_temp = data['data'][:, 0]
# Loading y
y = data['info']
y = y[0, :]
# y = np.array([y[i].flat[0]['actionRT'].flat[0]
y = np.array([y[i].flat[0]['cond'].flat[0]
for i in range(n_trials)])
good_trials = np.where(y > 1)[0]
n_good_trials = len(good_trials)
n_times = 16 # 8 seconds
# sentences
XS = np.zeros((n_good_trials, dim_x, dim_y, dim_z))
# pictures
XP = np.zeros((n_good_trials, dim_x, dim_y, dim_z))
first_stim = data['info']['firstStimulus']
# Averaging on the time
for k, i_trial in enumerate(good_trials):
i_first_stim = str(first_stim.flat[i_trial][0])
XSk = XS[k]
XPk = XP[k]
for j in range(n_voxels):
# Getting the right coords of the voxels
x, y, z = data['meta']['colToCoord'].flat[0][j, :] - 1
Xkxyz = X_temp[i_trial][:, j]
# Xkxyz -= Xkxyz.mean() # remove drifts
if i_first_stim == 'S': # sentence
XSk[x, y, z] = Xkxyz[:n_times].mean()
XPk[x, y, z] = Xkxyz[n_times:2 * n_times].mean()
elif i_first_stim == 'P': # picture
XPk[x, y, z] = Xkxyz[:n_times].mean()
XSk[x, y, z] = Xkxyz[n_times:2 * n_times].mean()
else:
raise ValueError('Uknown first_stim : %s'
% first_stim)
X = np.r_[XP, XS]
y = np.ones(2 * n_good_trials)
y[:n_good_trials] = 0
X = X.astype(np.float)
y = y.astype(np.float)
name = "data-starplus-%d-X.npy" % indice
name = os.path.join(data_dir, name)
np.save(name, X)
name = "data-starplus-%d-y.npy" % indice
name = os.path.join(data_dir, name)
np.save(name, y)
name = "data-starplus-%d-mask.npy" % indice
name = os.path.join(data_dir, name)
mask = X[0, ...]
mask = mask.astype(np.bool)
np.save(name, mask)
print "...done."
# Removing the unused data
os.remove(full_name)
except Exception, e:
print "Impossible to convert the file %s:\n %s " % (name, e)
success_indices.append(indice)
print "...done."
all_subject = []
for i in success_indices:
X = np.load(os.path.join(data_dir, 'data-starplus-%d-X.npy' % i))
y = np.load(os.path.join(data_dir, 'data-starplus-%d-y.npy' % i))
mask = np.load(os.path.join(data_dir, 'data-starplus-%d-mask.npy' % i))
all_subject.append(Bunch(data=X, target=y, mask=mask))
return all_subject
def fetch_haxby_data():
"""Returns the haxby datas
Returns
-------
data : Bunch
Dictionary-like object, the interest attributes are :
'data' : numpy array : the data to learn
'target' : numpy array
target of the data
'mask' : the masks for the data
'session' : the labels for LeaveOneLabelOut cross validation
"""
data_dir = os.path.join(os.getcwd(), 'nisl_data')
if not os.path.exists(data_dir):
os.makedirs(data_dir)
url = 'http://www.pymvpa.org/files/pymvpa_exampledata.tar.bz2'
file_names = ['attributes.txt', 'bold.nii.gz', 'mask.nii.gz']
file_names = [os.path.join('pymvpa-exampledata', i) for i in file_names]
download = False
for name in file_names:
# if one of those files doesn't exist, we download the archive
if not os.path.exists(os.path.join(data_dir, name)):
download = True
if download:
try:
print 'Downloading data from %s ...' % url
data = urllib2.urlopen(url)
temp_name = os.path.join(data_dir, 'temp.tar.bz2')
if not os.path.exists(temp_name):
local_file = open(temp_name, "wb")
local_file.write(data.read())
local_file.close()
except urllib2.HTTPError, e:
print "HTTP Error:", e, url
except urllib2.URLError, e:
print "URL Error:", e, url
print '...done.'
print 'extracting data from %s...' % temp_name
tar = tarfile.open(temp_name, "r:bz2")
for name in file_names:
print ' extracting %s...' % name
tar.extract(name, path=data_dir)
print ' ...done.'
os.remove(temp_name)
file_names = [os.path.join(data_dir, i) for i in file_names]
y, session = np.loadtxt(file_names[0]).astype("int").T
X = ni.load(file_names[1]).get_data()
mask = ni.load(file_names[2]).get_data()
return Bunch(data=X, target=y, mask=mask, session=session)