-
Notifications
You must be signed in to change notification settings - Fork 1
/
sampleListGen.py
31 lines (24 loc) · 996 Bytes
/
sampleListGen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
"Basic queries of GEO studies --> sample lists"
import csv, re, requests
import pandas as pd
geoURL = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc='
refDirectory = 'refFiles'
def main(url, directory):
studyList = []
with open('{0}/GEO_MusmusculusStudies.txt'.format(directory), newline = '') as fin:
csvRead = csv.reader(fin, delimiter=' ', quotechar='|')
for row in csvRead:
rowJoin = ''.join(row)
if 'Accession:GSE' in rowJoin:
studyList += re.findall(r'Accession\:(GSE\d+)', rowJoin)
sampleList = []
counter = 0
for geo in studyList:
urlGetText = requests.get(url + geo).text
sampleList += list(set(re.findall(r'acc\=(GSM\d+)\"', urlGetText)))
counter += 1
if counter % 1000 == 0:
print('1k studies done')
pd.DataFrame(sampleList).to_csv('{0}/GEO_MusmusculusSamples.csv'.format(directory))
if __name__ == "__main__":
main(geoURL, refDirectory)