-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch_makecsv.py
87 lines (60 loc) · 3.09 KB
/
search_makecsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from Bio import Entrez
from Bio import Medline
import numpy as np
from sys import argv
import csv
import os
Entrez.email = "jdushoff@gmail.com"
script, filename = argv
txt = open(filename)
SearchTerm = txt.read()
# Get Pubmed IDs matching SearchTerm
handle = Entrez.esearch(db="pubmed", term=SearchTerm, retmax=1000)
## ADD a warning if retmax articles are returned
record = Entrez.read(handle)
idlist = record["IdList"]
print idlist
# This returns a python list containing all of the PubMed IDs of articles related to the search concepts. i.e. it will display as [18680603', '18665331', '18661158', '18627489' ... ]
# =========== =======================
# Now that we've got the PubMed IDs,
# The following commands will give us the corresponding Medline records and extract the information from them. Here, we'll download the Medline records in the Medline flat-file format, and use the Bio.Medline module to parse them:
handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline",retmode="text")
records = Medline.parse(handle)
records = list(records)
#print records
np.savetxt('SystematicSearch.txt', records, fmt="%s") # this saves in a text format
# manupilating dictionaries
HeaderOutPut=['PMID','FAU','JT','TI','AB'] # selected keys or items from the search result. The reason for choosing some items is 1) to only download items of interest. 2) some articles have different details. Example an article may have 20 items including 'PMID','FAU','JT','TI','AB'. Whereas, another article may have less or more than 20 items. By doing this we make the number of items constant. ['PMID','FAU','JT','TI','AB' are items found for every article].
# PMID: is the unique identifier number used in PubMed
# FAU: Full names of the Author(s)
# JT: Journal Type
# TI: Title of the article
# AB: Abstract of the article
print len(records)
FinalList=[None]*len(records) # creating an empty list
for n in range(len(records)):
FinalList[n]={} # creating empty dictionary
for m in range(len(HeaderOutPut)):
if str(HeaderOutPut[m]) in records[n]:
FinalList[n][str(HeaderOutPut[m])]= records[n][str(HeaderOutPut[m])]
else:
FinalList[n][str(HeaderOutPut[m])]= 'None'
# for some reason I don't know, it was complaining about 'FAU' for some articles, and hence used if ... else function above.
print FinalList
# using WriteDictToCSV function to convert the records (now as dictionaries) into csv file
def WriteDictToCSV(csv_file,csv_columns,dict_data):
try:
with open(csv_file, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
writer.writeheader()
for data in dict_data:
writer.writerow(data)
except IOError as (errno, strerror):
print("I/O error({0}): {1}".format(errno, strerror))
return
# using WriteDictToCSV function defined above
dict_data=FinalList
csv_columns =HeaderOutPut # the header of the table
currentPath = os.getcwd()
csv_file = currentPath + "/SystematicSearch.csv"
WriteDictToCSV(csv_file,csv_columns,dict_data) # the output will be saved automatically in csv folder