-
Notifications
You must be signed in to change notification settings - Fork 0
/
uniprot_scraper_obsolete.py
executable file
·408 lines (299 loc) · 12.6 KB
/
uniprot_scraper_obsolete.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
#!/usr/bin/python
name = "uniprot_scraper.py"
version = "0.5.2"
updated = "2022-05-24"
usage = f"""\n
NAME {name}
VERSION {version}
UPDATED {updated}
SYNOPSIS Navigates UniProt and aquires the metadata of desired proteins as well as
downloads the corresponding 3D structures and FASTA files
COMMAND {name} \\
-v \\
-s \\
-ds \\
-df \\
-m X-ray \\
-c 'toll database:(type:pdb) AND reviewed:yes AND organism:"Drosophila melanogaster (Fruit fly) [7227]"'
OPTIONS
-g (--go_keyword) Gene ontoloy keyword to search
-v (--verified_only) Get downloads for genes that have been verified by UniProt
-s (--structures) Find accessions with 3D structures
-ds (--dwnld_strts) Download 3D structures
-df (--dwnld_fasta) Download FASTA files
-m (--method) Method used to obtain structure [Default = All] (i.e., X-ray, NMR)
-o (--outdir) Output directory for downloading files [Default = ./UNIPROT_SCRAP_RESULTS]
-c (--custom) Custom UniProt search
"""
from sys import exit,argv
if (len(argv) == 1):
print(f"{usage}")
exit()
import re
import os
import time
import argparse
from os import system, mkdir, path
pipeline_location = os.path.dirname(argv[0])
## Setup GetOptions
parser = argparse.ArgumentParser(usage=usage)
parser.add_argument("-g","--go_annotation")
parser.add_argument("-v","--verified_only",action='store_true')
parser.add_argument("-s","--structures",action='store_true')
parser.add_argument("-ds","--download_structures",action='store_true')
parser.add_argument("-df","--download_fasta",action='store_true')
parser.add_argument("-f","--fasta",action='store_true')
parser.add_argument("-m","--method")
parser.add_argument("-o","--outdir",default="./UNIPROT_SCRAP_RESULTS")
parser.add_argument("-c","--custom")
args = parser.parse_args()
## Acquire argparse options
go_annotation = args.go_annotation
reviewed = args.verified_only
structures = args.structures
download_structures = args.download_structures
download_fasta = args.download_fasta
method = args.method
outdir = args.outdir
custom = args.custom
fastadir = outdir + "/FASTA"
pdbdir = outdir + "/PDBs"
LOG = open(f"{outdir}/search.log", "w")
LOG.write(f"{argv[0]}\n")
## Prepare keywords for searching
keywords = ""
if(go_annotation):
keywords = f"goa:({go_annotation})"
if(reviewed):
keywords = f"{keywords} AND reviewed:yes"
if(method):
keywords = f"{keywords} AND method:({method})"
if(structures):
keywords = f"{keywords} AND database:(type:pdb)"
## If a custom search phrase is provided, overwrite keywords
if(custom):
keywords = custom
if not (os.path.exists(outdir)):
os.mkdir(outdir)
os.mkdir(fastadir)
if not (os.path.exists(fastadir)):
os.mkdir(fastadir)
if not (os.path.exists(pdbdir)):
os.mkdir(pdbdir)
## Loading all the necessary packages for web scraping
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
start = time.strftime("%Y-%m-%d, %H:%M:%S",time.localtime())
## Creating web scraper
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options,service_log_path=os.path.devnull)
url = "https://www.uniprot.org/"
LOG.write(f"Source from {url}\n")
LOG.write(f"Keywords\n\t{keywords}\n")
print(f"\nConnecting to {url}")
## Connect to UniProt
driver.get(url)
####################################################
##### All waiting times are set to 10 mins max #####
####################################################
## Wait for page load
WebDriverWait(driver,600).until(EC.presence_of_element_located((By.ID, "query")))
## Find search bar
search_bar = driver.find_element_by_id("query")
## Enter query into search bar
search_bar.send_keys(keywords)
## Find the search button
search_button = driver.find_element_by_id("search-button")
## Click the search button
search_button.click()
print(f"Searching for keywords '{keywords}'")
## Wait for dropdown (tells us the results are in)
WebDriverWait(driver,600).until(EC.presence_of_element_located((By.CLASS_NAME,"limitdropDown")))
## Find the number of results drop down box
item_limit = driver.find_element_by_class_name("limitdropDown")
## Get the options
options = item_limit.find_elements_by_css_selector("option")
## Select the max result for SPEED
options[4].click()
print("Aquiring matches")
LOG.write("Accession Found\n")
accession_numbers = []
## Visit next page of results if there is one
loop = True
while loop == True:
## Wait for results to show up
WebDriverWait(driver,600).until(EC.presence_of_element_located((By.ID, "results")))
## HTML hunting for accession numbers and corresponding links
content_table = driver.find_element_by_id("results")
table = content_table.find_element_by_css_selector("tbody")
results = table.find_elements_by_css_selector("tr")
## For each accession number, get its corresponding link
for i in results:
accession = i.get_attribute("id")
print(f"Found Query\t{accession}")
LOG.write(f"\t{accession}\n")
href = i.find_element_by_class_name("entryID > a").get_attribute("href")
accession_numbers.append([accession,href])
## Try to click on next page else, start accession number surfing
try:
nextPage = driver.find_element_by_class_name("nextPageLink").get_attribute("href")
driver.get(nextPage)
print("Moving to next page of results")
except:
loop = False
print()
metadata = {}
Downloads = {}
for pages in accession_numbers:
## Store ["METADAT",[FASTA_LINKS],[STRUCTURE_LINKS]]
info = ["",[],[]]
## Get accession number and go to its page
accession = pages[0]
driver.get(pages[1])
## Wait for content to load
WebDriverWait(driver,600).until(EC.presence_of_element_located((By.ID,"content-organism")))
## HTML hunting for the name of the organism the query belongs to
organism_title = driver.find_element_by_id("content-organism")
organism_name = organism_title.find_element_by_tag_name("em").text
info[0] = organism_name
## Wait for content to load
WebDriverWait(driver,600).until(EC.presence_of_element_located((By.ID,"content-protein")))
## HTML hunting for the protein name
protein_title = driver.find_element_by_id("content-protein")
protein_name = protein_title.find_element_by_tag_name("h1").text
print(f"Visiting page for {protein_name}")
metadata[accession] = [organism_name,protein_name,[],[]]
## If downloading FASTA files, go HTML hunting
if(download_fasta):
print(f"\tSearching for FASTA links for {accession} found in {organism_name}")
try:
## Wait to load content
WebDriverWait(driver,600).until(EC.presence_of_element_located((By.ID,"sequences-section")))
sequences_section = driver.find_element_by_id("sequences-section")
download_button = sequences_section.find_elements_by_tag_name("a")
## Search all the HTML for the FASTA link
for a in download_button:
## Get the first fasta link, and break out
if(a.get_attribute("class") == "tooltipped icon icon-functional button inlineDisplayThis"):
fasta_link = a.get_attribute("href")
info[1].append(fasta_link)
print(f"\t\tFASTA link\n\t\t\t{fasta_link}")
break
except:
LOG.write(f"Unable to acquire fasta download link for {accession}\n")
continue
## If downloading PDB files, go HTML hunting
if(download_structures):
print(f"\tSearching for 3D structure links for {accession} found in {protein_name}")
try:
## Wait for content to load
WebDriverWait(driver,600).until(EC.presence_of_element_located((By.TAG_NAME, "protvista-datatable")))
## HTML hunting for PDB file links
datatable = driver.find_element_by_tag_name("protvista-datatable")
table = datatable.find_element_by_tag_name("table")
tbody = datatable.find_element_by_tag_name("tbody")
rows = tbody.find_elements_by_tag_name("tr")
print("\t\tPDB Links")
## Iterate over all the HTML garbage for the download links
for row in rows:
## Get the PDB link
item = row.find_elements_by_tag_name("td")[8]
download_link = item.find_element_by_tag_name("a").get_attribute("href")
info[2].append(download_link)
print(f"\t\t\t{download_link}\n")
except:
LOG.write(f"Unable to acquire structure download links for {accession}\n")
continue
if not download_fasta and not download_structures:
print("Acquiring metadata only!")
Downloads[accession] = info
## Close the scraper, we are done surfing
driver.close()
downloaded = []
sources = ""
SOURCES = open(f"{outdir}/download.log", "w")
METADATA = open(f"{outdir}/metadata.log","w")
## Iterate over all UniProt accession found
### For each accession there can be a FASTA file
print(f"Downloading data for {len(Downloads.keys())} accession!")
for accession in Downloads:
## Split the data into workable chunks
org_name,fasta_link,structure_links = Downloads[accession]
SOURCES.write(f"{accession}\t{org_name}\n")
SOURCES.write("FASTA:\n")
METADATA.write(f"{accession}\t{metadata[accession][0]}\t{metadata[accession][1]}\n")
METADATA.write(f"\tFASTA\n")
print(f"Getting data for {accession}")
## Iterate over all FASTA links (This is redundant because currently hardcoded to get (1) FASTA link)
for link in fasta_link :
print(f"Acquiring FASTA file {link}")
fasta_name = re.search("\/(\w+\.fasta)$",link).groups(0)[0]
SOURCES.write(f"\t{link}\n")
METADATA.write(f"\t\t{fasta_name}\n")
## Download the FASTA file if not already
if not path.exists(f"{fastadir}/{fasta_name}"):
system(f"wget {link} -O {fastadir}/{fasta_name} 1>/dev/null 2>>{outdir}/download.errors")
SOURCES.write("STRUCTURE:\n")
METADATA.write(f"\tSTRUCTURE\n")
for link in structure_links:
## PDB prefix
pdb_prefix = ""
## Determine PDB source\
db = ""
### AlphaFold
if(re.search("AF-(\w+)-\w\d-model",link)):
pdb_prefix = re.search("AF-(\w+)-\w\d-model",link).groups(0)[0]
db = "AF"
### RCSB
if(re.search("pdb(\w+)\.ent",link)):
pdb_prefix = re.search("pdb(\w+)\.ent",link).groups(0)[0]
db = "RCSB"
SOURCES.write(f"\t{link}\n")
## Download the RCSB PDB file if it hasn't been downloaded/split
if not path.exists(f"{pdbdir}/{pdb_prefix}.pdb") and not path.isdir(f"{pdbdir}/{pdb_prefix}"):
system(f"wget {link} -O {pdbdir}/{pdb_prefix}.pdb 1>/dev/null 2>>{outdir}/download.errors")
## Split RCSB PDB file by chain and perform BLAST search to identify wanted chains
if(db == "RCSB"):
## Split the PDB file into different chains if not yet done
if not path.exists(f"{pdbdir}/{pdb_prefix}") and path.exists(f"{pdbdir}/{pdb_prefix}.pdb"):
system(f"{pipeline_location}/split_PDB.pl -p {pdbdir}/{pdb_prefix}.pdb -o {pdbdir}/{pdb_prefix}")
## Extract the FASTA sequences from PDB files
if not path.exists(f"{pdbdir}/{pdb_prefix}/FASTA/ALL.fasta") and any(File.endswith(".pdb") for File in os.listdir(f"{pdbdir}/{pdb_prefix}")):
## Split the PDBS
system(f"{pipeline_location}/extract_pdb_sequence.pl -p {pdbdir}/{pdb_prefix}/*.pdb -o {pdbdir}/{pdb_prefix}/FASTA")
## Concatenate the FASTA files
system(f"cat {pdbdir}/{pdb_prefix}/FASTA/*.fasta > {pdbdir}/{pdb_prefix}/FASTA/ALL.fasta")
if path.exists(f"{pdbdir}/{pdb_prefix}/FASTA/ALL.fasta"):
## If the sequences have been extracted, perform a BLAST search
if path.getsize(f"{pdbdir}/{pdb_prefix}/FASTA/ALL.fasta") > 0:
## Make BLAST db
system(f"diamond makedb --in {pdbdir}/{pdb_prefix}/FASTA/ALL.fasta --db {pdbdir}/{pdb_prefix}/FASTA/3Database &>/dev/null")
## Perform BLAST search
system(f"diamond blastp -q {fastadir}/{fasta_name} --db {pdbdir}/{pdb_prefix}/FASTA/3Database --out {pdbdir}/{pdb_prefix}/FASTA/results.diamond.6 &>/dev/null")
## Parse BLAST file for best results
with open(f"{pdbdir}/{pdb_prefix}/FASTA/results.diamond.6","r") as BLAST:
for line in BLAST:
file = line.split("\t")[1]
## Move the best PDB chain from its parent into home
system(f"cp {pdbdir}/{pdb_prefix}/{file}.pdb {pdbdir}/{file}.pdb")
## Add metadata to log
if f"{file}.pdb" not in metadata[accession][3]:
metadata[accession][3].append(f"{file}.pdb")
METADATA.write(f"\t\t{file}.pdb\n")
break
elif f"{file}.pdb" not in metadata[accession][3]:
metadata[accession][3].append(f"{file}.pdb")
METADATA.write(f"\t\t{file}.pdb\n")
## Remove unneeded PDBs to prevent unwanted hits being returned
for item in os.listdir(f"{pdbdir}/"):
if (os.path.isdir(f"{pdbdir}/{item}")):
system(f"rm -r {pdbdir}/{item}")
stop = time.strftime("%Y-%m-%d, %H:%M:%S",time.localtime())
LOG.write(f"Start\t{start}\nStop\t{stop}\n")
## Create an archive for later use
system(f"tar -czf {outdir}.tar.gz {outdir}")