forked from EBI-Metagenomics/genome_uploader
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenome_upload.py
1400 lines (1174 loc) · 64.2 KB
/
genome_upload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
import os
import sys
import logging
import argparse
import re
import json
import pandas as pd
from datetime import date, datetime as dt
from time import sleep
import xml.etree.ElementTree as ET
import xml.dom.minidom as minidom
import requests
##Script is currently in the writing stage and NOT TO BE used by the public.
#Script to annotate output manifests generated from genomes_uploader.py to allow upload of MAGs generated from co-assemblies of multiple runs
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
metagenomes = ["activated carbon metagenome", "activated sludge metagenome",
"aerosol metagenome", "air metagenome", "algae metagenome", "alkali sediment metagenome",
"amphibian metagenome", "anaerobic digester metagenome", "anchialine metagenome",
"annelid metagenome", "ant fungus garden metagenome", "ant metagenome",
"aquaculture metagenome", "aquatic eukaryotic metagenome", "aquatic metagenome",
"aquatic viral metagenome", "aquifer metagenome", "ballast water metagenome",
"bat gut metagenome", "bat metagenome", "beach sand metagenome", "beetle metagenome",
"bentonite metagenome", "bioanode metagenome", "biocathode metagenome",
"biofilm metagenome", "biofilter metagenome", "biofloc metagenome",
"biogas fermenter metagenome", "bioleaching metagenome", "bioreactor metagenome",
"bioreactor sludge metagenome", "bioretention column metagenome", "biosolids metagenome",
"bird metagenome", "blood metagenome", "bog metagenome", "book metagenome",
"bovine gut metagenome", "bovine metagenome", "brine metagenome", "canine metagenome",
"cave metagenome", "cetacean metagenome", "chemical production metagenome",
"chicken gut metagenome", "ciliate metagenome", "clay metagenome", "clinical metagenome",
"cloud metagenome", "coal metagenome", "cold seep metagenome", "cold spring metagenome",
"compost metagenome", "concrete metagenome", "coral metagenome", "coral reef metagenome",
"cow dung metagenome", "crab metagenome", "crude oil metagenome",
"Crustacea gut metagenome", "crustacean metagenome", "ctenophore metagenome",
"decomposition metagenome", "desalination cell metagenome", "dietary supplements metagenome",
"dinoflagellate metagenome", "drinking water metagenome", "dust metagenome",
"ear metagenome", "echinoderm metagenome", "egg metagenome", "electrolysis cell metagenome",
"endophyte metagenome", "epibiont metagenome", "estuary metagenome", "eukaryotic metagenome",
"eukaryotic plankton metagenome", "eye metagenome", "factory metagenome", "feces metagenome",
"feline metagenome", "fermentation metagenome", "fertilizer metagenome",
"fish gut metagenome", "fishing equipment metagenome", "fish metagenome",
"floral nectar metagenome", "flotsam metagenome", "flower metagenome",
"food contamination metagenome", "food fermentation metagenome", "food metagenome",
"food production metagenome", "fossil metagenome", "freshwater metagenome",
"freshwater sediment metagenome", "frog metagenome", "fuel tank metagenome",
"fungus metagenome", "gas well metagenome", "gill metagenome", "glacier lake metagenome",
"glacier metagenome", "gonad metagenome", "grain metagenome", "granuloma metagenome",
"groundwater metagenome", "gut metagenome", "halite metagenome",
"herbal medicine metagenome", "honeybee metagenome", "honey metagenome", "horse metagenome",
"hospital metagenome", "hot springs metagenome", "human bile metagenome",
"human blood metagenome", "human brain metagenome", "human eye metagenome",
"human feces metagenome", "human gut metagenome", "human hair metagenome",
"human lung metagenome", "human metagenome", "human milk metagenome",
"human nasopharyngeal metagenome", "human oral metagenome",
"human reproductive system metagenome", "human saliva metagenome",
"human semen metagenome", "human skeleton metagenome", "human skin metagenome",
"human sputum metagenome", "human tracheal metagenome", "human urinary tract metagenome",
"human vaginal metagenome", "human viral metagenome", "HVAC metagenome",
"hydrocarbon metagenome", "hydrothermal vent metagenome", "hydrozoan metagenome",
"hypersaline lake metagenome", "hyphosphere metagenome", "hypolithon metagenome",
"ice metagenome", "indoor metagenome", "industrial waste metagenome",
"insect gut metagenome", "insect metagenome", "insect nest metagenome",
"internal organ metagenome", "interstitial water metagenome", "invertebrate gut metagenome",
"invertebrate metagenome", "jellyfish metagenome", "karst metagenome", "koala metagenome",
"lagoon metagenome", "lake water metagenome", "landfill metagenome", "leaf litter metagenome",
"leaf metagenome", "lichen crust metagenome", "lichen metagenome", "liver metagenome",
"lung metagenome", "macroalgae metagenome", "mangrove metagenome", "manure metagenome",
"marine metagenome", "marine plankton metagenome", "marine sediment metagenome",
"marsh metagenome", "marsupial metagenome", "medical device metagenome", "metagenome",
"microbial eukaryotic metagenome", "microbial fuel cell metagenome",
"microbial mat metagenome", "microeukaryotic metagenome", "milk metagenome",
"mine drainage metagenome", "mine metagenome", "mine tailings metagenome",
"mite metagenome", "mixed culture metagenome", "mollusc metagenome", "money metagenome",
"moonmilk metagenome", "mosquito metagenome", "moss metagenome", "mouse gut metagenome",
"mouse metagenome", "mouse skin metagenome", "mud metagenome", "museum specimen metagenome",
"musk metagenome", "nematode metagenome", "neuston metagenome", "nutrient bag metagenome",
"oasis metagenome", "oil field metagenome", "oil metagenome",
"oil production facility metagenome", "oil sands metagenome", "oral metagenome",
"oral-nasopharyngeal metagenome", "oral viral metagenome", "outdoor metagenome",
"ovine metagenome", "oyster metagenome", "painting metagenome", "paper pulp metagenome",
"parasite metagenome", "parchment metagenome", "peat metagenome", "periphyton metagenome",
"permafrost metagenome", "photosynthetic picoeukaryotic metagenome", "phycosphere metagenome",
"phyllosphere metagenome", "phytotelma metagenome", "pig gut metagenome", "pig metagenome",
"pipeline metagenome", "pitcher plant inquiline metagenome", "placenta metagenome",
"plant metagenome", "plastic metagenome", "plastisphere metagenome", "pollen metagenome",
"pond metagenome", "poultry litter metagenome", "power plant metagenome", "primate metagenome",
"probiotic metagenome", "protist metagenome", "psyllid metagenome", "rat gut metagenome",
"rat metagenome", "reproductive system metagenome", "respiratory tract metagenome",
"retting metagenome", "rhizoplane metagenome", "rhizosphere metagenome",
"rice paddy metagenome", "riverine metagenome", "rock metagenome",
"rock porewater metagenome", "rodent metagenome", "root associated fungus metagenome",
"root metagenome", "runoff metagenome", "saline spring metagenome", "saltern metagenome",
"salt lake metagenome", "salt marsh metagenome", "salt mine metagenome",
"salt pan metagenome", "sand metagenome", "scorpion gut metagenome",
"sea anemone metagenome", "seagrass metagenome", "sea squirt metagenome",
"sea urchin metagenome", "seawater metagenome", "sediment metagenome", "seed metagenome",
"semen metagenome", "shale gas metagenome", "sheep gut metagenome", "sheep metagenome",
"shoot metagenome", "shrew metagenome", "shrimp gut metagenome", "silage metagenome",
"skin metagenome", "slag metagenome", "sludge metagenome", "snake metagenome",
"snow metagenome", "soda lake metagenome", "soda lime metagenome", "soil crust metagenome",
"soil metagenome", "solid waste metagenome", "spider metagenome", "sponge metagenome",
"starfish metagenome", "steel metagenome", "stomach metagenome", "stromatolite metagenome",
"subsurface metagenome", "surface metagenome", "symbiont metagenome", "synthetic metagenome",
"tannin metagenome", "tar pit metagenome", "termitarium metagenome",
"termite fungus garden metagenome", "termite gut metagenome", "termite metagenome",
"terrestrial metagenome", "tick metagenome", "tidal flat metagenome", "tin mine metagenome",
"tobacco metagenome", "tomb wall metagenome", "tree metagenome",
"upper respiratory tract metagenome", "urban metagenome", "urinary tract metagenome",
"urine metagenome", "urogenital metagenome", "vaginal metagenome", "viral metagenome",
"volcano metagenome", "wallaby gut metagenome", "wasp metagenome", "wastewater metagenome",
"wetland metagenome", "whale fall metagenome", "whole organism metagenome", "wine metagenome",
"Winogradsky column metagenome", "wood decay metagenome", "zebrafish metagenome"]
geographicLocations = ["Afghanistan", "Albania", "Algeria", "American Samoa", "Andorra",
"Angola", "Anguilla", "Antarctica", "Antigua and Barbuda", "Arctic Ocean", "Argentina",
"Armenia", "Aruba", "Ashmore and Cartier Islands", "Atlantic Ocean", "Australia", "Austria",
"Azerbaijan", "Bahamas", "Bahrain", "Baker Island", "Baltic Sea", "Bangladesh",
"Barbados", "Bassas da India", "Belarus", "Belgium", "Belize", "Benin", "Bermuda",
"Bhutan", "Bolivia", "Borneo", "Bosnia and Herzegovina", "Botswana", "Bouvet Island",
"Brazil", "British Virgin Islands", "Brunei", "Bulgaria", "Burkina Faso", "Burundi",
"Cambodia", "Cameroon", "Canada", "Cape Verde", "Cayman Islands", "Central African Republic",
"Chad", "Chile", "China", "Christmas Island", "Clipperton Island", "Cocos Islands",
"Colombia", "Comoros", "Cook Islands", "Coral Sea Islands", "Costa Rica", "Cote d'Ivoire",
"Croatia", "Cuba", "Curacao", "Cyprus", "Czech Republic", "Democratic Republic of the Congo",
"Denmark", "Djibouti", "Dominica", "Dominican Republic", "East Timor", "Ecuador", "Egypt",
"El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Ethiopia", "Europa Island",
"Falkland Islands (Islas Malvinas)", "Faroe Islands", "Fiji", "Finland", "France",
"French Guiana", "French Polynesia", "French Southern and Antarctic Lands", "Gabon",
"Gambia", "Gaza Strip", "Georgia", "Germany", "Ghana", "Gibraltar", "Glorioso Islands",
"Greece", "Greenland", "GrENAda", "Guadeloupe", "Guam", "Guatemala", "Guernsey", "Guinea",
"Guinea-Bissau", "Guyana", "Haiti", "Heard Island and McDonald Islands", "Honduras",
"Hong Kong", "Howland Island", "Hungary", "Iceland", "India", "Indian Ocean", "Indonesia",
"Iran", "Iraq", "Ireland", "Isle of Man", "Israel", "Italy", "Jamaica", "Jan Mayen", "Japan",
"Jarvis Island", "Jersey", "Johnston Atoll", "Jordan", "Juan de Nova Island", "Kazakhstan",
"Kenya", "Kerguelen Archipelago", "Kingman Reef", "Kiribati", "Kosovo", "Kuwait", "Kyrgyzstan",
"Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania",
"Luxembourg", "Macau", "Macedonia", "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali",
"Malta", "Marshall Islands", "Martinique", "Mauritania", "Mauritius", "Mayotte",
"Mediterranean Sea", "Mexico", "Micronesia", "Midway Islands", "Moldova", "Monaco",
"Mongolia", "Montenegro", "Montserrat", "Morocco", "Mozambique", "Myanmar", "Namibia",
"Nauru", "Navassa Island", "Nepal", "Netherlands", "New Caledonia", "New Zealand",
"Nicaragua", "Niger", "Nigeria", "Niue", "Norfolk Island", "Northern Mariana Islands",
"North Korea", "North Sea", "Norway", "not applicable", "not collected", "not provided",
"Oman", "Pacific Ocean", "Pakistan", "Palau", "Palmyra Atoll", "Panama", "Papua New Guinea",
"Paracel Islands", "Paraguay", "Peru", "Philippines", "Pitcairn Islands", "Poland",
"Portugal", "Puerto Rico", "Qatar", "Republic of the Congo", "restricted access", "Reunion",
"Romania", "Ross Sea", "Russia", "Rwanda", "Saint HelENA", "Saint Kitts and Nevis",
"Saint Lucia", "Saint Pierre and Miquelon", "Saint Vincent and the GrENAdines", "Samoa",
"San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles",
"Sierra Leone", "Singapore", "Sint Maarten", "Slovakia", "Slovenia", "Solomon Islands",
"Somalia", "South Africa", "Southern Ocean", "South Georgia and the South Sandwich Islands",
"South Korea", "Spain", "Spratly Islands", "Sri Lanka", "Sudan", "Suriname", "Svalbard",
"Swaziland", "Sweden", "Switzerland", "Syria", "Taiwan", "Tajikistan", "Tanzania",
"Tasman Sea", "Thailand", "Togo", "Tokelau", "Tonga", "Trinidad and Tobago",
"Tromelin Island", "Tunisia", "Turkey", "Turkmenistan", "Turks and Caicos Islands",
"Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "Uruguay",
"USA", "Uzbekistan", "Vanuatu", "Venezuela", "Viet Nam", "Virgin Islands", "Wake Island",
"Wallis and Futuna", "West Bank", "Western Sahara", "Yemen", "Zambia", "Zimbabwe"]
RETRY_COUNT = 5
HQ = ("Multiple fragments where gaps span repetitive regions. Presence of the "
"23S, 16S, and 5S rRNA genes and at least 18 tRNAs.")
MQ = ("Many fragments with little to no review of assembly other than reporting "
"of standard assembly statistics.")
class NoDataException(ValueError):
pass
def parse_args(argv):
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter,
description="Allows to create xmls and manifest files for genome upload to ENA. " +
"--xmls and --manifests are needed to determine the action the script " +
"should perform. The use of more than one option is encouraged. To spare time, " +
"-xmls and -manifests should be called only if respective xml or manifest files " +
"do not already exist.")
parser.add_argument('-u', '--upload_study', type=str, help="Study accession for genomes upload")
parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file")
genomeType = parser.add_mutually_exclusive_group(required=True)
genomeType.add_argument('-m', '--mags', action='store_true', help="Select for MAG upload")
genomeType.add_argument('-b', '--bins', action='store_true', help="Select for bin upload")
parser.add_argument('--out', type=str, help="Output folder. Default: working directory")
parser.add_argument('--force', action='store_true', help="Forces reset of sample xml's backups")
parser.add_argument('--live', action='store_true', help="Uploads on ENA. Omitting this " +
"option allows to validate samples beforehand")
parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes")
parser.add_argument('--webin', required=True, help="Webin id")
parser.add_argument('--password', required=True, help="Webin password")
parser.add_argument('--centre_name', required=True, help="Name of the centre uploading genomes")
args = parser.parse_args(argv)
if not args.upload_study:
raise ValueError("No project selected for genome upload [-u, --upload_study].")
if not os.path.exists(args.genome_info):
raise FileNotFoundError('Genome metadata file "{}" does not exist'.format(args.genome_info))
return args
'''
#input metadata table
Input table: expects the following parameters:
genome_name: genome file name
accessions: run(s) or assembly(ies) the genome was generated from
assembly_software: assembler_vX.X
binning_software: binner_vX.X
binning_parameters: binning parameters
stats_generation_software: software_vX.X
completeness: float
contamination: float
rRNA_presence: True/False if 5S, 16S, and 23S genes have been detected in the genome
NCBI_lineage: full NCBI lineage, either in tax id or strings
broad_environment: string
local_environment: string
environmental_medium: string
metagenome: string
co-assembly: True/False, whether the genome was generated from a co-assembly
genome_coverage : genome coverage
genome_path: path to genome to upload
'''
def read_and_cleanse_metadata_tsv(inputFile, genomeType):
print('\tRetrieving info for genomes to submit...')
binMandatoryFields = ["genome_name", "accessions",
"assembly_software", "binning_software",
"binning_parameters", "stats_generation_software", "NCBI_lineage",
"broad_environment", "local_environment", "environmental_medium", "metagenome",
"co-assembly", "genome_coverage", "genome_path"]
MAGMandatoryFields = ["rRNA_presence", "completeness", "contamination"]
allFields = MAGMandatoryFields + binMandatoryFields
metadata = pd.read_csv(inputFile, sep='\t', usecols=allFields)
# make sure there are no empty cells
cleanColumns = list(metadata.dropna(axis=1))
if genomeType == "MAGs":
missingValues = [item for item in allFields if item not in cleanColumns]
else:
missingValues = [item for item in binMandatoryFields if item not in cleanColumns]
if missingValues:
raise ValueError("The following mandatory fields have missing values in " +
"the input file: {}".format(", ".join(missingValues)))
# check amount of genomes to register at the same time
if len(metadata) >= 5000:
raise ValueError("Genomes need to be registered in batches of 5000 genomes or smaller.")
# check whether accessions follow the right format
accessions_regExp = re.compile("([E|S|D]R[R|Z]\d{6,})")
accessionComparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions",
"correct", "mismatching", "co-assembly"])
accessionComparison["genome_name"] = metadata["genome_name"]
accessionComparison["co-assembly"] = metadata["co-assembly"]
accessionComparison["attemptive_accessions"] = metadata["accessions"].map(
lambda a: len(a.split(',')))
accessionComparison["correct"] = metadata["accessions"].map(
lambda a: len(accessions_regExp.findall(a)))
accessionComparison["mismatching"] = accessionComparison.apply(lambda row:
True if row["attemptive_accessions"] == row["correct"]
else None, axis=1).isna()
mismatchingAccessions = accessionComparison[accessionComparison["mismatching"]]["genome_name"]
if not mismatchingAccessions.empty:
raise ValueError("Run accessions are not correctly formatted for the following " +
"genomes: " + ','.join(mismatchingAccessions.values))
# check whether completeness and contamination are floats
try:
pd.to_numeric(metadata["completeness"])
pd.to_numeric(metadata["contamination"])
pd.to_numeric(metadata["genome_coverage"])
except:
raise ValueError("Completeness, contamination or coverage values should be formatted as floats")
# check whether all co-assemblies have more than one run associated and vice versa
coassemblyDiscrepancy = metadata[(
(accessionComparison["correct"] < 2) & (accessionComparison["co-assembly"])) |
((accessionComparison["correct"] > 1) & (~accessionComparison["co-assembly"])
)]["genome_name"]
if not coassemblyDiscrepancy.empty:
raise ValueError("The following genomes show discrepancy between number of runs "
"involved and co-assembly status: " + ','.join(coassemblyDiscrepancy.values))
# are provided metagenomes part of the accepted metagenome list?
if False in metadata.apply(lambda row:
True if row["metagenome"] in metagenomes
else False, axis=1).unique():
raise ValueError("Metagenomes associated with each genome need to belong to ENA's " +
"approved metagenomes list.")
# do provided file paths exist?
if False in metadata.apply(lambda row:
True if os.path.exists(row["genome_path"])
else False, axis =1).unique():
raise FileNotFoundError("Some genome paths do not exist.")
# check genome name lengths
if not (metadata["genome_name"].map(lambda a: len(a) < 20).all()):
raise ValueError("Genome names must be shorter than 20 characters.")
# create dictionary while checking genome name uniqueness
try:
genomeInfo = metadata.set_index("genome_name").transpose().to_dict()
return genomeInfo
except:
raise IndexError("Duplicate genome names were found in the input table.")
def round_stats(stats):
#converts stats to floats. If 100, c
newStat = round(float(stats), 2)
if newStat == 100.0:
newStat = 100
if newStat == 0:
newStat = 0.0
return newStat
def compute_MAG_quality(completeness, contamination, RNApresence):
# computes the MAG quality (completeness, contamination, and RNA presence) according to the MIMAG criteria
RNApresent = False
if str(RNApresence).lower() in ["true", "yes", "y"]:
RNApresent = True
quality = MQ
if completeness >= 90 and contamination <= 5 and RNApresent:
quality = HQ
completeness = str(round_stats(completeness))
contamination = str(round_stats(contamination))
return quality, completeness, contamination
def extract_tax_info(taxInfo):
# returns tax_info (tax_id, and scientific name from NCBI_lineage input)
kingdoms = ["Archaea", "Bacteria", "Eukaryota"]
kingdomTaxa = ["2157", "2", "2759"]
lineage, position, digitAnnotation = taxInfo.split(';'), 0, False
selectedKingdom, finalKingdom = kingdoms, ""
if lineage[-1].isdigit():
selectedKingdom = kingdomTaxa
position = 1
digitAnnotation = True
for index, k in enumerate(selectedKingdom):
if k in lineage[position]:
finalKingdom = kingdoms[index]
iterator = len(lineage)-1
submittable = False
while iterator != -1 and not submittable:
scientificName = lineage[iterator].strip()
if digitAnnotation:
scientificName = query_taxid(scientificName)
elif "__" in scientificName:
scientificName = scientificName.split("__")[1]
else:
raise ValueError("Unrecognised taxonomy format.")
submittable, taxid, rank = query_scientific_name(scientificName, searchRank=True)
if not submittable:
if finalKingdom == "Archaea":
submittable, scientificName, taxid = extract_Archaea_info(scientificName, rank)
elif finalKingdom == "Bacteria":
submittable, scientificName, taxid = extract_Bacteria_info(scientificName, rank)
elif finalKingdom == "Eukaryota":
submittable, scientificName, taxid = extract_Eukaryota_info(scientificName, rank)
iterator -= 1
return taxid, scientificName
def query_taxid(taxid):
url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid)
response = requests.get(url)
try:
# Will raise exception if response status code is non-200
response.raise_for_status()
except requests.exceptions.HTTPError as e:
print("Request failed {} with error {}".format(url, e))
return False
res = json.loads(response.text)
return res.get("scientificName", "")
def query_scientific_name(scientificName, searchRank=False):
url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientificName)
response = requests.get(url)
try:
# Will raise exception if response status code is non-200
response.raise_for_status()
except requests.exceptions.HTTPError as e:
if searchRank:
return False, "", ""
else:
return False, ""
try:
res = json.loads(response.text)[0]
except IndexError:
if searchRank:
return False, "", ""
else:
return False, ""
submittable = res.get("submittable", "").lower() == "true"
taxid = res.get("taxId", "")
rank = res.get("rank", "")
if searchRank:
return submittable, taxid, rank
else:
return submittable, taxid
def extract_Eukaryota_info(name, rank):
# checks name and rank of eukaryota info (if final kingdom == 'Eukaryota')
nonSubmittable = (False, "", 0)
# Asterisks in given taxonomy suggest the classification might be not confident enough.
if '*' in name:
return nonSubmittable
if rank == "super kingdom":
name = "uncultured eukaryote"
submittable, taxid = query_scientific_name(name)
return submittable, name, taxid
else:
name = name.capitalize() + " sp."
submittable, taxid = query_scientific_name(name)
if submittable:
return submittable, name, taxid
else:
name = "uncultured " + name
submittable, taxid = query_scientific_name(name)
if submittable:
return submittable, name, taxid
else:
name = name.replace(" sp.", '')
submittable, taxid = query_scientific_name(name)
if submittable:
return submittable, name, taxid
else:
return nonSubmittable
def extract_Bacteria_info(name, rank):
# checks name and rank of bacteria info (if final kingdom == 'Bacteria')
if rank == "species":
name = name
elif rank == "superkingdom":
name = "uncultured bacterium".format(name)
elif rank in ["family", "order", "class", "phylum"]:
name = "uncultured {} bacterium".format(name)
elif rank == "genus":
name = "uncultured {} sp.".format(name)
submittable, taxid, rank = query_scientific_name(name, searchRank=True)
if not submittable:
if rank in ["species", "genus"] and name.lower().endswith("bacteria"):
name = "uncultured {}".format(name.lower().replace("bacteria", "bacterium"))
elif rank == "family":
if name.lower() == "deltaproteobacteria":
name = "uncultured delta proteobacterium"
submittable, taxid = query_scientific_name(name)
return submittable, name, taxid
def extract_Archaea_info(name, rank):
# checks name and rank of Archaea infor (if final kingdome == 'Archaea')
if rank == "species":
name = name
elif rank == "superkingdom":
name = "uncultured archaeon"
elif rank == "phylum":
if "Euryarchaeota" in name:
name = "uncultured euryarchaeote"
elif "Candidatus" in name:
name = "{} archaeon".format(name)
else:
name = "uncultured {} archaeon".format(name)
elif rank in ["family", "order", "class"]:
name = "uncultured {} archaeon".format(name)
elif rank == "genus":
name = "uncultured {} sp.".format(name)
submittable, taxid, rank = query_scientific_name(name, searchRank=True)
if not submittable:
if "Candidatus" in name:
if rank == "phylum":
name = name.replace("Candidatus ", '')
elif rank == "family":
name = name.replace("uncultured ", '')
submittable, taxid = query_scientific_name(name)
return submittable, name, taxid
def extract_genomes_info(inputFile, genomeType):
# main function to take an input metadata file and read into dictionary (based on genomeTye (either bin, mags))
genomeInfo = read_and_cleanse_metadata_tsv(inputFile, genomeType)
for gen in genomeInfo:
genomeInfo[gen]["accessions"] = genomeInfo[gen]["accessions"].split(',')
accessionType = "run"
assembly_regExp = re.compile("([E|S|D]RZ\d{6,})")
if assembly_regExp.findall(genomeInfo[gen]["accessions"][0]):
accessionType = "assembly"
genomeInfo[gen]["accessionType"] = accessionType
genomeInfo[gen]["isolationSource"] = genomeInfo[gen]["metagenome"]
try:
quality, compl, cont = compute_MAG_quality(genomeInfo[gen]["completeness"],
genomeInfo[gen]["contamination"], genomeInfo[gen]["rRNA_presence"])
genomeInfo[gen]["MAG_quality"] = quality
genomeInfo[gen]["completeness"] = compl
genomeInfo[gen]["contamination"] = cont
except IndexError:
pass
if str(genomeInfo[gen]["co-assembly"]).lower() in ["yes", "y", "true"]:
genomeInfo[gen]["co-assembly"] = True
else:
genomeInfo[gen]["co-assembly"] = False
genomeInfo[gen]["alias"] = gen + '_' + str(int(dt.timestamp(dt.now())))
taxID, scientificName = extract_tax_info(genomeInfo[gen]["NCBI_lineage"])
genomeInfo[gen]["taxID"] = taxID
genomeInfo[gen]["scientific_name"] = scientificName
return genomeInfo
# ------------------- ENA API HANDLER -------------------
# TODO: organise this into a class
RUN_DEFAULT_FIELDS = 'study_accession,secondary_study_accession,instrument_model,' \
'run_accession,sample_accession'
ASSEMBLY_DEFAULT_FIELDS = 'sample_accession'
SAMPLE_DEFAULT_FIELDS = 'sample_accession,secondary_sample_accession,' \
'collection_date,country,location'
STUDY_DEFAULT_FIELDS = 'study_accession,secondary_study_accession,description,study_title'
def get_default_params():
return {
'format': 'json',
'includeMetagenomes': True,
'dataPortal': 'ena'
}
def post_request(data, webin, password):
#api search for data
url = "https://www.ebi.ac.uk/ena/portal/api/search"
auth = (webin, password)
default_connection_headers = {
"Content-Type": "application/x-www-form-urlencoded",
"Accept": "*/*"
}
response = requests.post(url, data=data, auth=auth, headers=default_connection_headers)
return response
def get_run(run_accession, webin, password, attempt=0, search_params=None):
data = get_default_params()
data['result'] = 'read_run'
data['fields'] = RUN_DEFAULT_FIELDS
data['query'] = 'run_accession=\"{}\"'.format(run_accession)
if search_params:
data.update(search_params)
#function to do api pull using data
response = post_request(data, webin, password)
print (response.text, response.status_code) #for debugging
if str(response.status_code)[0] != '2' and attempt > 3:
raise ValueError("Could not retrieve run with accession {}, returned "
"message: {}".format(run_accession, response.text))
elif response.status_code == 401: #account for 401 error
if attempt < 3:
attempt +=1
sleep(1)
return get_run(run_accession, webin, password, attempt)
elif response.status_code == 204:
if attempt < 3:
attempt += 1
sleep(1)
return get_run(run_accession, webin, password, attempt)
else:
raise ValueError("Could not find run {} in ENA after {}"
" attempts".format(run_accession, RETRY_COUNT))
try:
run = json.loads(response.text)[0]
except (IndexError, TypeError, ValueError):
raise ValueError("Could not find run {} in ENA.".format(run_accession))
except:
raise Exception("Could not query ENA API: {}".format(response.text['message']), run)
return run
def get_run_from_assembly(assembly_name):
#get all runs associated with assembly name
manifestXml = minidom.parseString(requests.get("https://www.ebi.ac.uk" +
"/ena/browser/api/xml/" + assembly_name).text)
run_ref = manifestXml.getElementsByTagName("RUN_REF")
run = run_ref[0].attributes["accession"].value
return run
def get_study(webin, password, primary_accession=None, secondary_accession=None):
data = get_default_params()
data['result'] = 'read_study'
data['fields'] = STUDY_DEFAULT_FIELDS
if primary_accession and not secondary_accession:
data['query'] = 'study_accession="{}"'.format(primary_accession)
elif not primary_accession and secondary_accession:
data['query'] = 'secondary_study_accession="{}"'.format(secondary_accession)
else:
data['query'] = 'study_accession="{}" AND secondary_study_accession="{}"' \
.format(primary_accession, secondary_accession)
query_params = []
for result_type in ['study', 'read_study', 'analysis_study']:
for data_portal in ['ena', 'metagenome']:
param = data.copy()
param['result'] = result_type
param['dataPortal'] = data_portal
if result_type == 'study':
if 'description' in param['fields']:
param['fields'] = param['fields'].replace('description', 'study_description')
query_params.append(param)
for param in query_params:
try:
response = post_request(data, webin, password)
if response.status_code == 204:
raise NoDataException()
try:
study = json.loads(response.text)[0]
except (IndexError, TypeError, ValueError, KeyError) as e:
raise e
if data['result'] == 'study':
if 'study_description' in study:
study['description'] = study.pop('study_description')
return study
except NoDataException:
print("No info found to fetch study with params {}".format(param))
pass
except (IndexError, TypeError, ValueError, KeyError):
print("Failed to fetch study with params {}, returned error: {}".format(param, response.text))
raise ValueError('Could not find study {} {} in ENA.'.format(primary_accession, secondary_accession))
def get_study_runs(study_acc, webin, password, fields=None, search_params=None):
data = get_default_params()
data['result'] = 'read_run'
data['fields'] = fields or RUN_DEFAULT_FIELDS
data['query'] = '(study_accession=\"{}\" OR secondary_study_accession=\"{}\")'.format(study_acc, study_acc)
if search_params:
data.update(search_params)
response = post_request(data, webin, password)
if str(response.status_code)[0] != '2':
raise ValueError("Could not retrieve runs for study %s.", study_acc)
elif response.status_code == 204:
return []
try:
runs = json.loads(response.text)
except:
raise ValueError("Query against ENA API did not work. Returned "
"message: {}".format(response.text))
return runs
def get_sample(sample_accession, webin, password, fields=None, search_params=None, attempt=0):
data = get_default_params()
data['result'] = 'sample'
data['fields'] = fields or SAMPLE_DEFAULT_FIELDS
data['query'] = ('(sample_accession=\"{acc}\" OR secondary_sample_accession'
'=\"{acc}\") ').format(acc=sample_accession)
if search_params:
data.update(search_params)
response = post_request(data, webin, password)
if response.status_code == 200:
return json.loads(response.text)[0]
else:
if str(response.status_code)[0] != '2':
raise ValueError("Could not retrieve sample with accession {}. "
"Returned message: {}".format(sample_accession, response.text))
elif response.status_code == 401: # account for 401 error
if attempt < 3:
attempt += 1
sleep(1)
return get_sample(sample_accession, webin, password, fields=fields,
search_params=new_params, attempt=attempt)
elif response.status_code == 204:
if attempt < 3:
new_params = {'dataPortal': 'metagenome' if data['dataPortal'] == 'ena' else 'ena'}
attempt += 1
return get_sample(sample_accession, webin, password, fields=fields,
search_params=new_params, attempt=attempt)
else:
raise ValueError("Could not find sample {} in ENA after "
"{} attempts.".format(sample_accession, RETRY_COUNT))
# -------------------------------------------------------
def extract_ENA_info(genomeInfo, uploadDir, webin, password, genomeType):
# main function for getting processing genomeInfo (dict) and return manifests into uploadDir.
print('\tRetrieving project and run info from ENA (this might take a while)...')
# retrieving metadata from runs (and runs from assembly accessions if provided)
allRuns = []
for g in genomeInfo:
# print ("Processing {} ".format(g))
if genomeInfo[g]["accessionType"] == "assembly":
derivedRuns = []
for acc in genomeInfo[g]["accessions"]:
derivedRuns.append(get_run_from_assembly(acc))
genomeInfo[g]["accessions"] = derivedRuns
allRuns.extend(genomeInfo[g]["accessions"])
counter = 5
runsSet, studySet, samplesDict, tempDict = set(allRuns), set(), {}, {}
for i, r in enumerate(runsSet):
run_info = get_run(r, webin, password)
studySet.add(run_info["secondary_study_accession"])
samplesDict[r] = run_info["sample_accession"]
print (i, r)
#sleep after every 5 runs proceessed
sleep(1)
if not studySet:
raise ValueError("No study corresponding to runs found.")
backupFile = os.path.join(uploadDir, "ENA_backup.json")
counter = 0
if not os.path.exists(backupFile):
with open(backupFile, 'w') as file:
pass
with open(backupFile, "r+") as file:
try:
backupDict = json.load(file)
tempDict = dict(backupDict)
print("\tA backup file for ENA sample metadata has been found.")
except json.decoder.JSONDecodeError:
backupDict = {}
for s in studySet:
studyInfo = get_study(webin, password, "", s)
projectDescription = studyInfo["description"]
ENA_info = get_study_runs(s, webin, password)
if ENA_info == []:
raise IOError("No runs found on ENA for project {}.".format(s))
for run, item in enumerate(ENA_info):
runAccession = ENA_info[run]["run_accession"]
if runAccession not in backupDict:
if runAccession in runsSet:
sampleAccession = ENA_info[run]["sample_accession"]
sampleInfo = get_sample(sampleAccession, webin, password)
location = sampleInfo["location"]
if 'N' in location:
latitude = str(float(location.split('N')[0].strip()))
longitude = location.split('N')[1].strip()
elif 'S' in location:
latitude = '-' + str(float(location.split('S')[0].strip()))
longitude = location.split('S')[1].strip()
else:
latitude = "not provided"
longitude = "not provided"
if 'W' in longitude:
longitude = '-' + str(float(longitude.split('W')[0].strip()))
elif longitude.endswith('E'):
longitude = str(float(longitude.split('E')[0].strip()))
country = sampleInfo["country"].split(':')[0]
if not country in geographicLocations:
country = "not provided"
collectionDate = sampleInfo["collection_date"]
if collectionDate == "":
collectionDate = "not provided"
tempDict[runAccession] = {
"instrumentModel" : ENA_info[run]["instrument_model"],
"collectionDate" : collectionDate,
"country" : country,
"latitude" : latitude,
"longitude" : longitude,
"projectDescription" : projectDescription,
"study" : s,
"sampleAccession" : samplesDict[runAccession]
}
counter += 1
if (counter%10 == 0) or (len(runsSet) - len(backupDict) == counter):
file.seek(0)
file.write(json.dumps(tempDict))
file.truncate()
tempDict = {**tempDict, **backupDict}
combine_ENA_info(genomeInfo, tempDict, genomeType)
def multipleElementSet(metadataList):
#check for multiple unique elements in list
return len(set(metadataList))>1
def multi_study_description(studyList, runList):
#assigns runs to its project and creates a string for adding to description
# create dict of study_runs
study_runs_Dict = {} # eg {'ERP008710': ['ERR675312', 'ERR675313', 'ERR675314'], 'ERP001950': ['ERR192253']}
for run, study in zip(runList, studyList):
if study in study_runs_Dict:
study_runs_Dict[study].append(run)
else:
study_runs_Dict[study] = [run]
studyList2 = []
# create runs_study list for description
for study in study_runs_Dict:
# ERPXXXX (run1, run2, run3)
runs = "".join([study, " (", ",".join(study_runs_Dict[study]), ")"])
studyList2.append(runs)
#e.g. # ERPXXXX (run1, run2, run3); # ERPXXXX (run1, run2, run3)
studyList2 = "; ".join(studyList2)
return studyList2
def combine_ENA_info(genomeInfo, ENADict, genomeType):
if genomeType == "MAGs":
checklist, assemblyType = "ERC000047", "Metagenome-assembled genome"
elif genomeType == "bins":
checklist, assemblyType = "ERC000050", "binned metagenome"
#combines information from genome_Info and ENA_Dict
for g in genomeInfo:
# TODO: optimise all the part below
# 1st check - if genome (MAG/bin) was generated from co-assembly, co-assembly == TRUE
if genomeInfo[g]["co-assembly"]:
instrumentList, collectionList, countryList = [], [], []
studyList, descriptionList, samplesList = [], [], []
longList, latitList = [], []
for run in genomeInfo[g]["accessions"]:
instrumentList.append(ENADict[run]["instrumentModel"])
collectionList.append(ENADict[run]["collectionDate"])
countryList.append(ENADict[run]["country"])
studyList.append(ENADict[run]["study"])
descriptionList.append(ENADict[run]["projectDescription"])
samplesList.append(ENADict[run]["sampleAccession"])
longList.append(ENADict[run]["longitude"])
latitList.append(ENADict[run]["latitude"])
#MAIN check if co-assemblies are from different studies.
if multipleElementSet(studyList):
#if [MAG] genome generate from co-assemblies from runs from multiple studies, generate warning.
num_runs = str(len(set(studyList)))
studyList2 = multi_study_description(studyList, genomeInfo[g]["accessions"])
print ("CAUTION: The co-assembly your {} {} was generated from was co-assembled from {} different studies. ".format(assemblyType, g, str(len(set(studyList)))) +
"Please ensure your genomes have been checked for cross-contamination and chimerism prior to submission to ENA")
#sys.exit(1)
genomeInfo[g]["study"] = ','.join(studyList)
#description to be written to samples.xml
genomeInfo[g]["description"] = "This sample represents a {} assembled from runs of multiple projects: {}.".format(assemblyType, studyList2)
else: #[MAG] genome generated from co-assemblies from runs of a single study
genomeInfo[g]["study"] = studyList[0]
genomeInfo[g]["description"] = descriptionList[0] #use original description from sample
# fields for all co-assemblies (regardless of number studies)
instrument = ','.join(instrumentList) if multipleElementSet(instrumentList) else instrumentList[0] # get instrument and/or instruments
collectionDate = "not provided" if multipleElementSet(collectionList) else collectionList[0] # get collection Date from collection list
country = "not applicable" if multipleElementSet(countryList) else countryList[0]
latitude = "not provided" if multipleElementSet(latitList) else latitList[0]
longitude = "not provided" if multipleElementSet(longList) else longList[0]
samples = ','.join(samplesList) if multipleElementSet(samplesList) else samplesList[0] #get all samples if from multiple samples
genomeInfo[g]["sequencingMethod"] = instrument
genomeInfo[g]["collectionDate"] = collectionDate
genomeInfo[g]["country"] = country
genomeInfo[g]["latitude"] = latitude
genomeInfo[g]["longitude"] = longitude
genomeInfo[g]["sample_accessions"] = samples
else: # 2) if the genome isn't from a coassembly - co-assembly == FALSE
run = genomeInfo[g]["accessions"][0]
genomeInfo[g]["sequencingMethod"] = ENADict[run]["instrumentModel"]
genomeInfo[g]["collectionDate"] = ENADict[run]["collectionDate"]
genomeInfo[g]["study"] = ENADict[run]["study"]
genomeInfo[g]["description"] = ENADict[run]["projectDescription"]
genomeInfo[g]["sample_accessions"] = ENADict[run]["sampleAccession"]
genomeInfo[g]["country"] = ENADict[run]["country"]
genomeInfo[g]["longitude"] = ENADict[run]["longitude"]
genomeInfo[g]["latitude"] = ENADict[run]["latitude"]
genomeInfo[g]["accessions"] = ','.join(genomeInfo[g]["accessions"]) #joins all accessions into a single string.
def handle_genomes_registration(sample_xml, submission_xml, webin, password, live=False):
liveSub, mode = "", "live"
if not live:
liveSub = "dev"
mode = "test"
url = "https://www{}.ebi.ac.uk/ena/submit/drop-box/submit/".format(liveSub)
print('\tRegistering sample xml in {} mode.'.format(mode))
f = {
'SUBMISSION': open(submission_xml, 'r'),
'SAMPLE': open(sample_xml, 'r')
}
try:
submissionResponse = requests.post(url, files = f, auth = (webin, password))
if submissionResponse.status_code != 200:
if str(submissionResponse.status_code).startswith('5'):
raise Exception("Genomes could not be submitted to ENA as the server " +
"does not respond. Please again try later.")
else:
raise Exception("Genomes could not be submitted to ENA. HTTP response: " +
submissionResponse.reason)
receiptXml = minidom.parseString((submissionResponse.content).decode("utf-8"))
receipt = receiptXml.getElementsByTagName("RECEIPT")
success = receipt[0].attributes["success"].value
if success == "true":
aliasDict = {}
samples = receiptXml.getElementsByTagName("SAMPLE")
for s in samples:
sraAcc = s.attributes["accession"].value
alias = s.attributes["alias"].value
aliasDict[alias] = sraAcc
elif success == "false":
errors = receiptXml.getElementsByTagName("ERROR")
print("\tGenomes could not be submitted to ENA. Please, check the errors below.")
for error in errors:
print("\t" + error.firstChild.data)
sys.exit(1)
print('\t{} genome samples successfully registered.'.format(str(len(aliasDict))))
return aliasDict
except:
raise ConnectionError("Genomes could not be registered to ENA.")
def getAccessions(accessionsFile):
accessionDict = {}
with open(accessionsFile, 'r') as f:
for line in f:
line = line.split('\t')
alias = line[0]
accession = line[1].rstrip('\n')
accessionDict[alias] = accession
return accessionDict
def saveAccessions(aliasAccessionDict, accessionsFile):
with open(accessionsFile, 'a') as f:
for elem in aliasAccessionDict:
f.write("{}\t{}\n".format(elem, aliasAccessionDict[elem]))