-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathHaploSplit.py
executable file
·3524 lines (3135 loc) · 188 KB
/
HaploSplit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
import argparse
import itertools
from lib_files.HaploFunct import *
from lib_files.AGP_lib import *
from lib_files.FASTA_lib import *
from lib_files.GFF_lib import *
gc.garbage.append(sys.stdout)
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
def main() :
###### Options and help ######
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
#### Input sequences
parser.add_argument("-i", "--input" , dest="query",
help="FASTA file of the input sequences", metavar="query.fasta [Required]")
parser.add_argument("--GFF3", dest="gff3",
help="Annotation in GFF3 format", metavar="genes.gff3")
parser.add_argument("-a" , "--input_agp" , dest="input_agp",
help="AGP file of query sequences composition" , metavar="query.agp" )
parser.add_argument("--input_groups" , dest="input_groups",
help="Tab separated values file of group association of input sequences. Two columns reporting the sequences ID and the association groups id. Sequence can be associated to multiple groups with multiple rows. [Required with --haplodup]" , metavar="input_groups.tsv")
parser.add_argument("--legacy_groups" , dest="legacy_groups",
help="Tab separated values file of group association of components present the input sequences. Two columns reporting the component ID and the association group it belongs to. Sequence can be associated to multiple groups with multiple rows. [Required with --haplodup and -a|--input_agp]", metavar="legacy_groups.tsv")
#### Markers
parser.add_argument("-m", "--markers", dest="marker_map",
help="Table of markers order when sorted genomic position. Tab separated file with 3 columns: 1) Chromosome ID, 2) Marker sorting position, 3) Marker ID. If a guide genome is used for collinearity search (-g/--guide), chromosomes ID must match the the files", metavar="markers_list")
parser.add_argument("-n" , "--map", dest="markers_hits",
help="Map of markers position on input sequences. BED-like tab separated file with 3 columns 1) Sequence ID, 2) Match start (0-based), 3) Match start (1-based), 4) Marker ID", metavar="map.tsv")
parser.add_argument("-f" , "--filter", dest="filter_hits", default=False , action="store_true",
help="If set, remove from the analysis the sequences with intra-sequence markers duplication. [Default: filter markers only, keep all sequences]")
parser.add_argument("--extended", dest="extended_region", default=False , action="store_true",
help="Extend the association of a sequence to all markers belonging to the same chromosome regardless of sequence orientation. [Default: only the markers in the expected orientation are considered for tiling analysis]")
#### Guide genome mapping
parser.add_argument("-g", "--guide", dest="reference",
help="FASTA file with reference sequence", metavar="guide.fasta")
parser.add_argument("-l" , "--local_alignment", dest="hits", default="local.paf",
help="PAF mapping file with local alignment hits of query sequences on guide sequences. If --align is set, the file will be created or overwritten, otherwise it will be used as input", metavar="local.paf")
# Output generation
parser.add_argument("-o", "--out", dest="out", default="out",
help="Output files name prefix [default: out]", metavar="NAME")
parser.add_argument("-p", "--prefix", dest="prefix", default="NEW",
help="Prefix for output sequences IDs [default: NEW]", metavar="PREFIX")
parser.add_argument("--agp", dest="agp", default=True, action="store_false",
help="Do not export structures in AGP format")
parser.add_argument("--fasta", dest="sequence", default=True, action="store_false",
help="Do not export sequences in FASTA format")
parser.add_argument("--gapsize", dest="gap", default="1000",
help="Minimum gap size placeholder for AGP and FASTA (in bp) [default: 1,000bp]" , metavar="N")
parser.add_argument("--concatenate", dest="conc", default="",
help="Set the gap size used to concatenate unplaced sequences (in bp) [default: do not concatenated]", metavar="N")
# Hit tiling control
parser.add_argument("--distance1", dest="distance1", default="2000000",
help="Set the maximum distance (bp) between sequence projections on the guide genome to be considered as adjacent of the same tiling path alignment for Hap1 [default: 2,000,000bp]", metavar="N")
parser.add_argument("--distance2", dest="distance2", default="4000000",
help="Set the maximum distance (bp) between sequence projections on the guide genome to be considered as adjacent of the same tiling path alignment for Hap2 [default: 4,000,000bp]",metavar="N" )
# Mapping control
parser.add_argument("--align" , dest="map", action="store_true",
help="Do run mapping [overwrite input file.paf if existing]")
parser.add_argument("-t", "--tool", dest="mapping", default=" --cs -x asm20 -r 1000",
help="Mapping command for minimap2", metavar="\" --cs -x asm20 -r 1000\"")
parser.add_argument("-c", "--cores", dest="cores", default=4,
help="Cores used in mapping process [default: 4]", metavar="N")
parser.add_argument("--hitgap" , dest="hitgap", default="100000",
help="Allowed maximum distance (in bp) between local alignment hits to be merged as part of the same global alignment. [default: 100,000bp]", metavar="N")
# Update previous chromosome structure control
parser.add_argument("-u", "--upgrade", dest="upgrade",
help="Previously computed structure to upgrade", metavar="structure.agp")
parser.add_argument("--upgrade_format", dest="upgrade_format", default="agp" ,
help="File format of the structure to upgrade [Default: agp] ", metavar="agp | AGP | block | BLOCK")
parser.add_argument("--upgradeQC", "--upgrade_QC" , "--upgrade_qc" , dest="upgrade_QC", default=False , action="store_true",
help="Stop after performing the QC of the markers before upgrading [Default: continue after QC]" )
parser.add_argument("--structure2map", dest="map_ids",
help="ID conversion between map and structure sequences [Default: identical]" )
parser.add_argument("--conflict" , dest="conflict_resolution", default="exit" ,
help=textwrap.dedent('''Resolution policy for structure vs. map conflicts in contigs usage [Default: exit]. Choice of:
"exit": report and quit on conflicts
"ignore" : ignore the conflict
"release" : sequence is released from the given location and allowed to relocate according to the new map'''),
metavar = "exit"
)
# Tiling path control
parser.add_argument("-e", "--exclusion", dest="exclusion",
help="Tab separated file of sequences pairs to prevent being placed in the same haplotype", metavar="exclusion.tsv")
parser.add_argument("-k", "--known", dest="known",
help="Tab separated file of sequences known to be in the same haplotype. Two columns reporting the sequence ID and the associated group ID", metavar="known.tsv")
parser.add_argument("--allow_rearrangements", dest="rearrangements", default=False, action="store_true",
#help="Restrict constrains on sequence to a the given chromosome, allowing rearrangements on different chromosomes [default: Do not allow]")
help=argparse.SUPPRESS )
parser.add_argument("--alternative_groups", dest="alternative",
help="Tab separated file with two columns reporting ids (comma separated lists) of sequences that should be used in alternative haplotypes.", metavar="alternative_groups.tsv")
parser.add_argument("-1", "--path1", dest="use1",
#help="Use ONLY the following list of (stranded) contigs to create the first haplotype. Tab separated file with Target_ID followed by comma separated list of contigs (contig1|+,contig2|-,[...],contigN|+)", metavar="1st.txt")
help=argparse.SUPPRESS )
parser.add_argument("-2", "--path2", dest="use2",
#help="Use ONLY the following list of (stranded) contigs to create the second haplotype. Tab separated file with Target_ID followed by comma separated list of contigs (contig1|+,contig2|-,[...],contigN|+)", metavar="2nd.txt")
help=argparse.SUPPRESS )
parser.add_argument("--required_as_path", dest="forced_as_path" , default=False, action="store_true",
help="Manage lists in required sequences form \"--R1\" and \"--R2\" as tiling paths")
parser.add_argument("--R1", dest="Require1",
help="Require to use the following list of (stranded) contigs in the first haplotype. Tab separated file with Target_ID followed by comma separated list of contigs (contig1|+,contig2|-,[...],contigN|+)", metavar="1st.txt")
parser.add_argument("--F1" , "--forcemarkers1" , dest="force_direction1", default=False, action="store_true",
help="Force the use of markers according to chromosome and direction reported in --R1")
parser.add_argument("--min1", dest="minR1", default="0" ,
help="Minimum length of sequence allowed to be a requirement for the first haplotype (default: 0)", metavar="N")
parser.add_argument("--R2", dest="Require2",
help="Require to use the following list of (stranded) contigs in the second haplotype. Tab separated file with Target_ID followed by comma separated list of contigs (contig1|+,contig2|-,[...],contigN|+)", metavar="2nd.txt")
parser.add_argument("--F2" , "--forcemarkers2" , dest="force_direction2", default=False, action="store_true",
help="Force the use of markers according to chromosome and direction reported in --R2")
parser.add_argument("--min2", dest="minR2", default="0" ,
help="Minimum length of sequence allowed to be a requirement for the second haplotype (default: 0)", metavar="N")
parser.add_argument("--conflict_resolution", dest="conflict_resolution" , default=2,
help=argparse.SUPPRESS )
parser.add_argument("--B1", "--blacklist1", dest="Blacklist1",
help="Blacklisted (stranded) contigs NOT to be used in the first haplotype. Tab separated file with Target_ID followed by comma separated list of contigs (contig1,contig2,[...],contigN)", metavar="2nd.txt")
parser.add_argument("--B2", "--blacklist2", dest="Blacklist2",
help="Blacklisted (stranded) contigs NOT to be used in the second haplotype. Tab separated file with Target_ID followed by comma separated list of contigs (contig1,contig2,[...],contigN)", metavar="2nd.txt")
# processing steps control
parser.add_argument("--N2", dest="No2", action="store_true",
help="Don't run the search for the 2nd path")
parser.add_argument("--reuse_intermediate" , dest="reuse_intermediate", default=False, action="store_true",
help="Reuse the sequences and the intermediate results of a previous analysis and rerun just the plot" )
parser.add_argument("-v", "--dry", dest="dry", action="store_true",
help="Dry run: check uniqueness of marker in input sequences and quit")
# QC control
parser.add_argument("--skip_chimeric_qc", dest="skip_qc", default=False , action="store_true",
help="Do not perform QC report on potentially chimeric input sequences")
parser.add_argument("--avoid_rejected_qc" , dest="avoidrejectedqc", default=False, action="store_true",
help="Avoid running the quality control of the sequences assigned to a chromosome but not placed in the reconstructed pseudomolecule [Default: run]")
parser.add_argument("--only_markers" , dest="only_markers", default=False, action="store_true",
help="Limit the quality control of the rejected sequences only to those with markers [Default: Do all]")
parser.add_argument("--haplodup" , dest="haplodup", default=False, action="store_true",
help="Run HaploDup on assembly results. If --GFF3 is set, GMAP will be used to generate interactive plots for deduplication analysis, otherwise only dotplots will be delivered")
parser.add_argument("--debug" , dest="debug", default=False, action="store_true",
help=argparse.SUPPRESS )
parser.add_argument("--disable_marker_ploidy_check", dest="disable_marker_ploidy_check" , default=False, action="store_true",
help=argparse.SUPPRESS )
scriptDirectory = os.path.dirname(os.path.realpath(__file__)) + "/support_scripts"
print >> sys.stdout, "Running HaploSplit tool from HaploSync version " + get_version()
print >> sys.stdout, "To reproduce this run use the following command: " + " ".join( pipes.quote(x) for x in sys.argv)
print >> sys.stdout, "----"
# Sanity Check
if len(sys.argv) < 2:
parser.print_help()
sys.exit(1)
options = parser.parse_args()
if not options.reference and not options.marker_map and not options.markers_hits:
print >> sys.stderr , "[ERROR] Information for phasing pseudomolecules was given. -g/--guide and/or -m/--markers with -n/--map must be given"
parser.print_help()
sys.exit(1)
if (not options.marker_map and options.markers_hits) or (options.marker_map and not options.markers_hits):
print >> sys.stderr , "[ERROR] Marker information is partial. -m/--markers and -n/--map must be given concurrently"
parser.print_help()
sys.exit(1)
if not options.query :
print >> sys.stderr , "[ERROR] Query genome FASTA file missing"
parser.print_help()
sys.exit(1)
if options.haplodup :
if not options.input_groups :
print >> sys.stderr , "[ERROR] --haplodup set but --input_groups was not provided"
parser.print_help()
sys.exit(1)
if options.input_agp and (not options.legacy_groups) :
print >> sys.stderr , "[ERROR] --haplodup and -a|--input_agp set but --input_groups was not provided"
parser.print_help()
sys.exit(1)
###### Main ######
paths = set_paths(os.path.join(sys.path[0], 'HaploSync.conf.toml'))
nucmer_path = paths["nucmer"]
showcoords_path = paths["show-coords"]
ref_to_hap1 = {}
hap1_to_ref = {}
hap1_ids = []
ref_ids = []
all_seq_length = {}
if not options.No2 :
hap2_ids = []
ref_to_hap2 = {}
hap2_to_ref = {}
hap2_to_hap1 = {}
hap1_to_hap2 = {}
### Read query and compute lengths
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Reading query sequences"
query_fasta_db = read_fasta(options.query)
query_len = {}
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] == Calculating query sequences lengths"
for query_name in query_fasta_db :
query_len[query_name] = int(len(query_fasta_db[query_name]))
all_seq_length[query_name] = int(len(query_fasta_db[query_name]))
all_seq_length[query_name + "|+"] = int(len(query_fasta_db[query_name]))
all_seq_length[query_name + "|-"] = int(len(query_fasta_db[query_name]))
all_seq_length[query_name + "|."] = int(len(query_fasta_db[query_name]))
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] === Number of query sequences: " + str(len(query_len.keys()))
### Read input AGP if given
if options.input_agp :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Reading query structure from AGP file"
query_agp_db = read_agp(options.input_agp)
else :
query_agp_db = {}
if options.gff3 :
### Read GFF3 input
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Reading annotation from GFF3"
annotation_gff3, mRNA_db = read_gff3(options.gff3)
else :
annotation_gff3 = {}
mRNA_db = {}
reference_sequence_list = []
### Read marker map and markers hits
marker_hits_by_seq = {}
marker_hits_by_id = {}
unknown_markers = []
if options.marker_map and options.markers_hits :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Reading markers map"
marker_map_by_seq = {}
marker_map_by_id = {}
# Read map
for line in open( options.marker_map ) :
if line == "" :
continue
if line[0] == "#" :
continue
try :
seq_id , pos , marker_id = line.rstrip().split("\t")
except :
print >> sys.stdout, "Error in markers map file " + options.marker_map + ", line do not match file format: " + line.rstrip()
print >> sys.stdout, "Error in markers map file " + options.marker_map + ", line do not match file format: " + line.rstrip()
sys.exit(1)
else :
if seq_id not in marker_map_by_seq :
reference_sequence_list.append(seq_id)
marker_map_by_seq[seq_id] = []
marker_map_by_seq[seq_id].append([ int(pos) , marker_id ] )
marker_map_by_id[marker_id] = [ seq_id , int(pos) , marker_id ]
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Reading markers hits"
# Read marker position
for line in open( options.markers_hits ) :
if line == "" :
continue
if line[0] == "#" :
continue
try :
seq_id , pos_1 , pos_2 , marker_id = line.rstrip().split("\t")
except :
print >> sys.stdout, "Error in markers hits file " + options.markers_hits + ", line do not match file format."
print >> sys.stderr, "Error in markers hits file " + options.markers_hits + ", line do not match file format."
print >> sys.stderr, "Line format Expected: Seq_ID\tStart\tStop\tMarker_ID"
print >> sys.stderr, "Line format found: " + line.rstrip()
sys.exit(1)
else :
if marker_id in marker_map_by_id :
if seq_id not in marker_hits_by_seq :
marker_hits_by_seq[seq_id] = []
marker_pos = marker_map_by_id[marker_id][1]
marker_chr = marker_map_by_id[marker_id][0]
start = min( int(pos_1) , int(pos_2) )
stop = max( int(pos_1) , int(pos_2) )
marker_hits_by_seq[seq_id].append([ int(start) , int(stop) , marker_id , marker_chr , int(marker_pos) ] )
if marker_id not in marker_hits_by_id :
marker_hits_by_id[marker_id] = []
marker_hits_by_id[marker_id].append( [ seq_id , int(start) , int(stop) ] )
else :
unknown_markers.append(marker_id)
if not unknown_markers == [] :
unknown_markers_file_name = options.out + ".unknown_markers.txt"
unknown_markers_file = open(unknown_markers_file_name ,'w')
unknown_markers = list(set(unknown_markers))
for unknown_marker_id in sorted(unknown_markers) :
print >> unknown_markers_file , unknown_marker_id
unknown_markers_file.close()
print >> sys.stderr, "[WARNING] Markers alignment on the query sequences reports " + str(len(unknown_markers)) + " unknown marker ids from the genetic map. Unknown markers will be exclude, their ids are reported in " + unknown_markers_file_name + " file."
### Read reference, if given, and compute lengths
if options.reference:
add_seq_id=False
if reference_sequence_list == [] :
add_seq_id=True
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Reading reference sequences"
reference = read_fasta(options.reference)
reference_len = {}
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] == Calculating reference sequences lengths"
# check if map sequences id are all represented -> issue warning
for ref_name in reference_sequence_list :
if ref_name not in reference :
print >> sys.stderr , "[WARNING] Reference " + ref_name + " is present in the markers map but missing from guide genome fasta"
for ref_name in reference :
if add_seq_id :
reference_sequence_list.append(ref_name)
else :
if ref_name not in reference_sequence_list :
# Sequence name in fasta file do not match any map -> issue warning
print >> sys.stderr , "[WARNING] Guide sequence " + ref_name + " is present in the guide genome but absent from markers map. Sequence will be excluded from the analysis"
reference_len[ref_name] = int(len(reference[ref_name]))
all_seq_length[ref_name] = int(len(reference[ref_name]))
all_seq_length[ref_name + "|+"] = int(len(reference[ref_name]))
all_seq_length[ref_name + "|-"] = int(len(reference[ref_name]))
all_seq_length[ref_name + "|."] = int(len(reference[ref_name]))
ref_ids.append(ref_name)
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] === Number of reference sequences: " + str(len(reference_len.keys()))
### For each chromosome, Create graph
best_1_paths = {}
best_1_paths_edges = {}
best_2_paths = {}
best_2_paths_edges ={}
all_used = []
used_by_chr_hap1 = {}
used_by_chr_hap2 = {}
unused_by_chr = {}
# Read mutually exclusive pairs
# To avoid tiling path with mutually exclusive sequences:
# 1) remove direct edges between all unwanted pairs
# 2) Iterate:
# a) get the longest tiling path
# b) find connected pairs (check if both are in tiling path)
# c) remove the shortest sequence fo all wrong from the network
unwanted_pairs = {}
if options.exclusion :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Reading mutually exclusive sequences pairs"
for line in open(options.exclusion) :
if ( line == "" ) or ( line[0] == "#") :
continue
else:
try :
seq_1_id , seq_2_id = line.rstrip().split("\t")
except :
print >> sys.stdout , "[ERROR] Line in exclude pair file (" + options.exclusion + ") does not contain two ids"
print >> sys.stderr , "[ERROR] Line in exclude pair file (" + options.exclusion + ") does not contain two ids"
print >> sys.stderr , "[ERROR] Line expected: Seq_1_id\tSeq_2_id"
print >> sys.stderr , "[ERROR] Line found: " + line.rstrip()
sys.exit(1)
if (seq_1_id + "|+") not in unwanted_pairs :
unwanted_pairs[seq_1_id + "|+"] = []
unwanted_pairs[seq_1_id + "|-"] = []
unwanted_pairs[seq_1_id + "|."] = []
unwanted_pairs[seq_1_id + "|+"] += three_orientation_list(seq_2_id , False)
unwanted_pairs[seq_1_id + "|-"] += three_orientation_list(seq_2_id , False)
unwanted_pairs[seq_1_id + "|."] += three_orientation_list(seq_2_id , False)
if (seq_2_id + "|+") not in unwanted_pairs :
unwanted_pairs[seq_2_id + "|+"] = []
unwanted_pairs[seq_2_id + "|-"] = []
unwanted_pairs[seq_2_id + "|."] = []
unwanted_pairs[seq_2_id + "|+"] += three_orientation_list(seq_1_id , False)
unwanted_pairs[seq_2_id + "|-"] += three_orientation_list(seq_1_id , False)
unwanted_pairs[seq_2_id + "|."] += three_orientation_list(seq_1_id , False)
#unwanted_pairs_file = open(options.out + ".unwanted_pairs.txt", 'w')
#json.dump(unwanted_pairs , unwanted_pairs_file , indent=4)
#unwanted_pairs_file.close()
# Known relationships
known_groups = {}
known_groups_by_seqid = {}
if options.known :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Reading known groups of sequences in the same haplotype"
for line in open(options.known) :
if ( line == "" ) or ( line[0] == "#") :
continue
else:
try :
seq_id , group_id = line.rstrip().split("\t")
except :
print >> sys.stdout , "[ERROR] Line in exclude pair file (" + options.known + ") does not contain two ids"
print >> sys.stderr , "[ERROR] Line in exclude pair file (" + options.known + ") does not contain two ids"
print >> sys.stderr , "[ERROR] Line expected: Seq_id\tGroup_id"
print >> sys.stderr , "[ERROR] Line found: " + line.rstrip()
sys.exit(1)
if group_id not in known_groups :
known_groups[group_id] = []
known_groups[group_id].append(seq_id)
#print >> sys.stderr , known_groups.keys()
for group_id in known_groups.keys() :
for seq_id in known_groups[group_id] :
if seq_id not in known_groups_by_seqid :
known_groups_by_seqid[seq_id] = []
known_groups_by_seqid[seq_id + "|+" ] = []
known_groups_by_seqid[seq_id + "|-" ] = []
known_groups_by_seqid[seq_id + "|." ] = []
for seq_2_id in known_groups[group_id] :
known_groups_by_seqid[seq_id].append(seq_2_id)
known_groups_by_seqid[seq_id] += three_orientation_list(seq_2_id , False)
known_groups_by_seqid[seq_id + "|+" ].append(seq_2_id)
known_groups_by_seqid[seq_id + "|+" ] += three_orientation_list(seq_2_id , False)
known_groups_by_seqid[seq_id + "|-" ].append(seq_2_id)
known_groups_by_seqid[seq_id + "|-" ] += three_orientation_list(seq_2_id , False)
known_groups_by_seqid[seq_id + "|." ].append(seq_2_id)
known_groups_by_seqid[seq_id + "|." ] += three_orientation_list(seq_2_id , False)
#json.dump(known_groups_by_seqid , open("known_groups.by_seq_id.txt", 'w') , indent=4)
#json.dump(known_groups , open("known_groups.by_cluster.txt", 'w') , indent=4)
alternative_sequences = {}
if options.alternative :
for line in open( options.alternative ) :
if line[0] == "#" or line.rstrip() == "" :
continue
else:
group_1 , group_2 = line.rstrip().split("\t")
#print >> sys.stderr , "group_1 and group_2"
group_1_list = group_1.split(",")
group_2_list = group_2.split(",")
#print >> sys.stderr , group_1_list
#print >> sys.stderr , group_2_list
for seq_id in group_1_list :
#print >> sys.stderr , seq_id
if seq_id not in alternative_sequences :
alternative_sequences[seq_id] = []
alternative_sequences[seq_id] += group_2_list
alternative_sequences[seq_id] = list(set(alternative_sequences[seq_id]))
for seq_id in group_2_list :
if seq_id not in alternative_sequences :
alternative_sequences[seq_id] = []
alternative_sequences[seq_id] += group_1_list
alternative_sequences[seq_id] = list(set(alternative_sequences[seq_id]))
# Force path if necessary
fixed_path_1 = defaultdict(list)
fixed_path_2 = defaultdict(list)
if options.use1 or options.use2 :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Reading given paths"
if options.use1 :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] == Sequences used in 1st path"
for line in open(options.use1) :
try :
Target_sequence , loaded_path = line.rstrip().split("\t")
prompted_list = loaded_path.split(",")
best_1_paths_edges[Target_sequence] = []
best_1_paths[Target_sequence] = []
fixed_path_1[Target_sequence] = []
for id in prompted_list :
best_1_paths[Target_sequence].append([id,0,0,0,0])
fixed_path_1[Target_sequence].append(id)
best_1_paths_edges[Target_sequence].append(id)
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] === Seq " + str(Target_sequence) + " : " + ",".join(best_1_paths_edges[Target_sequence])
except:
pass
if options.use2 :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] == Sequences used in 2nd path"
for line in open(options.use2) :
try :
Target_sequence , loaded_path = line.rstrip().split("\t")
prompted_list = loaded_path.split(",")
best_2_paths_edges[Target_sequence] = []
best_2_paths[Target_sequence] = []
fixed_path_1[Target_sequence] = []
for id in prompted_list :
best_2_paths[Target_sequence].append([id,0,0,0,0])
fixed_path_2[Target_sequence].append(id)
best_2_paths_edges[Target_sequence].append(id)
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] === Seq " + str(Target_sequence) + " : " + ",".join(best_2_paths_edges[Target_sequence])
except :
pass
# Create required lists
forced_list_1 = defaultdict(list)
forced_list_2 = defaultdict(list)
discarded=[]
forced_list_1_id_oriented=[]
forced_list_1_id=[]
forced_list_2_id_oriented=[]
forced_list_2_id=[]
# Create Blacklists
blacklist_1 = defaultdict(list)
blacklist_2 = defaultdict(list)
conflict_resolution = options.conflict_resolution.lower()
if options.upgrade and options.marker_map and options.markers_hits :
# TODO: To Test
options.force_direction1 = True
options.force_direction2 = True
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Importing structure to upgrade"
print >> sys.stdout, '# Importing structure to upgrade'
structure_db = read_known_structure( options.upgrade, options.upgrade_format , options.map_ids )
#json.dump( structure_db , open( options.out + ".structure_to_update.json" , "w" ) , indent=4 , sort_keys=True )
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = QC of structure and map consitency"
print >> sys.stdout, '# QC of structure and map consitency'
forced_list_1 , forced_list_2 , conflicts_db = upgrade_qc( structure_db , marker_hits_by_seq , conflict_resolution )
conflicts_file_name = options.out + ".map2structure_conflicts.txt"
conflicts_file_name = print_conflicts( conflicts_db , conflicts_file_name )
json.dump(forced_list_1, open(options.out + ".forced_list_1.json", "w"), indent=4, sort_keys=True)
json.dump(forced_list_2, open(options.out + ".forced_list_2.json", "w"), indent=4, sort_keys=True)
# Check if continue or quit
if options.upgrade_QC :
print >> sys.stdout, '[EXIT] QC of the comparison between structure and map completed'
print >> sys.stderr, '[EXIT] QC of the comparison between structure and map completed'
exit(0)
elif ( conflicts_db != {} and conflict_resolution == "exit" ) :
print >> sys.stdout, '[EXIT] Structure vs map QC completed. Conflicts found'
print >> sys.stderr, '[EXIT] Structure vs map QC completed. Conflicts found'
exit(1)
elif options.Require1 or options.Require2 :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Reading required sequence lists"
if options.Require1 :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] == Sequence required in 1st path"
for line in open(options.Require1,"r") :
try :
Target_sequence , loaded_path = line.rstrip().split("\t")
prompted_list = loaded_path.split(",")
forced_list_1[Target_sequence] = []
for id in prompted_list :
id_name = id[:-2]
id_length = query_len[id_name]
if id_length >= int(options.minR1) :
forced_list_1[Target_sequence].append(id)
forced_list_1_id_oriented.append(id)
forced_list_1_id.append(id_name)
else :
discarded.append(id_name)
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] === Seq " + str(Target_sequence) + ": " + ",".join(forced_list_1[Target_sequence])
except :
pass
if options.Require2 :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] == Sequence required in 2nd path"
for line in open(options.Require2,"r") :
try :
Target_sequence , loaded_path = line.rstrip().split("\t")
prompted_list = loaded_path.split(",")
forced_list_2[Target_sequence] = []
for id in prompted_list :
id_name = id[:-2]
id_length = query_len[id_name]
if id_length >= int(options.minR2) :
forced_list_2[Target_sequence].append(id)
forced_list_2_id_oriented.append(id)
forced_list_2_id.append(id_name)
else :
discarded.append(id_name)
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] === Seq " + str(Target_sequence) + ": " + ",".join(forced_list_2[Target_sequence])
except :
pass
doubled = []
for seq_id in forced_list_2_id :
if seq_id in forced_list_1_id :
doubled.append(seq_id)
if not doubled == [] :
print >> sys.stdout, '[ERROR] Input error. Same sequence(s) required for both haplotypes'
print >> sys.stderr, '[ERROR] Input error. Same sequence(s) required for both haplotypes'
print >> sys.stderr, '[ERROR] duplicated sequences IDs:'
print >> sys.stderr, ", ".join([str(x) for x in doubled])
sys.exit(1)
if not discarded == [] :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] == Discarded required sequences because of short length (hap1 <" + str(options.minR1) +" or hap2 <" + str(options.minR2) + ")"
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] === " + ",".join(discarded)
# Generate blacklist
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Blacklists"
for ref_name in reference_sequence_list :
blacklist_1[ref_name]=[]
blacklist_2[ref_name]=[]
if options.Blacklist1 :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] == Rejected for 1st path"
for line in open(options.Blacklist1,"r") :
try :
Tid , Qids = line.rstrip().split("\t")
if Tid in blacklist_1 :
for name in Qids.split(",") :
blacklist_1[Tid] += three_orientation_list(name)
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] === Seq " + str(Tid) + " : " + ",".join(blacklist_1[Tid])
except:
pass
for key_in in sorted(blacklist_1.keys()) :
# Blacklist sequences required somewhere else
Required_somewhere_else = []
for key_out in sorted(blacklist_1.keys()) :
if not key_in == key_out :
Required_somewhere_else += forced_list_1[key_out]
Required_somewhere_else += fixed_path_1[key_out]
Required_somewhere_else += forced_list_2[key_out]
Required_somewhere_else += fixed_path_2[key_out]
for name in list(set(Required_somewhere_else)) :
blacklist_1[key_in] += three_orientation_list(name)
print >> sys.stderr , "### To reject for Path1@" + str(key_in) + " : " + ",".join(blacklist_1[key_in])
if options.Blacklist2 :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] == Rejected for 2nd path"
for line in open(options.Blacklist2,"r") :
try :
Tid , Qids = line.rstrip().split("\t")
if Tid in blacklist_2 :
for name in Qids.split(",") :
blacklist_2[Tid] += three_orientation_list(name)
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] === Seq " + str(Tid) + " : " + ",".join(blacklist_2[Tid])
except:
pass
for key_in in sorted(blacklist_2.keys()) :
# Blacklist sequences required somewhere else
Required_somewhere_else = []
for key_out in sorted(blacklist_2.keys()) :
if not key_in == key_out :
Required_somewhere_else += forced_list_2[key_out]
Required_somewhere_else += fixed_path_2[key_out]
Required_somewhere_else += forced_list_1[key_out]
Required_somewhere_else += fixed_path_1[key_out]
for name in list(set(Required_somewhere_else)) :
blacklist_2[key_in] += three_orientation_list(name)
print >> sys.stderr , "### To reject for Path2@" + str(key_in) + " : " + ",".join(blacklist_2[key_in])
# If markers a given, run consistency check, duplication control and reporting
if options.marker_map :
if not options.markers_hits :
print >> sys.stdout , "[ERROR] Marker map present but no marker hits given"
print >> sys.stderr , "[ERROR] Marker map present but no marker hits given"
sys.exit(1)
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Testing markers uniqueness"
if not options.disable_marker_ploidy_check :
if not options.No2 :
ploidy = 2
else :
ploidy = 1
else :
ploidy = 100
sys.stdout, '[' + str(datetime.datetime.now()) + "] == Checking copy number"
multi_copy_markers , unique_marker_hits_by_id = check_marker_copies( marker_hits_by_id , ploidy )
#print >> sys.stderr , "multi_copy_markers len:" + str(len(multi_copy_markers))
# multi_copy_markers: list of markers IDs in more copies than ploidy
# unique_marker_hits_by_id: dict of unique markers
sys.stdout, '[' + str(datetime.datetime.now()) + "] == Checking duplications within sequence"
chimera_list , markers_itradup , unique_distinct_marker_hits_by_id = check_in_sequence_duplications(marker_hits_by_seq , unique_marker_hits_by_id , multi_copy_markers , query_fasta_db , annotation_gff3 , query_agp_db , options.out , int(options.cores) , paths , options.skip_qc , ploidy )
#json.dump(unique_distinct_marker_hits_by_id, sys.stderr , indent=4)
print >> sys.stderr , "chimera_list len:" + str(len(chimera_list))
print >> sys.stderr , "markers_itradup len:" + str(len(markers_itradup))
#sys.exit(1)
# chimera_list = list of chimeric sequences
# markers_itradup = list of duplicated markers in chimeric sequences
# unique_distinct_marker_hits_by_id = db of markers in #ploidy copies in #ploidy distinct sequences
# In dry mode, stop after marker QC
if options.dry :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Dry run completed"
print >> sys.stderr, "# Dry run completed"
print >> sys.stdout , "------------------------------"
print >> sys.stdout, "- Done"
print >> sys.stdout , "------------------------------"
print >> sys.stderr , "##############################"
print >> sys.stderr , "# Done"
print >> sys.stderr , "##############################"
sys.exit(0)
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Cleaning markers from duplications"
print >> sys.stderr , "# Cleaning markers from duplications"
clean_marker_set_by_seq = clean_markers( unique_distinct_marker_hits_by_id , chimera_list , marker_map_by_seq , marker_map_by_id , options.out , query_fasta_db , {"show-coords" : paths["show-coords"] , "nucmer" : paths["nucmer"] } , int(options.cores) , options.filter_hits , options.extended_region , forced_list_1 , forced_list_2 , options.force_direction1 , options.force_direction2)
#json.dump(clean_marker_set_by_seq, open( "clean_marker_set_by_seq.txt",'w'), indent=4)
# Format:
# clean_marker_set_by_seq[seq_id] = {
# clean_marker_set_by_seq[seq_id]["id"] : seq_id
# clean_marker_set_by_seq[seq_id]["chr"] : chr_id
# clean_marker_set_by_seq[seq_id]["markers"] : [ ... , [chr_id , chr_pos , marker_id , seq_id, start , stop] , ... ]
# clean_marker_set_by_seq[seq_id]["markers"]["+"|"-"] with orientaion=="." : [ ... , [chr_id , chr_pos , marker_id , seq_id, start , stop] , ... ]
# clean_marker_set_by_seq[seq_id]["range"] : [marker_pos_min , marker_pos_max] ,
# clean_marker_set_by_seq[seq_id]["orientation"] : ["+" or "-" or "."] }
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Building tiling paths on markers"
print >> sys.stderr , "# Building tiling paths on markers"
# 1 - Split clean_marker_set_by_seq by chromosome
clean_marker_set_by_chr = {}
for seq_id in clean_marker_set_by_seq.keys() :
chr_id = clean_marker_set_by_seq[seq_id]["chr"][0]
if chr_id not in clean_marker_set_by_chr:
clean_marker_set_by_chr[chr_id] = []
clean_marker_set_by_chr[chr_id].append(clean_marker_set_by_seq[seq_id])
#json.dump(clean_marker_set_by_seq, open("clean_marker_set_by_seq.txt" , 'w') , indent=4)
unwanted_sequences_alternative_to_forced = {}
for Target_sequence in forced_list_1.keys() :
for seq_id in forced_list_1[Target_sequence] :
if seq_id in alternative_sequences :
alternative_to_seq_id = alternative_sequences[seq_id]
for Target_sequence_2 in forced_list_1.keys() :
if Target_sequence_2 not in unwanted_sequences_alternative_to_forced :
unwanted_sequences_alternative_to_forced[Target_sequence_2] = []
unwanted_sequences_alternative_to_forced[Target_sequence_2]+=alternative_to_seq_id
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Hap1 "
print >> sys.stderr , "# Hap1 "
for chr_id in sorted(clean_marker_set_by_chr.keys()) :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] == Processing " + str(chr_id)
print >> sys.stderr , "## Processing " + str(chr_id)
# Add start and stop to markers list
#print >> sys.stderr , "clean_marker_set_by_chr:"
#print >> sys.stderr, clean_marker_set_by_chr[chr_id]
stop_pos = max([ int(x[0]) for x in marker_map_by_seq[chr_id] ]) + 1
oriented_marker_set = add_orientation_to_ID(clean_marker_set_by_chr[chr_id])
#print >> sys.stderr , "oriented_marker_set:"
#print >> sys.stderr, oriented_marker_set
## QC of concordance of the forced list orientation with marker_set
marker_set , validated_forced_list_1 , validated_forced_list_2 , validated_blacklist_1 , validated_blacklist_2 = validate_marker_set( oriented_marker_set , forced_list_1[chr_id] , forced_list_2[chr_id] , blacklist_1[chr_id] , blacklist_2[chr_id] , options.conflict_resolution )
# validated_forced_list_X do not contain any sequence ID that has no marker
# sequences previously missing orientation info are directed according to the force_list, if present
#print >> sys.stderr , "marker_set:"
#print >> sys.stderr, marker_set
#json.dump(marker_set, open("validated_marker_set."+ chr_id +".txt" , 'w') , indent=4)
if not options.use1 :
preferred_db = {}
# No given path for first tiling, search for first best tiling with blacklist and forced_list
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] === Searching best tiling path"
print >> sys.stderr , "### Searching 1st best tiling path"
# Check forced list with unwanted_pairs (both in -> rise error)
unwanted_to_remove_list = []
for constrained_seq_id in (unwanted_pairs.keys()) :
if constrained_seq_id in validated_forced_list_1 :
# sequence in an unwanted pair is in the forced set, check the mate
mate_ids = unwanted_pairs[constrained_seq_id]
if not is_list_overlapping_list( mate_ids , validated_forced_list_1 ):
unwanted_to_remove_list += mate_ids
else :
# Both sequences requested to be in the same haplotype -> error!
print >> sys.stdout , "[ERROR] Conflicting sequences constrains: Requested sequences pair present in exclusion information file"
print >> sys.stderr , "[ERROR] Conflicting sequences constrains: Requested sequences pair present in exclusion information file"
print >> sys.stderr , "[ERROR] Sequence pair present in both requested tiling path " + options.Require1 + " and excluded pairs from the same haplotype in file " + options.exclusion
print >> sys.stderr , "[ERROR] Requested sequence: " + constrained_seq_id
sys.exit(1)
if chr_id in unwanted_sequences_alternative_to_forced :
unwanted_to_remove_list += unwanted_sequences_alternative_to_forced[chr_id]
best_found = False
unwanted_to_remove_list = list(set(unwanted_to_remove_list))
unwanted_to_reuse_list = []
new_forced = validated_forced_list_1[:]
while not best_found :
#print >> sys.stderr , "unwanted_to_remove_list: "
#print >> sys.stderr, unwanted_to_remove_list
marker_graph_1 = remove_sequence_from_graph( unwanted_to_remove_list , chr_id , stop_pos , marker_set , new_forced , validated_blacklist_1 )
#print >> sys.stderr, marker_graph_1
best_1_nodes = nx.dag_longest_path(marker_graph_1, weight='marker_num')
#print >> sys.stderr , best_1_nodes
best_1_edges , best_1_edges_names = get_marker_subgraph_from_path( marker_graph_1 , best_1_nodes )
#print >> sys.stderr , best_1_edges_names
# Run unwanted pair search, eventually update graph and rerun search
#print >> sys.stderr, "searching unwanted"
unwanted_to_remove = search_unwanted( best_1_edges_names , unwanted_pairs , all_seq_length , validated_forced_list_1)
if unwanted_to_remove == "" :
all_preferred_used = True
to_reintegrate = []
preferred_to_remove = []
# Check if the best path contains all preferred sequences >> otherwise remove them and try improving integrating the incompatible sequences
for seq_id in preferred_db.keys() :
if not seq_id in best_1_edges_names :
print >> sys.stderr, "##### Found equal optimal path without " + seq_id + ", trying to improve by reintegrating the incompatible sequences"
#print >> sys.stderr, "###### To reintegrate: " + str(preferred_db[seq_id])
all_preferred_used = False
unwanted_to_reuse_list += three_orientation_list(seq_id , True)
preferred_to_remove.append(seq_id)
to_reintegrate += preferred_db[seq_id]
#print >> sys.stderr, "to_reintegrate: " + str(to_reintegrate)
if all_preferred_used :
# Cannot not be improved as best uses all preferred >> best found
best_found = True
else :
# Pathway do not uses all preferred, it is equal to one using preferred but can be ameliorated if can make use of the sequences rejected by the prefereed
# Unused preferred >> unwanted_to_remove_list (will not reduce the performance not using that sequence
# Incompatible with unused preferred >> reintegrate to try to improve the pathway
new_forced = best_1_edges_names
to_reintegrate = list(set(to_reintegrate))
for reintegrate_id in to_reintegrate :
if (reintegrate_id in unwanted_to_remove_list) and (not reintegrate_id in unwanted_to_reuse_list) :
unwanted_to_remove_list.remove(reintegrate_id)
print >> sys.stderr, "###### Reintegrating: " + str(reintegrate_id)
for seq_id in preferred_to_remove :
del preferred_db[seq_id]
unwanted_to_remove_list += three_orientation_list(seq_id , True)
unwanted_to_remove_list = list(set(unwanted_to_remove_list))
else :
# unwanted_to_remove["keep"] = keep_seq_id|+
# unwanted_to_remove["remove"] = [ reject_seq_id|+ , reject_seq_id|- , reject_seq_id|. ]
print >> sys.stderr, "#### Found pair of sequences in the path that should not be placed in the same haplotype: " + str(unwanted_to_remove["keep"]) + " (to keep) | " + str(unwanted_to_remove["remove"]) + " (to remove)"
# Check if the one we are going to remove was a preferred before, eventually reconsider the removal decision
to_reintegrate = []
if unwanted_to_remove["remove"][0] in preferred_db :
to_reintegrate += preferred_db[unwanted_to_remove["remove"][0]]
del preferred_db[unwanted_to_remove["remove"][0]]
if unwanted_to_remove["remove"][1] in preferred_db :
to_reintegrate += preferred_db[unwanted_to_remove["remove"][1]]
del preferred_db[unwanted_to_remove["remove"][1]]
if unwanted_to_remove["remove"][2] in preferred_db :
to_reintegrate += preferred_db[unwanted_to_remove["remove"][2]]
del preferred_db[unwanted_to_remove["remove"][2]]
# The sequence to be removed was once preferred to other(s) sequence(s) in the path.
# Reintegrate the deleted one(s), if they are compatible
# Block - Start - Note - This should solve the issue of sequences IDs stuck in forced bucket even after incompatibility resolution
updated_forced = []
for element in new_forced :
if not element in unwanted_to_remove["remove"] :
updated_forced.append(element)
# Block - End
new_forced = updated_forced
if to_reintegrate == [] :
unwanted_to_remove_list += unwanted_to_remove["remove"]
else :
unwanted_to_remove_list = list(set(unwanted_to_remove_list))
# Reintegrate the old sequences by removing them from the unwanted list
for reintegrate_id in to_reintegrate :
if (reintegrate_id in unwanted_to_remove_list) and (reintegrate_id not in unwanted_to_reuse_list) :
unwanted_to_remove_list.remove(reintegrate_id)
new_forced.append(reintegrate_id)
print >> sys.stderr, "###### Reintegrating: " + reintegrate_id
unwanted_to_remove_list += unwanted_to_remove["remove"]
unwanted_to_remove_list = list(set(unwanted_to_remove_list))
if unwanted_to_remove["keep"] not in preferred_db :
preferred_db[unwanted_to_remove["keep"]] = []
preferred_db[unwanted_to_remove["keep"]] += unwanted_to_remove["remove"]
best_1_paths[chr_id] = best_1_edges
used , best_1_paths_edges[chr_id] = make_list_from_marker_path( best_1_edges )
print >> sys.stderr , "#### Used sequence (in order): " + ",".join([str(x) for x in best_1_paths_edges[chr_id]])
all_used += used
used_by_chr_hap1[chr_id] = used
# Find all sequences paired to the sequences used in the best_1_paths
paired_to_used = []
if not options.rearrangements :
print >> sys.stderr , "## Setting constrains based on sequences used for Hap 1"
for Target_sequence in best_1_paths_edges.keys() :
for seq_id in best_1_paths_edges[Target_sequence] :
if seq_id in known_groups_by_seqid :
paired_to_used += known_groups_by_seqid[seq_id]
#print >> sys.stderr , "#### To discard: " + str(known_groups_by_seqid[seq_id])
paired_to_used = list(set(paired_to_used))
# Blacklist all sequences alternate to best_1_paths_edges[Target_sequence]
unwanted_sequences_alternative_to_used = {}
for Target_sequence in best_1_paths_edges.keys() :
unwanted_sequences_alternative_to_used[Target_sequence] = []
for seq_id in best_1_paths_edges[Target_sequence] :
if seq_id in alternative_sequences :
alternative_to_seq_id = alternative_sequences[seq_id]
for Target_sequence_2 in best_1_paths_edges.keys() :
if not Target_sequence_2 == Target_sequence :
if Target_sequence_2 not in unwanted_sequences_alternative_to_used :
unwanted_sequences_alternative_to_used[Target_sequence_2] = []
unwanted_sequences_alternative_to_used[Target_sequence_2]+=alternative_to_seq_id
unwanted_sequences_alternative_to_forced = {}
for Target_sequence in forced_list_2.keys() :
for seq_id in forced_list_2[Target_sequence] :
if seq_id in alternative_sequences :
alternative_to_seq_id = alternative_sequences[seq_id]
for Target_sequence_2 in forced_list_2.keys() :
if Target_sequence_2 not in unwanted_sequences_alternative_to_forced :
unwanted_sequences_alternative_to_forced[Target_sequence_2] = []
unwanted_sequences_alternative_to_forced[Target_sequence_2]+=alternative_to_seq_id
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] = Hap2 "
print >> sys.stderr , "# Hap2 "
for chr_id in sorted(clean_marker_set_by_chr.keys()) :
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] == Processing " + str(chr_id)
print >> sys.stderr , "## Processing " + str(chr_id)
stop_pos = max([ int(x[0]) for x in marker_map_by_seq[chr_id] ]) + 1
oriented_marker_set = add_orientation_to_ID(clean_marker_set_by_chr[chr_id])
marker_set , validated_forced_list_1 , validated_forced_list_2 , validated_blacklist_1 , validated_blacklist_2 = validate_marker_set( oriented_marker_set , forced_list_1[chr_id] , forced_list_2[chr_id] , blacklist_1[chr_id] , blacklist_2[chr_id] , options.conflict_resolution )
if not options.use2 :
if not options.No2 :
preferred_db = {}
# No given path for second tiling, search for second best tiling with blacklist and forced_list
print >> sys.stdout, '[' + str(datetime.datetime.now()) + "] === Searching best tiling path"
print >> sys.stderr , "### Searching 2nd best tiling path"
unwanted_to_remove_list = paired_to_used
unwanted_to_remove_list += unwanted_sequences_alternative_to_used[chr_id]
if chr_id in unwanted_sequences_alternative_to_forced :
unwanted_to_remove_list += unwanted_sequences_alternative_to_forced[chr_id]
for seq_id in best_1_paths_edges[chr_id] :
unwanted_to_remove_list += three_orientation_list(seq_id , True)
if seq_id in known_groups_by_seqid :
print >> sys.stderr , "#### To discard because of " + seq_id +" in hap1: " + str(known_groups_by_seqid[seq_id])
unwanted_to_remove_list += known_groups_by_seqid[seq_id]
# Check that no forced sequence is in unwanted_to_remove_list -> rise error and quit (Should never happen)
for forced_seq_id in validated_forced_list_2 :
if forced_seq_id in unwanted_to_remove_list :
print >> sys.stdout , "[ERROR] Conflicting sequences constrains: Requested sequences should belong to Haplotype 1"
print >> sys.stderr , "[ERROR] Requested sequence " + forced_seq_id + " is in conflict with Haplotype 1 reconstruction. From sequences known to be in the same haplotype reported in file " + options.known + ", the sequence should be along with others in Haplotype 1"
sys.exit(1)
# Check forced list with unwanted_pairs (both in -> rise error)
for constrained_seq_id in (unwanted_pairs.keys()) :
if constrained_seq_id in validated_forced_list_2 :
# sequence in an unwanted pair is in the forced set, check the mate(s)
mate_id = unwanted_pairs[constrained_seq_id]
if not is_list_overlapping_list(mate_id, validated_forced_list_2 ):
unwanted_to_remove_list += mate_id
else :
# Both sequences requested to be in the same haplotype -> error!
print >> sys.stdout , "[ERROR] Conflicting sequences constrains: Requested sequences pair present in exclusion information file"
print >> sys.stderr , "[ERROR] Requested sequence " + constrained_seq_id + " present in both requested tiling path " + options.Require2 + " and excluded pairs from the same haplotype in file " + options.exclusion
sys.exit(1)
unwanted_to_remove_list = list(set(unwanted_to_remove_list))
unwanted_to_reuse_list = []
best_found = False
new_forced = validated_forced_list_2[:]
while not best_found :
#print >> sys.stderr , "unwanted_to_remove_list: "
#print >> sys.stderr, unwanted_to_remove_list
#print >> sys.stderr , "validated_blacklist_2: "
#print >> sys.stderr, validated_blacklist_2
marker_graph_2 = remove_sequence_from_graph( unwanted_to_remove_list , chr_id , stop_pos, marker_set , new_forced , validated_blacklist_2 )
best_2_nodes = nx.dag_longest_path(marker_graph_2, weight='marker_num')
#print >> sys.stderr , best_2_nodes
best_2_edges , best_2_edges_names = get_marker_subgraph_from_path( marker_graph_2 , best_2_nodes )
# Run unwanted pair search, eventually update graph and rerun search
#print >> sys.stderr , best_2_edges
#print >> sys.stderr , best_2_edges_names
unwanted_to_remove = search_unwanted( best_2_edges_names , unwanted_pairs , all_seq_length , validated_forced_list_2)
if unwanted_to_remove == "" :
all_preferred_used = True
to_reintegrate = []
preferred_to_remove = []
# Check if the best path contains all preferred sequences >> otherwise remove them and try improving integrating the incompatible sequences
for seq_id in preferred_db.keys() :
if not seq_id in best_2_edges_names :
print >> sys.stderr, "##### Found equal optimal path without " + seq_id + ", trying to improve by reintegrating the incompatible sequences"
#print >> sys.stderr, "###### To reintegrate: " + str(preferred_db[seq_id])
all_preferred_used = False
unwanted_to_reuse_list += three_orientation_list(seq_id , True)
preferred_to_remove.append(seq_id)
to_reintegrate += preferred_db[seq_id]
#print >> sys.stderr, "to_reintegrate: " + str(to_reintegrate)
if all_preferred_used :
# Cannot not be improved as best uses all preferred >> best found
best_found = True
else :
# Pathway do not uses all preferred, it is equal to one using preferred.