-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmsTools.py
174 lines (148 loc) · 6.74 KB
/
msTools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import sys, gzip, bisect
def sortedFlankingPositionsByDistToTargSite(targetPos, flankingPositionsToExamine, desiredNumPositions, physLen):
i=1
sortedFlankingPositions = []
while len(sortedFlankingPositions) < desiredNumPositions:
lPos = targetPos-i
rPos = targetPos+i
if lPos >= 0 and lPos in flankingPositionsToExamine:
sortedFlankingPositions.append(lPos)
if rPos < physLen and rPos in flankingPositionsToExamine and len(sortedFlankingPositions) < desiredNumPositions:
sortedFlankingPositions.append(rPos)
i += 1
return sortedFlankingPositions
def getNearestEmptyPositions(donorPos, snpCountAtPos, physLen):
numColliders = snpCountAtPos[donorPos]-1
freeSlots = []
lPos = donorPos - 1
rPos = donorPos + 1
while len(freeSlots) < numColliders:
if lPos >= 0:
if snpCountAtPos[lPos] == 0:
freeSlots.append(lPos)
lPos -= 1
if rPos <= physLen-1:
if snpCountAtPos[rPos] == 0:
freeSlots.append(rPos)
rPos += 1
return freeSlots
def resolveCollision(donorPos, snpCountAtPos, physLen):
for recipientPos in getNearestEmptyPositions(donorPos, snpCountAtPos, physLen):
snpCountAtPos[recipientPos] += 1
assert snpCountAtPos[recipientPos] == 1
snpCountAtPos[donorPos] -= 1
def msPositionsToIntegerPositions(positions, physLen):
assert physLen >= len(positions)
snpCountAtPos = {}
for i in range(physLen):
snpCountAtPos[i] = 0
for position in positions:
intPos = int(physLen*position)
if intPos == physLen:
intPos = physLen-1
snpCountAtPos[intPos] += 1
collisions = {}
for pos in snpCountAtPos:
if snpCountAtPos[pos] > 1:
collisions[pos] = 1
midPos = physLen/2
collisionPositions = []
midHasCollision=0
if midPos in collisions:
collisionPositions.append(midPos)
midHasCollision=1
collisionPositions += sortedFlankingPositionsByDistToTargSite(midPos, collisions, len(collisions)-midHasCollision, physLen)
for pos in collisionPositions:
resolveCollision(pos, snpCountAtPos, physLen)
assert max(snpCountAtPos.values()) == 1
newPositions = [x for x in sorted(snpCountAtPos) if snpCountAtPos[x] > 0]
assert newPositions[0] >= 0 and newPositions[-1] < physLen
return newPositions
def msRepToHaplotypeArrayIn(samples, positions, totalPhysLen, positionsFirst=True):
for i in range(len(samples)):
assert len(samples[i]) == len(positions)
positions = msPositionsToIntegerPositions(positions, totalPhysLen)
hapArrayIn = []
if positionsFirst:
for j in range(len(positions)):
hapArrayIn.append([])
for i in range(len(samples)):
hapArrayIn[j].append(samples[i][j])
else:
for i in range(len(samples)):
hapArrayIn.append([])
for j in range(len(positions)):
hapArrayIn[i].append(samples[i][j])
return hapArrayIn, positions
def msOutToHaplotypeArrayIn(msOutputFileName, totalPhysLen, positionsFirst=True):
if msOutputFileName == "stdin":
isFile = False
msStream = sys.stdin
else:
isFile = True
if msOutputFileName.endswith(".gz"):
msStream = gzip.open(msOutputFileName)
else:
msStream = open(msOutputFileName)
header = msStream.readline()
program,numSamples,numSims = header.strip().split()[:3]
numSamples,numSims = int(numSamples),int(numSims)
hapArraysIn = []
positionArrays = []
#advance to first simulation
line = msStream.readline()
while not line.strip().startswith("//"):
line = msStream.readline()
while line:
if not line.strip().startswith("//"):
sys.exit("Malformed ms-style output file: read '%s' instead of '//'. AAAARRRRGGHHH!!!!!\n" %(line.strip()))
segsitesBlah,segsites = msStream.readline().strip().split()
segsites = int(segsites)
if segsitesBlah != "segsites:":
sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n")
positionsLine = msStream.readline().strip().split()
if not positionsLine[0] == "positions:":
sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n")
positions = [float(x) for x in positionsLine[1:]]
samples = []
for i in range(numSamples):
sampleLine = msStream.readline().strip()
if len(sampleLine) != segsites:
sys.exit("Malformed ms-style output file %s segsites but %s columns in line: %s; line %s of %s samples AAAARRRRGGHHH!!!!!\n" %(segsites,len(sampleLine),sampleLine,i,numSamples))
samples.append(sampleLine)
if len(samples) != numSamples:
raise Exception
hapArrayIn, positions = msRepToHaplotypeArrayIn(samples, positions, totalPhysLen, positionsFirst=positionsFirst)
hapArraysIn.append(hapArrayIn)
positionArrays.append(positions)
line = msStream.readline()
#advance to the next non-empty line or EOF
while line and line.strip() == "":
line = msStream.readline()
#sys.stderr.write("finished rep %d\n" %(len(hapArraysIn)))
if len(hapArraysIn) != numSims:
sys.exit("Malformed ms-style output file: %s of %s sims processed. AAAARRRRGGHHH!!!!!\n" %(len(hapArraysIn), numSims))
if isFile:
msStream.close()
return hapArraysIn, positionArrays
def msOutToHaplotypeMatrices(msOutputFileName, totalPhysLen):
return msOutToHaplotypeArrayIn(msOutputFileName, totalPhysLen, positionsFirst=False)
def windowHaps(hapArraySamplesFirst, positionArray, winStart, winEnd):
indicesToKeep = [i for i in range(len(positionArray)) if positionArray[i] >= winStart and positionArray[i] <= winEnd]
windowHapArray = []
for sample in hapArraySamplesFirst:
windowHapArray.append([])
for i in indicesToKeep:
windowHapArray[-1].append(sample[i])
windowPositions = [positionArray[i] for i in indicesToKeep]
return windowHapArray, windowPositions
def msWinStr(hapArraysSamplesFirst, positionArrays, winStart, winEnd):
outStr = "./windowedMSOutput %s %s\nblah\n" %(len(hapArraysSamplesFirst[0]), len(hapArraysSamplesFirst))
for i in range(len(hapArraysSamplesFirst)):
currHapArray, currPositions = windowHaps(hapArraysSamplesFirst[i], positionArrays[i], winStart, winEnd)
currPositions = [(pos-winStart)/(winEnd-winStart+1.0) for pos in currPositions]
outStr += "\n//\nsegsites: %s\n" %(len(currPositions))
outStr += "positions: " + " ".join([str(x) for x in currPositions]) + "\n"
for sample in currHapArray:
outStr += "".join(sample) + "\n"
return outStr