-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatrixProcessing.py
36 lines (29 loc) · 1.16 KB
/
matrixProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
"""
This part of the BLAST heuristic algorithm will be
used to process scoring matrices and prepare them for
use in the algorithm.
BLOSUM matrices:
X - any amino acid.
B - aspartic acid or asparagine.
Z - glutamic acid or glutamine.
"""
import pandas as pd
import regex
# This function reads the scoring matrix and places it into a dataframe.
def readScoringMatrixFile(scoringMatrixFile):
fileLines = scoringMatrixFile.split("\n")
aminoAcidPattern = '((^[A-Z]{1}).+)'
dataframeColumnsRows = []
matrixLines = []
for line in fileLines:
aminoAcidMatch = regex.match(aminoAcidPattern, line)
if aminoAcidMatch:
matrixLines.append(aminoAcidMatch.group(1))
dataframeColumnsRows.append(aminoAcidMatch.group(2))
matrixDataFrame = pd.DataFrame(index=dataframeColumnsRows, columns=dataframeColumnsRows)
scorePattern = '(^[A-Z]{1}\s*)(((.?\d)\s*)+)'
for i in range(0, len(matrixLines)):
scoreMatch = regex.match(scorePattern, matrixLines[i])
for j in range(0, len(matrixLines)):
matrixDataFrame.iloc[i ,j] = scoreMatch.captures(4)[j]
return matrixDataFrame