Skip to content

Commit

Permalink
Merge pull request #8 from joshuailevy/unit-test-adding
Browse files Browse the repository at this point in the history
flake8 passing
  • Loading branch information
joshuailevy authored Nov 1, 2021
2 parents dd87082 + acfba92 commit 0a15c8a
Show file tree
Hide file tree
Showing 9 changed files with 106 additions and 713 deletions.
697 changes: 23 additions & 674 deletions LICENSE

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ test-install: all

test-cov: all
py.test --cov=freyja
coveralls

install: all
$(PYTHON) setup.py install
Expand Down
1 change: 1 addition & 0 deletions ci/conda_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ tqdm
cvxpy
pytest
flake8
coveralls
1 change: 0 additions & 1 deletion freyja.egg-info/requires.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
click
numpy
pandas
tqdm
cvxpy
32 changes: 22 additions & 10 deletions freyja/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def barcode(filename):
df = parse_tree_paths(df)
df_barcodes = convert_to_barcodes(df)
df_barcodes = reversion_checking(df_barcodes)
locDir = os.path.abspath(os.path.join(os.path.realpath(__file__), os.pardir))
locDir = os.path.abspath(os.path.join(os.path.realpath(__file__),
os.pardir))
df_barcodes.to_csv(os.path.join(locDir, 'data/usher_barcodes.csv'))


Expand All @@ -31,33 +32,44 @@ def barcode(filename):
@click.option('--output', default='demixing_result.csv', help='Output file',
type=click.Path(exists=False))
def demix(variants, depths, output):
locDir = os.path.abspath(os.path.join(os.path.realpath(__file__), os.pardir))
df_barcodes = pd.read_csv(os.path.join(locDir, 'data/usher_barcodes.csv'), index_col=0)
locDir = os.path.abspath(os.path.join(os.path.realpath(__file__),
os.pardir))
df_barcodes = pd.read_csv(os.path.join(locDir,
'data/usher_barcodes.csv'), index_col=0)
muts = list(df_barcodes.columns)
mapDict = buildLineageMap()
print('building mix/depth matrices')
# assemble data from of (possibly) mixed samples
mix, depths_ = build_mix_and_depth_arrays(variants, depths, muts)
print('demixing')
df_barcodes, mix, depths_ = reindex_dfs(df_barcodes, mix, depths_)
sample_strains, abundances, error = solve_demixing_problem(df_barcodes, mix, depths_)
sample_strains, abundances, error = solve_demixing_problem(df_barcodes,
mix,
depths_)
localDict = map_to_constellation(sample_strains, abundances, mapDict)
# assemble into series and write.
sols_df = pd.Series(data=(localDict, sample_strains, abundances, error),
index=['summarized', 'lineages', 'abundances', 'resid'],
index=['summarized', 'lineages',
'abundances', 'resid'],
name=mix.name)
sols_df.to_csv(output, sep='\t')


@cli.command()
def update():
print('Downloading a new global tree')
url = 'http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/public-latest.all.masked.pb.gz'
locDir = os.path.abspath(os.path.join(os.path.realpath(__file__), os.pardir))
urllib.request.urlretrieve(url, os.path.join(locDir, "data/public-latest.all.masked.pb.gz"))
url = 'http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/\
UShER_SARS-CoV-2/public-latest.all.masked.pb.gz'
locDir = os.path.abspath(os.path.join(os.path.realpath(__file__),
os.pardir))
urllib.request.urlretrieve(url, os.path.join(locDir,
"data/public-latest.all.masked.pb.gz"))
print('Downloading an updated curated lineage set from outbreak.info')
url2 = 'https://raw.githubusercontent.com/outbreak-info/outbreak.info/master/web/src/assets/genomics/curated_lineages.json'
urllib.request.urlretrieve(url2, os.path.join(locDir, "data/curated_lineages.json"))
url2 = 'https://raw.githubusercontent.com/outbreak-info/outbreak.info/\
master/web/src/assets/genomics/curated_lineages.json'
urllib.request.urlretrieve(url2,
os.path.join(locDir,
"data/curated_lineages.json"))


if __name__ == '__main__':
Expand Down
15 changes: 10 additions & 5 deletions freyja/convert_paths2barcodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ def parse_tree_paths(df):
# Make sure to check with new tree versions, lineages could get trimmed.
df = df.drop_duplicates(keep='last')
df['from_tree_root'] = df['from_tree_root'].fillna('')
df['from_tree_root'] = df['from_tree_root'].apply(lambda x: x.replace(' ', '').strip('>').split('>'))
df['from_tree_root'] = df['from_tree_root']\
.apply(lambda x: x.replace(' ', '').strip('>').split('>'))
return df


Expand All @@ -22,7 +23,8 @@ def convert_to_barcodes(df):
df_barcodes = pd.DataFrame()
for clade in df.index:
# sparse,binary encoding
cladeSeries = pd.Series({c: 1 for c in df.loc[clade, 'from_tree_root']}, name=clade)
cladeSeries = pd.Series({c: 1 for c in
df.loc[clade, 'from_tree_root']}, name=clade)
df_barcodes = df_barcodes.append(cladeSeries)

print('separating combined splits')
Expand All @@ -49,13 +51,16 @@ def reversion_checking(df_barcodes):
flips = [d[-1] + d[1:len(d)-1]+d[0] for d in df_barcodes.columns]
flipPairs = [d for d in df_barcodes.columns if d in flips]
flipPairs.sort(key=sortFun)
flipPairs = [[flipPairs[j], flipPairs[j+1]] for j in np.arange(0, len(flipPairs), 2)]
flipPairs = [[flipPairs[j], flipPairs[j+1]]
for j in np.arange(0, len(flipPairs), 2)]
# subtract lower of two pair counts to get the lineage defining mutations
for fp in flipPairs:
df_barcodes[fp] = df_barcodes[fp].subtract(df_barcodes[fp].min(axis=1), axis=0)
df_barcodes[fp] = df_barcodes[fp].subtract(df_barcodes[fp].min(axis=1),
axis=0)

# drop all unused mutations (i.e. paired mutations with reversions)
df_barcodes = df_barcodes.drop(columns=df_barcodes.columns[df_barcodes.sum(axis=0) == 0])
df_barcodes = df_barcodes.drop(
columns=df_barcodes.columns[df_barcodes.sum(axis=0) == 0])
return df_barcodes


Expand Down
27 changes: 17 additions & 10 deletions freyja/sample_deconv.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ def buildLineageMap():
f0.close()

mapDict = {}
for l in range(len(dat)):
if 'who_name' in dat[l].keys():
for d0 in dat[l]['pango_descendants']:
if dat[l]['who_name'] is not None:
mapDict[d0] = dat[l]['who_name']
for ind in range(len(dat)):
if 'who_name' in dat[ind].keys():
for d0 in dat[ind]['pango_descendants']:
if dat[ind]['who_name'] is not None:
mapDict[d0] = dat[ind]['who_name']
return mapDict


Expand All @@ -31,20 +31,24 @@ def build_mix_and_depth_arrays(fn, depthFn, muts):
keptInds = set(muts) & set(df.index)
mix = df.loc[keptInds, 'ALT_FREQ'].astype(float)
mix.name = fn
depths = pd.Series({kI: df_depth.loc[int(re.findall(r'\d+', kI)[0]), 3].astype(float) for kI in muts}, name=fn)
depths = pd.Series({kI: df_depth.loc[int(re.findall(r'\d+', kI)[0]), 3]
.astype(float) for kI in muts}, name=fn)
return mix, depths


def reindex_dfs(df_barcodes, mix, depths):
# first, drop Nextstrain clade names.
df_barcodes = df_barcodes.drop(index=['20A', '20C', '20D', '20H(Beta,V2)', '21C(Epsilon)'])
df_barcodes = df_barcodes.drop(index=['20A', '20C',
'20D', '20H(Beta,V2)',
'21C(Epsilon)'])
# reindex everything to match across the dfs
df_barcodes = df_barcodes.reindex(sorted(df_barcodes.columns), axis=1)
mix = mix.reindex(df_barcodes.columns).fillna(0.)

mix_as_set = set(mix.index)
# dropping extra sequencing depth info we don't need
depths = depths.drop(labels=[m0 for m0 in df_barcodes.columns if m0 not in mix_as_set])
depths = depths.drop(labels=[m0 for m0 in df_barcodes.columns
if m0 not in mix_as_set])
depths = depths.reindex(df_barcodes.columns).fillna(0.)
return df_barcodes, mix, depths

Expand Down Expand Up @@ -118,8 +122,11 @@ def solve_demixing_problem(df_barcodes, mix, depths):
mix, depths_ = build_mix_and_depth_arrays(variants, depths, muts)
print('demixing')
df_barcodes, mix, depths_ = reindex_dfs(df_barcodes, mix, depths_)
sample_strains, abundances, error = solve_demixing_problem(df_barcodes, mix, depths_)
sample_strains, abundances, error = solve_demixing_problem(df_barcodes,
mix, depths_)
localDict = map_to_constellation(sample_strains, abundances, mapDict)
# assemble into series and write.
sols_df = pd.Series(data=(localDict, sample_strains, abundances, error),
index=['summarized', 'lineages', 'abundances', 'resid'], name=mix.name)
index=['summarized', 'lineages',
'abundances', 'resid'],
name=mix.name)
24 changes: 18 additions & 6 deletions freyja/tests/test_barcoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@
class BarcodeTests(unittest.TestCase):

def test_parse_tree_paths(self):
df_test = pd.DataFrame({'clade': ['T0', 'T1', 'T2'], 'from_tree_root': ['> T1234G > A122C', None, '> A15T > A122C']})
df_test = pd.DataFrame({'clade': ['T0', 'T1', 'T2'],
'from_tree_root': ['> T1234G > A122C',
None, '> A15T > A122C']})
df_test = parse_tree_paths(df_test)
df_ideal = pd.DataFrame({'clade': ['T0', 'T1', 'T2'], 'from_tree_root': [['T1234G', 'A122C'], [''], ['A15T', 'A122C']]})
df_ideal = pd.DataFrame({'clade': ['T0', 'T1', 'T2'],
'from_tree_root': [['T1234G', 'A122C'],
[''], ['A15T', 'A122C']]})
df_ideal = df_ideal.set_index('clade')
pdt.assert_frame_equal(df_test, df_ideal)

Expand All @@ -19,15 +23,23 @@ def test_sortFun(self):
self.assertTrue(1234 == sortFun(test_string))

def test_convert_to_barcodes(self):
df_test = pd.DataFrame({'clade': ['T0', 'T1', 'T2'], 'from_tree_root': [['T1234G', 'A122C'], [''], ['A15T', 'A122C']]})
df_test = pd.DataFrame({'clade': ['T0', 'T1', 'T2'],
'from_tree_root': [['T1234G', 'A122C'],
[''], ['A15T', 'A122C']]})
df_barcode_test = convert_to_barcodes(df_test)
df_barcode_ideal = pd.DataFrame({'A122C': [1., 0., 1.], 'A15T': [0., 0., 1.], 'T1234G': [1., 0., 0.]})
df_barcode_ideal = pd.DataFrame({'A122C': [1., 0., 1.],
'A15T': [0., 0., 1.],
'T1234G': [1., 0., 0.]})
pdt.assert_frame_equal(df_barcode_test, df_barcode_ideal)

def test_reversion_checking(self):
df_barcode = pd.DataFrame({'A122C': [1., 0., 1.], 'C122A': [0., 0., 1.], 'T1234G': [1., 0., 0.], 'T12346G': [0., 0., 0.]})
df_barcode = pd.DataFrame({'A122C': [1., 0., 1.],
'C122A': [0., 0., 1.],
'T1234G': [1., 0., 0.],
'T12346G': [0., 0., 0.]})
df_barcode_checked = reversion_checking(df_barcode)
df_barcode_checked_ideal = pd.DataFrame({'A122C': [1., 0., 0.], 'T1234G': [1., 0., 0.]})
df_barcode_checked_ideal = pd.DataFrame({'A122C': [1., 0., 0.],
'T1234G': [1., 0., 0.]})
pdt.assert_frame_equal(df_barcode_checked, df_barcode_checked_ideal)


Expand Down
21 changes: 14 additions & 7 deletions freyja/tests/test_deconv.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ def test_buildLineageMap(self):
self.assertTrue('Delta' == mapDict['AY.4'])

def test_build_mix_and_depth_plus_reindex(self):
df_barcodes = pd.read_csv('freyja/data/usher_barcodes.csv', index_col=0)
df_barcodes = pd.read_csv('freyja/data/usher_barcodes.csv',
index_col=0)
muts = list(df_barcodes.columns)
varFn = 'freyja/data/mixture.tsv'
depthFn = 'freyja/data/mixture.depth'
Expand All @@ -34,21 +35,27 @@ def test_constellation_mapping(self):
vals = [0.1, 0.5, 0.31, 0.01, 0.02, 0.01]
strains = ['B.1.617.2', 'Q.3', 'B.1.427', 'A.2.5', 'B.1.1', 'B.1.1.7']
locDict = map_to_constellation(strains, vals, mapDict)
locDict_ideal = {'Alpha': 0.51, 'Epsilon': 0.31, 'Delta': 0.1, 'Other': 0.02, 'Aaron': 0.01}
locDict_ideal = {'Alpha': 0.51, 'Epsilon': 0.31, 'Delta': 0.1,
'Other': 0.02, 'Aaron': 0.01}
self.assertTrue(locDict == list(locDict_ideal.items()))

def test_demixing(self):
df_barcodes = pd.read_csv('freyja/data/usher_barcodes.csv', index_col=0)
df_barcodes = pd.read_csv('freyja/data/usher_barcodes.csv',
index_col=0)
# build crude in silico mixture from barcodes, perform recovery
strain1 = 'B.1.1.7'
strain2 = 'B.1.427'
mixFracs = [0.4, 0.6]
mix = mixFracs[0]*df_barcodes.loc[strain1, ]+mixFracs[1]*df_barcodes.loc[strain2, ]
mix = mixFracs[0]*df_barcodes.loc[strain1, ]\
+ mixFracs[1]*df_barcodes.loc[strain2, ]
# generate random sequencing depth at each position
depths = negative_binomial(50, 0.25, size=len(mix))
sample_strains, abundances, error = solve_demixing_problem(df_barcodes, mix, depths)
self.assertAlmostEqual(abundances[sample_strains.tolist().index(strain1)], mixFracs[0])
self.assertAlmostEqual(abundances[sample_strains.tolist().index(strain2)], mixFracs[1])
sample_strains, abundances, error = solve_demixing_problem(df_barcodes,
mix, depths)
self.assertAlmostEqual(
abundances[sample_strains.tolist().index(strain1)], mixFracs[0])
self.assertAlmostEqual(
abundances[sample_strains.tolist().index(strain2)], mixFracs[1])


if __name__ == '__main__':
Expand Down

0 comments on commit 0a15c8a

Please sign in to comment.