Merge pull request #8 from joshuailevy/unit-test-adding

flake8 passing
andersen-lab · Nov 1, 2021 · 0a15c8a · 0a15c8a
2 parents dd87082 + acfba92
commit 0a15c8a
Show file tree

Hide file tree

Showing 9 changed files with 106 additions and 713 deletions.
diff --git a/LICENSE b/LICENSE
diff --git a/Makefile b/Makefile
@@ -20,6 +20,7 @@ test-install: all
 
 test-cov: all
 	py.test --cov=freyja
+	coveralls
 
 install: all
 	$(PYTHON) setup.py install

diff --git a/ci/conda_requirements.txt b/ci/conda_requirements.txt
@@ -5,3 +5,4 @@ tqdm
 cvxpy
 pytest
 flake8
+coveralls
diff --git a/freyja.egg-info/requires.txt b/freyja.egg-info/requires.txt
@@ -1,5 +1,4 @@
 click
 numpy
 pandas
-tqdm
 cvxpy
diff --git a/freyja/_cli.py b/freyja/_cli.py
@@ -21,7 +21,8 @@ def barcode(filename):
     df = parse_tree_paths(df)
     df_barcodes = convert_to_barcodes(df)
     df_barcodes = reversion_checking(df_barcodes)
-    locDir = os.path.abspath(os.path.join(os.path.realpath(__file__), os.pardir))
+    locDir = os.path.abspath(os.path.join(os.path.realpath(__file__),
+                                          os.pardir))
     df_barcodes.to_csv(os.path.join(locDir, 'data/usher_barcodes.csv'))
 
 
@@ -31,33 +32,44 @@ def barcode(filename):
 @click.option('--output', default='demixing_result.csv', help='Output file',
               type=click.Path(exists=False))
 def demix(variants, depths, output):
-    locDir = os.path.abspath(os.path.join(os.path.realpath(__file__), os.pardir))
-    df_barcodes = pd.read_csv(os.path.join(locDir, 'data/usher_barcodes.csv'), index_col=0)
+    locDir = os.path.abspath(os.path.join(os.path.realpath(__file__),
+                             os.pardir))
+    df_barcodes = pd.read_csv(os.path.join(locDir,
+                              'data/usher_barcodes.csv'), index_col=0)
     muts = list(df_barcodes.columns)
     mapDict = buildLineageMap()
     print('building mix/depth matrices')
     # assemble data from of (possibly) mixed samples
     mix, depths_ = build_mix_and_depth_arrays(variants, depths, muts)
     print('demixing')
     df_barcodes, mix, depths_ = reindex_dfs(df_barcodes, mix, depths_)
-    sample_strains, abundances, error = solve_demixing_problem(df_barcodes, mix, depths_)
+    sample_strains, abundances, error = solve_demixing_problem(df_barcodes,
+                                                               mix,
+                                                               depths_)
     localDict = map_to_constellation(sample_strains, abundances, mapDict)
     # assemble into series and write.
     sols_df = pd.Series(data=(localDict, sample_strains, abundances, error),
-                        index=['summarized', 'lineages', 'abundances', 'resid'],
+                        index=['summarized', 'lineages',
+                        'abundances', 'resid'],
                         name=mix.name)
     sols_df.to_csv(output, sep='\t')
 
 
 @cli.command()
 def update():
     print('Downloading a new global tree')
-    url = 'http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/public-latest.all.masked.pb.gz'
-    locDir = os.path.abspath(os.path.join(os.path.realpath(__file__), os.pardir))
-    urllib.request.urlretrieve(url, os.path.join(locDir, "data/public-latest.all.masked.pb.gz"))
+    url = 'http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/\
+    UShER_SARS-CoV-2/public-latest.all.masked.pb.gz'
+    locDir = os.path.abspath(os.path.join(os.path.realpath(__file__),
+                                          os.pardir))
+    urllib.request.urlretrieve(url, os.path.join(locDir,
+                               "data/public-latest.all.masked.pb.gz"))
     print('Downloading an updated curated lineage set from outbreak.info')
-    url2 = 'https://raw.githubusercontent.com/outbreak-info/outbreak.info/master/web/src/assets/genomics/curated_lineages.json'
-    urllib.request.urlretrieve(url2, os.path.join(locDir, "data/curated_lineages.json"))
+    url2 = 'https://raw.githubusercontent.com/outbreak-info/outbreak.info/\
+    master/web/src/assets/genomics/curated_lineages.json'
+    urllib.request.urlretrieve(url2,
+                               os.path.join(locDir,
+                                            "data/curated_lineages.json"))
 
 
 if __name__ == '__main__':

diff --git a/freyja/convert_paths2barcodes.py b/freyja/convert_paths2barcodes.py
@@ -8,7 +8,8 @@ def parse_tree_paths(df):
     # Make sure to check with new tree versions, lineages could get trimmed.
     df = df.drop_duplicates(keep='last')
     df['from_tree_root'] = df['from_tree_root'].fillna('')
-    df['from_tree_root'] = df['from_tree_root'].apply(lambda x: x.replace(' ', '').strip('>').split('>'))
+    df['from_tree_root'] = df['from_tree_root']\
+        .apply(lambda x: x.replace(' ', '').strip('>').split('>'))
     return df
 
 
@@ -22,7 +23,8 @@ def convert_to_barcodes(df):
     df_barcodes = pd.DataFrame()
     for clade in df.index:
         # sparse,binary encoding
-        cladeSeries = pd.Series({c: 1 for c in df.loc[clade, 'from_tree_root']}, name=clade)
+        cladeSeries = pd.Series({c: 1 for c in
+                                 df.loc[clade, 'from_tree_root']}, name=clade)
         df_barcodes = df_barcodes.append(cladeSeries)
 
     print('separating combined splits')
@@ -49,13 +51,16 @@ def reversion_checking(df_barcodes):
     flips = [d[-1] + d[1:len(d)-1]+d[0] for d in df_barcodes.columns]
     flipPairs = [d for d in df_barcodes.columns if d in flips]
     flipPairs.sort(key=sortFun)
-    flipPairs = [[flipPairs[j], flipPairs[j+1]] for j in np.arange(0, len(flipPairs), 2)]
+    flipPairs = [[flipPairs[j], flipPairs[j+1]]
+                 for j in np.arange(0, len(flipPairs), 2)]
     # subtract lower of two pair counts to get the lineage defining mutations
     for fp in flipPairs:
-        df_barcodes[fp] = df_barcodes[fp].subtract(df_barcodes[fp].min(axis=1), axis=0)
+        df_barcodes[fp] = df_barcodes[fp].subtract(df_barcodes[fp].min(axis=1),
+                                                   axis=0)
 
     # drop all unused mutations (i.e. paired mutations with reversions)
-    df_barcodes = df_barcodes.drop(columns=df_barcodes.columns[df_barcodes.sum(axis=0) == 0])
+    df_barcodes = df_barcodes.drop(
+                columns=df_barcodes.columns[df_barcodes.sum(axis=0) == 0])
     return df_barcodes
 
 

diff --git a/freyja/sample_deconv.py b/freyja/sample_deconv.py
@@ -13,11 +13,11 @@ def buildLineageMap():
     f0.close()
 
     mapDict = {}
-    for l in range(len(dat)):
-        if 'who_name' in dat[l].keys():
-            for d0 in dat[l]['pango_descendants']:
-                if dat[l]['who_name'] is not None:
-                    mapDict[d0] = dat[l]['who_name']
+    for ind in range(len(dat)):
+        if 'who_name' in dat[ind].keys():
+            for d0 in dat[ind]['pango_descendants']:
+                if dat[ind]['who_name'] is not None:
+                    mapDict[d0] = dat[ind]['who_name']
     return mapDict
 
 
@@ -31,20 +31,24 @@ def build_mix_and_depth_arrays(fn, depthFn, muts):
     keptInds = set(muts) & set(df.index)
     mix = df.loc[keptInds, 'ALT_FREQ'].astype(float)
     mix.name = fn
-    depths = pd.Series({kI: df_depth.loc[int(re.findall(r'\d+', kI)[0]), 3].astype(float) for kI in muts}, name=fn)
+    depths = pd.Series({kI: df_depth.loc[int(re.findall(r'\d+', kI)[0]), 3]
+                        .astype(float) for kI in muts}, name=fn)
     return mix, depths
 
 
 def reindex_dfs(df_barcodes, mix, depths):
     # first, drop Nextstrain clade names.
-    df_barcodes = df_barcodes.drop(index=['20A', '20C', '20D', '20H(Beta,V2)', '21C(Epsilon)'])
+    df_barcodes = df_barcodes.drop(index=['20A', '20C',
+                                          '20D', '20H(Beta,V2)',
+                                          '21C(Epsilon)'])
     # reindex everything to match across the dfs
     df_barcodes = df_barcodes.reindex(sorted(df_barcodes.columns), axis=1)
     mix = mix.reindex(df_barcodes.columns).fillna(0.)
 
     mix_as_set = set(mix.index)
     # dropping extra sequencing depth info we don't need
-    depths = depths.drop(labels=[m0 for m0 in df_barcodes.columns if m0 not in mix_as_set])
+    depths = depths.drop(labels=[m0 for m0 in df_barcodes.columns
+                                 if m0 not in mix_as_set])
     depths = depths.reindex(df_barcodes.columns).fillna(0.)
     return df_barcodes, mix, depths
 
@@ -118,8 +122,11 @@ def solve_demixing_problem(df_barcodes, mix, depths):
     mix, depths_ = build_mix_and_depth_arrays(variants, depths, muts)
     print('demixing')
     df_barcodes, mix, depths_ = reindex_dfs(df_barcodes, mix, depths_)
-    sample_strains, abundances, error = solve_demixing_problem(df_barcodes, mix, depths_)
+    sample_strains, abundances, error = solve_demixing_problem(df_barcodes,
+                                                               mix, depths_)
     localDict = map_to_constellation(sample_strains, abundances, mapDict)
     # assemble into series and write.
     sols_df = pd.Series(data=(localDict, sample_strains, abundances, error),
-    index=['summarized', 'lineages', 'abundances', 'resid'], name=mix.name)
+                        index=['summarized', 'lineages',
+                               'abundances', 'resid'],
+                        name=mix.name)
diff --git a/freyja/tests/test_barcoding.py b/freyja/tests/test_barcoding.py
@@ -8,9 +8,13 @@
 class BarcodeTests(unittest.TestCase):
 
     def test_parse_tree_paths(self):
-        df_test = pd.DataFrame({'clade': ['T0', 'T1', 'T2'], 'from_tree_root': ['> T1234G > A122C', None, '> A15T > A122C']})
+        df_test = pd.DataFrame({'clade': ['T0', 'T1', 'T2'],
+                                'from_tree_root': ['> T1234G > A122C',
+                                                   None, '> A15T > A122C']})
         df_test = parse_tree_paths(df_test)
-        df_ideal = pd.DataFrame({'clade': ['T0', 'T1', 'T2'], 'from_tree_root': [['T1234G', 'A122C'], [''], ['A15T', 'A122C']]})
+        df_ideal = pd.DataFrame({'clade': ['T0', 'T1', 'T2'],
+                                 'from_tree_root': [['T1234G', 'A122C'],
+                                                    [''], ['A15T', 'A122C']]})
         df_ideal = df_ideal.set_index('clade')
         pdt.assert_frame_equal(df_test, df_ideal)
 
@@ -19,15 +23,23 @@ def test_sortFun(self):
         self.assertTrue(1234 == sortFun(test_string))
 
     def test_convert_to_barcodes(self):
-        df_test = pd.DataFrame({'clade': ['T0', 'T1', 'T2'], 'from_tree_root': [['T1234G', 'A122C'], [''], ['A15T', 'A122C']]})
+        df_test = pd.DataFrame({'clade': ['T0', 'T1', 'T2'],
+                                'from_tree_root': [['T1234G', 'A122C'],
+                                                   [''], ['A15T', 'A122C']]})
         df_barcode_test = convert_to_barcodes(df_test)
-        df_barcode_ideal = pd.DataFrame({'A122C': [1., 0., 1.], 'A15T': [0., 0., 1.], 'T1234G': [1., 0., 0.]})
+        df_barcode_ideal = pd.DataFrame({'A122C': [1., 0., 1.],
+                                         'A15T': [0., 0., 1.],
+                                        'T1234G': [1., 0., 0.]})
         pdt.assert_frame_equal(df_barcode_test, df_barcode_ideal)
 
     def test_reversion_checking(self):
-        df_barcode = pd.DataFrame({'A122C': [1., 0., 1.], 'C122A': [0., 0., 1.], 'T1234G': [1., 0., 0.], 'T12346G': [0., 0., 0.]})
+        df_barcode = pd.DataFrame({'A122C': [1., 0., 1.],
+                                   'C122A': [0., 0., 1.],
+                                   'T1234G': [1., 0., 0.],
+                                   'T12346G': [0., 0., 0.]})
         df_barcode_checked = reversion_checking(df_barcode)
-        df_barcode_checked_ideal = pd.DataFrame({'A122C': [1., 0., 0.], 'T1234G': [1., 0., 0.]})
+        df_barcode_checked_ideal = pd.DataFrame({'A122C': [1., 0., 0.],
+                                                 'T1234G': [1., 0., 0.]})
         pdt.assert_frame_equal(df_barcode_checked, df_barcode_checked_ideal)
 
 

diff --git a/freyja/tests/test_deconv.py b/freyja/tests/test_deconv.py
@@ -14,7 +14,8 @@ def test_buildLineageMap(self):
         self.assertTrue('Delta' == mapDict['AY.4'])
 
     def test_build_mix_and_depth_plus_reindex(self):
-        df_barcodes = pd.read_csv('freyja/data/usher_barcodes.csv', index_col=0)
+        df_barcodes = pd.read_csv('freyja/data/usher_barcodes.csv',
+                                  index_col=0)
         muts = list(df_barcodes.columns)
         varFn = 'freyja/data/mixture.tsv'
         depthFn = 'freyja/data/mixture.depth'
@@ -34,21 +35,27 @@ def test_constellation_mapping(self):
         vals = [0.1, 0.5, 0.31, 0.01, 0.02, 0.01]
         strains = ['B.1.617.2', 'Q.3', 'B.1.427', 'A.2.5', 'B.1.1', 'B.1.1.7']
         locDict = map_to_constellation(strains, vals, mapDict)
-        locDict_ideal = {'Alpha': 0.51, 'Epsilon': 0.31, 'Delta': 0.1, 'Other': 0.02, 'Aaron': 0.01}
+        locDict_ideal = {'Alpha': 0.51, 'Epsilon': 0.31, 'Delta': 0.1,
+                         'Other': 0.02, 'Aaron': 0.01}
         self.assertTrue(locDict == list(locDict_ideal.items()))
 
     def test_demixing(self):
-        df_barcodes = pd.read_csv('freyja/data/usher_barcodes.csv', index_col=0)
+        df_barcodes = pd.read_csv('freyja/data/usher_barcodes.csv',
+                                  index_col=0)
         # build crude in silico mixture from barcodes, perform recovery
         strain1 = 'B.1.1.7'
         strain2 = 'B.1.427'
         mixFracs = [0.4, 0.6]
-        mix = mixFracs[0]*df_barcodes.loc[strain1, ]+mixFracs[1]*df_barcodes.loc[strain2, ]
+        mix = mixFracs[0]*df_barcodes.loc[strain1, ]\
+            + mixFracs[1]*df_barcodes.loc[strain2, ]
         # generate random sequencing depth at each position
         depths = negative_binomial(50, 0.25, size=len(mix))
-        sample_strains, abundances, error = solve_demixing_problem(df_barcodes, mix, depths)
-        self.assertAlmostEqual(abundances[sample_strains.tolist().index(strain1)], mixFracs[0])
-        self.assertAlmostEqual(abundances[sample_strains.tolist().index(strain2)], mixFracs[1])
+        sample_strains, abundances, error = solve_demixing_problem(df_barcodes,
+                                                                   mix, depths)
+        self.assertAlmostEqual(
+            abundances[sample_strains.tolist().index(strain1)], mixFracs[0])
+        self.assertAlmostEqual(
+            abundances[sample_strains.tolist().index(strain2)], mixFracs[1])
 
 
 if __name__ == '__main__':
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,4 @@ tqdm @@
     cvxpy
     pytest
     flake8
+    coveralls