Skip to content

Commit

Permalink
Merge pull request #13 from andersen-lab/dev
Browse files Browse the repository at this point in the history
New helper functions, fixed context-dependent update issue
  • Loading branch information
joshuailevy authored Nov 23, 2021
2 parents 1b1407d + c27ca79 commit 372db37
Show file tree
Hide file tree
Showing 20 changed files with 1,551 additions and 32,297 deletions.
29 changes: 26 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ We can then run Freyja on the output files using the commmand:
```
freyja demix [variants-file] [depth-file] --output [output-file]
```
This outputs to a tsv file that includes the lineages present, their corresponding abundances, and summarization by constellation.
This outputs to a tsv file that includes the lineages present, their corresponding abundances, and summarization by constellation. This method also includes a `--eps` option, which enables the user to define the minimum lineage abundance returned to the user (e.g. `--eps 0.0001`).

---
### Additional options
Expand All @@ -48,7 +48,30 @@ freyja update
```
which downloads new versions of the curated lineage file as well as the UShER global phylogenetic [tree](http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/), which is subsequently converted into barcodes and saved in "data/usher_barcodes.csv".

---
For rapid visualization of results, we also offer two utility methods for manipulating the "demixed" output files. The first is an aggregation method

```
freyja aggregate [directory-of-output-files] --output [aggregated-filename.tsv]
```
This resulting aggregated data can analyzed directly as a tsv file, or can be visualized using

```
freyja plot [aggregated-filename-tsv] --output [plot-filename(.pdf,.png,etc.)]
```
which provides a fractional abundance estimate for all aggregated samples. To modify the provide a lineage specific breakdown, the `--lineages` flag can be used. Example outputs:

|**Summarized** | **Lineage-Specific**|
| :---: | :---: |
|![Summarized](freyja/data/testSummary.png) | ![Lineage-Specific](freyja/data/test0.png)|

If users wish to include sample collection time information, this can be done using

```
freyja plot [aggregated-filename-tsv] --output [plot-filename(.pdf,.png,etc.)] --times [times_metadata.csv(note csv!)] --interval [MS or D (month/day bins)]
```

Acknowledgements
When using the `--interval D` option, the `--windowsize NN` should also be specified, where `NN` is the width of the rolling average window. See `freyja/data/times_metadata.csv` for an example collection time metadata file. Example outputs:

|**Month binning** | **Daily binning (with smoothing)**|
| :---: | :---: |
|![Monthly](freyja/data/test2.png) | ![Daily-Smoothed](freyja/data/test.png)|
1 change: 1 addition & 0 deletions ci/conda_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ coveralls
ivar
samtools
usher
matplotlib
1 change: 1 addition & 0 deletions freyja.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ freyja/data/lineagePaths.txt
freyja/data/mixture.depth
freyja/data/mixture.tsv
freyja/data/public-latest.all.masked.pb.gz
freyja/data/test
freyja/data/test.bam
freyja/data/test.depth
freyja/data/test.tsv
Expand Down
41 changes: 38 additions & 3 deletions freyja/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
reindex_dfs, map_to_constellation, solve_demixing_problem
from freyja.updates import download_tree, convert_tree,\
get_curated_lineage_data
from freyja.utils import agg, makePlot_simple, makePlot_time
import os
import subprocess
import sys
Expand All @@ -21,9 +22,10 @@ def cli():
@cli.command()
@click.argument('variants', type=click.Path(exists=True))
@click.argument('depths', type=click.Path(exists=True))
@click.option('--eps', default=1e-3, help='minimum abundance to include')
@click.option('--output', default='demixing_result.csv', help='Output file',
type=click.Path(exists=False))
def demix(variants, depths, output):
def demix(variants, depths, output, eps):
locDir = os.path.abspath(os.path.join(os.path.realpath(__file__),
os.pardir))
df_barcodes = pd.read_csv(os.path.join(locDir,
Expand All @@ -37,7 +39,8 @@ def demix(variants, depths, output):
df_barcodes, mix, depths_ = reindex_dfs(df_barcodes, mix, depths_)
sample_strains, abundances, error = solve_demixing_problem(df_barcodes,
mix,
depths_)
depths_,
eps)
localDict = map_to_constellation(sample_strains, abundances, mapDict)
# assemble into series and write.
sols_df = pd.Series(data=(localDict, sample_strains, abundances, error),
Expand All @@ -57,14 +60,15 @@ def update():
print("Converting tree info to barcodes")
convert_tree() # returns paths for each lineage
# Now parse into barcode form
lineagePath = os.path.join(locDir, "data/lineagePaths.txt")
lineagePath = os.path.join(os.curdir, "lineagePaths.txt")
print('Building barcodes from global phylogenetic tree')
df = pd.read_csv(lineagePath, sep='\t')
df = parse_tree_paths(df)
df_barcodes = convert_to_barcodes(df)
df_barcodes = reversion_checking(df_barcodes)
df_barcodes.to_csv(os.path.join(locDir, 'data/usher_barcodes.csv'))
# delete files generated along the way that aren't needed anymore
print('Cleaning up')
os.remove(lineagePath)
os.remove(os.path.join(locDir, "data/public-latest.all.masked.pb.gz"))

Expand All @@ -88,5 +92,36 @@ def variants(bamfile, ref, variants, depths):
sys.exit(completed.returncode)


@cli.command()
@click.argument('results', type=click.Path(exists=True))
@click.option('--output', default='aggregated_result.tsv', help='Output file',
type=click.Path(exists=False))
def aggregate(results, output):
df_demix = agg(results)
df_demix.to_csv(output, sep='\t')


@cli.command()
@click.argument('agg_results', type=click.Path(exists=True))
@click.option('--lineages', is_flag=True)
@click.option('--times', default='-1')
@click.option('--interval', default='MS')
@click.option('--output', default='mix_plot.pdf', help='Output file')
@click.option('--windowsize', default=14)
def plot(agg_results, lineages, times, interval, output, windowsize):
agg_df = pd.read_csv(agg_results, skipinitialspace=True, sep='\t',
index_col=0)
if times == '-1':
# make basic plot, without time info
makePlot_simple(agg_df, lineages, output)
else:
# make time aware plot
times_df = pd.read_csv(times, skipinitialspace=True,
index_col=0)
times_df['sample_collection_datetime'] = \
pd.to_datetime(times_df['sample_collection_datetime'])
makePlot_time(agg_df, lineages, times_df, interval, output, windowsize)


if __name__ == '__main__':
cli()
11 changes: 11 additions & 0 deletions freyja/data/agg_outputs.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
summarized lineages abundances resid
mixture.tsv [('A', 0.36663599999788443), ('Alpha', 0.3331250000063474), ('Delta', 0.14550199999830338), ('Gamma', 0.07740219999875769), ('Beta', 0.07423700000338719), ('Other', 0.0030978002759573125)] "['A' 'Q.3' 'AY.48' 'P.1.7' 'B.1.351.2' 'B.1.1.7' 'B.1.617.2' 'B.47'
'B.1.351' 'P.1']" "[0.366636 0.308375 0.133008 0.0763636 0.0721154 0.02475 0.012494
0.0030978 0.0021216 0.0010386]" 6.117163565290815
test.tsv [('A', 0.6035859999995311), ('Delta', 0.3760700299983552), ('Other', 0.020343969998721814)] "['A' 'AY.48' 'B.1.143' 'B.1.1.208' 'A.4' 'B.1.399' 'B.1.187' 'B.1.545'
'AY.56' 'B.1.401' 'B.1.151' 'B.1.1.305' 'AY.92' 'B.1.81' 'AQ.1' 'B.1.214'
'B.1.78' 'AY.119']" "[6.00451200e-01 3.74631000e-01 8.56531000e-03 3.62976000e-03
3.13480000e-03 1.48445800e-03 1.29199000e-03 1.27389000e-03
8.24402000e-04 7.58150000e-04 6.35728000e-04 6.26173999e-04
6.14627999e-04 5.34830016e-04 5.10465000e-04 4.34931999e-04
4.33462998e-04 1.64819996e-04]" 4.797486607629358
Loading

0 comments on commit 372db37

Please sign in to comment.