From 8cb5281d2a8c8cf8ae7d6feba376601744160fd4 Mon Sep 17 00:00:00 2001 From: Yu Wan Date: Wed, 11 Nov 2020 21:23:13 +0000 Subject: [PATCH 1/4] Create subdirectory 'ariba' --- ariba/README.md | 11 ++++++++++ ariba/pool_seqs.py | 51 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 ariba/README.md create mode 100644 ariba/pool_seqs.py diff --git a/ariba/README.md b/ariba/README.md new file mode 100644 index 0000000..192c1b0 --- /dev/null +++ b/ariba/README.md @@ -0,0 +1,11 @@ +# Converting ARIBA outputs to an allelic PAM + +Yu Wan + + + +## Steps + +### Modifying summary table + +Replace both "\_report.tsv" and ".match" with "". \ No newline at end of file diff --git a/ariba/pool_seqs.py b/ariba/pool_seqs.py new file mode 100644 index 0000000..b1231c3 --- /dev/null +++ b/ariba/pool_seqs.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +""" +Pool FASTA files of ARIBA's output allele sequences into one file and append sample names to sequence IDs (in +the same format as that for SRST2's output consensus sequences). + +Command demonstration: + python -i *_genes.fna -o alleles.fna -e '_genes.fna' + +Copyright (C) 2020 Yu Wan +Licensed under the GNU General Public Licence version 3 (GPLv3) . +Publication: 11 Nov 2020; the latest modification: 11 Nov 2020 +""" + +import os +from argparse import ArgumentParser + +def parse_arguments(): + parser = ArgumentParser(description = "Pool ARIBA's output allele sequences into one FASTA file and append sample names to sequence IDs",\ + epilog = "This is a helper script of R package GeneMates.") + parser.add_argument("-i", nargs = "+", dest = "i", type = str, required = True, help = "Input FASTA files") + parser.add_argument("-o", dest = "o", type = str, required = False, default = "alleles.fna", help = "Output FASTA file of pooled allele sequences") + parser.add_argument("-e", dest = "e", type = str, required = False, default = "_genes.fna", help = "Filename extension to be removed for sample names") + return parser.parse_args() + +def main(): + args = parse_arguments() + fasta_out = open(args.o, "w") + sample_count = 0 + for fasta_in in args.i: + f_in = open(fasta_in, "r") + fasta_in = os.path.basename(fasta_in) + sample = fasta_in.replace(args.e, "") # No change applies if args.e is not found in the filename + line = f_in.readline() + while line: + if line.startswith(">"): # A header is encountered + fields = line.split(".") # ARIBA uses full stops as delimiters in the header line + new_id = fields[0] + "|" + sample # Example value: ">cluster_1|sample_1" + seq_descr = ".".join(fields[1 : ]) + fasta_out.write(new_id + " " + seq_descr) # Note that the newline character of this line is not stripped off by the readline method. + else: + fasta_out.write(line) + line = f_in.readline() # Till the end of the file, a None value is returned. + f_in.close() + sample_count += 1 + fasta_out.close() + print("%d samples have been processed." % sample_count) + return + +if __name__ == "__main__": + main() From 1ed2660bc0ba2bf2d56cf29b006fb4a815e674bf Mon Sep 17 00:00:00 2001 From: Yu Wan Date: Thu, 12 Nov 2020 10:39:41 +0000 Subject: [PATCH 2/4] Completed clusters2pam.py --- ariba/clusters2pam.py | 104 ++++++++++++++++++++++++++++++++++++++++++ ariba/pool_seqs.py | 2 +- 2 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 ariba/clusters2pam.py diff --git a/ariba/clusters2pam.py b/ariba/clusters2pam.py new file mode 100644 index 0000000..f38823b --- /dev/null +++ b/ariba/clusters2pam.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python + +""" +Converts output TSV of utility/tabulate_cdhit.py to an allelic presence-absence matrix (PAM) and saves +it in TSV format. + +Command demonstration: + python clusters2pam.py -i alleles.clstr.tsv -om alleles_pam.tsv -ot alleles_clstr_updated.tsv + +Dependency: module pandas, Python v3 + +Copyright (C) 2020 Yu Wan +Licensed under the GNU General Public Licence version 3 (GPLv3) . +Publication: 11 Nov 2020; the latest modification: 12 Nov 2020 +""" + +import os +import sys +import pandas +from argparse import ArgumentParser + +def parse_arguments(): + parser = ArgumentParser(description = "Creating an allelic presence-absence matrix from a table of sequence clusters",\ + epilog = "This is a helper script of R package GeneMates.") + parser.add_argument("-i", dest = "i", type = str, required = True, help = "Input FASTA files") + parser.add_argument("-om", dest = "om", type = str, required = False, default = "allelic_PAM.tsv", help = "Output presence-absence matrix in TSV format") + parser.add_argument("-ot", dest = "ot", type = str, required = False, default = "clusters.tsv", help = "Output table about sequence clusters") + return parser.parse_args() + +def main(): + args = parse_arguments() + + # Import the input TSV file as a data frame + input_tsv = args.i + if os.path.exists(input_tsv): + clusters = parse_seqid(pandas.read_csv(input_tsv, sep = "\t", index_col = None)) # Import the table as a data frame of six columns and parse column 'seqid' + else: + print("Error: the input file is not accessible.", file = sys.stderr) + sys.exit(1) + + # Assign allele IDs and create an allelic PAM from data frame "clusters" + pam = create_allelic_pam(clusters) + pam.to_csv(args.om, header = True, index = False, sep = "\t", float_format = None) + clusters.to_csv(args.ot, header = True, index = False, sep = "\t", float_format = None) + return + +def parse_seqid(df): + """ + Parses column 'seqid' into two new columns and returns a data frame of eight columns. + """ + seqids = df["seqid"].tolist() # Convert a column into a list + genes = list() + samples = list() + + # Create two lists from one column + for item in seqids: + g, s = item.split("|") + genes.append(g) + samples.append(s) + + # Append lists to the data frame as columns + df["gene"] = genes + df["sample"] = samples + return df + +def create_allelic_pam(df): + """ + Assign allele IDs and create an allelic PAM based on clustering results. + """ + samples = get_unique_ids(df, "sample") + cluster_ids = get_unique_ids(df, "cluster") # 0, 1, 2, ... + genes_visited = dict() # A dictionary of genes in processed clusters. {gene : number of clusters} + pam = pandas.DataFrame(samples, columns = ["sample"]) # Initalise the output PAM + + # Create a list about presence-absence of each allele and combine it to the output PAM as a column + for c in cluster_ids: # Type of elements: numpy.int64 + df_c = df[df["cluster"] == c] # Select rows of the current cluster + genes_c = df_c["gene"].tolist() # All gene names should be the same when alleles are clustered under 100% nucleotide identity + gene = genes_c[0] + if gene in genes_visited.keys(): + genes_visited[gene] += 1 + allele = gene + "." + str(genes_visited[gene]) # Adding a suffix for making an allele name. Example result: sul1.1, sul1.2. + else: + genes_visited[gene] = 0 # Record a new gene encountered + allele = gene + pa_vec = list() # A binary vector about presence-absence of the current allele across samples + samples_c = df_c["sample"].tolist() # Samples in which the current allele is detected + for s in samples: + pa = 1 if s in samples_c else 0 + pa_vec.append(pa) + pam[allele] = pa_vec + return pam + +def get_unique_ids(df, col_name): + """ + A subordinate function of create_allelic_pam for getting a list of unique and sorted values from a + given column of input data frame df. + """ + ids = list(df[col_name].unique()) + ids.sort(reverse = False) + return ids + +if __name__ == "__main__": + main() diff --git a/ariba/pool_seqs.py b/ariba/pool_seqs.py index b1231c3..5fdd275 100644 --- a/ariba/pool_seqs.py +++ b/ariba/pool_seqs.py @@ -5,7 +5,7 @@ the same format as that for SRST2's output consensus sequences). Command demonstration: - python -i *_genes.fna -o alleles.fna -e '_genes.fna' + python pool_seqs.py -i *_genes.fna -o alleles.fna -e '_genes.fna' Copyright (C) 2020 Yu Wan Licensed under the GNU General Public Licence version 3 (GPLv3) . From 8d871570a0d5872f527485a0dd35c6604f6fc31a Mon Sep 17 00:00:00 2001 From: Yu Wan Date: Sun, 22 Nov 2020 13:11:09 +0000 Subject: [PATCH 3/4] Update code documentation --- README.md | 151 +++++++++++++++++++++++++++++++++++++++++++++++- ariba/README.md | 10 +--- 2 files changed, 149 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index f0f82d7..32e01ae 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,11 @@ - [Dependencies](#Dependencies) - [Subdirectories of code](#Subdirectories) -- [A step-by-step guide for creating a PAM from SRST2 outputs](#guide_srst2) +- [Creating PAMs from SRST2 outputs](#guide_srst2) - [Targeted gene detection with SRST2](#srst2) - [Reliability assessment of allele calls](#uncertainty) - [Creating the allelic PAM](#make_PAM) +- [Creating PAMs from ARIBA outputs](#ariba)
@@ -35,6 +36,8 @@ Wan, Y., Wick, R.R., Zobel, J., Ingle, D.J., Inouye, M., Holt, K.E. GeneMates: a git clone https://github.com/wanyuac/PAMmaker.git ``` + + ### Dependencies PAMmaker requires three code interpreters and a Linux-compatible operating system: @@ -43,6 +46,8 @@ PAMmaker requires three code interpreters and a Linux-compatible operating syste * [Rscript](https://www.r-project.org) (>=3.0) * [Python](https://www.python.org/) (versions 2 and 3 compatible) + + ### Subdirectories of code Subdirectory `utility` stores scripts used by pipeline `mk_allele_matrix.sh`. In addition, there are two subdirectories offering code for evaluating two characteristics of allele calls from SRST2, respectively: @@ -52,7 +57,7 @@ Subdirectory `utility` stores scripts used by pipeline `mk_allele_matrix.sh`. In
-## 2. A step-by-step guide for creating a PAM from SRST2 outputs
+## 2. Creating PAMs from SRST2 outputs This section demonstrates a typical procedure for processing SRST2-formatted results. In this demonstration, we create an allelic PAM and a genetic PAM from detected antimicrobial resistance genes (ARGs). Particularly, we use the SRST2-compatible ARG-ANNOT database version 2 ([ARGannot_r2.fasta](https://github.com/katholt/srst2/blob/master/data/ARGannot_r2.fasta)) as a reference database (hence the gene detection process to be launched is a targeted analysis). @@ -171,4 +176,144 @@ Outputs: - `modified_gene_table.txt`, gene table with allele names updated for cluster information (namely, the table comprised of extended allele names); - `cluster_table.txt`, tabulated cluster information from the result of `cd-hit-est`; - `allele_db.fna`, a FASTA file of alleles in the allelic PAM; -- `allele_name_replacement.txt`, a table linking original allele names to extended allele names based on sequence clustering. \ No newline at end of file +- `allele_name_replacement.txt`, a table linking original allele names to extended allele names based on sequence clustering. + +
+ +## 3. Creating PAMs from ARIBA outputs + +PAMmaker currently offers scripts converting ARIBA outputs when a [ResFinder](https://cge.cbs.dtu.dk/services/ResFinder/) database is used. These scripts are stored in the subdirectory `ariba`. + +Dependencies: + +- Python 3 +- Python module `pandas` +- [CD-HIT-EST](http://weizhongli-lab.org/cd-hit/) + +Desirable software/code: + +- [Nextflow](https://www.nextflow.io/) +- [Singularity](https://sylabs.io/docs/) +- [portable batch system](https://en.wikipedia.org/wiki/Portable_Batch_System) (PBS) +- [ARIBA\_toolkit](https://github.com/wanyuac/ARIBA_toolkit) + + + +### 3.1. Preparing a ResFinder reference database + +This step is performed in accordance with a [wiki page](https://github.com/sanger-pathogens/ariba/wiki/Task:-prepareref) of ARIBA. + +```bash +# Take a note of the commit hash number for reproducibility +# Outputs: resfinder.fa, resfinder.tsv +ariba getref resfinder resfinder + +# Cluster sequences at 80% nucleotide identity +ariba prepareref -f resfinder.fa -m resfinder.tsv --cdhit_min_id 0.8 resfinder +``` + +Essential outputs for our downstream analysis are `02.cdhit.gene.fa` and `02.cdhit.clusters.tsv`. Since the ResFinder database only offers information of acquired antimicrobial resistance (AMR) genes, output files `02.cdhit.gene.varonly.fa`, `02.cdhit.noncoding.fa`, and`02.cdhit.noncoding.varonly.fa` are empty. + + + +### 3.2. Gene detection from short reads + +ARIBA takes as input short reads (e.g., those from Illumina sequencers) for gene detection. Assuming Singularity and a PBS has been installed on users' computer systems, a [Nextflow pipeline](https://github.com/wanyuac/ARIBA_toolkit) (`ariba.nf` and its configuration file `ariba.config`) has been developed for users to run ARIBA (installed as a Singularity-compatible Docker image) for detecting AMR genes in multiple samples. This pipeline is particularly useful when a high-performance computing cluster has a restriction on queue sizes. Nonetheless, users may user their preferred method but need to rename output files into those listed below. + +```bash +# Install ARIBA's Singularity image through Docker +singularity pull docker://staphb/ariba # Rename the Image file to ariba.sif + +# Run the ARIBA Nextflow pipeline for gene detection +nextflow -Djava.io.tmpdir=$PWD run ariba.nf --fastq "$PWD/reads/*_{1,2}.fastq.gz" --db_dir $HOME/db/resfinder --output_dir output -c ariba.config -profile pbs --queue_size 15 -with-singularity $HOME/software/docker/ariba.sif +``` + +An example of information printed on the terminal by Nextflow: + +```bash +N E X T F L O W ~ version 20.07.1 +Launching `ariba.nf` [suspicious_pasteur] - revision: 916cbaaedc +Successfully created directory output +Successfully created directory output/report +Successfully created directory output/gene +Successfully created directory output/stats +Successfully created directory output/log +Successfully created directory output/contig +executor > pbspro (73) +[c1/4c3501] process > gene_detection (68) [100%] 72 of 72 ✔ +[b6/427724] process > summarise_reports [100%] 1 of 1 ✔ +Completed at: 11-Nov-2020 11:38:39 +Duration : 34m 56s +CPU hours : 6.1 +Succeeded : 73 +``` + +Output files required for downstream steps are: + +- `[sample]_genes.fna`: renamed from `assembled_genes.fa.gz` comprised of assembled allele sequences of each sample. Users not using this Nextflow pipeline need to decompress and rename `assembled_genes.fa.gz` of each sample accordingly and ensure all these FASTA files can be accessed under a single directory. + +Users may also refer to `ariba_summary.csv` for compiled genetic profiles of all samples. This file can be converted into a genetic PAM by substituting 1 and 0 for "yes" and "no" in its content. + + + +### 3.3. Pooling allele sequences from all samples + +This step uses script `pool_seqs.py`. + +```bash +python PAMmaker/ariba/pool_seqs.py -i ./gene/*_genes.fna -o alleles.fna -e '_genes.fna' + +# Parameters of the script +python pool_seqs.py -h +usage: pool_seqs.py [-h] -i I [I ...] [-o O] [-e E] + +Pool ARIBA's output allele sequences into one FASTA file and append sample names to sequence IDs + +optional arguments: + -h, --help show this help message and exit + -i I [I ...] Input FASTA files + -o O Output FASTA file of pooled allele sequences + -e E Filename extension to be removed for sample names +``` + +Output file: `alleles.fna`, a multi-FASTA file of pooled allele sequences. In this file, a sample name is appended to each sequence ID. + + + +### 3.4. Clustering pooled allele sequences + +Despite the demonstration below, it is not necessary to cluster alleles based on complete sequence identity. Users may want to tolerate a few mismatches for their study. This is a feature differing from the SRST2-based pipeline described in Section [2](#guide_srst2). + +```bash +# Clustering based on complete sequence identity +cd-hit-est -i alleles.fna -o alleles_rep.fna -c 1.0 -d 0 -s 0.0 -aL 1.0 -aS 1.0 -g 1 + +# Tabulate clustering results from cd-hit-est +python PAMmaker/utility/tabulate_cdhit.py alleles_rep.fna.clstr > alleles_rep.fna.clstr.tsv +``` + + + +### 3.5. Creating an allelic PAM from tabulated CD-HIT-EST output + +```bash +python PAMmaker/ariba/clusters2pam.py -i alleles_rep.fna.clstr.tsv -om allelic_PAM.tsv -ot alleles_rep.fna.clstr_updated.tsv + +# Script parameters +python clusters2pam.py -h +usage: clusters2pam.py [-h] -i I [-om OM] [-ot OT] + +Creating an allelic presence-absence matrix from a table of sequence clusters + +optional arguments: + -h, --help show this help message and exit + -i I Input FASTA files + -om OM Output presence-absence matrix in TSV format + -ot OT Output table about sequence clusters +``` + +Outputs of this example: + +- `allelic_PAM.tsv`: The objective allelic PAM; +- `alleles_rep.fna.clstr_updated.tsv`: A table of clustering information, including cluster IDs (`gene`) and sample names. + diff --git a/ariba/README.md b/ariba/README.md index 192c1b0..a00799e 100644 --- a/ariba/README.md +++ b/ariba/README.md @@ -1,11 +1,3 @@ # Converting ARIBA outputs to an allelic PAM -Yu Wan - - - -## Steps - -### Modifying summary table - -Replace both "\_report.tsv" and ".match" with "". \ No newline at end of file +This subdirectory offers scripts for converting ARIBA outputs to an allelic PAM and a genetic PAM. Please see README under the main directory for a step-by-step guide. From 375a28ff6feb22583f923ffbe768f803e1f5bcf9 Mon Sep 17 00:00:00 2001 From: Yu Wan Date: Sun, 22 Nov 2020 13:15:12 +0000 Subject: [PATCH 4/4] Update table of content --- README.md | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 32e01ae..cbdb939 100644 --- a/README.md +++ b/README.md @@ -5,12 +5,16 @@ - [Installation](#Installation) - [Dependencies](#Dependencies) - [Subdirectories of code](#Subdirectories) - - [Creating PAMs from SRST2 outputs](#guide_srst2) - [Targeted gene detection with SRST2](#srst2) - [Reliability assessment of allele calls](#uncertainty) - [Creating the allelic PAM](#make_PAM) - [Creating PAMs from ARIBA outputs](#ariba) + - [Preparing a ResFinder reference database](#resfinder) + - [Gene detection from short reads](#run_ariba) + - [Pooling allele sequences from all samples](#pool_seqs) + - [Clustering pooled allele sequences](#seq_clustering) + - [Creating an allelic PAM from tabulated CD-HIT-EST output](#makePAM)
@@ -180,7 +184,7 @@ Outputs:
-## 3. Creating PAMs from ARIBA outputs +## 3. Creating PAMs from ARIBA outputs
PAMmaker currently offers scripts converting ARIBA outputs when a [ResFinder](https://cge.cbs.dtu.dk/services/ResFinder/) database is used. These scripts are stored in the subdirectory `ariba`. @@ -199,7 +203,7 @@ Desirable software/code: -### 3.1. Preparing a ResFinder reference database +### 3.1. Preparing a ResFinder reference database This step is performed in accordance with a [wiki page](https://github.com/sanger-pathogens/ariba/wiki/Task:-prepareref) of ARIBA. @@ -216,7 +220,7 @@ Essential outputs for our downstream analysis are `02.cdhit.gene.fa` and `02.cdh -### 3.2. Gene detection from short reads +### 3.2. Gene detection from short reads ARIBA takes as input short reads (e.g., those from Illumina sequencers) for gene detection. Assuming Singularity and a PBS has been installed on users' computer systems, a [Nextflow pipeline](https://github.com/wanyuac/ARIBA_toolkit) (`ariba.nf` and its configuration file `ariba.config`) has been developed for users to run ARIBA (installed as a Singularity-compatible Docker image) for detecting AMR genes in multiple samples. This pipeline is particularly useful when a high-performance computing cluster has a restriction on queue sizes. Nonetheless, users may user their preferred method but need to rename output files into those listed below. @@ -256,7 +260,7 @@ Users may also refer to `ariba_summary.csv` for compiled genetic profiles of all -### 3.3. Pooling allele sequences from all samples +### 3.3. Pooling allele sequences from all samples This step uses script `pool_seqs.py`. @@ -280,7 +284,7 @@ Output file: `alleles.fna`, a multi-FASTA file of pooled allele sequences. In th -### 3.4. Clustering pooled allele sequences +### 3.4. Clustering pooled allele sequences Despite the demonstration below, it is not necessary to cluster alleles based on complete sequence identity. Users may want to tolerate a few mismatches for their study. This is a feature differing from the SRST2-based pipeline described in Section [2](#guide_srst2). @@ -294,7 +298,7 @@ python PAMmaker/utility/tabulate_cdhit.py alleles_rep.fna.clstr > alleles_rep.fn -### 3.5. Creating an allelic PAM from tabulated CD-HIT-EST output +### 3.5. Creating an allelic PAM from tabulated CD-HIT-EST output ```bash python PAMmaker/ariba/clusters2pam.py -i alleles_rep.fna.clstr.tsv -om allelic_PAM.tsv -ot alleles_rep.fna.clstr_updated.tsv