v0.0.2

A profound update of algorithms, which corrects allele names when the same consensus sequence is aligned to multiple reference alleles in different strains. This problem caused split of allele calls when they actually reveal the same allele. This update also enables PAMmaker to take as input the clustering outcome when consensus sequences are clustered at a nucleotide identity of <100%.
wanyuac · Aug 6, 2018 · 2d2fdcf · 2d2fdcf
1 parent f5afcdf
commit 2d2fdcf
Show file tree

Hide file tree

Showing 4 changed files with 376 additions and 136 deletions.
diff --git a/clustering_allele_variants.py b/clustering_allele_variants.py
@@ -3,94 +3,180 @@
 It works as a downstream process after tabulate_cd-hit.py. Specifically, it takes as input from stdin and prints results to stdout.
 In particular, this program assumes question marks have been removed from the compiled result using a proper method to solve ambiguity in allele calls.
 
+In a few cases, there is no unique allele identifiers (such as intI1, not intI1_1). This may happen when the reference database is small.
+
 Usage:
-	python tabulate_cdhit.py clusters.fna.clstr | python clustering_allele_variants.py srst2_compiledResults.txt > compiledResults_clusterd.txt
-	cat cluster_table.txt | python clustering_allele_variants.py srst2_compiledResults.txt > compiledResults_clusterd.txt  # if you already have a tabulated cluster file
-	cat cluster_table.txt | python clustering_allele_variants.py srst2_compiledResults.txt 1 > compiledResults_clusterd.txt  # when allele calls do not have unique allele identifiers
+    python tabulate_cdhit.py clusters.fna.clstr > cluster_table.txt
+    python clustering_allele_variants.py -i srst2_compiledResults.txt -c cluster_table.txt -r -o compiledResults_clusterd.txt -m allele_name_replacement.txt
+    python clustering_allele_variants.py -i srst2_compiledResults.txt -c cluster_table.txt -r -n -o compiledResults_clusterd.txt  # when allele calls do not have unique allele identifiers
 
 Input:
-	1. A compiled tab-deliminated file generated by SRST2 using the command: python srst2.py --prev_output *__genes*results.txt --output <prefix>
-	2. A cluster file created by either cd-hit or cd-hit-est.
-	
+    1. A compiled tab-deliminated file generated by SRST2 using the command: python srst2.py --prev_output *__genes*results.txt --output <prefix>
+    2. A cluster file created by either cd-hit or cd-hit-est.
+    
 Output: A new file of compiled results, where each allele variant are assigned a unique cluster identifier.
 
-Python version: 3.4.3
-Author: Yu Wan (wanyuac@gmail.com, GitHub: https://github.com/wanyuac)
-Editions: 29 July 2015, 5 Sep 2016, 27 Jun 2017
 This script is derived from clustering_allele_variants.py in my repository SRST2_toolkit (https://github.com/wanyuac/SRST2_toolkit).
 
-License: Apache License, Version 2.0
+Python 2 and 3 compatible
+Copyright 2017 Yu Wan <wanyuac@gmail.com>
+Licensed under the Apache License, Version 2.0
+First and the latest editions: 29 July 2015, 6 August 2018 
 """
 
+from __future__ import print_function
 import sys
 import re
-import collections
-
+from collections import defaultdict, namedtuple
+from argparse import ArgumentParser
+
+
+def parse_arguments():
+    parser = ArgumentParser(description = "Assigning extended identifiers to allele names")
+    parser.add_argument("-i", "--input", dest = "input", type = str, required = True,
+                        help = "Input gene-content table whose allele names are subject to changes.")
+    parser.add_argument("-c", "--clusters", dest = "clusters", type = str, required = True,
+                        help = "A tab-delimited file containing clustering information from CD-HIT-EST")
+    parser.add_argument("-n", "--noext", dest = "noext", action = "store_true", required = False,
+                        help = "Whether allele names do contain unique identifiers. Default: False.")
+    parser.add_argument("-r", "--rename", dest = "rename", action = "store_true", required = False,
+                        help = "Whether to rename alleles when consensus sequences of different alleles belong to the same cluster")
+    parser.add_argument("-o", "--out", dest = "out", type = str, required = False, default = "modified_allele_matrix.txt",
+                        help = "The output gene-content table with extended allele names")
+    parser.add_argument("-m", "--mapping", dest = "mapping", type = str, required = False, default = "allele_name_replacement.txt",
+                        help = "The output gene-content table with extended allele names")
+
+    return parser.parse_args()
+
+
 def extract_alleleID(id, ext):
-	"""
-	This function changes sequence names in the cluster file into allele names used in SRST2. For example, 
-	if ext == True:
-		input = "66__FloR_Phe__FloR__1212", output = "FloR_1212"
-	"""
-	fields = id.split("__")[-2:]  # only take the last two elements: allele name and sequence identifier
-	if ext:
-		new_id = "_".join(fields)  # shorten the double underscores into a single one in order to match the format in SRST2 gene profiles.
-	else:
-		new_id = fields[0]  # only take the first element in the ID list, eg. returns FloR for the example above
-
-	return new_id
-
-def read_clusters(lines, id_ext = True):
-	"""
-	This function reads a tabulated cluster file produced by cd-hit-est and my script tabulate_cdhit.py, and then
-	returns a dictionary of clusters following the structure: {allele_name: {cluster_id : [sample1, sample2, ...]}},
-	where two keys are applied hierarchically.
-	"""
-	clusters = collections.defaultdict(dict)
-	for line in lines[1 : ]:  # skip the header line
-		fields = line.split("\t")
-		cluster_id = fields[0]  # an integer
-		allele_info = fields[2].split("|")  # "0__OqxB_Flq__OqxB__49.variant|NTUH-K2044" -> ["0__OqxB_Flq__OqxB__49.variant", "NTUH-K2044"]
-
-		# Extract the sequence ID, such as 52__TetA_Tet__TetA__1545, with a regular expression; then convert it into an allele name
-		# For instance, the regular expression extracts "0__OqxB_Flq__OqxB__49" out of "0__OqxB_Flq__OqxB__49.consensus".
-		allele = extract_alleleID(re.findall("(.+?)\.", allele_info[0])[0], ext = id_ext)
-		sample = allele_info[1]  # extract the sample ID from the line with a regular expression
-		clusters[sample][allele] = cluster_id  # save this cluster_id in the dictionary {sample : {allele : cluster_id}}
-
-	return clusters
-
-def assign_clusterIDs(gene_table, clusters):
-	"""
-	This function assigns an extended allele identifier to every allele call if it is a variant of its reference.
-	The additional identifier is ID of each cluster.
-	For example, "Aac6-Iaa_760*" becomes "aac6-Iaa_760.10" in the output, where 10 is the cluster where this allele belongs to in an output from cd-hit-est.
-	"""
-	f = open(gene_table, "rU")
-	content = f.read().splitlines()
-	print(content[0])  # print the header line first
-	for row in content[1:]:  # for every row of this table
-		fields = row.split("\t")
-		sample = fields[0]
-		new_line = [sample]
-		for allele in fields[1:]:
-			if allele[-1] == "*":  # This is a variant; otherwise, no change is needed.
-				allele = allele.replace("*", "." + clusters[sample][allele[:-1]])  # replace the asterisk with a cluster ID
-			new_line.append(allele)  # non-variants remain intact
-		print("\t".join(new_line))  # print a new line to the stdout
-	f.close()
-
-	return
+    """
+    This function changes sequence names in the cluster file into allele names used in SRST2. For example, 
+    if ext == True:
+        input = "66__FloR_Phe__FloR__1212", output = "FloR_1212"
+    """
+    fields = id.split("__")[-2:]  # only take the last two elements: allele name and sequence identifier
+    if ext:
+        new_id = "_".join(fields)  # shorten the double underscores into a single one in order to match the format in SRST2 gene profiles.
+    else:
+        new_id = fields[0]  # only take the first element in the ID list, eg. returns FloR for the example above
+
+    return new_id
+
+
+def read_clusters(cluster_file, id_ext = True):
+    """
+    This function reads a tabulated cluster file produced by cd-hit-est and my script tabulate_cdhit.py, and then
+    returns a dictionary of clusters following the structure: {allele_name: {cluster_id : [sample1, sample2, ...]}},
+    where two keys are applied hierarchically.
+    """    
+    with open(cluster_file, "rU") as f:
+        lines = f.read().splitlines()
+
+    # Initialise the return value
+    clusters = defaultdict(dict)
+    Allele = namedtuple("Allele", ["cid", "rep"])  # cid: cluster index; rep: is this allele in the current sample is chosen as the representative sequence
+
+    # Retrieve allele and cluster information
+    for line in lines[1 : ]:  # skip the header line
+        fields = line.split("\t")
+        cluster_id = fields[0]
+        is_rep = fields[5] == "Y"
+        allele_info = fields[2].split("|")  # "0__OqxB_Flq__OqxB__49.variant|NTUH-K2044" -> ["0__OqxB_Flq__OqxB__49.variant", "NTUH-K2044"]
+
+        # Extract the sequence ID, such as 52__TetA_Tet__TetA__1545, with a regular expression; then convert it into an allele name
+        # For instance, the regular expression extracts "0__OqxB_Flq__OqxB__49" out of "0__OqxB_Flq__OqxB__49.consensus".
+        allele = extract_alleleID(re.findall("(.+?)\.", allele_info[0])[0], ext = id_ext)
+        sample = allele_info[1]  # extract the sample ID from the line with a regular expression
+        clusters[sample][allele] = Allele(cid = cluster_id, rep = is_rep)  # save this cluster_id (a string) in the dictionary {sample : {allele : cluster_id}}
+
+    return clusters
+
+
+def get_rep_allele_per_cluster(clusters):
+    """
+    Identify allele names in the same cluster and replace them with the representative allele name
+    Return a dictionary {cid : allele}
+    """
+    reps = {}  # representative allele of each cluster
+
+    for sample, alleles in clusters.items():
+        for allele, record in alleles.items():
+            if record.rep:
+                """
+                Add this allele name into the dictionary cluster_rep. CD-HIT-EST ensures that there is
+                only a single representative allele per cluster. Therefore, we will not lose any
+                representative allele in this algorithm.
+                """
+                reps[record.cid] = allele
+
+    return reps
+
+
+def assign_clusterIDs(gene_table, clusters, reps, rename, out_tab, out_rep):
+    """
+    This function assigns an extended allele identifier to every allele call if it is a variant of its reference.
+    The additional identifier is ID of each cluster.
+    For example, "Aac6-Iaa_760*" becomes "aac6-Iaa_760.10" in the output, where 10 is the cluster where this allele belongs to in an output from cd-hit-est.
+    """
+    # Import the gene content table
+    with open(gene_table, "rU") as f:
+        content = f.read().splitlines()
+
+    # Establish output files
+    new_tab = open(out_tab, "w")
+    rep_out = open(out_rep, "w")
+
+    # Convert allele IDs in the gene content table
+    new_tab.write(content[0] + "\n")  # print the header line first
+    rep_out.write("Sample\tPrev_allele\tVar\tNew_allele\n")
+    for row in content[1:]:  # for every row of this table
+        fields = row.split("\t")
+        sample = fields[0]
+        new_line = [sample]
+
+        for allele in fields[1:]:
+            """
+            Although alleles are different to each other in every reference database, exact hits to different alleles
+            may be clustered together when the threshold of nucleotide identity is sufficiently low. Therefore, an exact
+            hit may be renamed when the rename argument is True and multiple reference alleles are clustered together.
+            Evidently, for clusters having only a single allele, there is no difference between the output allele name
+            and the original name.
+            """
+            if allele != "-":
+                is_var = allele[-1] == "*"
+                if is_var:  # This is a variant of a reference allele.
+                    allele = allele[:-1]  # drop the asterisk
+                cid = clusters[sample][allele].cid
+                if rename:
+                    rep_allele = reps[cid]
+                    if allele != rep_allele:
+                        rep_out.write("\t".join([sample, allele, str(int(is_var)), rep_allele]) + "\n")
+                    if is_var:
+                        allele = rep_allele + "." + cid  # replace the asterisk with a cluster ID
+                    else:
+                        allele = rep_allele   
+                elif is_var:
+                    allele = allele + "." + cid
+                # else: do nothing    
+            new_line.append(allele)  # non-variants remain intact
+
+        new_tab.write("\t".join(new_line) + "\n")  # print a new line to the stdout
+
+    new_tab.close()
+    rep_out.close()
+
+    return
+
 
 def main():
-	argv_len = len(sys.argv)  # number of arguments (>= 1)
-	if argv_len == 2:  # Allele calls have unique allele identifiers, which is the most common case of results.
-		assign_clusterIDs(gene_table = sys.argv[1], clusters = read_clusters(sys.stdin.read().splitlines(), True))  # a table showing allele calls of every gene over samples
-	elif argv_len > 2:  # In a few cases, there is no unique allele identifiers (such as intI1, not intI1_1). This may happen when the reference database is small.
-		assign_clusterIDs(gene_table = sys.argv[1], clusters = read_clusters(sys.stdin.read().splitlines(), False))
-	else:
-		exit("Error: a file showing allele calls of every gene over samples is required.")
-
+    args = parse_arguments()
+    clusters = read_clusters(args.clusters, not args.noext)  # import cluster information: the tablated CD-HIT-EST output
+    reps = get_rep_allele_per_cluster(clusters)  # one representative allele per cluster
+    assign_clusterIDs(gene_table = args.input, clusters = clusters, reps = reps, rename = args.rename, \
+                      out_tab = args.out, out_rep = args.mapping)  # a table showing allele calls of every gene over samples
+
+
 # The main program
 if __name__ == "__main__":
     main()