From 51c4e1251a54c4199baea04b163372ef1cd21d25 Mon Sep 17 00:00:00 2001 From: Nick Moore Date: Fri, 20 Sep 2024 16:41:53 +1000 Subject: [PATCH 1/8] starting on genomic variants --- countess/plugins/variant.py | 11 +++++++++-- countess/utils/variant.py | 24 ++++++++++++++++-------- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/countess/plugins/variant.py b/countess/plugins/variant.py index 45597c5..c3d68e3 100644 --- a/countess/plugins/variant.py +++ b/countess/plugins/variant.py @@ -28,8 +28,15 @@ class VariantPlugin(PandasTransformDictToDictPlugin): column = ColumnChoiceParam("Input Column", "sequence") reference = ColumnOrStringParam("Reference Sequence") - offset = ColumnOrIntegerParam("Reference Offset", 0) - output = StringParam("Output Column", "variant") + offset = ColumnOrIntegerParam("Genomic Offset (negative for reverse)", 0) + + prefix = StringParam("Genomic Prefix", "") + output = StringParam("Output Column (Genomic)", "variant") + + coding_prefix = ColumnOrStringParam("Coding Prefix", "") + coding_offset = ColumnOrIntegerParam("Coding Offset", "") + coding_output = StringParam("Coding Output Column", "coding") + max_mutations = IntegerParam("Max Mutations", 10) protein = StringParam("Protein Column", "protein") max_protein = IntegerParam("Max Protein Variations", 10) diff --git a/countess/utils/variant.py b/countess/utils/variant.py index 0567d56..1e93924 100644 --- a/countess/utils/variant.py +++ b/countess/utils/variant.py @@ -205,6 +205,14 @@ def find_variant_dna(ref_seq: str, var_seq: str, offset: int = 0) -> Iterable[st >>> list(find_variant_dna("AAACCCTTT", "AAAGGGTTT")) ['4_6inv'] + OFFSETS + + >>> list(find_variant_dna("AGAAGTAGAGG", "ATAAGAAGAGG", 100)) + ['102G>T', '106T>A'] + + >>> list(find_variant_dna("AGAAGTAGAGG", "ATAAGAAGAGG", -200)) + ['198G>T', '194T>A'] + """ ref_seq = ref_seq.strip().upper() @@ -273,9 +281,9 @@ def find_variant_dna(ref_seq: str, var_seq: str, offset: int = 0) -> Iterable[st assert dest_seq == "" # 'delete' opcode maps to HGVS 'del' operation if len(src_seq) == 1: - yield f"{start+1}del" + yield f"{abs(start+1)}del" else: - yield f"{start+1}_{end}del" + yield f"{abs(start+1)}_{abs(end)}del" elif opcode.tag == "insert": assert src_seq == "" @@ -285,12 +293,12 @@ def find_variant_dna(ref_seq: str, var_seq: str, offset: int = 0) -> Iterable[st # This is a duplication of one or more symbols immediately # preceding this point. if len(dest_seq) == 1: - yield f"{start}dup" + yield f"{abs(start)}dup" else: - yield f"{start - len(dest_seq) + 1}_{start}dup" + yield f"{abs(start - len(dest_seq) + 1)}_{abs(start)}dup" else: inserted_sequence = search_for_sequence(ref_seq, dest_seq) - yield f"{start}_{start+1}ins{inserted_sequence}" + yield f"{abs(start)}_{abs(start+1)}ins{inserted_sequence}" elif opcode.tag == "replace": # 'replace' opcode maps to either an HGVS '>' (single substitution) or @@ -301,12 +309,12 @@ def find_variant_dna(ref_seq: str, var_seq: str, offset: int = 0) -> Iterable[st # as this code has no concept of amino acid alignment. if len(src_seq) == 1 and len(dest_seq) == 1: - yield f"{start+1}{src_seq}>{dest_seq}" + yield f"{abs(start+1)}{src_seq}>{dest_seq}" elif len(src_seq) == len(dest_seq) and dest_seq == reverse_complement(src_seq): - yield f"{start+1}_{end}inv" + yield f"{abs(start+1)}_{abs(end)}inv" else: inserted_sequence = search_for_sequence(ref_seq, dest_seq) - yield f"{start+1}_{end}delins{inserted_sequence}" + yield f"{abs(start+1)}_{abs(end)}delins{inserted_sequence}" def find_variant_protein(ref_seq: str, var_seq: str, offset: int = 0): From 3d097ecef855fb0936b9ca5fd3512675cf992202 Mon Sep 17 00:00:00 2001 From: Nick Moore Date: Fri, 20 Sep 2024 18:22:39 +1000 Subject: [PATCH 2/8] attempt #1 --- countess/plugins/variant.py | 53 ++++++++++++++++++++++--------------- countess/utils/variant.py | 15 ++++++++--- 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/countess/plugins/variant.py b/countess/plugins/variant.py index c3d68e3..fd3c64b 100644 --- a/countess/plugins/variant.py +++ b/countess/plugins/variant.py @@ -28,14 +28,14 @@ class VariantPlugin(PandasTransformDictToDictPlugin): column = ColumnChoiceParam("Input Column", "sequence") reference = ColumnOrStringParam("Reference Sequence") - offset = ColumnOrIntegerParam("Genomic Offset (negative for reverse)", 0) + minus_strand = BooleanParam("Reference is on Minus Strand", False) + offset = ColumnOrIntegerParam("Reference Offset", 0) + prefix = StringParam("Output Prefix", "") + output = StringParam("Output Column", "variant") - prefix = StringParam("Genomic Prefix", "") - output = StringParam("Output Column (Genomic)", "variant") - - coding_prefix = ColumnOrStringParam("Coding Prefix", "") - coding_offset = ColumnOrIntegerParam("Coding Offset", "") - coding_output = StringParam("Coding Output Column", "coding") + coding_prefix = StringParam("Coding Prefix", "c.") + coding_offset = ColumnOrIntegerParam("Coding Offset", 0) + coding_output = StringParam("Coding Output", "coding") max_mutations = IntegerParam("Max Mutations", 10) protein = StringParam("Protein Column", "protein") @@ -50,28 +50,39 @@ def process_dict(self, data) -> dict: reference = self.reference.get_value_from_dict(data) offset = int(self.offset.get_value_from_dict(data) or 0) + coding_offset = int(self.coding_offset.get_value_from_dict(data) or 0) r: dict[str, str] = {} - if self.output: - try: + try: + if self.output: r[self.output.value] = find_variant_string( - "g.", reference, sequence, int(self.max_mutations), offset=offset + self.prefix + ":g." if self.prefix else "g.", + reference, + sequence, + max_mutations=self.max_mutations.value, + offset=offset, + minus_strand=self.minus_strand.value, + ) + + if self.coding_output: + r[self.coding_output.value] = find_variant_string( + self.coding_prefix + ":c." if self.coding_prefix else "c.", + reference, + sequence, + int(self.max_mutations), + offset=coding_offset, ) - except ValueError: - pass - except (TypeError, KeyError, IndexError) as exc: - logger.warning("Exception", exc_info=exc) - if self.protein: - try: + if self.protein: r[str(self.protein)] = find_variant_string( - "p.", reference, sequence, int(self.max_protein), offset=offset + "p.", reference, sequence, int(self.max_protein), offset=coding_offset ) - except ValueError: - pass - except (TypeError, KeyError, IndexError) as exc: - logger.warning("Exception", exc_info=exc) + + #except ValueError: + # pass + except (TypeError, KeyError, IndexError) as exc: + logger.warning("Exception", exc_info=exc) return r diff --git a/countess/utils/variant.py b/countess/utils/variant.py index 1e93924..0df981b 100644 --- a/countess/utils/variant.py +++ b/countess/utils/variant.py @@ -480,7 +480,12 @@ def _ref(pos): def find_variant_string( - prefix: str, ref_seq: str, var_seq: str, max_mutations: Optional[int] = None, offset: int = 0 + prefix: str, + ref_seq: str, + var_seq: str, + max_mutations: Optional[int] = None, + offset: int = 0, + minus_strand: bool = False, ) -> str: """As above, but returns a single string instead of a generator @@ -544,12 +549,16 @@ def find_variant_string( ValueError: Too many variations (2) in GATTACA """ - if prefix.endswith("g.") and not prefix.endswith("n."): + if minus_strand: + ref_seq = reverse_complement(ref_seq) + var_seq = reverse_complement(var_seq) + + if prefix.endswith("g.") or prefix.endswith("c."): variations = list(find_variant_dna(ref_seq, var_seq, offset)) elif prefix.endswith("p."): variations = list(find_variant_protein(ref_seq, var_seq, offset)) else: - raise ValueError("Only prefix types 'g.', 'n.' and 'p.' accepted at this time") + raise ValueError("Only prefix types 'g.', 'c.' and 'p.' accepted at this time") if len(variations) == 0: return prefix + "=" From 7e5f09adf424ff972eca4d265ddab584f849d0d7 Mon Sep 17 00:00:00 2001 From: Nick Moore Date: Mon, 23 Sep 2024 11:50:52 +1000 Subject: [PATCH 3/8] Add a FramedMultiParam to help organize options --- countess/core/parameters.py | 5 +++++ countess/gui/config.py | 12 +++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/countess/core/parameters.py b/countess/core/parameters.py index ee1081a..34e3273 100644 --- a/countess/core/parameters.py +++ b/countess/core/parameters.py @@ -917,3 +917,8 @@ def get_hash_value(self): class TabularMultiParam(MultiParam): """This is just used to drop a hint to the GUI as to how the MultiParam is to be presented ... as a hierarchy or as a table ...""" + +class FramedMultiParam(MultiParam): + """This is just used to drop a hint to the GUI to display the MultiParam + in its own frame""" + diff --git a/countess/gui/config.py b/countess/gui/config.py index 6205d18..fe882f0 100644 --- a/countess/gui/config.py +++ b/countess/gui/config.py @@ -16,6 +16,7 @@ FileArrayParam, FileParam, FileSaveParam, + FramedMultiParam, MultiParam, ScalarParam, TabularMultiParam, @@ -66,7 +67,7 @@ def __init__( # pylint: disable=R0912,R0915 self.label: Optional[tk.Widget] = None self.row_labels: list[tk.Widget] = [] - if isinstance(parameter, ArrayParam): + if isinstance(parameter, (ArrayParam, FramedMultiParam)): self.label = None else: self.label = tk.Label(tk_parent, text=parameter.label) @@ -139,6 +140,15 @@ def __init__( # pylint: disable=R0912,R0915 self.update_subwrappers(parameter.params, drc) + elif isinstance(parameter, FramedMultiParam): + label_frame_label = tk.Frame(tk_parent) + tk.Label(label_frame_label, text=parameter.label).grid(row=0, column=0, padx=5) + self.entry = tk.LabelFrame(tk_parent, labelwidget=label_frame_label, padx=10, pady=5) + self.entry.columnconfigure(0, weight=0) + self.entry.columnconfigure(1, weight=0) + self.entry.columnconfigure(2, weight=1) + self.update_subwrappers(parameter.params.values(), None) + elif isinstance(parameter, (ArrayParam, MultiParam)): self.entry = tk.Frame(tk_parent) self.entry.columnconfigure(0, weight=0) From a71fb409777c66c5aef61171e37376b628922b23 Mon Sep 17 00:00:00 2001 From: Nick Moore Date: Mon, 23 Sep 2024 13:35:24 +1000 Subject: [PATCH 4/8] new variant caller gets it right! --- countess/core/parameters.py | 2 +- countess/plugins/variant.py | 91 ++++++++++--------------------------- 2 files changed, 25 insertions(+), 68 deletions(-) diff --git a/countess/core/parameters.py b/countess/core/parameters.py index 34e3273..5f0ef96 100644 --- a/countess/core/parameters.py +++ b/countess/core/parameters.py @@ -918,7 +918,7 @@ class TabularMultiParam(MultiParam): """This is just used to drop a hint to the GUI as to how the MultiParam is to be presented ... as a hierarchy or as a table ...""" + class FramedMultiParam(MultiParam): """This is just used to drop a hint to the GUI to display the MultiParam in its own frame""" - diff --git a/countess/plugins/variant.py b/countess/plugins/variant.py index fd3c64b..95de3be 100644 --- a/countess/plugins/variant.py +++ b/countess/plugins/variant.py @@ -1,15 +1,13 @@ import logging -from typing import Optional - -import pandas as pd from countess import VERSION from countess.core.parameters import ( - BooleanParam, + ArrayParam, ColumnChoiceParam, ColumnOrIntegerParam, ColumnOrStringParam, IntegerParam, + MultiParam, StringParam, ) from countess.core.plugins import PandasTransformDictToDictPlugin @@ -18,89 +16,48 @@ logger = logging.getLogger(__name__) +class VariantOutputMultiParam(MultiParam): + prefix = StringParam("Prefix", "g.") + offset = ColumnOrIntegerParam("Offset", 0) + maxlen = IntegerParam("Max Variations", 10) + output = StringParam("Output Column", "variant") + + class VariantPlugin(PandasTransformDictToDictPlugin): """Turns a DNA sequence into a HGVS variant code""" - name = "Variant Translator" + name = "Variant Caller" description = "Turns a DNA sequence into a HGVS variant code" version = VERSION link = "https://countess-project.github.io/CountESS/included-plugins/#variant-caller" + additional = """Prefixes should end with 'g.', 'c.' or 'p.'. + Use a negative offset to call genomic references to minus-strand genes.""" column = ColumnChoiceParam("Input Column", "sequence") reference = ColumnOrStringParam("Reference Sequence") - minus_strand = BooleanParam("Reference is on Minus Strand", False) - offset = ColumnOrIntegerParam("Reference Offset", 0) - prefix = StringParam("Output Prefix", "") - output = StringParam("Output Column", "variant") - - coding_prefix = StringParam("Coding Prefix", "c.") - coding_offset = ColumnOrIntegerParam("Coding Offset", 0) - coding_output = StringParam("Coding Output", "coding") - - max_mutations = IntegerParam("Max Mutations", 10) - protein = StringParam("Protein Column", "protein") - max_protein = IntegerParam("Max Protein Variations", 10) - drop = BooleanParam("Drop unidentified variants", False) - drop_columns = BooleanParam("Drop Input Column(s)", False) + outputs = ArrayParam("Outputs", VariantOutputMultiParam("Output")) def process_dict(self, data) -> dict: sequence = data[str(self.column)] + reference = self.reference.get_value_from_dict(data) if not sequence: return {} - reference = self.reference.get_value_from_dict(data) - offset = int(self.offset.get_value_from_dict(data) or 0) - coding_offset = int(self.coding_offset.get_value_from_dict(data) or 0) - r: dict[str, str] = {} + for output in self.outputs: + offset = int(output.offset.get_value_from_dict(data) or 0) - try: - if self.output: - r[self.output.value] = find_variant_string( - self.prefix + ":g." if self.prefix else "g.", + try: + r[output.output.value] = find_variant_string( + str(output.prefix) if output.prefix else "g.", reference, sequence, - max_mutations=self.max_mutations.value, - offset=offset, - minus_strand=self.minus_strand.value, + max_mutations=output.maxlen.value, + offset=abs(offset), + minus_strand=offset < 0, ) - if self.coding_output: - r[self.coding_output.value] = find_variant_string( - self.coding_prefix + ":c." if self.coding_prefix else "c.", - reference, - sequence, - int(self.max_mutations), - offset=coding_offset, - ) - - if self.protein: - r[str(self.protein)] = find_variant_string( - "p.", reference, sequence, int(self.max_protein), offset=coding_offset - ) - - #except ValueError: - # pass - except (TypeError, KeyError, IndexError) as exc: - logger.warning("Exception", exc_info=exc) + except (TypeError, KeyError, IndexError) as exc: + logger.warning("Exception", exc_info=exc) return r - - def process_dataframe(self, dataframe: pd.DataFrame) -> Optional[pd.DataFrame]: - df_out = super().process_dataframe(dataframe) - - if df_out is not None: - if self.drop: - if self.output: - df_out.dropna(subset=str(self.output), inplace=True) - if self.protein: - df_out.dropna(subset=str(self.protein), inplace=True) - if self.drop_columns: - try: - df_out.drop(columns=str(self.column), inplace=True) - if self.reference.get_column_name(): - df_out.drop(columns=self.reference.get_column_name(), inplace=True) - except KeyError: - pass - - return df_out From 1274d6d2d44ee175eba5f86e66ddb68efab38dfa Mon Sep 17 00:00:00 2001 From: Nick Moore Date: Tue, 24 Sep 2024 17:02:00 +1000 Subject: [PATCH 5/8] add in a DictChoiceParam which is slightly nicer to use than the ChoiceParam --- countess/core/parameters.py | 64 +++++++++++++++++++++++++++++++++++++ countess/gui/config.py | 17 ++++++---- 2 files changed, 74 insertions(+), 7 deletions(-) diff --git a/countess/core/parameters.py b/countess/core/parameters.py index 5f0ef96..796bb91 100644 --- a/countess/core/parameters.py +++ b/countess/core/parameters.py @@ -339,6 +339,67 @@ def __init__(self, label: str, value=None, file_types=None): self.file_types = file_types +class DictChoiceParam(ScalarWithOperatorsParam): + """A drop-down menu parameter choosing between options. + Takes a mapping of choices where the key is the choice + and the value is the displayed value.""" + + _value: str = "" + _choice: str = "" + choices: dict[str, str] + reverse: dict[str, str] + + def __init__(self, label: str, value: Optional[str] = None, choices: Optional[dict[str, str]] = None): + super().__init__(label) + self.set_choices(choices or {}) + self.set_value(value) + + def clean_value(self, value): + return value + + def set_value(self, value): + if value in self.reverse: + self._choice = self.reverse[value] + self._value = value + else: + self.set_default() + + def get_choice(self): + return self._choice + + def set_choice(self, choice): + if choice in self.choices: + self._choice = choice + self._value = self.choices[choice] + else: + self.set_default() + + choice = property(get_choice, set_choice) + + def set_choices(self, choices: dict[str, str]): + self.choices = dict(choices) + self.reverse = {v: k for k, v in choices.items()} + if self._choice in self.choices: + self._value = self.choices[self._choice] + elif self._value in self.reverse: + self._choice = self.reverse[self._value] + else: + self.set_default() + + def get_values(self): + return list(self.choices.values()) + + def set_default(self): + if self.choices: + self._choice, self._value = list(self.choices.items())[0] + else: + self._choice = "" + self._value = "" + + def copy(self) -> "DictChoiceParam": + return self.__class__(self.label, self.value, self.choices) + + class ChoiceParam(ScalarWithOperatorsParam): """A drop-down menu parameter choosing between options. Defaults to 'None'""" @@ -395,6 +456,9 @@ def set_choices(self, choices: Iterable[str]): self._value = self.DEFAULT_VALUE self._choice = None + def get_values(self): + return self.choices + def copy(self) -> "ChoiceParam": return self.__class__(self.label, self.value, self.choices) diff --git a/countess/gui/config.py b/countess/gui/config.py index fe882f0..b721f5c 100644 --- a/countess/gui/config.py +++ b/countess/gui/config.py @@ -13,6 +13,7 @@ BooleanParam, ChoiceParam, ColumnOrStringParam, + DictChoiceParam, FileArrayParam, FileParam, FileSaveParam, @@ -72,10 +73,10 @@ def __init__( # pylint: disable=R0912,R0915 else: self.label = tk.Label(tk_parent, text=parameter.label) - if isinstance(parameter, ChoiceParam): + if isinstance(parameter, (ChoiceParam, DictChoiceParam)): self.var = tk.StringVar(tk_parent, value=parameter.value) self.entry = ttk.Combobox(tk_parent, textvariable=self.var) - self.entry["values"] = parameter.choices or [""] + self.entry["values"] = parameter.get_values() or [""] if isinstance(parameter, ColumnOrStringParam): self.entry.bind("", self.combobox_set) self.entry["state"] = "normal" @@ -203,9 +204,8 @@ def update(self): ) elif isinstance(self.parameter, MultiParam): self.update_subwrappers(self.parameter.params.values(), None) - elif isinstance(self.parameter, ChoiceParam): - choices = self.parameter.choices or [""] - self.entry["values"] = choices + elif isinstance(self.parameter, (ChoiceParam, DictChoiceParam)): + self.entry["values"] = self.parameter.get_values() or [""] self.var.set(self.parameter.value) elif isinstance(self.parameter, BooleanParam): self.set_checkbox_value() @@ -393,8 +393,11 @@ def set_choice(self, choice): self.callback(self.parameter) def value_changed_callback(self, *_): - if isinstance(self.parameter, ChoiceParam) and self.entry.current() != -1: - self.set_choice(self.entry.current()) + if isinstance(self.parameter, (ChoiceParam, DictChoiceParam)) and self.entry.current() != -1: + value = self.parameter.get_values()[self.entry.current()] + if value != self.parameter.value: + self.parameter.value = value + self.callback(self.parameter) else: self.var.set(self.set_value(self.var.get())) From fb34b6fed70fad4030515571204ffee7e98646cb Mon Sep 17 00:00:00 2001 From: Nick Moore Date: Tue, 24 Sep 2024 17:05:06 +1000 Subject: [PATCH 6/8] new variant caller plugin, fixed tests --- countess/plugins/variant.py | 37 +++++++++++++++++++++++++++++------ countess/utils/variant.py | 2 +- tests/plugins/test_variant.py | 11 +++++++---- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/countess/plugins/variant.py b/countess/plugins/variant.py index 95de3be..e859d93 100644 --- a/countess/plugins/variant.py +++ b/countess/plugins/variant.py @@ -1,4 +1,5 @@ import logging +import string from countess import VERSION from countess.core.parameters import ( @@ -6,8 +7,10 @@ ColumnChoiceParam, ColumnOrIntegerParam, ColumnOrStringParam, + DictChoiceParam, IntegerParam, MultiParam, + StringCharacterSetParam, StringParam, ) from countess.core.plugins import PandasTransformDictToDictPlugin @@ -15,9 +18,31 @@ logger = logging.getLogger(__name__) +REFERENCE_CHAR_SET = set(string.ascii_uppercase + string.digits + "_") + +# XXX Should proabably support these other types as well but I don't +# know what I don't know ... +# XXX Supporting protein calls on mitochondrial (or other organisms) +# DNA will required expansion of the variant caller routine to handle +# different codon tables. This opens up a can of worms of course. +# XXX There should probably also be a warning generated if you ask for a +# non-MT DNA call with an MT protein call or vice versa. + +SEQUENCE_TYPE_CHOICES = { + "g": "Linear Genomic", + "g-": "Linear Genomic (Minus Strand)", + # "o": "Circular Genomic", + # "m": "Mitochondrial", + "c": "Coding DNA", + # "n": "Non-Coding DNA", + "p": "Protein", + # "pm": "Protein (MT)", +} + class VariantOutputMultiParam(MultiParam): - prefix = StringParam("Prefix", "g.") + prefix = StringCharacterSetParam("Prefix", "", character_set=REFERENCE_CHAR_SET) + seq_type = DictChoiceParam("Type", choices=SEQUENCE_TYPE_CHOICES) offset = ColumnOrIntegerParam("Offset", 0) maxlen = IntegerParam("Max Variations", 10) output = StringParam("Output Column", "variant") @@ -30,8 +55,6 @@ class VariantPlugin(PandasTransformDictToDictPlugin): description = "Turns a DNA sequence into a HGVS variant code" version = VERSION link = "https://countess-project.github.io/CountESS/included-plugins/#variant-caller" - additional = """Prefixes should end with 'g.', 'c.' or 'p.'. - Use a negative offset to call genomic references to minus-strand genes.""" column = ColumnChoiceParam("Input Column", "sequence") reference = ColumnOrStringParam("Reference Sequence") @@ -45,16 +68,18 @@ def process_dict(self, data) -> dict: r: dict[str, str] = {} for output in self.outputs: + seq_type = output.seq_type.get_choice() or "g" + prefix = f"{output.prefix + ':' if output.prefix else ''}{seq_type[0]}." offset = int(output.offset.get_value_from_dict(data) or 0) try: r[output.output.value] = find_variant_string( - str(output.prefix) if output.prefix else "g.", + prefix, reference, sequence, max_mutations=output.maxlen.value, - offset=abs(offset), - minus_strand=offset < 0, + offset=offset, + minus_strand=seq_type.endswith("-"), ) except (TypeError, KeyError, IndexError) as exc: diff --git a/countess/utils/variant.py b/countess/utils/variant.py index 0df981b..bd674b9 100644 --- a/countess/utils/variant.py +++ b/countess/utils/variant.py @@ -529,7 +529,7 @@ def find_variant_string( >>> find_variant_string("x.", "CAT", "CAT") Traceback (most recent call last): ... - ValueError: Only prefix types 'g.', 'n.' and 'p.' accepted at this time + ValueError: Only prefix types 'g.', 'c.' and 'p.' accepted at this time >>> find_variant_string("g.", "HELLO", "CAT") Traceback (most recent call last): diff --git a/tests/plugins/test_variant.py b/tests/plugins/test_variant.py index 65fea8e..332b298 100644 --- a/tests/plugins/test_variant.py +++ b/tests/plugins/test_variant.py @@ -9,7 +9,8 @@ def test_variant_ref_value(): plugin = VariantPlugin() plugin.set_parameter("column", "seq") plugin.set_parameter("reference", "AGAAGTAGAGG") - plugin.set_parameter("output", "out") + plugin.set_parameter("outputs.0.seq_type", "g") + plugin.set_parameter("outputs.0.output", "out") plugin.prepare(["test"], None) @@ -30,7 +31,8 @@ def test_variant_ref_column(): plugin = VariantPlugin() plugin.set_parameter("column", "seq") plugin.set_parameter("reference", "— ref") - plugin.set_parameter("output", "out") + plugin.set_parameter("outputs.0.seq_type", "g") + plugin.set_parameter("outputs.0.output", "out") plugin.prepare(["test"], None) @@ -54,8 +56,9 @@ def test_variant_ref_offset(): plugin = VariantPlugin() plugin.set_parameter("column", "seq") plugin.set_parameter("reference", "AGAAGTAGAGG") - plugin.set_parameter("offset", "— offs") - plugin.set_parameter("output", "out") + plugin.set_parameter("outputs.0.offset", "— offs") + plugin.set_parameter("outputs.0.seq_type", "g") + plugin.set_parameter("outputs.0.output", "out") plugin.prepare(["test"], None) From 0bba8149298d3e5a2b4ad4d75cceaa8d8b67c7f9 Mon Sep 17 00:00:00 2001 From: Nick Moore Date: Tue, 24 Sep 2024 17:39:00 +1000 Subject: [PATCH 7/8] fixups --- countess/core/parameters.py | 6 ++++++ countess/plugins/data_table.py | 4 ++-- countess/plugins/variant.py | 6 +++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/countess/core/parameters.py b/countess/core/parameters.py index 796bb91..1d32e77 100644 --- a/countess/core/parameters.py +++ b/countess/core/parameters.py @@ -361,6 +361,9 @@ def set_value(self, value): if value in self.reverse: self._choice = self.reverse[value] self._value = value + elif value in self.choices: + self._choice = value + self._value = self.choices[value] else: self.set_default() @@ -396,6 +399,9 @@ def set_default(self): self._choice = "" self._value = "" + def get_parameters(self, key, base_dir="."): + return ((key, self._choice),) + def copy(self) -> "DictChoiceParam": return self.__class__(self.label, self.value, self.choices) diff --git a/countess/plugins/data_table.py b/countess/plugins/data_table.py index 916491e..0b284a7 100644 --- a/countess/plugins/data_table.py +++ b/countess/plugins/data_table.py @@ -11,7 +11,7 @@ StringParam, TabularMultiParam, ) -from countess.core.plugins import BasePlugin +from countess.core.plugins import PandasInputPlugin class _ColumnsMultiParam(MultiParam): @@ -20,7 +20,7 @@ class _ColumnsMultiParam(MultiParam): index = BooleanParam("Index?") -class DataTablePlugin(BasePlugin): +class DataTablePlugin(PandasInputPlugin): """DataTable""" name = "DataTable" diff --git a/countess/plugins/variant.py b/countess/plugins/variant.py index e859d93..e549b29 100644 --- a/countess/plugins/variant.py +++ b/countess/plugins/variant.py @@ -29,8 +29,8 @@ # non-MT DNA call with an MT protein call or vice versa. SEQUENCE_TYPE_CHOICES = { - "g": "Linear Genomic", - "g-": "Linear Genomic (Minus Strand)", + "g": "Genomic", + "g-": "Genomic (Minus Strand)", # "o": "Circular Genomic", # "m": "Mitochondrial", "c": "Coding DNA", @@ -58,7 +58,7 @@ class VariantPlugin(PandasTransformDictToDictPlugin): column = ColumnChoiceParam("Input Column", "sequence") reference = ColumnOrStringParam("Reference Sequence") - outputs = ArrayParam("Outputs", VariantOutputMultiParam("Output")) + outputs = ArrayParam("Outputs", VariantOutputMultiParam("Output"), min_size=1) def process_dict(self, data) -> dict: sequence = data[str(self.column)] From 4bbf073a63e49d9d85c006c21808cdcaa2fadb99 Mon Sep 17 00:00:00 2001 From: Nick Moore Date: Tue, 24 Sep 2024 19:50:55 +1000 Subject: [PATCH 8/8] add some tests for minus strand variant calling --- countess/utils/variant.py | 8 ++++++- tests/plugins/test_variant.py | 39 +++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/countess/utils/variant.py b/countess/utils/variant.py index bd674b9..a501c56 100644 --- a/countess/utils/variant.py +++ b/countess/utils/variant.py @@ -212,7 +212,6 @@ def find_variant_dna(ref_seq: str, var_seq: str, offset: int = 0) -> Iterable[st >>> list(find_variant_dna("AGAAGTAGAGG", "ATAAGAAGAGG", -200)) ['198G>T', '194T>A'] - """ ref_seq = ref_seq.strip().upper() @@ -524,6 +523,13 @@ def find_variant_string( >>> find_variant_string("p.", "ATGGTTGGTTCA", "ATGGCTGCTTCA") 'p.Val2_Gly3delinsAlaAla' + MINUS STRAND + + this example is actually comparing TGTAATC and TCTGAAC ... + + >>> find_variant_string("g.", "GATTACA", "GTTCAGA", minus_strand=True) + 'g.[2G>C;3_4insG;6del]' + CHECK FOR INVALID INPUTS >>> find_variant_string("x.", "CAT", "CAT") diff --git a/tests/plugins/test_variant.py b/tests/plugins/test_variant.py index 332b298..e773e9e 100644 --- a/tests/plugins/test_variant.py +++ b/tests/plugins/test_variant.py @@ -69,3 +69,42 @@ def test_variant_ref_offset(): assert output[0]["out"] == "g.1A>T" assert output[1]["out"] == "g.[17A>T;19A>T]" assert output[2]["out"] == "g.[102G>T;106T>A]" + +def test_variant_ref_offset_minus(): + """check that the reverse-complement behaviour works on the minus strand.""" + # genes on the minus strand are reverse-complemented, so what we're actually + # comparing is the reverse-complemented sequences: + # + # 00000000011 + # num 12345678901 + # ref CCTCTACTTCT + # seq1 CCTCTACTTCA => 11T>A + # seq2 CCACAACTTCT => 3T>A;5T>A + # seq3 CCTCTTCTTAT => 6A>T;10C>A + # + # plus the offset + + input_df = pd.DataFrame( + [ + {"seq": "TGAAGTAGAGG" }, + {"seq": "AGAAGTTGTGG" }, + {"seq": "ATAAGAAGAGG" }, + ] + ) + + plugin = VariantPlugin() + plugin.set_parameter("column", "seq") + plugin.set_parameter("reference", "AGAAGTAGAGG") + plugin.set_parameter("outputs.0.offset", "1000") + plugin.set_parameter("outputs.0.seq_type", "g-") + plugin.set_parameter("outputs.0.output", "out") + + plugin.prepare(["test"], None) + + output_df = plugin.process_dataframe(input_df) + + output = output_df.to_records() + + assert output[0]["out"] == "g.1011T>A" + assert output[1]["out"] == "g.[1003T>A;1005T>A]" + assert output[2]["out"] == "g.[1006A>T;1010C>A]"