Merge pull request #27 from sanjaynagi/af1-support-anoprimer-19-12-22

af1 support
sanjaynagi · Aug 18, 2023 · 60d4593 · 60d4593
2 parents 18ec124 + d182462
commit 60d4593
Show file tree

Hide file tree

Showing 19 changed files with 491 additions and 3,153 deletions.
diff --git a/.github/workflows/github-action-AgamPrimer.yml b/.github/workflows/github-action-AgamPrimer.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-          python-version: ['3.8', '3.9', '3.10', '3.11']
+          python-version: ['3.8', '3.10', '3.11']
 
     steps:
 
@@ -39,8 +39,12 @@ jobs:
 
     - name: Run notebooks
       run: |
+        poetry run papermill notebooks/AgamPrimer-long.ipynb qPCR_run.ipynb -k AgamPrimer -f tests/cDNA_Params_fun.json
         poetry run papermill notebooks/AgamPrimer-long.ipynb qPCR_run.ipynb -k AgamPrimer -f tests/cDNA_Params.json
+        poetry run papermill notebooks/AgamPrimer-long.ipynb qPCR2_run.ipynb -k AgamPrimer -f tests/cDNA_Params_2_fun.json
         poetry run papermill notebooks/AgamPrimer-long.ipynb qPCR2_run.ipynb -k AgamPrimer -f tests/cDNA_Params_2.json
+        poetry run papermill notebooks/AgamPrimer-long.ipynb gDNA_run.ipynb -k AgamPrimer -f tests/gDNA_probe_Params_fun.json
         poetry run papermill notebooks/AgamPrimer-long.ipynb gDNA_run.ipynb -k AgamPrimer -f tests/gDNA_probe_Params.json
         poetry run papermill notebooks/AgamPrimer-long.ipynb probe_run.ipynb -k AgamPrimer -f tests/probe_Params.json
+        poetry run papermill notebooks/AgamPrimer-long.ipynb probe_run.ipynb -k AgamPrimer -f tests/probe_Params_fun.json
         poetry run papermill notebooks/AgamPrimer-short.ipynb short_run.ipynb -k AgamPrimer
diff --git a/AgamPrimer/AgamPrimer.py b/AgamPrimer/AgamPrimer.py
@@ -11,6 +11,19 @@
 from plotly.subplots import make_subplots
 
 ag3 = malariagen_data.Ag3(url="gs://vo_agam_release/", pre=True)
+af1 = malariagen_data.Af1(url="gs://vo_afun_release/", pre=True)
+
+
+def retrieve_data_resource(species):
+    assert species in [
+        "gambiae_sl",
+        "funestus",
+    ], f"species {species} not recognised, please use 'gambiae_sl' or 'funestus'"
+    if species == "gambiae_sl":
+        data_resource = ag3
+    elif species == "funestus":
+        data_resource = af1
+    return data_resource
 
 
 def prepare_gDNA_sequence(
@@ -75,12 +88,16 @@ def prepare_gDNA_sequence(
     return (target_sequence, gdna_pos, seq_parameters)
 
 
-def prepare_cDNA_sequence(transcript, genome_seq, assay_name, cDNA_exon_junction):
+def prepare_cDNA_sequence(
+    species, transcript, genome_seq, assay_name, cDNA_exon_junction
+):
     """
     Extract exonic sequence for our transcript and record exon-exon junctions
 
     PARAMETERS
     ----------
+    species: str
+        The species to design primers for. Either 'gambiae_sl' or 'funestus'
     transcript : str
         The AGAP identifier of the transcript to design primers for
     genome_seq : dask.array.core.Array
@@ -92,7 +109,9 @@ def prepare_cDNA_sequence(transcript, genome_seq, assay_name, cDNA_exon_junction
     """
 
     # subset gff to your gene
-    gff = ag3.geneset()
+    data_resource = retrieve_data_resource(species)
+
+    gff = data_resource.geneset()
     gff = gff.query(f"type == 'exon' & Parent == '{transcript}'")
     # Get fasta sequence for each of our exons, and remember gDNA position
     seq = dict()
@@ -124,6 +143,7 @@ def prepare_cDNA_sequence(transcript, genome_seq, assay_name, cDNA_exon_junction
 
 
 def prepare_sequence(
+    species,
     target,
     assay_type,
     assay_name,
@@ -136,6 +156,8 @@ def prepare_sequence(
 
     PARAMETERS
     ----------
+    species: str
+        The species to design primers for. Either 'gambiae_sl' or 'funestus'
     target : str
         The target to design primers for. For gDNA primers, this should be a contig:position string,
         for example '2L:28545767'. For cDNA primers, this should be an AGAP identifier.
@@ -164,6 +186,7 @@ def prepare_sequence(
     elif assay_type == "cDNA primers":
         # quantitative PCR
         target_sequence, gdna_pos, seq_parameters = prepare_cDNA_sequence(
+            species=species,
             transcript=target,
             genome_seq=genome_seq,
             assay_name=assay_name,
@@ -323,7 +346,8 @@ def primer3_to_pandas(primer_dict, assay_type):
     return primer_df
 
 
-def plot_primer_ag3_frequencies(
+def plot_primer_snp_frequencies(
+    species,
     primer_df,
     gdna_pos,
     contig,
@@ -338,6 +362,8 @@ def plot_primer_ag3_frequencies(
 
     PARAMETERS
     ----------
+    species: str
+        The species to design primers for. Either 'gambiae_sl' or 'funestus'
     primer_df : pandas.DataFrame
         A pandas DataFrame containing the primer sequences and their information, returned by primer3_to_pandas()
     gdna_pos : numpy.ndarray
@@ -371,7 +397,14 @@ def plot_primer_ag3_frequencies(
     # Loop through each primer pair and get the frequencies of alternate alleles, storing in dict
     for i in range(len(primer_df.columns)):
         res_dict[i] = _get_primer_alt_frequencies(
-            primer_df, gdna_pos, i, sample_sets, assay_type, contig, sample_query
+            species=species,
+            primer_df=primer_df,
+            gdna_pos=gdna_pos,
+            pair=i,
+            sample_sets=sample_sets,
+            assay_type=assay_type,
+            contig=contig,
+            sample_query=sample_query,
         )
 
     # Plot data with plotly
@@ -389,6 +422,7 @@ def plot_primer_ag3_frequencies(
 
 
 def plot_primer_locs(
+    species,
     primer_res_dict,
     primer_df,
     contig,
@@ -402,6 +436,8 @@ def plot_primer_locs(
 
     PARAMETERS
     ----------
+    species: str
+        The species to design primers for. Either 'gambiae_sl' or 'funestus'
     primer_res_dict : dict
         A dictionary containing the primer3 results, returned by primer3.designPrimers()
     primer_df : pandas.DataFrame
@@ -418,10 +454,15 @@ def plot_primer_locs(
     out_dir : str, optional
         The directory to write output files to. If not specified, outputs will not be written to file.
     """
+    data_resource = retrieve_data_resource(species=species)
     oligos, _ = _return_oligo_list(assay_type)
     assay_name = seq_parameters["SEQUENCE_ID"]
+    exon_id_col = (
+        "Name" if species == "gambiae_sl" else "ID"
+    )  # funestus gff3 uses ID for exon name, gambiae uses Name
+
     # Load geneset (gff)
-    gff = ag3.geneset()
+    gff = data_resource.geneset()
     if any(item in assay_type for item in ["gDNA", "probe"]):
         start = seq_parameters["GENOMIC_TARGET"] - 500
         end = seq_parameters["GENOMIC_TARGET"] + 500
@@ -459,7 +500,7 @@ def plot_primer_locs(
     # Add rectangles for exons one at a time
     for _, exon in locgff.iterrows():
         ex_start, ex_end = exon[["start", "end"]]
-        e_name = exon["Name"][-2:]
+        e_name = exon[exon_id_col][-2:]
         strand = exon["strand"]
         if strand == "+":
             rect = patches.Rectangle(
@@ -606,42 +647,52 @@ def gget_blat_genome(primer_df, assay_type, assembly="anoGam3"):
         print("No HITs found for these primer pairs")
 
 
-def check_and_split_target(target, assay_type):
+def check_and_split_target(species, target, assay_type):
+    data_resource = retrieve_data_resource(species=species)
+
     # split contig from target
-    if target.startswith("AGAP"):
+    if target.startswith("AGAP") or target.startswith("LOC"):
         assert (
             assay_type == "cDNA primers"
-        ), "an AGAP identifier is specified, but the assay type is not cDNA primers. Please provide a contig:position identifier for gDNA primers."
-        gff = ag3.geneset()
+        ), "an AGAP/AFUN identifier is specified, but the assay type is not cDNA primers. Please provide a contig:position identifier for gDNA primers."
+        gff = data_resource.geneset()
         assert (
             target in gff["ID"].to_list()
-        ), f"requested target {target} not in ag3 transcript set"
+        ), f"requested target {target} not in ag3/af1 transcript set"
         contig = gff.query(f"ID == '{target}'")["contig"].unique()[0]
         return (contig, target)
     else:
         assert isinstance(
             target, str
         ), "For genomic DNA the target should be a string, such as '2L:28545767'"
         contig, target = target.split(":")
-        assert contig in [
-            "2L",
-            "2R",
-            "3L",
-            "3R",
-            "X",
-        ], "target contig not recognised, should be 2L, 2R, 3L, 3R or X"
+        if species == "gambiae_sl":
+            assert contig in [
+                "2L",
+                "2R",
+                "3L",
+                "3R",
+                "X",
+            ], "target contig not recognised, should be 2L, 2R, 3L, 3R or X"
+        elif species == "funestus":
+            assert contig in [
+                "2RL",
+                "3RL",
+                "X",
+            ], "target contig not recognised, should be 2RL, 3RL or X"
         return (contig, int(target))
 
 
 def designPrimers(
+    species,
     assay_type,
     assay_name,
     min_amplicon_size,
     max_amplicon_size,
     n_primer_pairs,
     target,
     primer_parameters,
-    sample_sets,
+    sample_sets=None,
     sample_query=None,
     out_dir=None,
     cDNA_exon_junction=True,
@@ -651,6 +702,8 @@ def designPrimers(
 
     Parameters
     ----------
+    species: str
+        The species to design primers for. Either 'gambiae_sl' or 'funestus'
     assay_type: {'gDNA primers', 'cDNA primers', 'gDNA primers + probe', 'probe}, str
         The type of oligos we wish to design. If 'gDNA primers' or 'cDNA primers' are specified,
         then only primers will be designed. If 'gDNA primers + probe' is specified, then primers
@@ -692,9 +745,13 @@ def designPrimers(
             A pandas DataFrame containing the BLAT alignment information for the primers.
     """
 
-    # check target is valid for assay type and find contig
+    data_resource = retrieve_data_resource(species=species)
     oligos, _ = _return_oligo_list(assay_type)
-    contig, target = check_and_split_target(target=target, assay_type=assay_type)
+
+    # check target is valid for assay type and find contig
+    contig, target = check_and_split_target(
+        species=species, target=target, assay_type=assay_type
+    )
     amplicon_size_range = [[min_amplicon_size, max_amplicon_size]]
 
     # adds some necessary parameters depending on assay type
@@ -715,10 +772,11 @@ def designPrimers(
             generate_defaults=False,
         )
     # load genome sequence
-    genome_seq = ag3.genome_sequence(region=contig)
+    genome_seq = data_resource.genome_sequence(region=contig)
     print(f"Our genome sequence for {contig} is {genome_seq.shape[0]} bp long")
 
     target_sequence, gdna_pos, seq_parameters = prepare_sequence(
+        species=species,
         target=target,
         assay_type=assay_type,
         assay_name=assay_name,
@@ -745,7 +803,8 @@ def designPrimers(
     primer_df = primer3_to_pandas(primer_dict=primer_dict, assay_type=assay_type)
 
     # plot frequencies of alleles in primer pairs
-    results_dict = plot_primer_ag3_frequencies(
+    results_dict = plot_primer_snp_frequencies(
+        species=species,
         primer_df=primer_df,
         gdna_pos=gdna_pos,
         contig=contig,
@@ -758,6 +817,7 @@ def designPrimers(
 
     # plot primer locations on genome
     plot_primer_locs(
+        species=species,
         primer_res_dict=results_dict,
         primer_df=primer_df,
         assay_type=assay_type,
@@ -784,16 +844,19 @@ def designPrimers(
         index=[f"primer_{o}" for o in oligos]
     )
 
-    # check primers for specificity against the genome and write to file
-    blat_df = gget_blat_genome(primer_df, assay_type, assembly="anoGam3")
-
-    # write primer3 and blat output to file
     if out_dir:
         primer_df.to_csv(f"{out_dir}/{assay_name}.{assay_type}.tsv", sep="\t")
         primer_df.to_excel(f"{out_dir}/{assay_name}.{assay_type}.xlsx")
-        blat_df.to_csv(f"{out_dir}/{assay_name}.{assay_type}.blat.tsv", sep="\t")
 
-    return (primer_df, blat_df)
+    if species == "gambiae_sl":
+        # check primers for specificity against the genome and write to file
+        blat_df = gget_blat_genome(primer_df, assay_type, assembly="anoGam3")
+        # write primer3 and blat output to file
+        if out_dir:
+            blat_df.to_csv(f"{out_dir}/{assay_name}.{assay_type}.blat.tsv", sep="\t")
+        return primer_df, blat_df
+    else:
+        return primer_df, "Cannot BLAT against funestus, gambiae_sl only"
 
 
 def extract_trailing_digits(string):
@@ -806,10 +869,14 @@ def extract_trailing_digits(string):
         return None
 
 
-def _get_primer_arrays(contig, gdna_pos, sample_sets, assay_type, sample_query=None):
+def _get_primer_arrays(
+    species, contig, gdna_pos, sample_sets, assay_type, sample_query=None
+):
+    data_resource = retrieve_data_resource(species=species)
+
     if any(item in assay_type for item in ["gDNA", "probe"]):
         span_str = f"{contig}:{gdna_pos.min()}-{gdna_pos.max()}"
-        snps = ag3.snp_calls(
+        snps = data_resource.snp_calls(
             region=span_str, sample_sets=sample_sets, sample_query=sample_query
         )  # get genotypes
         ref_alt_arr = snps["variant_allele"].compute().values
@@ -823,7 +890,7 @@ def _get_primer_arrays(contig, gdna_pos, sample_sets, assay_type, sample_query=N
         exon_spans = np.array(_consecutive(gdna_pos)) + 1
         for span in exon_spans:
             span_str = f"{contig}:{span[0]}-{span[1]}"
-            snps = ag3.snp_calls(
+            snps = data_resource.snp_calls(
                 region=span_str, sample_sets=sample_sets, sample_query=sample_query
             )  # get genotypes
             ref_alts = snps["variant_allele"]
@@ -842,7 +909,7 @@ def _get_primer_arrays(contig, gdna_pos, sample_sets, assay_type, sample_query=N
 
 
 def _get_primer_alt_frequencies(
-    primer_df, gdna_pos, pair, sample_sets, assay_type, contig, sample_query
+    species, primer_df, gdna_pos, pair, sample_sets, assay_type, contig, sample_query
 ):
     """
     Find the genomic locations of pairs of primers, and runs span_to_freq
@@ -851,6 +918,7 @@ def _get_primer_alt_frequencies(
 
     oligos, _ = _return_oligo_list(assay_type)
     base_freqs, ref_alt_arr, pos_arr = _get_primer_arrays(
+        species=species,
         contig=contig,
         gdna_pos=gdna_pos,
         sample_sets=sample_sets,

diff --git a/AgamPrimer/__pycache__/agamPrimer.cpython-37.pyc b/AgamPrimer/__pycache__/agamPrimer.cpython-37.pyc
diff --git a/AgamPrimer/agamPrimer.egg-info/PKG-INFO b/AgamPrimer/agamPrimer.egg-info/PKG-INFO
diff --git a/AgamPrimer/agamPrimer.egg-info/SOURCES.txt b/AgamPrimer/agamPrimer.egg-info/SOURCES.txt
diff --git a/AgamPrimer/agamPrimer.egg-info/dependency_links.txt b/AgamPrimer/agamPrimer.egg-info/dependency_links.txt
diff --git a/AgamPrimer/agamPrimer.egg-info/top_level.txt b/AgamPrimer/agamPrimer.egg-info/top_level.txt