From fa3fd505d69e7e2603f8d4c797d78e8114602e29 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Thu, 26 Sep 2024 15:51:36 +0100 Subject: [PATCH] precommit --- AnoPrimer/AnoPrimer.py | 1371 ---------------- AnoPrimer/__init__.py | 5 +- AnoPrimer/design.py | 488 ++++++ AnoPrimer/evaluate.py | 445 +++++ AnoPrimer/utils.py | 554 +++++++ notebooks/AnoPrimer-long.ipynb | 2702 +++++++++++++++---------------- notebooks/AnoPrimer-short.ipynb | 634 +------- pyproject.toml | 2 +- 8 files changed, 2859 insertions(+), 3342 deletions(-) delete mode 100644 AnoPrimer/AnoPrimer.py create mode 100644 AnoPrimer/design.py create mode 100644 AnoPrimer/evaluate.py create mode 100644 AnoPrimer/utils.py diff --git a/AnoPrimer/AnoPrimer.py b/AnoPrimer/AnoPrimer.py deleted file mode 100644 index 1f76574..0000000 --- a/AnoPrimer/AnoPrimer.py +++ /dev/null @@ -1,1371 +0,0 @@ -import allel -import gget -import malariagen_data -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import plotly.graph_objects as go -import primer3 -import seaborn as sns -from matplotlib import patches -from plotly.subplots import make_subplots - - -def retrieve_data_resource(species): - assert species in [ - "gambiae_sl", - "funestus", - ], f"species {species} not recognised, please use 'gambiae_sl' or 'funestus'" - if species == "gambiae_sl": - data_resource = malariagen_data.Ag3(url="gs://vo_agam_release/", pre=True) - elif species == "funestus": - data_resource = malariagen_data.Af1(url="gs://vo_afun_release/", pre=True) - return data_resource - - -def prepare_gDNA_sequence( - target_loc, - amplicon_size_range, - genome_seq, - assay_name, - assay_type, - probe_exclude_region_size=20, -): - """ - Extracts sequence of interest from genome sequence - - PARAMETERS - ---------- - target_loc : int - The target location of the SNP in the genome sequence - amplicon_size_range : list - The minimum and maximum size of the amplicon to design primers for - genome_seq : dask.array.core.Array - The genome sequence from ag3.genome_sequence() - assay_name : str - The name of the assay - assay_type : str - The type of assay, either 'gDNA primers' or 'gDNA primers + probe', 'cDNA primers' - or 'probe' - """ - target = int(target_loc) - # Set up range for the input sequence, we'll take the max range - # of the amplicon size and add that either side of the target SNP - amp_size_range = int(np.max(amplicon_size_range)) - start = target - amp_size_range - end = target + amp_size_range - # join array into be one character string, and store the positions - # of these sequences for later - target_sequence = "".join(genome_seq[start : end - 1].compute().astype(str)) - gdna_pos = np.arange(start, end).astype(int) + 1 - print(f"The target sequence is {len(target_sequence)} bases long") - - # We need the target snp indices within the region of interest - target_loc_primer3 = int(np.where(gdna_pos == target)[0]) - target_loc_primer3 = [target_loc_primer3, 10] - print(f"the target snp is {target_loc_primer3[0]} bp into our target sequence") - - seq_parameters = { - "SEQUENCE_ID": assay_name, - "SEQUENCE_TEMPLATE": target_sequence, - "SEQUENCE_TARGET": target_loc_primer3, - "GENOMIC_TARGET": target, - } - - if "probe" in assay_type: - seq_parameters["SEQUENCE_INTERNAL_EXCLUDED_REGION"] = [ - [1, target_loc_primer3[0] - probe_exclude_region_size], - [ - target_loc_primer3[0] + probe_exclude_region_size, - len(target_sequence) - - (target_loc_primer3[0] + probe_exclude_region_size), - ], - ] - - return (target_sequence, gdna_pos, seq_parameters) - - -def prepare_cDNA_sequence( - species, transcript, genome_seq, assay_name, cDNA_exon_junction -): - """ - Extract exonic sequence for our transcript and record exon-exon junctions - - PARAMETERS - ---------- - species: str - The species to design primers for. Either 'gambiae_sl' or 'funestus' - transcript : str - The AGAP identifier of the transcript to design primers for - genome_seq : dask.array.core.Array - The genome sequence from ag3.genome_sequence() - assay_name : str - The name of the assay - cDNA_exon_junction : bool - If True, cDNA primers will be designed to span an exon-exon junction. We strongly recommend for qPCR purposes. - """ - - # subset gff to your gene - data_resource = retrieve_data_resource(species) - - gff = data_resource.geneset() - gff = gff.query(f"type == 'exon' & Parent == '{transcript}'") - # Get fasta sequence for each of our exons, and remember gDNA position - seq = dict() - gdna_pos = dict() - for i, exon in enumerate(zip(gff.start, gff.end)): - seq[i] = "".join(np.array(genome_seq)[exon[0] - 1 : exon[1]].astype(str)) - gdna_pos[i] = np.arange(exon[0] - 1, exon[1]) - # concatenate exon FASTAs into one big sequence - gdna_pos = np.concatenate(list(gdna_pos.values())) - target_mRNA_seq = "".join(seq.values()) - - # Get list of exon junction positions - exon_junctions = np.array(np.cumsum(gff.end - gff.start))[:-1] - exon_sizes = np.array(gff.end - gff.start)[:-1] - exon_junctions_pos = [ex + gff.iloc[i, 3] for i, ex in enumerate(exon_sizes)] - print(f"Exon junctions for {transcript}:", exon_junctions, exon_junctions_pos, "\n") - seq_parameters = { - "SEQUENCE_ID": assay_name, - "SEQUENCE_TEMPLATE": target_mRNA_seq, - "GENOMIC_TARGET": transcript, - } - - if cDNA_exon_junction: - seq_parameters["SEQUENCE_OVERLAP_JUNCTION_LIST"] = list( - map(int, exon_junctions) - ) - - return (target_mRNA_seq, gdna_pos, seq_parameters) - - -def prepare_sequence( - species, - target, - assay_type, - assay_name, - genome_seq, - amplicon_size_range, - cDNA_exon_junction=True, -): - """ - Prepare the sequence for primer3, depending on cDNA or gDNA input type - - PARAMETERS - ---------- - species: str - The species to design primers for. Either 'gambiae_sl' or 'funestus' - target : str - The target to design primers for. For gDNA primers, this should be a contig:position string, - for example '2L:28545767'. For cDNA primers, this should be an AGAP identifier. - assay_type : str - The type of assay, either 'gDNA primers' or 'gDNA primers + probe', 'cDNA primers' - or 'probe' - assay_name : str - The name of the assay - genome_seq : dask.array.core.Array - The genome sequence from ag3.genome_sequence() - amplicon_size_range : list - The minimum and maximum size of the amplicon to design primers for - cDNA_exon_junction : bool - If True, cDNA primers will be designed to span an exon-exon junction. We strongly recommend for qPCR purposes. - """ - - if any(item in assay_type for item in ["gDNA", "probe"]): - # genomic DNA - target_sequence, gdna_pos, seq_parameters = prepare_gDNA_sequence( - target_loc=target, - amplicon_size_range=amplicon_size_range, - genome_seq=genome_seq, - assay_name=assay_name, - assay_type=assay_type, - ) - elif assay_type == "cDNA primers": - # quantitative PCR - target_sequence, gdna_pos, seq_parameters = prepare_cDNA_sequence( - species=species, - transcript=target, - genome_seq=genome_seq, - assay_name=assay_name, - cDNA_exon_junction=cDNA_exon_junction, - ) - - return (target_sequence, gdna_pos, seq_parameters) - - -def primer_params( - assay_type, - primer_parameters=None, - n_primer_pairs=None, - amplicon_size_range=None, - generate_defaults=False, -): - """ - adds necessary parameters depending on assay_type, or can - generate the default parameters - - PARAMETERS - ---------- - assay_type : str - The type of assay, either 'gDNA primers' or 'gDNA primers + probe', 'cDNA primers' - or 'probe' - primer_parameters : dict - A dictionary of primer3 parameters to use for primer design. If 'default' is specified, default - primer3 parameters will be generated. - n_primer_pairs : int - The number of primer pairs to design. - amplicon_size_range : list - The minimum and maximum size of the amplicon to design primers for - generate_defaults : bool - If True, default primer3 parameters will be generated. - """ - - if generate_defaults: - primer_parameters = { - "PRIMER_OPT_SIZE": 20, - "PRIMER_TASK": "generic", - "PRIMER_MIN_SIZE": 17, - "PRIMER_MAX_SIZE": 24, - "PRIMER_OPT_TM": 60.0, - "PRIMER_MIN_TM": 57.0, - "PRIMER_MAX_TM": 63.0, - "PRIMER_MIN_GC": 30.0, - "PRIMER_MAX_GC": 70.0, - # this parameter is the minimum distance between successive pairs. - # If 1, it means successive primer pairs could be identical bar one base shift - "PRIMER_MIN_THREE_PRIME_DISTANCE": 3, - # Probe size preferences if selected, otherwise ignored - "PRIMER_INTERNAL_OPT_SIZE": 16, - "PRIMER_INTERNAL_MIN_SIZE": 10, - "PRIMER_INTERNAL_MAX_SIZE": 22, - # Probe Tm considerations are quite relaxed, assumed that LNAs will be used - # later to affect TM - "PRIMER_INTERNAL_MIN_TM": 45, - "PRIMER_INTERNAL_MAX_TM": 65, - # Extra primer3 parameters can go here - # In the same format as above - } - - primer_parameters["PRIMER_NUM_RETURN"] = n_primer_pairs - primer_parameters["PRIMER_PRODUCT_SIZE_RANGE"] = amplicon_size_range - - if assay_type == "gDNA primers + probe": - primer_parameters["PRIMER_PICK_INTERNAL_OLIGO"] = 1 - primer_parameters["PRIMER_PICK_RIGHT_PRIMER"] = 1 - primer_parameters["PRIMER_PICK_LEFT_PRIMER"] = 1 - elif assay_type == "probe": - primer_parameters["PRIMER_PICK_INTERNAL_OLIGO"] = 1 - primer_parameters["PRIMER_PICK_RIGHT_PRIMER"] = 0 - primer_parameters["PRIMER_PICK_LEFT_PRIMER"] = 0 - return primer_parameters - - -def primer3_run_statistics(primer_dict, assay_type): - """ - Prints out primer3 run statistics from the primer3 results dictionary. - - PARAMETERS - ---------- - primer_dict : dict - The primer3 results dictionary returned by primer3.designPrimers() - assay_type : str - The type of assay, either 'gDNA primers' or 'gDNA primers + probe', 'cDNA primers' - or 'probe' - """ - _, row_start = _return_oligo_list(assay_type) - primer_dict = _convert_results_dict_naming(primer_dict) - # Convert the dict into a pandas dataframe - primer_df = pd.DataFrame.from_dict(primer_dict.items()) - # Rename the columns - primer_df = primer_df.rename(columns={0: "parameter", 1: "value"}) - explanations_df = primer_df.iloc[ - :row_start, : - ] # Take the first 7 rows which are general - # Loop through each row and print information - for ( - idx, - row, - ) in explanations_df.iterrows(): - print(row["parameter"], " : ", row["value"], "\n") - - -def primer3_to_pandas(primer_dict, assay_type): - """ - Convert primer3 results to pandas dataframe - - PARAMETERS - ---------- - primer_dict : dict - The primer3 results dictionary returned by primer3.designPrimers() - assay_type : str - The type of assay, either 'gDNA primers' or 'gDNA primers + probe', 'cDNA primers' - or 'probe' - - RETURNS - ------- - primer_df : pandas.DataFrame - A pandas DataFrame containing the primer sequences and their information. - """ - oligos, row_start = _return_oligo_list(assay_type) - # Convert the dict into a pandas dataframe - primer_dict = _convert_results_dict_naming(primer_dict) - primer_df = pd.DataFrame.from_dict(primer_dict.items()) - # Rename the columns - primer_df = primer_df.rename(columns={0: "parameter", 1: "value"}) - # Create a column which is primer pair #, and a column for primer - # parameter which does not contain primer pair # - primer_df = primer_df.iloc[row_start:, :].copy() - primer_df["primer_pair"] = primer_df["parameter"].str.extract("([0-9][0-9]|[0-9])") - primer_df["parameter"] = primer_df["parameter"].str.replace( - "(_[0-9][0-9]|_[0-9])", "", regex=True - ) - - # Put the different primer pairs in different columns - primer_df = primer_df.pivot( - index="parameter", columns="primer_pair", values="value" - ) - - # Get a list of the rows we need - primer_span = [f"primer_{oligo}" for oligo in oligos] - required_info = ["sequence", "TM", "GC_PERCENT"] - required_info = [ - p + "_" + y for y in required_info for p in primer_span - ] + primer_span - required_info = ( - required_info + ["primer_PAIR_PRODUCT_SIZE"] - if assay_type != "probe" - else required_info - ) - required_info = [string.lower() for string in required_info] - - # Subset data frame - primer_df = primer_df.loc[required_info, np.arange(primer_df.shape[1]).astype(str)] - return primer_df - - -def plot_primer_snp_frequencies( - species, - primer_df, - gdna_pos, - contig, - sample_sets, - assay_type, - seq_parameters, - out_dir=None, - sample_query=None, -): - """ - Loop through n primer pairs, retrieving frequency data and plot allele frequencies - - PARAMETERS - ---------- - species: str - The species to design primers for. Either 'gambiae_sl' or 'funestus' - primer_df : pandas.DataFrame - A pandas DataFrame containing the primer sequences and their information, returned by primer3_to_pandas() - gdna_pos : numpy.ndarray - The genomic positions of the target sequence - contig : str - The contig of the target sequence, e.g. '2L' - sample_sets : str or list of str, optional - Can be a sample set identifier (e.g., "AG1000G-AO") or a list of - sample set identifiers (e.g., ["AG1000G-BF-A", "AG1000G-BF-B"]) or a - release identifier (e.g., "3.0") or a list of release identifiers. - assay_type : str - The type of assay, either 'gDNA primers' or 'gDNA primers + probe', 'cDNA primers' - or 'probe' - seq_parameters : dict - A dictionary of parameters for primer3, returned by prepare_sequence() - out_dir : str, optional - The directory to write output files to. If not specified, outputs will not be written to file. - sample_query: str, optional - A pandas query string which will be evaluated against the sample - metadata e.g., "taxon == 'coluzzii' and country == 'Burkina Faso'" to subset - the genotypes to a specific set of samples. - """ - - if sample_query is not None: - print(f"Subsetting allele frequencies to {sample_query}") - - name = seq_parameters["SEQUENCE_ID"] - target = seq_parameters["GENOMIC_TARGET"] - - res_dict = {} - # Loop through each primer pair and get the frequencies of alternate alleles, storing in dict - for i in range(len(primer_df.columns)): - res_dict[i] = _get_primer_alt_frequencies( - species=species, - primer_df=primer_df, - gdna_pos=gdna_pos, - pair=i, - sample_sets=sample_sets, - assay_type=assay_type, - contig=contig, - sample_query=sample_query, - ) - - # Plot data with plotly - _plotly_primers( - primer_df=primer_df, - res_dict=res_dict, - name=name, - assay_type=assay_type, - sample_sets=sample_sets, - target=target, - out_dir=out_dir, - ) - - return res_dict - - -def plot_primer_locs( - species, - primer_res_dict, - primer_df, - contig, - seq_parameters, - assay_type, - legend_loc="best", - out_dir=None, -): - """ - Plot the position of the primer sets in relation to any nearby exons - - PARAMETERS - ---------- - species: str - The species to design primers for. Either 'gambiae_sl' or 'funestus' - primer_res_dict : dict - A dictionary containing the primer3 results, returned by primer3.designPrimers() - primer_df : pandas.DataFrame - A pandas DataFrame containing the primer sequences and their information, returned by primer3_to_pandas() - contig : str - The contig of the target sequence, e.g. '2L' - seq_parameters : dict - A dictionary of parameters for primer3, returned by prepare_sequence() - assay_type : str - The type of assay, either 'gDNA primers' or 'gDNA primers + probe', 'cDNA primers' - or 'probe' - legend_loc : str, optional - The location of the legend in the plot. Can be 'best', 'upper right', 'upper left', 'lower left', 'lower right', 'right', 'center left', 'center right', 'lower center', 'upper center' or 'center'. - out_dir : str, optional - The directory to write output files to. If not specified, outputs will not be written to file. - """ - data_resource = retrieve_data_resource(species=species) - oligos, _ = _return_oligo_list(assay_type) - assay_name = seq_parameters["SEQUENCE_ID"] - exon_id_col = ( - "Name" if species == "gambiae_sl" else "ID" - ) # funestus gff3 uses ID for exon name, gambiae uses Name - - # Load geneset (gff) - gff = data_resource.geneset() - if any(item in assay_type for item in ["gDNA", "probe"]): - start = seq_parameters["GENOMIC_TARGET"] - 500 - end = seq_parameters["GENOMIC_TARGET"] + 500 - locgff, min_, max_, genegff = _get_gDNA_locs(gff, contig, start, end) - min_ = np.min([min_, start]) - max_ = np.max([max_, end]) - elif assay_type == "cDNA primers": - locgff, min_, max_, genegff = _get_qPCR_locs( - gff, contig, seq_parameters["GENOMIC_TARGET"] - ) - - if locgff.empty: - print("No exons in close proximity for loc plot") - return - - fig, ax = plt.subplots(1, 1, figsize=[16, 4]) - # configure axes - if min_ in ["inf", "NaN"]: - min_ = start - if max_ in ["inf", "NaN"]: - max_ = end - - ax.set_xlim(min_, max_) - ax.set_ylim(-0.5, 1.5) - ax.ticklabel_format(useOffset=False) - ax.axhline(0.5, color="k", linewidth=0.7, linestyle="--") - sns.despine(ax=ax, left=True, bottom=False) - # ax.set_yticks(ticks=[0.2,1.2], size=20)#labels=['- ', '+'] - ax.tick_params( - top=False, left=False, right=False, labelleft=False, labelbottom=True - ) - ax.tick_params(axis="x", which="major", labelsize=13) - ax.set_ylabel("Exons") - ax.set_xlabel(f"Chromosome {contig} position", fontdict={"fontsize": 14}) - # Add rectangles for exons one at a time - for _, exon in locgff.iterrows(): - ex_start, ex_end = exon[["start", "end"]] - e_name = exon[exon_id_col][-2:] - strand = exon["strand"] - if strand == "+": - rect = patches.Rectangle( - (ex_start, 0.55), - ex_end - ex_start, - 0.3, - linewidth=3, - edgecolor="none", - facecolor="grey", - alpha=0.9, - ) - ax.text((ex_start + ex_end) / 2, 0.65, e_name) - else: - rect = patches.Rectangle( - (ex_start, 0.45), - ex_end - ex_start, - -0.3, - linewidth=3, - edgecolor="none", - facecolor="grey", - alpha=0.9, - ) - ax.text((ex_start + ex_end) / 2, 0.3, e_name) - ax.add_patch(rect) - - tot_genes = genegff.shape[0] - for i, gene in genegff.reset_index(drop=True).iterrows(): - start, end = gene[["start", "end"]] - diff = np.diff([min_, max_]) - interval = diff / tot_genes + 1 - name_point = min_ + (interval * i + 1) - strand = gene["strand"] - if strand == "+": - rect = patches.Rectangle( - (start, 0.55), - end - start, - 0.3, - linewidth=3, - edgecolor="black", - facecolor="none", - ) - ax.text( - name_point, 0.95, s=gene["ID"], fontdict={"fontsize": 12}, weight="bold" - ) - else: - rect = patches.Rectangle( - (start, 0.45), - end - start, - -0.3, - linewidth=3, - edgecolor="black", - facecolor="none", - ) - ax.text( - name_point, -0.3, s=gene["ID"], fontdict={"fontsize": 12}, weight="bold" - ) - ax.add_patch(rect) - - pal = sns.color_palette("Set2", len(primer_df.columns)) - handles, labels = ax.get_legend_handles_labels() - for pair in primer_df: - pair = int(pair) - for oligo in oligos: - lower, upper = ( - primer_res_dict[pair][oligo]["position"].min(), - primer_res_dict[pair][oligo]["position"].max(), - ) - - if oligo == "forward": - plt.arrow( - lower, - 0.8 + (2 / (10 - (pair))), - upper - lower, - 0, - width=0.03, - length_includes_head=True, - color=pal[pair], - ) - elif oligo == "reverse": - plt.arrow( - upper, - 0.8 + (2 / (10 - (pair))), - lower - upper, - 0, - width=0.03, - length_includes_head=True, - color=pal[pair], - ) - elif oligo == "probe": - ax.axhline(y=0.8 + (2 / (10 - (pair))), xmin=lower, xmax=upper) - line = plt.Line2D( - (lower, upper), - (0.8 + (2 / (10 - (pair))), 0.8 + (2 / (10 - (pair)))), - lw=2.5, - color=pal[pair], - ) - ax.add_line(line) - # manually define a new patch - patch = patches.Patch(color=pal[pair], label=f"pair {pair}") - # handles is a list, so append manual patch - handles.append(patch) - # plot the legend - plt.legend(handles=handles, loc=legend_loc) - if out_dir: - fig.savefig( - f"{out_dir}/{assay_name}_primer_locs.png", dpi=300, bbox_inches="tight" - ) - - -def gget_blat_genome(primer_df, assay_type, assembly="anoGam3"): - """ - Aligns primers to the AgamP3 genome with BLAT. - - PARAMETERS - ---------- - primer_df : pandas.DataFrame - A pandas DataFrame containing the primer sequences and their information, returned by primer3_to_pandas() - assay_type : str - The type of assay, either 'gDNA primers' or 'gDNA primers + probe', 'cDNA primers' - or 'probe' - assembly : str, optional - The genome assembly to use with Blat. 'anoGam3' is Anopheles gambiae. Please see the gget documentation for more information. - """ - oligos, _ = _return_oligo_list(assay_type=assay_type) - - pair_dict = {} - for primer_pair in primer_df: - oligo_list = [] - for oligo in oligos: - seq = primer_df[primer_pair].loc[f"primer_{oligo}_sequence"] - blat_df = gget.blat(sequence=seq, seqtype="DNA", assembly=assembly) - if blat_df is None: - print(f"No hit for {oligo} - pair {primer_pair}") - continue - blat_df.loc[:, "primer"] = f"{oligo}_{primer_pair}" - oligo_list.append(blat_df.set_index("primer")) - - if oligo_list: - pair_dict[primer_pair] = pd.concat(oligo_list) - elif not oligo_list: - continue - - if pair_dict: - return pd.concat(pair_dict) - else: - print("No HITs found for these primer pairs") - - -def check_and_split_target(species, target, assay_type): - data_resource = retrieve_data_resource(species=species) - - # split contig from target - if target.startswith("AGAP") or target.startswith("LOC"): - assert ( - assay_type == "cDNA primers" - ), "an AGAP/AFUN identifier is specified, but the assay type is not cDNA primers. Please provide a contig:position identifier for gDNA primers." - gff = data_resource.geneset() - assert ( - target in gff["ID"].to_list() - ), f"requested target {target} not in ag3/af1 transcript set" - contig = gff.query(f"ID == '{target}'")["contig"].unique()[0] - return (contig, target) - else: - assert isinstance( - target, str - ), "For genomic DNA the target should be a string, such as '2L:28545767'" - contig, target = target.split(":") - if species == "gambiae_sl": - assert contig in [ - "2L", - "2R", - "3L", - "3R", - "X", - "2RL", - "3RL", - ], "target contig not recognised, should be 2L, 2R, 3L, 3R, 2RL, 3RL, X" - elif species == "funestus": - assert contig in [ - "2RL", - "3RL", - "X", - ], "target contig not recognised, should be 2RL, 3RL or X" - return (contig, int(target)) - - -def designPrimers( - species, - assay_type, - assay_name, - min_amplicon_size, - max_amplicon_size, - n_primer_pairs, - target, - primer_parameters, - sample_sets=None, - sample_query=None, - out_dir=None, - cDNA_exon_junction=True, -): - """ - Run whole AnoPrimer workflow to design primers/probes with in one function - - Parameters - ---------- - species: str - The species to design primers for. Either 'gambiae_sl' or 'funestus' - assay_type: {'gDNA primers', 'cDNA primers', 'gDNA primers + probe', 'probe}, str - The type of oligos we wish to design. If 'gDNA primers' or 'cDNA primers' are specified, - then only primers will be designed. If 'gDNA primers + probe' is specified, then primers - and a probe will be designed. If 'probe' is specified, then only an internal probe will - be designed. - assay_name : str - A name to give the assay, used for naming output files. - min_amplicon_size : int - The minimum size of the amplicon we wish to design primers for. - max_amplicon_size : int - The maximum size of the amplicon we wish to design primers for. cDNA primers for gene - expression assays should be designed with a max amplicon size of ~120. - n_primer_pairs : int - The number of primer pairs to design. - target : str - The target to design primers for. For gDNA primers, this should be a contig:position string, - for example '2L:28545767'. For cDNA primers, this should be an AGAP identifier. - primer_parameters : dict or 'default' - A dictionary of primer3 parameters to use for primer design. If 'default' is specified, default - primer3 parameters will be generated. - sample_sets : str or list of str, optional - Can be a sample set identifier (e.g., "AG1000G-AO") or a list of - sample set identifiers (e.g., ["AG1000G-BF-A", "AG1000G-BF-B"]) or a - release identifier (e.g., "3.0") or a list of release identifiers. - sample_query: str, optional - A pandas query string which will be evaluated against the sample - metadata e.g., "taxon == 'coluzzii' and country == 'Burkina Faso'". - out_dir : str, optional - The directory to write output files to. If not specified, outputs will not be written to file. - cDNA_exon_junction : bool, optional - If True, cDNA primers will be designed to span an exon-exon junction. We strongly recommend - that this is set to True. In the case of gDNA primers, this parameter is ignored. - - Returns - ------- - primer_df : pandas.DataFrame - A pandas DataFrame containing the primer sequences and their information. - blat_df : pandas.DataFrame - A pandas DataFrame containing the BLAT alignment information for the primers. - """ - - data_resource = retrieve_data_resource(species=species) - oligos, _ = _return_oligo_list(assay_type) - - # check target is valid for assay type and find contig - contig, target = check_and_split_target( - species=species, target=target, assay_type=assay_type - ) - amplicon_size_range = [[min_amplicon_size, max_amplicon_size]] - - # adds some necessary parameters depending on assay type - if primer_parameters == "default": - primer_parameters = primer_params( - primer_parameters=None, - assay_type=assay_type, - n_primer_pairs=n_primer_pairs, - amplicon_size_range=amplicon_size_range, - generate_defaults=True, - ) - else: - primer_parameters = primer_params( - primer_parameters=primer_parameters, - assay_type=assay_type, - n_primer_pairs=n_primer_pairs, - amplicon_size_range=amplicon_size_range, - generate_defaults=False, - ) - # load genome sequence - genome_seq = data_resource.genome_sequence(region=contig) - print(f"Our genome sequence for {contig} is {genome_seq.shape[0]} bp long") - - target_sequence, gdna_pos, seq_parameters = prepare_sequence( - species=species, - target=target, - assay_type=assay_type, - assay_name=assay_name, - genome_seq=genome_seq, - amplicon_size_range=amplicon_size_range, - cDNA_exon_junction=cDNA_exon_junction, - ) - - # run primer3 - primer_dict = primer3.designPrimers( - seq_args=seq_parameters, global_args=primer_parameters - ) - - if assay_type != "probe": - # check if primer3 has returned any primers - if int(extract_trailing_digits(primer_dict["PRIMER_PAIR_EXPLAIN"])) == 0: - print( - f"No primers found for {assay_name}. For cDNA primers, this is more likely to occur if the target contains only one exon-exon junction. see troubleshooting below for more information. We suggest relaxing the primer parameters \n" - ) - print(primer3_run_statistics(primer_dict, assay_type)) - return (None, None) - - # AnoPrimer.primer3_run_statistics(primer_dict, assay_type) - primer_df = primer3_to_pandas(primer_dict=primer_dict, assay_type=assay_type) - - # plot frequencies of alleles in primer pairs - results_dict = plot_primer_snp_frequencies( - species=species, - primer_df=primer_df, - gdna_pos=gdna_pos, - contig=contig, - sample_sets=sample_sets, - sample_query=sample_query, - assay_type=assay_type, - seq_parameters=seq_parameters, - out_dir=out_dir, - ) - - # plot primer locations on genome - plot_primer_locs( - species=species, - primer_res_dict=results_dict, - primer_df=primer_df, - assay_type=assay_type, - contig=contig, - seq_parameters=seq_parameters, - legend_loc="lower left", - out_dir=out_dir, - ) - - # add genomic span to primer_df - spans = [] - for oligo in oligos: - for i in range(len(primer_df.columns)): - start = int(results_dict[i][oligo]["position"].min()) - end = int(results_dict[i][oligo]["position"].max()) - span = f"{contig}:{start}-{end}" - spans.append(span) - - spans = np.array(spans).reshape(len(oligos), len(primer_df.columns)) - spans_df = pd.DataFrame(spans) - spans_df.index = [f"{o}_genomic_span" for o in oligos] - spans_df.columns = primer_df.columns - primer_df = pd.concat([primer_df, spans_df], axis=0).drop( - index=[f"primer_{o}" for o in oligos] - ) - - if out_dir: - primer_df.to_csv(f"{out_dir}/{assay_name}.{assay_type}.tsv", sep="\t") - primer_df.to_excel(f"{out_dir}/{assay_name}.{assay_type}.xlsx") - - if species == "gambiae_sl": - # check primers for specificity against the genome and write to file - blat_df = gget_blat_genome(primer_df, assay_type, assembly="anoGam3") - # write primer3 and blat output to file - if out_dir: - blat_df.to_csv(f"{out_dir}/{assay_name}.{assay_type}.blat.tsv", sep="\t") - return primer_df, blat_df - else: - return primer_df, "Cannot BLAT against funestus, gambiae_sl only" - - -def extract_trailing_digits(string): - import re - - match = re.search(r"\d+$", string) - if match: - return match.group(0) - else: - return None - - -def _get_primer_arrays( - species, contig, gdna_pos, sample_sets, assay_type, sample_query=None -): - data_resource = retrieve_data_resource(species=species) - - if any(item in assay_type for item in ["gDNA", "probe"]): - span_str = f"{contig}:{gdna_pos.min()}-{gdna_pos.max()}" - snps = data_resource.snp_calls( - region=span_str, sample_sets=sample_sets, sample_query=sample_query - ) # get genotypes - ref_alt_arr = snps["variant_allele"].compute().values - geno = snps["call_genotype"] - freq_arr = allel.GenotypeArray(geno).count_alleles().to_frequencies() - pos_arr = gdna_pos - elif assay_type == "cDNA primers": - freq_arr = [] - ref_alt_arr = [] - pos_arr = np.array([]) - exon_spans = np.array(_consecutive(gdna_pos)) + 1 - for span in exon_spans: - span_str = f"{contig}:{span[0]}-{span[1]}" - snps = data_resource.snp_calls( - region=span_str, sample_sets=sample_sets, sample_query=sample_query - ) # get genotypes - ref_alts = snps["variant_allele"] - geno = snps["call_genotype"] - freqs = ( - allel.GenotypeArray(geno).count_alleles().to_frequencies() - ) # calculate allele frequencies - freqs = _addZeroCols(freqs) - freq_arr.append(freqs) - ref_alt_arr.append(ref_alts) - pos_arr = np.append(pos_arr, np.arange(span[0], span[1] + 1).astype(int)) - freq_arr = np.concatenate(freq_arr) - ref_alt_arr = np.concatenate(ref_alt_arr) - - return (freq_arr, ref_alt_arr.astype("U13"), pos_arr) - - -def _get_primer_alt_frequencies( - species, primer_df, gdna_pos, pair, sample_sets, assay_type, contig, sample_query -): - """ - Find the genomic locations of pairs of primers, and runs span_to_freq - to get allele frequencies at those locations - """ - - oligos, _ = _return_oligo_list(assay_type) - base_freqs, ref_alt_arr, pos_arr = _get_primer_arrays( - species=species, - contig=contig, - gdna_pos=gdna_pos, - sample_sets=sample_sets, - assay_type=assay_type, - sample_query=sample_query, - ) - - freq_arr = base_freqs[:, 1:].sum(axis=1) - - di = {} - for oligo in oligos: - primer_loc = primer_df.loc[f"primer_{oligo}", str(pair)][0] - primer_loc = primer_loc + 1 if oligo == "reverse" else primer_loc - primer_size = primer_df.loc[f"primer_{oligo}", str(pair)][1] - if oligo in ["forward", "probe"]: - freq = freq_arr[primer_loc : primer_loc + primer_size] - base_freqs_arr = base_freqs[primer_loc : primer_loc + primer_size, :] - ref = ref_alt_arr[primer_loc : primer_loc + primer_size, 0] - ref_alt = ref_alt_arr[primer_loc : primer_loc + primer_size, :] - pos = pos_arr[primer_loc : primer_loc + primer_size] - elif oligo == "reverse": - freq = np.flip(freq_arr[primer_loc - primer_size : primer_loc]) - base_freqs_arr = base_freqs[primer_loc - primer_size : primer_loc, :] - base_freqs_arr = np.flip(base_freqs_arr, axis=0) - ref = ref_alt_arr[primer_loc - primer_size : primer_loc, 0] - ref = np.array(list(_rev_complement("".join(ref))), dtype=str) - ref_alt = ref_alt_arr[primer_loc - primer_size : primer_loc, :] - ref_alt = _complement(np.flip(ref_alt, axis=0)) - pos = np.flip(pos_arr[primer_loc - primer_size : primer_loc]) - - df = pd.DataFrame( - {"position": pos, "base": ref, "alt_frequency": freq} - ) # Make dataframe for plotting - df["base_pos"] = df["base"] + "_" + df["position"].astype(str) - assert df.shape[0] == primer_size, "Wrong size primers" - - freq_df = _get_base_freqs(_addZeroCols(base_freqs_arr), ref_alt).filter( - like="freq" - ) - df = pd.concat([df, freq_df], axis=1) - di[oligo] = df - return di - - -def _plotly_primers( - primer_df, - res_dict, - name, - assay_type, - sample_sets, - target, - out_dir=None, -): - oligos, _ = _return_oligo_list(assay_type) - if len(oligos) == 2: - plt_title = ["Forward primer", "Reverse primer"] - elif len(oligos) == 3: - plt_title = ["Forward primer", "Reverse primer", "Probe"] - elif len(oligos) == 1: - plt_title = ["Probe"] - - title_list = [] - for pair in primer_df: - for oligo in plt_title: - title_list.append(f"{oligo} {pair}") - - hover_template = "
".join( - [ - "Base / Position: %{customdata[4]}", - "Total Alternate freq: %{y}", - "A_freq: %{customdata[0]}", - "C_freq: %{customdata[1]}", - "G_freq: %{customdata[2]}", - "T_freq: %{customdata[3]}", - ] - ) - - fig = make_subplots( - rows=len(primer_df.columns), - cols=len(oligos), - subplot_titles=title_list, - horizontal_spacing=0.03, - vertical_spacing=0.08, - ) - fig.update_annotations(font_size=13) - for idx, oligo in enumerate(oligos): - idx = idx + 1 - for i in primer_df: - i = int(i) - row_i = i + 1 - - color = [ - -1 if v == 0 else 1 if v > 0 else 0 - for v in res_dict[i][oligo]["alt_frequency"] - ] - colorscale = [[0, "lightgray"], [0.5, "lightgray"], [1, "dodgerblue"]] - - tm = np.round(primer_df.loc[f"primer_{oligo}_tm", str(i)], 2) - gc = np.round(primer_df.loc[f"primer_{oligo}_gc_percent", str(i)], 2) - span = f"{int(res_dict[i][oligo]['position'].min())}-{int(res_dict[i][oligo]['position'].max())}" - # Write text to plot for Tm, GC, span, and 3/5' - - for col in res_dict[i][oligo].columns: - if col.endswith("freq"): - res_dict[i][oligo][col] = np.round(res_dict[i][oligo][col], 2) - - fig.add_trace( - go.Scatter( - x=res_dict[i][oligo]["base_pos"], - y=res_dict[i][oligo]["alt_frequency"], - customdata=res_dict[i][oligo][ - ["A_freq", "C_freq", "G_freq", "T_freq", "base_pos"] - ], - hovertemplate=hover_template, - mode="markers", - marker=dict( - size=14, - color=color, - colorscale=colorscale, - line=dict(width=2, color="black"), - ), - marker_symbol="circle", - ), - row=row_i, - col=idx, - ) - fig.add_annotation( - row=row_i, - col=idx, - x=res_dict[i][oligo]["base_pos"][0], - y=0.8, - text="5'", - showarrow=False, - ) - fig.add_annotation( - row=row_i, - col=idx, - x=res_dict[i][oligo]["base_pos"].to_numpy()[-1], - y=0.8, - text="3'", - showarrow=False, - ) - fig.add_annotation( - row=row_i, - col=idx, - x=res_dict[i][oligo]["base_pos"].to_numpy()[4], - y=0.92, - text=span, - showarrow=False, - ) - fig.add_annotation( - row=row_i, - col=idx, - x=res_dict[i][oligo]["base_pos"].to_numpy()[-7], - y=0.92, - text=f"GC={gc}", - showarrow=False, - ) - fig.add_annotation( - row=row_i, - col=idx, - x=res_dict[i][oligo]["base_pos"].to_numpy()[-3], - y=0.92, - text=f"TM={tm}", - showarrow=False, - ) - - fig.update_xaxes( - row=row_i, - col=idx, - tickmode="array", - tickvals=res_dict[i][oligo]["base_pos"], - ticktext=res_dict[i][oligo]["base"], - tickangle=0, - mirror=True, - ) - if idx > 1: - fig.update_yaxes( - row=row_i, - col=idx, - range=[0, 1], - tickvals=np.arange(0, 1, 0.2), - showticklabels=False, - mirror=True, - ) - else: - fig.update_yaxes( - row=row_i, - col=idx, - tickvals=np.arange(0, 1, 0.2), - range=[0, 1], - mirror=True, - ) - - if ((row_i % 2) == 0) & (idx == 1): - fig.update_yaxes(row=row_i, col=idx, title="Alternate allele frequency") - - if any(item in assay_type for item in ["gDNA"]): - title_text = f"{name} primer pairs | {sample_sets} | target {target} bp" - elif assay_type == "probe": - title_text = f"{name} probe | {sample_sets} | target {target} bp" - elif assay_type == "cDNA primers": - title_text = f"{name} primer pairs | {sample_sets} | target {target}" - - # fig.update_traces(customdata=customdata, hovertemplate=hovertemplate) - fig.update_layout( - height=200 * len(primer_df.columns), - width=500 * len(oligos), - title_text=title_text, - title_x=0.5, - template="simple_white", - showlegend=False, - ) - if out_dir: - fig.write_html(f"{name}_{assay_type}.html") - fig.write_image(f"{name}_{assay_type}.pdf") - fig.show() - - -def _get_gDNA_locs(gff, contig, start, end): - locgff = gff.query( - f"contig == '{contig}' & type == 'exon' & start < {end} & end > {start}" - ) - min_ = locgff.start.min() - 100 - max_ = locgff.end.max() + 100 - genegff = gff.query( - f"contig == '{contig}' & type == 'gene' & start < {end} & end > {start}" - ) - return (locgff, min_, max_, genegff) - - -def _get_qPCR_locs(gff, contig, transcript): - # Load geneset (gff) - locgff = gff.query(f"Parent == '{transcript}' & type == 'exon'") - min_ = locgff.start.min() - 200 - max_ = locgff.end.max() + 200 - genegff = gff.query( - f"contig == '{contig}' & type == 'gene' & start > {min_} & end < {max_}" - ) - return (locgff, min_, max_, genegff) - - -def _return_oligo_list(assay_type): - if assay_type == "probe": - oligos = ["probe"] - row_start = 9 - elif any(item == assay_type for item in ["gDNA primers", "cDNA primers"]): - oligos = ["forward", "reverse"] - row_start = 11 - elif assay_type == "gDNA primers + probe": - oligos = ["forward", "reverse", "probe"] - row_start = 12 - return (oligos, row_start) - - -def _convert_results_dict_naming(primer_dict): - k = {} - for key in primer_dict.keys(): - if "LEFT" in key: - nkey = key.replace("LEFT", "forward") - elif "RIGHT" in key: - nkey = key.replace("RIGHT", "reverse") - elif "INTERNAL" in key: - nkey = key.replace("INTERNAL", "probe") - else: - nkey = key - k[nkey.lower()] = primer_dict[key] - return k - - -def _complement(x): - if x == "A": - return "T" - elif x == "C": - return "G" - elif x == "G": - return "C" - elif x == "T": - return "A" - - -_complement = np.vectorize(_complement) - - -def _rev_complement(seq): - BASES = "NRWSMBDACGTHVKSWY" - return "".join([BASES[-j] for j in [BASES.find(i) for i in seq][::-1]]) - - -def _consecutive(data, stepsize=1): - arr = np.split(data, np.where(np.diff(data) != stepsize)[0] + 1) - arr = [[a.min(), a.max()] for a in arr] - return arr - - -def _addZeroCols(freqs): - freqlength = freqs.shape[1] - needed = 4 - freqlength - if needed > 0: - for i in range(needed): - freqs = np.column_stack([freqs, np.repeat(0, freqs.shape[0])]) - return freqs - - -def _get_base_freqs(freqs, ref_alt_array): - assert freqs.shape == ref_alt_array.shape, "Shape of arrays is different" - freq_df = pd.DataFrame(ref_alt_array) - for i_base in range(4): - for i in range(freqs.shape[0]): - base = ref_alt_array[i, i_base] - freq_df.loc[i, f"{base}_freq"] = freqs[i, i_base] - return freq_df - - -### check my oligos ### - - -def check_my_oligo( - sequence, sample_sets="3.0", sample_query=None, width=700, height=400 -): - """ - Align a sequence to AgamP3, retrieve ag3 frequencies in this region and plot. - Only works with An.gambiae_sl for now. - """ - - print("Aligning sequence to AgamP3 genome with BLAT") - blat_df = gget.blat(sequence=sequence, seqtype="DNA", assembly="anoGam3") - if blat_df is None: - print(f"No hit for {sequence}") - return - - contig, start, end = blat_df.loc[0, ["chromosome", "start", "end"]] - contig = contig.replace("chr", "") - region_span = f"{contig}:{start}-{end}" - print("plotting frequencies in ag3 data") - - fig = plot_sequence_frequencies( - data_resource=malariagen_data.Ag3(url="gs://vo_agam_release/", pre=True), - region=region_span, - sample_sets=sample_sets, - sample_query=sample_query, - width=width, - height=height, - ) - return fig - - -def plot_sequence_frequencies( - data_resource, region, sample_sets=None, sample_query=None, width=700, height=400 -): - """Retrieve frequencies""" - - snps = data_resource.snp_calls( - region=region, sample_sets=sample_sets, sample_query=sample_query - ) - ref_alt_arr = snps["variant_allele"].compute().values.astype(str) - freq_arr = ( - allel.GenotypeArray(snps["call_genotype"]).count_alleles().to_frequencies() - ) - pos = snps["variant_position"].compute().values - df = pd.DataFrame( - { - "position": pos, - "base": ref_alt_arr[:, 0], - "alt_frequency": freq_arr[:, 1:].sum(axis=1), - } - ) # Make dataframe for plotting - df["base_pos"] = df["base"] + "_" + df["position"].astype(str) - # Get the frequency of each base and store as data frame - freq_df = _get_base_freqs(_addZeroCols(freq_arr), ref_alt_arr).filter(like="freq") - - data = pd.concat([df, freq_df], axis=1) - - fig = _plotly_frequencies( - data=data, - region=region, - sample_sets=sample_sets, - sample_query=sample_query, - width=width, - height=height, - ) - return fig - - -def _plotly_frequencies( - data, region, sample_sets, sample_query=None, width=700, height=400, save=False -): - import plotly.graph_objects as go - from plotly.subplots import make_subplots - - hover_template = "
".join( - [ - "Base / Position: %{customdata[4]}", - "Total Alternate freq: %{y}", - "A_freq: %{customdata[0]}", - "C_freq: %{customdata[1]}", - "G_freq: %{customdata[2]}", - "T_freq: %{customdata[3]}", - ] - ) - # Color scatterpoints blue if segregating SNP - color = [-1 if v == 0 else 1 if v > 0 else 0 for v in data["alt_frequency"]] - colorscale = [[0, "lightgray"], [0.5, "lightgray"], [1, "dodgerblue"]] - - fig = go.Figure( - go.Scatter( - x=data["position"], - y=data["alt_frequency"], - customdata=data[["A_freq", "C_freq", "G_freq", "T_freq", "base_pos"]], - hovertemplate=hover_template, - mode="markers", - marker=dict( - size=14, - color=color, - colorscale=colorscale, - line=dict(width=2, color="black"), - ), - marker_symbol="circle", - ) - ) - # Set xticks to be the REF allele - fig.update_xaxes( - tickmode="array", - tickangle=0, - tickvals=data["position"].to_list(), - ticktext=data["base"].to_list(), - ) - fig.update_yaxes( - tickmode="array", - tickvals=np.arange(0, 1, 0.2), - range=[0, 1], - title="Alternate allele frequency", - ) - # Set plot title - if sample_query is not None: - title_text = f"{region} | {sample_sets} | {sample_query} | allele frequencies" - else: - title_text = f"{region} | {sample_sets} | allele frequencies" - - fig.update_layout( - height=height, - width=width, - title_text=title_text, - title_x=0.5, - template="simple_white", - showlegend=False, - ) - fig.show() - return fig diff --git a/AnoPrimer/__init__.py b/AnoPrimer/__init__.py index 0b41e90..564f361 100644 --- a/AnoPrimer/__init__.py +++ b/AnoPrimer/__init__.py @@ -1,3 +1,4 @@ # flake8: noqa -from . import AnoPrimer -from .AnoPrimer import * +from .design import * +from .evaluate import AnoPrimerResults +from .utils import * diff --git a/AnoPrimer/design.py b/AnoPrimer/design.py new file mode 100644 index 0000000..53e8b7e --- /dev/null +++ b/AnoPrimer/design.py @@ -0,0 +1,488 @@ +import numpy as np +import pandas as pd +import primer3 + +from .evaluate import AnoPrimerResults +from .utils import ( + _convert_results_dict_naming, + _return_oligo_list, + extract_trailing_digits, + retrieve_data_resource, +) + + +def design_primers( + species, + assay_type, + assay_name, + target, + min_amplicon_size, + max_amplicon_size, + n_primer_pairs, + primer_parameters, + cDNA_exon_junction=True, +): + """ + Run whole AnoPrimer workflow to design primers/probes + + Parameters + ---------- + species: str + The species to design primers for. Either 'gambiae_sl' or 'funestus' + assay_type: {'gDNA primers', 'cDNA primers', 'gDNA primers + probe', 'probe}, str + The type of oligos we wish to design. If 'gDNA primers' or 'cDNA primers' are specified, + then only primers will be designed. If 'gDNA primers + probe' is specified, then primers + and a probe will be designed. If 'probe' is specified, then only an internal probe will + be designed. + assay_name : str + A name to give the assay, used for naming output files. + min_amplicon_size : int + The minimum size of the amplicon we wish to design primers for. + max_amplicon_size : int + The maximum size of the amplicon we wish to design primers for. cDNA primers for gene + expression assays should be designed with a max amplicon size of ~120. + n_primer_pairs : int + The number of primer pairs to design. + target : str + The target to design primers for. For gDNA primers, this should be a contig:position string, + for example '2L:28545767'. For cDNA primers, this should be an AGAP identifier. + primer_parameters : dict or 'default' + A dictionary of primer3 parameters to use for primer design. If 'default' is specified, default + primer3 parameters will be generated. + cDNA_exon_junction : bool, optional + If True, cDNA primers will be designed to span an exon-exon junction. We strongly recommend + that this is set to True. In the case of gDNA primers, this parameter is ignored. + + Returns + ------- + AnoPrimerResults + An AnoPrimerResults object containing the results of the primer design. + """ + + data_resource = retrieve_data_resource(species=species) + + # check target is valid for assay type and find contig + contig, target = check_and_split_target( + species=species, target=target, assay_type=assay_type + ) + amplicon_size_range = [[min_amplicon_size, max_amplicon_size]] + + # adds some necessary parameters depending on assay type + if primer_parameters == "default": + primer_parameters = primer_params( + primer_parameters=None, + assay_type=assay_type, + n_primer_pairs=n_primer_pairs, + amplicon_size_range=amplicon_size_range, + generate_defaults=True, + ) + else: + primer_parameters = primer_params( + primer_parameters=primer_parameters, + assay_type=assay_type, + n_primer_pairs=n_primer_pairs, + amplicon_size_range=amplicon_size_range, + generate_defaults=False, + ) + # load genome sequence + genome_seq = data_resource.genome_sequence(region=contig) + print(f"Our genome sequence for {contig} is {genome_seq.shape[0]} bp long") + + seq_parameters = prepare_sequence( + species=species, + target=target, + assay_type=assay_type, + assay_name=assay_name, + genome_seq=genome_seq, + amplicon_size_range=amplicon_size_range, + cDNA_exon_junction=cDNA_exon_junction, + ) + + # run primer3 + primer_dict = primer3.designPrimers( + seq_args=seq_parameters, global_args=primer_parameters + ) + + if assay_type != "probe": + # check if primer3 has returned any primers + if int(extract_trailing_digits(primer_dict["PRIMER_PAIR_EXPLAIN"])) == 0: + print( + f"No primers found for {assay_name}. For cDNA primers, this is more likely to occur if the target contains only one exon-exon junction. see troubleshooting below for more information. We suggest relaxing the primer parameters \n" + ) + print(primer3_run_statistics(primer_dict, assay_type)) + return (None, None) + + # AnoPrimer.primer3_run_statistics(primer_dict, assay_type) + primer_df = primer3_to_pandas(primer_dict=primer_dict, assay_type=assay_type) + + return AnoPrimerResults( + species=species, + data_resource=data_resource, + contig=contig, + assay_type=assay_type, + assay_name=assay_name, + target=target, + primer_df=primer_df, + seq_parameters=seq_parameters, + primer_parameters=primer_parameters, + ) + + +def prepare_gDNA_sequence( + target_loc, + amplicon_size_range, + genome_seq, + assay_name, + assay_type, + probe_exclude_region_size=20, +): + """ + Extracts sequence of interest from genome sequence + + PARAMETERS + ---------- + target_loc : int + The target location of the SNP in the genome sequence + amplicon_size_range : list + The minimum and maximum size of the amplicon to design primers for + genome_seq : dask.array.core.Array + The genome sequence from ag3.genome_sequence() + assay_name : str + The name of the assay + assay_type : str + The type of assay, either 'gDNA primers' or 'gDNA primers + probe', 'cDNA primers' + or 'probe' + """ + target = int(target_loc) + # Set up range for the input sequence, we'll take the max range + # of the amplicon size and add that either side of the target SNP + amp_size_range = int(np.max(amplicon_size_range)) + start = target - amp_size_range + end = target + amp_size_range + # join array into be one character string, and store the positions + # of these sequences for later + target_sequence = "".join(genome_seq[start : end - 1].compute().astype(str)) + gdna_pos = np.arange(start, end).astype(int) + 1 + print(f"The target sequence is {len(target_sequence)} bases long") + + # We need the target snp indices within the region of interest + target_loc_primer3 = int(np.where(gdna_pos == target)[0]) + target_loc_primer3 = [target_loc_primer3, 10] + print(f"the target snp is {target_loc_primer3[0]} bp into our target sequence") + + seq_parameters = { + "SEQUENCE_ID": assay_name, + "SEQUENCE_TEMPLATE": target_sequence, + "SEQUENCE_TARGET": target_loc_primer3, + "GENOMIC_TARGET": target, + "GENOMIC_DNA_POSITIONS": gdna_pos, + } + + if "probe" in assay_type: + seq_parameters["SEQUENCE_INTERNAL_EXCLUDED_REGION"] = [ + [1, target_loc_primer3[0] - probe_exclude_region_size], + [ + target_loc_primer3[0] + probe_exclude_region_size, + len(target_sequence) + - (target_loc_primer3[0] + probe_exclude_region_size), + ], + ] + + return seq_parameters + + +def prepare_cDNA_sequence( + species, transcript, genome_seq, assay_name, cDNA_exon_junction +): + """ + Extract exonic sequence for our transcript and record exon-exon junctions + + PARAMETERS + ---------- + species: str + The species to design primers for. Either 'gambiae_sl' or 'funestus' + transcript : str + The AGAP identifier of the transcript to design primers for + genome_seq : dask.array.core.Array + The genome sequence from ag3.genome_sequence() + assay_name : str + The name of the assay + cDNA_exon_junction : bool + If True, cDNA primers will be designed to span an exon-exon junction. We strongly recommend for qPCR purposes. + """ + + # subset gff to your gene + data_resource = retrieve_data_resource(species) + + gff = data_resource.geneset() + gff = gff.query(f"type == 'exon' & Parent == '{transcript}'") + # Get fasta sequence for each of our exons, and remember gDNA position + seq = dict() + gdna_pos = dict() + for i, exon in enumerate(zip(gff.start, gff.end)): + seq[i] = "".join(np.array(genome_seq)[exon[0] - 1 : exon[1]].astype(str)) + gdna_pos[i] = np.arange(exon[0] - 1, exon[1]) + # concatenate exon FASTAs into one big sequence + gdna_pos = np.concatenate(list(gdna_pos.values())) + target_mRNA_seq = "".join(seq.values()) + + # Get list of exon junction positions + exon_junctions = np.array(np.cumsum(gff.end - gff.start))[:-1] + exon_sizes = np.array(gff.end - gff.start)[:-1] + exon_junctions_pos = [ex + gff.iloc[i, 3] for i, ex in enumerate(exon_sizes)] + print(f"Exon junctions for {transcript}:", exon_junctions, exon_junctions_pos, "\n") + seq_parameters = { + "SEQUENCE_ID": assay_name, + "SEQUENCE_TEMPLATE": target_mRNA_seq, + "GENOMIC_TARGET": transcript, + "GENOMIC_DNA_POSITIONS": gdna_pos, + } + + if cDNA_exon_junction: + seq_parameters["SEQUENCE_OVERLAP_JUNCTION_LIST"] = list( + map(int, exon_junctions) + ) + + return seq_parameters + + +def prepare_sequence( + species, + target, + assay_type, + assay_name, + genome_seq, + amplicon_size_range, + cDNA_exon_junction=True, +): + """ + Prepare the sequence for primer3, depending on cDNA or gDNA input type + + PARAMETERS + ---------- + species: str + The species to design primers for. Either 'gambiae_sl' or 'funestus' + target : str + The target to design primers for. For gDNA primers, this should be a contig:position string, + for example '2L:28545767'. For cDNA primers, this should be an AGAP identifier. + assay_type : str + The type of assay, either 'gDNA primers' or 'gDNA primers + probe', 'cDNA primers' + or 'probe' + assay_name : str + The name of the assay + genome_seq : dask.array.core.Array + The genome sequence from ag3.genome_sequence() + amplicon_size_range : list + The minimum and maximum size of the amplicon to design primers for + cDNA_exon_junction : bool + If True, cDNA primers will be designed to span an exon-exon junction. We strongly recommend for qPCR purposes. + """ + + if any(item in assay_type for item in ["gDNA", "probe"]): + # genomic DNA + seq_parameters = prepare_gDNA_sequence( + target_loc=target, + amplicon_size_range=amplicon_size_range, + genome_seq=genome_seq, + assay_name=assay_name, + assay_type=assay_type, + ) + elif assay_type == "cDNA primers": + # quantitative PCR + seq_parameters = prepare_cDNA_sequence( + species=species, + transcript=target, + genome_seq=genome_seq, + assay_name=assay_name, + cDNA_exon_junction=cDNA_exon_junction, + ) + + return seq_parameters + + +def primer_params( + assay_type, + primer_parameters=None, + n_primer_pairs=None, + amplicon_size_range=None, + generate_defaults=False, +): + """ + adds necessary parameters depending on assay_type, or can + generate the default parameters + + PARAMETERS + ---------- + assay_type : str + The type of assay, either 'gDNA primers' or 'gDNA primers + probe', 'cDNA primers' + or 'probe' + primer_parameters : dict + A dictionary of primer3 parameters to use for primer design. If 'default' is specified, default + primer3 parameters will be generated. + n_primer_pairs : int + The number of primer pairs to design. + amplicon_size_range : list + The minimum and maximum size of the amplicon to design primers for + generate_defaults : bool + If True, default primer3 parameters will be generated. + """ + + if generate_defaults: + primer_parameters = { + "PRIMER_OPT_SIZE": 20, + "PRIMER_TASK": "generic", + "PRIMER_MIN_SIZE": 17, + "PRIMER_MAX_SIZE": 24, + "PRIMER_OPT_TM": 60.0, + "PRIMER_MIN_TM": 57.0, + "PRIMER_MAX_TM": 63.0, + "PRIMER_MIN_GC": 30.0, + "PRIMER_MAX_GC": 70.0, + # this parameter is the minimum distance between successive pairs. + # If 1, it means successive primer pairs could be identical bar one base shift + "PRIMER_MIN_THREE_PRIME_DISTANCE": 3, + # Probe size preferences if selected, otherwise ignored + "PRIMER_INTERNAL_OPT_SIZE": 16, + "PRIMER_INTERNAL_MIN_SIZE": 10, + "PRIMER_INTERNAL_MAX_SIZE": 22, + # Probe Tm considerations are quite relaxed, assumed that LNAs will be used + # later to affect TM + "PRIMER_INTERNAL_MIN_TM": 45, + "PRIMER_INTERNAL_MAX_TM": 65, + # Extra primer3 parameters can go here + # In the same format as above + } + + primer_parameters["PRIMER_NUM_RETURN"] = n_primer_pairs + primer_parameters["PRIMER_PRODUCT_SIZE_RANGE"] = amplicon_size_range + + if assay_type == "gDNA primers + probe": + primer_parameters["PRIMER_PICK_INTERNAL_OLIGO"] = 1 + primer_parameters["PRIMER_PICK_RIGHT_PRIMER"] = 1 + primer_parameters["PRIMER_PICK_LEFT_PRIMER"] = 1 + elif assay_type == "probe": + primer_parameters["PRIMER_PICK_INTERNAL_OLIGO"] = 1 + primer_parameters["PRIMER_PICK_RIGHT_PRIMER"] = 0 + primer_parameters["PRIMER_PICK_LEFT_PRIMER"] = 0 + return primer_parameters + + +def primer3_run_statistics(primer_dict, assay_type): + """ + Prints out primer3 run statistics from the primer3 results dictionary. + + PARAMETERS + ---------- + primer_dict : dict + The primer3 results dictionary returned by primer3.designPrimers() + assay_type : str + The type of assay, either 'gDNA primers' or 'gDNA primers + probe', 'cDNA primers' + or 'probe' + """ + _, row_start = _return_oligo_list(assay_type) + primer_dict = _convert_results_dict_naming(primer_dict) + # Convert the dict into a pandas dataframe + primer_df = pd.DataFrame.from_dict(primer_dict.items()) + # Rename the columns + primer_df = primer_df.rename(columns={0: "parameter", 1: "value"}) + explanations_df = primer_df.iloc[ + :row_start, : + ] # Take the first 7 rows which are general + # Loop through each row and print information + for ( + idx, + row, + ) in explanations_df.iterrows(): + print(row["parameter"], " : ", row["value"], "\n") + + +def primer3_to_pandas(primer_dict, assay_type): + """ + Convert primer3 results to pandas dataframe + + PARAMETERS + ---------- + primer_dict : dict + The primer3 results dictionary returned by primer3.designPrimers() + assay_type : str + The type of assay, either 'gDNA primers' or 'gDNA primers + probe', 'cDNA primers' + or 'probe' + + RETURNS + ------- + primer_df : pandas.DataFrame + A pandas DataFrame containing the primer sequences and their information. + """ + oligos, row_start = _return_oligo_list(assay_type) + # Convert the dict into a pandas dataframe + primer_dict = _convert_results_dict_naming(primer_dict) + primer_df = pd.DataFrame.from_dict(primer_dict.items()) + # Rename the columns + primer_df = primer_df.rename(columns={0: "parameter", 1: "value"}) + # Create a column which is primer pair #, and a column for primer + # parameter which does not contain primer pair # + primer_df = primer_df.iloc[row_start:, :].copy() + primer_df["primer_pair"] = primer_df["parameter"].str.extract("([0-9][0-9]|[0-9])") + primer_df["parameter"] = primer_df["parameter"].str.replace( + "(_[0-9][0-9]|_[0-9])", "", regex=True + ) + + # Put the different primer pairs in different columns + primer_df = primer_df.pivot( + index="parameter", columns="primer_pair", values="value" + ) + + # Get a list of the rows we need + primer_span = [f"primer_{oligo}" for oligo in oligos] + required_info = ["sequence", "TM", "GC_PERCENT"] + required_info = [ + p + "_" + y for y in required_info for p in primer_span + ] + primer_span + required_info = ( + required_info + ["primer_PAIR_PRODUCT_SIZE"] + if assay_type != "probe" + else required_info + ) + required_info = [string.lower() for string in required_info] + + # Subset data frame + primer_df = primer_df.loc[required_info, np.arange(primer_df.shape[1]).astype(str)] + return primer_df + + +def check_and_split_target(species, target, assay_type): + data_resource = retrieve_data_resource(species=species) + + # split contig from target + if target.startswith("AGAP") or target.startswith("LOC"): + assert ( + assay_type == "cDNA primers" + ), "an AGAP/AFUN identifier is specified, but the assay type is not cDNA primers. Please provide a contig:position identifier for gDNA primers." + gff = data_resource.geneset() + assert ( + target in gff["ID"].to_list() + ), f"requested target {target} not in ag3/af1 transcript set" + contig = gff.query(f"ID == '{target}'")["contig"].unique()[0] + return (contig, target) + else: + assert isinstance( + target, str + ), "For genomic DNA the target should be a string, such as '2L:28545767'" + contig, target = target.split(":") + if species == "gambiae_sl": + assert contig in [ + "2L", + "2R", + "3L", + "3R", + "X", + "2RL", + "3RL", + ], "target contig not recognised, should be 2L, 2R, 3L, 3R, 2RL, 3RL, X" + elif species == "funestus": + assert contig in [ + "2RL", + "3RL", + "X", + ], "target contig not recognised, should be 2RL, 3RL or X" + return contig, int(target) diff --git a/AnoPrimer/evaluate.py b/AnoPrimer/evaluate.py new file mode 100644 index 0000000..3083b15 --- /dev/null +++ b/AnoPrimer/evaluate.py @@ -0,0 +1,445 @@ +import gget +import matplotlib.patches as patches +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns + +from .utils import ( + _get_gDNA_locs, + _get_primer_alt_frequencies, + _get_qPCR_locs, + _plotly_primers, + _retrieve_span, + _return_oligo_list, + retrieve_data_resource, +) + + +class AnoPrimerResults: + """ + A class to represent the results of primer design and provide methods for analysis and visualization. + + This class encapsulates the results of primer design for a specific assay and provides + methods to analyze and visualize these results, including SNP frequencies in ag3/af1, primer locations, + and genome alignment with gget (BLAT). + + Attributes: + species (str): The species for which primers were designed. + data_resource: The data resource object used for retrieving genomic data. + contig (str): The contig or chromosome on which the primers are located. + target (str): The target sequence or location for primer design. + assay_type (str): The type of assay (e.g., 'gDNA primers', 'cDNA primers'). + assay_name (str): The name of the assay. + df (pd.DataFrame): DataFrame containing the designed primers and their properties. + seq_parameters (dict): Parameters used for sequence preparation. + primer_parameters (dict): Parameters used for primer design with primer3. + target_sequence (str): The target DNA sequence. + gdna_pos (np.array): Array of genomic DNA positions of the target input sequence. + """ + + def __init__( + self, + species, + data_resource, + assay_type, + assay_name, + contig, + target, + primer_df, + seq_parameters, + primer_parameters, + ): + """ + Initialize the AnoPrimerResults object with the results of primer design. + + Args: + species (str): The species for which primers were designed. + data_resource: The data resource object used for retrieving genomic data. + assay_type (str): The type of assay (e.g., 'gDNA primers', 'cDNA primers'). + assay_name (str): The name of the assay. + contig (str): The contig or chromosome on which the primers are located. + target (str): The target sequence or location for primer design. + primer_df (pd.DataFrame): DataFrame containing the designed primers and their properties. + seq_parameters (dict): Parameters used for sequence preparation. + primer_parameters (dict): Parameters used for primer design. + """ + self.species = species + self.data_resource = data_resource + self.contig = contig + self.target = target + self.assay_type = assay_type + self.assay_name = assay_name + + self.df = primer_df + self.seq_parameters = seq_parameters + self.primer_parameters = primer_parameters + + # Extract additional attributes from seq_parameters + self.target_sequence = seq_parameters.get("SEQUENCE_TEMPLATE") + self.gdna_pos = seq_parameters.get("GENOMIC_DNA_POSITIONS") + + def evaluate_primers( + self, + sample_sets, + sample_query=None, + out_dir=None, + legend_loc="best", + assembly="anoGam3", + ): + """ + Evaluate the designed primers by plotting SNP frequencies and primer locations. + + This method generates plots of SNP frequencies for each primer pair and the locations + of the primers in relation to nearby exons on the genome. It also performs a BLAT alignment + of the primers to the genome using the gget library. + + Args: + sample_sets (str or list): Sample set identifier(s) to use for frequency calculation. + sample_query (str): A pandas query string to filter the samples. + out_dir (str): Directory to save the output files. + legend_loc (str, optional): Location of the legend in the plot. Default is "best". + assembly (str, optional): The genome assembly to use with BLAT. Default is "anoGam3". + """ + # Plot SNP frequencies for each primer pair + self.plot_primer_snp_frequencies( + sample_sets=sample_sets, + sample_query=sample_query, + out_dir=out_dir, + ) + + # Plot primer locations in relation to nearby exons + self.plot_primer_locs( + legend_loc=legend_loc, + out_dir=out_dir, + ) + + # Perform BLAT alignment of primers to the genome + if self.species == "gambiae_sl": + blat_df = self.gget_blat_genome(assembly=assembly) + if out_dir is not None and blat_df is not None: + blat_df.to_csv(f"{out_dir}/{self.assay_name}_blat_results.csv") + + def summarise_metadata(self, sample_sets=None, sample_query=None): + """ + Retrieve a summary of metadata for samples in the ag3/af1 resource. + + This method creates a pivot table summarizing the sample counts by sample set, + year, country, and taxon. + + Args: + sample_sets (str or list, optional): Sample set identifier(s) to filter the data. + sample_query (str, optional): A pandas query string to filter the samples. + + Returns: + pd.DataFrame: A pivot table summarizing the sample metadata. + """ + # Retrieve sample metadata based on the provided filters + df_samples = self.data_resource.sample_metadata( + sample_sets=sample_sets, sample_query=sample_query + ) + + # Create a pivot table to summarize the data + pivot_country_year_taxon = df_samples.pivot_table( + index=["sample_set", "year", "country"], + columns=["taxon"], + values="sample_id", + aggfunc="count", + fill_value=0, + ) + + return pivot_country_year_taxon + + def plot_primer_snp_frequencies( + self, + sample_sets, + sample_query=None, + out_dir=None, + ): + """ + Plot SNP frequencies for each primer pair. + + This method retrieves allele frequency data for each primer pair and generates + a plot using the Plotly library. + + Args: + sample_sets (str or list): Sample set identifier(s) to use for frequency calculation. + sample_query (str, optional): A pandas query string to filter the samples. + out_dir (str, optional): Directory to save the output files. If None, files are not saved. + + Returns: + dict: A dictionary containing the frequency data for each primer pair. + """ + name = self.assay_name + target = self.seq_parameters["GENOMIC_TARGET"] + + if sample_query is not None: + print(f"Subsetting allele frequencies to {sample_query}") + + # Loop through each primer pair and get the frequencies of alternate alleles + res_dict = {} + for i in range(len(self.df.columns)): + res_dict[i] = _get_primer_alt_frequencies( + species=self.species, + primer_df=self.df, + gdna_pos=self.gdna_pos, + pair=i, + assay_type=self.assay_type, + contig=self.contig, + sample_sets=sample_sets, + sample_query=sample_query, + ) + + # Generate the plot using Plotly + _plotly_primers( + primer_df=self.df, + res_dict=res_dict, + name=name, + assay_type=self.assay_type, + sample_sets=sample_sets, + target=target, + out_dir=out_dir, + ) + + return res_dict + + def plot_primer_locs( + self, + legend_loc="best", + out_dir=None, + ): + """ + Plot the positions of primer sets in relation to nearby exons. + + This method generates a matplotlib figure showing the locations of primers + and nearby exons on the genome. + + Args: + legend_loc (str, optional): Location of the legend in the plot. Default is "best". + out_dir (str, optional): Directory to save the output files. If None, files are not saved. + """ + # Retrieve necessary data and parameters + data_resource = retrieve_data_resource(species=self.species) + oligos, _ = _return_oligo_list(self.assay_type) + assay_name = self.assay_name + exon_id_col = "Name" if self.species == "gambiae_sl" else "ID" + + # Load geneset (gff) and get relevant locations + gff = data_resource.geneset() + if any(item in self.assay_type for item in ["gDNA", "probe"]): + start = self.seq_parameters["GENOMIC_TARGET"] - 500 + end = self.seq_parameters["GENOMIC_TARGET"] + 500 + locgff, min_, max_, genegff = _get_gDNA_locs(gff, self.contig, start, end) + min_ = np.min([min_, start]) + max_ = np.max([max_, end]) + elif self.assay_type == "cDNA primers": + locgff, min_, max_, genegff = _get_qPCR_locs( + gff, self.contig, self.seq_parameters["GENOMIC_TARGET"] + ) + start, end = min_, max_ + + if locgff.empty: + print("No exons in close proximity for loc plot") + return + + # Set up the plot + fig, ax = plt.subplots(1, 1, figsize=[16, 4]) + self._configure_plot_axes(ax, min_, max_, start, end) + # Plot exons, genes, primer spans + self._plot_exons(ax, locgff, exon_id_col) + self._plot_genes(ax, genegff, min_, max_) + self._plot_primers(ax, oligos) + # Add legend and save if out_dir is provided + plt.legend(handles=ax.get_legend_handles_labels()[0], loc=legend_loc) + if out_dir: + fig.savefig( + f"{out_dir}/{assay_name}_primer_locs.png", dpi=300, bbox_inches="tight" + ) + + def _configure_plot_axes(self, ax, min_, max_, start, end): + """Helper method to configure plot axes.""" + if min_ in ["inf", "NaN"]: + min_ = start + if max_ in ["inf", "NaN"]: + max_ = end + ax.set_xlim(min_, max_) + ax.set_ylim(-0.5, 1.5) + ax.ticklabel_format(useOffset=False) + ax.axhline(0.5, color="k", linewidth=0.7, linestyle="--") + sns.despine(ax=ax, left=True, bottom=False) + ax.tick_params( + top=False, left=False, right=False, labelleft=False, labelbottom=True + ) + ax.tick_params(axis="x", which="major", labelsize=13) + ax.set_ylabel("Exons") + ax.set_xlabel(f"Chromosome {self.contig} position", fontdict={"fontsize": 14}) + + def _plot_exons(self, ax, locgff, exon_id_col): + """Helper method to plot exons.""" + for _, exon in locgff.iterrows(): + ex_start, ex_end = exon[["start", "end"]] + e_name = exon[exon_id_col][-2:] + strand = exon["strand"] + if strand == "+": + rect = patches.Rectangle( + (ex_start, 0.55), + ex_end - ex_start, + 0.3, + linewidth=3, + edgecolor="none", + facecolor="grey", + alpha=0.9, + ) + ax.text((ex_start + ex_end) / 2, 0.65, e_name) + else: + rect = patches.Rectangle( + (ex_start, 0.45), + ex_end - ex_start, + -0.3, + linewidth=3, + edgecolor="none", + facecolor="grey", + alpha=0.9, + ) + ax.text((ex_start + ex_end) / 2, 0.3, e_name) + ax.add_patch(rect) + + def _plot_genes(self, ax, genegff, min_, max_): + """Helper method to plot genes.""" + tot_genes = genegff.shape[0] + for i, gene in genegff.reset_index(drop=True).iterrows(): + start, end = gene[["start", "end"]] + diff = np.diff([min_, max_]) + interval = diff / tot_genes + 1 + name_point = min_ + (interval * i + 1) + strand = gene["strand"] + if strand == "+": + rect = patches.Rectangle( + (start, 0.55), + end - start, + 0.3, + linewidth=3, + edgecolor="black", + facecolor="none", + ) + ax.text( + name_point, + 0.95, + s=gene["ID"], + fontdict={"fontsize": 12}, + weight="bold", + ) + else: + rect = patches.Rectangle( + (start, 0.45), + end - start, + -0.3, + linewidth=3, + edgecolor="black", + facecolor="none", + ) + ax.text( + name_point, + -0.3, + s=gene["ID"], + fontdict={"fontsize": 12}, + weight="bold", + ) + ax.add_patch(rect) + + def _plot_primers(self, ax, oligos): + """Helper method to plot primers.""" + pal = sns.color_palette("Set2", len(self.df.columns)) + handles, labels = ax.get_legend_handles_labels() + for pair in self.df: + pair = int(pair) + for oligo in oligos: + oligo_pos = _retrieve_span( + primer_df=self.df, + gdna_pos=self.gdna_pos, + pair=pair, + oligo=oligo, + assay_type=self.assay_type, + ) + lower, upper = oligo_pos.min(), oligo_pos.max() + + if oligo == "forward": + plt.arrow( + lower, + 0.8 + (2 / (10 - (pair))), + upper - lower, + 0, + width=0.03, + length_includes_head=True, + color=pal[pair], + ) + elif oligo == "reverse": + plt.arrow( + upper, + 0.8 + (2 / (10 - (pair))), + lower - upper, + 0, + width=0.03, + length_includes_head=True, + color=pal[pair], + ) + elif oligo == "probe": + ax.axhline(y=0.8 + (2 / (10 - (pair))), xmin=lower, xmax=upper) + line = plt.Line2D( + (lower, upper), + (0.8 + (2 / (10 - (pair))), 0.8 + (2 / (10 - (pair)))), + lw=2.5, + color=pal[pair], + ) + ax.add_line(line) + + patch = patches.Patch(color=pal[pair], label=f"pair {pair}") + handles.append(patch) + + def gget_blat_genome(self, assembly="anoGam3"): + """ + Align primers to the genome using BLAT. + + This method uses the gget library to perform BLAT alignment of the designed primers + against the specified genome assembly. + + Args: + assembly (str, optional): The genome assembly to use with BLAT. Default is "anoGam3". + + Returns: + pd.DataFrame or None: A DataFrame containing BLAT alignment results, or None if no hits found. + """ + oligos, _ = _return_oligo_list(assay_type=self.assay_type) + + pair_dict = {} + for primer_pair in self.df: + oligo_list = [] + for oligo in oligos: + seq = self.df[primer_pair].loc[f"primer_{oligo}_sequence"] + blat_df = gget.blat(sequence=seq, seqtype="DNA", assembly=assembly) + if blat_df is None: + print(f"No hit for {oligo} - pair {primer_pair}") + continue + blat_df.loc[:, "primer"] = f"{oligo}_{primer_pair}" + oligo_list.append(blat_df.set_index("primer")) + + if oligo_list: + pair_dict[primer_pair] = pd.concat(oligo_list) + + if pair_dict: + return pd.concat(pair_dict) + else: + print("No HITs found for these primer pairs") + return None + + def to_csv(self, file_path, **kwargs): + self.df.to_csv(file_path, **kwargs) + + def to_excel(self, file_path, **kwargs): + self.df.to_excel(file_path, **kwargs) + + def __str__(self): + return f"AnoPrimerResults for {self.assay_name} ({self.assay_type})" + + def __repr__(self): + return self.__str__() diff --git a/AnoPrimer/utils.py b/AnoPrimer/utils.py new file mode 100644 index 0000000..8f55e77 --- /dev/null +++ b/AnoPrimer/utils.py @@ -0,0 +1,554 @@ +import allel +import gget +import malariagen_data +import numpy as np +import pandas as pd +import plotly.graph_objects as go +from plotly.subplots import make_subplots + + +def retrieve_data_resource(species): + assert species in [ + "gambiae_sl", + "funestus", + ], f"species {species} not recognised, please use 'gambiae_sl' or 'funestus'" + if species == "gambiae_sl": + data_resource = malariagen_data.Ag3(url="gs://vo_agam_release/", pre=True) + elif species == "funestus": + data_resource = malariagen_data.Af1(url="gs://vo_afun_release/", pre=True) + return data_resource + + +def check_my_oligo( + sequence, sample_sets="3.0", sample_query=None, width=700, height=400 +): + """ + Align a sequence to AgamP3, retrieve ag3 frequencies in this region and plot. + Only works with An.gambiae_sl for now. + """ + + print("Aligning sequence to AgamP3 genome with BLAT") + blat_df = gget.blat(sequence=sequence, seqtype="DNA", assembly="anoGam3") + if blat_df is None: + print(f"No hit for {sequence}") + return + + contig, start, end = blat_df.loc[0, ["chromosome", "start", "end"]] + contig = contig.replace("chr", "") + region_span = f"{contig}:{start}-{end}" + print("plotting frequencies in ag3 data") + + fig = plot_sequence_frequencies( + data_resource=malariagen_data.Ag3(url="gs://vo_agam_release/", pre=True), + region=region_span, + sample_sets=sample_sets, + sample_query=sample_query, + width=width, + height=height, + ) + return fig + + +def plot_sequence_frequencies( + data_resource, region, sample_sets=None, sample_query=None, width=700, height=400 +): + """Retrieve frequencies""" + + snps = data_resource.snp_calls( + region=region, sample_sets=sample_sets, sample_query=sample_query + ) + ref_alt_arr = snps["variant_allele"].compute().values.astype(str) + freq_arr = ( + allel.GenotypeArray(snps["call_genotype"]).count_alleles().to_frequencies() + ) + pos = snps["variant_position"].compute().values + df = pd.DataFrame( + { + "position": pos, + "base": ref_alt_arr[:, 0], + "alt_frequency": freq_arr[:, 1:].sum(axis=1), + } + ) # Make dataframe for plotting + df["base_pos"] = df["base"] + "_" + df["position"].astype(str) + # Get the frequency of each base and store as data frame + freq_df = _get_base_freqs(_addZeroCols(freq_arr), ref_alt_arr).filter(like="freq") + + data = pd.concat([df, freq_df], axis=1) + + fig = _plotly_frequencies( + data=data, + region=region, + sample_sets=sample_sets, + sample_query=sample_query, + width=width, + height=height, + ) + return fig + + +def _plotly_frequencies( + data, region, sample_sets, sample_query=None, width=700, height=400, save=False +): + import plotly.graph_objects as go + + hover_template = "
".join( + [ + "Base / Position: %{customdata[4]}", + "Total Alternate freq: %{y}", + "A_freq: %{customdata[0]}", + "C_freq: %{customdata[1]}", + "G_freq: %{customdata[2]}", + "T_freq: %{customdata[3]}", + ] + ) + # Color scatterpoints blue if segregating SNP + color = [-1 if v == 0 else 1 if v > 0 else 0 for v in data["alt_frequency"]] + colorscale = [[0, "lightgray"], [0.5, "lightgray"], [1, "dodgerblue"]] + + fig = go.Figure( + go.Scatter( + x=data["position"], + y=data["alt_frequency"], + customdata=data[["A_freq", "C_freq", "G_freq", "T_freq", "base_pos"]], + hovertemplate=hover_template, + mode="markers", + marker=dict( + size=14, + color=color, + colorscale=colorscale, + line=dict(width=2, color="black"), + ), + marker_symbol="circle", + ) + ) + # Set xticks to be the REF allele + fig.update_xaxes( + tickmode="array", + tickangle=0, + tickvals=data["position"].to_list(), + ticktext=data["base"].to_list(), + ) + fig.update_yaxes( + tickmode="array", + tickvals=np.arange(0, 1, 0.2), + range=[0, 1], + title="Alternate allele frequency", + ) + # Set plot title + if sample_query is not None: + title_text = f"{region} | {sample_sets} | {sample_query} | allele frequencies" + else: + title_text = f"{region} | {sample_sets} | allele frequencies" + + fig.update_layout( + height=height, + width=width, + title_text=title_text, + title_x=0.5, + template="simple_white", + showlegend=False, + ) + fig.show() + return fig + + +#### utility functions #### + + +def extract_trailing_digits(string): + import re + + match = re.search(r"\d+$", string) + if match: + return match.group(0) + else: + return None + + +def _retrieve_span(primer_df, gdna_pos, oligo, assay_type, pair): + primer_loc = primer_df.loc[f"primer_{oligo}", str(pair)][0] + primer_loc = primer_loc + 1 if oligo == "reverse" else primer_loc + primer_size = primer_df.loc[f"primer_{oligo}", str(pair)][1] + + if any(item in assay_type for item in ["gDNA", "probe"]): + pos_arr = gdna_pos + elif assay_type == "cDNA primers": + pos_arr = np.array([]) + exon_spans = np.array(_consecutive(gdna_pos)) + 1 + for span in exon_spans: + pos_arr = np.append(pos_arr, np.arange(span[0], span[1] + 1)).astype(int) + + if oligo in ["forward", "probe"]: + pos = pos_arr[primer_loc : primer_loc + primer_size] + elif oligo == "reverse": + pos = np.flip(pos_arr[primer_loc - primer_size : primer_loc]) + + return pos + + +def _get_primer_arrays( + species, contig, gdna_pos, sample_sets, assay_type, sample_query=None +): + """ + Load genotype data from Ag3/Af1 resource and return allele frequencies + at entire input sequence region + """ + data_resource = retrieve_data_resource(species=species) + + if any(item in assay_type for item in ["gDNA", "probe"]): + span_str = f"{contig}:{gdna_pos.min()}-{gdna_pos.max()}" + ds_snps = data_resource.snp_calls( + region=span_str, sample_sets=sample_sets, sample_query=sample_query + ) # get genotypes + ref_alt_arr = ds_snps["variant_allele"].compute().values + geno = ds_snps["call_genotype"] + freq_arr = allel.GenotypeArray(geno).count_alleles().to_frequencies() + pos_arr = gdna_pos + elif assay_type == "cDNA primers": + freq_arr = [] + ref_alt_arr = [] + pos_arr = np.array([]) + exon_spans = np.array(_consecutive(gdna_pos)) + 1 + for span in exon_spans: + span_str = f"{contig}:{span[0]}-{span[1]}" + ds_snps = data_resource.snp_calls( + region=span_str, sample_sets=sample_sets, sample_query=sample_query + ) # get genotypes + ref_alts = ds_snps["variant_allele"] + geno = ds_snps["call_genotype"] + freqs = ( + allel.GenotypeArray(geno).count_alleles().to_frequencies() + ) # calculate allele frequencies + freqs = _addZeroCols(freqs) + freq_arr.append(freqs) + ref_alt_arr.append(ref_alts) + pos_arr = np.append(pos_arr, np.arange(span[0], span[1] + 1)).astype(int) + freq_arr = np.concatenate(freq_arr) + ref_alt_arr = np.concatenate(ref_alt_arr) + + return (freq_arr, ref_alt_arr.astype("U13"), pos_arr) + + +def _get_primer_alt_frequencies( + species, primer_df, gdna_pos, pair, sample_sets, assay_type, contig, sample_query +): + """ + Find the genomic locations of pairs of primers, and runs span_to_freq + to get allele frequencies at those locations + """ + + oligos, _ = _return_oligo_list(assay_type) + base_freqs, ref_alt_arr, pos_arr = _get_primer_arrays( + species=species, + contig=contig, + gdna_pos=gdna_pos, + sample_sets=sample_sets, + assay_type=assay_type, + sample_query=sample_query, + ) + + freq_arr = base_freqs[:, 1:].sum(axis=1) + + di = {} + for oligo in oligos: + primer_loc = primer_df.loc[f"primer_{oligo}", str(pair)][0] + primer_loc = primer_loc + 1 if oligo == "reverse" else primer_loc + primer_size = primer_df.loc[f"primer_{oligo}", str(pair)][1] + if oligo in ["forward", "probe"]: + freq = freq_arr[primer_loc : primer_loc + primer_size] + base_freqs_arr = base_freqs[primer_loc : primer_loc + primer_size, :] + ref = ref_alt_arr[primer_loc : primer_loc + primer_size, 0] + ref_alt = ref_alt_arr[primer_loc : primer_loc + primer_size, :] + pos = pos_arr[primer_loc : primer_loc + primer_size] + elif oligo == "reverse": + freq = np.flip(freq_arr[primer_loc - primer_size : primer_loc]) + base_freqs_arr = base_freqs[primer_loc - primer_size : primer_loc, :] + base_freqs_arr = np.flip(base_freqs_arr, axis=0) + ref = ref_alt_arr[primer_loc - primer_size : primer_loc, 0] + ref = np.array(list(_rev_complement("".join(ref))), dtype=str) + ref_alt = ref_alt_arr[primer_loc - primer_size : primer_loc, :] + ref_alt = _complement(np.flip(ref_alt, axis=0)) + pos = np.flip(pos_arr[primer_loc - primer_size : primer_loc]) + + df = pd.DataFrame( + {"position": pos, "base": ref, "alt_frequency": freq} + ) # Make dataframe for plotting + df["base_pos"] = df["base"] + "_" + df["position"].astype(str) + assert df.shape[0] == primer_size, "Wrong size primers" + + freq_df = _get_base_freqs(_addZeroCols(base_freqs_arr), ref_alt).filter( + like="freq" + ) + df = pd.concat([df, freq_df], axis=1) + di[oligo] = df + return di + + +def _plotly_primers( + primer_df, + res_dict, + name, + assay_type, + sample_sets, + target, + out_dir=None, +): + oligos, _ = _return_oligo_list(assay_type) + if len(oligos) == 2: + plt_title = ["Forward primer", "Reverse primer"] + elif len(oligos) == 3: + plt_title = ["Forward primer", "Reverse primer", "Probe"] + elif len(oligos) == 1: + plt_title = ["Probe"] + + title_list = [] + for pair in primer_df: + for oligo in plt_title: + title_list.append(f"{oligo} {pair}") + + hover_template = "
".join( + [ + "Base / Position: %{customdata[4]}", + "Total Alternate freq: %{y}", + "A_freq: %{customdata[0]}", + "C_freq: %{customdata[1]}", + "G_freq: %{customdata[2]}", + "T_freq: %{customdata[3]}", + ] + ) + + fig = make_subplots( + rows=len(primer_df.columns), + cols=len(oligos), + subplot_titles=title_list, + horizontal_spacing=0.03, + vertical_spacing=0.08, + ) + fig.update_annotations(font_size=13) + for idx, oligo in enumerate(oligos): + idx = idx + 1 + for i in primer_df: + i = int(i) + row_i = i + 1 + + color = [ + -1 if v == 0 else 1 if v > 0 else 0 + for v in res_dict[i][oligo]["alt_frequency"] + ] + colorscale = [[0, "lightgray"], [0.5, "lightgray"], [1, "dodgerblue"]] + + tm = np.round(primer_df.loc[f"primer_{oligo}_tm", str(i)], 2) + gc = np.round(primer_df.loc[f"primer_{oligo}_gc_percent", str(i)], 2) + span = f"{int(res_dict[i][oligo]['position'].min())}-{int(res_dict[i][oligo]['position'].max())}" + # Write text to plot for Tm, GC, span, and 3/5' + + for col in res_dict[i][oligo].columns: + if col.endswith("freq"): + res_dict[i][oligo][col] = np.round(res_dict[i][oligo][col], 2) + + fig.add_trace( + go.Scatter( + x=res_dict[i][oligo]["base_pos"], + y=res_dict[i][oligo]["alt_frequency"], + customdata=res_dict[i][oligo][ + ["A_freq", "C_freq", "G_freq", "T_freq", "base_pos"] + ], + hovertemplate=hover_template, + mode="markers", + marker=dict( + size=14, + color=color, + colorscale=colorscale, + line=dict(width=2, color="black"), + ), + marker_symbol="circle", + ), + row=row_i, + col=idx, + ) + fig.add_annotation( + row=row_i, + col=idx, + x=res_dict[i][oligo]["base_pos"][0], + y=0.8, + text="5'", + showarrow=False, + ) + fig.add_annotation( + row=row_i, + col=idx, + x=res_dict[i][oligo]["base_pos"].to_numpy()[-1], + y=0.8, + text="3'", + showarrow=False, + ) + fig.add_annotation( + row=row_i, + col=idx, + x=res_dict[i][oligo]["base_pos"].to_numpy()[4], + y=0.92, + text=span, + showarrow=False, + ) + fig.add_annotation( + row=row_i, + col=idx, + x=res_dict[i][oligo]["base_pos"].to_numpy()[-7], + y=0.92, + text=f"GC={gc}", + showarrow=False, + ) + fig.add_annotation( + row=row_i, + col=idx, + x=res_dict[i][oligo]["base_pos"].to_numpy()[-3], + y=0.92, + text=f"TM={tm}", + showarrow=False, + ) + + fig.update_xaxes( + row=row_i, + col=idx, + tickmode="array", + tickvals=res_dict[i][oligo]["base_pos"], + ticktext=res_dict[i][oligo]["base"], + tickangle=0, + mirror=True, + ) + if idx > 1: + fig.update_yaxes( + row=row_i, + col=idx, + range=[0, 1], + tickvals=np.arange(0, 1, 0.2), + showticklabels=False, + mirror=True, + ) + else: + fig.update_yaxes( + row=row_i, + col=idx, + tickvals=np.arange(0, 1, 0.2), + range=[0, 1], + mirror=True, + ) + + if ((row_i % 2) == 0) & (idx == 1): + fig.update_yaxes(row=row_i, col=idx, title="Alternate allele frequency") + + if any(item in assay_type for item in ["gDNA"]): + title_text = f"{name} primer pairs | {sample_sets} | target {target} bp" + elif assay_type == "probe": + title_text = f"{name} probe | {sample_sets} | target {target} bp" + elif assay_type == "cDNA primers": + title_text = f"{name} primer pairs | {sample_sets} | target {target}" + + # fig.update_traces(customdata=customdata, hovertemplate=hovertemplate) + fig.update_layout( + height=200 * len(primer_df.columns), + width=500 * len(oligos), + title_text=title_text, + title_x=0.5, + template="simple_white", + showlegend=False, + ) + if out_dir: + fig.write_html(f"{name}_{assay_type}.html") + fig.write_image(f"{name}_{assay_type}.pdf") + fig.show() + + +def _get_gDNA_locs(gff, contig, start, end): + locgff = gff.query( + f"contig == '{contig}' & type == 'exon' & start < {end} & end > {start}" + ) + min_ = locgff.start.min() - 100 + max_ = locgff.end.max() + 100 + genegff = gff.query( + f"contig == '{contig}' & type == 'gene' & start < {end} & end > {start}" + ) + return (locgff, min_, max_, genegff) + + +def _get_qPCR_locs(gff, contig, transcript): + # Load geneset (gff) + locgff = gff.query(f"Parent == '{transcript}' & type == 'exon'") + min_ = locgff.start.min() - 200 + max_ = locgff.end.max() + 200 + genegff = gff.query( + f"contig == '{contig}' & type == 'gene' & start > {min_} & end < {max_}" + ) + return (locgff, min_, max_, genegff) + + +def _return_oligo_list(assay_type): + if assay_type == "probe": + oligos = ["probe"] + row_start = 9 + elif any(item == assay_type for item in ["gDNA primers", "cDNA primers"]): + oligos = ["forward", "reverse"] + row_start = 11 + elif assay_type == "gDNA primers + probe": + oligos = ["forward", "reverse", "probe"] + row_start = 12 + return (oligos, row_start) + + +def _convert_results_dict_naming(primer_dict): + k = {} + for key in primer_dict.keys(): + if "LEFT" in key: + nkey = key.replace("LEFT", "forward") + elif "RIGHT" in key: + nkey = key.replace("RIGHT", "reverse") + elif "INTERNAL" in key: + nkey = key.replace("INTERNAL", "probe") + else: + nkey = key + k[nkey.lower()] = primer_dict[key] + return k + + +def _complement(x): + if x == "A": + return "T" + elif x == "C": + return "G" + elif x == "G": + return "C" + elif x == "T": + return "A" + + +_complement = np.vectorize(_complement) + + +def _rev_complement(seq): + BASES = "NRWSMBDACGTHVKSWY" + return "".join([BASES[-j] for j in [BASES.find(i) for i in seq][::-1]]) + + +def _consecutive(data, stepsize=1): + arr = np.split(data, np.where(np.diff(data) != stepsize)[0] + 1) + arr = [[a.min(), a.max()] for a in arr] + return arr + + +def _addZeroCols(freqs): + freqlength = freqs.shape[1] + needed = 4 - freqlength + if needed > 0: + for i in range(needed): + freqs = np.column_stack([freqs, np.repeat(0, freqs.shape[0])]) + return freqs + + +def _get_base_freqs(freqs, ref_alt_array): + assert freqs.shape == ref_alt_array.shape, "Shape of arrays is different" + freq_df = pd.DataFrame(ref_alt_array) + for i_base in range(4): + for i in range(freqs.shape[0]): + base = ref_alt_array[i, i_base] + freq_df.loc[i, f"{base}_freq"] = freqs[i, i_base] + return freq_df diff --git a/notebooks/AnoPrimer-long.ipynb b/notebooks/AnoPrimer-long.ipynb index 137599a..4543789 100644 --- a/notebooks/AnoPrimer-long.ipynb +++ b/notebooks/AnoPrimer-long.ipynb @@ -1,1401 +1,1371 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "view-in-github" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "AM9EeHef6tAD", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "AM9EeHef6tAD", - "outputId": "b45dc9be-a741-40ee-8c5c-0ef5d7073426" - }, - "outputs": [], - "source": [ - "# First, install some packages we require\n", - "! pip install AnoPrimer -q " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "528ae85d", - "metadata": { - "id": "528ae85d" - }, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'AnoPrimer'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[2], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Import libraries \u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mAnoPrimer\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmalariagen_data\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mprimer3\u001b[39;00m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'AnoPrimer'" - ] - } - ], - "source": [ - "# Import libraries \n", - "import AnoPrimer\n", - "import malariagen_data\n", - "import primer3\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "import google.auth\n", - "\n", - "credentials, project = google.auth.default()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "47bde61a", - "metadata": { - "id": "47bde61a" - }, - "source": [ - "##**[AnoPrimer](https://github.com/sanjaynagi/AnoPrimer): Primer design considering genomic variation in *Anopheles gambiae* s.l and *Anopheles funestus*** \n", - "**Author**: [Sanjay Curtis Nagi](https://sanjaynagi.github.io/) \n", - "**Email**: sanjay.nagi@lstmed.ac.uk \n", - "\n", - "Welcome to the AnoPrimer notebook. This is the long version of the notebook which designs primers in steps. Alternatively, there is a all-in-one function in a [short version of the notebook](https://colab.research.google.com/github/sanjaynagi/AnoPrimer/blob/main/notebooks/AnoPrimer-short.ipynb).\n", - "\n", - "We would like to design primers for PCR applications, such as genotyping or gene expression (qPCR). However, single nucleotide polymorphisms (SNPs) in primer binding sites can result in differences or failures in PCR amplification, referred to as null alleles. \n", - "\n", - "In general, mismatches caused by SNPs are more of a problem as you move towards the 3' end. I recommend reading a really good article on this topic on the IDT website - [Consider SNPs when designing PCR and qPCR assays](https://eu.idtdna.com/pages/education/decoded/article/considering-snps-when-designing-pcr-and-qpcr-assays). In *An. gambiae s.l*, there is a [huge amount of genetic variation](https://genome.cshlp.org/content/30/10/1533.full); a SNP found approximately every 1.9 bases (!), which makes considering SNPs even more important when designing molecular assays. Thanks to primer3-py and the fantastic malariagen_data package, we can do all of this in the cloud, hosted by google!\n", - "\n", - "\n", - "\n", - "####**Google Colab**\n", - "\n", - "If you are unfamiliar with iPython notebooks and google colab, I encourage you to read the [FAQ](https://research.google.com/colaboratory/faq.html) and watch the following [introduction](https://www.youtube.com/watch?v=inN8seMm7UI). In general, the cells contain python code, and can be run by pressing the play button next to each cell, and should be run in order.\n", - "\n", - "You may want to save a copy of the notebook into your google drive (`file -> save a copy in Drive`)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "SV03ewwc9de8", - "metadata": { - "cellView": "form", - "id": "SV03ewwc9de8", - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "#@title **Selecting Primer Parameters** { run: \"auto\" }\n", - "#@markdown In the below cells, replace the values with those desired for your primers and ensure to press the play button (top left) to run the cell.\n", - "\n", - "species = 'gambiae_sl' #@param [\"gambiae_sl\", \"funestus\"]\n", - "assay_type = 'cDNA primers' #@param [\"gDNA primers\", \"gDNA primers + probe\", \"probe\", \"cDNA primers\"]\n", - "assay_name = 'coeae2f' #@param {type:\"string\"}\n", - "min_amplicon_size = 60 #@param {type:\"integer\"}\n", - "max_amplicon_size = 120 #@param {type:\"integer\"}\n", - "amplicon_size_range = [[min_amplicon_size, max_amplicon_size]]\n", - "n_primer_pairs = 6 #@param {type:\"slider\", min:1, max:20, step:1}\n", - "cDNA_exon_junction=True #ignore\n", - "\n", - "#@markdown \n", - "#@markdown primer_target should be a region string ('2L:28545767') for gDNA primers and probes, and an AGAP transcript identifier for cDNA primers.\n", - "\n", - "primer_target = 'AGAP006228-RA' #@param {type:\"string\"} \n", - "sample_sets = 'AG1000G-GH' #@param {type:\"string\"}\n", - "sample_query = None #'taxon == \"coluzzii\"' " - ] - }, - { - "cell_type": "markdown", - "id": "aa733775", - "metadata": { - "id": "aa733775" - }, - "source": [ - "Load sequence data for chromosomal arm of choice, using the [malariagen_data API](https://malariagen.github.io/vector-data/ag3/api.html):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5fa0be8d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5fa0be8d", - "outputId": "5e8379cc-2d78-49c6-cb6a-03429c25d7d9" - }, - "outputs": [], - "source": [ - "data_resource = AnoPrimer.retrieve_data_resource(species)\n", - "# Connect to the malariagen_data ag3 API\n", - "contig, target = AnoPrimer.check_and_split_target(species=species, target=primer_target, assay_type=assay_type)\n", - "genome_seq = data_resource.genome_sequence(region=contig)\n", - "print(f\"Our genome sequence for {contig} is {genome_seq.shape[0]} bp long\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "144a37ce", - "metadata": { - "id": "144a37ce" - }, - "source": [ - "Now we need to extract the bit of sequence we need. We will use functions in the [AnoPrimer](https://pypi.org/project/AnoPrimer/) package." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "R0CkEd38VXGY", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "R0CkEd38VXGY", - "outputId": "fbbbf120-e9b0-4328-8688-5eb3eb1d38f8" - }, - "outputs": [], - "source": [ - "target_sequence, gdna_pos, seq_parameters = AnoPrimer.prepare_sequence(\n", - " species=species,\n", - " target=target,\n", - " assay_type=assay_type,\n", - " assay_name=assay_name,\n", - " genome_seq=genome_seq,\n", - " amplicon_size_range=amplicon_size_range,\n", - " cDNA_exon_junction=cDNA_exon_junction,\n", - " )" - ] + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "51edba47", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "AM9EeHef6tAD", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "markdown", - "id": "_aaCJGAXUCSx", - "metadata": { - "id": "_aaCJGAXUCSx" - }, - "source": [ - "Now we have our target sequence. Lets take a look..." - ] + "id": "AM9EeHef6tAD", + "outputId": "b45dc9be-a741-40ee-8c5c-0ef5d7073426" + }, + "outputs": [], + "source": [ + "# First, install some packages we require\n", + "! pip install AnoPrimer -q " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "528ae85d", + "metadata": { + "id": "528ae85d" + }, + "outputs": [], + "source": [ + "# Import libraries \n", + "import AnoPrimer\n", + "import malariagen_data\n", + "import primer3\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import google.auth\n", + "\n", + "credentials, project = google.auth.default()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "47bde61a", + "metadata": { + "id": "47bde61a" + }, + "source": [ + "## **[AnoPrimer](https://github.com/sanjaynagi/AnoPrimer): Primer design considering genomic variation in *Anopheles gambiae* s.l and *Anopheles funestus*** \n", + "**Author**: [Sanjay Curtis Nagi](https://sanjaynagi.github.io/) \n", + "**Email**: sanjay.nagi@lstmed.ac.uk \n", + "\n", + "---\n", + "\n", + "Nagi SC, Ashraf F, Miles A and Donnelly MJ. **[AnoPrimer](https://github.com/sanjaynagi/AnoPrimer): Primer Design in malaria vectors informed by range-wide genomic variation** [version 1; peer review: 4 approved]. Wellcome Open Res 2024, 9:255 (https://doi.org/10.12688/wellcomeopenres.20998.1)\n", + "\n", + "---\n", + "\n", + "Welcome to the AnoPrimer notebook. This is the long version of the notebook which designs primers in steps. Alternatively, there is a [short version of the notebook](https://colab.research.google.com/github/sanjaynagi/AnoPrimer/blob/main/notebooks/AnoPrimer-short.ipynb).\n", + "\n", + "We would like to design primers for PCR applications, such as genotyping or gene expression (qPCR). However, single nucleotide polymorphisms (SNPs) in primer binding sites can result in differences or failures in PCR amplification, referred to as null alleles. In general, mismatches caused by SNPs are more of a problem as you move towards the 3' end. I recommend reading a really good article on this topic on the IDT website - [Consider SNPs when designing PCR and qPCR assays](https://eu.idtdna.com/pages/education/decoded/article/considering-snps-when-designing-pcr-and-qpcr-assays). In *An. gambiae s.l*, there is a [huge amount of genetic variation](https://genome.cshlp.org/content/30/10/1533.full); a SNP found approximately every 1.9 bases (!), which makes considering SNPs even more important when designing molecular assays. Thanks to primer3-py and the fantastic malariagen_data package, we can do all of this in the cloud, hosted by google!\n", + "\n", + "Note that due to changes in how MalariaGEN data is accessed, you will need to be registered with MalariaGEN to use AnoPrimer - https://malariagen.github.io/vector-data/vobs/vobs-data-access.html. \n", + "\n", + "#### **Google Colab**\n", + "\n", + "If you are unfamiliar with iPython notebooks and google colab, I encourage you to read the [FAQ](https://research.google.com/colaboratory/faq.html) and watch the following [introduction](https://www.youtube.com/watch?v=inN8seMm7UI). In general, the cells contain python code, and can be run by pressing the play button next to each cell, and should be run in order.\n", + "\n", + "You may want to save a copy of the notebook into your google drive (`file -> save a copy in Drive`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "SV03ewwc9de8", + "metadata": { + "cellView": "form", + "id": "SV03ewwc9de8", + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "#@title **Selecting Primer Parameters** { run: \"auto\" }\n", + "#@markdown In the below cells, replace the values with those desired for your primers and ensure to press the play button (top left) to run the cell.\n", + "\n", + "species = 'gambiae_sl' #@param [\"gambiae_sl\", \"funestus\"]\n", + "assay_type = 'cDNA primers' #@param [\"gDNA primers\", \"gDNA primers + probe\", \"probe\", \"cDNA primers\"]\n", + "assay_name = 'coeae2f' #@param {type:\"string\"}\n", + "min_amplicon_size = 60 #@param {type:\"integer\"}\n", + "max_amplicon_size = 120 #@param {type:\"integer\"}\n", + "amplicon_size_range = [[min_amplicon_size, max_amplicon_size]]\n", + "n_primer_pairs = 6 #@param {type:\"slider\", min:1, max:20, step:1}\n", + "cDNA_exon_junction=True #ignore\n", + "\n", + "#@markdown \n", + "#@markdown primer_target should be a region string ('2L:28545767') for gDNA primers and probes, and an AGAP transcript identifier for cDNA primers.\n", + "\n", + "primer_target = 'AGAP006228-RA' #@param {type:\"string\"} \n", + "sample_sets = 'AG1000G-GH' #@param {type:\"string\"}\n", + "sample_query = None #'taxon == \"coluzzii\"' " + ] + }, + { + "cell_type": "markdown", + "id": "aa733775", + "metadata": { + "id": "aa733775" + }, + "source": [ + "Load sequence data for chromosomal arm of choice, using the [malariagen_data API](https://malariagen.github.io/vector-data/ag3/api.html):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fa0be8d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "execution_count": null, - "id": "nDaWXq7f9hyA", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 110 - }, - "id": "nDaWXq7f9hyA", - "outputId": "9d470214-1fee-4bc2-fe94-f6a837e5398a" - }, - "outputs": [], - "source": [ - "target_sequence" - ] + "id": "5fa0be8d", + "outputId": "5e8379cc-2d78-49c6-cb6a-03429c25d7d9" + }, + "outputs": [], + "source": [ + "data_resource = AnoPrimer.retrieve_data_resource(species)\n", + "# Connect to the malariagen_data ag3 API\n", + "contig, target = AnoPrimer.check_and_split_target(species=species, target=primer_target, assay_type=assay_type)\n", + "genome_seq = data_resource.genome_sequence(region=contig)\n", + "print(f\"Our genome sequence for {contig} is {genome_seq.shape[0]} bp long\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "144a37ce", + "metadata": { + "id": "144a37ce" + }, + "source": [ + "Now we need to extract the bit of sequence we need. We will use functions in the [AnoPrimer](https://pypi.org/project/AnoPrimer/) package." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "R0CkEd38VXGY", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "markdown", - "id": "38fe2530", - "metadata": { - "id": "38fe2530" - }, - "source": [ - "We need to set up some python dictionaries containing our sequence and primer parameters, this will be our input to primer3. In the below cell, you can modify or add primer3 parameters, such as optimal TM, GC content etc etc. A full list of possible parameters and their functions can be found in the [primer3 manual](https://primer3.org/manual.html)." - ] + "id": "R0CkEd38VXGY", + "outputId": "fbbbf120-e9b0-4328-8688-5eb3eb1d38f8" + }, + "outputs": [], + "source": [ + "seq_parameters = AnoPrimer.prepare_sequence(\n", + " species=species,\n", + " target=target,\n", + " assay_type=assay_type,\n", + " assay_name=assay_name,\n", + " genome_seq=genome_seq,\n", + " amplicon_size_range=amplicon_size_range,\n", + " cDNA_exon_junction=cDNA_exon_junction,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "_aaCJGAXUCSx", + "metadata": { + "id": "_aaCJGAXUCSx" + }, + "source": [ + "Now we have our target sequence. Lets take a look..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nDaWXq7f9hyA", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 110 }, - { - "cell_type": "code", - "execution_count": null, - "id": "622ac6d9", - "metadata": { - "id": "622ac6d9" - }, - "outputs": [], - "source": [ - "primer_parameters = {\n", - " 'PRIMER_OPT_SIZE': 20,\n", - " 'PRIMER_TASK':'generic',\n", - " 'PRIMER_MIN_SIZE': 17,\n", - " 'PRIMER_MAX_SIZE': 24,\n", - " 'PRIMER_OPT_TM': 60.0,\n", - " 'PRIMER_MIN_TM': 57.0,\n", - " 'PRIMER_MAX_TM': 63.0,\n", - " 'PRIMER_MIN_GC': 35.0,\n", - " 'PRIMER_MAX_GC': 75.0,\n", - " 'PRIMER_MIN_THREE_PRIME_DISTANCE':3, # this parameter is the minimum distance between successive pairs. If 1, it means successive primer pairs could be identical bar one base shift\n", - " 'PRIMER_INTERNAL_OPT_SIZE': 16, # Probe size preferences if selected, otherwise ignored\n", - " 'PRIMER_INTERNAL_MIN_SIZE': 10,\n", - " 'PRIMER_INTERNAL_MAX_SIZE': 22,\n", - " 'PRIMER_INTERNAL_MIN_TM': 45,\n", - " 'PRIMER_INTERNAL_MAX_TM': 65, # Probe considerations are quite relaxed, assumed that LNAs will be used later to affect TM\n", - " # Extra primer3 parameters can go here\n", - " # In the same format as above \n", - " }\n", - "\n", - "primer_parameters = AnoPrimer.primer_params(primer_parameters=primer_parameters, assay_type=assay_type, n_primer_pairs=n_primer_pairs, amplicon_size_range=amplicon_size_range) ## adds some parameters depending on assay type" - ] + "id": "nDaWXq7f9hyA", + "outputId": "9d470214-1fee-4bc2-fe94-f6a837e5398a" + }, + "outputs": [], + "source": [ + "seq_parameters['SEQUENCE_TEMPLATE']" + ] + }, + { + "cell_type": "markdown", + "id": "38fe2530", + "metadata": { + "id": "38fe2530" + }, + "source": [ + "We need to set up some python dictionaries containing our sequence and primer parameters, this will be our input to primer3. In the below cell, you can modify or add primer3 parameters, such as optimal TM, GC content etc etc. A full list of possible parameters and their functions can be found in the [primer3 manual](https://primer3.org/manual.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "622ac6d9", + "metadata": { + "id": "622ac6d9" + }, + "outputs": [], + "source": [ + "primer_parameters = {\n", + " 'PRIMER_OPT_SIZE': 20,\n", + " 'PRIMER_TASK':'generic',\n", + " 'PRIMER_MIN_SIZE': 17,\n", + " 'PRIMER_MAX_SIZE': 24,\n", + " 'PRIMER_OPT_TM': 60.0,\n", + " 'PRIMER_MIN_TM': 57.0,\n", + " 'PRIMER_MAX_TM': 63.0,\n", + " 'PRIMER_MIN_GC': 35.0,\n", + " 'PRIMER_MAX_GC': 75.0,\n", + " 'PRIMER_MIN_THREE_PRIME_DISTANCE':3, # this parameter is the minimum distance between successive pairs. If 1, it means successive primer pairs could be identical bar one base shift\n", + " 'PRIMER_INTERNAL_OPT_SIZE': 16, # Probe size preferences if selected, otherwise ignored\n", + " 'PRIMER_INTERNAL_MIN_SIZE': 10,\n", + " 'PRIMER_INTERNAL_MAX_SIZE': 22,\n", + " 'PRIMER_INTERNAL_MIN_TM': 45,\n", + " 'PRIMER_INTERNAL_MAX_TM': 65, # Probe considerations are quite relaxed, assumed that LNAs will be used later to affect TM\n", + " # Extra primer3 parameters can go here\n", + " # In the same format as above \n", + " }\n", + "\n", + "primer_parameters = AnoPrimer.primer_params(primer_parameters=primer_parameters, assay_type=assay_type, n_primer_pairs=n_primer_pairs, amplicon_size_range=amplicon_size_range) ## adds some parameters depending on assay type" + ] + }, + { + "cell_type": "markdown", + "id": "0819ba76", + "metadata": { + "id": "0819ba76" + }, + "source": [ + "#### **Run the primer3 algorithm!**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae6e7525", + "metadata": { + "id": "ae6e7525" + }, + "outputs": [], + "source": [ + "primer_dict = primer3.designPrimers(seq_args=seq_parameters, global_args=primer_parameters)" + ] + }, + { + "cell_type": "markdown", + "id": "56945ca6", + "metadata": { + "id": "56945ca6" + }, + "source": [ + "It should be *fast*. The output, which we call 'primer_dict', is a python dictionary containing the full results from primer3. We will turn this into a pandas dataframe (i.e a useful, pretty table), containing just the necessary bits of information. First, we'll print some information from the primer3 run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bb40846", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "markdown", - "id": "0819ba76", - "metadata": { - "id": "0819ba76" - }, - "source": [ - "#### **Run the primer3 algorithm!**" - ] + "id": "1bb40846", + "outputId": "667a6413-4d49-43e8-82e8-9fe4ad8f4c8f" + }, + "outputs": [], + "source": [ + "AnoPrimer.primer3_run_statistics(primer_dict, assay_type)" + ] + }, + { + "cell_type": "markdown", + "id": "CbVMwOdCrNMf", + "metadata": { + "id": "CbVMwOdCrNMf" + }, + "source": [ + "Now lets wrangle this into an easy to look at table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf8c5970", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae6e7525", - "metadata": { - "id": "ae6e7525", - "scrolled": false - }, - "outputs": [], - "source": [ - "primer_dict = primer3.designPrimers(seq_args=seq_parameters, global_args=primer_parameters)" - ] + "id": "bf8c5970", + "outputId": "3d446c36-fa75-4088-d141-eb1b9cccfc30" + }, + "outputs": [], + "source": [ + "primer_df = AnoPrimer.primer3_to_pandas(primer_dict=primer_dict, assay_type=assay_type)\n", + "primer_df" + ] + }, + { + "cell_type": "markdown", + "id": "Lf0CeEVrtaxP", + "metadata": { + "id": "Lf0CeEVrtaxP" + }, + "source": [ + "Next, we will create an AnoPrimer results object, which has further methods associated with it, which we can use to evaluate our primer sets. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50e5a9f3-95c5-4de7-8d78-23e5663a3cfc", + "metadata": {}, + "outputs": [], + "source": [ + "primers = AnoPrimer.AnoPrimerResults(\n", + " species=species,\n", + " data_resource=data_resource,\n", + " contig=contig,\n", + " assay_type=assay_type,\n", + " assay_name=assay_name,\n", + " target=target,\n", + " primer_df=primer_df,\n", + " seq_parameters=seq_parameters,\n", + " primer_parameters=primer_parameters,\n", + " )\n", + "\n", + "primers" + ] + }, + { + "cell_type": "markdown", + "id": "13472642-7c5e-4894-a961-cd0fe51daec4", + "metadata": {}, + "source": [ + "We can write our dataframe to .csv, .tsv and excel files, which can be explored in other editors. To download a file from colab to your local computer, click the folder panel on the left-hand sidebar, the three dots next your primers.tsv/.xlsx file, and download. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09eQYSeT7lHG", + "metadata": { + "id": "09eQYSeT7lHG" + }, + "outputs": [], + "source": [ + "primers.to_csv(f\"{assay_name}.{assay_type}.tsv\", sep=\"\\t\")\n", + "primers.to_excel(f\"{assay_name}.{assay_type}.xlsx\")" + ] + }, + { + "cell_type": "markdown", + "id": "b4a9776c", + "metadata": { + "id": "b4a9776c" + }, + "source": [ + "## **Looking for variation using the ag3/af1 resource**\n", + "\n", + "In Ag3, samples are organised into sample sets. We can load any sample set from the Ag3 resource, but there are quite a few! Lets look at what each sample set contains, breaking it down by species, year and country. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61286685", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49, + "referenced_widgets": [ + "a97d545d424a4db19b337e5e9f508ec3", + "7ea4836435a340b698b172f4363e8443", + "8b82b57f2d944fe5b882827b44ee0759", + "69cd90440a034521ab0765203b544e82", + "f0d520fb105c44f1be119b77d30e1bbc", + "987cbbebf1b34152be251a0fdd7ea613", + "f61ccace1b824ddbafbf4de3cf249340", + "b3f0061d20e24891bbb0bdabd6b6a934", + "99fe7084c3e04dd8950d2b584576c5f2", + "72f5b3285926491abb6633ad0257abff", + "a6c3e95a12e24e9f943b26209be21ef3" + ] }, - { - "cell_type": "markdown", - "id": "56945ca6", - "metadata": { - "id": "56945ca6" - }, - "source": [ - "It should be *fast*. The output, which we call 'primer_dict', is a python dictionary containing the full results from primer3. We will turn this into a pandas dataframe (i.e a useful, pretty table), containing just the necessary bits of information. First, we'll print some information from the primer3 run." - ] + "id": "61286685", + "outputId": "c595a1b7-ef69-472f-9e88-334985ed070a" + }, + "outputs": [], + "source": [ + "primers.summarise_metadata(sample_sets=sample_sets, sample_query=sample_query)" + ] + }, + { + "cell_type": "markdown", + "id": "BLwvXBT28NYE", + "metadata": { + "id": "BLwvXBT28NYE" + }, + "source": [ + "Here, we can see the breakdown by sample set for country, species and year. For the purposes of this notebook, let's use the Ghana sample set. If we wanted to use all sample sets, we could supply '3.0' instead of a sample set, which will load all samples from the ag3.0 release." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49ca7f2b", + "metadata": { + "id": "49ca7f2b" + }, + "outputs": [], + "source": [ + "#sample_sets = None #'AG1000G-GH' # sample_set = '3.0' .you can also supply lists with multiple sample sets e.g ['AG1000G-GH', 'AG1000G-CI', 'AG1000G-BF-A']\n", + "#sample_query = None #\"taxon == 'coluzzii'\" # here we can subset to specific values in the metadata e.g : \"taxon == 'gambiae'\" , or \"taxon == 'arabiensis'\" " + ] + }, + { + "cell_type": "markdown", + "id": "67coxQKoo82x", + "metadata": { + "id": "67coxQKoo82x" + }, + "source": [ + "### **Plot allele frequencies in primers locations**\n", + "\n", + "Now we can plot the primers pairs, and the frequency of any alternate alleles in the Ag1000g sample set of choice. When calculating allele frequencies, we will take the sum of all alternate alleles, as we are interested here in any mutations which are different from the reference genome. We can see the frequencies of specific alleles by hovering over the points of the plot - in some cases it may be preferable to design degenerate primers rather than avoid a primer pair completely.\n", + "\n", + "We will also plot the primer Tm, GC and genomic spans of each primer binding site. We can use this plot to identify primers pairs and probes which may be suitable, particularly trying to avoid SNPs in the 3' end. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "shXKBpG49LSU", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "1a10feaaac164fe6bbdd0d9cd31cc7af", + "8779c1473ba843a1a6935c19ee50a37d", + "4122c2b626aa4977b12989a2194aa8bb", + "2286bb5011fc4b4bb43d37408f52f830", + "617428fec1344132a17cf1768da25d63", + "5d5e8119dac744618d72ea5e9a5b3982", + "f980c7cf030a4095b9350a81eb072efa", + "fb58512532ac4e37a6b5e163752e2597", + "1e7870ab87c64b71b0f931728ecddbf0", + "98a7da97707a45cba260ab174bad0c95", + "f34da91f0fcb44dc9729ffdcc4ec9c53" + ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "1bb40846", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1bb40846", - "outputId": "667a6413-4d49-43e8-82e8-9fe4ad8f4c8f" - }, - "outputs": [], - "source": [ - "AnoPrimer.primer3_run_statistics(primer_dict, assay_type)" - ] + "id": "shXKBpG49LSU", + "outputId": "8316fbf5-652d-41d8-fbfd-04ae8d7ec2ea" + }, + "outputs": [], + "source": [ + "results_dict = primers.plot_primer_snp_frequencies(\n", + " sample_sets=sample_sets, \n", + " sample_query=sample_query,\n", + " out_dir=\".\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "aCVuE1CiAR4h", + "metadata": { + "id": "aCVuE1CiAR4h" + }, + "source": [ + "Now lets plot these primer pairs across the genome, highlighting where they are in relation to any nearby exons..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cAacbw-q8Sgm", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 286 }, - { - "cell_type": "markdown", - "id": "CbVMwOdCrNMf", - "metadata": { - "id": "CbVMwOdCrNMf" - }, - "source": [ - "Now lets wrangle this into an easy to look at table." - ] + "id": "cAacbw-q8Sgm", + "outputId": "5c68b7ee-f1fd-4acd-ee03-0e5edd364c73" + }, + "outputs": [], + "source": [ + "primers.plot_primer_locs(\n", + " legend_loc='lower left',\n", + " out_dir=\".\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "-aIy4IyoBc7-", + "metadata": { + "id": "-aIy4IyoBc7-" + }, + "source": [ + "### **Checking our primers for specificity to the *Anopheles gambiae* genome**\n", + "\n", + "We can use a cool new python package, [gget](https://github.com/pachterlab/gget), to rapidly search our primers against the AgamP3 genome, to ensure they will not amplify any other regions of the genome. gget rapidly queries large databases - in this case we can use the program BLAT to align primer sequences. Unfortunately as the lengths of primer sequences are so short, it is at the limit of BLATs sensitivity, and in some cases, matches are not returned. gget can also currently only query the older AgamP3 assembly. Therefore, it is also recommended run a more exhaustive search in [Primer-BLAST](https://www.ncbi.nlm.nih.gov/tools/primer-blast/).\n", + "\n", + "Unfortunately, there is no funestus genome in the gget/blat reference database." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "zP4LB2bQBgGg", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 380 }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf8c5970", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 363 - }, - "id": "bf8c5970", - "outputId": "3d446c36-fa75-4088-d141-eb1b9cccfc30" - }, - "outputs": [], - "source": [ - "primer_df = AnoPrimer.primer3_to_pandas(primer_dict=primer_dict, assay_type=assay_type)\n", - "primer_df" - ] + "id": "zP4LB2bQBgGg", + "outputId": "62c3ab54-8232-47a8-fb52-eafeeb1e2dfe" + }, + "outputs": [], + "source": [ + "if species == 'gambiae_sl':\n", + " blat_result_df = primers.gget_blat_genome(\n", + " primer_df, \n", + " assay_type, \n", + " assembly='anoGam3'\n", + " )\n", + "\n", + " blat_result_df" + ] + }, + { + "cell_type": "markdown", + "id": "dRflsODz83tW", + "metadata": { + "id": "dRflsODz83tW" + }, + "source": [ + "### **Alternative all-in-one functions**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ea12dd4", + "metadata": { + "id": "5ea12dd4" + }, + "outputs": [], + "source": [ + "#view help for function\n", + "AnoPrimer.design_primers?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fiakkk2m7RsS", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, - { - "cell_type": "markdown", - "id": "Lf0CeEVrtaxP", - "metadata": { - "id": "Lf0CeEVrtaxP" - }, - "source": [ - "\n", - "We can write this to .tsv and excel files, which can be explored in other editors. To download a file from colab to your local computer, click the folder panel on the left-hand sidebar, the three dots next your primers.tsv/.xlsx file, and download." - ] + "id": "fiakkk2m7RsS", + "outputId": "25205488-b98b-47c1-d8d2-779775164b88" + }, + "outputs": [], + "source": [ + "# primers = AnoPrimer.design_primers(\n", + "# species=species,\n", + "# assay_type='gDNA primers + probe',\n", + "# target='X:2422652',\n", + "# assay_name='chrom_x_region',\n", + "# n_primer_pairs=5,\n", + "# min_amplicon_size=60,\n", + "# max_amplicon_size=120,\n", + "# primer_parameters=\"default\",\n", + "# )\n", + "#\n", + "#\n", + "# primers.evaluate_primers(sample_sets=['AG1000G-BF-A'], sample_query=None, out_dir=\"./\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "g4r9sQectXxx", + "metadata": { + "id": "g4r9sQectXxx" + }, + "source": [ + "#### **We may now have designed suitable primers. However, there are some further considerations...**\n", + "\n", + "\n", + "- Primers should be run in [**NCBI Blast**](https://blast.ncbi.nlm.nih.gov/Blast.cgi), to ensure specificity against the host organism, and specificity for the genomic location of interest. I would recommend both doing a general blast and also specifically against *An. gambiae* (TaxonID = 7165).\n", + "\n", + "- If in multiplexed use with other primers or probes, primers must not interact with each other. This can be investigated on a one by one basis using the IDT tool [oligoanalyzer](https://eu.idtdna.com/calc/analyzer), though higher throughput algorithms may be required.\n", + "\n", + "- If designing Locked Nucleic Acid (LNA) probes for SNP detection, you will want to play around with the placement of LNAs in the oligo sequence, which can allow short probes (~10-14 bases) to bind with high affinity and discriminate between SNPs. IDT have a tool for this which allow you to check the binding affinity between mismatches, though it requires a log in https://eu.idtdna.com/calc/analyzer/lna. \n", + "\n", + "- Many more considerations.... [IDT - How to design primers and probes for PCR and qPCR](https://eu.idtdna.com/pages/education/decoded/article/designing-pcr-primers-and-probes) \n", + "\n", + "---\n", + " \n", + "\n", + "#### **Future development**\n", + "\n", + "Any contributions or suggestions on how we can improve this notebook are more than welcome. Please [email](mailto:sanjay.nagi@lstmed.ac.uk) or log an [issue on github](https://github.com/sanjaynagi/primerDesignAg/issues). This notebook and source code for AnoPrimer are located here - https://github.com/sanjaynagi/AnoPrimer/ \n", + "\n", + "---\n", + "\n", + "#### **References**\n", + "\n", + "Nagi SC, Ashraf F, Miles A and Donnelly MJ. **AnoPrimer: Primer Design in malaria vectors informed by range-wide genomic variation** [version 1; peer review: 4 approved]. Wellcome Open Res 2024, 9:255 (https://doi.org/10.12688/wellcomeopenres.20998.1)\n", + "\n", + "The Anopheles gambiae 1000 Genomes Consortium (2020). **Genome variation and population structure among 1142 mosquitoes of the African malaria vector species *Anopheles gambiae* and *Anopheles coluzzii***. *Genome Research*, **30**: 1533-1546. \n", + "https://genome.cshlp.org/content/early/2020/09/25/gr.262790.120\n", + "\n", + "Untergasser A, Cutcutache I, Koressaar T, Ye J, Faircloth BC, Remm M and Rozen SG (2012). **Primer3--new capabilities and interfaces**. *Nucleic Acids Research*. 40(15):e115." + ] + } + ], + "metadata": { + "colab": { + "include_colab_link": true, + "name": "Primer Design in Anopheles gambiae.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + }, + "vscode": { + "interpreter": { + "hash": "ce681de973941d5edd9bd94c9a2926b7fe65e17e578a68317f38265a230b8ca7" + } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "1a10feaaac164fe6bbdd0d9cd31cc7af": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8779c1473ba843a1a6935c19ee50a37d", + "IPY_MODEL_4122c2b626aa4977b12989a2194aa8bb", + "IPY_MODEL_2286bb5011fc4b4bb43d37408f52f830" + ], + "layout": "IPY_MODEL_617428fec1344132a17cf1768da25d63" + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "09eQYSeT7lHG", - "metadata": { - "id": "09eQYSeT7lHG" - }, - "outputs": [], - "source": [ - "primer_df.to_csv(f\"{assay_name}.{assay_type}.tsv\", sep=\"\\t\")\n", - "primer_df.to_excel(f\"{assay_name}.{assay_type}.xlsx\")" - ] + "1e7870ab87c64b71b0f931728ecddbf0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "markdown", - "id": "b4a9776c", - "metadata": { - "id": "b4a9776c" - }, - "source": [ - "##**Looking for variation using the ag1000g resource and malariagen API**\n", - "\n", - "In Ag3, samples are organised into sample sets. We can load any sample set from the Ag3 resource, but there are quite a few! Lets look at what each sample set contains, breaking it down by species, year and country. " - ] + "2286bb5011fc4b4bb43d37408f52f830": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_98a7da97707a45cba260ab174bad0c95", + "placeholder": "​", + "style": "IPY_MODEL_f34da91f0fcb44dc9729ffdcc4ec9c53", + "value": " 1/1 [00:00<00:00, 3.07it/s]" + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "61286685", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49, - "referenced_widgets": [ - "a97d545d424a4db19b337e5e9f508ec3", - "7ea4836435a340b698b172f4363e8443", - "8b82b57f2d944fe5b882827b44ee0759", - "69cd90440a034521ab0765203b544e82", - "f0d520fb105c44f1be119b77d30e1bbc", - "987cbbebf1b34152be251a0fdd7ea613", - "f61ccace1b824ddbafbf4de3cf249340", - "b3f0061d20e24891bbb0bdabd6b6a934", - "99fe7084c3e04dd8950d2b584576c5f2", - "72f5b3285926491abb6633ad0257abff", - "a6c3e95a12e24e9f943b26209be21ef3" - ] - }, - "id": "61286685", - "outputId": "c595a1b7-ef69-472f-9e88-334985ed070a" - }, - "outputs": [], - "source": [ - "metadata = data_resource.sample_metadata()" - ] + "4122c2b626aa4977b12989a2194aa8bb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fb58512532ac4e37a6b5e163752e2597", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1e7870ab87c64b71b0f931728ecddbf0", + "value": 1 + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "iSTRbVpxD3oy", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "iSTRbVpxD3oy", - "outputId": "79277693-768b-4463-bdf9-bba06d102a09" - }, - "outputs": [], - "source": [ - "pivot_country_year_taxon = (\n", - " metadata\n", - " .pivot_table(\n", - " index=[\"sample_set\", \"year\", \"country\"], \n", - " columns=[\"taxon\"], \n", - " values=\"sample_id\",\n", - " aggfunc=\"count\",\n", - " fill_value=0\n", - " )\n", - ")\n", - "\n", - "pivot_country_year_taxon" - ] + "5d5e8119dac744618d72ea5e9a5b3982": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "id": "BLwvXBT28NYE", - "metadata": { - "id": "BLwvXBT28NYE" - }, - "source": [ - "Here, we can see the breakdown by sample set for country, species and year. For the purposes of this notebook, let's use the Ghana sample set. If we wanted to use all sample sets, we could supply '3.0' instead of a sample set, which will load all samples from the ag3.0 release." - ] + "617428fec1344132a17cf1768da25d63": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "49ca7f2b", - "metadata": { - "id": "49ca7f2b" - }, - "outputs": [], - "source": [ - "#sample_sets = None #'AG1000G-GH' # sample_set = '3.0' .you can also supply lists with multiple sample sets e.g ['AG1000G-GH', 'AG1000G-CI', 'AG1000G-BF-A']\n", - "#sample_query = None #\"taxon == 'coluzzii'\" # here we can subset to specific values in the metadata e.g : \"taxon == 'gambiae'\" , or \"taxon == 'arabiensis'\" " - ] + "69cd90440a034521ab0765203b544e82": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_72f5b3285926491abb6633ad0257abff", + "placeholder": "​", + "style": "IPY_MODEL_a6c3e95a12e24e9f943b26209be21ef3", + "value": " 28/28 [00:07<00:00, 3.80it/s]" + } }, - { - "cell_type": "markdown", - "id": "67coxQKoo82x", - "metadata": { - "id": "67coxQKoo82x" - }, - "source": [ - "### **Plot allele frequencies in primers locations**\n", - "\n", - "Now we can plot the primers pairs, and the frequency of any alternate alleles in the Ag1000g sample set of choice. When calculating allele frequencies, we will take the sum of all alternate alleles, as we are interested here in any mutations which are different from the reference genome. We can see the frequencies of specific alleles by hovering over the points of the plot - in some cases it may be preferable to design degenerate primers rather than avoid a primer pair completely.\n", - "\n", - "We will also plot the primer Tm, GC and genomic spans of each primer binding site. We can use this plot to identify primers pairs and probes which may be suitable, particularly trying to avoid SNPs in the 3' end. " - ] + "72f5b3285926491abb6633ad0257abff": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "shXKBpG49LSU", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000, - "referenced_widgets": [ - "1a10feaaac164fe6bbdd0d9cd31cc7af", - "8779c1473ba843a1a6935c19ee50a37d", - "4122c2b626aa4977b12989a2194aa8bb", - "2286bb5011fc4b4bb43d37408f52f830", - "617428fec1344132a17cf1768da25d63", - "5d5e8119dac744618d72ea5e9a5b3982", - "f980c7cf030a4095b9350a81eb072efa", - "fb58512532ac4e37a6b5e163752e2597", - "1e7870ab87c64b71b0f931728ecddbf0", - "98a7da97707a45cba260ab174bad0c95", - "f34da91f0fcb44dc9729ffdcc4ec9c53" - ] - }, - "id": "shXKBpG49LSU", - "outputId": "8316fbf5-652d-41d8-fbfd-04ae8d7ec2ea" - }, - "outputs": [], - "source": [ - "results_dict = AnoPrimer.plot_primer_snp_frequencies(\n", - " species=species,\n", - " primer_df=primer_df,\n", - " gdna_pos=gdna_pos,\n", - " contig=contig,\n", - " sample_sets=sample_sets, \n", - " sample_query=sample_query,\n", - " assay_type=assay_type,\n", - " seq_parameters=seq_parameters,\n", - " out_dir=\".\"\n", - ")" - ] + "7ea4836435a340b698b172f4363e8443": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_987cbbebf1b34152be251a0fdd7ea613", + "placeholder": "​", + "style": "IPY_MODEL_f61ccace1b824ddbafbf4de3cf249340", + "value": "Load sample metadata: 100%" + } }, - { - "cell_type": "markdown", - "id": "aCVuE1CiAR4h", - "metadata": { - "id": "aCVuE1CiAR4h" - }, - "source": [ - "Now lets plot these primer pairs across the genome, highlighting where they are in relation to any nearby exons..." - ] + "8779c1473ba843a1a6935c19ee50a37d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5d5e8119dac744618d72ea5e9a5b3982", + "placeholder": "​", + "style": "IPY_MODEL_f980c7cf030a4095b9350a81eb072efa", + "value": "Load sample metadata: 100%" + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "cAacbw-q8Sgm", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 286 - }, - "id": "cAacbw-q8Sgm", - "outputId": "5c68b7ee-f1fd-4acd-ee03-0e5edd364c73" - }, - "outputs": [], - "source": [ - "AnoPrimer.plot_primer_locs(\n", - " species=species,\n", - " primer_df=primer_df, \n", - " primer_res_dict=results_dict,\n", - " assay_type=assay_type, \n", - " contig=contig, \n", - " seq_parameters=seq_parameters, \n", - " legend_loc='lower left',\n", - " out_dir=\".\",\n", - ")" - ] + "8b82b57f2d944fe5b882827b44ee0759": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b3f0061d20e24891bbb0bdabd6b6a934", + "max": 28, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_99fe7084c3e04dd8950d2b584576c5f2", + "value": 28 + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "0JA92ApBcxLo", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 363 - }, - "id": "0JA92ApBcxLo", - "outputId": "9fa467b0-ad5d-4fea-a9a3-ee5f8ff44335" - }, - "outputs": [], - "source": [ - "primer_df" - ] + "987cbbebf1b34152be251a0fdd7ea613": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "id": "-aIy4IyoBc7-", - "metadata": { - "id": "-aIy4IyoBc7-" - }, - "source": [ - "### **Checking our primers for specificity to the *Anopheles gambiae* genome**\n", - "\n", - "We can use a cool new python package, [gget](https://github.com/pachterlab/gget), to rapidly search our primers against the AgamP3 genome, to ensure they will not amplify any other regions of the genome. gget rapidly queries large databases - in this case we can use the program BLAT to align primer sequences. Unfortunately as the lengths of primer sequences are so short, it is at the limit of BLATs sensitivity, and in some cases, matches are not returned. gget can also currently only query the older AgamP3 assembly. Therefore, it is also recommended run a more exhaustive search in [Primer-BLAST](https://www.ncbi.nlm.nih.gov/tools/primer-blast/)." - ] + "98a7da97707a45cba260ab174bad0c95": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "zP4LB2bQBgGg", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 380 - }, - "id": "zP4LB2bQBgGg", - "outputId": "62c3ab54-8232-47a8-fb52-eafeeb1e2dfe" - }, - "outputs": [], - "source": [ - "if species == 'gambiae_sl':\n", - " blat_result_df = AnoPrimer.gget_blat_genome(\n", - " primer_df, \n", - " assay_type, \n", - " assembly='anoGam3'\n", - " )\n", - "\n", - " blat_result_df" - ] + "99fe7084c3e04dd8950d2b584576c5f2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "markdown", - "id": "dRflsODz83tW", - "metadata": { - "id": "dRflsODz83tW" - }, - "source": [ - "### **Alternative all-in-one function**" - ] + "a6c3e95a12e24e9f943b26209be21ef3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ea12dd4", - "metadata": { - "id": "5ea12dd4" - }, - "outputs": [], - "source": [ - "#view help for function\n", - "AnoPrimer.designPrimers?" - ] + "a97d545d424a4db19b337e5e9f508ec3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7ea4836435a340b698b172f4363e8443", + "IPY_MODEL_8b82b57f2d944fe5b882827b44ee0759", + "IPY_MODEL_69cd90440a034521ab0765203b544e82" + ], + "layout": "IPY_MODEL_f0d520fb105c44f1be119b77d30e1bbc" + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "fiakkk2m7RsS", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "fiakkk2m7RsS", - "outputId": "25205488-b98b-47c1-d8d2-779775164b88" - }, - "outputs": [], - "source": [ - "# primer_df, blat_df = AnoPrimer.designPrimers(\n", - "# species=species,\n", - "# assay_type='gDNA primers + probe',\n", - "# target='X:2422652' ,\n", - "# assay_name='chrom_x_region',\n", - "# n_primer_pairs=5,\n", - "# min_amplicon_size=60,\n", - "# max_amplicon_size=120,\n", - "# primer_parameters=\"default\",\n", - "# #sample_sets=['AG1000G-BF-A', 'AG1000G-GH', 'AG1000G-GN-A'],\n", - "# out_dir=\".\"\n", - "# )" - ] + "b3f0061d20e24891bbb0bdabd6b6a934": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "g4r9sQectXxx", - "metadata": { - "id": "g4r9sQectXxx" - }, - "source": [ - "####**We may now have designed suitable primers. However, there are some further considerations...**\n", - "\n", - "\n", - "- Primers **must** be run in [**NCBI Blast**](https://blast.ncbi.nlm.nih.gov/Blast.cgi), to ensure specificity against the host organism, and specificity for the genomic location of interest. I would recommend both doing a general blast and also specifically against *An. gambiae* (TaxonID = 7165).\n", - "\n", - "- If in multiplexed use with other primers or probes, primers must not interact with each other. This can be investigated on a one by one basis using the IDT tool [oligoanalyzer](https://eu.idtdna.com/calc/analyzer), though higher throughput algorithms may be required.\n", - "\n", - "- If designing Locked Nucleic Acid (LNA) probes for SNP detection, you will want to play around with the placement of LNAs in the oligo sequence, which can allow short probes (~10-14 bases) to bind with high affinity and discriminate between SNPs. IDT have a tool for this which allow you to check the binding affinity between mismatches, though it requires a log in https://eu.idtdna.com/calc/analyzer/lna. \n", - "\n", - "- Many more considerations.... [IDT - How to design primers and probes for PCR and qPCR](https://eu.idtdna.com/pages/education/decoded/article/designing-pcr-primers-and-probes) \n", - "\n", - "\\\n", - " \n", - "\n", - "####**Future development**\n", - "\n", - "Any contributions or suggestions on how we can improve this notebook are more than welcome. Please [email](mailto:sanjay.nagi@lstmed.ac.uk) or log an [issue on github](https://github.com/sanjaynagi/primerDesignAg/issues). This notebook and source code for AnoPrimer are located here - https://github.com/sanjaynagi/AnoPrimer/ \n", - "\n", - "\\\n", - "####**References**\n", - "\n", - "The Anopheles gambiae 1000 Genomes Consortium (2020). **Genome variation and population structure among 1142 mosquitoes of the African malaria vector species *Anopheles gambiae* and *Anopheles coluzzii***. *Genome Research*, **30**: 1533-1546. \n", - "https://genome.cshlp.org/content/early/2020/09/25/gr.262790.120\n", - "\n", - "Untergasser A, Cutcutache I, Koressaar T, Ye J, Faircloth BC, Remm M and Rozen SG (2012). **Primer3--new capabilities and interfaces**. *Nucleic Acids Research*. 40(15):e115." - ] - } - ], - "metadata": { - "colab": { - "include_colab_link": true, - "name": "Primer Design in Anopheles gambiae.ipynb", - "provenance": [] + "f0d520fb105c44f1be119b77d30e1bbc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" + "f34da91f0fcb44dc9729ffdcc4ec9c53": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" + "f61ccace1b824ddbafbf4de3cf249340": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - "vscode": { - "interpreter": { - "hash": "ce681de973941d5edd9bd94c9a2926b7fe65e17e578a68317f38265a230b8ca7" - } + "f980c7cf030a4095b9350a81eb072efa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "1a10feaaac164fe6bbdd0d9cd31cc7af": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_8779c1473ba843a1a6935c19ee50a37d", - "IPY_MODEL_4122c2b626aa4977b12989a2194aa8bb", - "IPY_MODEL_2286bb5011fc4b4bb43d37408f52f830" - ], - "layout": "IPY_MODEL_617428fec1344132a17cf1768da25d63" - } - }, - "1e7870ab87c64b71b0f931728ecddbf0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "2286bb5011fc4b4bb43d37408f52f830": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_98a7da97707a45cba260ab174bad0c95", - "placeholder": "​", - "style": "IPY_MODEL_f34da91f0fcb44dc9729ffdcc4ec9c53", - "value": " 1/1 [00:00<00:00, 3.07it/s]" - } - }, - "4122c2b626aa4977b12989a2194aa8bb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fb58512532ac4e37a6b5e163752e2597", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_1e7870ab87c64b71b0f931728ecddbf0", - "value": 1 - } - }, - "5d5e8119dac744618d72ea5e9a5b3982": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "617428fec1344132a17cf1768da25d63": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "69cd90440a034521ab0765203b544e82": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_72f5b3285926491abb6633ad0257abff", - "placeholder": "​", - "style": "IPY_MODEL_a6c3e95a12e24e9f943b26209be21ef3", - "value": " 28/28 [00:07<00:00, 3.80it/s]" - } - }, - "72f5b3285926491abb6633ad0257abff": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7ea4836435a340b698b172f4363e8443": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_987cbbebf1b34152be251a0fdd7ea613", - "placeholder": "​", - "style": "IPY_MODEL_f61ccace1b824ddbafbf4de3cf249340", - "value": "Load sample metadata: 100%" - } - }, - "8779c1473ba843a1a6935c19ee50a37d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5d5e8119dac744618d72ea5e9a5b3982", - "placeholder": "​", - "style": "IPY_MODEL_f980c7cf030a4095b9350a81eb072efa", - "value": "Load sample metadata: 100%" - } - }, - "8b82b57f2d944fe5b882827b44ee0759": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b3f0061d20e24891bbb0bdabd6b6a934", - "max": 28, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_99fe7084c3e04dd8950d2b584576c5f2", - "value": 28 - } - }, - "987cbbebf1b34152be251a0fdd7ea613": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "98a7da97707a45cba260ab174bad0c95": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "99fe7084c3e04dd8950d2b584576c5f2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "a6c3e95a12e24e9f943b26209be21ef3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "a97d545d424a4db19b337e5e9f508ec3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_7ea4836435a340b698b172f4363e8443", - "IPY_MODEL_8b82b57f2d944fe5b882827b44ee0759", - "IPY_MODEL_69cd90440a034521ab0765203b544e82" - ], - "layout": "IPY_MODEL_f0d520fb105c44f1be119b77d30e1bbc" - } - }, - "b3f0061d20e24891bbb0bdabd6b6a934": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f0d520fb105c44f1be119b77d30e1bbc": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f34da91f0fcb44dc9729ffdcc4ec9c53": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "f61ccace1b824ddbafbf4de3cf249340": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "f980c7cf030a4095b9350a81eb072efa": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "fb58512532ac4e37a6b5e163752e2597": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - } - } + "fb58512532ac4e37a6b5e163752e2597": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } } - }, - "nbformat": 4, - "nbformat_minor": 5 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/notebooks/AnoPrimer-short.ipynb b/notebooks/AnoPrimer-short.ipynb index dc84e5c..18395d0 100644 --- a/notebooks/AnoPrimer-short.ipynb +++ b/notebooks/AnoPrimer-short.ipynb @@ -24,592 +24,11 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "id": "3md9Z9mRQRJn" }, - "outputs": [ - { - "data": { - "application/javascript": [ - "(function(root) {\n", - " function now() {\n", - " return new Date();\n", - " }\n", - "\n", - " const force = true;\n", - "\n", - " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", - " root._bokeh_onload_callbacks = [];\n", - " root._bokeh_is_loading = undefined;\n", - " }\n", - "\n", - "const JS_MIME_TYPE = 'application/javascript';\n", - " const HTML_MIME_TYPE = 'text/html';\n", - " const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", - " const CLASS_NAME = 'output_bokeh rendered_html';\n", - "\n", - " /**\n", - " * Render data to the DOM node\n", - " */\n", - " function render(props, node) {\n", - " const script = document.createElement(\"script\");\n", - " node.appendChild(script);\n", - " }\n", - "\n", - " /**\n", - " * Handle when an output is cleared or removed\n", - " */\n", - " function handleClearOutput(event, handle) {\n", - " function drop(id) {\n", - " const view = Bokeh.index.get_by_id(id)\n", - " if (view != null) {\n", - " view.model.document.clear()\n", - " Bokeh.index.delete(view)\n", - " }\n", - " }\n", - "\n", - " const cell = handle.cell;\n", - "\n", - " const id = cell.output_area._bokeh_element_id;\n", - " const server_id = cell.output_area._bokeh_server_id;\n", - "\n", - " // Clean up Bokeh references\n", - " if (id != null) {\n", - " drop(id)\n", - " }\n", - "\n", - " if (server_id !== undefined) {\n", - " // Clean up Bokeh references\n", - " const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", - " cell.notebook.kernel.execute(cmd_clean, {\n", - " iopub: {\n", - " output: function(msg) {\n", - " const id = msg.content.text.trim()\n", - " drop(id)\n", - " }\n", - " }\n", - " });\n", - " // Destroy server and session\n", - " const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", - " cell.notebook.kernel.execute(cmd_destroy);\n", - " }\n", - " }\n", - "\n", - " /**\n", - " * Handle when a new output is added\n", - " */\n", - " function handleAddOutput(event, handle) {\n", - " const output_area = handle.output_area;\n", - " const output = handle.output;\n", - "\n", - " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", - " if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n", - " return\n", - " }\n", - "\n", - " const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", - "\n", - " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", - " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", - " // store reference to embed id on output_area\n", - " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", - " }\n", - " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", - " const bk_div = document.createElement(\"div\");\n", - " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", - " const script_attrs = bk_div.children[0].attributes;\n", - " for (let i = 0; i < script_attrs.length; i++) {\n", - " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", - " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", - " }\n", - " // store reference to server id on output_area\n", - " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", - " }\n", - " }\n", - "\n", - " function register_renderer(events, OutputArea) {\n", - "\n", - " function append_mime(data, metadata, element) {\n", - " // create a DOM node to render to\n", - " const toinsert = this.create_output_subarea(\n", - " metadata,\n", - " CLASS_NAME,\n", - " EXEC_MIME_TYPE\n", - " );\n", - " this.keyboard_manager.register_events(toinsert);\n", - " // Render to node\n", - " const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", - " render(props, toinsert[toinsert.length - 1]);\n", - " element.append(toinsert);\n", - " return toinsert\n", - " }\n", - "\n", - " /* Handle when an output is cleared or removed */\n", - " events.on('clear_output.CodeCell', handleClearOutput);\n", - " events.on('delete.Cell', handleClearOutput);\n", - "\n", - " /* Handle when a new output is added */\n", - " events.on('output_added.OutputArea', handleAddOutput);\n", - "\n", - " /**\n", - " * Register the mime type and append_mime function with output_area\n", - " */\n", - " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", - " /* Is output safe? */\n", - " safe: true,\n", - " /* Index of renderer in `output_area.display_order` */\n", - " index: 0\n", - " });\n", - " }\n", - "\n", - " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", - " if (root.Jupyter !== undefined) {\n", - " const events = require('base/js/events');\n", - " const OutputArea = require('notebook/js/outputarea').OutputArea;\n", - "\n", - " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", - " register_renderer(events, OutputArea);\n", - " }\n", - " }\n", - " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", - " root._bokeh_timeout = Date.now() + 5000;\n", - " root._bokeh_failed_load = false;\n", - " }\n", - "\n", - " const NB_LOAD_WARNING = {'data': {'text/html':\n", - " \"
\\n\"+\n", - " \"

\\n\"+\n", - " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", - " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", - " \"

\\n\"+\n", - " \"\\n\"+\n", - " \"\\n\"+\n", - " \"from bokeh.resources import INLINE\\n\"+\n", - " \"output_notebook(resources=INLINE)\\n\"+\n", - " \"\\n\"+\n", - " \"
\"}};\n", - "\n", - " function display_loaded() {\n", - " const el = document.getElementById(null);\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS is loading...\";\n", - " }\n", - " if (root.Bokeh !== undefined) {\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", - " }\n", - " } else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(display_loaded, 100)\n", - " }\n", - " }\n", - "\n", - " function run_callbacks() {\n", - " try {\n", - " root._bokeh_onload_callbacks.forEach(function(callback) {\n", - " if (callback != null)\n", - " callback();\n", - " });\n", - " } finally {\n", - " delete root._bokeh_onload_callbacks\n", - " }\n", - " console.debug(\"Bokeh: all callbacks have finished\");\n", - " }\n", - "\n", - " function load_libs(css_urls, js_urls, callback) {\n", - " if (css_urls == null) css_urls = [];\n", - " if (js_urls == null) js_urls = [];\n", - "\n", - " root._bokeh_onload_callbacks.push(callback);\n", - " if (root._bokeh_is_loading > 0) {\n", - " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", - " return null;\n", - " }\n", - " if (js_urls == null || js_urls.length === 0) {\n", - " run_callbacks();\n", - " return null;\n", - " }\n", - " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", - " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", - "\n", - " function on_load() {\n", - " root._bokeh_is_loading--;\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", - " run_callbacks()\n", - " }\n", - " }\n", - "\n", - " function on_error(url) {\n", - " console.error(\"failed to load \" + url);\n", - " }\n", - "\n", - " for (let i = 0; i < css_urls.length; i++) {\n", - " const url = css_urls[i];\n", - " const element = document.createElement(\"link\");\n", - " element.onload = on_load;\n", - " element.onerror = on_error.bind(null, url);\n", - " element.rel = \"stylesheet\";\n", - " element.type = \"text/css\";\n", - " element.href = url;\n", - " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", - " document.body.appendChild(element);\n", - " }\n", - "\n", - " for (let i = 0; i < js_urls.length; i++) {\n", - " const url = js_urls[i];\n", - " const element = document.createElement('script');\n", - " element.onload = on_load;\n", - " element.onerror = on_error.bind(null, url);\n", - " element.async = false;\n", - " element.src = url;\n", - " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", - " document.head.appendChild(element);\n", - " }\n", - " };\n", - "\n", - " function inject_raw_css(css) {\n", - " const element = document.createElement(\"style\");\n", - " element.appendChild(document.createTextNode(css));\n", - " document.body.appendChild(element);\n", - " }\n", - "\n", - " const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.4.min.js\"];\n", - " const css_urls = [];\n", - "\n", - " const inline_js = [ function(Bokeh) {\n", - " Bokeh.set_log_level(\"info\");\n", - " },\n", - "function(Bokeh) {\n", - " }\n", - " ];\n", - "\n", - " function run_inline_js() {\n", - " if (root.Bokeh !== undefined || force === true) {\n", - " for (let i = 0; i < inline_js.length; i++) {\n", - " inline_js[i].call(root, root.Bokeh);\n", - " }\n", - "} else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(run_inline_js, 100);\n", - " } else if (!root._bokeh_failed_load) {\n", - " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", - " root._bokeh_failed_load = true;\n", - " } else if (force !== true) {\n", - " const cell = $(document.getElementById(null)).parents('.cell').data().cell;\n", - " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", - " }\n", - " }\n", - "\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", - " run_inline_js();\n", - " } else {\n", - " load_libs(css_urls, js_urls, function() {\n", - " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", - " run_inline_js();\n", - " });\n", - " }\n", - "}(window));" - ], - "application/vnd.bokehjs_load.v0+json": "(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n const el = document.getElementById(null);\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.4.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(null)).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "(function(root) {\n", - " function now() {\n", - " return new Date();\n", - " }\n", - "\n", - " const force = true;\n", - "\n", - " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", - " root._bokeh_onload_callbacks = [];\n", - " root._bokeh_is_loading = undefined;\n", - " }\n", - "\n", - "const JS_MIME_TYPE = 'application/javascript';\n", - " const HTML_MIME_TYPE = 'text/html';\n", - " const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", - " const CLASS_NAME = 'output_bokeh rendered_html';\n", - "\n", - " /**\n", - " * Render data to the DOM node\n", - " */\n", - " function render(props, node) {\n", - " const script = document.createElement(\"script\");\n", - " node.appendChild(script);\n", - " }\n", - "\n", - " /**\n", - " * Handle when an output is cleared or removed\n", - " */\n", - " function handleClearOutput(event, handle) {\n", - " function drop(id) {\n", - " const view = Bokeh.index.get_by_id(id)\n", - " if (view != null) {\n", - " view.model.document.clear()\n", - " Bokeh.index.delete(view)\n", - " }\n", - " }\n", - "\n", - " const cell = handle.cell;\n", - "\n", - " const id = cell.output_area._bokeh_element_id;\n", - " const server_id = cell.output_area._bokeh_server_id;\n", - "\n", - " // Clean up Bokeh references\n", - " if (id != null) {\n", - " drop(id)\n", - " }\n", - "\n", - " if (server_id !== undefined) {\n", - " // Clean up Bokeh references\n", - " const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", - " cell.notebook.kernel.execute(cmd_clean, {\n", - " iopub: {\n", - " output: function(msg) {\n", - " const id = msg.content.text.trim()\n", - " drop(id)\n", - " }\n", - " }\n", - " });\n", - " // Destroy server and session\n", - " const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", - " cell.notebook.kernel.execute(cmd_destroy);\n", - " }\n", - " }\n", - "\n", - " /**\n", - " * Handle when a new output is added\n", - " */\n", - " function handleAddOutput(event, handle) {\n", - " const output_area = handle.output_area;\n", - " const output = handle.output;\n", - "\n", - " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", - " if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n", - " return\n", - " }\n", - "\n", - " const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", - "\n", - " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", - " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", - " // store reference to embed id on output_area\n", - " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", - " }\n", - " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", - " const bk_div = document.createElement(\"div\");\n", - " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", - " const script_attrs = bk_div.children[0].attributes;\n", - " for (let i = 0; i < script_attrs.length; i++) {\n", - " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", - " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", - " }\n", - " // store reference to server id on output_area\n", - " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", - " }\n", - " }\n", - "\n", - " function register_renderer(events, OutputArea) {\n", - "\n", - " function append_mime(data, metadata, element) {\n", - " // create a DOM node to render to\n", - " const toinsert = this.create_output_subarea(\n", - " metadata,\n", - " CLASS_NAME,\n", - " EXEC_MIME_TYPE\n", - " );\n", - " this.keyboard_manager.register_events(toinsert);\n", - " // Render to node\n", - " const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", - " render(props, toinsert[toinsert.length - 1]);\n", - " element.append(toinsert);\n", - " return toinsert\n", - " }\n", - "\n", - " /* Handle when an output is cleared or removed */\n", - " events.on('clear_output.CodeCell', handleClearOutput);\n", - " events.on('delete.Cell', handleClearOutput);\n", - "\n", - " /* Handle when a new output is added */\n", - " events.on('output_added.OutputArea', handleAddOutput);\n", - "\n", - " /**\n", - " * Register the mime type and append_mime function with output_area\n", - " */\n", - " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", - " /* Is output safe? */\n", - " safe: true,\n", - " /* Index of renderer in `output_area.display_order` */\n", - " index: 0\n", - " });\n", - " }\n", - "\n", - " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", - " if (root.Jupyter !== undefined) {\n", - " const events = require('base/js/events');\n", - " const OutputArea = require('notebook/js/outputarea').OutputArea;\n", - "\n", - " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", - " register_renderer(events, OutputArea);\n", - " }\n", - " }\n", - " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", - " root._bokeh_timeout = Date.now() + 5000;\n", - " root._bokeh_failed_load = false;\n", - " }\n", - "\n", - " const NB_LOAD_WARNING = {'data': {'text/html':\n", - " \"
\\n\"+\n", - " \"

\\n\"+\n", - " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", - " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", - " \"

\\n\"+\n", - " \"\\n\"+\n", - " \"\\n\"+\n", - " \"from bokeh.resources import INLINE\\n\"+\n", - " \"output_notebook(resources=INLINE)\\n\"+\n", - " \"\\n\"+\n", - " \"
\"}};\n", - "\n", - " function display_loaded() {\n", - " const el = document.getElementById(null);\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS is loading...\";\n", - " }\n", - " if (root.Bokeh !== undefined) {\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", - " }\n", - " } else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(display_loaded, 100)\n", - " }\n", - " }\n", - "\n", - " function run_callbacks() {\n", - " try {\n", - " root._bokeh_onload_callbacks.forEach(function(callback) {\n", - " if (callback != null)\n", - " callback();\n", - " });\n", - " } finally {\n", - " delete root._bokeh_onload_callbacks\n", - " }\n", - " console.debug(\"Bokeh: all callbacks have finished\");\n", - " }\n", - "\n", - " function load_libs(css_urls, js_urls, callback) {\n", - " if (css_urls == null) css_urls = [];\n", - " if (js_urls == null) js_urls = [];\n", - "\n", - " root._bokeh_onload_callbacks.push(callback);\n", - " if (root._bokeh_is_loading > 0) {\n", - " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", - " return null;\n", - " }\n", - " if (js_urls == null || js_urls.length === 0) {\n", - " run_callbacks();\n", - " return null;\n", - " }\n", - " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", - " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", - "\n", - " function on_load() {\n", - " root._bokeh_is_loading--;\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", - " run_callbacks()\n", - " }\n", - " }\n", - "\n", - " function on_error(url) {\n", - " console.error(\"failed to load \" + url);\n", - " }\n", - "\n", - " for (let i = 0; i < css_urls.length; i++) {\n", - " const url = css_urls[i];\n", - " const element = document.createElement(\"link\");\n", - " element.onload = on_load;\n", - " element.onerror = on_error.bind(null, url);\n", - " element.rel = \"stylesheet\";\n", - " element.type = \"text/css\";\n", - " element.href = url;\n", - " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", - " document.body.appendChild(element);\n", - " }\n", - "\n", - " for (let i = 0; i < js_urls.length; i++) {\n", - " const url = js_urls[i];\n", - " const element = document.createElement('script');\n", - " element.onload = on_load;\n", - " element.onerror = on_error.bind(null, url);\n", - " element.async = false;\n", - " element.src = url;\n", - " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", - " document.head.appendChild(element);\n", - " }\n", - " };\n", - "\n", - " function inject_raw_css(css) {\n", - " const element = document.createElement(\"style\");\n", - " element.appendChild(document.createTextNode(css));\n", - " document.body.appendChild(element);\n", - " }\n", - "\n", - " const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.4.min.js\"];\n", - " const css_urls = [];\n", - "\n", - " const inline_js = [ function(Bokeh) {\n", - " Bokeh.set_log_level(\"info\");\n", - " },\n", - "function(Bokeh) {\n", - " }\n", - " ];\n", - "\n", - " function run_inline_js() {\n", - " if (root.Bokeh !== undefined || force === true) {\n", - " for (let i = 0; i < inline_js.length; i++) {\n", - " inline_js[i].call(root, root.Bokeh);\n", - " }\n", - "} else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(run_inline_js, 100);\n", - " } else if (!root._bokeh_failed_load) {\n", - " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", - " root._bokeh_failed_load = true;\n", - " } else if (force !== true) {\n", - " const cell = $(document.getElementById(null)).parents('.cell').data().cell;\n", - " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", - " }\n", - " }\n", - "\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", - " run_inline_js();\n", - " } else {\n", - " load_libs(css_urls, js_urls, function() {\n", - " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", - " run_inline_js();\n", - " });\n", - " }\n", - "}(window));" - ], - "application/vnd.bokehjs_load.v0+json": "(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n const el = document.getElementById(null);\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.4.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(null)).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import AnoPrimer\n", "import pandas as pd" @@ -621,16 +40,22 @@ "id": "aygnIeKqQbUR" }, "source": [ - "##**[AnoPrimer](https://github.com/sanjaynagi/AnoPrimer): Primer design considering genetic variation in *Anopheles gambiae* and *Anopheles funestus***\n", + "## **[AnoPrimer](https://github.com/sanjaynagi/AnoPrimer): Primer design considering genetic variation in *Anopheles gambiae* and *Anopheles funestus***\n", "**Author**: [Sanjay Curtis Nagi](https://sanjaynagi.github.io/) \n", "**Email**: sanjay.nagi@lstmed.ac.uk \n", "\n", + "---\n", + "\n", + "Nagi SC, Ashraf F, Miles A and Donnelly MJ. **[AnoPrimer](https://github.com/sanjaynagi/AnoPrimer): Primer Design in malaria vectors informed by range-wide genomic variation** [version 1; peer review: 4 approved]. Wellcome Open Res 2024, 9:255 (https://doi.org/10.12688/wellcomeopenres.20998.1)\n", + "\n", + "---\n", + "\n", "This notebook allows users to run AnoPrimer, without running the full, extended colaboratory [notebook](https://colab.research.google.com/github/sanjaynagi/AnoPrimer/blob/main/notebooks/AnoPrimer-long.ipynb)." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "id": "NCEovYA6RrH4" }, @@ -663,7 +88,7 @@ "id": "eZH4YGmhR6lK" }, "source": [ - "Run the `AnoPrimer.designPrimers()` function to design primers. primer_parameters can either be \"default\", or we can pass `primer_parameters` from the above cell. Here's the [primer3 manual](https://primer3.org/manual.html). " + "Run the `AnoPrimer.design_primers()` function to design primers. primer_parameters can either be \"default\", or we can pass `primer_parameters` from the above cell. Here's the [primer3 manual](https://primer3.org/manual.html). " ] }, { @@ -672,20 +97,9 @@ "metadata": { "id": "dfOtlN0IQb4p" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Our genome sequence for 2L is 49364325 bp long\n", - "Exon junctions for AGAP006222-RA: [ 238 325 911 1353] [28524463, 28524621, 28525282, 28525790] \n", - "\n", - "Access SNP calls: ⠸ (0:00:03.53) " - ] - } - ], + "outputs": [], "source": [ - "primer_df, blat_df = AnoPrimer.designPrimers(\n", + "primers = AnoPrimer.design_primers(\n", " species='gambiae_sl',\n", " assay_type='cDNA primers', # assay_type options are: 'cDNA primers', 'gDNA primers', 'gDNA primers + probe', 'probe'\n", " target='AGAP006222-RA', #'AGAP000818-RA' target should be an AGAP/AFUN transcript identifier for qPCR, otherwise should be a contig:integer string in genome, such as '2L:28545767'\n", @@ -694,8 +108,6 @@ " min_amplicon_size=60,\n", " max_amplicon_size=120,\n", " primer_parameters=\"default\",\n", - " sample_sets=['AG1000G-BF-A', 'AG1000G-GH', 'AG1000G-GN-A'],\n", - " out_dir=\".\"\n", " )" ] }, @@ -705,7 +117,25 @@ "metadata": {}, "outputs": [], "source": [ - "primer_df" + "primers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "primers.df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "primers.evaluate_primers(sample_sets=['AG1000G-BF-A'], sample_query=None, out_dir=\"./\")" ] } ], diff --git a/pyproject.toml b/pyproject.toml index 5710583..1680be4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "AnoPrimer" -version = "1.0.2" +version = "1.1.0" description = "A package to design primers in Anopheles gambiae whilst considering genetic variation with malariagen_data" readme = "README.md" documentation = "https://sanjaynagi.github.io/anoprimer/latest/"