Skip to content

Commit

Permalink
Merge pull request #27 from sanjaynagi/af1-support-anoprimer-19-12-22
Browse files Browse the repository at this point in the history
af1 support
  • Loading branch information
sanjaynagi authored Aug 18, 2023
2 parents 18ec124 + d182462 commit 60d4593
Show file tree
Hide file tree
Showing 19 changed files with 491 additions and 3,153 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/github-action-AgamPrimer.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
fail-fast: true
matrix:
python-version: ['3.8', '3.9', '3.10', '3.11']
python-version: ['3.8', '3.10', '3.11']

steps:

Expand All @@ -39,8 +39,12 @@ jobs:
- name: Run notebooks
run: |
poetry run papermill notebooks/AgamPrimer-long.ipynb qPCR_run.ipynb -k AgamPrimer -f tests/cDNA_Params_fun.json
poetry run papermill notebooks/AgamPrimer-long.ipynb qPCR_run.ipynb -k AgamPrimer -f tests/cDNA_Params.json
poetry run papermill notebooks/AgamPrimer-long.ipynb qPCR2_run.ipynb -k AgamPrimer -f tests/cDNA_Params_2_fun.json
poetry run papermill notebooks/AgamPrimer-long.ipynb qPCR2_run.ipynb -k AgamPrimer -f tests/cDNA_Params_2.json
poetry run papermill notebooks/AgamPrimer-long.ipynb gDNA_run.ipynb -k AgamPrimer -f tests/gDNA_probe_Params_fun.json
poetry run papermill notebooks/AgamPrimer-long.ipynb gDNA_run.ipynb -k AgamPrimer -f tests/gDNA_probe_Params.json
poetry run papermill notebooks/AgamPrimer-long.ipynb probe_run.ipynb -k AgamPrimer -f tests/probe_Params.json
poetry run papermill notebooks/AgamPrimer-long.ipynb probe_run.ipynb -k AgamPrimer -f tests/probe_Params_fun.json
poetry run papermill notebooks/AgamPrimer-short.ipynb short_run.ipynb -k AgamPrimer
134 changes: 101 additions & 33 deletions AgamPrimer/AgamPrimer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,19 @@
from plotly.subplots import make_subplots

ag3 = malariagen_data.Ag3(url="gs://vo_agam_release/", pre=True)
af1 = malariagen_data.Af1(url="gs://vo_afun_release/", pre=True)


def retrieve_data_resource(species):
assert species in [
"gambiae_sl",
"funestus",
], f"species {species} not recognised, please use 'gambiae_sl' or 'funestus'"
if species == "gambiae_sl":
data_resource = ag3
elif species == "funestus":
data_resource = af1
return data_resource


def prepare_gDNA_sequence(
Expand Down Expand Up @@ -75,12 +88,16 @@ def prepare_gDNA_sequence(
return (target_sequence, gdna_pos, seq_parameters)


def prepare_cDNA_sequence(transcript, genome_seq, assay_name, cDNA_exon_junction):
def prepare_cDNA_sequence(
species, transcript, genome_seq, assay_name, cDNA_exon_junction
):
"""
Extract exonic sequence for our transcript and record exon-exon junctions
PARAMETERS
----------
species: str
The species to design primers for. Either 'gambiae_sl' or 'funestus'
transcript : str
The AGAP identifier of the transcript to design primers for
genome_seq : dask.array.core.Array
Expand All @@ -92,7 +109,9 @@ def prepare_cDNA_sequence(transcript, genome_seq, assay_name, cDNA_exon_junction
"""

# subset gff to your gene
gff = ag3.geneset()
data_resource = retrieve_data_resource(species)

gff = data_resource.geneset()
gff = gff.query(f"type == 'exon' & Parent == '{transcript}'")
# Get fasta sequence for each of our exons, and remember gDNA position
seq = dict()
Expand Down Expand Up @@ -124,6 +143,7 @@ def prepare_cDNA_sequence(transcript, genome_seq, assay_name, cDNA_exon_junction


def prepare_sequence(
species,
target,
assay_type,
assay_name,
Expand All @@ -136,6 +156,8 @@ def prepare_sequence(
PARAMETERS
----------
species: str
The species to design primers for. Either 'gambiae_sl' or 'funestus'
target : str
The target to design primers for. For gDNA primers, this should be a contig:position string,
for example '2L:28545767'. For cDNA primers, this should be an AGAP identifier.
Expand Down Expand Up @@ -164,6 +186,7 @@ def prepare_sequence(
elif assay_type == "cDNA primers":
# quantitative PCR
target_sequence, gdna_pos, seq_parameters = prepare_cDNA_sequence(
species=species,
transcript=target,
genome_seq=genome_seq,
assay_name=assay_name,
Expand Down Expand Up @@ -323,7 +346,8 @@ def primer3_to_pandas(primer_dict, assay_type):
return primer_df


def plot_primer_ag3_frequencies(
def plot_primer_snp_frequencies(
species,
primer_df,
gdna_pos,
contig,
Expand All @@ -338,6 +362,8 @@ def plot_primer_ag3_frequencies(
PARAMETERS
----------
species: str
The species to design primers for. Either 'gambiae_sl' or 'funestus'
primer_df : pandas.DataFrame
A pandas DataFrame containing the primer sequences and their information, returned by primer3_to_pandas()
gdna_pos : numpy.ndarray
Expand Down Expand Up @@ -371,7 +397,14 @@ def plot_primer_ag3_frequencies(
# Loop through each primer pair and get the frequencies of alternate alleles, storing in dict
for i in range(len(primer_df.columns)):
res_dict[i] = _get_primer_alt_frequencies(
primer_df, gdna_pos, i, sample_sets, assay_type, contig, sample_query
species=species,
primer_df=primer_df,
gdna_pos=gdna_pos,
pair=i,
sample_sets=sample_sets,
assay_type=assay_type,
contig=contig,
sample_query=sample_query,
)

# Plot data with plotly
Expand All @@ -389,6 +422,7 @@ def plot_primer_ag3_frequencies(


def plot_primer_locs(
species,
primer_res_dict,
primer_df,
contig,
Expand All @@ -402,6 +436,8 @@ def plot_primer_locs(
PARAMETERS
----------
species: str
The species to design primers for. Either 'gambiae_sl' or 'funestus'
primer_res_dict : dict
A dictionary containing the primer3 results, returned by primer3.designPrimers()
primer_df : pandas.DataFrame
Expand All @@ -418,10 +454,15 @@ def plot_primer_locs(
out_dir : str, optional
The directory to write output files to. If not specified, outputs will not be written to file.
"""
data_resource = retrieve_data_resource(species=species)
oligos, _ = _return_oligo_list(assay_type)
assay_name = seq_parameters["SEQUENCE_ID"]
exon_id_col = (
"Name" if species == "gambiae_sl" else "ID"
) # funestus gff3 uses ID for exon name, gambiae uses Name

# Load geneset (gff)
gff = ag3.geneset()
gff = data_resource.geneset()
if any(item in assay_type for item in ["gDNA", "probe"]):
start = seq_parameters["GENOMIC_TARGET"] - 500
end = seq_parameters["GENOMIC_TARGET"] + 500
Expand Down Expand Up @@ -459,7 +500,7 @@ def plot_primer_locs(
# Add rectangles for exons one at a time
for _, exon in locgff.iterrows():
ex_start, ex_end = exon[["start", "end"]]
e_name = exon["Name"][-2:]
e_name = exon[exon_id_col][-2:]
strand = exon["strand"]
if strand == "+":
rect = patches.Rectangle(
Expand Down Expand Up @@ -606,42 +647,52 @@ def gget_blat_genome(primer_df, assay_type, assembly="anoGam3"):
print("No HITs found for these primer pairs")


def check_and_split_target(target, assay_type):
def check_and_split_target(species, target, assay_type):
data_resource = retrieve_data_resource(species=species)

# split contig from target
if target.startswith("AGAP"):
if target.startswith("AGAP") or target.startswith("LOC"):
assert (
assay_type == "cDNA primers"
), "an AGAP identifier is specified, but the assay type is not cDNA primers. Please provide a contig:position identifier for gDNA primers."
gff = ag3.geneset()
), "an AGAP/AFUN identifier is specified, but the assay type is not cDNA primers. Please provide a contig:position identifier for gDNA primers."
gff = data_resource.geneset()
assert (
target in gff["ID"].to_list()
), f"requested target {target} not in ag3 transcript set"
), f"requested target {target} not in ag3/af1 transcript set"
contig = gff.query(f"ID == '{target}'")["contig"].unique()[0]
return (contig, target)
else:
assert isinstance(
target, str
), "For genomic DNA the target should be a string, such as '2L:28545767'"
contig, target = target.split(":")
assert contig in [
"2L",
"2R",
"3L",
"3R",
"X",
], "target contig not recognised, should be 2L, 2R, 3L, 3R or X"
if species == "gambiae_sl":
assert contig in [
"2L",
"2R",
"3L",
"3R",
"X",
], "target contig not recognised, should be 2L, 2R, 3L, 3R or X"
elif species == "funestus":
assert contig in [
"2RL",
"3RL",
"X",
], "target contig not recognised, should be 2RL, 3RL or X"
return (contig, int(target))


def designPrimers(
species,
assay_type,
assay_name,
min_amplicon_size,
max_amplicon_size,
n_primer_pairs,
target,
primer_parameters,
sample_sets,
sample_sets=None,
sample_query=None,
out_dir=None,
cDNA_exon_junction=True,
Expand All @@ -651,6 +702,8 @@ def designPrimers(
Parameters
----------
species: str
The species to design primers for. Either 'gambiae_sl' or 'funestus'
assay_type: {'gDNA primers', 'cDNA primers', 'gDNA primers + probe', 'probe}, str
The type of oligos we wish to design. If 'gDNA primers' or 'cDNA primers' are specified,
then only primers will be designed. If 'gDNA primers + probe' is specified, then primers
Expand Down Expand Up @@ -692,9 +745,13 @@ def designPrimers(
A pandas DataFrame containing the BLAT alignment information for the primers.
"""

# check target is valid for assay type and find contig
data_resource = retrieve_data_resource(species=species)
oligos, _ = _return_oligo_list(assay_type)
contig, target = check_and_split_target(target=target, assay_type=assay_type)

# check target is valid for assay type and find contig
contig, target = check_and_split_target(
species=species, target=target, assay_type=assay_type
)
amplicon_size_range = [[min_amplicon_size, max_amplicon_size]]

# adds some necessary parameters depending on assay type
Expand All @@ -715,10 +772,11 @@ def designPrimers(
generate_defaults=False,
)
# load genome sequence
genome_seq = ag3.genome_sequence(region=contig)
genome_seq = data_resource.genome_sequence(region=contig)
print(f"Our genome sequence for {contig} is {genome_seq.shape[0]} bp long")

target_sequence, gdna_pos, seq_parameters = prepare_sequence(
species=species,
target=target,
assay_type=assay_type,
assay_name=assay_name,
Expand All @@ -745,7 +803,8 @@ def designPrimers(
primer_df = primer3_to_pandas(primer_dict=primer_dict, assay_type=assay_type)

# plot frequencies of alleles in primer pairs
results_dict = plot_primer_ag3_frequencies(
results_dict = plot_primer_snp_frequencies(
species=species,
primer_df=primer_df,
gdna_pos=gdna_pos,
contig=contig,
Expand All @@ -758,6 +817,7 @@ def designPrimers(

# plot primer locations on genome
plot_primer_locs(
species=species,
primer_res_dict=results_dict,
primer_df=primer_df,
assay_type=assay_type,
Expand All @@ -784,16 +844,19 @@ def designPrimers(
index=[f"primer_{o}" for o in oligos]
)

# check primers for specificity against the genome and write to file
blat_df = gget_blat_genome(primer_df, assay_type, assembly="anoGam3")

# write primer3 and blat output to file
if out_dir:
primer_df.to_csv(f"{out_dir}/{assay_name}.{assay_type}.tsv", sep="\t")
primer_df.to_excel(f"{out_dir}/{assay_name}.{assay_type}.xlsx")
blat_df.to_csv(f"{out_dir}/{assay_name}.{assay_type}.blat.tsv", sep="\t")

return (primer_df, blat_df)
if species == "gambiae_sl":
# check primers for specificity against the genome and write to file
blat_df = gget_blat_genome(primer_df, assay_type, assembly="anoGam3")
# write primer3 and blat output to file
if out_dir:
blat_df.to_csv(f"{out_dir}/{assay_name}.{assay_type}.blat.tsv", sep="\t")
return primer_df, blat_df
else:
return primer_df, "Cannot BLAT against funestus, gambiae_sl only"


def extract_trailing_digits(string):
Expand All @@ -806,10 +869,14 @@ def extract_trailing_digits(string):
return None


def _get_primer_arrays(contig, gdna_pos, sample_sets, assay_type, sample_query=None):
def _get_primer_arrays(
species, contig, gdna_pos, sample_sets, assay_type, sample_query=None
):
data_resource = retrieve_data_resource(species=species)

if any(item in assay_type for item in ["gDNA", "probe"]):
span_str = f"{contig}:{gdna_pos.min()}-{gdna_pos.max()}"
snps = ag3.snp_calls(
snps = data_resource.snp_calls(
region=span_str, sample_sets=sample_sets, sample_query=sample_query
) # get genotypes
ref_alt_arr = snps["variant_allele"].compute().values
Expand All @@ -823,7 +890,7 @@ def _get_primer_arrays(contig, gdna_pos, sample_sets, assay_type, sample_query=N
exon_spans = np.array(_consecutive(gdna_pos)) + 1
for span in exon_spans:
span_str = f"{contig}:{span[0]}-{span[1]}"
snps = ag3.snp_calls(
snps = data_resource.snp_calls(
region=span_str, sample_sets=sample_sets, sample_query=sample_query
) # get genotypes
ref_alts = snps["variant_allele"]
Expand All @@ -842,7 +909,7 @@ def _get_primer_arrays(contig, gdna_pos, sample_sets, assay_type, sample_query=N


def _get_primer_alt_frequencies(
primer_df, gdna_pos, pair, sample_sets, assay_type, contig, sample_query
species, primer_df, gdna_pos, pair, sample_sets, assay_type, contig, sample_query
):
"""
Find the genomic locations of pairs of primers, and runs span_to_freq
Expand All @@ -851,6 +918,7 @@ def _get_primer_alt_frequencies(

oligos, _ = _return_oligo_list(assay_type)
base_freqs, ref_alt_arr, pos_arr = _get_primer_arrays(
species=species,
contig=contig,
gdna_pos=gdna_pos,
sample_sets=sample_sets,
Expand Down
Binary file removed AgamPrimer/__pycache__/agamPrimer.cpython-37.pyc
Binary file not shown.
19 changes: 0 additions & 19 deletions AgamPrimer/agamPrimer.egg-info/PKG-INFO

This file was deleted.

Empty file.
Empty file.
1 change: 0 additions & 1 deletion AgamPrimer/agamPrimer.egg-info/top_level.txt

This file was deleted.

Loading

0 comments on commit 60d4593

Please sign in to comment.