Skip to content

Commit

Permalink
Check for identical sequences and names
Browse files Browse the repository at this point in the history
  • Loading branch information
veghp committed Apr 19, 2024
1 parent 5be360b commit f3097b9
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 15 deletions.
51 changes: 46 additions & 5 deletions seqreport/SeqCollection.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ def __init__(
currency_symbol="£",
projectname="",
comments="",
min_length=0,
max_length=0,
min_length=100, # a good cutoff for min DNA synthesis length
max_length=3000, # a good cutoff for max DNA synthesis length
name_length=15, # max seq record name length. Genbank character limit.
):
self.sequences = records
Expand All @@ -66,24 +66,65 @@ def calculate_values(self):
self.cost_per_seq = float(self.cost_per_seq)
self.n_seq = len(self.sequences)

# Numbers section
n_bp = 0
for part in self.sequences:
n_bp += len(part.seq)
self.n_bp = n_bp
self.cost = self.n_seq * self.cost_per_seq + self.n_bp * self.cost_per_base

# Lengths section
self.too_short = []
self.too_long = []
self.long_names = []
for record in self.sequences:
if len(record) < self.min_length:
self.too_short += [record.id]
if self.max_length > 0: # otherwise skip if default
if len(record) > self.max_length:
self.too_long += [record.id]
if len(record) > self.max_length:
self.too_long += [record.id]
if len(record.id) > self.name_length:
self.long_names += [record.id]

self.too_short = list(set(self.too_short))
self.too_long = list(set(self.too_long))
self.long_names = list(set(self.long_names))
# Repeats will be checked further below.

# For the PDF report:
self.too_short_text = " ; ".join(self.too_short)
self.too_long_text = " ; ".join(self.too_long)
self.long_names_text = " ; ".join(self.long_names)

# Repeats section
self.repeat_names = []
self.repeat_seq = []
self.reverse_complement_seq = []
for index, record in enumerate(self.sequences[:-1]):
for other_record in self.sequences[index+1:]:
if record.id == other_record.id:
self.repeat_names += [record.id]
if record.seq == other_record.seq:
self.repeat_seq += [(record.id, other_record.id)]
if record.seq == other_record.seq.reverse_complement():
self.reverse_complement_seq += [(record.id, other_record.id)]

self.repeat_names = list(set(self.repeat_names))

# For the PDF report:
self.repeat_names_text = " ; ".join(self.repeat_names)

repeat_seq_text_list = []
for identifier in self.repeat_seq:
combined = " = ".join(identifier)
repeat_seq_text_list += [combined]
self.repeat_seq_text = " ; ".join(repeat_seq_text_list)

reverse_complement_seq_text_list = []
for identifier in self.reverse_complement_seq:
combined = " = ".join(identifier)
reverse_complement_seq_text_list += [combined]
self.reverse_complement_seq_text = " ; ".join(reverse_complement_seq_text_list)


def read_fasta(fasta):
"""Read a FASTA sequence file into a list of records.
Expand Down
33 changes: 27 additions & 6 deletions seqreport/report_assets/seq_report.pug
Original file line number Diff line number Diff line change
Expand Up @@ -21,27 +21,48 @@ p.
p.
Estimated cost: <b>{{ seqcollection.currency_symbol }} {{ seqcollection.cost }}</b>

if seqcollection.too_short
if seqcollection.too_short_text
p.
#[span.red] Sequences shorter than <b>{{ seqcollection.min_length }}</b> bp: {{ seqcollection.too_short }}
#[span.red] Sequences shorter than <b>{{ seqcollection.min_length }}</b> bp: {{ seqcollection.too_short_text }}
else
p.
#[span.green] None of the sequences are shorter than <b>{{ seqcollection.min_length }}</b> bp.

if seqcollection.too_long
if seqcollection.too_long_text
p.
#[span.red] Sequences longer than <b>{{ seqcollection.max_length }}</b> bp: {{ seqcollection.too_long }}
#[span.red] Sequences longer than <b>{{ seqcollection.max_length }}</b> bp: {{ seqcollection.too_long_text }}
else
p.
#[span.green] None of the sequences are longer than <b>{{ seqcollection.max_length }}</b> bp.

if seqcollection.long_names
if seqcollection.long_names_text
p.
#[span.red] Sequence IDs longer than <b>{{ seqcollection.name_length }}</b> characters: {{ seqcollection.long_names }}
#[span.red] Sequence IDs longer than <b>{{ seqcollection.name_length }}</b> characters: {{ seqcollection.long_names_text }}
else
p.
#[span.green] None of the IDs are longer than <b>{{ seqcollection.name_length }}</b> characters.

if seqcollection.repeat_names_text
p.
#[span.red] Sequence IDs appearing more than once: {{ seqcollection.repeat_names_text }}
else
p.
#[span.green] None of the IDs are duplicated.

if seqcollection.repeat_seq_text
p.
#[span.red] Identical sequence-pairs: {{ seqcollection.repeat_seq_text }}
else
p.
#[span.green] None of the sequences are identical.

if seqcollection.reverse_complement_seq_text
p.
#[span.red] Sequence-pairs with identical reverse complements: {{ seqcollection.reverse_complement_seq_text }}
else
p.
#[span.green] None of the sequences are reverse complements.

if seqcollection.comments
p.
Comments: <i>{{ seqcollection.comments }}</i>
Expand Down
6 changes: 6 additions & 0 deletions tests/data/test.fa
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,9 @@ ACGGCCAATT
CTCAATCAGATCGATTGCAATGCACCATCGGG
>sequence3
TGATGATGAAAGACAGCCGATGACATGGCGTACTACAGACGC
>seq2
TGATGATGAAAGACAGCCGATGACCGT
>duplicate
CTCAATCAGATCGATTGCAATGCACCATCGGG
>reverse_complement
CCCGATGGTGCATTGCAATCGATCTGATTGAG
11 changes: 7 additions & 4 deletions tests/test_SeqCollection.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,23 @@ def test_SeqCollection():
comments="This is a test sequence set.",
min_length=20,
max_length=40,
name_length=5,
name_length=10,
)
assert seq_coll.n_seq == 3
assert seq_coll.n_bp == 84
assert seq_coll.n_seq == 6
assert seq_coll.n_bp == 175
assert seq_coll.projectname == "EGF24"
assert seq_coll.comments != ""
assert len(seq_coll.too_short) == 1
assert len(seq_coll.too_long) == 1
assert len(seq_coll.long_names) == 1
assert len(seq_coll.repeat_names) == 1
assert len(seq_coll.repeat_seq) == 1
assert len(seq_coll.reverse_complement_seq) == 2


def test_read_fasta():
seq_records = seqreport.read_fasta(seq_fasta)
assert len(seq_records) == 3
assert len(seq_records) == 6


def test_seqcollection_from_csv():
Expand Down

0 comments on commit f3097b9

Please sign in to comment.