-
Notifications
You must be signed in to change notification settings - Fork 0
/
dna.py
122 lines (97 loc) · 3.97 KB
/
dna.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# DNA Matching
#
# Copyright (c) 2023 Alessandro Amatucci Girlanda
#
# This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.
#
# You are free to:
# - Share — copy and redistribute the material in any medium or format
# - Adapt — remix, transform, and build upon the material
#
# Under the following terms:
# - Attribution — You must give appropriate credit, provide a link to the license, and
# indicate if changes were made. You may do so in any reasonable manner,
# but not in any way that suggests the licensor endorses you or your use.
# - NonCommercial — You may not use the material for commercial purposes.
# - ShareAlike — If you remix, transform, or build upon the material, you must
# distribute your contributions under the same license as the original.
#
# No additional restrictions — You may not apply legal terms or technological measures
# that legally restrict others from doing anything the license permits.
#
# Acknowledgment:
# This repository includes the Databases and Sequences folders developed by Harvard University CS50.
import csv
import sys
def main():
# Check for command-line usage
len_argv = len(sys.argv)
if len_argv != 3:
print("Usage: python dna.py data.csv sequence.txt")
sys.exit(1)
# Read database file into a variable
data_file = open(sys.argv[1], "r")
data_reader = csv.DictReader(data_file)
data_dict = list()
for row in data_reader:
data_dict.append(row)
# Store keys in a list
# Remove the "name" key
str_list = list(data_dict[0].keys())
str_list.pop(0)
# Store list length
str_list_len = len(str_list)
# Read DNA sequence file into a variable
dna_file = open(sys.argv[2], "r")
dna_reader = dna_file.read()
# Find longest match of each STR in DNA sequence
# e.g., repeats = longest_match(dna_reader, "AGATC")
dna_dict = dict()
for str in str_list:
dna_dict[str] = longest_match(dna_reader, str)
# Check database for matching profiles
dna_match = "No match"
for person in data_dict:
for str in str_list:
if int(person[str]) != dna_dict[str]:
break
elif str == str_list[str_list_len - 1]:
dna_match = person["name"]
if dna_match != "No match":
break
print(dna_match)
# Close files before terminating the program
data_file.close()
dna_file.close()
return
def longest_match(sequence, subsequence):
"""Returns length of longest run of subsequence in sequence."""
# Initialize variables
longest_run = 0
subsequence_length = len(subsequence)
sequence_length = len(sequence)
# Check each character in sequence for most consecutive runs of subsequence
for i in range(sequence_length):
# Initialize count of consecutive runs
count = 0
# Check for a subsequence match in a "substring" (a subset of characters) within sequence
# If a match, move substring to next potential match in sequence
# Continue moving substring and checking for matches until out of consecutive matches
while True:
# Adjust substring start and end
start = i + count * subsequence_length
end = start + subsequence_length
# If there is a match in the substring
if sequence[start:end] == subsequence:
count += 1
# If there is no match in the substring
else:
break
# Update most consecutive matches found
longest_run = max(longest_run, count)
# After checking for runs at each character in seqeuence, return longest run found
return longest_run
main()