-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_loader.py
222 lines (196 loc) · 9.41 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""
© Copyright 2024 Kai Lo and Chong Wan
Loads data from all 7 of our different datasets for this project.
Each source pertains to a different part of the recommendation process.
Some datasets were found online, and some of them were generated by us (mainly using Google
Translate/Maps/Geocode/Search APIs) and then manually cleaned and inspected.
"""
import csv
import math
from dataclasses import dataclass
from districts import District
from graphs import Graph, WeightedGraph
from userdata import User
@dataclass
class UserPreferenceDogBreed:
"""A class of dog breed that stores information about each breed that stores the dog's rating
of each trait based off of the data collected by the american Kennel Club:
https://www.kaggle.com/datasets/sujaykapadnis/dog-breeds"""
breed_name: str
affectionate_w_family: int # positive trait
good_w_young_children: int # positive trait
good_w_other_dog: int # positive trait
shedding_level: int # do negative weight -> negative trait
openness_to_strangers: int # positive trait
playfulness: int # positive
protective_nature: int # positive
adaptability: int # positive
trainability: int # positive
energy: int # Let users decide
barking: int # negative trait
stimulation_needs: int # Let users decide
def load_dog_data(dog_data_file: str, districts: set[District]) -> tuple[Graph, WeightedGraph]:
"""Creates two graphs:
- a graph containing every user in the given dog data file,
and every dog breed with edges between owners and pets.
- a weighted graph containing every district in the given dog data file,
and every dog breed with weighted edges between districts and # of dogs from breed
Ignores Mischling/mixed-breed dogs.
"""
dog_graph = Graph()
district_graph = WeightedGraph()
district_mapping = {target.district_id: target for target in districts}
users = {}
with open(dog_data_file, encoding='utf-8') as dog_data_content:
reader = csv.reader(dog_data_content)
next(reader, None) # Skip the first line header
for row in reader:
user_id = int(row[0])
raw_age_range = row[1]
if not raw_age_range.strip():
continue # Missing age range
gender = row[2].upper()
if not gender.strip():
continue # Missing gender data
district_id = int(row[4])
if district_id not in district_mapping:
continue # Invalid district ID
district = district_mapping[district_id]
dog_breed = row[5].capitalize()
if 'Mischling' in dog_breed: # Ignore mix-breed dogs because its complicated
continue
split_age_range = raw_age_range.split('-')
age = (int(split_age_range[0]) + int(split_age_range[1])) // 2 # Average in age range
if user_id not in district_mapping:
users[user_id] = User(user_id, age, gender, district)
dog_graph.add_vertex(users[user_id])
user = users[user_id]
dog_graph.add_vertex(dog_breed)
dog_graph.add_edge(dog_breed, user)
if not district_graph.contains(user.district):
district_graph.add_vertex(user.district)
if not district_graph.contains(dog_breed):
district_graph.add_vertex(dog_breed)
current_weight = district_graph.get_weight(user.district, dog_breed)
district_graph.add_edge(user.district, dog_breed, current_weight + 1)
return dog_graph, district_graph
def load_district_data(district_data_file: str) -> set[District]:
"""Loads the set of districts from a given district data file,
that contains each district's name and ID number.
"""
with open(district_data_file, encoding='utf-8') as districts_data:
reader = csv.reader(districts_data)
districts = set()
next(reader, None)
for row in reader:
# Sample row: ['261031', '31', 'Alt-Wiedikon', '261', '169']
district = District(int(row[1]), row[2])
districts.add(district)
return districts
def get_raw_district_distances(
districts: set[District],
district_distance_file: str
) -> dict[District, dict[District, float]]:
"""Takes existing district data and creates a mapping between districts and their distance
to every other district by loading data from the CSV file at district_distance_file.
Raw data in this context means it has not been normalized (and remains in kilometers,
not bounded by 0.0 and 1.0)
"""
district_lookup = {target.district_id: target for target in districts}
raw_district_distances = {}
with open(district_distance_file) as district_distance_content:
reader = csv.reader(district_distance_content)
next(reader, None)
for row in reader:
district_id = row[0]
origin = district_lookup[int(district_id)]
if not origin:
continue
district_distances = {}
district_mapping_raw = row[1].split('|')
for mapping in district_mapping_raw:
mapping_split = mapping.split(':')
destination_id, distance = int(mapping_split[0]), float(mapping_split[1])
destination = district_lookup[destination_id]
if not destination or destination == origin:
continue
district_distances[destination] = distance
raw_district_distances[origin] = district_distances
return raw_district_distances
def normalize_district_distances(raw_district_distances: dict[District, dict[District, float]]) -> None:
"""Normalize district distances so that all of them are between 0.0 and 1.0 (from raw km data).
In this case, also flips the values so that 1.0 indicates close districts and 0.0 is far.
Mutates the given dictionary.
"""
min_distance = math.inf
max_distance = 0
for origin in raw_district_distances:
for destination in raw_district_distances[origin]:
assert origin != destination
distance = raw_district_distances[origin][destination]
min_distance = min(distance, min_distance)
max_distance = max(distance, max_distance)
if max_distance == 0:
raise ValueError
difference = max_distance - min_distance
for origin in raw_district_distances:
for destination in raw_district_distances[origin]:
distance = raw_district_distances[origin][destination]
distance -= min_distance
distance /= difference
distance = 1 - distance
raw_district_distances[origin][destination] = distance
assert 0.0 <= distance <= 1.0
def apply_district_distances(district_distances: dict[District, dict[District, float]]) -> None:
"""Mutates the distance attributes of each district in the district_distances dictionary
so that it has the distance values corresponding to our given dictionary.
"""
for origin in district_distances:
for destination in district_distances[origin]:
assert origin != destination
origin.set_distance(destination, district_distances[origin][destination])
def dog_breed_data_loader(file: str) -> list[UserPreferenceDogBreed]:
"""Loads the data from the breed_traits.csv file, creates a list of DogBreed objects"""
with open(file) as dog_breed_file:
dog_breed_file.readline()
breed_informations = []
dog_breed_rows = csv.reader(dog_breed_file)
for row in dog_breed_rows:
breed_informations.append(
UserPreferenceDogBreed(row[0], int(row[1]), int(row[2]), int(row[3]), int(row[4]), int(row[5]),
int(row[6]), int(row[7]), int(row[8]), int(row[9]), int(row[10]),
int(row[11]), int(row[12])))
return breed_informations
def load_district_lat_lng(file: str, districts: set[District]) -> dict[District, tuple[float, float]]:
"""Loads the district latitudes and longitudes from a mapping file.
"""
district_lookup = {target.district_name: target for target in districts}
with open(file, encoding='utf-8') as district_file:
district_file.readline()
district_rows = csv.reader(district_file)
district_dict = {}
for row in district_rows:
district_name, lat, lng = row[0], float(row[1]), float(row[2])
district = district_lookup[district_name]
district_dict[district] = (lat, lng)
return district_dict
def load_translation_mapping(file: str) -> dict[str, str]:
"""Loads the mapping between german dog names to english dog names from a file.
"""
with open(file, encoding='utf-8') as translation_file:
translation_file.readline()
translation_rows = csv.reader(translation_file)
translation_dict = {}
for row in translation_rows:
translation_dict[row[0]] = row[1] # German to english
return translation_dict
def load_dog_images(file: str) -> dict[str, str]:
"""Loads the mapping between ENGLISH dog names and images URLs online.
"""
with open(file) as images_file:
images_file.readline()
images_rows = csv.reader(images_file)
images_dict = {}
for row in images_rows:
images_dict[row[0]] = row[1]
return images_dict