This repository has been archived by the owner on Dec 14, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
datasetAnalysis.py
118 lines (97 loc) · 3.5 KB
/
datasetAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
def get_translation_table(filename):
file_path = f"data/translationTables/{filename}"
with open(file_path, "r") as inp:
reader = csv.reader(inp)
dict_csv = {rows[0]: rows[1] for rows in reader}
return dict_csv
def correlation_matrix(df: pd.DataFrame):
"""
A function to calculate and plot
correlation matrix of a DataFrame.
"""
# Create the matrix
matrix = df.corr()
# Create cmap
cmap = sns.diverging_palette(250, 15, s=75, l=40, n=9, center="light", as_cmap=True)
# Create a mask
mask = np.triu(np.ones_like(matrix, dtype=bool))
# Make figsize bigger
fig, ax = plt.subplots(figsize=(16, 12))
# Plot the matrix
_ = sns.heatmap(
matrix,
mask=mask,
center=0,
annot=True,
fmt=".2f",
square=True,
cmap=cmap,
ax=ax,
)
saveable = _.get_figure()
saveable.savefig("data/Dataset_Coorelation_Matrix.png")
data_raw = pd.read_csv("data/dataset.csv")
data_raw.head()
# Shows the information regarding the original dataset and it's types
# data_raw.info()
# Drop the following arbitrary columns:
# "ID","Case Number","Updated On", "Year", "Location"
data = pd.read_csv("data/dataset.csv")
data = data.drop(
["ID", "Case Number", "Updated On", "Location", "Year", "Block"], axis=1
)
# data.head()
# Convert the following bool types to integer values 1/True, 0/False:
# "Arrest", "Domestic"
boolean_dict = {True: 1, False: 0}
data["Arrest"] = data_raw["Arrest"].map(boolean_dict)
data["Domestic"] = data_raw["Domestic"].map(boolean_dict)
# Convert the following columns using the translation tables:
# "IUCR", "Primary Type", "Location Description", "Description", "FBI Code"
primary_type_dict = get_translation_table("Primary_Type_Translation_Table.csv")
location_description_dict = get_translation_table(
"Location_Description_Translation_Table.csv"
)
iucr_dict = get_translation_table("IUCR_Translation_Table.csv")
fbi_code_dict = get_translation_table("FBI_Code_Translation_Table.csv")
description_dict = get_translation_table("Description_Translation_Table.csv")
data["Primary Type"] = (
data_raw["Primary Type"].map(primary_type_dict).fillna("0").astype(np.int64)
)
data["Location Description"] = (
data_raw["Location Description"]
.map(location_description_dict)
.fillna("0")
.astype(np.int64)
)
data["IUCR"] = data_raw["IUCR"].map(iucr_dict).fillna("0").astype(np.int64)
data["FBI Code"] = data_raw["FBI Code"].map(fbi_code_dict).fillna("0").astype(np.int64)
data["Description"] = (
data_raw["Description"].map(description_dict).fillna("0").astype(np.int64)
)
# data.tail()
# Convert date to readable format using the following format:
# '%m/%d/%Y %H:%M'
# Then, seperate Date into individual day, month, week, hour, minute, and dayofweek columns for analysis
# data["Date"] = pd.to_datetime(
# data_raw["Date"], format="%m/%d/%Y %H:%M", errors="coerce"
# )
# data["Date_day"] = data["Date"].dt.day
# data["Date_month"] = data["Date"].dt.month
# data["Date_week"] = data["Date"].dt.week
# data["Date_hour"] = data["Date"].dt.hour
# data["Date_minute"] = data["Date"].dt.minute
# data["Date_dayofweek"] = data["Date"].dt.dayofweek
# data.tail()
# data.info()
# Sample testing for pairplots and pairwise coorelation matrices
correlation_matrix(data)
data_s = data.drop(
["Ward", "FBI Code", "X Coordinate", "Y Coordinate", "Beat"], axis=1
)
correlation_matrix(data_s)