This repository has been archived by the owner on Jan 10, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
week3.py
196 lines (145 loc) · 6.23 KB
/
week3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env python3
#### Extracting data from data frames ####
#### Objectives ####
# review previous week's objectives
# Today:
# conditional subsetting
# grouping data
# visualizing data with matplotlib
# dealing with missing data
#### Getting set up ####
# make sure folks are working in project directory with data/
# load libraries
import pandas as pd
import matplotlib.pyplot as plt
# Make sure figures appear inline in some interfaces
%matplotlib inline
# can also use plt.show()
# read in data
clinical_df = pd.read_csv("data/clinical.csv") # import data as csv file
# inspect output
clinical_df.head()
len(clinical_df)
#### Conditional subsetting ####
# motivation: extracting data based on criteria
# what samples are from patients born in 1930?
clinical_df.year_of_birth == 1930
# this gives true/false results
# conditional subsetting: all patients born in 1930
clinical_df[clinical_df.year_of_birth == 1930]
# all patients NOT born in 1930
clinical_df[clinical_df.year_of_birth != 1930]
# combining criteria: AND
clinical_df[(clinical_df.year_of_birth >= 1930) & (clinical_df.year_of_birth <= 1940)]
# combining subsetting: OR
clinical_df[(clinical_df.year_of_birth == 1930) | (clinical_df.year_of_birth == 1931)]
## Challenge: print to the screen all data from clinical_df for patients with
# stage ia tumors who live more than 365 days
# what if you wanted to extract observations matching a collection of categories?
# create list of desired values
cancer_list = ["LGG", "UCEC", "GBM", "LUSC", "BRCA"]
# extract values
reduced_clinical = clinical_df[clinical_df["disease"].isin(cancer_list)]
#### Grouping ####
# motivation: evaluting data available for a category (column)
# what categories exist for race?
# identify number of unique elements in a column
pd.unique(clinical_df.race) # same as above, specifying column differently
# how can we summarize data by category?
# group data by race (object isn't interpretable by us)
grouped_data = clinical_df.groupby("race")
# note: we can't specify race as an attribute here because of the syntax of the method groupby
# summary stats for all columns by race
grouped_data.describe()
# for only one column
grouped_data.race.describe()
# count the number of each race (only one summary stat from above)
grouped_data.count()
# for only one column
grouped_data.race.count()
# count the number of each race for which days to death data is available
grouped_data.days_to_death.count()
# how does this differ from the last command?
# only display one race
grouped_data.days_to_death.count().asian
# remember this is synonymous with:
clinical_df.groupby("race")["days_to_death"].count()["asian"]
# this second command differs because of the data object (clinical_df) and the syntax for identifying columns
# save output to object for later use
race_counts = grouped_data.days_to_death.count()
print(race_counts) # see script-friendly output
## Challenge: Write code that will display:
# the number of patients in this dataset who are listed as alive
#### Visualizing grouped data as bar charts ####
# Create a quick bar chart of number of patients with race known
race_counts.plot(kind="bar");
# the semicolon suppresses the output, allowing the plot to show
## Challenge:
# create a new object called total_count that counts the number of samples for each cancer type (disease)
total_count = clinical_df.groupby("disease")["disease"].count()
total_count = clinical_df.groupby("disease").disease.count() # same as above
# plot the number of samples for each cancer type
total_count.plot(kind="bar");
#### Missing data: replacing data in copied df ####
## replace missing data in copied data frame
# create new copy of data frame
birth_replace = clinical_df.copy()
# look for missing data
birth_replace[pd.isnull(birth_replace.year_of_birth)]
# fill missing values with 0
birth_replace.year_of_birth = birth_replace.year_of_birth.fillna(0)
# filling with 0 gives different answer!
birth_replace.year_of_birth.mean()
clinical_df.year_of_birth.mean()
# fill NaN with mean for all weight values
birth_replace.year_of_birth = birth_replace.year_of_birth.fillna(birth_replace.year_of_birth.mean())
# this won't do anything since we've already replaced all missing data!
# can convert between data types, but is difficult without dealing with missing data
# convert the age_at_diagnosis from an float to integer
birth_replace.year_of_birth = birth_replace.year_of_birth.astype("int64")
birth_replace.year_of_birth.dtype
#clinical_df["year_of_birth"].dtype # gives error
#### Missing data: masking ####
# mask: excluding missing values
# check for missing data anywhere in dataset
pd.isnull(clinical_df)
# gives true/false matrix
# other options include pd.isna (alias of pd.isnull) and pd.notna (removes missing data)
# extract all rows values WITHOUT missing data
clinical_df[-pd.isnull(clinical_df).any(axis=1)]
len(clinical_df[-pd.isnull(clinical_df).any(axis=1)])
# another way to extract rows WITHOUT missing data
clinical_df.dropna() # yet another way
len(clinical_df.dropna())
# filtering for any missing data cuts out a lot of the dataset!
# exclude missing data in only days to death
clinical_df[-pd.isnull(clinical_df.cigarettes_per_day)]
clinical_df.dropna(subset = ["cigarettes_per_day"])
# save masked results to new object
smoke_complete = clinical_df.dropna(subset = ["cigarettes_per_day"])
# apply additional filter for age at diagnosis
smoke_complete = smoke_complete[smoke_complete.age_at_diagnosis > 0]
# save filtered data to file
smoke_complete.to_csv("data/smoke_complete.csv", index=False)
#### Visualizing scatterplots ####
# define plot variables
x = smoke_complete["cigarettes_per_day"]
y = smoke_complete["age_at_diagnosis"]
# plot two quantitative variables
plt.scatter(x, y)
# add transparency
plt.scatter(x, y, alpha=0.2)
# change colors to match cancer type
# show different categories
pd.unique(smoke_complete["disease"])
# create coded list of disease types
group = smoke_complete["disease"].astype("category").cat.codes
# create plot
plt.scatter(x, y, alpha=0.2, c=group, label=group)
# add axis labels (need to execute all lines at once)
plt.scatter(x, y, alpha=0.2, c=group, label=group)
plt.xlabel("cigarettes per day")
plt.ylabel("age at diagnosis (days)")
#### Wrapping up ####
# review objectives
# preview next week's objectives