-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path2. Pandas Data Structures.py
117 lines (78 loc) · 2.74 KB
/
2. Pandas Data Structures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#### Pandas Data Structures ####
import pandas as pd
s = pd.Series(['banana', 42])
print(s)
# manually assign index values to row names to a series by passing in a Python list
# 2.2.1 Creating a series
s1 = pd.Series(['Wes McKinney', 'Creator of Pandas'],index = ['Person', 'Who'])
print(s1)
# 2.2.2 Create a data frame
scientists = pd.DataFrame({
'Name': ['Rosalind Frankin', 'William Gosset'],
'Occupation': ['Chemist', 'Statistician'],
'Born': ['1920-07-25', '1876-06-13'],
'Died': ['1958-04-16', '1937-10-16'],
'Ages': [37, 61]
})
print(scientists)
# 2.3 The Series
first_row = scientists[scientists.values == 'William Gosset']
print(first_row)
age37 = scientists[scientists.values == 37]
print(age37)
first_row.index
first_row.values
# 2.3.1 The Series is ndarray-like
# get the 'Age' column
scientists
ages = scientists['Ages']
print(ages)
print(ages.mean())
print(ages.max())
print(ages.std())
# 2.3.2 Boolean Subsetting: Series
scientists1 = pd.read_csv('/Users/russellconte/Documents/Pandas for Everyone/pandas_for_everyone-master/data/scientists.csv')
scientists1
Ages = scientists1['Age']
print(Ages)
print(Ages.mean())
print(Ages[Ages>Ages.mean()])
type(Ages[Ages>Ages.mean()])
print(Ages)
rev_ages = Ages.sort_index(ascending = False) # prints the result in descending order
print(rev_ages)
# 2.4 The Data Frame
# Boolean vectors will subset rows:
scientists1[scientists1['Age'] >60]
first_half = scientists1[:4]
last_half = scientists1[4:]
print(first_half)
print(last_half)
# 2.5 Making changes to a searies and to a data frame
# 2.5.1 adding additional columns
print(scientists1['Born'].dtype)
print(scientists1['Died'].dtype)
born_datetime = pd.to_datetime(scientists1['Born'], format = '%Y-%m-%d')
print(born_datetime)
scientists1
died_datetime = pd.to_datetime(scientists1['Died'], format = '%Y-%m-%d')
print(died_datetime)
# Add new columns with the born and died datetime values
pd.set_option('display.max_columns', None) # allows to show all columns!
scientists1['born_dt'], scientists1['died_dt'] = (born_datetime, died_datetime)
print(scientists1)
scientists1['age_days_dt'] = (scientists1['died_dt'] - scientists1['born_dt'])
print(scientists1)
#2.5.3 dropping value
scientists2 = scientists1.drop(['Age'], axis = 1)
print(scientists2)
# 2.6 Exporting and importing data
# 2.6.1 pickle
names = scientists1['Name']
print(names)
names.to_pickle('/Users/russellconte/Documents/scientist_name_series.pickle')
scientist_names_from_pickle = pd.read_pickle('/Users/russellconte/Documents/scientist_name_series.pickle')
print(scientist_names_from_pickle) # Yay it worked!! Double Yay!!
# 2.6.2 CSV files - my very old and familar friend!!
names.to_csv('/Users/russellconte/Documents/scientist_name_series.csv')
print(names.to_csv)