-
Notifications
You must be signed in to change notification settings - Fork 0
/
Pandas.Rmd
135 lines (96 loc) · 4.35 KB
/
Pandas.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
---
title: "Python Pandas"
author: "Raghu Sidharthan"
output:
html_document:
highlight: pygments
---
**Basic Pandas syntax**
```{python,eval=FALSE}
import pandas as pd
pd.set_option('precision',3)
pd.set_option('display.width',500)
inpMatPD.info() # gives the memory usage of a object
s = pd.Series([1,3,5,np.nan,6,8]) # Creating a Series
Series.tolist() #Converting Series to list
pandas.__version__ # this will print the version of pandas beign used
#Creating a DataFrame
dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
(df.head(), df.tail(3), df.index, df.columns, df.describe) #Viewing Data
df.columns = ['a', 'b'] # 2. Assign column names
df.rename(columns={'$a': 'a', '$b': 'b'}, inplace=True) #Rename columns
##Using DataFrame.drop
df.drop(df.columns[[1, 2]], axis=1, inplace=True)
# drop by Name
df1 = df1.drop(['B', 'C'], axis=1)
## Select the ones you want
df1 = df[['a','d']]
df = df.drop(['id','var'],axis=1) #axis 1 for columns
prt1Pivot = prt1.pivot('TAZ','LINENO','EST').reset_index() # Reset index to make index a column
dtPivot=dtPivot.fillna(0) # Fill all missing variable with some number
#Sorting by an axis
df.sort_index(axis=1, ascending=False)
#Sorting by value
df.sort(columns='B')
df.sort(['A', 'B'], ascending=[1, 0])
#Prefixing certain column names using multiple list iterators
df.columns = df.columns.tolist()[:2]+[a+b for a in df.columns.tolist()[2:] for b in ['_sfx'] ]
#Pandas append - appends rows and returns a new object
allRslt=allRslt.append(rslt1)
df.drop_duplicates(['HOUSEID']) # Drop duplicates
df[df.duplicated(['HOUSEID'])] # Identify duplicates
pd.crosstab(df['oDist'],df['oDist']) ) # crosstab
pd.pivot_table(df,index=['oDist'],columns=['oDist'],values='id',aggfunc=len,fill_value=0) # crosstab using pivot_table but it can do more
df.pivot('foo', 'bar', 'baz') # not sure what this does
pd.merge(df1,df2,how='outer',left_on=['id1','id2'],right_on=['v1','v2']) # For suffixes use ",suffixes=('_prt1', '_prt2')"
racData=racData.add_prefix('r_') # Add suffix to all column names
prt3Pivot._get_numeric_data().apply(sum,0) # select only numeric coluns in a dataframe
#Read_csv options - reading a data withcoded null values
persons = pd.read_csv('data.csv',na_values=['NULL','N.A.'], keep_default_na=False)
#pd.Categorical can be used for ordered variables - good for custom sorting. Similar to the R ordered levels, Fits nicely with pivot_table
tr['TRPER'] = pd.Categorical(tr['TRPER'],['EA','AM','MD','PM','EV'])
#Categorical variables can be compared to get boolean operators. Aggregating and then reseting index will destroy the categories.To convert categories into cat code use this
sub_tours_summ['TOPER'].cat.codes
#Drop all columns starting with
dropCols = [val for is_good, val in zip([('rac_' in a or 'wac' in a) for a in df.columns.tolist()], df.columns.tolist()) if is_good]
ctppdata['hTrct'] = ctppdata.GEOID.apply(lambda x: np.int64(x[7:18])) # Pandas string manipulation
estData['totNMTrip']=np.minimum(estData['totNMTrip'],50) # Capping a dataframe column at a certain value
```
**Filtering a DataFrame**
```{python,echo=-1:-5,comment=NA}
import numpy as np
import pandas as pd
np.random.seed(123)
df = pd.DataFrame(np.random.randn(4,3),index=list('abcd'),columns=list('ABC'))
df['textCol']=['dog','cat','mouse','cat']
print(df)
print(df.loc['a']>0)
print(df.loc[:,df.loc['a']>0])
print(df.loc[df['C']>1.25,:])
print(df.loc[df['textCol'].isin(['cat','mouse']),:])
print(df.loc[~(df['textCol'].isin(['cat','mouse'])),:])
```
```{python,echo=-1:-2,comment=NA}
import numpy as np
import pandas as pd
# Create DataFrame from vectors of unequal length
d = dict( A = np.array([1,2]), B = np.array([1,2,3,4]) )
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in d.iteritems() ]))
print(d)
print(df)
```
```{python,eval=FALSE}
s.sample(n=3) # sample 3 random elements from the Series:
df.sample(frac=0.1, replace=True) # random 10% of the DataFrame with replacement:
```
**Shift function and rolling function for time series kind of data**
```{python,echo=-1:-2,}
import numpy as np
import pandas as pd
index = pd.date_range('2000-1-1', periods=10, freq='D')
df = pd.DataFrame(data=[3,1,4,2,1,3,2,1,3,2], index=index, columns=['A'])
df['B'] = df['A'].rolling(window=2,center=False).mean()
df['C'] = df['A'].shift(1).rolling(window=2,center=False).mean()
print(df)
```