-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse_GDP.py
53 lines (50 loc) · 2.11 KB
/
parse_GDP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pandas as pd
import numpy as np
import fix_differences as diff
import useful_analysis_functions as fns
#pd.set_option('display.max_rows', None)
df = pd.read_csv(r'Project data/GDP by county.csv', low_memory=False, dtype={'GeoType': str})
df = df.iloc[1:3245, :] # cut off end of data
df = df.replace(np.nan, '') # replace nan with empty string
i = 1
start = 0
end = 0
# separate state rows
while i < df.shape[0]:
if df.iloc[i, 0] == '':
end = i
df.iloc[start+2:end, 0] = [x + '_' + df.iloc[start+1, 0] for x in df.iloc[start+2:end, 0]]
start = i
i = i + 1
df.iloc[3221:3244, 0] = [x + '_' + df.iloc[start+1, 0] for x in df.iloc[3221:3244, 0]] # concatenate county and state name for wyoming
df = df[df['GeoType '].str.contains(r'[_]')] # remove empty rows
# format counties
df = fns.remove_confusing_words(df, 'GeoType ')
# create separate tables for virginia
ref = pd.read_csv(r'Parsed data/ID match resorted.csv')
ref_virginia = ref.loc[ref["county_State"].str.endswith("_virginia")]
df_virginia = df.loc[df["GeoType "].str.endswith("_virginia")]
df_virginia = df_virginia.sort_values(by = ['GeoType '], axis=0)
# add code columns
df_virginia[['STATEFP', 'COUNTYFP']] = [np.nan, np.nan]
df_copy = df_virginia.copy()
# fix outlier in ordering
df_virginia.iloc[64,:],df_virginia.iloc[65,:], df_virginia.iloc[66,:]=df_copy.iloc[65,:],df_copy.iloc[66,:],df_copy.iloc[64,:]
i = 0
# check for differneces
while i < ref_virginia.shape[0] - 1:
if ref_virginia.iat[i, 2] != (df_virginia.iat[i, 0]):
print(i)
print(ref_virginia.iat[i, 2])
print(df_virginia.iat[i, 0])
i = i + 1
i = 0
#add in county codes
for i in range(df_virginia.shape[0]):
df_virginia.iloc[i, 8] = ref_virginia.iloc[i, 0]
df_virginia.iloc[i, 9] = ref_virginia.iloc[i, 1]
df_virginia = df_virginia.sort_values(by = ['STATEFP', 'COUNTYFP'], axis=0) #resort
df_virginia = df_virginia.drop(['STATEFP', 'COUNTYFP'], axis=1) #drop columns
df = pd.concat([df.iloc[0:2820], df_virginia, df.iloc[2952:,:]], axis=0, ignore_index=True) #add back in
#df = diff.fix(df, 0, 2, 1)
#df.to_csv(r'Parsed data/GDP.csv', index = False)