-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathProduction.py
215 lines (179 loc) · 8.74 KB
/
Production.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import streamlit as st
import pandas as pd
# Title
st.title('Glassdoor Prediction model')
st.subheader('The objective of this project was to further understand what it takes to be a financial analyst in London.'
' This exercise will serve as a gateway to those seeking to become analyst themselves as well as create an '
'entry point adapting a machine learning model in predicting what role may be expected in relation to the different variables.')
# sidebar:
st.sidebar.markdown('**User Input Parameters**')
def user_input_features():
Rating = st.sidebar.selectbox(
'Whats the company rating',
('1.0', '2.1', '2.2', '2.4', '2.5', '2.6', '3.1', '3.2', '3.3',
'3.4', '3.43', '3.5', '3.6', '3.7', '3.8', '3.9',
'4.0', '4.1', '4.2', '4.3', '4.4', '4.5', '4.8', '5.0')
)
Size = st.sidebar.selectbox(
'How many employees are currently hired?',
('51 to 200 employees', '1001 to 5000 employees',
'1 to 50 employees', '10000+ employees', '201 to 500 employees',
'501 to 1000 employees', '5001 to 10000 employees', 'Unknown')
)
Age = st.sidebar.selectbox(
'How old is the company?',
('10.0', '101.0', '104.0', '107.0', '11.0', '114.0', '12.0',
'122.0', '13.0', '132.0', '134.0', '14.0', '142.0', '149.0',
'150.0', '151.0', '16.0', '164.0', '17.0', '170.0', '182.0',
'19.0', '199.0', '20.0', '208.0', '21.0', '22.0', '228.0', '23.0',
'24.0', '25.0', '26.0', '29.0', '3.0', '30.0', '31.0', '32.0',
'34.0', '36.0', '39.0', '4.0', '45.0', '50.0', '51.0', '6.0',
'75.0', '77.0', '83.0', '85.0', '91.0', '92.0')
)
Industry = st.sidebar.selectbox(
'What Industry is the company working in?',
('Accounting', 'Advertising & Marketing', 'Aerospace & Defence',
'Banks & Building Societies', 'Biotech & Pharmaceuticals',
'Brokerage Services', 'Computer Hardware & Software', 'Consulting',
'Department, Clothing, & Shoe Shops',
'Enterprise Software & Network Solutions',
'Financial Analytics & Research',
'Financial Transaction Processing', 'Haulage',
'Healthcare Product Manufacturing', 'IT Services',
'Insurance Agencies & Brokerages', 'Insurance Operators',
'Internet', 'Investment Banking & Asset Management', 'Legal',
'Lending', 'Logistics & Supply Chain', 'Oil & Gas Services',
'Publishing', 'Real Estate', 'Staffing & Outsourcing', 'Unsure')
)
Sector = st.sidebar.selectbox(
'What Sector is the company operating in?',
('Accounting & Legal', 'Aerospace & Defence',
'Biotech & Pharmaceuticals', 'Business Services', 'Finance',
'Information Technology', 'Insurance', 'Manufacturing', 'Media',
'Oil, Gas, Energy & Utilities', 'Real Estate', 'Retail',
'Transportation & Logistics', 'Unsure')
)
Revenue = st.sidebar.selectbox(
'How much revenue is the company earning?',
('Unknown / Non-Applicable', '£1 to £2 billion (GBP)',
'£10 to £25 million (GBP)', '£10+ billion (GBP)',
'£100 to £500 million (GBP)', '£2 to £5 billion (GBP)',
'£25 to £50 million (GBP)', '£5 to £10 billion (GBP)',
'£50 to £100 million (GBP)', '£500 million to £1 billion (GBP)')
)
Type_of_ownership = st.sidebar.selectbox(
'What time of ownership is the company under?',
('Company - Private', 'Company - Public', 'Other Organisation',
'Private Practice / Firm', 'Subsidiary or Business Segment')
)
Seniority_Status = st.sidebar.selectbox(
'Whats the Seniority status of the role?',
('Junior Status', 'Senior Status', 'Vice President')
)
Type = st.sidebar.selectbox(
'What is the jon title?',
('Operations analyst', 'Underwriting Analyst',
'Unspesified Analyst', 'business analyst', 'compliance analyst',
'credit analyst', 'data analyst', 'equity analyst',
'financial analyst', 'fp&a', 'quantitative analyst',
'real estate analyst', 'risk analyst')
)
Location = st.sidebar.selectbox(
'Where is the job located?',
('London, England', 'Paddington, England')
)
Risk = st.sidebar.checkbox(
'Is Risk associated for the role?',
value=True)
SQL = st.sidebar.checkbox(
'Is SQL Required for the role?',
value=True)
Excel = st.sidebar.checkbox(
'Is Excel Required for the role?',
value=True)
python = st.sidebar.checkbox(
'Is Python Required for the role?',
value=True)
Fintech = st.sidebar.checkbox(
'Is this a Fintech role?',
value=True)
Consulting = st.sidebar.checkbox(
'Is this a Consulting role?',
value=True)
data = {'Rating': Rating,
'Size': Size,
'Age': Age,
'Industry': Industry,
'Sector': Sector,
'Revenue': Revenue,
'Type_of_ownership': Type_of_ownership,
'Seniority_Status': Seniority_Status,
'Type': Type,
'Location': Location,
'Risk': Risk,
'SQL': SQL,
'Excel': Excel,
'python': python,
'Fintech': Fintech,
'Consulting': Consulting,
}
features = pd.DataFrame(data, index=[0])
return features
df = user_input_features()
st.subheader('Predicted Salary Value:')
# Read in dataframe
dfp = pd.read_csv('Fin Machine Learning Ready.csv')
dfp['Type_of_ownership'] = dfp['Type of ownership']
dfp['Age'] = dfp['Age of the company in years']
dfp = dfp.fillna('missing')
# Creating values for our comparative model
df_model = dfp[['Rating', 'Size', 'Age', 'Industry', 'Sector', 'Revenue',
'Type_of_ownership', 'Seniority_Status', 'Average_Salary',
'Type', 'Location', 'Risk', 'SQL', 'Excel', 'python', 'Fintech', 'Consulting']]
# load ensure all values are one category type
df_model['Size'] = df_model['Size'].astype('str')
df_model['Age'] = df_model['Age'].astype('str')
df_model['Industry'] = df_model['Industry'].astype('str')
df_model['Sector'] = df_model['Sector'].astype('str')
df_model['Revenue'] = df_model['Revenue'].astype('str')
df_model['Type_of_ownership'] = df_model['Type_of_ownership'].astype('str')
df_model['Seniority_Status'] = df_model['Seniority_Status'].astype('str')
df_model['Rating'] = df_model['Rating'].astype('str')
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
X = df_model.drop('Average_Salary', axis=1) # create instance of data with no salary
y = df_model.Average_Salary.values # instance of data with just salary data as a series
X_train, X_test, y_train, y_test = train = train_test_split(X, y, test_size=.15, random_state=42)
column_trans = make_column_transformer(
(OneHotEncoder(handle_unknown='ignore'), ['Size', 'Industry',
'Sector', 'Revenue', 'Type_of_ownership', 'Seniority_Status', 'Type',
'Location']),
remainder='passthrough')
column_trans.fit_transform(X)
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
pipe = make_pipeline(column_trans, rfc)
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
pipe.fit(X, y)
pipe.predict(df)
#Prediction print
st.title(pipe.predict(df)[0])
st.header('Information about the data: ')
st.write('This model was trained on 500 data points, as such salary variation may be limited')
st.subheader('The following graphs give an insight into what the data is made up of')
st.image('Assets/Industry.png', caption='The distribution of Industries where the data is from',
use_column_width=True)
st.image('Assets/Revenue.png', caption='The distribution of Company Revenue data',
use_column_width=True)
st.image('Assets/Sector.png', caption='The distribution of Sectors where the data is from',
use_column_width=True)
st.image('Assets/Analyst Roles.png', caption='The distribution of the various roles where the data is from',
use_column_width=True)
st.subheader('Key Findings:')
st.text("Some of the most common words mentioned in the analysis include: 'Problem Solving','Bachelor Degree','team' and 'attention to detail")
st.text('This exercise will serve as a gateway to those seeking to become analyst themselves as well as create an entry')
st.text('point adapting a machine learning model in predicting what role may be expected in relation to the different variables.')
st.text('Project Repo: https://github.com/SamuelLawrence876/Glassdoor_Salary_Predictor.')