This repository has been archived by the owner on Jan 9, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 274
/
Copy pathdemo4.py
86 lines (80 loc) · 3.32 KB
/
demo4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# -*- coding: utf-8 -*-
# 信用卡违约率分析
import matplotlib
import pandas as pd
from sklearn.model_selection import learning_curve, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from matplotlib import pyplot as plt
# 打印配置文件路径 我的是在个人文件夹
print(matplotlib.matplotlib_fname())
import seaborn as sns
# 数据加载
data = data = pd.read_csv('./UCI_Credit_Card.csv')
# 数据探索
print(data.shape) # 查看数据集大小
print(data.describe()) # 数据集概览
# 查看下一个月违约率的情况
next_month = data['default.payment.next.month'].value_counts()
print(next_month)
df = pd.DataFrame({'default.payment.next.month': next_month.index, 'values': next_month.values})
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.figure(figsize=(6, 6))
plt.title(u'信用卡违约率客户\n (违约:1,守约:0)')
sns.set_color_codes("pastel")
sns.barplot(x='default.payment.next.month', y="values", data=df)
locs, labels = plt.xticks()
plt.show()
# 特征选择,去掉ID字段、最后一个结果字段即可
data.drop(['ID'], inplace=True, axis=1) # ID这个字段没有用
target = data['default.payment.next.month'].values
columns = data.columns.tolist()
columns.remove('default.payment.next.month')
features = data[columns].values
# 30%作为测试集,其余作为训练集
train_x, test_x, train_y, test_y = train_test_split(features, target, test_size=0.30, stratify=target, random_state=1)
# 构造各种分类器
classifiers = [
SVC(random_state=1, kernel='rbf'),
DecisionTreeClassifier(random_state=1, criterion='gini'),
RandomForestClassifier(random_state=1, criterion='gini'),
KNeighborsClassifier(metric='minkowski'),
]
# 分类器名称
classifier_names = [
'svc',
'decisiontreeclassifier',
'randomforestclassifier',
'kneighborsclassifier',
]
# 分类器参数
classifier_param_grid = [
{'svc__C': [1], 'svc__gamma': [0.01]},
{'decisiontreeclassifier__max_depth': [6, 9, 11]},
{'randomforestclassifier__n_estimators': [3, 5, 6]},
{'kneighborsclassifier__n_neighbors': [4, 6, 8]},
]
# 对具体的分类器进行GridSearchCV参数调优
def GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, param_grid, score='accuracy'):
response = {}
gridsearch = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=score)
# 寻找最优的参数 和最优的准确率分数
search = gridsearch.fit(train_x, train_y)
print("GridSearch最优参数:", search.best_params_)
print("GridSearch最优分数: %0.4lf" % search.best_score_)
predict_y = gridsearch.predict(test_x)
print("准确率 %0.4lf" % accuracy_score(test_y, predict_y))
response['predict_y'] = predict_y
response['accuracy_score'] = accuracy_score(test_y, predict_y)
return response
for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid):
pipeline = Pipeline([
('scaler', StandardScaler()),
(model_name, model)
])
result = GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, model_param_grid, score='accuracy')