本人正在练习项目——对银行还款进行测试,其中的预测结果已经出来,但是想把结果进行可视化的时候遇到了问题,经过一系列排查,应该是 plt.contourf里面的那个ravel()部分出了问题
(请直接跳转到代码最底下部分的:Visulising the Traning Set Result)
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
# 经典而分类问题
app_train = pd.read_csv('/Users/iven/Desktop/Python机器学习实战/第十一章:银行客户还款可能性预测/application_train.csv')
# 展示缺失值
def missing_value_table(df):
mis_val = df.isnull().sum() # 计算所有缺失值
mis_val_percent = 100 * df.isnull().sum() / len(df) # %比
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# 做成表格显示出来
mis_val_rename = mis_val_table.rename(columns={0:'Missing valyes',1:'% of total values'})
# 剔除完整的并排序(sort_value) 升序
mis_val_rename = mis_val_rename[mis_val_rename.iloc[:, 1] != 0].sort_values('% of total values', ascending = False)
return mis_val_rename
missing_value_table(app_train)[:10]
'''
axis=0代表往跨行(down),而axis=1代表跨列(across)
使用0值表示沿着每一列或行标签/索引值向下执行方法
使用1值表示沿着每一行或者列标签横向执行对应的方法
'''
# Object 类型数据处理.
# 特征个数大于2时候,一般用One-Hot去处理,而小于2的时候 label-eco
app_train.dtypes.value_counts()
app_train.select_dtypes('object').apply(pd.Series.nunique,axis=0) # Pandas nunique() 用于获取唯一值的统计次数。
le = LabelEncoder()
for col in app_train:
if app_train[col].dtype == 'object':
if len(list(app_train[col].unique()))<=2:
le.fit(app_train[col])
app_train[col]=le.transform(app_train[col])
app_train = pd.get_dummies(app_train)
app_train.shape
# EDA分析 特征分析
train_labels = app_train['TARGET']
app_train['DAYS_BIRTH'][:5] # 贷款的人从出生到现在"活"了多少天
# 因此,我们需要转换成年
(app_train['DAYS_BIRTH']/-365).describe()
(app_train['DAYS_EMPLOYED']).describe()
app_train['DAYS_EMPLOYED'].plot.hist()
plt.show()
app_train['DAYS_EMPLOYED_ANOM'] = app_train['DAYS_EMPLOYED'] == 365243
app_train['DAYS_EMPLOYED'].replace({365243:np.nan},inplace=True)
app_train['DAYS_EMPLOYED'].plot.hist()
plt.show()
correlations = app_train.corr()['TARGET'].sort_values()
correlations.head()
correlations.tail()
# 但是对于年龄,它是负数
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_train['TARGET'].corr(app_train['DAYS_BIRTH'])
# 变负了
plt.figure(figsize = (12,6))
plt.style.use('fivethirtyeight') # 图表风格 去SeaBorn看就ok
plt.hist(app_train['DAYS_BIRTH']/365,edgecolor='k',bins=25)
plt.show()
plt.figure(figsize=(16,8))
#KDEPLOT
sns.kdeplot(app_train.loc[app_train['TARGET']==0,'DAYS_BIRTH']/365,label='target==0')
sns.kdeplot(app_train.loc[app_train['TARGET']==1,'DAYS_BIRTH']/365,label='target==1')
plt.show()
# 能用KDEPLot显示就先用,因为这是连续的,更加直观
# 不还钱的人都是30岁左右的人
age_data = app_train[['TARGET','DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH']/365
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'],bins=np.linspace(20,70,num=11)) # 设置年龄区间
age_data.head()
age_groups = age_data.groupby('YEARS_BINNED').mean()
plt.figure(figsize=(16,16))
plt.bar(age_groups.index.astype(str),100*age_groups['TARGET'])
plt.xticks(rotation=30) # 坐标轴,多少度
plt.show()
ext_data = app_train[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
plt.figure(figsize=(20,8))
sns.heatmap(ext_data_corrs,cmap = plt.cm.RdYlBu_r, linewidths = .5, annot=True)
plt.show()
# http://seaborn.pydata.org/generated/seaborn.heatmap.html
plt.figure(figsize=(16,10))
for i,source in enumerate(['EXT_SOURCE_3','EXT_SOURCE_2','EXT_SOURCE_1']):
# 指定好子图的位置
plt.subplot(3,1,i+1) # 3行1列,位置i=0 i+1
# kdeplot
sns.kdeplot(app_train.loc[app_train['TARGET']==0,source]/365,label='target==0')
sns.kdeplot(app_train.loc[app_train['TARGET']==1,source]/365,label='target==1')
plt.title('D of %s' % source)
plt.tight_layout(h_pad=2.5) # 布局 间隙
plt.show()
# 特征工程(多项式回归) X次方越大,越准确
poly_features = app_train[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
# 特征工程之前 - 缺失值填充
imputer = SimpleImputer(strategy='median') # 类似于拟合器,遇到缺失值就用中位数来填补
poly_target = poly_features['TARGET']
poly_features.drop(columns=['TARGET'],inplace =True) # 除了Target列的其他
poly_features = imputer.fit_transform(poly_features) # 拟合
poly_transformer = PolynomialFeatures(degree=3)
poly_transformer.fit(poly_features)
poly_features = poly_transformer.transform(poly_features)
# poly_features.shape 从4个特征变成了35个
poly_transformer.get_feature_names(input_features=['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH'])[:20]
# 将当前得到的部分特征跟总体组合在一起
poly_features = pd.DataFrame(
poly_features,
columns = poly_transformer.get_feature_names(input_features=['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH'])
)
# 与之前100个指标组合在一起
poly_features['SK_ID_CURR'] = app_train['SK_ID_CURR'] # ID是不会改变的 根据ID去传进去
app_train_poly = app_train.merge(poly_features, on='SK_ID_CURR', how='left')
# 根据实际情况来创建特征
# 例如对时间特征,可以分出来什么特征呢? 数据挖掘——90%时间都在和小特征打交道,不要忽视任何一个小的特征
# 建模其实没啥花时间
app_train_domain = app_train.copy() # 不要乱改,防止改乱 类似于创建副本
app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain['AMT_CREDIT'] / app_train_domain['AMT_INCOME_TOTAL'] # 信用额度与工资比值
app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_INCOME_TOTAL'] # 没(每年)还款年金和工资的比值
app_train_domain['CREDIT_TERM'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_CREDIT'] # 还款总月份
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['DAYS_EMPLOYED'] / app_train_domain['DAYS_BIRTH'] # 上班时间和年龄的比值
# 这是加到了最初的表,和上面的特征工程没啥关系,这是DIY的,所以新的列数为248而不是279
plt.figure(figsize=(16, 20))
for i, feature in enumerate(
['CREDIT_INCOME_PERCENT', 'ANNUITY_INCOME_PERCENT', 'CREDIT_TERM', 'DAYS_EMPLOYED_PERCENT']):
plt.subplot(4, 1, i + 1) # 定义子图的位置数量等
sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 0, feature], label='target == 0')
sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 1, feature], label='target == 1')
plt.title('Distribution of %s by Target Value' % feature)
plt.xlabel('%s' % feature);
plt.ylabel('Density');
plt.tight_layout(h_pad=2.5)
plt.show()
'''
pad:调整边框边距
w_pad:调整横宽边距
h_pad:调整纵宽边距
'''
# 数据预处理:特征好了后,检查下整合没啥问题就建模了
Y = app_train['TARGET']
X = app_train.drop(columns = ['TARGET'])
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
from sklearn.preprocessing import StandardScaler, MinMaxScaler
imputer = SimpleImputer(strategy='median')
std = StandardScaler()
# 填充
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)
# 标准化
std.fit(X_train)
X_train = std.transform(X_train)
X_test = std.transform(X_test)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train,Y_train)
# 用混淆矩阵
predictions = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, predictions)
# confusion_metrix = 70787/70787+6091 = 92%
# roc_auc_score
predictions_2 = classifier.predict_proba(X_test)[:,1]
from sklearn.metrics import roc_auc_score
test_auc = roc_auc_score(Y_test,predictions_2)
# test_auc = 0.7434
# Visualising the Trainning ser results
from matplotlib.colors import ListedColormap # 给不同的点上不同的颜色
X_set, Y_set = X_train, Y_train
x1, x2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1, # -1 / +1 都能更方便我们看生成好的图像
stop = X_set[:,0].max()+1,
step = 0.01), # 0.01 看显示屏的参数来设定
np.arange(start = X_set[:,1].min()-1,
stop = X_set[:,1].max()+1,
step = 0.01))
plt.contourf(
x1, x2, classifier.predict(
np.array([x1.ravel(),x2.ravel()]).T).reshape(x1.shape),
alpha = 0.75,
cmap = ListedColormap(('red', 'green'))
)
plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
for i, j in enumerate(np.unique(Y_set)): # 画出实际存在的点
plt.scatter(X_set[Y_set == j, 0], X_set[Y_set == j, 1],
c = ListedColormap(('orange','blue'))(i),label=j)
plt.title('Classifier (Training Set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, Y_set = X_test, Y_test
x1, x2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1,
stop = X_set[:,0].max()+1,
step = 0.01),
np.arange(start = X_set[:,1].min()-1,
stop = X_set[:,1].max()+1,
step = 0.01))
plt.contourf(
x1, x2, classifier.predict(
np.array([x1.ravel(),x2.ravel()]).T).reshape(x1.shape),
alpha = 0.75,
cmap = ListedColormap(('red', 'green'))
)
plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
for i, j in enumerate(np.unique(Y_set)):
plt.scatter(X_set[Y_set == j, 0], X_set[Y_set == j, 1],
c = ListedColormap(('orange','blue'))(i),label=j)
plt.title('Classifier (Test Set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
Traceback (most recent call last):
File "", line 210, in
File "/Users/iven/PycharmProjects/pythonProject/venv/lib/python3.8/site-packages/sklearn/linear_model/_base.py", line 447, in predict
scores = self.decision_function(X)
File "/Users/iven/PycharmProjects/pythonProject/venv/lib/python3.8/site-packages/sklearn/linear_model/_base.py", line 429, in decision_function
X = self._validate_data(X, accept_sparse="csr", reset=False)
File "/Users/iven/PycharmProjects/pythonProject/venv/lib/python3.8/site-packages/sklearn/base.py", line 600, in _validate_data
self._check_n_features(X, reset=reset)
File "/Users/iven/PycharmProjects/pythonProject/venv/lib/python3.8/site-packages/sklearn/base.py", line 400, in _check_n_features
raise ValueError(
ValueError: X has 2 features, but LogisticRegression is expecting 243 features as input.
大概只看到这个比较类似,但不知道怎么修改自己的代码:
https://blog.csdn.net/qq_45128278/article/details/120609776
但是同样的代码我取另一份没那么多维度的数据集来操作的时候,就能成功画出这幅图:
plt.contourf绘制的图是基于其中某两个特征的,需要重新构建分类器,并且选择数据集其中的某两个特征,代码以前两个特征为例,即代码中的0: 2,PS:由于代码太长,我就不一一复制了,从195行开始哈:
predictions_2 = classifier.predict_proba(X_test)[:,1]
from sklearn.metrics import roc_auc_score
test_auc = roc_auc_score(Y_test,predictions_2)
# test_auc = 0.7434
# 为了避免和之前的分类器重复,所以新起了个名字
classifier_new = LogisticRegression(random_state = 0)
classifier_new.fit(X_train[:, 0: 2], Y_train) # 0: 2表示的是前两个特征
from matplotlib.colors import ListedColormap # 给不同的点上不同的颜色
X_set, Y_set = X_train[:, 0: 2], Y_train # 0: 2表示的是前两个特征
x1, x2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1, # -1 / +1 都能更方便我们看生成好的图像
stop = X_set[:,0].max()+1,
step = 0.01), # 0.01 看显示屏的参数来设定
np.arange(start = X_set[:,1].min()-1,
stop = X_set[:,1].max()+1,
step = 0.01))
plt.contourf(
x1, x2, classifier_new.predict( # 这一行修改了
np.array([x1.ravel(),x2.ravel()]).T).reshape(x1.shape),
alpha = 0.75,
cmap = ListedColormap(('red', 'green'))
)
plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
for i, j in enumerate(np.unique(Y_set)): # 画出实际存在的点
plt.scatter(X_set[Y_set == j, 0], X_set[Y_set == j, 1],
c = ListedColormap(('orange','blue'))(i),label=j)
plt.title('Classifier (Training Set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, Y_set = X_test[:, 0: 2], Y_test # 0:2表示的是前两个特征
x1, x2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1,
stop = X_set[:,0].max()+1,
step = 0.01),
np.arange(start = X_set[:,1].min()-1,
stop = X_set[:,1].max()+1,
step = 0.01))
plt.contourf(
x1, x2, classifier_new.predict(
np.array([x1.ravel(),x2.ravel()]).T).reshape(x1.shape),
alpha = 0.75,
cmap = ListedColormap(('red', 'green'))
)
plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
for i, j in enumerate(np.unique(Y_set)):
plt.scatter(X_set[Y_set == j, 0], X_set[Y_set == j, 1],
c = ListedColormap(('orange','blue'))(i),label=j)
plt.title('Classifier (Test Set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()