# 1获取数据 # 读取的时候加上names # 2数据处理 # 缺失值处理 # 3划分数据集 # 4特征工程: # 无量纲化-标准化 # 5逻辑回归预估器 # 6模型评估 import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression # from sklearn.metrics import mean_squared_error from sklearn.metrics import classification_report from sklearn.metrics import roc_auc_score # 1读取数据 path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data' column_name = ['Sample code number','Clump Thickness' ,'Uniformity of Cell Size','Uniformity of Cell Shape' ,'Marginal Adhesion','Single Epithelial Cell Size' ,'Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class'] data = pd.read_csv(path,names=column_name) # 2数据处理 # 1替换-》np.nan data = data.replace(to_replace="?",value=np.nan) # 2缺失值处理-删除缺失样本 data.dropna(inplace=True) # print(data.isnull().any())#按列分组 #不存在缺失值 # 3划分数据集 x =data.iloc[:,1:-1] y =data['Class'] x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=22) # print(x_train) # 4特征工程: # 无量纲化-标准化 transfer = StandardScaler() x_train = transfer.fit_transform(x_train) x_test = transfer.transform(x_test) # 5逻辑回归预估器 estimator = LogisticRegression()#参数按照默认就可以 estimator.fit(x_train,y_train) # 逻辑回归的模型参数:权重系数和偏置 print('权重系数:\n',estimator.coef_) print('偏置:\n',estimator.intercept_) # 6模型评估 # 1直接对比真实值和预测值 y_predict = estimator.predict(x_test) print('预测:\n', y_predict) # 2为良性 4为恶性 print('直接对比真实值和预测值:\n',y_test == y_predict) # 2计算准确率 score = estimator.score(x_test,y_test) print('准确率为:\n',score) # error = mean_squared_error(y_test, y_predict) # print('均方误差为:\n', error) # 召回率 repotr = classification_report(y_test,y_predict,labels=[2,4],target_names=['良性','恶性']) print(repotr) # y_ture:每个样本的真是类别,必须为0(反例),1(正例)标记 # 将y_test 转换成 0 1 y_ture = np.where(y_test > 3,1,0) y_roc_auc_score = roc_auc_score(y_ture,y_predict)#指标越接近于1,分类器越好,越接近于0.5,越不好 print(y_roc_auc_score) # 7预测 t0 = [[4,1,1,1,2,1,3,1,1], [4,1,1,1,2,1,3,1,1], [1,1,1,1,2,1,3,2,1], [1,3,1,2,1,2,1,1,1], [1,3,3,2,2,1,7,2,1], [1,1,4,1,2,1,2,1,1], [3,1,1,1,2,3,3,1,1], [2,1,1,1,3,1,2,1,1], [2,2,2,1,1,1,7,1,1], [4,1,1,2,2,1,2,1,1], [5,2,1,1,2,1,3,1,1], [3,1,1,1,2,2,7,1,1]] t_predict = estimator.predict(t0) print('t的预测结果',t_predict)
问题点: 性质为良的原数据误判为恶性.
分析思路: 预测模型的构建过程,数据经过各种预处理, 其中标准化对数据的影响比较大.
解决方案: 两种方法都可以让t的预测结果显示为良性.
①待预测的数据也进行标准化的处理;
②模型的训练过程,数据不作标准化.