我想做一个模型来检测有问题的数据和正常的数据,我已经对数据打了标签分为0(正常数据)1(不正常数据),我用训练好的模型对用来训练模型时候已经打过标签0的这些正常数据进行预测的时候全部都被预测成了有问题数据。代码如下,可以帮忙看看哪里有问题吗?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import joblib
import json
import os
import xlwt
import xlrd
label_path=r"/home/common-dir/code-HX/数据标注表格.xlsx"
pridect_path = r'/home/common-dir/code-HX/DatasetLeak1061/频域数据整合/正常数据'
frequency_paths=[
r"/home/common-dir/code-HX/DatasetLeak1061/频域数据整合/正常数据",
r"/home/common-dir/code-HX/DatasetLeak1061/频域数据整合/疑似有问题数据",
r"/home/common-dir/code-HX/DatasetLeak1061/频域数据整合/确定有问题数据",
r"/home/common-dir/code-HX/DatasetLeak1061/频域数据整合/其他情况数据数据"
]
def getTimeData(path):
data=pd.read_json(path)
x=[round(x,1) for x in data["Data"][0]['Categories']]
y=data["Data"][0]['SeriesData']
return x,y
def getFrequencyData(path):
data = json.load(open(path,encoding='utf-8'))
x=list(data['fft_data_x'])
y=list(data['fft_data_y'])
return x,y
def getAlphaMaxNum(alpha,l):
#该函数用于求出超过最大值的alpha的数据个数
maxNum=max(l)
return len([x for x in l if x>=(alpha*maxNum)])
'''
labelPath 标签文件路径 返回一个名字与标签对应的字典
'''
def getLabel(labelPath):
res={}
data=pd.read_excel(labelPath)
for i in range(len(data)):
key=data.iloc[i]['数据名称']
value=data.iloc[i]['分类类别']
res[key]=value
return res
'''
paths为路径数组对该数组路径下的所有文件数据依次读取
flag=0表示读取时域数据 为1表示读取频域数据
'''
#获取频域数据并向其中添加峰值最高的前max_num个坐标点数据
def getFrequencyData_maxnum(paths,max_num):
labels=getLabel(label_path)
data=[]
label=[] #读取频域数据
for path in paths:
for line in os.listdir(path):
curPath = path + '/' + line
x,y=getFrequencyData(curPath)
y_mean = np.mean(y)#平均值
y_var = np.var(y)#方差
y_max = np.max(y)#最大值
y_alphaMaxNum = getAlphaMaxNum(0.5, y)
y_label = labels[line]
label.append((y_label))
frequency_domain_max=[]
for index in range(0,len(x)-1,1):
if(index-1<0 and y[index]>y[index+1]):
frequency_domain_max.append((x[index],y[index]))
if(y[index]>y[index-1] and y[index]>y[index+1] and index<len(x)-1):
frequency_domain_max.append((x[index],y[index]))
frequency_domain_max.sort(key=lambda tup:tup[1],reverse=True)
temp=frequency_domain_max[0:max_num:1]
tup=tuple()
for index in range(0,max_num,1):
tup=tup+temp[index]
data.append((y_mean, y_var, y_max, y_alphaMaxNum)+tup)
# 非标签0的数据都设为1(因为还有两类数据标签成了2,3,在这里除了正常数据我都改成了有问题数据)
for i in range(len(label)):
if int(label[i]) != 0:
label[i]= 1
return data,label
# 对预测数据进行预处理
def getData_pri(X,Y):
paths = []
y_mean = np.mean(Y) # 平均值
y_var = np.var(Y) # 方差
y_max = np.max(Y) # 最大值
y_alphaMaxNum = getAlphaMaxNum(0.5, Y)
frequency_domain_max = []
for index in range(0, len(X) - 1, 1):
if (index - 1 < 0 and Y[index] > Y[index + 1]):
frequency_domain_max.append((X[index], Y[index]))
if (Y[index] > Y[index - 1] and Y[index] > Y[index + 1] and index < len(X) - 1):
frequency_domain_max.append((X[index], Y[index]))
frequency_domain_max.sort(key=lambda tup: tup[1], reverse=True)
temp = frequency_domain_max[0:3:1]
tup = tuple()
for index in range(0, 3, 1):
tup = tup + temp[index]
paths.append((y_max, y_alphaMaxNum, y_var, y_mean)+tup)
return paths
if __name__=="__main__":
data,label=getFrequencyData_maxnum(frequency_paths,3)
min_max_scaler = preprocessing.MinMaxScaler(feature_range = (0,1))
data = min_max_scaler.fit_transform(data)
# # 数据拆分
X_train, X_test, y_train, y_test = model_selection.train_test_split(data, label, test_size=0.8,random_state=10)
# 构建AdaBoost算法的类
AdaBoost1 = ensemble.AdaBoostClassifier()
# 算法在训练数据集上的拟合
AdaBoost1.fit(X_train, y_train)
# 算法在测试数据集上的预测
pred1 = AdaBoost1.predict(X_test)
#获取预测数据
for line in os.listdir(pridect_path):
data_name = pridect_path + '/' + line
x,y = getFrequencyData(data_name)
data = getData_pri(x,y)
pred2 = AdaBoost1.predict(data)
print(data_name, pred2)
我运行下来的结果就是全部都是1,所有的正常数据被预测成了有问题数据,不知道问题出在了哪里,数据集大家随便换一个就行,标签0,1就ok