import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
def get_encoding(file):
encodings = ['utf-8', 'iso-8859-1', 'cp1252', 'gb2312', 'gbk']
for e in encodings:
try:
pd.read_csv(file, encoding=e, nrows=1) # 尝试读取文件的第一行,如果没有问题则返回该编码
return e
except:
pass
return None # 如果所有编码都失败,返回 None
def predict_heart_disease(file_path):
encoding = get_encoding(file_path)
if encoding is None:
print(f"无法确定文件 {file_path} 的编码")
return
# 数据加载,跳过解析错误的行
chunksize = 10 ** 6
chunks = []
for chunk in pd.read_csv(file_path, chunksize=chunksize, dtype=str, encoding=encoding, error_bad_lines=False):
chunks.append(chunk)
data = pd.concat(chunks, axis=0)
# 分离特征和目标变量
if '是否患有心血管疾病' in data.columns:
y = data['是否患有心血管疾病']
X = data.drop('是否患有心血管疾病', axis=1)
# 数据预处理
le = LabelEncoder()
for column in X.columns:
if X[column].dtype == 'object':
X[column] = le.fit_transform(X[column])
y = le.fit_transform(y)
# 将y转换为一维数组
y = y.reshape(-1)
scaler = StandardScaler()
X = scaler.fit_transform(X)
# 数据划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80% 训练集, 20% 测试集
# 构建模型
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# 模型编译
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 模型训练
model.fit(X_train, y_train, epochs=10, batch_size=32)
# 模型评估
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# 保存模型
model.save("C:\\Users\\1\\Desktop\\heart_disease\\heart_disease_model_{file_path.split('/')[-1]}.h5")
print(y.shape)
# 新数据预测
new_data_files = [r"C:\Users\1\Desktop\heart_disease1\1-首次病程记录.csv", r"C:\Users\1\Desktop\heart_disease1\2-日常病程记录.csv",
r"C:\Users\1\Desktop\heart_disease1\3-出院记录.csv", r"C:\Users\1\Desktop\heart_disease1\4-检验记录表.csv",
r"C:\Users\1\Desktop\heart_disease1\5-检验明细表.csv", r"C:\Users\1\Desktop\heart_disease1\6-细菌结果表.csv",
r"C:\Users\1\Desktop\heart_disease1\7-影像检查报告表.csv"] # 这个列表需要根据你的实际特征来修改
# 用于保存预测结果的列表
prediction_results = []
# 针对每一个新数据文件运行预测函数
for new_data_file in new_data_files:
print(f"Processing {new_data_file}")
new_data = pd.read_csv(new_data_file, dtype=str, encoding=encoding, error_bad_lines=False)
new_data = new_data.drop('是否患有心血管疾病', axis=1) if '是否患有心血管疾病' in new_data.columns else new_data
new_data = le.transform(new_data)
new_data = scaler.transform(new_data)
new_data = new_data.reshape(-1) # 将新数据转换为一维数组
prediction = model.predict(new_data)
if prediction[0] >= 0.5:
prediction_result = '有心血管疾病'
else:
prediction_result = '无心血管疾病'
# 添加预测结果到列表
prediction_results.append({'文件路径': new_data_file, '预测结果': prediction_result})
# 将预测结果保存为DataFrame
prediction_df = pd.DataFrame(prediction_results)
# 输出预测结果到CSV文件
prediction_df.to_csv(f'heart_disease_predictions_{file_path.split("/")[-1]}.csv', index=False)
# 输出预测结果到Excel文件
prediction_df.to_excel(f'heart_disease_predictions_{file_path.split("/")[-1]}.xlsx', index=False)
# 文件列表
files = [r"C:\Users\1\Desktop\heart_disease\1-首次病程记录.csv", r"C:\Users\1\Desktop\heart_disease\2-日常病程记录.csv",
r"C:\Users\1\Desktop\heart_disease\3-出院记录.csv", r"C:\Users\1\Desktop\heart_disease\4-检验记录表.csv",
r"C:\Users\1\Desktop\heart_disease\5-检验明细表.csv", r"C:\Users\1\Desktop\heart_disease\6-细菌结果表.csv",
r"C:\Users\1\Desktop\heart_disease\7-影像检查报告表.csv", r"C:\Users\1\Desktop\heart_disease\8-输出结果.csv"]
# 用于保存预测结果的列表
prediction_results = []
# 针对每一个文件运行函数
for file in files:
print(f"Processing {file}")
predict_heart_disease(file)
# 将预测结果保存为DataFrame
prediction_df = pd.DataFrame(prediction_results)
# 输出预测结果到CSV文件
prediction_df.to_csv('heart_disease_predictions.csv', index=False)
# 输出预测结果到Excel文件
prediction_df.to_excel('heart_disease_predictions.xlsx', index=False)
需要的是一维的数组,你给的是二维的,转一下就好了
使用numpy库中的flatten()或ravel()方法把二维的转成一维的
y转换成一维数组就可以了
不知道你这个问题是否已经解决, 如果还没有解决的话: