自己写了代码
但是执行到朴素贝叶斯分类的时候,会报错,效果非常差,不知道是不是因为我测试的时候,为了省去调试时间,每个类就抽了几个文本跑
希望有人看到,能给我再理理思路,帮忙看看有什么问题。谢谢啦
import os
import jieba
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.model_selection import train_test_split
def stopwordslist(filepath): # 定义函数创建停用词列表
stopword = [line.strip() for line in open(filepath, 'r',encoding='gb18030', errors='ignore').readlines()] #以行的形式读取停用词表,同时转换为列表
return stopword
jieba.setLogLevel(jieba.logging.INFO)
path = "C:/Users/ASUS/Desktop/机器学习/机器学习实验/try" #文件夹目录/C3-Art
path2= "C:/Users/ASUS/Desktop/机器学习/机器学习实验/seg"
files= os.listdir(path) #得到文件夹下的所有文件名称
label=[]
txt = []
z=0
sum=0
for file in files: #遍历文件夹
position = path+'/'+ file #构造绝对路径,"\\",其中一个'\'为转义符
filesin = os.listdir(position)
print("3")
for filein in filesin:
positionin = position + '/' + filein # 构造绝对路径,"\\",其中一个'\'为转义符
# print(positionin)
with open(positionin, "r", encoding='gb18030', errors='ignore') as f: # 打开文件'gb18030'
data = f.read() #读取文件
sum=sum+1
label.append(z)
words = jieba.cut(data)
print("2")
filepath = "C:/Users/ASUS/Desktop/机器学习/机器学习实验/cn_stopwords.txt"
path1="C:/Users/ASUS/Desktop/机器学习/机器学习实验"
stopwords = stopwordslist(filepath) # 这里加载停用词的路径
for word in words: # for循环遍历分词后的每个词语
if word not in stopwords: # 判断分词后的词语是否在停用词表内
# if word != '\t':
if not word.isspace():
with open(os.path.join(path2,str(sum)+ 'seg.txt'), 'a+', encoding='gb18030') as f:
f.write(word+' ')
with open(os.path.join(path1, 'seg.txt'), 'a+', encoding='gb18030') as f:
f.write(word + ' ')
# txt.append(word + '')
# np.vstack((txts, b))
# print(txt)
# txt.extend(txts)
# with open(os.path.join(positionin, 'seg.txt'), 'a+', encoding='gb18030') as f:
# f.write(word+' ')
z=z+1
seg= "C:/Users/ASUS/Desktop/机器学习/机器学习实验/seg.txt"
seg1= "C:/Users/ASUS/Desktop/机器学习/机器学习实验/seg1.txt"
with open(seg, "r", encoding='gb18030', errors='ignore') as f: # 打开文件'gb18030'
txt = f.read() #读取文件
vocab = set(txt.split())
char_dict = {}
for word in vocab:
# for i, char in enumerate(word):
if word not in char_dict:
char_dict[word] = len(char_dict)
# vocab_size = len(vocab)
# vocabc=list(vocabb)
# vocab=np.array(vocabc)
# word2id = {w: i for i, w in enumerate(vocab)}
# print(vocab)
# 创建全零向量,并将对应位置设置为1
# token_index = {} # 索引
# for sample in vocab:
# for word in sample.split():
# if word not in token_index:
# token_index[word] = len(token_index) + 1
# print(len(token_index)) # 5
# print(token_index)
onehot_vectors=[]
num=0
#print(txt)
files2= os.listdir(path2)
for segfile in files2:
position2 = path2 + '/' + segfile
with open(position2, "r", encoding='gb18030', errors='ignore') as f: # 打开文件'gb18030'
text = f.read() # 读取文件
text1 = set(text.split())
for wor in text1:
vec = [0] * len(vocab)
if wor in char_dict:
print(wor)
vec[char_dict[wor]] = 1
num=num+1
onehot_vectors.append(vec)
# print(vec)
#print(onehot_vectors)
# print(vector)# print(vocab_size)
print(len(onehot_vectors))
print(label)
print(num)
# print(z)
x_train, x_test, y_train, y_test = train_test_split(onehot_vectors, label, test_size=0.2, random_state=42)
# 训练朴素贝叶斯分类器
clf_nb = MultinomialNB() # 定义一个多项式朴素贝叶斯分类器
clf_nb.fit(x_train, y_train) # 利用训练集对其进行训练
# 预测并评估性能
y_pred = clf_nb.predict(x_test) # 对测试集进行预测
accuracy = accuracy_score(y_test, y_pred) # 准确率:分类正确的样本数与总样本数的比例
precision = precision_score(y_test, y_pred, average='macro',zero_division=1) # 精度:TP/(TP+FP)的均值,即所有类别的分类结果加权平均
recall = recall_score(y_test, y_pred, average='macro',zero_division=1) # 召回率:TP/(TP+FN)的均值,即所有类别的“正例”真实例子分配给其他(错)类的比例
f1 = f1_score(y_test, y_pred, average='macro',zero_division=1) # F1得分:精确度和召回率的调和平均值
print("准确率={:.4f}, 精度={:.4f}, 召回率={:.4f}, F1得分={:.4f}".format(accuracy, precision, recall, f1))
fpr, tpr, thresholds = roc_curve(y_test, y_pred) # 绘制ROC曲线, pos_label=None
auc_value = auc(fpr, tpr) # auc表示根据混淆矩阵和类别标签计算的二进制分类模型的质量
# 输出结果
print("朴素贝叶斯分类器在{}个测试样本上的效果".format(len(y_test)))
print("ROC曲线下面积(AUC值)={:.4f}\n".format(auc_value))
for i in range(5): # 10折的话把5改成10,下同
print(i,"times: ")
test=data[int(idx*i*0.2):int(idx*(i+1)*0.2),] # 测试集,如果10折的话把0.2改成0.1,下同
test_y=y[int(idx*i*0.2):int(idx*(i+1)*0.2),] # 测试集合标签
if i+1 <= max(range(5)): # 10折的话把5改成10,下同
val=data[int(idx*(i+1)*0.2):int((i+2)*idx*0.2)] # 验证集,
val_y=y[int(idx*(i+1)*0.2):int((i+2)*idx*0.2)] # 验证集标签
train=np.delete(data,range(int(idx*i*0.2),int(idx*(i+2)*0.2)),axis=0) # 训练集,去掉测试集和验证集剩下部分
train_y=np.delete(y,range(int(idx*i*0.2),int(idx*(i+2)*0.2)),axis=0) # 训练集标签
#train=np.delete(train,range(int(idx*(i+1)*0.2),int((i+2)*idx*0.2)),axis=0)
else: # 最后一个循环:当最后一组为测试集,第一组为验证集,中间为训练集时
val=data[:int(((i+1)%4)*idx*0.2)] # 10折的话把4改成8,0.2改成0.1,下同
val_y=y[:int(((i+1)%4)*idx*0.2)]
train=np.delete(data,range(int(idx*i*0.2),int(idx*(i+1)*0.2)),axis=0)
train=np.delete(train,range(int(((i+1)%4)*idx*0.2)),axis=0)
train_y=np.delete(y,range(int(idx*i*0.2),int(idx*(i+1)*0.2)),axis=0)
train_y=np.delete(train_y,range(int(((i+1)%4)*idx*0.2)),axis=0)
print("test:\n",test,"-----test_y",test_y)
print("val:\n", val,"-----val_y",val_y)
print("train\n", train,"---------train_y", train_y)
print("---------------------------------------")
结果如下:
0 times:
test:
tf.Tensor(
[[0. 0.3 0.7]
[1. 1.3 1.7]
[2. 2.3 2.7]
[3. 3.3 3.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([0 1 2 3], shape=(4,), dtype=int32)
val:
tf.Tensor(
[[4. 4.3 4.7]
[5. 5.3 5.7]
[6. 6.3 6.7]
[7. 7.3 7.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([4 5 6 7], shape=(4,), dtype=int32)
train
[[ 8. 8.3 8.7]
[ 9. 9.3 9.7]
[10. 10.3 10.7]
[11. 11.3 11.7]
[12. 12.3 12.7]
[13. 13.3 13.7]
[14. 14.3 14.7]
[15. 15.3 15.7]
[16. 16.3 16.7]
[17. 17.3 17.7]
[18. 18.3 18.7]
[19. 19.3 19.7]] ---------train_y [ 8 9 10 11 12 13 14 15 16 17 18 19]
---------------------------------------
1 times:
test:
tf.Tensor(
[[4. 4.3 4.7]
[5. 5.3 5.7]
[6. 6.3 6.7]
[7. 7.3 7.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([4 5 6 7], shape=(4,), dtype=int32)
val:
tf.Tensor(
[[ 8. 8.3 8.7]
[ 9. 9.3 9.7]
[10. 10.3 10.7]
[11. 11.3 11.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([ 8 9 10 11], shape=(4,), dtype=int32)
train
[[ 0. 0.3 0.7]
[ 1. 1.3 1.7]
[ 2. 2.3 2.7]
[ 3. 3.3 3.7]
[12. 12.3 12.7]
[13. 13.3 13.7]
[14. 14.3 14.7]
[15. 15.3 15.7]
[16. 16.3 16.7]
[17. 17.3 17.7]
[18. 18.3 18.7]
[19. 19.3 19.7]] ---------train_y [ 0 1 2 3 12 13 14 15 16 17 18 19]
---------------------------------------
2 times:
test:
tf.Tensor(
[[ 8. 8.3 8.7]
[ 9. 9.3 9.7]
[10. 10.3 10.7]
[11. 11.3 11.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([ 8 9 10 11], shape=(4,), dtype=int32)
val:
tf.Tensor(
[[12. 12.3 12.7]
[13. 13.3 13.7]
[14. 14.3 14.7]
[15. 15.3 15.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([12 13 14 15], shape=(4,), dtype=int32)
train
[[ 0. 0.3 0.7]
[ 1. 1.3 1.7]
[ 2. 2.3 2.7]
[ 3. 3.3 3.7]
[ 4. 4.3 4.7]
[ 5. 5.3 5.7]
[ 6. 6.3 6.7]
[ 7. 7.3 7.7]
[16. 16.3 16.7]
[17. 17.3 17.7]
[18. 18.3 18.7]
[19. 19.3 19.7]] ---------train_y [ 0 1 2 3 4 5 6 7 16 17 18 19]
---------------------------------------
3 times:
test:
tf.Tensor(
[[12. 12.3 12.7]
[13. 13.3 13.7]
[14. 14.3 14.7]
[15. 15.3 15.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([12 13 14 15], shape=(4,), dtype=int32)
val:
tf.Tensor(
[[16. 16.3 16.7]
[17. 17.3 17.7]
[18. 18.3 18.7]
[19. 19.3 19.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([16 17 18 19], shape=(4,), dtype=int32)
train
[[ 0. 0.3 0.7]
[ 1. 1.3 1.7]
[ 2. 2.3 2.7]
[ 3. 3.3 3.7]
[ 4. 4.3 4.7]
[ 5. 5.3 5.7]
[ 6. 6.3 6.7]
[ 7. 7.3 7.7]
[ 8. 8.3 8.7]
[ 9. 9.3 9.7]
[10. 10.3 10.7]
[11. 11.3 11.7]] ---------train_y [ 0 1 2 3 4 5 6 7 8 9 10 11]
---------------------------------------
4 times:
test:
tf.Tensor(
[[16. 16.3 16.7]
[17. 17.3 17.7]
[18. 18.3 18.7]
[19. 19.3 19.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([16 17 18 19], shape=(4,), dtype=int32)
val:
tf.Tensor(
[[0. 0.3 0.7]
[1. 1.3 1.7]
[2. 2.3 2.7]
[3. 3.3 3.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([0 1 2 3], shape=(4,), dtype=int32)
train
[[ 4. 4.3 4.7]
[ 5. 5.3 5.7]
[ 6. 6.3 6.7]
[ 7. 7.3 7.7]
[ 8. 8.3 8.7]
[ 9. 9.3 9.7]
[10. 10.3 10.7]
[11. 11.3 11.7]
[12. 12.3 12.7]
[13. 13.3 13.7]
[14. 14.3 14.7]
[15. 15.3 15.7]] ---------train_y [ 4 5 6 7 8 9 10 11 12 13 14 15]
---------------------------------------
可见数据在各组之间循环,而且对应的label也都是对应循环的
这儿是设置的5组,如果是10组的话,把代码中range(5)直接改成range(10),然后把0.2改成0.1,4改成8即可。
这个代码不完美,只是能实现三者循环,有更好的代码可以分享给我