用朴素贝叶斯对新闻文本分类

自己写了代码
但是执行到朴素贝叶斯分类的时候，会报错，效果非常差，不知道是不是因为我测试的时候，为了省去调试时间，每个类就抽了几个文本跑
希望有人看到，能给我再理理思路，帮忙看看有什么问题。谢谢啦

import os
import jieba
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.model_selection import train_test_split

def stopwordslist(filepath):   # 定义函数创建停用词列表
    stopword = [line.strip() for line in open(filepath, 'r',encoding='gb18030', errors='ignore').readlines()]    #以行的形式读取停用词表，同时转换为列表
    return stopword

jieba.setLogLevel(jieba.logging.INFO)
path = "C:/Users/ASUS/Desktop/机器学习/机器学习实验/try" #文件夹目录/C3-Art
path2= "C:/Users/ASUS/Desktop/机器学习/机器学习实验/seg"
files= os.listdir(path) #得到文件夹下的所有文件名称
label=[]
txt = []
z=0
sum=0
for file in files: #遍历文件夹
    position = path+'/'+ file #构造绝对路径，"\\"，其中一个'\'为转义符
    filesin = os.listdir(position)
    print("3")
    for filein in filesin:
        positionin = position + '/' + filein  # 构造绝对路径，"\\"，其中一个'\'为转义符
        # print(positionin)
        with open(positionin, "r", encoding='gb18030', errors='ignore') as f:  # 打开文件'gb18030'
            data = f.read()   #读取文件
            sum=sum+1
            label.append(z)
            words = jieba.cut(data)
            print("2")
            filepath = "C:/Users/ASUS/Desktop/机器学习/机器学习实验/cn_stopwords.txt"
            path1="C:/Users/ASUS/Desktop/机器学习/机器学习实验"
            stopwords = stopwordslist(filepath)  # 这里加载停用词的路径
            for word in words:  # for循环遍历分词后的每个词语
                if word not in stopwords:  # 判断分词后的词语是否在停用词表内
                    # if word != '\t':
                    if not word.isspace():
                        with open(os.path.join(path2,str(sum)+ 'seg.txt'), 'a+', encoding='gb18030') as f:
                            f.write(word+' ')
                        with open(os.path.join(path1, 'seg.txt'), 'a+', encoding='gb18030') as f:
                            f.write(word + ' ')
                            # txt.append(word + '')
                            # np.vstack((txts, b))
            # print(txt)
            # txt.extend(txts)
                        # with open(os.path.join(positionin, 'seg.txt'), 'a+', encoding='gb18030') as f:
                        #     f.write(word+' ')

    z=z+1

seg= "C:/Users/ASUS/Desktop/机器学习/机器学习实验/seg.txt"
seg1= "C:/Users/ASUS/Desktop/机器学习/机器学习实验/seg1.txt"
with open(seg, "r", encoding='gb18030', errors='ignore') as f:  # 打开文件'gb18030'
    txt = f.read()   #读取文件
vocab = set(txt.split())
char_dict = {}
for word in vocab:
    # for i, char in enumerate(word):
        if word not in char_dict:
            char_dict[word] = len(char_dict)
# vocab_size = len(vocab)
# vocabc=list(vocabb)
# vocab=np.array(vocabc)
# word2id = {w: i for i, w in enumerate(vocab)}
# print(vocab)
# 创建全零向量，并将对应位置设置为1
# token_index = {}  # 索引
# for sample in vocab:
#     for word in sample.split():
#         if word not in token_index:
#             token_index[word] = len(token_index) + 1

# print(len(token_index))  # 5
# print(token_index)
onehot_vectors=[]
num=0
#print(txt)
files2= os.listdir(path2)
for segfile in files2:
    position2 = path2 + '/' + segfile
    with open(position2, "r", encoding='gb18030', errors='ignore') as f:  # 打开文件'gb18030'
        text = f.read()  # 读取文件
        text1 = set(text.split())
        for wor in text1:
            vec = [0] * len(vocab)
            if wor in char_dict:
                print(wor)
                vec[char_dict[wor]] = 1
                num=num+1
        onehot_vectors.append(vec)

    # print(vec)
#print(onehot_vectors)
# print(vector)# print(vocab_size)
print(len(onehot_vectors))
print(label)
print(num)
# print(z)
x_train, x_test, y_train, y_test = train_test_split(onehot_vectors, label, test_size=0.2, random_state=42)
# 训练朴素贝叶斯分类器
clf_nb = MultinomialNB()  # 定义一个多项式朴素贝叶斯分类器
clf_nb.fit(x_train, y_train)  # 利用训练集对其进行训练

# 预测并评估性能
y_pred = clf_nb.predict(x_test)  # 对测试集进行预测
accuracy = accuracy_score(y_test, y_pred)  # 准确率：分类正确的样本数与总样本数的比例
precision = precision_score(y_test, y_pred, average='macro',zero_division=1)  # 精度：TP/(TP+FP)的均值，即所有类别的分类结果加权平均
recall = recall_score(y_test, y_pred, average='macro',zero_division=1)  # 召回率：TP/(TP+FN)的均值，即所有类别的“正例”真实例子分配给其他（错）类的比例
f1 = f1_score(y_test, y_pred, average='macro',zero_division=1)  # F1得分：精确度和召回率的调和平均值
print("准确率={:.4f}, 精度={:.4f}, 召回率={:.4f}, F1得分={:.4f}".format(accuracy, precision, recall, f1))
fpr, tpr, thresholds = roc_curve(y_test, y_pred)  # 绘制ROC曲线, pos_label=None
auc_value = auc(fpr, tpr)  # auc表示根据混淆矩阵和类别标签计算的二进制分类模型的质量
# 输出结果
print("朴素贝叶斯分类器在{}个测试样本上的效果".format(len(y_test)))

print("ROC曲线下面积(AUC值)={:.4f}\n".format(auc_value))

你可以看下这个问题的回答https://ask.csdn.net/questions/7686071
我还给你找了一篇非常好的博客，你可以看看是否有帮助，链接：在训练前标准化了训练集和测试集，训练完了以后，抽取未标准化的一条数据进行预测，结果出了问题
同时，你还可以查看手册：用于数字分类的限制性波尔兹曼机特征中的内容
除此之外, 这篇博客: 如何设置在交叉验证中同时设置训练集，验证集，测试集三个数据集并实现循环中的 开始在数据中循环，这儿设置为5组交叉验证 部分也许能够解决你的问题, 你可以仔细阅读以下内容或跳转源博客中阅读:

for i in range(5): # 10折的话把5改成10，下同
    print(i,"times: ")
    test=data[int(idx*i*0.2):int(idx*(i+1)*0.2),]   # 测试集，如果10折的话把0.2改成0.1，下同
    test_y=y[int(idx*i*0.2):int(idx*(i+1)*0.2),] # 测试集合标签
    if i+1 <= max(range(5)):     # 10折的话把5改成10，下同
        val=data[int(idx*(i+1)*0.2):int((i+2)*idx*0.2)] # 验证集，
        val_y=y[int(idx*(i+1)*0.2):int((i+2)*idx*0.2)] # 验证集标签
        
        train=np.delete(data,range(int(idx*i*0.2),int(idx*(i+2)*0.2)),axis=0) # 训练集，去掉测试集和验证集剩下部分
        train_y=np.delete(y,range(int(idx*i*0.2),int(idx*(i+2)*0.2)),axis=0) # 训练集标签
        #train=np.delete(train,range(int(idx*(i+1)*0.2),int((i+2)*idx*0.2)),axis=0)
    else:  # 最后一个循环：当最后一组为测试集，第一组为验证集，中间为训练集时
        val=data[:int(((i+1)%4)*idx*0.2)]   # 10折的话把4改成8，0.2改成0.1，下同
        val_y=y[:int(((i+1)%4)*idx*0.2)]
        
        train=np.delete(data,range(int(idx*i*0.2),int(idx*(i+1)*0.2)),axis=0)
        train=np.delete(train,range(int(((i+1)%4)*idx*0.2)),axis=0)
        
        train_y=np.delete(y,range(int(idx*i*0.2),int(idx*(i+1)*0.2)),axis=0)
        train_y=np.delete(train_y,range(int(((i+1)%4)*idx*0.2)),axis=0)
        
    print("test:\n",test,"-----test_y",test_y)
    print("val:\n", val,"-----val_y",val_y)
    print("train\n", train,"---------train_y", train_y)
    print("---------------------------------------")

结果如下：

0 times: 
test:
 tf.Tensor(
[[0.  0.3 0.7]
 [1.  1.3 1.7]
 [2.  2.3 2.7]
 [3.  3.3 3.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([0 1 2 3], shape=(4,), dtype=int32)
val:
 tf.Tensor(
[[4.  4.3 4.7]
 [5.  5.3 5.7]
 [6.  6.3 6.7]
 [7.  7.3 7.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([4 5 6 7], shape=(4,), dtype=int32)
train
 [[ 8.   8.3  8.7]
 [ 9.   9.3  9.7]
 [10.  10.3 10.7]
 [11.  11.3 11.7]
 [12.  12.3 12.7]
 [13.  13.3 13.7]
 [14.  14.3 14.7]
 [15.  15.3 15.7]
 [16.  16.3 16.7]
 [17.  17.3 17.7]
 [18.  18.3 18.7]
 [19.  19.3 19.7]] ---------train_y [ 8  9 10 11 12 13 14 15 16 17 18 19]
---------------------------------------
1 times: 
test:
 tf.Tensor(
[[4.  4.3 4.7]
 [5.  5.3 5.7]
 [6.  6.3 6.7]
 [7.  7.3 7.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([4 5 6 7], shape=(4,), dtype=int32)
val:
 tf.Tensor(
[[ 8.   8.3  8.7]
 [ 9.   9.3  9.7]
 [10.  10.3 10.7]
 [11.  11.3 11.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([ 8  9 10 11], shape=(4,), dtype=int32)
train
 [[ 0.   0.3  0.7]
 [ 1.   1.3  1.7]
 [ 2.   2.3  2.7]
 [ 3.   3.3  3.7]
 [12.  12.3 12.7]
 [13.  13.3 13.7]
 [14.  14.3 14.7]
 [15.  15.3 15.7]
 [16.  16.3 16.7]
 [17.  17.3 17.7]
 [18.  18.3 18.7]
 [19.  19.3 19.7]] ---------train_y [ 0  1  2  3 12 13 14 15 16 17 18 19]
---------------------------------------
2 times: 
test:
 tf.Tensor(
[[ 8.   8.3  8.7]
 [ 9.   9.3  9.7]
 [10.  10.3 10.7]
 [11.  11.3 11.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([ 8  9 10 11], shape=(4,), dtype=int32)
val:
 tf.Tensor(
[[12.  12.3 12.7]
 [13.  13.3 13.7]
 [14.  14.3 14.7]
 [15.  15.3 15.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([12 13 14 15], shape=(4,), dtype=int32)
train
 [[ 0.   0.3  0.7]
 [ 1.   1.3  1.7]
 [ 2.   2.3  2.7]
 [ 3.   3.3  3.7]
 [ 4.   4.3  4.7]
 [ 5.   5.3  5.7]
 [ 6.   6.3  6.7]
 [ 7.   7.3  7.7]
 [16.  16.3 16.7]
 [17.  17.3 17.7]
 [18.  18.3 18.7]
 [19.  19.3 19.7]] ---------train_y [ 0  1  2  3  4  5  6  7 16 17 18 19]
---------------------------------------
3 times: 
test:
 tf.Tensor(
[[12.  12.3 12.7]
 [13.  13.3 13.7]
 [14.  14.3 14.7]
 [15.  15.3 15.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([12 13 14 15], shape=(4,), dtype=int32)
val:
 tf.Tensor(
[[16.  16.3 16.7]
 [17.  17.3 17.7]
 [18.  18.3 18.7]
 [19.  19.3 19.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([16 17 18 19], shape=(4,), dtype=int32)
train
 [[ 0.   0.3  0.7]
 [ 1.   1.3  1.7]
 [ 2.   2.3  2.7]
 [ 3.   3.3  3.7]
 [ 4.   4.3  4.7]
 [ 5.   5.3  5.7]
 [ 6.   6.3  6.7]
 [ 7.   7.3  7.7]
 [ 8.   8.3  8.7]
 [ 9.   9.3  9.7]
 [10.  10.3 10.7]
 [11.  11.3 11.7]] ---------train_y [ 0  1  2  3  4  5  6  7  8  9 10 11]
---------------------------------------
4 times: 
test:
 tf.Tensor(
[[16.  16.3 16.7]
 [17.  17.3 17.7]
 [18.  18.3 18.7]
 [19.  19.3 19.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([16 17 18 19], shape=(4,), dtype=int32)
val:
 tf.Tensor(
[[0.  0.3 0.7]
 [1.  1.3 1.7]
 [2.  2.3 2.7]
 [3.  3.3 3.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([0 1 2 3], shape=(4,), dtype=int32)
train
 [[ 4.   4.3  4.7]
 [ 5.   5.3  5.7]
 [ 6.   6.3  6.7]
 [ 7.   7.3  7.7]
 [ 8.   8.3  8.7]
 [ 9.   9.3  9.7]
 [10.  10.3 10.7]
 [11.  11.3 11.7]
 [12.  12.3 12.7]
 [13.  13.3 13.7]
 [14.  14.3 14.7]
 [15.  15.3 15.7]] ---------train_y [ 4  5  6  7  8  9 10 11 12 13 14 15]
---------------------------------------

可见数据在各组之间循环，而且对应的label也都是对应循环的
这儿是设置的5组，如果是10组的话，把代码中range（5）直接改成range（10），然后把0.2改成0.1，4改成8即可。

这个代码不完美，只是能实现三者循环，有更好的代码可以分享给我

您还可以看一下陈槐老师的零基础新手入门软件测试必知必会课程中的大型项目测试用例模板内容详细讲解小节, 巩固相关知识点