用朴素贝叶斯对新闻文本分类

自己写了代码
但是执行到朴素贝叶斯分类的时候,会报错,效果非常差,不知道是不是因为我测试的时候,为了省去调试时间,每个类就抽了几个文本跑
希望有人看到,能给我再理理思路,帮忙看看有什么问题。谢谢啦

img

import os
import jieba
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.model_selection import train_test_split

def stopwordslist(filepath):   # 定义函数创建停用词列表
    stopword = [line.strip() for line in open(filepath, 'r',encoding='gb18030', errors='ignore').readlines()]    #以行的形式读取停用词表,同时转换为列表
    return stopword

jieba.setLogLevel(jieba.logging.INFO)
path = "C:/Users/ASUS/Desktop/机器学习/机器学习实验/try" #文件夹目录/C3-Art
path2= "C:/Users/ASUS/Desktop/机器学习/机器学习实验/seg"
files= os.listdir(path) #得到文件夹下的所有文件名称
label=[]
txt = []
z=0
sum=0
for file in files: #遍历文件夹
    position = path+'/'+ file #构造绝对路径,"\\",其中一个'\'为转义符
    filesin = os.listdir(position)
    print("3")
    for filein in filesin:
        positionin = position + '/' + filein  # 构造绝对路径,"\\",其中一个'\'为转义符
        # print(positionin)
        with open(positionin, "r", encoding='gb18030', errors='ignore') as f:  # 打开文件'gb18030'
            data = f.read()   #读取文件
            sum=sum+1
            label.append(z)
            words = jieba.cut(data)
            print("2")
            filepath = "C:/Users/ASUS/Desktop/机器学习/机器学习实验/cn_stopwords.txt"
            path1="C:/Users/ASUS/Desktop/机器学习/机器学习实验"
            stopwords = stopwordslist(filepath)  # 这里加载停用词的路径
            for word in words:  # for循环遍历分词后的每个词语
                if word not in stopwords:  # 判断分词后的词语是否在停用词表内
                    # if word != '\t':
                    if not word.isspace():
                        with open(os.path.join(path2,str(sum)+ 'seg.txt'), 'a+', encoding='gb18030') as f:
                            f.write(word+' ')
                        with open(os.path.join(path1, 'seg.txt'), 'a+', encoding='gb18030') as f:
                            f.write(word + ' ')
                            # txt.append(word + '')
                            # np.vstack((txts, b))
            # print(txt)
            # txt.extend(txts)
                        # with open(os.path.join(positionin, 'seg.txt'), 'a+', encoding='gb18030') as f:
                        #     f.write(word+' ')

    z=z+1

seg= "C:/Users/ASUS/Desktop/机器学习/机器学习实验/seg.txt"
seg1= "C:/Users/ASUS/Desktop/机器学习/机器学习实验/seg1.txt"
with open(seg, "r", encoding='gb18030', errors='ignore') as f:  # 打开文件'gb18030'
    txt = f.read()   #读取文件
vocab = set(txt.split())
char_dict = {}
for word in vocab:
    # for i, char in enumerate(word):
        if word not in char_dict:
            char_dict[word] = len(char_dict)
# vocab_size = len(vocab)
# vocabc=list(vocabb)
# vocab=np.array(vocabc)
# word2id = {w: i for i, w in enumerate(vocab)}
# print(vocab)
# 创建全零向量,并将对应位置设置为1
# token_index = {}  # 索引
# for sample in vocab:
#     for word in sample.split():
#         if word not in token_index:
#             token_index[word] = len(token_index) + 1

# print(len(token_index))  # 5
# print(token_index)
onehot_vectors=[]
num=0
#print(txt)
files2= os.listdir(path2)
for segfile in files2:
    position2 = path2 + '/' + segfile
    with open(position2, "r", encoding='gb18030', errors='ignore') as f:  # 打开文件'gb18030'
        text = f.read()  # 读取文件
        text1 = set(text.split())
        for wor in text1:
            vec = [0] * len(vocab)
            if wor in char_dict:
                print(wor)
                vec[char_dict[wor]] = 1
                num=num+1
        onehot_vectors.append(vec)

    # print(vec)
#print(onehot_vectors)
# print(vector)# print(vocab_size)
print(len(onehot_vectors))
print(label)
print(num)
# print(z)
x_train, x_test, y_train, y_test = train_test_split(onehot_vectors, label, test_size=0.2, random_state=42)
# 训练朴素贝叶斯分类器
clf_nb = MultinomialNB()  # 定义一个多项式朴素贝叶斯分类器
clf_nb.fit(x_train, y_train)  # 利用训练集对其进行训练

# 预测并评估性能
y_pred = clf_nb.predict(x_test)  # 对测试集进行预测
accuracy = accuracy_score(y_test, y_pred)  # 准确率:分类正确的样本数与总样本数的比例
precision = precision_score(y_test, y_pred, average='macro',zero_division=1)  # 精度:TP/(TP+FP)的均值,即所有类别的分类结果加权平均
recall = recall_score(y_test, y_pred, average='macro',zero_division=1)  # 召回率:TP/(TP+FN)的均值,即所有类别的“正例”真实例子分配给其他(错)类的比例
f1 = f1_score(y_test, y_pred, average='macro',zero_division=1)  # F1得分:精确度和召回率的调和平均值
print("准确率={:.4f}, 精度={:.4f}, 召回率={:.4f}, F1得分={:.4f}".format(accuracy, precision, recall, f1))
fpr, tpr, thresholds = roc_curve(y_test, y_pred)  # 绘制ROC曲线, pos_label=None
auc_value = auc(fpr, tpr)  # auc表示根据混淆矩阵和类别标签计算的二进制分类模型的质量
# 输出结果
print("朴素贝叶斯分类器在{}个测试样本上的效果".format(len(y_test)))

print("ROC曲线下面积(AUC值)={:.4f}\n".format(auc_value))

  • 你可以看下这个问题的回答https://ask.csdn.net/questions/7686071
  • 我还给你找了一篇非常好的博客,你可以看看是否有帮助,链接:在训练前标准化了训练集和测试集,训练完了以后,抽取未标准化的一条数据进行预测,结果出了问题
  • 同时,你还可以查看手册:用于数字分类的限制性波尔兹曼机特征 中的内容
  • 除此之外, 这篇博客: 如何设置在交叉验证中同时设置训练集,验证集,测试集三个数据集并实现循环中的 开始在数据中循环,这儿设置为5组交叉验证 部分也许能够解决你的问题, 你可以仔细阅读以下内容或跳转源博客中阅读:
  • for i in range(5): # 10折的话把5改成10,下同
        print(i,"times: ")
        test=data[int(idx*i*0.2):int(idx*(i+1)*0.2),]   # 测试集,如果10折的话把0.2改成0.1,下同
        test_y=y[int(idx*i*0.2):int(idx*(i+1)*0.2),] # 测试集合标签
        if i+1 <= max(range(5)):     # 10折的话把5改成10,下同
            val=data[int(idx*(i+1)*0.2):int((i+2)*idx*0.2)] # 验证集,
            val_y=y[int(idx*(i+1)*0.2):int((i+2)*idx*0.2)] # 验证集标签
            
            train=np.delete(data,range(int(idx*i*0.2),int(idx*(i+2)*0.2)),axis=0) # 训练集,去掉测试集和验证集剩下部分
            train_y=np.delete(y,range(int(idx*i*0.2),int(idx*(i+2)*0.2)),axis=0) # 训练集标签
            #train=np.delete(train,range(int(idx*(i+1)*0.2),int((i+2)*idx*0.2)),axis=0)
        else:  # 最后一个循环:当最后一组为测试集,第一组为验证集,中间为训练集时
            val=data[:int(((i+1)%4)*idx*0.2)]   # 10折的话把4改成8,0.2改成0.1,下同
            val_y=y[:int(((i+1)%4)*idx*0.2)]
            
            train=np.delete(data,range(int(idx*i*0.2),int(idx*(i+1)*0.2)),axis=0)
            train=np.delete(train,range(int(((i+1)%4)*idx*0.2)),axis=0)
            
            train_y=np.delete(y,range(int(idx*i*0.2),int(idx*(i+1)*0.2)),axis=0)
            train_y=np.delete(train_y,range(int(((i+1)%4)*idx*0.2)),axis=0)
            
        print("test:\n",test,"-----test_y",test_y)
        print("val:\n", val,"-----val_y",val_y)
        print("train\n", train,"---------train_y", train_y)
        print("---------------------------------------")
    

    结果如下:

    0 times: 
    test:
     tf.Tensor(
    [[0.  0.3 0.7]
     [1.  1.3 1.7]
     [2.  2.3 2.7]
     [3.  3.3 3.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([0 1 2 3], shape=(4,), dtype=int32)
    val:
     tf.Tensor(
    [[4.  4.3 4.7]
     [5.  5.3 5.7]
     [6.  6.3 6.7]
     [7.  7.3 7.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([4 5 6 7], shape=(4,), dtype=int32)
    train
     [[ 8.   8.3  8.7]
     [ 9.   9.3  9.7]
     [10.  10.3 10.7]
     [11.  11.3 11.7]
     [12.  12.3 12.7]
     [13.  13.3 13.7]
     [14.  14.3 14.7]
     [15.  15.3 15.7]
     [16.  16.3 16.7]
     [17.  17.3 17.7]
     [18.  18.3 18.7]
     [19.  19.3 19.7]] ---------train_y [ 8  9 10 11 12 13 14 15 16 17 18 19]
    ---------------------------------------
    1 times: 
    test:
     tf.Tensor(
    [[4.  4.3 4.7]
     [5.  5.3 5.7]
     [6.  6.3 6.7]
     [7.  7.3 7.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([4 5 6 7], shape=(4,), dtype=int32)
    val:
     tf.Tensor(
    [[ 8.   8.3  8.7]
     [ 9.   9.3  9.7]
     [10.  10.3 10.7]
     [11.  11.3 11.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([ 8  9 10 11], shape=(4,), dtype=int32)
    train
     [[ 0.   0.3  0.7]
     [ 1.   1.3  1.7]
     [ 2.   2.3  2.7]
     [ 3.   3.3  3.7]
     [12.  12.3 12.7]
     [13.  13.3 13.7]
     [14.  14.3 14.7]
     [15.  15.3 15.7]
     [16.  16.3 16.7]
     [17.  17.3 17.7]
     [18.  18.3 18.7]
     [19.  19.3 19.7]] ---------train_y [ 0  1  2  3 12 13 14 15 16 17 18 19]
    ---------------------------------------
    2 times: 
    test:
     tf.Tensor(
    [[ 8.   8.3  8.7]
     [ 9.   9.3  9.7]
     [10.  10.3 10.7]
     [11.  11.3 11.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([ 8  9 10 11], shape=(4,), dtype=int32)
    val:
     tf.Tensor(
    [[12.  12.3 12.7]
     [13.  13.3 13.7]
     [14.  14.3 14.7]
     [15.  15.3 15.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([12 13 14 15], shape=(4,), dtype=int32)
    train
     [[ 0.   0.3  0.7]
     [ 1.   1.3  1.7]
     [ 2.   2.3  2.7]
     [ 3.   3.3  3.7]
     [ 4.   4.3  4.7]
     [ 5.   5.3  5.7]
     [ 6.   6.3  6.7]
     [ 7.   7.3  7.7]
     [16.  16.3 16.7]
     [17.  17.3 17.7]
     [18.  18.3 18.7]
     [19.  19.3 19.7]] ---------train_y [ 0  1  2  3  4  5  6  7 16 17 18 19]
    ---------------------------------------
    3 times: 
    test:
     tf.Tensor(
    [[12.  12.3 12.7]
     [13.  13.3 13.7]
     [14.  14.3 14.7]
     [15.  15.3 15.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([12 13 14 15], shape=(4,), dtype=int32)
    val:
     tf.Tensor(
    [[16.  16.3 16.7]
     [17.  17.3 17.7]
     [18.  18.3 18.7]
     [19.  19.3 19.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([16 17 18 19], shape=(4,), dtype=int32)
    train
     [[ 0.   0.3  0.7]
     [ 1.   1.3  1.7]
     [ 2.   2.3  2.7]
     [ 3.   3.3  3.7]
     [ 4.   4.3  4.7]
     [ 5.   5.3  5.7]
     [ 6.   6.3  6.7]
     [ 7.   7.3  7.7]
     [ 8.   8.3  8.7]
     [ 9.   9.3  9.7]
     [10.  10.3 10.7]
     [11.  11.3 11.7]] ---------train_y [ 0  1  2  3  4  5  6  7  8  9 10 11]
    ---------------------------------------
    4 times: 
    test:
     tf.Tensor(
    [[16.  16.3 16.7]
     [17.  17.3 17.7]
     [18.  18.3 18.7]
     [19.  19.3 19.7]], shape=(4, 3), dtype=float32) -----test_y tf.Tensor([16 17 18 19], shape=(4,), dtype=int32)
    val:
     tf.Tensor(
    [[0.  0.3 0.7]
     [1.  1.3 1.7]
     [2.  2.3 2.7]
     [3.  3.3 3.7]], shape=(4, 3), dtype=float32) -----val_y tf.Tensor([0 1 2 3], shape=(4,), dtype=int32)
    train
     [[ 4.   4.3  4.7]
     [ 5.   5.3  5.7]
     [ 6.   6.3  6.7]
     [ 7.   7.3  7.7]
     [ 8.   8.3  8.7]
     [ 9.   9.3  9.7]
     [10.  10.3 10.7]
     [11.  11.3 11.7]
     [12.  12.3 12.7]
     [13.  13.3 13.7]
     [14.  14.3 14.7]
     [15.  15.3 15.7]] ---------train_y [ 4  5  6  7  8  9 10 11 12 13 14 15]
    ---------------------------------------
    

    可见数据在各组之间循环,而且对应的label也都是对应循环的
    这儿是设置的5组,如果是10组的话,把代码中range(5)直接改成range(10),然后把0.2改成0.1,4改成8即可。

    这个代码不完美,只是能实现三者循环,有更好的代码可以分享给我

  • 您还可以看一下 陈槐老师的零基础新手入门软件测试必知必会课程中的 大型项目测试用例模板内容详细讲解小节, 巩固相关知识点