python朴素贝叶斯精确率过低


# -*- coding: utf-8 -*-
"""
Created on Sat Dec 11 12:19:57 2021

@author: peng
"""


import pandas as pd
import numpy as np



class Bayes(object):
    def data_split(self):     #将数据集随机分为测试集和训练集
        full_list=pd.read_csv('E:\pythoncodes\pandas\iris.csv')
        full_list_list=np.array(full_list) #转化为数组
        total=len(full_list_list[1])
        offSet=int(total*0.2)
        col_rand_array = np.arange(full_list_list.shape[1])
        np.random.shuffle(col_rand_array)
        second_list= full_list_list[:,col_rand_array[0:offSet]]
        #print(second_list)
        first_list=full_list_list[:,col_rand_array[offSet:full_list_list.shape[1]-1]]
        #print(first_list)
        return first_list,second_list
    def getTrainData(self):
        trainData=first_list[0:first_list.shape[0]-1,1:first_list.shape[1]]   #训练数据x1,x2,
        #print(trainData)
        labels=first_list[first_list.shape[0]-1,1:first_list.shape[1]]   #训练数据对应y
        #print(labels)
        return trainData,labels
        #print(trainData)
    #求先验概率              
    def classify(self,trainData,labels,features,aim):
        labels=list(labels)
        labelSet=set(labels)   #y值分类
            #print(label)
        P_y={}  #y的概率
        P={}
        for label in labelSet:
            P_y[label]=labels.count(label)/float(len(labels))
        #print(P_y)    
        #求联合概率
        P_xy={}
        for y in P_y.keys():
            y_index = [i for i, label in enumerate(labels) if label == y]  # labels中出现y值的所有数值的下标索引
            for j in range(len(features)):      # features[0] 在trainData[:,0]中出现的值的所有下标索引
                x_index = [i for i, feature in enumerate(trainData[j,:]) if feature == features[j]]
                xy_count = len(set(x_index) & set(y_index))   # set(x_index)&set(y_index)列出两个表相同的元素
                xey = str(features[j]) + '*' + str(y)
                P_xy[xey] = xy_count / float(len(labels))
        for y in P_y.keys():
            for x in features:
                xey = str(x) + '|' + str(y)
                P[xey] = P_xy[str(x)+'*'+str(y)] / float(P_y[y])   

    #判别
        F={}
        #print(features[len(features)-1])
        for y in P_y:
            F[y]=P_y[y]
            for x in features:
                F[y]=F[y]*P[xey]
               # print(F[y],x)
        testResult=max(F,key=F.get)
        #print(testResult)
        print(testResult,aim)
        testResult=1
        return testResult



if __name__ == '__main__':
    TR_total = 0
    TN_total = 0
    NB=Bayes()
    first_list,second_list=NB.data_split()
    trainData,labels=NB.getTrainData()
    rest_data=second_list[:4]
    #print(rest_data)
    for n in range(len(rest_data[0])):   
        result=NB.classify(trainData, labels, rest_data[:,n],second_list[4,n])    
        #print(result)