# -*- coding: utf-8 -*-
"""
Created on Sat Dec 11 12:19:57 2021
@author: peng
"""
import pandas as pd
import numpy as np
class Bayes(object):
def data_split(self): #将数据集随机分为测试集和训练集
full_list=pd.read_csv('E:\pythoncodes\pandas\iris.csv')
full_list_list=np.array(full_list) #转化为数组
total=len(full_list_list[1])
offSet=int(total*0.2)
col_rand_array = np.arange(full_list_list.shape[1])
np.random.shuffle(col_rand_array)
second_list= full_list_list[:,col_rand_array[0:offSet]]
#print(second_list)
first_list=full_list_list[:,col_rand_array[offSet:full_list_list.shape[1]-1]]
#print(first_list)
return first_list,second_list
def getTrainData(self):
trainData=first_list[0:first_list.shape[0]-1,1:first_list.shape[1]] #训练数据x1,x2,
#print(trainData)
labels=first_list[first_list.shape[0]-1,1:first_list.shape[1]] #训练数据对应y
#print(labels)
return trainData,labels
#print(trainData)
#求先验概率
def classify(self,trainData,labels,features,aim):
labels=list(labels)
labelSet=set(labels) #y值分类
#print(label)
P_y={} #y的概率
P={}
for label in labelSet:
P_y[label]=labels.count(label)/float(len(labels))
#print(P_y)
#求联合概率
P_xy={}
for y in P_y.keys():
y_index = [i for i, label in enumerate(labels) if label == y] # labels中出现y值的所有数值的下标索引
for j in range(len(features)): # features[0] 在trainData[:,0]中出现的值的所有下标索引
x_index = [i for i, feature in enumerate(trainData[j,:]) if feature == features[j]]
xy_count = len(set(x_index) & set(y_index)) # set(x_index)&set(y_index)列出两个表相同的元素
xey = str(features[j]) + '*' + str(y)
P_xy[xey] = xy_count / float(len(labels))
for y in P_y.keys():
for x in features:
xey = str(x) + '|' + str(y)
P[xey] = P_xy[str(x)+'*'+str(y)] / float(P_y[y])
#判别
F={}
#print(features[len(features)-1])
for y in P_y:
F[y]=P_y[y]
for x in features:
F[y]=F[y]*P[xey]
# print(F[y],x)
testResult=max(F,key=F.get)
#print(testResult)
print(testResult,aim)
testResult=1
return testResult
if __name__ == '__main__':
TR_total = 0
TN_total = 0
NB=Bayes()
first_list,second_list=NB.data_split()
trainData,labels=NB.getTrainData()
rest_data=second_list[:4]
#print(rest_data)
for n in range(len(rest_data[0])):
result=NB.classify(trainData, labels, rest_data[:,n],second_list[4,n])
#print(result)