pytorch版本
import torch
import numpy as np
class NaiveBayes:
def __init__(self, lamda) -> None:
self.lamda = lamda
def fit(self, X_data, y_data):
self.classes = list(set(y_data))
# 求先验
self.prior = []
for i in self.classes:
self.prior.append((y_data == i).sum().item() / len(y_data))
# 按照分类分离样本
self.samples = []
for i in self.classes:
self.samples.append(X_data[y_data == i, :])
self.feature_value_count = []
for i in range(X_data.shape[1]):
self.feature_value_count.append(len(set(X_data[:, i])))
# 预测
def predict(self, X_data):
pred = []
for i in range(X_data.shape[0]):
post = []
# 对每一个分类分别求概率
for j in range(len(self.classes)):
# 获得先验
prior = self.prior[j]
# 对每个特征求似然
likelihood = 1.0
for k in range(X_data.shape[1]):
likelihood *= ((self.samples[j][:, k] == X_data[i][k]).sum().item() + self.lamda) \
/ (self.samples[j].shape[0] + self.lamda * self.feature_value_count[k])
# 获得后验概率
post.append(prior * likelihood)
# 将最大后验概率对应的下标作为分类
pred.append(self.classes[np.argmax(post)])
return torch.tensor(pred)
# 评估
def score(self, X_data, y_data):
pred = self.predict(X_data)
return (pred == y_data).sum().item() / len(y_data)
def create_dataset(n_samples=1000):
x0 = torch.normal(2, 1, size=(n_samples // 2, 2), dtype=torch.float32)
y0 = torch.zeros(n_samples // 2, dtype=torch.float32)
x1 = torch.normal(-2, 1, size=(n_samples - n_samples // 2, 2), dtype=torch.float32)
y1 = torch.ones(n_samples - n_samples // 2, dtype=torch.float32)
# 合并数据x,y
x = torch.cat((x0, x1), 0)
y = torch.cat((y0, y1), 0)
return x, y
X, y = create_dataset(1000)
nb = NaiveBayes(lamda=0)
nb.fit(X[:800], y[:800])
nb.score(X[800:], y[800:])
numpy版本
import numpy as np
class NaiveBayes:
def __init__(self, lamda) -> None:
self.prior = []
self.samples = []
self.feature_value_count = []
self.lamda = lamda
def fit(self, X_data, y_data):
self.classes = list(set(y_data))
# 求先验
for i in self.classes:
self.prior.append((y_data == i).sum() / len(y_data))
# 按照分类分离样本
for i in self.classes:
self.samples.append(X_data[y_data == i, :])
for i in range(X_data.shape[1]):
self.feature_value_count.append(len(set(X_data[:, i])))
# 预测
def predict(self, X_data):
pred = []
for i in range(X_data.shape[0]):
post = []
# 对每一个分类分别求概率
for j in range(len(self.classes)):
# 获得先验
prior = self.prior[j]
# 对每个特征求似然
likelihood = 1.0
for k in range(X_data.shape[1]):
likelihood *= ((self.samples[j][:, k] == X_data[i][k]).sum() + self.lamda) / (
self.samples[j].shape[0] + self.lamda * self.feature_value_count[k])
# 获得后验概率
post.append(prior * likelihood)
# 将最大后验概率对应的下标作为分类
pred.append(self.classes[np.argmax(post)])
return np.array(pred)
# 评估
def score(self, X_data, y_data):
pred = self.predict(X_data)
return (pred == y_data).sum().item() / len(y_data)
def create_dataset(n_samples=1000):
x0 = np.random.normal(2, 1, size=(n_samples // 2, 2))
y0 = np.zeros(n_samples // 2)
x1 = np.random.normal(-2, 1, size=(n_samples - n_samples // 2, 2))
y1 = np.ones(n_samples - n_samples // 2)
# 合并数据x,y
x = np.vstack((x0, x1))
y = np.hstack((y0, y1))
return x, y
X, y = create_dataset(1000)
nb = NaiveBayes(lamda=0)
nb.fit(X[:800], y[:800])
print(nb.score(X[800:], y[800:]))