补充代码,初始化高斯贝叶斯模型,并训练模型,测试其在分类上的性能。调节模型参数,使邮件分类性能不低于70%。
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
def make_Dictionary(root_dir):
all_words = []
emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]
for mail in emails:
with open(mail) as m:
for line in m:
words = line.split()
all_words += words
dictionary = Counter(all_words)
list_to_remove = list(dictionary)
for item in list_to_remove:
if item.isalpha() == False:
del dictionary[item]
elif len(item) == 1:
del dictionary[item]
return dictionary
def extract_features(mail_dir,dictionary):
files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
features_matrix = np.zeros((len(files),len(dictionary)))
train_labels = np.zeros(len(files))
count = 0
docID = 0
for fil in files:
with open(fil) as fi:
for i,line in enumerate(fi):
if i == 2:
words = line.split()
for word in words:
wordID = 0
for i,d in enumerate(dictionary):
if d[0] == word:
wordID = i
features_matrix[docID,wordID] = words.count(word)
train_labels[docID] = 0
filepathTokens = fil.split('/')
lastToken = filepathTokens[len(filepathTokens) - 1]
if "spmsg" in lastToken:
train_labels[docID] = 1
count = count + 1
docID = docID + 1
return features_matrix, train_labels
def test():
TRAIN_DIR = "src/step1/train-mails"
TEST_DIR = "src/step1/test-mails"
dictionary = make_Dictionary(TRAIN_DIR)
X_train, y_train = extract_features(TRAIN_DIR,dictionary)
X_test, y_test = extract_features(TEST_DIR,dictionary)
# 任务:初始化贝叶斯模型,并训练模型,测试其在分类上的性能。
# 调节模型参数,使邮件分类性能不低于70%,并返回精度值(百分值)
########## Begin ##########
# 分类器在测试集上的预测值
# 计算准确率
########## End ##########
return accuracy
修改了很多遍正确率都是60%多
分类精度为:74.82758620689656%
参考这篇博文:https://blog.csdn.net/qq_42589613/article/details/127648820