为什么这个代码输出的置信度为零,应该怎么改呢
# 数据准备
data = [['西红柿', '排骨', '鸡蛋'],
['西红柿', '茄子'],
['鸡蛋', '袜子'],
['西红柿', '排骨', '茄子'],
['西红柿', '排骨', '袜子', '酸奶'],
['鸡蛋', '茄子', '酸奶'],
['排骨', '鸡蛋', '茄子'],
['土豆', '鸡蛋', '袜子'],
['西红柿', '排骨', '鞋子', '土豆']]
data = [list(set(d)) for d in data]
# 获取所有可能出现的物品
items=['土豆','排骨','茄子','袜子','西红柿','酸奶','鞋子','鸡蛋']
# 构建布尔矩阵
matrix = []
for d in data:
row = []
for item in items:
if item in d:
row.append(1)
else:
row.append(0)
matrix.append(row)
# 输出布尔矩阵
print('ID', end='\t')
for item in items:
print(item, end='\t')
print()
for i in range(len(data)):
print('I' + str(i+1), end='\t')
for j in range(len(items)):
print(matrix[i][j], end='\t')
print()
# 计算关联规则
rules = [('西红柿','排骨'), ('排骨','西红柿'), ('袜子','鸡蛋'),
('茄子','排骨'), ('茄子','西红柿'), ('茄子','鸡蛋')]
def confidence(rule, data):
antecedent = tuple(rule[0])
consequent = tuple(rule[1])
antecedent_count = 0
rule_count = 0
for d in data:
if set(antecedent).issubset(set(d)):
antecedent_count += 1
if set(consequent).issubset(set(d)):
rule_count += 1
if antecedent_count == 0 or rule_count == 0: # 如果前项或者规则本身没有出现过,则返回0
print(f"Rule: {rule} not found in data")
return 0
confidence = float(rule_count) / antecedent_count
print(f"Rule: {rule}, Confidence: {confidence:.4f}")
return confidence
# 输出支持度和置信度
for rule in rules:
print(f"Rule: {rule} \t Confidence: {confidence(rule, data):.4f}")
问题分析:
1.第一处问题就是在confidence函数中的antecedent = tuple(rule[0])这一句,将rule[0]取出来会让'茄子'变成‘茄’ ,‘子’
,显然这不是你想要的效果吧。
2.第二处问题也是在confidence函数中,if set(antecedent).issubset(set(d)): 这一句判断中set(antecedent)操作会把‘西红柿’,变成‘红’,‘西’,‘柿’,因此我们要先把antecedent转成string,再转成list,最后设为set格式。
# 数据准备
data = [['西红柿', '排骨', '鸡蛋'],
['西红柿', '茄子'],
['鸡蛋', '袜子'],
['西红柿', '排骨', '茄子'],
['西红柿', '排骨', '袜子', '酸奶'],
['鸡蛋', '茄子', '酸奶'],
['排骨', '鸡蛋', '茄子'],
['土豆', '鸡蛋', '袜子'],
['西红柿', '排骨', '鞋子', '土豆']]
data = [list(set(d)) for d in data]
# 获取所有可能出现的物品
items=['土豆','排骨','茄子','袜子','西红柿','酸奶','鞋子','鸡蛋']
# 构建布尔矩阵
matrix = []
for d in data:
row = []
for item in items:
if item in d:
row.append(1)
else:
row.append(0)
matrix.append(row)
# 输出布尔矩阵
print('ID', end='\t')
for item in items:
print(item, end='\t')
print()
for i in range(len(data)):
print('I' + str(i+1), end='\t')
for j in range(len(items)):
print(matrix[i][j], end='\t')
print()
# 计算关联规则
rules = [('西红柿','排骨'), ('排骨','西红柿'), ('袜子','鸡蛋'),
('茄子','排骨'), ('茄子','西红柿'), ('茄子','鸡蛋')]
def confidence(rule, data):
antecedent = (rule[0])
consequent = (rule[1])
antecedent_count = 0
rule_count = 0
for d in data:
if set(antecedent.split(" ")).issubset(set(d)):
antecedent_count += 1
if set(consequent.split(" ")).issubset(set(d)):
rule_count += 1
if antecedent_count == 0 or rule_count == 0: # 如果前项或者规则本身没有出现过,则返回0
print(f"Rule: {rule} not found in data")
return 0
confidence = float(rule_count) / antecedent_count
#print(f"Rule: {rule}, Confidence: {confidence:.4f}")
return confidence
# 输出支持度和置信度
for rule in rules:
print(f"Rule: {rule} \t Confidence: {confidence(rule, data):.4f}")
结果输出:
ID 土豆 排骨 茄子 袜子 西红柿 酸奶 鞋子 鸡蛋
I1 0 1 0 0 1 0 0 1
I2 0 0 1 0 1 0 0 0
I3 0 0 0 1 0 0 0 1
I4 0 1 1 0 1 0 0 0
I5 0 1 0 1 1 1 0 0
I6 0 0 1 0 0 1 0 1
I7 0 1 1 0 0 0 0 1
I8 1 0 0 1 0 0 0 1
I9 1 1 0 0 1 0 1 0
Rule: ('西红柿', '排骨') Confidence: 0.8000
Rule: ('排骨', '西红柿') Confidence: 0.8000
Rule: ('袜子', '鸡蛋') Confidence: 0.6667
Rule: ('茄子', '排骨') Confidence: 0.5000
Rule: ('茄子', '西红柿') Confidence: 0.5000
Rule: ('茄子', '鸡蛋') Confidence: 0.5000