Python置信度输出都为零这是这是为什么

为什么这个代码输出的置信度为零,应该怎么改呢

img


# 数据准备
data = [['西红柿', '排骨', '鸡蛋'],
        ['西红柿', '茄子'],
        ['鸡蛋', '袜子'],
        ['西红柿', '排骨', '茄子'],
        ['西红柿', '排骨', '袜子', '酸奶'],
        ['鸡蛋', '茄子', '酸奶'],
        ['排骨', '鸡蛋', '茄子'],
        ['土豆', '鸡蛋', '袜子'],
        ['西红柿', '排骨', '鞋子', '土豆']]
data = [list(set(d)) for d in data]

# 获取所有可能出现的物品
items=['土豆','排骨','茄子','袜子','西红柿','酸奶','鞋子','鸡蛋']

# 构建布尔矩阵
matrix = []
for d in data:
    row = []
    for item in items:
        if item in d:
            row.append(1)
        else:
            row.append(0)
    matrix.append(row)

# 输出布尔矩阵
print('ID', end='\t')
for item in items:
    print(item, end='\t')
print()
for i in range(len(data)):
    print('I' + str(i+1), end='\t')
    for j in range(len(items)):
        print(matrix[i][j], end='\t')
    print()

# 计算关联规则
rules = [('西红柿','排骨'), ('排骨','西红柿'), ('袜子','鸡蛋'),
         ('茄子','排骨'), ('茄子','西红柿'), ('茄子','鸡蛋')]

def confidence(rule, data):
    antecedent = tuple(rule[0])
    consequent = tuple(rule[1])
    antecedent_count = 0
    rule_count = 0
    for d in data:
        if set(antecedent).issubset(set(d)):
            antecedent_count += 1
            if set(consequent).issubset(set(d)):
                rule_count += 1
    if antecedent_count == 0 or rule_count == 0: # 如果前项或者规则本身没有出现过,则返回0
        print(f"Rule: {rule} not found in data")
        return 0
    confidence = float(rule_count) / antecedent_count
    print(f"Rule: {rule}, Confidence: {confidence:.4f}")
    return confidence

# 输出支持度和置信度
for rule in rules:
    print(f"Rule: {rule}  \t Confidence: {confidence(rule, data):.4f}")

问题分析:
1.第一处问题就是在confidence函数中的antecedent = tuple(rule[0])这一句,将rule[0]取出来会让'茄子'变成‘茄’ ,‘子’
,显然这不是你想要的效果吧。
2.第二处问题也是在confidence函数中,if set(antecedent).issubset(set(d)): 这一句判断中set(antecedent)操作会把‘西红柿’,变成‘红’,‘西’,‘柿’,因此我们要先把antecedent转成string,再转成list,最后设为set格式。

# 数据准备
data = [['西红柿', '排骨', '鸡蛋'],
        ['西红柿', '茄子'],
        ['鸡蛋', '袜子'],
        ['西红柿', '排骨', '茄子'],
        ['西红柿', '排骨', '袜子', '酸奶'],
        ['鸡蛋', '茄子', '酸奶'],
        ['排骨', '鸡蛋', '茄子'],
        ['土豆', '鸡蛋', '袜子'],
        ['西红柿', '排骨', '鞋子', '土豆']]
data = [list(set(d)) for d in data]
# 获取所有可能出现的物品
items=['土豆','排骨','茄子','袜子','西红柿','酸奶','鞋子','鸡蛋']

# 构建布尔矩阵
matrix = []
for d in data:
    row = []
    for item in items:
        if item in d:
            row.append(1)
        else:
            row.append(0)
    matrix.append(row)

# 输出布尔矩阵
print('ID', end='\t')
for item in items:
    print(item, end='\t')
print()
for i in range(len(data)):
    print('I' + str(i+1), end='\t')
    for j in range(len(items)):
        print(matrix[i][j], end='\t')
    print()

# 计算关联规则
rules = [('西红柿','排骨'), ('排骨','西红柿'), ('袜子','鸡蛋'),
         ('茄子','排骨'), ('茄子','西红柿'), ('茄子','鸡蛋')]

def confidence(rule, data):
    antecedent = (rule[0])
    consequent = (rule[1])
    antecedent_count = 0
    rule_count = 0
    for d in data:
        if set(antecedent.split(" ")).issubset(set(d)):
            antecedent_count += 1
            if set(consequent.split(" ")).issubset(set(d)):
                rule_count += 1
    if antecedent_count == 0 or rule_count == 0: # 如果前项或者规则本身没有出现过,则返回0
        print(f"Rule: {rule} not found in data")
        return 0
    confidence = float(rule_count) / antecedent_count
    #print(f"Rule: {rule}, Confidence: {confidence:.4f}")
    return confidence

# 输出支持度和置信度
for rule in rules:
    print(f"Rule: {rule}  \t Confidence: {confidence(rule, data):.4f}")

结果输出:
ID 土豆 排骨 茄子 袜子 西红柿 酸奶 鞋子 鸡蛋
I1 0 1 0 0 1 0 0 1
I2 0 0 1 0 1 0 0 0
I3 0 0 0 1 0 0 0 1
I4 0 1 1 0 1 0 0 0
I5 0 1 0 1 1 1 0 0
I6 0 0 1 0 0 1 0 1
I7 0 1 1 0 0 0 0 1
I8 1 0 0 1 0 0 0 1
I9 1 1 0 0 1 0 1 0
Rule: ('西红柿', '排骨') Confidence: 0.8000
Rule: ('排骨', '西红柿') Confidence: 0.8000
Rule: ('袜子', '鸡蛋') Confidence: 0.6667
Rule: ('茄子', '排骨') Confidence: 0.5000
Rule: ('茄子', '西红柿') Confidence: 0.5000
Rule: ('茄子', '鸡蛋') Confidence: 0.5000