https://blog.csdn.net/Crashyi/article/details/114375678?utm_source=csdn_ai_ada_ask_robot
请问这篇知识图谱嵌入TransH模型第二个代码test中from homework_8.transH_torch import data_loader,entity2id,relation2i这个代码中的homework_8是从哪里来的,如何解决
homework_8应该是作者自身项目创建的一个目录.
主要还是算法写在transH_torch.py中,transH_torch的代码也写出来了,
①算法复制到本地存成transH_torch.py
import torch
import torch.optim as optim
import torch.nn.functional as F
import codecs
import numpy as np
import copy
import time
import random
entity2id = {}
relation2id = {}
relation_tph = {} #关系每个头结点平均对应的尾节点数
relation_hpt = {} #关系每个尾结点平均对应的头节点数
'''
数据加载
entity2id: {entity1:id1,entity2:id2}
relation2id: {relation1:id1,relation2:id2}
'''
def data_loader(file):
print("load file...")
file1 = file + "train.txt"
file2 = file + "entity2id.txt"
file3 = file + "relation2id.txt"
with open(file2, 'r') as f1, open(file3, 'r') as f2:
lines1 = f1.readlines()
lines2 = f2.readlines()
for line in lines1:
line = line.strip().split('\t')
if len(line) != 2:
continue
entity2id[line[0]] = line[1]
for line in lines2:
line = line.strip().split('\t')
if len(line) != 2:
continue
relation2id[line[0]] = line[1]
entity_set = set() #训练集中的所有实体
relation_set = set() #训练集中的所有关系
triple_list = [] #训练集中的所有三元组
relation_head = {} #训练集中的关系的所有头部和头部数量,格式:{r_:{head1:count1,head2:count2}}
relation_tail = {} #训练集中的关系的所有尾部和尾部数量,格式:{r_:{tail1:count1,tail2:count2}}
with codecs.open(file1, 'r') as f:
content = f.readlines()
for line in content:
triple = line.strip().split("\t")
if len(triple) != 3:
continue
h_ = entity2id[triple[0]]
t_ = entity2id[triple[1]]
r_ = relation2id[triple[2]]
triple_list.append([h_, t_, r_])
entity_set.add(h_)
entity_set.add(t_)
relation_set.add(r_)
if r_ in relation_head:
if h_ in relation_head[r_]:
relation_head[r_][h_] += 1
else:
relation_head[r_][h_] = 1
else:
relation_head[r_] = {}
relation_head[r_][h_] = 1
if r_ in relation_tail:
if t_ in relation_tail[r_]:
relation_tail[r_][t_] += 1
else:
relation_tail[r_][t_] = 1
else:
relation_tail[r_] = {}
relation_tail[r_][t_] = 1
#计算关系中个头结点平均对应的尾节点数
for r_ in relation_head:
sum1, sum2 = 0, 0
for head in relation_head[r_]:
sum1 += 1
sum2 += relation_head[r_][head]
tph = sum2/sum1
relation_tph[r_] = tph
#计算关系每个尾结点平均对应的头节点数
for r_ in relation_tail:
sum1, sum2 = 0, 0
for tail in relation_tail[r_]:
sum1 += 1
sum2 += relation_tail[r_][tail]
hpt = sum2/sum1
relation_hpt[r_] = hpt
print("Complete load. entity : %d , relation : %d , triple : %d" % (
len(entity_set), len(relation_set), len(triple_list)))
return entity_set, relation_set, triple_list
class TransH:
def __init__(self, entity_set, relation_set, triple_list, embedding_dim=50, lr=0.01, margin=1.0, norm=1, C=1.0, epsilon = 1e-5):
self.entities = entity_set #实体集
self.relations = relation_set #关系集
self.triples = triple_list #三元组
self.dimension = embedding_dim #向量维度
self.learning_rate = lr
self.margin = margin
self.norm = norm
self.loss = 0.0
self.norm_relations = {} #Wr
self.hyper_relations = {} #dr
self.C = C #软约束的权重
self.epsilon = epsilon #艾普西隆
def data_initialise(self):
entityVectorList = {} #实体向量列表
relationNormVectorList = {}
relationHyperVectorList = {}
device = "cpu"
#将实体和关系映射成50维的向量
for entity in self.entities:
entity_vector = torch.Tensor(self.dimension).uniform_(-6.0 / np.sqrt(self.dimension), 6.0 / np.sqrt(self.dimension))
entityVectorList[entity] = entity_vector.requires_grad_(True)
for relation in self.relations:
relation_norm_vector = torch.Tensor(self.dimension).uniform_(-6.0 / np.sqrt(self.dimension), 6.0 / np.sqrt(self.dimension))
relation_hyper_vector = torch.Tensor(self.dimension).uniform_(-6.0 / np.sqrt(self.dimension), 6.0 / np.sqrt(self.dimension))
print(relation_norm_vector)
relationNormVectorList[relation] = relation_norm_vector.requires_grad_(True)
relationHyperVectorList[relation] = relation_hyper_vector.requires_grad_(True)
self.entities = entityVectorList #{id:vector,id:vector}
self.norm_relations = relationNormVectorList #{id:vector,id:vector}
self.hyper_relations = relationHyperVectorList #{id:vector,id:vector}
def training_run(self, epochs=100, nbatches=100):
#每一批的数量
batch_size = int(len(self.triples) / nbatches)
print("batch size: ", batch_size)
for epoch in range(epochs):
#开始计时
start = time.time()
#损失
self.loss = 0.0
# Normalise the embedding of the entities to 1
# for entity in self.entities:
# self.entities[entity] = self.normalization(self.entities[entity])
#分批处理
for batch in range(nbatches):
#随机抽取batch_size大小的样本
batch_samples = random.sample(self.triples, batch_size)
Tbatch = []
for sample in batch_samples:
#将sample深度拷贝给corrupted_sample
corrupted_sample = copy.deepcopy(sample)
pr = np.random.random(1)[0] #从0到1中随机获得一个浮点数
#p表示头结点被替换的概率
p = relation_tph[corrupted_sample[2]] / (
relation_tph[corrupted_sample[2]] + relation_hpt[corrupted_sample[2]])
#pr>p时,head任意改变为另一个实体,pr<=p时,tail任意改变为另一个实体
if pr > p:
# 随机选择一个实体替换头结点corrupted_sample[0]
corrupted_sample[0] = random.sample(self.entities.keys(), 1)[0]
while corrupted_sample[0] == sample[0]:
corrupted_sample[0] = random.sample(self.entities.keys(), 1)[0]
else:
# 随机选择一个实体替换尾结点corrupted_sample[1]
corrupted_sample[1] = random.sample(self.entities.keys(), 1)[0]
while corrupted_sample[1] == sample[1]:
corrupted_sample[1] = random.sample(self.entities.keys(), 1)[0]
# Tbatch加入(正例,负例)
if (sample, corrupted_sample) not in Tbatch:
Tbatch.append((sample, corrupted_sample))
#更新
self.update_triple_embedding(Tbatch)
#结束时间
end = time.time()
print("epoch: ", epoch, "cost time: %s" % (round((end - start), 3)))
print("running loss: ", self.loss)
with codecs.open("entity_" + str(self.dimension) + "dim_batch" + str(batch_size), "w") as f1:
for e in self.entities:
f1.write(e + "\t")
f1.write(str(list(self.entities[e])))
f1.write("\n")
with codecs.open("relation_norm_" + str(self.dimension) + "dim_batch" + str(batch_size), "w") as f2:
for r in self.norm_relations:
f2.write(r + "\t")
f2.write(str(list(self.norm_relations[r])))
f2.write("\n")
with codecs.open("relation_hyper_" + str(self.dimension) + "dim_batch" + str(batch_size), "w") as f3:
for r in self.hyper_relations:
f3.write(r + "\t")
f3.write(str(list(self.hyper_relations[r])))
f3.write("\n")
def normalization(self, vector):
v = vector / torch.sum(torch.square(vector))
return v.requires_grad_(True)
# 范数L2 torch.norm()元素平方求和再开根号
# f=||(h-WrhWr)+dr-(t-WrtWr)||2
def norm_l2(self, h, r_norm, r_hyper, t):
return torch.norm(h - r_norm.dot(h)*r_norm + r_hyper -(t - r_norm.dot(t)*r_norm))
# 软约束项 模长约束
def scale_entity(self, vector):
return torch.relu(torch.sum(vector**2) - 1)
# #正交化
# def orthogonality(self, norm, hyper):
# return np.dot(norm, hyper)**2/np.linalg.norm(hyper)**2 - self.epsilon**2
#更新三元组
def update_triple_embedding(self, Tbatch):
for correct_sample, corrupted_sample in Tbatch:
correct_head = self.entities[correct_sample[0]]
correct_tail = self.entities[correct_sample[1]]
#Wr
relation_norm = self.norm_relations[correct_sample[2]]
#dr
relation_hyper = self.hyper_relations[correct_sample[2]]
corrupted_head = self.entities[corrupted_sample[0]]
corrupted_tail = self.entities[corrupted_sample[1]]
# # calculate the distance of the triples
# correct_distance = self.norm_l2(correct_head, relation_norm, relation_hyper, correct_tail)
# corrupted_distance = self.norm_l2(corrupted_head, relation_norm, relation_hyper, corrupted_tail)
#SGD随机梯度下降法,调整所有向量的值来最小化loss函数
opt1 = optim.SGD([correct_head], lr=0.01)
opt2 = optim.SGD([correct_tail], lr=0.01)
opt3 = optim.SGD([relation_norm], lr=0.01)
opt4 = optim.SGD([relation_hyper], lr=0.01)
if correct_sample[0] == corrupted_sample[0]:
opt5 = optim.SGD([corrupted_tail], lr=0.01)
#正例距离
correct_distance = self.norm_l2(correct_head, relation_norm, relation_hyper, correct_tail)
#负例距离
corrupted_distance = self.norm_l2(correct_head, relation_norm, relation_hyper, corrupted_tail)
#约束
scale = self.scale_entity(correct_head) + self.scale_entity(correct_tail) + self.scale_entity(corrupted_tail)
else:
opt5 = optim.SGD([corrupted_head], lr=0.01)
correct_distance = self.norm_l2(correct_head, relation_norm, relation_hyper, correct_tail)
corrupted_distance = self.norm_l2(corrupted_head, relation_norm, relation_hyper, correct_tail)
scale = self.scale_entity(correct_head) + self.scale_entity(correct_tail) + self.scale_entity(corrupted_head)
opt1.zero_grad()
opt2.zero_grad()
opt3.zero_grad()
opt4.zero_grad()
opt5.zero_grad()
loss = F.relu(self.margin + correct_distance - corrupted_distance) + self.C * scale
loss.backward()
self.loss += loss
opt1.step()
opt2.step()
opt3.step()
opt4.step()
opt5.step()
# 归一化这些新的向量,而不是将所有向量一起归一化
self.entities[correct_sample[0]] = correct_head
self.entities[correct_sample[1]] = correct_tail
if correct_sample[0] == corrupted_sample[0]:
# 如果负例实体替换了尾部实体,则更新尾部实体的补全
self.entities[corrupted_sample[1]] = corrupted_tail
elif correct_sample[1] == corrupted_sample[1]:
# # 如果负例实体替换了头部实体,则更新头部实体的补全
self.entities[corrupted_sample[0]] = corrupted_head
# 该论文提到该关系的嵌入不需要规范化
self.norm_relations[correct_sample[2]] = relation_norm
self.hyper_relations[correct_sample[2]] = relation_hyper
if __name__ == '__main__':
file1 = "D:/Pycharmprojects/bigDataAnalysis/FB15k/"
entity_set, relation_set, triple_list = data_loader(file1)
transH = TransH(entity_set, relation_set, triple_list, embedding_dim=50, lr=0.01, margin=1.0, norm=1)
transH.data_initialise()
transH.training_run()
②修改引用为下面代码
# from homework_8.transH_torch import data_loader,entity2id,relation2id
from transH_torch import data_loader,entity2id,relation2id