蛋白直接耦合预测python代码如下,为什么生成的npz文件都是“nan”:
import sys
import numpy as np
import string
import tensorflow as tf
def parse_a3m(a3mlines):
seqs = []
labels = []
for line in a3mlines:
if line[0] == '>':
labels.append(line.strip())
else:
seqs.append(line[:-1])
alphabet = np.array(list("ARNDCQEGHILKMFPSTWYV-"), dtype='|S1').view(np.uint8)
seq_num = np.array([list(s) for s in seqs], dtype='|S1').view(np.uint8)
for i in range(alphabet.shape[0]):
seq_num[seq_num == alphabet[i]] = i
seq_num[seq_num > 20] = 20
return {'seqs' : seq_num, 'labels' : labels }
def tf_cov(x,w=None):
if w is None:
num_points = tf.cast(tf.shape(x)[0],tf.float32) - 1
x_mean = tf.reduce_mean(x, axis=0, keep_dims=True)
x = (x - x_mean)
else:
num_points = tf.reduce_sum(w) - tf.sqrt(tf.reduce_mean(w))
x_mean = tf.reduce_sum(x * w[:,None], axis=0, keepdims=True) / num_points
x = (x - x_mean) * tf.sqrt(w[:,None])
return tf.matmul(tf.transpose(x),x)/num_points
fp = open(sys.argv[1] + ".alignments", "r")
pair2a3m = {}
pair = ""
pairs = []
a3m = []
for line in fp:
if line[0] == ">":
if pair and a3m:
pair2a3m[pair] = a3m
pairs.append(pair)
pair = line[0:-1]
a3m = []
else:
a3m.append(line)
fp.close()
if pair and a3m:
pair2a3m[pair] = a3m
pairs.append(pair)
print(pair2a3m)
config = tf.compat.v1.ConfigProto(gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.9))
with tf.Graph().as_default():
x = tf.compat.v1.placeholder(tf.uint8,shape=(None,None),name="x")
x_shape = tf.shape(x)
x_nr = x_shape[0]
x_nc = x_shape[1]
x_ns = 21
x_msa = tf.one_hot(x,x_ns)
x_cutoff = tf.cast(x_nc,tf.float32) * 0.8
x_pw = tf.tensordot(x_msa, x_msa, [[1,2], [1,2]])
x_cut = x_pw > x_cutoff
x_weights = 1.0/tf.reduce_sum(tf.cast(x_cut, dtype=tf.float32),-1)
x_feat = tf.reshape(x_msa,(x_nr,x_nc*x_ns))
x_c = tf_cov(x_feat,x_weights) + tf.eye(x_nc*x_ns)*4.5/tf.sqrt(tf.reduce_sum(x_weights))
x_c_inv = tf.linalg.inv(x_c)
x_w = tf.reshape(x_c_inv,(x_nc,x_ns,x_nc,x_ns))
x_wi = tf.sqrt(tf.reduce_sum(tf.square(x_w[:,:-1,:,:-1]),(1,3))) * (1-tf.eye(x_nc))
# APC
#x_ap = tf.reduce_sum(x_wi,0,keepdims=True) * tf.reduce_sum(x_wi,1,keepdims=True) / tf.reduce_sum(x_wi)
#x_wip = (x_wi - x_ap) * (1-tf.eye(x_nc))
with tf.compat.v1.Session(config=config) as sess:
results = []
get_pairs = []
for pair in pairs:
print(pair)
msa = parse_a3m(pair2a3m[pair])
try:
wip = sess.run(x_wi,{x:msa['seqs']})
results.append(wip.astype(np.float16))
get_pairs.append(pair)
print(wip)
except tf.errors.ResourceExhaustedError as e:
pass
np.savez_compressed("try2", *results, names=get_pairs)
rp = open("try2" + ".log","w")
for pair in get_pairs:
rp.write(pair + "\n")
rp.close()
引用chatgpt部分指引作答:
代码中有几个可能导致生成 "nan" 的问题:
计算协方差矩阵时,如果没有传入权重(即 w=None),会计算所有序列的协方差矩阵,这可能会导致矩阵奇异,从而在计算逆矩阵时出现 "nan"。可以尝试传入权重来解决这个问题。
如果权重 w 中有全 0 的行或列,那么在计算协方差矩阵时,分母为 0,也会导致出现 "nan"。
如果权重 w 中有负数,那么在计算平方根时会出现负数,从而出现 "nan"。
在计算 x_wi 时,如果分母为 0,也会出现 "nan"。可能的原因是 msa['seqs'] 中的所有序列之间都没有相似度超过阈值的配对,导致 x_wi 中所有元素为 0。这个问题可能需要在检查数据的前处理过程中查找原因。
引用chatGPT作答,您可以尝试调整代码中的一些超参数,比如 per_process_gpu_memory_fraction 参数,将其设为一个更小的值。此外,您可以尝试在代码运行之前清除 TensorFlow 的缓存,方法是在代码中添加以下代码:
from tensorflow.keras import backend as K
K.clear_session()