python,蛋白直接耦合分析DCA

蛋白直接耦合预测python代码如下,为什么生成的npz文件都是“nan”:

import sys
import numpy as np
import string
import tensorflow as tf
def parse_a3m(a3mlines):
    seqs = []
    labels = []
    for line in a3mlines:
        if line[0] == '>':
            labels.append(line.strip())
        else:
            seqs.append(line[:-1])
    alphabet = np.array(list("ARNDCQEGHILKMFPSTWYV-"), dtype='|S1').view(np.uint8)
    seq_num = np.array([list(s) for s in seqs], dtype='|S1').view(np.uint8)
    for i in range(alphabet.shape[0]):
        seq_num[seq_num == alphabet[i]] = i
    seq_num[seq_num > 20] = 20
    return {'seqs' : seq_num, 'labels' : labels }
def tf_cov(x,w=None):
    if w is None:
        num_points = tf.cast(tf.shape(x)[0],tf.float32) - 1
        x_mean = tf.reduce_mean(x, axis=0, keep_dims=True)
        x = (x - x_mean)
    else:
        num_points = tf.reduce_sum(w) - tf.sqrt(tf.reduce_mean(w))
        x_mean = tf.reduce_sum(x * w[:,None], axis=0, keepdims=True) / num_points
        x = (x - x_mean) * tf.sqrt(w[:,None])
    return tf.matmul(tf.transpose(x),x)/num_points
fp = open(sys.argv[1] + ".alignments", "r")
pair2a3m = {}
pair = ""
pairs = []
a3m = []
for line in fp:
    if line[0] == ">":
        if pair and a3m:
            pair2a3m[pair] = a3m
            pairs.append(pair)
        pair = line[0:-1]
        a3m = []
    else:
        a3m.append(line)
fp.close()



if pair and a3m:
    pair2a3m[pair] = a3m
    pairs.append(pair)
    print(pair2a3m)
config = tf.compat.v1.ConfigProto(gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.9))
with tf.Graph().as_default():
    x = tf.compat.v1.placeholder(tf.uint8,shape=(None,None),name="x")
    x_shape = tf.shape(x)
    x_nr = x_shape[0]
    x_nc = x_shape[1]
    x_ns = 21
    x_msa = tf.one_hot(x,x_ns)
    x_cutoff = tf.cast(x_nc,tf.float32) * 0.8
    x_pw = tf.tensordot(x_msa, x_msa, [[1,2], [1,2]])
    x_cut = x_pw > x_cutoff
    x_weights = 1.0/tf.reduce_sum(tf.cast(x_cut, dtype=tf.float32),-1)
    x_feat = tf.reshape(x_msa,(x_nr,x_nc*x_ns))
    x_c = tf_cov(x_feat,x_weights) + tf.eye(x_nc*x_ns)*4.5/tf.sqrt(tf.reduce_sum(x_weights))
    x_c_inv = tf.linalg.inv(x_c)
    x_w = tf.reshape(x_c_inv,(x_nc,x_ns,x_nc,x_ns))
    x_wi = tf.sqrt(tf.reduce_sum(tf.square(x_w[:,:-1,:,:-1]),(1,3))) * (1-tf.eye(x_nc))
# APC
#x_ap = tf.reduce_sum(x_wi,0,keepdims=True) * tf.reduce_sum(x_wi,1,keepdims=True) / tf.reduce_sum(x_wi)
#x_wip = (x_wi - x_ap) * (1-tf.eye(x_nc))
    with tf.compat.v1.Session(config=config) as sess:
        results = []
        get_pairs = []
        for pair in pairs:
            print(pair)
            msa = parse_a3m(pair2a3m[pair])
            try:
                wip = sess.run(x_wi,{x:msa['seqs']})
                results.append(wip.astype(np.float16))
                get_pairs.append(pair)
                print(wip)
            except tf.errors.ResourceExhaustedError as e:
                pass
        np.savez_compressed("try2", *results, names=get_pairs)
        rp = open("try2" + ".log","w")
        for pair in get_pairs:
            rp.write(pair + "\n")
        rp.close()


引用chatgpt部分指引作答:
代码中有几个可能导致生成 "nan" 的问题:

计算协方差矩阵时,如果没有传入权重(即 w=None),会计算所有序列的协方差矩阵,这可能会导致矩阵奇异,从而在计算逆矩阵时出现 "nan"。可以尝试传入权重来解决这个问题。

如果权重 w 中有全 0 的行或列,那么在计算协方差矩阵时,分母为 0,也会导致出现 "nan"。

如果权重 w 中有负数,那么在计算平方根时会出现负数,从而出现 "nan"。

在计算 x_wi 时,如果分母为 0,也会出现 "nan"。可能的原因是 msa['seqs'] 中的所有序列之间都没有相似度超过阈值的配对,导致 x_wi 中所有元素为 0。这个问题可能需要在检查数据的前处理过程中查找原因。

引用chatGPT作答,您可以尝试调整代码中的一些超参数,比如 per_process_gpu_memory_fraction 参数,将其设为一个更小的值。此外,您可以尝试在代码运行之前清除 TensorFlow 的缓存,方法是在代码中添加以下代码:

from tensorflow.keras import backend as K
K.clear_session()