为啥我的DDPG会出现奖励函数陡然下降,有没有人知道原因啊??是我的actor和critic两块出错了吗??

img
img

class DDPG(object):
    def __init__(self, a_dim, s_dim, a_bound, model_file=None):
        self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
        self.pointer = 0
        self.sess = tf.Session()
        self.a_replace_counter, self.c_replace_counter = 0, 0

        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
        self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
        self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')

        with tf.variable_scope('Actor'):
            self.a = self._build_a(self.S, scope='eval', trainable=True)
            a_ = self._build_a(self.S_, scope='target', trainable=False)
        with tf.variable_scope('Critic'):
            # assign self.a = a in memory when calculating q for td_error,
            # otherwise the self.a is from Actor when updating Actor
            q = self._build_c(self.S, self.a, scope='eval', trainable=True)
            q_ = self._build_c(self.S_, a_, scope='target', trainable=False)

        # networks parameters
        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
        self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
        self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
        self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')

        # target net replacement
        self.soft_replace = [[tf.assign(ta, (1 - TAU) * ta + TAU * ea), tf.assign(tc, (1 - TAU) * tc + TAU * ec)]
                             for ta, ea, tc, ec in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]

        q_target = self.R + GAMMA * q_
        # in the feed_dic for the td_error, the self.a should change to actions in memory
        td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
        self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=self.ce_params)

        a_loss = - tf.reduce_mean(q)    # maximize the q
        self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=self.ae_params)

        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        if model_file is not None:
            self.restore_model(model_file)

    def choose_action(self, s):
        return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]

    def learn(self):
        # soft target replacement
        self.sess.run(self.soft_replace)

        indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
        bt = self.memory[indices, :]
        bs = bt[:, :self.s_dim]
        ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
        br = bt[:, -self.s_dim - 1: -self.s_dim]
        bs_ = bt[:, -self.s_dim:]

        self.sess.run(self.atrain, {self.S: bs})
        self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, a, [r], s_))
        index = self.pointer % MEMORY_CAPACITY  # replace the old memory with new memory
        self.memory[index, :] = transition
        self.pointer += 1

    def _build_a(self, s, scope, trainable):
        with tf.variable_scope(scope):
            init_w = tf.random_normal_initializer(0., 0.1)
            init_b = tf.constant_initializer(0.1)

            net = tf.layers.dense(s, HIDDEN_1_SIZE, activation=tf.nn.relu,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l1', trainable=trainable)
            net_2 = tf.layers.dense(net, HIDDEN_2_SIZE, activation=tf.nn.relu,
                                    kernel_initializer=init_w, bias_initializer=init_b, name='l2', trainable=trainable)
            net_3 = tf.layers.dense(net_2, HIDDEN_2_SIZE, activation=tf.nn.relu,
                                    kernel_initializer=init_w, bias_initializer=init_b, name='l3', trainable=trainable)
            a = tf.layers.dense(net_3, self.a_dim, activation=tf.nn.tanh,
                                kernel_initializer=init_w, bias_initializer=init_b, name='a', trainable=trainable)
            return tf.multiply(a, 1, name='scaled_a')

    def _build_c(self, s, a, scope, trainable):
        with tf.variable_scope(scope):
            init_w = tf.random_normal_initializer(0., 0.1)
            init_b = tf.constant_initializer(0.1)

            hidden_1 = tf.layers.dense(s, HIDDEN_1_SIZE, activation=tf.nn.tanh,
                                       kernel_initializer=init_w, bias_initializer=init_b, name='hidden_1')
            hidden_2 = tf.layers.dense(hidden_1, HIDDEN_2_SIZE, activation=tf.nn.tanh,
                                       kernel_initializer=init_w, bias_initializer=init_b, name='hidden_2')
            with tf.variable_scope('hidden_3'):
                w2_s = tf.get_variable('w2_s', [HIDDEN_2_SIZE, HIDDEN_3_SIZE], initializer=init_w)
                w2_a = tf.get_variable('w2_a', [self.a_dim, HIDDEN_3_SIZE], initializer=init_w)
                b2 = tf.get_variable('b2', [1, HIDDEN_3_SIZE], initializer=init_b)
                hidden_3 = tf.nn.relu(tf.matmul(hidden_2, w2_s) + tf.matmul(a, w2_a) + b2)
            q = tf.layers.dense(hidden_3, 1, kernel_initializer=init_w, bias_initializer=init_b, name='q')
            return q