深度学习实战中文聊天对话机器人出现的问题

问题遇到的现象和发生背景

问题相关代码，请勿粘贴截图

运行结果及报错内容

我的解答思路和尝试过的方法

我想要达到的结果

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras import layers
import time
import keras
import numpy as np
import re
import os
from datetime import datetime
import io
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import unicodedata
from matplotlib.font_manager import FontProperties
font = FontProperties(fname="/Library/Fonts/Songti.ttc",size=8)


def process_cut(source_path,cut_path):
    convs = []
    with open(source_path,'r',encoding='utf8') as f:
        complete_dialog = []
        for line in f:
            line = line.strip('\n')
            line = re.sub("[\s+\.\!\/_,$%?^*(+\"\']+|[+!，。?~@#.&*()""]+","",line)
            if line == "":
                continue
            if line[0] == "E":
                if complete_dialog:
                    convs.append(complete_dialog)
                    complete_dialog = []
            if line[0] == "M":
                complete_dialog.append(line[1:])
    return convs


def question_answer(convs):
    questions = []
    answers = []
    for conv in convs:
        if len(conv) == 1:
            continue
        if len(conv) % 2 !=0 :
            conv = conv[:-1]
        for i in range(len(conv)):
            if i % 2 ==0:
                questions.append("<start>"+" ".join(conv[i])+"<end>")
            else:
                answers.append("<start>"+" ".join(conv[i])+"<end>")
    return questions,answers


def tokenize(datas):
    tokenizer = keras.preprocessing.text.Tokenizer(filters="")
    tokenizer.fit_on_texts(datas)
    voc_li = tokenizer.texts_to_sequences(datas)
    voc_li = keras.preprocessing.sequence.pad_sequences(voc_li,padding="post")
    return voc_li,tokenizer


class Encoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,enc_units,batch_sz):
        super(Encoder,self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = keras.layers.Embedding(vocab_size,embedding_dim)
        self.gru = keras.layers.GRU(
            self.enc_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer="glorot_uniform"
        )
    @tf.function
    def call(self,x,hidden):
        x = self.embedding(x)
        output, state = self.gru(x,initial_state=hidden)
        return output,state
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz,self.enc_units))


class BahdanauAttentionMechanism(tf.keras.layers.Layer):
    def __init__(self,units):
        super(BahdanauAttentionMechanism,self).__init__()
        self.W1 = layers.Dense(units)
        self.W2 = layers.Dense(units)
        self.v = layers.Dense(1)
    @tf.function
    def call(self,query,values):
        hidden_with_time_axis = tf.expand_dims(query,1)
        score = self.v(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis = 1)))
        attention_weights = tf.nn.softmax(score,axis=1)
        context_vector = attention_weights * values
        context_vector = tf.math.reduce_sum(context_vector,axis=1)
        return context_vector,attention_weights


class Decoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,dec_units,batch_sz):
        super(Decoder,self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = layers.Embedding(vocab_size,embedding_dim)
        self.gru = layers.GRU(self.dec_units,retrun_sequences=True,return_state=True,return_initializer=True)
        self.fc = layers.Dense(vocab_size)
        self.attention = BahdanauAttentionMechanism(self.dec_units)
    @tf.function
    def call(self,x,hidden,enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state, attention_weights


def loss(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real,0))
    loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
    loss_value = loss_obj(real, pred)
    mask = tf.cast(mask, dtype=loss_value.dtype)
    loss_value *= mask
    return tf.math.reduce_mean(loss_value)


def grad_loss(q,a,q_hidden,encoder,decoder,q_index,BATCH_SIZE):
    loss_value = 0
    with tf.GradientTape() as tape:
        q_output, q_hidden = encoder(q, q_hidden)
        a_hidden = q_hidden
        a_input = tf.expand_dims(
            [q_index.word_index["<start>"]]*BATCH_SIZE,1)
        for vector in range(1, a.shape[1]):
            predictions, a_hidden, _=decoder(a_input, a_hidden, q_output)
            loss_value += loss(a[:vector], predictions)
            a_input = tf.expand_dims(a[:, vector], 1)
        batch_loss = (loss_value/int(a.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables
        return batch_loss, tape.gradient(loss_value, variables)


def optimizer_loss(q,a,q_hidden,encoder,decoder,q_index,BATCH_SIZE,optimizer):
    batch_loss, grads = grad_loss(q,a,q_hidden,encoder,decoder,q_index,BATCH_SIZE)
    variables = encoder.trainable_variables + decoder.trainable_variables
    optimizer.apply_gradients(zip(grads, variables))
    return batch_loss


def source_data(source_path):
    convs = process_cut(source_path,None)
    questions, answers = question_answer(convs)
    return questions, answers

def train_model(q_hidden, encoder, decoder, q_index, BATCH_SIZE, dataset, steps_per_epoch, optimizer, checkpoint,checkpoint_prefix,summary_writer):
    i = 0
    EPOCHS = 200
    for epoch in range(EPOCHS):
        start = time.time()
        a_hidden = encoder.initialize_hidden_state()
        total_loss = 0
        for(batch, (q, a)) in enumerate(dataset.take(steps_per_epoch)):
            batch_loss = optimizer_loss(q,a,q_hidden,encoder,decoder,q_index,BATCH_SIZE,optimizer)
            total_loss += batch_loss
            with summary_writer.as_default():
                tf.summary.scalar("batch loss", batch_loss.numpy(), step=epoch)
            if batch % 100 == 0:
                print("第{}次训练，第{}批数据损失值:{:.4f}".format(epoch+1,batch+1,batch_loss.numpy()))
            with summary_writer.as_default():
                tf.summary.scalar("total loss", total_loss/steps_per_epoch, step=epoch)
            if(epoch+1) % 100 == 0:
                i += 1
                print("====第{}几次保存训练模型====".format(i))
                checkpoint.save(file_prefix=checkpoint_prefix)
            print("第{}次训练，总损失值:{:.4f}".format(epoch+1,total_loss/steps_per_epoch))
            print("训练耗时:{:.1f}秒".format(time.time()-start))


def preprocess_question(question):
    question = "<start> "+" ".join(question) + " <end>"
    return question

def max_length(vectors):
    return max(len(vector) for vector in vectors)

def convert(index, vectors):
    for vector in vectors:
        if vector != 0:
            print("{}-->{}".format(vector, index.index_word[vector]))


if __name__ == "__main__":
    stamp = datetime.now().strftime("%Y%m%d-%H:%M:%S")
    source_path ="./chat/chat data.py"
    convs = process_cut(source_path,None)
    questions,answers = question_answer(convs)
    q_vec , q_index = tokenize(questions)
    a_vec, a_index = tokenize(answers)
    q_max_len = max_length(q_vec)
    a_max_len = max_length(a_vec)
    convert(q_index, q_vec[0])
    BUFFER_SIZE = len(q_vec)
    print("buffer size:",BUFFER_SIZE)
    BATCH_SIZE = 64
    step_per_epoch = len(q_vec)//BATCH_SIZE
    embedding_dim = 256
    units = 1024
    q_vocab_size = len(q_index.word_index)+1
    a_vocab_size = len(a_index.word_index)+1
    dataset = tf.data.Dataset.from_tensor_slices((q_vec, a_vec)).shuffle(BUFFER_SIZE)
    q_batch, a_batch = next(iter(dataset))
    print("question batch:", q_batch.shape)
    print("answer batch:", a_batch.shape)
    log_path = "logs3\\chat\\"
    summary_writer = tf.summary.create_file_writer(log_path)
    tf.summary.trace_on(graph=True, profiler=True)
    encoder = Encoder(
        q_vocab_size,
        embedding_dim,
        units,
        BATCH_SIZE)
    q_hidden = encoder.initialize_hidden_state()
    print(q_hidden)
    q_output, q_hidden = encoder.call(q_batch, q_hidden)
    with summary_writer.as_default():
        tf.summary.trace_export(name="chat-en", step=0, profiler_outdir=log_path)

        tf.summary.trace_on(graph=True, profiler=True)
        attention_layer =BahdanauAttentionMechanism(10)
        attention_result, attention_weights = attention_layer.call(q_hidden, q_output)
    with summary_writer.as_default():
        tf.summary.trace_export(name="chat-atten", step=0, profiler_outdir=log_path)
        tf.summary.trace_on(graph=True, profiler=True)
        decoder =Decoder(
            a_vocab_size,
            embedding_dim,
            units,
            BATCH_SIZE
        )
        a_output, _, _ = decoder.call(
            tf.random.uniform((64,1)),
            q_hidden,
            q_output
        )
    with summary_writer.as_default():
        tf.summary.trace_export(name="chat-dec", step=0, profiler_outdir=log_path)
        optimizer = tf.keras.optimizers.Adam()
        checkpoint_dir = "./models"
        checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
        checkpoint = tf.train.Checkpoint(
            optimizer=optimizer,
            encoder=encoder,
            decoder=decoder
        )
为什么报错说我ValueError: Input 0 of layer "gru" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (17, 256)
看下你模型当中这个是ndim=3, found ndim=2. Full shape received: (17, 256) 是不是有误已经提示你写的ndim是3但是找到的是2