解决无数结构的蒙特卡洛树搜索

努力学了MCTS,但这种形式的MCTS第一次见,没有用树结构,能否讲解
def search(self, board):
"""
对状态进行一次递归的模拟搜索,添加各状态(棋盘)的访问结点信息(始终以白棋视角存储)
:param board: 棋盘当前
:return: None
"""
board_key = self.game.to_string(board)
# 判断是否胜负已分(叶子节点)
if board_key not in self.Game_End:
self.Game_End[board_key] = self.game.get_game_ended(board, WHITE)if self.Game_End[board_key] != 0:
# print("模拟到根节点", self.Game_End[board_key])
return -self.Game_End[board_key]

    # 判断board_key是否为新扩展的节点
    if board_key not in self.Pi:
        # 由神经网路预测策略与v([-1,1]) PS[s] 为[1:300]数组
        self.Pi[board_key], v = self.nnet.predict(board)
        # print(len(self.Pi[board_key]))
        # 始终寻找白棋可走的行动
        self.Pi[board_key], legal_actions = self.game.get_valid_actions(board, WHITE, self.Pi[board_key])

        # 存储该状态下所有可行动作
        self.Actions[board_key] = legal_actions
        self.N[board_key] = 0
        self.Qsa[board_key] = 0
        return -v
    legal_actions = self.Actions[board_key]
    best_uct = -float('inf')
    # 最好的行动
    best_action = -1
    psa = list()  # 状态转移概率,长度为当前状态下可走的动作数

    # 将选点概率P转换成动作的概率
    # print(legal_actions)
    # print(self.Pi[board_key])
    for a in legal_actions:
        p = 0
        for i in [0, 1, 2]:
            assert self.Pi[board_key][a[i] + i * self.game.board_size ** 2] > 0
            p += math.log(self.Pi[board_key][a[i] + i * self.game.board_size ** 2])
        psa.append(p)
    psa = np.array(psa)
    psa = np.exp(psa) / sum(np.exp(psa))
    # 求置信上限函数:Q + Cpuct * p * ((Ns/Nsa)的开方)
    for i, a in enumerate(legal_actions):  # enumerate():将一个元组加上序号,其中 i 为序号:01.... a为中的legal_actions元组
        if (board_key, a[0], a[1], a[2]) in self.Qsa:  # board_key:棋盘字符串,a[0], a[1], a[2]分别为起始点,落子点,放箭点
            uct = self.Qsa[(board_key, a[0], a[1], a[2])] + self.args.Cpuct * psa[i] * math.sqrt(self.N[board_key]) \
                  / (1 + self.Nsa[(board_key, a[0], a[1], a[2])])

        else:
            uct = self.args.Cpuct * psa[i] * math.sqrt(self.N[board_key] + EPS)  # 防止乘积为0

        if uct > best_uct:
            best_uct = uct
            best_action = a

    a = best_action
    # next_player反转
    next_board, next_player = self.game.get_next_state(board, WHITE, a)
    # 下一个状态,将棋盘颜色反转 (next_player = BLACK)
    next_board = self.game.get_transformed_board(next_board, next_player)

    v = self.search(next_board)

    if (board_key, a[0], a[1], a[2]) in self.Qsa:
        self.Qsa[(board_key, a[0], a[1], a[2])] = (self.Nsa[(board_key, a[0], a[1], a[2])] *
                                                   self.Qsa[(board_key, a[0], a[1], a[2])] + v) \
                                                  / (self.Nsa[(board_key, a[0], a[1], a[2])] + 1)
        self.Nsa[(board_key, a[0], a[1], a[2])] += 1

    else:
        self.Qsa[(board_key, a[0], a[1], a[2])] = v
        self.Nsa[(board_key, a[0], a[1], a[2])] = 1

    if (board_key, a[0]) in self.N_start:
        self.N_start[(board_key, a[0])] += 1
    else:
        self.N_start[(board_key, a[0])] = 1

    if (board_key, a[1]) in self.N_end:
        self.N_end[(board_key, a[1])] += 1
    else:
        self.N_end[(board_key, a[1])] = 1

    if (board_key, a[2]) in self.N_arrow:
        self.N_arrow[(board_key, a[2])] += 1
    else:
        self.N_arrow[(board_key, a[2])] = 1

    self.N[board_key] += 1

    return -v

可以和“神经网络连子棋”进行对比

mark