努力学了MCTS,但这种形式的MCTS第一次见,没有用树结构,能否讲解
def search(self, board):
"""
对状态进行一次递归的模拟搜索,添加各状态(棋盘)的访问结点信息(始终以白棋视角存储)
:param board: 棋盘当前
:return: None
"""
board_key = self.game.to_string(board)
# 判断是否胜负已分(叶子节点)
if board_key not in self.Game_End:
self.Game_End[board_key] = self.game.get_game_ended(board, WHITE)if self.Game_End[board_key] != 0:
# print("模拟到根节点", self.Game_End[board_key])
return -self.Game_End[board_key]
# 判断board_key是否为新扩展的节点
if board_key not in self.Pi:
# 由神经网路预测策略与v([-1,1]) PS[s] 为[1:300]数组
self.Pi[board_key], v = self.nnet.predict(board)
# print(len(self.Pi[board_key]))
# 始终寻找白棋可走的行动
self.Pi[board_key], legal_actions = self.game.get_valid_actions(board, WHITE, self.Pi[board_key])
# 存储该状态下所有可行动作
self.Actions[board_key] = legal_actions
self.N[board_key] = 0
self.Qsa[board_key] = 0
return -v
legal_actions = self.Actions[board_key]
best_uct = -float('inf')
# 最好的行动
best_action = -1
psa = list() # 状态转移概率,长度为当前状态下可走的动作数
# 将选点概率P转换成动作的概率
# print(legal_actions)
# print(self.Pi[board_key])
for a in legal_actions:
p = 0
for i in [0, 1, 2]:
assert self.Pi[board_key][a[i] + i * self.game.board_size ** 2] > 0
p += math.log(self.Pi[board_key][a[i] + i * self.game.board_size ** 2])
psa.append(p)
psa = np.array(psa)
psa = np.exp(psa) / sum(np.exp(psa))
# 求置信上限函数:Q + Cpuct * p * ((Ns/Nsa)的开方)
for i, a in enumerate(legal_actions): # enumerate():将一个元组加上序号,其中 i 为序号:0,1.... a为中的legal_actions元组
if (board_key, a[0], a[1], a[2]) in self.Qsa: # board_key:棋盘字符串,a[0], a[1], a[2]分别为起始点,落子点,放箭点
uct = self.Qsa[(board_key, a[0], a[1], a[2])] + self.args.Cpuct * psa[i] * math.sqrt(self.N[board_key]) \
/ (1 + self.Nsa[(board_key, a[0], a[1], a[2])])
else:
uct = self.args.Cpuct * psa[i] * math.sqrt(self.N[board_key] + EPS) # 防止乘积为0
if uct > best_uct:
best_uct = uct
best_action = a
a = best_action
# next_player反转
next_board, next_player = self.game.get_next_state(board, WHITE, a)
# 下一个状态,将棋盘颜色反转 (next_player = BLACK)
next_board = self.game.get_transformed_board(next_board, next_player)
v = self.search(next_board)
if (board_key, a[0], a[1], a[2]) in self.Qsa:
self.Qsa[(board_key, a[0], a[1], a[2])] = (self.Nsa[(board_key, a[0], a[1], a[2])] *
self.Qsa[(board_key, a[0], a[1], a[2])] + v) \
/ (self.Nsa[(board_key, a[0], a[1], a[2])] + 1)
self.Nsa[(board_key, a[0], a[1], a[2])] += 1
else:
self.Qsa[(board_key, a[0], a[1], a[2])] = v
self.Nsa[(board_key, a[0], a[1], a[2])] = 1
if (board_key, a[0]) in self.N_start:
self.N_start[(board_key, a[0])] += 1
else:
self.N_start[(board_key, a[0])] = 1
if (board_key, a[1]) in self.N_end:
self.N_end[(board_key, a[1])] += 1
else:
self.N_end[(board_key, a[1])] = 1
if (board_key, a[2]) in self.N_arrow:
self.N_arrow[(board_key, a[2])] += 1
else:
self.N_arrow[(board_key, a[2])] = 1
self.N[board_key] += 1
return -v
可以和“神经网络连子棋”进行对比
mark