搬箱子的任务场地大小为 5x5,红色为a号智能体、蓝色为 b 号智能体、绿色为目标箱子box。动作 0 为上走,动作 1 为下走,动作 2 为左走,动作 3 为右走。只有当 1 号智能体在箱子左侧,2 号智能体在箱子右侧,且两个智能体执行相同方向的动作时才能搬动箱子。任务的目标是两个智能体将箱子搬到场地上方的位置。当智能体靠近箱子时会得到相应的奖励,当智能体搬动箱子靠近目标位置时也会获得相应奖励。a智能体 坐标 (5, 1).b智能体坐标 (5, 5),box箱子坐标 (4, 3),目标坐标(0,3)实现 Q-learning算法,将两个智能体当成一个整体的大的智能体,此时算法输入两个智能体的状态,同时输出两个智能体的动作。,具体代码是什么?有一部分的代码已写出。
import copy
import numpy as np
import random
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import cv2
class EnvMoveBox:
def __init__(self):
self.original_field = [[1, 1, 5, 5, 5, 1, 1],
[1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 4, 0, 0, 1],
[1, 2, 0, 0, 0, 3, 1],
[1, 1, 1, 1, 1, 1, 1]]
self.action_map = {0:(-1,0), 1:(1,0), 2:(0,-1), 3:(0,1)}
def reset(self):
self.field = copy.deepcopy(self.original_field)
self.a_pos = (5, 1)
self.b_pos = (5, 5)
self.box_pos = (4, 3)
self.goal_pos = (0, 3)
def move(self, pos, next_pos):
if 0 <= next_pos[0] <= 6 and 0 <= next_pos[1] <= 6 and \
self.field[next_pos[0]][next_pos[1]] not in [1, 2, 3, 4]:
self.field[pos[0]][pos[1]], self.field[next_pos[0]][next_pos[1]] = \
self.field[next_pos[0]][next_pos[1]], self.field[pos[0]][pos[1]]
return next_pos
else:
return pos
def step(self, action_list):
a_box = abs(self.a_pos[0] - self.box_pos[0]) + abs(self.a_pos[1] - (self.box_pos[1] - 1))
b_box = abs(self.b_pos[0] - self.box_pos[0]) + abs(self.b_pos[1] - (self.box_pos[1] + 1))
box_goal = abs(self.box_pos[0] - self.goal_pos[0]) + abs(self.box_pos[1] - self.goal_pos[1])
if self.a_pos == (self.box_pos[0], self.box_pos[1] - 1) and \
self.b_pos == (self.box_pos[0], self.box_pos[1] + 1) and \
action_list[0] == action_list[1]:
action_map = self.action_map[action_list[0]]
self.a_pos = self.move(self.a_pos,
(self.a_pos[0] + action_map[0], self.a_pos[1] + action_map[1]))
self.b_pos = self.move(self.b_pos,
(self.b_pos[0] + action_map[0], self.b_pos[1] + action_map[1]))
self.box_pos = self.move(self.box_pos,
(self.box_pos[0] + action_map[0], self.box_pos[1] + action_map[1]))
else:
action_map = self.action_map[action_list[0]]
self.a_pos = self.move(self.a_pos,
(self.a_pos[0] + action_map[0], self.a_pos[1] + action_map[1]))
action_map = self.action_map[action_list[1]]
self.b_pos = self.move(self.b_pos,
(self.b_pos[0] + action_map[0], self.b_pos[1] + action_map[1]))
_a_box = abs(self.a_pos[0] - self.box_pos[0]) + abs(self.a_pos[1] - (self.box_pos[1] - 1))
_b_box = abs(self.b_pos[0] - self.box_pos[0]) + abs(self.b_pos[1] - (self.box_pos[1] + 1))
_box_goal = abs(self.box_pos[0] - self.goal_pos[0]) + abs(self.box_pos[1] - self.goal_pos[1])
if self.box_pos == self.goal_pos:
reward = 100
done = True
else:
reward = a_box - _a_box + b_box - _b_box + box_goal - _box_goal
done = False
return (self.a_pos, self.b_pos, self.box_pos), reward, done, {}
def render(self):
obs = np.ones((7 * 20, 7 * 20, 3))
for i in range(7):
for j in range(7):
if self.field[i][j] == 1:
cv2.rectangle(obs, (j*20, i*20), (j*20+20, i*20+20), (0, 0, 0), -1)
cv2.rectangle(obs, (self.a_pos[1]*20, self.a_pos[0]*20), (self.a_pos[1]*20+20, self.a_pos[0]*20+20), (0,0,255), -1)
cv2.rectangle(obs, (self.b_pos[1]*20, self.b_pos[0]*20), (self.b_pos[1]*20+20, self.b_pos[0]*20+20), (255,0,0), -1)
cv2.rectangle(obs, (self.box_pos[1]*20, self.box_pos[0]*20), (self.box_pos[1]*20+20, self.box_pos[0]*20+20), (0,255,0), -1)
cv2.imshow('Move Box', obs)
cv2.waitKey(100)
引用chatGPT作答,以下是针对上面问题的代码,主要是Q-learning算法的实现:
class QLearning:
def __init__(self, alpha, gamma, epsilon):
self.Q = {} # 存储Q值
self.alpha = alpha # 学习率
self.gamma = gamma # 折扣因子
self.epsilon = epsilon # 探索概率
def get_action(self, state):
if random.random() < self.epsilon: # 随机探索
action_a = random.randint(0, 3)
action_b = random.randint(0, 3)
else: # 根据Q值选择动作
actions = []
q_values = []
for action_a in range(4):
for action_b in range(4):
actions.append((action_a, action_b))
q_values.append(self.Q.get((state, (action_a, action_b)), 0))
max_q = max(q_values)
count = q_values.count(max_q)
if count > 1: # 如果有多个最大Q值,则随机选择
best_actions = [i for i in range(len(actions)) if q_values[i] == max_q]
index = random.choice(best_actions)
else:
index = q_values.index(max_q)
action_a, action_b = actions[index]
return action_a, action_b
def update_Q(self, state, action, reward, next_state):
max_q = max([self.Q.get((next_state, (a, b)), 0) for a in range(4) for b in range(4)]) # 最大Q值
self.Q[(state, action)] = (1 - self.alpha) * self.Q.get((state, action), 0) + self.alpha * (reward + self.gamma * max_q)
这是我基于Q-learning算法实现的代码,包括状态表示、奖励函数、策略函数、动作选择以及Q表更新:
import numpy as np
import random
# 状态大小为 25*25*25*25*4*4,表示 a 和 b 两个智能体的位置和动作
STATE_SIZE = (25, 25, 25, 25, 4, 4)
# 动作集合
ACTION_SIZE = 4
# 设置学习率
LEARNING_RATE = 0.1
# 设置折扣率
DISCOUNT_FACTOR = 0.99
# 设置探索因子
EPSILON = 0.5
# 设置探索因子下降的速度
EPSILON_DECAY = 0.999
# 设置最小探索率
EPSILON_MIN = 0.01
# 设置迭代次数
NUM_EPISODES = 10000
# 状态表示:状态由 a 和 b 两个智能体的位置和动作组成
# 将状态和动作转换为整数,方便在 Q 表中存储
def state_to_int_state(state):
int_state = 0
for i in range(4):
int_state += state[i] * (25 ** i)
int_state += state[4] * 100 + state[5] * 10
return int_state
def action_to_int_action(action):
return action
# 奖励函数:到达箱子和目标位置分别给予奖励
def get_reward(state, action):
reward = -0.1
a_pos = (state[0], state[1])
b_pos = (state[2], state[3])
box_pos = (4, 3)
goal_pos = (0, 3)
box_near_a = (abs(a_pos[0]-box_pos[0]) + abs(a_pos[1]-box_pos[1])) == 1
box_near_b = (abs(b_pos[0]-box_pos[0]) + abs(b_pos[1]-box_pos[1])) == 1
a_next_pos = (a_pos[0]+1, a_pos[1]) if action == 0 else \
(a_pos[0]-1, a_pos[1]) if action == 1 else \
(a_pos[0], a_pos[1]-1) if action == 2 else \
(a_pos[0], a_pos[1]+1)
b_next_pos = (b_pos[0]+1, b_pos[1]) if action == 0 else \
(b_pos[0]-1, b_pos[1]) if action == 1 else \
(b_pos[0], b_pos[1]-1) if action == 2 else \
(b_pos[0], b_pos[1]+1)
a_near_box = (abs(a_next_pos[0]-box_pos[0]) + abs(a_next_pos[1]-box_pos[1])) == 1
b_near_box = (abs(b_next_pos[0]-box_pos[0]) + abs(b_next_pos[1]-box_pos[1])) == 1
if a_near_box and b_near_box and a_next_pos[1] == b_next_pos[1] and \
((a_next_pos[1]==2 and a_next_pos[0]>b_next_pos[0]) or \
(a_next_pos[1]==4 and a_next_pos[0]<b_next_pos[0])):
reward = 10
elif a_near_box or b_near_box:
reward = 1
if box_near_a and box_near_b:
reward += 100
if box_pos == goal_pos:
reward += 1000
return reward
# 策略函数:根据当前状态和 Q 表选择下一步动作
def policy(state, q_table):
int_state = state_to_int_state(state)
if random.uniform(0, 1) < EPSILON:
action_a = np.random.choice(range(ACTION_SIZE))
action_b = np.random.choice(range(ACTION_SIZE))
else:
action_a = np.argmax(q_table[int_state][:, :, :, state[3], state[4], state[5]])
action_b = np.argmax(q_table[int_state][state[0], state[1], state[2], :, state[4], state[5]])
return (action_a, action_b)
# 动作选择:根据策略函数选择下一步动作,同时进行探索率的下降
def choose_action(state, q_table):
action_a, action_b = policy(state, q_table)
if EPSILON > EPSILON_MIN:
EPSILON *= EPSILON_DECAY
return (action_a, action_b)
# Q 表更新:根据新的状态、动作和奖励更新 Q 表
def update_q_table(q_table, state, action, reward, next_state):
int_state = state_to_int_state(state)
int_next_state = state_to_int_state(next_state)
int_action = action_to_int_action(action)
# 针对 a 和 b 的 Q 表逐个更新,同时考虑另一个智能体的动作。
q_table[int_state][state[0], state[1], state[2], state[3], state[4], state[5], int_action] = \
q_table[int_state][state[0], state[1], state[2], state[3], state[4], state[5], int_action] + LEARNING_RATE * \
(reward + DISCOUNT_FACTOR * np.max(q_table[int_next_state][:, :, :, state[3], state[4], state[5]]) - \
q_table[int_state][state[0], state[1], state[2], state[3], state[4], state[5], int_action])
int_action_a, int_action_b = action_to_int_action((action[0], 0)), action_to_int_action((0, action[1]))
q_table[int_state][:, :, :, state[3], state[4], state[5], int_action_a] = \
q_table[int_state][:, :, :, state[3], state[4], state[5], int_action_a] + LEARNING_RATE * \
(reward + DISCOUNT_FACTOR * np.max(q_table[int_next_state][state[0], :, state[2], :, state[4], state[5]]) - \
q_table[int_state][:, :, :, state[3], state[4], state[5], int_action_a])
q_table[int_state][state[0], :, state[2], :, state[4], state[5], int_action_b] = \
q_table[int_state][state[0], :, state[2], :, state[4], state[5], int_action_b] + LEARNING_RATE * \
(reward + DISCOUNT_FACTOR * np.max(q_table[int_next_state][:, :, state[2], :, state[4], state[5]]) - \
q_table[int_state][state[0], :, state[2], :, state[4], state[5], int_action_b])
return q_table
# 初始化 Q 表
q_table = np.zeros((25**4, 25, 25, 25, 4, 4, ACTION_SIZE))
# 开始迭代
for i in range(NUM_EPISODES):
state = [5, 1, 5, 5, 0, 0]
done = False
while not done:
action = choose_action(state, q_table)
next_state = state.copy()
next_state[0] += 1 if action[0] == 0 else -1 if action[0] == 1 else 0
next_state[2] += 1 if action[1] == 0 else -1 if action[1] == 1 else 0
reward = get_reward(state, action)
q_table = update_q_table(q_table, state, action, reward, next_state)
state = next_state.copy()
done = (state[4] == 0 and state[0] == 0) or (state[4] == 1 and state[2] == 0)
print("Episode {} : {}".format(i, reward))
# 根据 Q 表得到可视化结果
a_pos = (5, 1)
b_pos = (5, 5)
box_pos = (4, 3)
goal_pos = (0, 3)
while box_pos != goal_pos:
state = [a_pos[0], a_pos[1], b_pos[0], b_pos[1], box_pos[0], box_pos[1]]
int_state = state_to_int_state(state)
action_a = np.argmax(q_table[int_state][:, :, :, state[3], state[4], state[5]])
action_b = np.argmax(q_table[int
抱歉,我是萌新,给出的代码可能有bug,麻烦自己修,见谅。
以下是在给出的代码基础上续写的部分:
class QLearningAgent:
def init(self):
self.learning_rate = 0.8
self.discount_factor = 0.95
self.epsilon = 0.1
self.q_table = {}
self.actions = [0, 1, 2, 3]
for x1 in range(7):
for y1 in range(7):
for x2 in range(7):
for y2 in range(7):
for x_box in range(7):
for y_box in range(7):
state = ((x1, y1), (x2, y2), (x_box, y_box))
self.q_table[state] = {a: 0.0 for a in self.actions}
def get_action(self, state):
if np.random.rand() < self.epsilon:
action = np.random.choice(self.actions)
else:
state_action = self.q_table[state]
max_value = max(state_action.values())
action_list = [k for k, v in state_action.items() if v == max_value]
action = np.random.choice(action_list)
return action
def update_table(self, state, action, reward, next_state):
q_1 = self.q_table[state][action]
q_2 = reward + self.discount_factor * max(self.q_table[next_state].values())
self.q_table[state][action] += self.learning_rate * (q_2 - q_1)
def main():
env = EnvMoveBox()
agent = QLearningAgent()
global_step = 0
score_history = []
EPISODES = 40000
for episode in range(EPISODES):
done = False
score = 0
env.reset()
state = ((env.a_pos[0], env.a_pos[1]), (env.b_pos[0], env.b_pos[1]), (env.box_pos[0], env.box_pos[1]))
while not done:
action_a = agent.get_action(state)
action_b = agent.get_action(state)
action_list = [action_a, action_b]
next_state, reward, done, _ = env.step(action_list)
next_state = ((next_state[0][0], next_state[0][1]),
(next_state[1][0], next_state[1][1]),
(next_state[2][0], next_state[2][1]))
agent.update_table(state, action_a, reward, next_state)
agent.update_table(state, action_b, reward, next_state)
state = next_state
score += reward
global_step += 1
score_history.append(score)
if episode % 100 == 0:
print("episode : ", episode, " avg score : ", np.mean(score_history))
score_history = []
if name == "main":
main()
基于new Bing的编写:
Q-learning算法是一种基于值函数的增强学习方法,利用状态-行为-奖励-下一个状态的四元组来进行学习。在此代码中,我们可以将两个智能体当成一个整体的大智能体来处理,合并状态和动作信息,输出两个智能体的动作(行动)。
以下是代码实现:
import copy
import numpy as np
import random
from collections import defaultdict
class QLearning:
def init(self, env, alpha=0.1, gamma=0.9, epsilon=0.1):
self.env = env # 环境
self.alpha = alpha # 学习率
self.gamma = gamma # 折扣因子
self.epsilon = epsilon # 探索率
self.q_table = defaultdict(lambda: [0]*4) # 初始化Q表格
def update_q_table(self, state, action, reward, next_state):
old_value = self.q_table[state][action]
next_max = max(self.q_table[next_state])
new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * next_max)
self.q_table[state][action] = new_value
def choose_action(self, state):
if random.uniform(0, 1) < self.epsilon:
return random.choice([0, 1, 2, 3])
else:
return np.argmax(self.q_table[state])
def train(self, num_episodes=1000):
episode_rewards = []
for episode in range(num_episodes):
state = tuple([self.env.a_pos, self.env.b_pos, self.env.box_pos]) # 合并状态信息
total_reward = 0
while True:
action_a_b = self.choose_action(state) # 输出两个智能体的动作信息
action_list = [action_a_b, action_a_b]
next_state, reward, done, _ = self.env.step(action_list)
next_state = tuple(next_state)
self.update_q_table(state, action_a_b, reward, next_state)
state = next_state
total_reward += reward
if done:
episode_rewards.append(total_reward)
break
return episode_rewards
def test(self):
state = tuple([self.env.a_pos, self.env.b_pos, self.env.box_pos])
path = [(self.env.a_pos, self.env.b_pos, self.env.box_pos)]
while True:
action_a_b = np.argmax(self.q_table[state])
action_list = [action_a_b, action_a_b]
next_state, _, done, _ = self.env.step(action_list)
next_state = tuple(next_state)
path.append(next_state)
state = next_state
if done:
break
return path
env = EnvMoveBox()
q_learning = QLearning(env)
episode_rewards = q_learning.train(num_episodes=5000)
path = q_learning.test()
print("Episode rewards:", episode_rewards)
print("Optimal path:", path)
以上代码实现了QLearning算法用于解决移动箱子任务问题。在运行该代码之前,需要安装相关的库numpy和matplotlib。
我们将两个智能体视为一个整体,并合并状态信息。每次对合并后的状态执行动作时,按照Q-Learning算法思路更新Q表格,并根据当前状态与q值以一定的概率进行随机探索或者输出“最大化的”动作(行动)。在训练完成后,我们可以应用该算法输出各状态的Q值及其对应的最优Action。最后,我们可以利用输出的最优path路径实现自动移动箱子。
当然,在具体实现中还有很多需要注意的细节问题。例如在训练过程中如何定义奖赏函数、调整学习率和探测率等超参数,以及如何选取更好的Q表初始化方法等等。这些都直接影响到Q-Learning算法的收敛速度和性能效果。所以,如果要进一步提升算法的可靠性和鲁棒性,还需要在具体应用场景中不断实践和总结经验,并针对实际问题进一步修改算法的参数设置和代码实现。
以下答案由GPT-3.5大模型与博主波罗歌共同编写:
Q-learning算法是基于第一次访问的贪心策略,通过不断更新状态-动作值函数(Q函数),从而获取最优策略。在实现Q-learning算法时需要进行以下几个步骤:
在此题中,状态为两个智能体和箱子的位置,即 (a_pos, b_pos, box_pos);动作为两个智能体可以执行的四个方向,即 0:向上,1:向下,2:向左,3:向右。同时,由于两个智能体相互协作才能搬动箱子,因此我们将两个智能体当成一个整体的大的智能体,此时算法输入两个智能体的状态,同时输出两个智能体的动作。
状态-动作值函数 Q(s,a) 表示在状态 s 下采取动作 a 的回报值,在本题中,由于两个智能体要相互协作才能搬动箱子,因此 Q 函数的输入为元组(s1,s2,a1,a2) 表示棋盘状态和两个智能体执行的动作,输出为两个智能体的动作。
在获得状态定义和状态-动作值函数的情况下,可以根据状态数量初始化 Q 函数,这里使用字典进行存储。同时,需要设定一些训练参数,如折扣因子 gamma,学习率 alpha,探索率 epsilon 等。
代码如下:
class QLearning:
def __init__(self, alpha, gamma, epsilon):
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
self.qtable = {}
for a_pos in range(1, 6):
for b_pos in range(1, 6):
for box_pos in range(1, 6):
for a_act in range(4):
for b_act in range(4):
self.qtable[((a_pos, b_pos, box_pos), a_act, b_act)] = 0
在训练过程中,需要对动作进行选择,这里使用 epsilon-greedy 策略,即以概率1-epsilon选择当前状态下的最优动作,以概率epsilon随机选择一个动作。
代码如下:
def choose_action(self, state):
# epsilon-greedy策略
if np.random.uniform() < self.epsilon:
return np.random.randint(4), np.random.randint(4)
else:
a_act = np.argmax([self.qtable.get((state, a, b)) for a in range(4)])
b_act = np.argmax([self.qtable.get((state, a_act, b)) for b in range(4)])
return a_act, b_act
在每一步行动中,需要使用更新规则更新 Q 函数,该规则公式为:
Q(s,a) ← Q(s,a) + α(R + γ max a' Q(s',a') - Q(s,a))
即在当前状态 s 下进行动作 a 后,进入状态 s',并获得即时回报 R,在新状态 s' 下选择最大价值的动作 a',更新 Q 函数。
代码如下:
def update_qtable(self, state, action, reward, new_state):
a_act, b_act = action
max_qvalue = max([self.qtable.get((new_state, a, b), 0) for a in range(4) for b in range(4)])
old_qvalue = self.qtable.get((state, a_act, b_act), 0)
self.qtable[(state, a_act, b_act)] = old_qvalue + self.alpha * (reward + self.gamma * max_qvalue - old_qvalue)
在定义好以上步骤后,对于这个任务可以进行模拟实验。具体的训练和测试流程如下:
env = EnvMoveBox()
qlearning = QLearning(alpha=0.8, gamma=0.9, epsilon=0.1)
episodes = 5000
for episode in range(episodes):
env.reset()
state = (env.a_pos, env.b_pos, env.box_pos)
done = False
while not done:
action = qlearning.choose_action(state)
obs, reward, done, _ = env.step(action)
new_state = (env.a_pos, env.b_pos, env.box_pos)
qlearning.update_qtable(state, action, reward, new_state)
state = new_state
env.reset()
state = (env.a_pos, env.b_pos, env.box_pos)
while True:
action = tuple(np.argmax([qlearning.qtable.get((state, a, b), 0) for a in range(4)]) for b in range(4))
obs, _, done, _ = env.step(action)
env.render()
state = (env.a_pos, env.b_pos, env.box_pos)
if done:
break
完整代码如下:
如果我的回答解决了您的问题,请采纳!