强化学习中遇到的问题,python写的一个一维寻宝例子
import numpy as np
import pandas as pd
import time
MAX_EPISODES = 10
FRESH_TIME = 0.1
action_space = ['l', 'r']
state_space = 6
class create_env:
def __init__(self):
self.env_list = ['-']*(state_space-1) + ['T']
def update_env(self,episode,state,step_counter):
if state == 'terminal':
interaction = 'Episode %s : total_step = %s'%(episode+1,step_counter)
print('\r{}'.format(interaction),end='')
time.sleep(2)
print(' ',end='\n')
else:
self.__init__()
self.env_list[state] = 'o'
interaction = ''.join(self.env_list)
print('\r{}'.format(interaction),end='')
time.sleep(FRESH_TIME)
class RL:
def __init__(self):
self.q_table = pd.DataFrame(np.zeros((state_space, len(action_space))), columns=action_space)
self.ALPHA = 0.1
self.EPSILON = 0.9
self.GAMMA = 0.9
def choose_action(self,state):
state_action = self.q_table.iloc[state, :]
if (np.random.uniform() > self.EPSILON) or ((state_action == 0).all()):
choosed_action = np.random.choice(action_space)
else:
choosed_action = state_action.idxmax() # idxmax返回每一列中最大值的索引
return choosed_action
def feedback(self,state,action):
if action == 'r':
if state == state_space-2:
s_ = 'terminal'
reward = 1
else:
s_ = state + 1
reward = 0
else:
reward = -1
if state == 0:
s_ = state
else:
s_ = state - 1
return s_,reward
def rl(self):
for episode in range(MAX_EPISODES):
step_counter = 0
state = 0
is_terminated = False
env.update_env(episode, state, step_counter)
while not is_terminated:
print(state)
action = self.choose_action(state)
s_, reward = self.feedback(state, action)
q_predict = self.q_table.loc[state, action]
if s_ != 'terminal':
q_target = reward + self.GAMMA * self.q_table.iloc[s_, :].max()
else:
q_target = reward
is_terminated = True
self.q_table.loc[state, action] += self.ALPHA * (q_target - q_predict)
state = s_
step_counter += 1
env.update_env(episode, state, step_counter)
if __name__=='__main__':
env = create_env()
rl = RL()
for episode in range(MAX_EPISODES):
step_counter = 0
state = 0
is_terminal = False
env.update_env(episode, state, step_counter)
while not is_terminal:
action = rl.choose_action(state)
s_, reward = rl.feedback(state, action)
q_predict = rl.q_table.loc[state, action]
if s_ != 'terminal':
q_target = reward + rl.GAMMA * rl.q_table.iloc[s_, :].max()
else:
q_target = reward
is_terminated = True
rl.q_table.loc[state, action] += rl.ALPHA * (q_target - q_predict)
state = s_
env.update_env(episode, state, step_counter + 1)
step_counter += 1
print('\r\nQ-table:\n')
print(rl.q_table)
运行结果及报错内容
Episode 1 : total_step = 12
Traceback (most recent call last):
File "D:\Anaconda\envs\sklearnbase\lib\site-packages\pandas\core\indexing.py", line 769, in _validate_tuple_indexer
self._validate_key(k, i)
File "D:\Anaconda\envs\sklearnbase\lib\site-packages\pandas\core\indexing.py", line 1381, in _validate_key
raise ValueError(f"Can only index by location with a [{self._valid_types}]")
ValueError: Can only index by location with a [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array]
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "D:/PycharmProjects/RLtest1/case.py", line 90, in
action = rl.choose_action(state)
File "D:/PycharmProjects/RLtest1/case.py", line 35, in choose_action
state_action = self.q_table.iloc[state, :]
File "D:\Anaconda\envs\sklearnbase\lib\site-packages\pandas\core\indexing.py", line 961, in __getitem__
return self._getitem_tuple(key)
File "D:\Anaconda\envs\sklearnbase\lib\site-packages\pandas\core\indexing.py", line 1461, in _getitem_tuple
tup = self._validate_tuple_indexer(tup)
File "D:\Anaconda\envs\sklearnbase\lib\site-packages\pandas\core\indexing.py", line 771, in _validate_tuple_indexer
raise ValueError(
ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types
Process finished with ex
实现的功能:
实现for循环的功能