def _init_S(self):
'''Define the state space
'''
# all possible state
self.nS = frozen_lake.n_row*frozen_lake.n_col
self.S = list(range(self.nS))
self.state = self.cell2state(self.curr_cell)
# define the terminal states: goal, holes
self.goal_state = self.cell2state(self.goal)
self.hole_states = [self.cell2state(h) for h in self.hole_cells]
self.s_termination = self.hole_states+[self.goal_state]
其中 S 为状态集合,在代码中用env.S来表示, A 为动作集合,在代码中用env.A来表示。
Action
Action(行动)是 Agent 可以在每个状态下执行的操作。在 Frozen Lake 中,Agent 有四种可能的行动:
上 (Up)
下 (Down)
左 (Left)
右 (Right)
这些行动会导致 Agent 从当前状态移动到相邻的状态(如果没有超出网格边界)。
def _init_A(self,)
'''Define the action space
'''
self.directs = [
np.array([-1, 0]), # up
np.array([ 1, 0]), # down
np.array([ 0,-1]), # left
np.array([ 0, 1]), # right
]
self.nA = len(self.directs)
self.A = list((range(self.nA)))
Transmission Function (P)
Transmission Function(状态转移函数)定义了在执行某个行动后,环境如何从一个状态转移到另一个状态,以及每个转移的概率。
在 Frozen Lake 中,状态转移函数可以表示为 P(s′∣s,a),即在状态 s 执行动作 a 后转移到状态 s′ 的概率。例如,Agent从 (0,0) 点出发, (0,0) 为当前的状态 s,在执行向下移动的动作后,到达 (1,0),为新的状态 s′ 。
def _init_P(self):
'''Define the transition function, P(s'|s,a)
P(s'|s,a) is a probability distribution that
maps s and a to a distribution of s'.
'''
def p_s_next(s, a):
p_next = np.zeros([self.nS])
cell = self.state2cell(s)
# if the current state is terminal state
# state in the current state
if s in self.s_termination:
p_next[s] = 1
else:
for j in self.A:
s_next = self.cell2state(
np.clip(cell + self.directs[j],
0, frozen_lake.n_row-1))
# the agent is walking on a surface of frozen ice, they cannot always
# successfully perform the intended action. For example, attempting to move "left"
# may result in the agent moving to the left with a probability of 1-ε.
# With probability ε, the agent will randomly move in one of the
# other possible directions.
if j == a:
p_next[s_next] += 1-self.eps
else:
p_next[s_next] += self.eps / (self.nA-1)
return p_next
self.p_s_next = p_s_next
Reward
Reward(奖励)是 Agent 执行动作后从环境中获得的反馈。奖励用于指导 Agent 学习最佳策略,以实现其目标。
在 Frozen Lake 中,奖励机制通常如下:
到达目标状态(G):+1 分
掉入冰窟窿(H):-1 分
其余情况:0 分
def _init_R(self):
'''Define the reward function, R(s')
return:
r: reward
done: if terminated
'''
def R(s_next):
if s_next == self.goal_state:
return 1, True
elif s_next in self.hole_states:
return -1, True
else:
return 0, False
self.r = R