-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsarsa.py
More file actions
110 lines (96 loc) · 3.57 KB
/
sarsa.py
File metadata and controls
110 lines (96 loc) · 3.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
'''
Solve FrozenLake-v0 with Sarsa with minimum 6 steps
Hyperparameters:
- alpha
- epsilon in epsilon-soft policy
Steps:
- initialize all Q(S,A) to 0; initialize hyper parameters
- set pi to be epsilon-soft, with epsilon = 0.1
- for each step S of episode:
- take A from argmax(Q(S)) with epsilon-soft, observe R, S'
- choose A' from argmax(Q(S')) with epsilon-soft
- Q(S,A) <- Q(S,A) + alpha * (R + lambda * Q(S',A') - Q(S,A))
- S <- S', A <- A'
- until S is terminal
- repeat until Q converges
- render final policy with no epsilon-soft, this should solve the environment
'''
import gym
from gym.envs.registration import register
from common.policies import *
def train(env, q, hyper_parameters, debug=False):
'''
train the Sarsa policy
:param
env: gym environment
q: Q(S,A) matrix
:return: an updated q
'''
alpha = hyper_parameters['alpha']
discount = hyper_parameters['discount']
timesteps = 1e4
starting_epsilon = 1 / env.action_space.n # important that we start being very exploratory
s = env.reset()
episodes = 0
total_update = 0
epsilon = epsilon_decay(starting_epsilon, timesteps, 0)
a = epsilon_greedy_policy(q, s, epsilon=epsilon)
for i in range(1, int(timesteps+1)):
epsilon = epsilon_decay(starting_epsilon, timesteps, i)
s_prime, reward, done, info = env.step(a)
if done:
q[s_prime][:] = 0
a_prime = epsilon_greedy_policy(q, s_prime, epsilon=epsilon)
q_update = alpha * (reward + discount*q[s_prime][a_prime] - q[s][a])
total_update += q_update
q[s][a] += q_update
s = s_prime
a = a_prime
if done:
s = env.reset()
a = epsilon_greedy_policy(q, s, epsilon=epsilon)
total_update = 0
episodes += 1
print("training done; total episodes %i, final update %f" % (episodes, total_update)) # 1406
return q
if __name__ == '__main__':
env_name = 'FrozenLake-notSlippery-v0'
register(id=env_name, entry_point='gym.envs.toy_text:FrozenLakeEnv', kwargs={'is_slippery': False})
env = gym.make(env_name)
# initialize
q = np.zeros((env.observation_space.n, env.action_space.n))
hyper_parameters = {'alpha':0.1, 'discount':1}
q = train(env, q, hyper_parameters, debug=True)
# test the trained policy, final reward should be 1
observation = env.reset()
for t in range(100):
env.render()
action = epsilon_greedy_policy(q, observation, greedy=True)
observation, reward, done, info = env.step(action)
if done:
print("Episode finished after {} timesteps".format(t+1))
print("Final reward {}".format(reward))
print("Final Q:")
print(q)
break
'''
Sample output:
Episode finished after 6 timesteps
Final reward 1.0
Final Q: [[0.77311302 0.71094352 0.91454925 0.7827461 ]
[0.80618836 0. 0.90771566 0.81316732]
[0.79137627 0.91611362 0.80451343 0.81531942]
[0.85209155 0. 0.61943572 0.68055591]
[0.63680207 0.6900417 0. 0.80150173]
[0. 0. 0. 0. ]
[0. 0.91758567 0. 0.89011938]
[0. 0. 0. 0. ]
[0.76771875 0. 0.96907976 0.74969815]
[0.79044144 0.73901709 0.90690114 0. ]
[0.9277173 0.9996899 0. 0.92464385]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0.96953419 0.98819386 0.8760814 ]
[0.96531663 0.99512412 1. 0.94105635]
[0. 0. 0. 0. ]]
'''