-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathreinforce_continuous.py
More file actions
108 lines (84 loc) · 3.18 KB
/
reinforce_continuous.py
File metadata and controls
108 lines (84 loc) · 3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
'''
Solve MountainCar continuous with REINFORCE
Does not seem to converge as of episode 500 :(
'''
import gym
import numpy as np
from common.tile_encoding import TileEncoder
env = gym.make('MountainCarContinuous-v0') # .env
state_space = env.observation_space
# same featurizer for both mean and variance
tile_encoder = TileEncoder(
lower_x=state_space.low[0],
lower_y=state_space.low[1],
upper_x=state_space.high[0],
upper_y=state_space.high[1],
n=12,
tile_offsets=[]
)
# linear for mean
alpha_miu = 0.001
alpha_sigma = 0.0001
def predict_miu(observation, weights):
features = tile_encoder.encode(observation[0], observation[1]).flatten()
return np.dot(weights, features)
def predict_sigma(observation, weights):
features = tile_encoder.encode(observation[0], observation[1]).flatten()
e = np.exp(-1 * np.dot(weights, features))
# return 1 / (1+e)
return np.exp(np.dot(weights, features))
def clip(action):
if action > 1.0:
return 1.0
if action < -1.0:
return -1.0
return action
def train(episodes):
weights_miu = np.random.rand(tile_encoder.d)
weights_sigma = np.random.rand(tile_encoder.d)
gradients_miu = []
gradients_sigma = []
for i in range(episodes):
observation = env.reset()
rewards = []
# debug on
sigmas = []
# debug off
is_done = False
while not is_done:
miu = predict_miu(observation, weights_miu)
state_features = tile_encoder.encode(observation[0], observation[1]).flatten()
sigma = predict_sigma(observation, weights_sigma) # + 1e-5
sigmas.append(sigma)
action = np.random.normal(miu, sigma)
# gradients
gradients_miu.append(1 / (sigma ** 2) * (action - miu) * state_features)
gradients_sigma.append(((action - miu) ** 2 / (sigma ** 2) - 1) * state_features)
# gradients_sigma.append((1-sigma) * ((action - miu)**2 / (sigma**2) - 1))
# step
observation, reward, is_done, info = env.step([clip(action)]) # clip so that reward does not explode
rewards.append(reward)
for j in range(len(rewards)):
g = sum(rewards[j:])
gradient_miu = gradients_miu[j]
gradient_sigma = gradients_sigma[j]
weights_miu += alpha_miu * g * gradient_miu
weights_sigma += alpha_sigma * g * gradient_sigma
# with open('logs/reinforce_continuous_mountaincar.csv', 'a') as f:
# f.write("%d,%d" % (i, len(rewards)))
# print("sigma stats:", max(sigmas), sum(sigmas) / len(sigmas))
print("episode %d, rewards %d, %.2f" % (i, len(rewards), sum(rewards)))
return weights_miu, weights_sigma
if __name__ == '__main__':
episodes = 5000
weights_miu, weights_sigma = train(episodes)
# demonstrate
observation = env.reset()
for _ in range(1000):
env.render()
miu = predict_miu(observation, weights_miu)
sigma = predict_sigma(observation, weights_sigma)
action = np.random.normal(miu, sigma)
observation, reward, is_done, info = env.step([clip(action)])
if is_done:
break