Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
lli
YM-Seminar
Commits
0a799f4c
Commit
0a799f4c
authored
Apr 24, 2021
by
lli
Browse files
update
parent
8738c2b3
Changes
2
Hide whitespace changes
Inline
Side-by-side
algorithms/reinforce.py
View file @
0a799f4c
...
...
@@ -30,6 +30,7 @@ class PolicyNetwork(nn.Module):
)
self
.
optimizer
=
torch
.
optim
.
Adam
(
self
.
network
.
parameters
(),
lr
)
self
.
scheduler
=
torch
.
optim
.
lr_scheduler
.
StepLR
(
self
.
optimizer
,
step_size
=
100
,
gamma
=
0.1
)
def
predict
(
self
,
state
):
# Compute the action probabilities of state s using the learning rate
...
...
@@ -54,6 +55,7 @@ class PolicyNetwork(nn.Module):
self
.
optimizer
.
zero_grad
()
loss
.
backward
()
self
.
optimizer
.
step
()
self
.
scheduler
.
step
()
def
get_action
(
self
,
state
):
probs
=
self
.
predict
(
state
)
...
...
train_reinforce.py
View file @
0a799f4c
...
...
@@ -92,7 +92,7 @@ n_episode = args.n_episode # Number of training episodes
total_reward_episode
=
[
0
]
*
n_episode
num_no_capacity
=
[]
accepted_orders
=
[]
lr
=
[]
start_time
=
timer
()
#############################
...
...
@@ -141,16 +141,17 @@ for episode in range(n_episode):
advantages
=
(
returns
-
baseline_values
)
value_net
.
update
(
states
,
returns
)
lr
.
append
(
policy_net
.
optimizer
.
param_groups
[
0
][
'lr'
])
# Update nn based on discounted rewards and log_probs
policy_net
.
update
(
advantages
,
log_probs
)
print
(
'Episode: {}, total reward: {}, number of penalties: {}, accepted orders: {}'
.
format
(
episode
,
print
(
'Episode: {}, total reward: {}, number of penalties: {}, accepted orders:
{}, learning rate:
{}'
.
format
(
episode
,
total_reward_episode
[
episode
],
num_no_capacity
[
episode
],
accepted_orders
[
episode
]))
episode
],
lr
[
episode
]))
# print('Episode: {}, selected action: {}'.format(episode, selected_action))
break
...
...
@@ -164,7 +165,7 @@ torch.save(value_net.state_dict(), os.path.join(OUT_PATH, 'value_{}.pk1'.format(
# Plot of the training model
#############################
# Cumulative Average reward received over time
smoothed_rewards
=
[
np
.
mean
(
total_reward_episode
[:
i
+
1
])
for
i
in
range
(
len
(
total_reward_episode
))
]
smoothed_rewards
=
sliding_window
(
total_reward_episode
,
len
(
total_reward_episode
))
rewards
,
=
plt
.
plot
(
total_reward_episode
,
label
=
'Rewards'
)
avg_rewards
,
=
plt
.
plot
(
smoothed_rewards
,
label
=
'Average rewards'
)
plt
.
title
(
'Episode rewards over time'
)
...
...
@@ -175,7 +176,7 @@ plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards.png'), dpi=1200, tran
plt
.
close
()
# Plot number of penalties
num_no_capacity_smoothed
=
[
np
.
mean
(
num_no_capacity
[:
i
+
1
])
for
i
in
range
(
len
(
num_no_capacity
))
]
num_no_capacity_smoothed
=
sliding_window
(
num_no_capacity
,
len
(
num_no_capacity
))
num_penalty
,
=
plt
.
plot
(
num_no_capacity
,
label
=
'Penalties'
)
avg_penalty
,
=
plt
.
plot
(
num_no_capacity_smoothed
,
label
=
'Average penalties'
)
plt
.
title
(
'Number of penalties'
)
...
...
@@ -185,6 +186,11 @@ plt.legend(handles=[num_penalty, avg_penalty], loc='best')
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'penalties.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
plt
.
title
(
'Learning rate decay'
)
plt
.
xlabel
(
'Episode'
)
plt
.
ylabel
(
'Learning rate'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'learning_rate.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
#############################
# Evaluation
...
...
@@ -238,8 +244,8 @@ print(f"Optimal average rewards: {optimal_avg_rewards}")
# Plot rewards (evaluation)
# Cumulative Average reward received over time
smoothed_rewards_eva
=
[
np
.
mean
(
total_reward_episode_eva
[:
i
+
1
])
for
i
in
range
(
len
(
total_reward_episode_eva
))
]
smoothed_optimal_rewards
=
[
np
.
mean
(
optimal_rewards
[:
i
+
1
])
for
i
in
range
(
len
(
optimal_rewards
))
]
smoothed_rewards_eva
=
sliding_window
(
total_reward_episode_eva
,
len
(
total_reward_episode_eva
))
smoothed_optimal_rewards
=
sliding_window
(
optimal_rewards
,
len
(
optimal_rewards
))
rewards_eva
,
=
plt
.
plot
(
total_reward_episode_eva
,
label
=
'Rewards'
)
avg_rewards_eva
,
=
plt
.
plot
(
smoothed_rewards_eva
,
label
=
'Average rewards'
)
opt_rewards
,
=
plt
.
plot
(
optimal_rewards
,
label
=
'Optimal rewards'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment