Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
lli
YM-Seminar
Commits
0bbf1707
Commit
0bbf1707
authored
Apr 23, 2021
by
lli
Browse files
implemented reinforce
parent
f2f41b14
Changes
3
Hide whitespace changes
Inline
Side-by-side
algorithms/reinforce.py
View file @
0bbf1707
...
...
@@ -48,7 +48,7 @@ class PolicyNetwork(nn.Module):
for
log_prob
,
Gt
in
zip
(
log_probs
,
advantages
):
policy_gradient
.
append
(
-
log_prob
*
Gt
)
loss
=
torch
.
cat
(
policy_gradient
).
sum
().
cuda
(
)
loss
=
torch
.
cat
(
policy_gradient
).
sum
().
to
(
device
)
self
.
optimizer
.
zero_grad
()
loss
.
backward
()
self
.
optimizer
.
step
()
...
...
@@ -58,7 +58,6 @@ class PolicyNetwork(nn.Module):
# Sample the action with the highest probability
action
=
torch
.
multinomial
(
probs
,
1
).
item
()
log_prob
=
torch
.
log
(
probs
[
action
])
return
action
,
log_prob
...
...
@@ -81,7 +80,8 @@ class ValueNetwork(nn.Module):
def
update
(
self
,
state
,
y
):
y_pred
=
self
.
model
(
torch
.
tensor
(
state
,
dtype
=
torch
.
float32
,
device
=
device
))
loss
=
self
.
criterion
(
y_pred
,
torch
.
tensor
(
y
,
dtype
=
torch
.
float32
,
device
=
device
))
y
=
y
.
clone
().
requires_grad_
(
True
).
to
(
device
)
loss
=
self
.
criterion
(
y_pred
,
y
)
self
.
optimizer
.
zero_grad
()
loss
.
backward
()
self
.
optimizer
.
step
()
...
...
train_reinforce.py
View file @
0bbf1707
...
...
@@ -18,10 +18,11 @@ from environment.wendtris import Wendtris_Eva
plt
.
rcParams
[
'agg.path.chunksize'
]
=
10000
parser
=
argparse
.
ArgumentParser
(
description
=
'
DQN
'
)
parser
=
argparse
.
ArgumentParser
(
description
=
'
Reinforce with basesline
'
)
parser
.
add_argument
(
'--save_path'
,
type
=
str
,
required
=
True
,
help
=
'save path of results'
)
parser
.
add_argument
(
'--n_hidden'
,
type
=
int
,
default
=
128
,
help
=
'number of hidden neurons (default: 128)'
)
parser
.
add_argument
(
'--lr'
,
type
=
float
,
default
=
0.01
,
help
=
'learning rate (default: 0.01)'
)
parser
.
add_argument
(
'--lr_policy'
,
type
=
float
,
default
=
0.001
,
help
=
'learning rate policy net (default: 0.001)'
)
parser
.
add_argument
(
'--lr_value'
,
type
=
float
,
default
=
0.01
,
help
=
'learning rate value net (default: 0.01)'
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
None
,
help
=
'random seed'
)
parser
.
add_argument
(
'--n_episode'
,
type
=
int
,
required
=
True
,
help
=
'number of training episodes'
)
args
=
parser
.
parse_args
()
...
...
@@ -45,25 +46,37 @@ if USE_CUDA:
cudnn
.
benchmark
=
True
print
()
print
(
'#################Hyper Parameter Settings#################'
)
print
(
'#################Policy Net##################'
)
print
(
f
'Number of states (input):
{
n_state
}
'
)
print
(
f
'Number of actions (output):
{
n_action
}
'
)
print
(
f
'Number of hidden neurons:
{
args
.
n_hidden
}
'
)
print
(
f
'Learning rate:
{
args
.
lr
}
'
)
print
(
f
'Learning rate policy net:
{
args
.
lr_policy
}
'
)
print
(
f
'Learning rate value net
{
args
.
lr_value
}
'
)
print
(
f
'Discount factor:
{
gamma
}
'
)
print
()
# Initialize DQN network
dqn
=
DQN
(
n_state
,
n_action
,
args
.
n_hidden
,
args
.
lr
)
# Initialize the policy network
policy_net
=
PolicyNetwork
(
n_state
,
n_action
,
args
.
n_hidden
,
args
.
lr_policy
)
# Initialize the value network
value_net
=
ValueNetwork
(
n_state
,
args
.
n_hidden
,
args
.
lr_value
)
if
USE_CUDA
:
dqn
=
dqn
.
to
(
device
)
policy_net
=
policy_net
.
to
(
device
)
value_net
=
value_net
.
to
(
device
)
print
(
'######################Policy net architecture#####################'
)
print
(
policy_net
)
print
()
print
(
f
'Total parameters:
{
sum
(
p
.
numel
()
for
p
in
policy_net
.
parameters
())
}
'
)
print
(
f
'Trainable parameters:
{
sum
(
p
.
numel
()
for
p
in
policy_net
.
parameters
()
if
p
.
requires_grad
)
}
'
)
print
()
print
(
'######################
DQN
architecture#####################'
)
print
(
dqn
)
print
(
'######################
Value net
architecture#####################'
)
print
(
value_net
)
print
()
print
(
f
'Total parameters:
{
sum
(
p
.
numel
()
for
p
in
dqn
.
parameters
())
}
'
)
print
(
f
'Trainable parameters:
{
sum
(
p
.
numel
()
for
p
in
dqn
.
parameters
()
if
p
.
requires_grad
)
}
'
)
print
(
f
'Total parameters:
{
sum
(
p
.
numel
()
for
p
in
value_net
.
parameters
())
}
'
)
print
(
f
'Trainable parameters:
{
sum
(
p
.
numel
()
for
p
in
value_net
.
parameters
()
if
p
.
requires_grad
)
}
'
)
print
()
seed
=
args
.
seed
...
...
@@ -80,61 +93,72 @@ total_reward_episode = [0] * n_episode
num_no_capacity
=
[]
accepted_orders
=
[]
if
args
.
policy
==
'epsilon_greedy'
:
epsilon_value
=
[]
else
:
tau_value
=
[]
losses
=
[]
start_time
=
timer
()
#############################
# Training
#############################s
for
episode
in
range
(
n_episode
):
log_probs
=
[]
states
=
[]
rewards
=
[]
selected_action
=
[]
state
=
env
.
reset
()
is_done
=
False
if
args
.
policy
==
'epsilon_greedy'
:
if
epsilon_decay
:
epsilon
=
stretched_exponential_decay
(
episode
,
args
.
n_episode
,
0.1
,
0.1
,
0.1
)
epsilon_value
.
append
(
epsilon
)
else
:
if
tau_decay
:
tau
=
anneal_tau
(
episode
,
0.001
,
100
)
tau_value
.
append
(
tau
)
while
not
is_done
:
if
args
.
policy
==
'epsilon_greedy'
:
action
=
dqn
.
eps_greedy_policy
(
state
,
n_action
,
epsilon
)
else
:
action
=
dqn
.
boltzmann_policy
(
state
,
n_action
,
tau
)
while
True
:
states
.
append
(
state
)
action
,
log_prob
=
policy_net
.
get_action
(
state
)
next_state
,
reward
,
is_done
,
info
=
env
.
step
(
action
)
total_reward_episode
[
episode
]
+=
reward
replay_buffer
.
append
((
state
,
action
,
next_state
,
reward
,
is_done
))
total_reward_episode
[
episode
]
+=
reward
log_probs
.
append
(
log_prob
)
rewards
.
append
(
reward
)
selected_action
.
append
(
action
)
if
is_done
:
# Penalties and order position of accepted orders
num_no_capacity
.
append
(
info
[
'Number no capacity'
])
accepted_orders
.
append
(
info
[
'Accepted orders'
])
# Calculate discounted rewards
returns
=
[]
Gt
=
0
pw
=
0
for
t
in
range
(
len
(
states
)
-
1
,
-
1
,
-
1
):
Gt
+=
gamma
**
pw
*
rewards
[
t
]
pw
+=
1
returns
.
append
(
Gt
)
returns
=
returns
[::
-
1
]
returns
=
torch
.
tensor
(
returns
,
dtype
=
torch
.
float32
,
device
=
device
)
returns
=
(
returns
-
returns
.
mean
())
/
(
returns
.
std
()
+
1e-9
)
baseline_values
=
value_net
.
predict
(
states
)
# Ajust returns to the same shape of baseline_values
returns
=
torch
.
reshape
(
returns
,
(
baseline_values
.
shape
))
advantages
=
(
returns
-
baseline_values
)
value_net
.
update
(
states
,
returns
)
# Update nn based on discounted rewards and log_probs
policy_net
.
update
(
advantages
,
log_probs
)
print
(
'Episode: {}, total reward: {}, number of penalties: {}, accepted orders: {}'
.
format
(
episode
,
total_reward_episode
[
episode
],
num_no_capacity
[
episode
],
accepted_orders
[
episode
]))
# print('Episode: {}, selected action: {}'.format(episode, selected_action))
break
loss
=
dqn
.
replay
(
replay_buffer
,
replay_batch_size
,
gamma
)
losses
.
append
(
loss
)
state
=
next_state
if
args
.
policy
==
'epsilon_greedy'
:
print
(
f
'episode:
{
episode
}
, total reward:
{
total_reward_episode
[
episode
]
}
, epsilon:
{
epsilon
}
, loss:
{
loss
}
, '
f
'num_no_capacity:
{
num_no_capacity
[
episode
]
}
, accepted orders:
{
accepted_orders
[
episode
]
}
'
)
else
:
print
(
f
'episode:
{
episode
}
, total reward:
{
total_reward_episode
[
episode
]
}
, loss:
{
loss
}
, '
f
'num_no_capacity:
{
num_no_capacity
[
episode
]
}
, accepted orders:
{
accepted_orders
[
episode
]
}
'
)
print
(
f
"Training time for
{
n_episode
}
episodes:
{
timer
()
-
start_time
}
"
)
# save the model parameters
torch
.
save
(
dqn
.
state_dict
(),
os
.
path
.
join
(
OUT_PATH
,
'dqn_{}.pk1'
.
format
(
n_episode
)))
torch
.
save
(
policy_net
.
state_dict
(),
os
.
path
.
join
(
OUT_PATH
,
'policy_{}.pk1'
.
format
(
n_episode
)))
torch
.
save
(
value_net
.
state_dict
(),
os
.
path
.
join
(
OUT_PATH
,
'value_{}.pk1'
.
format
(
n_episode
)))
#############################
# Plot of the training model
...
...
@@ -150,22 +174,6 @@ plt.legend(handles=[rewards, avg_rewards], loc='best')
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'training_total_rewards.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
# Plot epsilon and tau
if
args
.
policy
==
'epsilon_greedy'
:
plt
.
plot
(
epsilon_value
)
plt
.
title
(
'Epsilon over time'
)
plt
.
xlabel
(
'Episode'
)
plt
.
ylabel
(
'Epsilon'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'epsilon.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
else
:
plt
.
plot
(
tau_value
)
plt
.
title
(
'Tau value over time'
)
plt
.
xlabel
(
'Episode'
)
plt
.
ylabel
(
'Tau'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'tau.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
# Plot number of penalties
num_no_capacity_smoothed
=
[
np
.
mean
(
num_no_capacity
[:
i
+
1
])
for
i
in
range
(
len
(
num_no_capacity
))]
num_penalty
,
=
plt
.
plot
(
num_no_capacity
,
label
=
'Penalties'
)
...
...
@@ -177,14 +185,6 @@ plt.legend(handles=[num_penalty, avg_penalty], loc='best')
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'penalties.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
# Plot loss
loss
=
plt
.
plot
(
losses
,
label
=
'Loss'
)
plt
.
title
(
'Loss'
)
plt
.
xlabel
(
'Steps'
)
plt
.
ylabel
(
'Loss'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'loss.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
#############################
# Evaluation
...
...
@@ -205,7 +205,7 @@ for ep in range(test_orders.shape[0]):
while
not
is_done
:
# Always take the best action
action
=
torch
.
argmax
(
dqn
.
predict
(
state
)).
item
()
action
=
policy_net
.
get_action
(
state
)[
0
]
next_state
,
reward
,
is_done
,
info
=
env_eva
.
step
(
action
)
total_reward_episode_eva
[
ep
]
+=
reward
...
...
@@ -225,10 +225,6 @@ save_list(total_reward_episode, EVA_FILE, 'total_reward_episode_train')
save_list
(
total_reward_episode_eva
,
EVA_FILE
,
'total_reward_episode_eva'
)
save_list
(
num_no_capacity_eva
,
EVA_FILE
,
'num_no_capacity_eva'
)
save_list
(
accepted_orders_eva
,
EVA_FILE
,
'accepted_orders_eva'
)
if
args
.
policy
==
'epsilon_greedy'
:
save_list
(
epsilon_value
,
EVA_FILE
,
'epsilon_value'
)
else
:
save_list
(
tau_value
,
EVA_FILE
,
'tau_value'
)
# Load optimal solution
optimal_rewards
=
load
(
'dp/results.npy'
)
...
...
utils/utils.py
View file @
0bbf1707
...
...
@@ -4,6 +4,7 @@ import shutil
import
sys
import
math
import
pickle
import
torch
import
numpy
as
np
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment