Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
lli
YM-Seminar
Commits
c9446558
Commit
c9446558
authored
Apr 19, 2021
by
lli
Browse files
Corrected plot bug, change epoch to episode
parent
bb7d5832
Changes
1
Hide whitespace changes
Inline
Side-by-side
train_dqn.py
View file @
c9446558
...
...
@@ -25,7 +25,7 @@ parser.add_argument('--n_hidden', type=int, default=12, help='number of hidden n
parser
.
add_argument
(
'--lr'
,
type
=
float
,
default
=
0.01
,
help
=
'learning rate (default: 0.01)'
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
None
,
help
=
'random seed'
)
parser
.
add_argument
(
'--policy'
,
type
=
str
,
choices
=
(
'epsilon_greedy'
,
'boltzmann'
))
parser
.
add_argument
(
'--n_ep
och
'
,
type
=
int
,
required
=
True
,
help
=
'number of training ep
och
s'
)
parser
.
add_argument
(
'--n_ep
isode
'
,
type
=
int
,
required
=
True
,
help
=
'number of training ep
isode
s'
)
args
=
parser
.
parse_args
()
# Check if using cuda and define device
...
...
@@ -85,8 +85,8 @@ if USE_CUDA:
torch
.
cuda
.
manual_seed
(
seed
)
env
.
seed
(
seed
)
n_ep
och
=
args
.
n_ep
och
# Number of training episodes
total_reward_ep
och
=
[
0
]
*
n_ep
och
n_ep
isode
=
args
.
n_ep
isode
# Number of training episodes
total_reward_ep
isode
=
[
0
]
*
n_ep
isode
num_no_capacity
=
[]
accepted_orders
=
[]
...
...
@@ -97,7 +97,7 @@ start_time = timer()
#############################
# Training
#############################s
for
ep
och
in
range
(
n_ep
och
):
for
ep
isode
in
range
(
n_ep
isode
):
state
=
env
.
reset
()
is_done
=
False
...
...
@@ -115,7 +115,7 @@ for epoch in range(n_epoch):
else
:
action
=
dqn
.
boltzmann_policy
(
state
,
n_action
)
next_state
,
reward
,
is_done
,
info
=
env
.
step
(
action
)
total_reward_ep
och
[
epoch
]
+=
reward
total_reward_ep
isode
[
episode
]
+=
reward
replay_buffer
.
append
((
state
,
action
,
next_state
,
reward
,
is_done
))
...
...
@@ -129,47 +129,47 @@ for epoch in range(n_epoch):
state
=
next_state
print
(
f
'
Epoch:
{
epoch
}
, total reward:
{
total_reward_ep
och
[
epoch
]
}
, epsilon:
{
epsilon
}
, loss:
{
loss
}
, '
f
'num_no_capacity:
{
num_no_capacity
[
ep
och
]
}
, accepted orders:
{
accepted_orders
[
ep
och
]
}
'
)
f
'
episode:
{
episode
}
, total reward:
{
total_reward_ep
isode
[
episode
]
}
, epsilon:
{
epsilon
}
, loss:
{
loss
}
, '
f
'num_no_capacity:
{
num_no_capacity
[
ep
isode
]
}
, accepted orders:
{
accepted_orders
[
ep
isode
]
}
'
)
print
(
f
"Training time for
{
n_ep
och
}
epoch
s:
{
timer
()
-
start_time
}
"
)
print
(
f
"Training time for
{
n_ep
isode
}
episode
s:
{
timer
()
-
start_time
}
"
)
# save the model parameters
torch
.
save
(
dqn
.
state_dict
(),
os
.
path
.
join
(
OUT_PATH
,
'dqn_{}.pk1'
.
format
(
n_ep
och
)))
torch
.
save
(
dqn
.
state_dict
(),
os
.
path
.
join
(
OUT_PATH
,
'dqn_{}.pk1'
.
format
(
n_ep
isode
)))
#############################
# Plot of the training model
#############################
# Cumulative Average reward received over time
smoothed_rewards
=
[
np
.
mean
(
total_reward_ep
och
[:
i
+
1
])
for
i
in
range
(
len
(
total_reward_ep
och
))]
rewards
,
=
plt
.
plot
(
total_reward_ep
och
,
label
=
'Rewards'
)
smoothed_rewards
=
[
np
.
mean
(
total_reward_ep
isode
[:
i
+
1
])
for
i
in
range
(
len
(
total_reward_ep
isode
))]
rewards
,
=
plt
.
plot
(
total_reward_ep
isode
,
label
=
'Rewards'
)
avg_rewards
,
=
plt
.
plot
(
smoothed_rewards
,
label
=
'Average rewards'
)
plt
.
title
(
'
Epoch
rewards over time'
)
plt
.
xlabel
(
'
Epoch
'
)
plt
.
title
(
'
episode
rewards over time'
)
plt
.
xlabel
(
'
episode
'
)
plt
.
ylabel
(
'Total reward'
)
plt
.
legend
(
handles
=
[
rewards
,
avg_rewards
],
loc
=
'best'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'training_total_rewards.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
show
()
plt
.
close
()
# Plot epsilon
if
args
.
policy
==
'epsilon_greedy'
:
plt
.
plot
(
epsilon_value
)
plt
.
title
(
'Epsilon over time'
)
plt
.
xlabel
(
'
Epoch
'
)
plt
.
xlabel
(
'
episode
'
)
plt
.
ylabel
(
'Epsilon'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'epsilon.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
show
()
plt
.
close
()
# Plot number of penalties
num_no_capacity_smoothed
=
[
np
.
mean
(
num_no_capacity
[:
i
+
1
])
for
i
in
range
(
len
(
num_no_capacity
))]
num_penalty
,
=
plt
.
plot
(
num_no_capacity
,
label
=
'Penalties'
)
avg_penalty
,
=
plt
.
plot
(
num_no_capacity_smoothed
,
label
=
'Average penalties'
)
plt
.
title
(
'Number of penalties'
)
plt
.
xlabel
(
'
Epoch
'
)
plt
.
xlabel
(
'
episode
'
)
plt
.
ylabel
(
'Number of penalties'
)
plt
.
legend
(
handles
=
[
num_penalty
,
avg_penalty
],
loc
=
'best'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'penalties.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
show
()
plt
.
close
()
# Plot loss
loss
=
plt
.
plot
(
losses
,
label
=
'Loss'
)
...
...
@@ -177,21 +177,21 @@ plt.title('Loss')
plt
.
xlabel
(
'Steps'
)
plt
.
ylabel
(
'Loss'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'loss.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
show
()
plt
.
close
()
#############################
# Evaluation
#############################
# Use the trained model to predict 1000 test games
total_reward_ep
och
_eva
=
[
0
]
*
1000
total_reward_ep
isode
_eva
=
[
0
]
*
1000
num_no_capacity_eva
=
[]
accepted_orders_eva
=
[]
test_orders
=
load
(
'dp/order_list.npy'
)
test_rewards
=
load
(
'dp/reward_list.npy'
)
print
(
'##########################Evaluation##########################'
)
for
ep
in
range
(
test_orders
.
shape
[
0
]):
env_eva
=
Wendtris_Eva
(
test_orders
[
ep
],
test_rewards
[
ep
])
state
=
env_eva
.
state
...
...
@@ -201,7 +201,7 @@ for ep in range(test_orders.shape[0]):
# Always take the best action
action
=
torch
.
argmax
(
dqn
.
predict
(
state
)).
item
()
next_state
,
reward
,
is_done
,
info
=
env_eva
.
step
(
action
)
total_reward_ep
och
_eva
[
ep
]
+=
reward
total_reward_ep
isode
_eva
[
ep
]
+=
reward
if
is_done
:
num_no_capacity_eva
.
append
(
info
[
'Number no capacity'
])
...
...
@@ -209,13 +209,13 @@ for ep in range(test_orders.shape[0]):
break
state
=
next_state
print
(
'##########################Evaluation##########################'
)
print
(
f
'Ep
och
:
{
ep
}
, total reward:
{
total_reward_ep
och
_eva
[
ep
]
}
'
,
print
(
f
'Ep
isode
:
{
ep
}
, total reward:
{
total_reward_ep
isode
_eva
[
ep
]
}
'
,
f
'num_no_capacity:
{
num_no_capacity_eva
[
ep
]
}
, accepted orders:
{
accepted_orders_eva
[
ep
]
}
'
)
# Save the variables for evaluation
EVA_FILE
=
os
.
path
.
join
(
OUT_PATH
,
'evaluation'
)
save_list
(
total_reward_ep
och
_eva
,
EVA_FILE
,
'total_reward_ep
och
_eva'
)
save_list
(
total_reward_ep
isode
_eva
,
EVA_FILE
,
'total_reward_ep
isode
_eva'
)
save_list
(
num_no_capacity_eva
,
EVA_FILE
,
'num_no_capacity_eva'
)
save_list
(
accepted_orders_eva
,
EVA_FILE
,
'accepted_orders_eva'
)
...
...
@@ -225,24 +225,24 @@ optimal_orders = load('dp/subset.npy', allow_pickle=True).astype('object').tolis
# Calculate average results of the ground truth
optimal_avg_rewards
=
np
.
average
(
optimal_rewards
)
eva_avg_rewards
=
statistics
.
mean
(
total_reward_ep
och
_eva
)
eva_avg_rewards
=
statistics
.
mean
(
total_reward_ep
isode
_eva
)
print
(
f
'Predicted average rewards:
{
eva_avg_rewards
}
'
)
print
(
f
"Optimal average rewards:
{
optimal_avg_rewards
}
"
)
# Plot rewards (evaluation)
# Cumulative Average reward received over time
smoothed_rewards_eva
=
[
np
.
mean
(
total_reward_ep
och
_eva
[:
i
+
1
])
for
i
in
range
(
len
(
total_reward_ep
och
_eva
))]
smoothed_rewards_eva
=
[
np
.
mean
(
total_reward_ep
isode
_eva
[:
i
+
1
])
for
i
in
range
(
len
(
total_reward_ep
isode
_eva
))]
smoothed_optimal_rewards
=
[
np
.
mean
(
optimal_rewards
[:
i
+
1
])
for
i
in
range
(
len
(
optimal_rewards
))]
rewards_eva
,
=
plt
.
plot
(
total_reward_ep
och
_eva
,
label
=
'Rewards'
)
rewards_eva
,
=
plt
.
plot
(
total_reward_ep
isode
_eva
,
label
=
'Rewards'
)
avg_rewards_eva
,
=
plt
.
plot
(
smoothed_rewards_eva
,
label
=
'Average rewards'
)
opt_rewards
,
=
plt
.
plot
(
optimal_rewards
,
label
=
'Optimal rewards'
)
opt_avg_rewards
,
=
plt
.
plot
(
smoothed_optimal_rewards
,
label
=
'Average optimal rewards'
)
plt
.
title
(
'Ep
och
rewards over time (Evaluation)'
)
plt
.
xlabel
(
'Ep
och
'
)
plt
.
title
(
'Ep
isode
rewards over time (Evaluation)'
)
plt
.
xlabel
(
'Ep
isode
'
)
plt
.
ylabel
(
'Total reward'
)
plt
.
legend
(
handles
=
[
rewards_eva
,
avg_rewards_eva
,
opt_rewards
,
opt_avg_rewards
],
loc
=
'best'
,
fontsize
=
'small'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'training_total_rewards_evaluation.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
show
()
plt
.
close
()
# Modify orders for evaluation
...
...
@@ -258,7 +258,7 @@ categories = ['Reject', 'Accept']
make_confusion_matrix
(
cf_matrix
,
group_names
=
labels
,
categories
=
categories
,
cmap
=
'Blues'
)
plt
.
tight_layout
()
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'confusion_matrix.png'
),
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
show
()
plt
.
close
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment