Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
lli
YM-Seminar
Commits
168e0fcb
Commit
168e0fcb
authored
Apr 21, 2021
by
lli
Browse files
update2
parent
97be44db
Changes
3
Hide whitespace changes
Inline
Side-by-side
algorithms/reinforce.py
View file @
168e0fcb
import
torch
import
torch
import
numpy
as
np
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.nn.functional
as
F
USE_CUDA
=
torch
.
cuda
.
is_available
()
USE_CUDA
=
torch
.
cuda
.
is_available
()
device
=
torch
.
device
(
'cuda'
if
USE_CUDA
else
'cpu'
)
device
=
torch
.
device
(
'cuda'
if
USE_CUDA
else
'cpu'
)
...
...
train_dqn.py
View file @
168e0fcb
import
argparse
import
argparse
import
math
import
statistics
import
statistics
import
matplotlib.pyplot
as
plt
import
matplotlib.pyplot
as
plt
import
torch
import
torch
...
@@ -21,7 +20,7 @@ plt.rcParams['agg.path.chunksize'] = 10000
...
@@ -21,7 +20,7 @@ plt.rcParams['agg.path.chunksize'] = 10000
parser
=
argparse
.
ArgumentParser
(
description
=
'DQN'
)
parser
=
argparse
.
ArgumentParser
(
description
=
'DQN'
)
parser
.
add_argument
(
'--save_path'
,
type
=
str
,
required
=
True
,
help
=
'save path of results'
)
parser
.
add_argument
(
'--save_path'
,
type
=
str
,
required
=
True
,
help
=
'save path of results'
)
parser
.
add_argument
(
'--n_hidden'
,
type
=
int
,
default
=
12
,
help
=
'number of hidden neurons (default: 12)'
)
parser
.
add_argument
(
'--n_hidden'
,
type
=
int
,
default
=
12
8
,
help
=
'number of hidden neurons (default: 12
8
)'
)
parser
.
add_argument
(
'--lr'
,
type
=
float
,
default
=
0.01
,
help
=
'learning rate (default: 0.01)'
)
parser
.
add_argument
(
'--lr'
,
type
=
float
,
default
=
0.01
,
help
=
'learning rate (default: 0.01)'
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
None
,
help
=
'random seed'
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
None
,
help
=
'random seed'
)
parser
.
add_argument
(
'--policy'
,
type
=
str
,
choices
=
(
'epsilon_greedy'
,
'boltzmann'
))
parser
.
add_argument
(
'--policy'
,
type
=
str
,
choices
=
(
'epsilon_greedy'
,
'boltzmann'
))
...
@@ -33,7 +32,7 @@ USE_CUDA = torch.cuda.is_available()
...
@@ -33,7 +32,7 @@ USE_CUDA = torch.cuda.is_available()
device
=
torch
.
device
(
'cuda'
if
USE_CUDA
else
'cpu'
)
device
=
torch
.
device
(
'cuda'
if
USE_CUDA
else
'cpu'
)
OUT_PATH
=
os
.
path
.
join
(
'results'
,
args
.
save_path
)
OUT_PATH
=
os
.
path
.
join
(
'results
/dqn
'
,
args
.
save_path
)
LOG_FILE
=
os
.
path
.
join
(
OUT_PATH
,
'log.txt'
)
LOG_FILE
=
os
.
path
.
join
(
OUT_PATH
,
'log.txt'
)
clear_folder
(
OUT_PATH
)
clear_folder
(
OUT_PATH
)
...
...
train_reinforce.py
0 → 100644
View file @
168e0fcb
import
argparse
import
statistics
import
matplotlib.pyplot
as
plt
import
torch
import
torch.backends.cudnn
as
cudnn
from
timeit
import
default_timer
as
timer
from
numpy
import
load
from
algorithms.reinforce
import
PolicyNetwork
,
ValueNetwork
from
utils.cf_matrix
import
make_confusion_matrix
from
sklearn.metrics
import
confusion_matrix
from
params.reinforce_params
import
*
from
utils.utils
import
*
from
environment.wendtris
import
Wendtris_Eva
# Configurations for matplotlib
plt
.
rcParams
[
'agg.path.chunksize'
]
=
10000
parser
=
argparse
.
ArgumentParser
(
description
=
'DQN'
)
parser
.
add_argument
(
'--save_path'
,
type
=
str
,
required
=
True
,
help
=
'save path of results'
)
parser
.
add_argument
(
'--n_hidden'
,
type
=
int
,
default
=
128
,
help
=
'number of hidden neurons (default: 128)'
)
parser
.
add_argument
(
'--lr'
,
type
=
float
,
default
=
0.01
,
help
=
'learning rate (default: 0.01)'
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
None
,
help
=
'random seed'
)
parser
.
add_argument
(
'--n_episode'
,
type
=
int
,
required
=
True
,
help
=
'number of training episodes'
)
args
=
parser
.
parse_args
()
# Check if using cuda and define device
USE_CUDA
=
torch
.
cuda
.
is_available
()
device
=
torch
.
device
(
'cuda'
if
USE_CUDA
else
'cpu'
)
OUT_PATH
=
os
.
path
.
join
(
'results/reinforce'
,
args
.
save_path
)
LOG_FILE
=
os
.
path
.
join
(
OUT_PATH
,
'log.txt'
)
clear_folder
(
OUT_PATH
)
print
(
f
'Logging to
{
LOG_FILE
}
n'
)
sys
.
stdout
=
StdOut
(
LOG_FILE
)
print
(
'#################Reinforce with Baseline#################'
)
print
(
f
"PyTorch version
{
torch
.
__version__
}
"
)
print
(
f
'Training device:
{
device
}
'
)
if
USE_CUDA
:
print
(
f
"CUDA version:
{
torch
.
version
.
cuda
}
"
)
cudnn
.
benchmark
=
True
print
()
print
(
'#################Hyper Parameter Settings#################'
)
print
(
'#################Policy Net##################'
)
print
(
f
'Number of states (input):
{
n_state
}
'
)
print
(
f
'Number of actions (output):
{
n_action
}
'
)
print
(
f
'Number of hidden neurons:
{
args
.
n_hidden
}
'
)
print
(
f
'Learning rate:
{
args
.
lr
}
'
)
print
(
f
'Discount factor:
{
gamma
}
'
)
print
()
# Initialize DQN network
dqn
=
DQN
(
n_state
,
n_action
,
args
.
n_hidden
,
args
.
lr
)
if
USE_CUDA
:
dqn
=
dqn
.
to
(
device
)
print
(
'######################DQN architecture#####################'
)
print
(
dqn
)
print
()
print
(
f
'Total parameters:
{
sum
(
p
.
numel
()
for
p
in
dqn
.
parameters
())
}
'
)
print
(
f
'Trainable parameters:
{
sum
(
p
.
numel
()
for
p
in
dqn
.
parameters
()
if
p
.
requires_grad
)
}
'
)
print
()
seed
=
args
.
seed
if
seed
is
None
:
seed
=
np
.
random
.
randint
(
1
,
10000
)
print
(
'Random seed:'
,
seed
)
torch
.
manual_seed
(
seed
)
if
USE_CUDA
:
torch
.
cuda
.
manual_seed
(
seed
)
env
.
seed
(
seed
)
n_episode
=
args
.
n_episode
# Number of training episodes
total_reward_episode
=
[
0
]
*
n_episode
num_no_capacity
=
[]
accepted_orders
=
[]
if
args
.
policy
==
'epsilon_greedy'
:
epsilon_value
=
[]
else
:
tau_value
=
[]
losses
=
[]
start_time
=
timer
()
#############################
# Training
#############################s
for
episode
in
range
(
n_episode
):
state
=
env
.
reset
()
is_done
=
False
if
args
.
policy
==
'epsilon_greedy'
:
if
epsilon_decay
:
epsilon
=
stretched_exponential_decay
(
episode
,
args
.
n_episode
,
0.1
,
0.1
,
0.1
)
epsilon_value
.
append
(
epsilon
)
else
:
if
tau_decay
:
tau
=
anneal_tau
(
episode
,
0.001
,
100
)
tau_value
.
append
(
tau
)
while
not
is_done
:
if
args
.
policy
==
'epsilon_greedy'
:
action
=
dqn
.
eps_greedy_policy
(
state
,
n_action
,
epsilon
)
else
:
action
=
dqn
.
boltzmann_policy
(
state
,
n_action
,
tau
)
next_state
,
reward
,
is_done
,
info
=
env
.
step
(
action
)
total_reward_episode
[
episode
]
+=
reward
replay_buffer
.
append
((
state
,
action
,
next_state
,
reward
,
is_done
))
if
is_done
:
num_no_capacity
.
append
(
info
[
'Number no capacity'
])
accepted_orders
.
append
(
info
[
'Accepted orders'
])
break
loss
=
dqn
.
replay
(
replay_buffer
,
replay_batch_size
,
gamma
)
losses
.
append
(
loss
)
state
=
next_state
if
args
.
policy
==
'epsilon_greedy'
:
print
(
f
'episode:
{
episode
}
, total reward:
{
total_reward_episode
[
episode
]
}
, epsilon:
{
epsilon
}
, loss:
{
loss
}
, '
f
'num_no_capacity:
{
num_no_capacity
[
episode
]
}
, accepted orders:
{
accepted_orders
[
episode
]
}
'
)
else
:
print
(
f
'episode:
{
episode
}
, total reward:
{
total_reward_episode
[
episode
]
}
, loss:
{
loss
}
, '
f
'num_no_capacity:
{
num_no_capacity
[
episode
]
}
, accepted orders:
{
accepted_orders
[
episode
]
}
'
)
print
(
f
"Training time for
{
n_episode
}
episodes:
{
timer
()
-
start_time
}
"
)
# save the model parameters
torch
.
save
(
dqn
.
state_dict
(),
os
.
path
.
join
(
OUT_PATH
,
'dqn_{}.pk1'
.
format
(
n_episode
)))
#############################
# Plot of the training model
#############################
# Cumulative Average reward received over time
smoothed_rewards
=
[
np
.
mean
(
total_reward_episode
[:
i
+
1
])
for
i
in
range
(
len
(
total_reward_episode
))]
rewards
,
=
plt
.
plot
(
total_reward_episode
,
label
=
'Rewards'
)
avg_rewards
,
=
plt
.
plot
(
smoothed_rewards
,
label
=
'Average rewards'
)
plt
.
title
(
'Episode rewards over time'
)
plt
.
xlabel
(
'Episode'
)
plt
.
ylabel
(
'Total reward'
)
plt
.
legend
(
handles
=
[
rewards
,
avg_rewards
],
loc
=
'best'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'training_total_rewards.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
# Plot epsilon and tau
if
args
.
policy
==
'epsilon_greedy'
:
plt
.
plot
(
epsilon_value
)
plt
.
title
(
'Epsilon over time'
)
plt
.
xlabel
(
'Episode'
)
plt
.
ylabel
(
'Epsilon'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'epsilon.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
else
:
plt
.
plot
(
tau_value
)
plt
.
title
(
'Tau value over time'
)
plt
.
xlabel
(
'Episode'
)
plt
.
ylabel
(
'Tau'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'tau.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
# Plot number of penalties
num_no_capacity_smoothed
=
[
np
.
mean
(
num_no_capacity
[:
i
+
1
])
for
i
in
range
(
len
(
num_no_capacity
))]
num_penalty
,
=
plt
.
plot
(
num_no_capacity
,
label
=
'Penalties'
)
avg_penalty
,
=
plt
.
plot
(
num_no_capacity_smoothed
,
label
=
'Average penalties'
)
plt
.
title
(
'Number of penalties'
)
plt
.
xlabel
(
'Episode'
)
plt
.
ylabel
(
'Number of penalties'
)
plt
.
legend
(
handles
=
[
num_penalty
,
avg_penalty
],
loc
=
'best'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'penalties.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
# Plot loss
loss
=
plt
.
plot
(
losses
,
label
=
'Loss'
)
plt
.
title
(
'Loss'
)
plt
.
xlabel
(
'Steps'
)
plt
.
ylabel
(
'Loss'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'loss.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
#############################
# Evaluation
#############################
# Use the trained model to predict 1000 test games
total_reward_episode_eva
=
[
0
]
*
1000
num_no_capacity_eva
=
[]
accepted_orders_eva
=
[]
test_orders
=
load
(
'dp/order_list.npy'
)
test_rewards
=
load
(
'dp/reward_list.npy'
)
print
(
'##########################Evaluation##########################'
)
for
ep
in
range
(
test_orders
.
shape
[
0
]):
env_eva
=
Wendtris_Eva
(
test_orders
[
ep
],
test_rewards
[
ep
])
state
=
env_eva
.
state
is_done
=
False
while
not
is_done
:
# Always take the best action
action
=
torch
.
argmax
(
dqn
.
predict
(
state
)).
item
()
next_state
,
reward
,
is_done
,
info
=
env_eva
.
step
(
action
)
total_reward_episode_eva
[
ep
]
+=
reward
if
is_done
:
num_no_capacity_eva
.
append
(
info
[
'Number no capacity'
])
accepted_orders_eva
.
append
(
info
[
'Accepted orders'
])
break
state
=
next_state
print
(
f
'Episode:
{
ep
}
, total reward:
{
total_reward_episode_eva
[
ep
]
}
'
,
f
'num_no_capacity:
{
num_no_capacity_eva
[
ep
]
}
, accepted orders:
{
accepted_orders_eva
[
ep
]
}
'
)
# Save the variables for evaluation
EVA_FILE
=
os
.
path
.
join
(
OUT_PATH
,
'evaluation'
)
save_list
(
total_reward_episode
,
EVA_FILE
,
'total_reward_episode_train'
)
save_list
(
total_reward_episode_eva
,
EVA_FILE
,
'total_reward_episode_eva'
)
save_list
(
num_no_capacity_eva
,
EVA_FILE
,
'num_no_capacity_eva'
)
save_list
(
accepted_orders_eva
,
EVA_FILE
,
'accepted_orders_eva'
)
if
args
.
policy
==
'epsilon_greedy'
:
save_list
(
epsilon_value
,
EVA_FILE
,
'epsilon_value'
)
else
:
save_list
(
tau_value
,
EVA_FILE
,
'tau_value'
)
# Load optimal solution
optimal_rewards
=
load
(
'dp/results.npy'
)
optimal_orders
=
load
(
'dp/subset.npy'
,
allow_pickle
=
True
).
astype
(
'object'
).
tolist
()
# Calculate average results of the ground truth
optimal_avg_rewards
=
np
.
average
(
optimal_rewards
)
eva_avg_rewards
=
statistics
.
mean
(
total_reward_episode_eva
)
print
(
f
'Predicted average rewards:
{
eva_avg_rewards
}
'
)
print
(
f
"Optimal average rewards:
{
optimal_avg_rewards
}
"
)
# Plot rewards (evaluation)
# Cumulative Average reward received over time
smoothed_rewards_eva
=
[
np
.
mean
(
total_reward_episode_eva
[:
i
+
1
])
for
i
in
range
(
len
(
total_reward_episode_eva
))]
smoothed_optimal_rewards
=
[
np
.
mean
(
optimal_rewards
[:
i
+
1
])
for
i
in
range
(
len
(
optimal_rewards
))]
rewards_eva
,
=
plt
.
plot
(
total_reward_episode_eva
,
label
=
'Rewards'
)
avg_rewards_eva
,
=
plt
.
plot
(
smoothed_rewards_eva
,
label
=
'Average rewards'
)
opt_rewards
,
=
plt
.
plot
(
optimal_rewards
,
label
=
'Optimal rewards'
)
opt_avg_rewards
,
=
plt
.
plot
(
smoothed_optimal_rewards
,
label
=
'Average optimal rewards'
)
plt
.
title
(
'Episode rewards over time (Evaluation)'
)
plt
.
xlabel
(
'Episode'
)
plt
.
ylabel
(
'Total reward'
)
plt
.
legend
(
handles
=
[
rewards_eva
,
avg_rewards_eva
,
opt_rewards
,
opt_avg_rewards
],
loc
=
'best'
,
fontsize
=
'small'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'training_total_rewards_evaluation.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
# Modify orders for evaluation
prediction
=
np
.
asarray
(
modify_orders
(
accepted_orders_eva
),
dtype
=
int
)
prediction
=
prediction
.
flatten
()
optimal_results
=
np
.
asarray
(
modify_orders
(
optimal_orders
),
dtype
=
int
)
optimal_results
=
optimal_results
.
flatten
()
# Confusion matrix
cf_matrix
=
confusion_matrix
(
optimal_results
,
prediction
)
labels
=
[
'True Neg'
,
'False Pos'
,
'False Neg'
,
'True Pos'
]
categories
=
[
'Reject'
,
'Accept'
]
make_confusion_matrix
(
cf_matrix
,
group_names
=
labels
,
categories
=
categories
,
cmap
=
'Blues'
)
plt
.
tight_layout
()
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'confusion_matrix.png'
),
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment