Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
lli
YM-Seminar
Commits
f276e3dc
Commit
f276e3dc
authored
Apr 21, 2021
by
lli
Browse files
add tau anneal
parent
37e5aee0
Changes
5
Hide whitespace changes
Inline
Side-by-side
algorithms/reinforce.py
0 → 100644
View file @
f276e3dc
import
torch
import
numpy
as
np
import
torch.nn
as
nn
import
torch.nn.functional
as
F
USE_CUDA
=
torch
.
cuda
.
is_available
()
device
=
torch
.
device
(
'cuda'
if
USE_CUDA
else
'cpu'
)
# Define the policy net
class
PolicyNetwork
(
nn
.
Module
):
def
__init__
(
self
,
n_state
,
n_action
,
n_hidden
,
lr
):
'''
Initialize the policy neural network:
Use one hidden layer
Input: a state, followed by a hidden layer
Output: the probability of taking possible individual actions
use softmax function as the activation for the output layer
'''
super
(
PolicyNetwork
,
self
).
__init__
()
self
.
network
=
nn
.
Sequential
(
nn
.
Linear
(
n_state
,
n_hidden
),
nn
.
ReLU
(),
nn
.
Linear
(
n_hidden
,
n_hidden
),
nn
.
ReLU
(),
nn
.
Linear
(
n_hidden
,
n_action
),
nn
.
Softmax
(
dim
=-
1
),
)
self
.
optimizer
=
torch
.
optim
.
Adam
(
self
.
network
.
parameters
(),
lr
)
def
predict
(
self
,
state
):
# Compute the action probabilities of state s using the learning rate
action_probs
=
self
.
network
(
torch
.
tensor
(
state
,
dtype
=
torch
.
float32
,
device
=
device
))
return
action_probs
def
update
(
self
,
advantages
,
log_probs
):
'''
Update the network parameters,
given all the data gathered in an episode, including the returns and
the log probabilities of all steps, compute the policy gradients,
then update the policy parameters accordingly via backpropagation
'''
# Update the weights of the policy network given the training samples
# returns --> cumulative rewards for each step in an episode
policy_gradient
=
[]
for
log_prob
,
Gt
in
zip
(
log_probs
,
advantages
):
policy_gradient
.
append
(
-
log_prob
*
Gt
)
loss
=
torch
.
cat
(
policy_gradient
).
sum
().
cuda
()
self
.
optimizer
.
zero_grad
()
loss
.
backward
()
self
.
optimizer
.
step
()
def
get_action
(
self
,
state
):
probs
=
self
.
predict
(
state
)
# Sample the action with the highest probability
action
=
torch
.
multinomial
(
probs
,
1
).
item
()
log_prob
=
torch
.
log
(
probs
[
action
])
return
action
,
log_prob
# Define the value net
class
ValueNetwork
(
nn
.
Module
):
'''
Use a regression neural network to approximate state-values
'''
def
__init__
(
self
,
n_state
,
n_hidden
,
lr
=
0.01
):
super
(
ValueNetwork
,
self
).
__init__
()
self
.
criterion
=
torch
.
nn
.
MSELoss
()
self
.
model
=
torch
.
nn
.
Sequential
(
nn
.
Linear
(
n_state
,
n_hidden
),
nn
.
ReLU
(),
nn
.
Linear
(
n_hidden
,
n_hidden
),
nn
.
ReLU
(),
nn
.
Linear
(
n_hidden
,
1
)
)
self
.
optimizer
=
torch
.
optim
.
Adam
(
self
.
model
.
parameters
(),
lr
)
def
update
(
self
,
state
,
y
):
y_pred
=
self
.
model
(
torch
.
tensor
(
state
,
dtype
=
torch
.
float32
,
device
=
device
))
loss
=
self
.
criterion
(
y_pred
,
torch
.
tensor
(
y
,
dtype
=
torch
.
float32
,
device
=
device
))
self
.
optimizer
.
zero_grad
()
loss
.
backward
()
self
.
optimizer
.
step
()
def
predict
(
self
,
state
):
with
torch
.
no_grad
():
return
self
.
model
(
torch
.
tensor
(
state
,
dtype
=
torch
.
float32
,
device
=
device
))
\ No newline at end of file
params/dqn_params.py
View file @
f276e3dc
...
...
@@ -16,7 +16,8 @@ n_action = env.action_space.n # Number of output
#lr = 0.001 # Learning rate
gamma
=
1
# Discount factor
epsilon_decay
=
True
# epsilon decay rate
epsilon_decay
=
True
# Using epsilon decay
tau_decay
=
True
# Using boltzmann exploration, decay temperature
replay_buffer
=
deque
(
maxlen
=
10000
)
# Size of replay buffer
replay_batch_size
=
64
# Size of replay batch
params/reinforce_params.py
0 → 100644
View file @
f276e3dc
from
environment.wendtris
import
Wendtris
#############################
# Initialize the environment, with penalty factor 2
#############################s
env
=
Wendtris
(
20
,
6
,
6
,
2
)
#############################
# Model Params (fixed)
#############################
n_state
=
len
(
env
.
state
)
# Number of input
n_action
=
env
.
action_space
.
n
# Number of output
gamma
=
1
# Discounted factor
train_dqn.py
View file @
f276e3dc
...
...
@@ -86,7 +86,10 @@ total_reward_episode = [0] * n_episode
num_no_capacity
=
[]
accepted_orders
=
[]
epsilon_value
=
[]
if
args
.
policy
==
'epsilon_greedy'
:
epsilon_value
=
[]
else
:
tau_value
=
[]
losses
=
[]
start_time
=
timer
()
...
...
@@ -101,12 +104,16 @@ for episode in range(n_episode):
if
epsilon_decay
:
epsilon
=
stretched_exponential_decay
(
episode
,
args
.
n_episode
,
0.1
,
0.1
,
0.1
)
epsilon_value
.
append
(
epsilon
)
else
:
if
tau_decay
:
tau
=
anneal_tau
(
episode
,
0.001
,
100
)
tau_value
.
append
(
tau
)
while
not
is_done
:
if
args
.
policy
==
'epsilon_greedy'
:
action
=
dqn
.
eps_greedy_policy
(
state
,
n_action
,
epsilon
)
else
:
action
=
dqn
.
boltzmann_policy
(
state
,
n_action
,
0.5
)
action
=
dqn
.
boltzmann_policy
(
state
,
n_action
,
tau
)
next_state
,
reward
,
is_done
,
info
=
env
.
step
(
action
)
total_reward_episode
[
episode
]
+=
reward
...
...
@@ -142,8 +149,8 @@ torch.save(dqn.state_dict(), os.path.join(OUT_PATH, 'dqn_{}.pk1'.format(n_episod
smoothed_rewards
=
[
np
.
mean
(
total_reward_episode
[:
i
+
1
])
for
i
in
range
(
len
(
total_reward_episode
))]
rewards
,
=
plt
.
plot
(
total_reward_episode
,
label
=
'Rewards'
)
avg_rewards
,
=
plt
.
plot
(
smoothed_rewards
,
label
=
'Average rewards'
)
plt
.
title
(
'
e
pisode rewards over time'
)
plt
.
xlabel
(
'
e
pisode'
)
plt
.
title
(
'
E
pisode rewards over time'
)
plt
.
xlabel
(
'
E
pisode'
)
plt
.
ylabel
(
'Total reward'
)
plt
.
legend
(
handles
=
[
rewards
,
avg_rewards
],
loc
=
'best'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'training_total_rewards.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
...
...
@@ -153,17 +160,24 @@ plt.close()
if
args
.
policy
==
'epsilon_greedy'
:
plt
.
plot
(
epsilon_value
)
plt
.
title
(
'Epsilon over time'
)
plt
.
xlabel
(
'
e
pisode'
)
plt
.
xlabel
(
'
E
pisode'
)
plt
.
ylabel
(
'Epsilon'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'epsilon.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
else
:
plt
.
plot
(
tau_value
)
plt
.
title
(
'Tau value over time'
)
plt
.
xlabel
(
'Episode'
)
plt
.
ylabel
(
'Tau'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'tau.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
plt
.
close
()
# Plot number of penalties
num_no_capacity_smoothed
=
[
np
.
mean
(
num_no_capacity
[:
i
+
1
])
for
i
in
range
(
len
(
num_no_capacity
))]
num_penalty
,
=
plt
.
plot
(
num_no_capacity
,
label
=
'Penalties'
)
avg_penalty
,
=
plt
.
plot
(
num_no_capacity_smoothed
,
label
=
'Average penalties'
)
plt
.
title
(
'Number of penalties'
)
plt
.
xlabel
(
'
e
pisode'
)
plt
.
xlabel
(
'
E
pisode'
)
plt
.
ylabel
(
'Number of penalties'
)
plt
.
legend
(
handles
=
[
num_penalty
,
avg_penalty
],
loc
=
'best'
)
plt
.
savefig
(
os
.
path
.
join
(
OUT_PATH
,
'penalties.png'
),
dpi
=
1200
,
transparent
=
True
,
bbox_inches
=
'tight'
)
...
...
utils/utils.py
View file @
f276e3dc
...
...
@@ -86,6 +86,14 @@ def stretched_exponential_decay(episode, n_episode, a=0.2, b=0.1, c=0.1):
return
epsilon
def
anneal_tau
(
time
,
decay_rate
,
tau
):
"""
Decay tau value over time
:rtype: float
"""
return
np
.
exp
(
-
decay_rate
*
time
)
*
tau
+
1
class
StdOut
(
object
):
"""Redirect stdout to file, and print to console as well.
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment