Commit f276e3dc authored by lli's avatar lli
Browse files

add tau anneal

parent 37e5aee0
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu')
# Define the policy net
class PolicyNetwork(nn.Module):
def __init__(self, n_state, n_action, n_hidden, lr):
'''
Initialize the policy neural network:
Use one hidden layer
Input: a state, followed by a hidden layer
Output: the probability of taking possible individual actions
use softmax function as the activation for the output layer
'''
super(PolicyNetwork, self).__init__()
self.network = nn.Sequential(
nn.Linear(n_state, n_hidden),
nn.ReLU(),
nn.Linear(n_hidden, n_hidden),
nn.ReLU(),
nn.Linear(n_hidden, n_action),
nn.Softmax(dim=-1),
)
self.optimizer = torch.optim.Adam(self.network.parameters(), lr)
def predict(self, state):
# Compute the action probabilities of state s using the learning rate
action_probs = self.network(torch.tensor(state, dtype=torch.float32, device=device))
return action_probs
def update(self, advantages, log_probs):
'''
Update the network parameters,
given all the data gathered in an episode, including the returns and
the log probabilities of all steps, compute the policy gradients,
then update the policy parameters accordingly via backpropagation
'''
# Update the weights of the policy network given the training samples
# returns --> cumulative rewards for each step in an episode
policy_gradient = []
for log_prob, Gt in zip(log_probs, advantages):
policy_gradient.append(-log_prob * Gt)
loss = torch.cat(policy_gradient).sum().cuda()
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def get_action(self, state):
probs = self.predict(state)
# Sample the action with the highest probability
action = torch.multinomial(probs, 1).item()
log_prob = torch.log(probs[action])
return action, log_prob
# Define the value net
class ValueNetwork(nn.Module):
'''
Use a regression neural network to approximate state-values
'''
def __init__(self, n_state, n_hidden, lr=0.01):
super(ValueNetwork, self).__init__()
self.criterion = torch.nn.MSELoss()
self.model = torch.nn.Sequential(
nn.Linear(n_state, n_hidden),
nn.ReLU(),
nn.Linear(n_hidden, n_hidden),
nn.ReLU(),
nn.Linear(n_hidden, 1)
)
self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
def update(self, state, y):
y_pred = self.model(torch.tensor(state, dtype=torch.float32, device=device))
loss = self.criterion(y_pred, torch.tensor(y, dtype=torch.float32, device=device))
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def predict(self, state):
with torch.no_grad():
return self.model(torch.tensor(state, dtype=torch.float32, device=device))
\ No newline at end of file
......@@ -16,7 +16,8 @@ n_action = env.action_space.n # Number of output
#lr = 0.001 # Learning rate
gamma = 1 # Discount factor
epsilon_decay = True # epsilon decay rate
epsilon_decay = True # Using epsilon decay
tau_decay = True # Using boltzmann exploration, decay temperature
replay_buffer = deque(maxlen=10000) # Size of replay buffer
replay_batch_size = 64 # Size of replay batch
from environment.wendtris import Wendtris
#############################
# Initialize the environment, with penalty factor 2
#############################s
env = Wendtris(20, 6, 6, 2)
#############################
# Model Params (fixed)
#############################
n_state = len(env.state) # Number of input
n_action = env.action_space.n # Number of output
gamma = 1 # Discounted factor
......@@ -86,7 +86,10 @@ total_reward_episode = [0] * n_episode
num_no_capacity = []
accepted_orders = []
epsilon_value = []
if args.policy == 'epsilon_greedy':
epsilon_value = []
else:
tau_value = []
losses = []
start_time = timer()
......@@ -101,12 +104,16 @@ for episode in range(n_episode):
if epsilon_decay:
epsilon = stretched_exponential_decay(episode, args.n_episode, 0.1, 0.1, 0.1)
epsilon_value.append(epsilon)
else:
if tau_decay:
tau = anneal_tau(episode, 0.001, 100)
tau_value.append(tau)
while not is_done:
if args.policy == 'epsilon_greedy':
action = dqn.eps_greedy_policy(state, n_action, epsilon)
else:
action = dqn.boltzmann_policy(state, n_action, 0.5)
action = dqn.boltzmann_policy(state, n_action, tau)
next_state, reward, is_done, info = env.step(action)
total_reward_episode[episode] += reward
......@@ -142,8 +149,8 @@ torch.save(dqn.state_dict(), os.path.join(OUT_PATH, 'dqn_{}.pk1'.format(n_episod
smoothed_rewards = [np.mean(total_reward_episode[:i+1]) for i in range(len(total_reward_episode))]
rewards, = plt.plot(total_reward_episode, label='Rewards')
avg_rewards, = plt.plot(smoothed_rewards, label='Average rewards')
plt.title('episode rewards over time')
plt.xlabel('episode')
plt.title('Episode rewards over time')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.legend(handles=[rewards, avg_rewards], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards.png'), dpi=1200, transparent=True, bbox_inches='tight')
......@@ -153,17 +160,24 @@ plt.close()
if args.policy == 'epsilon_greedy':
plt.plot(epsilon_value)
plt.title('Epsilon over time')
plt.xlabel('episode')
plt.xlabel('Episode')
plt.ylabel('Epsilon')
plt.savefig(os.path.join(OUT_PATH, 'epsilon.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
else:
plt.plot(tau_value)
plt.title('Tau value over time')
plt.xlabel('Episode')
plt.ylabel('Tau')
plt.savefig(os.path.join(OUT_PATH, 'tau.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Plot number of penalties
num_no_capacity_smoothed = [np.mean(num_no_capacity[:i+1]) for i in range(len(num_no_capacity))]
num_penalty, = plt.plot(num_no_capacity, label= 'Penalties')
avg_penalty, = plt.plot(num_no_capacity_smoothed, label = 'Average penalties')
plt.title('Number of penalties')
plt.xlabel('episode')
plt.xlabel('Episode')
plt.ylabel('Number of penalties')
plt.legend(handles=[num_penalty, avg_penalty], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'penalties.png'), dpi=1200, transparent=True, bbox_inches='tight')
......
......@@ -86,6 +86,14 @@ def stretched_exponential_decay(episode, n_episode, a=0.2, b=0.1, c=0.1):
return epsilon
def anneal_tau(time, decay_rate, tau):
"""
Decay tau value over time
:rtype: float
"""
return np.exp(-decay_rate * time) * tau + 1
class StdOut(object):
"""Redirect stdout to file, and print to console as well.
"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment