Commit 0bbf1707 authored by lli's avatar lli
Browse files

implemented reinforce

parent f2f41b14
......@@ -48,7 +48,7 @@ class PolicyNetwork(nn.Module):
for log_prob, Gt in zip(log_probs, advantages):
policy_gradient.append(-log_prob * Gt)
loss = torch.cat(policy_gradient).sum().cuda()
loss = torch.cat(policy_gradient).sum().to(device)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
......@@ -58,7 +58,6 @@ class PolicyNetwork(nn.Module):
# Sample the action with the highest probability
action = torch.multinomial(probs, 1).item()
log_prob = torch.log(probs[action])
return action, log_prob
......@@ -81,7 +80,8 @@ class ValueNetwork(nn.Module):
def update(self, state, y):
y_pred = self.model(torch.tensor(state, dtype=torch.float32, device=device))
loss = self.criterion(y_pred, torch.tensor(y, dtype=torch.float32, device=device))
y = y.clone().requires_grad_(True).to(device)
loss = self.criterion(y_pred, y)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
......
......@@ -18,10 +18,11 @@ from environment.wendtris import Wendtris_Eva
plt.rcParams['agg.path.chunksize'] = 10000
parser = argparse.ArgumentParser(description='DQN')
parser = argparse.ArgumentParser(description='Reinforce with basesline')
parser.add_argument('--save_path', type=str, required=True, help='save path of results')
parser.add_argument('--n_hidden', type=int, default=128, help='number of hidden neurons (default: 128)')
parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)')
parser.add_argument('--lr_policy', type=float, default=0.001, help='learning rate policy net (default: 0.001)')
parser.add_argument('--lr_value', type=float, default=0.01, help='learning rate value net (default: 0.01)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument('--n_episode', type=int, required=True, help='number of training episodes')
args = parser.parse_args()
......@@ -45,25 +46,37 @@ if USE_CUDA:
cudnn.benchmark = True
print()
print('#################Hyper Parameter Settings#################')
print('#################Policy Net##################')
print(f'Number of states (input): {n_state}')
print(f'Number of actions (output): {n_action}')
print(f'Number of hidden neurons: {args.n_hidden}')
print(f'Learning rate: {args.lr}')
print(f'Learning rate policy net: {args.lr_policy}')
print(f'Learning rate value net {args.lr_value}')
print(f'Discount factor: {gamma}')
print()
# Initialize DQN network
dqn = DQN(n_state, n_action, args.n_hidden, args.lr)
# Initialize the policy network
policy_net = PolicyNetwork(n_state, n_action, args.n_hidden, args.lr_policy)
# Initialize the value network
value_net = ValueNetwork(n_state, args.n_hidden, args.lr_value)
if USE_CUDA:
dqn = dqn.to(device)
policy_net = policy_net.to(device)
value_net = value_net.to(device)
print('######################Policy net architecture#####################')
print(policy_net)
print()
print(f'Total parameters: {sum(p.numel() for p in policy_net.parameters())}')
print(f'Trainable parameters: {sum(p.numel() for p in policy_net.parameters() if p.requires_grad)}')
print()
print('######################DQN architecture#####################')
print(dqn)
print('######################Value net architecture#####################')
print(value_net)
print()
print(f'Total parameters: {sum(p.numel() for p in dqn.parameters())}')
print(f'Trainable parameters: {sum(p.numel() for p in dqn.parameters() if p.requires_grad)}')
print(f'Total parameters: {sum(p.numel() for p in value_net.parameters())}')
print(f'Trainable parameters: {sum(p.numel() for p in value_net.parameters() if p.requires_grad)}')
print()
seed = args.seed
......@@ -80,61 +93,72 @@ total_reward_episode = [0] * n_episode
num_no_capacity = []
accepted_orders = []
if args.policy == 'epsilon_greedy':
epsilon_value = []
else:
tau_value = []
losses = []
start_time = timer()
#############################
# Training
#############################s
for episode in range(n_episode):
log_probs = []
states = []
rewards = []
selected_action = []
state = env.reset()
is_done = False
if args.policy == 'epsilon_greedy':
if epsilon_decay:
epsilon = stretched_exponential_decay(episode, args.n_episode, 0.1, 0.1, 0.1)
epsilon_value.append(epsilon)
else:
if tau_decay:
tau = anneal_tau(episode, 0.001, 100)
tau_value.append(tau)
while not is_done:
if args.policy == 'epsilon_greedy':
action = dqn.eps_greedy_policy(state, n_action, epsilon)
else:
action = dqn.boltzmann_policy(state, n_action, tau)
while True:
states.append(state)
action, log_prob = policy_net.get_action(state)
next_state, reward, is_done, info = env.step(action)
total_reward_episode[episode] += reward
replay_buffer.append((state, action, next_state, reward, is_done))
total_reward_episode[episode] += reward
log_probs.append(log_prob)
rewards.append(reward)
selected_action.append(action)
if is_done:
# Penalties and order position of accepted orders
num_no_capacity.append(info['Number no capacity'])
accepted_orders.append(info['Accepted orders'])
# Calculate discounted rewards
returns = []
Gt = 0
pw = 0
for t in range(len(states) - 1, -1, -1):
Gt += gamma ** pw * rewards[t]
pw += 1
returns.append(Gt)
returns = returns[::-1]
returns = torch.tensor(returns, dtype=torch.float32, device=device)
returns = (returns - returns.mean()) / (returns.std() + 1e-9)
baseline_values = value_net.predict(states)
# Ajust returns to the same shape of baseline_values
returns = torch.reshape(returns, (baseline_values.shape))
advantages = (returns - baseline_values)
value_net.update(states, returns)
# Update nn based on discounted rewards and log_probs
policy_net.update(advantages, log_probs)
print('Episode: {}, total reward: {}, number of penalties: {}, accepted orders: {}'.format(episode,
total_reward_episode[
episode],
num_no_capacity[
episode],
accepted_orders[
episode]))
# print('Episode: {}, selected action: {}'.format(episode, selected_action))
break
loss = dqn.replay(replay_buffer, replay_batch_size, gamma)
losses.append(loss)
state = next_state
if args.policy == 'epsilon_greedy':
print(
f'episode: {episode}, total reward: {total_reward_episode[episode]}, epsilon: {epsilon}, loss: {loss}, '
f'num_no_capacity: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}')
else:
print(
f'episode: {episode}, total reward: {total_reward_episode[episode]}, loss: {loss}, '
f'num_no_capacity: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}')
print(f"Training time for {n_episode} episodes: {timer() - start_time}")
# save the model parameters
torch.save(dqn.state_dict(), os.path.join(OUT_PATH, 'dqn_{}.pk1'.format(n_episode)))
torch.save(policy_net.state_dict(), os.path.join(OUT_PATH, 'policy_{}.pk1'.format(n_episode)))
torch.save(value_net.state_dict(), os.path.join(OUT_PATH, 'value_{}.pk1'.format(n_episode)))
#############################
# Plot of the training model
......@@ -150,22 +174,6 @@ plt.legend(handles=[rewards, avg_rewards], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Plot epsilon and tau
if args.policy == 'epsilon_greedy':
plt.plot(epsilon_value)
plt.title('Epsilon over time')
plt.xlabel('Episode')
plt.ylabel('Epsilon')
plt.savefig(os.path.join(OUT_PATH, 'epsilon.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
else:
plt.plot(tau_value)
plt.title('Tau value over time')
plt.xlabel('Episode')
plt.ylabel('Tau')
plt.savefig(os.path.join(OUT_PATH, 'tau.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Plot number of penalties
num_no_capacity_smoothed = [np.mean(num_no_capacity[:i+1]) for i in range(len(num_no_capacity))]
num_penalty, = plt.plot(num_no_capacity, label= 'Penalties')
......@@ -177,14 +185,6 @@ plt.legend(handles=[num_penalty, avg_penalty], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'penalties.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Plot loss
loss= plt.plot(losses, label= 'Loss')
plt.title('Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.savefig(os.path.join(OUT_PATH, 'loss.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
#############################
# Evaluation
......@@ -205,7 +205,7 @@ for ep in range(test_orders.shape[0]):
while not is_done:
# Always take the best action
action = torch.argmax(dqn.predict(state)).item()
action = policy_net.get_action(state)[0]
next_state, reward, is_done, info = env_eva.step(action)
total_reward_episode_eva[ep] += reward
......@@ -225,10 +225,6 @@ save_list(total_reward_episode, EVA_FILE, 'total_reward_episode_train')
save_list(total_reward_episode_eva, EVA_FILE, 'total_reward_episode_eva')
save_list(num_no_capacity_eva, EVA_FILE, 'num_no_capacity_eva')
save_list(accepted_orders_eva, EVA_FILE, 'accepted_orders_eva')
if args.policy == 'epsilon_greedy':
save_list(epsilon_value, EVA_FILE, 'epsilon_value')
else:
save_list(tau_value, EVA_FILE, 'tau_value')
# Load optimal solution
optimal_rewards = load('dp/results.npy')
......
......@@ -4,6 +4,7 @@ import shutil
import sys
import math
import pickle
import torch
import numpy as np
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment