Commit 0a39152b authored by lli's avatar lli
Browse files

implemented actor critic

parent 66c87674
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu')
class Actor(nn.Module):
class ActorCritic(nn.Module):
def __init__(self, n_state, n_hidden, n_action):
"""
Actor network, takes the input state and outputs the action probabilities
It learns the optimal policy by updating the model using information provided by the critic
Critic network evaluates how good it is to be at the input state by computing the value function.
The value guides the actor on how it should adjust.
:param n_state: number of state of the environment
:param n_hidden: [list] number of hidden neurons in hidden layer
:param n_action: number of neurons of output layer
:param n_action: number of actions
"""
super(Actor, self).__init__()
super(ActorCritic, self).__init__()
self.n_state = n_state
self.n_action = n_action
self.input = nn.Linear(self.n_state, n_hidden[0])
self.hidden = nn.ModuleList()
for k in range(len(n_hidden) - 1):
self.hidden.append(nn.Linear(n_hidden[k], n_hidden[k+1]))
self.out = nn.Linear(n_hidden[-1], self.n_action)
self.actor = nn.Linear(n_hidden[-1], self.n_action)
self.critic = nn.Linear(n_hidden[-1], 1)
def forward(self, state):
output = F.relu(self.input(state))
output = state.clone().detach().requires_grad_(True)
output = self.input(output)
output = F.relu(output)
for m in self.hidden:
output = m(output)
output = self.out(output)
distribution = Categorical(F.softmax(output, dim=-1))
return distribution
output = F.relu(output)
action_probs = F.softmax(self.actor(output), dim=-1)
state_values = self.critic(output)
return action_probs, state_values
class Critic(nn.Module):
def __init__(self, n_state, n_hidden, n_action):
"""
Critic network evaluates how good it is to be at the input state by computing the value function.
The value guides the actor on how it should adjust.
"""
super(Critic, self).__init__()
self.n_state = n_state
self.n_action = n_action
self.input = nn.Linear(self.n_state, n_hidden[0])
self.hidden = nn.ModuleList()
for k in range(len(n_hidden) - 1):
self.hidden.append(nn.Linear(n_hidden[k], n_hidden[k+1]))
self.out = nn.Linear(n_hidden[-1], n_action)
class PolicyNetwork(nn.Module):
def __init__(self, n_state, n_hidden, n_action, lr=0.001):
super(PolicyNetwork, self).__init__()
self.model = ActorCritic(n_state, n_hidden, n_action)
self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
def predict(self, state):
# Compute the output using the Actor Critic model
# Return action probabilities, state_value
return self.model(torch.tensor(state, dtype=torch.float32, device=device))
def update(self, returns, log_probs, state_values):
loss = 0
for log_prob, value, Gt in zip(log_probs, state_values, returns):
advantage = Gt - value.item()
#advantage = (advantage - advantage.mean()) / advantage.std()
policy_loss = (-log_prob * advantage)
Gt = torch.unsqueeze(Gt, 0)
value_loss = F.smooth_l1_loss(value, Gt)
loss += policy_loss + value_loss
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# self.scheduler.step()
def get_action(self, state):
action_probs, state_value = self.predict(state)
action = torch.multinomial(action_probs, 1).item()
log_prob = torch.log(action_probs[action])
return action, log_prob, state_value
def forward(self, state):
output = F.relu(self.input(state))
for m in self.hidden:
output = m(output)
value = self.out(output)
return value
import argparse
import statistics
import matplotlib.pyplot as plt
import torch
import torch.optim as optim
import torch.backends.cudnn as cudnn
from timeit import default_timer as timer
from numpy import load
from algorithms.a2c import PolicyNetwork
from utils.cf_matrix import make_confusion_matrix
from sklearn.metrics import confusion_matrix
from params.a2c_params import *
from utils.utils import *
from torch.utils.tensorboard import SummaryWriter
from environment.wendtris import Wendtris_Eva
# Configurations for matplotlib
plt.rcParams['agg.path.chunksize'] = 10000
parser = argparse.ArgumentParser(description='A2C')
parser.add_argument('--save_path', type=str, required=True, help='save path of results')
parser.add_argument('--n_hidden', nargs='+', type=int, help='number of hidden neurons')
parser.add_argument('--lr', type=float, default=0.001, help='learning rate (default: 0.001)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument('--n_episode', type=int, required=True, help='number of training episodes')
args = parser.parse_args()
# Check if using cuda and define device
USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu')
# Define tensorboard writer path
# comment = f' policy_lr={args.lr_actor} value_lr={args.lr_critic} n_episode={args.n_episode}'
# writer = SummaryWriter('runs/a2c', comment=comment, filename_suffix=str(args.lr_policy) + str(args.n_episode))
OUT_PATH = os.path.join('results/a2c', args.save_path)
LOG_FILE = os.path.join(OUT_PATH, 'log.txt')
clear_folder(OUT_PATH)
print(f'Logging to {LOG_FILE}n')
sys.stdout = StdOut(LOG_FILE)
print('#################A2C#################')
print(f"PyTorch version {torch.__version__}")
print(f'Training device: {device}')
if USE_CUDA:
print(f"CUDA version: {torch.version.cuda}")
cudnn.benchmark = True
print()
print('#################Hyper Parameter Settings#################')
print(f'Number of states (input): {n_state}')
print(f'Number of actions (output): {n_action}')
print(f'Number of hidden neurons: {args.n_hidden}')
print(f'Learning rate : {args.lr}')
print(f'Discount factor: {gamma}')
print()
# Initialize the policy net
policy_net = PolicyNetwork(n_state, args.n_hidden, n_action, args.lr)
if USE_CUDA:
policy_net = policy_net.to(device)
print('######################Actor Critic architecture#####################')
print(policy_net)
print()
print(f'Total parameters: {sum(p.numel() for p in policy_net.parameters())}')
print(f'Trainable parameters: {sum(p.numel() for p in policy_net.parameters() if p.requires_grad)}')
print(f'Actor net is on GPU: {next(policy_net.parameters()).is_cuda}')
print()
seed = args.seed
if seed is None:
seed = np.random.randint(1, 10000)
print('Random seed:', seed)
torch.manual_seed(seed)
if USE_CUDA:
torch.cuda.manual_seed(seed)
env.seed(seed)
n_episode = args.n_episode # Number of training episodes
total_reward_episode = [0] * n_episode
num_no_capacity = []
accepted_orders = []
start_time = timer()
#############################
# Training
#############################s
for episode in range(n_episode):
log_probs = []
rewards = []
state_values = []
selected_action = []
state = env.reset()
while True:
action, log_prob, state_value = policy_net.get_action(state)
next_state, reward, is_done, info = env.step(action)
total_reward_episode[episode] += reward
log_probs.append(log_prob)
state_values.append(state_value)
rewards.append(reward)
if is_done:
# Penalties and order position of accepted orders
num_no_capacity.append(info['Number no capacity'])
accepted_orders.append(info['Accepted orders'])
returns = []
Gt = 0
pw = 0
for reward in rewards[::-1]:
Gt += gamma ** pw * reward
pw += 1
returns.append(Gt)
returns = returns[::-1]
returns = torch.tensor(returns, dtype=torch.float, device=device)
returns = (returns - returns.mean()) / (returns.std() + 1e-9)
policy_net.update(returns, log_probs, state_values)
print(f'Episode: {episode}, total reward: {total_reward_episode[episode]}, number of penalties: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}')
break
state = next_state
# save the model parameters
torch.save(policy_net.state_dict(), os.path.join(OUT_PATH, 'policy_net{}.pk1'.format(n_episode)))
#############################
# Plot of the training model
#############################
# Cumulative Average reward received over time
smoothed_rewards = sliding_window(total_reward_episode, len(total_reward_episode))
rewards, = plt.plot(total_reward_episode, label='Rewards')
avg_rewards, = plt.plot(smoothed_rewards, label='Average rewards')
plt.title('Episode rewards over time')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.legend(handles=[rewards, avg_rewards], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Plot number of penalties
num_no_capacity_smoothed = sliding_window(num_no_capacity, len(num_no_capacity))
num_penalty, = plt.plot(num_no_capacity, label='Penalties')
avg_penalty, = plt.plot(num_no_capacity_smoothed, label='Average penalties')
plt.title('Number of penalties')
plt.xlabel('Episode')
plt.ylabel('Number of penalties')
plt.legend(handles=[num_penalty, avg_penalty], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'penalties.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
#############################
# Evaluation
#############################
# Use the trained model to predict 1000 test games
total_reward_episode_eva = [0] * 1000
num_no_capacity_eva = []
accepted_orders_eva = []
test_orders = load('dp/order_list.npy')
test_rewards = load('dp/reward_list.npy')
print('##########################Evaluation##########################')
for ep in range(test_orders.shape[0]):
env_eva = Wendtris_Eva(test_orders[ep], test_rewards[ep])
state = env_eva.state
is_done = False
while not is_done:
# Always take the best action
action = policy_net.get_action(state)[0]
next_state, reward, is_done, info = env_eva.step(action)
total_reward_episode_eva[ep] += reward
if is_done:
num_no_capacity_eva.append(info['Number no capacity'])
accepted_orders_eva.append(info['Accepted orders'])
break
state = next_state
print(f'Episode: {ep}, total reward: {total_reward_episode_eva[ep]}',
f'num_no_capacity: {num_no_capacity_eva[ep]}, accepted orders: {accepted_orders_eva[ep]}')
# Save the variables for evaluation
EVA_FILE = os.path.join(OUT_PATH, 'evaluation')
save_list(total_reward_episode, EVA_FILE, 'total_reward_episode_train')
save_list(total_reward_episode_eva, EVA_FILE, 'total_reward_episode_eva')
save_list(num_no_capacity_eva, EVA_FILE, 'num_no_capacity_eva')
save_list(accepted_orders_eva, EVA_FILE, 'accepted_orders_eva')
# Load optimal solution
optimal_rewards = load('dp/results.npy')
optimal_orders = load('dp/subset.npy', allow_pickle=True).astype('object').tolist()
# Calculate average results of the ground truth
optimal_avg_rewards = np.average(optimal_rewards)
eva_avg_rewards = statistics.mean(total_reward_episode_eva)
print(f'Predicted average rewards: {eva_avg_rewards}')
print(f"Optimal average rewards: {optimal_avg_rewards}")
# Plot rewards (evaluation)
# Cumulative Average reward received over time
smoothed_rewards_eva = sliding_window(total_reward_episode_eva, len(total_reward_episode_eva))
smoothed_optimal_rewards = sliding_window(optimal_rewards, len(optimal_rewards))
rewards_eva, = plt.plot(total_reward_episode_eva, label='Rewards')
avg_rewards_eva, = plt.plot(smoothed_rewards_eva, label='Average rewards')
opt_rewards, = plt.plot(optimal_rewards, label='Optimal rewards')
opt_avg_rewards, = plt.plot(smoothed_optimal_rewards, label='Average optimal rewards')
plt.title('Episode rewards over time (Evaluation)')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.legend(handles=[rewards_eva, avg_rewards_eva, opt_rewards, opt_avg_rewards], loc='best', fontsize='small')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards_evaluation.png'), dpi=1200, transparent=True,
bbox_inches='tight')
plt.close()
# Modify orders for evaluation
prediction = np.asarray(modify_orders(accepted_orders_eva), dtype=int)
prediction = prediction.flatten()
optimal_results = np.asarray(modify_orders(optimal_orders), dtype=int)
optimal_results = optimal_results.flatten()
# Confusion matrix
cf_matrix = confusion_matrix(optimal_results, prediction)
labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
categories = ['Reject', 'Accept']
make_confusion_matrix(cf_matrix, group_names=labels, categories=categories, cmap='Blues')
plt.tight_layout()
plt.savefig(os.path.join(OUT_PATH, 'confusion_matrix.png'), transparent=True, bbox_inches='tight')
plt.close()
......@@ -157,8 +157,6 @@ for episode in range(n_episode):
v_running_loss += v_loss.item()
v_losses.append(v_loss.item())
v_running_losses.append(v_running_loss)
if policy_lr_schedule:
lr.append(policy_net.optimizer.param_groups[0]['lr'])
# Update nn based on discounted rewards and log_probs
p_loss = policy_net.update(advantages, log_probs)
p_losses.append(p_loss.item())
......@@ -172,12 +170,8 @@ for episode in range(n_episode):
writer.add_histogram(name + '_grad', param.grad, episode)
writer.add_histogram(name + '_data', param, episode)
if policy_lr_schedule:
print(
f'Episode: {episode}, total reward: {total_reward_episode[episode]}, number of penalties: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}, policy loss: {p_loss.item()}, value loss: {v_loss.item()}, policy learning rate: {lr[episode]}')
else:
print(
f'Episode: {episode}, total reward: {total_reward_episode[episode]}, number of penalties: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}, policy loss: {p_loss.item()}, value loss: {v_loss.item()}')
print(f'Episode: {episode}, total reward: {total_reward_episode[episode]}, number of penalties: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}, policy loss: {p_loss.item()}, value loss: {v_loss.item()}')
break
state = next_state
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment