Commit 6676088d authored by lli's avatar lli
Browse files

update

parent e06d8eef
......@@ -3,4 +3,5 @@
__pycache__/
*.pyc
results/
slurm/
\ No newline at end of file
slurm/
runs/
\ No newline at end of file
......@@ -86,7 +86,7 @@ class DQN(nn.Module):
loss.backward()
self.optimizer.step()
# self.scheduler.step()
return loss.item()
return loss
def replay(self, replay_buffer, replay_batch_size, gamma):
"""
......
......@@ -61,6 +61,7 @@ class PolicyNetwork(nn.Module):
self.optimizer.step()
if self.lr_schedule:
self.scheduler.step()
return loss
def get_action(self, state):
probs = self.predict(state)
......@@ -96,6 +97,7 @@ class ValueNetwork(nn.Module):
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
return loss
def predict(self, state):
with torch.no_grad():
......
import os
from itertools import product
parameters = dict(
policy_lr = [0.1, 0.01],
value_lr = [0.1, 0.01]
)
param_values = [v for v in parameters.values()]
print(param_values)
for policy_lr, value_lr in product(*param_values):
print(policy_lr, value_lr)
for run_id, (policy_lr, value_lr) in enumerate(product(*param_values)):
print('Run id: ', run_id + 1)
print('Policy learning rate: ', policy_lr)
print('Value learning rate: ', value_lr)
os.system(f"python train_reinforce.py --save_path {run_id + 1} --n_hidden 128 --lr_policy {policy_lr} --lr_value {value_lr} --n_episode 200 ")
......@@ -137,6 +137,7 @@ for episode in range(n_episode):
f'episode: {episode}, total reward: {total_reward_episode[episode]}, loss: {loss}, '
f'num_no_capacity: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}')
print(f"Training time for {n_episode} episodes: {timer() - start_time}")
# save the model parameters
......@@ -231,6 +232,7 @@ save_list(total_reward_episode, EVA_FILE, 'total_reward_episode_train')
save_list(total_reward_episode_eva, EVA_FILE, 'total_reward_episode_eva')
save_list(num_no_capacity_eva, EVA_FILE, 'num_no_capacity_eva')
save_list(accepted_orders_eva, EVA_FILE, 'accepted_orders_eva')
save_list(losses, EVA_FILE, 'loss')
if args.policy == 'epsilon_greedy':
save_list(epsilon_value, EVA_FILE, 'epsilon_value')
else:
......
......@@ -11,13 +11,13 @@ from utils.cf_matrix import make_confusion_matrix
from sklearn.metrics import confusion_matrix
from params.reinforce_params import *
from utils.utils import *
from torch.utils.tensorboard import SummaryWriter
from environment.wendtris import Wendtris_Eva
# Configurations for matplotlib
plt.rcParams['agg.path.chunksize'] = 10000
parser = argparse.ArgumentParser(description='Reinforce with basesline')
parser.add_argument('--save_path', type=str, required=True, help='save path of results')
parser.add_argument('--n_hidden', type=int, default=128, help='number of hidden neurons (default: 128)')
......@@ -31,6 +31,9 @@ args = parser.parse_args()
USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu')
# Define tensorboard writer path
comment = f' policy_lr={args.lr_policy} value_lr={args.lr_value} n_episode={args.n_episode}'
writer = SummaryWriter('runs/reinforce', comment=comment, filename_suffix=str(args.lr_policy) + str(args.n_episode))
OUT_PATH = os.path.join('results/reinforce', args.save_path)
LOG_FILE = os.path.join(OUT_PATH, 'log.txt')
......@@ -88,11 +91,16 @@ if USE_CUDA:
torch.cuda.manual_seed(seed)
env.seed(seed)
n_episode = args.n_episode # Number of training episodes
n_episode = args.n_episode # Number of training episodes
total_reward_episode = [0] * n_episode
num_no_capacity = []
accepted_orders = []
p_losses = []
v_losses = []
p_running_losses = []
v_running_losses = []
start_time = timer()
if policy_lr_schedule:
......@@ -101,6 +109,8 @@ if policy_lr_schedule:
# Training
#############################s
for episode in range(n_episode):
p_running_loss = 0
v_running_loss = 0
log_probs = []
states = []
rewards = []
......@@ -142,27 +152,37 @@ for episode in range(n_episode):
advantages = (returns - baseline_values)
value_net.update(states, returns)
v_loss = value_net.update(states, returns)
v_running_loss += v_loss.item()
v_losses.append(v_loss.item())
v_running_losses.append(v_running_loss)
if policy_lr_schedule:
lr.append(policy_net.optimizer.param_groups[0]['lr'])
# Update nn based on discounted rewards and log_probs
policy_net.update(advantages, log_probs)
p_loss = policy_net.update(advantages, log_probs)
p_losses.append(p_loss.item())
p_running_loss += p_loss.item()
p_running_losses.append(p_running_loss)
# Update tensor board
writer.add_scalar('Policy Loss', p_running_loss, episode)
writer.add_scalar('Value loss', v_running_loss, episode)
for name, param in policy_net.named_parameters():
writer.add_histogram(name + '_grad', param.grad, episode)
writer.add_histogram(name + '_data', param, episode)
if policy_lr_schedule:
print('Episode: {}, total reward: {}, number of penalties: {}, accepted orders: {}, learning rate: {}'.format(episode,
total_reward_episode[
episode],
num_no_capacity[
episode],
accepted_orders[
episode], lr[episode]))
print(
f'Episode: {episode}, total reward: {total_reward_episode[episode]}, number of penalties: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}, policy loss: {p_loss.item()}, value loss: {v_loss.item()}, policy learning rate: {lr[episode]}')
else:
print(f'Episode: {episode}, total reward: {total_reward_episode[episode]}, number of penalties: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}')
# print('Episode: {}, selected action: {}'.format(episode, selected_action))
print(
f'Episode: {episode}, total reward: {total_reward_episode[episode]}, number of penalties: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}, policy loss: {p_loss.item()}, value loss: {v_loss.item()}')
break
state = next_state
writer.close()
# save the model parameters
torch.save(policy_net.state_dict(), os.path.join(OUT_PATH, 'policy_{}.pk1'.format(n_episode)))
torch.save(value_net.state_dict(), os.path.join(OUT_PATH, 'value_{}.pk1'.format(n_episode)))
......@@ -183,8 +203,8 @@ plt.close()
# Plot number of penalties
num_no_capacity_smoothed = sliding_window(num_no_capacity, len(num_no_capacity))
num_penalty, = plt.plot(num_no_capacity, label= 'Penalties')
avg_penalty, = plt.plot(num_no_capacity_smoothed, label = 'Average penalties')
num_penalty, = plt.plot(num_no_capacity, label='Penalties')
avg_penalty, = plt.plot(num_no_capacity_smoothed, label='Average penalties')
plt.title('Number of penalties')
plt.xlabel('Episode')
plt.ylabel('Number of penalties')
......@@ -192,13 +212,29 @@ plt.legend(handles=[num_penalty, avg_penalty], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'penalties.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Plot learning rate
if policy_lr_schedule:
plt.plot(lr)
plt.title('Learning rate decay')
plt.xlabel('Episode')
plt.ylabel('Learning rate')
plt.savefig(os.path.join(OUT_PATH, 'learning_rate.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Plot policy net loss
plt.plot(p_running_losses)
plt.title('Policy net losses')
plt.xlabel('Episode')
plt.savefig(os.path.join(OUT_PATH, 'policy_loss.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Plot value net loss
plt.plot(v_running_losses)
plt.title('Value net losses')
plt.xlabel('Episode')
plt.savefig(os.path.join(OUT_PATH, 'value_loss.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
#############################
# Evaluation
#############################
......@@ -238,6 +274,8 @@ save_list(total_reward_episode, EVA_FILE, 'total_reward_episode_train')
save_list(total_reward_episode_eva, EVA_FILE, 'total_reward_episode_eva')
save_list(num_no_capacity_eva, EVA_FILE, 'num_no_capacity_eva')
save_list(accepted_orders_eva, EVA_FILE, 'accepted_orders_eva')
save_list(p_losses, EVA_FILE, 'policy_losses')
save_list(v_losses, EVA_FILE, 'value_losses')
# Load optimal solution
optimal_rewards = load('dp/results.npy')
......@@ -261,27 +299,21 @@ plt.title('Episode rewards over time (Evaluation)')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.legend(handles=[rewards_eva, avg_rewards_eva, opt_rewards, opt_avg_rewards], loc='best', fontsize='small')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards_evaluation.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards_evaluation.png'), dpi=1200, transparent=True,
bbox_inches='tight')
plt.close()
# Modify orders for evaluation
prediction = np.asarray(modify_orders(accepted_orders_eva), dtype=int)
prediction = prediction.flatten()
optimal_results = np.asarray(modify_orders(optimal_orders), dtype=int)
optimal_results= optimal_results.flatten()
optimal_results = optimal_results.flatten()
# Confusion matrix
cf_matrix = confusion_matrix(optimal_results, prediction)
labels = ['True Neg','False Pos','False Neg','True Pos']
labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
categories = ['Reject', 'Accept']
make_confusion_matrix(cf_matrix, group_names=labels, categories=categories, cmap='Blues')
plt.tight_layout()
plt.savefig(os.path.join(OUT_PATH, 'confusion_matrix.png'), transparent=True, bbox_inches='tight')
plt.close()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment