Commit c9446558 authored by lli's avatar lli
Browse files

Corrected plot bug, change epoch to episode

parent bb7d5832
......@@ -25,7 +25,7 @@ parser.add_argument('--n_hidden', type=int, default=12, help='number of hidden n
parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument('--policy', type=str, choices=('epsilon_greedy', 'boltzmann'))
parser.add_argument('--n_epoch', type=int, required=True, help='number of training epochs')
parser.add_argument('--n_episode', type=int, required=True, help='number of training episodes')
args = parser.parse_args()
# Check if using cuda and define device
......@@ -85,8 +85,8 @@ if USE_CUDA:
torch.cuda.manual_seed(seed)
env.seed(seed)
n_epoch = args.n_epoch # Number of training episodes
total_reward_epoch = [0] * n_epoch
n_episode = args.n_episode # Number of training episodes
total_reward_episode = [0] * n_episode
num_no_capacity = []
accepted_orders = []
......@@ -97,7 +97,7 @@ start_time = timer()
#############################
# Training
#############################s
for epoch in range(n_epoch):
for episode in range(n_episode):
state = env.reset()
is_done = False
......@@ -115,7 +115,7 @@ for epoch in range(n_epoch):
else:
action = dqn.boltzmann_policy(state, n_action)
next_state, reward, is_done, info = env.step(action)
total_reward_epoch[epoch] += reward
total_reward_episode[episode] += reward
replay_buffer.append((state, action, next_state, reward, is_done))
......@@ -129,47 +129,47 @@ for epoch in range(n_epoch):
state = next_state
print(
f'Epoch: {epoch}, total reward: {total_reward_epoch[epoch]}, epsilon: {epsilon}, loss: {loss}, '
f'num_no_capacity: {num_no_capacity[epoch]}, accepted orders: {accepted_orders[epoch]}')
f'episode: {episode}, total reward: {total_reward_episode[episode]}, epsilon: {epsilon}, loss: {loss}, '
f'num_no_capacity: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}')
print(f"Training time for {n_epoch} epochs: {timer() - start_time}")
print(f"Training time for {n_episode} episodes: {timer() - start_time}")
# save the model parameters
torch.save(dqn.state_dict(), os.path.join(OUT_PATH, 'dqn_{}.pk1'.format(n_epoch)))
torch.save(dqn.state_dict(), os.path.join(OUT_PATH, 'dqn_{}.pk1'.format(n_episode)))
#############################
# Plot of the training model
#############################
# Cumulative Average reward received over time
smoothed_rewards = [np.mean(total_reward_epoch[:i+1]) for i in range(len(total_reward_epoch))]
rewards, = plt.plot(total_reward_epoch, label='Rewards')
smoothed_rewards = [np.mean(total_reward_episode[:i+1]) for i in range(len(total_reward_episode))]
rewards, = plt.plot(total_reward_episode, label='Rewards')
avg_rewards, = plt.plot(smoothed_rewards, label='Average rewards')
plt.title('Epoch rewards over time')
plt.xlabel('Epoch')
plt.title('episode rewards over time')
plt.xlabel('episode')
plt.ylabel('Total reward')
plt.legend(handles=[rewards, avg_rewards], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.show()
plt.close()
# Plot epsilon
if args.policy == 'epsilon_greedy':
plt.plot(epsilon_value)
plt.title('Epsilon over time')
plt.xlabel('Epoch')
plt.xlabel('episode')
plt.ylabel('Epsilon')
plt.savefig(os.path.join(OUT_PATH, 'epsilon.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.show()
plt.close()
# Plot number of penalties
num_no_capacity_smoothed = [np.mean(num_no_capacity[:i+1]) for i in range(len(num_no_capacity))]
num_penalty, = plt.plot(num_no_capacity, label= 'Penalties')
avg_penalty, = plt.plot(num_no_capacity_smoothed, label = 'Average penalties')
plt.title('Number of penalties')
plt.xlabel('Epoch')
plt.xlabel('episode')
plt.ylabel('Number of penalties')
plt.legend(handles=[num_penalty, avg_penalty], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'penalties.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.show()
plt.close()
# Plot loss
loss= plt.plot(losses, label= 'Loss')
......@@ -177,21 +177,21 @@ plt.title('Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.savefig(os.path.join(OUT_PATH, 'loss.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.show()
plt.close()
#############################
# Evaluation
#############################
# Use the trained model to predict 1000 test games
total_reward_epoch_eva = [0] * 1000
total_reward_episode_eva = [0] * 1000
num_no_capacity_eva = []
accepted_orders_eva = []
test_orders = load('dp/order_list.npy')
test_rewards = load('dp/reward_list.npy')
print('##########################Evaluation##########################')
for ep in range(test_orders.shape[0]):
env_eva = Wendtris_Eva(test_orders[ep], test_rewards[ep])
state = env_eva.state
......@@ -201,7 +201,7 @@ for ep in range(test_orders.shape[0]):
# Always take the best action
action = torch.argmax(dqn.predict(state)).item()
next_state, reward, is_done, info = env_eva.step(action)
total_reward_epoch_eva[ep] += reward
total_reward_episode_eva[ep] += reward
if is_done:
num_no_capacity_eva.append(info['Number no capacity'])
......@@ -209,13 +209,13 @@ for ep in range(test_orders.shape[0]):
break
state = next_state
print('##########################Evaluation##########################')
print(f'Epoch: {ep}, total reward: {total_reward_epoch_eva[ep]}',
print(f'Episode: {ep}, total reward: {total_reward_episode_eva[ep]}',
f'num_no_capacity: {num_no_capacity_eva[ep]}, accepted orders: {accepted_orders_eva[ep]}')
# Save the variables for evaluation
EVA_FILE = os.path.join(OUT_PATH, 'evaluation')
save_list(total_reward_epoch_eva, EVA_FILE, 'total_reward_epoch_eva')
save_list(total_reward_episode_eva, EVA_FILE, 'total_reward_episode_eva')
save_list(num_no_capacity_eva, EVA_FILE, 'num_no_capacity_eva')
save_list(accepted_orders_eva, EVA_FILE, 'accepted_orders_eva')
......@@ -225,24 +225,24 @@ optimal_orders = load('dp/subset.npy', allow_pickle=True).astype('object').tolis
# Calculate average results of the ground truth
optimal_avg_rewards = np.average(optimal_rewards)
eva_avg_rewards = statistics.mean(total_reward_epoch_eva)
eva_avg_rewards = statistics.mean(total_reward_episode_eva)
print(f'Predicted average rewards: {eva_avg_rewards}')
print(f"Optimal average rewards: {optimal_avg_rewards}")
# Plot rewards (evaluation)
# Cumulative Average reward received over time
smoothed_rewards_eva = [np.mean(total_reward_epoch_eva[:i+1]) for i in range(len(total_reward_epoch_eva))]
smoothed_rewards_eva = [np.mean(total_reward_episode_eva[:i+1]) for i in range(len(total_reward_episode_eva))]
smoothed_optimal_rewards = [np.mean(optimal_rewards[:i+1]) for i in range(len(optimal_rewards))]
rewards_eva, = plt.plot(total_reward_epoch_eva, label='Rewards')
rewards_eva, = plt.plot(total_reward_episode_eva, label='Rewards')
avg_rewards_eva, = plt.plot(smoothed_rewards_eva, label='Average rewards')
opt_rewards, = plt.plot(optimal_rewards, label='Optimal rewards')
opt_avg_rewards, = plt.plot(smoothed_optimal_rewards, label='Average optimal rewards')
plt.title('Epoch rewards over time (Evaluation)')
plt.xlabel('Epoch')
plt.title('Episode rewards over time (Evaluation)')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.legend(handles=[rewards_eva, avg_rewards_eva, opt_rewards, opt_avg_rewards], loc='best', fontsize='small')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards_evaluation.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.show()
plt.close()
# Modify orders for evaluation
......@@ -258,7 +258,7 @@ categories = ['Reject', 'Accept']
make_confusion_matrix(cf_matrix, group_names=labels, categories=categories, cmap='Blues')
plt.tight_layout()
plt.savefig(os.path.join(OUT_PATH, 'confusion_matrix.png'), transparent=True, bbox_inches='tight')
plt.show()
plt.close()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment