Commit 0a799f4c authored by lli's avatar lli
Browse files

update

parent 8738c2b3
......@@ -30,6 +30,7 @@ class PolicyNetwork(nn.Module):
)
self.optimizer = torch.optim.Adam(self.network.parameters(), lr)
self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.1)
def predict(self, state):
# Compute the action probabilities of state s using the learning rate
......@@ -54,6 +55,7 @@ class PolicyNetwork(nn.Module):
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.scheduler.step()
def get_action(self, state):
probs = self.predict(state)
......
......@@ -92,7 +92,7 @@ n_episode = args.n_episode # Number of training episodes
total_reward_episode = [0] * n_episode
num_no_capacity = []
accepted_orders = []
lr = []
start_time = timer()
#############################
......@@ -141,16 +141,17 @@ for episode in range(n_episode):
advantages = (returns - baseline_values)
value_net.update(states, returns)
lr.append(policy_net.optimizer.param_groups[0]['lr'])
# Update nn based on discounted rewards and log_probs
policy_net.update(advantages, log_probs)
print('Episode: {}, total reward: {}, number of penalties: {}, accepted orders: {}'.format(episode,
print('Episode: {}, total reward: {}, number of penalties: {}, accepted orders: {}, learning rate: {}'.format(episode,
total_reward_episode[
episode],
num_no_capacity[
episode],
accepted_orders[
episode]))
episode], lr[episode]))
# print('Episode: {}, selected action: {}'.format(episode, selected_action))
break
......@@ -164,7 +165,7 @@ torch.save(value_net.state_dict(), os.path.join(OUT_PATH, 'value_{}.pk1'.format(
# Plot of the training model
#############################
# Cumulative Average reward received over time
smoothed_rewards = [np.mean(total_reward_episode[:i+1]) for i in range(len(total_reward_episode))]
smoothed_rewards = sliding_window(total_reward_episode, len(total_reward_episode))
rewards, = plt.plot(total_reward_episode, label='Rewards')
avg_rewards, = plt.plot(smoothed_rewards, label='Average rewards')
plt.title('Episode rewards over time')
......@@ -175,7 +176,7 @@ plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards.png'), dpi=1200, tran
plt.close()
# Plot number of penalties
num_no_capacity_smoothed = [np.mean(num_no_capacity[:i+1]) for i in range(len(num_no_capacity))]
num_no_capacity_smoothed = sliding_window(num_no_capacity, len(num_no_capacity))
num_penalty, = plt.plot(num_no_capacity, label= 'Penalties')
avg_penalty, = plt.plot(num_no_capacity_smoothed, label = 'Average penalties')
plt.title('Number of penalties')
......@@ -185,6 +186,11 @@ plt.legend(handles=[num_penalty, avg_penalty], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'penalties.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
plt.title('Learning rate decay')
plt.xlabel('Episode')
plt.ylabel('Learning rate')
plt.savefig(os.path.join(OUT_PATH, 'learning_rate.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
#############################
# Evaluation
......@@ -238,8 +244,8 @@ print(f"Optimal average rewards: {optimal_avg_rewards}")
# Plot rewards (evaluation)
# Cumulative Average reward received over time
smoothed_rewards_eva = [np.mean(total_reward_episode_eva[:i+1]) for i in range(len(total_reward_episode_eva))]
smoothed_optimal_rewards = [np.mean(optimal_rewards[:i+1]) for i in range(len(optimal_rewards))]
smoothed_rewards_eva = sliding_window(total_reward_episode_eva, len(total_reward_episode_eva))
smoothed_optimal_rewards = sliding_window(optimal_rewards, len(optimal_rewards))
rewards_eva, = plt.plot(total_reward_episode_eva, label='Rewards')
avg_rewards_eva, = plt.plot(smoothed_rewards_eva, label='Average rewards')
opt_rewards, = plt.plot(optimal_rewards, label='Optimal rewards')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment