Commit 168e0fcb authored by lli's avatar lli
Browse files

update2

parent 97be44db
import torch import torch
import numpy as np
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F
USE_CUDA = torch.cuda.is_available() USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu') device = torch.device('cuda' if USE_CUDA else 'cpu')
......
import argparse import argparse
import math
import statistics import statistics
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import torch import torch
...@@ -21,7 +20,7 @@ plt.rcParams['agg.path.chunksize'] = 10000 ...@@ -21,7 +20,7 @@ plt.rcParams['agg.path.chunksize'] = 10000
parser = argparse.ArgumentParser(description='DQN') parser = argparse.ArgumentParser(description='DQN')
parser.add_argument('--save_path', type=str, required=True, help='save path of results') parser.add_argument('--save_path', type=str, required=True, help='save path of results')
parser.add_argument('--n_hidden', type=int, default=12, help='number of hidden neurons (default: 12)') parser.add_argument('--n_hidden', type=int, default=128, help='number of hidden neurons (default: 128)')
parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)') parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)')
parser.add_argument('--seed', type=int, default=None, help='random seed') parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument('--policy', type=str, choices=('epsilon_greedy', 'boltzmann')) parser.add_argument('--policy', type=str, choices=('epsilon_greedy', 'boltzmann'))
...@@ -33,7 +32,7 @@ USE_CUDA = torch.cuda.is_available() ...@@ -33,7 +32,7 @@ USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu') device = torch.device('cuda' if USE_CUDA else 'cpu')
OUT_PATH = os.path.join('results',args.save_path) OUT_PATH = os.path.join('results/dqn',args.save_path)
LOG_FILE = os.path.join(OUT_PATH, 'log.txt') LOG_FILE = os.path.join(OUT_PATH, 'log.txt')
clear_folder(OUT_PATH) clear_folder(OUT_PATH)
......
import argparse
import statistics
import matplotlib.pyplot as plt
import torch
import torch.backends.cudnn as cudnn
from timeit import default_timer as timer
from numpy import load
from algorithms.reinforce import PolicyNetwork, ValueNetwork
from utils.cf_matrix import make_confusion_matrix
from sklearn.metrics import confusion_matrix
from params.reinforce_params import *
from utils.utils import *
from environment.wendtris import Wendtris_Eva
# Configurations for matplotlib
plt.rcParams['agg.path.chunksize'] = 10000
parser = argparse.ArgumentParser(description='DQN')
parser.add_argument('--save_path', type=str, required=True, help='save path of results')
parser.add_argument('--n_hidden', type=int, default=128, help='number of hidden neurons (default: 128)')
parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument('--n_episode', type=int, required=True, help='number of training episodes')
args = parser.parse_args()
# Check if using cuda and define device
USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu')
OUT_PATH = os.path.join('results/reinforce', args.save_path)
LOG_FILE = os.path.join(OUT_PATH, 'log.txt')
clear_folder(OUT_PATH)
print(f'Logging to {LOG_FILE}n')
sys.stdout = StdOut(LOG_FILE)
print('#################Reinforce with Baseline#################')
print(f"PyTorch version {torch.__version__}")
print(f'Training device: {device}')
if USE_CUDA:
print(f"CUDA version: {torch.version.cuda}")
cudnn.benchmark = True
print()
print('#################Hyper Parameter Settings#################')
print('#################Policy Net##################')
print(f'Number of states (input): {n_state}')
print(f'Number of actions (output): {n_action}')
print(f'Number of hidden neurons: {args.n_hidden}')
print(f'Learning rate: {args.lr}')
print(f'Discount factor: {gamma}')
print()
# Initialize DQN network
dqn = DQN(n_state, n_action, args.n_hidden, args.lr)
if USE_CUDA:
dqn = dqn.to(device)
print('######################DQN architecture#####################')
print(dqn)
print()
print(f'Total parameters: {sum(p.numel() for p in dqn.parameters())}')
print(f'Trainable parameters: {sum(p.numel() for p in dqn.parameters() if p.requires_grad)}')
print()
seed = args.seed
if seed is None:
seed = np.random.randint(1, 10000)
print('Random seed:', seed)
torch.manual_seed(seed)
if USE_CUDA:
torch.cuda.manual_seed(seed)
env.seed(seed)
n_episode = args.n_episode # Number of training episodes
total_reward_episode = [0] * n_episode
num_no_capacity = []
accepted_orders = []
if args.policy == 'epsilon_greedy':
epsilon_value = []
else:
tau_value = []
losses = []
start_time = timer()
#############################
# Training
#############################s
for episode in range(n_episode):
state = env.reset()
is_done = False
if args.policy == 'epsilon_greedy':
if epsilon_decay:
epsilon = stretched_exponential_decay(episode, args.n_episode, 0.1, 0.1, 0.1)
epsilon_value.append(epsilon)
else:
if tau_decay:
tau = anneal_tau(episode, 0.001, 100)
tau_value.append(tau)
while not is_done:
if args.policy == 'epsilon_greedy':
action = dqn.eps_greedy_policy(state, n_action, epsilon)
else:
action = dqn.boltzmann_policy(state, n_action, tau)
next_state, reward, is_done, info = env.step(action)
total_reward_episode[episode] += reward
replay_buffer.append((state, action, next_state, reward, is_done))
if is_done:
num_no_capacity.append(info['Number no capacity'])
accepted_orders.append(info['Accepted orders'])
break
loss = dqn.replay(replay_buffer, replay_batch_size, gamma)
losses.append(loss)
state = next_state
if args.policy == 'epsilon_greedy':
print(
f'episode: {episode}, total reward: {total_reward_episode[episode]}, epsilon: {epsilon}, loss: {loss}, '
f'num_no_capacity: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}')
else:
print(
f'episode: {episode}, total reward: {total_reward_episode[episode]}, loss: {loss}, '
f'num_no_capacity: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}')
print(f"Training time for {n_episode} episodes: {timer() - start_time}")
# save the model parameters
torch.save(dqn.state_dict(), os.path.join(OUT_PATH, 'dqn_{}.pk1'.format(n_episode)))
#############################
# Plot of the training model
#############################
# Cumulative Average reward received over time
smoothed_rewards = [np.mean(total_reward_episode[:i+1]) for i in range(len(total_reward_episode))]
rewards, = plt.plot(total_reward_episode, label='Rewards')
avg_rewards, = plt.plot(smoothed_rewards, label='Average rewards')
plt.title('Episode rewards over time')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.legend(handles=[rewards, avg_rewards], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Plot epsilon and tau
if args.policy == 'epsilon_greedy':
plt.plot(epsilon_value)
plt.title('Epsilon over time')
plt.xlabel('Episode')
plt.ylabel('Epsilon')
plt.savefig(os.path.join(OUT_PATH, 'epsilon.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
else:
plt.plot(tau_value)
plt.title('Tau value over time')
plt.xlabel('Episode')
plt.ylabel('Tau')
plt.savefig(os.path.join(OUT_PATH, 'tau.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Plot number of penalties
num_no_capacity_smoothed = [np.mean(num_no_capacity[:i+1]) for i in range(len(num_no_capacity))]
num_penalty, = plt.plot(num_no_capacity, label= 'Penalties')
avg_penalty, = plt.plot(num_no_capacity_smoothed, label = 'Average penalties')
plt.title('Number of penalties')
plt.xlabel('Episode')
plt.ylabel('Number of penalties')
plt.legend(handles=[num_penalty, avg_penalty], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'penalties.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Plot loss
loss= plt.plot(losses, label= 'Loss')
plt.title('Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.savefig(os.path.join(OUT_PATH, 'loss.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
#############################
# Evaluation
#############################
# Use the trained model to predict 1000 test games
total_reward_episode_eva = [0] * 1000
num_no_capacity_eva = []
accepted_orders_eva = []
test_orders = load('dp/order_list.npy')
test_rewards = load('dp/reward_list.npy')
print('##########################Evaluation##########################')
for ep in range(test_orders.shape[0]):
env_eva = Wendtris_Eva(test_orders[ep], test_rewards[ep])
state = env_eva.state
is_done = False
while not is_done:
# Always take the best action
action = torch.argmax(dqn.predict(state)).item()
next_state, reward, is_done, info = env_eva.step(action)
total_reward_episode_eva[ep] += reward
if is_done:
num_no_capacity_eva.append(info['Number no capacity'])
accepted_orders_eva.append(info['Accepted orders'])
break
state = next_state
print(f'Episode: {ep}, total reward: {total_reward_episode_eva[ep]}',
f'num_no_capacity: {num_no_capacity_eva[ep]}, accepted orders: {accepted_orders_eva[ep]}')
# Save the variables for evaluation
EVA_FILE = os.path.join(OUT_PATH, 'evaluation')
save_list(total_reward_episode, EVA_FILE, 'total_reward_episode_train')
save_list(total_reward_episode_eva, EVA_FILE, 'total_reward_episode_eva')
save_list(num_no_capacity_eva, EVA_FILE, 'num_no_capacity_eva')
save_list(accepted_orders_eva, EVA_FILE, 'accepted_orders_eva')
if args.policy == 'epsilon_greedy':
save_list(epsilon_value, EVA_FILE, 'epsilon_value')
else:
save_list(tau_value, EVA_FILE, 'tau_value')
# Load optimal solution
optimal_rewards = load('dp/results.npy')
optimal_orders = load('dp/subset.npy', allow_pickle=True).astype('object').tolist()
# Calculate average results of the ground truth
optimal_avg_rewards = np.average(optimal_rewards)
eva_avg_rewards = statistics.mean(total_reward_episode_eva)
print(f'Predicted average rewards: {eva_avg_rewards}')
print(f"Optimal average rewards: {optimal_avg_rewards}")
# Plot rewards (evaluation)
# Cumulative Average reward received over time
smoothed_rewards_eva = [np.mean(total_reward_episode_eva[:i+1]) for i in range(len(total_reward_episode_eva))]
smoothed_optimal_rewards = [np.mean(optimal_rewards[:i+1]) for i in range(len(optimal_rewards))]
rewards_eva, = plt.plot(total_reward_episode_eva, label='Rewards')
avg_rewards_eva, = plt.plot(smoothed_rewards_eva, label='Average rewards')
opt_rewards, = plt.plot(optimal_rewards, label='Optimal rewards')
opt_avg_rewards, = plt.plot(smoothed_optimal_rewards, label='Average optimal rewards')
plt.title('Episode rewards over time (Evaluation)')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.legend(handles=[rewards_eva, avg_rewards_eva, opt_rewards, opt_avg_rewards], loc='best', fontsize='small')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards_evaluation.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Modify orders for evaluation
prediction = np.asarray(modify_orders(accepted_orders_eva), dtype=int)
prediction = prediction.flatten()
optimal_results = np.asarray(modify_orders(optimal_orders), dtype=int)
optimal_results= optimal_results.flatten()
# Confusion matrix
cf_matrix = confusion_matrix(optimal_results, prediction)
labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['Reject', 'Accept']
make_confusion_matrix(cf_matrix, group_names=labels, categories=categories, cmap='Blues')
plt.tight_layout()
plt.savefig(os.path.join(OUT_PATH, 'confusion_matrix.png'), transparent=True, bbox_inches='tight')
plt.close()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment