Commit 168e0fcb authored by lli's avatar lli
Browse files

update2

parent 97be44db
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu')
......
import argparse
import math
import statistics
import matplotlib.pyplot as plt
import torch
......@@ -21,7 +20,7 @@ plt.rcParams['agg.path.chunksize'] = 10000
parser = argparse.ArgumentParser(description='DQN')
parser.add_argument('--save_path', type=str, required=True, help='save path of results')
parser.add_argument('--n_hidden', type=int, default=12, help='number of hidden neurons (default: 12)')
parser.add_argument('--n_hidden', type=int, default=128, help='number of hidden neurons (default: 128)')
parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument('--policy', type=str, choices=('epsilon_greedy', 'boltzmann'))
......@@ -33,7 +32,7 @@ USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu')
OUT_PATH = os.path.join('results',args.save_path)
OUT_PATH = os.path.join('results/dqn',args.save_path)
LOG_FILE = os.path.join(OUT_PATH, 'log.txt')
clear_folder(OUT_PATH)
......
import argparse
import statistics
import matplotlib.pyplot as plt
import torch
import torch.backends.cudnn as cudnn
from timeit import default_timer as timer
from numpy import load
from algorithms.reinforce import PolicyNetwork, ValueNetwork
from utils.cf_matrix import make_confusion_matrix
from sklearn.metrics import confusion_matrix
from params.reinforce_params import *
from utils.utils import *
from environment.wendtris import Wendtris_Eva
# Configurations for matplotlib
plt.rcParams['agg.path.chunksize'] = 10000
parser = argparse.ArgumentParser(description='DQN')
parser.add_argument('--save_path', type=str, required=True, help='save path of results')
parser.add_argument('--n_hidden', type=int, default=128, help='number of hidden neurons (default: 128)')
parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument('--n_episode', type=int, required=True, help='number of training episodes')
args = parser.parse_args()
# Check if using cuda and define device
USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu')
OUT_PATH = os.path.join('results/reinforce', args.save_path)
LOG_FILE = os.path.join(OUT_PATH, 'log.txt')
clear_folder(OUT_PATH)
print(f'Logging to {LOG_FILE}n')
sys.stdout = StdOut(LOG_FILE)
print('#################Reinforce with Baseline#################')
print(f"PyTorch version {torch.__version__}")
print(f'Training device: {device}')
if USE_CUDA:
print(f"CUDA version: {torch.version.cuda}")
cudnn.benchmark = True
print()
print('#################Hyper Parameter Settings#################')
print('#################Policy Net##################')
print(f'Number of states (input): {n_state}')
print(f'Number of actions (output): {n_action}')
print(f'Number of hidden neurons: {args.n_hidden}')
print(f'Learning rate: {args.lr}')
print(f'Discount factor: {gamma}')
print()
# Initialize DQN network
dqn = DQN(n_state, n_action, args.n_hidden, args.lr)
if USE_CUDA:
dqn = dqn.to(device)
print('######################DQN architecture#####################')
print(dqn)
print()
print(f'Total parameters: {sum(p.numel() for p in dqn.parameters())}')
print(f'Trainable parameters: {sum(p.numel() for p in dqn.parameters() if p.requires_grad)}')
print()
seed = args.seed
if seed is None:
seed = np.random.randint(1, 10000)
print('Random seed:', seed)
torch.manual_seed(seed)
if USE_CUDA:
torch.cuda.manual_seed(seed)
env.seed(seed)
n_episode = args.n_episode # Number of training episodes
total_reward_episode = [0] * n_episode
num_no_capacity = []
accepted_orders = []
if args.policy == 'epsilon_greedy':
epsilon_value = []
else:
tau_value = []
losses = []
start_time = timer()
#############################
# Training
#############################s
for episode in range(n_episode):
state = env.reset()
is_done = False
if args.policy == 'epsilon_greedy':
if epsilon_decay:
epsilon = stretched_exponential_decay(episode, args.n_episode, 0.1, 0.1, 0.1)
epsilon_value.append(epsilon)
else:
if tau_decay:
tau = anneal_tau(episode, 0.001, 100)
tau_value.append(tau)
while not is_done:
if args.policy == 'epsilon_greedy':
action = dqn.eps_greedy_policy(state, n_action, epsilon)
else:
action = dqn.boltzmann_policy(state, n_action, tau)
next_state, reward, is_done, info = env.step(action)
total_reward_episode[episode] += reward
replay_buffer.append((state, action, next_state, reward, is_done))
if is_done:
num_no_capacity.append(info['Number no capacity'])
accepted_orders.append(info['Accepted orders'])
break
loss = dqn.replay(replay_buffer, replay_batch_size, gamma)
losses.append(loss)
state = next_state
if args.policy == 'epsilon_greedy':
print(
f'episode: {episode}, total reward: {total_reward_episode[episode]}, epsilon: {epsilon}, loss: {loss}, '
f'num_no_capacity: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}')
else:
print(
f'episode: {episode}, total reward: {total_reward_episode[episode]}, loss: {loss}, '
f'num_no_capacity: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}')
print(f"Training time for {n_episode} episodes: {timer() - start_time}")
# save the model parameters
torch.save(dqn.state_dict(), os.path.join(OUT_PATH, 'dqn_{}.pk1'.format(n_episode)))
#############################
# Plot of the training model
#############################
# Cumulative Average reward received over time
smoothed_rewards = [np.mean(total_reward_episode[:i+1]) for i in range(len(total_reward_episode))]
rewards, = plt.plot(total_reward_episode, label='Rewards')
avg_rewards, = plt.plot(smoothed_rewards, label='Average rewards')
plt.title('Episode rewards over time')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.legend(handles=[rewards, avg_rewards], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Plot epsilon and tau
if args.policy == 'epsilon_greedy':
plt.plot(epsilon_value)
plt.title('Epsilon over time')
plt.xlabel('Episode')
plt.ylabel('Epsilon')
plt.savefig(os.path.join(OUT_PATH, 'epsilon.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
else:
plt.plot(tau_value)
plt.title('Tau value over time')
plt.xlabel('Episode')
plt.ylabel('Tau')
plt.savefig(os.path.join(OUT_PATH, 'tau.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Plot number of penalties
num_no_capacity_smoothed = [np.mean(num_no_capacity[:i+1]) for i in range(len(num_no_capacity))]
num_penalty, = plt.plot(num_no_capacity, label= 'Penalties')
avg_penalty, = plt.plot(num_no_capacity_smoothed, label = 'Average penalties')
plt.title('Number of penalties')
plt.xlabel('Episode')
plt.ylabel('Number of penalties')
plt.legend(handles=[num_penalty, avg_penalty], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'penalties.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Plot loss
loss= plt.plot(losses, label= 'Loss')
plt.title('Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.savefig(os.path.join(OUT_PATH, 'loss.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
#############################
# Evaluation
#############################
# Use the trained model to predict 1000 test games
total_reward_episode_eva = [0] * 1000
num_no_capacity_eva = []
accepted_orders_eva = []
test_orders = load('dp/order_list.npy')
test_rewards = load('dp/reward_list.npy')
print('##########################Evaluation##########################')
for ep in range(test_orders.shape[0]):
env_eva = Wendtris_Eva(test_orders[ep], test_rewards[ep])
state = env_eva.state
is_done = False
while not is_done:
# Always take the best action
action = torch.argmax(dqn.predict(state)).item()
next_state, reward, is_done, info = env_eva.step(action)
total_reward_episode_eva[ep] += reward
if is_done:
num_no_capacity_eva.append(info['Number no capacity'])
accepted_orders_eva.append(info['Accepted orders'])
break
state = next_state
print(f'Episode: {ep}, total reward: {total_reward_episode_eva[ep]}',
f'num_no_capacity: {num_no_capacity_eva[ep]}, accepted orders: {accepted_orders_eva[ep]}')
# Save the variables for evaluation
EVA_FILE = os.path.join(OUT_PATH, 'evaluation')
save_list(total_reward_episode, EVA_FILE, 'total_reward_episode_train')
save_list(total_reward_episode_eva, EVA_FILE, 'total_reward_episode_eva')
save_list(num_no_capacity_eva, EVA_FILE, 'num_no_capacity_eva')
save_list(accepted_orders_eva, EVA_FILE, 'accepted_orders_eva')
if args.policy == 'epsilon_greedy':
save_list(epsilon_value, EVA_FILE, 'epsilon_value')
else:
save_list(tau_value, EVA_FILE, 'tau_value')
# Load optimal solution
optimal_rewards = load('dp/results.npy')
optimal_orders = load('dp/subset.npy', allow_pickle=True).astype('object').tolist()
# Calculate average results of the ground truth
optimal_avg_rewards = np.average(optimal_rewards)
eva_avg_rewards = statistics.mean(total_reward_episode_eva)
print(f'Predicted average rewards: {eva_avg_rewards}')
print(f"Optimal average rewards: {optimal_avg_rewards}")
# Plot rewards (evaluation)
# Cumulative Average reward received over time
smoothed_rewards_eva = [np.mean(total_reward_episode_eva[:i+1]) for i in range(len(total_reward_episode_eva))]
smoothed_optimal_rewards = [np.mean(optimal_rewards[:i+1]) for i in range(len(optimal_rewards))]
rewards_eva, = plt.plot(total_reward_episode_eva, label='Rewards')
avg_rewards_eva, = plt.plot(smoothed_rewards_eva, label='Average rewards')
opt_rewards, = plt.plot(optimal_rewards, label='Optimal rewards')
opt_avg_rewards, = plt.plot(smoothed_optimal_rewards, label='Average optimal rewards')
plt.title('Episode rewards over time (Evaluation)')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.legend(handles=[rewards_eva, avg_rewards_eva, opt_rewards, opt_avg_rewards], loc='best', fontsize='small')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards_evaluation.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.close()
# Modify orders for evaluation
prediction = np.asarray(modify_orders(accepted_orders_eva), dtype=int)
prediction = prediction.flatten()
optimal_results = np.asarray(modify_orders(optimal_orders), dtype=int)
optimal_results= optimal_results.flatten()
# Confusion matrix
cf_matrix = confusion_matrix(optimal_results, prediction)
labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['Reject', 'Accept']
make_confusion_matrix(cf_matrix, group_names=labels, categories=categories, cmap='Blues')
plt.tight_layout()
plt.savefig(os.path.join(OUT_PATH, 'confusion_matrix.png'), transparent=True, bbox_inches='tight')
plt.close()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment