Commit 7773341b authored by lli's avatar lli
Browse files

finished dqn

parent 5e04f371
import random
import torch
import torch.nn as nn
import numpy as np
from utils.utils import to_np
# Check if using cuda and define device
USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu')
#############################
# DQN model
#############################
class DQN(nn.Module):
def __init__(self, n_state, n_action, n_hidden, lr):
super(DQN, self).__init__()
self.criterion = torch.nn.SmoothL1Loss()
self.model = torch.nn.Sequential(
torch.nn.Linear(n_state, n_hidden),
torch.nn.ReLU(),
torch.nn.Linear(n_hidden, n_hidden),
torch.nn.ReLU(),
torch.nn.Linear(n_hidden, n_action)
)
self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
# self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.9)
def predict(self, state):
"""
Compute the Q values of the state for all actions using the learning model
@param state: input state
@return: Q values of the state for all actions
"""
with torch.no_grad():
return self.model(torch.Tensor(state).to(device))
def eps_greedy_policy(self, state, n_action, epsilon):
"""
Compute epsilon greedy search
Eps Greedy policy either:
- takes a random action with probability epsilon
- takes current best action with prob (1 - epsilon)
@param state: state
@param epsilon: epsison
"""
p = random.random()
if p < epsilon:
# random action
return random.randint(0, n_action - 1)
else:
q_values = self.predict(state)
return torch.argmax(q_values).item()
def boltzmann_policy(self, state, n_action, tau=1, clip=(-500., 500.)):
"""
Boltzmann policy builds a probability law on q values and returns an
action selected randomly according to this law
:param tau: tau
:param clip: clip
:return: the selected action
"""
q_values = to_np(self.predict(state))
q_values = q_values.astype('float64')
q_values[np.isnan(q_values)] = 0
exp_values = np.exp(np.clip(q_values / tau, clip[0], clip[1]))
probs = exp_values / np.sum(exp_values)
action = np.random.choice(range(n_action), p=probs)
return action
def update(self, state, y):
"""
Update the weights of the DQN given a training sample
@param state: state
@param y: target value
"""
y_pred = self.model(torch.Tensor(state).to(device))
y = torch.Tensor(y).to(device)
loss = self.criterion(y_pred, y)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# self.scheduler.step()
return loss.item()
def replay(self, replay_buffer, replay_batch_size, gamma):
"""
Experience replay
@param replay_buffer: a list of experience
@param replay_batch_size: the number of samples we use to update the model each time
@param gamma: the discount factor
"""
if len(replay_buffer) >= replay_batch_size:
replay_data = random.sample(replay_buffer, replay_batch_size)
states = []
td_targets = []
for state, action, next_state, reward, is_done in replay_data:
states.append(state)
q_values = self.predict(state).tolist()
if is_done:
q_values[action] = reward
else:
q_values_next = self.predict(next_state)
q_values[action] = reward + gamma * torch.max(q_values_next).item()
td_targets.append(q_values)
loss = self.update(states, td_targets)
return loss
\ No newline at end of file
from numpy import load, save
from utils.utils import modify_orders
optimal_orders = load('subset.npy', allow_pickle=True).astype('object')
# sort the optimal_orders
optimal_orders = [sorted(i, reverse=False) for i in optimal_orders]
# modify the optimal orders for evaluation
# optimal orders contains the index of accepted orders, -->1 accepted
result = modify_orders(optimal_orders)
save('optimal_acception.npy', result)
from numpy import asarray
from numpy import savez_compressed
from numpy import load
from requestGenerator import RequestGenerator
from environment.requestGenerator import RequestGenerator
......
......@@ -135,7 +135,7 @@ class Wendtris(gym.Env):
current_reward = self.rewards[self.reward_pos][1]
self.profit += current_reward
self.tmp[0] -= self.tmp[1]
self.accepted_orders.append(self.order_pos + 1)
self.accepted_orders.append(self.order_pos)
self.order_pos += 1
self.reward_pos += 1
self.num_remain_orders -= 1
......@@ -149,7 +149,7 @@ class Wendtris(gym.Env):
current_reward = 0
self.profit += current_reward
self.num_remain_orders -= 1
self.rejected_orders.append(self.order_pos + 1)
self.rejected_orders.append(self.order_pos)
self.order_pos += 1
self.reward_pos += 1
self.tmp[1] = self.orders[self.order_pos]
......@@ -223,7 +223,7 @@ class Wendtris_Eva(gym.Env):
else:
current_reward = self.rewards[self.reward_pos][1]
self.tmp[0] -= self.tmp[1]
self.accepted_orders.append(self.order_pos + 1)
self.accepted_orders.append(self.order_pos)
self.order_pos += 1
self.reward_pos += 1
self.num_remain_orders -= 1
......@@ -236,7 +236,7 @@ class Wendtris_Eva(gym.Env):
# If the agent rejects the order
current_reward = 0
self.num_remain_orders -= 1
self.rejected_orders.append(self.order_pos + 1)
self.rejected_orders.append(self.order_pos)
self.order_pos += 1
self.reward_pos += 1
self.tmp[1] = self.orders[self.order_pos]
......
from wendtris import Wendtris
from environment.wendtris import Wendtris
from collections import deque
......@@ -8,12 +8,12 @@ from collections import deque
env = Wendtris(20, 6, 6, 2)
#############################
# Model Params
# Model Params (fixed)
#############################
n_state = len(env.state) # Number of input
n_action = env.action_space.n # Number of output
n_hidden = 12 # Number of hidden neurons
lr = 0.001 # Learning rate
#n_hidden = 12 # Number of hidden neurons
#lr = 0.001 # Learning rate
gamma = 1 # Discount factor
epsilon = 1 # epsilon start value
......@@ -24,14 +24,3 @@ steps_done = 0 # parameter for epsilon greedy policy
replay_buffer = deque(maxlen=10000) # Size of replay buffer
replay_batch_size = 64 # Size of replay batch
n_epoch = 100 # Number of training episodes
total_reward_epoch = [0] * n_epoch
num_no_capacity = []
accepted_orders = []
epsilon_value = []
losses = []
# SEED = 100
\ No newline at end of file
import pandas as pd
import numpy as np
dates = pd.date_range("20130101", periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
\ No newline at end of file
import os
import sys
import argparse
import math, random
import numpy as np
import math
import statistics
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
from timeit import default_timer as timer
from params.dqn_params import *
from numpy import load
import utils
from algorithms.dqn import DQN
from utils.cf_matrix import make_confusion_matrix
from sklearn.metrics import confusion_matrix
from params.dqn_params import *
from utils.utils import *
from environment.wendtris import Wendtris_Eva
from wendtris import Wendtris_Eva
# Configurations for matplotlib
plt.rcParams['agg.path.chunksize'] = 10000
parser = argparse.ArgumentParser(description='DQN')
parser.add_argument('--save_path', required=True, help='save path of results')
parser.add_argument('--save_path', type=str, required=True, help='save path of results')
parser.add_argument('--n_hidden', type=int, default=12, help='number of hidden neurons (default: 12)')
parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument('--policy', type=str, choices=('epsilon_greedy', 'boltzmann'))
parser.add_argument('--n_epoch', type=int, required=True, help='number of training epochs')
args = parser.parse_args()
# Check if using cuda and define device
......@@ -28,123 +33,48 @@ USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu')
#############################
# DQN model
#############################
class DQN(nn.Module):
def __init__(self, n_state, n_action, n_hidden, lr):
super(DQN, self).__init__()
self.criterion = torch.nn.SmoothL1Loss()
self.model = torch.nn.Sequential(
torch.nn.Linear(n_state, n_hidden),
torch.nn.ReLU(),
torch.nn.Linear(n_hidden, n_hidden),
torch.nn.ReLU(),
torch.nn.Linear(n_hidden, n_action)
)
self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
# self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.9)
def predict(self, state):
"""
Compute the Q values of the state for all actions using the learning model
@param state: input state
@return: Q values of the state for all actions
"""
with torch.no_grad():
return self.model(torch.Tensor(state).to(device))
def eps_greedy_policy(self, state, n_action, epsilon):
"""
Compute epsilon greedy search
Eps Greedy policy either:
- takes a random action with probability epsilon
- takes current best action with prob (1 - epsilon)
@param state: state
@param epsilon: epsison
"""
p = random.random()
if p < epsilon:
# random action
return random.randint(0, n_action - 1)
else:
q_values = self.predict(state)
return torch.argmax(q_values).item()
def boltzmann_policy(self, state, n_action, tau=1, clip=(-500., 500.)):
"""
Boltzmann policy builds a probability law on q values and returns an
action selected randomly according to this law
:param tau: tau
:param clip: clip
:return: the selected action
"""
q_values = self.predict(state).detach().cpu().numpy()
q_values = q_values.astype('float64')
q_values[np.isnan(q_values)] = 0
exp_values = np.exp(np.clip(q_values / tau, clip[0], clip[1]))
probs = exp_values / np.sum(exp_values)
action = np.random.choice(range(n_action), p=probs)
return action
def update(self, state, y):
"""
Update the weights of the DQN given a training sample
@param state: state
@param y: target value
"""
y_pred = self.model(torch.Tensor(state).to(device))
y = torch.Tensor(y).to(device)
loss = self.criterion(y_pred, y)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# self.scheduler.step()
return loss.item()
def replay(self, replay_buffer, replay_batch_size, gamma):
"""
Experience replay
@param replay_buffer: a list of experience
@param replay_batch_size: the number of samples we use to update the model each time
@param gamma: the discount factor
"""
if len(replay_buffer) >= replay_batch_size:
replay_data = random.sample(replay_buffer, replay_batch_size)
states = []
td_targets = []
for state, action, next_state, reward, is_done in replay_data:
states.append(state)
q_values = self.predict(state).tolist()
if is_done:
q_values[action] = reward
else:
q_values_next = self.predict(next_state)
q_values[action] = reward + gamma * torch.max(q_values_next).item()
td_targets.append(q_values)
loss = self.update(states, td_targets)
return loss
# Initialize DQN network
dqn = DQN(n_state, n_action, n_hidden, lr)
if USE_CUDA:
dqn = dqn.to(device)
OUT_PATH = args.save_path
LOG_FILE = os.path.join(OUT_PATH, 'log.txt')
utils.clear_folder(OUT_PATH)
clear_folder(OUT_PATH)
print(f'Logging to {LOG_FILE}n')
sys.stdout = utils.StdOut(LOG_FILE)
sys.stdout = StdOut(LOG_FILE)
print('#################Deep Q Network#################')
print(f"PyTorch version {torch.__version__}")
print(f'Training device: {device}')
if USE_CUDA:
print(f"CUDA version: {torch.version.cuda}")
cudnn.benchmark = True
print()
print('#################Hyper Parameter Settings#################')
print(f'Number of states (input): {n_state}')
print(f'Number of actions (output): {n_action}')
print(f'Number of hidden neurons: {args.n_hidden}')
print(f'Learning rate: {args.lr}')
print(f'Discount factor: {gamma}')
print(f'Exploration strategy: {args.policy}')
if args.policy == 'epsilon_greedy':
if epsilon_decay:
print(f'Epsilon start value: {epsilon}')
print(f'Epsilon end value: {epsilon_end}')
print(f'Decay: {decay}')
else:
print(f'Epsilon value: {epsilon}')
print(f'Size of experience replay buffer: {replay_buffer}')
print(f'Size of experience replay batch: {replay_batch_size}')
print()
# Initialize DQN network
dqn = DQN(n_state, n_action, args.n_hidden, args.lr)
if USE_CUDA:
dqn = dqn.to(device)
print('######################DQN architecture#####################')
print(dqn)
print()
print(f'Total parameters: {sum(p.numel() for p in dqn.parameters())}')
print(f'Trainable parameters: {sum(p.numel() for p in dqn.parameters() if p.requires_grad)}')
print()
seed = args.seed
if seed is None:
......@@ -153,9 +83,15 @@ print('Random seed:', seed)
torch.manual_seed(seed)
if USE_CUDA:
torch.cuda.manual_seed(seed)
env.seed(seed)
n_epoch = args.n_epoch # Number of training episodes
total_reward_epoch = [0] * n_epoch
num_no_capacity = []
accepted_orders = []
epsilon_value = []
losses = []
start_time = timer()
#############################
......@@ -165,12 +101,13 @@ for epoch in range(n_epoch):
state = env.reset()
is_done = False
if epsilon_decay:
epsilon = epsilon_end + (epsilon - epsilon_end) * math.exp(-1. * steps_done / decay)
epsilon_value.append(epsilon)
steps_done += 1
else:
epsilon_value.append(epsilon)
if args.policy == 'epsilon_greedy':
if epsilon_decay:
epsilon = epsilon_end + (epsilon - epsilon_end) * math.exp(-1. * steps_done / decay)
epsilon_value.append(epsilon)
steps_done += 1
else:
epsilon_value.append(epsilon)
while not is_done:
if args.policy == 'epsilon_greedy':
......@@ -191,16 +128,12 @@ for epoch in range(n_epoch):
losses.append(loss)
state = next_state
print(
f'Epoch: {epoch}, total reward: {total_reward_epoch[epoch]}, epsilon: {epsilon}, loss: {loss}, '
f'num_no_capacity: {num_no_capacity[epoch]}, accepted orders: {accepted_orders[epoch]}')
print(f"Training time for {n_epoch} epochs: {timer() - start_time}")
number_params = sum(p.numel() for p in dqn.parameters())
print(f"Number of parameters: {number_params}")
# save the model parameters
torch.save(dqn.state_dict(), os.path.join(OUT_PATH, 'dqn_{}.pk1'.format(n_epoch)))
......@@ -215,16 +148,17 @@ plt.title('Epoch rewards over time')
plt.xlabel('Epoch')
plt.ylabel('Total reward')
plt.legend(handles=[rewards, avg_rewards], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards.png'), dpi=1200, transparent=True)
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.show()
# Plot epsilon
plt.plot(epsilon_value)
plt.title('Epsilon over time')
plt.xlabel('Epoch')
plt.ylabel('Epsilon')
plt.savefig(os.path.join(OUT_PATH, 'epsilon.png'), dpi=1200, transparent=True)
plt.show()
if args.policy == 'epsilon_greedy':
plt.plot(epsilon_value)
plt.title('Epsilon over time')
plt.xlabel('Epoch')
plt.ylabel('Epsilon')
plt.savefig(os.path.join(OUT_PATH, 'epsilon.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.show()
# Plot number of penalties
num_no_capacity_smoothed = [np.mean(num_no_capacity[:i+1]) for i in range(len(num_no_capacity))]
......@@ -234,7 +168,7 @@ plt.title('Number of penalties')
plt.xlabel('Epoch')
plt.ylabel('Number of penalties')
plt.legend(handles=[num_penalty, avg_penalty], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'penalties.png'), dpi=1200, transparent=True)
plt.savefig(os.path.join(OUT_PATH, 'penalties.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.show()
# Plot loss
......@@ -242,7 +176,7 @@ loss= plt.plot(losses, label= 'Loss')
plt.title('Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.savefig(os.path.join(OUT_PATH, 'loss.png'), dpi=1200, transparent=True)
plt.savefig(os.path.join(OUT_PATH, 'loss.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.show()
......@@ -264,6 +198,7 @@ for ep in range(test_orders.shape[0]):
is_done = False
while not is_done:
# Always take the best action
action = torch.argmax(dqn.predict(state)).item()
next_state, reward, is_done, info = env_eva.step(action)
total_reward_epoch_eva[ep] += reward
......@@ -278,20 +213,55 @@ for ep in range(test_orders.shape[0]):
print(f'Epoch: {ep}, total reward: {total_reward_epoch_eva[ep]}',
f'num_no_capacity: {num_no_capacity_eva[ep]}, accepted orders: {accepted_orders_eva[ep]}')
# Save the variables for evaluation
EVA_FILE = os.path.join(OUT_PATH, 'evaluation')
save_list(total_reward_epoch_eva, EVA_FILE, 'total_reward_epoch_eva')
save_list(num_no_capacity_eva, EVA_FILE, 'num_no_capacity_eva')
save_list(accepted_orders_eva, EVA_FILE, 'accepted_orders_eva')
# Load optimal solution
optimal_rewards = load('dp/results.npy')
optimal_orders = load('dp/subset.npy', allow_pickle=True).astype('object').tolist()
# Calculate average results of the ground truth
optimal_avg_rewards = np.average(optimal_rewards)
eva_avg_rewards = statistics.mean(total_reward_epoch_eva)
print(f'Predicted average rewards: {eva_avg_rewards}')
print(f"Optimal average rewards: {optimal_avg_rewards}")
# Plot rewards (evaluation)
# Cumulative Average reward received over time
smoothed_rewards_eva = [np.mean(total_reward_epoch_eva[:i+1]) for i in range(len(total_reward_epoch_eva))]
rewards, = plt.plot(total_reward_epoch_eva, label='Rewards')
avg_rewards, = plt.plot(smoothed_rewards_eva, label='Average rewards')
smoothed_optimal_rewards = [np.mean(optimal_rewards[:i+1]) for i in range(len(optimal_rewards))]
rewards_eva, = plt.plot(total_reward_epoch_eva, label='Rewards')
avg_rewards_eva, = plt.plot(smoothed_rewards_eva, label='Average rewards')
opt_rewards, = plt.plot(optimal_rewards, label='Optimal rewards')
opt_avg_rewards, = plt.plot(smoothed_optimal_rewards, label='Average optimal rewards')
plt.title('Epoch rewards over time (Evaluation)')
plt.xlabel('Epoch')
plt.ylabel('Total reward')
plt.legend(handles=[rewards, avg_rewards], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards_evaluation.png'), dpi=1200, transparent=True)
plt.legend(handles=[rewards_eva, avg_rewards_eva, opt_rewards, opt_avg_rewards], loc='best', fontsize='small')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards_evaluation.png'), dpi=1200, transparent=True, bbox_inches='tight')
plt.show()
# Compare with optimal solution
results = load('dp/results.npy')
subsets = load('dp/subset.npy', allow_pickle=True).astype('object')
# Modify orders for evaluation
prediction = np.asarray(modify_orders(accepted_orders_eva), dtype=int)
prediction = prediction.flatten()
optimal_results = np.asarray(modify_orders(optimal_orders), dtype=int)
optimal_results= optimal_results.flatten()
# Confusion matrix
cf_matrix = confusion_matrix(optimal_results, prediction)
labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['Reject', 'Accept']
make_confusion_matrix(cf_matrix, group_names=labels, categories=categories, cmap='Blues')
plt.tight_layout()
plt.savefig(os.path.join(OUT_PATH, 'confusion_matrix.png'), transparent=True, bbox_inches='tight')
plt.show()
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
def make_confusion_matrix(cf,
group_names=None,
categories='auto',
count=True,
percent=True,
cbar=True,
xyticks=True,
xyplotlabels=True,
sum_stats=True,
figsize=None,
cmap='Blues',
title=None):
'''
This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
Arguments
---------
cf: confusion matrix to be passed in
group_names: List of strings that represent the labels row by row to be shown in each square.
categories: List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'
count: If True, show the raw number in the confusion matrix. Default is True.