Commit 5e04f371 authored by lli's avatar lli
Browse files

update

parent fed98667
import torch
import numpy as np
from collections import defaultdict
from wendtris import Wendtris
import matplotlib.pyplot as plt
env = Wendtris()
SEED = 99
def gen_epsilon_greedy_policy(n_action, epsilon):
def policy_function(state, Q):
probs = torch.ones(n_action) * epsilon / n_action
best_action = torch.argmax(Q[tuple(state)]).item()
probs[best_action] += 1.0 - epsilon
action = torch.multinomial(probs, 1).item()
return action
return policy_function
def q_learning(env, gamma, n_episode, alpha, epsilon, epsilon_min, epsilon_decay):
n_action = env.action_space.n
Q = defaultdict(lambda: torch.zeros(n_action))
for episode in range(n_episode):
epsilon_greedy_policy = gen_epsilon_greedy_policy(env.action_space.n, epsilon)
env.seed(99)
state = env.reset()
is_done = False
while not is_done:
action = epsilon_greedy_policy(state, Q)
next_state, reward, is_done, _ = env.step(action)
td_delta = reward + gamma * torch.max(Q[tuple(next_state)]) - Q[tuple(state)][action]
Q[tuple(state)][action] += alpha * td_delta
total_reward_episode[episode] += reward
if is_done:
print('Episode: {}, total reward: {}, epsilon: {}'.format(episode, total_reward_episode[episode],
epsilon))
break
state = next_state
if epsilon >= epsilon_min:
epsilon *= epsilon_decay
policy = {}
for state, actions in Q.items():
policy[tuple(state)] = torch.argmax(actions).item()
return Q, policy
gamma = 1
n_episode = 40000
alpha = 0.01
epsilon = 0.3
epsilon_decay = 0.99993
epsilon_min = 0.005
total_reward_episode = [0] * n_episode
optimal_Q, optimal_policy = q_learning(env, gamma, n_episode, alpha, epsilon, epsilon_min, epsilon_decay)
print('The optimal policy:\n', optimal_policy)
import matplotlib.pyplot as plt
smoothed_rewards = [np.mean(total_reward_episode[:i+1]) for i in range(len(total_reward_episode))]
plt.plot(total_reward_episode)
plt.plot(smoothed_rewards)
plt.title('Episode reward over time')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.show()
plt.plot(total_reward_episode)
plt.title('Episode reward over time')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.show()
\ No newline at end of file
import os
import sys
import torch
import argparse
import math, random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from collections import deque
import numpy as np
import matplotlib.pyplot as plt
from collections import namedtuple
import torch.backends.cudnn as cudnn
from timeit import default_timer as timer
from wendtris import Wendtris
from params.dqn_params import *
from numpy import load
import utils
from wendtris import Wendtris_Eva
parser = argparse.ArgumentParser(description='DQN')
parser.add_argument('--save_path', required=True, help='save path of results')
parser.add_argument('--n_hidden', type=int, default=12, help='number of hidden neurons (default: 12)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument('--policy', type=str, choices=('epsilon_greedy', 'boltzmann'))
args = parser.parse_args()
# Check if using cuda and define device
USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu')
#############################
# DQN model
#############################
class DQN(nn.Module):
def __init__(self, n_state, n_action, n_hidden, lr):
super(DQN, self).__init__()
......@@ -42,6 +55,40 @@ class DQN(nn.Module):
with torch.no_grad():
return self.model(torch.Tensor(state).to(device))
def eps_greedy_policy(self, state, n_action, epsilon):
"""
Compute epsilon greedy search
Eps Greedy policy either:
- takes a random action with probability epsilon
- takes current best action with prob (1 - epsilon)
@param state: state
@param epsilon: epsison
"""
p = random.random()
if p < epsilon:
# random action
return random.randint(0, n_action - 1)
else:
q_values = self.predict(state)
return torch.argmax(q_values).item()
def boltzmann_policy(self, state, n_action, tau=1, clip=(-500., 500.)):
"""
Boltzmann policy builds a probability law on q values and returns an
action selected randomly according to this law
:param tau: tau
:param clip: clip
:return: the selected action
"""
q_values = self.predict(state).detach().cpu().numpy()
q_values = q_values.astype('float64')
q_values[np.isnan(q_values)] = 0
exp_values = np.exp(np.clip(q_values / tau, clip[0], clip[1]))
probs = exp_values / np.sum(exp_values)
action = np.random.choice(range(n_action), p=probs)
return action
def update(self, state, y):
"""
Update the weights of the DQN given a training sample
......@@ -55,22 +102,7 @@ class DQN(nn.Module):
loss.backward()
self.optimizer.step()
# self.scheduler.step()
return loss
def choose_action(self, state, n_action, epsilon):
"""
Compute epsilon greedy search
@param state: state
@param epsilon: epsison
"""
p = random.random()
if p < epsilon:
# random action
return random.randint(0, n_action - 1)
else:
# take the action with maximal reward
q_values = self.predict(state)
return torch.argmax(q_values).item()
return loss.item()
def replay(self, replay_buffer, replay_batch_size, gamma):
"""
......@@ -98,40 +130,12 @@ class DQN(nn.Module):
return loss
# Initialize the environment, with penalty factor 2
env = Wendtris(20, 6, 6, 2)
# Hyperparameters
n_state = len(env.state)
n_action = env.action_space.n
n_hidden = 12
lr = 0.001
replay_buffer = deque(maxlen=10000)
replay_batch_size = 64
n_episode = 10000
gamma = 1
total_reward_episode = [0] * n_episode
num_no_capacity = []
accepted_orders = []
total_rewards = []
epsilon_start = 1.0
epsilon_decay = 0.999
epsilon_value = []
losses = []
# Initialize DQN network
dqn = DQN(n_state, n_action, n_hidden, lr)
if USE_CUDA:
dqn = dqn.to(device)
SEED = 100
OUT_PATH = "output/dqn"
OUT_PATH = args.save_path
LOG_FILE = os.path.join(OUT_PATH, 'log.txt')
utils.clear_folder(OUT_PATH)
......@@ -140,20 +144,41 @@ sys.stdout = utils.StdOut(LOG_FILE)
print(f"PyTorch version {torch.__version__}")
if USE_CUDA:
print(f"CUDA version: {torch.version.cuda}")
cudnn.benchmark = True
seed = args.seed
if seed is None:
seed = np.random.randint(1, 10000)
print('Random seed:', seed)
torch.manual_seed(seed)
if USE_CUDA:
torch.cuda.manual_seed(seed)
env.seed(seed)
start_time = timer()
for episode in range(n_episode):
epsilon = epsilon_start * epsilon_decay ** episode
epsilon_value.append(epsilon)
env.seed(SEED)
#############################
# Training
#############################s
for epoch in range(n_epoch):
state = env.reset()
is_done = False
if epsilon_decay:
epsilon = epsilon_end + (epsilon - epsilon_end) * math.exp(-1. * steps_done / decay)
epsilon_value.append(epsilon)
steps_done += 1
else:
epsilon_value.append(epsilon)
while not is_done:
action = dqn.choose_action(state, n_action, epsilon)
if args.policy == 'epsilon_greedy':
action = dqn.eps_greedy_policy(state, n_action, epsilon)
else:
action = dqn.boltzmann_policy(state, n_action)
next_state, reward, is_done, info = env.step(action)
total_reward_episode[episode] += reward
total_reward_epoch[epoch] += reward
replay_buffer.append((state, action, next_state, reward, is_done))
......@@ -166,41 +191,107 @@ for episode in range(n_episode):
losses.append(loss)
state = next_state
SEED += 1
print(
f'Episode: {episode}, total reward: {total_reward_episode[episode]}, epsilon: {epsilon}, loss: {loss}, '
f'num_no_capacity: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}')
print(f"Training time for {n_episode} episodes: {timer() - start_time}")
f'Epoch: {epoch}, total reward: {total_reward_epoch[epoch]}, epsilon: {epsilon}, loss: {loss}, '
f'num_no_capacity: {num_no_capacity[epoch]}, accepted orders: {accepted_orders[epoch]}')
print(f"Training time for {n_epoch} epochs: {timer() - start_time}")
number_params = sum(p.numel() for p in dqn.parameters())
print(f"Number of parameters: {number_params}")
# save the model parameters
torch.save(dqn.state_dict(), os.path.join(OUT_PATH, 'dqn_{}.pk1'.format(n_episode)))
torch.save(dqn.state_dict(), os.path.join(OUT_PATH, 'dqn_{}.pk1'.format(n_epoch)))
# Plot the total reward of training model
#############################
# Plot of the training model
#############################
# Cumulative Average reward received over time
smoothed_rewards = [np.mean(total_reward_episode[:i+1]) for i in range(len(total_reward_episode))]
plt.plot(total_reward_episode)
plt.plot(smoothed_rewards)
plt.title('Episode rewards over time')
plt.xlabel('Episode')
smoothed_rewards = [np.mean(total_reward_epoch[:i+1]) for i in range(len(total_reward_epoch))]
rewards, = plt.plot(total_reward_epoch, label='Rewards')
avg_rewards, = plt.plot(smoothed_rewards, label='Average rewards')
plt.title('Epoch rewards over time')
plt.xlabel('Epoch')
plt.ylabel('Total reward')
plt.legend(handles=[rewards, avg_rewards], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards.png'), dpi=1200, transparent=True)
plt.show()
# Plot epsilon
plt.plot(epsilon)
plt.plot(epsilon_value)
plt.title('Epsilon over time')
plt.xlabel('Episode')
plt.xlabel('Epoch')
plt.ylabel('Epsilon')
plt.savefig(os.path.join(OUT_PATH, 'epsilon.png'), dpi=1200, transparent=True)
plt.show()
# Plot number of penalties
num_no_capacity_smoothed = [np.mean(num_no_capacity[:i+1]) for i in range(len(num_no_capacity))]
plt.plot(num_no_capacity)
plt.plot(num_no_capacity_smoothed)
plt.title('Number of accepted orders without capacity')
plt.xlabel('Episode')
plt.ylabel('Number of accepted orders')
num_penalty, = plt.plot(num_no_capacity, label= 'Penalties')
avg_penalty, = plt.plot(num_no_capacity_smoothed, label = 'Average penalties')
plt.title('Number of penalties')
plt.xlabel('Epoch')
plt.ylabel('Number of penalties')
plt.legend(handles=[num_penalty, avg_penalty], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'penalties.png'), dpi=1200, transparent=True)
plt.show()
\ No newline at end of file
plt.show()
# Plot loss
loss= plt.plot(losses, label= 'Loss')
plt.title('Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.savefig(os.path.join(OUT_PATH, 'loss.png'), dpi=1200, transparent=True)
plt.show()
#############################
# Evaluation
#############################
# Use the trained model to predict 1000 test games
total_reward_epoch_eva = [0] * 1000
num_no_capacity_eva = []
accepted_orders_eva = []
test_orders = load('dp/order_list.npy')
test_rewards = load('dp/reward_list.npy')
for ep in range(test_orders.shape[0]):
env_eva = Wendtris_Eva(test_orders[ep], test_rewards[ep])
state = env_eva.state
is_done = False
while not is_done:
action = torch.argmax(dqn.predict(state)).item()
next_state, reward, is_done, info = env_eva.step(action)
total_reward_epoch_eva[ep] += reward
if is_done:
num_no_capacity_eva.append(info['Number no capacity'])
accepted_orders_eva.append(info['Accepted orders'])
break
state = next_state
print('##########################Evaluation##########################')
print(f'Epoch: {ep}, total reward: {total_reward_epoch_eva[ep]}',
f'num_no_capacity: {num_no_capacity_eva[ep]}, accepted orders: {accepted_orders_eva[ep]}')
# Plot rewards (evaluation)
# Cumulative Average reward received over time
smoothed_rewards_eva = [np.mean(total_reward_epoch_eva[:i+1]) for i in range(len(total_reward_epoch_eva))]
rewards, = plt.plot(total_reward_epoch_eva, label='Rewards')
avg_rewards, = plt.plot(smoothed_rewards_eva, label='Average rewards')
plt.title('Epoch rewards over time (Evaluation)')
plt.xlabel('Epoch')
plt.ylabel('Total reward')
plt.legend(handles=[rewards, avg_rewards], loc='best')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards_evaluation.png'), dpi=1200, transparent=True)
plt.show()
# Compare with optimal solution
results = load('dp/results.npy')
subsets = load('dp/subset.npy', allow_pickle=True).astype('object')
PyTorch version 1.8.0
Episode: 0, total reward: -296.0, epsilon: 1.0, loss: None, num_no_capacity: 5, accepted orders: [1, 4, 7]
Episode: 1, total reward: -476.0, epsilon: 0.999, loss: None, num_no_capacity: 8, accepted orders: [1, 2, 4, 6]
Episode: 2, total reward: -257.0, epsilon: 0.998001, loss: None, num_no_capacity: 4, accepted orders: [2, 3, 4, 5]
Episode: 3, total reward: -146.0, epsilon: 0.997002999, loss: 1515.2685546875, num_no_capacity: 4, accepted orders: [7, 8, 10]
Episode: 4, total reward: -248.0, epsilon: 0.996005996001, loss: 1094.554443359375, num_no_capacity: 4, accepted orders: [1, 5, 15]
Episode: 5, total reward: -599.0, epsilon: 0.995009990004999, loss: 1469.408935546875, num_no_capacity: 11, accepted orders: [1, 3]
Episode: 6, total reward: -545.0, epsilon: 0.994014980014994, loss: 1301.1917724609375, num_no_capacity: 8, accepted orders: [2, 4, 6]
Episode: 7, total reward: -217.0, epsilon: 0.993020965034979, loss: 951.9117431640625, num_no_capacity: 7, accepted orders: [1, 2, 15]
Episode: 8, total reward: -778.0, epsilon: 0.992027944069944, loss: 723.3570556640625, num_no_capacity: 10, accepted orders: [2, 3, 5]
Episode: 9, total reward: -169.0, epsilon: 0.991035916125874, loss: 1263.716552734375, num_no_capacity: 4, accepted orders: [5, 12, 20]
Episode: 10, total reward: -512.0, epsilon: 0.9900448802097482, loss: 1295.9482421875, num_no_capacity: 8, accepted orders: [1, 2, 5, 6, 8]
Episode: 11, total reward: -341.0, epsilon: 0.9890548353295384, loss: 1316.07421875, num_no_capacity: 8, accepted orders: [1, 4]
Episode: 12, total reward: -388.0, epsilon: 0.9880657804942089, loss: 571.015869140625, num_no_capacity: 8, accepted orders: [1, 2, 3, 5]
Episode: 13, total reward: -824.0, epsilon: 0.9870777147137147, loss: 1148.4736328125, num_no_capacity: 9, accepted orders: [2, 3, 7]
Episode: 14, total reward: -478.0, epsilon: 0.986090636999001, loss: 681.695068359375, num_no_capacity: 6, accepted orders: [3, 5, 6, 12]
Episode: 15, total reward: -414.0, epsilon: 0.985104546362002, loss: 1234.6348876953125, num_no_capacity: 5, accepted orders: [1, 6, 14]
Episode: 16, total reward: -462.0, epsilon: 0.9841194418156399, loss: 921.0794067382812, num_no_capacity: 7, accepted orders: [2, 3, 4, 20]
Episode: 17, total reward: -655.0, epsilon: 0.9831353223738244, loss: 1112.9505615234375, num_no_capacity: 7, accepted orders: [2, 4]
Episode: 18, total reward: -115.0, epsilon: 0.9821521870514505, loss: 551.048095703125, num_no_capacity: 2, accepted orders: [2, 4, 16]
Episode: 19, total reward: -132.0, epsilon: 0.981170034864399, loss: 633.1275634765625, num_no_capacity: 2, accepted orders: [3, 6, 10, 19]
Episode: 20, total reward: -318.0, epsilon: 0.9801888648295347, loss: 436.76800537109375, num_no_capacity: 9, accepted orders: [1, 2, 3]
Episode: 21, total reward: -353.0, epsilon: 0.9792086759647052, loss: 530.3541259765625, num_no_capacity: 6, accepted orders: [1, 5, 6]
Episode: 22, total reward: -622.0, epsilon: 0.9782294672887404, loss: 520.0811767578125, num_no_capacity: 7, accepted orders: [2, 3, 8]
Episode: 23, total reward: -321.0, epsilon: 0.9772512378214517, loss: 405.4267578125, num_no_capacity: 6, accepted orders: [1, 3, 4]
Episode: 24, total reward: -431.0, epsilon: 0.9762739865836302, loss: 543.4894409179688, num_no_capacity: 8, accepted orders: [1, 3, 4, 7]
Episode: 25, total reward: -406.0, epsilon: 0.9752977125970466, loss: 331.8126220703125, num_no_capacity: 7, accepted orders: [1, 4, 5]
Episode: 26, total reward: -448.0, epsilon: 0.9743224148844496, loss: 436.88299560546875, num_no_capacity: 8, accepted orders: [3, 5, 6]
Episode: 27, total reward: -284.0, epsilon: 0.9733480924695651, loss: 522.7940673828125, num_no_capacity: 7, accepted orders: [3, 4, 13]
Episode: 28, total reward: -231.0, epsilon: 0.9723747443770956, loss: 317.2567138671875, num_no_capacity: 6, accepted orders: [1, 6]
Episode: 29, total reward: -620.0, epsilon: 0.9714023696327184, loss: 234.384521484375, num_no_capacity: 8, accepted orders: [1, 2, 4, 5]
Episode: 30, total reward: -458.0, epsilon: 0.9704309672630858, loss: 442.863037109375, num_no_capacity: 6, accepted orders: [2, 6, 7, 13]
Episode: 31, total reward: -759.0, epsilon: 0.9694605362958226, loss: 247.92172241210938, num_no_capacity: 10, accepted orders: [2, 3, 4]
Episode: 32, total reward: -676.0, epsilon: 0.9684910757595268, loss: 360.61083984375, num_no_capacity: 8, accepted orders: [2, 5, 12]
Episode: 33, total reward: -309.0, epsilon: 0.9675225846837673, loss: 334.993896484375, num_no_capacity: 6, accepted orders: [1, 2, 3]
Episode: 34, total reward: -99.0, epsilon: 0.9665550620990835, loss: 164.2584686279297, num_no_capacity: 4, accepted orders: [1, 5, 7]
Episode: 35, total reward: -382.0, epsilon: 0.9655885070369844, loss: 249.46676635742188, num_no_capacity: 6, accepted orders: [1, 6, 8]
Episode: 36, total reward: -260.0, epsilon: 0.9646229185299474, loss: 222.1248016357422, num_no_capacity: 7, accepted orders: [2, 3, 4]
Episode: 37, total reward: -863.0, epsilon: 0.9636582956114175, loss: 374.58319091796875, num_no_capacity: 11, accepted orders: [1, 2]
Episode: 38, total reward: -456.0, epsilon: 0.9626946373158061, loss: 199.5648193359375, num_no_capacity: 7, accepted orders: [1, 3, 4, 6]
Episode: 39, total reward: -168.0, epsilon: 0.9617319426784903, loss: 374.11456298828125, num_no_capacity: 3, accepted orders: [2, 4, 6]
Episode: 40, total reward: -908.0, epsilon: 0.9607702107358118, loss: 412.63043212890625, num_no_capacity: 11, accepted orders: [4, 5, 9]
Episode: 41, total reward: -188.0, epsilon: 0.959809440525076, loss: 462.1036376953125, num_no_capacity: 5, accepted orders: [1, 3, 6, 10]
Episode: 42, total reward: -197.0, epsilon: 0.9588496310845509, loss: 137.34925842285156, num_no_capacity: 5, accepted orders: [1, 3, 8, 14]
Episode: 43, total reward: -296.0, epsilon: 0.9578907814534663, loss: 291.7785339355469, num_no_capacity: 6, accepted orders: [1, 3, 4]
Episode: 44, total reward: -435.0, epsilon: 0.9569328906720129, loss: 207.5941162109375, num_no_capacity: 9, accepted orders: [1, 2]
Episode: 45, total reward: -641.0, epsilon: 0.9559759577813408, loss: 288.4837646484375, num_no_capacity: 9, accepted orders: [1, 3, 4]
Episode: 46, total reward: -472.0, epsilon: 0.9550199818235595, loss: 385.63043212890625, num_no_capacity: 8, accepted orders: [1, 2, 11]
Episode: 47, total reward: -209.0, epsilon: 0.954064961841736, loss: 265.7742004394531, num_no_capacity: 7, accepted orders: [1, 2, 10]
Episode: 48, total reward: -752.0, epsilon: 0.9531108968798943, loss: 403.2070617675781, num_no_capacity: 10, accepted orders: [4, 5]
Episode: 49, total reward: -148.0, epsilon: 0.9521577859830144, loss: 378.09295654296875, num_no_capacity: 5, accepted orders: [3, 5, 7]
Episode: 50, total reward: -515.0, epsilon: 0.9512056281970314, loss: 172.2987060546875, num_no_capacity: 6, accepted orders: [2, 3, 8]
Episode: 51, total reward: -396.0, epsilon: 0.9502544225688343, loss: 188.1471710205078, num_no_capacity: 7, accepted orders: [1, 2, 16]
Episode: 52, total reward: -588.0, epsilon: 0.9493041681462654, loss: 468.11444091796875, num_no_capacity: 7, accepted orders: [1, 3, 4]
Episode: 53, total reward: -351.0, epsilon: 0.9483548639781192, loss: 257.4324645996094, num_no_capacity: 5, accepted orders: [1, 7]
Episode: 54, total reward: -366.0, epsilon: 0.947406509114141, loss: 191.06546020507812, num_no_capacity: 9, accepted orders: [1, 3]
Episode: 55, total reward: -389.0, epsilon: 0.9464591026050269, loss: 127.50218200683594, num_no_capacity: 8, accepted orders: [1, 3, 6]
Episode: 56, total reward: -190.0, epsilon: 0.9455126435024219, loss: 264.197265625, num_no_capacity: 6, accepted orders: [1, 4, 6]
Episode: 57, total reward: -540.0, epsilon: 0.9445671308589194, loss: 309.24700927734375, num_no_capacity: 10, accepted orders: [1, 2, 4]
Episode: 58, total reward: -312.0, epsilon: 0.9436225637280605, loss: 284.0190734863281, num_no_capacity: 6, accepted orders: [1, 3, 5, 19]
Episode: 59, total reward: -256.0, epsilon: 0.9426789411643325, loss: 193.7789764404297, num_no_capacity: 7, accepted orders: [1, 2, 7]
Episode: 60, total reward: -476.0, epsilon: 0.9417362622231682, loss: 208.85693359375, num_no_capacity: 7, accepted orders: [1, 3, 5]
Episode: 61, total reward: -93.0, epsilon: 0.9407945259609449, loss: 254.3696746826172, num_no_capacity: 5, accepted orders: [1, 2, 3, 4, 15]
Episode: 62, total reward: -840.0, epsilon: 0.9398537314349841, loss: 362.2593078613281, num_no_capacity: 11, accepted orders: [1, 2, 3]
Episode: 63, total reward: -535.0, epsilon: 0.9389138777035491, loss: 170.14749145507812, num_no_capacity: 10, accepted orders: [2, 4, 9]
Episode: 64, total reward: -267.0, epsilon: 0.9379749638258454, loss: 233.229248046875, num_no_capacity: 6, accepted orders: [1, 2, 7, 9]
Episode: 65, total reward: -116.0, epsilon: 0.9370369888620197, loss: 236.19491577148438, num_no_capacity: 7, accepted orders: [1, 2, 3, 7, 12]
Episode: 66, total reward: -654.0, epsilon: 0.9360999518731576, loss: 395.6317443847656, num_no_capacity: 9, accepted orders: [2, 3, 4]
Episode: 67, total reward: -108.0, epsilon: 0.9351638519212845, loss: 310.77423095703125, num_no_capacity: 4, accepted orders: [2, 4]
Episode: 68, total reward: -313.0, epsilon: 0.9342286880693632, loss: 227.64627075195312, num_no_capacity: 4, accepted orders: [2, 5, 6]
Episode: 69, total reward: -405.0, epsilon: 0.9332944593812939, loss: 266.79205322265625, num_no_capacity: 6, accepted orders: [1, 4, 11]
Episode: 70, total reward: -881.0, epsilon: 0.9323611649219126, loss: 514.915771484375, num_no_capacity: 10, accepted orders: [1, 2, 3]
Episode: 71, total reward: -104.0, epsilon: 0.9314288037569907, loss: 235.0006561279297, num_no_capacity: 2, accepted orders: [2, 4, 7, 10]
Episode: 72, total reward: -446.0, epsilon: 0.9304973749532336, loss: 182.97811889648438, num_no_capacity: 7, accepted orders: [1, 2, 5, 7]
Episode: 73, total reward: -552.0, epsilon: 0.9295668775782804, loss: 257.50762939453125, num_no_capacity: 9, accepted orders: [1, 2, 3]
Episode: 74, total reward: -492.0, epsilon: 0.9286373107007021, loss: 234.62115478515625, num_no_capacity: 6, accepted orders: [2, 3, 11]
Episode: 75, total reward: -563.0, epsilon: 0.9277086733900014, loss: 114.7972412109375, num_no_capacity: 9, accepted orders: [1, 2, 5]
Episode: 76, total reward: -269.0, epsilon: 0.9267809647166114, loss: 199.13429260253906, num_no_capacity: 7, accepted orders: [2, 3, 4, 5]
Episode: 77, total reward: -331.0, epsilon: 0.9258541837518948, loss: 567.741943359375, num_no_capacity: 6, accepted orders: [1, 4, 16]
Episode: 78, total reward: -259.0, epsilon: 0.9249283295681429, loss: 196.4779510498047, num_no_capacity: 6, accepted orders: [1, 3]
Episode: 79, total reward: -533.0, epsilon: 0.9240034012385747, loss: 341.63824462890625, num_no_capacity: 8, accepted orders: [1, 2, 4, 6]
Episode: 80, total reward: 142.0, epsilon: 0.9230793978373362, loss: 226.29409790039062, num_no_capacity: 2, accepted orders: [3, 5, 9, 10]
Episode: 81, total reward: -490.0, epsilon: 0.9221563184394989, loss: 142.129638671875, num_no_capacity: 7, accepted orders: [1, 2]
Episode: 82, total reward: -771.0, epsilon: 0.9212341621210594, loss: 198.11651611328125, num_no_capacity: 9, accepted orders: [1, 3, 5, 18]
Episode: 83, total reward: -442.0, epsilon: 0.9203129279589383, loss: 263.2390441894531, num_no_capacity: 7, accepted orders: [2, 4, 6, 16]
Episode: 84, total reward: -313.0, epsilon: 0.9193926150309794, loss: 271.8157653808594, num_no_capacity: 6, accepted orders: [1, 4, 5, 8]
Episode: 85, total reward: -268.0, epsilon: 0.9184732224159483, loss: 439.0942687988281, num_no_capacity: 4, accepted orders: [3, 5, 6, 17]
Episode: 86, total reward: -399.0, epsilon: 0.9175547491935324, loss: 559.6263427734375, num_no_capacity: 4, accepted orders: [1, 2, 11]
Episode: 87, total reward: -413.0, epsilon: 0.9166371944443389, loss: 417.55767822265625, num_no_capacity: 7, accepted orders: [1, 3, 4, 16]
Episode: 88, total reward: -279.0, epsilon: 0.9157205572498945, loss: 226.57447814941406, num_no_capacity: 4, accepted orders: [4, 7, 10, 12]
Episode: 89, total reward: -355.0, epsilon: 0.9148048366926447, loss: 164.82437133789062, num_no_capacity: 7, accepted orders: [1, 2, 4]
Episode: 90, total reward: -540.0, epsilon: 0.913890031855952, loss: 336.3482971191406, num_no_capacity: 9, accepted orders: [1, 2, 3]
Episode: 91, total reward: -432.0, epsilon: 0.912976141824096, loss: 224.48004150390625, num_no_capacity: 8, accepted orders: [1, 2, 5, 7]
Episode: 92, total reward: -172.0, epsilon: 0.912063165682272, loss: 144.8651123046875, num_no_capacity: 4, accepted orders: [1, 3, 7]
Episode: 93, total reward: -446.0, epsilon: 0.9111511025165897, loss: 344.2680969238281, num_no_capacity: 8, accepted orders: [3, 4, 7, 11]
Episode: 94, total reward: -427.0, epsilon: 0.9102399514140731, loss: 372.3551025390625, num_no_capacity: 6, accepted orders: [1, 5, 6, 13]
Episode: 95, total reward: -160.0, epsilon: 0.9093297114626591, loss: 416.30322265625, num_no_capacity: 5, accepted orders: [1, 2, 13]
Episode: 96, total reward: -3.0, epsilon: 0.9084203817511963, loss: 415.9712829589844, num_no_capacity: 2, accepted orders: [3, 8, 9, 17]
Episode: 97, total reward: -770.0, epsilon: 0.9075119613694451, loss: 219.63385009765625, num_no_capacity: 9, accepted orders: [1, 4, 8, 9]
Episode: 98, total reward: -318.0, epsilon: 0.9066044494080757, loss: 406.5568542480469, num_no_capacity: 7, accepted orders: [2, 4, 5]
Episode: 99, total reward: -461.0, epsilon: 0.9056978449586677, loss: 362.95770263671875, num_no_capacity: 7, accepted orders: [1, 3, 11]
Training time for 100 episodes: 19.535649799999998
from wendtris import Wendtris
from collections import deque
#############################
# Initialize the environment, with penalty factor 2
#############################s
env = Wendtris(20, 6, 6, 2)
#############################
# Model Params
#############################
n_state = len(env.state) # Number of input
n_action = env.action_space.n # Number of output
n_hidden = 12 # Number of hidden neurons
lr = 0.001 # Learning rate
gamma = 1 # Discount factor
epsilon = 1 # epsilon start value
epsilon_end = 0.001 # epsilon end value
epsilon_decay = True # epsilon decay rate
decay = 50
steps_done = 0 # parameter for epsilon greedy policy
replay_buffer = deque(maxlen=10000) # Size of replay buffer
replay_batch_size = 64 # Size of replay batch
n_epoch = 100 # Number of training episodes
total_reward_epoch = [0] * n_epoch
num_no_capacity = []
accepted_orders = []
epsilon_value = []
losses = []
# SEED = 100
\ No newline at end of file
from wendtris import Wendtris
import random
env = Wendtris()
for i in range(5):
is_done = False
while is_done == False:
env.render()
state, reward, is_done, _ = env.step(random.randint(0, 1))
env.reset()
......@@ -24,6 +24,7 @@ class Wendtris(gym.Env):
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
seed += 1
return [seed]
def reset(self):
......@@ -167,3 +168,92 @@ class Wendtris(gym.Env):
return np.hstack(self.state.flatten()), current_reward, done, info
class Wendtris_Eva(gym.Env):
# Evaluation environment
def __init__(self, orders, rewards, num_spacewidth=6, num_capacity=6):
self.orders = orders.astype('float64')
self.rewards = rewards
self.num_spacewidth = num_spacewidth
self.num_capacity = num_capacity
self.action_space = spaces.Discrete(2)
orders_offset_row = np.zeros(self.orders.shape[1], dtype=np.float64)
rewards_offset_row = np.zeros(self.rewards.shape[1])
self.num_remain_orders = self.orders.shape[0]
self.orders = np.vstack((self.orders, orders_offset_row))
self.rewards = np.vstack((self.rewards, rewards_offset_row))
self.order_pos = 0
self.reward_pos = 0
self.count_no_capacity = 0
self.accepted_orders = []
self.rejected_orders = []
self.capacity_list = np.ones((self.num_spacewidth,), dtype=np.float64) * self.num_capacity
self.tmp = np.array(
[self.capacity_list, self.orders[self.order_pos], self.rewards[self.reward_pos][0], self.num_remain_orders],
dtype=object)
self.state = np.hstack(self.tmp.flatten())
def step(self, action):