Commit 01dd6c69 authored by lli's avatar lli
Browse files

Initial commit

parents
.idea
__pycache__
import numpy as np
from numpy import save
from timeit import default_timer as timer
import math
import sys
def resourceAllocator(orderList, rewardList):
maximumSubset = [] # list for storing the index of the accepted orders
result = np.zeros((orderList.shape[0],), dtype=int) # array for storing the maximum total reward of each episode
# setting all elements of the dp array to 0
for episode in range(orderList.shape[0]):
numOfOrder = orderList.shape[1]
# dp stores the different states of the resources
dp = []
for i in range(numOfOrder + 1):
# 6 loops, representing the different states of the resource consumption
list6 = []
for j in range(7):
list5 = []
for k in range(7):
list4 = []
for l in range(7):
list3 = []
for m in range(7):
list2 = []
for n in range(7):
list1 = []
for o in range(7):
list1.append(0)
list2.append(list1)
list3.append(list2)
list4.append(list3)
list5.append(list4)
list6.append(list5)
dp.append(list6)
# i represents the order
for i in range(numOfOrder):
# different states, repesenting the number of the consumption of each resource (6 types)
for j in range(1, 7):
for k in range(1, 7):
for l in range(1, 7):
for m in range(1, 7):
for n in range(1, 7):
for o in range(1, 7):
# ignoring the current request, i
if i > 0:
dp[i + 1][j][k][l][m][n][o] = dp[i][j][k][l][m][n][o]
# accepting the current request, i if it's possible to accept
if orderList[episode, i, 0] <= j and orderList[episode, i, 1] <= k and orderList[
episode, i, 2] <= l and orderList[episode, i, 3] <= m and orderList[
episode, i, 4] <= n and orderList[episode, i, 5] <= o:
# compare accept order or not accept order
dp[i + 1][j][k][l][m][n][o] = max(dp[i + 1][j][k][l][m][n][o], # if not accept
rewardList[
episode, i, 1] + # if accept, plus current reward, the substract the resources
dp[i][j - orderList[episode, i, 0]][
k - orderList[episode, i, 1]][
l - orderList[episode, i, 2]][
m - orderList[episode, i, 3]][
n - orderList[episode, i, 4]][
o - orderList[episode, i, 5]])
result[episode] = dp[numOfOrder][6][6][6][6][6][6] # maximum reward for this episode
# find the index of the accepted orders
idx = numOfOrder - 1
cur = result[episode]
rem = [6, 6, 6, 6, 6, 6]
subset = []
while cur > 0:
if dp[idx][rem[0]][rem[1]][rem[2]][rem[3]][rem[4]][rem[5]] == cur: # ignoring the current request, i
idx -= 1
else: # accepting the current request, i
subset.append(idx) # adding the request index to the subset
rem[0] -= orderList[episode, idx, 0]
rem[1] -= orderList[episode, idx, 1]
rem[2] -= orderList[episode, idx, 2]
rem[3] -= orderList[episode, idx, 3]
rem[4] -= orderList[episode, idx, 4]
rem[5] -= orderList[episode, idx, 5]
cur -= rewardList[episode, idx, 1]
idx -= 1
maximumSubset.append(subset)
return result, maximumSubset
def generateGame(NUM_EPISODES, NUM_ORDERS, NUM_SPACEWIDTH, NUM_CAPACITY, REWARD_MIN_PER_UNIT=1, REWARD_MAX_PER_UNIT=9):
base_element_set = np.array(
# Data based on Wendtris Java code (Hagen)
[[1, 2, 3, 2, 1, 0],
[1, 1, 1, 1, 0, 0],
[0, 2, 2, 0, 0, 0],
[2, 2, 1, 1, 0, 0],
[1, 3, 2, 2, 1, 0],
[2, 1, 1, 1, 3, 3]], dtype=object)
ARR_ORDERLIST = np.zeros((NUM_EPISODES, NUM_ORDERS, NUM_SPACEWIDTH), dtype=np.int8)
ARR_SET_AND_POSITION = np.zeros((NUM_EPISODES, NUM_ORDERS, 3), dtype=np.int8)
# 0 -> element (index of base_element_set)
# 1 -> position (offset)
# 2 -> length of element
ARR_REWARDS = np.zeros((NUM_EPISODES, NUM_ORDERS, 2), dtype=np.float32)
# 0 -> reward per unit
# 1 -> total reward
# Index 0: Set index
ARR_SET_AND_POSITION[:, :, 0] = np.random.randint(base_element_set.shape[0], size=(NUM_EPISODES, NUM_ORDERS))
# Set reward data per unit
ARR_REWARDS[:, :, 0] = np.random.randint(REWARD_MIN_PER_UNIT, REWARD_MAX_PER_UNIT + 1,
size=(NUM_EPISODES, NUM_ORDERS))
# Index 1: Position of element
for i in range(NUM_EPISODES):
for j in range(NUM_ORDERS):
# Set radon position for the elements
len_element = len(base_element_set[ARR_SET_AND_POSITION[i, j, 0]])
pos_element = np.random.randint(NUM_SPACEWIDTH - len_element + 1)
ARR_SET_AND_POSITION[i, j, 1] = pos_element
ARR_SET_AND_POSITION[i, j, 2] = len_element
# set reward data for total element
ARR_REWARDS[i, j, 1] = ARR_REWARDS[i, j, 0] * sum(base_element_set[ARR_SET_AND_POSITION[i, j, 0]])
for k in range(len_element):
ARR_ORDERLIST[i, j, pos_element + k] = base_element_set[ARR_SET_AND_POSITION[i, j, 0]][k]
return ARR_ORDERLIST, ARR_REWARDS
if __name__ == "__main__":
np.set_printoptions(threshold=sys.maxsize)
num_episodes = 1000
num_orders = 20
num_spacewidth = 6
num_capacity = 6
orders, rewards = generateGame(num_episodes, num_orders, num_spacewidth, num_capacity)
start = timer()
result, maximumSubset = resourceAllocator(orders, rewards)
result = np.array(result)
maximumSubset = np.array(maximumSubset)
print(result)
print(maximumSubset)
save('order_list.npy', orders)
save('reward_list.npy', rewards)
save('results.npy', result)
save('subset.npy', maximumSubset)
print("without GPU:", timer()-start)
print("--------------------------------")
\ No newline at end of file
import torch
import numpy as np
from collections import defaultdict
from wendtris import Wendtris
import matplotlib.pyplot as plt
env = Wendtris()
SEED = 99
def gen_epsilon_greedy_policy(n_action, epsilon):
def policy_function(state, Q):
probs = torch.ones(n_action) * epsilon / n_action
best_action = torch.argmax(Q[tuple(state)]).item()
probs[best_action] += 1.0 - epsilon
action = torch.multinomial(probs, 1).item()
return action
return policy_function
def q_learning(env, gamma, n_episode, alpha, epsilon, epsilon_min, epsilon_decay):
n_action = env.action_space.n
Q = defaultdict(lambda: torch.zeros(n_action))
for episode in range(n_episode):
epsilon_greedy_policy = gen_epsilon_greedy_policy(env.action_space.n, epsilon)
env.seed(99)
state = env.reset()
is_done = False
while not is_done:
action = epsilon_greedy_policy(state, Q)
next_state, reward, is_done, _ = env.step(action)
td_delta = reward + gamma * torch.max(Q[tuple(next_state)]) - Q[tuple(state)][action]
Q[tuple(state)][action] += alpha * td_delta
total_reward_episode[episode] += reward
if is_done:
print('Episode: {}, total reward: {}, epsilon: {}'.format(episode, total_reward_episode[episode],
epsilon))
break
state = next_state
if epsilon >= epsilon_min:
epsilon *= epsilon_decay
policy = {}
for state, actions in Q.items():
policy[tuple(state)] = torch.argmax(actions).item()
return Q, policy
gamma = 1
n_episode = 40000
alpha = 0.01
epsilon = 0.3
epsilon_decay = 0.99993
epsilon_min = 0.005
total_reward_episode = [0] * n_episode
optimal_Q, optimal_policy = q_learning(env, gamma, n_episode, alpha, epsilon, epsilon_min, epsilon_decay)
print('The optimal policy:\n', optimal_policy)
import matplotlib.pyplot as plt
smoothed_rewards = [np.mean(total_reward_episode[:i+1]) for i in range(len(total_reward_episode))]
plt.plot(total_reward_episode)
plt.plot(smoothed_rewards)
plt.title('Episode reward over time')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.show()
plt.plot(total_reward_episode)
plt.title('Episode reward over time')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.show()
\ No newline at end of file
File added
import numpy as np
def buildWendtrisModel():
global ARR_ALLPOSSIBLEREQUEST_PROB
global ARR_ALLPOSSIBLEREQUEST
avg_reward = 5
ARR_ALLPOSSIBLEREQUEST = np.array(
# Data based on Wendtris Java code (Hagen)
[[1, 2, 3, 2, 1, 0],
[1, 1, 1, 1, 0, 0],
[0, 2, 2, 0, 0, 0],
[2, 2, 1, 1, 0, 0],
[1, 3, 2, 2, 1, 0],
[2, 1, 1, 1, 3, 3]])
ARR_ALLPOSSIBLEREQUEST_PROB = np.ones((6, 2), np.float32)
# 0-> probability for each order
# 1-> EV for each request: Units * AVG reward * probability for each request
# calculated "manually":
# num of different requests, each probability 1/6
ARR_ALLPOSSIBLEREQUEST_PROB[:, 0] /= 6
# ressource consumption, hardcoded
ARR_ALLPOSSIBLEREQUEST_PROB[:, 1] = [9, 4, 4, 6, 9, 11]
ARR_ALLPOSSIBLEREQUEST_PROB[:, 1] *= avg_reward / 6
buildWendtrisModel()
print(ARR_ALLPOSSIBLEREQUEST_PROB)
print(ARR_ALLPOSSIBLEREQUEST)
sdp_fix_mem = {}
def solveGame_sdp_fixedPosition(cap_remaining: np.array, stage: int):
global REWARD_AVG_PER_UNIT
global REWARD_MAX_PER_UNIT
global ARR_ALLPOSSIBLEREQUEST_PROB
global ARR_ALLPOSSIBLEREQUEST
num_req_types = ARR_ALLPOSSIBLEREQUEST_PROB.shape[0]
cap_remaining_key = cap_remaining.tobytes()
cap_remaining_total = np.sum(cap_remaining)
try:
res_sdp = sdp_fix_mem[stage][cap_remaining_total][cap_remaining_key]
except KeyError:
# Check if parent key (cap_remaining_total) exists and create if needed
try:
tmp2 = sdp_fix_mem[stage][cap_remaining_total]
except KeyError:
# Check if parent key (stage) exists and create if needed
try:
tmp1 = sdp_fix_mem[stage]
except KeyError:
# create key for state
sdp_fix_mem[stage] = {}
# create key for cap_remaining_total
sdp_fix_mem[stage][cap_remaining_total] = {}
# create key for order and calculate value
sdp_fix_mem[stage][cap_remaining_total][cap_remaining_key] = 0.0
# if we are in stage 1, the value is the mean of the
# expected rev for all orders which we can and will accept
if (stage == 1):
for i in range(num_req_types):
if np.min(cap_remaining - ARR_ALLPOSSIBLEREQUEST[i]) >= 0:
sdp_fix_mem[stage][cap_remaining_total][cap_remaining_key] += ARR_ALLPOSSIBLEREQUEST_PROB[i][1]
# if we are not in stage 1, we calculate the threshold price for each order type
# based on the value of the previous stage
# the value of the current stage at the given capacity vector is the average revenue for each policy
else:
sdp_fix_mem[stage][cap_remaining_total][cap_remaining_key] = 0.0
thres_price = np.ones(num_req_types, np.float32)
thres_price_ceil = np.ones(num_req_types, np.float32)
# Calculate V*(cap,stage-1)
rej_val = solveGame_sdp_fixedPosition(cap_remaining, stage - 1)
for i in range(num_req_types):
tmp_cap_remaining = cap_remaining - ARR_ALLPOSSIBLEREQUEST[i]
# check for request if it can be accepted
if np.min(tmp_cap_remaining) >= 0:
# order can be accepted, calculate V* of stage-1 in case for the remaning capacity
tmp_Vcapremaining = solveGame_sdp_fixedPosition(tmp_cap_remaining, stage - 1)
# Calculate threshold price and round values to suit the problem with discrete rewards
tmp_sum = np.sum(ARR_ALLPOSSIBLEREQUEST[i])
thres_price[i] = (rej_val - tmp_Vcapremaining) / tmp_sum
if thres_price[i] > 10:
thres_price[i] = 10.0
thres_price_ceil[i] = np.ceil(thres_price[i])
if thres_price_ceil[i] < 1.0:
thres_price_ceil[i] = 1.0
# Formula for Acceptance probability for given threshold price and discrete rewards
acc_prob = (REWARD_MAX_PER_UNIT - thres_price_ceil[i] + 1) / REWARD_MAX_PER_UNIT
# In case the order will be accepted, the EV is the average of all discrete values >= threshold
acc_EV_per_unit = (thres_price_ceil[i] + REWARD_MAX_PER_UNIT) / 2
add_val_acc = acc_prob * ARR_ALLPOSSIBLEREQUEST_PROB[i][0] * (
acc_EV_per_unit * tmp_sum + tmp_Vcapremaining)
# Prorated value and probability in case threshold price is not met
add_val_rej = (1 - acc_prob) * ARR_ALLPOSSIBLEREQUEST_PROB[i][0] * rej_val
else:
# order cant be accepted
thres_price[i] = REWARD_MAX_PER_UNIT + 1
thres_price_ceil[i] = REWARD_MAX_PER_UNIT + 1
# 100% rej probability
add_val_rej = ARR_ALLPOSSIBLEREQUEST_PROB[i][0] * rej_val
add_val_acc = 0
# Update Value in Dictionary
sdp_fix_mem[stage][cap_remaining_total][cap_remaining_key] += add_val_acc + add_val_rej
print("stage: " + str(stage) + ", rem cap: " + str(cap_remaining) + " thres price ceil: " + str(
thres_price_ceil))
print("Calculated Total Val: " + str(sdp_fix_mem[stage][cap_remaining_total][cap_remaining_key]))
return sdp_fix_mem[stage][cap_remaining_total][cap_remaining_key]
test = np.array([6, 6, 6, 6, 6, 6])
order = np.array([0, 2, 2, 0, 0, 0])
after_order = np.array([2, 2, 2, 5, 2, 3])
testb = test.tobytes()
after_orderb = after_order.tobytes()
solveGame_sdp_fixedPosition(test, 50)
test = np.array([6, 6, 6, 6, 6, 6])
order = np.array([0, 2, 2, 0, 0, 0])
after_order = np.array([2, 2, 2, 5, 2, 3])
testb = test.tobytes()
after_orderb = after_order.tobytes()
solveGame_sdp_fixedPosition(test, 500)
File added
import os
import sys
import torch
import math, random
import torch
import torch.nn as nn
from collections import deque
import numpy as np
import matplotlib.pyplot as plt
from collections import namedtuple
from timeit import default_timer as timer
from wendtris import Wendtris
import utils
# Check if using cuda and define device
USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if USE_CUDA else 'cpu')
class DQN(nn.Module):
def __init__(self, n_state, n_action, n_hidden, lr):
super(DQN, self).__init__()
self.criterion = torch.nn.MSELoss()
self.model = torch.nn.Sequential(
torch.nn.Linear(n_state, n_hidden),
torch.nn.ReLU(),
torch.nn.Linear(n_hidden, n_hidden),
torch.nn.ReLU(),
torch.nn.Linear(n_hidden, n_action)
)
self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
# self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.9)
def predict(self, state):
"""
Compute the Q values of the state for all actions using the learning model
@param state: input state
@return: Q values of the state for all actions
"""
with torch.no_grad():
return self.model(torch.Tensor(state).to(device))
def update(self, state, y):
"""
Update the weights of the DQN given a training sample
@param state: state
@param y: target value
"""
y_pred = self.model(torch.Tensor(state).to(device))
y = torch.Tensor(y).to(device)
loss = self.criterion(y_pred, y)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# self.scheduler.step()
return loss
def choose_action(self, state, n_action, epsilon):
"""
Compute epsilon greedy search
@param state: state
@param epsilon: epsison
"""
p = random.random()
if p < epsilon:
# random action
return random.randint(0, n_action - 1)
else:
# take the action with maximal reward
q_values = self.predict(state)
return torch.argmax(q_values).item()
def replay(self, replay_buffer, replay_batch_size, gamma):
"""
Experience replay
@param replay_buffer: a list of experience
@param replay_batch_size: the number of samples we use to update the model each time
@param gamma: the discount factor
"""
if len(replay_buffer) >= replay_batch_size:
replay_data = random.sample(replay_buffer, replay_batch_size)
states = []
td_targets = []
for state, action, next_state, reward, is_done in replay_data:
states.append(state)
q_values = self.predict(state).tolist()
if is_done:
q_values[action] = reward
else:
q_values_next = self.predict(next_state)
q_values[action] = reward + gamma * torch.max(q_values_next).item()
td_targets.append(q_values)
loss = self.update(states, td_targets)
return loss
# Initialize the environment, with penalty factor 2
env = Wendtris(20, 6, 6, 2)
# Hyperparameters
n_state = len(env.state)
n_action = env.action_space.n
n_hidden = 12
lr = 0.001
replay_buffer = deque(maxlen=10000)
replay_batch_size = 64
n_episode = 100
gamma = 1
total_reward_episode = [0] * n_episode
num_no_capacity = []
accepted_orders = []
total_rewards = []
epsilon_start = 1.0
epsilon_decay = 0.999
epsilon_value = []
losses = []
dqn = DQN(n_state, n_action, n_hidden, lr)
if USE_CUDA:
dqn = dqn.to(device)
SEED = 100
OUT_PATH = "output/dqn"
LOG_FILE = os.path.join(OUT_PATH, 'log.txt')
utils.clear_folder(OUT_PATH)
print(f'Logging to {LOG_FILE}n')
sys.stdout = utils.StdOut(LOG_FILE)
print(f"PyTorch version {torch.__version__}")
if USE_CUDA:
print(f"CUDA version: {torch.version.cuda}")
start_time = timer()
for episode in range(n_episode):
epsilon = epsilon_start * epsilon_decay ** episode
epsilon_value.append(epsilon)
env.seed(SEED)
state = env.reset()
is_done = False
while not is_done:
action = dqn.choose_action(state, n_action, epsilon)
next_state, reward, is_done, info = env.step(action)
total_reward_episode[episode] += reward
replay_buffer.append((state, action, next_state, reward, is_done))
if is_done:
num_no_capacity.append(info['Number no capacity'])
accepted_orders.append(info['Accepted orders'])
break
loss = dqn.replay(replay_buffer, replay_batch_size, gamma)
losses.append(loss)
state = next_state
SEED += 1
print(
f'Episode: {episode}, total reward: {total_reward_episode[episode]}, epsilon: {epsilon}, loss: {loss}, '
f'num_no_capacity: {num_no_capacity[episode]}, accepted orders: {accepted_orders[episode]}')
print(f"Training time for {n_episode} episodes: {timer() - start_time}")
# save the model parameters
torch.save(dqn.state_dict(), os.path.join(OUT_PATH, 'dqn_{}.pk1'.format(n_episode)))
# Plot the total reward of training model
# Cumulative Average reward received over time
smoothed_rewards = [np.mean(total_reward_episode[:i+1]) for i in range(len(total_reward_episode))]
plt.plot(total_reward_episode)
plt.plot(smoothed_rewards)
plt.title('Episode rewards over time')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.savefig(os.path.join(OUT_PATH, 'training_total_rewards.png'), dpi=1200, transparent=True)
plt.show()
# Plot epsilon
plt.plot(epsilon)
plt.title('Epsilon over time')
plt.xlabel('Episode')
plt.ylabel('Epsilon')
plt.savefig(os.path.join(OUT_PATH, 'epsilon.png'), dpi=1200, transparent=True)
plt.show()
# Plot number of penalties
num_no_capacity_smoothed = [np.mean(num_no_capacity[:i+1]) for i in range(len(num_no_capacity))]
plt.plot(num_no_capacity)
plt.plot(num_no_capacity_smoothed)
plt.title('Number of accepted orders without capacity')
plt.xlabel('Episode')