Commit beaa6350 authored by Håkon Harnes's avatar Håkon Harnes

completed assignment 08

parent 6633f4c0
......@@ -14,7 +14,7 @@
"\n",
"Deretter skal implementasjonen av Q-læring fra forrige oppgave brukes for å\n",
"trene en agent i environmentet. Til slutt skal Q-verdiene visualiserer inne i selve\n",
"environmentet, og dette kan gjøres på flere måter. En m˚ate erå fargelegge rutene\n",
"environmentet, og dette kan gjøres på flere måter. En måte erå fargelegge rutene\n",
"basert på den høyeste Q-verdien fra tilsvarende rad i Q-tabellen. Alternativt så\n",
"kan man tegne inn piler som peker i samme retning som handlingen med høyest\n",
"Q-verdi.\n",
......@@ -24,7 +24,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 64,
"metadata": {
"scrolled": true
},
......@@ -38,7 +38,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 65,
"metadata": {
"scrolled": true
},
......@@ -49,7 +49,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 66,
"metadata": {
"scrolled": true
},
......@@ -57,7 +57,7 @@
"source": [
"# Hyperparameters \n",
"BUCKETS = (8, 8) \n",
"EPISODES = 5000\n",
"EPISODES = 3000\n",
"MIN_LEARNING_RATE = 0.1\n",
"MIN_EPSILON = 0.1\n",
"DISCOUNT = 1.0\n",
......@@ -69,19 +69,18 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 67,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# q_table = np.random.uniform(low=0, high=3, size=(BUCKETS + (env.action_space.n, )))\n",
"q_table = np.zeros(BUCKETS + (env.action_space.n, ))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 68,
"metadata": {
"scrolled": true
},
......@@ -93,7 +92,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 69,
"metadata": {
"scrolled": true
},
......@@ -114,7 +113,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 70,
"metadata": {
"scrolled": true
},
......@@ -130,7 +129,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 71,
"metadata": {
"scrolled": true
},
......@@ -143,7 +142,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 72,
"metadata": {
"scrolled": true
},
......@@ -156,7 +155,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
......@@ -167,7 +166,7 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 74,
"metadata": {
"scrolled": false
},
......@@ -177,21 +176,19 @@
"output_type": "stream",
"text": [
"Episode Score\n",
"500\t 7.2%\n",
"1000\t 42.0%\n",
"1500\t 91.4%\n",
"2000\t 98.8%\n",
"2500\t 99.6%\n",
"3000\t 99.8%\n",
"3500\t 100.0%\n",
"4000\t 100.0%\n",
"4500\t 100.0%\n",
"5000\t 100.0%\n"
"500\t 8.2%\n",
"1000\t 44.4%\n",
"1500\t 92.6%\n",
"2000\t 98.2%\n",
"2500\t 99.8%\n",
"3000\t 100.0%\n"
]
}
],
"source": [
"print('Episode Score')\n",
"\n",
"scores = []\n",
"completionCount = 0 \n",
"\n",
"for episode in range(EPISODES):\n",
......@@ -202,31 +199,28 @@
" learning_rate = get_learning_rate(episode)\n",
" epsilon = get_epsilon(episode)\n",
" \n",
" # Plays the game \n",
" # Runs through an episode \n",
" done = False\n",
" while not done:\n",
" \n",
" # Renders the last episode\n",
" #if episode == (EPISODES - 1):\n",
" # env.render()\n",
" \n",
" action = choose_action(current_state) # Chooses action\n",
" obs, reward, done, _ = env.step(action) # Performs action \n",
" new_state = tuple(obs) # Discretizes new state\n",
" update_q(current_state, action, reward, new_state) # Updates Q-Table\n",
" update_q(current_state, action, reward, new_state) # Updates the Q-Table\n",
" current_state = new_state # Updates the current state\n",
" \n",
" if reward == 10.0: completionCount += 1 \n",
" \n",
" # Prints some statistics \n",
" if (episode + 1) % SHOW_STATS == 0: \n",
" print(f'{episode + 1}\\t {round((completionCount / SHOW_STATS) * 100, 2)}%')\n",
" completionCount = 0 "
" score = round((completionCount / SHOW_STATS) * 100, 2)\n",
" completionCount = 0\n",
" print(f'{episode + 1}\\t {score}%') "
]
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 75,
"metadata": {},
"outputs": [
{
......@@ -249,18 +243,22 @@
}
],
"source": [
"epsilon = 0.0 \n",
"current_state = tuple(env.reset()) \n",
"\n",
"done = False \n",
"while not done:\n",
" action = choose_action(current_state) # Chooses action\n",
" obs, reward, done, _ = env.step(action) # Performs action \n",
" \n",
" # Chooses and performs action\n",
" action = choose_action(current_state) \n",
" obs, reward, done, _ = env.step(action) \n",
" \n",
" # Sets new state\n",
" new_state = tuple(obs)\n",
" current_state = new_state \n",
" \n",
" env.render()\n",
" # Renders the frame \n",
" env.render(q_table)\n",
" print(obs)"
]
}
......@@ -270,6 +268,18 @@
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.12"
}
},
"nbformat": 4,
......
......@@ -14,7 +14,7 @@
"\n",
"Deretter skal implementasjonen av Q-læring fra forrige oppgave brukes for å\n",
"trene en agent i environmentet. Til slutt skal Q-verdiene visualiserer inne i selve\n",
"environmentet, og dette kan gjøres på flere måter. En m˚ate erå fargelegge rutene\n",
"environmentet, og dette kan gjøres på flere måter. En måte erå fargelegge rutene\n",
"basert på den høyeste Q-verdien fra tilsvarende rad i Q-tabellen. Alternativt så\n",
"kan man tegne inn piler som peker i samme retning som handlingen med høyest\n",
"Q-verdi.\n",
......@@ -24,7 +24,7 @@
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": 64,
"metadata": {
"scrolled": true
},
......@@ -38,7 +38,7 @@
},
{
"cell_type": "code",
"execution_count": 43,
"execution_count": 65,
"metadata": {
"scrolled": true
},
......@@ -49,7 +49,7 @@
},
{
"cell_type": "code",
"execution_count": 44,
"execution_count": 66,
"metadata": {
"scrolled": true
},
......@@ -57,7 +57,7 @@
"source": [
"# Hyperparameters \n",
"BUCKETS = (8, 8) \n",
"EPISODES = 5000\n",
"EPISODES = 3000\n",
"MIN_LEARNING_RATE = 0.1\n",
"MIN_EPSILON = 0.1\n",
"DISCOUNT = 1.0\n",
......@@ -69,19 +69,18 @@
},
{
"cell_type": "code",
"execution_count": 45,
"execution_count": 67,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# q_table = np.random.uniform(low=0, high=3, size=(BUCKETS + (env.action_space.n, )))\n",
"q_table = np.zeros(BUCKETS + (env.action_space.n, ))"
]
},
{
"cell_type": "code",
"execution_count": 46,
"execution_count": 68,
"metadata": {
"scrolled": true
},
......@@ -93,7 +92,7 @@
},
{
"cell_type": "code",
"execution_count": 47,
"execution_count": 69,
"metadata": {
"scrolled": true
},
......@@ -114,7 +113,7 @@
},
{
"cell_type": "code",
"execution_count": 48,
"execution_count": 70,
"metadata": {
"scrolled": true
},
......@@ -130,7 +129,7 @@
},
{
"cell_type": "code",
"execution_count": 49,
"execution_count": 71,
"metadata": {
"scrolled": true
},
......@@ -143,7 +142,7 @@
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": 72,
"metadata": {
"scrolled": true
},
......@@ -156,7 +155,7 @@
},
{
"cell_type": "code",
"execution_count": 51,
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
......@@ -167,7 +166,7 @@
},
{
"cell_type": "code",
"execution_count": 52,
"execution_count": 74,
"metadata": {
"scrolled": false
},
......@@ -177,21 +176,19 @@
"output_type": "stream",
"text": [
"Episode Score\n",
"500\t 11.2%\n",
"1000\t 45.8%\n",
"1500\t 91.4%\n",
"2000\t 99.0%\n",
"2500\t 99.6%\n",
"3000\t 100.0%\n",
"3500\t 100.0%\n",
"4000\t 100.0%\n",
"4500\t 100.0%\n",
"5000\t 100.0%\n"
"500\t 8.2%\n",
"1000\t 44.4%\n",
"1500\t 92.6%\n",
"2000\t 98.2%\n",
"2500\t 99.8%\n",
"3000\t 100.0%\n"
]
}
],
"source": [
"print('Episode Score')\n",
"\n",
"scores = []\n",
"completionCount = 0 \n",
"\n",
"for episode in range(EPISODES):\n",
......@@ -209,27 +206,27 @@
" action = choose_action(current_state) # Chooses action\n",
" obs, reward, done, _ = env.step(action) # Performs action \n",
" new_state = tuple(obs) # Discretizes new state\n",
" update_q(current_state, action, reward, new_state) # Updates Q-Table\n",
" update_q(current_state, action, reward, new_state) # Updates the Q-Table\n",
" current_state = new_state # Updates the current state\n",
" \n",
" if reward == 10.0: completionCount += 1 \n",
" \n",
" # Prints some statistics \n",
" if (episode + 1) % SHOW_STATS == 0: \n",
" print(f'{episode + 1}\\t {round((completionCount / SHOW_STATS) * 100, 2)}%')\n",
" completionCount = 0 "
" score = round((completionCount / SHOW_STATS) * 100, 2)\n",
" completionCount = 0\n",
" print(f'{episode + 1}\\t {score}%') "
]
},
{
"cell_type": "code",
"execution_count": 53,
"execution_count": 75,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1, 0]\n",
"[1, 0]\n",
"[2, 0]\n",
"[3, 0]\n",
......@@ -246,6 +243,7 @@
}
],
"source": [
"epsilon = 0.0 \n",
"current_state = tuple(env.reset()) \n",
"\n",
"done = False \n",
......@@ -259,8 +257,8 @@
" new_state = tuple(obs)\n",
" current_state = new_state \n",
" \n",
" # Renders frame \n",
" env.render()\n",
" # Renders the frame \n",
" env.render(q_table)\n",
" print(obs)"
]
}
......@@ -270,6 +268,18 @@
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.12"
}
},
"nbformat": 4,
......
......@@ -7,6 +7,14 @@ import sys
class GridWorld:
# Arrows
ARROWS = [
pygame.image.load('resources/up_arrow.jpg'),
pygame.image.load('resources/down_arrow.jpg'),
pygame.image.load('resources/left_arrow.jpg'),
pygame.image.load('resources/right_arrow.jpg')
]
# Colors
WHITE = (255, 255, 255)
RED = (255, 0, 0)
......@@ -31,12 +39,16 @@ class GridWorld:
self.ROW_COUNT = self.COL_COUNT = int(math.sqrt(RECTANGLE_COUNT))
self.OBSTACLE_POSITION = [self.ROW_COUNT - 4, self.COL_COUNT - 2]
self.GOAL_POSITION = [self.ROW_COUNT - 2, self.COL_COUNT - 2]
self.RECTANGLE_SIZE = WINDOW_SIZE / self.COL_COUNT
self.RECTANGLE_SIZE = int(WINDOW_SIZE / self.COL_COUNT)
self.RECTANGLE_COUNT = RECTANGLE_COUNT
self.MAX_MOVES = RECTANGLE_COUNT;
self.moves = 0;
self.GRID_GAP = GRID_GAP
# Resizes the arrows
for i in range(len(self.ARROWS)):
self.ARROWS[i] = pygame.transform.scale(self.ARROWS[i], (self.RECTANGLE_SIZE - self.GRID_GAP, self.RECTANGLE_SIZE - self.GRID_GAP))
# Fills the display (background) as black
self.DISPLAY.fill(self.BLACK)
......@@ -52,9 +64,9 @@ class GridWorld:
return self.PLAYER_POSITION
def render(self):
def render(self, q_table):
pygame.init()
self.drawGrid()
self.drawGrid(q_table)
for event in pygame.event.get():
if event.type == pygame.QUIT:
......@@ -65,23 +77,24 @@ class GridWorld:
pygame.time.delay(1000)
def drawGrid(self):
def drawGrid(self, q_table):
for row in range(self.ROW_COUNT):
for col in range(self.COL_COUNT):
# Draws the arrows
action = np.argmax(q_table[(row, col)])
self.DISPLAY.blit(self.ARROWS[action], (col * self.RECTANGLE_SIZE, row * self.RECTANGLE_SIZE))
rectangle = pygame.Rect(col * self.RECTANGLE_SIZE, row * self.RECTANGLE_SIZE,
self.RECTANGLE_SIZE - self.GRID_GAP, self.RECTANGLE_SIZE - self.GRID_GAP)
# Draws the players, goal and obstacle
if [row, col] == self.PLAYER_POSITION:
color = self.RED
pygame.draw.rect(self.DISPLAY, self.RED, rectangle)
elif [row, col] == self.GOAL_POSITION:
color = self.GREEN
pygame.draw.rect(self.DISPLAY, self.GREEN, rectangle)
elif [row, col] == self.OBSTACLE_POSITION:
color = self.BLACK
else:
color = self.WHITE
pygame.draw.rect(self.DISPLAY, color, rectangle)
# pygame.draw.polygon(self.DISPLAY, self.BLACK, [[col * self.RECTANGLE_SIZE, row * self.RECTANGLE_SIZE], [0, 100], [100, 50]])
pygame.draw.rect(self.DISPLAY, self.BLACK, rectangle)
def step(self, action):
......
from gridworld import GridWorld
env = GridWorld(800, 64, 1)
done = False
while not done:
env.render()
print(env.step(3))
\ No newline at end of file
import numpy as np
# global variables
BOARD_ROWS = 3
BOARD_COLS = 4
WIN_STATE = (0, 3)
LOSE_STATE = (1, 3)
START = (2, 0)
DETERMINISTIC = True
class State:
def __init__(self, state=START):
self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
self.board[1, 1] = -1
self.state = state
self.isEnd = False
self.determine = DETERMINISTIC
def giveReward(self):
if self.state == WIN_STATE:
return 1
elif self.state == LOSE_STATE:
return -1
else:
return 0
def isEndFunc(self):
if (self.state == WIN_STATE) or (self.state == LOSE_STATE):
self.isEnd = True
def nxtPosition(self, action):
"""
action: up, down, left, right
-------------
0 | 1 | 2| 3|
1 |
2 |
return next position
"""
if self.determine:
if action == "up":
nxtState = (self.state[0] - 1, self.state[1])
elif action == "down":
nxtState = (self.state[0] + 1, self.state[1])
elif action == "left":
nxtState = (self.state[0], self.state[1] - 1)
else:
nxtState = (self.state[0], self.state[1] + 1)
# if next state legal
if (nxtState[0] >= 0) and (nxtState[0] <= (BOARD_ROWS -1)):
if (nxtState[1] >= 0) and (nxtState[1] <= (BOARD_COLS -1)):
if nxtState != (1, 1):
return nxtState
return self.state
def showBoard(self):
self.board[self.state] = 1
for i in range(0, BOARD_ROWS):
print('-----------------')
out = '| '
for j in range(0, BOARD_COLS):
if self.board[i, j] == 1:
token = '*'
if self.board[i, j] == -1:
token = 'z'
if self.board[i, j] == 0:
token = '0'
out += token + ' | '
print(out)
print('-----------------')
# Agent of player
class Agent:
def __init__(self):
self.states = []
self.actions = ["up", "down", "left", "right"]
self.State = State()
self.lr = 0.2
self.exp_rate = 0.3
# initial state reward
self.state_values = {}
for i in range(BOARD_ROWS):
for j in range(BOARD_COLS):
self.state_values[(i, j)] = 0 # set initial value to 0
def chooseAction(self):
# choose action with most expected value
mx_nxt_reward = 0
action = ""
if np.random.uniform(0, 1) <= self.exp_rate:
action = np.random.choice(self.actions)
else:
# greedy action
for a in self.actions:
# if the action is deterministic
nxt_reward = self.state_values[self.State.nxtPosition(a)]
if nxt_reward >= mx_nxt_reward:
action = a
mx_nxt_reward = nxt_reward
return action
def takeAction(self, action):
position = self.State.nxtPosition(action)
return State(state=position)
def reset(self):
self.states = []
self.State = State()
def play(self, rounds=10):
i = 0
while i < rounds:
# to the end of game back propagate reward
if self.State.isEnd:
# back propagate
reward = self.State.giveReward()
# explicitly assign end state to reward values
self.state_values[self.State.state] = reward # this is optional
print("Game End Reward", reward)
for s in reversed(self.states):
reward = self.state_values[s] + self.lr * (reward - self.state_values[s])
self.state_values[s] = round(reward, 3)
self.reset()
i += 1
else:
action = self.chooseAction()
# append trace
self.states.append(self.State.nxtPosition(action))
print("current position {} action {}".format(self.State.state, action))
# by taking the action, it reaches the next state
self.State = self.takeAction(action)
# mark is end
self.State.isEndFunc()
print("nxt state", self.State.state)
print("---------------------")
def showValues(self):
for i in range(0, BOARD_ROWS):
print('----------------------------------')
out = '| '
for j in range(0, BOARD_COLS):
out += str(self.state_values[(i, j)]).ljust(6) + ' | '
print(out)
print('----------------------------------')
if __name__ == "__main__":
ag = Agent()
ag.play(50)
print(ag.showValues())
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment