Commit 40385339 authored by Håkon Harnes's avatar Håkon Harnes

completed assignment 08

parent 819ee518
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Task 1 - CartPole-v0\n",
"Implementer Q-læring og bruk det for å løse cartpole-environmentet"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import gym \n",
"import math \n",
"import numpy as np "
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"env = gym.make('CartPole-v0')"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Parmeters: cart position, cart velocity, pole angle, pole tip velocity\n",
"\n",
"# Hyperparameters \n",
"BUCKETS = (1, 1, 6, 12) \n",
"EPISODES = 1000\n",
"MIN_LEARNING_RATE = 0.1\n",
"MIN_EPSILON = 0.1\n",
"DISCOUNT = 1.0\n",
"DECAY = 25\n",
"\n",
"# Visualization variables \n",
"SHOW_ENV = 200\n",
"SHOW_STATS = 50"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"q_table = np.random.uniform(low=0, high=1, size=(BUCKETS + (env.action_space.n, )))"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"upper_bounds = [env.observation_space.high[0], 0.5, env.observation_space.high[2], math.radians(50) / 1.]\n",
"lower_bounds = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -math.radians(50) / 1.]"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Discretizes the state \n",
"def discretize_state(obs):\n",
" discretized = list()\n",
" \n",
" for i in range(len(obs)):\n",
" scaling = (obs[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i])\n",
" new_obs = int(round((BUCKETS[i] - 1) * scaling))\n",
" new_obs = min(BUCKETS[i] - 1, max(0, new_obs))\n",
" discretized.append(new_obs)\n",
" \n",
" return tuple(discretized)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Chooses what action to take (random or look in Q-Table)\n",
"def choose_action(state):\n",
" if (np.random.random() < epsilon):\n",
" return env.action_space.sample() # Random action\n",
" else:\n",
" return np.argmax(q_table[state]) # Looks up in the Q-Table "
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Updates the Q-Table \n",
"def update_q(state, action, reward, new_state):\n",
" q_table[state][action] += learning_rate * (reward + DISCOUNT * np.max(q_table[new_state]) - q_table[state][action])"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Updates epsilon value (logarithmically decreasing)\n",
"def get_epsilon(episode):\n",
" return max(MIN_EPSILON, min(1., 1. - math.log10((episode + 1) / DECAY)))"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"# Updates the learning rate (logarithmically decreasing)\n",
"def get_learning_rate(episode):\n",
" return max(MIN_LEARNING_RATE, min(1., 1. - math.log10((episode + 1) / DECAY)))"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Episode Score Average\n",
"0\t 16.0\t 0.16\n",
"50\t 27.0\t 13.12\n",
"100\t 89.0\t 31.8\n",
"150\t 180.0\t 94.97\n",
"200\t 200.0\t 174.65\n",
"250\t 200.0\t 198.52\n",
"300\t 200.0\t 200.0\n",
"350\t 200.0\t 200.0\n",
"400\t 200.0\t 200.0\n",
"450\t 200.0\t 200.0\n",
"500\t 200.0\t 200.0\n",
"550\t 200.0\t 199.33\n",
"600\t 200.0\t 199.33\n",
"650\t 200.0\t 200.0\n",
"700\t 200.0\t 200.0\n",
"750\t 200.0\t 200.0\n",
"800\t 200.0\t 200.0\n",
"850\t 200.0\t 200.0\n",
"900\t 200.0\t 200.0\n",
"950\t 200.0\t 200.0\n",
"\n",
"Completed on episode 228\n"
]
}
],
"source": [
"print('Episode Score Average')\n",
"\n",
"scores = []\n",
"successfulEpisode = -1 \n",
"\n",
"for episode in range(EPISODES):\n",
" render = episode % SHOW_ENV == 0 \n",
" \n",
" # Resets the state \n",
" current_state = discretize_state(env.reset())\n",
" \n",
" # Updates learning rate and epsilon \n",
" learning_rate = get_learning_rate(episode)\n",
" epsilon = get_epsilon(episode)\n",
" \n",
" score = 0\n",
" \n",
" # Plays the game \n",
" done = False\n",
" while not done:\n",
" \n",
" # Renders the current state \n",
" if render:\n",
" env.render()\n",
" \n",
" action = choose_action(current_state) # Chooses action\n",
" obs, reward, done, _ = env.step(action) # Performs action \n",
" new_state = discretize_state(obs) # Discretizes state\n",
" update_q(current_state, action, reward, new_state) # Updates Q-Table\n",
" current_state = new_state # Updates the current state\n",
" score += reward # Updates the score \n",
" \n",
" scores.append(score)\n",
"\n",
" # Calculates the average of the last 100 episodes \n",
" average = sum(scores[-100:]) / 100 \n",
" if(average >= 195.0 and successfulEpisode < 0):\n",
" successfulEpisode = episode\n",
" \n",
" # Prints some statistics for every 50th episode \n",
" if episode % SHOW_STATS == 0: print(f'{episode}\\t {score}\\t {average}')\n",
"\n",
"# Prints the result \n",
"if successfulEpisode > 0:\n",
" print(f'\\nCompleted on episode {successfulEpisode}')\n",
"else:\n",
" print('\\nUnable to complete game')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Task 2A - GridWorld\n",
"Lag et enkelt gridworld-environment. Dette innebærer at environmentet har et\n",
"diskret rutenett, og at en agent kan bevege seg rundt med fire handlinger (opp,\n",
"ned, høyre, venstre). Simuleringen terminerer når agenten har nådd et plassert\n",
"mål-posisjon som gir reward 1. Om man ønsker, kan det legges inn f.eks. solide\n",
"vegger eller farlige områder som gir straff rundt omkring. Environmentet skal\n",
"ha samme interface som cartpole (.step(a)-funksjon, og .reset())\n",
"\n",
"Deretter skal implementasjonen av Q-læring fra forrige oppgave brukes for å\n",
"trene en agent i environmentet. Til slutt skal Q-verdiene visualiserer inne i selve\n",
"environmentet, og dette kan gjøres på flere måter. En m˚ate erå fargelegge rutene\n",
"basert på den høyeste Q-verdien fra tilsvarende rad i Q-tabellen. Alternativt så\n",
"kan man tegne inn piler som peker i samme retning som handlingen med høyest\n",
"Q-verdi.\n",
"\n",
"Tips: Biblioteket pygame er veldig greit for å lage visualisering av environmentet."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import gym \n",
"import math \n",
"import numpy as np \n",
"from gridworld import GridWorld"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"env = GridWorld(800, 64, 1)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Hyperparameters \n",
"BUCKETS = (8, 8) \n",
"EPISODES = 100000\n",
"MIN_LEARNING_RATE = 0.1\n",
"MIN_EPSILON = 0.5\n",
"DISCOUNT = 0.95\n",
"DECAY = 500\n",
"\n",
"# Visualization variables \n",
"SHOW_ENV = 10000\n",
"SHOW_STATS = 1000"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# q_table = np.random.uniform(low=0, high=3, size=(BUCKETS + (env.action_space.n, )))\n",
"q_table = np.zeros(BUCKETS + (env.action_space.n, ))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"upper_bounds = [env.observation_space.high[0], 0.5, env.observation_space.high[1], math.radians(50) / 1.]\n",
"lower_bounds = [env.observation_space.low[0], -0.5, env.observation_space.low[1], -math.radians(50) / 1.]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Discretizes the state \n",
"def discretize_state(obs):\n",
" discretized = list()\n",
" \n",
" for i in range(len(obs)):\n",
" scaling = (obs[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i])\n",
" new_obs = int(round((BUCKETS[i] - 1) * scaling))\n",
" new_obs = min(BUCKETS[i] - 1, max(0, new_obs))\n",
" discretized.append(new_obs)\n",
" \n",
" return tuple(discretized)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Chooses what action to take (random or look in Q-Table)\n",
"def choose_action(state):\n",
" if (np.random.random() < epsilon):\n",
" return env.action_space.sample() # Random action\n",
" else:\n",
" return np.argmax(q_table[state]) # Looks up in the Q-Table "
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Updates the Q-Table \n",
"def update_q(state, action, reward, new_state):\n",
" q_table[state][action] += learning_rate * (reward + DISCOUNT * np.max(q_table[new_state]) - q_table[state][action])"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Updates epsilon value (logarithmically decreasing)\n",
"def get_epsilon(episode):\n",
" return max(MIN_EPSILON, min(1., 1. - math.log10((episode + 1) / DECAY)))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"# Updates the learning rate (logarithmically decreasing)\n",
"def get_learning_rate(episode):\n",
" return max(MIN_LEARNING_RATE, min(1., 1. - math.log10((episode + 1) / DECAY)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"successfulEpisode = -1 \n",
"\n",
"for episode in range(EPISODES):\n",
" render = episode % SHOW_ENV == 0 \n",
" \n",
" # Resets the state \n",
" current_state = discretize_state(env.reset())\n",
" \n",
" # Updates learning rate and epsilon \n",
" learning_rate = get_learning_rate(episode)\n",
" epsilon = get_epsilon(episode)\n",
" \n",
" # Plays the game \n",
" done = False\n",
" while not done:\n",
" \n",
" # Renders the current state \n",
" if render:\n",
" env.render(np.argmax(q_table[current_state]))\n",
" \n",
" action = choose_action(current_state) # Chooses action\n",
" obs, reward, done, _ = env.step(action) # Performs action \n",
" new_state = discretize_state(obs) # Discretizes state\n",
" update_q(current_state, action, reward, new_state) # Updates Q-Table\n",
" current_state = new_state # Updates the current state\n",
" \n",
" if reward == 1.0: \n",
" successfulEpisode = episode \n",
" print(f'Completed @ {episode}')\n",
" \n",
" \n",
" # Prints some statistics for every 50th episode \n",
" if episode % SHOW_STATS == 0: print(f'Episode {episode}')\n",
"\n",
"# Prints the result \n",
"if successfulEpisode > 0:\n",
" print(f'\\nCompleted on episode {successfulEpisode}')\n",
"else:\n",
" print('\\nUnable to complete game')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
from gym import spaces
import numpy as np
import pygame
import math
import time
import sys
class GridWorld:
# Colors
WHITE = (255, 255, 255)
RED = (255, 0, 0)
GREEN = ( 0, 255, 0)
BLACK = ( 0, 0, 0)
LIGHT_RED = (255, 115, 129)
# Player positiion
PLAYER_POSITION = [0, 0]
# Actions
UP = 0
DOWN = 1
LEFT = 2
RIGHT = 3
def __init__(self, WINDOW_SIZE, RECTANGLE_COUNT, GRID_GAP):
# Initializes parameters
self.DISPLAY = pygame.display.set_mode((WINDOW_SIZE, WINDOW_SIZE))
self.ROW_COUNT = self.COL_COUNT = int(math.sqrt(RECTANGLE_COUNT))
self.GOAL_POSITION = [self.ROW_COUNT - 2, self.COL_COUNT - 2]
self.RECTANGLE_SIZE = WINDOW_SIZE / self.COL_COUNT
self.RECTANGLE_COUNT = RECTANGLE_COUNT
self.GRID_GAP = GRID_GAP
self.MAX_MOVES = (2/3) * self.RECTANGLE_COUNT
self.moves = 0
# Fills the display (background) as black
self.DISPLAY.fill(self.BLACK)
# Sets up the observation and action space
high = np.array([self.ROW_COUNT -1, self.COL_COUNT - 1], dtype=np.float32)
self.observation_space = spaces.Box(-high, high, dtype=np.float32)
self.action_space = spaces.Discrete(4)
def reset(self):
self.PLAYER_POSITION = [0, 0]
self.moves = 0;
return self.PLAYER_POSITION
def render(self, action):
pygame.init()
self.drawGrid(action)
for event in pygame.event.get():
if event.type == pygame.QUIT:
pygame.quit()
sys.exit()
pygame.display.update()
pygame.time.delay(1000)
def drawGrid(self, action):
# Finds the next position according to the Q-Table
if action == self.UP: qTablePos = [ self.PLAYER_POSITION[0] - 1, self.PLAYER_POSITION[1] ]
if action == self.DOWN: qTablePos = [ self.PLAYER_POSITION[0] + 1, self.PLAYER_POSITION[1] ]
if action == self.LEFT: qTablePos = [ self.PLAYER_POSITION[0], self.PLAYER_POSITION[1] - 1]
if action == self.RIGHT: qTablePos = [ self.PLAYER_POSITION[0], self.PLAYER_POSITION[1] + 1]
for row in range(self.ROW_COUNT):
for col in range(self.COL_COUNT):
rectangle = pygame.Rect(col * self.RECTANGLE_SIZE, row * self.RECTANGLE_SIZE,
self.RECTANGLE_SIZE - self.GRID_GAP, self.RECTANGLE_SIZE - self.GRID_GAP)
if [row, col] == self.PLAYER_POSITION:
color = self.RED
elif [row, col] == self.GOAL_POSITION:
color = self.GREEN
elif [row, col] == qTablePos:
color = self.LIGHT_RED
else:
color = self.WHITE
pygame.draw.rect(self.DISPLAY, color, rectangle)
def step(self, action):
# Moves the player
if action == self.UP: self.PLAYER_POSITION[0] -= 1
if action == self.DOWN: self.PLAYER_POSITION[0] += 1
if action == self.LEFT: self.PLAYER_POSITION[1] -= 1
if action == self.RIGHT: self.PLAYER_POSITION[1] += 1
# Checks if new position is valid
if self.PLAYER_POSITION[0] < 0 or self.PLAYER_POSITION[0] >= self.ROW_COUNT: done = True
elif self.PLAYER_POSITION[1] < 0 or self.PLAYER_POSITION[1] >= self.COL_COUNT: done = True
else: done = False
# Checks if the player has reached the goal
if self.PLAYER_POSITION == self.GOAL_POSITION:
reward = 1.0
done = True
else: reward = 0.0
# Makes sure the solution doesn't use too many moves
self.moves += 1
if self.moves > self.MAX_MOVES:
reward = 0.0
done = True
return self.PLAYER_POSITION, reward, done, {}
from gridworld import GridWorld
import time
env = GridWorld(800, 36, 1)
done = False
while not done:
env.render()
obs, reward, done, _ = env.step(3)
#done = True
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment