Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.DS_Store
171 changes: 136 additions & 35 deletions Learner.py
Original file line number Diff line number Diff line change
@@ -1,93 +1,194 @@
__author__ = 'philippe'
__author__ = 'vivek'

import World
import threading
import time

discount = 0.3
actions = World.actions
states = []
Q = {}
import random

# Initial Values
discount = 0.3 # discount
actions = World.actions # actions
states = [] # states
Q = {} # policies / rewards

# Define World
for i in range(World.x):
""" Loads all available states (squares) in the World"""
for j in range(World.y):
states.append((i, j))

for state in states:
""" For every state, set the default reward for each action to 0.1"""
temp = {}
for action in actions:
temp[action] = 0.1
World.set_cell_score(state, action, temp[action])
Q[state] = temp

for (i, j, c, w) in World.specials:
""" For x, y, color, score in each World.specials list, update the Q reward array so reaching this square through any action earns the new reward"""
for action in actions:
Q[(i, j)][action] = w
World.set_cell_score((i, j), action, w)

# Value Functions
def max_Q(s):
""" For current position, check which of the next available squares provide the highest Q value. If more than one, choose randomly.

input:
s: current position
epsilon: value of random actions

output:
best_a : best action
best_q: best q value for that action"""

best_q = None
best_a = None

for a, q in Q[s].items(): # for every action and q value
if best_q is None or (q > best_q): # if val is 0 or current q is higher, assign current a, q
#print('Checking for an Action...')

best_q = q
best_a = a

options = [x for x in Q[s].items() if x[1] == best_q] # How many options do we have?

if len(options) > 1:
# If more than option, pick the best one
best_a, best_q = options[random.randrange(0,len(options))]

if best_q < 0.1:
# If all the options are bad (less than the regular reward) do something random!
best_a, best_q = Q[s].items()[random.randrange(0,len(Q[s].items()))]

return best_a, best_q



def inc_Q(s, a, alpha, inc):
""" Given the position, action, the learning rate, and inc, set the new cell score.

We use the alpha to decrease the value of moves over time; the longer a new policy takes, the less its Q value becomes in the q matrix.

s: current position
a: current action
alpha: current learning rate
inc: r + discount * max_val (World Score + Discount * MaxQ(s).val)
"""

# For the specific action
Q[s][a] *= 1 - alpha # multiply the Q value for an action by the learning rate
Q[s][a] += alpha * inc # add the incoming value of the alpha * action

World.set_cell_score(s, a, Q[s][a]) # Set the score of getting to the current position using the action to the new Q[s][a]

# Moving Functions
def do_action(action):
"""
Given an action, make an actual move in the real world.

s = current player position
r = the current score increased by the cost/reward of the next action
s2 = updated player position
"""
s = World.player
r = -World.score

if action == actions[0]:
# down
World.try_move(0, -1)
elif action == actions[1]:
World.try_move(0, 1)

elif action == actions[2]:
# left
World.try_move(-1, 0)

elif action == actions[1]:
# up
World.try_move(0, 1)

elif action == actions[3]:
# right
World.try_move(1, 0)
else:
return

s2 = World.player
r += World.score
return s, action, r, s2


def max_Q(s):
val = None
act = None
for a, q in Q[s].items():
if val is None or (q > val):
val = q
act = a
return act, val


def inc_Q(s, a, alpha, inc):
Q[s][a] *= 1 - alpha
Q[s][a] += alpha * inc
World.set_cell_score(s, a, Q[s][a])


# Start Game
def run():
global discount
time.sleep(1)

alpha = 1
beta = None

t = 1

stuck = 0
old_s = (0,0)

while True:
# Pick the right action
s = World.player
max_act, max_val = max_Q(s)
(s, a, r, s2) = do_action(max_act)
s = World.player # Starting Position

if s == old_s: # Stuck Check
stuck += 1
#print('Been in the same spot this many times:{}'.format(stuck))

# Before Move
# Current Position
#print('Current Position: {},\n Potential Actions: {}\n\n'.format(s, Q[s]))

# Potential Actions
max_act, max_val = max_Q(s) # find the next highest Q from position s
#print('Suggested Action: {},\n Q Value: {}\n\n'.format(max_act, max_val))

# Making Move
(s, a, r, s2) = do_action(max_act) # Return the results of an action from old s to new s2
#print('New Position: {},\n Current Score: {},\n Actual Move: {},\n Old Position {}\n\n'.format(s2, r, a, s2))

# Learning Consequences
# alpha: learning rate; cost of taking too many moves, if the alpha grows too quickly, bad moves seem less bad over time. (Really is a 'learning rate'; measures consequence of actions)
# beta: summary of the reward function, which takes the world score, discount, and best q to update the value of that action

epsilon = pow(t, -0.005)
alpha = pow(t, -0.25)
beta = r + discount * max_val

# Update Q
max_act, max_val = max_Q(s2)
inc_Q(s, a, alpha, r + discount * max_val)
# Check Values at New Position
max_act, max_val = max_Q(s2) # return the score at s2
#print('Suggested Action at New Position: {},\nQ Value at New Position: {}\n'.format(max_act, max_val)) # seems to be the same position. Hmm....

# Update Q Matrix
inc_Q(s, a, alpha, beta)
#print('Updating Q value at Position: {},\n for Action: {}, \n with Alpha {},\n and Beta {}\n'.format(s, a, alpha, beta, max_val))

# Check if the game has restarted
t += 1.0
if World.has_restarted():
old_s = s

print('Moves {} | Score {}'.format(t, round(World.score, 2)))


if World.has_restarted() or (t > 100) or (stuck > 10): # 500 tries per life, or you get stuck more than 10 times
World.restart_game()
time.sleep(0.01)

t = 1.0
stuck = 0

# Update the learning rate
alpha = pow(t, -0.1)

# MODIFY THIS SLEEP IF THE GAME IS GOING TOO FAST.
time.sleep(0.1)
time.sleep(0.01) # seconds between moves
# sanic


t = threading.Thread(target=run)
t.daemon = True
t.start()

World.start_game()
25 changes: 16 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# q_learning_demo
# Q Learning Demo

This is the code for "How to use Q Learning in Video Games Easily" by Siraj Raval on Youtube

##Overview

This is the associated code for [this](https://youtu.be/A5eihauRQvo) video on Youtube by Siraj Raval. This is a simple example of a type of [reinforcement learning](https://en.wikipedia.org/wiki/Reinforcement_learning)
called [Q learning](https://en.wikipedia.org/wiki/Q-learning).
called [Q learning](https://en.wikipedia.org/wiki/Q-learning).

● Rules: The agent (yellow box) has to reach one of the goals to end the game (green or red cell).
● Rewards: Each step gives a negative reward of -0.04. The red cell gives a negative reward of -1. The green one gives a positive reward of +1.
Expand All @@ -16,24 +17,30 @@ called [Q learning](https://en.wikipedia.org/wiki/Q-learning).
-Python 2.7
-tkinter

If on Ubuntu you can install tkinter for python2.7 with
$sudo apt-get install python-tk

##Usage

Run `python Learner.py` in terminal to see the the bot in action. It'll find the optimal strategy pretty fast (like in 15 seconds)
Run `python Learner.py` in terminal to see the the bot in action.

##Challenge

The challenge for this video is to
The challenge for this video is to

* modify the the game world so that it's bigger
* modify the the game world so that it's bigger
* add more obstacles
* have the bot start in a different position

**Bonus points if you modify the bot in some way that makes it more efficient**

#Due Date is Thursday at noon PST January 12th 2017
##Solution

My solution features the following

* `random_start()` start anywhere on the board.
* `difficulty` parameter that scales up the number of walls.
* `create_reds()`,`create_greens()` and `create_walls()` that add more special squares to help and hinder the agent.
* `(x,y)` scaling for larger boards
* `max_q()` changes for more randomized/Q-sensitive agent decision making.
* Tons more documentation (for my own learning)

##Credits

Expand Down
Loading