1 Q learning example

-------
|A|B|C| 
-------
|D|E|F|
-------

States: A,B,C,D,E,F
Paths: from A can go to B, D; from B can A, E, C; etc.
Goal states: C


The Q Learning finds best path for every state as policy

# initinal gamma
gamma <- 0.9

# initinal R matrix
R <- matrix(nrow=6, ncol=6)

rownames(R) <- colnames(R) <- c('A','B','C','D','E','F')

R[1,] <- c(-1, 0,-1, 0,-1,-1)
R[2,] <- c( 0,-1,10,-1, 0,-1)
R[3,] <- c(-1,-1,-1,-1,-1,-1)
R[4,] <- c( 0,-1,-1,-1, 0,-1)
R[5,] <- c(-1, 0,-1, 0,-1, 0)
R[6,] <- c(-1,-1,10,-1, 0,-1)

# R matrix (state/action)
# environment rewards in matrix R
# matrix values: -1=no path, 0=path, 10=goal path
R

##    A  B  C  D  E  F
## A -1  0 -1  0 -1 -1
## B  0 -1 10 -1  0 -1
## C -1 -1 -1 -1 -1 -1
## D  0 -1 -1 -1  0 -1
## E -1  0 -1  0 -1  0
## F -1 -1 10 -1  0 -1

# Initialize matrix Q to zero
Q <- R 
Q[,] <- 0

Q

##   A B C D E F
## A 0 0 0 0 0 0
## B 0 0 0 0 0 0
## C 0 0 0 0 0 0
## D 0 0 0 0 0 0
## E 0 0 0 0 0 0
## F 0 0 0 0 0 0

# -------- Training ----------

# Each episode is equivalent to one training session.
for ( i in 1:1000){ # 1000 episodes
  
  # Select a random initial state
  state <- sample(rownames(R),1)
  
  while(state != "C") {
    
    # Random select one action from all possible actions for the current state
    actions <- R[state,][R[state,]>=0]
    action <- sample(names(actions), 1)
    
    # Consider going to the next state using the selected action
    nextState <- action
    
    # Get maximum Q value for the next state 
    # Q(state, action) = R(state, action) + Gamma * Max[Q(next state, all actions)]
    Q[state, action] <- R[state, action] + gamma * max(Q[nextState,]) 
    
    # Set the next state as the current state
    state <- nextState;
  }
}

# -------- Training result ----------
Q

##     A B  C    D   E F
## A 0.0 9  0 7.29 0.0 0
## B 8.1 0 10 0.00 8.1 0
## C 0.0 0  0 0.00 0.0 0
## D 8.1 0  0 0.00 8.1 0
## E 0.0 9  0 7.29 0.0 9
## F 0.0 0 10 0.00 8.1 0

# -------- Print policy ----------

for (state in rownames(Q)){ 
  
  if(state=="C"){
    action <- state # stay in goal state
  }else{
    actions <- Q[state,][Q[state,]>=0]
    action <- names(actions[order(-actions)][1]) # Find the action with the highest Q value
  } 
  
  print(paste("from state",  state, ", should take action", action ) )
}

## [1] "from state A , should take action B"
## [1] "from state B , should take action C"
## [1] "from state C , should take action C"
## [1] "from state D , should take action A"
## [1] "from state E , should take action B"
## [1] "from state F , should take action C"

Q Learning

Updated: 18 July, 2017

1 Q learning example