0.1 Read transaction data

0.1.1 Groceries

suppressMessages(library(arules))
## Warning: package 'arules' was built under R version 3.4.2
data(Groceries)
Groceries
## transactions in sparse format with
##  9835 transactions (rows) and
##  169 items (columns)
inspect(Groceries[1:3])
##     items                
## [1] {citrus fruit,       
##      semi-finished bread,
##      margarine,          
##      ready soups}        
## [2] {tropical fruit,     
##      yogurt,             
##      coffee}             
## [3] {whole milk}

0.1.2 simple basket without any sequence

my_data = paste("1,2","1","2,3", sep="\n");
write(my_data, file = "my_basket");
trans = read.transactions("my_basket", format = "basket", sep=",");
inspect(trans);
##     items
## [1] {1,2}
## [2] {1}  
## [3] {2,3}

0.1.3 ‘single’ format

item_df <- df[, c("windowID", "X733specificprob")]

item_df[1:4,]
              windowID                             X733specificprob
46446 AC11205_01/09/2017 multi_site_outage_hua_power_problem_(m.3100)
46447 AC11205_01/09/2017       site_outage_hua_power_problem_(m.3100)
46416 AC11219_03/09/2017       site_outage_hua_power_problem_(m.3100)
44737 AC11586_02/09/2017       site_outage_hua_power_problem_(m.3100)


write.csv(item_df, file = "myTemp.csv", row.names = F, quote = F)
# column 1 contains the transaction ID and column 2 contains one item
trans = read.transactions("myTemp.csv", format = "single", sep=",", cols = c(1,2), skip = 1)
length(trans)==length(unique(item_df$windowID))
cat("number of trans: ", length(trans), "\n")
inspect(trans[1:10]

[1]  {multi_site_outage_hua_power_problem_(m.3100),                                      
      site_outage_hua_power_problem_(m.3100)}                         AC11205_01/09/2017   
[2]  {site_outage_hua_power_problem_(m.3100)}                         AC11219_03/09/2017   
[3]  {site_outage_hua_power_problem_(m.3100)}                         AC11586_02/09/2017   
[4]  {cabinet_main_failure_external_power_supply_failure_(gsm_12.11)} BAG11_01/09/2017
[5]  {cabinet_main_failure_external_power_supply_failure_(gsm_12.11)} BAG11_02/09/2017

0.1.4 basket data

library(arulesSequences, quietly=T )
## Warning: package 'arulesSequences' was built under R version 3.4.2
# sequence dataset
# {a}, {b}
# {a}, {c}
# {a}, {a,b}

# build sequence dataset as a data frame
df <- data.frame(sequenceID = c(1,1,2,2,3,3),
                 eventID = c(1,2,1,2,1,2),
                 Size = c(1,1,1,1,2,2),
                 items = c("a","b","a","c","a", "a,b")
                 )

df <- data.frame(lapply(df, as.factor))

write.table(df, file="myTemp.csv", sep = ",", row.names = F, col.names = F, quote=F)

# "sequenceID" (sequence or customer identifier) and "eventID" (time or event identifier)
t <- read_baskets("myTemp.csv", sep=",", info = c("sequenceID","eventID" , "SIZE"))

# convert transaction as data frame
as(t, "data.frame")
##   items sequenceID eventID SIZE
## 1   {a}          1       1    1
## 2   {b}          1       2    1
## 3   {a}          2       1    1
## 4   {c}          2       2    1
## 5   {a}          3       1    2
## 6 {a,b}          3       2    2
inspect(t)
##     items sequenceID eventID SIZE
## [1] {a}   1          1       1   
## [2] {b}   1          2       1   
## [3] {a}   2          1       1   
## [4] {c}   2          2       1   
## [5] {a}   3          1       2   
## [6] {a,b} 3          2       2
# get transactionInfo
transactionInfo(t)
##   sequenceID eventID SIZE
## 1          1       1    1
## 2          1       2    1
## 3          2       1    1
## 4          2       2    1
## 5          3       1    2
## 6          3       2    2

0.2 arules apriori

http://www.salemmarafi.com/code/market-basket-analysis-with-r/

library(arules, quietly = T)

rules <- apriori(t, parameter=list(support=0.1, confidence=0.5))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 0 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[3 item(s), 6 transaction(s)] done [0.00s].
## sorting and recoding items ... [3 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [2 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
inspect(rules)
##     lhs    rhs support   confidence lift count
## [1] {}  => {a} 0.6666667 0.6666667  1.00 4    
## [2] {b} => {a} 0.1666667 0.5000000  0.75 1
##     lhs    rhs support   confidence lift
## [1] {}  => {a} 0.6666667 0.6666667  1.00
## [2] {b} => {a} 0.1666667 0.5000000  0.75


# sort rules
inspect(head(sort(rules, by="lift"), 10))
##     lhs    rhs support   confidence lift count
## [1] {}  => {a} 0.6666667 0.6666667  1.00 4    
## [2] {b} => {a} 0.1666667 0.5000000  0.75 1
as(rules, "data.frame")
##        rules   support confidence lift count
## 1  {} => {a} 0.6666667  0.6666667 1.00     4
## 2 {b} => {a} 0.1666667  0.5000000 0.75     1

rule to data frame with lhs an rhs in different column

source_url('https://raw.githubusercontent.com/brooksandrew/Rsenal/master/R/rules2df.R')

Visualization

library(arulesViz)
## Warning: package 'arulesViz' was built under R version 3.4.2
## Loading required package: grid
plot(rules, method="graph"
     #, interactive=TRUE
     , shading=NA)

http://mhahsler.github.io/arulesViz/reference/plot.html

0.3 arulesSequences cspade

Frequent sequence mining

# Mining Frequent Sequence
# set the support parameter to 0.3, and also have instructed the algorithm to show a verbose output.
s1 <- cspade(t, parameter=list(support = 0.3), control=list(verbose = TRUE))
## 
## parameter specification:
## support : 0.3
## maxsize :  10
## maxlen  :  10
## 
## algorithmic control:
## bfstype  : FALSE
## verbose  :  TRUE
## summary  : FALSE
## tidLists : FALSE
## 
## preprocessing ... 1 partition(s), 0 MB [0.15s]
## mining transactions ... 0 MB [0.17s]
## reading sequences ... [0.05s]
## 
## total elapsed time: 0.37s
as(s1, "data.frame")
##      sequence   support
## 1       <{a}> 1.0000000
## 2       <{b}> 0.6666667
## 3       <{c}> 0.3333333
## 4   <{a},{c}> 0.3333333
## 5   <{a},{b}> 0.6666667
## 6     <{a,b}> 0.3333333
## 7 <{a},{a,b}> 0.3333333
## 8   <{a},{a}> 0.3333333
#  default value for confidence of 0.8
s2 <- ruleInduction(s1, confidence = 0)

as(s2, "data.frame")
##               rule   support confidence      lift
## 1   <{a}> => <{c}> 0.3333333  0.3333333 1.0000000
## 2   <{a}> => <{b}> 0.6666667  0.6666667 1.0000000
## 3 <{a}> => <{a,b}> 0.3333333  0.3333333 1.0000000
## 4   <{a}> => <{a}> 0.3333333  0.3333333 0.3333333

built in data from arulesSequences package

library(arulesSequences)

# -- sample data
# $ cat /usr/local/lib/R/site-library/arulesSequences/misc/zaki.txt
# 1 10 2 C D
# 1 15 3 A B C
# 1 20 3 A B F
# 1 25 4 A C D F


# convert data to transactions
# A sequence α is an ordered list of events. An event is a non-empty unordered set of items. 
t <- read_baskets(con=system.file("misc", "zaki.txt", package="arulesSequences"),
          info = c("sequenceID","eventID","SIZE"))
    
# show transaction
inspect(t)
##        items sequenceID eventID SIZE
## 1      {C,D}          1      10    2
## 2    {A,B,C}          1      15    3
## 3    {A,B,F}          1      20    3
## 4  {A,C,D,F}          1      25    4
## 5    {A,B,F}          2      15    3
Home Page