suppressMessages(library(arules))
## Warning: package 'arules' was built under R version 3.4.2
data(Groceries)
Groceries
## transactions in sparse format with
## 9835 transactions (rows) and
## 169 items (columns)
inspect(Groceries[1:3])
## items
## [1] {citrus fruit,
## semi-finished bread,
## margarine,
## ready soups}
## [2] {tropical fruit,
## yogurt,
## coffee}
## [3] {whole milk}
my_data = paste("1,2","1","2,3", sep="\n");
write(my_data, file = "my_basket");
trans = read.transactions("my_basket", format = "basket", sep=",");
inspect(trans);
## items
## [1] {1,2}
## [2] {1}
## [3] {2,3}
item_df <- df[, c("windowID", "X733specificprob")]
item_df[1:4,]
windowID X733specificprob
46446 AC11205_01/09/2017 multi_site_outage_hua_power_problem_(m.3100)
46447 AC11205_01/09/2017 site_outage_hua_power_problem_(m.3100)
46416 AC11219_03/09/2017 site_outage_hua_power_problem_(m.3100)
44737 AC11586_02/09/2017 site_outage_hua_power_problem_(m.3100)
write.csv(item_df, file = "myTemp.csv", row.names = F, quote = F)
# column 1 contains the transaction ID and column 2 contains one item
trans = read.transactions("myTemp.csv", format = "single", sep=",", cols = c(1,2), skip = 1)
length(trans)==length(unique(item_df$windowID))
cat("number of trans: ", length(trans), "\n")
inspect(trans[1:10]
[1] {multi_site_outage_hua_power_problem_(m.3100),
site_outage_hua_power_problem_(m.3100)} AC11205_01/09/2017
[2] {site_outage_hua_power_problem_(m.3100)} AC11219_03/09/2017
[3] {site_outage_hua_power_problem_(m.3100)} AC11586_02/09/2017
[4] {cabinet_main_failure_external_power_supply_failure_(gsm_12.11)} BAG11_01/09/2017
[5] {cabinet_main_failure_external_power_supply_failure_(gsm_12.11)} BAG11_02/09/2017
library(arulesSequences, quietly=T )
## Warning: package 'arulesSequences' was built under R version 3.4.2
# sequence dataset
# {a}, {b}
# {a}, {c}
# {a}, {a,b}
# build sequence dataset as a data frame
df <- data.frame(sequenceID = c(1,1,2,2,3,3),
eventID = c(1,2,1,2,1,2),
Size = c(1,1,1,1,2,2),
items = c("a","b","a","c","a", "a,b")
)
df <- data.frame(lapply(df, as.factor))
write.table(df, file="myTemp.csv", sep = ",", row.names = F, col.names = F, quote=F)
# "sequenceID" (sequence or customer identifier) and "eventID" (time or event identifier)
t <- read_baskets("myTemp.csv", sep=",", info = c("sequenceID","eventID" , "SIZE"))
# convert transaction as data frame
as(t, "data.frame")
## items sequenceID eventID SIZE
## 1 {a} 1 1 1
## 2 {b} 1 2 1
## 3 {a} 2 1 1
## 4 {c} 2 2 1
## 5 {a} 3 1 2
## 6 {a,b} 3 2 2
inspect(t)
## items sequenceID eventID SIZE
## [1] {a} 1 1 1
## [2] {b} 1 2 1
## [3] {a} 2 1 1
## [4] {c} 2 2 1
## [5] {a} 3 1 2
## [6] {a,b} 3 2 2
# get transactionInfo
transactionInfo(t)
## sequenceID eventID SIZE
## 1 1 1 1
## 2 1 2 1
## 3 2 1 1
## 4 2 2 1
## 5 3 1 2
## 6 3 2 2
http://www.salemmarafi.com/code/market-basket-analysis-with-r/
library(arules, quietly = T)
rules <- apriori(t, parameter=list(support=0.1, confidence=0.5))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 0
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[3 item(s), 6 transaction(s)] done [0.00s].
## sorting and recoding items ... [3 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [2 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(rules)
## lhs rhs support confidence lift count
## [1] {} => {a} 0.6666667 0.6666667 1.00 4
## [2] {b} => {a} 0.1666667 0.5000000 0.75 1
## lhs rhs support confidence lift
## [1] {} => {a} 0.6666667 0.6666667 1.00
## [2] {b} => {a} 0.1666667 0.5000000 0.75
# sort rules
inspect(head(sort(rules, by="lift"), 10))
## lhs rhs support confidence lift count
## [1] {} => {a} 0.6666667 0.6666667 1.00 4
## [2] {b} => {a} 0.1666667 0.5000000 0.75 1
as(rules, "data.frame")
## rules support confidence lift count
## 1 {} => {a} 0.6666667 0.6666667 1.00 4
## 2 {b} => {a} 0.1666667 0.5000000 0.75 1
rule to data frame with lhs an rhs in different column
source_url('https://raw.githubusercontent.com/brooksandrew/Rsenal/master/R/rules2df.R')
Visualization
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 3.4.2
## Loading required package: grid
plot(rules, method="graph"
#, interactive=TRUE
, shading=NA)
Frequent sequence mining
# Mining Frequent Sequence
# set the support parameter to 0.3, and also have instructed the algorithm to show a verbose output.
s1 <- cspade(t, parameter=list(support = 0.3), control=list(verbose = TRUE))
##
## parameter specification:
## support : 0.3
## maxsize : 10
## maxlen : 10
##
## algorithmic control:
## bfstype : FALSE
## verbose : TRUE
## summary : FALSE
## tidLists : FALSE
##
## preprocessing ... 1 partition(s), 0 MB [0.15s]
## mining transactions ... 0 MB [0.17s]
## reading sequences ... [0.05s]
##
## total elapsed time: 0.37s
as(s1, "data.frame")
## sequence support
## 1 <{a}> 1.0000000
## 2 <{b}> 0.6666667
## 3 <{c}> 0.3333333
## 4 <{a},{c}> 0.3333333
## 5 <{a},{b}> 0.6666667
## 6 <{a,b}> 0.3333333
## 7 <{a},{a,b}> 0.3333333
## 8 <{a},{a}> 0.3333333
# default value for confidence of 0.8
s2 <- ruleInduction(s1, confidence = 0)
as(s2, "data.frame")
## rule support confidence lift
## 1 <{a}> => <{c}> 0.3333333 0.3333333 1.0000000
## 2 <{a}> => <{b}> 0.6666667 0.6666667 1.0000000
## 3 <{a}> => <{a,b}> 0.3333333 0.3333333 1.0000000
## 4 <{a}> => <{a}> 0.3333333 0.3333333 0.3333333
built in data from arulesSequences package
library(arulesSequences)
# -- sample data
# $ cat /usr/local/lib/R/site-library/arulesSequences/misc/zaki.txt
# 1 10 2 C D
# 1 15 3 A B C
# 1 20 3 A B F
# 1 25 4 A C D F
# convert data to transactions
# A sequence α is an ordered list of events. An event is a non-empty unordered set of items.
t <- read_baskets(con=system.file("misc", "zaki.txt", package="arulesSequences"),
info = c("sequenceID","eventID","SIZE"))
# show transaction
inspect(t)
## items sequenceID eventID SIZE
## 1 {C,D} 1 10 2
## 2 {A,B,C} 1 15 3
## 3 {A,B,F} 1 20 3
## 4 {A,C,D,F} 1 25 4
## 5 {A,B,F} 2 15 3