http://www.statmethods.net/advstats/cart.html
# Classification Tree with rpart
library(rpart)
kyphosis[1:3,]
## Kyphosis Age Number Start
## 1 absent 71 3 5
## 2 absent 158 3 14
## 3 present 128 4 5
#------------ build tree --------------
# grow tree
fit <- rpart(Kyphosis ~ Age+Number+Start, method="class", data=kyphosis) # classfication tree
# prune the tree
pfit <- prune(fit, cp=fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"])
#----- model diagnostic -----------
printcp(fit) # display the results
##
## Classification tree:
## rpart(formula = Kyphosis ~ Age + Number + Start, data = kyphosis,
## method = "class")
##
## Variables actually used in tree construction:
## [1] Age Start
##
## Root node error: 17/81 = 0.20988
##
## n= 81
##
## CP nsplit rel error xerror xstd
## 1 0.176471 0 1.00000 1.00000 0.21559
## 2 0.019608 1 0.82353 0.82353 0.20018
## 3 0.010000 4 0.76471 0.82353 0.20018
plotcp(fit) # visualize cross-validation results
# summary(fit) # detailed summary of splits
# ---------- plot tree -----------
plot(fit, uniform=TRUE)
text(fit, use.n=TRUE, all=TRUE, cex=.8)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.3.3
rpart.plot(fit, under=TRUE)
rpart.plot(pfit, under=TRUE) # plot the pruned tree
# create attractive postscript plot of tree
# post(fit, file = "")
library(randomForest)
## Classification:
## data(iris)
iris[sample(nrow(iris), 5), ]
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# 49 5.3 3.7 1.5 0.2 setosa
# 133 6.4 2.8 5.6 2.2 virginica
# 1 5.1 3.5 1.4 0.2 setosa
# 99 5.1 2.5 3.0 1.1 versicolor
# 69 6.2 2.2 4.5 1.5 versicolor
#----------------------------------------------------------
set.seed(1)
iris.rf <- randomForest(Species ~ ., data=iris, proximity=TRUE, keep.forest=TRUE)
# ntree: Number of trees to grow.
# importance: Should importance of predictors be assessed?
# sampsize : for example, sampsize=c(20, 30, 20), stratified sampling: draw 20, 30, and 20 of the species to grow each tree.
# --------- Plot of feature important
imp <- importance(iris.rf) # imp is a matrix
plot(imp)
lines(imp)
text(imp, labels=rownames(imp), cex=1)
# --------- more detailed variable importance information
iris.rf <- randomForest(Species ~ ., data=iris, importance=TRUE, proximity=TRUE)
print(iris.rf)
# Look at variable importance:
round(importance(iris.rf), 2)
# --------- Multi-dimensional Scaling Plot of Proximity matrix from randomForest -------------
# The unsupervised clustering case:
# MDSplot(rf, fac, ...)
# rf= randomForest object. fac=a factor that was used as response to train rf.
x<-MDSplot(iris.rf, iris$Species)
#add legend
legend("topleft", legend=levels(iris.rf$predicted),
fill=brewer.pal(length(levels(iris.rf$predicted)), "Set1"))
str(x)
# need to identify points?
# text(x$points, labels=attr(x$points,"dimnames")[[1]], cex=0.5)
##-------------------- ------------
## Do MDS on 1 - proximity:
iris.mds <- cmdscale(1 - iris.rf$proximity, eig=TRUE)
plot(iris.mds$points[,1], iris.mds$points[,2], type = "n", xlab = "", ylab = "", axes = FALSE,
main = "cmdscale (stats)")
op <- par(pty="s")
x <- cbind(iris[,1:4], iris.mds$points)
pairs(x, cex=0.6, gap=0,
col=c("red", "green", "blue")[as.numeric(iris$Species)],
main="Iris Data: Predictors and MDS of Proximity Based on RandomForest")
par(op)
# "GOF" component of the result ("goodness of fit")
print(iris.mds$GOF)
The party package provides nonparametric regression trees for nominal, ordinal, numeric, censored, and multivariate responses
library(rpart) # for cu.summary dataset
library(party)
fit <- ctree(Mileage ~ Price+Country+Reliability+Type, data=na.omit(cu.summary) )
plot(fit, main="Conditional Inference Tree")