Home: http://manmustbecool.github.io/MyWiki/

1 K-Means

iris[1:3,]

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa

model <- kmeans(iris[,1:4], 3)
# plot with first two attributes: Sepal.Length Sepal.Width 
plot(iris[,c(1,2)], col=model$cluster)
# point center of first two attributes
points(model$centers[, c(1,2)], col=1:3, pch=8, cex=2)

2 K-Means++

set.seed(1984)
x1 <- sample(seq(0.0, 0.5, by=0.01), 50, replace = T)
x2 <- sample(seq(0.3, 0.4, by=0.01), 300, replace = T)
x3 <- sample(seq(0.8, 1.0, by=0.01), 300, replace = T)
xx <- c(x1, x2, x3)
yy <- runif(length(xx))
zz <- cbind(xx, yy)

k=2

# kmean
model <- kmeans(zz, centers = k)
plot(zz, col = model$cluster, pch = 19)
points(model$centers[, c(1,2)], col=1:k, pch=8, cex=2)


# kmean++
library(LICORS) # kmeanspp

model <- kmeanspp(zz, k = k, start = "random")
plot(zz, col = model$cluster, pch = 19)
points(model$centers[, c(1,2)], col=1:k, pch=8, cex=2)

# kmean++
# the result may different to kmeanspp as there is no nstart paramater
kmpp <- function(data, k) {
  n <- nrow(data)
  center_ids <- numeric(k)
  center_ids[1:2]<- sample(1:n, 1)
  
  for (i in 2:k) {
     # Computes the distance between rows of two matrices.
      dists <- apply(data[center_ids, ], 1, 
                       function(center) {rowSums((data - center)^2)})
      probs <- apply(dists, 1, min)
      probs[center_ids] <- 0
      center_ids[i] <- sample.int(n, 1, prob = probs)
  }
  kmeans(data, centers = data[center_ids, ])
}

model <- kmpp(zz, k)
plot(zz, col = model$cluster, pch = 19)
points(model$centers[, c(1,2)], col=1:k, pch=8, cex=2)

more : http://www.mattpeeples.net/kmeans.html

3 Hierarchical Clustering

sampleiris <- iris[sample(1:150, 30),]
distance <- dist(sampleiris[,-5], method="euclidean")
model <- hclust(distance, method="average")
plot(model, hang=-1, label=sampleiris$Species)

4 Fuzzy C-Means

library(e1071)
model <- cmeans(iris[,1:4], 3, 100, m=2, method="cmeans")
plot(iris[,c(1,2)], col=model$cluster)
points(model$centers[,c(1,2)], col=1:3, pch=8, cex=2)

5 Density-based Cluster

# Method for determining the optimal eps value
# The function kNNdistplot() can be used to draw the k-distance plot:
# install.packages("dbscan")
# dbscan::kNNdistplot(iris[,1:4], k =  5)
# abline(h = 0.15, lty = 2)

library(fpc)
# eps is radius of neighborhood, MinPts is number of neighbors within eps
model <- dbscan(iris[,1:4], eps=0.6, MinPts=4)
# plot(model, sampleiris)
# Notice points in cluster 0 are unassigned outliers
plot(iris[,c(1,2)], col=model$cluster)

6 Multi-Gaussian with Expectation-Maximization

library(mclust)
model <- Mclust(iris[,1:4], 3)
# plot(iris[,c(1,2)], col=model$classification)
plot(model, iris[, 1:4], what=c('classification'), dimens=c(1,2))

references:

http://horicky.blogspot.ie/2012/04/machine-learning-in-r-clustering.html

^Home Page^

Clustering

Updated: 01 June, 2017

1 K-Means

2 K-Means++

3 Hierarchical Clustering

4 Fuzzy C-Means

5 Density-based Cluster

6 Multi-Gaussian with Expectation-Maximization