1 Genetic algorithm based clustering

trainingData <- iris[,1:4]
plotData <- iris[,c(1,4)]

euc.dist <- function(x1, x2) sqrt(sum((x1 - x2) ^ 2))

k_cluster <- 3

#------------- start the algorithm ------------

chromosomes <- list()
for(i in 1:k_cluster){
  chromosomes[[i]] <- trainingData[i,]
}


solution_cluster_number <- vector()
best_distance <- Inf

for(loop in 1:30){
  
  cat("loop: ", loop, "\n")
  
  # assign each data point to the cluster
  cluster_number <- vector()
  for(trainingData_index in 1:nrow(trainingData)){
    trainingData_distances <- vector()
    for(chromosome_index in 1:length(chromosomes)){
      distance <- euc.dist(chromosomes[[chromosome_index]], trainingData[trainingData_index,])
      trainingData_distances <- c(trainingData_distances, distance)
    }
    names(trainingData_distances) <- 1:length(trainingData_distances)
    trainingData_distances <- sort(trainingData_distances)
    cluster_number <- c(cluster_number, as.numeric(names(trainingData_distances)[1]))
  }
  
  # ------ fitness calcaluation --------------
  chromosomes_distances <- vector()
  for(chromosome_index in 1:length(chromosomes)){
    total_distance <- 0;
    
    trainingData_cluster <- trainingData[which(cluster_number==chromosome_index), ]
    
    # if no data for this cluster then total_distance is Inf
    if(nrow(trainingData_cluster)==0){
      total_distance <- Inf
    }else{
      
      # recalculate centers of the cluster as chromosomes 
      for(i in 1:ncol(trainingData_cluster)){
        chromosomes[[chromosome_index]][,i] <- sum(trainingData_cluster[, i])/nrow(trainingData_cluster)
      }

      for(i in 1:nrow(trainingData_cluster)){
        total_distance <- total_distance + euc.dist(chromosomes[[chromosome_index]], trainingData_cluster[i,])
      }
    }
    
    print(total_distance)
    chromosomes_distances <- c(chromosomes_distances, total_distance)
  }
  names(chromosomes_distances) <- 1:length(chromosomes)
  chromosomes_distances <- sort(chromosomes_distances)
  
  
  if(mean(chromosomes_distances)<best_distance){
    
    best_distance <- mean(chromosomes_distances)
    solution_cluster_number <- cluster_number
    # solution_cluster_number[solution_cluster_number==4] <- 3
    plot(plotData, col=solution_cluster_number, main=paste0("GA-Cluster, generation ", loop) )
    solution_center <- as.data.frame(do.call(rbind, chromosomes))
    # points(solution_center[, c(1,3)], col=1:length(chromosomes), pch=8, cex=2)
    
  }
   

  chromosomes_new <- list()
  
  # ------------ selection -------------
  # select top 2 as parents
  for(i in 1:2){
    chromosomes_new[[i]] <- chromosomes[[as.numeric(names(chromosomes_distances)[i])]] 
  }
  
  # ---------- crossover ----------------
  # make two children
  chromosomes_new[[3]] <- cbind(chromosomes_new[[1]][,1:2], chromosomes_new[[2]][,3:4])  
  chromosomes_new[[4]] <- cbind(chromosomes_new[[2]][,1:2], chromosomes_new[[1]][,3:4])  
  
  # --------- mutation ------------------
  mutationChance <- 0.2
  
  # don't mutate the best
  for(chromosomes_new_index in 2:length(chromosomes_new)){
    
    # cat(as.character(chromosomes_new[[chromosomes_new_index]]), "\n")
    
    x <- chromosomes_new[[chromosomes_new_index]] 
    
    for(i in 1:ncol(x)){
      v <- x[,i]
      if (runif(1) <= mutationChance) { # ok, do mutation
        if(v!=0){
          if(runif(1) <= 0.5){
            v <- v + 2*runif(1)*v
          }else{
            v <- v - 2*runif(1)*v
          }
        }else{ # v==0
          if(runif(1) <= 0.5){
            v <- v + 2*runif(1)
          }else{
            v <- v - 2*runif(1)
          }
        }
      }
      x[,i] <- v
    }
    
    chromosomes_new[[chromosomes_new_index]] <- x
    
    # cat(as.character(chromosomes_new[[chromosomes_new_index]]), "\n")
  }
  
  # take top 3 for new population
  chromosomes <- chromosomes_new[1:3]
  
}
## loop:  1 
## [1] 181.4227
## [1] 61.06766
## [1] 3.043691

## loop:  2 
## [1] 24.08526
## [1] 103.9356
## [1] Inf
## loop:  3 
## [1] 26.10341
## [1] Inf
## [1] 101.6857
## loop:  4 
## [1] 26.10341
## [1] Inf
## [1] 101.6857
## loop:  5 
## [1] 31.00067
## [1] 27.68757
## [1] 42.76792

## loop:  6 
## [1] 60.74447
## [1] 15.57121
## [1] 24.08526

## loop:  7 
## [1] 291.6103
## [1] Inf
## [1] Inf
## loop:  8 
## [1] 103.9356
## [1] Inf
## [1] 24.08526
## loop:  9 
## [1] 26.10341
## [1] 66.0601
## [1] 11.96701
## loop:  10 
## [1] 103.9356
## [1] 22.84912
## [1] 0
## loop:  11 
## [1] 1.038512
## [1] 264.4184
## [1] 10.95663
## loop:  12 
## [1] 24.08526
## [1] Inf
## [1] 103.9356
## loop:  13 
## [1] 28.48506
## [1] 99.49896
## [1] Inf
## loop:  14 
## [1] 31.00067
## [1] 97.33599
## [1] Inf
## loop:  15 
## [1] 26.10341
## [1] 9.194751
## [1] 75.89586
## loop:  16 
## [1] 81.50572
## [1] 63.1305
## [1] Inf
## loop:  17 
## [1] 36.76846
## [1] 68.03237
## [1] 8.858187
## loop:  18 
## [1] 74.73318
## [1] 79.39224
## [1] Inf
## loop:  19 
## [1] 76.12024
## [1] 31.00067
## [1] 5.439637
## loop:  20 
## [1] 103.9356
## [1] 22.84912
## [1] 0
## loop:  21 
## [1] 1.038512
## [1] 264.4184
## [1] 10.95663
## loop:  22 
## [1] 24.08526
## [1] 103.9356
## [1] Inf
## loop:  23 
## [1] 26.10341
## [1] 100.9736
## [1] 0
## loop:  24 
## [1] 101.6857
## [1] 23.04216
## [1] 1.007091
## loop:  25 
## [1] 7.724199
## [1] 9.1235
## [1] 209.7745
## loop:  26 
## [1] 21.23854
## [1] 111.3756
## [1] 0.4123106
## loop:  27 
## [1] 90.32793
## [1] Inf
## [1] 42.84967
## loop:  28 
## [1] 31.00067
## [1] 55.33851
## [1] 17.17399
## loop:  29 
## [1] 103.9356
## [1] 16.90079
## [1] 3.476788
## loop:  30 
## [1] 131.2249
## [1] 80.12147
## [1] Inf
plot(plotData, col=solution_cluster_number, main="Final result of GA-Cluster")

The implementation is based on:

Genetic algorithm-based clustering technique, U Maulik, S Bandyopadhyay, Pattern recognition 33 (9), 1455-1465

Home Page