0.1 Text mining

http://chengjun.github.io/en/2014/04/sentiment-analysis-with-machine-learning-in-R/

rdmTweets <- list(
"Text Mining Tutorial http://t.co/jPHHLEGm",
"He likes dogs r Singapore http://t.co/GPA0TyG5",
"RDataMining: Easier Parallel Computing in R with snowfall and sfCluster http://t.co/BPcinvzK",
"RDataMining: Tutorial: Parallel computing using R package snowfall http://t.co/CHBCyr76",
"handling big data: Interacting with Data using the filehash Package for R http://t.co/7RB3sChx"
)

df <- as.data.frame(do.call(rbind, rdmTweets))
names(df) <- c('text')

library(tm)
# ---- build a corpus
myCorpus <- Corpus(VectorSource(df$text)) # VectorSource specifies the text source

myCorpus[[3]]$content # show text in the 3rd document

# ---- Transforming Text

myCorpus <- tm_map(myCorpus, content_transformer(tolower)) #  to lower case
myCorpus <- tm_map(myCorpus, removePunctuation) # remove punctuation
myCorpus <- tm_map(myCorpus, removeNumbers) # remove numbers
myStopwords <- c(stopwords('english'), "available", "dogs") # add two additional stopwords
myStopwords <- setdiff(myStopwords, c("r", "big")) # remove 'r' and 'big' from stopwords
myCorpus <- tm_map(myCorpus, removeWords, myStopwords) # remove stopwords
#fix up 1) differences between us and aussie english 2) general errors
myCorpus <- tm_map(myCorpus, content_transformer(gsub), pattern = “organiz”, replacement = “organ”)

# ---- Stemming Words

dictCorpus <- myCorpus # keep a copy as a dictionary for stem completion
#library("SnowballC") # for stemDocument
myCorpus <- tm_map(myCorpus, stemDocument) # stem words
inspect(myCorpus)
# myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) # stem completion
# the following stem completion works in tm v0.6 
tm_map(myCorpus, content_transformer(function(x, d)
  paste(stemCompletion(strsplit(stemDocument(x), ' ')[[1]], d), collapse = ' ')), dictCorpus)
# fix up 1) differences between us and aussie english 2) general errors
myCorpus <- tm_map(myCorpus, content_transformer(gsub), pattern = “organiz”, replacement = “organ”)
inspect(myCorpus)
inspect(dictCorpus)

# ---- Building a Document-Term Matrix

myDtm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))
inspect(myDtm)
#                  Docs
# Terms             1 2 3 4 5
#   big             0 0 0 0 1
#   comput          0 0 1 1 0
#   data            0 0 0 0 2  
#   ...

# get tf-idf weighting 
myDtm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf), weighting = function(x) weightTfIdf(x, normalize = FALSE)))

# Based on the above matrix, many data mining tasks can be done, for example, clustering, classification and association analysis.

# ----- Frequent Terms and Associations

findFreqTerms(myDtm, lowfreq=2)

# which words are associated with "r"?
findAssocs(myDtm, 'r', 0.30)

1 Term frequency/inverse document frequency (TF/IDF)

#read 1000 txt articles from directory data/txt
corpus  <-Corpus(DirSource("data/txt"), readerControl = list(blank.lines.skip=TRUE));
#some preprocessing
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument, language="english")
#creating term matrix with TF-IDF weighting
terms <-DocumentTermMatrix(corpus,control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))

#or compute cosine distance among documents
dissimilarity(tdm, method = "cosine")

2 Word cloud

library(wordcloud)
wordcloud(corpus, scale=c(5,0.5), 
    max.words=30, random.order=FALSE
    #, rot.per=0.35, use.r.layout=FALSE
    ,colors=brewer.pal(10, "Dark2")
)

3 Topic Modelling

LDA

https://eight2late.wordpress.com/2015/09/29/a-gentle-introduction-to-topic-modeling-using-r/

4 Social Network Analysis

http://www.rdatamining.com/examples/social-network-analysis

library(tm)
## Loading required package: NLP
data("crude")
# Document Term Matrix
dtMatrix <- DocumentTermMatrix(crude)
# created a Term-Term Adjacency Matrix.
termMatrix <- t(as.matrix(dtMatrix)) %*% as.matrix(dtMatrix) 
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
##          Terms
## Terms     "growth "if "is "may "none "opec
##   "growth       1   0   0    0     0     0
##   "if           0   1   0    0     1     0
##   "is           0   0   2    1     0     0
##   "may          0   0   1    1     0     0
##   "none         0   1   0    0     2     0
##   "opec         0   0   0    0     0     4
termMatrix <- termMatrix[1:20,1:20]

library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$degree <- degree(g)
V(g)$label.cex <- 2.2 * V(g)$degree / max(V(g)$degree)+ .2
V(g)$label.color <- rgb(0, 0, .2, .8)
V(g)$frame.color <- NA
egam <- (log(E(g)$weight)+.4) / max(log(E(g)$weight)+.4)
E(g)$color <- rgb(.5, .5, 0, egam)
E(g)$width <- egam

# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)

Home Page