Home: http://manmustbecool.github.io/MyWiki/

1 Basic info

Pakcages overview
- http://cran.r-project.org/web/views/
Installing R in Ubuntu
- ```
sudo apt-get update
sudo apt-get install r-base r-base-dev
```
  install R in linux http://www.jason-french.com/blog/2013/03/11/installing-r-in-linux/

2 Special constants

#------ NULL, NA, NaN, Inf and -Inf -----------
# NULL is an Object meaning the variable contains no object.
is.null() # check is NULL

# NA is a Value meaning missing data
is.na() # returns TRUE for both NA and NaN
na.omit() # remove NA value

# NaN means not a number
is.nan() # returns False for NA and Ture for NaN

Inf -Inf # for positive and negative infinity

3 Atomic data types

The atomic data types are “logical”, “integer”, “numeric” (synonym “double”), “complex”, “character” and “raw”.

# Formatting decimal
round(1.221554, digits=2)

# --------- string ---------
#  string is considered to be a character vector in R
str <- 'string' # str[1] # This evaluates to 'string'.
nchar(str) # Count the Number of Characters 
substr(str, 1, 2) # == 'st'
strsplit('0-0-1', '-') # Evaluates to list('0', '0', '1')
strsplit('0-0-1', '')  # Evaluates to list('0', '-', '0', '-', '1')
paste('hello', 'world', sep = ' ') # 'hello world'
paste0('hello', 'world') # 'helloworld'
paste(c('a','b','c'), collapse = "")  # 'abc' , connect character vector to string

4 Common data structures

4.1 vector

A Vector is a sequence of data elements of atomic data types (Numeric, Integer, etc.).

# initialize an empty vector
v <- vector()

# create an arithmetic sequence (numeric vector)
se <- seq(8, 3) # 8,7,6,5,4,3
se <- seq(3, 8, by=2) #  3, 5, 7

# select top and buttom
head(v, 5) # top 5
tail(v, 2) # buttom 2

# select a subset
v <- c(1,2,3,4)
y <- v[v<3]
y <- which(v<3)

# select every (n+1)th element
n=2
v <- v[seq(1, length(v), n)]

# append a value
v <- c(1, 2)
v <- c(v, 3) #or v <- append(v, 3)

# delete a value
v <- c("a", "b", "c", "d")
v <- v[!v=="c"]

# delete subset of values
v <- v[!v %in% c("c", "d")]

# find all duplicated entries
v[duplicated(v)] # or remove all duplicated, v[!duplicated(v)]

# find all unique entries
unique(v)

# find the index of the max/min value
which.max(v)

# calculate frequency of occurrence 
v = c(1,1,1,1,1,2,3,4,5,5,5,5,6,7,7,7,7)
table(v) # frequency table
max(table(v)) # highest frequency (not the actual value )

# Set operations
union(x, y)
intersect(x, y) 
setdiff(x, y) # (asymmetric!) difference, setdiff(y, x) for y difference to x
setequal(x, y) 

# compare 2 vector
all(c(1,2,3)==c(1,2,3)) # > TRUE
all(c(1,2,3)==c(2,1,3)) # > FALSE
all(c(1,2,3)==c(1,2,NA)) # > NA

# Return the sum of all the values
sum(v)
sum(v==1) # count the number of value 1

4.2 factor

A Factor is a categorical variable

# factor() transforms a vector into a factor.
f <- factor(c("10", "11", "12"))

# factor origional values to numeric values
v <- as.numeric(as.character(f)) 

# get numeric coding of the factor, not the origional values
v <- as.numeric(f) 

# divides the range of x into intervals
cut(c(1,2,3,4,5,2,3,4,5,6,7), 3)

4.3 matrix

A Matrix is a collection of elements with same data types in a two-dimensional rectangular layout.

# Initialize a matrix
mr <- matrix(ncol=50, nrow=50)

# Convert a vector to a matrix
v <- seq(1, 6)
m <- array(v, c(3,2)) # or m <- matrix(v, nrow=3, ncol=2)
m
      [,2]  [,3]
[1,]    1    4
[2,]    2    5
[3,]    3    6

# vectors to matrix columns
A <- c(0,5,4,3)
B <- c(8,0,6,5)
C <- c(3,6,0,2)
m <- matrix(c(A,B,C), nrow=length(A))

# Matrix operation
x %*% y    # Multiplication
m <- t(m)  # Transpose

Sparse Matrix is a matrix in which most of the elements are zero.

#---- slam package------------
# simple triplet matrix from slam package is very efficient representation of a large sparse matrix. Cell with unassigned values are assumed to be zero.

library(slam)
i <- c(1,2,3,4,5)   # row index
j <- c(1,1,1,2,2)   # column index
v <- c(5,5,5,6,6)   # value
m <- simple_triplet_matrix(i,j,v)

as.matrix(m)
     [,1] [,2]
[1,]    5    0
[2,]    5    0
[3,]    5    0
[4,]    0    6
[5,]    0    6

#----- Matrix package ------
library(Matrix)
 
m1 <- matrix(0, nrow = 1000, ncol = 1000)
m2 <- Matrix(m1, sparse = TRUE)
 
object.size(m1)
# 8000200 bytes
object.size(m2)
# 5032 bytes

4.4 array

An array is composed of n dimensions where each dimension is a vector of R objects of the same type

Arrays are similar to matrices but can have more than 2 dimensions.

# create an one dimension array 
ar <- array(3, 5) # [3,3,3,3,3]
ar <- array(c(3,5), 5) # [3,5,3,5,3,5,3,5,3,5]

# get dimantion of array
dim(ar)

array(1:8, dim=c(2,2,2))

, , 1
     [,1] [,2]
[1,]    1    3
[2,]    2    4

, , 2
     [,1] [,2]
[1,]    5    7
[2,]    6    8

4.5 data.frame

A data.frame likes a table. It is a list of column vectors of equal length. Different columns can be of various types. . I.e., one column might be a numerical variable, another might be a factor.

Initialize data frame

# Initialize a df
df <- data.frame()
df <- data.frame(stringsAsFactors=F) # Avoid that R understands characters as factors

# Initialize a df from a matrix 
df <- data.frame(matrix(ncol=10, nrow=5))
...
df <- df[-1,]

# Initialize from value
timestamps <- c('2011-01-05 11:00', '2011-01-05 12:00', '2011-01-05 13:00')
prices <- c(1.1, 2.2, 3.3)
stockpricesDf <- data.frame(timestamps, prices)

# Get/Set row/column names 
rownames(df)   
colnames(df) 
rownames(df) <- value

# Get number of rows/columns
nrow(df)
ncol(df)

Subset data frame

sub_df <- df[c(1,2,5), ] # Subset by row indexes
sub_df <- df[, c(1,2,5)] # Subset by column indexes

sub_df <- df[rownames(df)=='user1', ] # Subset by row names
sub_df <- df[rownames(df)=='user1', ,drop=FALSE] # passed on to the indexing method to keep data.frame and matrix format
sub_df <- df[, c('start','end')] # Subset by column names

# filter rows with condition
sub_df <- df[df$age>40, ]

# sample rows 
sub_df <- df[sample(nrow(df), 3), ]

# delete rows/column by index
sub_df <- df[-c(1:8), ] # delete rows
sub_df <- df[, -c(1:8)] # delete columns

# delete columns by names 
sub_df <- df[, !(colnames(df) %in% c("TotalDuration") )]

Append data frame

# add a row or column
x <- c(1,2,3)
df <- rbind(df, x); # add a row
df <- cbind(df, x); # add a column

# Avoid that R understands characters as factors
df <- data.frame(stringsAsFactors=F) 
df <- rbind(df, c("hi","i","am","bob"), stringsAsFactors=F)
df <- rbind(df, c("hi","i","am","alice"), stringsAsFactors=F)

temp <- list();
for(i in 1:100) temp <- append(temp, list(c(1, i)) )
df <- as.data.frame(do.call(rbind, temp))

# create a new column with merged values of two columns
df$new_column = paste(df$V1, df$V2, sep="_")

Combine data frames

# build a list of data frames
dfList <- list()
for(i in 1:10){ dfList[[i]] <- data.frame(a=c("a1","a2"), b=c("b1","b2")) }

# do.call approach
finalDf <- do.call(rbind, dfList)

# an efficient implementation of do.call(rbind, ...);
library(dplyr) # for bind_rows
finalDf <- bind_rows(dfList)

Sort

# sort by column index
df[order(-df[,4], df[,1]), ]

# sort according a specified order or vector
target <- c("b", "c", "a", "d")
df[match(target, df$name),]

Summarize data frame

# calculate row sum and product in data.frame
transform(df, sum=rowSums(df), prod=x*y*z)
 #  x y z sum prod
 #1 1 2 3   6    6
 #2 2 3 4   9   24
 #3 5 1 2   8   10
 
# apply a function for each row
apply(df, 1, function(u) sum)
 
# apply a function to a data frame Split by Factors
by(df[, 1:2], df[,3], summary)

# row frequency count
# http://rwiki.sciviews.org/doku.php?id=tips:data-frames:count_and_extract_unique_rows
aggregate(rep(1, nrow(df)), by=as.list(df), FUN=sum)

Handle duplication

# find/remove duplicated rows
df[duplicated(df),] # or remove, df[!duplicated(df),]

# remove duplicated rows by a column
df[!duplicated(df[,c('firstname')]),]

# find unique rows
unique(df)

# remove rows or colums have same values
df <- df[apply(df, 1, function(row) length(unique(row)) > 1), ]
df <- df[, apply(df, 2, function(col) length(unique(col)) > 1)]

Handle na, nan, null, blank

df[is.na(df)] <- value  # Convert all Na's in data frames to other values

# Remove rows with NAs in data frames
df[complete.cases(df),]       # remove all rows containing NAs
df[complete.cases(df[,2:3]),] # remove rows based on some columns

df[!is.null(df[,1]),] # Remove rows with null value in a specific column
df[!(df[,1]==""),]    # Remove rows with Blank value in a specific column

df[rowSums(is.na(df))!= ncol(df),] # Remove rows where all data is NA
df[,colSums(is.na(df))<nrow(df)]   # Remove columns where All values are NA

Multiple data frames

# ----- join two data frames
df1<- data.frame(CustomerId=c(1,2),y=c(1,1))
df2<- data.frame(CustomerId=c(3,4),y=c(2,2))

merge(x=df1, y=df2, by="CustomerId")          # default all=FALSE. natural join, a special case of an inner join
# CustomerId y.x        y.y # 0 row

merge(x=df1, y=df2, by="CustomerId", all = T) # (full) outer join
#  CustomerId y.x y.y
# 1          1   1  NA
# 2          2   1  NA
# 3          3  NA   2
# 4          4  NA   2

merge(x=df1, y=df2, by="CustomerId", all.x=T) # left (outer) join
#   CustomerId y.x y.y
# 1          1   1  NA
# 2          2   1  NA

merge(x=df1, y=df2, by="CustomerId", all.y=T) # right (outer) join
#   CustomerId y.x y.y
# 1          3  NA   2
# 2          4  NA   2

merge(x=df1, y=df2, by=NULL)                  # cross join
#   CustomerId.x y.x CustomerId.y y.y
# 1            1   1            3   2
# 2            2   1            3   2
# 3            1   1            4   2
# 4            2   1            4   2

merge(x=df1, y=df2, by=c("CustomerId","y"))
# CustomerId y         
# <0 rows> 

merge(x=df1, y=df2, by=c("CustomerId","y"), all=T)
#   CustomerId y
# 1          1 1
# 2          2 1
# 3          3 2
# 4          4 2

# ----- compare two data frames
a1 <- data.frame(a = 1:5, b=letters[1:5])
a2 <- data.frame(a = 1:3, b=letters[1:3])

require(sqldf)
a1Ina2 <- sqldf('SELECT * FROM a1 INTERSECT SELECT * FROM a2') # for rows  are in both data frames
a1NotIna2 <- sqldf('SELECT * FROM a1 EXCEPT SELECT * FROM a2')

require(dplyr)
semi_join(a1,a2) # for rows  are in both data frames
anti_join(a1,a2)

Stack columns

df <- data.frame(a=c(1,1), b=c(2,2), c=c(3,3))
# Stack the data:
sdf <- stack(df)

#   values ind
# 1      1   a
# 2      1   a
# 3      2   b
# 4      2   b
# 5      3   c
# 6      3   c

4.5.1 data.table

use data.table for data.frame

library(data.table)

# convert the data frame to data table
dt <- data.table(df) 

# convert a table to a data frame 
df <- as.data.frame.matrix(dt) 

# select colum by names
dt <- dt[ ,c("time","count"), with=F] # need set with=F

# return a column as a vector
dt[[2]] # select by index

# delete a column by name in data.table
dt <- dt[,foo:=NULL]

dt <- dt[, c("foo","bar"):=NULL]  # remove two columns

myVar = "foo"
dt <- dt[, (myVar):=NULL]   # lookup myVar contents

#------- merge tables -----
dt1 <- data.table(x=1, y=2)
dt2 <- data.table(y=1, x=2)
dt3 <- rbindlist(list(dt1,dt2), use.names=T, fill=F) # use fill=T for binding by names and fill missing columns
#    x y
# 1: 1 2
# 2: 2 1

# ------ group by with data.table
dt[, list(mean=mean(age), sd=sd(age)), by=company] # summarize
dt[, list(ages=list(age)), by=company] # list
dt[, count:=.N, by=list(a,b)] # row frequency count by group. .N will give you the number of number of rows in each group.

expr <- parse(text = paste("by=", colnames(dt)2)) # dynamic column names with groupping
dt[, list(ages=list(age)), eval(expr)]

4.5.2 tidyr

reshape data.frame with tidyr

library(tidyr)
df <- data.frame(month=rep(1:3,2),
                 student=rep(c("Amy", "Bob"), each=3),
                 score=c(9, 7, 6, 8, 6, 9))
                 
#     month student score
# 1     1     Amy     9
# 2     2     Amy     7
# 3     3     Amy     6
# 4     1     Bob     8
# 5     2     Bob     6
# 6     3     Bob     9

# ---- Reshaping wide format to long format
df_w <- spread(df, student, score)  # args (data, key, value, ...)

#    month Amy Bob
# 1     1   9   8
# 2     2   7   6
# 3     3   6   9

# ----- Reshaping long format to wide format
gather(df_w, student, score, Amy:Bob  )  # args (data, key, value, ...)

#     month student score
# 1     1     Amy     9
# 2     2     Amy     7
# 3     3     Amy     6
# 4     1     Bob     8
# 5     2     Bob     6
# 6     3     Bob     9

4.6 list

A List is an ordered collection of R objects.

# initialize a list
ls <- vector('list', 50) # vector(mode="list", length=50)
ls[[3]] <- c(1,2,3) # assign a value

# convert a list to a data frame
my_df <- as.data.frame(do.call(rbind, my_ls))

# select a subset from a list
v <- v[v>5]

# append a list
l1 <- append(l1, l2) # each elements of l2 will be an element of l1
l1 <- append(l1, list(l2)) # the l2 will be one element of l1

5 Other data structures

5.1 object

# pass string name as object
get("objectX")  

# get object name as string
as.character(dQuote(objectX))

# object memory size
object.size(objectX)

5.2 environment

an environment is a bag of object references. http://adv-r.had.co.nz/Environments.html

# create an enviroment
e <- new.env(hash=TRUE)

# create an object in the enviroment
e$a <- 1:3
e$a <- 2:4

# check if an object in the enviroment
exists("a", envir=e)

5.3 formula

Creating a formula from a string

as.formula("y ~ x1 + x2")  #> y ~ x1 + x2

6 Common functions

# -------- apply --------
http://nsaunders.wordpress.com/2010/08/20/a-brief-introduction-to-apply-in-r/

# ----- random number --------
runif(10)       # for uniform distribution
rnorm(10)       # for normal distribution
sample(1:10)    # for vector 1:10 in random order
sample(300, 30) # Sample 30 from 1 to 300

sort(sample(300, 30)) # sorted result

# get replicable sample 
set.seed(123) # sample results are same everytime for same seed
sample(300, 30)

# ----- pipe operator -----
library(dplyr)
data %>% func # = func(data)
data %>% func(arg=value) # = func(data, arg=value)

7 Regular expression

# ----- return position --------
# x is a string
pos = regexpr('pattern', x) # Returns position of 1st match in a string
pos = gregexpr('pattern', x) # Returns positions of every match in a string
# x is a vector
pos = grep('pattern', x) #  returns the index of the matching string in the vector

#------ replace ---------
sub('pattern', replacement, input) # Changes only the 1st pattern match per string
gsub('pattern', replacement, input) # Changes every occurrence of a pattern match

# ------ extact ----------
library(stringr)
str_extract(x, 'pattern')

# ------- insert a string -------
gsub('^([a-z]{2})([a-z]+)$', '\\1cc\\2', "aabb") # aabb -> aaccbb

8 CSV input/output

#---- scan lines from a file ----
#  data in the example.dat file 
#  10
#  12
#  13

data <- scan(".../misc/example.dat", skip=1)
data
# [1] 60 43 67 50 

# ----- read CSV --------
# for small size dataset
data <- read.table(file, header=F, sep="", quote ="\"'")

# for large size dataset
library(data.table)
data <- fread(file, header=F) # for better performance

# ---- write CSV ------
write.table(MyData, file = "MyData.csv", row.names=FALSE, na="", col.names=FALSE, sep=",")

8.1 Chinese charactor

Sys.getlocale(category="LC_ALL")
Sys.setlocale(category="LC_ALL", locale = "chinese")

data <- fread(file, header = T,  encoding="UTF-8", sep = "\t")

Import zipped CSV file

# file structure
#  zipdata.zip
#     |--- csvdata
#         |---- 1.csv
#         |---- 2.csv
#         |---- 3.csv

filePath <- "zipdata.zip"

# ---  Read one file ------------
data <- read.table(unzip(filePath, "csvdata/1.csv"),
                   nrows=10, header=T, quote="\"", sep=";", stringsAsFactors=F)

# ---  Read multiple files ------------
# unzip files in a temp directory
temp_dir <- "unzip_temp"
dir.create(temp_dir)
unzip(filePath, exdir=temp_dir)

# list files in the temp dir
fileList <- list.files(paste0(temp_dir, "/csvdata/"), pattern=".csv")
fileList <- paste0(temp_dir, "/csvdata/", fileList)

# combin the files
myfiles <- do.call(rbind, 
                  lapply(fileList, 
                         function(x) read.table(x, sep=";", stringsAsFactors=F, header=T) ))

# delete the temp dir
unlink(temp_dir, recursive=T)

9 Parallel R in Windows

http://decisionstats.com/2010/09/24/parallel-programming-using-r-in-windows/

require(doSNOW)
cl<-makeCluster(2) # I have two cores
registerDoSNOW(cl)

# create a function to run in each itteration of the loop
check <-function(n) {
   for(i in 1:1000){
     sme <- matrix(rnorm(100), 10,10)
     solve(sme)
     }
 }

times <- 10   # times to run the loop
system.time(x <- foreach(j=1:times ) %dopar% check(j))
system.time(for(j in 1:times ) x <- check(j))
stopCluster(cl)

10 tryCatch exception handling

tryCatch({
    expr
  }, warning = function(w) {
    warning-handler-code
  }, error = function(e) {
    error-handler-code
  }, finally = {
    cleanup-code
  }
)

# Assign value outside tryCatch scope

x <- tryCatch()
# or 
x 
tryCatch(
  x <<- 12
)

11 SQL query

JDBC

library(RJDBC) 
postgres <- JDBC( "org.postgresql.Driver", "C:/.../postgresql-8.4-703.jdbc4.jar")
conn <- dbConnect(postgres, "jdbc:postgresql://*.*.*.*:5432/xxdb", user="postgres", password="postgres")
myDf <- dbGetQuery(conn, "select * from tableX")

ODBC

library(RODBC)
conn <- odbcDriverConnect('Driver=SQL Server; Server=10.x.x.x\\SQLEXPRESS; Database=xxx;')
myDF <- sqlQuery(conn, "select * from [xxx].[dbo].[myTable]")

Home Page

R Cheat Sheet

Updated: 07 July, 2019