sudo apt-get update
sudo apt-get install r-base r-base-dev
#------ NULL, NA, NaN, Inf and -Inf -----------
# NULL is an Object meaning the variable contains no object.
is.null() # check is NULL
# NA is a Value meaning missing data
is.na() # returns TRUE for both NA and NaN
na.omit() # remove NA value
# NaN means not a number
is.nan() # returns False for NA and Ture for NaN
Inf -Inf # for positive and negative infinity
The atomic data types are “logical”, “integer”, “numeric” (synonym “double”), “complex”, “character” and “raw”.
# Formatting decimal
round(1.221554, digits=2)
# --------- string ---------
# string is considered to be a character vector in R
str <- 'string' # str[1] # This evaluates to 'string'.
nchar(str) # Count the Number of Characters
substr(str, 1, 2) # == 'st'
strsplit('0-0-1', '-') # Evaluates to list('0', '0', '1')
strsplit('0-0-1', '') # Evaluates to list('0', '-', '0', '-', '1')
paste('hello', 'world', sep = ' ') # 'hello world'
paste0('hello', 'world') # 'helloworld'
paste(c('a','b','c'), collapse = "") # 'abc' , connect character vector to string
A Vector is a sequence of data elements of atomic data types (Numeric, Integer, etc.).
# initialize an empty vector
v <- vector()
# create an arithmetic sequence (numeric vector)
se <- seq(8, 3) # 8,7,6,5,4,3
se <- seq(3, 8, by=2) # 3, 5, 7
# select top and buttom
head(v, 5) # top 5
tail(v, 2) # buttom 2
# select a subset
v <- c(1,2,3,4)
y <- v[v<3]
y <- which(v<3)
# select every (n+1)th element
n=2
v <- v[seq(1, length(v), n)]
# append a value
v <- c(1, 2)
v <- c(v, 3) #or v <- append(v, 3)
# delete a value
v <- c("a", "b", "c", "d")
v <- v[!v=="c"]
# delete subset of values
v <- v[!v %in% c("c", "d")]
# find all duplicated entries
v[duplicated(v)] # or remove all duplicated, v[!duplicated(v)]
# find all unique entries
unique(v)
# find the index of the max/min value
which.max(v)
# calculate frequency of occurrence
v = c(1,1,1,1,1,2,3,4,5,5,5,5,6,7,7,7,7)
table(v) # frequency table
max(table(v)) # highest frequency (not the actual value )
# Set operations
union(x, y)
intersect(x, y)
setdiff(x, y) # (asymmetric!) difference, setdiff(y, x) for y difference to x
setequal(x, y)
# compare 2 vector
all(c(1,2,3)==c(1,2,3)) # > TRUE
all(c(1,2,3)==c(2,1,3)) # > FALSE
all(c(1,2,3)==c(1,2,NA)) # > NA
# Return the sum of all the values
sum(v)
sum(v==1) # count the number of value 1
A Factor is a categorical variable
# factor() transforms a vector into a factor.
f <- factor(c("10", "11", "12"))
# factor origional values to numeric values
v <- as.numeric(as.character(f))
# get numeric coding of the factor, not the origional values
v <- as.numeric(f)
# divides the range of x into intervals
cut(c(1,2,3,4,5,2,3,4,5,6,7), 3)
A Matrix is a collection of elements with same data types in a two-dimensional rectangular layout.
# Initialize a matrix
mr <- matrix(ncol=50, nrow=50)
# Convert a vector to a matrix
v <- seq(1, 6)
m <- array(v, c(3,2)) # or m <- matrix(v, nrow=3, ncol=2)
m
[,2] [,3]
[1,] 1 4
[2,] 2 5
[3,] 3 6
# vectors to matrix columns
A <- c(0,5,4,3)
B <- c(8,0,6,5)
C <- c(3,6,0,2)
m <- matrix(c(A,B,C), nrow=length(A))
# Matrix operation
x %*% y # Multiplication
m <- t(m) # Transpose
Sparse Matrix is a matrix in which most of the elements are zero.
#---- slam package------------
# simple triplet matrix from slam package is very efficient representation of a large sparse matrix. Cell with unassigned values are assumed to be zero.
library(slam)
i <- c(1,2,3,4,5) # row index
j <- c(1,1,1,2,2) # column index
v <- c(5,5,5,6,6) # value
m <- simple_triplet_matrix(i,j,v)
as.matrix(m)
[,1] [,2]
[1,] 5 0
[2,] 5 0
[3,] 5 0
[4,] 0 6
[5,] 0 6
#----- Matrix package ------
library(Matrix)
m1 <- matrix(0, nrow = 1000, ncol = 1000)
m2 <- Matrix(m1, sparse = TRUE)
object.size(m1)
# 8000200 bytes
object.size(m2)
# 5032 bytes
An array is composed of n dimensions where each dimension is a vector of R objects of the same type
Arrays are similar to matrices but can have more than 2 dimensions.
# create an one dimension array
ar <- array(3, 5) # [3,3,3,3,3]
ar <- array(c(3,5), 5) # [3,5,3,5,3,5,3,5,3,5]
# get dimantion of array
dim(ar)
array(1:8, dim=c(2,2,2))
, , 1
[,1] [,2]
[1,] 1 3
[2,] 2 4
, , 2
[,1] [,2]
[1,] 5 7
[2,] 6 8
A data.frame likes a table. It is a list of column vectors of equal length. Different columns can be of various types. . I.e., one column might be a numerical variable, another might be a factor.
Initialize data frame
# Initialize a df
df <- data.frame()
df <- data.frame(stringsAsFactors=F) # Avoid that R understands characters as factors
# Initialize a df from a matrix
df <- data.frame(matrix(ncol=10, nrow=5))
...
df <- df[-1,]
# Initialize from value
timestamps <- c('2011-01-05 11:00', '2011-01-05 12:00', '2011-01-05 13:00')
prices <- c(1.1, 2.2, 3.3)
stockpricesDf <- data.frame(timestamps, prices)
# Get/Set row/column names
rownames(df)
colnames(df)
rownames(df) <- value
# Get number of rows/columns
nrow(df)
ncol(df)
Subset data frame
sub_df <- df[c(1,2,5), ] # Subset by row indexes
sub_df <- df[, c(1,2,5)] # Subset by column indexes
sub_df <- df[rownames(df)=='user1', ] # Subset by row names
sub_df <- df[rownames(df)=='user1', ,drop=FALSE] # passed on to the indexing method to keep data.frame and matrix format
sub_df <- df[, c('start','end')] # Subset by column names
# filter rows with condition
sub_df <- df[df$age>40, ]
# sample rows
sub_df <- df[sample(nrow(df), 3), ]
# delete rows/column by index
sub_df <- df[-c(1:8), ] # delete rows
sub_df <- df[, -c(1:8)] # delete columns
# delete columns by names
sub_df <- df[, !(colnames(df) %in% c("TotalDuration") )]
Append data frame
# add a row or column
x <- c(1,2,3)
df <- rbind(df, x); # add a row
df <- cbind(df, x); # add a column
# Avoid that R understands characters as factors
df <- data.frame(stringsAsFactors=F)
df <- rbind(df, c("hi","i","am","bob"), stringsAsFactors=F)
df <- rbind(df, c("hi","i","am","alice"), stringsAsFactors=F)
temp <- list();
for(i in 1:100) temp <- append(temp, list(c(1, i)) )
df <- as.data.frame(do.call(rbind, temp))
# create a new column with merged values of two columns
df$new_column = paste(df$V1, df$V2, sep="_")
Combine data frames
# build a list of data frames
dfList <- list()
for(i in 1:10){ dfList[[i]] <- data.frame(a=c("a1","a2"), b=c("b1","b2")) }
# do.call approach
finalDf <- do.call(rbind, dfList)
# an efficient implementation of do.call(rbind, ...);
library(dplyr) # for bind_rows
finalDf <- bind_rows(dfList)
Sort
# sort by column index
df[order(-df[,4], df[,1]), ]
# sort according a specified order or vector
target <- c("b", "c", "a", "d")
df[match(target, df$name),]
Summarize data frame
# calculate row sum and product in data.frame
transform(df, sum=rowSums(df), prod=x*y*z)
# x y z sum prod
#1 1 2 3 6 6
#2 2 3 4 9 24
#3 5 1 2 8 10
# apply a function for each row
apply(df, 1, function(u) sum)
# apply a function to a data frame Split by Factors
by(df[, 1:2], df[,3], summary)
# row frequency count
# http://rwiki.sciviews.org/doku.php?id=tips:data-frames:count_and_extract_unique_rows
aggregate(rep(1, nrow(df)), by=as.list(df), FUN=sum)
Handle duplication
# find/remove duplicated rows
df[duplicated(df),] # or remove, df[!duplicated(df),]
# remove duplicated rows by a column
df[!duplicated(df[,c('firstname')]),]
# find unique rows
unique(df)
# remove rows or colums have same values
df <- df[apply(df, 1, function(row) length(unique(row)) > 1), ]
df <- df[, apply(df, 2, function(col) length(unique(col)) > 1)]
Handle na, nan, null, blank
df[is.na(df)] <- value # Convert all Na's in data frames to other values
# Remove rows with NAs in data frames
df[complete.cases(df),] # remove all rows containing NAs
df[complete.cases(df[,2:3]),] # remove rows based on some columns
df[!is.null(df[,1]),] # Remove rows with null value in a specific column
df[!(df[,1]==""),] # Remove rows with Blank value in a specific column
df[rowSums(is.na(df))!= ncol(df),] # Remove rows where all data is NA
df[,colSums(is.na(df))<nrow(df)] # Remove columns where All values are NA
Multiple data frames
# ----- join two data frames
df1<- data.frame(CustomerId=c(1,2),y=c(1,1))
df2<- data.frame(CustomerId=c(3,4),y=c(2,2))
merge(x=df1, y=df2, by="CustomerId") # default all=FALSE. natural join, a special case of an inner join
# CustomerId y.x y.y # 0 row
merge(x=df1, y=df2, by="CustomerId", all = T) # (full) outer join
# CustomerId y.x y.y
# 1 1 1 NA
# 2 2 1 NA
# 3 3 NA 2
# 4 4 NA 2
merge(x=df1, y=df2, by="CustomerId", all.x=T) # left (outer) join
# CustomerId y.x y.y
# 1 1 1 NA
# 2 2 1 NA
merge(x=df1, y=df2, by="CustomerId", all.y=T) # right (outer) join
# CustomerId y.x y.y
# 1 3 NA 2
# 2 4 NA 2
merge(x=df1, y=df2, by=NULL) # cross join
# CustomerId.x y.x CustomerId.y y.y
# 1 1 1 3 2
# 2 2 1 3 2
# 3 1 1 4 2
# 4 2 1 4 2
merge(x=df1, y=df2, by=c("CustomerId","y"))
# CustomerId y
# <0 rows>
merge(x=df1, y=df2, by=c("CustomerId","y"), all=T)
# CustomerId y
# 1 1 1
# 2 2 1
# 3 3 2
# 4 4 2
# ----- compare two data frames
a1 <- data.frame(a = 1:5, b=letters[1:5])
a2 <- data.frame(a = 1:3, b=letters[1:3])
require(sqldf)
a1Ina2 <- sqldf('SELECT * FROM a1 INTERSECT SELECT * FROM a2') # for rows are in both data frames
a1NotIna2 <- sqldf('SELECT * FROM a1 EXCEPT SELECT * FROM a2')
require(dplyr)
semi_join(a1,a2) # for rows are in both data frames
anti_join(a1,a2)
Stack columns
df <- data.frame(a=c(1,1), b=c(2,2), c=c(3,3))
# Stack the data:
sdf <- stack(df)
# values ind
# 1 1 a
# 2 1 a
# 3 2 b
# 4 2 b
# 5 3 c
# 6 3 c
use data.table for data.frame
library(data.table)
# convert the data frame to data table
dt <- data.table(df)
# convert a table to a data frame
df <- as.data.frame.matrix(dt)
# select colum by names
dt <- dt[ ,c("time","count"), with=F] # need set with=F
# return a column as a vector
dt[[2]] # select by index
# delete a column by name in data.table
dt <- dt[,foo:=NULL]
dt <- dt[, c("foo","bar"):=NULL] # remove two columns
myVar = "foo"
dt <- dt[, (myVar):=NULL] # lookup myVar contents
#------- merge tables -----
dt1 <- data.table(x=1, y=2)
dt2 <- data.table(y=1, x=2)
dt3 <- rbindlist(list(dt1,dt2), use.names=T, fill=F) # use fill=T for binding by names and fill missing columns
# x y
# 1: 1 2
# 2: 2 1
# ------ group by with data.table
dt[, list(mean=mean(age), sd=sd(age)), by=company] # summarize
dt[, list(ages=list(age)), by=company] # list
dt[, count:=.N, by=list(a,b)] # row frequency count by group. .N will give you the number of number of rows in each group.
expr <- parse(text = paste("by=", colnames(dt)2)) # dynamic column names with groupping
dt[, list(ages=list(age)), eval(expr)]
reshape data.frame with tidyr
library(tidyr)
df <- data.frame(month=rep(1:3,2),
student=rep(c("Amy", "Bob"), each=3),
score=c(9, 7, 6, 8, 6, 9))
# month student score
# 1 1 Amy 9
# 2 2 Amy 7
# 3 3 Amy 6
# 4 1 Bob 8
# 5 2 Bob 6
# 6 3 Bob 9
# ---- Reshaping wide format to long format
df_w <- spread(df, student, score) # args (data, key, value, ...)
# month Amy Bob
# 1 1 9 8
# 2 2 7 6
# 3 3 6 9
# ----- Reshaping long format to wide format
gather(df_w, student, score, Amy:Bob ) # args (data, key, value, ...)
# month student score
# 1 1 Amy 9
# 2 2 Amy 7
# 3 3 Amy 6
# 4 1 Bob 8
# 5 2 Bob 6
# 6 3 Bob 9
A List is an ordered collection of R objects.
# initialize a list
ls <- vector('list', 50) # vector(mode="list", length=50)
ls[[3]] <- c(1,2,3) # assign a value
# convert a list to a data frame
my_df <- as.data.frame(do.call(rbind, my_ls))
# select a subset from a list
v <- v[v>5]
# append a list
l1 <- append(l1, l2) # each elements of l2 will be an element of l1
l1 <- append(l1, list(l2)) # the l2 will be one element of l1
# pass string name as object
get("objectX")
# get object name as string
as.character(dQuote(objectX))
# object memory size
object.size(objectX)
an environment is a bag of object references. http://adv-r.had.co.nz/Environments.html
# create an enviroment
e <- new.env(hash=TRUE)
# create an object in the enviroment
e$a <- 1:3
e$a <- 2:4
# check if an object in the enviroment
exists("a", envir=e)
Creating a formula from a string
as.formula("y ~ x1 + x2") #> y ~ x1 + x2
# -------- apply --------
http://nsaunders.wordpress.com/2010/08/20/a-brief-introduction-to-apply-in-r/
# ----- random number --------
runif(10) # for uniform distribution
rnorm(10) # for normal distribution
sample(1:10) # for vector 1:10 in random order
sample(300, 30) # Sample 30 from 1 to 300
sort(sample(300, 30)) # sorted result
# get replicable sample
set.seed(123) # sample results are same everytime for same seed
sample(300, 30)
# ----- pipe operator -----
library(dplyr)
data %>% func # = func(data)
data %>% func(arg=value) # = func(data, arg=value)
# ----- return position --------
# x is a string
pos = regexpr('pattern', x) # Returns position of 1st match in a string
pos = gregexpr('pattern', x) # Returns positions of every match in a string
# x is a vector
pos = grep('pattern', x) # returns the index of the matching string in the vector
#------ replace ---------
sub('pattern', replacement, input) # Changes only the 1st pattern match per string
gsub('pattern', replacement, input) # Changes every occurrence of a pattern match
# ------ extact ----------
library(stringr)
str_extract(x, 'pattern')
# ------- insert a string -------
gsub('^([a-z]{2})([a-z]+)$', '\\1cc\\2', "aabb") # aabb -> aaccbb
#---- scan lines from a file ----
# data in the example.dat file
# 10
# 12
# 13
data <- scan(".../misc/example.dat", skip=1)
data
# [1] 60 43 67 50
# ----- read CSV --------
# for small size dataset
data <- read.table(file, header=F, sep="", quote ="\"'")
# for large size dataset
library(data.table)
data <- fread(file, header=F) # for better performance
# ---- write CSV ------
write.table(MyData, file = "MyData.csv", row.names=FALSE, na="", col.names=FALSE, sep=",")
Sys.getlocale(category="LC_ALL")
Sys.setlocale(category="LC_ALL", locale = "chinese")
data <- fread(file, header = T, encoding="UTF-8", sep = "\t")
Import zipped CSV file
# file structure
# zipdata.zip
# |--- csvdata
# |---- 1.csv
# |---- 2.csv
# |---- 3.csv
filePath <- "zipdata.zip"
# --- Read one file ------------
data <- read.table(unzip(filePath, "csvdata/1.csv"),
nrows=10, header=T, quote="\"", sep=";", stringsAsFactors=F)
# --- Read multiple files ------------
# unzip files in a temp directory
temp_dir <- "unzip_temp"
dir.create(temp_dir)
unzip(filePath, exdir=temp_dir)
# list files in the temp dir
fileList <- list.files(paste0(temp_dir, "/csvdata/"), pattern=".csv")
fileList <- paste0(temp_dir, "/csvdata/", fileList)
# combin the files
myfiles <- do.call(rbind,
lapply(fileList,
function(x) read.table(x, sep=";", stringsAsFactors=F, header=T) ))
# delete the temp dir
unlink(temp_dir, recursive=T)
http://decisionstats.com/2010/09/24/parallel-programming-using-r-in-windows/
require(doSNOW)
cl<-makeCluster(2) # I have two cores
registerDoSNOW(cl)
# create a function to run in each itteration of the loop
check <-function(n) {
for(i in 1:1000){
sme <- matrix(rnorm(100), 10,10)
solve(sme)
}
}
times <- 10 # times to run the loop
system.time(x <- foreach(j=1:times ) %dopar% check(j))
system.time(for(j in 1:times ) x <- check(j))
stopCluster(cl)
tryCatch({
expr
}, warning = function(w) {
warning-handler-code
}, error = function(e) {
error-handler-code
}, finally = {
cleanup-code
}
)
# Assign value outside tryCatch scope
x <- tryCatch()
# or
x
tryCatch(
x <<- 12
)
library(RJDBC)
postgres <- JDBC( "org.postgresql.Driver", "C:/.../postgresql-8.4-703.jdbc4.jar")
conn <- dbConnect(postgres, "jdbc:postgresql://*.*.*.*:5432/xxdb", user="postgres", password="postgres")
myDf <- dbGetQuery(conn, "select * from tableX")
library(RODBC)
conn <- odbcDriverConnect('Driver=SQL Server; Server=10.x.x.x\\SQLEXPRESS; Database=xxx;')
myDF <- sqlQuery(conn, "select * from [xxx].[dbo].[myTable]")