Data Types

a[-c(1, 3)] a[[3]] <- NULL a[c(TRUE, FALSE, TRUE)] ### yes

Atomic vector

c:combine, comprises the same types (character, double, integer, logical)

Basic

v <- c(1,2L,3!=3,c(4==4)) #== c(1,2L,3!=3,c(4==4)) -> v
# TRUE(T) will be coreced to 1 , FALSE(F) is 0
# L means coreced to integer
# 1 2 0 1

names(v) <- letters[1:4]
# a b c d 
# 1 2 0 1 

v[2]
# b 
# 2
v[c(1,2,2,1)]
# 1 2 2 1
v[[2]]
# 2

v[2:4]
#== v[c(2,3,4)];v[c(2:4)];v[c(F,T,T,T)]
# b c d 
# 2 0 1 

v[c("a","b")]
v[1:2]
# a b 
# 1 2

v[-2] # Drop the 2rd element
# a c d 
# 1 0 1 
v[-1:-2] #  Drop first two
# 0 1

v[v>0]
# 1 2 1

append(v,2)
# 1 2 0 1 2
append(v,v,after = 2)
# 1 2 1 2 0 1 0 1

Logic

c(F,T,F,T) & c(T,F,F,T)
# FALSE FALSE FALSE  TRUE
c(F,T,F,T) && c(T,F,F,T)
# FALSE
c(F,T,F,T) | c(T,F,F,T)
# TRUE  TRUE FALSE  TRUE
c(F,T,F,T) || c(T,F,F,T)
#TRUE

v==c(1,2,0,1)
#    a    b    c    d 
# TRUE TRUE TRUE TRUE 

c(1,2,3) %in% v
# TRUE  TRUE FALSE

vv <- v > 1
vv
# FALSE  TRUE FALSE FALSE
all(v > 1) # Are all of the values true?
# FALSE
any(v > 1) # Are any of the values true?
# TRUE

Special character vectors

letters 
# "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w"
# "x" "y" "z"
LETTERS
# "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S" "T" "U" "V" "W"
# "X" "Y" "Z"
month.abb # abbreviations of months
# "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
month.name
# "January"   "February"  "March"     "April"     "May"       "June"      "July"      "August"    "September"
# "October"   "November"  "December" 
# months in my current locale

Sampling

set.seed(0) # Ensure the reproduible results

sample(c(letters[1:5]),5) #== sample(c(letters[1:5]))
#  "e" "b" "d" "c" "a"
sample(c(letters[1:5]),5,replace = T)
#  "b" "e" "e" "d" "d"
nums <- sample(10,10)
#  1  2  9  5  3  4  8  6  7 10
order(nums) #== sort.list(nums)
# 1  2  5  6  4  8  9  7  3 10
sort(nums)
# 1  2  3  4  5  6  7  8  9 10

Identical

x <- vector("numeric", length = 10) 
y <- rep(0,10)
identical(x,y)
# True
vect <- c(foo = 11, bar = 2, foobar = NA)
names(vect)
# "foo"    "bar"    "foobar"
vect2 <- c(11, 2, NA)
names(vect2) <- c("foo", "bar", "foobar")
identical(vect, vect2)
# TRUE

Factors

Basic

Factors are just some series of special character atomic vector

set.seed(0)
f <- factor(sample(LETTERS[c(1,20,3,7)],10,replace = T))
# [1] G T T C G A G G C C
# Levels: A C G T
f[2]
# [1] T
# Levels: A C G T
f[2:3, drop = TRUE]
# [1] T T
# Levels: G

levels(f)
# "A" "C" "G" "T"
as.character(f)
# [1] "G" "T" "T" "C" "G" "A" "G" "G" "C" "C"
as.numeric(f)
# 3 4 4 2 3 1 3 3 2 2
unclass(f)
#  [1] 3 4 4 2 3 1 3 3 2 2
# attr(,"levels")
# [1] "A" "C" "G" "T"

Levels manipulation

f[4] <- "U"
## "U" not in the levels
## invalid factor level, NA generated
f
#  [1] G    T    T    <NA> G    A    G    G    C    C   
# Levels: A C G T

levels(f) <- c(levels(f),"U")
#== levels(f)[5] <- "U"
f[4] <- "U"
#  [1] G T T U G A G G C C
# Levels: A C G T U

levels(f)[4] <- "U"
#  [1] G U U U G A G G C C
# Levels: A C G U
## Renaming levels of a factor
levels(f) <- list("adenine"="A", "thymine"="T", "cytosine"="C","guanine"="G","uracil"="U")
#  [1] guanine  uracil   uracil   uracil   guanine  adenine  guanine  guanine  cytosine cytosine
# Levels: adenine thymine cytosine guanine uracil

## Drop the extra levels
droplevels(f)
#  [1] guanine  uracil   uracil   uracil   guanine  adenine  guanine  guanine  cytosine cytosine
# Levels: adenine cytosine guanine uracil

Changing the order of levels

Order will determine, for example, how output will be printed, or the arrangement of items on a graph

set.seed(0)
f <- factor(sample(LETTERS[c(1,20,3,7)],10,replace = T),levels = c("A", "T", "C", "G"))
f
# [1] G T T C G A G G C C
# Levels: A T C G
f <- factor(f,levels=rev(levels(f)))
#  [1] A A A C T G T C G T
# Levels: G C T A
f<- relevel(f, "A")
# [1] A A A C T G T C G T
# Levels: A G C T
Bases <- ordered(c("A" ,"T", "C", "G", "G"))
# [1] A T C G G
# Levels: A < C < G < T
Bases<- ordered(Bases, levels = c("A", "T", "C","G"))
# [1] A T C G G
# Levels: A < T < C < G

Lists

Indexing into a list

l <- list( film = list("The Shape of Water", "Dunkirk", "Three Billboards Outside Ebbing, Missouri","Coco","Call Me by Your Name"), score = c(7.3, 8.5,8.7,9.1,8.9),"The Best Film nomination")

## list[name or index] will return a list
l[2]
l["score"]
# $score
# [1] 7.3 8.5 8.7 9.1 8.9

## list[[name or index]] or $name will return a vector
l[[2]]
l[["score"]]
l$score
l[["s", exact = FALSE]]
# [1] 7.3 8.5 8.7 9.1 8.9

l[[1]][[1]]
# "The Shape of Water"
l[[1]][1]
# [[1]]
# [1] "The Shape of Water"

l[-1]
# $score
# [1] 7.3 8.5 8.7 9.1 8.9
# 
# [[2]]
# [1] "The Best Film nomination"

# l[[-1]]
## Error in l[[-1]] : 
##   attempt to select more than one element in get1index <real>

l[[c(2, 3)]] #== l[[2]][[3]]
# 8.7
l[[c(1, 3)]] #== l[[1]][[3]]
# "Three Billboards Outside Ebbing, Missouri"

names(l)
# [1] "film"  "score" ""

l[[3]] <- NULL #== l[3] <- NULL
# $film
# $film[[1]]
# [1] "The Shape of Water"
# ...
# $film[[5]]
# [1] "Call Me by Your Name"
# 
# $score
# [1] 7.3 8.5 8.7 9.1 8.9

Str

l[3] <- "The Best Film nomination"

str(l) #== str(c(l))
# List of 3
#  $ film :List of 5
 # $ film :List of 5
 #  ..$ : chr "The Shape of Water"
 #  ..$ : chr "Dunkirk"
 #  ..$ : chr "Three Billboards Outside Ebbing, Missouri"
 #  ..$ : chr "Coco"
 #  ..$ : chr "Call Me by Your Name"
 # $ score: num [1:5] 7.3 8.5 8.7 9.1 8.9
 # $      : chr "The Best Film nomination"
c(l) #== l
# $film
# $film[[1]]
# [1] "Lady Bird"
# 
# ...
# $film[[5]]
# [1] "Call Me by Your Name"
# 
# 
# $score
# [1] 8.0 8.5 8.7 7.5 8.9
# 
# [[3]]
# [1] "The Best Film nomination"
str(c(list = l[2],l[[2]]))
# List of 6
 # $ list.score: num [1:5] 7.3 8.5 8.7 9.1 8.9
 # $           : num 7.3
 # $           : num 8.5
 # $           : num 8.7
 # $           : num 9.1
 # $           : num 8.9
as.list(l[[2]])
# [[1]]
# [1] 8
# 
# [[2]]
# [1] 8.5
# 
# [[3]]
# [1] 8.7
# 
# [[4]]
# [1] 7.5
# 
# [[5]]
# [1] 8.9

Matrix

Matrix

Data Frames

Names

d <- data.frame(subject = 1:4, sex = c("M", "F", "F", "M"), size = c(4,2,5,3))
#   subject sex size
# 1       1   M    4
# 2       2   F    2
# 3       3   F    5
# 4       4   M    3
print(d,row.names = F)
 # subject sex size
 #       1   M    4
 #       2   F    2
 #       3   F    5
 #       4   M    3
dim(d) # nrow(d);ncol(d)
# [1] 4 3

dimnames(d) 

# [[1]]
# [1] "1" "2" "3" "4"
# 
# [[2]]
# [1] "subject" "sex"     "size" 
rownames(d);colnames(d)
# [1] "1" "2" "3" "4"
# [1] "subject" "sex"     "size" 
names(d);colnames(d)
# [1] "subject" "sex"     "size"   
# [1] "subject" "sex"     "size" 

Indexing into a dataframe

d[[2,1]] #== d[2,][[1]]
# 2

## still  "data.frame"
d[1] #== d[,1,drop=F]
#   subject
# 1       1
# 2       2
# 3       3
# 4       4
d[1,]
#   subject sex size
# 1       1   M    4
d[2,][1]
#   subject
# 2       2

## now its atomic vector
d[,2] 
# [1] M F F M
# Levels: F M
d[,1]
d[,"subject"]
d[[1]] 
d[["subject"]] 
d$subject
# [1] 1 2 3 4

which(d$sex!="M")
# 2 3

d[d$sex=="M",]
subset(d, sex == "M")
#   subject sex size
# 1       1   M    4
# 4       4   M    3

## Note: you cant asign value with subset function
d[d$sex=="M",]$sex <- "F"  
#   subject sex size
# 1       1   F    4
# 2       2   F    2
# 3       3   F    5
# 4       4   F    3

Unique & Duplicated

d <- rbind(d,data.frame(subject = c(5,2), sex = "M", size = c(2,5)))
#   subject sex size
# 1       1   F    4
# 2       2   F    2
# 3       3   F    5
# 4       4   F    3
# 5       5   M    2
# 6       2   M    5

unique(d$sex)
# [1] F M
# Levels: F M

match(unique(d$sex),d$sex)
# 1 5

## Find  a duplicate value 
## First instance of a particular value will not be counted
duplicated(d$sex)
# [1] FALSE  TRUE  TRUE  TRUE FALSE TRUE
d[duplicated(d$sex),]
#   subject sex size
# 2       2   F    2
# 3       3   F    5
# 4       4   F    3
# 6       2   M    5
d[!duplicated(d$sex),]
#   subject sex size
# 1       1   F    4
# 5       5   M    2
d[match(unique(d$sex),d$sex),]
#   subject sex size
# 1       1   F    4
# 5       5   M    2

Subset

# Subset of particular rows and columns
subset(d, subject < 3, select = -subject)
subset(d, subject < 3, select = c(sex,size))
subset(d, subject < 3, select = sex:size)
d[d$subject < 3, c("sex","size")]
#   sex size
# 1   F    4
# 2   F    2
# 6   M    5

d[d$subject==1 | d$sex=="M",]
subset(d, subject ==1 | sex =="M")
d[d$subject%in% c(1,5),]
subset(d, subject %in% c(1,5))
#   subject sex size
# 1       1   F    4
# 5       5   M    2

# Randomizing order
set.seed(0)
d[sample(nrow(d)),]
#   subject sex size
# 6       2   M    5
# 2       2   F    2
# 5       5   M    2
# 4       4   F    3
# 3       3   F    5
# 1       1   F    4

Factors

str(d)
# 'data.frame': 6 obs. of  3 variables:
#  $ subject: num  1 2 3 4 5 2
#  $ sex    : Factor w/ 2 levels "F","M": 1 1 1 1 2 2
#  $ size   : num  4 2 5 3 2 5

## Apply the factor() function to those columns, and assign then back into d
fac <- vapply(d, is.factor, logical(1))
#== fac <- sapply(d, is.factor)
# subject     sex    size 
#   FALSE    TRUE   FALSE 

d <- data.frame(d,stringsAsFactors = F)
# d[,2] <- lapply(d[,2],as.character)
# 'data.frame': 4 obs. of  3 variables:
#  $ subject: int  1 2 3 4
#  $ sex    : Factor w/ 2 levels "F","M": 2 1 1 2
#  $ size   : num  4 2 5 3

Se7en

from wikipedia

Se7en <- list(sins = c("lust", "gluttony", "greed", "sloth", "wrath", "envy", "pride"), 
virtues = c("Chastity", "Temperance", "Charity", "Diligence", "Patience", "Kindness", "Humility"),
gloss = c("purity", "abstinence","Humanity" ,"equanimity","will", "benevolence", "generosity", "sacrifice", "persistence", "Effort", "ethics", 
"forgiveness", "mercy", "satisfaction", "compassion", "bravery", "modesty", "reverence"),
index = c(2,2,4,3,2,2,3))
S <- data.frame(sins = Se7en[[1]],virtues = Se7en[[2]])
g <- split(Se7en$gloss,rep(1:length(Se7en$index),Se7en$index))
# $`1`
# [1] "purity"     "abstinence"
# 
# $`2`
# [1] "Humanity"   "equanimity"
# 
# $`3`
# [1] "will"        "benevolence" "generosity"  "sacrifice"  
# 
# $`4`
# [1] "persistence" "Effort"      "ethics"     
# 
# $`5`
# [1] "forgiveness" "mercy"      
# 
# $`6`
# [1] "satisfaction" "compassion"  
# 
# $`7`
# [1] "bravery"   "modesty"   "reverence"
S$gloss = g
#       sins    virtues                                    gloss
# 1     lust   Chastity                       purity, abstinence
# 2 gluttony Temperance                     Humanity, equanimity
# 3    greed    Charity will, benevolence, generosity, sacrifice
# 4    sloth  Diligence              persistence, Effort, ethics
# 5    wrath   Patience                       forgiveness, mercy
# 6     envy   Kindness                 satisfaction, compassion
# 7    pride   Humility              bravery, modesty, reverence
str(data.frame(S,gloss = I(g)))
# 'data.frame': 7 obs. of  4 variables:
#  $ sins   : Factor w/ 7 levels "envy","gluttony",..: 4 2 3 6 7 1 5
#  $ virtues: Factor w/ 7 levels "Charity","Chastity",..: 2 7 1 3 6 5 4
#  $ gloss  :List of 7
#   ..$ 1: chr  "purity" "abstinence"
#   ..$ 2: chr  "Humanity" "equanimity"
#   ..$ 3: chr  "will" "benevolence" "generosity" "sacrifice"
#   ..$ 4: chr  "persistence" "Effort" "ethics"
#   ..$ 5: chr  "forgiveness" "mercy"
#   ..$ 6: chr  "satisfaction" "compassion"
#   ..$ 7: chr  "bravery" "modesty" "reverence"
#  $ gloss.1:List of 7
#   ..$ 1: chr  "purity" "abstinence"
#   ..$ 2: chr  "Humanity" "equanimity"
#   ..$ 3: chr  "will" "benevolence" "generosity" "sacrifice"
#   ..$ 4: chr  "persistence" "Effort" "ethics"
#   ..$ 5: chr  "forgiveness" "mercy"
#   ..$ 6: chr  "satisfaction" "compassion"
#   ..$ 7: chr  "bravery" "modesty" "reverence"
#   ..- attr(*, "class")= chr "AsIs"

Math

Math

Ly Series

  • lapply: Loop over a list or data.frame and evaluate a function on each element
  • sapply: Same as lapply but try to simplify the result
  • vapply: Same as sapply but can speicfy which type to return
  • apply: Apply a function over the margins of an array
  • tapply: Apply a function over subsets of a vector
  • mapply: Multivariate version of lapply

Table & Xtabs

d <- read.table(header = T,text = "
  subject sex size
1       1   F    4
2       2   F    2
3       3   F    5
4       4   F    3
5       5   M    2
")


contingency_table <- table(d[-1])
#    size
# sex 2 3 4 5
#   F 1 1 1 1
#   M 1 0 0 0

## A data frame of counts
counts_table <- as.data.frame(contingency_table) 
#   sex size Freq
# 1   F    2    1
# 2   M    2    1
# 3   F    3    1
# 4   M    3    0
# 5   F    4    1
# 6   M    4    0
# 7   F    5    1
# 8   M    5    0

xtabs(Freq ~ sex + size, counts_table)
#    size
# sex 2 3 4 5
#   F 1 1 1 1
#   M 1 0 0 0


## Convert from data frame of counts to data frame of cases
countsToCases <- function(x, countcol = "Freq") {
    # Get the row indices to pull from x
    idx <- rep(seq(1:nrow(x)), x[[countcol]])
    # Drop count column
    x[[countcol]] <- NULL
    x[idx, ]
}

countsToCases(counts_table)
#   sex size
# 1   F    2
# 2   M    2
# 3   F    3
# 5   F    4
# 7   F    5

Tapply & Apply

tapply(d$size , d$sex)
# 1 1 1 1 2
tapply(d$size , d$sex, sum)
#  F  M 
# 14  2 
tapply(d$size , d$sex, range)
# $F
# [1] 2 5
# 
# $M
# [1] 2 2
tapply(d$size , d$sex, range, simplify = T)

apply(d[-2],1,sum)
# 1 2 3 4 5 
# 5 4 8 7 7 
apply(d[-2],2,sum)
# subject    size 
#      15      16 
f <- factor(rep(1:3, 10), levels = 1:5)
tapply(1:30, f, range)
# $`1`
# [1]  1 28
# 
# $`2`
# [1]  2 29
# 
# $`3`
# [1]  3 30
# 
# $`4`
# NULL
# 
# $`5`
# NULL

Lapply & Sapply

dat <- list(a = contingency_table,b = counts_table) 
lapply(dat, function(e) e[,1])
# $a
# F M 
# 1 1 
# 
# $b
# [1] F M F M F M F M
# Levels: F M
lapply(d[-2], sum)
#== sapply(d[-2], sum, simpilfy = F)
# $subject
# [1] 15
# 
# $size
# [1] 16
sapply(d[-2], sum)
# subject    size 
#      15      16 
sapply(d[-2], tabulate)
#      subject size
# [1,]       1    0
# [2,]       1    2
# [3,]       1    1
# [4,]       1    1
# [5,]       1    1
sapply(6:8, seq)
# [[1]]
# [1] 1 2 3 4 5 6
# 
# [[2]]
# [1] 1 2 3 4 5 6 7
# 
# [[3]]
# [1] 1 2 3 4 5 6 7 8
d <- data.frame(subject = letters[1:5], sex = letters[1:5], size =1:5,stringsAsFactors = F)
data.frame(sapply(d, function(v) {
     if (is.character(v)) return(toupper(v))
     else return(v)
 }))
#   subject sex size
# 1       A   A    1
# 2       B   B    2
# 3       C   C    3
# 4       D   D    4
# 5       E   E    5
a <- 1:10
names(a) <- letters[1:10]
b <- 2:14
names(b) <- letters[3:15]

# a
#  a  b  c  d  e  f  g  h  i  j 
#  1  2  3  4  5  6  7  8  9 10 
# b
#  c  d  e  f  g  h  i  j  k  l  m  n  o 
#  2  3  4  5  6  7  8  9 10 11 12 13 14 

sapply(1:length(a),function(i){
       if (names(a)[i] %in% names(b))
        a[i] + b[[names(a)[i]]] else a[i]
})

 # a  b  c  d  e  f  g  h  i  j 
 # 1  2  5  7  9 11 13 15 17 19 

Mapply

mapply(rep, 1:4, 4:1)
# [[1]]
# [1] 1 1 1 1
# 
# [[2]]
# [1] 2 2 2
# 
# [[3]]
# [1] 3 3
# 
# [[4]]
# [1] 4
mapply(sum, 1:10, 100:90)
# 101 101 101 101 101 101 101 101 101 101  91
word <- function(C, k) paste(rep(C, k), collapse = "")
mapply(word, LETTERS[1:6], 6:1)
#        A        B        C        D        E        F 
# "AAAAAA"  "BBBBB"   "CCCC"    "DDD"     "EE"      "F" 

Control Structures

IF

x <- c(5:-5)
sqrt(ifelse(x >= 0, x, NA))
# [1] 2.236068 2.000000 1.732051 1.414214 1.000000 0.000000       NA       NA       NA
# [10]       NA       NA
## Finding the leap year
# year = as.integer(readline(prompt="Enter a year: "))
year = 2018
if((year %% 4) == 0) {
    if((year %% 100) == 0) {
        if((year %% 400) == 0) {
            print(paste(year,"is a leap year"))
        } else {
            print(paste(year,"is not a leap year"))
        }
    } else {
        print(paste(year,"is a leap year"))
    }
} else {
    print(paste(year,"is not a leap year"))
}
# "2018 is not a leap year"

For

## print a-j
x <- letters[1:10]
for(i in 1:4) {
  print(x[i])
}
for(i in seq_along(x)) {
  print(x[i])
}
for(letter in x) {
  print(letter)
}
for(i in 1:length(x)) print(x[i])
x <- c(50:-50)
for (i in 1:length(x)){
if (x[i] < 0) {
      break
  } else if(round(sqrt(x[i]))**2!=x[i]){
      next
  } else{
    cat(x[i]," ")
  }
}
# 49  36  25  16  9  4  1  0 
## Finding the Highest Common Factor/Greatest Common Divisor

hcf <- function(x, y) {
    #  the smaller number
    if(x > y) {
        smaller = y
    } else {
        smaller = x
    }
    for(i in 1:smaller) {
        if((x %% i == 0) && (y %% i == 0)) {
            hcf = i
        }
    }
    return(hcf)
}

# num1 = as.integer(readline(prompt = "Enter first number: "))
# num2 = as.integer(readline(prompt = "Enter second number: "))
num1 = 17; num2 = 17*12
print(paste("The H.C.F. of", num1,"and", num2,"is", hcf(num1, num2)))
# "The H.C.F. of 17 and 204 is 17"

While

## Finding the H.C.F. using Euclidean algorithm
hcf <- function(x, y) {
    while(y) {
        temp = y
        y = x %% y
        x = temp
    }
    return(x)
}
num1 = 16*2; num2 = 3*16
print(paste("The H.C.F. of", num1,"and", num2,"is", hcf(num1, num2)))
# "The H.C.F. of 32 and 48 is 16"
## Finding the Lowest Common Multiple 

lcm <- function(x, y) {
    #  the greater number
    if(x > y) {
        greater = x
    } else {
        greater = y
    }

    while(TRUE) {
        if((greater %% x == 0) && (greater %% y == 0)) {
            lcm = greater
            break
        }
        greater = greater + 1
    }
    return(lcm)
}
num1 = 16*2; num2 = 3*16
print(paste("The L.C.M. of", num1,"and", num2,"is", lcm(num1, num2)))
# "The L.C.M. of 32 and 48 is 96"

And we can conclude that Number1 * Number2 = L.C.M. * G.C.D

Strings

Strings

Dirty Data

Strings

REFERENCES