数据集(Datasets)

Iris

Iris数据集是著名的分类实验数据集,由Fisher或Anderson于1936收集整理. 该数据集由3种不同类型的鸢尾花的构成,各50个样本。
其中的一个种类与另外两个种类是线性可分离的,后两个种类是非线性可分离的。

该数据集包含 了5个属性:

  • Sepal.Length(花萼长度)
  • Sepal.Width(花萼宽度)
  • Petal.Length(花瓣长度)
  • Petal.Width(花瓣宽度)
  • Species(种类):Setosa(山鸢尾)、Versicolour(杂色鸢尾),Virginica(维吉尼亚鸢尾)
rbind(head(iris,5) , tail(iris,5))
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 1            5.1         3.5          1.4         0.2    setosa
## 2            4.9         3.0          1.4         0.2    setosa
## 3            4.7         3.2          1.3         0.2    setosa
## 4            4.6         3.1          1.5         0.2    setosa
## 5            5.0         3.6          1.4         0.2    setosa
## 146          6.7         3.0          5.2         2.3 virginica
## 147          6.3         2.5          5.0         1.9 virginica
## 148          6.5         3.0          5.2         2.0 virginica
## 149          6.2         3.4          5.4         2.3 virginica
## 150          5.9         3.0          5.1         1.8 virginica

Converting data between wide and long (长宽数据转换)

长数据(Long Format)的变量不是在各个列上, 而是拍成一列, 每一个变量都分别占其中的几行, 这样便于对每个变量分组, R偏向于使用长数据。

宽数据(Wide Format)的变量则由每一列表示,比如常见的excelspss等。

tidyr中gather()spread()能够聚集(gather)和伸展(spread)数据框中的数据。

reshape2中melt()cast()能够灵活的融合(melt)和重铸(cast,dcast)数据框中的数据。

R中自带的reshape(),stack()unstack()也可以实现, 不过微繁, 这里不加赘述

library(reshape2)
library(tidyr)
library(dplyr)
library(ggplot2)

Wide to Long fromat(宽转长)

Tidyr

gather(data, key = “key”, value = “value”, …, na.rm = FALSE, convert = FALSE, factor_key = FALSE)

## factor_key : Treat the new key column as a factor (instead of character vector)
long_iris <- gather(iris, Attributes, Measurement,
                    Sepal.Length:Petal.Width, 
                    factor_key = T)

## These would have the same result as above
# long_iris <- iris %>% gather(Attributes,Measurement,Sepal.Length:Petal.Width,factor_key = T)
# long_iris <- gather(iris,Attributes,Measurement,-Species,factor_key = T)
# long_iris <- gather(iris,"Attributes","Measurement",colnames(iris)[1:4],factor_key = T)
# long_iris <- gather(iris,Attributes,Measurement,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,factor_key = T)
##       Species   Attributes Measurement
## 1      setosa Sepal.Length         5.1
## 2      setosa Sepal.Length         4.9
## 3      setosa Sepal.Length         4.7
## 4      setosa Sepal.Length         4.6
## 5      setosa Sepal.Length         5.0
## 596 virginica  Petal.Width         2.3
## 597 virginica  Petal.Width         1.9
## 598 virginica  Petal.Width         2.0
## 599 virginica  Petal.Width         2.3
## 600 virginica  Petal.Width         1.8

Reshape2

melt(data, …, na.rm = FALSE, value.name = “value”)

long_iris  <- melt(iris,
        # ID variables - all the variables to keep
    id.vars = colnames(iris)[5],
        # The split columns
    measure.vars = colnames(iris)[1:4],
        # Name of the destination column
    variable.name="Attributes",
    value.name="Measurement"
)
## This would have the same result as above
# long_iris  <- melt(iris, "Species", variable.name = "Attributes",vaule.name = "Measurement")
##       Species   Attributes Measurement
## 1      setosa Sepal.Length         5.1
## 2      setosa Sepal.Length         4.9
## 3      setosa Sepal.Length         4.7
## 4      setosa Sepal.Length         4.6
## 5      setosa Sepal.Length         5.0
## 596 virginica  Petal.Width         2.3
## 597 virginica  Petal.Width         1.9
## 598 virginica  Petal.Width         2.0
## 599 virginica  Petal.Width         2.3
## 600 virginica  Petal.Width         1.8

Long to Wide fromat(长转宽)

宽数据的列顺序由变量的factor levels决定

如果你的数据有duplicated row, 需建立索引

显然long_iris含有duplicated rows

Find duplicated rows

long_iris1 <- long_iris[!duplicated(long_iris),]
long_iris2 <- unique(long_iris)

identical(long_iris1, long_iris2)
# TRUE
dim(long_iris1)
# 175   3
## # A tibble: 10 x 4
##    rowname Species   Attributes   Measurement
##  * <fct>   <fct>     <fct>              <dbl>
##  1 1       setosa    Sepal.Length        5.10
##  2 2       setosa    Sepal.Length        4.90
##  3 3       setosa    Sepal.Length        4.70
##  4 4       setosa    Sepal.Length        4.60
##  5 5       setosa    Sepal.Length        5.00
##  6 565     virginica Petal.Width         2.40
##  7 566     virginica Petal.Width         2.30
##  8 570     virginica Petal.Width         1.50
##  9 580     virginica Petal.Width         1.60
## 10 585     virginica Petal.Width         1.40
## OR using dplyr
library(dplyr)
long_iris3 <- long_iris %>%
    distinct()
##==
# long_iris %>%
#     distinct(Species, Attributes, Measurement, .keep_all = TRUE)
## # A tibble: 10 x 4
##    rowname Species   Attributes   Measurement
##  * <fct>   <fct>     <fct>              <dbl>
##  1 1       setosa    Sepal.Length        5.10
##  2 2       setosa    Sepal.Length        4.90
##  3 3       setosa    Sepal.Length        4.70
##  4 4       setosa    Sepal.Length        4.60
##  5 5       setosa    Sepal.Length        5.00
##  6 171     virginica Petal.Width         2.40
##  7 172     virginica Petal.Width         2.30
##  8 173     virginica Petal.Width         1.50
##  9 174     virginica Petal.Width         1.60
## 10 175     virginica Petal.Width         1.40

Note: all dplyr methods ignore rownames

all.equal(long_iris1, long_iris3)
# "Attributes: < Component “row.names”: Mean relative difference: 0.6602903 "

Tidyr

spread(data, key, value, fill = NA, convert = FALSE, drop = TRUE, sep = NULL)

# wide_iris <- spread(long_iris,Attributes,Measurement) 
## ERROR! duplicate identifiers for rows
## To build indices

long_iris$index <- rep(1:150,4)
wide_iris <- spread(long_iris, Attributes, Measurement)
wide_iris <- wide_iris[,c(3:6,1)]

identical(wide_iris,iris)
# TRUE

Reshape2

# wide_iris <- dcast(long_iris, Species ~ Attributes, value.var="Measurement") 
## ERROR!Aggregation function missing: defaulting to length

## Try fun.aggregate
wide_iris <- dcast(long_iris, Species ~ Attributes, value.var="Measurement",fun.aggregate=function(x) toString(unique(x)))
wide_iris <- dcast(long_iris, Species + index ~ Attributes, value.var="Measurement") 
wide_iris <- wide_iris[,c(3:6,1)]
identical(wide_iris,iris)
## [1] TRUE
# TRUE

Merging data

Merge two data frames by common columns or row names

merge(x, y, by = intersect(names(x), names(y)), by.x = by, by.y = by, all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(“.x”,“.y”), incomparables = NULL, …)

# Make up some data
animals <- read.table(header=T, text='
                      size  type  name
                      small  cat  Dragen-Li
                      small  cat  lynx
                      big  cat  tiger
                      small  dog  chihuahua
                      big  dog  "great dane"
                      middle  dog  "siberian husky"
                      small  dog  dachshund
                      big  dog  "chinese kunmin"
                      small  dig  "shetland sheepdog"
                      big   dog  "German Shepherd"
                      ')

observations <- read.table(header=T, text='
                           number  size type  name
                           1  big  cat  tiger
                           2  small  dog  dachshund
                           3  small  dog  chihuahua
                           4  big  dog  "chinese kunmin"
                           5  middle dog poodle
                           6  big  dog  samoyed
                           7  middle  dog  "chow chow"
                           8  small  dog  papillon
                           ')

merge(observations, animals)
## These results are equal
# merge(animals, observations)
# merge(observations, animals,by=c("size","type","name"))

#    size type           name number
# 1   big  cat          tiger      1
# 2   big  dog chinese kunmin      4
# 3 small  dog      chihuahua      3
# 4 small  dog      dachshund      2
merge(observations, animals,by.x="name",by.y="name")
##             name number size.x type.x size.y type.y
## 1      chihuahua      3  small    dog  small    dog
## 2 chinese kunmin      4    big    dog    big    dog
## 3      dachshund      2  small    dog  small    dog
## 4          tiger      1    big    cat    big    cat
merge(observations, animals,by.x="name",by.y="name",all.x=T)
##             name number size.x type.x size.y type.y
## 1      chihuahua      3  small    dog  small    dog
## 2 chinese kunmin      4    big    dog    big    dog
## 3      chow chow      7 middle    dog   <NA>   <NA>
## 4      dachshund      2  small    dog  small    dog
## 5       papillon      8  small    dog   <NA>   <NA>
## 6         poodle      5 middle    dog   <NA>   <NA>
## 7        samoyed      6    big    dog   <NA>   <NA>
## 8          tiger      1    big    cat    big    cat
merge(observations, animals,by.x="name",by.y="name",all.x=T,sort=FALSE)
##             name number size.x type.x size.y type.y
## 1          tiger      1    big    cat    big    cat
## 2      dachshund      2  small    dog  small    dog
## 3      chihuahua      3  small    dog  small    dog
## 4 chinese kunmin      4    big    dog    big    dog
## 5         poodle      5 middle    dog   <NA>   <NA>
## 6        samoyed      6    big    dog   <NA>   <NA>
## 7      chow chow      7 middle    dog   <NA>   <NA>
## 8       papillon      8  small    dog   <NA>   <NA>
merge(observations, animals, all=T)
##      size type              name number
## 1     big  cat             tiger      1
## 2     big  dog    chinese kunmin      4
## 3     big  dog           samoyed      6
## 4     big  dog   German Shepherd     NA
## 5     big  dog        great dane     NA
## 6  middle  dog         chow chow      7
## 7  middle  dog            poodle      5
## 8  middle  dog    siberian husky     NA
## 9   small  cat         Dragen-Li     NA
## 10  small  cat              lynx     NA
## 11  small  dog         chihuahua      3
## 12  small  dog         dachshund      2
## 13  small  dog          papillon      8
## 14  small  dig shetland sheepdog     NA

Dplyr

dplyr provides a flexible grammar of data manipulation. It’s the next iteration of plyr, focused on tools for working with data frames (hence the d in the name).

df <- iris

## dplyr approach: better formatting, and adapts to your screen width
glimpse(df)
## Observations: 150
## Variables: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9,...
## $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5,...
## $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1,...
## $ Species      <fct> setosa, setosa, setosa, setosa, setosa, setosa, s...

Ordering data

To reverse the direction of a particular column, the method depends on the data type:

  • Numbers: put a - in front of the variable name, e.g. df[ order(-df$weight), ].

  • Factors: convert to integer and put a - in front of the variable name, e.g. df[ order(-xtfrm(df$size)), ].

  • Characters: there isn’t a simple way to do this. One method is to convert to a factor first and then sort as above.

Note: all dplyr methods ignore rownames

## Sort by length, then by width
arrange(df, Sepal.Length,Sepal.Width)       # Use arrange from dplyr package
df[order(df$Sepal.Length,df$Sepal.Width), ]   # Use built-in R functions

## Sort by all columns in the data frame, from left to right
df[ do.call(order, as.list(df)), ] 

## Reverse sort by weight column. These all have the same effect:
arrange(df, -Sepal.Length)            
df[order(df$Sepal.Length, decreasing=TRUE), ]
df[order(-df$Sepal.Length), ]

## Sort by length, then by width (decreasing)
arrange(df, Sepal.Length,-Sepal.Width)         # Use arrange from plyr package
df[ order(df$Sepal.Length,-df$Sepal.Width), ]

## Sort by Species (decreasing), then by Sepal.Length (increasing)
## The call to xtfrm() which is needed for factors
arrange(df, -xtfrm(Species), Sepal.Length)         # Use arrange from plyr package
df[ order(-xtfrm(df$Species), df$Sepal.Length), ]  # Use built-in R functions

Recoding data

The easiest way is to use revalue() or mapvalues() in plyr

Alternative, recode in dplyr

cut in R will split the data to specified groups

Note: For numeric vectors, revalue() won’t work, since it uses a named vector, and the names are always strings, not numbers. mapvalues() will work, though.

library(plyr)
library(dplyr)
# The following two groups are equivalent:
df$flora <- recode(df$Species, "setosa"=1, "versicolor"=2, "virginica"=3)
df$flora[df$Species=="setosa"] <- "1"
df$flora[df$Species=="versicolor"] <- "2"
df$flora[df$Species=="virginica"] <- "3"
# flora the column to a factor
df$flora <- factor(df$flora)


df$flora <- revalue(df$Species, c("setosa"="1", "versicolor"="2", "virginica"="3"))
df$flora <- mapvalues(df$Species, from = c("setosa", "versicolor", "virginica"), to = c("1", "2", "3"))
oldvalues <- c("setosa", "versicolor", "virginica")
newvalues <- factor(c("1", "2", "3"))  # Make this a factor
df$flora <- newvalues[ match(df$Species, oldvalues) ]
## # A tibble: 10 x 7
##    rowname Sepal.Length Sepal.Width Petal.Length Petal.Width Species flora
##  * <fct>          <dbl>       <dbl>        <dbl>       <dbl> <fct>   <fct>
##  1 1               5.10        3.50         1.40       0.200 setosa  1    
##  2 2               4.90        3.00         1.40       0.200 setosa  1    
##  3 3               4.70        3.20         1.30       0.200 setosa  1    
##  4 4               4.60        3.10         1.50       0.200 setosa  1    
##  5 5               5.00        3.60         1.40       0.200 setosa  1    
##  6 146             6.70        3.00         5.20       2.30  virgin~ 3    
##  7 147             6.30        2.50         5.00       1.90  virgin~ 3    
##  8 148             6.50        3.00         5.20       2.00  virgin~ 3    
##  9 149             6.20        3.40         5.40       2.30  virgin~ 3    
## 10 150             5.90        3.00         5.10       1.80  virgin~ 3

When using cut, by default, the ranges are open on the left, and closed on the right, as in (7,9].
To set it so that ranges are closed on the left and open on the right, like [7,9), use right=FALSE.

df$category <- cut(df$Sepal.Length,
                     breaks=c(-Inf, 5, 7, Inf),
                     labels=c("low","medium","high"))

rename in dplyr and plyr

#  rename the columns in a data frame.
#  names(df)[names(df)=="."] <- ".."
df <- rename(df, c("Sepal.Length"="S.Length", "Sepal.Width"="S.Width","Petal.Length"="P.Length", "Petal.Width"="P.Width"))
## # A tibble: 10 x 8
##    rowname S.Length S.Width P.Length P.Width Species   flora category
##  * <fct>      <dbl>   <dbl>    <dbl>   <dbl> <fct>     <fct> <fct>   
##  1 1           5.10    3.50     1.40   0.200 setosa    1     medium  
##  2 2           4.90    3.00     1.40   0.200 setosa    1     low     
##  3 3           4.70    3.20     1.30   0.200 setosa    1     low     
##  4 4           4.60    3.10     1.50   0.200 setosa    1     low     
##  5 5           5.00    3.60     1.40   0.200 setosa    1     low     
##  6 146         6.70    3.00     5.20   2.30  virginica 3     medium  
##  7 147         6.30    2.50     5.00   1.90  virginica 3     medium  
##  8 148         6.50    3.00     5.20   2.00  virginica 3     medium  
##  9 149         6.20    3.40     5.40   2.30  virginica 3     medium  
## 10 150         5.90    3.00     5.10   1.80  virginica 3     medium

Select & Filter & mutate

names(df)
# [1] "S.Length" "S.Width"  "P.Length" "P.Width"  "Species" 
# [6] "flora"    "category"

## using select method
df %>% select(S.Length, P.Length:P.Width,
              contains("flora"), -starts_with("P.Length"),
              -S.Length, ends_with("ory"),
              matches("S\\.W.*")) %>% head
#   P.Width flora category S.Width
# 1     0.2     1   medium     3.5
# 2     0.2     1      low     3.0
# 3     0.2     1      low     3.2
# 4     0.2     1      low     3.1
# 5     0.2     1      low     3.6
# 6     0.4     1   medium     3.9

## using filter method
df %>% filter(category=="medium" & as.numeric(flora) >1) %>% head(.,5)
#   S.Length S.Width P.Length P.Width    Species flora category
# 1      7.0     3.2      4.7     1.4 versicolor     2   medium
# 2      6.4     3.2      4.5     1.5 versicolor     2   medium
# 3      6.9     3.1      4.9     1.5 versicolor     2   medium
# 4      5.5     2.3      4.0     1.3 versicolor     2   medium
# 5      6.5     2.8      4.6     1.5 versicolor     2   medium

Using mutate

df %>%
  select(S.Width, flora) %>%
  arrange(desc(flora)) %>%
  mutate(mesc = paste0(S.Width,"-",flora)) %>% 
  separate(mesc, into = c("Sepal.Width", "Flora"), sep = "-") %>% 
  # separate in tidyr
  unite("mesc", sep = "-", Sepal.Width, Flora) %>% 
  head(., 5)
##   S.Width flora  mesc
## 1     3.3     3 3.3-3
## 2     2.7     3 2.7-3
## 3     3.0     3   3-3
## 4     2.9     3 2.9-3
## 5     3.0     3   3-3

Summarise

Basic

df %>%
  group_by(Species) %>%
  select(matches("S.Length")) %>% 
  summarise_all(funs(min(., na.rm=TRUE), 
                     Q1 = quantile(., probs = 0.25),
                     med = median, 
                     Q3 = quantile(., probs = 0.75), 
                     max(., na.rm=TRUE), 
                     mean(., na.rm=TRUE), 
                     counts = n()))
## # A tibble: 3 x 8
##   Species      min    Q1   med    Q3   max  mean counts
##   <fct>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <int>
## 1 setosa      4.30  4.80  5.00  5.20  5.80  5.01     50
## 2 versicolor  4.90  5.60  5.90  6.30  7.00  5.94     50
## 3 virginica   4.90  6.22  6.50  6.90  7.90  6.59     50

Note:

Lead and lag : Find the “next” or “previous” values in a vector. Useful for comparing values ahead of or behind the current values.

df %>%
  group_by(category) %>%
  tally %>%
  mutate(change = n- lag(n))
## # A tibble: 3 x 3
##   category     n change
##   <fct>    <int>  <int>
## 1 low         32     NA
## 2 medium     106     74
## 3 high        12    -94
# df %>%
#   group_by(category) %>%
#   summarise(n=n()) %>%
#   mutate(change=n - lag(n))
df %>%
  group_by(category) %>%
  select(P.Length, P.Width, Species) %>%
  filter(min_rank(desc(P.Length)) <= 3) %>%  ##== top_n(3)
  arrange(category, desc(P.Length))
## # A tibble: 11 x 4
## # Groups:   category [3]
##    category P.Length P.Width Species   
##    <fct>       <dbl>   <dbl> <fct>     
##  1 low          4.50    1.70 virginica 
##  2 low          3.50    1.00 versicolor
##  3 low          3.30    1.00 versicolor
##  4 low          3.30    1.00 versicolor
##  5 medium       6.00    2.50 virginica 
##  6 medium       5.90    2.30 virginica 
##  7 medium       5.80    2.20 virginica 
##  8 medium       5.80    1.80 virginica 
##  9 high         6.90    2.30 virginica 
## 10 high         6.70    2.20 virginica 
## 11 high         6.70    2.00 virginica
## randomly sample a fixed number of rows, without replacement
set.seed(0)
df %>% sample_n(5)
##     S.Length S.Width P.Length P.Width    Species flora category
## 135      6.1     2.6      5.6     1.4  virginica     3   medium
## 40       5.1     3.4      1.5     0.2     setosa     1   medium
## 56       5.7     2.8      4.5     1.3 versicolor     2   medium
## 85       5.4     3.0      4.5     1.5 versicolor     2   medium
## 133      6.4     2.8      5.6     2.2  virginica     3   medium
## randomly sample a fraction of rows, with replacement
df %>% sample_frac(0.05, replace=TRUE)
##      S.Length S.Width P.Length P.Width    Species flora category
## 31        4.8     3.1      1.6     0.2     setosa     1      low
## 135       6.1     2.6      5.6     1.4  virginica     3   medium
## 142       6.9     3.1      5.1     2.3  virginica     3   medium
## 100       5.7     2.8      4.1     1.3 versicolor     2   medium
## 95        5.6     2.7      4.2     1.3 versicolor     2   medium
## 10        4.9     3.1      1.5     0.1     setosa     1      low
## 31.1      4.8     3.1      1.6     0.2     setosa     1      low
## 27        5.0     3.4      1.6     0.4     setosa     1      low

Putting some NAs

## Using ddply in plyr
ddply(df, c("Species", "category"), summarise,
               N    = length(S.Length),
               mean = mean(S.Length),
               sd   = sd(S.Length),
               se   = sd / sqrt(N)
)
##      Species category  N     mean         sd         se
## 1     setosa      low 28 4.764286 0.22146697 0.04185332
## 2     setosa   medium 22 5.313636 0.22317077 0.04758017
## 3 versicolor      low  3 4.966667 0.05773503 0.03333333
## 4 versicolor   medium 47 5.997872 0.46741163 0.06817899
## 5  virginica      low  1 4.900000         NA         NA
## 6  virginica   medium 37 6.345946 0.35558605 0.05845799
## 7  virginica     high 12 7.475000 0.27010099 0.07797144
## Using dlpyr
df %>% group_by(Species, category) %>% 
      select(S.Length) %>% 
      summarise_all(funs(N = n(),
               mean = mean(.),
               sd   = sd(.),
               se   = sd / sqrt(N)))
## # A tibble: 7 x 6
## # Groups:   Species [?]
##   Species    category     N  mean       sd       se
##   <fct>      <fct>    <int> <dbl>    <dbl>    <dbl>
## 1 setosa     low         28  4.76   0.221    0.0419
## 2 setosa     medium      22  5.31   0.223    0.0476
## 3 versicolor low          3  4.97   0.0577   0.0333
## 4 versicolor medium      47  6.00   0.467    0.0682
## 5 virginica  low          1  4.90 NaN      NaN     
## 6 virginica  medium      37  6.35   0.356    0.0585
## 7 virginica  high        12  7.48   0.270    0.0780
dfNA <- df
dfNA$S.Length[11:20] <- NA

## Using ddply in plyr
ddply(dfNA, c("Species", "category"), summarise,
               N    = sum(!is.na(S.Length)),
               mean = mean(S.Length, na.rm=TRUE),
               sd   = sd(S.Length, na.rm=TRUE),
               se   = sd / sqrt(N)
)
##      Species category  N     mean         sd         se
## 1     setosa      low 25 4.780000 0.21408721 0.04281744
## 2     setosa   medium 15 5.246667 0.15522641 0.04007929
## 3 versicolor      low  3 4.966667 0.05773503 0.03333333
## 4 versicolor   medium 47 5.997872 0.46741163 0.06817899
## 5  virginica      low  1 4.900000         NA         NA
## 6  virginica   medium 37 6.345946 0.35558605 0.05845799
## 7  virginica     high 12 7.475000 0.27010099 0.07797144
ddply(dfNA, c("Species", "category"),
  .fun = function(., col) {
    c(N    = sum(!is.na(.[[col]])),
      mean = mean   (.[[col]], na.rm=T),
      sd   = sd     (.[[col]], na.rm=T)
    )
  },
  "S.Length"
)
##      Species category  N     mean         sd
## 1     setosa      low 25 4.780000 0.21408721
## 2     setosa   medium 15 5.246667 0.15522641
## 3 versicolor      low  3 4.966667 0.05773503
## 4 versicolor   medium 47 5.997872 0.46741163
## 5  virginica      low  1 4.900000         NA
## 6  virginica   medium 37 6.345946 0.35558605
## 7  virginica     high 12 7.475000 0.27010099
## Using dlpyr
dfNA %>% filter(!is.na(S.Length)) %>% 
  group_by(Species, category) %>% 
      select(S.Length) %>% 
      summarise_all(funs(N = n(),
               mean = mean(.),
               sd   = sd(.),
               se   = sd / sqrt(N)))
## # A tibble: 7 x 6
## # Groups:   Species [?]
##   Species    category     N  mean       sd       se
##   <fct>      <fct>    <int> <dbl>    <dbl>    <dbl>
## 1 setosa     low         25  4.78   0.214    0.0428
## 2 setosa     medium      15  5.25   0.155    0.0401
## 3 versicolor low          3  4.97   0.0577   0.0333
## 4 versicolor medium      47  6.00   0.467    0.0682
## 5 virginica  low          1  4.90 NaN      NaN     
## 6 virginica  medium      37  6.35   0.356    0.0585
## 7 virginica  high        12  7.48   0.270    0.0780

Keep rownames in dplyr

name_rows : provides a way to preserve them by converting them to an explicit column in the data fram

df %>% name_rows %>% filter(category=="medium", S.Width>=4)
##   S.Length S.Width P.Length P.Width Species flora category .rownames
## 1      5.8     4.0      1.2     0.2  setosa     1   medium        15
## 2      5.7     4.4      1.5     0.4  setosa     1   medium        16
## 3      5.2     4.1      1.5     0.1  setosa     1   medium        33
## 4      5.5     4.2      1.4     0.2  setosa     1   medium        34

Pipelining(chaining)

The Treachery of Images - Ceci n’est pas une pipe

library(magrittr)
library(babynames)
iris %>%
  {
    n <- 5
    rbind(head(., n), tail(., n))
  } %T>%
  plot %>% 
  summary

##   Sepal.Length    Sepal.Width    Petal.Length    Petal.Width   
##  Min.   :4.600   Min.   :2.50   Min.   :1.300   Min.   :0.200  
##  1st Qu.:4.925   1st Qu.:3.00   1st Qu.:1.400   1st Qu.:0.200  
##  Median :5.500   Median :3.05   Median :3.250   Median :1.000  
##  Mean   :5.590   Mean   :3.13   Mean   :3.290   Mean   :1.130  
##  3rd Qu.:6.275   3rd Qu.:3.35   3rd Qu.:5.175   3rd Qu.:1.975  
##  Max.   :6.700   Max.   :3.60   Max.   :5.400   Max.   :2.300  
##        Species 
##  setosa    :5  
##  versicolor:0  
##  virginica :5  
##                
##                
## 
iris %>%
  subset(Sepal.Length > mean(Sepal.Length)) %$%
  cor(Sepal.Length, Sepal.Width)
## [1] 0.3361992
# iris %>%
#   subset(Sepal.Length > mean(Sepal.Length)) %>%
#   select(Sepal.Length, Sepal.Width) %>%
#   cor
iris %>%
  subset(Sepal.Length > mean(Sepal.Length)) %$% 
  add(Sepal.Length, Sepal.Width) %>% 
  head
## [1] 10.2  9.6 10.0  9.3  9.6  9.5
iris$Sepal.Length %<>% sqrt
# iris$Sepal.Length <- 
#   iris$Sepal.Length %>%
#   sqrt()
dat <- filter(babynames, substr(babynames$name, 1, 2)=="Sa")
# # A tibble: 6 x 5
#    year sex   name         n     prop
#   <dbl> <chr> <chr>    <int>    <dbl>
# 1  1880 F     Sarah     1288 0.0132  
# 2  1880 F     Sallie     404 0.00414 
# 3  1880 F     Sadie      317 0.00325 
# 4  1880 F     Sara       165 0.00169 
# 5  1880 F     Sally       80 0.000820
# 6  1880 F     Samantha    21 0.000215
dim(dat)
# 27093     5
extract [
extract2 [[
inset [<-
inset2 [[<-
use_series $
add +
subtract -
multiply_by *
raise_to_power ^
multiply_by_matrix %*%
divide_by /
divide_by_int %/%
mod %%
is_in %in%
and &
or |
equals ==
is_greater_than >
is_weakly_greater_than >=
is_less_than <
is_weakly_less_than <=
not (n'est pas) !
set_colnames colnames<-
set_rownames rownames<-
set_names names<-
## qplot
# babynames %>%
#           filter(name %>% substr(1, 2) %>% equals("Sa")) %>%
#           group_by(year, sex) %>%
#                     select(n) %>% 
#           summarise_all(funs(total = sum(n)))%>%
#           qplot(year, total, color = sex, data=., geom="line") %>%
#           add(ggtitle("Names starting with Sa by pipe")) %>%
#           add(theme_minimal()) %>%
#           print

babynames %>%
          filter(name %>% substr(1, 2) %>% equals("Sa")) %>%
          group_by(year,sex) %>%
          select(n) %>% 
          summarise_all(funs(total = sum(n)))%>%
          {ggplot(.) + geom_line(aes(year, total, color = sex))}  %>%
          add(ggtitle("Names starting with Sa by pipe")) %>%
          add(theme_minimal()) %>%
          print