What is dirty data?

From wikipedia, dirty data, is inaccurate, incomplete or erroneous data, which can contain such mistakes as spelling or punctuation errors, incorrect data associated with a field, incomplete or outdated data, or even data that has been duplicated in the database. It can be cleaned through a process known as data cleansing.

脏数据是不准确的,不完整或错误的数据,它们可能包含拼写或标点符号错误,与研究领域相关的错误数据,不完整的或过时的数据,甚至是数据库中重复的数据。 它可以通过一个称为数据清理的过程进行清理。

Dirty Data 包括:

  • MCAR - 完全随机缺失(missing complete at random, 缺失与研究对象无关)
  • MAR - 随机缺失(missing at random, 缺失与研究对象有关)
  • NMAR - 非随机、不可忽略缺失(not missing at Random, or nonignorable)

NAs in R

NA(meaning not applicable, not available, or no answer) NA which cantains NA_integer_, NA_real_, NA_complex_ and NA_character_, can be coerced to any other vector type except raw.

Manipulate NAs

1/0
# Inf
0/0
sqrt(-1)
1/0 - 1/0
# NaN not a number
NaN > NA
# NA
x <- c(NA,Inf,NaN,0,NULL)
# NULL means that there is no value
is.na(x) # support type list or vector
# TRUE FALSE TRUE FALSE
is.nan(x)
# FALSE FALSE  TRUE FALSE
is.infinite(x)
# FALSE  TRUE FALSE FALSE
is.null(x)
# FALSE
x <- c(NA,Inf,NaN,0,"NA")
is.na(x)
# TRUE FALSE FALSE FALSE FALSE
# Coz NaN will coerced to a string like x <- c(NaN,"")
is.na(x) <- c(2,4)
x
# NA    NA    "NaN" NA    "NA" 
c(TRUE, FALSE) & NA
## NA FALSE
c(TRUE, FALSE) | NA
# TRUE   NA
df <- data.frame( a=c(TRUE,TRUE,TRUE,FALSE,FALSE,FALSE,NA,NA,NA),
                  b=c(TRUE,FALSE,NA,TRUE,FALSE,NA,TRUE,FALSE,NA))
# This function returns TRUE wherever elements are the same, including NA's,
# and FALSE everywhere else.
compareNA <- function(v1,v2) {
    same <- (v1 == v2) | (is.na(v1) & is.na(v2))
    same[is.na(same)] <- FALSE
    return(same)
}
data.frame(df, isSame = compareNA(df$a,df$b))
#       a     b isSame
# 1  TRUE  TRUE   TRUE
# 2  TRUE FALSE  FALSE
# 3  TRUE    NA  FALSE
# 4 FALSE  TRUE  FALSE
# 5 FALSE FALSE   TRUE
# 6 FALSE    NA  FALSE
# 7    NA  TRUE  FALSE
# 8    NA FALSE  FALSE
# 9    NA    NA   TRUE

any(is.na(x)) Vs anyNA(x))

Generally, anyNA(x) is a more faster way

  • user time: time charged to the CPU(s)
  • elapsed time: “wall clock” time
x <- sample(c(1:10000,rep(NA,100),rep(NaN,100)),10000)
N <- 2^20
print(rbind(is.na = system.time(replicate(N, any(is.na(x)))),
              anyNA = system.time(replicate(N, anyNA(x)))))

#       user.self sys.self elapsed user.child sys.child
# is.na     17.93    11.04   29.05         NA        NA
# anyNA      0.87     0.00    0.87         NA        NA

Whats more, anyNA can also perform a recursive way

L <- list(c(1:10),c(NA_integer_, NA_real_, NA_complex_ ,NA_character_),c(sqrt(-1),1/0 - 1/0))
sapply(L, anyNA); c(anyNA(L), anyNA(L, TRUE))
## NaNs produced
# FALSE  TRUE  TRUE
# FALSE  TRUE

Ignoring “bad” values in R

v <- c(1, 2, 3, NA, 5, NaN, Inf, NULL )
is.na(v)
# FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE
mean(v)
sum(v)
# NA
mean(v, na.rm=TRUE)
# 2.75
sum(v, na.rm=TRUE)
# Inf
sum(v[-7], na.rm=TRUE)
# 11
sum(1,2,NA,na.rm = T)
# 3
mean(1,2,NA,na.rm = F)
# 1!!!
v[!is.na(v)]
# 1   2   3   5 Inf

Airquality for example

153 entries:

  • Ozone : Ozone (ppb)
  • Solar.R : Solar radiation in Langleys (lang)
  • Wind : Average wind speed in miles per hour (mph)
  • Temp : Maximum daily temperature in degrees Fahrenheit (degrees F)
  • Month : Month (1–12)
  • Day : Day of month (1–31)
x <- c(1, 2, NA, 4, NA, 5)
y <- c("a", "b", NA, "d", NA, "f")                  
good <- complete.cases(x, y)
good
# TRUE  TRUE FALSE  TRUE FALSE  TRUE
y[good]
# "a" "b" "d" "f"
airquality[1:6, ]
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6
good <- complete.cases(airquality)
## 111 entries
airquality[good, ][1:6, ]
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 7    23     299  8.6   65     5   7
## 8    19      99 13.8   59     5   8

Simulate a dataset containing undesirable data

Simulate a dataset

df <- data.frame(
  sex=factor(rep(c("F", "M"), each=200)),
  weight=round(c(rnorm(200, mean=55, sd=5),
                 rnorm(200, mean=65, sd=5)))
  )
set.seed(0)
# Setting the random number seed with set.seed ensures reproducibility
library(openxlsx)
Names <- read.xlsx(file.choose())[,1]
# import names
Name <- paste0(sample(Names,1000,replace = T))
Sex <- factor(sample(c('F','M'),1000, replace = T, prob = c(0.45,0.55)))
Age <- abs(round(rnorm(1000, mean=25, sd=10))+15)
Income <- round(rexp(1000,4)*10000,4)
df <- data.frame(Name = Name, Sex = Sex, Age = Age, Income= Income)
summary(df)
##         Name     Sex          Age            Income         
##  Valentine:  8   F:436   Min.   :10.00   Min.   :    8.938  
##  Herman   :  7   M:564   1st Qu.:33.00   1st Qu.:  692.469  
##  Nigel    :  7           Median :40.00   Median : 1702.759  
##  Booth    :  6           Mean   :39.76   Mean   : 2569.415  
##  Chapman  :  6           3rd Qu.:47.00   3rd Qu.: 3465.921  
##  Edmund   :  6           Max.   :70.00   Max.   :22398.214  
##  (Other)  :960
set.seed(1)
i <- sample(1000, 100)
j <- sample(1:4, 100, replace = T)
df[as.matrix(data.frame(i,j))] <- NA
summary(df)
##         Name       Sex           Age            Income         
##  Valentine:  8   F   :424   Min.   :10.00   Min.   :    8.938  
##  Herman   :  7   M   :551   1st Qu.:33.00   1st Qu.:  698.288  
##  Nigel    :  7   NA's: 25   Median :40.00   Median : 1709.895  
##  Booth    :  6              Mean   :39.66   Mean   : 2584.425  
##  Chapman  :  6              3rd Qu.:47.00   3rd Qu.: 3476.368  
##  (Other)  :945              Max.   :70.00   Max.   :22398.214  
##  NA's     : 21              NA's   :32      NA's   :22

Handle with the dirty data

library(tidyverse)
library(VIM)
aggr(df, prop = FALSE, numbers = TRUE)

df2 <- df
df2 <- df2[!is.na(df2$Name),]
df2 <- replace_na(df2,replace = list(Sex = names(which.max(table(df2$Sex))), Age = mean(df2$Age, na.rm = TRUE)))
summary(df2)
##         Name     Sex          Age            Income         
##  Valentine:  8   F:414   Min.   :10.00   Min.   :    8.938  
##  Herman   :  7   M:565   1st Qu.:33.00   1st Qu.:  692.485  
##  Nigel    :  7           Median :39.69   Median : 1715.361  
##  Booth    :  6           Mean   :39.69   Mean   : 2582.242  
##  Chapman  :  6           3rd Qu.:47.00   3rd Qu.: 3471.288  
##  Edmund   :  6           Max.   :70.00   Max.   :22398.214  
##  (Other)  :939                           NA's   :22
library(mice)
imp <- mice(data = df2)
## 
##  iter imp variable
##   1   1  Income
##   1   2  Income
##   1   3  Income
##   1   4  Income
##   1   5  Income
##   2   1  Income
##   2   2  Income
##   2   3  Income
##   2   4  Income
##   2   5  Income
##   3   1  Income
##   3   2  Income
##   3   3  Income
##   3   4  Income
##   3   5  Income
##   4   1  Income
##   4   2  Income
##   4   3  Income
##   4   4  Income
##   4   5  Income
##   5   1  Income
##   5   2  Income
##   5   3  Income
##   5   4  Income
##   5   5  Income
imp$imp
## $Name
## NULL
## 
## $Sex
## NULL
## 
## $Age
## NULL
## 
## $Income
##              1         2         3          4         5
## 204  3942.1838 8498.0879 2997.3294  7483.7609 2493.0779
## 224   838.5253 4868.7148   23.4414   412.1710  583.9495
## 261  2024.4653 2318.8446  748.2238   788.2074 1234.2818
## 276 16607.5158  639.4116  727.3643  5756.6726 5760.9782
## 312   183.4636 7240.4725 1146.3331    37.0536  183.4636
## 322  6332.3123 1942.6753  127.3307  6321.0070 2293.2614
## 368 22398.2143 2736.4843 9521.6995  7931.7234   79.5099
## 371  1934.9597 3462.4594  439.6423 14572.9850 5676.2001
## 546 13265.6628 2224.4830 1311.7418  6296.6821 6696.6482
## 610  5087.4181 3050.3740  413.0280  6296.6821 5352.7009
## 625  6330.5200  886.6474 5123.3427    31.1536   23.4414
## 650  4894.4728 1301.4659  180.3558  8102.0720 5495.6884
## 659    71.7279 5108.4458 2476.1572   441.0749  203.0800
## 694  5296.5679 1757.4108 7764.4984  2563.8153 5126.2661
## 697  3467.8073    8.9376  183.4636   876.5242  669.4385
## 780  4868.7148 5455.6900  196.7655  5838.6330 5495.6884
## 795    24.6374  793.4435   31.1536   413.0280 3467.8073
## 800  5087.4181  877.4353  373.9559  6296.6821 5352.7009
## 826    31.1536 4219.5399 3858.1781   230.1099   23.4414
## 885    23.4414 8837.9920 3005.0820    37.0536  495.0302
## 906  2410.5120  977.3326  482.9877  4038.2669 3481.2841
## 917   687.6375  264.4450 6377.2210  1993.9283 3093.6910
Income_imp <- apply(imp$imp$Income,1,mean)
df2$Income[is.na(df2$Income)] <-Income_imp
summary(df2)
##         Name     Sex          Age            Income         
##  Valentine:  8   F:414   Min.   :10.00   Min.   :    8.938  
##  Herman   :  7   M:565   1st Qu.:33.00   1st Qu.:  708.399  
##  Nigel    :  7           Median :39.69   Median : 1745.223  
##  Booth    :  6           Mean   :39.69   Mean   : 2599.971  
##  Chapman  :  6           3rd Qu.:47.00   3rd Qu.: 3530.039  
##  Edmund   :  6           Max.   :70.00   Max.   :22398.214  
##  (Other)  :939

Filling in NAs with last non-NA value

## Sample data
x <- c(NA,NA, "A","A", "B","B","B", NA,NA, "C", NA,NA,NA, "A","A","B", NA,NA)

goodIdx <- !is.na(x)
goodIdx
#>  [1] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE
#> [15]  TRUE  TRUE FALSE FALSE

# These are the non-NA values from x only
# Add a leading NA for later use when we index into this vector
goodVals <- c(NA, x[goodIdx])
goodVals
#>  [1] NA  "A" "A" "B" "B" "B" "C" "A" "A" "B"

# Fill the indices of the output vector with the indices pulled from
# these offsets of goodVals. Add 1 to avoid indexing to zero.
fillIdx <- cumsum(goodIdx)+1
fillIdx
#>  [1]  1  1  2  3  4  5  6  6  6  7  7  7  7  8  9 10 10 10

# The original vector with gaps filled
goodVals[fillIdx]
#>  [1] NA  NA  "A" "A" "B" "B" "B" "B" "B" "C" "C" "C" "C" "A" "A" "B" "B" "B"
fillNAgaps <- function(x, firstBack=FALSE) {
    ## NA's in a vector or factor are replaced with last non-NA values
    ## If firstBack is TRUE, it will fill in leading NA's with the first
    ## non-NA value. If FALSE, it will not change leading NA's.
    
    # If it's a factor, store the level labels and convert to integer
    lvls <- NULL
    if (is.factor(x)) {
        lvls <- levels(x)
        x    <- as.integer(x)
    }
 
    goodIdx <- !is.na(x)
 
    # These are the non-NA values from x only
    # Add a leading NA or take the first good value, depending on firstBack   
    if (firstBack)   goodVals <- c(x[goodIdx][1], x[goodIdx])
    else             goodVals <- c(NA,            x[goodIdx])

    # Fill the indices of the output vector with the indices pulled from
    # these offsets of goodVals. Add 1 to avoid indexing to zero.
    fillIdx <- cumsum(goodIdx)+1
    
    x <- goodVals[fillIdx]

    # If it was originally a factor, convert it back
    if (!is.null(lvls)) {
        x <- factor(x, levels=seq_along(lvls), labels=lvls)
    }

    x
}

fillNAgaps(x,firstBack = T)
##  [1] "A" "A" "A" "A" "B" "B" "B" "B" "B" "C" "C" "C" "C" "A" "A" "B" "B"
## [18] "B"
fillNAgaps(x,firstBack = F)
##  [1] NA  NA  "A" "A" "B" "B" "B" "B" "B" "C" "C" "C" "C" "A" "A" "B" "B"
## [18] "B"