Missing Data

Due by 11:59 PM on Wednesday, February 6, 2019

Missing Data

knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0           ✔ purrr   0.2.5      
## ✔ tibble  2.0.99.9000     ✔ dplyr   0.8.0.9000 
## ✔ tidyr   0.8.2           ✔ stringr 1.3.1      
## ✔ readr   1.3.1           ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()  masks stats::filter()
## ✖ purrr::is_null() masks testthat::is_null()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ dplyr::matches() masks testthat::matches()
library(janitor)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(socviz)
## 
## Attaching package: 'socviz'
## The following object is masked from 'package:kjhutils':
## 
##     %nin%
library(naniar)
library(visdat)
organdata
## # A tibble: 238 x 21
##    country year       donors   pop pop_dens   gdp gdp_lag health health_lag
##    <chr>   <date>      <dbl> <int>    <dbl> <int>   <int>  <dbl>      <dbl>
##  1 Austra… NA          NA    17065    0.220 16774   16591   1300       1224
##  2 Austra… 1991-01-01  12.1  17284    0.223 17171   16774   1379       1300
##  3 Austra… 1992-01-01  12.4  17495    0.226 17914   17171   1455       1379
##  4 Austra… 1993-01-01  12.5  17667    0.228 18883   17914   1540       1455
##  5 Austra… 1994-01-01  10.2  17855    0.231 19849   18883   1626       1540
##  6 Austra… 1995-01-01  10.2  18072    0.233 21079   19849   1737       1626
##  7 Austra… 1996-01-01  10.6  18311    0.237 21923   21079   1846       1737
##  8 Austra… 1997-01-01  10.3  18518    0.239 22961   21923   1948       1846
##  9 Austra… 1998-01-01  10.5  18711    0.242 24148   22961   2077       1948
## 10 Austra… 1999-01-01   8.67 18926    0.244 25445   24148   2231       2077
## # … with 228 more rows, and 12 more variables: pubhealth <dbl>,
## #   roads <dbl>, cerebvas <int>, assault <int>, external <int>,
## #   txp_pop <dbl>, world <chr>, opt <chr>, consent_law <chr>,
## #   consent_practice <chr>, consistent <chr>, ccode <chr>
vis_dat(organdata)

miss_var_summary(organdata)
## # A tibble: 21 x 3
##    variable  n_miss pct_miss
##    <chr>      <int>    <dbl>
##  1 year          34    14.3 
##  2 donors        34    14.3 
##  3 opt           28    11.8 
##  4 pubhealth     21     8.82
##  5 pop           17     7.14
##  6 pop_dens      17     7.14
##  7 gdp           17     7.14
##  8 roads         17     7.14
##  9 cerebvas      17     7.14
## 10 assault       17     7.14
## # … with 11 more rows
miss_case_summary(organdata)
## # A tibble: 238 x 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1    84     12     57.1
##  2   182     12     57.1
##  3   210     12     57.1
##  4    14     11     52.4
##  5    28     11     52.4
##  6    42     11     52.4
##  7    56     11     52.4
##  8    70     11     52.4
##  9    98     11     52.4
## 10   112     11     52.4
## # … with 228 more rows
organdata %>%
  select(consent_law, year, pubhealth, roads) %>%
  group_by(consent_law) %>%
  miss_var_summary()
## # A tibble: 6 x 4
##   consent_law variable  n_miss pct_miss
##   <chr>       <chr>      <int>    <dbl>
## 1 Informed    year          16    14.3 
## 2 Informed    pubhealth      8     7.14
## 3 Informed    roads          8     7.14
## 4 Presumed    year          18    14.3 
## 5 Presumed    pubhealth     13    10.3 
## 6 Presumed    roads          9     7.14
vis_miss(organdata)

vis_miss(organdata, cluster = TRUE)

gg_miss_var(organdata)

gg_miss_upset(organdata)

ggplot(organdata, 
       aes(x = donors, 
           y = pubhealth)) + 
  geom_miss_point()

vis_dat(organdata)

ggplot(organdata, 
       aes(x = pubhealth, 
           y = donors)) + 
  geom_miss_point()

gg_miss_var(organdata)

library(rpart)
library(rpart.plot)

organdata %>%
  add_prop_miss() %>%
  rpart(prop_miss_all ~ ., data = .) %>%
  prp(type = 4, extra = 101, prefix = "Prop. Miss = ")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call prp with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

gg_miss_upset(organdata)

n_var_miss(organdata)
## [1] 13
gg_miss_upset(organdata, nintersects = NA)