Missing Data
Due by 11:59 PM on Wednesday, February 6, 2019
Missing Data
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 2.0.99.9000 ✔ dplyr 0.8.0.9000
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::is_null() masks testthat::is_null()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::matches() masks testthat::matches()
library(janitor)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(socviz)
##
## Attaching package: 'socviz'
## The following object is masked from 'package:kjhutils':
##
## %nin%
library(naniar)
library(visdat)
organdata
## # A tibble: 238 x 21
## country year donors pop pop_dens gdp gdp_lag health health_lag
## <chr> <date> <dbl> <int> <dbl> <int> <int> <dbl> <dbl>
## 1 Austra… NA NA 17065 0.220 16774 16591 1300 1224
## 2 Austra… 1991-01-01 12.1 17284 0.223 17171 16774 1379 1300
## 3 Austra… 1992-01-01 12.4 17495 0.226 17914 17171 1455 1379
## 4 Austra… 1993-01-01 12.5 17667 0.228 18883 17914 1540 1455
## 5 Austra… 1994-01-01 10.2 17855 0.231 19849 18883 1626 1540
## 6 Austra… 1995-01-01 10.2 18072 0.233 21079 19849 1737 1626
## 7 Austra… 1996-01-01 10.6 18311 0.237 21923 21079 1846 1737
## 8 Austra… 1997-01-01 10.3 18518 0.239 22961 21923 1948 1846
## 9 Austra… 1998-01-01 10.5 18711 0.242 24148 22961 2077 1948
## 10 Austra… 1999-01-01 8.67 18926 0.244 25445 24148 2231 2077
## # … with 228 more rows, and 12 more variables: pubhealth <dbl>,
## # roads <dbl>, cerebvas <int>, assault <int>, external <int>,
## # txp_pop <dbl>, world <chr>, opt <chr>, consent_law <chr>,
## # consent_practice <chr>, consistent <chr>, ccode <chr>
vis_dat(organdata)
miss_var_summary(organdata)
## # A tibble: 21 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 year 34 14.3
## 2 donors 34 14.3
## 3 opt 28 11.8
## 4 pubhealth 21 8.82
## 5 pop 17 7.14
## 6 pop_dens 17 7.14
## 7 gdp 17 7.14
## 8 roads 17 7.14
## 9 cerebvas 17 7.14
## 10 assault 17 7.14
## # … with 11 more rows
miss_case_summary(organdata)
## # A tibble: 238 x 3
## case n_miss pct_miss
## <int> <int> <dbl>
## 1 84 12 57.1
## 2 182 12 57.1
## 3 210 12 57.1
## 4 14 11 52.4
## 5 28 11 52.4
## 6 42 11 52.4
## 7 56 11 52.4
## 8 70 11 52.4
## 9 98 11 52.4
## 10 112 11 52.4
## # … with 228 more rows
organdata %>%
select(consent_law, year, pubhealth, roads) %>%
group_by(consent_law) %>%
miss_var_summary()
## # A tibble: 6 x 4
## consent_law variable n_miss pct_miss
## <chr> <chr> <int> <dbl>
## 1 Informed year 16 14.3
## 2 Informed pubhealth 8 7.14
## 3 Informed roads 8 7.14
## 4 Presumed year 18 14.3
## 5 Presumed pubhealth 13 10.3
## 6 Presumed roads 9 7.14
vis_miss(organdata)
vis_miss(organdata, cluster = TRUE)
gg_miss_var(organdata)
gg_miss_upset(organdata)
ggplot(organdata,
aes(x = donors,
y = pubhealth)) +
geom_miss_point()
vis_dat(organdata)
ggplot(organdata,
aes(x = pubhealth,
y = donors)) +
geom_miss_point()
gg_miss_var(organdata)
library(rpart)
library(rpart.plot)
organdata %>%
add_prop_miss() %>%
rpart(prop_miss_all ~ ., data = .) %>%
prp(type = 4, extra = 101, prefix = "Prop. Miss = ")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call prp with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.
gg_miss_upset(organdata)
n_var_miss(organdata)
## [1] 13
gg_miss_upset(organdata, nintersects = NA)