More on dplyr
Due by 11:59 PM on Sunday, December 31, 0000
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 2.0.99.9000 ✔ dplyr 0.8.0.9000
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::is_null() masks testthat::is_null()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::matches() masks testthat::matches()
library(janitor)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(socviz)
##
## Attaching package: 'socviz'
## The following object is masked from 'package:kjhutils':
##
## %nin%
library(gapminder)
Scoping
organdata %>%
group_by(world) %>%
summarize_if(is.numeric, mean, na.rm = TRUE) %>%
select(world, donors, pubhealth, roads) %>%
select_all(tools::toTitleCase)
## # A tibble: 4 x 4
## World Donors Pubhealth Roads
## <chr> <dbl> <dbl> <dbl>
## 1 <NA> 28.1 5.45 161.
## 2 Corporatist 16.8 6.40 132.
## 3 Liberal 15.6 5.75 111.
## 4 SocDem 14.8 6.54 82.7
organdata %>%
group_by(country) %>%
summarize_if(is.numeric,
funs(avg = mean, sd = sd),
na.rm = TRUE) %>%
select(country, donors_avg,
donors_sd, roads_avg, roads_sd) %>%
arrange(desc(donors_avg))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.)
##
## # After:
## list(name = ~f(.))
## This warning is displayed once per session.
## # A tibble: 17 x 5
## country donors_avg donors_sd roads_avg roads_sd
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Spain 28.1 4.96 161. 35.3
## 2 Austria 23.5 2.42 150. 30.3
## 3 Belgium 21.9 1.94 155. 20.6
## 4 United States 20.0 1.33 155. 8.35
## 5 Ireland 19.8 2.48 118. 10.8
## 6 Finland 18.4 1.53 93.6 19.0
## 7 France 16.8 1.60 156. 20.1
## 8 Norway 15.4 1.11 70.0 6.68
## 9 Switzerland 14.2 1.71 96.4 21.7
## 10 Canada 14.0 0.751 109. 17.7
## 11 Netherlands 13.7 1.55 76.1 9.93
## 12 United Kingdom 13.5 0.775 67.9 10.5
## 13 Sweden 13.1 1.75 72.3 13.2
## 14 Denmark 13.1 1.47 102. 12.4
## 15 Germany 13.0 0.611 113. 25.9
## 16 Italy 11.1 4.28 122. 10.2
## 17 Australia 10.6 1.14 105. 14.3
out <- lm(donors ~ pop + gdp + roads, data = organdata)
summary(out)
##
## Call:
## lm(formula = donors ~ pop + gdp + roads, data = organdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.423 -2.658 -0.080 1.963 15.864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.506e+00 2.364e+00 1.906 0.0580 .
## pop -1.153e-05 5.643e-06 -2.043 0.0423 *
## gdp 1.082e-04 7.527e-05 1.438 0.1521
## roads 8.988e-02 1.032e-02 8.710 1.14e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.325 on 200 degrees of freedom
## (34 observations deleted due to missingness)
## Multiple R-squared: 0.2944, Adjusted R-squared: 0.2838
## F-statistic: 27.81 on 3 and 200 DF, p-value: 4.486e-15
names(summary(out))
## [1] "call" "terms" "residuals" "coefficients"
## [5] "aliased" "sigma" "df" "r.squared"
## [9] "adj.r.squared" "fstatistic" "cov.unscaled" "na.action"
organdata %>%
split(.$world) %>%
map(~ lm(donors ~ pop + gdp + roads, data = .)) %>%
map(summary) %>%
map_dbl("r.squared")
## Corporatist Liberal SocDem
## 0.6681724 0.4060604 0.2521456
Zero Counts in Dplyr
Grab the data remotely, to make this self-contained.
url <- "https://datamgmt.org/data/first_terms.csv"
df <- read_csv(url)
## Parsed with column specification:
## cols(
## pid = col_double(),
## start_year = col_date(format = ""),
## party = col_character(),
## sex = col_character()
## )
df
## # A tibble: 293 x 4
## pid start_year party sex
## <dbl> <date> <chr> <chr>
## 1 3160 2013-01-03 Republican M
## 2 3161 2013-01-03 Democrat F
## 3 3162 2013-01-03 Democrat M
## 4 3163 2013-01-03 Republican M
## 5 3164 2013-01-03 Democrat M
## 6 3165 2013-01-03 Republican M
## 7 3166 2013-01-03 Republican M
## 8 3167 2013-01-03 Democrat F
## 9 3168 2013-01-03 Republican M
## 10 3169 2013-01-03 Democrat M
## # … with 283 more rows
df %>%
group_by(start_year, party, sex) %>%
summarize(N = n()) %>%
mutate(freq = N / sum(N))
## # A tibble: 14 x 5
## # Groups: start_year, party [8]
## start_year party sex N freq
## <date> <chr> <chr> <int> <dbl>
## 1 2013-01-03 Democrat F 21 0.362
## 2 2013-01-03 Democrat M 37 0.638
## 3 2013-01-03 Republican F 8 0.101
## 4 2013-01-03 Republican M 71 0.899
## 5 2015-01-03 Democrat M 1 1
## 6 2015-01-03 Republican M 5 1
## 7 2017-01-03 Democrat F 6 0.24
## 8 2017-01-03 Democrat M 19 0.76
## 9 2017-01-03 Republican F 2 0.0667
## 10 2017-01-03 Republican M 28 0.933
## 11 2019-01-03 Democrat F 34 0.567
## 12 2019-01-03 Democrat M 26 0.433
## 13 2019-01-03 Republican F 1 0.0286
## 14 2019-01-03 Republican M 34 0.971
## Hex colors for sex
sex_colors <- c("#E69F00", "#993300")
## Hex color codes for Dem Blue and Rep Red
party_colors <- c("#2E74C0", "#CB454A")
## Group labels
mf_labs <- tibble(M = "Men", F = "Women")
theme_set(theme_minimal())
df %>%
group_by(start_year, party, sex) %>%
summarize(N = n()) %>%
mutate(freq = N / sum(N)) %>%
ggplot(aes(x = start_year,
y = freq,
fill = sex)) +
geom_col() +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(values = sex_colors, labels = c("Women", "Men")) +
labs(x = "Year", y = "Percent", fill = "Group") +
facet_wrap(~ party)
df %>%
group_by(start_year, party, sex) %>%
summarize(N = n()) %>%
mutate(freq = N / sum(N)) %>%
ggplot(aes(x = start_year,
y = freq,
color = sex)) +
geom_line(size = 1.1) +
scale_y_continuous(labels = scales::percent) +
scale_color_manual(values = sex_colors, labels = c("Women", "Men")) +
guides(color = guide_legend(reverse = TRUE)) +
labs(x = "Year", y = "Percent", color = "Group") +
facet_wrap(~ party)
Option 1: recode to factors
df_f <- df %>% modify_if(is.character, as.factor)
df_f %>%
group_by(start_year, party, sex) %>%
tally()
## # A tibble: 16 x 4
## # Groups: start_year, party [8]
## start_year party sex n
## <date> <fct> <fct> <int>
## 1 2013-01-03 Democrat F 21
## 2 2013-01-03 Democrat M 37
## 3 2013-01-03 Republican F 8
## 4 2013-01-03 Republican M 71
## 5 2015-01-03 Democrat F 0
## 6 2015-01-03 Democrat M 1
## 7 2015-01-03 Republican F 0
## 8 2015-01-03 Republican M 5
## 9 2017-01-03 Democrat F 6
## 10 2017-01-03 Democrat M 19
## 11 2017-01-03 Republican F 2
## 12 2017-01-03 Republican M 28
## 13 2019-01-03 Democrat F 34
## 14 2019-01-03 Democrat M 26
## 15 2019-01-03 Republican F 1
## 16 2019-01-03 Republican M 34
Option 2: ungroup()
and complete()
df %>%
group_by(start_year, party, sex) %>%
summarize(N = n()) %>%
mutate(freq = N / sum(N)) %>%
ungroup() %>%
complete(start_year, party, sex,
fill = list(N = 0, freq = 0))
## # A tibble: 16 x 5
## start_year party sex N freq
## <date> <chr> <chr> <dbl> <dbl>
## 1 2013-01-03 Democrat F 21 0.362
## 2 2013-01-03 Democrat M 37 0.638
## 3 2013-01-03 Republican F 8 0.101
## 4 2013-01-03 Republican M 71 0.899
## 5 2015-01-03 Democrat F 0 0
## 6 2015-01-03 Democrat M 1 1
## 7 2015-01-03 Republican F 0 0
## 8 2015-01-03 Republican M 5 1
## 9 2017-01-03 Democrat F 6 0.24
## 10 2017-01-03 Democrat M 19 0.76
## 11 2017-01-03 Republican F 2 0.0667
## 12 2017-01-03 Republican M 28 0.933
## 13 2019-01-03 Democrat F 34 0.567
## 14 2019-01-03 Democrat M 26 0.433
## 15 2019-01-03 Republican F 1 0.0286
## 16 2019-01-03 Republican M 34 0.971
df_f %>%
group_by(start_year, party, sex) %>%
summarize(N = n()) %>%
mutate(freq = N / sum(N)) %>%
ggplot(aes(x = start_year,
y = freq,
color = sex)) +
geom_line(size = 1.1) +
scale_y_continuous(labels = scales::percent) +
scale_color_manual(values = sex_colors, labels = c("Women", "Men")) +
guides(color = guide_legend(reverse = TRUE)) +
labs(x = "Year", y = "Percent", color = "Group") +
facet_wrap(~ party)
Spreading multiple values
edu
## # A tibble: 366 x 11
## age sex year total elem4 elem8 hs3 hs4 coll3 coll4 median
## <chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 25-34 Male 2016 21845 116 468 1427 6386 6015 7432 NA
## 2 25-34 Male 2015 21427 166 488 1584 6198 5920 7071 NA
## 3 25-34 Male 2014 21217 151 512 1611 6323 5910 6710 NA
## 4 25-34 Male 2013 20816 161 582 1747 6058 5749 6519 NA
## 5 25-34 Male 2012 20464 161 579 1707 6127 5619 6270 NA
## 6 25-34 Male 2011 20985 190 657 1791 6444 5750 6151 NA
## 7 25-34 Male 2010 20689 186 641 1866 6458 5587 5951 NA
## 8 25-34 Male 2009 20440 184 695 1806 6495 5508 5752 NA
## 9 25-34 Male 2008 20210 172 714 1874 6356 5277 5816 NA
## 10 25-34 Male 2007 20024 246 757 1930 6361 5137 5593 NA
## # … with 356 more rows
edu_t <- gather(data = edu,
key = school,
value = freq,
elem4:coll4)
head(edu_t)
## # A tibble: 6 x 7
## age sex year total median school freq
## <chr> <chr> <int> <int> <dbl> <chr> <dbl>
## 1 25-34 Male 2016 21845 NA elem4 116
## 2 25-34 Male 2015 21427 NA elem4 166
## 3 25-34 Male 2014 21217 NA elem4 151
## 4 25-34 Male 2013 20816 NA elem4 161
## 5 25-34 Male 2012 20464 NA elem4 161
## 6 25-34 Male 2011 20985 NA elem4 190
tail(edu_t)
## # A tibble: 6 x 7
## age sex year total median school freq
## <chr> <chr> <int> <int> <dbl> <chr> <dbl>
## 1 55> Female 1959 16263 8.3 coll4 688
## 2 55> Female 1957 15581 8.2 coll4 630
## 3 55> Female 1952 13662 7.9 coll4 628
## 4 55> Female 1950 13150 8.4 coll4 436
## 5 55> Female 1947 11810 7.6 coll4 343
## 6 55> Female 1940 9777 8.3 coll4 219
Generate some sample stratified data
gen_cats <- function(x, N = 1000) {
sample(x, N, replace = TRUE)
}
set.seed(101)
N <- 1000
income <- rnorm(N, 100, 50)
vars <- list(stratum = c(1:8),
sex = c("M", "F"),
race = c("B", "W"),
educ = c("HS", "BA"))
df <- as_tibble(map_dfc(vars, gen_cats))
df <- add_column(df, income)
df
## # A tibble: 1,000 x 5
## stratum sex race educ income
## <int> <chr> <chr> <chr> <dbl>
## 1 6 F W BA 83.7
## 2 7 F B HS 128.
## 3 1 F B BA 66.3
## 4 2 M B BA 111.
## 5 4 M B BA 116.
## 6 6 F W BA 159.
## 7 7 F W BA 131.
## 8 6 M W HS 94.4
## 9 4 F W HS 146.
## 10 7 M B BA 88.8
## # … with 990 more rows
Basic grouped summary (no gathering)
tv_wide1 <- df %>% group_by(sex, race, stratum, educ) %>%
summarize(mean_inc = mean(income), N = n())
tv_wide1
## # A tibble: 64 x 6
## # Groups: sex, race, stratum [32]
## sex race stratum educ mean_inc N
## <chr> <chr> <int> <chr> <dbl> <int>
## 1 F B 1 BA 102. 16
## 2 F B 1 HS 116. 23
## 3 F B 2 BA 93.0 12
## 4 F B 2 HS 90.5 15
## 5 F B 3 BA 114. 18
## 6 F B 3 HS 104. 11
## 7 F B 4 BA 90.5 13
## 8 F B 4 HS 103. 15
## 9 F B 5 BA 111. 12
## 10 F B 5 HS 70.7 10
## # … with 54 more rows
Gather mean income and N into a key-value pair
tv_wide2 <- df %>% group_by(sex, race, stratum, educ) %>%
summarize(mean_inc = mean(income), N = n()) %>%
gather(variable, value, -(sex:educ))
tv_wide2
## # A tibble: 128 x 6
## # Groups: sex, race, stratum [32]
## sex race stratum educ variable value
## <chr> <chr> <int> <chr> <chr> <dbl>
## 1 F B 1 BA mean_inc 102.
## 2 F B 1 HS mean_inc 116.
## 3 F B 2 BA mean_inc 93.0
## 4 F B 2 HS mean_inc 90.5
## 5 F B 3 BA mean_inc 114.
## 6 F B 3 HS mean_inc 104.
## 7 F B 4 BA mean_inc 90.5
## 8 F B 4 HS mean_inc 103.
## 9 F B 5 BA mean_inc 111.
## 10 F B 5 HS mean_inc 70.7
## # … with 118 more rows
Unite the variable
(key) column with the Education measure
tv_wide2 <- df %>% group_by(sex, race, stratum, educ) %>%
summarize(mean_inc = mean(income), N = n()) %>%
gather(variable, value, -(sex:educ)) %>%
unite(temp, educ, variable)
tv_wide2
## # A tibble: 128 x 5
## # Groups: sex, race, stratum [32]
## sex race stratum temp value
## <chr> <chr> <int> <chr> <dbl>
## 1 F B 1 BA_mean_inc 102.
## 2 F B 1 HS_mean_inc 116.
## 3 F B 2 BA_mean_inc 93.0
## 4 F B 2 HS_mean_inc 90.5
## 5 F B 3 BA_mean_inc 114.
## 6 F B 3 HS_mean_inc 104.
## 7 F B 4 BA_mean_inc 90.5
## 8 F B 4 HS_mean_inc 103.
## 9 F B 5 BA_mean_inc 111.
## 10 F B 5 HS_mean_inc 70.7
## # … with 118 more rows
Finally, spread the resulting temporary column, to get the desired result
tv_wide2 <- df %>% group_by(sex, race, stratum, educ) %>%
summarize(mean_inc = mean(income), N = n()) %>%
gather(variable, value, -(sex:educ)) %>%
unite(temp, educ, variable) %>%
spread(temp, value)
tv_wide2
## # A tibble: 32 x 7
## # Groups: sex, race, stratum [32]
## sex race stratum BA_mean_inc BA_N HS_mean_inc HS_N
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 F B 1 102. 16 116. 23
## 2 F B 2 93.0 12 90.5 15
## 3 F B 3 114. 18 104. 11
## 4 F B 4 90.5 13 103. 15
## 5 F B 5 111. 12 70.7 10
## 6 F B 6 97.8 9 88.7 17
## 7 F B 7 70.2 17 99.0 21
## 8 F B 8 116. 15 101. 8
## 9 F W 1 86.4 20 93.0 8
## 10 F W 2 104. 14 93.4 12
## # … with 22 more rows
A function to do this gather, spread, unite sequence
multi_spread <- function(df, key, value) {
# quote key
keyq <- rlang::enquo(key)
# break value vector into quotes
valueq <- rlang::enquo(value)
s <- rlang::quos(!!valueq)
df %>% gather(variable, value, !!!s) %>%
unite(temp, !!keyq, variable) %>%
spread(temp, value)
}
Use the function directly
## Final version
tv_wide3 <- df %>% group_by(sex, race, stratum, educ) %>%
summarize(mean_inc = mean(income), N = n()) %>%
multi_spread(educ, c(mean_inc, N))
tv_wide3
## # A tibble: 32 x 7
## # Groups: sex, race, stratum [32]
## sex race stratum BA_mean_inc BA_N HS_mean_inc HS_N
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 F B 1 102. 16 116. 23
## 2 F B 2 93.0 12 90.5 15
## 3 F B 3 114. 18 104. 11
## 4 F B 4 90.5 13 103. 15
## 5 F B 5 111. 12 70.7 10
## 6 F B 6 97.8 9 88.7 17
## 7 F B 7 70.2 17 99.0 21
## 8 F B 8 116. 15 101. 8
## 9 F W 1 86.4 20 93.0 8
## 10 F W 2 104. 14 93.4 12
## # … with 22 more rows
Nesting
out_le <- gapminder %>%
group_by(continent, year) %>%
nest()
out_le
## # A tibble: 60 x 3
## continent year data
## <fct> <int> <list>
## 1 Asia 1952 <tibble [33 × 4]>
## 2 Asia 1957 <tibble [33 × 4]>
## 3 Asia 1962 <tibble [33 × 4]>
## 4 Asia 1967 <tibble [33 × 4]>
## 5 Asia 1972 <tibble [33 × 4]>
## 6 Asia 1977 <tibble [33 × 4]>
## 7 Asia 1982 <tibble [33 × 4]>
## 8 Asia 1987 <tibble [33 × 4]>
## 9 Asia 1992 <tibble [33 × 4]>
## 10 Asia 1997 <tibble [33 × 4]>
## # … with 50 more rows
out_le %>% filter(continent == "Europe" & year == 1977) %>%
unnest()
## # A tibble: 30 x 6
## continent year country lifeExp pop gdpPercap
## <fct> <int> <fct> <dbl> <int> <dbl>
## 1 Europe 1977 Albania 68.9 2509048 3533.
## 2 Europe 1977 Austria 72.2 7568430 19749.
## 3 Europe 1977 Belgium 72.8 9821800 19118.
## 4 Europe 1977 Bosnia and Herzegovina 69.9 4086000 3528.
## 5 Europe 1977 Bulgaria 70.8 8797022 7612.
## 6 Europe 1977 Croatia 70.6 4318673 11305.
## 7 Europe 1977 Czech Republic 70.7 10161915 14800.
## 8 Europe 1977 Denmark 74.7 5088419 20423.
## 9 Europe 1977 Finland 72.5 4738902 15605.
## 10 Europe 1977 France 73.8 53165019 18293.
## # … with 20 more rows
fit_ols <- function(df) {
lm(lifeExp ~ log(gdpPercap), data = df)
}
out_le <- gapminder %>%
group_by(continent, year) %>%
nest() %>%
mutate(model = map(data, fit_ols))
out_le
## # A tibble: 60 x 4
## continent year data model
## <fct> <int> <list> <list>
## 1 Asia 1952 <tibble [33 × 4]> <S3: lm>
## 2 Asia 1957 <tibble [33 × 4]> <S3: lm>
## 3 Asia 1962 <tibble [33 × 4]> <S3: lm>
## 4 Asia 1967 <tibble [33 × 4]> <S3: lm>
## 5 Asia 1972 <tibble [33 × 4]> <S3: lm>
## 6 Asia 1977 <tibble [33 × 4]> <S3: lm>
## 7 Asia 1982 <tibble [33 × 4]> <S3: lm>
## 8 Asia 1987 <tibble [33 × 4]> <S3: lm>
## 9 Asia 1992 <tibble [33 × 4]> <S3: lm>
## 10 Asia 1997 <tibble [33 × 4]> <S3: lm>
## # … with 50 more rows
library(broom) ## more later
fit_ols <- function(df) {
lm(lifeExp ~ log(gdpPercap), data = df)
}
out_tidy <- gapminder %>%
group_by(continent, year) %>%
nest() %>%
mutate(model = map(data, fit_ols),
tidied = map(model, tidy)) %>%
unnest(tidied, .drop = TRUE)
out_tidy
## # A tibble: 120 x 7
## continent year term estimate std.error statistic p.value
## <fct> <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Asia 1952 (Intercept) 15.8 9.27 1.71 0.0978
## 2 Asia 1952 log(gdpPercap) 4.16 1.25 3.33 0.00228
## 3 Asia 1957 (Intercept) 18.1 9.70 1.86 0.0720
## 4 Asia 1957 log(gdpPercap) 4.17 1.28 3.26 0.00271
## 5 Asia 1962 (Intercept) 16.6 9.52 1.74 0.0911
## 6 Asia 1962 log(gdpPercap) 4.59 1.24 3.72 0.000794
## 7 Asia 1967 (Intercept) 19.8 9.05 2.19 0.0364
## 8 Asia 1967 log(gdpPercap) 4.50 1.15 3.90 0.000477
## 9 Asia 1972 (Intercept) 21.9 8.14 2.69 0.0113
## 10 Asia 1972 log(gdpPercap) 4.44 1.01 4.41 0.000116
## # … with 110 more rows
out_tidy <- gapminder %>%
group_by(continent, year) %>%
nest() %>%
mutate(model = map(data, fit_ols),
tidied = map(model, tidy)) %>%
unnest(tidied, .drop = TRUE) %>%
filter(term %nin% "(Intercept)")
out_tidy
## # A tibble: 60 x 7
## continent year term estimate std.error statistic p.value
## <fct> <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Asia 1952 log(gdpPercap) 4.16 1.25 3.33 2.28e-3
## 2 Asia 1957 log(gdpPercap) 4.17 1.28 3.26 2.71e-3
## 3 Asia 1962 log(gdpPercap) 4.59 1.24 3.72 7.94e-4
## 4 Asia 1967 log(gdpPercap) 4.50 1.15 3.90 4.77e-4
## 5 Asia 1972 log(gdpPercap) 4.44 1.01 4.41 1.16e-4
## 6 Asia 1977 log(gdpPercap) 4.87 1.03 4.75 4.42e-5
## 7 Asia 1982 log(gdpPercap) 4.78 0.852 5.61 3.77e-6
## 8 Asia 1987 log(gdpPercap) 5.17 0.727 7.12 5.31e-8
## 9 Asia 1992 log(gdpPercap) 5.09 0.649 7.84 7.60e-9
## 10 Asia 1997 log(gdpPercap) 5.11 0.628 8.15 3.35e-9
## # … with 50 more rows
Misc dplyr
Selection
organdata %>%
select(matches("_lag"))
## # A tibble: 238 x 2
## gdp_lag health_lag
## <int> <dbl>
## 1 16591 1224
## 2 16774 1300
## 3 17171 1379
## 4 17914 1455
## 5 18883 1540
## 6 19849 1626
## 7 21079 1737
## 8 21923 1846
## 9 22961 1948
## 10 24148 2077
## # … with 228 more rows
organdata %>%
select(world, everything())
## # A tibble: 238 x 21
## world country year donors pop pop_dens gdp gdp_lag health
## <chr> <chr> <date> <dbl> <int> <dbl> <int> <int> <dbl>
## 1 Libe… Austra… NA NA 17065 0.220 16774 16591 1300
## 2 Libe… Austra… 1991-01-01 12.1 17284 0.223 17171 16774 1379
## 3 Libe… Austra… 1992-01-01 12.4 17495 0.226 17914 17171 1455
## 4 Libe… Austra… 1993-01-01 12.5 17667 0.228 18883 17914 1540
## 5 Libe… Austra… 1994-01-01 10.2 17855 0.231 19849 18883 1626
## 6 Libe… Austra… 1995-01-01 10.2 18072 0.233 21079 19849 1737
## 7 Libe… Austra… 1996-01-01 10.6 18311 0.237 21923 21079 1846
## 8 Libe… Austra… 1997-01-01 10.3 18518 0.239 22961 21923 1948
## 9 Libe… Austra… 1998-01-01 10.5 18711 0.242 24148 22961 2077
## 10 Libe… Austra… 1999-01-01 8.67 18926 0.244 25445 24148 2231
## # … with 228 more rows, and 12 more variables: health_lag <dbl>,
## # pubhealth <dbl>, roads <dbl>, cerebvas <int>, assault <int>,
## # external <int>, txp_pop <dbl>, opt <chr>, consent_law <chr>,
## # consent_practice <chr>, consistent <chr>, ccode <chr>
Summaries
organdata %>%
summarize_all(funs(min, max), na.rm = TRUE)
## # A tibble: 1 x 42
## country_min year_min donors_min pop_min pop_dens_min gdp_min
## <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 Australia 1991-01-01 5.2 3514 0.220 12917
## # … with 36 more variables: gdp_lag_min <dbl>, health_min <dbl>,
## # health_lag_min <dbl>, pubhealth_min <dbl>, roads_min <dbl>,
## # cerebvas_min <dbl>, assault_min <dbl>, external_min <dbl>,
## # txp_pop_min <dbl>, world_min <chr>, opt_min <chr>,
## # consent_law_min <chr>, consent_practice_min <chr>,
## # consistent_min <chr>, ccode_min <chr>, country_max <chr>,
## # year_max <date>, donors_max <dbl>, pop_max <dbl>, pop_dens_max <dbl>,
## # gdp_max <dbl>, gdp_lag_max <dbl>, health_max <dbl>,
## # health_lag_max <dbl>, pubhealth_max <dbl>, roads_max <dbl>,
## # cerebvas_max <dbl>, assault_max <dbl>, external_max <dbl>,
## # txp_pop_max <dbl>, world_max <chr>, opt_max <chr>,
## # consent_law_max <chr>, consent_practice_max <chr>,
## # consistent_max <chr>, ccode_max <chr>
organdata %>%
summarize_if(is.numeric, funs(min, max), na.rm = TRUE)
## # A tibble: 1 x 26
## donors_min pop_min pop_dens_min gdp_min gdp_lag_min health_min
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5.2 3514 0.220 12917 11434 791
## # … with 20 more variables: health_lag_min <dbl>, pubhealth_min <dbl>,
## # roads_min <dbl>, cerebvas_min <dbl>, assault_min <dbl>,
## # external_min <dbl>, txp_pop_min <dbl>, donors_max <dbl>,
## # pop_max <dbl>, pop_dens_max <dbl>, gdp_max <dbl>, gdp_lag_max <dbl>,
## # health_max <dbl>, health_lag_max <dbl>, pubhealth_max <dbl>,
## # roads_max <dbl>, cerebvas_max <dbl>, assault_max <dbl>,
## # external_max <dbl>, txp_pop_max <dbl>
organdata %>%
group_by(country) %>%
summarize_if(is.numeric, funs(min, max), na.rm = TRUE)
## # A tibble: 17 x 27
## country donors_min pop_min pop_dens_min gdp_min gdp_lag_min health_min
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Austra… 8.67 17065 0.220 16774 16591 1300
## 2 Austria 19.5 7678 9.16 18914 17425 1344
## 3 Belgium 19.4 9967 30.1 18008 16848 1340
## 4 Canada 12.6 27701 0.278 19044 18699 1714
## 5 Denmark 11 5141 11.9 18285 17460 1554
## 6 Finland 16.3 4986 1.47 16943 16943 1390
## 7 France 15 56709 10.3 18162 17113 1555
## 8 Germany 12.2 63254 17.7 17511 17511 1729
## 9 Ireland 16.8 3514 5.00 12917 11434 791
## 10 Italy 5.2 56719 18.8 17430 16525 1397
## 11 Nether… 10.9 14952 36.0 17707 16580 1419
## 12 Norway 13.7 4242 1.31 17905 16942 1385
## 13 Spain 20.2 38850 7.68 12971 12051 865
## 14 Sweden 10.9 8559 1.90 18660 17915 1552
## 15 Switze… 10.4 6712 16.3 24648 23009 2040
## 16 United… 12.1 57238 23.6 16228 15804 977
## 17 United… 17.6 249623 2.59 23038 22039 2738
## # … with 20 more variables: health_lag_min <dbl>, pubhealth_min <dbl>,
## # roads_min <dbl>, cerebvas_min <dbl>, assault_min <dbl>,
## # external_min <dbl>, txp_pop_min <dbl>, donors_max <dbl>,
## # pop_max <dbl>, pop_dens_max <dbl>, gdp_max <dbl>, gdp_lag_max <dbl>,
## # health_max <dbl>, health_lag_max <dbl>, pubhealth_max <dbl>,
## # roads_max <dbl>, cerebvas_max <dbl>, assault_max <dbl>,
## # external_max <dbl>, txp_pop_max <dbl>
Scoped filtering
organdata %>%
filter_all(any_vars(is.na(.)))
## # A tibble: 73 x 21
## country year donors pop pop_dens gdp gdp_lag health health_lag
## <chr> <date> <dbl> <int> <dbl> <int> <int> <dbl> <dbl>
## 1 Austra… NA NA 17065 0.220 16774 16591 1300 1224
## 2 Austra… NA NA NA NA NA 28168 2754 2629
## 3 Austria NA NA 7678 9.16 18914 17425 1344 1255
## 4 Austria NA NA NA NA NA 28842 2266 2220
## 5 Belgium NA NA 9967 30.1 18008 16848 1340 1246
## 6 Belgium 1991-01-01 21 10005 30.2 18796 18008 1460 1340
## 7 Belgium 1992-01-01 20.6 10045 30.3 19444 18796 1547 1460
## 8 Belgium 1993-01-01 21 10085 30.5 19733 19444 1595 1547
## 9 Belgium NA NA NA NA NA 27652 2589 2515
## 10 Canada NA NA 27701 0.278 19044 18699 1714 1600
## # … with 63 more rows, and 12 more variables: pubhealth <dbl>,
## # roads <dbl>, cerebvas <int>, assault <int>, external <int>,
## # txp_pop <dbl>, world <chr>, opt <chr>, consent_law <chr>,
## # consent_practice <chr>, consistent <chr>, ccode <chr>
Rowwise operations
iris %>%
select(contains("Length")) %>%
rowwise() %>%
mutate(avg_length =
mean(c(Petal.Length, Sepal.Length)))
## Source: local data frame [150 x 3]
## Groups: <by row>
##
## # A tibble: 150 x 3
## Sepal.Length Petal.Length avg_length
## <dbl> <dbl> <dbl>
## 1 5.1 1.4 3.25
## 2 4.9 1.4 3.15
## 3 4.7 1.3 3
## 4 4.6 1.5 3.05
## 5 5 1.4 3.2
## 6 5.4 1.7 3.55
## 7 4.6 1.4 3
## 8 5 1.5 3.25
## 9 4.4 1.4 2.9
## 10 4.9 1.5 3.2
## # … with 140 more rows