More on dplyr

Due by 11:59 PM on Sunday, December 31, 0000

knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0           ✔ purrr   0.2.5      
## ✔ tibble  2.0.99.9000     ✔ dplyr   0.8.0.9000 
## ✔ tidyr   0.8.2           ✔ stringr 1.3.1      
## ✔ readr   1.3.1           ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()  masks stats::filter()
## ✖ purrr::is_null() masks testthat::is_null()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ dplyr::matches() masks testthat::matches()
library(janitor)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(socviz)
## 
## Attaching package: 'socviz'
## The following object is masked from 'package:kjhutils':
## 
##     %nin%
library(gapminder)

Scoping

organdata %>%
  group_by(world) %>%
  summarize_if(is.numeric, mean, na.rm = TRUE) %>%
  select(world, donors, pubhealth, roads) %>%
  select_all(tools::toTitleCase)
## # A tibble: 4 x 4
##   World       Donors Pubhealth Roads
##   <chr>        <dbl>     <dbl> <dbl>
## 1 <NA>          28.1      5.45 161. 
## 2 Corporatist   16.8      6.40 132. 
## 3 Liberal       15.6      5.75 111. 
## 4 SocDem        14.8      6.54  82.7
organdata %>%
  group_by(country) %>%
  summarize_if(is.numeric, 
               funs(avg = mean, sd = sd), 
               na.rm = TRUE) %>%
  select(country, donors_avg, 
         donors_sd, roads_avg, roads_sd) %>%
  arrange(desc(donors_avg))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
## # Before:
## funs(name = f(.)
## 
## # After: 
## list(name = ~f(.))
## This warning is displayed once per session.
## # A tibble: 17 x 5
##    country        donors_avg donors_sd roads_avg roads_sd
##    <chr>               <dbl>     <dbl>     <dbl>    <dbl>
##  1 Spain                28.1     4.96      161.     35.3 
##  2 Austria              23.5     2.42      150.     30.3 
##  3 Belgium              21.9     1.94      155.     20.6 
##  4 United States        20.0     1.33      155.      8.35
##  5 Ireland              19.8     2.48      118.     10.8 
##  6 Finland              18.4     1.53       93.6    19.0 
##  7 France               16.8     1.60      156.     20.1 
##  8 Norway               15.4     1.11       70.0     6.68
##  9 Switzerland          14.2     1.71       96.4    21.7 
## 10 Canada               14.0     0.751     109.     17.7 
## 11 Netherlands          13.7     1.55       76.1     9.93
## 12 United Kingdom       13.5     0.775      67.9    10.5 
## 13 Sweden               13.1     1.75       72.3    13.2 
## 14 Denmark              13.1     1.47      102.     12.4 
## 15 Germany              13.0     0.611     113.     25.9 
## 16 Italy                11.1     4.28      122.     10.2 
## 17 Australia            10.6     1.14      105.     14.3
out <- lm(donors ~ pop + gdp + roads, data = organdata)

summary(out)
## 
## Call:
## lm(formula = donors ~ pop + gdp + roads, data = organdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.423  -2.658  -0.080   1.963  15.864 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.506e+00  2.364e+00   1.906   0.0580 .  
## pop         -1.153e-05  5.643e-06  -2.043   0.0423 *  
## gdp          1.082e-04  7.527e-05   1.438   0.1521    
## roads        8.988e-02  1.032e-02   8.710 1.14e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.325 on 200 degrees of freedom
##   (34 observations deleted due to missingness)
## Multiple R-squared:  0.2944, Adjusted R-squared:  0.2838 
## F-statistic: 27.81 on 3 and 200 DF,  p-value: 4.486e-15
names(summary(out))
##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled"  "na.action"
organdata %>%
  split(.$world) %>%
  map(~ lm(donors ~ pop + gdp + roads, data = .)) %>%
  map(summary) %>%
  map_dbl("r.squared")
## Corporatist     Liberal      SocDem 
##   0.6681724   0.4060604   0.2521456

Zero Counts in Dplyr

Grab the data remotely, to make this self-contained.

url <- "https://datamgmt.org/data/first_terms.csv"

df <- read_csv(url)
## Parsed with column specification:
## cols(
##   pid = col_double(),
##   start_year = col_date(format = ""),
##   party = col_character(),
##   sex = col_character()
## )
df
## # A tibble: 293 x 4
##      pid start_year party      sex  
##    <dbl> <date>     <chr>      <chr>
##  1  3160 2013-01-03 Republican M    
##  2  3161 2013-01-03 Democrat   F    
##  3  3162 2013-01-03 Democrat   M    
##  4  3163 2013-01-03 Republican M    
##  5  3164 2013-01-03 Democrat   M    
##  6  3165 2013-01-03 Republican M    
##  7  3166 2013-01-03 Republican M    
##  8  3167 2013-01-03 Democrat   F    
##  9  3168 2013-01-03 Republican M    
## 10  3169 2013-01-03 Democrat   M    
## # … with 283 more rows
df %>%
    group_by(start_year, party, sex) %>%
    summarize(N = n()) %>%
    mutate(freq = N / sum(N))
## # A tibble: 14 x 5
## # Groups:   start_year, party [8]
##    start_year party      sex       N   freq
##    <date>     <chr>      <chr> <int>  <dbl>
##  1 2013-01-03 Democrat   F        21 0.362 
##  2 2013-01-03 Democrat   M        37 0.638 
##  3 2013-01-03 Republican F         8 0.101 
##  4 2013-01-03 Republican M        71 0.899 
##  5 2015-01-03 Democrat   M         1 1     
##  6 2015-01-03 Republican M         5 1     
##  7 2017-01-03 Democrat   F         6 0.24  
##  8 2017-01-03 Democrat   M        19 0.76  
##  9 2017-01-03 Republican F         2 0.0667
## 10 2017-01-03 Republican M        28 0.933 
## 11 2019-01-03 Democrat   F        34 0.567 
## 12 2019-01-03 Democrat   M        26 0.433 
## 13 2019-01-03 Republican F         1 0.0286
## 14 2019-01-03 Republican M        34 0.971
## Hex colors for sex
sex_colors <- c("#E69F00", "#993300")

## Hex color codes for Dem Blue and Rep Red
party_colors <- c("#2E74C0", "#CB454A")

## Group labels
mf_labs <- tibble(M = "Men", F = "Women")

theme_set(theme_minimal())
df %>%
    group_by(start_year, party, sex) %>%
    summarize(N = n()) %>%
    mutate(freq = N / sum(N)) %>%
    ggplot(aes(x = start_year,
               y = freq,
               fill = sex)) +
    geom_col() +
    scale_y_continuous(labels = scales::percent) +
    scale_fill_manual(values = sex_colors, labels = c("Women", "Men")) +
    labs(x = "Year", y = "Percent", fill = "Group") +
    facet_wrap(~ party)

df %>%
    group_by(start_year, party, sex) %>%
    summarize(N = n()) %>%
    mutate(freq = N / sum(N)) %>%
    ggplot(aes(x = start_year,
               y = freq,
               color = sex)) +
    geom_line(size = 1.1) +
    scale_y_continuous(labels = scales::percent) +
    scale_color_manual(values = sex_colors, labels = c("Women", "Men")) +
    guides(color = guide_legend(reverse = TRUE)) +
    labs(x = "Year", y = "Percent", color = "Group") +
    facet_wrap(~ party)

Option 1: recode to factors

df_f <- df %>% modify_if(is.character, as.factor)

df_f %>%
    group_by(start_year, party, sex) %>%
    tally()
## # A tibble: 16 x 4
## # Groups:   start_year, party [8]
##    start_year party      sex       n
##    <date>     <fct>      <fct> <int>
##  1 2013-01-03 Democrat   F        21
##  2 2013-01-03 Democrat   M        37
##  3 2013-01-03 Republican F         8
##  4 2013-01-03 Republican M        71
##  5 2015-01-03 Democrat   F         0
##  6 2015-01-03 Democrat   M         1
##  7 2015-01-03 Republican F         0
##  8 2015-01-03 Republican M         5
##  9 2017-01-03 Democrat   F         6
## 10 2017-01-03 Democrat   M        19
## 11 2017-01-03 Republican F         2
## 12 2017-01-03 Republican M        28
## 13 2019-01-03 Democrat   F        34
## 14 2019-01-03 Democrat   M        26
## 15 2019-01-03 Republican F         1
## 16 2019-01-03 Republican M        34

Option 2: ungroup() and complete()

df %>%
    group_by(start_year, party, sex) %>%
    summarize(N = n()) %>%
    mutate(freq = N / sum(N)) %>%
    ungroup() %>%
    complete(start_year, party, sex,
             fill = list(N = 0, freq = 0))
## # A tibble: 16 x 5
##    start_year party      sex       N   freq
##    <date>     <chr>      <chr> <dbl>  <dbl>
##  1 2013-01-03 Democrat   F        21 0.362 
##  2 2013-01-03 Democrat   M        37 0.638 
##  3 2013-01-03 Republican F         8 0.101 
##  4 2013-01-03 Republican M        71 0.899 
##  5 2015-01-03 Democrat   F         0 0     
##  6 2015-01-03 Democrat   M         1 1     
##  7 2015-01-03 Republican F         0 0     
##  8 2015-01-03 Republican M         5 1     
##  9 2017-01-03 Democrat   F         6 0.24  
## 10 2017-01-03 Democrat   M        19 0.76  
## 11 2017-01-03 Republican F         2 0.0667
## 12 2017-01-03 Republican M        28 0.933 
## 13 2019-01-03 Democrat   F        34 0.567 
## 14 2019-01-03 Democrat   M        26 0.433 
## 15 2019-01-03 Republican F         1 0.0286
## 16 2019-01-03 Republican M        34 0.971
df_f %>%
    group_by(start_year, party, sex) %>%
    summarize(N = n()) %>%
    mutate(freq = N / sum(N)) %>%
    ggplot(aes(x = start_year,
               y = freq,
               color = sex)) +
    geom_line(size = 1.1) +
    scale_y_continuous(labels = scales::percent) +
    scale_color_manual(values = sex_colors, labels = c("Women", "Men")) +
    guides(color = guide_legend(reverse = TRUE)) +
    labs(x = "Year", y = "Percent", color = "Group") +
    facet_wrap(~ party)

Spreading multiple values

edu
## # A tibble: 366 x 11
##    age   sex    year total elem4 elem8   hs3   hs4 coll3 coll4 median
##    <chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>  <dbl>
##  1 25-34 Male   2016 21845   116   468  1427  6386  6015  7432     NA
##  2 25-34 Male   2015 21427   166   488  1584  6198  5920  7071     NA
##  3 25-34 Male   2014 21217   151   512  1611  6323  5910  6710     NA
##  4 25-34 Male   2013 20816   161   582  1747  6058  5749  6519     NA
##  5 25-34 Male   2012 20464   161   579  1707  6127  5619  6270     NA
##  6 25-34 Male   2011 20985   190   657  1791  6444  5750  6151     NA
##  7 25-34 Male   2010 20689   186   641  1866  6458  5587  5951     NA
##  8 25-34 Male   2009 20440   184   695  1806  6495  5508  5752     NA
##  9 25-34 Male   2008 20210   172   714  1874  6356  5277  5816     NA
## 10 25-34 Male   2007 20024   246   757  1930  6361  5137  5593     NA
## # … with 356 more rows
edu_t <- gather(data = edu,
                key = school,
                value = freq,
                elem4:coll4)

head(edu_t) 
## # A tibble: 6 x 7
##   age   sex    year total median school  freq
##   <chr> <chr> <int> <int>  <dbl> <chr>  <dbl>
## 1 25-34 Male   2016 21845     NA elem4    116
## 2 25-34 Male   2015 21427     NA elem4    166
## 3 25-34 Male   2014 21217     NA elem4    151
## 4 25-34 Male   2013 20816     NA elem4    161
## 5 25-34 Male   2012 20464     NA elem4    161
## 6 25-34 Male   2011 20985     NA elem4    190
tail(edu_t)
## # A tibble: 6 x 7
##   age   sex     year total median school  freq
##   <chr> <chr>  <int> <int>  <dbl> <chr>  <dbl>
## 1 55>   Female  1959 16263    8.3 coll4    688
## 2 55>   Female  1957 15581    8.2 coll4    630
## 3 55>   Female  1952 13662    7.9 coll4    628
## 4 55>   Female  1950 13150    8.4 coll4    436
## 5 55>   Female  1947 11810    7.6 coll4    343
## 6 55>   Female  1940  9777    8.3 coll4    219

Generate some sample stratified data

gen_cats <- function(x, N = 1000) {
    sample(x, N, replace = TRUE)
}

set.seed(101)
N <- 1000

income <- rnorm(N, 100, 50)

vars <- list(stratum = c(1:8),
          sex = c("M", "F"),
          race =  c("B", "W"),
          educ = c("HS", "BA"))

df <- as_tibble(map_dfc(vars, gen_cats))
df <- add_column(df, income)

df
## # A tibble: 1,000 x 5
##    stratum sex   race  educ  income
##      <int> <chr> <chr> <chr>  <dbl>
##  1       6 F     W     BA      83.7
##  2       7 F     B     HS     128. 
##  3       1 F     B     BA      66.3
##  4       2 M     B     BA     111. 
##  5       4 M     B     BA     116. 
##  6       6 F     W     BA     159. 
##  7       7 F     W     BA     131. 
##  8       6 M     W     HS      94.4
##  9       4 F     W     HS     146. 
## 10       7 M     B     BA      88.8
## # … with 990 more rows

Basic grouped summary (no gathering)

tv_wide1 <- df %>% group_by(sex, race, stratum, educ) %>%
    summarize(mean_inc = mean(income), N = n())

tv_wide1
## # A tibble: 64 x 6
## # Groups:   sex, race, stratum [32]
##    sex   race  stratum educ  mean_inc     N
##    <chr> <chr>   <int> <chr>    <dbl> <int>
##  1 F     B           1 BA       102.     16
##  2 F     B           1 HS       116.     23
##  3 F     B           2 BA        93.0    12
##  4 F     B           2 HS        90.5    15
##  5 F     B           3 BA       114.     18
##  6 F     B           3 HS       104.     11
##  7 F     B           4 BA        90.5    13
##  8 F     B           4 HS       103.     15
##  9 F     B           5 BA       111.     12
## 10 F     B           5 HS        70.7    10
## # … with 54 more rows

Gather mean income and N into a key-value pair

tv_wide2 <- df %>% group_by(sex, race, stratum, educ) %>%
    summarize(mean_inc = mean(income), N = n()) %>%
    gather(variable, value, -(sex:educ))

tv_wide2
## # A tibble: 128 x 6
## # Groups:   sex, race, stratum [32]
##    sex   race  stratum educ  variable value
##    <chr> <chr>   <int> <chr> <chr>    <dbl>
##  1 F     B           1 BA    mean_inc 102. 
##  2 F     B           1 HS    mean_inc 116. 
##  3 F     B           2 BA    mean_inc  93.0
##  4 F     B           2 HS    mean_inc  90.5
##  5 F     B           3 BA    mean_inc 114. 
##  6 F     B           3 HS    mean_inc 104. 
##  7 F     B           4 BA    mean_inc  90.5
##  8 F     B           4 HS    mean_inc 103. 
##  9 F     B           5 BA    mean_inc 111. 
## 10 F     B           5 HS    mean_inc  70.7
## # … with 118 more rows

Unite the variable (key) column with the Education measure

tv_wide2 <- df %>% group_by(sex, race, stratum, educ) %>%
    summarize(mean_inc = mean(income), N = n()) %>%
    gather(variable, value, -(sex:educ)) %>%
    unite(temp, educ, variable)

tv_wide2
## # A tibble: 128 x 5
## # Groups:   sex, race, stratum [32]
##    sex   race  stratum temp        value
##    <chr> <chr>   <int> <chr>       <dbl>
##  1 F     B           1 BA_mean_inc 102. 
##  2 F     B           1 HS_mean_inc 116. 
##  3 F     B           2 BA_mean_inc  93.0
##  4 F     B           2 HS_mean_inc  90.5
##  5 F     B           3 BA_mean_inc 114. 
##  6 F     B           3 HS_mean_inc 104. 
##  7 F     B           4 BA_mean_inc  90.5
##  8 F     B           4 HS_mean_inc 103. 
##  9 F     B           5 BA_mean_inc 111. 
## 10 F     B           5 HS_mean_inc  70.7
## # … with 118 more rows

Finally, spread the resulting temporary column, to get the desired result

tv_wide2 <- df %>% group_by(sex, race, stratum, educ) %>%
    summarize(mean_inc = mean(income), N = n()) %>%
    gather(variable, value, -(sex:educ)) %>%
    unite(temp, educ, variable) %>%
    spread(temp, value)

tv_wide2
## # A tibble: 32 x 7
## # Groups:   sex, race, stratum [32]
##    sex   race  stratum BA_mean_inc  BA_N HS_mean_inc  HS_N
##    <chr> <chr>   <int>       <dbl> <dbl>       <dbl> <dbl>
##  1 F     B           1       102.     16       116.     23
##  2 F     B           2        93.0    12        90.5    15
##  3 F     B           3       114.     18       104.     11
##  4 F     B           4        90.5    13       103.     15
##  5 F     B           5       111.     12        70.7    10
##  6 F     B           6        97.8     9        88.7    17
##  7 F     B           7        70.2    17        99.0    21
##  8 F     B           8       116.     15       101.      8
##  9 F     W           1        86.4    20        93.0     8
## 10 F     W           2       104.     14        93.4    12
## # … with 22 more rows

A function to do this gather, spread, unite sequence

multi_spread <- function(df, key, value) {
    # quote key
    keyq <- rlang::enquo(key)
    # break value vector into quotes
    valueq <- rlang::enquo(value)
    s <- rlang::quos(!!valueq)
    df %>% gather(variable, value, !!!s) %>%
        unite(temp, !!keyq, variable) %>%
        spread(temp, value)
}

Use the function directly

## Final version
tv_wide3 <- df %>% group_by(sex, race, stratum, educ) %>%
    summarize(mean_inc = mean(income), N = n()) %>%
    multi_spread(educ, c(mean_inc, N))

tv_wide3
## # A tibble: 32 x 7
## # Groups:   sex, race, stratum [32]
##    sex   race  stratum BA_mean_inc  BA_N HS_mean_inc  HS_N
##    <chr> <chr>   <int>       <dbl> <dbl>       <dbl> <dbl>
##  1 F     B           1       102.     16       116.     23
##  2 F     B           2        93.0    12        90.5    15
##  3 F     B           3       114.     18       104.     11
##  4 F     B           4        90.5    13       103.     15
##  5 F     B           5       111.     12        70.7    10
##  6 F     B           6        97.8     9        88.7    17
##  7 F     B           7        70.2    17        99.0    21
##  8 F     B           8       116.     15       101.      8
##  9 F     W           1        86.4    20        93.0     8
## 10 F     W           2       104.     14        93.4    12
## # … with 22 more rows

Nesting

out_le <- gapminder %>%
    group_by(continent, year) %>%
    nest()

out_le
## # A tibble: 60 x 3
##    continent  year data             
##    <fct>     <int> <list>           
##  1 Asia       1952 <tibble [33 × 4]>
##  2 Asia       1957 <tibble [33 × 4]>
##  3 Asia       1962 <tibble [33 × 4]>
##  4 Asia       1967 <tibble [33 × 4]>
##  5 Asia       1972 <tibble [33 × 4]>
##  6 Asia       1977 <tibble [33 × 4]>
##  7 Asia       1982 <tibble [33 × 4]>
##  8 Asia       1987 <tibble [33 × 4]>
##  9 Asia       1992 <tibble [33 × 4]>
## 10 Asia       1997 <tibble [33 × 4]>
## # … with 50 more rows
out_le %>% filter(continent == "Europe" & year == 1977) %>% 
           unnest()
## # A tibble: 30 x 6
##    continent  year country                lifeExp      pop gdpPercap
##    <fct>     <int> <fct>                    <dbl>    <int>     <dbl>
##  1 Europe     1977 Albania                   68.9  2509048     3533.
##  2 Europe     1977 Austria                   72.2  7568430    19749.
##  3 Europe     1977 Belgium                   72.8  9821800    19118.
##  4 Europe     1977 Bosnia and Herzegovina    69.9  4086000     3528.
##  5 Europe     1977 Bulgaria                  70.8  8797022     7612.
##  6 Europe     1977 Croatia                   70.6  4318673    11305.
##  7 Europe     1977 Czech Republic            70.7 10161915    14800.
##  8 Europe     1977 Denmark                   74.7  5088419    20423.
##  9 Europe     1977 Finland                   72.5  4738902    15605.
## 10 Europe     1977 France                    73.8 53165019    18293.
## # … with 20 more rows
fit_ols <- function(df) {
    lm(lifeExp ~ log(gdpPercap), data = df)
}

out_le <- gapminder %>%
    group_by(continent, year) %>%
    nest() %>% 
    mutate(model = map(data, fit_ols)) 


out_le
## # A tibble: 60 x 4
##    continent  year data              model   
##    <fct>     <int> <list>            <list>  
##  1 Asia       1952 <tibble [33 × 4]> <S3: lm>
##  2 Asia       1957 <tibble [33 × 4]> <S3: lm>
##  3 Asia       1962 <tibble [33 × 4]> <S3: lm>
##  4 Asia       1967 <tibble [33 × 4]> <S3: lm>
##  5 Asia       1972 <tibble [33 × 4]> <S3: lm>
##  6 Asia       1977 <tibble [33 × 4]> <S3: lm>
##  7 Asia       1982 <tibble [33 × 4]> <S3: lm>
##  8 Asia       1987 <tibble [33 × 4]> <S3: lm>
##  9 Asia       1992 <tibble [33 × 4]> <S3: lm>
## 10 Asia       1997 <tibble [33 × 4]> <S3: lm>
## # … with 50 more rows
library(broom) ## more later

fit_ols <- function(df) {
    lm(lifeExp ~ log(gdpPercap), data = df)
}

out_tidy <- gapminder %>%
    group_by(continent, year) %>%
    nest() %>% 
    mutate(model = map(data, fit_ols),
           tidied = map(model, tidy)) %>%
    unnest(tidied, .drop = TRUE)

out_tidy
## # A tibble: 120 x 7
##    continent  year term           estimate std.error statistic  p.value
##    <fct>     <int> <chr>             <dbl>     <dbl>     <dbl>    <dbl>
##  1 Asia       1952 (Intercept)       15.8       9.27      1.71 0.0978  
##  2 Asia       1952 log(gdpPercap)     4.16      1.25      3.33 0.00228 
##  3 Asia       1957 (Intercept)       18.1       9.70      1.86 0.0720  
##  4 Asia       1957 log(gdpPercap)     4.17      1.28      3.26 0.00271 
##  5 Asia       1962 (Intercept)       16.6       9.52      1.74 0.0911  
##  6 Asia       1962 log(gdpPercap)     4.59      1.24      3.72 0.000794
##  7 Asia       1967 (Intercept)       19.8       9.05      2.19 0.0364  
##  8 Asia       1967 log(gdpPercap)     4.50      1.15      3.90 0.000477
##  9 Asia       1972 (Intercept)       21.9       8.14      2.69 0.0113  
## 10 Asia       1972 log(gdpPercap)     4.44      1.01      4.41 0.000116
## # … with 110 more rows
out_tidy <- gapminder %>%
    group_by(continent, year) %>%
    nest() %>% 
    mutate(model = map(data, fit_ols),
           tidied = map(model, tidy)) %>%
    unnest(tidied, .drop = TRUE) %>% 
    filter(term %nin% "(Intercept)")

out_tidy
## # A tibble: 60 x 7
##    continent  year term           estimate std.error statistic      p.value
##    <fct>     <int> <chr>             <dbl>     <dbl>     <dbl>        <dbl>
##  1 Asia       1952 log(gdpPercap)     4.16     1.25       3.33      2.28e-3
##  2 Asia       1957 log(gdpPercap)     4.17     1.28       3.26      2.71e-3
##  3 Asia       1962 log(gdpPercap)     4.59     1.24       3.72      7.94e-4
##  4 Asia       1967 log(gdpPercap)     4.50     1.15       3.90      4.77e-4
##  5 Asia       1972 log(gdpPercap)     4.44     1.01       4.41      1.16e-4
##  6 Asia       1977 log(gdpPercap)     4.87     1.03       4.75      4.42e-5
##  7 Asia       1982 log(gdpPercap)     4.78     0.852      5.61      3.77e-6
##  8 Asia       1987 log(gdpPercap)     5.17     0.727      7.12      5.31e-8
##  9 Asia       1992 log(gdpPercap)     5.09     0.649      7.84      7.60e-9
## 10 Asia       1997 log(gdpPercap)     5.11     0.628      8.15      3.35e-9
## # … with 50 more rows

Misc dplyr

Selection

organdata %>% 
  select(matches("_lag"))
## # A tibble: 238 x 2
##    gdp_lag health_lag
##      <int>      <dbl>
##  1   16591       1224
##  2   16774       1300
##  3   17171       1379
##  4   17914       1455
##  5   18883       1540
##  6   19849       1626
##  7   21079       1737
##  8   21923       1846
##  9   22961       1948
## 10   24148       2077
## # … with 228 more rows
organdata %>% 
  select(world, everything())
## # A tibble: 238 x 21
##    world country year       donors   pop pop_dens   gdp gdp_lag health
##    <chr> <chr>   <date>      <dbl> <int>    <dbl> <int>   <int>  <dbl>
##  1 Libe… Austra… NA          NA    17065    0.220 16774   16591   1300
##  2 Libe… Austra… 1991-01-01  12.1  17284    0.223 17171   16774   1379
##  3 Libe… Austra… 1992-01-01  12.4  17495    0.226 17914   17171   1455
##  4 Libe… Austra… 1993-01-01  12.5  17667    0.228 18883   17914   1540
##  5 Libe… Austra… 1994-01-01  10.2  17855    0.231 19849   18883   1626
##  6 Libe… Austra… 1995-01-01  10.2  18072    0.233 21079   19849   1737
##  7 Libe… Austra… 1996-01-01  10.6  18311    0.237 21923   21079   1846
##  8 Libe… Austra… 1997-01-01  10.3  18518    0.239 22961   21923   1948
##  9 Libe… Austra… 1998-01-01  10.5  18711    0.242 24148   22961   2077
## 10 Libe… Austra… 1999-01-01   8.67 18926    0.244 25445   24148   2231
## # … with 228 more rows, and 12 more variables: health_lag <dbl>,
## #   pubhealth <dbl>, roads <dbl>, cerebvas <int>, assault <int>,
## #   external <int>, txp_pop <dbl>, opt <chr>, consent_law <chr>,
## #   consent_practice <chr>, consistent <chr>, ccode <chr>

Summaries

organdata %>% 
  summarize_all(funs(min, max), na.rm = TRUE)
## # A tibble: 1 x 42
##   country_min year_min   donors_min pop_min pop_dens_min gdp_min
##   <chr>       <date>          <dbl>   <dbl>        <dbl>   <dbl>
## 1 Australia   1991-01-01        5.2    3514        0.220   12917
## # … with 36 more variables: gdp_lag_min <dbl>, health_min <dbl>,
## #   health_lag_min <dbl>, pubhealth_min <dbl>, roads_min <dbl>,
## #   cerebvas_min <dbl>, assault_min <dbl>, external_min <dbl>,
## #   txp_pop_min <dbl>, world_min <chr>, opt_min <chr>,
## #   consent_law_min <chr>, consent_practice_min <chr>,
## #   consistent_min <chr>, ccode_min <chr>, country_max <chr>,
## #   year_max <date>, donors_max <dbl>, pop_max <dbl>, pop_dens_max <dbl>,
## #   gdp_max <dbl>, gdp_lag_max <dbl>, health_max <dbl>,
## #   health_lag_max <dbl>, pubhealth_max <dbl>, roads_max <dbl>,
## #   cerebvas_max <dbl>, assault_max <dbl>, external_max <dbl>,
## #   txp_pop_max <dbl>, world_max <chr>, opt_max <chr>,
## #   consent_law_max <chr>, consent_practice_max <chr>,
## #   consistent_max <chr>, ccode_max <chr>
organdata %>% 
  summarize_if(is.numeric, funs(min, max), na.rm = TRUE)
## # A tibble: 1 x 26
##   donors_min pop_min pop_dens_min gdp_min gdp_lag_min health_min
##        <dbl>   <dbl>        <dbl>   <dbl>       <dbl>      <dbl>
## 1        5.2    3514        0.220   12917       11434        791
## # … with 20 more variables: health_lag_min <dbl>, pubhealth_min <dbl>,
## #   roads_min <dbl>, cerebvas_min <dbl>, assault_min <dbl>,
## #   external_min <dbl>, txp_pop_min <dbl>, donors_max <dbl>,
## #   pop_max <dbl>, pop_dens_max <dbl>, gdp_max <dbl>, gdp_lag_max <dbl>,
## #   health_max <dbl>, health_lag_max <dbl>, pubhealth_max <dbl>,
## #   roads_max <dbl>, cerebvas_max <dbl>, assault_max <dbl>,
## #   external_max <dbl>, txp_pop_max <dbl>
organdata %>%
  group_by(country) %>%
  summarize_if(is.numeric, funs(min, max), na.rm = TRUE)
## # A tibble: 17 x 27
##    country donors_min pop_min pop_dens_min gdp_min gdp_lag_min health_min
##    <chr>        <dbl>   <dbl>        <dbl>   <dbl>       <dbl>      <dbl>
##  1 Austra…       8.67   17065        0.220   16774       16591       1300
##  2 Austria      19.5     7678        9.16    18914       17425       1344
##  3 Belgium      19.4     9967       30.1     18008       16848       1340
##  4 Canada       12.6    27701        0.278   19044       18699       1714
##  5 Denmark      11       5141       11.9     18285       17460       1554
##  6 Finland      16.3     4986        1.47    16943       16943       1390
##  7 France       15      56709       10.3     18162       17113       1555
##  8 Germany      12.2    63254       17.7     17511       17511       1729
##  9 Ireland      16.8     3514        5.00    12917       11434        791
## 10 Italy         5.2    56719       18.8     17430       16525       1397
## 11 Nether…      10.9    14952       36.0     17707       16580       1419
## 12 Norway       13.7     4242        1.31    17905       16942       1385
## 13 Spain        20.2    38850        7.68    12971       12051        865
## 14 Sweden       10.9     8559        1.90    18660       17915       1552
## 15 Switze…      10.4     6712       16.3     24648       23009       2040
## 16 United…      12.1    57238       23.6     16228       15804        977
## 17 United…      17.6   249623        2.59    23038       22039       2738
## # … with 20 more variables: health_lag_min <dbl>, pubhealth_min <dbl>,
## #   roads_min <dbl>, cerebvas_min <dbl>, assault_min <dbl>,
## #   external_min <dbl>, txp_pop_min <dbl>, donors_max <dbl>,
## #   pop_max <dbl>, pop_dens_max <dbl>, gdp_max <dbl>, gdp_lag_max <dbl>,
## #   health_max <dbl>, health_lag_max <dbl>, pubhealth_max <dbl>,
## #   roads_max <dbl>, cerebvas_max <dbl>, assault_max <dbl>,
## #   external_max <dbl>, txp_pop_max <dbl>

Scoped filtering

organdata %>% 
  filter_all(any_vars(is.na(.)))
## # A tibble: 73 x 21
##    country year       donors   pop pop_dens   gdp gdp_lag health health_lag
##    <chr>   <date>      <dbl> <int>    <dbl> <int>   <int>  <dbl>      <dbl>
##  1 Austra… NA           NA   17065    0.220 16774   16591   1300       1224
##  2 Austra… NA           NA      NA   NA        NA   28168   2754       2629
##  3 Austria NA           NA    7678    9.16  18914   17425   1344       1255
##  4 Austria NA           NA      NA   NA        NA   28842   2266       2220
##  5 Belgium NA           NA    9967   30.1   18008   16848   1340       1246
##  6 Belgium 1991-01-01   21   10005   30.2   18796   18008   1460       1340
##  7 Belgium 1992-01-01   20.6 10045   30.3   19444   18796   1547       1460
##  8 Belgium 1993-01-01   21   10085   30.5   19733   19444   1595       1547
##  9 Belgium NA           NA      NA   NA        NA   27652   2589       2515
## 10 Canada  NA           NA   27701    0.278 19044   18699   1714       1600
## # … with 63 more rows, and 12 more variables: pubhealth <dbl>,
## #   roads <dbl>, cerebvas <int>, assault <int>, external <int>,
## #   txp_pop <dbl>, world <chr>, opt <chr>, consent_law <chr>,
## #   consent_practice <chr>, consistent <chr>, ccode <chr>

Rowwise operations

iris %>% 
     select(contains("Length")) %>%
     rowwise() %>%
     mutate(avg_length = 
              mean(c(Petal.Length, Sepal.Length)))
## Source: local data frame [150 x 3]
## Groups: <by row>
## 
## # A tibble: 150 x 3
##    Sepal.Length Petal.Length avg_length
##           <dbl>        <dbl>      <dbl>
##  1          5.1          1.4       3.25
##  2          4.9          1.4       3.15
##  3          4.7          1.3       3   
##  4          4.6          1.5       3.05
##  5          5            1.4       3.2 
##  6          5.4          1.7       3.55
##  7          4.6          1.4       3   
##  8          5            1.5       3.25
##  9          4.4          1.4       2.9 
## 10          4.9          1.5       3.2 
## # … with 140 more rows