Randomization Tests

Dr. Mine Dogucu

1 / 21

library(openintro)
library(tidyverse)
library(janitor)
glimpse(gender_discrimination)

## Rows: 48
## Columns: 2
## $ gender   <fct> male, male, male, male, male, male, male, male, male, male, m…
## $ decision <fct> promoted, promoted, promoted, promoted, promoted, promoted, p…

Example from the OpenIntro Introductory Statistics with Randomization and Simulation Book

2 / 21

gender_discrimination %>% 
  tabyl(gender, decision) %>% 
  adorn_totals("row")

##  gender promoted not promoted
##    male       21            3
##  female       14           10
##   Total       35           13

3 / 21

Hypotheses

$H_{0} : π_{m} - π_{f} \leq 0$
$H_{A} : π_{m} - π_{f} > 0$

4 / 21

Sample Statistic

summary_table <- gender_discrimination %>% 
  tabyl(gender, decision) %>% 
  adorn_totals("row")

5 / 21

Sample Statistic

summary_table

##  gender promoted not promoted
##    male       21            3
##  female       14           10
##   Total       35           13

p_m <- summary_table [1, 2] / 24
p_f <- summary_table [2, 2] / 24
p_m - p_f

## [1] 0.2916667

6 / 21

observed_diff_p <- p_m - p_f

7 / 21

Can this observed difference in promotion rates (0.2916667) be due to chance rather than gender discrimination?

8 / 21

Steps

1.Shuffle the 48 personnel files.
2.Deal the 48 files into to two stacks. Stack 1 will have 35 files that represent the promoted files. Stack 2 will have 13 files that are not promoted.
3.Calculate the differences in promotion rates of males and females.
4.Repeat this process multiple times.

9 / 21

gender_discrimination

## # A tibble: 48 x 2
##    gender decision
##    <fct>  <fct>   
##  1 male   promoted
##  2 male   promoted
##  3 male   promoted
##  4 male   promoted
##  5 male   promoted
##  6 male   promoted
##  7 male   promoted
##  8 male   promoted
##  9 male   promoted
## 10 male   promoted
## # … with 38 more rows

10 / 21

set.seed(12345)
gender_discrimination$simulated_decision <-
  sample(gender_discrimination$decision)

11 / 21

gender_discrimination

## # A tibble: 48 x 3
##    gender decision simulated_decision
##    <fct>  <fct>    <fct>             
##  1 male   promoted promoted          
##  2 male   promoted promoted          
##  3 male   promoted promoted          
##  4 male   promoted promoted          
##  5 male   promoted not promoted      
##  6 male   promoted promoted          
##  7 male   promoted promoted          
##  8 male   promoted promoted          
##  9 male   promoted not promoted      
## 10 male   promoted promoted          
## # … with 38 more rows

12 / 21

gender_discrimination %>% 
  tabyl(gender, simulated_decision) %>% 
  adorn_totals("row")

##  gender promoted not promoted
##    male       18            6
##  female       17            7
##   Total       35           13

13 / 21

summary_table <- gender_discrimination %>% 
  tabyl(gender, simulated_decision) %>% 
  adorn_totals("row")
p_m <- summary_table [1, 2] / 24
p_f <- summary_table [2, 2] / 24
p_m - p_f

## [1] 0.04166667

14 / 21

set.seed(12345)
for (i in 1:2){
  simulated_decision <-
  sample(gender_discrimination$decision)
  print(simulated_decision)
  rep = rep(i, 48)
  print(rep)
}

##  [1] promoted     promoted     promoted     promoted     not promoted
##  [6] promoted     promoted     promoted     not promoted promoted    
## [11] not promoted not promoted promoted     promoted     promoted    
## [16] not promoted promoted     promoted     promoted     promoted    
## [21] promoted     promoted     promoted     not promoted promoted    
## [26] promoted     not promoted promoted     promoted     promoted    
## [31] not promoted promoted     promoted     not promoted promoted    
## [36] promoted     promoted     not promoted promoted     promoted    
## [41] not promoted not promoted promoted     not promoted promoted    
## [46] promoted     promoted     promoted    
## Levels: promoted not promoted
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1
##  [1] promoted     promoted     not promoted promoted     promoted    
##  [6] promoted     promoted     not promoted promoted     promoted    
## [11] promoted     promoted     not promoted not promoted not promoted
## [16] promoted     not promoted promoted     promoted     promoted    
## [21] promoted     promoted     promoted     not promoted not promoted
## [26] promoted     promoted     promoted     promoted     not promoted
## [31] promoted     not promoted promoted     promoted     not promoted
## [36] promoted     promoted     promoted     not promoted promoted    
## [41] not promoted promoted     promoted     promoted     promoted    
## [46] promoted     promoted     promoted    
## Levels: promoted not promoted
##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2

15 / 21

set.seed(12345)
for (i in 1:2){
  simulated_decision <-
  sample(gender_discrimination$decision)
  print(simulated_decision)
  rep = rep(i, 48)
  print(rep)
}

##  [1] promoted     promoted     promoted     promoted     not promoted
##  [6] promoted     promoted     promoted     not promoted promoted    
## [11] not promoted not promoted promoted     promoted     promoted    
## [16] not promoted promoted     promoted     promoted     promoted    
## [21] promoted     promoted     promoted     not promoted promoted    
## [26] promoted     not promoted promoted     promoted     promoted    
## [31] not promoted promoted     promoted     not promoted promoted    
## [36] promoted     promoted     not promoted promoted     promoted    
## [41] not promoted not promoted promoted     not promoted promoted    
## [46] promoted     promoted     promoted    
## Levels: promoted not promoted
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1
##  [1] promoted     promoted     not promoted promoted     promoted    
##  [6] promoted     promoted     not promoted promoted     promoted    
## [11] promoted     promoted     not promoted not promoted not promoted
## [16] promoted     not promoted promoted     promoted     promoted    
## [21] promoted     promoted     promoted     not promoted not promoted
## [26] promoted     promoted     promoted     promoted     not promoted
## [31] promoted     not promoted promoted     promoted     not promoted
## [36] promoted     promoted     promoted     not promoted promoted    
## [41] not promoted promoted     promoted     promoted     promoted    
## [46] promoted     promoted     promoted    
## Levels: promoted not promoted
##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2

16 / 21

set.seed(12345)
prop_diff <- c()
for (i in 1:100){
  gender_discrimination$simulated_decision <-
  sample(gender_discrimination$decision)
  summary_table <- gender_discrimination %>% 
  tabyl(gender, simulated_decision) %>% 
  adorn_totals("row")
  p_m <- summary_table [1, 2] / 24
  p_f <- summary_table [2, 2] / 24
  prop_diff_calc <- p_m - p_f
  prop_diff <- c(prop_diff, prop_diff_calc)
}

17 / 21

prop_diff

##   [1]  0.04166667 -0.04166667  0.20833333  0.04166667 -0.12500000 -0.04166667
##   [7] -0.12500000  0.12500000  0.12500000  0.04166667  0.04166667  0.04166667
##  [13] -0.04166667  0.20833333  0.12500000 -0.12500000  0.04166667  0.12500000
##  [19]  0.12500000  0.12500000  0.04166667 -0.04166667  0.04166667  0.12500000
##  [25] -0.12500000 -0.12500000  0.12500000  0.12500000 -0.12500000 -0.04166667
##  [31] -0.04166667  0.04166667  0.04166667  0.12500000  0.12500000 -0.04166667
##  [37]  0.12500000  0.29166667  0.04166667 -0.12500000  0.04166667 -0.04166667
##  [43] -0.12500000  0.20833333  0.29166667  0.04166667 -0.20833333  0.12500000
##  [49]  0.12500000  0.04166667 -0.04166667 -0.04166667 -0.04166667  0.04166667
##  [55] -0.20833333 -0.04166667  0.12500000 -0.04166667 -0.04166667 -0.12500000
##  [61] -0.12500000  0.20833333 -0.20833333  0.04166667 -0.12500000  0.04166667
##  [67] -0.12500000  0.12500000 -0.04166667  0.04166667 -0.04166667  0.04166667
##  [73]  0.20833333  0.04166667 -0.29166667 -0.12500000  0.04166667  0.04166667
##  [79]  0.04166667  0.29166667 -0.04166667 -0.04166667  0.12500000 -0.04166667
##  [85] -0.04166667 -0.04166667  0.04166667 -0.12500000  0.04166667 -0.04166667
##  [91] -0.12500000 -0.12500000 -0.04166667 -0.04166667  0.12500000 -0.04166667
##  [97]  0.20833333 -0.12500000 -0.29166667 -0.20833333

18 / 21

hist(prop_diff)

19 / 21

p-value

which(prop_diff >= observed_diff_p)

## [1] 38 45 80

20 / 21

p-value

which(prop_diff >= observed_diff_p)

## [1] 38 45 80

length(which(prop_diff >= observed_diff_p)) / 100

## [1] 0.03

↑, ←, Pg Up, k	Go to previous slide
↓, →, Pg Dn, Space, j	Go to next slide
Home	Go to first slide
End	Go to last slide
Number + Return	Go to specific slide
b / m / f	Toggle blackout / mirrored / fullscreen mode
c	Clone slideshow
p	Toggle presenter mode
t	Restart the presentation timer
?, h	Toggle this help

Randomization Tests

Dr. Mine Dogucu

Hypotheses

Sample Statistic

Sample Statistic

Steps

p-value

p-value

Help