---
title: "Should we redefine statistical significance?"
subtitle: "RSS international conference // Cardiff, UK"
author: "Richard D. Morey"
date: "5 September 2018"
output:
ioslides_presentation:
dev: png
logo: img/cardiff_logo_sq.png
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, dpi=300)
quartzFonts(avenir = c("Avenir Book", "Avenir Black", "Avenir Book Oblique", "Avenir Black Oblique"))
fill.under = function(expr, from, to, n = 101, ...){
xname = "x"
sexpr <- substitute(expr)
if (is.name(sexpr)) {
expr <- call(as.character(sexpr), as.name(xname))
}
else {
if (!((is.call(sexpr) || is.expression(sexpr)) && xname %in%
all.vars(sexpr)))
stop(gettextf("'expr' must be a function, or a call or an expression containing '%s'",
xname), domain = NA)
expr <- sexpr
}
x = seq(from, to, len = n)
ll <- list(x = x)
y <- eval(expr, envir = ll, enclos = parent.frame())
polygon(c(x, rev(x)), c(y,y*0), ...)
}
Bc = Vectorize(function(z, delta){
exp(-delta^2/2 + abs(z*delta))
},c("z","delta"))
Bi = Vectorize(function(z, delta){
exp(-delta^2/2 - abs(z*delta))
},c("z","delta"))
B2 = Vectorize(function(z, delta){
(Bi(z,delta) + Bc(z,delta))/2
},c("z","delta"))
find.p.one = function(b){
lnb = log(b)
optimize(function(p){
(lnb + -log(2-p) + 1 + log(p) + log(-log(p)))^2
}, interval = c(0, 1/exp(1)))$minimum
}
```
## Science and statistics
Statistics: contextual problem solving
* Research design
* Summary
* Visualization
* Fallacies/"paradoxes"
## When all you've got is a hammer
### "Null hypothesis significance testing"
* Bastardization of classical significance testing logic
* Focus on single hypothesis
* Focus on single "effect"
* Focus on single criterion for "existence"
* Statistical rejection mistaken for theoretical support
## A replication crisis?
Newly popular: Replication attempts
* Reproducibility Project: Psychology
* "Many labs" projects
* Large-scale replications by skeptics
### The hammer strikes again
* Single effects
* Primary outcome: "Is replication 'significant'?"
### Many replications "fail" (!)
## Redefine statistical significance (RS)?
## RS primary arguments
### "We propose to change the default P-value threshold for statistical significance from 0.05 to 0.005 for claims of new discoveries."
1. **$p\approx.05$ represents "weak [Bayesian] evidence"**
2. $p<.05$ inflates "false discovery rate"
3. $p<.05$ results less likely to "replicate" than $p<.005$
### ➡ Using $p<.05$ "leading cause of non-reproducibility"
## Counter-arguments
### The RS proposal...
* ...encourages contextless, simplistic view of "discovery"
* ...is based on restrictive, unrealistic prior assumptions
* ...exaggerates role of necessarily rare p values near .05
### Among others:
* Argument 2: "False discovery rate" is flawed (Mayo & Morey, in prep.)
* Argument 3: true for *all* $\alpha_2<\alpha_1$
## Priors/models
```{r fig.width = 6, fig.height=5}
xx = seq(-4,4,len = 500)
layout(matrix(c(1,4,2,4,2,5,3,5), 2,4))
par(mar=c(4,1,.5,1), family = 'avenir')
yy = dnorm(xx)
plot.new()
### Dividing
plot(xx, yy, axes=FALSE, ylab = "", xlab = "", ty='l', xaxs='i', yaxs='i', xpd=TRUE, lwd=3, ylim = c(0, 1.5*max(yy)))
abline(h=0, lwd=3, col=rgb(0,0,0,.5))
#mtext(expression(mu), cex = 1.5, 1, 2)
text(par()$usr[1], par()$usr[4]/2, expression(H[n]), col = rgb(.5,0,1,.7), cex = 2, adj = -.2)
text(par()$usr[2], par()$usr[4]/2, expression(H[p]), col = rgb(0,.7,1,.7), cex = 2, adj = 1.2)
fill.under(dnorm(x), min(xx),
0, col = rgb(.5,0,1,.7),
border = NA)
fill.under(dnorm(x), 0, max(xx),
col = rgb(0,.7,1,.7),
border = NA)
mtext("Dividing", 1, 1, adj=.5, cex=1.7)
plot.new()
## Two sided
plot(xx, yy, axes=FALSE, ylab = "", xlab = "", ty='l', xaxs='i', yaxs='i', xpd=TRUE, lwd=3, ylim = c(0, 1.5*max(yy)))
abline(h=0, lwd=3, col=rgb(0,0,0,.5))
#mtext(expression(mu), cex = 1.5, 1, 2)
segments(0,0,0,1.3*max(yy), lwd=3, col = rgb(1,0,0,.7))
points(0,1.3*max(yy), pch = 19, cex = 2, col = rgb(1,0,0,.7))
text(0,1.3*max(yy), expression(H[0]), col = rgb(1,0,0,.7), cex = 2, adj = c(-.5,1))
text(par()$usr[1], par()$usr[4]/2, expression(H[1]), col = rgb(0,0,1,.7), cex = 2, adj = -.2)
text(par()$usr[2], par()$usr[4]/2, expression(H[1]), col = rgb(0,0,1,.7), cex = 2, adj = 1.2)
fill.under(dnorm(x), min(xx), max(xx),
col = rgb(0,0,1,.7),
border = NA)
mtext("Point vs 2-sided", 1, 1, adj=.5, cex=1.7)
## Sided, with point
plot(xx, yy, axes=FALSE, ylab = "", xlab = "", ty='l', xaxs='i', yaxs='i', xpd=TRUE, lwd=3, ylim = c(0, 1.5*max(yy)), xlim = range(xx))
abline(h=0, lwd=3, col=rgb(0,0,0,.5))
#mtext(expression(mu), cex = 1.5, 1, 2)
segments(0,0,0,1.3*max(yy), lwd=3, col = rgb(1,0,0,.7))
points(0,1.3*max(yy), pch = 19, cex = 2, col = rgb(1,0,0,.7))
text(0,1.3*max(yy), expression(H[0]), col = rgb(1,0,0,.7), cex = 2, adj = c(-.5,1))
text(par()$usr[1], par()$usr[4]/2, expression(H[n]), col = rgb(.5,0,1,.7), cex = 2, adj = -.2)
fill.under(dnorm(x), min(xx), 0,
col = rgb(.5,0,1,.7),
border = NA)
text(par()$usr[2], par()$usr[4]/2, expression(H[p]), col = rgb(0,.7,1,.7), cex = 2, adj = 1.2)
fill.under(dnorm(x), 0, max(xx),
col = rgb(0,.7,1,.7),
border = NA)
mtext("Point vs 1-sided", 1, 1, adj=.5, cex=1.7)
```
## Dividing, p value
```{r}
N = 250
z = -1.53
#z = -1.4075
sigma = 15
mu0 = 100
se = sigma / sqrt(N)
```
```{r fig.width=6,fig.height=4}
pval1 = pnorm(z)
pval2 = 2*pnorm(-abs(z))
xbar = mu0 + z*sigma/sqrt(N)
# Bayesian
mu_mu = mu0
sigma_mu = 5
precisions = c(data = 1/se^2, prior = 1/sigma_mu^2)
means = c(data = xbar, prior = mu_mu)
posterior_precision = sum(precisions)
posterior_sd = 1/sqrt(posterior_precision)
posterior_mean = sum(precisions * means) / posterior_precision
pr_lt_mu0 = pnorm(mu0, posterior_mean, posterior_sd)
odds_lt_mu0 = pr_lt_mu0 / ( 1 - pr_lt_mu0 )
xx = seq(-5,5,len = 500)*se + mu0
yy = dnorm(xx, mu0, se)
par(family = 'avenir', cex.lab = 1.3)
plot(xx, yy, ty='l', col = "purple", lwd=2, ylab = "Density", axes=FALSE, xlab = expression(bar(X)), xaxs="i", yaxs="i", xpd = TRUE)
axis(1)
abline(v = xbar, lty=2)
abline(v = mu0+(mu0-xbar), col = "gray", lty=2)
text(xbar, par()$usr[4],
substitute(paste(bar(x)==x0), list(x0=round(xbar,2))),
srt = 90, adj = c(1.1, -.1), cex = 1.2)
fill.under(dnorm(x, mu0, se), min(xx),
xbar, col = rgb(1,0,1,.7),
border = NA)
fill.under(dnorm(x, mu0, se), mu0+(mu0-xbar),
max(xx), col = rgb(1,0,1,.2),
border = NA)
text(par()$usr[2], par()$usr[4] / 3,
"Sampling distribution", col = "purple",
adj = 1, cex = 1, xpd = TRUE)
text(par()$usr[2], par()$usr[4] / 4,
expression(mu==100), col = "purple",
adj = 1, cex = 1, xpd = TRUE)
#mtext(expression(mu==100), 3, .1, adj=.5, col="purple", cex = 1.2)
```
$p = `r round(pval2, 2)`$
## Dividing, Bayes factor
```{r fig.width=6,fig.height=4}
par(family = 'avenir', cex.lab = 1.3)
xx = seq(-3, 3, len = 500)*sigma_mu + mu_mu
yy0 = dnorm(xx, mu_mu, sigma_mu)
yy1 = dnorm(xx, posterior_mean, posterior_sd)
plot(xx, yy1, ty='l', col = "darkblue", lwd=2, ylab = "Density", axes=FALSE, xlab = expression(mu), xaxs="i", yaxs="i", xpd = TRUE)
lines(xx, yy0, col = "red", lwd = 2, lty=2)
axis(1)
abline(v = mu0, lty = 3)
text(mu0, par()$usr[4],
substitute(paste(mu==x0), list(x0=mu0)),
srt = 90, adj = c(1.1, 1.1), cex = 1.2)
fill.under(dnorm(x, mu_mu, sigma_mu), mu0,
qnorm(.999, mu_mu, sigma_mu), col = rgb(1,0,0,.2),
border = NA)
fill.under(dnorm(x, posterior_mean, posterior_sd), mu0,
qnorm(.999, posterior_mean, posterior_sd), col = rgb(0,0,1,.5),
border = NA)
text(posterior_mean, par()$usr[4],
"Posterior", col = "darkblue",
adj = 1.1, cex = 1.2, xpd = TRUE)
text(par()$usr[2], par()$usr[4] / 7,
"Prior", col = "red",
adj = 2, cex = 1.2, xpd = TRUE)
```
$BF_{np} = \left.\frac{`r round(pr_lt_mu0,2)`}{1 -`r round(pr_lt_mu0,2)`}\middle/\frac{1}{1}\right. = `r round(odds_lt_mu0, 2)`$
## Two-sided, point null: BF and p
## One-sided, point null: Bayes factors
* Idea: choose *half* the prior for a one-sided test
* Bayesian can choose side consistent with data
* Evidential BF "boost": approx. $2 - p$ (for smallish $p$)
```{r fig.width = 6, fig.height = 3}
par(family = 'avenir', cex.lab = 1.3)
xx = seq(-4,0,len = 500)
yy = dnorm(xx)
## Sided, with point
plot(xx, yy, axes=FALSE, ylab = "", xlab = "", ty='l', xaxs='i', yaxs='i', xpd=TRUE, lwd=3, ylim = c(0, 1.5*max(yy)), xlim = c(-4,4))
abline(h=0, lwd=3, col=rgb(0,0,0,.5))
#mtext(expression(mu), cex = 1.5, 1, 2)
segments(0,0,0,1.3*max(yy), lwd=3, col = rgb(1,0,0,.7))
points(0,1.3*max(yy), pch = 19, cex = 2, col = rgb(1,0,0,.7))
text(0,1.3*max(yy), expression(H[0]), col = rgb(1,0,0,.7), cex = 2, adj = c(-.5,1))
text(par()$usr[1], par()$usr[4]/2, expression(H[n]), col = rgb(.5,0,1,.7), cex = 2, adj = -.2)
fill.under(dnorm(x), min(xx), 0,
col = rgb(.5,0,1,.7),
border = NA)
#text(par()$usr[2], par()$usr[4]/2, expression(H[p]), col = rgb(0,.7,1,.7), cex = 2, adj = 1.2)
#fill.under(dnorm(x), 0, max(xx),
# col = rgb(0,.7,1,.7),
# border = NA)
mtext("Point vs 1-sided", 1, 1, adj=.5, cex=1.7)
```
## One-sided, point null: BF and p
## *Best-case* Bayesian argument?
* Point null may be appropriate for some problems
* No reason for Bayesians to use two-sided hypotheses!
* So: $p<.03$ or $p<.02$ more appropriate calibration
### But...are these p values that common without cheating?
## How common are these p values?
```{r fig.width = 6, fig.height = 5}
critp = .02
N = seq(20,120, len = 50)
d = seq(0,1,len = 50)
ps = outer(N,d, function(N,d){
critf05 = qf(.95, 1, N - 1)
critf02 = qf(1-critp, 1, N - 1)
pf(critf02, 1, N - 1, d^2*N) - pf(critf05, 1, N - 1, d^2*N)
})
max.p = ceiling(max(ps)*20)/20
par(family = 'avenir', cex.lab = 1.3, cex.axis = 1.2)
filled.contour(N,d,ps,levels = seq(0,max.p,.05), plot.axes = {
rect(par()$usr[1], par()$usr[3], par()$usr[2], par()$usr[4], col = "white",border = NA)
#contour(N,d,ps, levels = seq(0,max.p,.05),
# drawlabels = TRUE, axes = FALSE,
# frame.plot = FALSE, add = TRUE);
axis(1); #axis(2)
},
#ylab = "True standardized effect size",
xlab = "Sample size", color.palette = viridis::viridis
#main = paste0("Pr(",critp,"
## How common are these p values?
```{r fig.width = 6, fig.height = 5}
critp = .02
N = seq(20,120, len = 50)
d = seq(0,1,len = 50)
ps = outer(N,d, function(N,d){
critf05 = qf(.95, 1, N - 1)
critf02 = qf(1-critp, 1, N - 1)
pf(critf02, 1, N - 1, d^2*N) - pf(critf05, 1, N - 1, d^2*N)
})
max.p = ceiling(max(ps)*20)/20
par(family = 'avenir', cex.lab = 1.3, cex.axis = 1.2)
filled.contour(N,d,ps,levels = seq(0,max.p,.05), plot.axes = {
rect(par()$usr[1], par()$usr[3], par()$usr[2], par()$usr[4], col = "white",border = NA)
#contour(N,d,ps, levels = seq(0,max.p,.05),
# drawlabels = TRUE, axes = FALSE,
# frame.plot = FALSE, add = TRUE);
axis(1); axis(2) },
ylab = "True standardized effect size",
xlab = "Sample size", color.palette = viridis::viridis
#main = paste0("Pr(",critp,"
## How common are these p values?
```{r fig.width = 6, fig.height = 5}
critp = .02
N = seq(20,120, len = 50)
d = seq(0,1,len = 50)
ps = outer(N,d, function(N,d){
critf05 = qf(.95, 1, N - 1)
critf02 = qf(1-critp, 1, N - 1)
pf(critf02, 1, N - 1, d^2*N) - pf(critf05, 1, N - 1, d^2*N)
})
max.p = ceiling(max(ps)*20)/20
par(family = 'avenir', cex.lab = 1.3, cex.axis = 1.2, cex.main = 1.4)
filled.contour(N,d,ps,levels = seq(0,max.p,.05), plot.axes = {
rect(par()$usr[1], par()$usr[3], par()$usr[2], par()$usr[4], col = "white",border = NA)
#contour(N,d,ps, levels = seq(0,max.p,.05),
# drawlabels = TRUE, axes = FALSE,
# frame.plot = FALSE, add = TRUE);
axis(1); axis(2) },
ylab = "True standardized effect size",
xlab = "Sample size", color.palette = viridis::viridis,
main = paste0("Pr(",critp,"
## These p values are rare!
```{r fig.width = 6, fig.height = 5}
critp = .02
N = seq(20,120, len = 50)
d = seq(0,1,len = 50)
ps = outer(N,d, function(N,d){
critf05 = qf(.95, 1, N - 1)
critf02 = qf(1-critp, 1, N - 1)
pf(critf02, 1, N - 1, d^2*N) - pf(critf05, 1, N - 1, d^2*N)
})
max.p = ceiling(max(ps)*20)/20
par(family = 'avenir', cex.lab = 1.3, cex.axis = 1.2, cex.main = 1.4)
filled.contour(N,d,ps,levels = seq(0,max.p,.05), plot.axes = { contour(N,d,ps, levels = seq(0,max.p,.05),
drawlabels = TRUE, axes = FALSE,
frame.plot = FALSE, add = TRUE);
axis(1); axis(2) }, ylab = "True standardized effect size", xlab = "Sample size", color.palette = viridis::viridis,
main = paste0("Pr(",critp,"
## RS's arguments fail
* "Discovery" is more than just a significant p value
* A new criterion is unlikely to help the situation
* No general calibration of p values to Bayes factors
* Sometimes p will "appear" to be more lenient...
* ...sometimes BF will "appear" to be more lenient
* Unlikely that marginal p values have helped caused the crisis
* Best-case Bayesian argument: weakly evidential p's are rare
### Thank you.