--- title: "Should we redefine statistical significance?" subtitle: "RSS international conference // Cardiff, UK" author: "Richard D. Morey" date: "5 September 2018" output: ioslides_presentation: dev: png logo: img/cardiff_logo_sq.png --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = FALSE, dpi=300) quartzFonts(avenir = c("Avenir Book", "Avenir Black", "Avenir Book Oblique", "Avenir Black Oblique")) fill.under = function(expr, from, to, n = 101, ...){ xname = "x" sexpr <- substitute(expr) if (is.name(sexpr)) { expr <- call(as.character(sexpr), as.name(xname)) } else { if (!((is.call(sexpr) || is.expression(sexpr)) && xname %in% all.vars(sexpr))) stop(gettextf("'expr' must be a function, or a call or an expression containing '%s'", xname), domain = NA) expr <- sexpr } x = seq(from, to, len = n) ll <- list(x = x) y <- eval(expr, envir = ll, enclos = parent.frame()) polygon(c(x, rev(x)), c(y,y*0), ...) } Bc = Vectorize(function(z, delta){ exp(-delta^2/2 + abs(z*delta)) },c("z","delta")) Bi = Vectorize(function(z, delta){ exp(-delta^2/2 - abs(z*delta)) },c("z","delta")) B2 = Vectorize(function(z, delta){ (Bi(z,delta) + Bc(z,delta))/2 },c("z","delta")) find.p.one = function(b){ lnb = log(b) optimize(function(p){ (lnb + -log(2-p) + 1 + log(p) + log(-log(p)))^2 }, interval = c(0, 1/exp(1)))$minimum } ``` ## Science and statistics
Statistics: contextual problem solving * Research design * Summary * Visualization * Fallacies/"paradoxes" ## When all you've got is a hammer ### "Null hypothesis significance testing"
* Bastardization of classical significance testing logic * Focus on single hypothesis * Focus on single "effect" * Focus on single criterion for "existence" * Statistical rejection mistaken for theoretical support ## A replication crisis? Newly popular: Replication attempts * Reproducibility Project: Psychology * "Many labs" projects * Large-scale replications by skeptics ### The hammer strikes again * Single effects * Primary outcome: "Is replication 'significant'?" ### Many replications "fail" (!) ## Redefine statistical significance (RS)?

## RS primary arguments ### "We propose to change the default P-value threshold for statistical significance from 0.05 to 0.005 for claims of new discoveries."
1. **$p\approx.05$ represents "weak [Bayesian] evidence"** 2. $p<.05$ inflates "false discovery rate" 3. $p<.05$ results less likely to "replicate" than $p<.005$ ### ➡ Using $p<.05$ "leading cause of non-reproducibility" ## Counter-arguments ### The RS proposal... * ...encourages contextless, simplistic view of "discovery" * ...is based on restrictive, unrealistic prior assumptions * ...exaggerates role of necessarily rare p values near .05 ### Among others: * Argument 2: "False discovery rate" is flawed (Mayo & Morey, in prep.) * Argument 3: true for *all* $\alpha_2<\alpha_1$ ## Priors/models ```{r fig.width = 6, fig.height=5} xx = seq(-4,4,len = 500) layout(matrix(c(1,4,2,4,2,5,3,5), 2,4)) par(mar=c(4,1,.5,1), family = 'avenir') yy = dnorm(xx) plot.new() ### Dividing plot(xx, yy, axes=FALSE, ylab = "", xlab = "", ty='l', xaxs='i', yaxs='i', xpd=TRUE, lwd=3, ylim = c(0, 1.5*max(yy))) abline(h=0, lwd=3, col=rgb(0,0,0,.5)) #mtext(expression(mu), cex = 1.5, 1, 2) text(par()$usr[1], par()$usr[4]/2, expression(H[n]), col = rgb(.5,0,1,.7), cex = 2, adj = -.2) text(par()$usr[2], par()$usr[4]/2, expression(H[p]), col = rgb(0,.7,1,.7), cex = 2, adj = 1.2) fill.under(dnorm(x), min(xx), 0, col = rgb(.5,0,1,.7), border = NA) fill.under(dnorm(x), 0, max(xx), col = rgb(0,.7,1,.7), border = NA) mtext("Dividing", 1, 1, adj=.5, cex=1.7) plot.new() ## Two sided plot(xx, yy, axes=FALSE, ylab = "", xlab = "", ty='l', xaxs='i', yaxs='i', xpd=TRUE, lwd=3, ylim = c(0, 1.5*max(yy))) abline(h=0, lwd=3, col=rgb(0,0,0,.5)) #mtext(expression(mu), cex = 1.5, 1, 2) segments(0,0,0,1.3*max(yy), lwd=3, col = rgb(1,0,0,.7)) points(0,1.3*max(yy), pch = 19, cex = 2, col = rgb(1,0,0,.7)) text(0,1.3*max(yy), expression(H[0]), col = rgb(1,0,0,.7), cex = 2, adj = c(-.5,1)) text(par()$usr[1], par()$usr[4]/2, expression(H[1]), col = rgb(0,0,1,.7), cex = 2, adj = -.2) text(par()$usr[2], par()$usr[4]/2, expression(H[1]), col = rgb(0,0,1,.7), cex = 2, adj = 1.2) fill.under(dnorm(x), min(xx), max(xx), col = rgb(0,0,1,.7), border = NA) mtext("Point vs 2-sided", 1, 1, adj=.5, cex=1.7) ## Sided, with point plot(xx, yy, axes=FALSE, ylab = "", xlab = "", ty='l', xaxs='i', yaxs='i', xpd=TRUE, lwd=3, ylim = c(0, 1.5*max(yy)), xlim = range(xx)) abline(h=0, lwd=3, col=rgb(0,0,0,.5)) #mtext(expression(mu), cex = 1.5, 1, 2) segments(0,0,0,1.3*max(yy), lwd=3, col = rgb(1,0,0,.7)) points(0,1.3*max(yy), pch = 19, cex = 2, col = rgb(1,0,0,.7)) text(0,1.3*max(yy), expression(H[0]), col = rgb(1,0,0,.7), cex = 2, adj = c(-.5,1)) text(par()$usr[1], par()$usr[4]/2, expression(H[n]), col = rgb(.5,0,1,.7), cex = 2, adj = -.2) fill.under(dnorm(x), min(xx), 0, col = rgb(.5,0,1,.7), border = NA) text(par()$usr[2], par()$usr[4]/2, expression(H[p]), col = rgb(0,.7,1,.7), cex = 2, adj = 1.2) fill.under(dnorm(x), 0, max(xx), col = rgb(0,.7,1,.7), border = NA) mtext("Point vs 1-sided", 1, 1, adj=.5, cex=1.7) ``` ## Dividing, p value ```{r} N = 250 z = -1.53 #z = -1.4075 sigma = 15 mu0 = 100 se = sigma / sqrt(N) ``` ```{r fig.width=6,fig.height=4} pval1 = pnorm(z) pval2 = 2*pnorm(-abs(z)) xbar = mu0 + z*sigma/sqrt(N) # Bayesian mu_mu = mu0 sigma_mu = 5 precisions = c(data = 1/se^2, prior = 1/sigma_mu^2) means = c(data = xbar, prior = mu_mu) posterior_precision = sum(precisions) posterior_sd = 1/sqrt(posterior_precision) posterior_mean = sum(precisions * means) / posterior_precision pr_lt_mu0 = pnorm(mu0, posterior_mean, posterior_sd) odds_lt_mu0 = pr_lt_mu0 / ( 1 - pr_lt_mu0 ) xx = seq(-5,5,len = 500)*se + mu0 yy = dnorm(xx, mu0, se) par(family = 'avenir', cex.lab = 1.3) plot(xx, yy, ty='l', col = "purple", lwd=2, ylab = "Density", axes=FALSE, xlab = expression(bar(X)), xaxs="i", yaxs="i", xpd = TRUE) axis(1) abline(v = xbar, lty=2) abline(v = mu0+(mu0-xbar), col = "gray", lty=2) text(xbar, par()$usr[4], substitute(paste(bar(x)==x0), list(x0=round(xbar,2))), srt = 90, adj = c(1.1, -.1), cex = 1.2) fill.under(dnorm(x, mu0, se), min(xx), xbar, col = rgb(1,0,1,.7), border = NA) fill.under(dnorm(x, mu0, se), mu0+(mu0-xbar), max(xx), col = rgb(1,0,1,.2), border = NA) text(par()$usr[2], par()$usr[4] / 3, "Sampling distribution", col = "purple", adj = 1, cex = 1, xpd = TRUE) text(par()$usr[2], par()$usr[4] / 4, expression(mu==100), col = "purple", adj = 1, cex = 1, xpd = TRUE) #mtext(expression(mu==100), 3, .1, adj=.5, col="purple", cex = 1.2) ``` $p = `r round(pval2, 2)`$ ## Dividing, Bayes factor ```{r fig.width=6,fig.height=4} par(family = 'avenir', cex.lab = 1.3) xx = seq(-3, 3, len = 500)*sigma_mu + mu_mu yy0 = dnorm(xx, mu_mu, sigma_mu) yy1 = dnorm(xx, posterior_mean, posterior_sd) plot(xx, yy1, ty='l', col = "darkblue", lwd=2, ylab = "Density", axes=FALSE, xlab = expression(mu), xaxs="i", yaxs="i", xpd = TRUE) lines(xx, yy0, col = "red", lwd = 2, lty=2) axis(1) abline(v = mu0, lty = 3) text(mu0, par()$usr[4], substitute(paste(mu==x0), list(x0=mu0)), srt = 90, adj = c(1.1, 1.1), cex = 1.2) fill.under(dnorm(x, mu_mu, sigma_mu), mu0, qnorm(.999, mu_mu, sigma_mu), col = rgb(1,0,0,.2), border = NA) fill.under(dnorm(x, posterior_mean, posterior_sd), mu0, qnorm(.999, posterior_mean, posterior_sd), col = rgb(0,0,1,.5), border = NA) text(posterior_mean, par()$usr[4], "Posterior", col = "darkblue", adj = 1.1, cex = 1.2, xpd = TRUE) text(par()$usr[2], par()$usr[4] / 7, "Prior", col = "red", adj = 2, cex = 1.2, xpd = TRUE) ``` $BF_{np} = \left.\frac{`r round(pr_lt_mu0,2)`}{1 -`r round(pr_lt_mu0,2)`}\middle/\frac{1}{1}\right. = `r round(odds_lt_mu0, 2)`$ ## Two-sided, point null: BF and p

## One-sided, point null: Bayes factors * Idea: choose *half* the prior for a one-sided test * Bayesian can choose side consistent with data * Evidential BF "boost": approx. $2 - p$ (for smallish $p$) ```{r fig.width = 6, fig.height = 3} par(family = 'avenir', cex.lab = 1.3) xx = seq(-4,0,len = 500) yy = dnorm(xx) ## Sided, with point plot(xx, yy, axes=FALSE, ylab = "", xlab = "", ty='l', xaxs='i', yaxs='i', xpd=TRUE, lwd=3, ylim = c(0, 1.5*max(yy)), xlim = c(-4,4)) abline(h=0, lwd=3, col=rgb(0,0,0,.5)) #mtext(expression(mu), cex = 1.5, 1, 2) segments(0,0,0,1.3*max(yy), lwd=3, col = rgb(1,0,0,.7)) points(0,1.3*max(yy), pch = 19, cex = 2, col = rgb(1,0,0,.7)) text(0,1.3*max(yy), expression(H[0]), col = rgb(1,0,0,.7), cex = 2, adj = c(-.5,1)) text(par()$usr[1], par()$usr[4]/2, expression(H[n]), col = rgb(.5,0,1,.7), cex = 2, adj = -.2) fill.under(dnorm(x), min(xx), 0, col = rgb(.5,0,1,.7), border = NA) #text(par()$usr[2], par()$usr[4]/2, expression(H[p]), col = rgb(0,.7,1,.7), cex = 2, adj = 1.2) #fill.under(dnorm(x), 0, max(xx), # col = rgb(0,.7,1,.7), # border = NA) mtext("Point vs 1-sided", 1, 1, adj=.5, cex=1.7) ``` ## One-sided, point null: BF and p

## *Best-case* Bayesian argument?
* Point null may be appropriate for some problems * No reason for Bayesians to use two-sided hypotheses! * So: $p<.03$ or $p<.02$ more appropriate calibration ### But...are these p values that common without cheating? ## How common are these p values? ```{r fig.width = 6, fig.height = 5} critp = .02 N = seq(20,120, len = 50) d = seq(0,1,len = 50) ps = outer(N,d, function(N,d){ critf05 = qf(.95, 1, N - 1) critf02 = qf(1-critp, 1, N - 1) pf(critf02, 1, N - 1, d^2*N) - pf(critf05, 1, N - 1, d^2*N) }) max.p = ceiling(max(ps)*20)/20 par(family = 'avenir', cex.lab = 1.3, cex.axis = 1.2) filled.contour(N,d,ps,levels = seq(0,max.p,.05), plot.axes = { rect(par()$usr[1], par()$usr[3], par()$usr[2], par()$usr[4], col = "white",border = NA) #contour(N,d,ps, levels = seq(0,max.p,.05), # drawlabels = TRUE, axes = FALSE, # frame.plot = FALSE, add = TRUE); axis(1); #axis(2) }, #ylab = "True standardized effect size", xlab = "Sample size", color.palette = viridis::viridis #main = paste0("Pr(",critp," ## How common are these p values? ```{r fig.width = 6, fig.height = 5} critp = .02 N = seq(20,120, len = 50) d = seq(0,1,len = 50) ps = outer(N,d, function(N,d){ critf05 = qf(.95, 1, N - 1) critf02 = qf(1-critp, 1, N - 1) pf(critf02, 1, N - 1, d^2*N) - pf(critf05, 1, N - 1, d^2*N) }) max.p = ceiling(max(ps)*20)/20 par(family = 'avenir', cex.lab = 1.3, cex.axis = 1.2) filled.contour(N,d,ps,levels = seq(0,max.p,.05), plot.axes = { rect(par()$usr[1], par()$usr[3], par()$usr[2], par()$usr[4], col = "white",border = NA) #contour(N,d,ps, levels = seq(0,max.p,.05), # drawlabels = TRUE, axes = FALSE, # frame.plot = FALSE, add = TRUE); axis(1); axis(2) }, ylab = "True standardized effect size", xlab = "Sample size", color.palette = viridis::viridis #main = paste0("Pr(",critp," ## How common are these p values? ```{r fig.width = 6, fig.height = 5} critp = .02 N = seq(20,120, len = 50) d = seq(0,1,len = 50) ps = outer(N,d, function(N,d){ critf05 = qf(.95, 1, N - 1) critf02 = qf(1-critp, 1, N - 1) pf(critf02, 1, N - 1, d^2*N) - pf(critf05, 1, N - 1, d^2*N) }) max.p = ceiling(max(ps)*20)/20 par(family = 'avenir', cex.lab = 1.3, cex.axis = 1.2, cex.main = 1.4) filled.contour(N,d,ps,levels = seq(0,max.p,.05), plot.axes = { rect(par()$usr[1], par()$usr[3], par()$usr[2], par()$usr[4], col = "white",border = NA) #contour(N,d,ps, levels = seq(0,max.p,.05), # drawlabels = TRUE, axes = FALSE, # frame.plot = FALSE, add = TRUE); axis(1); axis(2) }, ylab = "True standardized effect size", xlab = "Sample size", color.palette = viridis::viridis, main = paste0("Pr(",critp," ## These p values are rare! ```{r fig.width = 6, fig.height = 5} critp = .02 N = seq(20,120, len = 50) d = seq(0,1,len = 50) ps = outer(N,d, function(N,d){ critf05 = qf(.95, 1, N - 1) critf02 = qf(1-critp, 1, N - 1) pf(critf02, 1, N - 1, d^2*N) - pf(critf05, 1, N - 1, d^2*N) }) max.p = ceiling(max(ps)*20)/20 par(family = 'avenir', cex.lab = 1.3, cex.axis = 1.2, cex.main = 1.4) filled.contour(N,d,ps,levels = seq(0,max.p,.05), plot.axes = { contour(N,d,ps, levels = seq(0,max.p,.05), drawlabels = TRUE, axes = FALSE, frame.plot = FALSE, add = TRUE); axis(1); axis(2) }, ylab = "True standardized effect size", xlab = "Sample size", color.palette = viridis::viridis, main = paste0("Pr(",critp," ## RS's arguments fail * "Discovery" is more than just a significant p value * A new criterion is unlikely to help the situation * No general calibration of p values to Bayes factors * Sometimes p will "appear" to be more lenient... * ...sometimes BF will "appear" to be more lenient * Unlikely that marginal p values have helped caused the crisis * Best-case Bayesian argument: weakly evidential p's are rare ### Thank you.