%pylab inline
from scipy.special import comb
from scipy.stats import norm
import scipy.stats as stats
def underlying_dist(bins):
loc, scale = 15,5
fig,ax = subplots(ncols= 1, sharey = True)
fig.set_figwidth(5)
n3 = norm.rvs(loc = loc, scale=scale, size = 100000)
ax.hist(n3, bins=bins)
def measure_tumor(group):
loc, scale = 15,5
fig,ax = subplots(ncols= 1, sharey = True)
fig.set_figwidth(5)
control = norm.rvs(loc = loc, scale=scale, size = 3)
drug = norm.rvs(loc = loc, scale=scale, size = 3)
ax.scatter([0,0,0],control)
ax.scatter([1,1,1],drug)
ax.set_ylim(0,25)
ax.set_xlim(-1,2)
ax.set_ylabel('Tumor size', fontsize = 18)
ax.set_xlabel('Treatment', fontsize = 18)
ax.set_xticklabels(['','','control','',group,'',''], fontsize = 18)
def measure_many_tumor():
loc, scale = 15,5
fig,axs = subplots(ncols= 10, nrows =10, sharey = True, sharex = True)
fig.set_figwidth(20)
fig.set_figheight(20)
axs = axs.flatten()
pvals = []
for ax in axs:
control = norm.rvs(loc = loc, scale=scale, size = 3)
drug = norm.rvs(loc = loc, scale=scale, size = 3)
ax.scatter([0,0,0],control)
ax.scatter([1,1,1],drug)
ax.set_ylim(0,25)
ax.set_xlim(-1,2)
ax.set_yticklabels(['',''], fontsize = 18)
ax.set_xticklabels(['',''], fontsize = 18)
pvals.append(stats.ttest_ind(control, drug)[1])
fig.text(0.5, 0.1, 'Treatment', ha='center',fontsize = 18)
fig.text(0.1, 0.5, 'Tumor size', va='center', rotation='vertical',fontsize = 18)
return pvals
Populating the interactive namespace from numpy and matplotlib
Your lab wants to develop a drug to reduce the size of tumors in a particular kind of cancer. In a sort of screening fashion, you lab is exhaustively testing compounds to find out if any of them worked. So using an animal model you measured the size of the tumors from three specimens after a certain time without any drugs and with each candidate drug.
measure_tumor('drug1')
<ipython-input-1-eec1c45e2606>:28: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(['','','control','',group,'',''], fontsize = 18)
measure_tumor('drug2')
<ipython-input-1-eec1c45e2606>:28: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(['','','control','',group,'',''], fontsize = 18)
That means if we did 100 tests we would expect about 5 false positives or 5 percent and if we did 10,000 tests we would expect about 500 false positives in other words the more tests we do the more false positives we have to deal with
pvals = measure_many_tumor()
<ipython-input-1-eec1c45e2606>:48: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_yticklabels(['',''], fontsize = 18) <ipython-input-1-eec1c45e2606>:49: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(['',''], fontsize = 18)
(np.array(pvals)<0.05)
array([False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False])
def running_p():
loc, scale = 15,5 ## assume the drug has no effect on tumor!
control_all = norm.rvs(loc = loc, scale=scale, size = 3)
drug_all = norm.rvs(loc = loc, scale=scale, size = 3)
p_values = [stats.ttest_ind(control_all, drug_all)[1]]
K = 1
while (p_values[-1] >= 0.05) & (K<5000):
control = norm.rvs(loc = loc, scale=scale, size = 3)
drug = norm.rvs(loc = loc, scale=scale, size = 3)
control_all = np.concatenate([control_all,control])
drug_all = np.concatenate([drug_all,drug])
p_values.append(stats.ttest_ind(control_all, drug_all)[1])
K = K + 1
return p_values
plt.plot(running_p());
plt.xlabel('Number of new datasets added to the experiment',fontsize=12);
plt.ylabel('p-value',fontsize=12);
# takes a long time to simulate!
experiment_length = []
for i in range(100):
experiment_length.append(len(running_p()))
print("The fraction of experiments that terminated before all datasets are used:")
print(sum(np.array(experiment_length)<5000)/100)
The fraction of experiments that terminated before all datasets are used: 0.6
Apparently, this approach will result in over 50% experiments being falsely reported as positive (i.e., ineffective drugs being reported as effective!)
plt.hist(experiment_length,bins=50);
plt.xlabel('Experiment length');
underlying_dist(bins = 10000)
pvals = []
loc, scale = 15,5
for i in range(10000):
control = norm.rvs(loc = loc, scale=scale, size = 3)
drug = norm.rvs(loc = loc, scale=scale, size = 3)
pvals.append(stats.ttest_ind(control, drug)[1])
plt.hist(pvals,bins=25);
Under the null hypothesis, the distribution of $p$ value is uniform over $[0,1]$. Thus, among $n$ independent tests, it's the smallest $p$ value that controls if you will see false-positives among the results.
Suppose the ground truth is that all tests are performed on data following their own null distributions (i.e., their is no significance whatsoever), what is the distribution of the smallest $p$ value? Suppose the significance threshold for each test is fixed at $\hat{p}$, then the probability of observing at least one false-positive is
$$ \begin{align} &\mathbb{P}(\min(p_1,p_2,\cdots,p_n)\leq \hat{p})\\ =&1-\mathbb{P}(\text{All of } p_1,p_2,\cdots,p_n > \hat{p})\\ =&1-\prod_{i=1}^n \mathbb{P}(p_i > \hat{p})\\ =&1-(1-\hat{p})^n\\ \approx & 1-\exp(-n\hat{p}) \end{align} $$This tells you that as the number of tests $n$ goes up, the possibility that at least one test becomes false-positive rises exponentially fast to 1. To reduce this effect, one could choose $\hat{p}$ depending on the number of tests $n$, for instance, a popular correction method is to set
$$ \hat{p}=\frac{0.05}{n} $$Then, the probability that at least one test becomes false-positive is acceptably small:
$$ 1-\exp(-0.05)\approx 0.04877 $$With this correction (Bonferroni correction), if you have 1000 tests, then the significance threshold for each test will be as small as $0.00005$.