%pylab inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
Populating the interactive namespace from numpy and matplotlib
Datapoints $(x,t)$ are believed to come from a straight line with Gaussian noise $\varepsilon \sim N(0, \sigma^2)$
$$ t = w_0 + w_1 x + \varepsilon, $$Now we want to infer the parameters $w_0, w_1$ and sample from this posterior
%pylab inline
Populating the interactive namespace from numpy and matplotlib
# Simulate
np.random.seed(1)
w1 = 0.5
w0 = 3
sigma2 = 3
N = 100
err = np.random.normal(0,sigma2,N)
X = np.array(sorted(np.random.uniform(-10,10,N)))
T = w0 + X*w1 + err
plt.plot(X,T,".")
plt.xlabel('x')
plt.ylabel('t')
plt.show()
Assuming uniform prior for $w_0$, $w_1$ from -10 to 10
\begin{cases}
\prod_i \sqrt{\frac 1 {2 \pi \sigma^2}}e^{\frac {-(t_i-w_0-w_1 x_i)^2} {2 \sigma^2}} & \text{if } w_0, w_1 \in [-2, 5]\\
0 & \text{otherwise}
\end{cases}\\
\
P(D) &= \int{-2}^{5} \int{-2}^{5} \prod_i \sqrt{\frac 1 {2 \pi \sigma^2}}e^{\frac {-(t_i-w_0-w_1 x_i)^2} {2 \sigma^2}} \, dw_0 dw1\
\
\text{If we wanted to pursue Bayesian hypothesis comparison:}\
\text{Assuming } p(H) = constant
\
P(H{linear}|D) &\propto \int{-2}^{5} \int{-2}^{5} \prod_i \sqrt{\frac 1 {2 \pi \sigma^2}}e^{\frac {-(t_i-w_0-w_1 x_i)^2} {2 \sigma^2}} \, dw_0 dw1\
P(H{linear}|D) &\propto \int{-2}^{5}\int{-2}^{5} e^ {\sum_i -0.5\ln 2\pi \sigma^2 {+\frac {-(t_i-w_0-w_1 x_i)^2} {2\sigma^2}} }\, dw_0 dw1\
Log(P(H{linear}|D)) &\propto log(\int{-2}^{5}\int{-2}^{5} e^ {\sum_i -0.5\ln 2\pi \sigma^2 {+\frac {-(t_i-w_0-w_1 x_i)^2} {2\sigma^2}} }\, dw_0 dw_1)\
\end{aligned}
\
\text{etc}
$$def posterior(x, t, w0, w1,sigma2):
logconst = -0.5*np.log(2*np.pi)*sigma2
logprob = 0 # flat prior
logprob = (-(t - w0 - w1*x)*(t - w0 - w1*x)/2/sigma2 + logconst).sum()
return np.exp(logprob)
nbin = 500
w0_min = -2
w0_max = +5
w1_min = -2
w1_max = +5
x0 = np.linspace(w0_min, w0_max, nbin)
x1 = np.linspace(w1_min, w1_max, nbin)
def posterior_pdf(X,T):
h2 = []
bsize_w0 = (w0_max - w0_min)/nbin #x0[1]-x0[0]
bsize_w1 = (w1_max - w1_min)/nbin
for b0 in range(nbin):
w0 = w0_min + b0*bsize_w0
row = []
for b1 in range(nbin):
w1 = w1_min + b1*bsize_w1
ev = posterior(X, T, w0, w1,sigma2) * bsize_w0 * bsize_w1
row.append(ev)
h2.append(row)
# normalization
h2 = np.array(h2)/np.sum(np.array(h2))
return h2
pdf= posterior_pdf(X,T)
pdf.sum()
1.0
Equivalent to $\arg\max_{(w_0,w_1)} P(w_0,w_1)$
w0 ,w1 = np.unravel_index(pdf.argmax(), pdf.shape)
w0,w1
(370, 182)
x0[w0], x1[w1]
(3.1903807615230457, 0.5531062124248498)
fig,axs = subplots(ncols= 2,nrows =1)
fig.set_figwidth(20)
fig.set_figheight(10)
axs[0].imshow(pdf, extent=[w1_min, w1_max, w0_max, w0_min])
pcm = axs[1].imshow(pdf[int(w0-30):int(w0+30),int(w1-30):int(w1+30)],
extent=[x1[w1-30], x1[w1+30], x0[w0+30], x0[w0-30]])
axs[0].axvline(x1[w1], c = 'r', ls ='--')
axs[0].axhline(x0[w0], c = 'r', ls ='--')
fig.colorbar(pcm,ax=axs[0], shrink =0.35)
fig.colorbar(pcm,ax=axs[1], shrink =0.35)
axs[0].set_title('joint posterior')
axs[0].set_ylabel(r'$w0$')
axs[0].set_xlabel(r'$w1$')
axs[1].set_xlabel(r'$w1$')
axs[1].set_title('joint posterior (but more intimate)')
Text(0.5, 1.0, 'joint posterior (but more intimate)')
Let's marginalize posterior distributions for each parameter.
This is called a mean field approximation, and while it introduces some bias, it is a common assumption in many fields of statistics to simplify complex models.
# from scipy.stats.contingency import margins
def marginalization(pdf):
pdf_w0,pdf_w1 = pdf.sum(axis=1),pdf.sum(axis=0) #margins(pdf) # ~
pdf_w0 = pdf_w0.T
cdf_w0 = np.cumsum(pdf_w0)
cdf_w1 = np.cumsum(pdf_w1)
return cdf_w0,cdf_w1, pdf_w0,pdf_w1
cdf_w0,cdf_w1, pdf_w0,pdf_w1 = marginalization(pdf)
fig,axs = subplots(ncols= 2,nrows =2)
fig.set_figwidth(20)
fig.set_figheight(10)
axs[0][0].plot(x0,pdf_w0)
axs[0][1].plot(x1,pdf_w1)
axs[1][0].plot(x0,cdf_w0)
axs[1][1].plot(x1,cdf_w1)
axs[0,0].set_title('$w_0$',fontsize=18)
axs[0,1].set_title('$w_1$', fontsize=18)
Text(0.5, 1.0, '$w_1$')
def parameters_sample(cdf_w0,cdf_w1):
r = np.random.uniform(low=0.0, high=1.0, size=2)
i = int(np.argwhere(cdf_w0>r[0])[0] -1)
i = max(i,0)
w0_sampled = x0[i]
i = int(np.argwhere(cdf_w1>r[1])[0] -1)
i = max(i,0)
w1_sampled = x0[i]
return w0_sampled,w1_sampled
def plot_sample(X,T,cdf_w0,cdf_w1):
for i in range(5):
w0_sampled,w1_sampled = parameters_sample(cdf_w0,cdf_w1)
print("w0: ", w0_sampled," w1: ",w1_sampled)
Y = []
for i in range(len(X)):
Y.append(w0_sampled + w1_sampled * X[i])
plt.plot(X,T,".")
plt.plot(X,Y)
plt.show()
plot_sample(X,T,cdf_w0,cdf_w1)
w0: 2.797595190380761 w1: 0.5390781563126255 w0: 3.148296593186373 w1: 0.5671342685370742 w0: 3.092184368737475 w1: 0.5811623246492985 w0: 3.2044088176352705 w1: 0.5110220440881763 w0: 3.3166332665330662 w1: 0.5531062124248498
def plot_sample(X,T,cdf_w0,cdf_w1):
for i in range(5):
w0_sampled,w1_sampled = parameters_sample(cdf_w0,cdf_w1)
print("w0: ", w0_sampled," w1: ",w1_sampled)
Y = []
for i in range(len(X)):
Y.append(w0_sampled + w1_sampled * X[i])
plt.plot(X,T,".")
plt.plot(X,Y)
plt.show()
plot_sample(X,T,cdf_w0,cdf_w1)
w0: 2.895791583166333 w1: 0.5390781563126255 w0: 3.260521042084169 w1: 0.5811623246492985 w0: 2.5871743486973946 w1: 0.5951903807615229 w0: 3.1202404809619235 w1: 0.5951903807615229 w0: 3.2184368737474953 w1: 0.5671342685370742
def sub_sampling(sub_idx):
X1 = X[sub_idx]
T1 = T[sub_idx]
pdf1=posterior_pdf(X1,T1)
cdf_w0_1,cdf_w1_1,_,_ = marginalization(pdf1)
plot_sample(X1,T1,cdf_w0_1,cdf_w1_1)
sub_idx = np.random.choice(N, 10)
sub_sampling(sub_idx)
w0: 2.7835671342685373 w1: 0.7074148296593186 w0: 3.0360721442885774 w1: 0.6092184368737477 w0: 2.643286573146293 w1: 0.6372745490981964 w0: 2.19438877755511 w1: 0.623246492985972 w0: 2.222444889779559 w1: 0.6092184368737477
sub_idx = range(17,32)
sub_sampling(sub_idx)
w0: 2.1663326653306614 w1: 0.1603206412825653 w0: 4.719438877755511 w1: 0.42685370741482975 w0: -0.4288577154308617 w1: 0.1603206412825653 w0: 0.5110220440881763 w1: 0.5531062124248498 w0: 3.9058116232464934 w1: 0.6933867735470942
sub_idx = range(15)
sub_sampling(sub_idx)
w0: 3.1903807615230457 w1: 0.2585170340681362 w0: 0.5250501002004007 w1: 0.6653306613226455 w0: 2.4048096192384767 w1: 0.623246492985972 w0: 2.6713426853707416 w1: 0.5811623246492985 w0: 2.3486973947895793 w1: 0.6092184368737477
sub_idx = range(35,50)
sub_sampling(sub_idx)
w0: 4.158316633266534 w1: 0.8056112224448899 w0: 3.849699398797595 w1: 0.3426853707414832 w0: 3.9058116232464934 w1: 0.3707414829659319 w0: 4.915831663326653 w1: 0.5390781563126255 w0: 4.49498997995992 w1: 1.2124248496993988
sub_idx = list(range(7))+list(range(42,50))
sub_sampling(sub_idx)
w0: 4.438877755511022 w1: 0.6513026052104207 w0: 3.6392785571142285 w1: 0.623246492985972 w0: 3.9759519038076157 w1: 0.5671342685370742 w0: 4.312625250501002 w1: 0.5951903807615229 w0: 4.803607214428858 w1: 0.7494989979959921
From Wikipedia: "the Metropolis–Hastings algorithm is a Markov chain Monte Carlo (MCMC) method for obtaining a sequence of random samples from a probability distribution from which direct sampling is difficult" https://en.wikipedia.org/wiki/Metropolis%E2%80%93Hastings_algorithm
Here in our case, the full posterior distribution is a 2D pdf, which is hard to sample directly. The algorithm goes like follow:
we start with some initial parameters $\Theta$ and calculating the probability with $\Theta$
In each iteration:
import scipy.stats
# don't forget to generate the 500 random samples as in the previous post
# adapted from https://letianzj.github.io/mcmc-linear-regression.html
def prior_probability(w_vector):
a = w_vector[0] # intercept
b = w_vector[1] # slope
#
a_prior = 1
b_prior = 1
# a_prior = scipy.stats.norm(0.5, 0.5).pdf(a)
#b_prior = scipy.stats.norm(0.5, 0.5).pdf(b)
return np.log(a) + np.log(b)
# Given w, the likehood of seeing x and t
def likelihood_probability(w_vector):
a = w_vector[0] # intercept
b = w_vector[1] # slope
t_predict = a + b * X
single_likelihoods = scipy.stats.norm(t_predict, np.sqrt(sigma2)).pdf(T) # we know sigma_e is 3.0
return np.sum(np.log(single_likelihoods))
# We don't need to know the denominator of support f(y)
# as it will be canceled out in the acceptance ratio
def posterior_probability(w_vector):
return likelihood_probability(w_vector) + prior_probability(w_vector)
# jump from w to new w
# proposal function is Gaussian centered at w
def proposal_function(w_vector):
a = w_vector[0]
b = w_vector[1]
a_new = np.random.normal(a, 0.5)
b_new = np.random.normal(b, 0.5)
w_new = [a_new, b_new]
return w_new
# run the Monte Carlo
n_samples = 30000
initial_w_vector = [0.5, 0.5] # start value
results = np.zeros([n_samples,2]) # record the results
results[0,0] = initial_w_vector[0]
results[0, 1] = initial_w_vector[1]
for step in range(1, n_samples): # loop 50,000 times
if step % 5000 == 0:
print('step: {}'.format(step))
w_old = results[step-1, :]
w_proposal = proposal_function(w_old)
# Use np.exp to restore from log numbers
prob = np.exp(posterior_probability(w_proposal) - posterior_probability(w_old))
if np.random.uniform(0,1) < prob:
results[step, :] = w_proposal # jump
else:
results[step, :] = w_old # stay
burn_in = 10000
w_posterior = results[burn_in:, :]
print(w_posterior.mean(axis=0)) # use average as point estimates
<ipython-input-1-a978602104c7>:13: RuntimeWarning: invalid value encountered in log return np.log(a) + np.log(b)
step: 5000 step: 10000 step: 15000 step: 20000 step: 25000 [3.18341861 0.54985753]
# present the results
fig, axes = plt.subplots(1,3, figsize=(12,3), dpi=100)
axes[0].hist(w_posterior[:,0], bins=20, color='blue')
axes[0].axvline(w_posterior.mean(axis=0)[0], color='red', linestyle='dashed', linewidth=2)
axes[0].title.set_text('Posterior -- w0')
axes[1].hist(w_posterior[:,1], bins=20, color='blue')
axes[1].axvline(w_posterior.mean(axis=0)[1], color='red', linestyle='dashed', linewidth=2)
axes[1].title.set_text('Posterior -- w1')
axes[2].hist2d(*w_posterior.T, bins=50)
axes[2].title.set_text('Posterior -- w0,w1')
axes[2].axvline(w_posterior.mean(axis=0)[0], color='red', linestyle='dashed', linewidth=2)
axes[2].axhline(w_posterior.mean(axis=0)[1], color='red', linestyle='dashed', linewidth=2)
axes[2].set_xlabel('w0')
axes[2].set_xlabel('w1')
plt.subplots_adjust(wspace=(0.4))
plt.show()
import seaborn as sns
sns.jointplot(x=w_posterior[:,0], y=w_posterior[:,1])
<seaborn.axisgrid.JointGrid at 0x7fdb50e9d130>
plt.plot(results[:,0], label='Burned chain')
plt.plot(np.arange(burn_in, len(results)),w_posterior[:,0], label='w_1 samples')
plt.legend()
plt.show()
from statsmodels.graphics import tsaplots
tsaplots.plot_acf(w_posterior[:,0], lags=60)
plt.ylabel('Correlation')
plt.xlabel('Lag')
plt.show()
lag = 300
plt.scatter(w_posterior[:-lag,0], w_posterior[lag:,0], alpha=0.05)
plt.ylabel(r'$w_0^{\tau _0+l}$', fontsize=14)
plt.xlabel(r'$w_0^{\tau _0}$', fontsize=14)
plt.title(f'Lag = {lag}')
plt.show()