## PYMC – Markov Chain Monte Carlo regression – canonical example

```import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pymc as pm
import scipy.stats as sps

np.random.seed(4711)

observations = 100

# simulate parameter x values, e.g. ratio foreign born
xvals = np.linspace(0,.45,observations)

# simulate parameter y values, e.g. ratio votes
def yval_generator(x,slope,intercept):
return x * slope + intercept

f = yval_generator(xvals,0.5,0.1)

def noise(y):

return np.random.normal(1,0.1,len(y)) * y

data_points = noise(f)
slope,intercept,_,_,_ = sps.linregress(xvals,f)

def markov_monte_carlo(xvals,data_points):
# params for distributions for linreq equation
alpha = pm.Normal('alpha',mu=0.5,tau = 1 / (0.25 * 0.25))
beta = pm.Normal('beta',mu=1.0,tau = 1 / (0.5 * 0.5))
sigma = pm.Uniform('sigma',0,1)

@pm.deterministic()
def mu(xvals=xvals,beta=beta,alpha=alpha):
return xvals*beta+alpha

# generative function for regression line parameters
lines = pm.Normal('lines',mu=mu,tau = 1 / (sigma * sigma),
value = data_points,observed = True)

model = pm.Model([alpha,beta,sigma,mu,lines])
mcmc = pm.MCMC(model)
mcmc.sample(iter=10000,burn=4000,thin=2)

alpha_samples = mcmc.trace('alpha')[:]
beta_samples = mcmc.trace('beta')[:]
sigma_samples = mcmc.trace('sigma')[:]
mu_samples = mcmc.trace('mu')[:,0]

result = pd.DataFrame({'alpha':alpha_samples,'beta':beta_samples,
'sigma':sigma_samples,'mu':mu_samples})

return result

result = markov_monte_carlo(xvals,data_points)

result.plot.hist(alpha=0.7,subplots=True,grid=True,title='Param Posteriors',
figsize=(18,12))

plt.xticks(np.arange(0,1.1,0.05))
plt.savefig('pymc_regression_canonical_param_posteriors.jpg',format='jpg')

#### sample posterior
nr_rows = 10

rows = np.random.choice(result.index,replace=True,size=nr_rows)
posterior_samples = result.iloc[rows,:]

X = np.linspace(0,0.45,nr_rows) #size must be same as in posterior_yvals!!!

posterior_yvals = np.array(
[np.random.normal(loc=X[i] * posterior_samples['beta'] + \
posterior_samples['alpha'],
scale=posterior_samples['sigma'],
size=nr_rows) for i in range(nr_rows)])

# get 89th percentile of yvals
low_high = np.array(list(zip(*np.percentile(posterior_yvals,
[5.5,94.5],axis=1))))

lines = np.array([X[i] * result['beta'] + result['alpha'] \
for i in range(nr_rows)])

###
plt.figure(figsize=(18,12))
plt.title('Posterior regession lines & 89th percentile CI')
plt.grid(which='both')
plt.scatter(xvals,data_points,color='r')
plt.plot(X,lines,color='orange',alpha=0.005)
plt.fill_between(X,low_high[:,0],low_high[:,1],color='c',alpha=0.1)
plt.xticks(np.arange(0,0.50,0.05))
plt.savefig('pymc_regression_canonical_reg_lines.jpg',format='jpg')

plt.figure(figsize=(18,12))
plt.title('Noised Data points & the generating base function')
plt.grid(which='both')
plt.xlabel('x')
plt.ylabel('y')
plt.plot(xvals,f,ls='-.',color='r',
label='generating base function',alpha=1)
plt.scatter(xvals,data_points,color='r',label='noise applied to f')
plt.plot(xvals,slope * xvals + intercept,'x-',label='Regression',
color='c',ls=':')
plt.legend(loc='upper left')
plt.savefig('pymc_regression_canonical_data_gen.jpg',format='jpg')
plt.show()
```