import numpy as np import matplotlib.pyplot as plt import pandas as pd import pymc as pm import scipy.stats as sps np.random.seed(4711) observations = 100 # simulate parameter x values, e.g. ratio foreign born xvals = np.linspace(0,.45,observations) # simulate parameter y values, e.g. ratio votes def yval_generator(x,slope,intercept): return x * slope + intercept f = yval_generator(xvals,0.5,0.1) def noise(y): return np.random.normal(1,0.1,len(y)) * y data_points = noise(f) slope,intercept,_,_,_ = sps.linregress(xvals,f) def markov_monte_carlo(xvals,data_points): # params for distributions for linreq equation alpha = pm.Normal('alpha',mu=0.5,tau = 1 / (0.25 * 0.25)) beta = pm.Normal('beta',mu=1.0,tau = 1 / (0.5 * 0.5)) sigma = pm.Uniform('sigma',0,1) @pm.deterministic() def mu(xvals=xvals,beta=beta,alpha=alpha): return xvals*beta+alpha # generative function for regression line parameters lines = pm.Normal('lines',mu=mu,tau = 1 / (sigma * sigma), value = data_points,observed = True) model = pm.Model([alpha,beta,sigma,mu,lines]) mcmc = pm.MCMC(model) mcmc.sample(iter=10000,burn=4000,thin=2) alpha_samples = mcmc.trace('alpha')[:] beta_samples = mcmc.trace('beta')[:] sigma_samples = mcmc.trace('sigma')[:] mu_samples = mcmc.trace('mu')[:,0] result = pd.DataFrame({'alpha':alpha_samples,'beta':beta_samples, 'sigma':sigma_samples,'mu':mu_samples}) return result result = markov_monte_carlo(xvals,data_points) result.plot.hist(alpha=0.7,subplots=True,grid=True,title='Param Posteriors', figsize=(18,12)) plt.xticks(np.arange(0,1.1,0.05)) plt.savefig('pymc_regression_canonical_param_posteriors.jpg',format='jpg') #### sample posterior nr_rows = 10 rows = np.random.choice(result.index,replace=True,size=nr_rows) posterior_samples = result.iloc[rows,:] X = np.linspace(0,0.45,nr_rows) #size must be same as in posterior_yvals!!! posterior_yvals = np.array( [np.random.normal(loc=X[i] * posterior_samples['beta'] + \ posterior_samples['alpha'], scale=posterior_samples['sigma'], size=nr_rows) for i in range(nr_rows)]) # get 89th percentile of yvals low_high = np.array(list(zip(*np.percentile(posterior_yvals, [5.5,94.5],axis=1)))) lines = np.array([X[i] * result['beta'] + result['alpha'] \ for i in range(nr_rows)]) ### plt.figure(figsize=(18,12)) plt.title('Posterior regession lines & 89th percentile CI') plt.grid(which='both') plt.scatter(xvals,data_points,color='r') plt.plot(X,lines,color='orange',alpha=0.005) plt.fill_between(X,low_high[:,0],low_high[:,1],color='c',alpha=0.1) plt.xticks(np.arange(0,0.50,0.05)) plt.savefig('pymc_regression_canonical_reg_lines.jpg',format='jpg') plt.figure(figsize=(18,12)) plt.title('Noised Data points & the generating base function') plt.grid(which='both') plt.xlabel('x') plt.ylabel('y') plt.plot(xvals,f,ls='-.',color='r', label='generating base function',alpha=1) plt.scatter(xvals,data_points,color='r',label='noise applied to f') plt.plot(xvals,slope * xvals + intercept,'x-',label='Regression', color='c',ls=':') plt.legend(loc='upper left') plt.savefig('pymc_regression_canonical_data_gen.jpg',format='jpg') plt.show()