Python, Pandas & PYMC example on Bayesian Linear Regression, adopted from Richard McElreath’s “Statistical Rethinking” class, where he uses R as modeling language instead of PYMC.

Data in a csv-file describe various attributes such as weight, height, age, gender etc of an indigenous people, !Kung, below an extract:

"height";"weight";"age";"male" 151.765;47.8256065;63;1 139.7;36.4858065;63;0 136.525;31.864838;65;0 156.845;53.0419145;41;1 145.415;41.276872;51;0 163.83;62.992589;35;1 149.225;38.2434755;32;0 168.91;55.4799715;27;1

Here, I’m using weight as a predictor variable, and the objective is to predict height based on weight.

First, a histogram on population heights:

Next, a histogram of regression priors:

Next, posterior distributions:

Finally, a regression plot including a sampling from the posterior.

import numpy as np import matplotlib.pyplot as plt import pandas as pd import pymc as pm import numpy as np import seaborn as sns sns.set() np.random.seed(4711) bins = 20 df = pd.read_csv('../Stat_Rethink/Howell1.csv',header=0,sep=';') df = df.loc[df['age'] >= 18] df['weight_c'] = df['weight'] - np.mean(df['weight']) df['weight_s'] = df['weight_c'] / df['weight_c'].std() print (df.describe()) df['height'].hist(bins=bins,figsize=(18,12)) plt.title('Overall Height Distribution') plt.xlabel('Height') plt.ylabel ('Frequency') plt.savefig('pymc_basic_linear_0.jpg',format='jpg') # regression coefficients - priors alphas = pm.Normal('alphas',mu=178,tau=1/(50 * 50)) betas = pm.Lognormal('betas',0,1/(1 * 1)) alpha_prior_samples = np.array([alphas.random() for i in range(10000)]) beta_prior_samples = np.array([betas.random() for i in range(10000)]) priors = pd.DataFrame({'alpha_prior':alpha_prior_samples, 'beta_prior':beta_prior_samples}) priors.hist(bins=bins,figsize=(18,12)) plt.savefig('pymc_linear_basic_1.jpg',format='jpg') # regression function, to generate a distribution of mean heights per weight @pm.deterministic() def heights_mu(x=df.weight_s,alpha=alphas,beta=betas): return x * beta + alpha sigma = pm.Uniform('sigma',0,100) # likelihood, using the mean heighs from above as mu fit = pm.Normal('mean_heights', mu=heights_mu, tau=1/sigma**2, value=df.height, observed=True) model = pm.Model([alphas,betas,heights_mu,sigma,fit]) map_ = pm.MAP(model) map_.fit() mcmc = pm.MCMC(model) mcmc.sample(10000,2000,2) alpha_posterior = mcmc.trace('alphas')[:] beta_posterior = mcmc.trace('betas')[:] heights_mu_posterior = mcmc.trace('heights_mu')[:,0] sigma_posterior = mcmc.trace('sigma')[:] results = pd.DataFrame({'alpha_posterior':alpha_posterior, 'beta_posterior':beta_posterior, 'heights_mu_posterior':heights_mu_posterior, 'sigma_posterior':sigma_posterior}) print (results.head()) print (results.describe()) results.hist(bins=20,figsize=(18,12)) plt.savefig('pymc_basic_linear_2.jpg',format='jpg') # sample posterior nr_samples = 1000 sample_index = np.random.choice(range(len(results)), replace=True, size=nr_samples) sample_alphas = results.alpha_posterior.iloc[sample_index] sample_betas = results.beta_posterior.iloc[sample_index] sample_sigmas = results.sigma_posterior.iloc[sample_index] # must sort for fill_between to work! sample_weights = df.weight_s.sort_values() # for each weight, generate mean heights sample_mean_heights = np.array([sample_weights * sample_betas.iloc[i] + \ sample_alphas.iloc[i] \ for i in range(len(sample_index))]) # plot linear regression based on mean heights for each weight plt.figure(figsize=(18,12)) plt.title('Posterior Samples (green) and Data Points (red)') for h in sample_mean_heights: plt.plot(sample_weights,h,color='yellow',alpha=0.05) # for each weight, generate heights ~ N(mu,sigma) sample_heights = np.array([np.random.normal( sample_mean_heights[:,i],sample_sigmas) for i in range( len(sample_weights))]) # credible interval 89% ci = np.percentile(sample_heights,[5.5,94.5],axis=1) count = 0 for s in range(nr_samples): if count == 0: plt.scatter(sample_weights,sample_heights[:,s],color='g', alpha=0.7,label='Generated Sample Point') else: plt.scatter(sample_weights,sample_heights[:,s],color='g',alpha=0.05) count += 1 plt.scatter(df['weight_s'],df['height'],color='r') plt.fill_between(df.weight_s.sort_values(), ci[0,:],ci[1,:], color='orange',alpha=0.4,label='credible interval 89%') plt.xlabel('Standardized weight') plt.ylabel('Height') plt.legend() plt.savefig('pymc_basic_linear_3.jpg',format='jpg') plt.show()