Bayesian Linear Regression with PYMC

Is there a relationship between human height and weight…? Probably. But what does that relationship look like ?

Those kinds of questions can be answered by Linear Regression. Below an example, using simulated data for weights,heights etc:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pymc as pm

def create_fake_data():
    ### create simulated data, 200 rows, with columns ->
    ### height,weight,male,age ### 
    entries = 200
    height_series = pd.Series(np.random.normal(170,10,entries))
    weight_series = pd.Series(np.random.normal(60,10,entries))
    male_series = pd.Series(np.random.uniform(0,2,entries)).astype(int)
    age_series = pd.Series(np.random.normal(40,10,entries)).astype(int)

    df = pd.DataFrame({'height':height_series,'weight':weight_series,
                       'male':male_series,'age':age_series})

    mean_height = np.mean(df['height'])
    std_height = np.std(df['height'])
    mean_weight = np.mean(df['weight'])
    std_weight = np.std(df['weight'])

    ### increase weight of tall persons, reduce of short ### 
    df.loc[df['height'] > mean_height * 1.01,'weight'] = df['weight'].apply(
        lambda x: float(np.random.normal(mean_weight + 30,10,1)))

    df.loc[df['height'] < mean_height * 0.995,'weight'] = df['weight'].apply(         lambda x: float(np.random.normal(mean_weight - 10,5,1)))     ### increase weight and height of men ###     df.loc[df['male'] == 1,'weight'] = df['weight'] * 1.1     df.loc[df['male'] == 1,'height'] = df['height'] * 1.1     ### ###      df = df[df['age'] >= 18] # unnecessary here,but left for selection syntax
    return df
    
df = create_fake_data()

x = df['weight']

### define priors ###
alpha = pm.Normal('alpha',mu=178,tau=1/(100*100))
beta = pm.Normal('beta',mu=0,tau=1/(10*10))
sigma = pm.Uniform('sigma',lower=0,upper=50)

### sample the priors for plotting ###
alpha_prior_samples = [alpha.random() for i in range(10000)]
beta_prior_samples = [beta.random() for i in range(10000)]
sigma_prior_samples = [sigma.random() for i in range(10000)]

### define regression ###
@pm.deterministic()
def mu(x=x,beta=beta,alpha=alpha):
    return x*beta+alpha

### define data generations scheme, connect to data ###
height = pm.Normal('height',
                   mu=mu,tau=1 / (sigma * sigma),
                   value=df['height'],observed=True)

### define PYMC model ###
model = pm.Model([alpha,beta,sigma,height,mu])
mcmc = pm.MCMC(model)

mcmc.sample(iter=100000,burn=25000,thin=2)

### get traces ###
alpha_samples = mcmc.trace('alpha')[:]
beta_samples = mcmc.trace('beta')[:]
sigma_samples = mcmc.trace('sigma')[:]
mu_samples = np.array(mcmc.trace('mu')[:,0])

result = pd.DataFrame({'alpha':alpha_samples,'beta':beta_samples,
                       'sigma':sigma_samples,'mu':mu_samples})

print (result.head())
print (result.describe())

print ('height mean:',np.mean(df['height']))
print ('height std:',np.std(df['height']))
print ('height women mean:',np.mean(df[df['male'] == 0]['height']))
print ('height men mean:',np.mean(df[df['male'] == 1]['height']))
       
print ('posterior mu mean:',np.mean(mu_samples))
print ('posterior sigma mean:',np.mean(sigma_samples))
print ('posterior alpha mean:',np.mean(alpha_samples))
print ('posterior beta mean:',np.mean(beta_samples))

nr_rows = 1000
rows = np.random.choice(result.index,replace=True,size=nr_rows)

posterior_samples = result.iloc[rows,:]

X = np.linspace(min(df['weight']),max(df['weight']),nr_rows)

### for each weight, generate random heights ->
### with mu and sigma from from posterior ###  
posterior_mus = np.array([np.random.normal(X[i] * posterior_samples[
    'beta'] + posterior_samples['alpha'],posterior_samples[
        'sigma']) for i in range(len(posterior_samples))])

### 89% confidence interval ### 
low,high = np.percentile(posterior_mus,[5.5,94.5],axis=1)

#### plotting ####
sub=10
lines = np.array([X[i] * result['beta'][::sub] + result[
        'alpha'][::sub] for i in range(len(X))])

plt.figure(figsize=(18,12))
plt.subplot(3,1,1)
plt.title(r'Prior $\alpha$')
plt.hist(alpha_prior_samples,bins=30)

plt.subplot(3,1,2)
plt.title(r'Prior $\beta$')
plt.hist(beta_prior_samples,bins=30)

plt.subplot(3,1,3)
plt.title(r'Prior $\sigma$')
plt.hist(sigma_prior_samples,bins=30)

plt.tight_layout()
plt.savefig('priors.jpg',format='jpg')

plt.figure(figsize=(18,12))
plt.subplot(4,1,1)
plt.title(r'Posterior $\alpha$')
plt.hist(alpha_samples,bins=30)

plt.subplot(4,1,2)
plt.title(r'Posterior $\beta$')
plt.hist(beta_samples,bins=30)

plt.subplot(4,1,3)
plt.title(r'Posterior $\sigma$')
plt.hist(sigma_samples,bins=30)

plt.subplot(4,1,4)
plt.title(r'Posterior $\mu$')
plt.hist(mu_samples,bins=30)
plt.tight_layout()
plt.savefig('posteriors.jpg',format='jpg')

plt.figure(figsize=(18,12))
plt.title(r'Bivariate Linear Regression $\alpha$:{:.2f} $\beta$:{:.2f}, CI=89%'.format(np.round(np.mean(alpha_samples),2),np.round(np.mean(beta_samples),2)))

plt.scatter(df['weight'],df['height'])
plt.plot(X,lines,alpha=0.005,color='r')
plt.plot(X,X*np.mean(beta_samples)+np.mean(alpha_samples),color='k',lw=3)
plt.fill_between(np.linspace(min(df['weight']),
                             max(df['weight']),high.size),
                 low,high,color='c',alpha=0.1)

plt.xlabel('Weight [kg]')
plt.ylabel('Height [cm]')
plt.savefig('regression.jpg',format='jpg')
plt.show()

Running that code gives the below graphs:

First the priors for Alpha, Beta & Sigma:

priors

Next, the Posteriors:

posteriors

And finally, the Regression itself: The blue dots are the actual (simulated) data points, the black line is the Maximum Aposteriori for mu, the red area around mu is the uncertainty for mu, and the baby blue area represents the 89% confidence interval for what the model believes are the heights of people for each weight.

regression

About swdevperestroika

High tech industry veteran, avid hacker reluctantly transformed to mgmt consultant.
This entry was posted in Bayes, Data Analytics, Numpy, Probability, PYMC, Python, Simulation, Statistics and tagged , , , , , , , , , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s