Is there a relationship between human height and weight…? Probably. But what does that relationship look like ?
Those kinds of questions can be answered by Linear Regression. Below an example, using simulated data for weights,heights etc:
import numpy as np import matplotlib.pyplot as plt import pandas as pd import pymc as pm def create_fake_data(): ### create simulated data, 200 rows, with columns -> ### height,weight,male,age ### entries = 200 height_series = pd.Series(np.random.normal(170,10,entries)) weight_series = pd.Series(np.random.normal(60,10,entries)) male_series = pd.Series(np.random.uniform(0,2,entries)).astype(int) age_series = pd.Series(np.random.normal(40,10,entries)).astype(int) df = pd.DataFrame({'height':height_series,'weight':weight_series, 'male':male_series,'age':age_series}) mean_height = np.mean(df['height']) std_height = np.std(df['height']) mean_weight = np.mean(df['weight']) std_weight = np.std(df['weight']) ### increase weight of tall persons, reduce of short ### df.loc[df['height'] > mean_height * 1.01,'weight'] = df['weight'].apply( lambda x: float(np.random.normal(mean_weight + 30,10,1))) df.loc[df['height'] < mean_height * 0.995,'weight'] = df['weight'].apply( lambda x: float(np.random.normal(mean_weight - 10,5,1))) ### increase weight and height of men ### df.loc[df['male'] == 1,'weight'] = df['weight'] * 1.1 df.loc[df['male'] == 1,'height'] = df['height'] * 1.1 ### ### df = df[df['age'] >= 18] # unnecessary here,but left for selection syntax return df df = create_fake_data() x = df['weight'] ### define priors ### alpha = pm.Normal('alpha',mu=178,tau=1/(100*100)) beta = pm.Normal('beta',mu=0,tau=1/(10*10)) sigma = pm.Uniform('sigma',lower=0,upper=50) ### sample the priors for plotting ### alpha_prior_samples = [alpha.random() for i in range(10000)] beta_prior_samples = [beta.random() for i in range(10000)] sigma_prior_samples = [sigma.random() for i in range(10000)] ### define regression ### @pm.deterministic() def mu(x=x,beta=beta,alpha=alpha): return x*beta+alpha ### define data generations scheme, connect to data ### height = pm.Normal('height', mu=mu,tau=1 / (sigma * sigma), value=df['height'],observed=True) ### define PYMC model ### model = pm.Model([alpha,beta,sigma,height,mu]) mcmc = pm.MCMC(model) mcmc.sample(iter=100000,burn=25000,thin=2) ### get traces ### alpha_samples = mcmc.trace('alpha')[:] beta_samples = mcmc.trace('beta')[:] sigma_samples = mcmc.trace('sigma')[:] mu_samples = np.array(mcmc.trace('mu')[:,0]) result = pd.DataFrame({'alpha':alpha_samples,'beta':beta_samples, 'sigma':sigma_samples,'mu':mu_samples}) print (result.head()) print (result.describe()) print ('height mean:',np.mean(df['height'])) print ('height std:',np.std(df['height'])) print ('height women mean:',np.mean(df[df['male'] == 0]['height'])) print ('height men mean:',np.mean(df[df['male'] == 1]['height'])) print ('posterior mu mean:',np.mean(mu_samples)) print ('posterior sigma mean:',np.mean(sigma_samples)) print ('posterior alpha mean:',np.mean(alpha_samples)) print ('posterior beta mean:',np.mean(beta_samples)) nr_rows = 1000 rows = np.random.choice(result.index,replace=True,size=nr_rows) posterior_samples = result.iloc[rows,:] X = np.linspace(min(df['weight']),max(df['weight']),nr_rows) ### for each weight, generate random heights -> ### with mu and sigma from from posterior ### posterior_mus = np.array([np.random.normal(X[i] * posterior_samples[ 'beta'] + posterior_samples['alpha'],posterior_samples[ 'sigma']) for i in range(len(posterior_samples))]) ### 89% confidence interval ### low,high = np.percentile(posterior_mus,[5.5,94.5],axis=1) #### plotting #### sub=10 lines = np.array([X[i] * result['beta'][::sub] + result[ 'alpha'][::sub] for i in range(len(X))]) plt.figure(figsize=(18,12)) plt.subplot(3,1,1) plt.title(r'Prior $\alpha$') plt.hist(alpha_prior_samples,bins=30) plt.subplot(3,1,2) plt.title(r'Prior $\beta$') plt.hist(beta_prior_samples,bins=30) plt.subplot(3,1,3) plt.title(r'Prior $\sigma$') plt.hist(sigma_prior_samples,bins=30) plt.tight_layout() plt.savefig('priors.jpg',format='jpg') plt.figure(figsize=(18,12)) plt.subplot(4,1,1) plt.title(r'Posterior $\alpha$') plt.hist(alpha_samples,bins=30) plt.subplot(4,1,2) plt.title(r'Posterior $\beta$') plt.hist(beta_samples,bins=30) plt.subplot(4,1,3) plt.title(r'Posterior $\sigma$') plt.hist(sigma_samples,bins=30) plt.subplot(4,1,4) plt.title(r'Posterior $\mu$') plt.hist(mu_samples,bins=30) plt.tight_layout() plt.savefig('posteriors.jpg',format='jpg') plt.figure(figsize=(18,12)) plt.title(r'Bivariate Linear Regression $\alpha$:{:.2f} $\beta$:{:.2f}, CI=89%'.format(np.round(np.mean(alpha_samples),2),np.round(np.mean(beta_samples),2))) plt.scatter(df['weight'],df['height']) plt.plot(X,lines,alpha=0.005,color='r') plt.plot(X,X*np.mean(beta_samples)+np.mean(alpha_samples),color='k',lw=3) plt.fill_between(np.linspace(min(df['weight']), max(df['weight']),high.size), low,high,color='c',alpha=0.1) plt.xlabel('Weight [kg]') plt.ylabel('Height [cm]') plt.savefig('regression.jpg',format='jpg') plt.show()
Running that code gives the below graphs:
First the priors for Alpha, Beta & Sigma:
Next, the Posteriors:
And finally, the Regression itself: The blue dots are the actual (simulated) data points, the black line is the Maximum Aposteriori for mu, the red area around mu is the uncertainty for mu, and the baby blue area represents the 89% confidence interval for what the model believes are the heights of people for each weight.