Exploring Statistical Interactions with Python

In the previous post, I analyzed the correlation between 3 development indicators (income per person, life expectancy, policy score, and urban rate) and HIV rate. I found that there was a linear negative association between urban rate and HIV rate (Correlation coefficient = -0.276, p-value = 0.001).

This time, I am interested in assessing the moderation effect of income level and polity scoring on the relationship between urban rate and HIV rate.

The code:

# -*- coding: utf-8 -*-
“””
Created on Mon Sep 21 10:18:43 2015

@author: Degninou
“””
#%%
# Correlation and interaction nalysis
import pandas
import numpy
import scipy.stats
import seaborn
import matplotlib.pyplot as plt

gapmind = pandas.read_csv(‘gapminder.csv’, low_memory=False)

#Set variables of interest to numeric
gapmind[‘urbanrate’] = gapmind[‘urbanrate’].convert_objects(convert_numeric=True)
gapmind[‘incomeperperson’] = gapmind[‘incomeperperson’].convert_objects(convert_numeric=True)
gapmind[‘hivrate’] = gapmind[‘hivrate’].convert_objects(convert_numeric=True)
gapmind[‘polityscore’] = gapmind[‘polityscore’].convert_objects(convert_numeric=True)

#Replace empty cases by missing
gapmind[‘incomeperperson’]= gapmind[‘incomeperperson’].replace(‘ ‘, numpy.nan)
gapmind[‘urbanrate’]= gapmind[‘urbanrate’].replace(‘ ‘, numpy.nan)
gapmind[‘hivrate’]= gapmind[‘hivrate’].replace(‘ ‘, numpy.nan)
gapmind[‘polityscore’]= gapmind[‘polityscore’].replace(‘ ‘, numpy.nan)

gapmind1 = gapmind.dropna()

print (scipy.stats.pearsonr(gapmind1[‘urbanrate’], gapmind1[‘hivrate’]))

def incomegrp (row):
if row[‘incomeperperson’] <= 744.239:
return 1
elif row[‘incomeperperson’] <= 9425.326 :
return 2
elif row[‘incomeperperson’] > 9425.326:
return 3

def politygrp (row):
if row[‘polityscore’] <= 5:
return 1
elif row[‘polityscore’] > 5 :
return 2
gapmind1[‘incomegrp’] = gapmind1.apply (lambda row: incomegrp (row),axis=1)
gapmind1[‘politygrp’] = gapmind1.apply (lambda row: politygrp (row),axis=1)

print (‘Frequency count of income group’)
chk1 = gapmind1[‘incomegrp’].value_counts(sort=False, dropna=False)
print(chk1)
print (‘Frequency count of polity group’)
chk2 = gapmind1[‘politygrp’].value_counts(sort=False, dropna=False)
print(chk2)

#%% Subset gapmind1 data set into 3 data set by income group
sub1=gapmind1[(gapmind1[‘incomegrp’]== 1)]
sub2=gapmind1[(gapmind1[‘incomegrp’]== 2)]
sub3=gapmind1[(gapmind1[‘incomegrp’]== 3)]
#Subset gapmind data set into 2 data set by polity group
sub4=gapmind1[(gapmind1[‘politygrp’]== 1)]
sub5=gapmind1[(gapmind1[‘politygrp’]== 2)]

#%%
print (‘Association between urbanrate and HIV rate for LOW income countries’)
print (scipy.stats.pearsonr(sub1[‘urbanrate’], sub1[‘hivrate’]))
print (‘ ‘)
print (‘Association between urbanrate and HIV rate for MIDDLE income countries’)
print (scipy.stats.pearsonr(sub2[‘urbanrate’], sub2[‘hivrate’]))
print (‘ ‘)
print (‘Association between urbanrate and HIV rate for HIGH income countries’)
print (scipy.stats.pearsonr(sub3[‘urbanrate’], sub3[‘hivrate’]))

#%%
print (‘Association between urbanrate and HIV rate for LOW polity countries’)
print (scipy.stats.pearsonr(sub4[‘urbanrate’], sub4[‘hivrate’]))
print (‘ ‘)
print (‘Association between urbanrate and HIV rate for HIGH polity countries’)
print (scipy.stats.pearsonr(sub5[‘urbanrate’], sub5[‘hivrate’]))
#%%
scat1 = seaborn.regplot(x=”urbanrate”, y=”hivrate”, data=sub1)
plt.xlabel(‘Urban Rate’)
plt.ylabel(‘HIV Rate’)
plt.title(‘Scatterplot for the Association Between Urban Rate and HIV Rate for LOW income countries’)
print (scat1)
#%%
scat2 = seaborn.regplot(x=”urbanrate”, y=”hivrate”, data=sub2)
plt.xlabel(‘Urban Rate’)
plt.ylabel(‘HIV Rate’)
plt.title(‘Scatterplot for the Association Between Urban Rate and HIV Rate for MIDDLE income countries’)
print (scat2)
#%%
scat3 = seaborn.regplot(x=”urbanrate”, y=”hivrate”, data=sub3)
plt.xlabel(‘Urban Rate’)
plt.ylabel(‘HIV Rate’)
plt.title(‘Scatterplot for the Association Between Urban Rate and HIV Rate for HIGH income countries’)
print (scat3)

#%%
scat4 = seaborn.regplot(x=”urbanrate”, y=”hivrate”, data=sub4)
plt.xlabel(‘Urban Rate’)
plt.ylabel(‘HIV Rate’)
plt.title(‘Scatterplot for the Association Between Urban Rate and HIV rate for LOW polity countries’)
print (scat4)
#%%
scat5 = seaborn.regplot(x=”urbanrate”, y=”hivrate”, data=sub5)
plt.xlabel(‘Urban Rate’)
plt.ylabel(‘HIV Rate’)
plt.title(‘Scatterplot for the Association Between Urban Rate and HIV Rate for HIGH polity countries’)
print (scat5)
#%%

The graphs:

The outpout:

Output

The results show that there was a significant correlation between urban rate and HIV rate, only for countries with high income. (Correlation coefficient = -0.593, p-value = 0.0007), and only for countries with high polity scores (Correlation coefficient = -0.312, p-value = 0.0040). Thus, it can be concluded that income level and polity score play moderating effects on the correlation between urban rate and HIV rate.

Advertisements

One thought on “Exploring Statistical Interactions with Python

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s