Visualizing data with Python

This week, I learnt how to visualize data with Python programming language in Spider integrated development environments (IDE). The purpose was to visualize GapMinder data. The variables of interest are: urban rate, CO2 emissions, suicide rate, breast cancers rate, HIV rate, and income per person. Here is the code I wrote:

# -*- coding: utf-8 -*-
“””
Created on Fri Dec 18 16:32:51 2015
@author: DEGNINOU
“”” #Loading pandas and numpy packages import pandas import numpy import seaborn import matplotlib.pyplot as plt #Read gapminder.csv data file gapmind = pandas.read_csv(“gapminder.csv”, low_memory=False) #Set PANDAS to show all columns in DataFrame pandas.set_option(‘display.max_columns’, None) #Set PANDAS to show all rows in DataFrame pandas.set_option(‘display.max_rows’, None) # bug fix for display formats to avoid run time errors pandas.set_option(‘display.float_format’, lambda x:’%f’%x) #setting variables you will be working with to numeric gapmind[‘suicide’] = gapmind[‘suicide’].convert_objects(convert_numeric=True) gapmind[‘urbanrate’] = gapmind[‘urbanrate’].convert_objects(convert_numeric=True) gapmind[‘incomeperperson’] = gapmind[‘incomeperperson’].convert_objects(convert_numeric=True) gapmind[‘hivrate’] = gapmind[‘hivrate’].convert_objects(convert_numeric=True) gapmind[‘breastcancer’] = gapmind[‘breastcancer’].convert_objects(convert_numeric=True) gapmind[‘co2emissions’] = gapmind[‘co2emissions’].convert_objects(convert_numeric=True) # standard deviation and other descriptive statistics for quantitative variables print (‘Describe urban rate’) desc1 = gapmind[‘urbanrate’].describe() print (desc1) print (‘Describe CO2 emissions’) desc2 = gapmind[‘co2emissions’].describe() print (desc2) print (‘Describe suicide rate’) desc3 = gapmind[“suicide”].describe() print (desc3) print (‘Describe breast cancer rate’) desc4 = gapmind[‘breastcancer’].describe()
print (desc4)
print (‘Describe income per person’)
desc5 = gapmind[‘incomeperperson’].describe()
print (desc5)
# quartile split (use qcut function & ask for 4 groups – gives you quartile split)
print (‘Income per person – 4 categories – quartiles’)
gapmind[‘incomegrp’]=pandas.qcut(gapmind.incomeperperson, 4, labels=[“1=25th%tile”,”2=50%tile”,”3=75%tile”,”4=100%tile”])
#c10 = gapmind[‘incomegrp’].value_counts(sort=False, dropna=True)
#print(c10)
print (‘CO2 Emissions – 4 categories – quartiles’)
gapmind[‘co2grp’]=pandas.qcut(gapmind.co2emissions, 4, labels=[“1=25th%tile”,”2=50%tile”,”3=75%tile”,”4=100%tile”])
#c11 = gapmind[‘incomegrp’].value_counts(sort=False, dropna=True)
#print(c11)
print (‘Urban rate – 4 categories – quartiles’)
gapmind[‘urbangrp’]=pandas.qcut(gapmind.urbanrate, 4, labels=[“1=25th%tile”,”2=50%tile”,”3=75%tile”,”4=100%tile”])
#c12 = gapmind[‘incomegrp’].value_counts(sort=False, dropna=True)
#print(c12)
#Univariate histogram for income
seaborn.distplot(gapmind[‘incomeperperson’].dropna(), kde=False);
plt.xlabel(‘Gross Domestic Product per capita in constant 2000 US$’)
plt.ylabel(‘Number of countries’)
plt.title(‘Distribution of income per persone’)
#Univariate histogram for CO2 emissions
seaborn.distplot(gapmind[‘co2emissions’].dropna(),kde=False);
plt.xlabel(‘CO2 emission (metric tons)’)
plt.ylabel(‘Number of countries’)
plt.title(‘Distribution of CO2 emissions’)
#Univariate histogram for urban rate
seaborn.distplot(gapmind[‘urbanrate’].dropna(), kde=False);
plt.xlabel(‘Urban population (% of total)’)
plt.ylabel(‘Number of countries’)
plt.title(‘Distribution of urban population’)
#basic scatterplot
scat1 = seaborn.regplot(x=’co2emissions’, y=’breastcancer’, fit_reg=False, data=gapmind)
plt.xlabel(‘CO2 emissions’)
plt.ylabel(‘Breast cancer new cases per 100,000 females’)
plt.title(‘Scatterplot for the Association Between CO2 Emissions and Breat cancers rate’)
scat2 = seaborn.regplot(x=’urbanrate’, y= ‘suicide’, data=gapmind)
plt.xlabel(‘Urban Rate’)
plt.ylabel(‘Suicide per 100 000’)
plt.title(‘Scatterplot for the Association Between Urban Rate and Suicide Rate’)
scat3 = seaborn.regplot(x=’incomeperperson’, y=’co2emissions’, data=gapmind)
plt.xlabel(‘Income per Person’)
plt.ylabel(‘Cumulative CO2 emission (metric tons)’)
plt.title(‘Scatterplot for the Association Between Income per Person and CO2 Emissions’)
# bivariate bar graph C->Q
seaborn.factorplot(x=’urbangrp’, y=’suicide’, data=gapmind, kind=”bar”, ci=None)
plt.xlabel(‘Urban rate’)
plt.ylabel(‘Mean suicide rate’)
plt.title(‘Bar graph of suicide rates by urban quartile range’)
c14= gapmind.groupby(‘urbangrp’).size()
print (c14)
# bivariate bar graph C->Q
seaborn.factorplot(x=’incomegrp’, y=’co2emissions’, data=gapmind, kind=”bar”, ci=None)
plt.xlabel(‘Income per Person’)
plt.ylabel(‘Mean CO2 Emission’)
plt.title(‘Bar graph of CO2 emissions by income quartile range’)
c15= gapmind.groupby(‘incomegrp’).size()
print (c15)
# bivariate bar graph C->Q
seaborn.factorplot(x=’co2grp’, y=’breastcancer’, data=gapmind, kind=”bar”, ci=None)
plt.xlabel(‘CO2 Emissions’)
plt.ylabel(‘Mean breast cancer rate’)
plt.title(‘Bar graph of breast cancers rates by CO2 emission quartile range’)
c13= gapmind.groupby(‘co2grp’).size()
print (c13)

The code produced the following graphs:

The univariate graphs show that, most of countries have small urban rates and CO2 emissions. Most of countries have income per person around $20 000. The scatter plots show that there is no association between CO2 emissions and breast cancers rate;  there is a negative association between urban rate and suicide rate; there is a positive association between income per person and CO2 emissions. Bivariate bar graphs show that the highest breast cancer rates are in the 100 percentile of CO2 emissions; the highest suicide rates are between the 25th and the 75th percentiles of urban rate; and the lowest CO2 emission are between the 25th and the 75th income percentiles of income per person.

Advertisements

One thought on “Visualizing data with Python

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s