Setting the Stage


In [1]:
# Importing the Python libraries we will use below #
import scipy as scipy
import pandas as pd
import ggplot as gg
import seaborn as sns
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from bokeh.plotting import *

In [2]:
# Setting the chose graphical styles #
%matplotlib inline
output_notebook()
sns.set_style("darkgrid", {"grid.linewidth": .9, "axes.facecolor": ".98"})
sns.set_context("notebook") # paper, notebook, talk, poster
# colour_map = dict(unsafe="red", celebs="blue", general="yellow", awareness="grey", myths="purple", stigma="green", safe="#2ecc71", advocates="#34495e", race="#e74c3c", jokes="#3498db",needle="steelblue", questions="indianred")
cmap = sns.diverging_palette(19, 251, s=60, l=30, sep=100, n=11, as_cmap=True)


BokehJS successfully loaded.

Getting the Ground-Truth Data


In [3]:
curitibaData = pd.ExcelFile("../data/groundtruth/curitiba.xls")

In [4]:
curitibaData.sheet_names


Out[4]:
[u'Curitiba HIV Tests (Public)', u'Curitiba HIV Tests (Private)']

In [5]:
curitibaPublic = curitibaData.parse(sheetname=0, header=0, parse_dates=True)

In [6]:
curitibaPublic


Out[6]:
HIV tests performed 2014-01 2014-02 2014-03 2014-04 2014-05 2014-06 2014-07 2014-08 2014-09
0 Primary Health Care services (females) 1465 1425 1378 1477 1437 1174 712 1267 1438
1 Primary Health Care services (males) 1057 1056 998 1115 1064 840 1197 1009 1250
2 Pregnant women in public health care 2760 2360 2095 2258 2473 2086 3423 2020 2571
3 HIV testing center (males) 528 358 389 383 348 307 490 404 377
4 HIV testing center (females) 186 133 158 155 155 94 197 158 107

In [7]:
curitibaPublic = curitibaPublic.loc[:,'2014-06':'2014-09']
curitibaPublic


Out[7]:
2014-06 2014-07 2014-08 2014-09
0 1174 712 1267 1438
1 840 1197 1009 1250
2 2086 3423 2020 2571
3 307 490 404 377
4 94 197 158 107

Turning the Table


In [8]:
curitibaPublic = curitibaPublic.transpose()
curitibaPublic


Out[8]:
0 1 2 3 4
2014-06 1174 840 2086 307 94
2014-07 712 1197 3423 490 197
2014-08 1267 1009 2020 404 158
2014-09 1438 1250 2571 377 107

In [9]:
curitibaPublic.columns = ['Primary Care, Females', 'Primary Care, Males', 'Pregnant Women', 'HIV Testing, Males',
                          'HIV Testing, Females'] # Giving the columns fairly short and explanatory names
curitibaPublic


Out[9]:
Primary Care, Females Primary Care, Males Pregnant Women HIV Testing, Males HIV Testing, Females
2014-06 1174 840 2086 307 94
2014-07 712 1197 3423 490 197
2014-08 1267 1009 2020 404 158
2014-09 1438 1250 2571 377 107

In [10]:
curitibaPublic.dtypes # Figuring out what datatype each column is read as


Out[10]:
Primary Care, Females    int64
Primary Care, Males      int64
Pregnant Women           int64
HIV Testing, Males       int64
HIV Testing, Females     int64
dtype: object

In [11]:
curitibaPublic.index = pd.to_datetime(curitibaPublic.index) # Making sure that months are read as such
for col in curitibaPublic: curitibaPublic[col] = curitibaPublic[col].astype('int') # Convert each non-index column to integers curitibaPublic.dtypes # checking that all columns are now integers

In [12]:
curitibaPublic # Checking that data looks the same after the datatype shenanigans


Out[12]:
Primary Care, Females Primary Care, Males Pregnant Women HIV Testing, Males HIV Testing, Females
2014-06-01 1174 840 2086 307 94
2014-07-01 712 1197 3423 490 197
2014-08-01 1267 1009 2020 404 158
2014-09-01 1438 1250 2571 377 107

In [13]:
curitibaPublic['Curitiba Total'] = curitibaPublic.sum(axis=1) # Adding a column with monthly totals
curitibaPublic


Out[13]:
Primary Care, Females Primary Care, Males Pregnant Women HIV Testing, Males HIV Testing, Females Curitiba Total
2014-06-01 1174 840 2086 307 94 4501
2014-07-01 712 1197 3423 490 197 6019
2014-08-01 1267 1009 2020 404 158 4858
2014-09-01 1438 1250 2571 377 107 5743

Looking at the Ground-Truth Data

Now we'll start having a graphical look at the data. First off we'll create a timeseries graph for each topic and the monthly totals (that is, a timeseries graph for each column)


In [14]:
for col in curitibaPublic:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = curitibaPublic[col].name,                   # Plot title
        y_axis_label = 'Tests',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FAFAFA',                        # Background colour for plot area
        outline_line_color = '#FAFAFA',                     # Colour of line sorrounding plot
        border_fill = '#FAFAFA',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (curitibaPublic.index.min(),
                   curitibaPublic.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(curitibaPublic[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        curitibaPublic.index,                               # Variable values for the x-axis (index = dates)
        curitibaPublic[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
    legend().label_text_font='Open Sans'
    legend().label_text_color='#363636'
    legend().border_line_color='#f6f6f6'
    axis().axis_label_text_font = "Open Sans"
    axis().axis_label_text_font_size = "12pt"
    axis().axis_label_text_color = "#363636"
    axis().major_label_text_font="Open Sans"
    axis().major_label_text_font_size="10pt"
    axis().minor_tick_line_color = "#d4d4d4"
    xaxis().axis_line_color = '#d4d4d4'
    xaxis().major_tick_line_color = "#d4d4d4"
    yaxis().major_tick_line_color = None
    yaxis().axis_line_color = None
    xgrid().grid_line_color = None
    ygrid().grid_line_color = "#d4d4d4"
    ygrid().grid_line_width = 0.5
    show(fig)


Below we'll insert trendlines for all topics in one chart to better compare.


In [15]:
fig = figure(
    plot_width = 1000, plot_height = 600, title = 'All Topics', y_axis_label = 'Tests', x_axis_label = 'Date',
    title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FAFAFA',
    outline_line_color = '#FAFAFA', border_fill = '#FAFAFA', x_axis_type = 'datetime',
    x_range = (curitibaPublic.index.min(),curitibaPublic.index.max()), y_range = (0,3500),
    )
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Females"], line_color = '#00aeef', line_width = 5, line_alpha = 0.7,
         legend = "Primary Care, Females")
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Males"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7, 
         legend = "Primary Care, Males")
fig.line(curitibaPublic.index, curitibaPublic["Pregnant Women"], line_color = '#5d6263', line_width = 5, line_alpha = 0.7, 
         legend = "Pregnant Women")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Females"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Females")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Males"], line_color = '#e1d8ad', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Males")
legend().label_text_font='Open Sans'
legend().label_text_color='#363636'
legend().border_line_color='#f6f6f6'
axis().axis_label_text_font = "Open Sans"
axis().axis_label_text_font_size = "12pt"
axis().axis_label_text_color = "#363636"
axis().major_label_text_font="Open Sans"
axis().major_label_text_font_size="10pt"
axis().minor_tick_line_color = "#d4d4d4"
xaxis().axis_line_color = '#d4d4d4'
xaxis().major_tick_line_color = "#d4d4d4"
yaxis().major_tick_line_color = None
yaxis().axis_line_color = None
xgrid().grid_line_color = None
ygrid().grid_line_color = "#d4d4d4"
ygrid().grid_line_width = 0.5
show(fig)


Correlation Between Test Groups

We can see above that there is some co-variance between the sub-groupings, but July also seems to be a bit of a dividing month. We'll therefore dig a bit further into any potential correlations to see if changes within the sub-groupings are indeed similar. If that is the case, it would imply that there is a large degree of robustness in the data. Or, in other words, if one group gets tested more in one month, so will others, meaning that external factors, such as campaigns or increased risk behavoiur across sub-groupings, could be at play.

Simple Matrix

First we'll make a simple correlation matrix using Pandas' built-in DataFrame correlations function, .corr. It correlates all columns pairwise using either pearson, kendall, or spearman.

As we're currently looking at timeseries correlations, we'll just use the default: Pearson.

Pearson assumes that the data is normal distributed. We can't really test that with only four data points, but code has been readied below for furture use.

normalTestPrimaryFemales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Females"]) normalTestPrimaryMales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Males"]) normalTestPregnantWomen = scipy.stats.mstats.normaltest(curitibaPublic["Pregnant Women"]) normalTestHIVTestFemales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Females"]) normalTestHIVTestMales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Males"]) normalTestTotal = scipy.stats.mstats.normaltest(curitibaPublic["Total"]) print('Normal Distribution Test for "Primary Care, Females": %s' % (normalTestPrimaryFemales,)) print('Normal Distribution Test for "Primary Care, Males": %s' % (normalTestPrimaryMales,)) print('Normal Distribution Test for "Pregnant Women": %s' % (normalTestPregnantWomen,)) print('Normal Distribution Test for "HIV Testing, Females": %s' % (normalTestHIVTestFemales,)) print('Normal Distribution Test for "HIV Testing, Males": %s' % (normalTestHIVTestMales,)) print('Normal Distribution Test for "Total": %s' % (normalTestTotal,)) #curitibaPublic["Primary Care, Females"].normaltest()

In [16]:
(gg.ggplot(gg.aes(x="Primary Care, Females"), data=curitibaPublic)
 + gg.geom_histogram())


stat_bin: binwidth defaulted to range/30.
    Use 'binwidth = x' to adjust this.
/home/ubuntu/anaconda/lib/python2.7/site-packages/pandas/util/decorators.py:81: FutureWarning: the 'rows' keyword is deprecated, use 'index' instead
  warnings.warn(msg, FutureWarning)
Out[16]:
<ggplot: (8789575508069)>

In [17]:
(gg.ggplot(gg.aes(x="Primary Care, Males"), data=curitibaPublic)
 + gg.geom_histogram())


Out[17]:
<ggplot: (8789574913781)>

In [18]:
(gg.ggplot(gg.aes(x="Pregnant Women"), data=curitibaPublic)
 + gg.geom_histogram())


Out[18]:
<ggplot: (8789574811461)>

In [19]:
(gg.ggplot(gg.aes(x="HIV Testing, Females"), data=curitibaPublic)
 + gg.geom_histogram())


Out[19]:
<ggplot: (8789574720549)>

In [20]:
(gg.ggplot(gg.aes(x="HIV Testing, Males"), data=curitibaPublic)
 + gg.geom_histogram())


Out[20]:
<ggplot: (8789574609437)>

In [22]:
(gg.ggplot(gg.aes(x="Curitiba Total"), data=curitibaPublic)
 + gg.geom_histogram())


Out[22]:
<ggplot: (8789574442225)>

In [23]:
curitibaPublicCorr = curitibaPublic.corr() # Using default method: Pearson
curitibaPublicCorr


Out[23]:
Primary Care, Females Primary Care, Males Pregnant Women HIV Testing, Males HIV Testing, Females Curitiba Total
Primary Care, Females 1.000000 -0.094142 -0.746287 -0.679287 -0.756572 -0.386639
Primary Care, Males -0.094142 1.000000 0.699348 0.671452 0.404677 0.948163
Pregnant Women -0.746287 0.699348 1.000000 0.807249 0.658780 0.889943
HIV Testing, Males -0.679287 0.671452 0.807249 1.000000 0.947279 0.777277
HIV Testing, Females -0.756572 0.404677 0.658780 0.947279 1.000000 0.538360
Curitiba Total -0.386639 0.948163 0.889943 0.777277 0.538360 1.000000
curitibaPublicCorrKendall = curitibaPublic.corr(method='kendall') # Using kendall curitibaPublicCorrKendall

Correlation Matrix


In [24]:
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(curitibaPublicCorr, annot=False, sig_stars=True,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()


Main Group Correlations


In [25]:
sns.jointplot("Primary Care, Females", "HIV Testing, Females", curitibaPublic, kind="reg", color="#404040");



In [26]:
sns.jointplot("HIV Testing, Males", "HIV Testing, Females", curitibaPublic, kind="reg", color="#404040");


Getting the Twitter Data


In [46]:
# Checking that the data file looks right #
!cat ../data/all.tsv | head











cat: write error: Broken pipe

In [43]:
# Read in Twitter data file #
twitterData=pd.read_table('../data/all.tsv',
                          encoding='utf-8',
                          #header=None,
                          na_values=['NaN',''],
                          parse_dates=[1],
                          index_col=[1]
                        )

In [44]:
twitterData.head()


Out[44]:
city lat lon topic
origdate
2014-06-19 06:01:11 Porto Alegre -30.11462 -51.16393 Prevention_Positive
2014-06-19 09:06:28 Fortaleza -3.72271 -38.52465 Discrimination_Negative
2014-06-19 00:22:09 Recife -8.01175 -34.95291 Discrimination_Negative
2014-06-19 02:07:21 Brasília -15.79159 -47.89558 Discrimination_Negative
2014-06-19 23:55:34 Fortaleza -3.72271 -38.52465 Discrimination_Negative

In [31]:
twitterDataSmall = twitterData.loc[:,'Campaign':'Testing']

In [32]:
twitterDataSmallAgg = twitterDataSmall.resample('MS', how='sum') # Resampling by summing each topic over each month
twitterDataSmallAgg['Twitter Total'] = twitterDataSmallAgg.sum(axis=1) # Adding a column with monthly totals
twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataSmallAgg


Out[32]:
Campaign Discrimination Prevention Testing Twitter Total
2014-06-01 3754 78987 16729 536 100006
2014-07-01 4346 73833 20443 563 99185
2014-08-01 1667 67650 14883 165 84365
2014-09-01 812 77534 13699 157 92202

Looking at the Twitter Data


In [33]:
for col in twitterDataSmallAgg:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = twitterDataSmallAgg[col].name,                   # Plot title
        y_axis_label = 'Tests',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FAFAFA',                        # Background colour for plot area
        outline_line_color = '#FAFAFA',                     # Colour of line sorrounding plot
        border_fill = '#FAFAFA',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (twitterDataSmallAgg.index.min(),
                   twitterDataSmallAgg.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(twitterDataSmallAgg[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        twitterDataSmallAgg.index,                               # Variable values for the x-axis (index = dates)
        twitterDataSmallAgg[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
    legend().label_text_font='Open Sans'
    legend().label_text_color='#363636'
    legend().border_line_color='#f6f6f6'
    axis().axis_label_text_font = "Open Sans"
    axis().axis_label_text_font_size = "12pt"
    axis().axis_label_text_color = "#363636"
    axis().major_label_text_font="Open Sans"
    axis().major_label_text_font_size="10pt"
    axis().minor_tick_line_color = "#d4d4d4"
    xaxis().axis_line_color = '#d4d4d4'
    xaxis().major_tick_line_color = "#d4d4d4"
    yaxis().major_tick_line_color = None
    yaxis().axis_line_color = None
    xgrid().grid_line_color = None
    ygrid().grid_line_color = "#d4d4d4"
    ygrid().grid_line_width = 0.5
    show(fig)


Merging Data


In [34]:
df = pd.merge(curitibaPublic, twitterDataSmallAgg, how='left', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True,
      suffixes=('_x', '_y'), copy=True)

In [35]:
df.head()


Out[35]:
Primary Care, Females Primary Care, Males Pregnant Women HIV Testing, Males HIV Testing, Females Curitiba Total Campaign Discrimination Prevention Testing Twitter Total
2014-06-01 1174 840 2086 307 94 4501 3754 78987 16729 536 100006
2014-07-01 712 1197 3423 490 197 6019 4346 73833 20443 563 99185
2014-08-01 1267 1009 2020 404 158 4858 1667 67650 14883 165 84365
2014-09-01 1438 1250 2571 377 107 5743 812 77534 13699 157 92202

Comparisons


In [36]:
dfCorr = df.corr() # Using default method: Pearson
dfCorr


Out[36]:
Primary Care, Females Primary Care, Males Pregnant Women HIV Testing, Males HIV Testing, Females Curitiba Total Campaign Discrimination Prevention Testing Twitter Total
Primary Care, Females 1.000000 -0.094142 -0.746287 -0.679287 -0.756572 -0.386639 -0.871252 0.100282 -0.990627 -0.793788 -0.556484
Primary Care, Males -0.094142 1.000000 0.699348 0.671452 0.404677 0.948163 -0.328454 -0.053807 0.026313 -0.311391 -0.111999
Pregnant Women -0.746287 0.699348 1.000000 0.807249 0.658780 0.889943 0.445380 0.104931 0.721673 0.445322 0.481301
HIV Testing, Males -0.679287 0.671452 0.807249 1.000000 0.947279 0.777277 0.231663 -0.500540 0.583412 0.111548 -0.054721
HIV Testing, Females -0.756572 0.404677 0.658780 0.947279 1.000000 0.538360 0.371501 -0.649023 0.660181 0.207545 -0.091602
Curitiba Total -0.386639 0.948163 0.889943 0.777277 0.538360 1.000000 -0.011861 0.028211 0.334325 0.002520 0.152247
Campaign -0.871252 -0.328454 0.445380 0.231663 0.371501 -0.011861 1.000000 0.195708 0.922476 0.976264 0.769774
Discrimination 0.100282 -0.053807 0.104931 -0.500540 -0.649023 0.028211 0.195708 1.000000 0.021836 0.403337 0.760921
Prevention -0.990627 0.026313 0.721673 0.583412 0.660181 0.334325 0.922476 0.021836 1.000000 0.868131 0.659190
Testing -0.793788 -0.311391 0.445322 0.111548 0.207545 0.002520 0.976264 0.403337 0.868131 1.000000 0.887252
Twitter Total -0.556484 -0.111999 0.481301 -0.054721 -0.091602 0.152247 0.769774 0.760921 0.659190 0.887252 1.000000

In [37]:
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(dfCorr, annot=False, sig_stars=True,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [38]:
sns.jointplot("Curitiba Total", "Twitter Total", df, kind="reg", color="#404040");



In [ ]:
fig = figure(
    plot_width = 1000, plot_height = 600, title = 'All Topics', y_axis_label = 'Tests', x_axis_label = 'Date',
    title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FAFAFA',
    outline_line_color = '#FAFAFA', border_fill = '#FAFAFA', x_axis_type = 'datetime',
    x_range = (curitibaPublic.index.min(),curitibaPublic.index.max()), y_range = (0,3500),
    )
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Females"], line_color = '#00aeef', line_width = 5, line_alpha = 0.7,
         legend = "Primary Care, Females")
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Males"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7, 
         legend = "Primary Care, Males")
fig.line(curitibaPublic.index, curitibaPublic["Pregnant Women"], line_color = '#5d6263', line_width = 5, line_alpha = 0.7, 
         legend = "Pregnant Women")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Females"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Females")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Males"], line_color = '#e1d8ad', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Males")
legend().label_text_font='Open Sans'
legend().label_text_color='#363636'
legend().border_line_color='#f6f6f6'
axis().axis_label_text_font = "Open Sans"
axis().axis_label_text_font_size = "12pt"
axis().axis_label_text_color = "#363636"
axis().major_label_text_font="Open Sans"
axis().major_label_text_font_size="10pt"
axis().minor_tick_line_color = "#d4d4d4"
xaxis().axis_line_color = '#d4d4d4'
xaxis().major_tick_line_color = "#d4d4d4"
yaxis().major_tick_line_color = None
yaxis().axis_line_color = None
xgrid().grid_line_color = None
ygrid().grid_line_color = "#d4d4d4"
ygrid().grid_line_width = 0.5
show(fig)

In [40]:
sns.jointplot("Primary Care, Females", "Prevention", df, kind="reg", color="#404040");



In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)


Out[1]:

In [ ]: