Setting the Stage


In [1]:
# Importing the Python libraries we will use below #
import sys
import numpy as np
import scipy as scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import ggplot as gg
import seaborn as sns
from bokeh.plotting import *


BokehJS successfully loaded.

In [2]:
# %install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
%load_ext watermark
%watermark -a 'Rene Clausen Nielsen, UN Global Pulse' -p pandas,numpy,scipy,geolocator,scikit-learn,nltk,gensim,textblob,ggplot,matplotlib,mpld3,seaborn,bokeh,pymysql -d -n -t -z -v -m -g


Rene Clausen Nielsen, UN Global Pulse 10/03/2015 12:57:57 UTC

CPython 2.7.9
IPython 2.4.1

pandas 0.15.1
numpy 1.9.2
scipy 0.15.1
geolocator 0.2.dev0
scikit-learn 0.15.2
nltk 3.0.1
gensim 0.10.3
textblob 0.9.0
ggplot 0.6.5
matplotlib 1.4.3
mpld3 0.2
seaborn 0.5.1
bokeh 0.8.1
pymysql 0.6.3

compiler   : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)
system     : Linux
release    : 3.13.0-46-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
Git hash   : 

In [3]:
# Setting the chosen graphical styles #
%matplotlib inline
output_notebook()
sns.set_style("darkgrid", {"grid.linewidth": .9, "axes.facecolor": ".98"})
sns.set_context("notebook") # paper, notebook, talk, poster
# colour_map = dict(unsafe="red", celebs="blue", general="yellow", awareness="grey", myths="purple", stigma="green", safe="#2ecc71", advocates="#34495e", race="#e74c3c", jokes="#3498db",needle="steelblue", questions="indianred")
cmap = sns.diverging_palette(19, 251, s=60, l=30, sep=100, n=11, as_cmap=True)

Getting the Ground-Truth Data


In [4]:
curitibaData = pd.ExcelFile("../data/groundtruth/curitiba.xls")

In [5]:
curitibaData.sheet_names


Out[5]:
[u'Curitiba HIV Tests (Public)', u'Curitiba HIV Tests (Private)']

In [6]:
curitibaPublic = curitibaData.parse(sheetname=0, header=0, parse_dates=True, index_col=0)

In [7]:
curitibaPublic


Out[7]:
2014-01 2014-02 2014-03 2014-04 2014-05 2014-06 2014-07 2014-08 2014-09 2014-10 2014-11 2014-12
HIV tests performed
Primary Health Care services (females) 1465 1425 1378 1477 1437 1174 1495 1267 1438 1413 720 686
Primary Health Care services (males) 1057 1056 998 1115 1064 840 1197 1009 1250 1187 577 617
Pregnant women in public health care 2760 2360 2095 2258 2473 2086 2640 2020 2571 2499 1564 1666
HIV testing center (males) 528 358 389 383 348 307 490 404 377 370 405 430
HIV testing center (females) 186 133 158 155 155 94 197 158 107 117 111 156

In [8]:
curitibaPublic = curitibaPublic.loc[:,:'2014-10'] # Excluding only November and December where ground-truth is biased
curitibaPublic


Out[8]:
2014-01 2014-02 2014-03 2014-04 2014-05 2014-06 2014-07 2014-08 2014-09 2014-10
HIV tests performed
Primary Health Care services (females) 1465 1425 1378 1477 1437 1174 1495 1267 1438 1413
Primary Health Care services (males) 1057 1056 998 1115 1064 840 1197 1009 1250 1187
Pregnant women in public health care 2760 2360 2095 2258 2473 2086 2640 2020 2571 2499
HIV testing center (males) 528 358 389 383 348 307 490 404 377 370
HIV testing center (females) 186 133 158 155 155 94 197 158 107 117

Turning the Table


In [9]:
curitibaPublic = curitibaPublic.transpose()
curitibaPublic


Out[9]:
HIV tests performed Primary Health Care services (females) Primary Health Care services (males) Pregnant women in public health care HIV testing center (males) HIV testing center (females)
2014-01 1465 1057 2760 528 186
2014-02 1425 1056 2360 358 133
2014-03 1378 998 2095 389 158
2014-04 1477 1115 2258 383 155
2014-05 1437 1064 2473 348 155
2014-06 1174 840 2086 307 94
2014-07 1495 1197 2640 490 197
2014-08 1267 1009 2020 404 158
2014-09 1438 1250 2571 377 107
2014-10 1413 1187 2499 370 117

In [10]:
curitibaPublic.columns = ['Primary Care, Females', 'Primary Care, Males', 'Pregnant Women', 'HIV Testing, Males',
                          'HIV Testing, Females'] # Giving the columns fairly short and explanatory names
curitibaPublic = curitibaPublic.drop('Pregnant Women', 1)
curitibaPublic


Out[10]:
Primary Care, Females Primary Care, Males HIV Testing, Males HIV Testing, Females
2014-01 1465 1057 528 186
2014-02 1425 1056 358 133
2014-03 1378 998 389 158
2014-04 1477 1115 383 155
2014-05 1437 1064 348 155
2014-06 1174 840 307 94
2014-07 1495 1197 490 197
2014-08 1267 1009 404 158
2014-09 1438 1250 377 107
2014-10 1413 1187 370 117

In [11]:
curitibaPublic.dtypes # Figuring out what datatype each column is read as


Out[11]:
Primary Care, Females    int64
Primary Care, Males      int64
HIV Testing, Males       int64
HIV Testing, Females     int64
dtype: object

In [12]:
curitibaPublic.index = pd.to_datetime(curitibaPublic.index) # Making sure that months are read as such

In [13]:
curitibaPublic # Checking that data looks the same after the datatype shenanigans


Out[13]:
Primary Care, Females Primary Care, Males HIV Testing, Males HIV Testing, Females
2014-01-01 1465 1057 528 186
2014-02-01 1425 1056 358 133
2014-03-01 1378 998 389 158
2014-04-01 1477 1115 383 155
2014-05-01 1437 1064 348 155
2014-06-01 1174 840 307 94
2014-07-01 1495 1197 490 197
2014-08-01 1267 1009 404 158
2014-09-01 1438 1250 377 107
2014-10-01 1413 1187 370 117

In [14]:
curitibaPublic['Curitiba Total'] = curitibaPublic.sum(axis=1) # Adding a column with monthly totals
curitibaPublic


Out[14]:
Primary Care, Females Primary Care, Males HIV Testing, Males HIV Testing, Females Curitiba Total
2014-01-01 1465 1057 528 186 3236
2014-02-01 1425 1056 358 133 2972
2014-03-01 1378 998 389 158 2923
2014-04-01 1477 1115 383 155 3130
2014-05-01 1437 1064 348 155 3004
2014-06-01 1174 840 307 94 2415
2014-07-01 1495 1197 490 197 3379
2014-08-01 1267 1009 404 158 2838
2014-09-01 1438 1250 377 107 3172
2014-10-01 1413 1187 370 117 3087

Looking at the Ground-Truth Data

Now we'll start having a graphical look at the data. First off we'll create a timeseries graph for each topic and the monthly totals (that is, a timeseries graph for each column)


In [15]:
for col in curitibaPublic:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = curitibaPublic[col].name,                   # Plot title
        y_axis_label = 'Tests',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FAFAFA',                        # Background colour for plot area
        outline_line_color = '#FAFAFA',                     # Colour of line sorrounding plot
        border_fill = '#FAFAFA',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (curitibaPublic.index.min(),
                   curitibaPublic.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(curitibaPublic[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        curitibaPublic.index,                               # Variable values for the x-axis (index = dates)
        curitibaPublic[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
   # legend().label_text_font='Open Sans'
   # legend().label_text_color='#363636'
   # legend().border_line_color='#f6f6f6'
   # axis().axis_label_text_font = "Open Sans"
   # axis().axis_label_text_font_size = "12pt"
   # axis().axis_label_text_color = "#363636"
   # axis().major_label_text_font="Open Sans"
   # axis().major_label_text_font_size="10pt"
   # axis().minor_tick_line_color = "#d4d4d4"
   # xaxis().axis_line_color = '#d4d4d4'
   # xaxis().major_tick_line_color = "#d4d4d4"
   # yaxis().major_tick_line_color = None
   # yaxis().axis_line_color = None
   # xgrid().grid_line_color = None
   # ygrid().grid_line_color = "#d4d4d4"
    show(fig)


Below we'll insert lines for all topics in one chart to better compare.


In [16]:
fig = figure(
    plot_width = 1000, plot_height = 600, title = 'All Topics', y_axis_label = 'Tests', x_axis_label = 'Date',
    title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FAFAFA',
    outline_line_color = '#FAFAFA', border_fill = '#FAFAFA', x_axis_type = 'datetime',
    x_range = (curitibaPublic.index.min(),curitibaPublic.index.max()), y_range = (0,1800),
    )
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Females"], line_color = '#00aeef', line_width = 5, line_alpha = 0.7,
         legend = "Primary Care, Females")
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Males"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7, 
         legend = "Primary Care, Males")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Females"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Females")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Males"], line_color = '#e1d8ad', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Males")
#legend().label_text_font='Open Sans'
#legend().label_text_color='#363636'
#legend().border_line_color='#f6f6f6'
#axis().axis_label_text_font = "Open Sans"
#axis().axis_label_text_font_size = "12pt"
#axis().axis_label_text_color = "#363636"
#axis().major_label_text_font="Open Sans"
#axis().major_label_text_font_size="10pt"
#axis().minor_tick_line_color = "#d4d4d4"
#xaxis().axis_line_color = '#d4d4d4'
#xaxis().major_tick_line_color = "#d4d4d4"
#yaxis().major_tick_line_color = None
#yaxis().axis_line_color = None
#xgrid().grid_line_color = None
#ygrid().grid_line_color = "#d4d4d4"
#ygrid().grid_line_width = 0.5
show(fig)


Correlation Between Test Groups

We can see above that there is some co-variance between the sub-groupings, but July also seems to be a bit of a dividing month. We'll therefore dig a bit further into any potential correlations to see if changes within the sub-groupings are indeed similar. If that is the case, it would imply that there is a large degree of robustness in the data. Or, in other words, if one group gets tested more in one month, so will others, meaning that external factors, such as campaigns or increased risk behavoiur across sub-groupings, could be at play.

First we'll make a simple correlation matrix using Pandas' built-in DataFrame correlations function, .corr. It correlates all columns pairwise using either pearson, kendall, or spearman.

As we're currently looking at timeseries correlations, we'll just use the default: Pearson.

Normal Distribution Test

Pearson assumes that the data is normal distributed. We can't really test that with only four data points per series, but the code below has been readied for furture use.


In [17]:
normalTestPrimaryFemales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Females"])
normalTestPrimaryMales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Males"])
normalTestHIVTestFemales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Females"])
normalTestHIVTestMales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Males"])
normalTestTotal = scipy.stats.mstats.normaltest(curitibaPublic["Curitiba Total"])

print('Normal Distribution Test for "Primary Care, Females": %s' % (normalTestPrimaryFemales,))
print('Normal Distribution Test for "Primary Care, Males": %s' % (normalTestPrimaryMales,))
# print('Normal Distribution Test for "Pregnant Women": %s' % (normalTestPregnantWomen,))
print('Normal Distribution Test for "HIV Testing, Females": %s' % (normalTestHIVTestFemales,))
print('Normal Distribution Test for "HIV Testing, Males": %s' % (normalTestHIVTestMales,))
print('Normal Distribution Test for "Curitiba Total": %s' % (normalTestTotal,))

#curitibaPublic["Primary Care, Females"].normaltest()


Normal Distribution Test for "Primary Care, Females": (6.4451966926332762, 0.039851375941810332)
Normal Distribution Test for "Primary Care, Males": (1.0120404797968383, 0.60289016894457592)
Normal Distribution Test for "HIV Testing, Females": (0.26951130991670885, 0.87392942592337231)
Normal Distribution Test for "HIV Testing, Males": (3.3488328623461694, 0.18741751859687544)
Normal Distribution Test for "Curitiba Total": (5.7922243106692832, 0.055237558175648439)
/home/ubuntu/anaconda/lib/python2.7/site-packages/scipy/stats/mstats_basic.py:1613: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=10
  np.min(n))

Histograms

As is the case with narmaltests above, we don't really have enough data points for a histogram to be useful as a visual indicator of being normal distributed, but again, for future use.

(gg.ggplot(gg.aes(x="Primary Care, Females"), data=curitibaPublic) + gg.geom_histogram())
(gg.ggplot(gg.aes(x="Primary Care, Males"), data=curitibaPublic) + gg.geom_histogram())
(gg.ggplot(gg.aes(x="HIV Testing, Females"), data=curitibaPublic) + gg.geom_histogram())
(gg.ggplot(gg.aes(x="HIV Testing, Males"), data=curitibaPublic) + gg.geom_histogram())
(gg.ggplot(gg.aes(x="Curitiba Total"), data=curitibaPublic) + gg.geom_histogram())

Correlation Matrix


In [20]:
curitibaPublicCorr = curitibaPublic.corr() # Using default method: Pearson
curitibaPublicCorr


Out[20]:
Primary Care, Females Primary Care, Males HIV Testing, Males HIV Testing, Females Curitiba Total
Primary Care, Females 1.000000 0.784215 0.531210 0.536651 0.935275
Primary Care, Males 0.784215 1.000000 0.348880 0.182559 0.859216
HIV Testing, Males 0.531210 0.348880 1.000000 0.820127 0.713403
HIV Testing, Females 0.536651 0.182559 0.820127 1.000000 0.618473
Curitiba Total 0.935275 0.859216 0.713403 0.618473 1.000000
curitibaPublicCorrKendall = curitibaPublic.corr(method='kendall') # Using kendall curitibaPublicCorrKendall

In [21]:
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(curitibaPublic, annot=False, sig_stars=True,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()


Main Group Correlations


In [22]:
sns.jointplot("HIV Testing, Males", "HIV Testing, Females", curitibaPublic, kind="reg", color="#404040");


Getting the Twitter Data


In [23]:
# Checking that the data file looks right #
!cat ../data/all.csv | head











cat: write error: Broken pipe

In [24]:
# Read in Twitter data file #
twitterData=pd.read_csv('../ipynb/spark/output/all.csv',
                          encoding='utf-8',
                          #header=None,
                          na_values=['NaN',''],
                          parse_dates=[3],
                          index_col=[3]
                        )
twitterData.head()


Out[24]:
city lat lon topic
origdate
2014-04-03 21:23:26 Recife -8.057838 -34.882897 Discrimination_Negative
2015-02-07 23:51:00 São Paulo -23.500000 -46.600000 Discrimination_Negative
2014-11-12 20:35:08 Curitiba -25.428954 -49.267137 Discrimination_Negative
2014-04-06 12:06:17 Curitiba -25.428954 -49.267137 Discrimination_Negative
2014-09-13 15:38:28 São Paulo -23.500000 -46.600000 Discrimination_Negative

In [25]:
twitterDataCuritiba = twitterData[twitterData['city'] == 'Curitiba'] # Getting Curitiba data only
twitterDataSmall = twitterDataCuritiba[['city','topic']] # Getting rid of columns we won't need
twitterDataSmall.head()


Out[25]:
city topic
origdate
2014-11-12 20:35:08 Curitiba Discrimination_Negative
2014-04-06 12:06:17 Curitiba Discrimination_Negative
2014-09-19 02:54:41 Curitiba Discrimination_Negative
2014-03-25 19:49:02 Curitiba Discrimination_Negative
2014-04-22 06:39:53 Curitiba Discrimination_Negative

In [26]:
twitterDataSmall.describe()


Out[26]:
city topic
count 7252 7252
unique 1 7
top Curitiba Discrimination_Negative
freq 7252 5502

In [27]:
twitterDataSmall = pd.get_dummies(twitterDataSmall['topic'])
twitterDataSmall.head()


Out[27]:
Campaign_Portuguese Discrimination_Negative Discrimination_Positive Prevention_Negative Prevention_Neutral Prevention_Positive Testing_Neutral
origdate
2014-11-12 20:35:08 0 1 0 0 0 0 0
2014-04-06 12:06:17 0 1 0 0 0 0 0
2014-09-19 02:54:41 0 1 0 0 0 0 0
2014-03-25 19:49:02 0 1 0 0 0 0 0
2014-04-22 06:39:53 0 1 0 0 0 0 0

In [28]:
twitterDataSmallAgg = twitterDataSmall.resample('MS', how='sum') # Resampling by summing each topic over each month
twitterDataSmallAgg['Twitter Total'] = twitterDataSmallAgg.sum(axis=1) # Adding a column with monthly totals
# twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataSmallAgg


Out[28]:
Campaign_Portuguese Discrimination_Negative Discrimination_Positive Prevention_Negative Prevention_Neutral Prevention_Positive Testing_Neutral Twitter Total
origdate
2014-01-01 6 540 12 1 6 61 1 627
2014-02-01 0 498 38 2 11 63 0 612
2014-03-01 2 509 39 2 21 78 0 651
2014-04-01 0 472 53 1 11 56 1 594
2014-05-01 0 371 38 1 5 46 1 462
2014-06-01 0 415 18 1 9 61 1 505
2014-07-01 5 401 12 2 12 87 2 521
2014-08-01 1 349 59 1 5 53 2 470
2014-09-01 0 349 107 0 7 51 0 514
2014-10-01 0 344 62 3 8 40 0 457
2014-11-01 1 265 30 2 12 75 1 386
2014-12-01 8 302 36 1 28 51 4 430
2015-01-01 1 324 27 1 31 77 3 464
2015-02-01 3 274 37 1 26 89 5 435
2015-03-01 6 89 9 1 4 15 0 124

Looking at the Twitter Data


In [29]:
for col in twitterDataSmallAgg:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = twitterDataSmallAgg[col].name,                   # Plot title
        y_axis_label = 'Tests',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FAFAFA',                        # Background colour for plot area
        outline_line_color = '#FAFAFA',                     # Colour of line sorrounding plot
        border_fill = '#FAFAFA',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (twitterDataSmallAgg.index.min(),
                   twitterDataSmallAgg.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(twitterDataSmallAgg[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        twitterDataSmallAgg.index,                               # Variable values for the x-axis (index = dates)
        twitterDataSmallAgg[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
    #legend().label_text_font='Open Sans'
    #legend().label_text_color='#363636'
    #legend().border_line_color='#f6f6f6'
    #axis().axis_label_text_font = "Open Sans"
    #axis().axis_label_text_font_size = "12pt"
    #axis().axis_label_text_color = "#363636"
    #axis().major_label_text_font="Open Sans"
    #axis().major_label_text_font_size="10pt"
    #axis().minor_tick_line_color = "#d4d4d4"
    #xaxis().axis_line_color = '#d4d4d4'
    #xaxis().major_tick_line_color = "#d4d4d4"
    #yaxis().major_tick_line_color = None
    #yaxis().axis_line_color = None
    #xgrid().grid_line_color = None
    #ygrid().grid_line_color = "#d4d4d4"
    #ygrid().grid_line_width = 0.5
    show(fig)



In [30]:
curitibaTwitterCorr = twitterDataSmallAgg.corr() # Using default method: Pearson
curitibaTwitterCorr


Out[30]:
Campaign_Portuguese Discrimination_Negative Discrimination_Positive Prevention_Negative Prevention_Neutral Prevention_Positive Testing_Neutral Twitter Total
Campaign_Portuguese 1.000000 -0.257204 -0.512107 -0.108423 0.235566 -0.080378 0.384959 -0.304044
Discrimination_Negative -0.257204 1.000000 0.068860 0.152680 -0.054787 0.400021 -0.236346 0.969129
Discrimination_Positive -0.512107 0.068860 1.000000 -0.221960 -0.129828 -0.154733 -0.192909 0.212160
Prevention_Negative -0.108423 0.152680 -0.221960 1.000000 -0.003667 0.157744 -0.254824 0.118625
Prevention_Neutral 0.235566 -0.054787 -0.129828 -0.003667 1.000000 0.561457 0.707131 0.094807
Prevention_Positive -0.080378 0.400021 -0.154733 0.157744 0.561457 1.000000 0.463087 0.529034
Testing_Neutral 0.384959 -0.236346 -0.192909 -0.254824 0.707131 0.463087 1.000000 -0.112459
Twitter Total -0.304044 0.969129 0.212160 0.118625 0.094807 0.529034 -0.112459 1.000000

In [31]:
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(twitterDataSmallAgg, annot=False, sig_stars=True,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()


Merging Data


In [32]:
df = pd.merge(curitibaPublic, twitterDataSmallAgg, how='left', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True,
      suffixes=('_x', '_y'), copy=True)

In [33]:
df


Out[33]:
Primary Care, Females Primary Care, Males HIV Testing, Males HIV Testing, Females Curitiba Total Campaign_Portuguese Discrimination_Negative Discrimination_Positive Prevention_Negative Prevention_Neutral Prevention_Positive Testing_Neutral Twitter Total
2014-01-01 1465 1057 528 186 3236 6 540 12 1 6 61 1 627
2014-02-01 1425 1056 358 133 2972 0 498 38 2 11 63 0 612
2014-03-01 1378 998 389 158 2923 2 509 39 2 21 78 0 651
2014-04-01 1477 1115 383 155 3130 0 472 53 1 11 56 1 594
2014-05-01 1437 1064 348 155 3004 0 371 38 1 5 46 1 462
2014-06-01 1174 840 307 94 2415 0 415 18 1 9 61 1 505
2014-07-01 1495 1197 490 197 3379 5 401 12 2 12 87 2 521
2014-08-01 1267 1009 404 158 2838 1 349 59 1 5 53 2 470
2014-09-01 1438 1250 377 107 3172 0 349 107 0 7 51 0 514
2014-10-01 1413 1187 370 117 3087 0 344 62 3 8 40 0 457

Comparisons


In [34]:
dfNoTotals = df.drop(df.columns[[4, 12]], axis=1)
dfNoTotalsCorr = dfNoTotals.corr() # Using default method: Pearson
dfNoTotalsCorr


Out[34]:
Primary Care, Females Primary Care, Males HIV Testing, Males HIV Testing, Females Campaign_Portuguese Discrimination_Negative Discrimination_Positive Prevention_Negative Prevention_Neutral Prevention_Positive Testing_Neutral
Primary Care, Females 1.000000 0.784215 0.531210 0.536651 0.354117 0.225466 0.069446 0.128279 0.078527 0.122174 -0.136857
Primary Care, Males 0.784215 1.000000 0.348880 0.182559 0.103346 -0.321404 0.515700 0.073299 -0.123659 -0.116219 -0.155290
HIV Testing, Males 0.531210 0.348880 1.000000 0.820127 0.936994 0.348728 -0.353389 0.036715 -0.025768 0.439405 0.407001
HIV Testing, Females 0.536651 0.182559 0.820127 1.000000 0.784017 0.374796 -0.503709 0.123243 0.135034 0.527283 0.561009
Campaign_Portuguese 0.354117 0.103346 0.936994 0.784017 1.000000 0.438557 -0.586626 0.081242 0.102658 0.606875 0.421848
Discrimination_Negative 0.225466 -0.321404 0.348728 0.374796 0.438557 1.000000 -0.529473 0.084905 0.498054 0.485684 -0.190649
Discrimination_Positive 0.069446 0.515700 -0.353389 -0.503709 -0.586626 -0.529473 1.000000 -0.317560 -0.189156 -0.550650 -0.428773
Prevention_Negative 0.128279 0.073299 0.036715 0.123243 0.081242 0.084905 -0.317560 1.000000 0.414624 0.173084 -0.200446
Prevention_Neutral 0.078527 -0.123659 -0.025768 0.135034 0.102658 0.498054 -0.189156 0.414624 1.000000 0.684826 -0.325051
Prevention_Positive 0.122174 -0.116219 0.439405 0.527283 0.606875 0.485684 -0.550650 0.173084 0.684826 1.000000 0.270588
Testing_Neutral -0.136857 -0.155290 0.407001 0.561009 0.421848 -0.190649 -0.428773 -0.200446 -0.325051 0.270588 1.000000

In [35]:
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(dfNoTotals, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [36]:
dfTotals = df[["Curitiba Total", "Twitter Total"]]
dfTotalsCorr = dfTotals.corr() # Using default method: Pearson
dfTotalsCorr


Out[36]:
Curitiba Total Twitter Total
Curitiba Total 1.000000 0.169366
Twitter Total 0.169366 1.000000

In [37]:
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(dfTotals, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [38]:
sns.jointplot("Curitiba Total", "Twitter Total", df, kind="reg", color="#404040");



In [39]:
sns.jointplot("HIV Testing, Males", "Campaign_Portuguese", df, kind="reg", color="#404040");



In [40]:
sns.jointplot("HIV Testing, Females", "Campaign_Portuguese", df, kind="reg", color="#404040");



In [41]:
sns.jointplot("Primary Care, Females", "Prevention_Positive", df, kind="reg", color="#404040");


Anomaly Detection

We can only do anomaly detection on tweets, because we don't have enough data points for our ground-truth data. As we have very fine temporal information for all tweets (seconds), we can resample to look at hourly and daily aggregates. That shulkd give us enough data for anomaly detection.

First we resample to daily aggregates.


In [42]:
twitterDataDailyAgg = twitterDataSmall.resample('D', how='sum') # Resampling by summing each topic over each day
twitterDataDailyAgg['Twitter Total'] = twitterDataDailyAgg.sum(axis=1) # Adding a column with monthly totals
# twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataDailyAgg.head()


Out[42]:
Campaign_Portuguese Discrimination_Negative Discrimination_Positive Prevention_Negative Prevention_Neutral Prevention_Positive Testing_Neutral Twitter Total
origdate
2014-01-01 0 15 0 0 0 1 0 16
2014-01-02 0 16 0 0 0 0 0 16
2014-01-03 0 15 0 0 0 0 0 15
2014-01-04 0 13 0 0 2 4 0 19
2014-01-05 0 12 0 0 0 4 0 16

As we'll be using r instead of Python for this, we'll use IPyhon's built-in r interpreter using the so called Magic Functions.


In [43]:
%load_ext rmagic
%%R update.packages() install.packages("devtools") devtools::install_github("twitter/AnomalyDetection")

In [44]:
%R library(AnomalyDetection)


Out[44]:
array(['AnomalyDetection', 'tools', 'stats', 'graphics', 'grDevices',
       'utils', 'datasets', 'methods', 'base'], 
      dtype='|S16')

In [45]:
# %%R
# help(AnomalyDetectionTs)
# help(AnomalyDetectionVec)

In [46]:
df_r = twitterDataDailyAgg['Twitter Total']
df_r.to_csv('TwitterDailyAgg.csv', header=['Twitter Total'], date_format='%Y-%m-%d')

In [47]:
!cat TwitterDailyAgg.csv | head


origdate,Twitter Total
2014-01-01,16.0
2014-01-02,16.0
2014-01-03,15.0
2014-01-04,19.0
2014-01-05,16.0
2014-01-06,21.0
2014-01-07,28.0
2014-01-08,14.0
2014-01-09,14.0

In [48]:
%%R

df_r = read.csv("TwitterDailyAgg.csv", stringsAsFactors=FALSE)

In [49]:
%R data(df_r)


Out[49]:
array(['df_r'], 
      dtype='|S4')

In [50]:
%R df_r


Out[50]:
array([['2014-01-01', '2014-01-02', '2014-01-03', '2014-01-04',
        '2014-01-05', '2014-01-06', '2014-01-07', '2014-01-08',
        '2014-01-09', '2014-01-10', '2014-01-11', '2014-01-12',
        '2014-01-13', '2014-01-14', '2014-01-15', '2014-01-16',
        '2014-01-17', '2014-01-18', '2014-01-19', '2014-01-20',
        '2014-01-21', '2014-01-22', '2014-01-23', '2014-01-24',
        '2014-01-25', '2014-01-26', '2014-01-27', '2014-01-28',
        '2014-01-29', '2014-01-30', '2014-01-31', '2014-02-01',
        '2014-02-02', '2014-02-03', '2014-02-04', '2014-02-05',
        '2014-02-06', '2014-02-07', '2014-02-08', '2014-02-09',
        '2014-02-10', '2014-02-11', '2014-02-12', '2014-02-13',
        '2014-02-14', '2014-02-15', '2014-02-16', '2014-02-17',
        '2014-02-18', '2014-02-19', '2014-02-20', '2014-02-21',
        '2014-02-22', '2014-02-23', '2014-02-24', '2014-02-25',
        '2014-02-26', '2014-02-27', '2014-02-28', '2014-03-01',
        '2014-03-02', '2014-03-03', '2014-03-04', '2014-03-05',
        '2014-03-06', '2014-03-07', '2014-03-08', '2014-03-09',
        '2014-03-10', '2014-03-11', '2014-03-12', '2014-03-13',
        '2014-03-14', '2014-03-15', '2014-03-16', '2014-03-17',
        '2014-03-18', '2014-03-19', '2014-03-20', '2014-03-21',
        '2014-03-22', '2014-03-23', '2014-03-24', '2014-03-25',
        '2014-03-26', '2014-03-27', '2014-03-28', '2014-03-29',
        '2014-03-30', '2014-03-31', '2014-04-01', '2014-04-02',
        '2014-04-03', '2014-04-04', '2014-04-05', '2014-04-06',
        '2014-04-07', '2014-04-08', '2014-04-09', '2014-04-10',
        '2014-04-11', '2014-04-12', '2014-04-13', '2014-04-14',
        '2014-04-15', '2014-04-16', '2014-04-17', '2014-04-18',
        '2014-04-19', '2014-04-20', '2014-04-21', '2014-04-22',
        '2014-04-23', '2014-04-24', '2014-04-25', '2014-04-26',
        '2014-04-27', '2014-04-28', '2014-04-29', '2014-04-30',
        '2014-05-01', '2014-05-02', '2014-05-03', '2014-05-04',
        '2014-05-05', '2014-05-06', '2014-05-07', '2014-05-08',
        '2014-05-09', '2014-05-10', '2014-05-11', '2014-05-12',
        '2014-05-13', '2014-05-14', '2014-05-15', '2014-05-16',
        '2014-05-17', '2014-05-18', '2014-05-19', '2014-05-20',
        '2014-05-21', '2014-05-22', '2014-05-23', '2014-05-24',
        '2014-05-25', '2014-05-26', '2014-05-27', '2014-05-28',
        '2014-05-29', '2014-05-30', '2014-05-31', '2014-06-01',
        '2014-06-02', '2014-06-03', '2014-06-04', '2014-06-05',
        '2014-06-06', '2014-06-07', '2014-06-08', '2014-06-09',
        '2014-06-10', '2014-06-11', '2014-06-12', '2014-06-13',
        '2014-06-14', '2014-06-15', '2014-06-16', '2014-06-17',
        '2014-06-18', '2014-06-19', '2014-06-20', '2014-06-21',
        '2014-06-22', '2014-06-23', '2014-06-24', '2014-06-25',
        '2014-06-26', '2014-06-27', '2014-06-28', '2014-06-29',
        '2014-06-30', '2014-07-01', '2014-07-02', '2014-07-03',
        '2014-07-04', '2014-07-05', '2014-07-06', '2014-07-07',
        '2014-07-08', '2014-07-09', '2014-07-10', '2014-07-11',
        '2014-07-12', '2014-07-13', '2014-07-14', '2014-07-15',
        '2014-07-16', '2014-07-17', '2014-07-18', '2014-07-19',
        '2014-07-20', '2014-07-21', '2014-07-22', '2014-07-23',
        '2014-07-24', '2014-07-25', '2014-07-26', '2014-07-27',
        '2014-07-28', '2014-07-29', '2014-07-30', '2014-07-31',
        '2014-08-01', '2014-08-02', '2014-08-03', '2014-08-04',
        '2014-08-05', '2014-08-06', '2014-08-07', '2014-08-08',
        '2014-08-09', '2014-08-10', '2014-08-11', '2014-08-12',
        '2014-08-13', '2014-08-14', '2014-08-15', '2014-08-16',
        '2014-08-17', '2014-08-18', '2014-08-19', '2014-08-20',
        '2014-08-21', '2014-08-22', '2014-08-23', '2014-08-24',
        '2014-08-25', '2014-08-26', '2014-08-27', '2014-08-28',
        '2014-08-29', '2014-08-30', '2014-08-31', '2014-09-01',
        '2014-09-02', '2014-09-03', '2014-09-04', '2014-09-05',
        '2014-09-06', '2014-09-07', '2014-09-08', '2014-09-09',
        '2014-09-10', '2014-09-11', '2014-09-12', '2014-09-13',
        '2014-09-14', '2014-09-15', '2014-09-16', '2014-09-17',
        '2014-09-18', '2014-09-19', '2014-09-20', '2014-09-21',
        '2014-09-22', '2014-09-23', '2014-09-24', '2014-09-25',
        '2014-09-26', '2014-09-27', '2014-09-28', '2014-09-29',
        '2014-09-30', '2014-10-01', '2014-10-02', '2014-10-03',
        '2014-10-04', '2014-10-05', '2014-10-06', '2014-10-07',
        '2014-10-08', '2014-10-09', '2014-10-10', '2014-10-11',
        '2014-10-12', '2014-10-13', '2014-10-14', '2014-10-15',
        '2014-10-16', '2014-10-17', '2014-10-18', '2014-10-19',
        '2014-10-20', '2014-10-21', '2014-10-22', '2014-10-23',
        '2014-10-24', '2014-10-25', '2014-10-26', '2014-10-27',
        '2014-10-28', '2014-10-29', '2014-10-30', '2014-10-31',
        '2014-11-01', '2014-11-02', '2014-11-03', '2014-11-04',
        '2014-11-05', '2014-11-06', '2014-11-07', '2014-11-08',
        '2014-11-09', '2014-11-10', '2014-11-11', '2014-11-12',
        '2014-11-13', '2014-11-14', '2014-11-15', '2014-11-16',
        '2014-11-17', '2014-11-18', '2014-11-19', '2014-11-20',
        '2014-11-21', '2014-11-22', '2014-11-23', '2014-11-24',
        '2014-11-25', '2014-11-26', '2014-11-27', '2014-11-28',
        '2014-11-29', '2014-11-30', '2014-12-01', '2014-12-02',
        '2014-12-03', '2014-12-04', '2014-12-05', '2014-12-06',
        '2014-12-07', '2014-12-08', '2014-12-09', '2014-12-10',
        '2014-12-11', '2014-12-12', '2014-12-13', '2014-12-14',
        '2014-12-15', '2014-12-16', '2014-12-17', '2014-12-18',
        '2014-12-19', '2014-12-20', '2014-12-21', '2014-12-22',
        '2014-12-23', '2014-12-24', '2014-12-25', '2014-12-26',
        '2014-12-27', '2014-12-28', '2014-12-29', '2014-12-30',
        '2014-12-31', '2015-01-01', '2015-01-02', '2015-01-03',
        '2015-01-04', '2015-01-05', '2015-01-06', '2015-01-07',
        '2015-01-08', '2015-01-09', '2015-01-10', '2015-01-11',
        '2015-01-12', '2015-01-13', '2015-01-14', '2015-01-15',
        '2015-01-16', '2015-01-17', '2015-01-18', '2015-01-19',
        '2015-01-20', '2015-01-21', '2015-01-22', '2015-01-23',
        '2015-01-24', '2015-01-25', '2015-01-26', '2015-01-27',
        '2015-01-28', '2015-01-29', '2015-01-30', '2015-01-31',
        '2015-02-01', '2015-02-02', '2015-02-03', '2015-02-04',
        '2015-02-05', '2015-02-06', '2015-02-07', '2015-02-08',
        '2015-02-09', '2015-02-10', '2015-02-11', '2015-02-12',
        '2015-02-13', '2015-02-14', '2015-02-15', '2015-02-16',
        '2015-02-17', '2015-02-18', '2015-02-19', '2015-02-20',
        '2015-02-21', '2015-02-22', '2015-02-23', '2015-02-24',
        '2015-02-25', '2015-02-26', '2015-02-27', '2015-02-28',
        '2015-03-01', '2015-03-02', '2015-03-03', '2015-03-04',
        '2015-03-05', '2015-03-06', '2015-03-07', '2015-03-08',
        '2015-03-09'],
       ['16.0', '16.0', '15.0', '19.0', '16.0', '21.0', '28.0', '14.0',
        '14.0', '22.0', '10.0', '18.0', '21.0', '15.0', '31.0', '34.0',
        '22.0', '17.0', '20.0', '19.0', '24.0', '13.0', '24.0', '19.0',
        '12.0', '13.0', '26.0', '40.0', '28.0', '12.0', '28.0', '34.0',
        '12.0', '29.0', '15.0', '15.0', '24.0', '21.0', '12.0', '27.0',
        '16.0', '19.0', '19.0', '34.0', '27.0', '18.0', '17.0', '31.0',
        '26.0', '32.0', '14.0', '19.0', '16.0', '23.0', '20.0', '22.0',
        '21.0', '13.0', '36.0', '37.0', '19.0', '19.0', '45.0', '25.0',
        '32.0', '23.0', '21.0', '27.0', '14.0', '12.0', '20.0', '30.0',
        '32.0', '15.0', '12.0', '16.0', '11.0', '28.0', '21.0', '23.0',
        '14.0', '18.0', '15.0', '17.0', '17.0', '14.0', '23.0', '15.0',
        '20.0', '16.0', '17.0', '15.0', '18.0', '27.0', '11.0', '17.0',
        '21.0', '17.0', '16.0', '12.0', '17.0', '16.0', '21.0', '8.0',
        '26.0', '18.0', '24.0', '15.0', '20.0', '15.0', '26.0', '25.0',
        '27.0', '23.0', '26.0', '10.0', '27.0', '26.0', '31.0', '22.0',
        '17.0', '11.0', '20.0', '16.0', '14.0', '16.0', '17.0', '14.0',
        '22.0', '16.0', '11.0', '16.0', '9.0', '18.0', '12.0', '19.0',
        '13.0', '13.0', '15.0', '15.0', '7.0', '16.0', '12.0', '16.0',
        '12.0', '23.0', '13.0', '13.0', '11.0', '15.0', '20.0', '10.0',
        '7.0', '22.0', '26.0', '17.0', '15.0', '14.0', '12.0', '15.0',
        '19.0', '17.0', '24.0', '22.0', '17.0', '9.0', '16.0', '17.0',
        '22.0', '14.0', '17.0', '14.0', '14.0', '27.0', '12.0', '15.0',
        '23.0', '15.0', '22.0', '10.0', '21.0', '14.0', '13.0', '13.0',
        '21.0', '13.0', '9.0', '28.0', '24.0', '11.0', '15.0', '14.0',
        '30.0', '9.0', '17.0', '16.0', '20.0', '11.0', '20.0', '11.0',
        '11.0', '14.0', '25.0', '22.0', '24.0', '10.0', '12.0', '12.0',
        '27.0', '23.0', '16.0', '16.0', '15.0', '11.0', '12.0', '19.0',
        '18.0', '17.0', '24.0', '14.0', '15.0', '13.0', '6.0', '12.0',
        '12.0', '10.0', '16.0', '13.0', '17.0', '9.0', '13.0', '14.0',
        '8.0', '12.0', '8.0', '13.0', '14.0', '11.0', '24.0', '19.0',
        '37.0', '26.0', '18.0', '13.0', '13.0', '26.0', '18.0', '22.0',
        '27.0', '8.0', '15.0', '26.0', '19.0', '15.0', '22.0', '18.0',
        '15.0', '12.0', '19.0', '15.0', '14.0', '23.0', '14.0', '10.0',
        '16.0', '20.0', '11.0', '12.0', '15.0', '12.0', '9.0', '39.0',
        '16.0', '12.0', '16.0', '31.0', '10.0', '16.0', '18.0', '3.0',
        '15.0', '29.0', '12.0', '11.0', '10.0', '18.0', '24.0', '12.0',
        '15.0', '10.0', '10.0', '17.0', '29.0', '16.0', '10.0', '11.0',
        '14.0', '9.0', '12.0', '5.0', '13.0', '16.0', '18.0', '15.0',
        '5.0', '12.0', '11.0', '14.0', '12.0', '12.0', '15.0', '15.0',
        '16.0', '20.0', '16.0', '14.0', '17.0', '14.0', '13.0', '6.0',
        '25.0', '14.0', '15.0', '19.0', '13.0', '3.0', '15.0', '13.0',
        '13.0', '6.0', '7.0', '11.0', '10.0', '10.0', '61.0', '8.0',
        '17.0', '6.0', '14.0', '12.0', '4.0', '14.0', '8.0', '14.0',
        '19.0', '14.0', '18.0', '7.0', '12.0', '12.0', '10.0', '16.0',
        '21.0', '10.0', '11.0', '14.0', '22.0', '13.0', '5.0', '13.0',
        '12.0', '9.0', '21.0', '7.0', '6.0', '4.0', '11.0', '12.0', '14.0',
        '11.0', '10.0', '10.0', '17.0', '20.0', '17.0', '6.0', '11.0',
        '6.0', '11.0', '22.0', '29.0', '11.0', '13.0', '17.0', '19.0',
        '24.0', '16.0', '18.0', '21.0', '12.0', '17.0', '11.0', '9.0',
        '30.0', '24.0', '11.0', '12.0', '14.0', '17.0', '14.0', '18.0',
        '18.0', '17.0', '18.0', '19.0', '17.0', '18.0', '26.0', '25.0',
        '15.0', '7.0', '14.0', '16.0', '17.0', '20.0', '10.0', '12.0',
        '14.0', '15.0', '10.0', '9.0', '16.0', '15.0', '12.0', '9.0',
        '8.0', '8.0', '26.0', '13.0', '19.0', '14.0', '19.0', '8.0']], 
      dtype='|S32')

In [51]:
%%R
data(df_r)
res = AnomalyDetectionTs(df_r, max_anoms=0.02, direction='both', plot=TRUE)
res$plot


Error in R_idx[i] <- data[[1]][temp_max_idx] : 
  replacement has length zero
In addition: Warning messages:
1: In data(df_r) : data set ‘df_r’ not found
2: In data(df_r) : data set ‘df_r’ not found
3: In max(ares) : no non-missing arguments to max; returning -Inf
4: In max(ares) : no non-missing arguments to max; returning -Inf
Error in R_idx[i] <- data[[1]][temp_max_idx] : 
  replacement has length zero

DELETE


In [52]:
%R raw_data


Out[52]:
array([ <DataFrame - Python:0x7f9457410c20 / R:0x796a848>
[Float..., IntVe..., IntVe..., ..., IntVe..., IntVe..., IntVe...]
  <no name>: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7f94574108c0 / R:0x5f9c910>
[0.000000, 0.000000, 0.000000, ..., 0.000000, 0.000000, 0.000000]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f9457410dd0 / R:0x589e3f0>
[       1,        2,        3, ...,       56,       57,       58]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f9457410b90 / R:0x4c88ff0>
[      14,       14,       14, ...,       13,       13,       13]
  ...
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f9457410560 / R:0x6207940>
[       4,        4,        4, ...,        0,        0,        0]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f94574106c8 / R:0x645d960>
[     268,      268,      268, ...,      278,      278,      278]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f9457410710 / R:0x60587c0>
[       0,        0,        0, ...,        0,        0,        0],
       <FloatVector - Python:0x7f9457410998 / R:0x58f9ed0>
[182.478000, 176.231000, 183.917000, ..., 153.776000, 150.481000, 146.638000]], dtype=object)

In [53]:
%%R

data(raw_data)
res = AnomalyDetectionTs(raw_data, max_anoms=0.02, direction='both', plot=TRUE)
res$plot


Styling


In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)


Out[1]:

In [54]: