Setting the Stage


In [1]:
# Importing the Python libraries we will use below #
import sys
import numpy as np
import scipy as scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import ggplot as gg
import seaborn as sns
from bokeh.plotting import *

In [2]:
# %install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
%load_ext watermark
%watermark -a 'Rene Clausen Nielsen, UN Global Pulse' -p pandas,numpy,scipy,geolocator,ggplot,matplotlib,mpld3,seaborn,bokeh -d -n -t -z -v -m -g


Rene Clausen Nielsen, UN Global Pulse 15/05/2015 18:53:16 UTC

CPython 2.7.9
IPython 3.1.0

pandas 0.16.1
numpy 1.9.2
scipy 0.15.1
geolocator 0.2.dev0
ggplot 0.6.5
matplotlib 1.4.3
mpld3 0.2
seaborn 0.5.1
bokeh 0.8.2

compiler   : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)
system     : Linux
release    : 3.13.0-46-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
Git hash   : 

In [3]:
# Setting the chosen graphical styles #
%matplotlib inline
output_notebook()
cmap = sns.diverging_palette(19, 251, s=60, l=30, sep=100, n=11, as_cmap=True)
sns.set_context("poster")
sns.despine()
sns.set_style("whitegrid", {'font.size': 10, 'axes.labelsize': 10, 'legend.fontsize': 10, 'axes.titlesize': 12, 
                        'xtick.labelsize': 10, 'ytick.labelsize': 10, 'grid.linewidth': .2, 'axes.facecolor': ".97",
                        'grid.color': '.9', 'axes.edgecolor': '.9', 'font.family': ['sans-serif'], 'lines.solid_capstyle': 'round',
                        'font.sans-serif': ['Liberation Sans','Bitstream Vera Sans','sans-serif','Arial'],})


BokehJS successfully loaded.
<matplotlib.figure.Figure at 0x7f653e689bd0>

Getting Population Data


In [4]:
populationData = pd.read_csv('../data/BrazilPopulation.csv', encoding='utf-8',)
populationData.sort("Population", ascending=False)


Out[4]:
City State Population
24 São Paulo SP 11152968
21 Rio de Janeiro RJ 6320446
22 Salvador BA 2674923
4 Brasília DF 2481272
9 Fortaleza CE 2452185
2 Belo Horizonte MG 2375151
14 Manaus AM 1792881
7 Curitiba PR 1751907
19 Recife PE 1537704
17 Porto Alegre RS 1409351
1 Belém PA 1381475
10 Goiânia GO 1297154
23 São Luís MA 958545
13 Maceió AL 932078
15 Natal RN 803739
5 Campo Grande MS 776242
25 Teresina PI 767559
11 João Pessoa PB 720954
0 Aracaju SE 571149
6 Cuiabá MT 540814
8 Florianópolis SC 405189
18 Porto Velho RO 392475
12 Macapá AP 381091
26 Vitória ES 327801
20 Rio Branco AC 308545
3 Boa Vista RR 277799
16 Palmas TO 221742

In [5]:
populationData.sort("Population").plot(x = "City",
                   y = "Population",
                   kind = "barh", 
                   title = "Number of Inhabitants per City",
                   legend = False,
                   # colormap = cmap,
                   color = "#00aeef",
                   alpha=0.8)


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f653de85110>

Getting the Ground-Truth Data


In [6]:
curitibaData = pd.ExcelFile("../data/groundtruth/curitiba.xls")

In [7]:
curitibaData.sheet_names


Out[7]:
[u'Curitiba HIV Tests (Public)', u'Curitiba HIV Tests (Private)']

In [8]:
curitibaPublic = curitibaData.parse(sheetname=0, header=0, parse_dates=True, index_col=0)

In [9]:
curitibaPublic


Out[9]:
2014-01 2014-02 2014-03 2014-04 2014-05 2014-06 2014-07 2014-08 2014-09 2014-10 2014-11 2014-12 2015-01 2015-02
HIV tests performed
Primary Health Care services (females) 1465 1425 1378 1477 1437 1174 1495 1267 1438 1413 720 686 1352 1277
Primary Health Care services (males) 1057 1056 998 1115 1064 840 1197 1009 1250 1187 577 617 1157 912
Pregnant women in public health care 2760 2360 2095 2258 2473 2086 2640 2020 2571 2499 1564 1666 3027 2130
HIV testing center (males) 528 358 389 383 348 307 490 404 377 370 405 430 379 437
HIV testing center (females) 186 133 158 155 155 94 197 158 107 117 111 156 112 144

In [10]:
# curitibaPublic = curitibaPublic.loc[:,'2014-06':'2014-09'] # Including only months where we also have Twitter data 
# curitibaPublic

Turning the Table


In [11]:
curitibaPublic = curitibaPublic.transpose()
curitibaPublic


Out[11]:
HIV tests performed Primary Health Care services (females) Primary Health Care services (males) Pregnant women in public health care HIV testing center (males) HIV testing center (females)
2014-01 1465 1057 2760 528 186
2014-02 1425 1056 2360 358 133
2014-03 1378 998 2095 389 158
2014-04 1477 1115 2258 383 155
2014-05 1437 1064 2473 348 155
2014-06 1174 840 2086 307 94
2014-07 1495 1197 2640 490 197
2014-08 1267 1009 2020 404 158
2014-09 1438 1250 2571 377 107
2014-10 1413 1187 2499 370 117
2014-11 720 577 1564 405 111
2014-12 686 617 1666 430 156
2015-01 1352 1157 3027 379 112
2015-02 1277 912 2130 437 144

In [12]:
curitibaPublic.columns = ['Primary Care, Females', 'Primary Care, Males', 'Pregnant Women', 'HIV Testing, Males',
                          'HIV Testing, Females'] # Giving the columns fairly short and explanatory names
curitibaPublic = curitibaPublic.drop('Pregnant Women', 1) # Excluding numbers from matenal health facilities as they are tests delivered, not tests made
curitibaPublic


Out[12]:
Primary Care, Females Primary Care, Males HIV Testing, Males HIV Testing, Females
2014-01 1465 1057 528 186
2014-02 1425 1056 358 133
2014-03 1378 998 389 158
2014-04 1477 1115 383 155
2014-05 1437 1064 348 155
2014-06 1174 840 307 94
2014-07 1495 1197 490 197
2014-08 1267 1009 404 158
2014-09 1438 1250 377 107
2014-10 1413 1187 370 117
2014-11 720 577 405 111
2014-12 686 617 430 156
2015-01 1352 1157 379 112
2015-02 1277 912 437 144

In [13]:
curitibaPublic.dtypes # Figuring out what datatype each column is read as


Out[13]:
Primary Care, Females    int64
Primary Care, Males      int64
HIV Testing, Males       int64
HIV Testing, Females     int64
dtype: object

In [14]:
curitibaPublic.index = pd.to_datetime(curitibaPublic.index) # Making sure that months are read as such

In [15]:
curitibaPublic # Checking that data looks the same after the datatype shenanigans


Out[15]:
Primary Care, Females Primary Care, Males HIV Testing, Males HIV Testing, Females
2014-01-01 1465 1057 528 186
2014-02-01 1425 1056 358 133
2014-03-01 1378 998 389 158
2014-04-01 1477 1115 383 155
2014-05-01 1437 1064 348 155
2014-06-01 1174 840 307 94
2014-07-01 1495 1197 490 197
2014-08-01 1267 1009 404 158
2014-09-01 1438 1250 377 107
2014-10-01 1413 1187 370 117
2014-11-01 720 577 405 111
2014-12-01 686 617 430 156
2015-01-01 1352 1157 379 112
2015-02-01 1277 912 437 144

In [16]:
curitibaPublic['Curitiba Total'] = curitibaPublic.sum(axis=1) # Adding a column with monthly totals
curitibaPublic


Out[16]:
Primary Care, Females Primary Care, Males HIV Testing, Males HIV Testing, Females Curitiba Total
2014-01-01 1465 1057 528 186 3236
2014-02-01 1425 1056 358 133 2972
2014-03-01 1378 998 389 158 2923
2014-04-01 1477 1115 383 155 3130
2014-05-01 1437 1064 348 155 3004
2014-06-01 1174 840 307 94 2415
2014-07-01 1495 1197 490 197 3379
2014-08-01 1267 1009 404 158 2838
2014-09-01 1438 1250 377 107 3172
2014-10-01 1413 1187 370 117 3087
2014-11-01 720 577 405 111 1813
2014-12-01 686 617 430 156 1889
2015-01-01 1352 1157 379 112 3000
2015-02-01 1277 912 437 144 2770

Looking at the Ground-Truth Data

Now we'll start having a graphical look at the data. First off we'll create a timeseries graph for each topic and the monthly totals (that is, a timeseries graph for each column)


In [17]:
for col in curitibaPublic:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = curitibaPublic[col].name,                   # Plot title
        y_axis_label = 'Tests',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FAFAFA',                        # Background colour for plot area
        outline_line_color = '#FAFAFA',                     # Colour of line sorrounding plot
        border_fill = '#FAFAFA',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (curitibaPublic.index.min(),
                   curitibaPublic.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(curitibaPublic[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        curitibaPublic.index,                               # Variable values for the x-axis (index = dates)
        curitibaPublic[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
   # legend().label_text_font='Open Sans'
   # legend().label_text_color='#363636'
   # legend().border_line_color='#f6f6f6'
   # axis().axis_label_text_font = "Open Sans"
   # axis().axis_label_text_font_size = "12pt"
   # axis().axis_label_text_color = "#363636"
   # axis().major_label_text_font="Open Sans"
   # axis().major_label_text_font_size="10pt"
   # axis().minor_tick_line_color = "#d4d4d4"
   # xaxis().axis_line_color = '#d4d4d4'
   # xaxis().major_tick_line_color = "#d4d4d4"
   # yaxis().major_tick_line_color = None
   # yaxis().axis_line_color = None
   # xgrid().grid_line_color = None
   # ygrid().grid_line_color = "#d4d4d4"
    show(fig)


Below we'll insert lines for all topics in one chart to better compare.


In [46]:
fig = figure(
    plot_width = 1000, plot_height = 600, title = 'All Groups', y_axis_label = 'Tests', x_axis_label = 'Date',
    title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FAFAFA',
    outline_line_color = '#FAFAFA', border_fill = '#FAFAFA', x_axis_type = 'datetime',
    x_range = (curitibaPublic.index.min(),curitibaPublic.index.max()), y_range = (0,1800),
    )
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Females"], line_color = '#00aeef', line_width = 5, line_alpha = 0.7,
         legend = "Primary Care, Females")
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Males"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7, 
         legend = "Primary Care, Males")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Females"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Females")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Males"], line_color = '#e1d8ad', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Males")
#legend().label_text_font='Open Sans'
#legend().label_text_color='#363636'
#legend().border_line_color='#f6f6f6'
#axis().axis_label_text_font = "Open Sans"
#axis().axis_label_text_font_size = "12pt"
#axis().axis_label_text_color = "#363636"
#axis().major_label_text_font="Open Sans"
#axis().major_label_text_font_size="10pt"
#axis().minor_tick_line_color = "#d4d4d4"
#xaxis().axis_line_color = '#d4d4d4'
#xaxis().major_tick_line_color = "#d4d4d4"
#yaxis().major_tick_line_color = None
#yaxis().axis_line_color = None
#xgrid().grid_line_color = None
#ygrid().grid_line_color = "#d4d4d4"
#ygrid().grid_line_width = 0.5
show(fig)


Correlation Between Test Groups

We can see above that there is some co-variance between the sub-groupings, but July also seems to be a bit of a dividing month. We'll therefore dig a bit further into any potential correlations to see if changes within the sub-groupings are indeed similar. If that is the case, it would imply that there is a large degree of robustness in the data. Or, in other words, if one group gets tested more in one month, so will others, meaning that external factors, such as campaigns or increased risk behavoiur across sub-groupings, could be at play.

First we'll make a simple correlation matrix using Pandas' built-in DataFrame correlations function, .corr. It correlates all columns pairwise using either pearson, kendall, or spearman.

As we're currently looking at timeseries correlations, we'll just use the default: Pearson.

Normal Distribution Test

Pearson assumes that the data is normal distributed. We can't really test that with only four data points per series, but the code below has been readied for furture use.


In [19]:
normalTestPrimaryFemales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Females"])
normalTestPrimaryMales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Males"])
normalTestHIVTestFemales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Females"])
normalTestHIVTestMales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Males"])
normalTestTotal = scipy.stats.mstats.normaltest(curitibaPublic["Curitiba Total"])

print('Normal Distribution Test for "Primary Care, Females": %s' % (normalTestPrimaryFemales,))
print('Normal Distribution Test for "Primary Care, Males": %s' % (normalTestPrimaryMales,))
print('Normal Distribution Test for "HIV Testing, Females": %s' % (normalTestHIVTestFemales,))
print('Normal Distribution Test for "HIV Testing, Males": %s' % (normalTestHIVTestMales,))
print('Normal Distribution Test for "Curitiba Total": %s' % (normalTestTotal,))

#curitibaPublic["Primary Care, Females"].normaltest()


Normal Distribution Test for "Primary Care, Females": (10.569402877642911, 0.0050685452331196016)
Normal Distribution Test for "Primary Care, Males": (3.844355404418017, 0.14628804320632191)
Normal Distribution Test for "HIV Testing, Females": (0.41082200345958769, 0.81431256375187089)
Normal Distribution Test for "HIV Testing, Males": (3.2429228913257564, 0.19760969212357282)
Normal Distribution Test for "Curitiba Total": (6.0730025398130856, 0.048002543979462284)
/home/ubuntu/anaconda/lib/python2.7/site-packages/scipy/stats/mstats_basic.py:1613: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=14
  np.min(n))

Histograms

As is the case with narmaltests above, we don't really have enough data points for a histogram to be useful as a visual indicator of being normal distributed, but again, for future use.

(gg.ggplot(gg.aes(x="Primary Care, Females"), data=curitibaPublic) + gg.geom_histogram())
(gg.ggplot(gg.aes(x="Primary Care, Males"), data=curitibaPublic) + gg.geom_histogram())
(gg.ggplot(gg.aes(x="HIV Testing, Females"), data=curitibaPublic) + gg.geom_histogram())
(gg.ggplot(gg.aes(x="HIV Testing, Males"), data=curitibaPublic) + gg.geom_histogram())
(gg.ggplot(gg.aes(x="Curitiba Total"), data=curitibaPublic) + gg.geom_histogram())

Correlation Matrix


In [20]:
curitibaPublicCorr = curitibaPublic.corr() # Using default method: Pearson
curitibaPublicCorr


Out[20]:
Primary Care, Females Primary Care, Males HIV Testing, Males HIV Testing, Females Curitiba Total
Primary Care, Females 1.000000 0.930127 0.021855 0.266027 0.975676
Primary Care, Males 0.930127 1.000000 0.005270 0.152996 0.957776
HIV Testing, Males 0.021855 0.005270 1.000000 0.749629 0.182782
HIV Testing, Females 0.266027 0.152996 0.749629 1.000000 0.367992
Curitiba Total 0.975676 0.957776 0.182782 0.367992 1.000000
curitibaPublicCorrKendall = curitibaPublic.corr(method='kendall') # Using kendall curitibaPublicCorrKendall

In [21]:
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(curitibaPublic, annot=False, sig_stars=True,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()


Main Group Correlations


In [22]:
sns.jointplot("HIV Testing, Males", "HIV Testing, Females", curitibaPublic, kind="reg", color="#404040");


Getting the Twitter Data


In [23]:
# Checking that the data file looks right #
!cat spark/output-final/all.csv | head


city,lat,lon,origdate,topic









cat: write error: Broken pipe

In [24]:
# Read in Twitter data file #
twitterData=pd.read_csv('spark/output-final/all.csv',
                          encoding='utf-8',
                          #header=None,
                          na_values=['NaN',''],
                          parse_dates=[3],
                          index_col=[3]
                        )

In [25]:
twitterData.head()


Out[25]:
city lat lon topic
origdate
2014-04-03 21:23:26 Recife -8.057838 -34.882897 Discrimination_Negative
2015-02-07 23:51:00 São Paulo -23.500000 -46.600000 Discrimination_Negative
2014-11-12 20:35:08 Curitiba -25.428954 -49.267137 Discrimination_Negative
2014-04-06 12:06:17 Curitiba -25.428954 -49.267137 Discrimination_Negative
2014-09-13 15:38:28 São Paulo -23.500000 -46.600000 Discrimination_Negative

In [26]:
twitterDataCounts = pd.DataFrame({"Tweets" : twitterData.groupby(["city"]).size()}).reset_index()
twitterDataCounts.sort("Tweets", ascending=False)


Out[26]:
city Tweets
24 São Paulo 20908
21 Rio de Janeiro 13271
17 Porto Alegre 10766
7 Curitiba 7546
1 Belo Horizonte 6933
4 Brasília 6492
2 Belém 4257
26 Vitória 3474
8 Florianópolis 3140
19 Recife 3090
9 Fortaleza 2622
14 Manaus 2047
22 Salvador 2036
15 Natal 1759
23 São Luís 1677
5 Campo Grande 1594
3 Boa Vista 1232
13 Maceió 1067
0 Aracaju 904
10 Goiânia 849
12 Macapá 846
6 Cuiabá 808
11 João Pessoa 797
25 Teresina 753
20 Rio Branco 629
18 Porto Velho 450
16 Palmas 186

In [27]:
twitterDataCounts.sort("Tweets").plot(x = "city", y = "Tweets", kind = "barh", title = "Number of Tweets per City",
                                legend = False, color="#00aeef", alpha=0.8)


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f653da08910>

In [28]:
cityData = pd.merge(twitterDataCounts, populationData, how='outer', left_on="city", right_on="City", copy=True)
cityData = cityData[["City","Population","Tweets"]]
cityData


Out[28]:
City Population Tweets
0 Aracaju 571149 904
1 Belo Horizonte 2375151 6933
2 Belém 1381475 4257
3 Boa Vista 277799 1232
4 Brasília 2481272 6492
5 Campo Grande 776242 1594
6 Cuiabá 540814 808
7 Curitiba 1751907 7546
8 Florianópolis 405189 3140
9 Fortaleza 2452185 2622
10 Goiânia 1297154 849
11 João Pessoa 720954 797
12 Macapá 381091 846
13 Maceió 932078 1067
14 Manaus 1792881 2047
15 Natal 803739 1759
16 Palmas 221742 186
17 Porto Alegre 1409351 10766
18 Porto Velho 392475 450
19 Recife 1537704 3090
20 Rio Branco 308545 629
21 Rio de Janeiro 6320446 13271
22 Salvador 2674923 2036
23 São Luís 958545 1677
24 São Paulo 11152968 20908
25 Teresina 767559 753
26 Vitória 327801 3474

In [29]:
cityData["Tweets per 1,000 inhabitants"] = cityData["Tweets"]/cityData["Population"]*1000
cityData.sort("Tweets per 1,000 inhabitants").plot(x = "City", y = "Tweets per 1,000 inhabitants", kind = "barh", 
                                                   title = "Tweets per 1,000 inhabitants", legend = False,
                                                  color="#00aeef", alpha=0.8)


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f653d40e510>

Case: Curitiba


In [30]:
twitterDataCuritiba = twitterData[twitterData['city'] == 'Curitiba'] # Getting Curitiba data only
twitterDataSmall = twitterDataCuritiba[['city','topic']] # Getting rid of columns we won't need
twitterDataSmall.head()


Out[30]:
city topic
origdate
2014-11-12 20:35:08 Curitiba Discrimination_Negative
2014-04-06 12:06:17 Curitiba Discrimination_Negative
2014-09-19 02:54:41 Curitiba Discrimination_Negative
2014-03-25 19:49:02 Curitiba Discrimination_Negative
2014-04-22 06:39:53 Curitiba Discrimination_Negative

In [31]:
twitterDataSmall.describe()


Out[31]:
city topic
count 7546 7546
unique 1 7
top Curitiba Discrimination_Negative
freq 7546 5697

In [32]:
twitterDataSmall = pd.get_dummies(twitterDataSmall['topic'])
twitterDataSmall.head()


Out[32]:
Campaign_Portuguese Discrimination_Negative Discrimination_Positive Prevention_Negative Prevention_Neutral Prevention_Positive Testing_Neutral
origdate
2014-11-12 20:35:08 0 1 0 0 0 0 0
2014-04-06 12:06:17 0 1 0 0 0 0 0
2014-09-19 02:54:41 0 1 0 0 0 0 0
2014-03-25 19:49:02 0 1 0 0 0 0 0
2014-04-22 06:39:53 0 1 0 0 0 0 0

In [33]:
twitterDataSmallAgg = twitterDataSmall.resample('MS', how='sum') # Resampling by summing each topic over each month
twitterDataSmallAgg['Twitter Total'] = twitterDataSmallAgg.sum(axis=1) # Adding a column with monthly totals
# twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataSmallAgg


Out[33]:
Campaign_Portuguese Discrimination_Negative Discrimination_Positive Prevention_Negative Prevention_Neutral Prevention_Positive Testing_Neutral Twitter Total
origdate
2014-01-01 6 540 12 1 6 61 1 627
2014-02-01 0 498 38 2 11 63 0 612
2014-03-01 2 509 39 2 21 78 0 651
2014-04-01 0 472 53 1 11 56 1 594
2014-05-01 0 371 38 1 5 46 1 462
2014-06-01 0 415 18 1 9 61 1 505
2014-07-01 5 401 12 2 12 87 2 521
2014-08-01 1 349 59 1 5 53 2 470
2014-09-01 0 349 107 0 7 51 0 514
2014-10-01 0 344 62 3 8 40 0 457
2014-11-01 1 265 30 2 12 75 1 386
2014-12-01 8 302 36 1 28 51 4 430
2015-01-01 1 324 27 1 31 77 3 464
2015-02-01 3 274 37 1 26 89 5 435
2015-03-01 9 284 52 1 5 60 7 418

Looking at the Twitter Data


In [34]:
for col in twitterDataSmallAgg:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = twitterDataSmallAgg[col].name,                   # Plot title
        y_axis_label = 'Tweets',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FAFAFA',                        # Background colour for plot area
        outline_line_color = '#FAFAFA',                     # Colour of line sorrounding plot
        border_fill = '#FAFAFA',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (twitterDataSmallAgg.index.min(),
                   twitterDataSmallAgg.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(twitterDataSmallAgg[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        twitterDataSmallAgg.index,                               # Variable values for the x-axis (index = dates)
        twitterDataSmallAgg[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
    #legend().label_text_font='Open Sans'
    #legend().label_text_color='#363636'
    #legend().border_line_color='#f6f6f6'
    #axis().axis_label_text_font = "Open Sans"
    #axis().axis_label_text_font_size = "12pt"
    #axis().axis_label_text_color = "#363636"
    #axis().major_label_text_font="Open Sans"
    #axis().major_label_text_font_size="10pt"
    #axis().minor_tick_line_color = "#d4d4d4"
    #xaxis().axis_line_color = '#d4d4d4'
    #xaxis().major_tick_line_color = "#d4d4d4"
    #yaxis().major_tick_line_color = None
    #yaxis().axis_line_color = None
    #xgrid().grid_line_color = None
    #ygrid().grid_line_color = "#d4d4d4"
    #ygrid().grid_line_width = 0.5
    show(fig)



In [35]:
curitibaTwitterCorr = twitterDataSmallAgg.corr() # Using default method: Pearson
curitibaTwitterCorr


Out[35]:
Campaign_Portuguese Discrimination_Negative Discrimination_Positive Prevention_Negative Prevention_Neutral Prevention_Positive Testing_Neutral Twitter Total
Campaign_Portuguese 1.000000 -0.173546 -0.283673 -0.126847 0.155188 0.157308 0.720007 -0.171034
Discrimination_Negative -0.173546 1.000000 -0.236608 0.124414 -0.237682 -0.033660 -0.591547 0.966719
Discrimination_Positive -0.283673 -0.236608 1.000000 -0.292206 -0.264764 -0.506955 -0.130878 -0.104225
Prevention_Negative -0.126847 0.124414 -0.292206 1.000000 -0.007390 0.106317 -0.302603 0.065439
Prevention_Neutral 0.155188 -0.237682 -0.264764 -0.007390 1.000000 0.516719 0.307874 -0.120481
Prevention_Positive 0.157308 -0.033660 -0.506955 0.106317 0.516719 1.000000 0.254708 0.065843
Testing_Neutral 0.720007 -0.591547 -0.130878 -0.302603 0.307874 0.254708 1.000000 -0.552325
Twitter Total -0.171034 0.966719 -0.104225 0.065439 -0.120481 0.065843 -0.552325 1.000000
f, ax = plt.subplots(figsize=(8, 8)) sns.corrplot(twitterDataSmallAgg, annot=False, sig_stars=True, diag_names=False, cmap=cmap, ax=ax) f.tight_layout()

Merging Data


In [36]:
df = pd.merge(curitibaPublic, twitterDataSmallAgg, how='left', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True,
      suffixes=('_x', '_y'), copy=True)

In [37]:
df


Out[37]:
Primary Care, Females Primary Care, Males HIV Testing, Males HIV Testing, Females Curitiba Total Campaign_Portuguese Discrimination_Negative Discrimination_Positive Prevention_Negative Prevention_Neutral Prevention_Positive Testing_Neutral Twitter Total
2014-01-01 1465 1057 528 186 3236 6 540 12 1 6 61 1 627
2014-02-01 1425 1056 358 133 2972 0 498 38 2 11 63 0 612
2014-03-01 1378 998 389 158 2923 2 509 39 2 21 78 0 651
2014-04-01 1477 1115 383 155 3130 0 472 53 1 11 56 1 594
2014-05-01 1437 1064 348 155 3004 0 371 38 1 5 46 1 462
2014-06-01 1174 840 307 94 2415 0 415 18 1 9 61 1 505
2014-07-01 1495 1197 490 197 3379 5 401 12 2 12 87 2 521
2014-08-01 1267 1009 404 158 2838 1 349 59 1 5 53 2 470
2014-09-01 1438 1250 377 107 3172 0 349 107 0 7 51 0 514
2014-10-01 1413 1187 370 117 3087 0 344 62 3 8 40 0 457
2014-11-01 720 577 405 111 1813 1 265 30 2 12 75 1 386
2014-12-01 686 617 430 156 1889 8 302 36 1 28 51 4 430
2015-01-01 1352 1157 379 112 3000 1 324 27 1 31 77 3 464
2015-02-01 1277 912 437 144 2770 3 274 37 1 26 89 5 435

Comparisons


In [38]:
dfNoTotals = df.drop(df.columns[[4, 12]], axis=1)
dfNoTotalsCorr = dfNoTotals.corr() # Using default method: Pearson
dfNoTotalsCorr


Out[38]:
Primary Care, Females Primary Care, Males HIV Testing, Males HIV Testing, Females Campaign_Portuguese Discrimination_Negative Discrimination_Positive Prevention_Negative Prevention_Neutral Prevention_Positive Testing_Neutral
Primary Care, Females 1.000000 0.930127 0.021855 0.266027 -0.340195 0.572681 0.157711 -0.009420 -0.356660 -0.020886 -0.380191
Primary Care, Males 0.930127 1.000000 0.005270 0.152996 -0.335326 0.372802 0.346881 -0.030731 -0.294514 -0.121403 -0.360078
HIV Testing, Males 0.021855 0.005270 1.000000 0.749629 0.787499 0.115768 -0.341484 0.005826 0.095528 0.370342 0.341341
HIV Testing, Females 0.266027 0.152996 0.749629 1.000000 0.629990 0.398823 -0.361133 0.060185 -0.061313 0.194174 0.188662
Campaign_Portuguese -0.340195 -0.335326 0.787499 0.629990 1.000000 -0.000118 -0.443256 -0.064861 0.387285 0.237851 0.538479
Discrimination_Negative 0.572681 0.372802 0.115768 0.398823 -0.000118 1.000000 -0.211037 0.091550 -0.337992 -0.053778 -0.559711
Discrimination_Positive 0.157711 0.346881 -0.341484 -0.361133 -0.443256 -0.211037 1.000000 -0.280880 -0.243220 -0.504345 -0.299572
Prevention_Negative -0.009420 -0.030731 0.005826 0.060185 -0.064861 0.091550 -0.280880 1.000000 -0.041254 0.099711 -0.298511
Prevention_Neutral -0.356660 -0.294514 0.095528 -0.061313 0.387285 -0.337992 -0.243220 -0.041254 1.000000 0.519410 0.685470
Prevention_Positive -0.020886 -0.121403 0.370342 0.194174 0.237851 -0.053778 -0.504345 0.099711 0.519410 1.000000 0.407901
Testing_Neutral -0.380191 -0.360078 0.341341 0.188662 0.538479 -0.559711 -0.299572 -0.298511 0.685470 0.407901 1.000000

In [39]:
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(dfNoTotals, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [68]:
dfTotals = df[["Curitiba Total", "Twitter Total"]]
dfTotalsCorr = dfTotals.corr() # Using default method: Pearson
dfTotalsCorr


Out[68]:
Curitiba Total Twitter Total
Curitiba Total 1.000000 0.564951
Twitter Total 0.564951 1.000000

In [76]:
dfTotals["Curitiba Tests Total"] = dfTotals["Curitiba Total"].sum()
dfTotals["Curitiba Tweets Total"] = dfTotals["Twitter Total"].sum()
dfTotals["Curitiba Tests Total %"] = dfTotals["Curitiba Total"]/dfTotals["Curitiba Tests Total"]*100
dfTotals["Curitiba Tweets Total %"] = dfTotals["Twitter Total"]/dfTotals["Curitiba Tweets Total"]*100


/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app
/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [77]:
dfTotals


Out[77]:
Curitiba Total Twitter Total Curitiba Tests Total Curitiba Tweets Total Curitiba Tests Total % Curitiba Tweets Total %
2014-01-01 3236 627 39628 7128 8.165943 8.796296
2014-02-01 2972 612 39628 7128 7.499748 8.585859
2014-03-01 2923 651 39628 7128 7.376098 9.132997
2014-04-01 3130 594 39628 7128 7.898456 8.333333
2014-05-01 3004 462 39628 7128 7.580499 6.481481
2014-06-01 2415 505 39628 7128 6.094176 7.084736
2014-07-01 3379 521 39628 7128 8.526799 7.309203
2014-08-01 2838 470 39628 7128 7.161603 6.593715
2014-09-01 3172 514 39628 7128 8.004441 7.210999
2014-10-01 3087 457 39628 7128 7.789947 6.411336
2014-11-01 1813 386 39628 7128 4.575048 5.415264
2014-12-01 1889 430 39628 7128 4.766832 6.032548
2015-01-01 3000 464 39628 7128 7.570405 6.509540
2015-02-01 2770 435 39628 7128 6.990007 6.102694

In [79]:
fig = figure(
    plot_width = 1000, plot_height = 600, title = 'Tests vs. Tweets', y_axis_label = 'Monthly %', x_axis_label = 'Month',
    title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FAFAFA',
    outline_line_color = '#FAFAFA', border_fill = '#FAFAFA', x_axis_type = 'datetime',
    x_range = (dfTotals.index.min(),dfTotals.index.max()), y_range = (0,10),
    )
fig.line(dfTotals.index, dfTotals["Curitiba Tests Total %"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7, 
         legend = "Curitiba Tests"),
fig.line(dfTotals.index, dfTotals["Curitiba Tweets Total %"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
         legend = "Curitiba Tweets")
show(fig)



In [41]:
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(dfTotals, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [86]:
sns.jointplot("Curitiba Total", "Twitter Total", df, kind="reg", color="#404040");



In [43]:
sns.jointplot("HIV Testing, Males", "Campaign_Portuguese", df, kind="reg", color="#404040");



In [44]:
sns.jointplot("HIV Testing, Females", "Campaign_Portuguese", df, kind="reg", color="#404040");



In [45]:
sns.jointplot("Primary Care, Females", "Prevention_Positive", df, kind="reg", color="#404040");


Anomaly Detection

We can only do anomaly detection on tweets, because we don't have enough data points for our ground-truth data. As we have very fine temporal information for all tweets (seconds), we can resample to look at hourly and daily aggregates. That shulkd give us enough data for anomaly detection.

First we resample to daily aggregates.


In [55]:
twitterDataDailyAgg = twitterDataSmall.resample('D', how='sum') # Resampling by summing each topic over each day
twitterDataDailyAgg['Twitter Total'] = twitterDataDailyAgg.sum(axis=1) # Adding a column with monthly totals
# twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataDailyAgg.head()


Out[55]:
Campaign_Portuguese Discrimination_Negative Discrimination_Positive Prevention_Negative Prevention_Neutral Prevention_Positive Testing_Neutral Twitter Total
origdate
2014-01-01 0 15 0 0 0 1 0 16
2014-01-02 0 16 0 0 0 0 0 16
2014-01-03 0 15 0 0 0 0 0 15
2014-01-04 0 13 0 0 2 4 0 19
2014-01-05 0 12 0 0 0 4 0 16

As we'll be using r instead of Python for this, we'll use IPyhon's built-in r interpreter using the so called Magic Functions.


In [56]:
%load_ext rmagic


/home/ubuntu/anaconda/lib/python2.7/site-packages/IPython/extensions/rmagic.py:693: UserWarning: The rmagic extension in IPython is deprecated in favour of rpy2.ipython. If available, that will be loaded instead.
http://rpy.sourceforge.net/
  warnings.warn("The rmagic extension in IPython is deprecated in favour of "
%%R update.packages() install.packages("devtools") devtools::install_github("twitter/AnomalyDetection")

In [57]:
%R library(AnomalyDetection)


Out[57]:
<StrVector - Python:0x7f6b1d9e5bd8 / R:0x3a06f68>
[str, str, str, ..., str, str, str]

In [58]:
# %%R
# help(AnomalyDetectionTs)
# help(AnomalyDetectionVec)

In [59]:
df_r = twitterDataDailyAgg['Twitter Total']
df_r.to_csv('TwitterDailyAgg.csv', header=['Twitter Total'], date_format='%Y-%m-%d')

In [60]:
!cat TwitterDailyAgg.csv | head


origdate,Twitter Total
2014-01-01,16.0
2014-01-02,16.0
2014-01-03,15.0
2014-01-04,19.0
2014-01-05,16.0
2014-01-06,21.0
2014-01-07,28.0
2014-01-08,14.0
2014-01-09,14.0

In [61]:
%%R

df_r = read.csv("TwitterDailyAgg.csv", stringsAsFactors=FALSE)

In [62]:
%R data(df_r)


Out[62]:
<StrVector - Python:0x7f6b1d9e5878 / R:0x5cf56d8>
[str]

In [63]:
%R df_r


Out[63]:
<DataFrame - Python:0x7f6b1d8fcef0 / R:0x4e64a38>
[StrVector, FloatVector]
  origdate: <class 'rpy2.robjects.vectors.StrVector'>
  <StrVector - Python:0x7f6b1d9795f0 / R:0x5baeb60>
[str, str, str, ..., str, str, str]
  Twitter.Total: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7f6b1d9797e8 / R:0x5baf9d0>
[16.000000, 16.000000, 15.000000, ..., 11.000000, 12.000000, 17.000000]

In [64]:
%%R
data(df_r)
res = AnomalyDetectionTs(df_r, max_anoms=0.02, direction='both', plot=TRUE)
res$plot


Error in R_idx[i] <- data[[1]][temp_max_idx] : 
  replacement has length zero
In addition: Warning messages:
1: In data(df_r) : data set ‘df_r’ not found
2: In data(df_r) : data set ‘df_r’ not found
3: In max(ares) : no non-missing arguments to max; returning -Inf
4: In max(ares) : no non-missing arguments to max; returning -Inf
Error in R_idx[i] <- data[[1]][temp_max_idx] : 
  replacement has length zero

DELETE


In [51]:
%R raw_data


Out[51]:
array([ <DataFrame - Python:0x7f36222810e0 / R:0x6e78748>
[Float..., IntVe..., IntVe..., ..., IntVe..., IntVe..., IntVe...]
  <no name>: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7f3622281488 / R:0x785a320>
[0.000000, 0.000000, 0.000000, ..., 0.000000, 0.000000, 0.000000]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e9ea8 / R:0x6611f80>
[       1,        2,        3, ...,       56,       57,       58]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e92d8 / R:0x7aae890>
[      14,       14,       14, ...,       13,       13,       13]
  ...
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e93f8 / R:0x6891040>
[       4,        4,        4, ...,        0,        0,        0]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e9488 / R:0x76d8910>
[     268,      268,      268, ...,      278,      278,      278]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f36222e9248 / R:0x76e6a40>
[       0,        0,        0, ...,        0,        0,        0],
       <FloatVector - Python:0x7f36222811b8 / R:0x7a673c0>
[182.478000, 176.231000, 183.917000, ..., 153.776000, 150.481000, 146.638000]], dtype=object)

In [52]:
%%R

data(raw_data)
res = AnomalyDetectionTs(raw_data, max_anoms=0.02, direction='both', plot=TRUE)
res$plot


Styling


In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)


Out[1]:

In [ ]: