Setting the Stage



In [1]:

    
# Importing the Python libraries we will use below #
import sys
import numpy as np
import scipy as scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import ggplot as gg
import seaborn as sns
from bokeh.plotting import *









    




    
        
        
        
    
        
        BokehJS successfully loaded.



In [2]:

    
# %install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
%load_ext watermark
%watermark -a 'Rene Clausen Nielsen, UN Global Pulse' -p pandas,numpy,scipy,geolocator,scikit-learn,nltk,gensim,textblob,ggplot,matplotlib,mpld3,seaborn,bokeh,pymysql -d -n -t -z -v -m -g









    



Rene Clausen Nielsen, UN Global Pulse 10/03/2015 12:57:57 UTC

CPython 2.7.9
IPython 2.4.1

pandas 0.15.1
numpy 1.9.2
scipy 0.15.1
geolocator 0.2.dev0
scikit-learn 0.15.2
nltk 3.0.1
gensim 0.10.3
textblob 0.9.0
ggplot 0.6.5
matplotlib 1.4.3
mpld3 0.2
seaborn 0.5.1
bokeh 0.8.1
pymysql 0.6.3

compiler   : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)
system     : Linux
release    : 3.13.0-46-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
Git hash   :



In [3]:

    
# Setting the chosen graphical styles #
%matplotlib inline
output_notebook()
sns.set_style("darkgrid", {"grid.linewidth": .9, "axes.facecolor": ".98"})
sns.set_context("notebook") # paper, notebook, talk, poster
# colour_map = dict(unsafe="red", celebs="blue", general="yellow", awareness="grey", myths="purple", stigma="green", safe="#2ecc71", advocates="#34495e", race="#e74c3c", jokes="#3498db",needle="steelblue", questions="indianred")
cmap = sns.diverging_palette(19, 251, s=60, l=30, sep=100, n=11, as_cmap=True)

Getting the Ground-Truth Data



In [4]:

    
curitibaData = pd.ExcelFile("../data/groundtruth/curitiba.xls")



In [5]:

    
curitibaData.sheet_names









    Out[5]:





[u'Curitiba HIV Tests (Public)', u'Curitiba HIV Tests (Private)']



In [6]:

    
curitibaPublic = curitibaData.parse(sheetname=0, header=0, parse_dates=True, index_col=0)



In [7]:

    
curitibaPublic









    Out[7]:






  
    
      
      2014-01
      2014-02
      2014-03
      2014-04
      2014-05
      2014-06
      2014-07
      2014-08
      2014-09
      2014-10
      2014-11
      2014-12
    
    
      HIV tests performed 
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Primary Health Care services (females)
       1465
       1425
       1378
       1477
       1437
       1174
       1495
       1267
       1438
       1413
        720
        686
    
    
      Primary Health Care services (males)
       1057
       1056
        998
       1115
       1064
        840
       1197
       1009
       1250
       1187
        577
        617
    
    
      Pregnant women in public health care
       2760
       2360
       2095
       2258
       2473
       2086
       2640
       2020
       2571
       2499
       1564
       1666
    
    
      HIV testing center (males)
        528
        358
        389
        383
        348
        307
        490
        404
        377
        370
        405
        430
    
    
      HIV testing center (females)
        186
        133
        158
        155
        155
         94
        197
        158
        107
        117
        111
        156



In [8]:

    
curitibaPublic = curitibaPublic.loc[:,:'2014-10'] # Excluding only November and December where ground-truth is biased
curitibaPublic









    Out[8]:






  
    
      
      2014-01
      2014-02
      2014-03
      2014-04
      2014-05
      2014-06
      2014-07
      2014-08
      2014-09
      2014-10
    
    
      HIV tests performed 
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Primary Health Care services (females)
       1465
       1425
       1378
       1477
       1437
       1174
       1495
       1267
       1438
       1413
    
    
      Primary Health Care services (males)
       1057
       1056
        998
       1115
       1064
        840
       1197
       1009
       1250
       1187
    
    
      Pregnant women in public health care
       2760
       2360
       2095
       2258
       2473
       2086
       2640
       2020
       2571
       2499
    
    
      HIV testing center (males)
        528
        358
        389
        383
        348
        307
        490
        404
        377
        370
    
    
      HIV testing center (females)
        186
        133
        158
        155
        155
         94
        197
        158
        107
        117

Turning the Table



In [9]:

    
curitibaPublic = curitibaPublic.transpose()
curitibaPublic









    Out[9]:






  
    
      HIV tests performed 
      Primary Health Care services (females)
      Primary Health Care services (males)
      Pregnant women in public health care
      HIV testing center (males)
      HIV testing center (females)
    
  
  
    
      2014-01
       1465
       1057
       2760
       528
       186
    
    
      2014-02
       1425
       1056
       2360
       358
       133
    
    
      2014-03
       1378
        998
       2095
       389
       158
    
    
      2014-04
       1477
       1115
       2258
       383
       155
    
    
      2014-05
       1437
       1064
       2473
       348
       155
    
    
      2014-06
       1174
        840
       2086
       307
        94
    
    
      2014-07
       1495
       1197
       2640
       490
       197
    
    
      2014-08
       1267
       1009
       2020
       404
       158
    
    
      2014-09
       1438
       1250
       2571
       377
       107
    
    
      2014-10
       1413
       1187
       2499
       370
       117



In [10]:

    
curitibaPublic.columns = ['Primary Care, Females', 'Primary Care, Males', 'Pregnant Women', 'HIV Testing, Males',
                          'HIV Testing, Females'] # Giving the columns fairly short and explanatory names
curitibaPublic = curitibaPublic.drop('Pregnant Women', 1)
curitibaPublic









    Out[10]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
    
  
  
    
      2014-01
       1465
       1057
       528
       186
    
    
      2014-02
       1425
       1056
       358
       133
    
    
      2014-03
       1378
        998
       389
       158
    
    
      2014-04
       1477
       1115
       383
       155
    
    
      2014-05
       1437
       1064
       348
       155
    
    
      2014-06
       1174
        840
       307
        94
    
    
      2014-07
       1495
       1197
       490
       197
    
    
      2014-08
       1267
       1009
       404
       158
    
    
      2014-09
       1438
       1250
       377
       107
    
    
      2014-10
       1413
       1187
       370
       117



In [11]:

    
curitibaPublic.dtypes # Figuring out what datatype each column is read as









    Out[11]:





Primary Care, Females    int64
Primary Care, Males      int64
HIV Testing, Males       int64
HIV Testing, Females     int64
dtype: object



In [12]:

    
curitibaPublic.index = pd.to_datetime(curitibaPublic.index) # Making sure that months are read as such



In [13]:

    
curitibaPublic # Checking that data looks the same after the datatype shenanigans









    Out[13]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
    
  
  
    
      2014-01-01
       1465
       1057
       528
       186
    
    
      2014-02-01
       1425
       1056
       358
       133
    
    
      2014-03-01
       1378
        998
       389
       158
    
    
      2014-04-01
       1477
       1115
       383
       155
    
    
      2014-05-01
       1437
       1064
       348
       155
    
    
      2014-06-01
       1174
        840
       307
        94
    
    
      2014-07-01
       1495
       1197
       490
       197
    
    
      2014-08-01
       1267
       1009
       404
       158
    
    
      2014-09-01
       1438
       1250
       377
       107
    
    
      2014-10-01
       1413
       1187
       370
       117



In [14]:

    
curitibaPublic['Curitiba Total'] = curitibaPublic.sum(axis=1) # Adding a column with monthly totals
curitibaPublic









    Out[14]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Curitiba Total
    
  
  
    
      2014-01-01
       1465
       1057
       528
       186
       3236
    
    
      2014-02-01
       1425
       1056
       358
       133
       2972
    
    
      2014-03-01
       1378
        998
       389
       158
       2923
    
    
      2014-04-01
       1477
       1115
       383
       155
       3130
    
    
      2014-05-01
       1437
       1064
       348
       155
       3004
    
    
      2014-06-01
       1174
        840
       307
        94
       2415
    
    
      2014-07-01
       1495
       1197
       490
       197
       3379
    
    
      2014-08-01
       1267
       1009
       404
       158
       2838
    
    
      2014-09-01
       1438
       1250
       377
       107
       3172
    
    
      2014-10-01
       1413
       1187
       370
       117
       3087

Looking at the Ground-Truth Data

Now we'll start having a graphical look at the data. First off we'll create a timeseries graph for each topic and the monthly totals (that is, a timeseries graph for each column)



In [15]:

    
for col in curitibaPublic:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = curitibaPublic[col].name,                   # Plot title
        y_axis_label = 'Tests',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FAFAFA',                        # Background colour for plot area
        outline_line_color = '#FAFAFA',                     # Colour of line sorrounding plot
        border_fill = '#FAFAFA',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (curitibaPublic.index.min(),
                   curitibaPublic.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(curitibaPublic[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        curitibaPublic.index,                               # Variable values for the x-axis (index = dates)
        curitibaPublic[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
   # legend().label_text_font='Open Sans'
   # legend().label_text_color='#363636'
   # legend().border_line_color='#f6f6f6'
   # axis().axis_label_text_font = "Open Sans"
   # axis().axis_label_text_font_size = "12pt"
   # axis().axis_label_text_color = "#363636"
   # axis().major_label_text_font="Open Sans"
   # axis().major_label_text_font_size="10pt"
   # axis().minor_tick_line_color = "#d4d4d4"
   # xaxis().axis_line_color = '#d4d4d4'
   # xaxis().major_tick_line_color = "#d4d4d4"
   # yaxis().major_tick_line_color = None
   # yaxis().axis_line_color = None
   # xgrid().grid_line_color = None
   # ygrid().grid_line_color = "#d4d4d4"
    show(fig)

Below we'll insert lines for all topics in one chart to better compare.



In [16]:

    
fig = figure(
    plot_width = 1000, plot_height = 600, title = 'All Topics', y_axis_label = 'Tests', x_axis_label = 'Date',
    title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FAFAFA',
    outline_line_color = '#FAFAFA', border_fill = '#FAFAFA', x_axis_type = 'datetime',
    x_range = (curitibaPublic.index.min(),curitibaPublic.index.max()), y_range = (0,1800),
    )
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Females"], line_color = '#00aeef', line_width = 5, line_alpha = 0.7,
         legend = "Primary Care, Females")
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Males"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7, 
         legend = "Primary Care, Males")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Females"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Females")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Males"], line_color = '#e1d8ad', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Males")
#legend().label_text_font='Open Sans'
#legend().label_text_color='#363636'
#legend().border_line_color='#f6f6f6'
#axis().axis_label_text_font = "Open Sans"
#axis().axis_label_text_font_size = "12pt"
#axis().axis_label_text_color = "#363636"
#axis().major_label_text_font="Open Sans"
#axis().major_label_text_font_size="10pt"
#axis().minor_tick_line_color = "#d4d4d4"
#xaxis().axis_line_color = '#d4d4d4'
#xaxis().major_tick_line_color = "#d4d4d4"
#yaxis().major_tick_line_color = None
#yaxis().axis_line_color = None
#xgrid().grid_line_color = None
#ygrid().grid_line_color = "#d4d4d4"
#ygrid().grid_line_width = 0.5
show(fig)

Correlation Between Test Groups

We can see above that there is some co-variance between the sub-groupings, but July also seems to be a bit of a dividing month. We'll therefore dig a bit further into any potential correlations to see if changes within the sub-groupings are indeed similar. If that is the case, it would imply that there is a large degree of robustness in the data. Or, in other words, if one group gets tested more in one month, so will others, meaning that external factors, such as campaigns or increased risk behavoiur across sub-groupings, could be at play.

First we'll make a simple correlation matrix using Pandas' built-in DataFrame correlations function, .corr. It correlates all columns pairwise using either pearson, kendall, or spearman.

As we're currently looking at timeseries correlations, we'll just use the default: Pearson.

Normal Distribution Test

Pearson assumes that the data is normal distributed. We can't really test that with only four data points per series, but the code below has been readied for furture use.



In [17]:

    
normalTestPrimaryFemales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Females"])
normalTestPrimaryMales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Males"])
normalTestHIVTestFemales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Females"])
normalTestHIVTestMales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Males"])
normalTestTotal = scipy.stats.mstats.normaltest(curitibaPublic["Curitiba Total"])

print('Normal Distribution Test for "Primary Care, Females": %s' % (normalTestPrimaryFemales,))
print('Normal Distribution Test for "Primary Care, Males": %s' % (normalTestPrimaryMales,))
# print('Normal Distribution Test for "Pregnant Women": %s' % (normalTestPregnantWomen,))
print('Normal Distribution Test for "HIV Testing, Females": %s' % (normalTestHIVTestFemales,))
print('Normal Distribution Test for "HIV Testing, Males": %s' % (normalTestHIVTestMales,))
print('Normal Distribution Test for "Curitiba Total": %s' % (normalTestTotal,))

#curitibaPublic["Primary Care, Females"].normaltest()









    



Normal Distribution Test for "Primary Care, Females": (6.4451966926332762, 0.039851375941810332)
Normal Distribution Test for "Primary Care, Males": (1.0120404797968383, 0.60289016894457592)
Normal Distribution Test for "HIV Testing, Females": (0.26951130991670885, 0.87392942592337231)
Normal Distribution Test for "HIV Testing, Males": (3.3488328623461694, 0.18741751859687544)
Normal Distribution Test for "Curitiba Total": (5.7922243106692832, 0.055237558175648439)






    



/home/ubuntu/anaconda/lib/python2.7/site-packages/scipy/stats/mstats_basic.py:1613: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=10
  np.min(n))

Histograms

As is the case with narmaltests above, we don't really have enough data points for a histogram to be useful as a visual indicator of being normal distributed, but again, for future use.

(gg.ggplot(gg.aes(x="Primary Care, Females"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="Primary Care, Males"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="HIV Testing, Females"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="HIV Testing, Males"), data=curitibaPublic) + gg.geom_histogram())

(gg.ggplot(gg.aes(x="Curitiba Total"), data=curitibaPublic) + gg.geom_histogram())

Correlation Matrix



In [20]:

    
curitibaPublicCorr = curitibaPublic.corr() # Using default method: Pearson
curitibaPublicCorr









    Out[20]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Curitiba Total
    
  
  
    
      Primary Care, Females
       1.000000
       0.784215
       0.531210
       0.536651
       0.935275
    
    
      Primary Care, Males
       0.784215
       1.000000
       0.348880
       0.182559
       0.859216
    
    
      HIV Testing, Males
       0.531210
       0.348880
       1.000000
       0.820127
       0.713403
    
    
      HIV Testing, Females
       0.536651
       0.182559
       0.820127
       1.000000
       0.618473
    
    
      Curitiba Total
       0.935275
       0.859216
       0.713403
       0.618473
       1.000000

curitibaPublicCorrKendall = curitibaPublic.corr(method='kendall') # Using kendall curitibaPublicCorrKendall



In [21]:

    
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(curitibaPublic, annot=False, sig_stars=True,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()

Main Group Correlations



In [22]:

    
sns.jointplot("HIV Testing, Males", "HIV Testing, Females", curitibaPublic, kind="reg", color="#404040");

Getting the Twitter Data



In [23]:

    
# Checking that the data file looks right #
!cat ../data/all.csv | head









    












cat: write error: Broken pipe



In [24]:

    
# Read in Twitter data file #
twitterData=pd.read_csv('../ipynb/spark/output/all.csv',
                          encoding='utf-8',
                          #header=None,
                          na_values=['NaN',''],
                          parse_dates=[3],
                          index_col=[3]
                        )
twitterData.head()









    Out[24]:






  
    
      
      city
      lat
      lon
      topic
    
    
      origdate
      
      
      
      
    
  
  
    
      2014-04-03 21:23:26
          Recife
       -8.057838
      -34.882897
       Discrimination_Negative
    
    
      2015-02-07 23:51:00
       São Paulo
      -23.500000
      -46.600000
       Discrimination_Negative
    
    
      2014-11-12 20:35:08
        Curitiba
      -25.428954
      -49.267137
       Discrimination_Negative
    
    
      2014-04-06 12:06:17
        Curitiba
      -25.428954
      -49.267137
       Discrimination_Negative
    
    
      2014-09-13 15:38:28
       São Paulo
      -23.500000
      -46.600000
       Discrimination_Negative



In [25]:

    
twitterDataCuritiba = twitterData[twitterData['city'] == 'Curitiba'] # Getting Curitiba data only
twitterDataSmall = twitterDataCuritiba[['city','topic']] # Getting rid of columns we won't need
twitterDataSmall.head()









    Out[25]:






  
    
      
      city
      topic
    
    
      origdate
      
      
    
  
  
    
      2014-11-12 20:35:08
       Curitiba
       Discrimination_Negative
    
    
      2014-04-06 12:06:17
       Curitiba
       Discrimination_Negative
    
    
      2014-09-19 02:54:41
       Curitiba
       Discrimination_Negative
    
    
      2014-03-25 19:49:02
       Curitiba
       Discrimination_Negative
    
    
      2014-04-22 06:39:53
       Curitiba
       Discrimination_Negative



In [26]:

    
twitterDataSmall.describe()









    Out[26]:






  
    
      
      city
      topic
    
  
  
    
      count
           7252
                          7252
    
    
      unique
              1
                             7
    
    
      top
       Curitiba
       Discrimination_Negative
    
    
      freq
           7252
                          5502



In [27]:

    
twitterDataSmall = pd.get_dummies(twitterDataSmall['topic'])
twitterDataSmall.head()









    Out[27]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
    
    
      origdate
      
      
      
      
      
      
      
    
  
  
    
      2014-11-12 20:35:08
       0
       1
       0
       0
       0
       0
       0
    
    
      2014-04-06 12:06:17
       0
       1
       0
       0
       0
       0
       0
    
    
      2014-09-19 02:54:41
       0
       1
       0
       0
       0
       0
       0
    
    
      2014-03-25 19:49:02
       0
       1
       0
       0
       0
       0
       0
    
    
      2014-04-22 06:39:53
       0
       1
       0
       0
       0
       0
       0



In [28]:

    
twitterDataSmallAgg = twitterDataSmall.resample('MS', how='sum') # Resampling by summing each topic over each month
twitterDataSmallAgg['Twitter Total'] = twitterDataSmallAgg.sum(axis=1) # Adding a column with monthly totals
# twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataSmallAgg









    Out[28]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
    
      origdate
      
      
      
      
      
      
      
      
    
  
  
    
      2014-01-01
       6
       540
        12
       1
        6
       61
       1
       627
    
    
      2014-02-01
       0
       498
        38
       2
       11
       63
       0
       612
    
    
      2014-03-01
       2
       509
        39
       2
       21
       78
       0
       651
    
    
      2014-04-01
       0
       472
        53
       1
       11
       56
       1
       594
    
    
      2014-05-01
       0
       371
        38
       1
        5
       46
       1
       462
    
    
      2014-06-01
       0
       415
        18
       1
        9
       61
       1
       505
    
    
      2014-07-01
       5
       401
        12
       2
       12
       87
       2
       521
    
    
      2014-08-01
       1
       349
        59
       1
        5
       53
       2
       470
    
    
      2014-09-01
       0
       349
       107
       0
        7
       51
       0
       514
    
    
      2014-10-01
       0
       344
        62
       3
        8
       40
       0
       457
    
    
      2014-11-01
       1
       265
        30
       2
       12
       75
       1
       386
    
    
      2014-12-01
       8
       302
        36
       1
       28
       51
       4
       430
    
    
      2015-01-01
       1
       324
        27
       1
       31
       77
       3
       464
    
    
      2015-02-01
       3
       274
        37
       1
       26
       89
       5
       435
    
    
      2015-03-01
       6
        89
         9
       1
        4
       15
       0
       124

Looking at the Twitter Data



In [29]:

    
for col in twitterDataSmallAgg:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = twitterDataSmallAgg[col].name,                   # Plot title
        y_axis_label = 'Tests',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FAFAFA',                        # Background colour for plot area
        outline_line_color = '#FAFAFA',                     # Colour of line sorrounding plot
        border_fill = '#FAFAFA',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (twitterDataSmallAgg.index.min(),
                   twitterDataSmallAgg.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(twitterDataSmallAgg[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        twitterDataSmallAgg.index,                               # Variable values for the x-axis (index = dates)
        twitterDataSmallAgg[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
    #legend().label_text_font='Open Sans'
    #legend().label_text_color='#363636'
    #legend().border_line_color='#f6f6f6'
    #axis().axis_label_text_font = "Open Sans"
    #axis().axis_label_text_font_size = "12pt"
    #axis().axis_label_text_color = "#363636"
    #axis().major_label_text_font="Open Sans"
    #axis().major_label_text_font_size="10pt"
    #axis().minor_tick_line_color = "#d4d4d4"
    #xaxis().axis_line_color = '#d4d4d4'
    #xaxis().major_tick_line_color = "#d4d4d4"
    #yaxis().major_tick_line_color = None
    #yaxis().axis_line_color = None
    #xgrid().grid_line_color = None
    #ygrid().grid_line_color = "#d4d4d4"
    #ygrid().grid_line_width = 0.5
    show(fig)



In [30]:

    
curitibaTwitterCorr = twitterDataSmallAgg.corr() # Using default method: Pearson
curitibaTwitterCorr









    Out[30]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
  
  
    
      Campaign_Portuguese
       1.000000
      -0.257204
      -0.512107
      -0.108423
       0.235566
      -0.080378
       0.384959
      -0.304044
    
    
      Discrimination_Negative
      -0.257204
       1.000000
       0.068860
       0.152680
      -0.054787
       0.400021
      -0.236346
       0.969129
    
    
      Discrimination_Positive
      -0.512107
       0.068860
       1.000000
      -0.221960
      -0.129828
      -0.154733
      -0.192909
       0.212160
    
    
      Prevention_Negative
      -0.108423
       0.152680
      -0.221960
       1.000000
      -0.003667
       0.157744
      -0.254824
       0.118625
    
    
      Prevention_Neutral
       0.235566
      -0.054787
      -0.129828
      -0.003667
       1.000000
       0.561457
       0.707131
       0.094807
    
    
      Prevention_Positive
      -0.080378
       0.400021
      -0.154733
       0.157744
       0.561457
       1.000000
       0.463087
       0.529034
    
    
      Testing_Neutral
       0.384959
      -0.236346
      -0.192909
      -0.254824
       0.707131
       0.463087
       1.000000
      -0.112459
    
    
      Twitter Total
      -0.304044
       0.969129
       0.212160
       0.118625
       0.094807
       0.529034
      -0.112459
       1.000000



In [31]:

    
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(twitterDataSmallAgg, annot=False, sig_stars=True,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()

Merging Data



In [32]:

    
df = pd.merge(curitibaPublic, twitterDataSmallAgg, how='left', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True,
      suffixes=('_x', '_y'), copy=True)



In [33]:

    
df









    Out[33]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Curitiba Total
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
  
  
    
      2014-01-01
       1465
       1057
       528
       186
       3236
       6
       540
        12
       1
        6
       61
       1
       627
    
    
      2014-02-01
       1425
       1056
       358
       133
       2972
       0
       498
        38
       2
       11
       63
       0
       612
    
    
      2014-03-01
       1378
        998
       389
       158
       2923
       2
       509
        39
       2
       21
       78
       0
       651
    
    
      2014-04-01
       1477
       1115
       383
       155
       3130
       0
       472
        53
       1
       11
       56
       1
       594
    
    
      2014-05-01
       1437
       1064
       348
       155
       3004
       0
       371
        38
       1
        5
       46
       1
       462
    
    
      2014-06-01
       1174
        840
       307
        94
       2415
       0
       415
        18
       1
        9
       61
       1
       505
    
    
      2014-07-01
       1495
       1197
       490
       197
       3379
       5
       401
        12
       2
       12
       87
       2
       521
    
    
      2014-08-01
       1267
       1009
       404
       158
       2838
       1
       349
        59
       1
        5
       53
       2
       470
    
    
      2014-09-01
       1438
       1250
       377
       107
       3172
       0
       349
       107
       0
        7
       51
       0
       514
    
    
      2014-10-01
       1413
       1187
       370
       117
       3087
       0
       344
        62
       3
        8
       40
       0
       457

Comparisons



In [34]:

    
dfNoTotals = df.drop(df.columns[[4, 12]], axis=1)
dfNoTotalsCorr = dfNoTotals.corr() # Using default method: Pearson
dfNoTotalsCorr









    Out[34]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      HIV Testing, Males
      HIV Testing, Females
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
    
  
  
    
      Primary Care, Females
       1.000000
       0.784215
       0.531210
       0.536651
       0.354117
       0.225466
       0.069446
       0.128279
       0.078527
       0.122174
      -0.136857
    
    
      Primary Care, Males
       0.784215
       1.000000
       0.348880
       0.182559
       0.103346
      -0.321404
       0.515700
       0.073299
      -0.123659
      -0.116219
      -0.155290
    
    
      HIV Testing, Males
       0.531210
       0.348880
       1.000000
       0.820127
       0.936994
       0.348728
      -0.353389
       0.036715
      -0.025768
       0.439405
       0.407001
    
    
      HIV Testing, Females
       0.536651
       0.182559
       0.820127
       1.000000
       0.784017
       0.374796
      -0.503709
       0.123243
       0.135034
       0.527283
       0.561009
    
    
      Campaign_Portuguese
       0.354117
       0.103346
       0.936994
       0.784017
       1.000000
       0.438557
      -0.586626
       0.081242
       0.102658
       0.606875
       0.421848
    
    
      Discrimination_Negative
       0.225466
      -0.321404
       0.348728
       0.374796
       0.438557
       1.000000
      -0.529473
       0.084905
       0.498054
       0.485684
      -0.190649
    
    
      Discrimination_Positive
       0.069446
       0.515700
      -0.353389
      -0.503709
      -0.586626
      -0.529473
       1.000000
      -0.317560
      -0.189156
      -0.550650
      -0.428773
    
    
      Prevention_Negative
       0.128279
       0.073299
       0.036715
       0.123243
       0.081242
       0.084905
      -0.317560
       1.000000
       0.414624
       0.173084
      -0.200446
    
    
      Prevention_Neutral
       0.078527
      -0.123659
      -0.025768
       0.135034
       0.102658
       0.498054
      -0.189156
       0.414624
       1.000000
       0.684826
      -0.325051
    
    
      Prevention_Positive
       0.122174
      -0.116219
       0.439405
       0.527283
       0.606875
       0.485684
      -0.550650
       0.173084
       0.684826
       1.000000
       0.270588
    
    
      Testing_Neutral
      -0.136857
      -0.155290
       0.407001
       0.561009
       0.421848
      -0.190649
      -0.428773
      -0.200446
      -0.325051
       0.270588
       1.000000



In [35]:

    
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(dfNoTotals, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [36]:

    
dfTotals = df[["Curitiba Total", "Twitter Total"]]
dfTotalsCorr = dfTotals.corr() # Using default method: Pearson
dfTotalsCorr









    Out[36]:






  
    
      
      Curitiba Total
      Twitter Total
    
  
  
    
      Curitiba Total
       1.000000
       0.169366
    
    
      Twitter Total
       0.169366
       1.000000



In [37]:

    
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(dfTotals, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [38]:

    
sns.jointplot("Curitiba Total", "Twitter Total", df, kind="reg", color="#404040");



In [39]:

    
sns.jointplot("HIV Testing, Males", "Campaign_Portuguese", df, kind="reg", color="#404040");



In [40]:

    
sns.jointplot("HIV Testing, Females", "Campaign_Portuguese", df, kind="reg", color="#404040");



In [41]:

    
sns.jointplot("Primary Care, Females", "Prevention_Positive", df, kind="reg", color="#404040");

Anomaly Detection

We can only do anomaly detection on tweets, because we don't have enough data points for our ground-truth data. As we have very fine temporal information for all tweets (seconds), we can resample to look at hourly and daily aggregates. That shulkd give us enough data for anomaly detection.

First we resample to daily aggregates.



In [42]:

    
twitterDataDailyAgg = twitterDataSmall.resample('D', how='sum') # Resampling by summing each topic over each day
twitterDataDailyAgg['Twitter Total'] = twitterDataDailyAgg.sum(axis=1) # Adding a column with monthly totals
# twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataDailyAgg.head()









    Out[42]:






  
    
      
      Campaign_Portuguese
      Discrimination_Negative
      Discrimination_Positive
      Prevention_Negative
      Prevention_Neutral
      Prevention_Positive
      Testing_Neutral
      Twitter Total
    
    
      origdate
      
      
      
      
      
      
      
      
    
  
  
    
      2014-01-01
       0
       15
       0
       0
       0
       1
       0
       16
    
    
      2014-01-02
       0
       16
       0
       0
       0
       0
       0
       16
    
    
      2014-01-03
       0
       15
       0
       0
       0
       0
       0
       15
    
    
      2014-01-04
       0
       13
       0
       0
       2
       4
       0
       19
    
    
      2014-01-05
       0
       12
       0
       0
       0
       4
       0
       16

As we'll be using r instead of Python for this, we'll use IPyhon's built-in r interpreter using the so called Magic Functions.



In [43]:

    
%load_ext rmagic

%%R update.packages() install.packages("devtools") devtools::install_github("twitter/AnomalyDetection")



In [44]:

    
%R library(AnomalyDetection)









    Out[44]:





array(['AnomalyDetection', 'tools', 'stats', 'graphics', 'grDevices',
       'utils', 'datasets', 'methods', 'base'], 
      dtype='|S16')



In [45]:

    
# %%R
# help(AnomalyDetectionTs)
# help(AnomalyDetectionVec)



In [46]:

    
df_r = twitterDataDailyAgg['Twitter Total']
df_r.to_csv('TwitterDailyAgg.csv', header=['Twitter Total'], date_format='%Y-%m-%d')



In [47]:

    
!cat TwitterDailyAgg.csv | head









    



origdate,Twitter Total
2014-01-01,16.0
2014-01-02,16.0
2014-01-03,15.0
2014-01-04,19.0
2014-01-05,16.0
2014-01-06,21.0
2014-01-07,28.0
2014-01-08,14.0
2014-01-09,14.0



In [48]:

    
%%R

df_r = read.csv("TwitterDailyAgg.csv", stringsAsFactors=FALSE)



In [49]:

    
%R data(df_r)









    Out[49]:





array(['df_r'], 
      dtype='|S4')



In [50]:

    
%R df_r









    Out[50]:





array([['2014-01-01', '2014-01-02', '2014-01-03', '2014-01-04',
        '2014-01-05', '2014-01-06', '2014-01-07', '2014-01-08',
        '2014-01-09', '2014-01-10', '2014-01-11', '2014-01-12',
        '2014-01-13', '2014-01-14', '2014-01-15', '2014-01-16',
        '2014-01-17', '2014-01-18', '2014-01-19', '2014-01-20',
        '2014-01-21', '2014-01-22', '2014-01-23', '2014-01-24',
        '2014-01-25', '2014-01-26', '2014-01-27', '2014-01-28',
        '2014-01-29', '2014-01-30', '2014-01-31', '2014-02-01',
        '2014-02-02', '2014-02-03', '2014-02-04', '2014-02-05',
        '2014-02-06', '2014-02-07', '2014-02-08', '2014-02-09',
        '2014-02-10', '2014-02-11', '2014-02-12', '2014-02-13',
        '2014-02-14', '2014-02-15', '2014-02-16', '2014-02-17',
        '2014-02-18', '2014-02-19', '2014-02-20', '2014-02-21',
        '2014-02-22', '2014-02-23', '2014-02-24', '2014-02-25',
        '2014-02-26', '2014-02-27', '2014-02-28', '2014-03-01',
        '2014-03-02', '2014-03-03', '2014-03-04', '2014-03-05',
        '2014-03-06', '2014-03-07', '2014-03-08', '2014-03-09',
        '2014-03-10', '2014-03-11', '2014-03-12', '2014-03-13',
        '2014-03-14', '2014-03-15', '2014-03-16', '2014-03-17',
        '2014-03-18', '2014-03-19', '2014-03-20', '2014-03-21',
        '2014-03-22', '2014-03-23', '2014-03-24', '2014-03-25',
        '2014-03-26', '2014-03-27', '2014-03-28', '2014-03-29',
        '2014-03-30', '2014-03-31', '2014-04-01', '2014-04-02',
        '2014-04-03', '2014-04-04', '2014-04-05', '2014-04-06',
        '2014-04-07', '2014-04-08', '2014-04-09', '2014-04-10',
        '2014-04-11', '2014-04-12', '2014-04-13', '2014-04-14',
        '2014-04-15', '2014-04-16', '2014-04-17', '2014-04-18',
        '2014-04-19', '2014-04-20', '2014-04-21', '2014-04-22',
        '2014-04-23', '2014-04-24', '2014-04-25', '2014-04-26',
        '2014-04-27', '2014-04-28', '2014-04-29', '2014-04-30',
        '2014-05-01', '2014-05-02', '2014-05-03', '2014-05-04',
        '2014-05-05', '2014-05-06', '2014-05-07', '2014-05-08',
        '2014-05-09', '2014-05-10', '2014-05-11', '2014-05-12',
        '2014-05-13', '2014-05-14', '2014-05-15', '2014-05-16',
        '2014-05-17', '2014-05-18', '2014-05-19', '2014-05-20',
        '2014-05-21', '2014-05-22', '2014-05-23', '2014-05-24',
        '2014-05-25', '2014-05-26', '2014-05-27', '2014-05-28',
        '2014-05-29', '2014-05-30', '2014-05-31', '2014-06-01',
        '2014-06-02', '2014-06-03', '2014-06-04', '2014-06-05',
        '2014-06-06', '2014-06-07', '2014-06-08', '2014-06-09',
        '2014-06-10', '2014-06-11', '2014-06-12', '2014-06-13',
        '2014-06-14', '2014-06-15', '2014-06-16', '2014-06-17',
        '2014-06-18', '2014-06-19', '2014-06-20', '2014-06-21',
        '2014-06-22', '2014-06-23', '2014-06-24', '2014-06-25',
        '2014-06-26', '2014-06-27', '2014-06-28', '2014-06-29',
        '2014-06-30', '2014-07-01', '2014-07-02', '2014-07-03',
        '2014-07-04', '2014-07-05', '2014-07-06', '2014-07-07',
        '2014-07-08', '2014-07-09', '2014-07-10', '2014-07-11',
        '2014-07-12', '2014-07-13', '2014-07-14', '2014-07-15',
        '2014-07-16', '2014-07-17', '2014-07-18', '2014-07-19',
        '2014-07-20', '2014-07-21', '2014-07-22', '2014-07-23',
        '2014-07-24', '2014-07-25', '2014-07-26', '2014-07-27',
        '2014-07-28', '2014-07-29', '2014-07-30', '2014-07-31',
        '2014-08-01', '2014-08-02', '2014-08-03', '2014-08-04',
        '2014-08-05', '2014-08-06', '2014-08-07', '2014-08-08',
        '2014-08-09', '2014-08-10', '2014-08-11', '2014-08-12',
        '2014-08-13', '2014-08-14', '2014-08-15', '2014-08-16',
        '2014-08-17', '2014-08-18', '2014-08-19', '2014-08-20',
        '2014-08-21', '2014-08-22', '2014-08-23', '2014-08-24',
        '2014-08-25', '2014-08-26', '2014-08-27', '2014-08-28',
        '2014-08-29', '2014-08-30', '2014-08-31', '2014-09-01',
        '2014-09-02', '2014-09-03', '2014-09-04', '2014-09-05',
        '2014-09-06', '2014-09-07', '2014-09-08', '2014-09-09',
        '2014-09-10', '2014-09-11', '2014-09-12', '2014-09-13',
        '2014-09-14', '2014-09-15', '2014-09-16', '2014-09-17',
        '2014-09-18', '2014-09-19', '2014-09-20', '2014-09-21',
        '2014-09-22', '2014-09-23', '2014-09-24', '2014-09-25',
        '2014-09-26', '2014-09-27', '2014-09-28', '2014-09-29',
        '2014-09-30', '2014-10-01', '2014-10-02', '2014-10-03',
        '2014-10-04', '2014-10-05', '2014-10-06', '2014-10-07',
        '2014-10-08', '2014-10-09', '2014-10-10', '2014-10-11',
        '2014-10-12', '2014-10-13', '2014-10-14', '2014-10-15',
        '2014-10-16', '2014-10-17', '2014-10-18', '2014-10-19',
        '2014-10-20', '2014-10-21', '2014-10-22', '2014-10-23',
        '2014-10-24', '2014-10-25', '2014-10-26', '2014-10-27',
        '2014-10-28', '2014-10-29', '2014-10-30', '2014-10-31',
        '2014-11-01', '2014-11-02', '2014-11-03', '2014-11-04',
        '2014-11-05', '2014-11-06', '2014-11-07', '2014-11-08',
        '2014-11-09', '2014-11-10', '2014-11-11', '2014-11-12',
        '2014-11-13', '2014-11-14', '2014-11-15', '2014-11-16',
        '2014-11-17', '2014-11-18', '2014-11-19', '2014-11-20',
        '2014-11-21', '2014-11-22', '2014-11-23', '2014-11-24',
        '2014-11-25', '2014-11-26', '2014-11-27', '2014-11-28',
        '2014-11-29', '2014-11-30', '2014-12-01', '2014-12-02',
        '2014-12-03', '2014-12-04', '2014-12-05', '2014-12-06',
        '2014-12-07', '2014-12-08', '2014-12-09', '2014-12-10',
        '2014-12-11', '2014-12-12', '2014-12-13', '2014-12-14',
        '2014-12-15', '2014-12-16', '2014-12-17', '2014-12-18',
        '2014-12-19', '2014-12-20', '2014-12-21', '2014-12-22',
        '2014-12-23', '2014-12-24', '2014-12-25', '2014-12-26',
        '2014-12-27', '2014-12-28', '2014-12-29', '2014-12-30',
        '2014-12-31', '2015-01-01', '2015-01-02', '2015-01-03',
        '2015-01-04', '2015-01-05', '2015-01-06', '2015-01-07',
        '2015-01-08', '2015-01-09', '2015-01-10', '2015-01-11',
        '2015-01-12', '2015-01-13', '2015-01-14', '2015-01-15',
        '2015-01-16', '2015-01-17', '2015-01-18', '2015-01-19',
        '2015-01-20', '2015-01-21', '2015-01-22', '2015-01-23',
        '2015-01-24', '2015-01-25', '2015-01-26', '2015-01-27',
        '2015-01-28', '2015-01-29', '2015-01-30', '2015-01-31',
        '2015-02-01', '2015-02-02', '2015-02-03', '2015-02-04',
        '2015-02-05', '2015-02-06', '2015-02-07', '2015-02-08',
        '2015-02-09', '2015-02-10', '2015-02-11', '2015-02-12',
        '2015-02-13', '2015-02-14', '2015-02-15', '2015-02-16',
        '2015-02-17', '2015-02-18', '2015-02-19', '2015-02-20',
        '2015-02-21', '2015-02-22', '2015-02-23', '2015-02-24',
        '2015-02-25', '2015-02-26', '2015-02-27', '2015-02-28',
        '2015-03-01', '2015-03-02', '2015-03-03', '2015-03-04',
        '2015-03-05', '2015-03-06', '2015-03-07', '2015-03-08',
        '2015-03-09'],
       ['16.0', '16.0', '15.0', '19.0', '16.0', '21.0', '28.0', '14.0',
        '14.0', '22.0', '10.0', '18.0', '21.0', '15.0', '31.0', '34.0',
        '22.0', '17.0', '20.0', '19.0', '24.0', '13.0', '24.0', '19.0',
        '12.0', '13.0', '26.0', '40.0', '28.0', '12.0', '28.0', '34.0',
        '12.0', '29.0', '15.0', '15.0', '24.0', '21.0', '12.0', '27.0',
        '16.0', '19.0', '19.0', '34.0', '27.0', '18.0', '17.0', '31.0',
        '26.0', '32.0', '14.0', '19.0', '16.0', '23.0', '20.0', '22.0',
        '21.0', '13.0', '36.0', '37.0', '19.0', '19.0', '45.0', '25.0',
        '32.0', '23.0', '21.0', '27.0', '14.0', '12.0', '20.0', '30.0',
        '32.0', '15.0', '12.0', '16.0', '11.0', '28.0', '21.0', '23.0',
        '14.0', '18.0', '15.0', '17.0', '17.0', '14.0', '23.0', '15.0',
        '20.0', '16.0', '17.0', '15.0', '18.0', '27.0', '11.0', '17.0',
        '21.0', '17.0', '16.0', '12.0', '17.0', '16.0', '21.0', '8.0',
        '26.0', '18.0', '24.0', '15.0', '20.0', '15.0', '26.0', '25.0',
        '27.0', '23.0', '26.0', '10.0', '27.0', '26.0', '31.0', '22.0',
        '17.0', '11.0', '20.0', '16.0', '14.0', '16.0', '17.0', '14.0',
        '22.0', '16.0', '11.0', '16.0', '9.0', '18.0', '12.0', '19.0',
        '13.0', '13.0', '15.0', '15.0', '7.0', '16.0', '12.0', '16.0',
        '12.0', '23.0', '13.0', '13.0', '11.0', '15.0', '20.0', '10.0',
        '7.0', '22.0', '26.0', '17.0', '15.0', '14.0', '12.0', '15.0',
        '19.0', '17.0', '24.0', '22.0', '17.0', '9.0', '16.0', '17.0',
        '22.0', '14.0', '17.0', '14.0', '14.0', '27.0', '12.0', '15.0',
        '23.0', '15.0', '22.0', '10.0', '21.0', '14.0', '13.0', '13.0',
        '21.0', '13.0', '9.0', '28.0', '24.0', '11.0', '15.0', '14.0',
        '30.0', '9.0', '17.0', '16.0', '20.0', '11.0', '20.0', '11.0',
        '11.0', '14.0', '25.0', '22.0', '24.0', '10.0', '12.0', '12.0',
        '27.0', '23.0', '16.0', '16.0', '15.0', '11.0', '12.0', '19.0',
        '18.0', '17.0', '24.0', '14.0', '15.0', '13.0', '6.0', '12.0',
        '12.0', '10.0', '16.0', '13.0', '17.0', '9.0', '13.0', '14.0',
        '8.0', '12.0', '8.0', '13.0', '14.0', '11.0', '24.0', '19.0',
        '37.0', '26.0', '18.0', '13.0', '13.0', '26.0', '18.0', '22.0',
        '27.0', '8.0', '15.0', '26.0', '19.0', '15.0', '22.0', '18.0',
        '15.0', '12.0', '19.0', '15.0', '14.0', '23.0', '14.0', '10.0',
        '16.0', '20.0', '11.0', '12.0', '15.0', '12.0', '9.0', '39.0',
        '16.0', '12.0', '16.0', '31.0', '10.0', '16.0', '18.0', '3.0',
        '15.0', '29.0', '12.0', '11.0', '10.0', '18.0', '24.0', '12.0',
        '15.0', '10.0', '10.0', '17.0', '29.0', '16.0', '10.0', '11.0',
        '14.0', '9.0', '12.0', '5.0', '13.0', '16.0', '18.0', '15.0',
        '5.0', '12.0', '11.0', '14.0', '12.0', '12.0', '15.0', '15.0',
        '16.0', '20.0', '16.0', '14.0', '17.0', '14.0', '13.0', '6.0',
        '25.0', '14.0', '15.0', '19.0', '13.0', '3.0', '15.0', '13.0',
        '13.0', '6.0', '7.0', '11.0', '10.0', '10.0', '61.0', '8.0',
        '17.0', '6.0', '14.0', '12.0', '4.0', '14.0', '8.0', '14.0',
        '19.0', '14.0', '18.0', '7.0', '12.0', '12.0', '10.0', '16.0',
        '21.0', '10.0', '11.0', '14.0', '22.0', '13.0', '5.0', '13.0',
        '12.0', '9.0', '21.0', '7.0', '6.0', '4.0', '11.0', '12.0', '14.0',
        '11.0', '10.0', '10.0', '17.0', '20.0', '17.0', '6.0', '11.0',
        '6.0', '11.0', '22.0', '29.0', '11.0', '13.0', '17.0', '19.0',
        '24.0', '16.0', '18.0', '21.0', '12.0', '17.0', '11.0', '9.0',
        '30.0', '24.0', '11.0', '12.0', '14.0', '17.0', '14.0', '18.0',
        '18.0', '17.0', '18.0', '19.0', '17.0', '18.0', '26.0', '25.0',
        '15.0', '7.0', '14.0', '16.0', '17.0', '20.0', '10.0', '12.0',
        '14.0', '15.0', '10.0', '9.0', '16.0', '15.0', '12.0', '9.0',
        '8.0', '8.0', '26.0', '13.0', '19.0', '14.0', '19.0', '8.0']], 
      dtype='|S32')



In [51]:

    
%%R
data(df_r)
res = AnomalyDetectionTs(df_r, max_anoms=0.02, direction='both', plot=TRUE)
res$plot









    



Error in R_idx[i] <- data[[1]][temp_max_idx] : 
  replacement has length zero
In addition: Warning messages:
1: In data(df_r) : data set ‘df_r’ not found
2: In data(df_r) : data set ‘df_r’ not found
3: In max(ares) : no non-missing arguments to max; returning -Inf
4: In max(ares) : no non-missing arguments to max; returning -Inf
Error in R_idx[i] <- data[[1]][temp_max_idx] : 
  replacement has length zero

DELETE



In [52]:

    
%R raw_data









    Out[52]:





array([ <DataFrame - Python:0x7f9457410c20 / R:0x796a848>
[Float..., IntVe..., IntVe..., ..., IntVe..., IntVe..., IntVe...]
  <no name>: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7f94574108c0 / R:0x5f9c910>
[0.000000, 0.000000, 0.000000, ..., 0.000000, 0.000000, 0.000000]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f9457410dd0 / R:0x589e3f0>
[       1,        2,        3, ...,       56,       57,       58]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f9457410b90 / R:0x4c88ff0>
[      14,       14,       14, ...,       13,       13,       13]
  ...
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f9457410560 / R:0x6207940>
[       4,        4,        4, ...,        0,        0,        0]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f94574106c8 / R:0x645d960>
[     268,      268,      268, ...,      278,      278,      278]
  <no name>: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7f9457410710 / R:0x60587c0>
[       0,        0,        0, ...,        0,        0,        0],
       <FloatVector - Python:0x7f9457410998 / R:0x58f9ed0>
[182.478000, 176.231000, 183.917000, ..., 153.776000, 150.481000, 146.638000]], dtype=object)



In [53]:

    
%%R

data(raw_data)
res = AnomalyDetectionTs(raw_data, max_anoms=0.02, direction='both', plot=TRUE)
res$plot

Styling



In [1]:

    
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)









    Out[1]:



In [54]:

	2014-01	2014-02	2014-03	2014-04	2014-05	2014-06	2014-07	2014-08	2014-09	2014-10	2014-11	2014-12
HIV tests performed
Primary Health Care services (females)	1465	1425	1378	1477	1437	1174	1495	1267	1438	1413	720	686
Primary Health Care services (males)	1057	1056	998	1115	1064	840	1197	1009	1250	1187	577	617
Pregnant women in public health care	2760	2360	2095	2258	2473	2086	2640	2020	2571	2499	1564	1666
HIV testing center (males)	528	358	389	383	348	307	490	404	377	370	405	430
HIV testing center (females)	186	133	158	155	155	94	197	158	107	117	111	156

HIV tests performed	Primary Health Care services (females)	Primary Health Care services (males)	Pregnant women in public health care	HIV testing center (males)	HIV testing center (females)
2014-01	1465	1057	2760	528	186
2014-02	1425	1056	2360	358	133
2014-03	1378	998	2095	389	158
2014-04	1477	1115	2258	383	155
2014-05	1437	1064	2473	348	155
2014-06	1174	840	2086	307	94
2014-07	1495	1197	2640	490	197
2014-08	1267	1009	2020	404	158
2014-09	1438	1250	2571	377	107
2014-10	1413	1187	2499	370	117

	Primary Care, Females	Primary Care, Males	HIV Testing, Males	HIV Testing, Females
2014-01-01	1465	1057	528	186
2014-02-01	1425	1056	358	133
2014-03-01	1378	998	389	158
2014-04-01	1477	1115	383	155
2014-05-01	1437	1064	348	155
2014-06-01	1174	840	307	94
2014-07-01	1495	1197	490	197
2014-08-01	1267	1009	404	158
2014-09-01	1438	1250	377	107
2014-10-01	1413	1187	370	117

	Primary Care, Females	Primary Care, Males	HIV Testing, Males	HIV Testing, Females	Curitiba Total
Primary Care, Females	1.000000	0.784215	0.531210	0.536651	0.935275
Primary Care, Males	0.784215	1.000000	0.348880	0.182559	0.859216
HIV Testing, Males	0.531210	0.348880	1.000000	0.820127	0.713403
HIV Testing, Females	0.536651	0.182559	0.820127	1.000000	0.618473
Curitiba Total	0.935275	0.859216	0.713403	0.618473	1.000000

	city	lat	lon	topic
origdate
2014-04-03 21:23:26	Recife	-8.057838	-34.882897	Discrimination_Negative
2015-02-07 23:51:00	São Paulo	-23.500000	-46.600000	Discrimination_Negative
2014-11-12 20:35:08	Curitiba	-25.428954	-49.267137	Discrimination_Negative
2014-04-06 12:06:17	Curitiba	-25.428954	-49.267137	Discrimination_Negative
2014-09-13 15:38:28	São Paulo	-23.500000	-46.600000	Discrimination_Negative

	city	topic
count	7252	7252
unique	1	7
top	Curitiba	Discrimination_Negative
freq	7252	5502

	Campaign_Portuguese	Discrimination_Negative	Discrimination_Positive	Prevention_Negative	Prevention_Neutral	Prevention_Positive	Testing_Neutral	Twitter Total
Campaign_Portuguese	1.000000	-0.257204	-0.512107	-0.108423	0.235566	-0.080378	0.384959	-0.304044
Discrimination_Negative	-0.257204	1.000000	0.068860	0.152680	-0.054787	0.400021	-0.236346	0.969129
Discrimination_Positive	-0.512107	0.068860	1.000000	-0.221960	-0.129828	-0.154733	-0.192909	0.212160
Prevention_Negative	-0.108423	0.152680	-0.221960	1.000000	-0.003667	0.157744	-0.254824	0.118625
Prevention_Neutral	0.235566	-0.054787	-0.129828	-0.003667	1.000000	0.561457	0.707131	0.094807
Prevention_Positive	-0.080378	0.400021	-0.154733	0.157744	0.561457	1.000000	0.463087	0.529034
Testing_Neutral	0.384959	-0.236346	-0.192909	-0.254824	0.707131	0.463087	1.000000	-0.112459
Twitter Total	-0.304044	0.969129	0.212160	0.118625	0.094807	0.529034	-0.112459	1.000000

	Campaign_Portuguese	Discrimination_Negative	Discrimination_Positive	Prevention_Negative	Prevention_Neutral	Prevention_Positive	Testing_Neutral	Twitter Total
origdate
2014-01-01	0	15	0	0	0	1	0	16
2014-01-02	0	16	0	0	0	0	0	16
2014-01-03	0	15	0	0	0	0	0	15
2014-01-04	0	13	0	0	2	4	0	19
2014-01-05	0	12	0	0	0	4	0	16